Chapter 10

Chapter 10 Logistic Regression

Original Code Credit:: Shmueli, Galit; Bruce, Peter C.; Gedeck, Peter; Patel, Nitin R.. Data Mining for Business Analytics Wiley.

Modifications have been made from the original textbook examples due to version changes in library dependencies and/or for clarity.

Download this notebook and data here.

Import Libraries

import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from mord import LogisticIT
import matplotlib.pyplot as plt
import seaborn as sns
from dmba import classificationSummary, gainsChart, liftChart
from dmba.metric import AIC_score

import matplotlib

%matplotlib inline

SEED = 1

10.3 Example: Acceptance of Personal Loan

Logistic regression model for loan acceptance (training data)

bank_df = pd.read_csv(os.path.join('data', 'UniversalBank.csv'))
bank_df.drop(columns=['ID', 'ZIP Code'], inplace=True)
bank_df.columns = [c.replace(' ', '_') for c in bank_df.columns]

# Treat education as categorical, convert to dummy variables
bank_df['Education'] = bank_df['Education'].astype('category')
new_categories = {1: 'Undergrad', 2: 'Graduate', 3: 'Advanced/Professional'}
bank_df['Education'] = bank_df['Education'].cat.rename_categories(new_categories)
bank_df2 = pd.get_dummies(bank_df, prefix_sep='_', drop_first=True, dtype=int)

y = bank_df2['Personal_Loan']
X = bank_df2.drop(columns=['Personal_Loan'])

# partition data
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=SEED)

# fit a logistic regression (set penalty=l2 and C=1e42 to avoid regularization)
logit_reg = LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
logit_reg.fit(train_X, train_y)

print('intercept ', logit_reg.intercept_[0])
pd.DataFrame({'coeff': logit_reg.coef_[0]}, index=X.columns).transpose()

intercept  -12.560703103343736

	Age	Experience	Income	Family	CCAvg	Mortgage	Securities_Account	CD_Account	Online	CreditCard	Education_Graduate	Education_Advanced/Professional
coeff	-0.035686	0.037234	0.058921	0.61268	0.240926	0.001015	-1.029088	3.662056	-0.679537	-0.960725	4.21121	4.36177

print('AIC', AIC_score(valid_y, logit_reg.predict(valid_X), df = len(train_X.columns) + 1))

AIC -733.9975169177105

10.4 Evaluating Classification Performance

Propensities for four customers in validation data

logit_reg_pred = logit_reg.predict(valid_X)
logit_reg_proba = logit_reg.predict_proba(valid_X)
logit_result = pd.DataFrame({'actual': valid_y, 
                             'p(0)': [p[0] for p in logit_reg_proba],
                             'p(1)': [p[1] for p in logit_reg_proba],
                             'predicted': logit_reg_pred })

# display four different cases
interestingCases = [2764, 932, 2721, 702]
print(logit_result.loc[interestingCases])

      actual      p(0)      p(1)  predicted
2764       0  0.976165  0.023835          0
932        0  0.331940  0.668060          1
2721       1  0.031254  0.968746          1
702        1  0.985943  0.014057          0

Training and Validation Confusion Matrices

# training confusion matrix
classificationSummary(train_y, logit_reg.predict(train_X))

# validation confusion matrix
classificationSummary(valid_y, logit_reg.predict(valid_X))

Confusion Matrix (Accuracy 0.9600)

       Prediction
Actual    0    1
     0 2683   30
     1   90  197
Confusion Matrix (Accuracy 0.9600)

       Prediction
Actual    0    1
     0 1791   16
     1   64  129

Cumulative gains chart and decile-wise lift chart for the validation data for Universal Bank loan offer

df = logit_result.sort_values(by=['p(1)'], ascending=False)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))

gainsChart(df.actual, ax=axes[0])
liftChart(df.actual, title=False, ax=axes[1])

plt.tight_layout()
plt.show()

png

10.5 Logistic Regression for Multi-class Classification

Ordinal and nominal (multi-class) regression in Python

data = pd.read_csv(os.path.join('data', 'accidentsFull.csv'))
outcome = 'MAX_SEV_IR'
predictors = ['ALCHL_I', 'WEATHER_R']

y = data[outcome]
X = data[predictors]
train_X, train_y = X, y

print('Nominal logistic regression')
logit = LogisticRegression(penalty="l2", solver='lbfgs', C=1e24)
logit.fit(X, y)
print('  intercept', logit.intercept_)
print('  coefficients', logit.coef_)
print()
probs = logit.predict_proba(X)
results = pd.DataFrame({
    'actual': y, 'predicted': logit.predict(X),
    'P(0)': [p[0] for p in probs],
    'P(1)': [p[1] for p in probs],
    'P(2)': [p[2] for p in probs],
})
print(results.head())
print()

print('Ordinal logistic regression')
logit = LogisticIT(alpha=0)
logit.fit(X, y)
print('  theta', logit.theta_)
print('  coefficients', logit.coef_)
print()
probs = logit.predict_proba(X)
results = pd.DataFrame({
    'actual': y, 'predicted': logit.predict(X),
    'P(0)': [p[0] for p in probs],
    'P(1)': [p[1] for p in probs],
    'P(2)': [p[2] for p in probs],
})
results.head()

Nominal logistic regression
  intercept [-0.17870851  0.81673248 -0.63802397]
  coefficients [[ 0.52491466  0.40469045]
 [ 0.15763357  0.16071816]
 [-0.68254823 -0.56540861]]

   actual  predicted      P(0)      P(1)      P(2)
0       1          1  0.490569  0.498933  0.010498
1       0          0  0.554023  0.441483  0.004494
2       0          0  0.554023  0.441483  0.004494
3       0          1  0.490569  0.498933  0.010498
4       0          1  0.393700  0.578117  0.028183

Ordinal logistic regression
  theta [-1.06916285  2.77444326]
  coefficients [-0.40112008 -0.25174207]

	actual	predicted	P(0)	P(1)	P(2)
0	1	1	0.496205	0.482514	0.021281
1	0	0	0.558866	0.424510	0.016624
2	0	0	0.558866	0.424510	0.016624
3	0	1	0.496205	0.482514	0.021281
4	0	1	0.397402	0.571145	0.031453

10.6 Example of Complete Analysis: Predicting Delayed Flights

delays_df = pd.read_csv(os.path.join('data', 'FlightDelays.csv'))
# Create an indicator variable
delays_df['isDelayed'] = [1 if status == 'delayed' else 0 
                          for status in delays_df['Flight Status']]

def createGraph(group, xlabel, axis):
    groupAverage = delays_df.groupby([group])['isDelayed'].mean()
    if group == 'DAY_WEEK': # rotate so that display starts on Sunday
        groupAverage = groupAverage.reindex(index=np.roll(groupAverage.index,1))
        groupAverage.index = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']
    ax = groupAverage.plot.bar(color='C0', ax=axis)
    ax.set_ylabel('Average Delay')
    ax.set_xlabel(xlabel)
    return ax

def graphDepartureTime(xlabel, axis):
    temp_df = pd.DataFrame({'CRS_DEP_TIME': delays_df['CRS_DEP_TIME'] // 100, 
                            'isDelayed': delays_df['isDelayed']})
    groupAverage = temp_df.groupby(['CRS_DEP_TIME'])['isDelayed'].mean()
    ax = groupAverage.plot.bar(color='C0', ax=axis)
    ax.set_xlabel(xlabel); ax.set_ylabel('Average Delay')

fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(10, 10))

createGraph('DAY_WEEK', 'Day of week', axis=axes[0][0])
createGraph('DEST', 'Destination', axis=axes[0][1])
graphDepartureTime('Departure time', axis=axes[1][0])
createGraph('CARRIER', 'Carrier', axis=axes[1][1])
createGraph('ORIGIN', 'Origin', axis=axes[2][0])
createGraph('Weather', 'Weather', axis=axes[2][1])
plt.tight_layout()

png

Proportion of delayed flights by each of the six predictors. Time of day is divided into hourly bins

agg = delays_df.groupby(['ORIGIN', 'DAY_WEEK', 'CARRIER']).isDelayed.mean()
agg = agg.reset_index()

# Define the layout of the graph
height_ratios = []
for i, origin in enumerate(sorted(delays_df.ORIGIN.unique())):
    height_ratios.append(len(agg[agg.ORIGIN == origin].CARRIER.unique()))
gridspec_kw = {'height_ratios': height_ratios, 'width_ratios': [15, 1]}
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(10, 6), 
                         gridspec_kw = gridspec_kw)
axes[0, 1].axis('off')
axes[2, 1].axis('off')

maxIsDelay = agg.isDelayed.max()
for i, origin in enumerate(sorted(delays_df.ORIGIN.unique())):
    data = pd.pivot_table(agg[agg.ORIGIN == origin], values='isDelayed', aggfunc='sum', 
                          index=['CARRIER'], columns=['DAY_WEEK'])
    data = data[[7, 1, 2, 3, 4, 5, 6]]  # Shift last columns to first
    ax = sns.heatmap(data, ax=axes[i][0], vmin=0, vmax=maxIsDelay, 
                     cbar_ax=axes[1][1], cmap=sns.light_palette("navy"))
    ax.set_xticklabels(['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat'])
    if i != 2: 
        ax.get_xaxis().set_visible(False)
    ax.set_ylabel('Airport ' + origin)
plt.show()

png

Estimated Logistic regression model for delayed flights (based on the training set)

# convert to categorical
delays_df.DAY_WEEK = delays_df.DAY_WEEK.astype('category')

# create hourly bins departure time 
delays_df.CRS_DEP_TIME = [round(t / 100) for t in delays_df.CRS_DEP_TIME]
delays_df.CRS_DEP_TIME = delays_df.CRS_DEP_TIME.astype('category')

predictors = ['DAY_WEEK', 'CRS_DEP_TIME', 'ORIGIN', 'DEST', 'CARRIER', 'Weather']
outcome = 'isDelayed'

X = pd.get_dummies(delays_df[predictors], drop_first=True)
y = delays_df[outcome]
classes = ['ontime', 'delayed']

# split into training and validation
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=SEED)

logit_full = LogisticRegression(penalty="l2", C=1e42, solver='liblinear')
logit_full.fit(train_X, train_y)

print('intercept ', logit_full.intercept_[0])
pd.DataFrame({'coeff': logit_full.coef_[0]}, index=X.columns).transpose()

intercept  -1.2191068769073172

	Weather	DAY_WEEK_2	DAY_WEEK_3	DAY_WEEK_4	DAY_WEEK_5	DAY_WEEK_6	DAY_WEEK_7	CRS_DEP_TIME_7	CRS_DEP_TIME_8	CRS_DEP_TIME_9	...	ORIGIN_IAD	DEST_JFK	DEST_LGA	CARRIER_DH	CARRIER_DL	CARRIER_MQ	CARRIER_OH	CARRIER_RU	CARRIER_UA	CARRIER_US
coeff	9.325014	-0.597928	-0.704795	-0.798678	-0.295774	-1.129236	-0.135377	0.631496	0.382262	-0.365022	...	-0.134055	-0.523642	-0.545662	0.352302	-0.684517	0.743163	-0.710518	-0.194203	0.315418	-0.97143

1 rows × 33 columns

print('AIC', AIC_score(valid_y, logit_full.predict(valid_X), df = len(train_X.columns) + 1))

AIC 1004.5346225948085

Evaluating performance of all-predictor model on validation

logit_reg_pred = logit_full.predict_proba(valid_X)
full_result = pd.DataFrame({'actual': valid_y, 
                            'p(0)': [p[0] for p in logit_reg_pred],
                            'p(1)': [p[1] for p in logit_reg_pred],
                            'predicted': logit_full.predict(valid_X)})
full_result = full_result.sort_values(by=['p(1)'], ascending=False)

# confusion matrix
classificationSummary(full_result.actual, full_result.predicted, class_names=classes)

gainsChart(full_result.actual, figsize=[5, 5])
plt.show()

Confusion Matrix (Accuracy 0.8309)

        Prediction
 Actual  ontime delayed
 ontime     705       9
delayed     140      27

png

Logistic regression model with fewer predictors

delays_df['CRS_DEP_TIME'] = [round(t / 100) for t in delays_df['CRS_DEP_TIME']]
delays_red_df = pd.DataFrame({
    'Sun_Mon' : [1 if d in (1, 7) else 0 for d in delays_df.DAY_WEEK],
    'Weather' : delays_df.Weather,
    'CARRIER_CO_MQ_DH_RU' : [1 if d in ("CO", "MQ", "DH", "RU") else 0 
                             for d in delays_df.CARRIER],
    'MORNING' : [1 if d in (6, 7, 8, 9) else 0 for d in delays_df.CRS_DEP_TIME],
    'NOON' : [1 if d in (10, 11, 12, 13) else 0 for d in delays_df.CRS_DEP_TIME],
    'AFTER2P' : [1 if d in (14, 15, 16, 17, 18) else 0 for d in delays_df.CRS_DEP_TIME],
    'EVENING' : [1 if d in (19, 20) else 0 for d in delays_df.CRS_DEP_TIME],
    'isDelayed' : [1 if status == 'delayed' else 0 for status in delays_df['Flight Status']],
})

X = delays_red_df.drop(columns=['isDelayed'])
y = delays_red_df['isDelayed']
classes = ['ontime', 'delayed']

# split into training and validation
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, 
                                                      random_state=SEED)

logit_red = LogisticRegressionCV(penalty="l1", solver='liblinear', cv=5)
logit_red.fit(train_X, train_y)

print('intercept ', logit_red.intercept_[0])
pd.DataFrame({'coeff': logit_red.coef_[0]}, index=X.columns).transpose()

intercept  -2.509726593729543

	Sun_Mon	Weather	CARRIER_CO_MQ_DH_RU	MORNING	NOON	AFTER2P	EVENING
coeff	0.625337	4.920328	1.281507	0.0	0.0	0.0	0.0

print('AIC', AIC_score(valid_y, logit_red.predict(valid_X), df=len(train_X.columns) + 1))

# confusion matrix
classificationSummary(valid_y, logit_red.predict(valid_X), class_names=classes)

AIC 940.6290360089256
Confusion Matrix (Accuracy 0.8331)

        Prediction
 Actual  ontime delayed
 ontime     714       0
delayed     147      20

logit_reg_proba = logit_red.predict_proba(valid_X)
red_result = pd.DataFrame({'actual': valid_y, 
                            'p(0)': [p[0] for p in logit_reg_proba],
                            'p(1)': [p[1] for p in logit_reg_proba],
                            'predicted': logit_red.predict(valid_X),
                          })
red_result = red_result.sort_values(by=['p(1)'], ascending=False)

ax = gainsChart(full_result.actual, color='C1', figsize=[5, 5])
gainsChart(red_result.actual, color='C0', ax=ax)
plt.show()

png

Logistic regression model for loan acceptance using Statmodels

# add constant column
y = bank_df2['Personal_Loan']
X = bank_df2.drop(columns=['Personal_Loan'])
X = sm.add_constant(X, prepend=True)

# partition data
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=SEED)

# use GLM (general linear model) with the binomial family to fit a logistic regression
logit_reg = sm.GLM(train_y, train_X, family=sm.families.Binomial())
logit_result = logit_reg.fit()
logit_result.summary()

Generalized Linear Model Regression Results
Dep. Variable:	Personal_Loan	No. Observations:	3000
Model:	GLM	Df Residuals:	2987
Model Family:	Binomial	Df Model:	12
Link Function:	Logit	Scale:	1.0000
Method:	IRLS	Log-Likelihood:	-340.15
Date:	Mon, 07 Apr 2025	Deviance:	680.30
Time:	07:32:46	Pearson chi2:	8.10e+03
No. Iterations:	8	Pseudo R-squ. (CS):	0.3325
Covariance Type:	nonrobust

	coef	std err	z	P>\|z\|	[0.025	0.975]
const	-12.5634	2.336	-5.377	0.000	-17.143	-7.984
Age	-0.0354	0.086	-0.412	0.680	-0.204	0.133
Experience	0.0369	0.086	0.431	0.666	-0.131	0.205
Income	0.0589	0.004	15.044	0.000	0.051	0.067
Family	0.6128	0.103	5.931	0.000	0.410	0.815
CCAvg	0.2408	0.060	4.032	0.000	0.124	0.358
Mortgage	0.0010	0.001	1.301	0.193	-0.001	0.003
Securities_Account	-1.0305	0.422	-2.443	0.015	-1.857	-0.204
CD_Account	3.6628	0.460	7.961	0.000	2.761	4.565
Online	-0.6794	0.216	-3.140	0.002	-1.103	-0.255
CreditCard	-0.9609	0.274	-3.507	0.000	-1.498	-0.424
Education_Graduate	4.2075	0.364	11.573	0.000	3.495	4.920
Education_Advanced/Professional	4.3580	0.365	11.937	0.000	3.642	5.074