Appendix F — Regression Code

F.1 Logistic Regression

Data was prepared in Appendix G.

Performing train-test split of 80/20, stratified on the outcome “approve/deny”.

#build the training and test data
X_train,X_test,y_train,y_test = train_test_split(
    mca, #mca[mca.columns[:126]],
    labels,
    stratify=labels,
    random_state=8808,
    test_size=0.2
)
X_train_npc,X_test_npc,y_train,y_test = train_test_split(
    mca_npc, #mca_npc[mca_npc.columns[:86]],
    labels,
    stratify=labels,
    random_state=8808,
    test_size=0.2
)
y_train_copy = y_train.copy()
# X_train,y_train = # SMOTE(random_state=8808).fit_resample(X_train,y_train_copy.copy())
# X_train_npc,y_train = SMOTE(random_state=8808).fit_resample(X_train_npc,y_train_copy.copy())

results = pd.DataFrame({
    'Model':[],
    'Data':[],
    'Accuracy':[],
    'Precision':[],
    'Recall':[],
    'F1':[],
    'ROC-AUC':[]
})

#train the model
lr = LogisticRegression(max_iter=300)
lr_npc = LogisticRegression(max_iter=300)

lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
results.loc[len(results)] = {
    'Model':'Logistic Regression',
    'Data':'With Protected Classes',
    'Accuracy':accuracy_score(y_test,y_pred),
    'Precision':precision_score(y_test,y_pred),
    'Recall':recall_score(y_test,y_pred),
    'F1':f1_score(y_test,y_pred),
    'ROC-AUC':roc_auc_score(y_test,y_pred)
}

#run the model with the test dataset
lr_npc.fit(X_train_npc,y_train)
y_pred_npc = lr_npc.predict(X_test_npc)
results.loc[len(results)] = {
    'Model':'Logistic Regression',
    'Data':'Without Protected Classes',
    'Accuracy':accuracy_score(y_test,y_pred_npc),
    'Precision':precision_score(y_test,y_pred_npc),
    'Recall':recall_score(y_test,y_pred_npc),
    'F1':f1_score(y_test,y_pred_npc),
    'ROC-AUC':roc_auc_score(y_test,y_pred_npc)
}

# display_labels=My_BNB_Model.classes_)
#display summarized classification results
import matplotlib.pyplot as plt
fig,axes=plt.subplots(nrows=1,ncols=2)

ConfusionMatrixDisplay(
    confusion_matrix(
        y_pred=y_pred,y_true=y_test
    ),
    display_labels=['Deny','Approve']
).plot(ax=axes[0])
ConfusionMatrixDisplay(
    confusion_matrix(
        y_pred=y_pred_npc,y_true=y_test
    ),
    display_labels=['Deny','Approve']
).plot(ax=axes[1])
axes[0].set_title('With Protected\nClasses')
axes[1].set_title('Without Protected\nClasses')
plt.suptitle("Confusion Matrices - Logistic Regression")
plt.tight_layout()
plt.show()

Model	Data	Accuracy	Precision	Recall	F1	ROC-AUC
Logistic Regression	With Protected Classes	0.961859	0.983022	0.972197	0.977579	0.936481
Logistic Regression	Without Protected Classes	0.958859	0.981304	0.970385	0.975814	0.930562

# are the results significantly different? do a randomization test...

results = pd.DataFrame({
    'Model':[],
    'Data':[],
    'Accuracy':[],
    'Precision':[],
    'Recall':[],
    'F1':[],
    'ROC-AUC':[]
})
np.random.seed(2036)
for i in range(500):
    print('iteration: {}'.format(i+1))
    r = np.random.randint(0,5000,1)
    X_train,X_test,y_train,y_test = train_test_split(
        mca, #mca[mca.columns[:126]],
        labels,
        stratify=labels,
        random_state=r[0],
        test_size=0.2
    )
    X_train_npc,X_test_npc,y_train,y_test = train_test_split(
        mca_npc, #mca_npc[mca_npc.columns[:86]],
        labels,
        stratify=labels,
        random_state=r[0],
        test_size=0.2
    )
    y_train_copy = y_train.copy()
    # X_train,y_train = SMOTE(random_state=8808).fit_resample(X_train,y_train_copy.copy())
    # X_train_npc,y_train = SMOTE(random_state=8808).fit_resample(X_train_npc,y_train_copy.copy())
# lr = LogisticRegression(max_iter=300)
# lr_npc = LogisticRegression(max_iter=300)
    lr.fit(X_train,y_train)
    lr_npc.fit(X_train_npc,y_train)
    y_pred = lr.predict(X_test)
    y_pred_npc = lr_npc.predict(X_test_npc)
    results.loc[len(results)] = {
        'Model':'Logistic Regression',
        'Data':'With Protected Classes',
        'Accuracy':accuracy_score(y_test,y_pred),
        'Precision':precision_score(y_test,y_pred),
        'Recall':recall_score(y_test,y_pred),
        'F1':f1_score(y_test,y_pred),
        'ROC-AUC':roc_auc_score(y_test,y_pred)
    }
    results.loc[len(results)] = {
        'Model':'Logistic Regression',
        'Data':'Without Protected Classes',
        'Accuracy':accuracy_score(y_test,y_pred_npc),
        'Precision':precision_score(y_test,y_pred_npc),
        'Recall':recall_score(y_test,y_pred_npc),
        'F1':f1_score(y_test,y_pred_npc),
        'ROC-AUC':roc_auc_score(y_test,y_pred_npc)
    }

results.to_csv('../data/logRegRandTest.csv',index=False)

#visualize the output distributions
res = results.copy()
res = res.melt(id_vars=['Model','Data'],var_name='Metric',value_name='Score')
g = sns.FacetGrid(data=res,col='Metric',hue='Data',col_wrap=3)
g.map_dataframe(
    sns.kdeplot,
    x='Score'#,
    # hue='Data'
)
g.add_legend(loc='lower right')
plt.suptitle("Distributions for Randomization of Training/Testing Data\nLogistic Regression")
plt.tight_layout()
plt.show()

Stat	z-score	top performer	top mean	difference in means
Accuracy	44.648798	With Protected Classes	0.963586	0.002457
Precision	47.173034	With Protected Classes	0.983867	0.001985
Recall	16.219786	With Protected Classes	0.973386	0.000889
F1	43.977874	With Protected Classes	0.978598	0.001431
ROC-AUC	50.590439	With Protected Classes	0.939526	0.006305

F.2 Multinomial Naive Bayes

Multinomial Naive Bayes code and data processing can be found in Appendix D