Appendix E — Decision Tree Code

E.1 Label Encoded Data

#build the training and test data
X_train,X_test,y_train,y_test = train_test_split(
    clean,
    labels,
    stratify=labels,
    test_size=0.2,
    random_state=9999
)

X_train_npc,X_test_npc,y_train,y_test = train_test_split(
    clean_npc,
    labels,
    stratify=labels,
    test_size=0.2,
    random_state=9999
)

	derived_sex	preapproval	open-end_line_of_credit	loan_amount	loan_to_value_ratio	interest_rate	total_loan_costs	origination_charges	discount_points	lender_credits	...	co-applicant_ethnicity_No Co-applicant	aus_Desktop Underwriter	aus_Loan Prospector/Product Advisor	aus_Other	aus_Internal Proprietary	aus_Not applicable
142289	3	0	1	1	2	3	1	1	1	1	...	0	1	0	0	0	0
174168	0	0	1	2	2	3	1	1	1	1	...	1	1	0	0	0	0
106229	1	1	1	1	2	2	1	1	1	1	...	0	1	0	0	0	0
30766	0	1	1	3	2	2	3	1	1	1	...	1	1	1	1	0	0
34556	0	1	1	1	2	2	1	1	1	1	...	1	1	1	1	0	0
60130	0	1	0	1	1	2	1	1	1	1	...	1	0	0	0	0	1
114480	0	0	1	1	4	2	1	1	1	1	...	1	1	0	0	0	0
61174	2	1	1	1	2	2	1	1	1	1	...	1	0	1	0	1	0
131619	2	0	1	1	4	2	1	1	1	1	...	0	1	0	0	0	0
77029	2	1	1	2	2	2	1	1	1	1	...	1	1	1	0	1	0

10 rows × 100 columns

	preapproval	open-end_line_of_credit	loan_amount	loan_to_value_ratio	interest_rate	total_loan_costs	origination_charges	discount_points	lender_credits	loan_term	...	tract_median_age_of_housing_units	company	aus_Desktop Underwriter	aus_Loan Prospector/Product Advisor	aus_Other	aus_Internal Proprietary	aus_Not applicable
142289	0	1	1	2	3	1	1	1	1	2	...	2	3	1	0	0	0	0
174168	0	1	2	2	3	1	1	1	1	2	...	1	3	1	0	0	0	0
106229	1	1	1	2	2	1	1	1	1	2	...	1	2	1	0	0	0	0
30766	1	1	3	2	2	3	1	1	1	2	...	1	1	1	1	1	0	0
34556	1	1	1	2	2	1	1	1	1	2	...	1	1	1	1	1	0	0
60130	1	0	1	1	2	1	1	1	1	2	...	2	0	0	0	0	0	1
114480	0	1	1	4	2	1	1	1	1	1	...	1	3	1	0	0	0	0
61174	1	1	1	2	2	1	1	1	1	2	...	1	0	0	1	0	1	0
131619	0	1	1	4	2	1	1	1	1	1	...	3	3	1	0	0	0	0
77029	1	1	2	2	2	1	1	1	1	2	...	1	4	1	1	0	1	0

10 rows × 33 columns

dt_pc,dt_npc = DecisionTreeClassifier(max_depth=4),DecisionTreeClassifier(max_depth=4)
# re-fit using the new data on the previous models
dt_pc.fit(X_train,y_train)
dt_npc.fit(X_train_npc,y_train)
#generate predictions
y_pred = dt_pc.predict(X_test)
y_pred_npc = dt_npc.predict(X_test_npc)

import graphviz
from sklearn import tree
dot_data = tree.export_graphviz(
    dt_npc, 
    class_names = ['deny','approve'], #labels.unique(), #
    feature_names = X_test_npc.columns, #the columns
    out_file=None
)
graph = graphviz.Source(dot_data)
graph

#display summarized classification results
results = pd.DataFrame({
    'Model':[],
    'Data':[],
    'Accuracy':[],
    'Precision':[],
    'Recall':[],
    'F1':[],
    'ROC-AUC':[]
})
results.loc[len(results)] = {
    'Model':'Decision Tree',
    'Data':'With Protected Classes',
    'Accuracy':accuracy_score(y_test,y_pred),
    'Precision':precision_score(y_test,y_pred),
    'Recall':recall_score(y_test,y_pred),
    'F1':f1_score(y_test,y_pred),
    'ROC-AUC':roc_auc_score(y_test,y_pred)
}
results.loc[len(results)] = {
    'Model':'Decision Tree',
    'Data':'Without Protected Classes',
    'Accuracy':accuracy_score(y_test,y_pred_npc),
    'Precision':precision_score(y_test,y_pred_npc),
    'Recall':recall_score(y_test,y_pred_npc),
    'F1':f1_score(y_test,y_pred_npc),
    'ROC-AUC':roc_auc_score(y_test,y_pred_npc)
}

display(results)

	Model	Data	Accuracy	Precision	Recall	F1	ROC-AUC
0	Decision Tree	With Protected Classes	0.909013	0.911677	0.989477	0.948985	0.711476
1	Decision Tree	Without Protected Classes	0.909013	0.911677	0.989477	0.948985	0.711476

#display summarized classification results - confusion matrices

import matplotlib.pyplot as plt
fig,axes=plt.subplots(nrows=1,ncols=2)

ConfusionMatrixDisplay(
    confusion_matrix(
        y_pred=y_pred,y_true=y_test
    ),
        display_labels=['Deny','Approve']
).plot(ax=axes[0])
ConfusionMatrixDisplay(
    confusion_matrix(
        y_pred=y_pred_npc,y_true=y_test
    ),
        display_labels=['Deny','Approve']
).plot(ax=axes[1])
axes[0].set_title('With Protected\nClasses')
axes[1].set_title('Without Protected\nClasses')
plt.suptitle("Confusion Matrices - Decision Trees")
plt.tight_layout()
plt.show()

E.1.1 Dropping debt to income ratio

import graphviz
from sklearn import tree
dot_data = tree.export_graphviz(
    dt_npc, 
    class_names = ['deny','approve'], #labels.unique(), #
    feature_names = X_test_npc.columns, #the columns
    out_file=None
)
graph = graphviz.Source(dot_data)
graph

'imgs\\dt1npc.png'

'imgs\\dt1pc.png'

#display summarized classification results
results = pd.DataFrame({
    'Model':[],
    'Data':[],
    'Accuracy':[],
    'Precision':[],
    'Recall':[],
    'F1':[],
    'ROC-AUC':[]
})
results.loc[len(results)] = {
    'Model':'Decision Tree',
    'Data':'With Protected Classes',
    'Accuracy':accuracy_score(y_test,y_pred),
    'Precision':precision_score(y_test,y_pred),
    'Recall':recall_score(y_test,y_pred),
    'F1':f1_score(y_test,y_pred),
    'ROC-AUC':roc_auc_score(y_test,y_pred)
}
results.loc[len(results)] = {
    'Model':'Decision Tree',
    'Data':'Without Protected Classes',
    'Accuracy':accuracy_score(y_test,y_pred_npc),
    'Precision':precision_score(y_test,y_pred_npc),
    'Recall':recall_score(y_test,y_pred_npc),
    'F1':f1_score(y_test,y_pred_npc),
    'ROC-AUC':roc_auc_score(y_test,y_pred_npc)
}

display(results)

	Model	Data	Accuracy	Precision	Recall	F1	ROC-AUC
0	Decision Tree	With Protected Classes	0.889389	0.897131	0.983439	0.938304	0.658499
1	Decision Tree	Without Protected Classes	0.889389	0.897131	0.983439	0.938304	0.658499

#display summarized classification results - confusion matrices

import matplotlib.pyplot as plt
fig,axes=plt.subplots(nrows=1,ncols=2)

ConfusionMatrixDisplay(
    confusion_matrix(
        y_pred=y_pred,y_true=y_test
    ),
        display_labels=['Deny','Approve']
).plot(ax=axes[0])
ConfusionMatrixDisplay(
    confusion_matrix(
        y_pred=y_pred_npc,y_true=y_test
    ),
        display_labels=['Deny','Approve']
).plot(ax=axes[1])
axes[0].set_title('With Protected\nClasses')
axes[1].set_title('Without Protected\nClasses')
plt.suptitle("Confusion Matrices - Decision Trees")
plt.tight_layout()
plt.show()

E.1.2 Dropping automated underwriting system columns…

import graphviz
from sklearn import tree
dot_data = tree.export_graphviz(
    dt_npc, 
    class_names = ['deny','approve'], #labels.unique(), #
    feature_names = X_test_npc.columns, #the columns
    out_file=None
)
graph = graphviz.Source(dot_data)
graph

#display summarized classification results
results = pd.DataFrame({
    'Model':[],
    'Data':[],
    'Accuracy':[],
    'Precision':[],
    'Recall':[],
    'F1':[],
    'ROC-AUC':[]
})
results.loc[len(results)] = {
    'Model':'Decision Tree',
    'Data':'With Protected Classes',
    'Accuracy':accuracy_score(y_test,y_pred),
    'Precision':precision_score(y_test,y_pred),
    'Recall':recall_score(y_test,y_pred),
    'F1':f1_score(y_test,y_pred),
    'ROC-AUC':roc_auc_score(y_test,y_pred)
}
results.loc[len(results)] = {
    'Model':'Decision Tree',
    'Data':'Without Protected Classes',
    'Accuracy':accuracy_score(y_test,y_pred_npc),
    'Precision':precision_score(y_test,y_pred_npc),
    'Recall':recall_score(y_test,y_pred_npc),
    'F1':f1_score(y_test,y_pred_npc),
    'ROC-AUC':roc_auc_score(y_test,y_pred_npc)
}

display(results)

	Model	Data	Accuracy	Precision	Recall	F1	ROC-AUC
0	Decision Tree	With Protected Classes	0.863593	0.863495	0.998332	0.926031	0.532811
1	Decision Tree	Without Protected Classes	0.863593	0.863495	0.998332	0.926031	0.532811

#display summarized classification results - confusion matrices

import matplotlib.pyplot as plt
fig,axes=plt.subplots(nrows=1,ncols=2)

ConfusionMatrixDisplay(
    confusion_matrix(
        y_pred=y_pred,y_true=y_test
    ),
        display_labels=['Deny','Approve']
).plot(ax=axes[0])
ConfusionMatrixDisplay(
    confusion_matrix(
        y_pred=y_pred_npc,y_true=y_test
    ),
        display_labels=['Deny','Approve']
).plot(ax=axes[1])
axes[0].set_title('With Protected\nClasses')
axes[1].set_title('Without Protected\nClasses')
plt.suptitle("Confusion Matrices - Decision Trees")
plt.tight_layout()
plt.show()

import graphviz
from sklearn import tree
dot_data = tree.export_graphviz(
    dt_npc, 
    class_names = ['deny','approve'], #labels.unique(), #
    feature_names = X_test_npc.columns, #the columns
    out_file=None
)
graph = graphviz.Source(dot_data)
graph

dot_data = tree.export_graphviz(
    dt_pc, 
    class_names = ['deny','approve'], #labels.unique(), #
    feature_names = X_test.columns, #the columns
    out_file=None
)
graph = graphviz.Source(dot_data)
graph

#randomization testing...
clean = pd.read_csv('../data/cnb_pc.csv')
clean_npc = pd.read_csv('../data/cnb_npc.csv')

np.random.seed(5505)
results = pd.DataFrame({
    'Model':[],
    'Data':[],
    'Accuracy':[],
    'Precision':[],
    'Recall':[],
    'F1':[],
    'ROC-AUC':[]
})
for i in range(500):
    r = np.random.randint(1,5000,1)[0]

    X_train,X_test,y_train,y_test = train_test_split(
        clean,
        labels,
        stratify=labels,
        test_size=0.2,
        random_state=r
    )

    X_train_npc,X_test_npc,y_train,y_test = train_test_split(
        clean_npc,
        labels,
        stratify=labels,
        test_size=0.2,
        random_state=r
    )
    dt_pc.fit(X_train,y_train)
    dt_npc.fit(X_train_npc,y_train)
    y_pred = dt_pc.predict(X_test)
    y_pred_npc = dt_npc.predict(X_test_npc)
    results.loc[len(results)] = {
        'Model':'Decision Tree (D=4)',
        'Data':'With Protected Classes',
        'Accuracy':accuracy_score(y_test,y_pred),
        'Precision':precision_score(y_test,y_pred),
        'Recall':recall_score(y_test,y_pred),
        'F1':f1_score(y_test,y_pred),
        'ROC-AUC':roc_auc_score(y_test,y_pred)
    }
    results.loc[len(results)] = {
        'Model':'Decision Tree (D=4)',
        'Data':'Without Protected Classes',
        'Accuracy':accuracy_score(y_test,y_pred_npc),
        'Precision':precision_score(y_test,y_pred_npc),
        'Recall':recall_score(y_test,y_pred_npc),
        'F1':f1_score(y_test,y_pred_npc),
        'ROC-AUC':roc_auc_score(y_test,y_pred_npc)
    }

results.to_csv('../data/dtRandTest.csv',index=False)

#check for statistically significant differences in model performance
from statsmodels.stats.weightstats import ztest
tbl_pc = results[results['Data']=='With Protected Classes'].describe().T.reset_index()
tbl_npc = results[results['Data']=='Without Protected Classes'].describe().T.reset_index()

from scipy import stats
sig = pd.DataFrame({
    'Stat':[],
    'z-score':[],
    'p-value':[],
    'top performer':[],
    'top mean':[],
    'difference in means':[]
})

z_stat,p_value = ztest(
    results.loc[results['Data']=='With Protected Classes']['Accuracy'],
    results.loc[results['Data']=='Without Protected Classes']['Accuracy'],
)
scores = tbl_pc['index'].unique()
for score in scores: 
    z_stat,p_value = ztest(
        results.loc[results['Data']=='With Protected Classes'][score],
        results.loc[results['Data']=='Without Protected Classes'][score],
    )
    mu_pc, mu_npc = (
        tbl_pc.loc[tbl_pc['index']==score,'mean'].iloc[0],
        tbl_npc.loc[tbl_npc['index']==score,'mean'].iloc[0]
    )
    winner = np.select(
        [
            mu_pc < mu_npc,
            mu_npc < mu_pc
        ],
        [
            'Without Protected Classes',
            'With Protected Classes'
        ],
        default='tie'
    )
    diff = np.abs(mu_pc-mu_npc)
    m = max(mu_pc,mu_npc)
    sig.loc[len(sig)] = {
        'Stat':score,
        'z-score':z_stat,
        'p-value':p_value,
        'top performer':winner,
        'top mean':m,
        'difference in means':diff
    }

display(sig.style.hide(axis='index'))

Stat	z-score	p-value	top performer	top mean	difference in means
Accuracy	-0.063229	0.949584	Without Protected Classes	0.909842	0.000004
Precision	-0.048038	0.961686	Without Protected Classes	0.912724	0.000002
Recall	-0.045748	0.963511	Without Protected Classes	0.989174	0.000001
F1	-0.064089	0.948899	Without Protected Classes	0.949412	0.000002
ROC-AUC	-0.049822	0.960264	Without Protected Classes	0.715086	0.000009

#visualize the output distributions
res = results.copy()
res = res.melt(id_vars=['Model','Data'],var_name='Metric',value_name='Score')
g = sns.FacetGrid(data=res,col='Metric',hue='Data',col_wrap=3)
g.map_dataframe(
    sns.kdeplot,
    x='Score'
)
g.add_legend(loc='lower right')
plt.suptitle("Distributions for Randomization of Training/Testing Data\nDecision Trees")
plt.tight_layout()
plt.show()