import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rulesAppendix C — Association Rule Mining Code
C.1 Module and Data Imports
C.2 Transform to Basket Transactions
mapper = {
'applicant_race':{
'American Indian/Alaska Native':0b0000000000000000001,
'Asian':0b0000000000000000010,
'Asian Indian':0b0000000000000000100,
'Chinese':0b0000000000000001000,
'Filipino':0b0000000000000010000,
'Japanese':0b0000000000000100000,
'Korean':0b0000000000001000000,
'Vietnamese':0b0000000000010000000,
'Other Asian':0b0000000000100000000,
'Black/African American':0b0000000001000000000,
'Native Hawaiian/Pacific Islander':0b0000000010000000000,
'Native Hawaiian':0b0000000100000000000,
'Guamanian/Chamorro':0b0000001000000000000,
'Samoan':0b0000010000000000000,
'Other Pacific Islander':0b0000100000000000000,
'White':0b0001000000000000000,
'Information not provided':0b0010000000000000000,
'Not Applicable':0b0100000000000000000,
'No Co-applicant':0b1000000000000000000
},
# 'co-applicant_race':{
# 'American Indian/Alaska Native':0b0000000000000000001,
# 'Asian':0b0000000000000000010,
# 'Asian Indian':0b0000000000000000100,
# 'Chinese':0b0000000000000001000,
# 'Filipino':0b0000000000000010000,
# 'Japanese':0b0000000000000100000,
# 'Korean':0b0000000000001000000,
# 'Vietnamese':0b0000000000010000000,
# 'Other Asian':0b0000000000100000000,
# 'Black/African American':0b0000000001000000000,
# 'Native Hawaiian/Pacific Islander':0b0000000010000000000,
# 'Native Hawaiian':0b0000000100000000000,
# 'Guamanian/Chamorro':0b0000001000000000000,
# 'Samoan':0b0000010000000000000,
# 'Other Pacific Islander':0b0000100000000000000,
# 'White':0b0001000000000000000,
# 'Information not provided':0b0010000000000000000,
# 'Not Applicable':0b0100000000000000000,
# 'No Co-applicant':0b1000000000000000000
# },
'applicant_ethnicity':{
'Hispanic/Latino':0b000000001,
'Mexican':0b000000010,
'Puerto Rican':0b000000100,
'Cuban':0b000001000,
'Other Hispanic/Latino':0b000010000,
'Not Hispanic/Latino':0b000100000,
'Information Not Provided':0b001000000,
'Not Applicable':0b010000000,
'No Co-applicant':0b100000000
},
# 'co-applicant_ethnicity':{
# 'Hispanic/Latino':0b000000001,
# 'Mexican':0b000000010,
# 'Puerto Rican':0b000000100,
# 'Cuban':0b000001000,
# 'Other Hispanic/Latino':0b000010000,
# 'Not Hispanic/Latino':0b000100000,
# 'Information Not Provided':0b001000000,
# 'Not Applicable':0b010000000,
# 'No Co-applicant':0b100000000
# },
'aus':{
'Desktop Underwriter':0b00000001,
'Loan Prospector/Product Advisor':0b00000010,
'TOTAL Scorecard':0b00000100,
'GUS':0b00001000,
'Other':0b00010000,
'Internal Proprietary':0b00100000,
'Not applicable':0b01000000,
'Exempt':0b10000000,
},
'denial_reason':{
'DTI':0b0000000001,
'Employment History':0b0000000010,
'Credit History':0b0000000100,
'Collateral':0b0000001000,
'Insufficient Cash':0b0000010000,
'Unverifiable Information':0b0000100000,
'Credit Application Incomplete':0b0001000000,
'Mortgage Insurance Denied':0b0010000000,
'Other':0b0100000000,
'Not Applicable':0b1000000000
}
}
new_mapper = {}
for k,v in mapper.items():
new_mapper[k] = {}
#print(k)
for j,w in v.items():
#print(w,j)
new_mapper[k][w] = jfr2 = pd.read_csv('../data/final_clean_r2.csv')#fr.copy()
pct = [20,40,60,80]
levels = ['0-20','21-40','41-60','61-80','>80']
pct_cols = [
'income',
'debt_to_income_ratio',
'loan_to_value_ratio',
'tract_minority_population_percent',
'tract_to_msa_income_percentage',
'tract_median_age_of_housing_units',
'interest_rate'
]
for col in pct_cols:
p = list(map(lambda x: np.percentile(fr2[col],x),pct))
p = [-np.inf] + p + [np.inf]
fr2[col] = pd.cut(fr2[col],bins=p,labels=levels)basket = []
bc = [
'income','debt_to_income_ratio','loan_to_value_ratio',
'tract_minority_population_percent','tract_to_msa_income_percent',
'derived_sex'
]
b1c = ['interest_rate','company','applicant_race']
b2c = ['interest_rate','company','outcome']
b3c = ['interest_rate','applicant_race','outcome']
b1,b2,b3 = [],[],[]
for i, row in fr2.iterrows():
curr = []
for k,v in new_mapper.items():
for j,w in v.items():
#print(row[k],type(row[k]))
if row[k] & j > 0:
curr.append("{}:{}".format(k,w))
if row['balloon_payment'] == 1:
curr.append('balloon')
if row['interest_only_payment'] == 1:
curr.append('interest only')
curr.append("{} rooms".format(row['total_units']))
for col in pct_cols:
curr.append("{}:{}".format(col,row[col]))
# curr.append(row['company'])
curr.append(row['derived_sex'])
curr.append("age_category:{}".format(row['applicant_age']))
basket.append(curr)items = set()
for trans in basket:
for item in trans:
items.add(item)
result = []
for record in basket:
rowset = set(record)
labels = {}
uncommons = list(items-rowset)
commons = list(items.intersection(rowset))
for uc in uncommons:
labels[uc] = False
for com in commons:
labels[com] = True
result.append(labels)
ohe_df = pd.DataFrame(result)
single_basket = ohe_df.replace(
[0],[np.nan]
).reset_index()
single_basket = single_basket.melt(
id_vars='index',
value_vars=single_basket.columns[1:]
)
single_basket.sort_values(by='index',inplace=True)
single_basket.dropna(inplace=True)
# display(
# ohe_df.head(),
# single_basket.head()
# )
single_basket.to_csv('../data/ARM_single_basket.csv',index=False)C.2.1 Construct Frequent Items
freq_items=apriori(ohe_df,min_support=0.05,use_colnames=True,verbose=1)
Processing 3306 combinations | Sampling itemset size 2
Processing 25998 combinations | Sampling itemset size 3
Processing 31860 combinations | Sampling itemset size 4
Processing 29945 combinations | Sampling itemset size 5
Processing 12594 combinations | Sampling itemset size 6
Processing 1253 combinations | Sampling itemset size 7
ars = association_rules(freq_items,metric='support',min_threshold=0.5)
ars.sort_values(by='lift',ascending=False,inplace=True)C.2.2 Construct Association Rules (Python)
C.2.3 Save the Top 15 Association Rules by Category
C.2.4 Performing Rule Mining and Visualization in R
Rule mining is performed on data that was already transformed in Python from previous steps
library(tidyverse)
library(arules)
library(arulesViz)First load the data into R for use in rule mining.
dat <- read.transactions(
'C:/Users/pconn/OneDrive/Desktop/Machine Learning/ML/data/single_basket.csv',
sep=',',
rm.duplicates=TRUE,
format='single',
cols=c(1,2)
)Now that the data is loaded, R is leveraged to perform rule mining
set.seed = 9001
a_rules <- arules::apriori(
dat,
control=list(verbose=F),
parameter=list(support=0.04,confidence=0.01,minlen=2)
)Using the mined rules, the rules can now be printed and/or visualized.
sorted_arules <- sort(a_rules,by='support',decreasing = T)
arules::inspect(sorted_arules[1:15]) lhs rhs support confidence coverage lift count
[1] {approve} => {1 rooms} 0.8465587 0.9897986 0.8552837 1.001607 172124
[2] {1 rooms} => {approve} 0.8465587 0.8566580 0.9882108 1.001607 172124
[3] {applicant_ethnicity:Not Hispanic/Latino} => {1 rooms} 0.7152005 0.9893591 0.7228928 1.001162 145416
[4] {1 rooms} => {applicant_ethnicity:Not Hispanic/Latino} 0.7152005 0.7237327 0.9882108 1.001162 145416
[5] {applicant_ethnicity:Not Hispanic/Latino} => {approve} 0.6253676 0.8650905 0.7228928 1.011466 127151
[6] {approve} => {applicant_ethnicity:Not Hispanic/Latino} 0.6253676 0.7311815 0.8552837 1.011466 127151
[7] {applicant_ethnicity:Not Hispanic/Latino,
approve} => {1 rooms} 0.6195837 0.9907512 0.6253676 1.002571 125975
[8] {1 rooms,
applicant_ethnicity:Not Hispanic/Latino} => {approve} 0.6195837 0.8663077 0.7152005 1.012889 125975
[9] {1 rooms,
approve} => {applicant_ethnicity:Not Hispanic/Latino} 0.6195837 0.7318852 0.8465587 1.012439 125975
[10] {applicant_race:White} => {1 rooms} 0.6052911 0.9917401 0.6103324 1.003571 123069
[11] {1 rooms} => {applicant_race:White} 0.6052911 0.6125121 0.9882108 1.003571 123069
[12] {aus:Desktop Underwriter} => {1 rooms} 0.5645626 0.9910383 0.5696678 1.002861 114788
[13] {1 rooms} => {aus:Desktop Underwriter} 0.5645626 0.5712977 0.9882108 1.002861 114788
[14] {applicant_race:White} => {approve} 0.5349397 0.8764727 0.6103324 1.024774 108765
[15] {approve} => {applicant_race:White} 0.5349397 0.6254529 0.8552837 1.024774 108765
sorted_arules <- sort(a_rules,by='confidence',decreasing = T)
arules::inspect(sorted_arules[1:15]) lhs rhs support confidence coverage lift count
[1] {interest_rate:>80} => {approve} 0.18578904 1 0.18578904 1.169203 37775
[2] {aus:Loan Prospector/Product Advisor,
aus:Other} => {JP Morgan} 0.15580213 1 0.15580213 4.971320 31678
[3] {interest_rate:>80,
loan_to_value_ratio:61-80} => {approve} 0.04241056 1 0.04241056 1.169203 8623
[4] {interest_rate:>80,
tract_minority_population_percent:0-20} => {approve} 0.04197283 1 0.04197283 1.169203 8534
[5] {debt_to_income_ratio:61-80,
interest_rate:>80} => {approve} 0.04571074 1 0.04571074 1.169203 9294
[6] {interest_rate:>80,
tract_to_msa_income_percentage:21-40} => {approve} 0.04036454 1 0.04036454 1.169203 8207
[7] {debt_to_income_ratio:21-40,
interest_rate:>80} => {approve} 0.04220891 1 0.04220891 1.169203 8582
[8] {Female,
interest_rate:>80} => {approve} 0.04209579 1 0.04209579 1.169203 8559
[9] {2.0,
interest_rate:>80} => {approve} 0.04509104 1 0.04509104 1.169203 9168
[10] {1.0,
interest_rate:>80} => {approve} 0.05827702 1 0.05827702 1.169203 11849
[11] {interest_rate:>80,
loan_to_value_ratio:21-40} => {approve} 0.05279802 1 0.05279802 1.169203 10735
[12] {interest_rate:>80,
Joint} => {approve} 0.06351993 1 0.06351993 1.169203 12915
[13] {interest_rate:>80,
Male} => {approve} 0.06214773 1 0.06214773 1.169203 12636
[14] {aus:Loan Prospector/Product Advisor,
interest_rate:>80} => {approve} 0.08140290 1 0.08140290 1.169203 16551
[15] {interest_rate:>80,
Rocket Mortgage} => {approve} 0.10214832 1 0.10214832 1.169203 20769
sorted_arules <- sort(a_rules,by='lift',decreasing = T)
arules::inspect(sorted_arules[1:15]) lhs rhs support confidence coverage lift count
[1] {applicant_ethnicity:Mexican} => {applicant_ethnicity:Hispanic/Latino} 0.04056128 0.9169446 0.04423525 8.207934 8247
[2] {applicant_ethnicity:Hispanic/Latino} => {applicant_ethnicity:Mexican} 0.04056128 0.3630800 0.11171442 8.207934 8247
[3] {1 rooms,
applicant_ethnicity:Information Not Provided,
applicant_race:Information not provided,
Rocket Mortgage} => {Sex Not Available} 0.07532387 0.7362273 0.10231062 7.834365 15315
[4] {applicant_ethnicity:Information Not Provided,
applicant_race:Information not provided,
Rocket Mortgage} => {Sex Not Available} 0.07603703 0.7353151 0.10340740 7.824658 15460
[5] {1 rooms,
applicant_ethnicity:Information Not Provided,
applicant_race:Information not provided,
approve,
Rocket Mortgage} => {Sex Not Available} 0.06595941 0.7333224 0.08994600 7.803453 13411
[6] {applicant_ethnicity:Information Not Provided,
applicant_race:Information not provided,
approve,
Rocket Mortgage} => {Sex Not Available} 0.06651026 0.7322395 0.09083129 7.791930 13523
[7] {1 rooms,
applicant_ethnicity:Information Not Provided,
applicant_race:Information not provided,
aus:Desktop Underwriter,
Rocket Mortgage} => {Sex Not Available} 0.05100776 0.7264131 0.07021867 7.729930 10371
[8] {applicant_ethnicity:Information Not Provided,
applicant_race:Information not provided,
aus:Desktop Underwriter,
Rocket Mortgage} => {Sex Not Available} 0.05134712 0.7252518 0.07079903 7.717572 10440
[9] {1 rooms,
applicant_ethnicity:Information Not Provided,
applicant_race:Information not provided,
approve,
aus:Desktop Underwriter,
Rocket Mortgage} => {Sex Not Available} 0.04512547 0.7246663 0.06227068 7.711341 9175
[10] {applicant_ethnicity:Information Not Provided,
applicant_race:Information not provided,
approve,
aus:Desktop Underwriter,
Rocket Mortgage} => {Sex Not Available} 0.04538122 0.7233459 0.06273792 7.697291 9227
[11] {1 rooms,
applicant_ethnicity:Information Not Provided,
applicant_race:Information not provided,
approve,
aus:Desktop Underwriter} => {Sex Not Available} 0.04939456 0.6718625 0.07351885 7.149444 10043
[12] {1 rooms,
applicant_ethnicity:Information Not Provided,
Rocket Mortgage} => {Sex Not Available} 0.07602227 0.6709350 0.11330795 7.139574 15457
[13] {applicant_ethnicity:Information Not Provided,
applicant_race:Information not provided,
approve,
aus:Desktop Underwriter} => {Sex Not Available} 0.04969457 0.6703377 0.07413364 7.133218 10104
[14] {applicant_ethnicity:Information Not Provided,
Rocket Mortgage} => {Sex Not Available} 0.07674034 0.6700880 0.11452278 7.130562 15603
[15] {1 rooms,
applicant_ethnicity:Information Not Provided,
approve,
Rocket Mortgage} => {Sex Not Available} 0.06656437 0.6685768 0.09956129 7.114480 13534
sub <- head(sort(a_rules,by='lift',decreasing = T),10)
plot(sub,method="graph",engine="html")C.2.5 Examining Organization Specific Rules
NFCU <- subset(dat, subset = items %in% "Navy Federal Credit Union")
JPM <- subset(dat, subset = items %in% 'JP Morgan')
BOA <- subset(dat, subset = items %in% 'Bank of America')
WF <- subset(dat, subset = items %in% 'Wells Fargo')
RM <- subset(dat, subset = items %in% 'Rocket Mortgage')
get_rules <- function(trns,appear,sup,conf,len){
arules::apriori(
trns,parameter=list(support=sup,confidence=conf,minlen=len) ,
control=list(verbose=F),
appearance = appear
)
}
NFCU_app <- get_rules(trns=NFCU,appear=list(default='lhs',rhs='approve'),sup=0.04,conf=0.01,len=3)
NFCU_den <- get_rules(trns=NFCU,appear=list(default='lhs',rhs='deny'),sup=0.04,conf=0.01,len=3)
JPM_app <- get_rules(trns=JPM,appear=list(default='lhs',rhs='approve'),sup=0.04,conf=0.01,len=3)
JPM_den <- get_rules(trns=JPM,appear=list(default='lhs',rhs='deny'),sup=0.04,conf=0.01,len=3)
BOA_app <- get_rules(trns=BOA,appear=list(default='lhs',rhs='approve'),sup=0.04,conf=0.01,len=3)
BOA_den <- get_rules(trns=BOA,appear=list(default='lhs',rhs='deny'),sup=0.04,conf=0.01,len=3)
WF_app <- get_rules(trns=WF,appear=list(default='lhs',rhs='approve'),sup=0.04,conf=0.01,len=3)
WF_den <- get_rules(trns=WF,appear=list(default='lhs',rhs='deny'),sup=0.04,conf=0.01,len=3)
RM_app <- get_rules(trns=RM,appear=list(default='lhs',rhs='approve'),sup=0.04,conf=0.01,len=3)
RM_den <- get_rules(trns=RM,appear=list(default='lhs',rhs='deny'),sup=0.04,conf=0.01,len=3)plot(sort(NFCU_den,by='lift')[1:10],engine='html',method='graph')plot(sort(JPM_den,by='lift')[1:10],engine='html',method='graph')plot(sort(BOA_den,by='lift')[1:10],engine='html',method='graph')plot(sort(WF_den,by='lift')[1:10],engine='html',method='graph')plot(sort(RM_den,by='lift')[1:10],engine='html',method='graph')a_rules_den <- arules::apriori(
dat,
control=list(verbose=F),
parameter=list(support=0.04,confidence=0.01,minlen=2),
appearance = list(default='lhs',rhs='deny')
)
a_rules_app <- arules::apriori(
dat,
control=list(verbose=F),
parameter=list(support=0.04,confidence=0.01,minlen=2),
appearance = list(default='lhs',rhs='approve')
)
inspect(sort(
subset(
a_rules_den,lhs %pin% 'race:'
), by='lift'
)) lhs rhs support confidence coverage lift count
[1] {applicant_race:White,
interest_rate:41-60} => {deny} 0.04213514 0.3407446 0.1236561 2.3546500 8567
[2] {1 rooms,
applicant_race:White,
interest_rate:41-60} => {deny} 0.04138263 0.3381153 0.1223921 2.3364812 8414
[3] {applicant_race:White} => {deny} 0.07539273 0.1235273 0.6103324 0.8536119 15329
[4] {1 rooms,
applicant_race:White} => {deny} 0.07414348 0.1224923 0.6052911 0.8464593 15075
[5] {applicant_race:White,
aus:Desktop Underwriter} => {deny} 0.04111213 0.1185674 0.3467406 0.8193371 8359
[6] {1 rooms,
applicant_race:White,
aus:Desktop Underwriter} => {deny} 0.04052193 0.1176227 0.3445077 0.8128092 8239
[7] {applicant_ethnicity:Not Hispanic/Latino,
applicant_race:White} => {deny} 0.05616707 0.1111652 0.5052577 0.7681857 11420
[8] {1 rooms,
applicant_ethnicity:Not Hispanic/Latino,
applicant_race:White} => {deny} 0.05537030 0.1103444 0.5017952 0.7625140 11258
inspect(sort(
subset(
a_rules_app,lhs %pin% 'race:'
), by='lift'
)[1:10]) lhs rhs support confidence coverage lift count
[1] {applicant_race:White,
interest_rate:>80} => {approve} 0.11876236 1 0.11876236 1.169203 24147
[2] {applicant_race:White,
interest_rate:>80,
Joint} => {approve} 0.04705836 1 0.04705836 1.169203 9568
[3] {applicant_race:White,
interest_rate:>80,
Male} => {approve} 0.04324667 1 0.04324667 1.169203 8793
[4] {applicant_race:White,
aus:Loan Prospector/Product Advisor,
interest_rate:>80} => {approve} 0.05488339 1 0.05488339 1.169203 11159
[5] {applicant_race:White,
interest_rate:>80,
Rocket Mortgage} => {approve} 0.06145424 1 0.06145424 1.169203 12495
[6] {applicant_race:White,
aus:Desktop Underwriter,
interest_rate:>80} => {approve} 0.08051760 1 0.08051760 1.169203 16371
[7] {applicant_ethnicity:Not Hispanic/Latino,
applicant_race:White,
interest_rate:>80} => {approve} 0.09885305 1 0.09885305 1.169203 20099
[8] {1 rooms,
applicant_race:White,
interest_rate:>80} => {approve} 0.11780329 1 0.11780329 1.169203 23952
[9] {applicant_race:White,
interest_rate:0-20,
Rocket Mortgage} => {approve} 0.05066840 1 0.05066840 1.169203 10302
[10] {1 rooms,
applicant_race:White,
interest_rate:>80,
Joint} => {approve} 0.04677802 1 0.04677802 1.169203 9511