import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import OneHotEncoder
from prince import MCA
fr = pd.read_csv('../data/final_clean_r2.csv')
labels = fr['outcome'].copy()Appendix G — Multiple Correspondence Analysis
First, import the necessary modules
G.1 Data Transformations
First, to perform MCA, variables require separation, and transformation to categorical variables. Below, the binary encoded fields for applicant race, ethnicity, and underwriting system, are broken back out into corresponding binary True/False columns for each category.
G.1.1 Breakout Race, Ethnicity, and Underwriting System Columns
##for use in splitting the collapsed columns
##back out into their respective binary values
mapper = {
'applicant_race':{
'American Indian/Alaska Native':0b0000000000000000001,
'Asian':0b0000000000000000010,
'Asian Indian':0b0000000000000000100,
'Chinese':0b0000000000000001000,
'Filipino':0b0000000000000010000,
'Japanese':0b0000000000000100000,
'Korean':0b0000000000001000000,
'Vietnamese':0b0000000000010000000,
'Other Asian':0b0000000000100000000,
'Black/African American':0b0000000001000000000,
'Native Hawaiian/Pacific Islander':0b0000000010000000000,
'Native Hawaiian':0b0000000100000000000,
'Guamanian/Chamorro':0b0000001000000000000,
'Samoan':0b0000010000000000000,
'Other Pacific Islander':0b0000100000000000000,
'White':0b0001000000000000000,
'Information not provided':0b0010000000000000000,
'Not Applicable':0b0100000000000000000,
'No Co-applicant':0b1000000000000000000
},
'co-applicant_race':{
'American Indian/Alaska Native':0b0000000000000000001,
'Asian':0b0000000000000000010,
'Asian Indian':0b0000000000000000100,
'Chinese':0b0000000000000001000,
'Filipino':0b0000000000000010000,
'Japanese':0b0000000000000100000,
'Korean':0b0000000000001000000,
'Vietnamese':0b0000000000010000000,
'Other Asian':0b0000000000100000000,
'Black/African American':0b0000000001000000000,
'Native Hawaiian/Pacific Islander':0b0000000010000000000,
'Native Hawaiian':0b0000000100000000000,
'Guamanian/Chamorro':0b0000001000000000000,
'Samoan':0b0000010000000000000,
'Other Pacific Islander':0b0000100000000000000,
'White':0b0001000000000000000,
'Information not provided':0b0010000000000000000,
'Not Applicable':0b0100000000000000000,
'No Co-applicant':0b1000000000000000000
},
'applicant_ethnicity':{
'Hispanic/Latino':0b000000001,
'Mexican':0b000000010,
'Puerto Rican':0b000000100,
'Cuban':0b000001000,
'Other Hispanic/Latino':0b000010000,
'Not Hispanic/Latino':0b000100000,
'Information Not Provided':0b001000000,
'Not Applicable':0b010000000,
'No Co-applicant':0b100000000
},
'co-applicant_ethnicity':{
'Hispanic/Latino':0b000000001,
'Mexican':0b000000010,
'Puerto Rican':0b000000100,
'Cuban':0b000001000,
'Other Hispanic/Latino':0b000010000,
'Not Hispanic/Latino':0b000100000,
'Information Not Provided':0b001000000,
'Not Applicable':0b010000000,
'No Co-applicant':0b100000000
},
'aus':{
'Desktop Underwriter':0b00000001,
'Loan Prospector/Product Advisor':0b00000010,
'TOTAL Scorecard':0b00000100,
'GUS':0b00001000,
'Other':0b00010000,
'Internal Proprietary':0b00100000,
'Not applicable':0b01000000,
'Exempt':0b10000000,
},
}
new_mapper = {}
for k,v in mapper.items():
new_mapper[k] = {}
#print(k)
for j,w in v.items():
#print(w,j)
new_mapper[k][w] = jNext, the numeric variables require transformation to categorical. In this case, the standard deviation was leveraged to produce the following categories:
Value < Mean - 2*standard deviation => L (low)
Mean - 2 * standard deviation < Value < Mean - standard deviation => ML (Mid-Low)
Mean - standard deviation < Value < Mean + standard deviation => M
Mean + standard deviation < Value < Mean + 2 * standard deviation => MH (Mid-High)
Value > Mean + 2 * standard deviation => H (High)
This categorization allowed for diversity in the source data prior to transforming with MCA
#adjust numerics to categoricals
#drop columns that will not be leveraged in MCA
fr.drop(
labels = [
'balloon_payment',
'interest_only_payment',
'other_nonamortizing_features',
'income_from_median',
'state_code',
'county_code'
],
axis=1,inplace=True
)
#identify numeric columns to convert to categorical
numerics = [
'income',
'loan_amount',
'interest_rate',
'total_loan_costs',
'origination_charges',
'discount_points',
'lender_credits',
'loan_term',
'intro_rate_period',
'property_value',
'total_units',
'tract_population',
'tract_minority_population_percent',
'ffiec_msa_md_median_family_income',
'tract_to_msa_income_percentage',
'tract_owner_occupied_units',
'tract_one_to_four_family_homes',
'tract_median_age_of_housing_units',
'loan_to_value_ratio'
]
#set the cutting boundaries
bounds = [i/5 for i in range(1,5)]
for col in numerics:
#income had some errors, for some reason
if col == 'income':
fr.loc[fr[col]<=0,col] = 0.01
fr[col] = np.log(fr[col])
s = fr[col].std()
m = fr[col].mean()
#cut everything based on standard deviations
cut_level = [
m-2*s,
m-s,
m+s,
m+2*s
]
cut_level = [-np.inf] + cut_level + [np.inf]
#assign value based on cut boundaries
fr[col] = pd.cut(
fr[col],
bins=cut_level,
labels=["L","ML","M","MH","H"]
)
#convert to categorical
fr[col] = fr[col].astype('category')
fr[numerics].head(10)| income | loan_amount | interest_rate | total_loan_costs | origination_charges | discount_points | lender_credits | loan_term | intro_rate_period | property_value | total_units | tract_population | tract_minority_population_percent | ffiec_msa_md_median_family_income | tract_to_msa_income_percentage | tract_owner_occupied_units | tract_one_to_four_family_homes | tract_median_age_of_housing_units | loan_to_value_ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | M | MH | L | H | H | M | H | M | H | M | M | M | M | M | M | MH | MH | M | M |
| 1 | L | MH | L | M | M | M | M | M | H | H | M | M | ML | M | H | M | L | ML | L |
| 2 | MH | H | ML | M | M | M | M | M | H | H | M | H | M | M | H | M | L | ML | M |
| 3 | M | MH | ML | M | M | M | M | M | H | M | M | H | M | M | H | H | H | ML | M |
| 4 | M | H | ML | M | M | M | M | M | H | H | M | M | M | H | MH | M | M | MH | M |
| 5 | MH | MH | M | MH | M | M | M | M | M | MH | M | M | ML | M | M | M | M | M | M |
| 6 | MH | H | M | M | M | M | M | M | MH | H | M | M | MH | M | M | M | M | M | M |
| 7 | MH | M | ML | M | M | M | M | M | M | H | M | M | ML | M | H | MH | ML | MH | L |
| 8 | M | H | ML | M | M | M | M | M | H | H | M | H | M | M | MH | MH | ML | H | M |
| 9 | M | M | ML | M | M | M | M | M | H | M | M | M | ML | M | MH | MH | M | ML | ML |
After transforming numerics, the one-hot encoded columns extracted from race, ethnicity, and underwriting system required separation from the rest of the data so that all remaining categorical columns could be converted to a one-hot encoding.
Below outlines the separation of the race, ethnicity, and underwriting columns.
#extract binary columns
## because they're already one-hot encoded
fr_bin = fr[[
'applicant_race',
'applicant_ethnicity',
'co-applicant_race',
'co-applicant_ethnicity',
'aus'
]].copy()
for k,v in new_mapper.items():
for l,w in v.items():
fr_bin[k+'_'+w] = (fr_bin[k]&l > 0).astype(int)
fr_bin.drop(
labels=[
'applicant_race',
'applicant_ethnicity',
'co-applicant_race',
'co-applicant_ethnicity',
'aus'
],
inplace=True,
axis=1
)
fr.drop(
labels=[
'applicant_race',
'applicant_ethnicity',
'co-applicant_race',
'co-applicant_ethnicity',
'denial_reason',
'aus', #may need to exclude this/comment it out...
'outcome',
'action_taken'
],
inplace=True,
axis=1
)
display(
fr.head(10),
fr_bin.head(10)
)| derived_sex | purchaser_type | preapproval | open-end_line_of_credit | loan_amount | loan_to_value_ratio | interest_rate | total_loan_costs | origination_charges | discount_points | ... | applicant_age | co-applicant_age | tract_population | tract_minority_population_percent | ffiec_msa_md_median_family_income | tract_to_msa_income_percentage | tract_owner_occupied_units | tract_one_to_four_family_homes | tract_median_age_of_housing_units | company | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Sex Not Available | 0 | 2 | 2 | MH | M | L | H | H | M | ... | 3.0 | 7.0 | M | M | M | M | MH | MH | M | JP Morgan |
| 1 | Male | 0 | 2 | 2 | MH | L | L | M | M | M | ... | 1.0 | 8.0 | M | ML | M | H | M | L | ML | JP Morgan |
| 2 | Sex Not Available | 0 | 1 | 2 | H | M | ML | M | M | M | ... | 3.0 | 8.0 | H | M | M | H | M | L | ML | JP Morgan |
| 3 | Male | 0 | 2 | 2 | MH | M | ML | M | M | M | ... | 4.0 | 8.0 | H | M | M | H | H | H | ML | JP Morgan |
| 4 | Joint | 0 | 2 | 2 | H | M | ML | M | M | M | ... | 1.0 | 1.0 | M | M | H | MH | M | M | MH | JP Morgan |
| 5 | Joint | 0 | 1 | 2 | MH | M | M | MH | M | M | ... | 2.0 | 3.0 | M | ML | M | M | M | M | M | JP Morgan |
| 6 | Joint | 0 | 2 | 2 | H | M | M | M | M | M | ... | 2.0 | 1.0 | M | MH | M | M | M | M | M | JP Morgan |
| 7 | Sex Not Available | 0 | 1 | 2 | M | L | ML | M | M | M | ... | 1.0 | 1.0 | M | ML | M | H | MH | ML | MH | JP Morgan |
| 8 | Joint | 0 | 1 | 2 | H | M | ML | M | M | M | ... | 1.0 | 1.0 | H | M | M | MH | MH | ML | H | JP Morgan |
| 9 | Sex Not Available | 0 | 1 | 2 | M | ML | ML | M | M | M | ... | 1.0 | 1.0 | M | ML | M | MH | MH | M | ML | JP Morgan |
10 rows × 37 columns
| applicant_race_American Indian/Alaska Native | applicant_race_Asian | applicant_race_Asian Indian | applicant_race_Chinese | applicant_race_Filipino | applicant_race_Japanese | applicant_race_Korean | applicant_race_Vietnamese | applicant_race_Other Asian | applicant_race_Black/African American | ... | co-applicant_ethnicity_Not Applicable | co-applicant_ethnicity_No Co-applicant | aus_Desktop Underwriter | aus_Loan Prospector/Product Advisor | aus_TOTAL Scorecard | aus_GUS | aus_Other | aus_Internal Proprietary | aus_Not applicable | aus_Exempt | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 6 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
10 rows × 64 columns
The below cell takes the remaining non-binary encoded data and performs one-hot encoding.
# perform one-hot encoding of remaining columns
ohe = OneHotEncoder()
out = ohe.fit_transform(fr)
outdf = pd.DataFrame(out.toarray(),columns=ohe.get_feature_names_out().tolist())
#prepare a copy of the dataframe
outdf_nr = outdf.copy()
#transfer columns over from the already one-hot encoded dataframe
for col in fr_bin.columns:
outdf[col] = fr_bin[col].copy()
#convert all columns to integers
for col in outdf.columns:
outdf[col] = outdf[col].astype(int)
#display the output
display(outdf.head())| derived_sex_Female | derived_sex_Joint | derived_sex_Male | derived_sex_Sex Not Available | purchaser_type_0 | purchaser_type_1 | purchaser_type_3 | purchaser_type_5 | purchaser_type_6 | purchaser_type_9 | ... | co-applicant_ethnicity_Not Applicable | co-applicant_ethnicity_No Co-applicant | aus_Desktop Underwriter | aus_Loan Prospector/Product Advisor | aus_TOTAL Scorecard | aus_GUS | aus_Other | aus_Internal Proprietary | aus_Not applicable | aus_Exempt | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 2 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 3 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 4 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
5 rows × 243 columns
G.1.2 Check for Buggy Columns
MCA requires a one-hot encoded vector to have at least 1 zero and at least 1 one per column. The below checks for any columns that did not meet this requirement, and any such columns are exlcued from the MCA.
#check for any columns that only have one value/result
for col in outdf.columns:
x = list(outdf[col].unique())
x.sort()
if x != [0,1]:
print(col)applicant_race_No Co-applicant
applicant_ethnicity_No Co-applicant
aus_GUS
aus_Exempt
G.1.3 MCA With Protected Classes
ncomp=181
mcaNd = MCA(n_components=ncomp,one_hot=False)
#exclude columns that had no variability (only a single value)
#and fit a multiple correspondence analysis
xformNd = mcaNd.fit_transform(
outdf.drop(
labels=[
'applicant_race_No Co-applicant',
'applicant_ethnicity_No Co-applicant',
'aus_GUS',
'aus_Exempt'
],axis=1
)
)
xformNd.columns = ['MC{}'.format(i+1) for i in range(len(xformNd.columns))]#head of the MCA dataframe
xformNd.head(10)| MC1 | MC2 | MC3 | MC4 | MC5 | MC6 | MC7 | MC8 | MC9 | MC10 | ... | MC172 | MC173 | MC174 | MC175 | MC176 | MC177 | MC178 | MC179 | MC180 | MC181 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.451521 | 6.128471 | 61.481532 | -0.582675 | 0.142174 | -0.446411 | -0.155301 | 0.005913 | 0.387730 | -0.210268 | ... | -0.038055 | -0.012424 | -0.000177 | -0.001205 | 0.009249 | -0.005225 | -0.003449 | 0.001742 | 0.003897 | -0.004720 |
| 1 | -0.319092 | 0.052840 | 0.031634 | 0.207948 | -0.063021 | 0.541456 | 0.596652 | -0.107125 | 0.091412 | 0.025145 | ... | 0.046762 | -0.086929 | -0.002160 | 0.006824 | -0.084655 | -0.006127 | -0.002646 | 0.012392 | 0.002616 | 0.000280 |
| 2 | -0.233362 | 0.054331 | 0.034305 | -0.336069 | 0.122278 | 0.455142 | 1.004323 | -0.075047 | 0.135570 | -0.089234 | ... | 0.028105 | -0.080263 | -0.000241 | 0.011704 | -0.138774 | -0.004096 | 0.011195 | 0.001980 | -0.000695 | 0.002452 |
| 3 | -0.302904 | 0.025349 | 0.025218 | 0.236800 | -0.090089 | 0.174363 | 0.410053 | -0.525227 | 0.699016 | -0.263994 | ... | 0.046171 | -0.085637 | 0.000008 | 0.006094 | -0.074695 | -0.002369 | 0.002111 | 0.009786 | -0.001861 | 0.001232 |
| 4 | 0.634415 | 0.012939 | 0.028819 | 0.400410 | -0.163398 | 0.245667 | 0.736226 | 0.056253 | -0.248964 | 0.088020 | ... | 0.026770 | -0.069725 | 0.000374 | 0.009017 | -0.129854 | -0.002576 | 0.012426 | 0.013691 | 0.002016 | 0.000075 |
| 5 | 0.684141 | 0.009116 | 0.007519 | 0.267646 | -0.094813 | 0.307864 | 0.136132 | -0.002112 | 0.192184 | -0.031398 | ... | 0.212160 | 0.006548 | -0.001180 | 0.011061 | -0.075641 | 0.006081 | 0.001703 | 0.013096 | 0.002621 | 0.002301 |
| 6 | 0.649954 | 0.004748 | 0.014900 | 0.356419 | -0.135796 | 0.275216 | 0.713745 | 0.010690 | -0.266578 | -0.003736 | ... | 0.032658 | -0.058016 | 0.001301 | 0.007645 | -0.091264 | 0.001304 | -0.112194 | -0.005152 | -0.045460 | -0.004703 |
| 7 | 0.569649 | 0.039957 | 0.014486 | -0.852346 | 0.334152 | 0.879204 | 0.562587 | 0.009737 | -0.127637 | 0.000647 | ... | 0.059040 | -0.090289 | -0.001719 | 0.013249 | -0.097626 | 0.008894 | 0.001857 | 0.000997 | 0.003241 | 0.003394 |
| 8 | 0.643866 | 0.013070 | 0.028519 | 0.316946 | -0.121840 | 0.209269 | 0.495583 | -0.080518 | 0.103247 | -0.041161 | ... | 0.031299 | -0.071656 | 0.000867 | 0.009461 | -0.135678 | -0.001970 | 0.008649 | -0.007681 | 0.002351 | 0.002077 |
| 9 | 0.465507 | 0.023988 | 0.021353 | -1.000502 | 0.358704 | 0.639682 | 0.285072 | -0.181713 | 0.044954 | 0.006286 | ... | 0.066260 | -0.093161 | 0.000555 | 0.006261 | -0.064012 | -0.003694 | 0.002558 | 0.000544 | -0.000423 | 0.001978 |
10 rows × 181 columns
#summary of eigenvalues
display(mcaNd.eigenvalues_summary)| eigenvalue | % of variance | % of variance (cumulative) | |
|---|---|---|---|
| component | |||
| 0 | 0.215 | 4.69% | 4.69% |
| 1 | 0.167 | 3.63% | 8.32% |
| 2 | 0.166 | 3.62% | 11.93% |
| 3 | 0.115 | 2.49% | 14.43% |
| 4 | 0.102 | 2.21% | 16.64% |
| ... | ... | ... | ... |
| 176 | 0.001 | 0.02% | 99.96% |
| 177 | 0.001 | 0.01% | 99.97% |
| 178 | 0.000 | 0.01% | 99.98% |
| 179 | 0.000 | 0.01% | 99.99% |
| 180 | 0.000 | 0.01% | 100.00% |
181 rows × 3 columns
#summary of column contributions to each component, sorted by the first component (e.g)
#max eigenvalue's top 10 contributors
display(mcaNd.column_contributions_.sort_values(by=0,ascending=False).head(10))| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| co-applicant_sex_observed_2 | 0.061418 | 0.000009 | 9.547786e-06 | 0.000537 | 0.000206 | 0.000370 | 0.001228 | 7.078606e-05 | 0.000358 | 0.000027 | ... | 9.090788e-06 | 0.000057 | 9.194148e-04 | 2.548507e-04 | 0.000192 | 2.767818e-06 | 0.003954 | 0.000062 | 0.002840 | 0.005715 |
| co-applicant_ethnicity_observed_2 | 0.061381 | 0.000009 | 9.545378e-06 | 0.000564 | 0.000261 | 0.000386 | 0.001226 | 7.261382e-05 | 0.000358 | 0.000026 | ... | 1.594640e-06 | 0.000037 | 6.545288e-06 | 1.604834e-03 | 0.000099 | 2.790837e-06 | 0.003903 | 0.000058 | 0.002822 | 0.005734 |
| co-applicant_race_observed_2 | 0.061369 | 0.000009 | 9.542219e-06 | 0.000572 | 0.000275 | 0.000386 | 0.001215 | 6.994850e-05 | 0.000360 | 0.000026 | ... | 3.701321e-06 | 0.000033 | 1.013418e-03 | 4.079012e-04 | 0.000224 | 3.182675e-06 | 0.003976 | 0.000059 | 0.002853 | 0.005723 |
| derived_sex_Joint | 0.053307 | 0.000027 | 5.610064e-06 | 0.008021 | 0.000851 | 0.000131 | 0.002191 | 5.000923e-05 | 0.000005 | 0.000049 | ... | 1.003506e-03 | 0.000284 | 9.737514e-05 | 1.936380e-05 | 0.000046 | 1.287449e-07 | 0.000169 | 0.003894 | 0.000058 | 0.000248 |
| co-applicant_credit_score_type_9 | 0.047954 | 0.000029 | 2.215064e-06 | 0.003363 | 0.000227 | 0.004860 | 0.002476 | 8.910958e-07 | 0.001545 | 0.015377 | ... | 9.534944e-06 | 0.000128 | 1.583074e-08 | 1.048725e-08 | 0.000330 | 1.582662e-04 | 0.003368 | 0.000068 | 0.003135 | 0.006879 |
| co-applicant_ethnicity_Not Hispanic/Latino | 0.046580 | 0.000019 | 4.598133e-06 | 0.012423 | 0.001830 | 0.000412 | 0.000204 | 9.886380e-03 | 0.002706 | 0.000063 | ... | 5.756610e-07 | 0.000037 | 3.882840e-06 | 6.085041e-06 | 0.000258 | 6.342339e-06 | 0.004570 | 0.000095 | 0.009585 | 0.204413 |
| co-applicant_ethnicity_observed_4 | 0.045165 | 0.000006 | 3.893077e-07 | 0.000104 | 0.000036 | 0.000117 | 0.000839 | 7.990930e-05 | 0.000380 | 0.000045 | ... | 8.054315e-05 | 0.000096 | 1.192476e-06 | 1.178042e-06 | 0.000159 | 1.314536e-06 | 0.000200 | 0.000037 | 0.002541 | 0.000430 |
| co-applicant_age_8.0 | 0.045165 | 0.000006 | 3.893077e-07 | 0.000104 | 0.000036 | 0.000117 | 0.000839 | 7.990930e-05 | 0.000380 | 0.000045 | ... | 8.054315e-05 | 0.000096 | 1.192476e-06 | 1.178042e-06 | 0.000159 | 1.314536e-06 | 0.000200 | 0.000037 | 0.002541 | 0.000430 |
| co-applicant_race_observed_4 | 0.045165 | 0.000006 | 3.893077e-07 | 0.000104 | 0.000036 | 0.000117 | 0.000839 | 7.990930e-05 | 0.000380 | 0.000045 | ... | 8.054315e-05 | 0.000096 | 1.192476e-06 | 1.178042e-06 | 0.000159 | 1.314536e-06 | 0.000200 | 0.000037 | 0.002541 | 0.000430 |
| co-applicant_sex_observed_4 | 0.045165 | 0.000006 | 3.893077e-07 | 0.000104 | 0.000036 | 0.000117 | 0.000839 | 7.990930e-05 | 0.000380 | 0.000045 | ... | 8.054315e-05 | 0.000096 | 1.192476e-06 | 1.178042e-06 | 0.000159 | 1.314536e-06 | 0.000200 | 0.000037 | 0.002541 | 0.000430 |
10 rows × 181 columns
G.1.4 MCA without Protected Classes
#perform an MCA, excluding any information on:
#age, gender, or race
##need 99 components to get 100% of variance
##may need these two versions to do full compare
# ncomp = 90
ncomp=100
mcaNd_nr = MCA(n_components=ncomp,one_hot=False)
xformNd_nr = mcaNd_nr.fit_transform(outdf_nr.drop(
labels=[
'derived_sex_Female',
'derived_sex_Joint',
'derived_sex_Male',
'derived_sex_Sex Not Available',
'applicant_ethnicity_observed_1',
'applicant_ethnicity_observed_2',
'applicant_ethnicity_observed_3',
'co-applicant_ethnicity_observed_1',
'co-applicant_ethnicity_observed_2',
'co-applicant_ethnicity_observed_3',
'co-applicant_ethnicity_observed_4',
'applicant_race_observed_1',
'applicant_race_observed_2',
'applicant_race_observed_3',
'co-applicant_race_observed_1',
'co-applicant_race_observed_2',
'co-applicant_race_observed_3',
'co-applicant_race_observed_4',
'applicant_sex_1',
'applicant_sex_2',
'applicant_sex_3',
'applicant_sex_4',
'applicant_sex_6',
'co-applicant_sex_1',
'co-applicant_sex_2',
'co-applicant_sex_3',
'co-applicant_sex_4',
'co-applicant_sex_5',
'co-applicant_sex_6',
'applicant_sex_observed_1',
'applicant_sex_observed_2',
'applicant_sex_observed_3',
'co-applicant_sex_observed_1',
'co-applicant_sex_observed_2',
'co-applicant_sex_observed_3',
'co-applicant_sex_observed_4',
'applicant_age_0.0',
'applicant_age_1.0',
'applicant_age_2.0',
'applicant_age_3.0',
'applicant_age_4.0',
'applicant_age_5.0',
'applicant_age_6.0',
'applicant_age_7.0',
'co-applicant_age_0.0',
'co-applicant_age_1.0',
'co-applicant_age_2.0',
'co-applicant_age_3.0',
'co-applicant_age_4.0',
'co-applicant_age_5.0',
'co-applicant_age_6.0',
'co-applicant_age_7.0',
'co-applicant_age_8.0'
],
axis=1
))
xformNd_nr.columns = ['MC{}'.format(i+1) for i in range(len(xformNd_nr.columns))]#head of the MCA dataframe
xformNd_nr.head(10)| MC1 | MC2 | MC3 | MC4 | MC5 | MC6 | MC7 | MC8 | MC9 | MC10 | ... | MC91 | MC92 | MC93 | MC94 | MC95 | MC96 | MC97 | MC98 | MC99 | MC100 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.764720 | 0.723758 | 0.125389 | -0.056585 | 0.170870 | 0.548785 | -0.424036 | -0.241991 | 0.097430 | 0.513754 | ... | -0.132230 | -0.380647 | -0.171881 | 0.643932 | -0.047217 | -0.006711 | -0.111764 | 0.019441 | -0.075462 | -0.008763 |
| 1 | 0.140150 | 1.022436 | 0.076617 | -0.388087 | 0.022334 | -0.423135 | -0.292322 | 0.252731 | -0.214779 | 0.088909 | ... | 0.209340 | -0.153709 | 0.088645 | -0.070257 | -0.278045 | 0.102762 | -0.002714 | -0.035754 | -0.016209 | -0.007529 |
| 2 | -0.382006 | 1.207435 | 0.016253 | -0.675044 | 0.254554 | -0.804334 | -0.305620 | 0.294332 | -0.503778 | -0.252445 | ... | 0.530322 | 0.212954 | -0.084617 | 0.070987 | 0.042070 | -0.008835 | -0.001938 | 0.005713 | -0.031815 | -0.006028 |
| 3 | -0.322021 | 0.627399 | -1.025139 | -0.423863 | 1.221655 | -0.311233 | -0.274495 | 0.188478 | -0.149330 | -0.948647 | ... | -0.146744 | -0.033300 | -0.009274 | 0.073137 | 0.166896 | -0.090431 | -0.140857 | 0.015810 | -0.021260 | -0.004191 |
| 4 | -0.222096 | 0.986399 | 0.342411 | -0.517554 | -0.440269 | -0.432887 | -0.167812 | 0.169848 | -0.530523 | -0.208360 | ... | -0.061780 | -0.004806 | 0.018112 | -0.017873 | 0.077980 | -0.052323 | 0.012778 | -0.014635 | -0.026638 | -0.004269 |
| 5 | -0.221344 | 0.601721 | 0.008450 | -0.185320 | -0.387269 | 0.006025 | 0.150320 | 0.375523 | 0.461518 | -0.034283 | ... | 0.022191 | -0.134455 | -0.361678 | 0.100214 | -0.095434 | 0.000053 | 0.081231 | 0.031581 | 0.214385 | 0.007798 |
| 6 | -0.042042 | 0.825237 | 0.223653 | -0.488704 | -0.353445 | -0.016609 | -0.012096 | 0.187879 | -0.816343 | -0.194093 | ... | -0.090645 | -0.056575 | 0.058021 | -0.051419 | 0.022802 | 0.039888 | -0.002228 | -0.028715 | -0.030764 | -0.000873 |
| 7 | 0.072632 | 1.034400 | 0.031038 | -0.342971 | -0.175534 | -0.413475 | -0.225371 | 0.303189 | 0.460221 | 0.414513 | ... | 0.038167 | -0.069464 | 0.040725 | -0.028568 | -0.415168 | 0.299228 | 0.182213 | -0.011537 | -0.010310 | 0.011517 |
| 8 | -0.307864 | 0.906534 | -0.026393 | -0.526053 | 0.337134 | -0.601458 | -0.253877 | 0.238076 | -0.385210 | 0.078999 | ... | 0.605544 | 0.285794 | -0.096174 | 0.047887 | 0.039686 | 0.239895 | -0.010113 | 0.011185 | -0.025836 | -0.002878 |
| 9 | -0.033257 | 0.346177 | -0.359762 | -0.135270 | -0.053814 | -0.052379 | -0.025562 | -0.059310 | -0.197474 | 0.223686 | ... | -0.163153 | -0.069786 | -0.003615 | 0.146050 | 0.032843 | 0.194863 | -0.094570 | 0.014722 | -0.002103 | -0.003909 |
10 rows × 100 columns
#summary of eigenvalues
display(mcaNd_nr.eigenvalues_summary)| eigenvalue | % of variance | % of variance (cumulative) | |
|---|---|---|---|
| component | |||
| 0 | 0.124 | 3.21% | 3.21% |
| 1 | 0.116 | 3.01% | 6.23% |
| 2 | 0.100 | 2.61% | 8.83% |
| 3 | 0.086 | 2.23% | 11.06% |
| 4 | 0.085 | 2.22% | 13.28% |
| ... | ... | ... | ... |
| 95 | 0.009 | 0.24% | 99.52% |
| 96 | 0.007 | 0.17% | 99.69% |
| 97 | 0.007 | 0.17% | 99.86% |
| 98 | 0.004 | 0.11% | 99.97% |
| 99 | 0.001 | 0.03% | 100.00% |
100 rows × 3 columns
#summary of column contributions to each component, sorted by the first component (e.g)
#max eigenvalue's top contributors
display(mcaNd_nr.column_contributions_.sort_values(by=0,ascending=False).head(20))| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| origination_charges_H | 0.088933 | 8.942172e-03 | 3.623890e-02 | 0.034505 | 0.012401 | 0.097634 | 0.045595 | 0.008395 | 0.000913 | 9.811357e-05 | ... | 0.000608 | 3.762587e-02 | 0.113043 | 0.012385 | 0.002687 | 6.808054e-05 | 7.799676e-04 | 1.020531e-04 | 4.273904e-01 | 3.882523e-07 |
| total_loan_costs_H | 0.084998 | 1.035486e-02 | 3.586364e-02 | 0.029490 | 0.011701 | 0.083204 | 0.043297 | 0.006285 | 0.000281 | 7.044364e-05 | ... | 0.000008 | 9.952770e-05 | 0.010288 | 0.170899 | 0.013513 | 1.634182e-04 | 4.238083e-05 | 1.011652e-04 | 3.245955e-01 | 7.379104e-07 |
| discount_points_H | 0.076381 | 6.579247e-03 | 2.680753e-02 | 0.031293 | 0.012241 | 0.078079 | 0.024438 | 0.004070 | 0.007833 | 1.578854e-04 | ... | 0.001013 | 6.219218e-02 | 0.087251 | 0.344232 | 0.025247 | 1.838789e-05 | 6.597205e-04 | 7.203051e-04 | 1.090670e-02 | 8.576594e-08 |
| loan_amount_ML | 0.054607 | 1.096792e-02 | 1.401917e-02 | 0.000428 | 0.003975 | 0.000037 | 0.098195 | 0.098392 | 0.010839 | 4.374160e-04 | ... | 0.000399 | 7.009303e-05 | 0.000026 | 0.014265 | 0.203888 | 5.612735e-04 | 1.900939e-01 | 2.722371e-02 | 1.364451e-05 | 4.457975e-06 |
| total_loan_costs_MH | 0.049097 | 2.092212e-03 | 3.964357e-04 | 0.006480 | 0.000034 | 0.000908 | 0.086860 | 0.125954 | 0.003481 | 7.030098e-05 | ... | 0.001394 | 5.438767e-02 | 0.189695 | 0.001752 | 0.000194 | 7.493222e-08 | 1.082877e-07 | 6.609995e-06 | 6.789266e-02 | 2.494463e-07 |
| origination_charges_MH | 0.047695 | 5.291334e-03 | 1.314798e-05 | 0.007483 | 0.000316 | 0.002069 | 0.114474 | 0.159078 | 0.003506 | 1.969190e-04 | ... | 0.000797 | 6.254710e-02 | 0.309200 | 0.109115 | 0.004430 | 6.008099e-06 | 7.908615e-05 | 8.730695e-08 | 6.953100e-02 | 1.126411e-07 |
| property_value_ML | 0.043231 | 1.311904e-02 | 2.361116e-02 | 0.000225 | 0.006645 | 0.001983 | 0.079686 | 0.111624 | 0.002818 | 1.176477e-04 | ... | 0.003947 | 2.806396e-04 | 0.000114 | 0.008425 | 0.217519 | 2.815655e-06 | 1.540480e-01 | 2.014790e-02 | 2.489933e-05 | 5.612855e-06 |
| discount_points_MH | 0.033078 | 4.344227e-03 | 3.090285e-05 | 0.006287 | 0.000074 | 0.002746 | 0.087229 | 0.116690 | 0.007014 | 3.781955e-04 | ... | 0.000003 | 3.379782e-03 | 0.045276 | 0.146388 | 0.009086 | 1.432915e-05 | 1.318505e-05 | 2.382708e-05 | 4.700539e-04 | 2.749443e-08 |
| purchaser_type_0 | 0.032864 | 5.101933e-02 | 5.134128e-05 | 0.000131 | 0.000346 | 0.017773 | 0.005854 | 0.002768 | 0.024542 | 2.130675e-05 | ... | 0.013549 | 2.877823e-04 | 0.000093 | 0.000437 | 0.000206 | 1.423601e-06 | 5.923642e-04 | 1.386355e-03 | 9.410552e-07 | 2.602806e-07 |
| open-end_line_of_credit_1 | 0.030098 | 3.497459e-02 | 3.147512e-03 | 0.270842 | 0.000043 | 0.016216 | 0.002723 | 0.000650 | 0.000119 | 7.713089e-05 | ... | 0.000072 | 4.264123e-06 | 0.000008 | 0.000191 | 0.000081 | 2.761905e-08 | 1.606608e-05 | 2.476935e-06 | 3.188185e-07 | 4.925486e-01 |
| income_ML | 0.027202 | 2.880571e-03 | 5.876898e-03 | 0.004245 | 0.003249 | 0.000997 | 0.036600 | 0.059010 | 0.000014 | 1.913531e-05 | ... | 0.000243 | 3.641664e-06 | 0.000100 | 0.000925 | 0.001264 | 1.180952e-04 | 1.426697e-04 | 2.402477e-05 | 3.522036e-07 | 4.322885e-06 |
| applicant_credit_score_type_8 | 0.023624 | 2.252189e-02 | 1.936850e-03 | 0.221591 | 0.000008 | 0.014466 | 0.001494 | 0.000439 | 0.006472 | 1.602758e-04 | ... | 0.000325 | 2.646222e-05 | 0.000011 | 0.000186 | 0.000020 | 7.810669e-07 | 4.211464e-05 | 5.847733e-04 | 4.673938e-07 | 3.942036e-01 |
| debt_to_income_ratio_18.0 | 0.022513 | 1.081502e-02 | 1.665984e-03 | 0.020357 | 0.000969 | 0.006126 | 0.002172 | 0.001373 | 0.028440 | 4.319469e-05 | ... | 0.000344 | 3.474722e-07 | 0.000018 | 0.000296 | 0.000127 | 2.200983e-06 | 3.635712e-06 | 3.165596e-04 | 5.901178e-07 | 1.647005e-06 |
| loan_amount_MH | 0.022203 | 1.950365e-02 | 2.109436e-03 | 0.000615 | 0.000016 | 0.001156 | 0.001097 | 0.000228 | 0.009686 | 1.696359e-04 | ... | 0.002764 | 2.114010e-05 | 0.000608 | 0.000015 | 0.012514 | 1.093064e-05 | 5.994630e-02 | 1.001983e-02 | 1.617146e-04 | 3.625249e-07 |
| company_Navy Federal Credit Union | 0.021360 | 1.382622e-02 | 7.652527e-03 | 0.024050 | 0.000687 | 0.138437 | 0.017836 | 0.009102 | 0.041316 | 8.876005e-06 | ... | 0.007175 | 8.647908e-04 | 0.000281 | 0.001228 | 0.000032 | 2.159210e-05 | 2.375829e-02 | 1.576080e-01 | 4.473596e-05 | 1.660420e-04 |
| co-applicant_credit_score_type_9 | 0.020690 | 4.557131e-04 | 6.755788e-07 | 0.000520 | 0.003270 | 0.016488 | 0.001447 | 0.002080 | 0.007025 | 5.846616e-07 | ... | 0.000218 | 2.176786e-06 | 0.000032 | 0.000010 | 0.000025 | 1.267790e-06 | 4.024599e-03 | 1.622137e-02 | 2.032551e-05 | 4.518054e-05 |
| origination_charges_M | 0.018343 | 1.853453e-08 | 2.048184e-03 | 0.004807 | 0.000418 | 0.003443 | 0.002822 | 0.009952 | 0.000624 | 4.297237e-05 | ... | 0.000008 | 9.540820e-04 | 0.008231 | 0.015886 | 0.001038 | 1.369357e-06 | 8.420563e-05 | 5.919102e-06 | 5.366124e-02 | 6.069544e-08 |
| purchaser_type_1 | 0.015311 | 1.264095e-02 | 7.400198e-05 | 0.000004 | 0.002262 | 0.002061 | 0.001437 | 0.001671 | 0.000003 | 3.912600e-05 | ... | 0.005013 | 9.121422e-05 | 0.000009 | 0.000021 | 0.000058 | 6.294184e-06 | 2.411526e-04 | 7.236456e-05 | 4.126922e-07 | 1.541073e-10 |
| total_loan_costs_M | 0.015282 | 1.597282e-04 | 3.768925e-03 | 0.001067 | 0.001801 | 0.001352 | 0.001175 | 0.002676 | 0.009979 | 2.529182e-05 | ... | 0.000004 | 4.037010e-03 | 0.008549 | 0.008052 | 0.000817 | 3.548314e-06 | 7.984920e-07 | 6.811171e-05 | 3.599548e-02 | 1.046963e-08 |
| loan_to_value_ratio_MH | 0.014452 | 4.061576e-03 | 5.026527e-05 | 0.003849 | 0.002806 | 0.085140 | 0.012913 | 0.006081 | 0.017323 | 1.591876e-05 | ... | 0.000027 | 7.783720e-05 | 0.000164 | 0.000191 | 0.000906 | 8.208134e-06 | 1.281691e-04 | 1.841615e-02 | 1.551564e-09 | 1.157982e-05 |
20 rows × 100 columns
#forgot to drop from outdf_nr earlier. fixing...
outdf_nr.drop(
columns=['derived_sex_Female','derived_sex_Joint','derived_sex_Male',
'derived_sex_Sex Not Available'],inplace=True,axis=1
)G.1.5 Output the Results to CSV
out = mcaNd.column_contributions_.copy()
out_nr = mcaNd_nr.column_contributions_.copy()
out.reset_index(inplace=True)
out_nr.reset_index(inplace=True)
out.columns = ['Column']+["MC{}".format(i+1) for i in range(len(out.columns)-1)]
out_nr.columns = ['Column']+["MC{}".format(i+1) for i in range(len(out_nr.columns)-1)]
mcaNd.eigenvalues_summary.to_csv('../data/mca-Nd-eig.csv',index=False)
out.to_csv('../data/mca-Nd-ColCont.csv',index=False)
mcaNd_nr.eigenvalues_summary.to_csv('../data/mca-Nd-npc-eig.csv',index=False)
out_nr.to_csv('../data/mca-Nd-npc-ColCont.csv',index=False)
outdf.to_csv('../data/data-one-hot.csv',index=False)
outdf_nr.to_csv('../data/data-one-hot-npc.csv',index=False)
xformNd.to_csv('../data/mcaNd.csv',index=False)
xformNd_nr.to_csv('../data/mcaNd-npc.csv',index=False)tmp = pd.concat([out,labels],axis=1)
sns.scatterplot(
data=tmp,
x='MC1',y='MC2',
hue='outcome'
)
