import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import OneHotEncoder
from prince import MCA
fr = pd.read_csv('../data/final_clean.csv')Appendix B — Component Analysis Code (PCA and MCA)
B.1 Module and Data Imports
B.1.1 Principal Component Analysis
B.1.1.1 Select & Scale Numeric Columns
columns = [
#'loan_amount',
#'property_value',
'income',
'interest_rate',
'total_loan_costs',
'loan_to_value_ratio',
#'origination_costs',
#'discount_points',
#'lender_credits',
'loan_term',
'intro_rate_period',
'total_units',
'tract_minority_population_percent',
'tract_population',
'tract_to_msa_income_percentage',
'tract_owner_occupied_units',
'tract_one_to_four_family_homes',
'tract_median_age_of_housing_units',
#'debt_to_income_ratio'
]
X = fr[columns]
X = StandardScaler().fit_transform(X)B.1.1.2 Perform 2D PCA
pca2d = PCA(n_components=2)
result2d = pd.DataFrame(pca2d.fit_transform(X))
result2d['outcome'] = fr['outcome'].astype(bool)
display(
np.cumsum(pca2d.explained_variance_) #eigenvalues
)
sns.scatterplot(
data=result2d,
x=0,y=1,hue='outcome'
)
np.cumsum(pca2d.explained_variance_ratio_)array([2.71077093, 4.35879384])
array([0.20851998, 0.33529045])

B.1.1.3 Perform 3D PCA
pca3d = PCA(n_components=3)
result3d = pd.DataFrame(pca3d.fit_transform(X))
result3d['outcome'] = fr['outcome'].astype(bool)
display(
np.cumsum(pca3d.explained_variance_) #eigenvalues
)
result3d
np.cumsum(pca3d.explained_variance_ratio_)array([2.71077093, 4.35879384, 5.61886437])
array([0.20851998, 0.33529045, 0.43221855])
fig = plt.figure(figsize=(12,12))
ax = Axes3D(fig,rect=[0,0,.9,1],elev=5,azim=225)
fig.add_axes(ax)
x=result3d[0]
y=result3d[1]
z=result3d[2]
ax.scatter(x,y,z, cmap="RdYlGn", edgecolor='k', s=40, c=fr['outcome'].astype(int))
plt.show()