PCA Tutorial

PCA Tutorial: Heart disease prediction

Author

Benedict Thekkel

import pandas as pd

# https://www.kaggle.com/fedesoriano/heart-failure-prediction
df = pd.read_csv("Data/heart.csv")
df.head()

	Age	Sex	ChestPainType	RestingBP	Cholesterol	RestingECG	MaxHR	ExerciseAngina	Oldpeak	ST_Slope	HeartDisease
0	40	M	ATA	140	289	Normal	172	N	0.0	Up	0
1	49	F	NAP	160	180	Normal	156	N	1.0	Flat	1
2	37	M	ATA	130	283	ST	98	N	0.0	Up	0
3	48	F	ASY	138	214	Normal	108	Y	1.5	Flat	1
4	54	M	NAP	150	195	Normal	122	N	0.0	Up	0

df.shape

(918, 12)

df.describe()

	Age	RestingBP	Cholesterol	FastingBS	MaxHR	Oldpeak	HeartDisease
count	918.000000	918.000000	918.000000	918.000000	918.000000	918.000000	918.000000
mean	53.510893	132.396514	198.799564	0.233115	136.809368	0.887364	0.553377
std	9.432617	18.514154	109.384145	0.423046	25.460334	1.066570	0.497414
min	28.000000	0.000000	0.000000	0.000000	60.000000	-2.600000	0.000000
25%	47.000000	120.000000	173.250000	0.000000	120.000000	0.000000	0.000000
50%	54.000000	130.000000	223.000000	0.000000	138.000000	0.600000	1.000000
75%	60.000000	140.000000	267.000000	0.000000	156.000000	1.500000	1.000000
max	77.000000	200.000000	603.000000	1.000000	202.000000	6.200000	1.000000

Treat Outliers

df[df.Cholesterol>(df.Cholesterol.mean()+3*df.Cholesterol.std())]

	Age	Sex	ChestPainType	RestingBP	Cholesterol	FastingBS	RestingECG	MaxHR	ExerciseAngina	Oldpeak	ST_Slope	HeartDisease
76	32	M	ASY	118	529	0	Normal	130	N	0.0	Flat	1
149	54	M	ASY	130	603	1	Normal	125	Y	1.0	Flat	1
616	67	F	NAP	115	564	0	LVH	160	N	1.6	Flat	0

df.shape

(918, 12)

df1 = df[df.Cholesterol<=(df.Cholesterol.mean()+3*df.Cholesterol.std())]
df1.shape

(915, 12)

df[df.MaxHR>(df.MaxHR.mean()+3*df.MaxHR.std())]

	Age	Sex	ChestPainType	RestingBP	Cholesterol	FastingBS	RestingECG	MaxHR	ExerciseAngina	Oldpeak	ST_Slope	HeartDisease

df[df.FastingBS>(df.FastingBS.mean()+3*df.FastingBS.std())]

	Age	Sex	ChestPainType	RestingBP	Cholesterol	FastingBS	RestingECG	MaxHR	ExerciseAngina	Oldpeak	ST_Slope	HeartDisease

df[df.Oldpeak>(df.Oldpeak.mean()+3*df.Oldpeak.std())]

	Age	Sex	ChestPainType	RestingBP	Cholesterol	RestingECG	MaxHR	ExerciseAngina	Oldpeak	ST_Slope	HeartDisease
166	50	M	ASY	140	231	ST	140	Y	5.0	Flat	1
702	59	M	TA	178	270	LVH	145	N	4.2	Down	0
771	55	M	ASY	140	217	Normal	111	Y	5.6	Down	1
791	51	M	ASY	140	298	Normal	122	Y	4.2	Flat	1
850	62	F	ASY	160	164	LVH	145	N	6.2	Down	1
900	58	M	ASY	114	318	ST	140	N	4.4	Down	1

df2 = df1[df1.Oldpeak<=(df1.Oldpeak.mean()+3*df1.Oldpeak.std())]
df2.shape

(909, 12)

df[df.RestingBP>(df.RestingBP.mean()+3*df.RestingBP.std())]

	Age	Sex	ChestPainType	RestingBP	Cholesterol	FastingBS	RestingECG	MaxHR	ExerciseAngina	Oldpeak	ST_Slope	HeartDisease
109	39	M	ATA	190	241	0	Normal	106	N	0.0	Up	0
241	54	M	ASY	200	198	0	Normal	142	Y	2.0	Flat	1
365	64	F	ASY	200	0	0	Normal	140	Y	1.0	Flat	1
399	61	M	NAP	200	0	1	ST	70	N	0.0	Flat	1
592	61	M	ASY	190	287	1	LVH	150	Y	2.0	Down	1
732	56	F	ASY	200	288	1	LVH	133	Y	4.0	Down	1
759	54	M	ATA	192	283	0	LVH	195	N	0.0	Up	1

df3 = df2[df2.RestingBP<=(df2.RestingBP.mean()+3*df2.RestingBP.std())]
df3.shape

(902, 12)

df.ChestPainType.unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

df.RestingECG.unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

df.ExerciseAngina.unique()

array(['N', 'Y'], dtype=object)

df.ST_Slope.unique()

array(['Up', 'Flat', 'Down'], dtype=object)

df4 = df3.copy()
df4.ExerciseAngina.replace(
    {
        'N': 0,
        'Y': 1
    },
    inplace=True)

df4.ST_Slope.replace(
    {
        'Down': 1,
        'Flat': 2,
        'Up': 3
    },
    inplace=True
)

df4.RestingECG.replace(
    {
        'Normal': 1,
        'ST': 2,
        'LVH': 3
    },
    inplace=True)

df4.head()

	Age	Sex	ChestPainType	RestingBP	Cholesterol	RestingECG	MaxHR	ExerciseAngina	Oldpeak	ST_Slope	HeartDisease
0	40	M	ATA	140	289	1	172	0	0.0	3	0
1	49	F	NAP	160	180	1	156	0	1.0	2	1
2	37	M	ATA	130	283	2	98	0	0.0	3	0
3	48	F	ASY	138	214	1	108	1	1.5	2	1
4	54	M	NAP	150	195	1	122	0	0.0	3	0

df5 = pd.get_dummies(df4, drop_first=True)
df5.head()

	Age	RestingBP	Cholesterol	RestingECG	MaxHR	ExerciseAngina	Oldpeak	ST_Slope	HeartDisease	Sex_M	ChestPainType_ATA	ChestPainType_NAP
0	40	140	289	1	172	0	0.0	3	0	1	1	0
1	49	160	180	1	156	0	1.0	2	1	0	0	1
2	37	130	283	2	98	0	0.0	3	0	1	1	0
3	48	138	214	1	108	1	1.5	2	1	0	0	0
4	54	150	195	1	122	0	0.0	3	0	1	0	1

X = df5.drop("HeartDisease",axis='columns')
y = df5.HeartDisease

X.head()

	Age	RestingBP	Cholesterol	RestingECG	MaxHR	ExerciseAngina	Oldpeak	ST_Slope	Sex_M	ChestPainType_ATA	ChestPainType_NAP
0	40	140	289	1	172	0	0.0	3	1	1	0
1	49	160	180	1	156	0	1.0	2	0	0	1
2	37	130	283	2	98	0	0.0	3	1	1	0
3	48	138	214	1	108	1	1.5	2	0	0	0
4	54	150	195	1	122	0	0.0	3	1	0	1

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-1.42896269,  0.46089071,  0.85238015, ...,  2.06757196,
        -0.53547478, -0.22914788],
       [-0.47545956,  1.5925728 , -0.16132855, ..., -0.4836591 ,
         1.86750159, -0.22914788],
       [-1.74679706, -0.10495034,  0.79657967, ...,  2.06757196,
        -0.53547478, -0.22914788],
       ...,
       [ 0.37209878, -0.10495034, -0.61703246, ..., -0.4836591 ,
        -0.53547478, -0.22914788],
       [ 0.37209878, -0.10495034,  0.35947592, ...,  2.06757196,
        -0.53547478, -0.22914788],
       [-1.64085227,  0.3477225 , -0.20782894, ..., -0.4836591 ,
         1.86750159, -0.22914788]])

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=30)

X_train.shape

(721, 13)

X_test.shape

(181, 13)

from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
model_rf.score(X_test, y_test)

0.8674033149171271

Use PCA to reduce dimensions

	Age	RestingBP	Cholesterol	FastingBS	RestingECG	MaxHR	ExerciseAngina	Oldpeak	ST_Slope	Sex_M	ChestPainType_ATA	ChestPainType_NAP	ChestPainType_TA
0	40	140	289	0	1	172	0	0.0	3	1	1	0	0
1	49	160	180	0	1	156	0	1.0	2	0	0	1	0
2	37	130	283	0	2	98	0	0.0	3	1	1	0	0
3	48	138	214	0	1	108	1	1.5	2	0	0	0	0
4	54	150	195	0	1	122	0	0.0	3	1	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...
913	45	110	264	0	1	132	0	1.2	2	1	0	0	1
914	68	144	193	1	1	141	0	3.4	2	1	0	0	0
915	57	130	131	0	1	115	1	1.2	2	1	0	0	0
916	57	130	236	0	3	174	0	0.0	2	0	1	0	0
917	38	138	175	0	1	173	0	0.0	3	1	0	1	0

902 rows × 13 columns

from sklearn.decomposition import PCA

pca = PCA(0.95)
X_pca = pca.fit_transform(X)
X_pca

array([[ 93.82465373, -29.40099458],
       [-15.58422331, -14.10909233],
       [ 83.29606634,  38.6867453 ],
       ...,
       [-67.57318721,  17.61319354],
       [ 40.70458237, -33.38750602],
       [-19.91368346, -37.29085722]])

X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=30)

from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier()
model_rf.fit(X_train_pca, y_train)
model_rf.score(X_test_pca, y_test)

0.7182320441988951