In [27]:
# Import all necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay
from sklearn.datasets import load_digits
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import RocCurveDisplay
import timeit
from timeit import default_timer as timer
import os
import warnings
warnings.filterwarnings('ignore')
In [3]:
# Load data
df = pd.read_csv("heart_disease_dataframe.csv")
Data Inspection
In [5]:
# Load first five rows
print("The first five rows:")
print(df.head(5))
The first five rows: age sex cp trestbps chol fbs restecg thalach exang oldpeak slope \ 0 52 1 0 125 212 0 1 168 0 1.0 2 1 53 1 0 140 203 1 0 155 1 3.1 0 2 70 1 0 145 174 0 1 125 1 2.6 0 3 61 1 0 148 203 0 1 161 0 0.0 2 4 62 0 0 138 294 1 1 106 0 1.9 1 ca thal target 0 2 3 0 1 0 3 0 2 0 3 0 3 1 3 0 4 3 2 0
In [7]:
# Print the dimensions of the dataframe with shape()
print("Dataframe dimensions:")
print(df.shape) # inspect dimensions of the df
Dataframe dimensions: (303, 14)
In [9]:
# Check for missing values for each feature
print("Null value count for each feature:")
print(df.isnull().sum()) # show number of missing values in each column
Null value count for each feature: age 0 sex 0 cp 0 trestbps 0 chol 0 fbs 0 restecg 0 thalach 0 exang 0 oldpeak 0 slope 0 ca 0 thal 0 target 0 dtype: int64
In [11]:
# Since no NA values were identfied, explore further for unusual values
# Check the number of unique values for each feature
print("Number of unique values for each feature:")
print(df.nunique().sort_values())
Number of unique values for each feature: sex 2 fbs 2 exang 2 target 2 restecg 3 slope 3 cp 4 thal 4 ca 5 oldpeak 40 age 41 trestbps 50 thalach 91 chol 152 dtype: int64
In [13]:
# Plot the distribution of values in the target feature
counts = df['target'].value_counts()
counts.plot(kind='bar', color='blue')
plt.xlabel('Categories')
plt.ylabel('Count')
plt.title('Value Counts of Target Variable')
plt.tight_layout()
plt.show()
Data Cleaning and Preparation
In [15]:
# Set necessary variables to "NA"
# for "ca" variable, NA values are denoted in the data as 4,
# since values should only be 0,1,2,3 as specified in the documentation
# for "thal" variable, NA values are denoted as O since values are from 1-3
# as specified in the documentation
df.loc[df["ca"] == 4, "ca"] = pd.NA
df.loc[df["thal"] == 0, "thal"] = pd.NA
In [17]:
# Drop rows that have an NA value
df = df.dropna()
In [19]:
# Split into X and y
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
In [21]:
# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
(237, 13) (60, 13) (237,) (60,)
In [31]:
# Check information gain of all variables
for col in X.select_dtypes(include='object').columns:
le = LabelEncoder()
X[col] = le.fit_transform(X_encoded[col])
# Compute information gain
i_scores = mutual_info_classif(X, y)
# Create a DataFrame to view scores
info_gain = pd.Series(i_scores, index=X.columns).sort_values(ascending=False)
print(info_gain)
ca 0.168856 thal 0.151349 exang 0.119518 oldpeak 0.105728 cp 0.086175 thalach 0.078578 chol 0.075927 slope 0.066450 sex 0.053453 restecg 0.024446 fbs 0.021515 age 0.001924 trestbps 0.000000 dtype: float64
Model Development
In [33]:
# Need to transform and scale variables for modelling
# Scale numerical variables
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca', 'thal']
numeric_transformer = Pipeline(
steps=[("scaler", StandardScaler())]
)
# Encode cateogrical variables with one hot encoder
categorical_features = ['cp', 'sex', 'fbs', 'restecg', 'exang', 'slope']
categorical_transformer = Pipeline(
steps=[
("encoder", OneHotEncoder(handle_unknown="ignore")),
]
)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numerical_features),
("cat", categorical_transformer, categorical_features),
]
)
In [35]:
# Count the total number of features after one hot encoding
X_transformed = preprocessor.fit_transform(X_train)
# Get the shape of the transformed data
print("Number of features after preprocessing:", X_transformed.shape[1])
Number of features after preprocessing: 23
In [37]:
# Make Naive Bayes pipeline
NBpipe = Pipeline(steps=[
('preprocessor', preprocessor), # Preprocess the features
('nb', GaussianNB()) # Include option to try different types of NB
])
param_grid_NB = {
'preprocessor__num__scaler': [StandardScaler(), MinMaxScaler()],
'nb': [GaussianNB(), BernoulliNB()]
}
In [39]:
# Perform grid search to test all parameters
grid_search_NB = GridSearchCV(NBpipe, param_grid=param_grid_NB, cv=10, verbose=1, n_jobs = -1)
start = timer()
grid_search_NB.fit(X_train, y_train)
end = timer()
print(end - start)
Fitting 10 folds for each of 4 candidates, totalling 40 fits 7.643267299979925
In [41]:
# Print best combination of parameters
grid_search_NB.best_params_
Out[41]:
{'nb': BernoulliNB(), 'preprocessor__num__scaler': StandardScaler()}
In [43]:
scores_df_NB = pd.DataFrame(grid_search_NB.cv_results_)
scores_df_NB = scores_df_NB.sort_values(by=['rank_test_score']).reset_index(drop='index')
scores_df_NB
Out[43]:
mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_nb | param_preprocessor__num__scaler | params | split0_test_score | split1_test_score | split2_test_score | split3_test_score | split4_test_score | split5_test_score | split6_test_score | split7_test_score | split8_test_score | split9_test_score | mean_test_score | std_test_score | rank_test_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.040616 | 0.019564 | 0.014977 | 0.002322 | BernoulliNB() | StandardScaler() | {'nb': BernoulliNB(), 'preprocessor__num__scal... | 0.916667 | 0.833333 | 0.916667 | 0.875000 | 0.708333 | 0.750000 | 0.833333 | 0.782609 | 0.782609 | 0.782609 | 0.818116 | 0.066029 | 1 |
1 | 0.027167 | 0.009582 | 0.013641 | 0.002258 | BernoulliNB() | MinMaxScaler() | {'nb': BernoulliNB(), 'preprocessor__num__scal... | 0.833333 | 0.750000 | 0.916667 | 0.875000 | 0.791667 | 0.791667 | 0.750000 | 0.826087 | 0.739130 | 0.739130 | 0.801268 | 0.057932 | 2 |
2 | 0.029242 | 0.004522 | 0.013293 | 0.001206 | GaussianNB() | StandardScaler() | {'nb': GaussianNB(), 'preprocessor__num__scale... | 0.875000 | 0.791667 | 0.833333 | 0.958333 | 0.708333 | 0.708333 | 0.791667 | 0.782609 | 0.695652 | 0.695652 | 0.784058 | 0.082637 | 3 |
3 | 0.025201 | 0.002812 | 0.014205 | 0.001377 | GaussianNB() | MinMaxScaler() | {'nb': GaussianNB(), 'preprocessor__num__scale... | 0.875000 | 0.791667 | 0.833333 | 0.958333 | 0.708333 | 0.666667 | 0.791667 | 0.782609 | 0.695652 | 0.695652 | 0.779891 | 0.087270 | 4 |
In [45]:
# kNN pipeline
kNNpipe = Pipeline(steps=[
('preprocessor', preprocessor), # feature preprocessor
('kNN', KNeighborsClassifier())
])
param_grid_kNN = {
'preprocessor__num__scaler':[StandardScaler(), MinMaxScaler(),'passthrough'],
'kNN__n_neighbors':[5, 10, 30],
'kNN__weights':['uniform', 'distance']
}
In [47]:
# Perform grid search
grid_search_kNN = GridSearchCV(kNNpipe, param_grid=param_grid_kNN, cv=10, verbose=1, n_jobs = -1)
start = timer()
grid_search_kNN.fit(X_train, y_train)
end = timer()
print(end - start)
Fitting 10 folds for each of 18 candidates, totalling 180 fits 1.8598260000580922
In [49]:
# Print best combination of parameters
grid_search_kNN.best_params_
Out[49]:
{'kNN__n_neighbors': 5, 'kNN__weights': 'uniform', 'preprocessor__num__scaler': StandardScaler()}
In [51]:
# Print scores for all combinations of parameters
scores_df_kNN = pd.DataFrame(grid_search_kNN.cv_results_)
scores_df_kNN = scores_df_kNN.sort_values(by=['rank_test_score']).reset_index(drop='index')
scores_df_kNN
Out[51]:
mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_kNN__n_neighbors | param_kNN__weights | param_preprocessor__num__scaler | params | split0_test_score | split1_test_score | ... | split3_test_score | split4_test_score | split5_test_score | split6_test_score | split7_test_score | split8_test_score | split9_test_score | mean_test_score | std_test_score | rank_test_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.026435 | 0.008368 | 0.228611 | 0.162379 | 5 | uniform | StandardScaler() | {'kNN__n_neighbors': 5, 'kNN__weights': 'unifo... | 0.916667 | 0.833333 | ... | 0.833333 | 0.791667 | 0.833333 | 0.833333 | 0.913043 | 0.739130 | 0.869565 | 0.835507 | 0.051875 | 1 |
1 | 0.026703 | 0.005872 | 0.016211 | 0.003462 | 5 | distance | StandardScaler() | {'kNN__n_neighbors': 5, 'kNN__weights': 'dista... | 0.916667 | 0.833333 | ... | 0.833333 | 0.791667 | 0.833333 | 0.833333 | 0.913043 | 0.739130 | 0.869565 | 0.831341 | 0.056680 | 2 |
2 | 0.021128 | 0.000970 | 0.017506 | 0.008723 | 10 | distance | MinMaxScaler() | {'kNN__n_neighbors': 10, 'kNN__weights': 'dist... | 0.791667 | 0.916667 | ... | 0.833333 | 0.791667 | 0.791667 | 0.791667 | 0.782609 | 0.869565 | 0.869565 | 0.823007 | 0.044449 | 3 |
3 | 0.029948 | 0.015580 | 0.017090 | 0.004481 | 5 | distance | MinMaxScaler() | {'kNN__n_neighbors': 5, 'kNN__weights': 'dista... | 0.750000 | 0.916667 | ... | 0.916667 | 0.791667 | 0.833333 | 0.750000 | 0.782609 | 0.869565 | 0.739130 | 0.818297 | 0.063256 | 4 |
4 | 0.024634 | 0.006832 | 0.021316 | 0.011344 | 30 | uniform | StandardScaler() | {'kNN__n_neighbors': 30, 'kNN__weights': 'unif... | 0.916667 | 0.791667 | ... | 0.833333 | 0.791667 | 0.666667 | 0.833333 | 0.913043 | 0.782609 | 0.826087 | 0.814674 | 0.067400 | 5 |
5 | 0.025494 | 0.007245 | 0.022755 | 0.012643 | 10 | distance | StandardScaler() | {'kNN__n_neighbors': 10, 'kNN__weights': 'dist... | 0.875000 | 0.833333 | ... | 0.833333 | 0.791667 | 0.791667 | 0.791667 | 0.913043 | 0.826087 | 0.695652 | 0.810145 | 0.058202 | 6 |
6 | 0.031700 | 0.006640 | 0.013019 | 0.000951 | 30 | distance | StandardScaler() | {'kNN__n_neighbors': 30, 'kNN__weights': 'dist... | 0.916667 | 0.791667 | ... | 0.833333 | 0.791667 | 0.666667 | 0.833333 | 0.913043 | 0.782609 | 0.826087 | 0.806341 | 0.074507 | 7 |
7 | 0.023344 | 0.003545 | 0.013966 | 0.002101 | 10 | uniform | StandardScaler() | {'kNN__n_neighbors': 10, 'kNN__weights': 'unif... | 0.875000 | 0.833333 | ... | 0.833333 | 0.833333 | 0.666667 | 0.791667 | 0.956522 | 0.826087 | 0.695652 | 0.806159 | 0.080619 | 8 |
8 | 0.028623 | 0.006207 | 0.013791 | 0.001699 | 10 | uniform | MinMaxScaler() | {'kNN__n_neighbors': 10, 'kNN__weights': 'unif... | 0.750000 | 0.916667 | ... | 0.791667 | 0.750000 | 0.791667 | 0.791667 | 0.739130 | 0.869565 | 0.826087 | 0.801812 | 0.052992 | 9 |
9 | 0.024178 | 0.006204 | 0.015050 | 0.004553 | 30 | distance | MinMaxScaler() | {'kNN__n_neighbors': 30, 'kNN__weights': 'dist... | 0.791667 | 0.875000 | ... | 0.833333 | 0.791667 | 0.750000 | 0.750000 | 0.869565 | 0.826087 | 0.739130 | 0.801812 | 0.045975 | 9 |
10 | 0.036063 | 0.013179 | 0.017139 | 0.003595 | 5 | uniform | MinMaxScaler() | {'kNN__n_neighbors': 5, 'kNN__weights': 'unifo... | 0.708333 | 0.875000 | ... | 0.875000 | 0.750000 | 0.791667 | 0.791667 | 0.739130 | 0.826087 | 0.782609 | 0.797283 | 0.052931 | 11 |
11 | 0.028173 | 0.010487 | 0.018084 | 0.008248 | 30 | uniform | MinMaxScaler() | {'kNN__n_neighbors': 30, 'kNN__weights': 'unif... | 0.750000 | 0.916667 | ... | 0.833333 | 0.708333 | 0.750000 | 0.750000 | 0.826087 | 0.826087 | 0.695652 | 0.793116 | 0.069301 | 12 |
12 | 0.016396 | 0.001517 | 0.010330 | 0.001038 | 30 | distance | passthrough | {'kNN__n_neighbors': 30, 'kNN__weights': 'dist... | 0.833333 | 0.666667 | ... | 0.583333 | 0.708333 | 0.541667 | 0.666667 | 0.695652 | 0.565217 | 0.739130 | 0.666667 | 0.082744 | 13 |
13 | 0.022407 | 0.010565 | 0.012709 | 0.004054 | 10 | uniform | passthrough | {'kNN__n_neighbors': 10, 'kNN__weights': 'unif... | 0.708333 | 0.666667 | ... | 0.625000 | 0.666667 | 0.583333 | 0.708333 | 0.608696 | 0.565217 | 0.739130 | 0.657971 | 0.056601 | 14 |
14 | 0.021218 | 0.007057 | 0.013671 | 0.006174 | 30 | uniform | passthrough | {'kNN__n_neighbors': 30, 'kNN__weights': 'unif... | 0.750000 | 0.708333 | ... | 0.583333 | 0.708333 | 0.541667 | 0.625000 | 0.608696 | 0.608696 | 0.782609 | 0.650000 | 0.076846 | 15 |
15 | 0.017840 | 0.003139 | 0.015803 | 0.003919 | 10 | distance | passthrough | {'kNN__n_neighbors': 10, 'kNN__weights': 'dist... | 0.708333 | 0.583333 | ... | 0.541667 | 0.666667 | 0.583333 | 0.666667 | 0.652174 | 0.565217 | 0.739130 | 0.637319 | 0.061960 | 16 |
16 | 0.024550 | 0.005031 | 0.017400 | 0.007658 | 5 | uniform | passthrough | {'kNN__n_neighbors': 5, 'kNN__weights': 'unifo... | 0.666667 | 0.541667 | ... | 0.583333 | 0.708333 | 0.541667 | 0.583333 | 0.695652 | 0.608696 | 0.739130 | 0.633514 | 0.067332 | 17 |
17 | 0.025774 | 0.016168 | 0.015783 | 0.014388 | 5 | distance | passthrough | {'kNN__n_neighbors': 5, 'kNN__weights': 'dista... | 0.666667 | 0.541667 | ... | 0.500000 | 0.708333 | 0.625000 | 0.625000 | 0.695652 | 0.608696 | 0.652174 | 0.628986 | 0.062308 | 18 |
18 rows × 21 columns
In [53]:
# Logistic regression pipeline
LRpipe = Pipeline([
('preprocessor', preprocessor),
('lor', LogisticRegression())
])
param_grid_LR = {
'preprocessor__num__scaler': [StandardScaler(), MinMaxScaler()],
'lor__C': [0.1, 1, 10],
'lor__penalty': ['l1','l2'],
'lor__solver': ['liblinear'],
'lor__class_weight': [None, 'balanced']
}
In [55]:
# Perform grid search
grid_search_LR = GridSearchCV(LRpipe, param_grid=param_grid_LR, cv = 10, verbose = 1, n_jobs = -1)
start = timer()
grid_search_LR = grid_search_LR.fit(X_train,y_train)
end = timer()
print(end - start)
Fitting 10 folds for each of 24 candidates, totalling 240 fits 2.2249090999830514
In [57]:
# Print best combination of parameters
grid_search_LR.best_params_
Out[57]:
{'lor__C': 10, 'lor__class_weight': None, 'lor__penalty': 'l2', 'lor__solver': 'liblinear', 'preprocessor__num__scaler': StandardScaler()}
In [59]:
# Print scores for all combinations of parameters
scores_df_LR = pd.DataFrame(grid_search_LR.cv_results_)
scores_df_LR = scores_df_LR.sort_values(by=['rank_test_score']).reset_index(drop='index')
scores_df_LR
Out[59]:
mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_lor__C | param_lor__class_weight | param_lor__penalty | param_lor__solver | param_preprocessor__num__scaler | params | ... | split3_test_score | split4_test_score | split5_test_score | split6_test_score | split7_test_score | split8_test_score | split9_test_score | mean_test_score | std_test_score | rank_test_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.033824 | 0.007303 | 0.020174 | 0.013337 | 10.0 | balanced | l1 | liblinear | MinMaxScaler() | {'lor__C': 10, 'lor__class_weight': 'balanced'... | ... | 0.875000 | 0.750000 | 0.750000 | 0.791667 | 0.869565 | 0.869565 | 0.913043 | 0.827717 | 0.055887 | 1 |
1 | 0.032085 | 0.008151 | 0.013103 | 0.004807 | 10.0 | balanced | l1 | liblinear | StandardScaler() | {'lor__C': 10, 'lor__class_weight': 'balanced'... | ... | 0.875000 | 0.791667 | 0.750000 | 0.791667 | 0.869565 | 0.869565 | 0.913043 | 0.827717 | 0.055887 | 1 |
2 | 0.030107 | 0.009581 | 0.013180 | 0.002110 | 10.0 | None | l2 | liblinear | StandardScaler() | {'lor__C': 10, 'lor__class_weight': None, 'lor... | ... | 0.875000 | 0.791667 | 0.750000 | 0.791667 | 0.869565 | 0.869565 | 0.913043 | 0.827717 | 0.055887 | 1 |
3 | 0.035369 | 0.008313 | 0.015984 | 0.008681 | 10.0 | balanced | l2 | liblinear | StandardScaler() | {'lor__C': 10, 'lor__class_weight': 'balanced'... | ... | 0.875000 | 0.791667 | 0.750000 | 0.791667 | 0.869565 | 0.869565 | 0.913043 | 0.827717 | 0.055887 | 1 |
4 | 0.024468 | 0.002028 | 0.019287 | 0.012827 | 0.1 | None | l1 | liblinear | StandardScaler() | {'lor__C': 0.1, 'lor__class_weight': None, 'lo... | ... | 0.791667 | 0.750000 | 0.833333 | 0.875000 | 0.869565 | 0.869565 | 0.869565 | 0.827536 | 0.056018 | 5 |
5 | 0.032092 | 0.006293 | 0.017227 | 0.010048 | 10.0 | None | l1 | liblinear | MinMaxScaler() | {'lor__C': 10, 'lor__class_weight': None, 'lor... | ... | 0.875000 | 0.791667 | 0.750000 | 0.791667 | 0.869565 | 0.869565 | 0.869565 | 0.827536 | 0.056018 | 5 |
6 | 0.031349 | 0.013079 | 0.021401 | 0.014764 | 1.0 | None | l2 | liblinear | StandardScaler() | {'lor__C': 1, 'lor__class_weight': None, 'lor_... | ... | 0.875000 | 0.791667 | 0.750000 | 0.791667 | 0.869565 | 0.869565 | 0.869565 | 0.827536 | 0.056018 | 5 |
7 | 0.029324 | 0.007512 | 0.014583 | 0.006244 | 10.0 | balanced | l2 | liblinear | MinMaxScaler() | {'lor__C': 10, 'lor__class_weight': 'balanced'... | ... | 0.875000 | 0.750000 | 0.750000 | 0.791667 | 0.826087 | 0.869565 | 0.913043 | 0.827536 | 0.059296 | 5 |
8 | 0.033215 | 0.012689 | 0.018869 | 0.006823 | 0.1 | None | l2 | liblinear | StandardScaler() | {'lor__C': 0.1, 'lor__class_weight': None, 'lo... | ... | 0.875000 | 0.791667 | 0.791667 | 0.791667 | 0.913043 | 0.826087 | 0.826087 | 0.827355 | 0.067847 | 9 |
9 | 0.029831 | 0.011166 | 0.016903 | 0.008972 | 1.0 | None | l1 | liblinear | StandardScaler() | {'lor__C': 1, 'lor__class_weight': None, 'lor_... | ... | 0.875000 | 0.791667 | 0.750000 | 0.791667 | 0.869565 | 0.869565 | 0.869565 | 0.823370 | 0.050512 | 10 |
10 | 0.035238 | 0.012771 | 0.013919 | 0.002458 | 10.0 | None | l1 | liblinear | StandardScaler() | {'lor__C': 10, 'lor__class_weight': None, 'lor... | ... | 0.875000 | 0.791667 | 0.750000 | 0.791667 | 0.869565 | 0.869565 | 0.869565 | 0.823370 | 0.050512 | 10 |
11 | 0.028972 | 0.011720 | 0.014940 | 0.003184 | 1.0 | None | l2 | liblinear | MinMaxScaler() | {'lor__C': 1, 'lor__class_weight': None, 'lor_... | ... | 0.916667 | 0.750000 | 0.791667 | 0.750000 | 0.826087 | 0.869565 | 0.869565 | 0.823188 | 0.054246 | 12 |
12 | 0.026324 | 0.007364 | 0.015885 | 0.006873 | 10.0 | None | l2 | liblinear | MinMaxScaler() | {'lor__C': 10, 'lor__class_weight': None, 'lor... | ... | 0.833333 | 0.791667 | 0.750000 | 0.791667 | 0.869565 | 0.869565 | 0.869565 | 0.819203 | 0.047723 | 13 |
13 | 0.038048 | 0.009816 | 0.017373 | 0.003743 | 0.1 | balanced | l2 | liblinear | StandardScaler() | {'lor__C': 0.1, 'lor__class_weight': 'balanced... | ... | 0.875000 | 0.750000 | 0.750000 | 0.791667 | 0.913043 | 0.782609 | 0.826087 | 0.818841 | 0.069675 | 14 |
14 | 0.041717 | 0.012834 | 0.020325 | 0.007015 | 0.1 | balanced | l1 | liblinear | StandardScaler() | {'lor__C': 0.1, 'lor__class_weight': 'balanced... | ... | 0.791667 | 0.791667 | 0.791667 | 0.916667 | 0.869565 | 0.782609 | 0.869565 | 0.818841 | 0.058607 | 14 |
15 | 0.033617 | 0.009044 | 0.016848 | 0.004173 | 0.1 | balanced | l2 | liblinear | MinMaxScaler() | {'lor__C': 0.1, 'lor__class_weight': 'balanced... | ... | 0.875000 | 0.791667 | 0.750000 | 0.750000 | 0.826087 | 0.869565 | 0.782609 | 0.818659 | 0.052751 | 16 |
16 | 0.026952 | 0.004060 | 0.013113 | 0.002126 | 1.0 | balanced | l2 | liblinear | StandardScaler() | {'lor__C': 1, 'lor__class_weight': 'balanced',... | ... | 0.833333 | 0.750000 | 0.750000 | 0.791667 | 0.869565 | 0.869565 | 0.869565 | 0.815036 | 0.051606 | 17 |
17 | 0.030905 | 0.009868 | 0.017964 | 0.012697 | 1.0 | balanced | l2 | liblinear | MinMaxScaler() | {'lor__C': 1, 'lor__class_weight': 'balanced',... | ... | 0.916667 | 0.750000 | 0.750000 | 0.750000 | 0.826087 | 0.826087 | 0.869565 | 0.814674 | 0.055608 | 18 |
18 | 0.025941 | 0.006350 | 0.014718 | 0.005811 | 1.0 | None | l1 | liblinear | MinMaxScaler() | {'lor__C': 1, 'lor__class_weight': None, 'lor_... | ... | 0.875000 | 0.708333 | 0.750000 | 0.750000 | 0.826087 | 0.869565 | 0.869565 | 0.810688 | 0.058317 | 19 |
19 | 0.037964 | 0.013412 | 0.014545 | 0.003916 | 1.0 | balanced | l1 | liblinear | StandardScaler() | {'lor__C': 1, 'lor__class_weight': 'balanced',... | ... | 0.833333 | 0.750000 | 0.750000 | 0.791667 | 0.869565 | 0.826087 | 0.869565 | 0.810688 | 0.040801 | 20 |
20 | 0.028776 | 0.006499 | 0.013762 | 0.004283 | 1.0 | balanced | l1 | liblinear | MinMaxScaler() | {'lor__C': 1, 'lor__class_weight': 'balanced',... | ... | 0.916667 | 0.708333 | 0.750000 | 0.750000 | 0.826087 | 0.826087 | 0.869565 | 0.802174 | 0.057771 | 21 |
21 | 0.036283 | 0.008165 | 0.022456 | 0.009137 | 0.1 | None | l2 | liblinear | MinMaxScaler() | {'lor__C': 0.1, 'lor__class_weight': None, 'lo... | ... | 0.875000 | 0.791667 | 0.750000 | 0.750000 | 0.826087 | 0.782609 | 0.782609 | 0.801630 | 0.037743 | 22 |
22 | 0.033705 | 0.015333 | 0.018319 | 0.007521 | 0.1 | None | l1 | liblinear | MinMaxScaler() | {'lor__C': 0.1, 'lor__class_weight': None, 'lo... | ... | 0.833333 | 0.708333 | 0.833333 | 0.750000 | 0.826087 | 0.782609 | 0.695652 | 0.780435 | 0.056386 | 23 |
23 | 0.036807 | 0.008219 | 0.021186 | 0.005509 | 0.1 | balanced | l1 | liblinear | MinMaxScaler() | {'lor__C': 0.1, 'lor__class_weight': 'balanced... | ... | 0.833333 | 0.625000 | 0.875000 | 0.750000 | 0.826087 | 0.826087 | 0.652174 | 0.776268 | 0.082503 | 24 |
24 rows × 23 columns
In [61]:
# Random Forest Pipeline
RFpipe = Pipeline([
('preprocessor', preprocessor),
('rf', RandomForestClassifier())
])
param_grid_RF = {
'preprocessor__num__scaler': ['passthrough'],
'rf__max_depth': [20, 50, None],
'rf__max_features': ['sqrt'],
'rf__n_estimators': [100, 200, 400, 500,]
}
In [63]:
# Perform grid search
grid_search_RF = GridSearchCV(RFpipe, param_grid=param_grid_RF, cv = 10, verbose = 1, n_jobs = -1)
start = timer()
grid_search_RF = grid_search_RF.fit(X_train,y_train)
end = timer()
print(end - start)
Fitting 10 folds for each of 12 candidates, totalling 120 fits 27.08498939999845
In [65]:
grid_search_RF.best_params_
Out[65]:
{'preprocessor__num__scaler': 'passthrough', 'rf__max_depth': 50, 'rf__max_features': 'sqrt', 'rf__n_estimators': 100}
In [67]:
scores_df_RF = pd.DataFrame(grid_search_RF.cv_results_)
scores_df_RF = scores_df_RF.sort_values(by=['rank_test_score']).reset_index(drop='index')
scores_df_RF
Out[67]:
mean_fit_time | std_fit_time | mean_score_time | std_score_time | param_preprocessor__num__scaler | param_rf__max_depth | param_rf__max_features | param_rf__n_estimators | params | split0_test_score | ... | split3_test_score | split4_test_score | split5_test_score | split6_test_score | split7_test_score | split8_test_score | split9_test_score | mean_test_score | std_test_score | rank_test_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.413213 | 0.041544 | 0.025372 | 0.002237 | passthrough | 50 | sqrt | 100 | {'preprocessor__num__scaler': 'passthrough', '... | 0.916667 | ... | 0.916667 | 0.833333 | 0.75 | 0.875000 | 0.913043 | 0.869565 | 0.739130 | 0.839674 | 0.067567 | 1 |
1 | 1.740125 | 0.098232 | 0.085991 | 0.013691 | passthrough | 20 | sqrt | 400 | {'preprocessor__num__scaler': 'passthrough', '... | 0.833333 | ... | 0.958333 | 0.875000 | 0.75 | 0.833333 | 0.913043 | 0.869565 | 0.739130 | 0.831341 | 0.070347 | 2 |
2 | 0.568113 | 0.118757 | 0.040052 | 0.006936 | passthrough | 20 | sqrt | 100 | {'preprocessor__num__scaler': 'passthrough', '... | 0.875000 | ... | 0.916667 | 0.833333 | 0.75 | 0.791667 | 0.913043 | 0.826087 | 0.739130 | 0.831159 | 0.057091 | 3 |
3 | 0.789970 | 0.062701 | 0.043109 | 0.004487 | passthrough | 50 | sqrt | 200 | {'preprocessor__num__scaler': 'passthrough', '... | 0.833333 | ... | 0.916667 | 0.833333 | 0.75 | 0.833333 | 0.913043 | 0.782609 | 0.782609 | 0.822826 | 0.055883 | 4 |
4 | 1.955410 | 0.126208 | 0.099494 | 0.010557 | passthrough | 50 | sqrt | 500 | {'preprocessor__num__scaler': 'passthrough', '... | 0.833333 | ... | 0.916667 | 0.833333 | 0.75 | 0.833333 | 0.913043 | 0.869565 | 0.739130 | 0.818841 | 0.067595 | 5 |
5 | 1.565949 | 0.103470 | 0.087932 | 0.018947 | passthrough | 50 | sqrt | 400 | {'preprocessor__num__scaler': 'passthrough', '... | 0.833333 | ... | 0.916667 | 0.833333 | 0.75 | 0.750000 | 0.913043 | 0.869565 | 0.739130 | 0.814674 | 0.065248 | 6 |
6 | 0.941730 | 0.079146 | 0.054663 | 0.011396 | passthrough | 20 | sqrt | 200 | {'preprocessor__num__scaler': 'passthrough', '... | 0.833333 | ... | 0.916667 | 0.833333 | 0.75 | 0.833333 | 0.913043 | 0.826087 | 0.695652 | 0.814312 | 0.066568 | 7 |
7 | 0.779785 | 0.035231 | 0.044444 | 0.004863 | passthrough | None | sqrt | 200 | {'preprocessor__num__scaler': 'passthrough', '... | 0.833333 | ... | 0.958333 | 0.833333 | 0.75 | 0.791667 | 0.869565 | 0.782609 | 0.782609 | 0.814312 | 0.059855 | 7 |
8 | 1.929132 | 0.147457 | 0.094169 | 0.007704 | passthrough | None | sqrt | 500 | {'preprocessor__num__scaler': 'passthrough', '... | 0.833333 | ... | 0.958333 | 0.833333 | 0.75 | 0.750000 | 0.869565 | 0.826087 | 0.739130 | 0.814312 | 0.065635 | 7 |
9 | 2.172357 | 0.190574 | 0.104054 | 0.018580 | passthrough | 20 | sqrt | 500 | {'preprocessor__num__scaler': 'passthrough', '... | 0.833333 | ... | 0.916667 | 0.833333 | 0.75 | 0.791667 | 0.913043 | 0.826087 | 0.739130 | 0.810326 | 0.065554 | 10 |
10 | 1.570740 | 0.133804 | 0.081549 | 0.007444 | passthrough | None | sqrt | 400 | {'preprocessor__num__scaler': 'passthrough', '... | 0.833333 | ... | 0.916667 | 0.875000 | 0.75 | 0.791667 | 0.913043 | 0.782609 | 0.739130 | 0.810145 | 0.068663 | 11 |
11 | 0.397811 | 0.028023 | 0.028380 | 0.004541 | passthrough | None | sqrt | 100 | {'preprocessor__num__scaler': 'passthrough', '... | 0.791667 | ... | 0.916667 | 0.750000 | 0.75 | 0.791667 | 0.913043 | 0.826087 | 0.782609 | 0.806341 | 0.058889 | 12 |
12 rows × 22 columns
In [69]:
best_nb = grid_search_NB.best_estimator_
y_pred = best_nb.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
print(classification_report(y_test, y_pred))
precision recall f1-score support 0 0.88 0.81 0.85 27 1 0.86 0.91 0.88 33 accuracy 0.87 60 macro avg 0.87 0.86 0.86 60 weighted avg 0.87 0.87 0.87 60
In [71]:
y_proba = best_nb.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y_test, y_proba)
print("ROC-AUC:", roc_auc)
RocCurveDisplay.from_estimator(best_nb, X_test, y_test)
ROC-AUC: 0.9270482603815937
Out[71]:
<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x284b8c0a420>
In [73]:
best_lr = grid_search_LR.best_estimator_
y_pred = best_lr.predict(X_test)
cm_lr = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_lr)
disp.plot(cmap="Blues")
print(classification_report(y_test, y_pred))
precision recall f1-score support 0 0.91 0.78 0.84 27 1 0.84 0.94 0.89 33 accuracy 0.87 60 macro avg 0.88 0.86 0.86 60 weighted avg 0.87 0.87 0.87 60
In [75]:
y_proba = best_lr.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y_test, y_proba)
print("ROC-AUC:", roc_auc)
RocCurveDisplay.from_estimator(best_lr, X_test, y_test)
ROC-AUC: 0.9461279461279462
Out[75]:
<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x284b8c0a030>
In [77]:
best_knn = grid_search_kNN.best_estimator_
y_pred = best_knn.predict(X_test)
cm_knn = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_knn)
disp.plot(cmap="Blues")
print(classification_report(y_test, y_pred))
precision recall f1-score support 0 0.88 0.78 0.82 27 1 0.83 0.91 0.87 33 accuracy 0.85 60 macro avg 0.85 0.84 0.85 60 weighted avg 0.85 0.85 0.85 60
In [79]:
y_proba = best_knn.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y_test, y_proba)
print("ROC-AUC:", roc_auc)
RocCurveDisplay.from_estimator(best_knn, X_test, y_test)
ROC-AUC: 0.8838383838383839
Out[79]:
<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x284baa90230>
In [81]:
best_rf = grid_search_RF.best_estimator_
y_pred = best_rf.predict(X_test)
cm_rf = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_knn)
disp.plot(cmap="Blues")
print(classification_report(y_test, y_pred))
precision recall f1-score support 0 0.88 0.81 0.85 27 1 0.86 0.91 0.88 33 accuracy 0.87 60 macro avg 0.87 0.86 0.86 60 weighted avg 0.87 0.87 0.87 60
In [83]:
y_proba = best_rf.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y_test, y_proba)
print("ROC-AUC:", roc_auc)
RocCurveDisplay.from_estimator(best_rf, X_test, y_test)
ROC-AUC: 0.9191919191919192
Out[83]:
<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x284bb5c43b0>