Import Requisite Libraries
######################## Standard Library Imports ##############################
import pandas as pd
import os
import sys
from eda_toolkit import ensure_directory
######################## Modeling Library Imports ##############################
import shap
import model_tuner
from model_tuner.pickleObjects import loadObjects
import eda_toolkit
import matplotlib.pyplot as plt
from functions import evaluate_kfold_oof, build_multimodel_performance_table
# Add the parent directory to sys.path to access 'functions.py'
sys.path.append(os.path.join(os.pardir))
print (
f"This project uses: \n \n Python { sys. version. split()[0 ]} \n model_tuner "
f" { model_tuner. __version__} \n eda_toolkit { eda_toolkit. __version__} "
)
/home/lshpaner/Python_Projects/circ_milan/venv_circ_311/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
This project uses:
Python 3.11.11
model_tuner 0.0.34b1
eda_toolkit 0.0.19
Set Paths & Read in the Data
# Define base paths
# `base_path`` represents the parent directory of current working directory
base_path = os.path.join(os.pardir)
# Go up one level from 'notebooks' to the parent directory, then into the 'data' folder
data_path = os.path.join(os.pardir, "data" )
image_path_png = os.path.join(base_path, "images" , "png_images" , "modeling" )
image_path_svg = os.path.join(base_path, "images" , "svg_images" , "modeling" )
# Use the function to ensure the 'data' directory exists
ensure_directory(data_path)
ensure_directory(image_path_png)
ensure_directory(image_path_svg)
Directory exists: ../data
Directory exists: ../images/png_images/modeling
Directory exists: ../images/svg_images/modeling
data_path = "../data/processed/"
model_path = "../mlruns/models/"
df = pd.read_parquet(os.path.join(data_path, "X.parquet" ))
print (f"DataFrame Columns w/ Outcome: \n { df. columns. to_list()} " )
print (f"DataFrame Shape: { df. shape} " )
DataFrame Columns w/ Outcome:
['Age_years', 'BMI', 'Surgical_Technique', 'Intraoperative_Blood_Loss_ml', 'Intraop_Mean_Heart_Rate_bpm', 'Intraop_Mean_Pulse_Ox_Percent', 'Surgical_Time_min', 'BMI_Category_Obese', 'BMI_Category_Overweight', 'BMI_Category_Underweight', 'Intraop_SBP', 'Intraop_DBP', 'Diabetes']
DataFrame Shape: (194, 13)
X = pd.read_parquet(os.path.join(data_path, "X.parquet" ))
y = pd.read_parquet(os.path.join(data_path, "y_Bleeding_Edema_Outcome.parquet" ))
df = df.join(y, how= "inner" , on= "patient_id" )
Load Models
# lr_smote_training
model_lr = loadObjects(
os.path.join(
model_path,
"./192577440515948778/f9e6938832d8403b96c13485638d7ff2/artifacts/lr_Bleeding_Edema_Outcome/model.pkl" ,
)
)
# rf_over_training
model_rf = loadObjects(
os.path.join(
model_path,
"./192577440515948778/fa0e2889515d4f2cb2e37b9a8feef84d/artifacts/rf_Bleeding_Edema_Outcome/model.pkl" ,
)
)
# svm_orig_training
model_svm = loadObjects(
os.path.join(
model_path,
"./192577440515948778/936e228e12834c519a472f4a3556db66/artifacts/svm_Bleeding_Edema_Outcome/model.pkl" ,
)
)
Object loaded!
Object loaded!
Object loaded!
Set-up Pipelines, Model Titles, and Thresholds
pipelines_or_models = [model_lr, model_rf, model_svm]
# Model titles
model_titles = [
"Logistic Regression" ,
"Random Forest Classifier" ,
"Support Vector Machines" ,
]
thresholds = {
"Logistic Regression" : next (iter (model_lr.threshold.values())),
"Random Forest Classifier" : next (iter (model_rf.threshold.values())),
"Support Vector Machines" : next (iter (model_svm.threshold.values())),
}
for col in X.columns:
if col.startswith("BMI_" ):
print (f"Value Counts for column { col} : \n " )
print (X[col].value_counts())
print (" \n " )
Value Counts for column BMI_Category_Obese:
BMI_Category_Obese
0 183
1 11
Name: count, dtype: int64
Value Counts for column BMI_Category_Overweight:
BMI_Category_Overweight
0 141
1 53
Name: count, dtype: int64
Value Counts for column BMI_Category_Underweight:
BMI_Category_Underweight
0 190
1 4
Name: count, dtype: int64
SHAP Summary Plot
SHAP (SHapley Additive exPlanations) Set-up
# Step 1: Get transformed features using model's preprocessing pipeline
X_transformed = model_svm.get_preprocessing_and_feature_selection_pipeline().transform(
X
)
# Optional: Sampling for speed (or just use X_transformed if it's small)
sample_size = 100
X_sample = shap.utils.sample(X_transformed, sample_size, random_state= 42 )
# Step 2: Get final fitted model (SVC in pipeline)
final_model = model_svm.estimator.named_steps[model_svm.estimator_name]
# Step 3: Define a pred. function that returns only the probability for class 1
def model_predict(X):
return final_model.predict_proba(X)[:, 1 ]
# Step 4: Create SHAP explainer
explainer = shap.KernelExplainer(
model_predict, X_sample, feature_names= model_svm.get_feature_names()
)
# Step 5: Compute SHAP values for the full dataset or sample
shap_values = explainer.shap_values(X_sample) # can use X_transformed instead
100%|██████████| 100/100 [03:17<00:00, 1.97s/it]
SHAP Beeswarm Plot
# Step 6a: SHAP beeswarm plot (default)
shap.summary_plot(
shap_values,
X_sample,
feature_names= model_svm.get_feature_names(),
show= False ,
)
plt.savefig(os.path.join(image_path_png, "shap_summary_beeswarm.png" ), dpi= 600 )
plt.savefig(os.path.join(image_path_svg, "shap_summary_beeswarm.png" ), dpi= 600 )
SHAP Bar Plot
# Step 6b: SHAP bar plot (mean |SHAP value| for each feature)
shap.summary_plot(
shap_values,
X_sample,
feature_names= model_svm.get_feature_names(),
plot_type= "bar" ,
show= False ,
)
plt.savefig(os.path.join(image_path_png, "shap_summary_bar.png" ), dpi= 600 )
plt.savefig(os.path.join(image_path_svg, "shap_summary_bar.png" ), dpi= 600 )
Plot SVM Decision Boundary
from project_functions import plot_svm_decision_boundary_2d
plot_svm_decision_boundary_2d(
# model=model_svm,
X= X,
y= y,
feature_pair= ("Intraoperative_Blood_Loss_ml" , "Surgical_Technique" ),
title= "SVM Decision Boundary: Intraoperative Blood Loss (ml) vs. Surgical Technique" ,
image_path_svg= os.path.join(image_path_svg, "svm_decision_surface_2d.svg" ),
)
from project_functions import plot_svm_decision_boundary_2d
plot_svm_decision_boundary_2d(
# model=model_svm,
X= X,
y= y,
feature_pair= ("Intraoperative_Blood_Loss_ml" , "Surgical_Technique" ),
title= "SVM Decision Boundary: Intraoperative Blood Loss (ml) vs. Surgical Technique" ,
margin= True ,
image_path_svg= os.path.join(image_path_svg, "svm_decision_surface_2d_margin.svg" ),
)
from project_functions import plot_svm_decision_surface_3d
plot_svm_decision_surface_3d(
X= X,
y= y,
# figsize=(6, 10),
feature_pair= ("Intraoperative_Blood_Loss_ml" , "Surgical_Technique" ),
title= "3D SVM Decision Boundary (Intraoperative Blood Loss (ml) vs. Surgical Technique)" ,
image_path_png= os.path.join(image_path_png, "svm_decision_surface_3d.png" ),
image_path_svg= os.path.join(image_path_svg, "svm_decision_surface_3d.svg" ),
)
from project_functions import plot_svm_decision_surface_3d_plotly
# Plotly 3D SVM Decision Surface
plot_svm_decision_surface_3d_plotly(
X= df,
y= df["Bleeding_Edema_Outcome" ],
feature_pair= ("Intraoperative_Blood_Loss_ml" , "Surgical_Technique" ),
title= f"Interactive 3D SVM Decision Boundary:<br>Intraoperative Blood "
f"Loss (ml) vs. Surgical Technique" ,
html_path= os.path.join(image_path_svg, "svm_decision_surface_3d_plotly.html" ),
)
WebGL is not supported by your browser - visit https://get.webgl.org for more info
Support Vectors (No Complications) Support Vectors (Complications) −3 −2 −1 0 1 Decision f(x) Interactive 3D SVM Decision Boundary: Intraoperative Blood Loss (ml) vs. Surgical Technique
Saved interactive plot to ../images/svg_images/modeling/svm_decision_surface_3d_plotly.html
Calibration
# Plot calibration curves in overlay mode
from model_metrics import show_calibration_curve
show_calibration_curve(
model= pipelines_or_models,
X= X,
y= y,
model_title= model_titles,
overlay= True ,
title= "" ,
save_plot= True ,
image_path_png= image_path_png,
image_path_svg= image_path_svg,
text_wrap= 40 ,
curve_kwgs= {
"Logistic Regression" : {"color" : "blue" , "linewidth" : 1 },
"Support Vector Machines" : {
"color" : "red" ,
# "linestyle": "--",
"linewidth" : 1.5 ,
},
"Decision Tree" : {
"color" : "lightblue" ,
"linestyle" : "--" ,
"linewidth" : 1.5 ,
},
},
figsize= (8 , 6 ),
label_fontsize= 10 ,
tick_fontsize= 10 ,
bins= 10 ,
show_brier_score= True ,
brier_decimals= 3 ,
subplots= False ,
# gridlines=False,
linestyle_kwgs= {"color" : "black" },
)
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:02<00:00, 3.42it/s]
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:02<00:00, 3.65it/s]
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:00<00:00, 39.68it/s]
Confusion Matrices
from model_metrics import show_confusion_matrix
show_confusion_matrix(
model= pipelines_or_models,
X= X,
y= y,
model_title= model_titles,
model_threshold= [thresholds],
# class_labels=["No Pain", "Class 1"],
cmap= "Blues" ,
text_wrap= 40 ,
save_plot= True ,
image_path_png= image_path_png,
image_path_svg= image_path_svg,
grid= True ,
n_cols= 3 ,
n_rows= 1 ,
figsize= (4 , 4 ),
show_colorbar= False ,
label_fontsize= 14 ,
tick_fontsize= 12 ,
inner_fontsize= 12 ,
class_report= True ,
# thresholds=thresholds,
# custom_threshold=0.5,
# labels=False,
)
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:00<00:00, 30.43it/s]
Confusion Matrix for Logistic Regression:
Predicted 0 Predicted 1
Actual 0 97 39
Actual 1 6 52
Classification Report for Logistic Regression:
precision recall f1-score support
0 0.94 0.71 0.81 136
1 0.57 0.90 0.70 58
accuracy 0.77 194
macro avg 0.76 0.80 0.75 194
weighted avg 0.83 0.77 0.78 194
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:02<00:00, 3.74it/s]
Confusion Matrix for Random Forest Classifier:
Predicted 0 Predicted 1
Actual 0 116 20
Actual 1 10 48
Classification Report for Random Forest Classifier:
precision recall f1-score support
0 0.92 0.85 0.89 136
1 0.71 0.83 0.76 58
accuracy 0.85 194
macro avg 0.81 0.84 0.82 194
weighted avg 0.86 0.85 0.85 194
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:00<00:00, 53.64it/s]
Confusion Matrix for Support Vector Machines:
Predicted 0 Predicted 1
Actual 0 117 19
Actual 1 8 50
Classification Report for Support Vector Machines:
precision recall f1-score support
0 0.94 0.86 0.90 136
1 0.72 0.86 0.79 58
accuracy 0.86 194
macro avg 0.83 0.86 0.84 194
weighted avg 0.87 0.86 0.86 194
ROC AUC Curves
from model_metrics import show_roc_curve
# Plot ROC curves
show_roc_curve(
model= pipelines_or_models,
X= X,
y= y,
overlay= False ,
model_title= model_titles,
decimal_places= 3 ,
# n_cols=3,
# n_rows=1,
# curve_kwgs={
# "Logistic Regression": {"color": "blue", "linewidth": 2},
# "SVM": {"color": "red", "linestyle": "--", "linewidth": 1.5},
# },
# linestyle_kwgs={"color": "grey", "linestyle": "--"},
save_plot= True ,
subplots= True ,
n_cols= 3 ,
figsize= (12 , 4 ),
# label_fontsize=16,
# tick_fontsize=16,
image_path_png= image_path_png,
image_path_svg= image_path_svg,
# gridlines=False,
)
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:00<00:00, 42.06it/s]
AUC for Logistic Regression: 0.900
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:02<00:00, 4.42it/s]
AUC for Random Forest Classifier: 0.887
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:00<00:00, 63.01it/s]
AUC for Support Vector Machines: 0.907
show_roc_curve(
model= pipelines_or_models,
X= X,
y= y,
overlay= True ,
model_title= model_titles,
title= "AUC ROC - All Models" ,
curve_kwgs= {
"Logistic Regression" : {"color" : "blue" , "linewidth" : 1 },
"Random Forest" : {"color" : "lightblue" , "linewidth" : 1 },
"Support Vector Machines" : {
"color" : "red" ,
"linestyle" : "-" ,
"linewidth" : 2 ,
},
},
linestyle_kwgs= {"color" : "grey" , "linestyle" : "--" },
save_plot= True ,
subplots= False ,
decimal_places= 3 ,
figsize= (8 , 6 ),
# gridlines=False,
label_fontsize= 16 ,
tick_fontsize= 13 ,
image_path_png= image_path_png,
image_path_svg= image_path_svg,
)
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:00<00:00, 39.27it/s]
AUC for Logistic Regression: 0.900
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:02<00:00, 4.51it/s]
AUC for Random Forest Classifier: 0.887
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:00<00:00, 48.48it/s]
AUC for Support Vector Machines: 0.907
Precision-Recall Curves
from model_metrics import show_pr_curve
# Plot PR curves
show_pr_curve(
model= pipelines_or_models,
X= X,
y= y,
# x_label="Hello",
model_title= model_titles,
decimal_places= 3 ,
overlay= False ,
subplots= True ,
save_plot= True ,
image_path_png= image_path_png,
image_path_svg= image_path_svg,
figsize= (12 , 4 ),
n_cols= 3 ,
# tick_fontsize=16,
# label_fontsize=16,
# grid=True,
# gridlines=False,
)
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:00<00:00, 39.78it/s]
Average Precision for Logistic Regression: 0.809
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:01<00:00, 5.19it/s]
Average Precision for Random Forest Classifier: 0.737
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:00<00:00, 62.43it/s]
Average Precision for Support Vector Machines: 0.832
show_pr_curve(
model= pipelines_or_models,
X= X,
y= y,
overlay= True ,
model_title= model_titles,
title= "Precision-Recall - All Models" ,
curve_kwgs= {
"Logistic Regression" : {"color" : "blue" , "linewidth" : 1 },
"Random Forest" : {"color" : "lightblue" , "linewidth" : 1 },
"Support Vector Machines" : {
"color" : "red" ,
"linestyle" : "-" ,
"linewidth" : 2 ,
},
},
save_plot= True ,
subplots= False ,
decimal_places= 3 ,
figsize= (8 , 6 ),
# gridlines=False,
label_fontsize= 16 ,
tick_fontsize= 13 ,
image_path_png= image_path_png,
image_path_svg= image_path_svg,
)
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:00<00:00, 40.17it/s]
Average Precision for Logistic Regression: 0.809
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:02<00:00, 4.64it/s]
Average Precision for Random Forest Classifier: 0.736
Running k-fold model metrics...
Processing Folds: 100%|██████████| 10/10 [00:00<00:00, 42.88it/s]
Average Precision for Support Vector Machines: 0.832