Exploratory Data Analysis

1 Import Requisite Libraries

######################## Standard Library Imports ##############################
import pandas as pd
import numpy as np
from scipy.stats import gaussian_kde
from itertools import combinations
import os
import sys

########################## Plotting Libraries ##################################
import matplotlib.pyplot as plt
import seaborn as sns
import eda_toolkit
from eda_toolkit import (
    ensure_directory,
    kde_distributions,
    box_violin_plot,
    stacked_crosstab_plot,
    flex_corr_matrix,
    box_violin_plot,
    highlight_columns,
    scatter_fit_plot,
    generate_table1,
)

################################################################################

# Add the parent directory to sys.path to access 'functions.py'
sys.path.append(os.path.join(os.pardir))

from functions import *  # import custom functions

print(f"This project uses Python {sys.version.split()[0]}.")
print(f"This project uses EDA_Toolkit {eda_toolkit.__version__}.")
This project uses Python 3.11.11.
This project uses EDA_Toolkit 0.0.19.

2 Read in the Data

# Define your base paths
# `base_path`` represents the parent directory of your current working directory
base_path = os.path.join(os.pardir)
data_path = os.path.join(base_path, "data")
data_raw = "../data/"
image_path_png = os.path.join(base_path, "images", "png_images")
image_path_svg = os.path.join(base_path, "images", "svg_images")

# Ensure that each directory exists
ensure_directory(data_path)
ensure_directory(image_path_png)
ensure_directory(image_path_svg)
Directory exists: ../data
Directory exists: ../images/png_images
Directory exists: ../images/svg_images
# read in the data, set index to "ID"
circ_eda = pd.read_csv(os.path.join(data_path, "circ_eda.csv")).set_index("patient_id")
circ_eda.columns.to_list()
['Birthday',
 'Age_years',
 'Weight_kg',
 'BMI',
 'Geographical_Origin',
 'Cultural_Religious_Affiliation',
 'Comorbidities',
 'Preop_drugs_antibiotic',
 'Preop_Blood_Pressure_mmHg',
 'Preop_Heart_Rate_bpm',
 'Preop_Pulse_Ox_Percent',
 'Surgical_Technique',
 'Anesthesia_Type',
 'Intraoperative_drugs',
 'Intraoperative_Blood_Loss_ml',
 'Intraop_Mean_Blood_Pressure_mmHg',
 'Intraop_Mean_Heart_Rate_bpm',
 'Intraop_Mean_Pulse_Ox_Percent',
 'Surgical_Time_min',
 'Functional_Outcomes_Pain',
 'Functional_Outcomes_Bleeding',
 'Functional_Outcomes_Edema',
 'Functional_Outcomes_Infection',
 'Functional_Outcomes_Fast_Recovery',
 'Functional_Outcomes_Cosmetic_Satisfaction',
 'Cost_of_Procedure_euros',
 'Cost_Type',
 'BMI_Category',
 'Preop_MAP',
 'Intraop_MAP',
 'Diabetes',
 'Anesthesia_Type_carbocaine',
 'Anesthesia_Type_lidocaine',
 'BMI_Category_Normal_Weight',
 'BMI_Category_Obese',
 'BMI_Category_Overweight',
 'BMI_Category_Underweight',
 'Intraop_SBP',
 'Intraop_DBP',
 'Preop_SBP',
 'Preop_DBP',
 'Comorbidity_Flag']
circ_eda.head()  # inspect first five rows of dataframe
Birthday Age_years Weight_kg BMI Geographical_Origin Cultural_Religious_Affiliation Comorbidities Preop_drugs_antibiotic Preop_Blood_Pressure_mmHg Preop_Heart_Rate_bpm ... Anesthesia_Type_lidocaine BMI_Category_Normal_Weight BMI_Category_Obese BMI_Category_Overweight BMI_Category_Underweight Intraop_SBP Intraop_DBP Preop_SBP Preop_DBP Comorbidity_Flag
patient_id
424123959 NaN 22 67 26.17 USA Jewish 0 Cefazolina 130/80 85 ... 1 0 0 1 0 130 90 130 80 0
390469576 NaN 50 90 31.14 Italy Catholic IPA Cefazolina 120/70 65 ... 1 0 1 0 0 120 70 120 70 1
633173792 NaN 70 65 23.88 Italy Catholic DM Cefazolina 110/80 90 ... 1 1 0 0 0 110 80 110 80 1
784928164 NaN 68 78 27.64 Italy Catholic DM Cefazolina 120/90 65 ... 1 0 0 1 0 120 90 120 90 1
936242280 NaN 64 88 31.18 Italy Catholic DM Cefazolina 110/70 79 ... 1 0 1 0 0 110 70 110 70 1

5 rows × 42 columns

circ_eda = circ_eda.drop(columns=["Birthday"])  # drop unused col
circ_eda = circ_eda[circ_eda["Age_years"] >= 18]
circ_eda.columns
Index(['Age_years', 'Weight_kg', 'BMI', 'Geographical_Origin',
       'Cultural_Religious_Affiliation', 'Comorbidities',
       'Preop_drugs_antibiotic', 'Preop_Blood_Pressure_mmHg',
       'Preop_Heart_Rate_bpm', 'Preop_Pulse_Ox_Percent', 'Surgical_Technique',
       'Anesthesia_Type', 'Intraoperative_drugs',
       'Intraoperative_Blood_Loss_ml', 'Intraop_Mean_Blood_Pressure_mmHg',
       'Intraop_Mean_Heart_Rate_bpm', 'Intraop_Mean_Pulse_Ox_Percent',
       'Surgical_Time_min', 'Functional_Outcomes_Pain',
       'Functional_Outcomes_Bleeding', 'Functional_Outcomes_Edema',
       'Functional_Outcomes_Infection', 'Functional_Outcomes_Fast_Recovery',
       'Functional_Outcomes_Cosmetic_Satisfaction', 'Cost_of_Procedure_euros',
       'Cost_Type', 'BMI_Category', 'Preop_MAP', 'Intraop_MAP', 'Diabetes',
       'Anesthesia_Type_carbocaine', 'Anesthesia_Type_lidocaine',
       'BMI_Category_Normal_Weight', 'BMI_Category_Obese',
       'BMI_Category_Overweight', 'BMI_Category_Underweight', 'Intraop_SBP',
       'Intraop_DBP', 'Preop_SBP', 'Preop_DBP', 'Comorbidity_Flag'],
      dtype='object')
table1_cont = generate_table1(
    circ_eda, include_types="continuous", groupby_col="Surgical_Technique"
)
Using Welch's t-test for continuous variable: Age_years
Using Welch's t-test for continuous variable: BMI
Using Welch's t-test for continuous variable: Cost_of_Procedure_euros
Using Welch's t-test for continuous variable: Intraop_DBP
Using Welch's t-test for continuous variable: Intraop_MAP
Using Welch's t-test for continuous variable: Intraop_Mean_Heart_Rate_bpm
Using Welch's t-test for continuous variable: Intraop_Mean_Pulse_Ox_Percent
Using Welch's t-test for continuous variable: Intraop_SBP
Using Welch's t-test for continuous variable: Intraoperative_Blood_Loss_ml
Using Welch's t-test for continuous variable: Preop_DBP
Using Welch's t-test for continuous variable: Preop_Heart_Rate_bpm
Using Welch's t-test for continuous variable: Preop_MAP
Using Welch's t-test for continuous variable: Preop_Pulse_Ox_Percent
Using Welch's t-test for continuous variable: Preop_SBP
Using Welch's t-test for continuous variable: Surgical_Time_min
Using Welch's t-test for continuous variable: Weight_kg
print(table1_cont)
 Variable           | Type       | Mean   | SD     | Median | Min    | Max      | Mode   | Missing (n) | Missing (%) | Count | Proportion (%) | 1 (n = 62)   | 0 (n = 132)   | P-value 
--------------------|------------|--------|--------|--------|--------|----------|--------|-------------|-------------|-------|----------------|--------------|---------------|---------
 Age_years          | Continuous | 43.13  | 21.88  | 34.00  | 18.00  | 93.00    | 18.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.00    
 BMI                | Continuous | 24.07  | 3.00   | 23.68  | 17.34  | 36.57    | 21.63  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.97    
 Cost_of_Procedure_ | Continuous | 321.65 | 475.78 | 0.00   | 0.00   | 1,200.00 | 0.00   | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.00    
 Intraop_DBP        | Continuous | 82.94  | 15.87  | 90.00  | 10.00  | 100.00   | 90.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.00    
 Intraop_MAP        | Continuous | 96.08  | 11.13  | 100.00 | 46.67  | 110.00   | 100.00 | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.00    
 Intraop_Mean_Heart | Continuous | 75.93  | 5.68   | 80.00  | 60.00  | 88.00    | 80.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.05    
 Intraop_Mean_Pulse | Continuous | 96.69  | 1.78   | 97.00  | 91.00  | 99.00    | 98.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.87    
 Intraop_SBP        | Continuous | 122.37 | 10.61  | 120.00 | 100.00 | 150.00   | 120.00 | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.21    
 Intraoperative_Blo | Continuous | 7.90   | 15.51  | 0.00   | 0.00   | 100.00   | 0.00   | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.00    
 Preop_DBP          | Continuous | 72.79  | 12.06  | 70.00  | 10.00  | 100.00   | 80.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.09    
 Preop_Heart_Rate_b | Continuous | 76.85  | 6.77   | 77.00  | 59.00  | 93.00    | 68.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.15    
 Preop_MAP          | Continuous | 89.34  | 9.90   | 90.00  | 46.67  | 116.67   | 96.67  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.07    
 Preop_Pulse_Ox_Per | Continuous | 96.78  | 1.72   | 97.00  | 91.00  | 99.00    | 98.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.68    
 Preop_SBP          | Continuous | 122.42 | 10.67  | 120.00 | 100.00 | 150.00   | 120.00 | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.24    
 Surgical_Time_min  | Continuous | 28.18  | 5.64   | 27.50  | 15.00  | 40.00    | 28.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.07    
 Weight_kg          | Continuous | 72.80  | 9.40   | 73.00  | 50.00  | 102.00   | 68.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.32    
table1_cont = generate_table1(circ_eda, include_types="categorical")
table1_cont
Variable Type Mode Missing (n) Missing (%) Count Proportion (%)
0 Geographical_Origin Categorical Italy 0 0.00 194 100.00
1 Cultural_Religious_Affiliation Categorical Catholic 0 0.00 194 100.00
2 Comorbidities Categorical 0.00 0 0.00 194 100.00
3 Preop_drugs_antibiotic Categorical Cefazolina 0 0.00 194 100.00
4 Preop_Blood_Pressure_mmHg Categorical 130/80 0 0.00 194 100.00
5 Anesthesia_Type Categorical lidocaine 0 0.00 194 100.00
6 Intraoperative_drugs Categorical ipnovel, tavor 0 0.00 194 100.00
7 Intraop_Mean_Blood_Pressure_mmHg Categorical 120/90 0 0.00 194 100.00
8 Cost_Type Categorical SSN 0 0.00 194 100.00
9 BMI_Category Categorical Normal_Weight 0 0.00 194 100.00
10 Anesthesia_Type_carbocaine Categorical 0.00 0 0.00 194 100.00
11 Anesthesia_Type_lidocaine Categorical 1.00 0 0.00 194 100.00
12 BMI_Category_Normal_Weight Categorical 1.00 0 0.00 194 100.00
13 BMI_Category_Obese Categorical 0.00 0 0.00 194 100.00
14 BMI_Category_Overweight Categorical 0.00 0 0.00 194 100.00
15 BMI_Category_Underweight Categorical 0.00 0 0.00 194 100.00
16 Comorbidity_Flag Categorical 0.00 0 0.00 194 100.00
17 Diabetes Categorical 0.00 0 0.00 194 100.00
18 Functional_Outcomes_Bleeding Categorical 0.00 0 0.00 194 100.00
19 Functional_Outcomes_Cosmetic_Satisfaction Categorical 1.00 0 0.00 194 100.00
20 Functional_Outcomes_Edema Categorical 0.00 0 0.00 194 100.00
21 Functional_Outcomes_Fast_Recovery Categorical 1.00 0 0.00 194 100.00
22 Functional_Outcomes_Infection Categorical 0.00 0 0.00 194 100.00
23 Functional_Outcomes_Pain Categorical 0.00 0 0.00 194 100.00
24 Surgical_Technique Categorical 0.00 0 0.00 194 100.00

3 Define Age Group

# create bins for age along with labels such that age as a continuous series
# can be converted to something more manageable for visualization and analysis
bin_ages = [18, 30, 40, 50, 60, 70, 80, 90, 100]
label_ages = [
    "18-29",
    "30-39",
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-89",
    "90-99",
]

circ_eda["age_group"] = pd.cut(
    circ_eda["Age_years"],
    bins=bin_ages,
    labels=label_ages,
)

4 Clinical Characteristics

4.1 Prevalence of Comorbidities

comorbid_color = ["#1f77b4", "#c8544c"]
comorbid_flag = {0: "No Comorbidities", 1: "Comorbidities"}
comorb_val_counts = circ_eda["Comorbidity_Flag"].value_counts()
comorb_val_counts.index = comorb_val_counts.index.map(comorbid_flag)
ax = comorb_val_counts.plot(
    kind="bar",
    rot=0,
    width=0.99,
    color=comorbid_color,
)

for i, v in enumerate(comorb_val_counts):
    ax.text(i, v - 20, str(v), ha="center", color="yellow")
ax.set_title("Prevalence of Comorbidities")
ax.set_xlabel("Comorbidity Flag")
ax.set_ylabel("Comorbidity Count")
plt.savefig(
    os.path.join(image_path_png, "comorbidities_vs_no_comorbidities.png"),
)
plt.savefig(
    os.path.join(image_path_svg, "comorbidities_vs_no_comorbidities.svg"),
)
plt.show()

4.2 Comorbidities by Age Group

pd.crosstab(
    circ_eda["Comorbidities"],
    circ_eda["age_group"],
    margins=True,
    margins_name="Total",
)
age_group 18-29 30-39 40-49 50-59 60-69 70-79 80-89 90-99 Total
Comorbidities
0 65 16 9 11 13 7 1 0 122
DM 0 2 0 4 12 2 1 0 21
DM, IPA 0 0 0 1 2 0 0 0 3
DM, IPA, ipercolesterolemia 0 0 0 0 0 1 1 1 3
DM, hydrocele 0 0 0 0 0 1 0 0 1
DM, ipercolesterolemia 0 0 0 0 0 0 0 1 1
IPA 0 0 1 3 3 0 1 0 8
IPA, DM, Ipercolesterolemia 0 0 0 0 0 0 1 0 1
IPA, prec peritonite 0 0 0 0 0 1 0 0 1
LS 0 0 0 1 2 1 0 0 4
cardiac stents 0 0 0 0 0 1 0 0 1
depression 2 2 0 0 0 0 0 0 4
epilessia 0 1 0 0 0 0 0 0 1
hydrocele 2 0 0 0 0 0 0 0 2
m. Parkinson 0 0 0 0 0 1 1 0 2
prec peritonite 1 0 0 0 1 0 0 0 2
s.Gilbert 1 0 0 0 0 0 0 0 1
varicocele 4 0 0 0 0 0 0 0 4
Total 75 21 10 20 33 15 6 2 182
circ_eda["age_group"].unique()
['18-29', '40-49', '60-69', '50-59', '70-79', '30-39', NaN, '80-89', '90-99']
Categories (8, object): ['18-29' < '30-39' < '40-49' < '50-59' < '60-69' < '70-79' < '80-89' < '90-99']
filtered_df = circ_eda[circ_eda["Comorbidities"] != "0"]
age_group_order = filtered_df["age_group"].dropna().unique().tolist()
age_group_order = [
    "18-29",
    "30-39",
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-89",
    "90-99",
]

sorted_crosstab = pd.crosstab(
    filtered_df["Comorbidities"], filtered_df["age_group"]
).reindex(columns=age_group_order)


plt.figure(figsize=(9, 6))

# Create the heatmap using the sorted crosstab
sns.heatmap(sorted_crosstab, annot=True, cmap="rocket_r", fmt="d")
# plt.title("Comorbidites by Age Group")
plt.xlabel("Age Group")

# Save the image, assuming image_path_png and image_path_svg are already defined
plt.savefig(
    os.path.join(image_path_png, "comorbidities_by_age_group.png"),
    bbox_inches="tight",
)
plt.savefig(
    os.path.join(image_path_svg, "comorbidities_by_age_group.svg"),
    bbox_inches="tight",
)
plt.show()

4.3 Comorbidities by Geographical Origin

plt.figure(figsize=(9, 6))

# create a heatmap of the crosstab between patient comorbidities and geo. origin
sns.heatmap(
    pd.crosstab(circ_eda["Comorbidities"], circ_eda["Geographical_Origin"]),
    annot=True,
    cmap="rocket_r",
    fmt="d",
)


plt.title("Comorbidites by Geographical Origin")
plt.xlabel("Geographical Origin")
plt.savefig(
    os.path.join(image_path_png, "comorbidities_by_geog_origin.png"),
    bbox_inches="tight",
)

plt.savefig(
    os.path.join(image_path_svg, "comorbidities_by_geog_origin.svg"),
    bbox_inches="tight",
)

plt.show()

5 Overall Distributions

dist_list_1 = [
    "Age_years",
    "BMI",
    "Comorbidities",
    "Preop_drugs_antibiotic",
    "Preop_Heart_Rate_bpm",
    "Preop_Pulse_Ox_Percent",
    "Surgical_Technique",
    "Intraoperative_Blood_Loss_ml",
]

dist_list_2 = [
    "Intraop_Mean_Heart_Rate_bpm",
    "Intraop_Mean_Pulse_Ox_Percent",
    "Surgical_Time_min",
    "Functional_Outcomes_Pain",
    "Functional_Outcomes_Bleeding",
    "Functional_Outcomes_Edema",
    "Functional_Outcomes_Infection",
    "Functional_Outcomes_Fast_Recovery",
]

dist_list_3 = [
    "Functional_Outcomes_Cosmetic_Satisfaction",
    "Cost_of_Procedure_euros",
    "Preop_MAP",
    "Intraop_MAP",
    "Anesthesia_Type_lidocaine",
    "SBP",
    "DBP",
    "Comorbidity_Flag",
]
len(dist_list_1) + len(dist_list_2) + len(dist_list_3)
24
dist_list = circ_eda.select_dtypes(np.number).columns.to_list()

kde_distributions(
    df=circ_eda,
    fill=True,
    n_rows=2,
    n_cols=4,
    h_pad=5,
    fill_alpha=0.60,
    text_wrap=40,
    # grid_figsize=(50, 25),  # Size of the overall grid figure
    vars_of_interest=dist_list_1,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    plot_type="both",
    image_filename="numeric_distributions_1",
    bbox_inches="tight",
    # y_axis_label=" ",
    # plot_title=None,
    bins=10,
    tick_fontsize=14,
    # custom_xlabels={"Age_years": "Age"},
    # custom_titles={"Age_years": None},
    label_fontsize=16,
)

dist_list = circ_eda.select_dtypes(np.number).columns.to_list()

kde_distributions(
    df=circ_eda,
    fill=True,
    n_rows=2,
    n_cols=4,
    h_pad=5,
    fill_alpha=0.60,
    text_wrap=30,
    # grid_figsize=(50, 25),  # Size of the overall grid figure
    vars_of_interest=dist_list_2,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    plot_type="both",
    image_filename="numeric_distributions_2",
    bbox_inches="tight",
    y_axis_label="Density",
    bins=10,
    tick_fontsize=14,
    label_fontsize=16,
)

dist_list = circ_eda.select_dtypes(np.number).columns.to_list()

kde_distributions(
    df=circ_eda,
    fill=True,
    n_rows=2,
    n_cols=4,
    h_pad=5,
    fill_alpha=0.60,
    text_wrap=40,
    # grid_figsize=(50, 25),  # Size of the overall grid figure
    vars_of_interest=dist_list_3,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    plot_type="both",
    image_filename="numeric_distributions_3",
    bbox_inches="tight",
    y_axis_label="Density",
    bins=10,
    tick_fontsize=14,
    label_fontsize=16,
)

kde_distributions(
    df=circ_eda,
    figsize=(10, 6),
    text_wrap=50,
    hist_color="brown",
    bbox_inches="tight",
    vars_of_interest=["Surgical_Time_min"],
    y_axis_label="Density",
    bins=10,
    fill_alpha=0.40,
    plot_type="both",
    stat="Density",
    label_fontsize=16,  # Font size for axis labels
    tick_fontsize=14,  # Font size for tick labels
    plot_mean=True,
    plot_median=True,
    mean_color="blue",
    image_filename="surgical_time_distribution",
    image_path_svg=image_path_svg,
    custom_xlabels=None,  # New parameter to customize x-axis labels
    custom_titles={
        "Surgical_Time_min": "Surgical Time in Minutes"
    },  # New parameter to customize plot titles
    image_path_png=image_path_png,
    std_dev_levels=[
        1,
        2,
        3,
    ],
    std_color=[
        "purple",
        "green",
        "silver",
    ],
    # title="Age Distribution",
)

6 Correlation Matrix

# Function to create a mock dataset
def create_mock_dataset(rows=100, seed=42):
    np.random.seed(seed)
    data = {
        "Age_years": np.random.randint(18, 65, size=rows),
        "Weight_kg": np.random.uniform(50, 120, size=rows),
        "BMI": np.random.uniform(18, 35, size=rows),
        "Comorbidities": np.random.choice([0, 1], size=rows),
        "Preop_Heart_Rate_bpm": np.random.randint(60, 100, size=rows),
        "Intraop_Mean_Heart_Rate_bpm": np.random.randint(70, 110, size=rows),
        "Intraop_Mean_Pulse_Ox_Percent": np.random.uniform(90, 100, size=rows),
        "Surgical_Time_min": np.random.randint(30, 300, size=rows),
        "Cost_of_Procedure_euros": np.random.uniform(5000, 20000, size=rows),
        "SBP": np.random.randint(100, 140, size=rows),
        "DBP": np.random.randint(60, 90, size=rows),
    }
    return pd.DataFrame(data)


# Generate the dataset
mock_dataset = create_mock_dataset(rows=100)
mock_dataset
Age_years Weight_kg BMI Comorbidities Preop_Heart_Rate_bpm Intraop_Mean_Heart_Rate_bpm Intraop_Mean_Pulse_Ox_Percent Surgical_Time_min Cost_of_Procedure_euros SBP DBP
0 56 70.544172 30.089542 1 84 82 92.944939 291 19005.377356 126 79
1 46 50.985588 31.761518 0 99 92 99.958314 82 5113.015447 132 68
2 32 63.918968 23.927322 1 60 94 96.969251 234 8379.991966 103 76
3 60 99.793937 19.635001 0 75 104 93.842018 155 10480.352295 121 79
4 25 105.312288 33.988895 1 98 99 97.371007 264 12317.147012 101 81
... ... ... ... ... ... ... ... ... ... ... ...
95 24 85.573914 20.322563 1 83 104 94.593468 130 16121.913667 125 80
96 26 98.706896 30.051487 1 82 88 98.420914 35 13597.262739 127 73
97 41 110.085116 27.397940 1 91 89 97.689177 209 19965.389239 120 82
98 18 72.817123 23.040672 1 96 87 90.662360 255 16286.046801 106 75
99 61 65.416873 25.136275 1 71 83 90.458613 258 15604.706919 116 77

100 rows × 11 columns

from eda_toolkit import flex_corr_matrix

feature_list = [
    "Age_years",
    "BMI",
    "Surgical_Technique",
    "Intraoperative_Blood_Loss_ml",
    "Intraop_Mean_Heart_Rate_bpm",
    "Intraop_Mean_Pulse_Ox_Percent",
    "Surgical_Time_min",
    "Diabetes",
    "BMI_Category_Obese",
    "BMI_Category_Overweight",
    "BMI_Category_Underweight",
    "Intraop_SBP",
    "Intraop_DBP",
]
flex_corr_matrix(
    df=circ_eda,
    # cols=mock_dataset.columns.to_list(),
    cols=feature_list,
    annot=True,
    cmap="viridis",
    figsize=(20, 20),
    # title="US Census Correlation Matrix",
    xlabel_alignment="right",
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plots=True,
    label_fontsize=20,
    tick_fontsize=20,
    xlabel_rot=30,
    ylabel_rot=0,
    text_wrap=30,
    vmin=-1,
    vmax=1,
    cbar_label="Correlation Index",
    # cbar_padding=0.8,  # Adjust spacing as needed
    # cbar_width_ratio=0.05,  # Adjust width as needed
    triangular=True,
    show_colorbar=True,
)

7 Scatter Plots

7.1 Preoperative vs. Intraoperative Characteristics (Scatterplots)

Examining the clinical relevance of the proposed correlations involves understanding the physiological interactions and the possible implications of these measurements on patient outcomes. Here’s a brief overview of the clinical sensibility of each correlation:

  1. Preoperative Heart Rate (BPM) vs. Preoperative Pulse Oximetry (SpO2):

    Clinical Relevance: Moderately relevant. Heart rate and oxygen saturation can both be indicators of a patient’s cardiorespiratory status. While there’s no direct causal relationship, abnormalities in one might reflect or affect changes in the other, especially in the context of cardiorespiratory diseases.

  2. Preoperative Heart Rate (BPM) vs. Intraoperative Blood Loss (ml):

    Clinical Relevance: Indirect relevance. Preoperative heart rate could reflect the patient’s stress or anxiety level, potentially influencing blood pressure and vascular tone. However, the correlation with intraoperative blood loss is likely to be influenced by many other factors, making this relationship more complex and indirect.

  3. Preoperative Heart Rate (BPM) vs. Preoperative Mean Arterial Pressure (MAP):

    Clinical Relevance: Highly relevant. There’s a physiological interaction where the heart rate can influence and be influenced by arterial pressure due to cardiac output and vascular resistance factors. This relationship is fundamental in understanding the patient’s hemodynamic status.

  4. Preoperative Heart Rate (BPM) vs. Intraoperative Mean Arterial Pressure (MAP):

    Clinical Relevance: Moderately relevant. Similar to the preoperative MAP, but considering the stress and potential complications during surgery, the correlation might offer insights into how preoperative conditions could affect or predict intraoperative hemodynamic stability.

  5. Preoperative Pulse Ox vs. Intraoperative Blood Loss (ml):

    Clinical Relevance: Indirect relevance. While both metrics are important, the direct correlation between preoperative oxygen saturation and intraoperative blood loss is not straightforward. Other factors, such as the surgical site and technique, significantly influence blood loss.

  6. Preoperative Pulse Ox vs. Preoperative MAP:

    Clinical Relevance: Indirect relevance. Both are vital signs but relate to different physiological aspects (cardiorespiratory efficiency vs. circulatory pressure). The relationship is more about how general health can impact these measurements rather than a direct correlation.

  7. Preoperative Pulse Ox vs. Intraoperative MAP:

    Clinical Relevance: Indirect relevance. This relationship might be more about the underlying health status of the patient and how it could affect or be affected by intraoperative hemodynamic management rather than a direct correlation.

  8. Intraoperative Blood Loss (ml) vs. Preoperative MAP:

    Clinical Relevance: Indirect relevance. Preoperative MAP might influence the body’s response to blood loss (through compensatory mechanisms), but the amount of blood loss is more directly related to the surgical procedure and technique.

  9. Intraoperative Blood Loss (ml) vs. Intraoperative MAP:

    Clinical Relevance: Highly relevant. Significant blood loss can lead to a decrease in MAP due to reduced circulating volume, making this correlation critical for monitoring and managing intraoperative hemodynamics.

  10. Preoperative MAP vs. Intraoperative MAP:

    Clinical Relevance: Highly relevant. Understanding the changes from preoperative to intraoperative MAP can provide insights into the patient’s hemodynamic response to surgery and anesthesia, helping to guide management to maintain stability.

For each of these correlations, it’s important to consider the broader clinical context, including the type of surgery, patient health status, and other concurrent interventions. The significance of these correlations can vary based on specific patient populations and conditions.

preop_intraop_values = [
    "Preop_Heart_Rate_bpm",
    "Preop_Pulse_Ox_Percent",
    "Intraoperative_Blood_Loss_ml",
    "Preop_MAP",
    "Intraop_MAP",
]

custom_titles = {
    "Preop_Heart_Rate_bpm": "Preoperative Heart Rate (BPM)",
    "Preop_Pulse_Ox_Percent": "Preoperative Pulse Oximetry (SpO2)",
    "Intraoperative_Blood_Loss_ml": "Intraoperative Blood Loss (ML)",
    "Preop_MAP": "Preoperative Mean Arterial Pressure",
    "Intraop_MAP": "Intraoperative Mean Arterial Pressure",
}

# Define combinations to omit
combinations_to_omit = [
    (
        "Preop_Heart_Rate_bpm",
        "Preop_Pulse_Ox_Percent",
    ),  # Indirect physiological relationship
    (
        "Preop_Heart_Rate_bpm",
        "Intraoperative_Blood_Loss_ml",
    ),  # Indirect and complex relationship
    (
        "Preop_Pulse_Ox_Percent",
        "Intraoperative_Blood_Loss_ml",
    ),  # No direct physiological relationship
    (
        "Preop_Pulse_Ox_Percent",
        "Intraop_MAP",
    ),  # indirect relevance and differing physiological systems
    (
        "Intraoperative_Blood_Loss_ml",
        "Preop_MAP",
    ),  # complex and indirect influences on outcomes.
    # Any addtl' combos based on further clinical insights can be added here
]

scatter_fit_plot(
    df=circ_eda,
    all_vars=preop_intraop_values,
    label_names=custom_titles,
    show_legend=True,
    show_plot="subplots",
    label_fontsize=14,
    exclude_combinations=combinations_to_omit,
    tick_fontsize=12,
    add_best_fit_line=True,
    scatter_color="#808080",
    show_correlation=True,
    text_wrap=40,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plots="individual",
)

Saving scatter plot(s): 100%|██████████| 5/5 [00:02<00:00,  2.24it/s]

7.2 Mean Arterial Pressure vs. BMI

scatter_fit_plot(
    df=circ_eda,
    x_vars=["Preop_MAP", "Intraop_MAP"],
    y_vars=["BMI"],
    label_names=custom_titles,
    show_legend=True,
    show_plot="subplots",
    label_fontsize=14,
    tick_fontsize=12,
    add_best_fit_line=True,
    scatter_color="#808080",
    show_correlation=True,
    text_wrap=40,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plots="individual",
)

Saving scatter plot(s): 100%|██████████| 2/2 [00:00<00:00,  2.06it/s]

7.3 BMI by Geographical Origin

bmi_by_geog = (
    circ_eda.groupby("Geographical_Origin")["BMI"]
    .agg(["mean", "std", "min", "max"])
    .rename(
        columns={
            "mean": "Mean",
            "std": "Standard Deviation",
            "min": "Min",
            "max": "Max",
        },
    )
).replace(np.nan, "-")
bmi_by_geog
Mean Standard Deviation Min Max
Geographical_Origin
Algeria 19.370000 - 19.37 19.37
China 23.330000 2.160657 21.63 27.64
Egypt 20.840000 1.598061 19.71 21.97
France 26.030000 - 26.03 26.03
Germany 22.240000 - 22.24 22.24
Italy 24.277706 3.060707 17.34 36.57
Morocco 21.800000 1.682914 20.61 22.99
Pakistan 23.700000 - 23.70 23.70
Philippines 23.440000 - 23.44 23.44
Spain 20.800000 - 20.80 20.80
Thailand 22.660000 - 22.66 22.66
Tunisia 20.960000 - 20.96 20.96
USA 22.988333 2.174667 20.20 26.17
circ_eda.groupby("Geographical_Origin")["BMI"].agg("mean").plot(
    kind="barh",
    width=0.9,
    rot=0,
)

plt.title("BMI by Country of Origin")
plt.xlabel("Body Mass Index")
plt.ylabel("Country of Origin")
plt.savefig(
    os.path.join(image_path_png, "bmi_by_geog_origin.png"),
    bbox_inches="tight",
)
plt.savefig(
    os.path.join(image_path_svg, "bmi_by_geog_origin.svg"),
    bbox_inches="tight",
)
plt.show()

10 Functional Outcomes by Age

neg_outcomes = [
    "Functional_Outcomes_Pain",
    "Functional_Outcomes_Bleeding",
    "Functional_Outcomes_Infection",
]

# Define the legend_labels to use in the loop
neg_legend_labels = [
    ["No Pain", "Pain"],
    ["No Bleeding", "Bleeding"],
    ["No Infection", "Infection"],
]

# Define titles for the plots
neg_title = [
    "Pain",
    "Bleeding",
    "Infection",
]
stacked_crosstabs = stacked_crosstab_plot(
    df=circ_eda,
    col="age_group",
    func_col=neg_outcomes,
    legend_labels_list=neg_legend_labels,
    title=neg_title,
    kind="bar",
    width=0.8,
    rot=0,  # axis rotation angle
    custom_order=None,
    text_wrap=80,
    color=["#1f77b4", "#c8544c"],
    output="both",
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    return_dict=True,
    save_formats=["png", "svg"],
    x=14,
    y=10,
    p=12,
    file_prefix="Stacked_Bar",
    logscale=False,
    plot_type="both",
    show_legend=True,
    label_fontsize=12,
    tick_fontsize=12,
)
Plot saved as ../images/png_images/Stacked_Bar_Functional_Outcomes_Pain.png
Plot saved as ../images/svg_images/Stacked_Bar_Functional_Outcomes_Pain.svg

Plot saved as ../images/png_images/Stacked_Bar_Functional_Outcomes_Bleeding.png
Plot saved as ../images/svg_images/Stacked_Bar_Functional_Outcomes_Bleeding.svg

Plot saved as ../images/png_images/Stacked_Bar_Functional_Outcomes_Infection.png
Plot saved as ../images/svg_images/Stacked_Bar_Functional_Outcomes_Infection.svg


Crosstab for Functional_Outcomes_Pain

Functional_Outcomes_Pain  No Pain  Pain  Total  No Pain_%  Pain_%
age_group                                                        
18-29                          69     6     75      92.00    8.00
30-39                          20     1     21      95.24    4.76
40-49                           9     1     10      90.00   10.00
50-59                          17     3     20      85.00   15.00
60-69                          33     0     33     100.00    0.00
70-79                          13     2     15      86.67   13.33
80-89                           4     2      6      66.67   33.33
90-99                           1     1      2      50.00   50.00
Total                         166    16    182      91.21    8.79


Crosstab for Functional_Outcomes_Bleeding

Functional_Outcomes_Bleeding  No Bleeding  Bleeding  Total  No Bleeding_%  \
age_group                                                                   
18-29                                  48        27     75          64.00   
30-39                                  18         3     21          85.71   
40-49                                   8         2     10          80.00   
50-59                                  17         3     20          85.00   
60-69                                  31         2     33          93.94   
70-79                                  12         3     15          80.00   
80-89                                   5         1      6          83.33   
90-99                                   1         1      2          50.00   
Total                                 140        42    182          76.92   

Functional_Outcomes_Bleeding  Bleeding_%  
age_group                                 
18-29                              36.00  
30-39                              14.29  
40-49                              20.00  
50-59                              15.00  
60-69                               6.06  
70-79                              20.00  
80-89                              16.67  
90-99                              50.00  
Total                              23.08  


Crosstab for Functional_Outcomes_Infection

Functional_Outcomes_Infection  No Infection  Infection  Total  No Infection_%  \
age_group                                                                       
18-29                                    67          8     75           89.33   
30-39                                    21          0     21          100.00   
40-49                                    10          0     10          100.00   
50-59                                    20          0     20          100.00   
60-69                                    32          1     33           96.97   
70-79                                    15          0     15          100.00   
80-89                                     5          1      6           83.33   
90-99                                     2          0      2          100.00   
Total                                   172         10    182           94.51   

Functional_Outcomes_Infection  Infection_%  
age_group                                   
18-29                                10.67  
30-39                                 0.00  
40-49                                 0.00  
50-59                                 0.00  
60-69                                 3.03  
70-79                                 0.00  
80-89                                16.67  
90-99                                 0.00  
Total                                 5.49  
stacked_crosstabs
{'Functional_Outcomes_Pain': Functional_Outcomes_Pain  No Pain  Pain  Total  No Pain_%  Pain_%
 age_group                                                        
 18-29                          69     6     75      92.00    8.00
 30-39                          20     1     21      95.24    4.76
 40-49                           9     1     10      90.00   10.00
 50-59                          17     3     20      85.00   15.00
 60-69                          33     0     33     100.00    0.00
 70-79                          13     2     15      86.67   13.33
 80-89                           4     2      6      66.67   33.33
 90-99                           1     1      2      50.00   50.00
 Total                         166    16    182      91.21    8.79,
 'Functional_Outcomes_Bleeding': Functional_Outcomes_Bleeding  No Bleeding  Bleeding  Total  No Bleeding_%  \
 age_group                                                                   
 18-29                                  48        27     75          64.00   
 30-39                                  18         3     21          85.71   
 40-49                                   8         2     10          80.00   
 50-59                                  17         3     20          85.00   
 60-69                                  31         2     33          93.94   
 70-79                                  12         3     15          80.00   
 80-89                                   5         1      6          83.33   
 90-99                                   1         1      2          50.00   
 Total                                 140        42    182          76.92   
 
 Functional_Outcomes_Bleeding  Bleeding_%  
 age_group                                 
 18-29                              36.00  
 30-39                              14.29  
 40-49                              20.00  
 50-59                              15.00  
 60-69                               6.06  
 70-79                              20.00  
 80-89                              16.67  
 90-99                              50.00  
 Total                              23.08  ,
 'Functional_Outcomes_Infection': Functional_Outcomes_Infection  No Infection  Infection  Total  No Infection_%  \
 age_group                                                                       
 18-29                                    67          8     75           89.33   
 30-39                                    21          0     21          100.00   
 40-49                                    10          0     10          100.00   
 50-59                                    20          0     20          100.00   
 60-69                                    32          1     33           96.97   
 70-79                                    15          0     15          100.00   
 80-89                                     5          1      6           83.33   
 90-99                                     2          0      2          100.00   
 Total                                   172         10    182           94.51   
 
 Functional_Outcomes_Infection  Infection_%  
 age_group                                   
 18-29                                10.67  
 30-39                                 0.00  
 40-49                                 0.00  
 50-59                                 0.00  
 60-69                                 3.03  
 70-79                                 0.00  
 80-89                                16.67  
 90-99                                 0.00  
 Total                                 5.49  }
## pickle out the crosstab data for later use in dash plotly
pd.to_pickle(
    stacked_crosstabs,
    os.path.join(data_raw, "stacked_crosstabs.pkl"),
)
## Save the crosstabs to csv on data_path
for key, value in stacked_crosstabs.items():
    # Save each DataFrame as a CSV file
    value.to_csv(os.path.join(data_path, f"{key}.csv"), index=True)
for key, value in stacked_crosstabs.items():
    key = key.lower()
    print(key)
print()

for key, value in stacked_crosstabs.items():
    # Create DataFrame variables dynamically
    key = key.lower()
    globals()[key] = value
    # print(f"{globals()[key]}\n")
functional_outcomes_pain
functional_outcomes_bleeding
functional_outcomes_infection
functional_outcomes_pain
Functional_Outcomes_Pain No Pain Pain Total No Pain_% Pain_%
age_group
18-29 69 6 75 92.00 8.00
30-39 20 1 21 95.24 4.76
40-49 9 1 10 90.00 10.00
50-59 17 3 20 85.00 15.00
60-69 33 0 33 100.00 0.00
70-79 13 2 15 86.67 13.33
80-89 4 2 6 66.67 33.33
90-99 1 1 2 50.00 50.00
Total 166 16 182 91.21 8.79
functional_outcomes_bleeding
Functional_Outcomes_Bleeding No Bleeding Bleeding Total No Bleeding_% Bleeding_%
age_group
18-29 48 27 75 64.00 36.00
30-39 18 3 21 85.71 14.29
40-49 8 2 10 80.00 20.00
50-59 17 3 20 85.00 15.00
60-69 31 2 33 93.94 6.06
70-79 12 3 15 80.00 20.00
80-89 5 1 6 83.33 16.67
90-99 1 1 2 50.00 50.00
Total 140 42 182 76.92 23.08
functional_outcomes_infection
Functional_Outcomes_Infection No Infection Infection Total No Infection_% Infection_%
age_group
18-29 67 8 75 89.33 10.67
30-39 21 0 21 100.00 0.00
40-49 10 0 10 100.00 0.00
50-59 20 0 20 100.00 0.00
60-69 32 1 33 96.97 3.03
70-79 15 0 15 100.00 0.00
80-89 5 1 6 83.33 16.67
90-99 2 0 2 100.00 0.00
Total 172 10 182 94.51 5.49
circ_eda["Functional_Outcomes_Cosmetic_Satisfaction"].value_counts()
Functional_Outcomes_Cosmetic_Satisfaction
1    182
0     12
Name: count, dtype: int64
pos_outcomes = [
    "Functional_Outcomes_Fast_Recovery",
    "Functional_Outcomes_Cosmetic_Satisfaction",
    "Comorbidity_Flag",
]


pos_legend_labels = [
    ["Not Fast Recovery", "Fast Recovery"],
    ["Not Satisfied", "Satisfied"],
    ["Comorbidities", "No Comorbidities"],
]

pos_title = [
    "Recovery",
    "Cosmetic Satisfaction",
    "Comorbidities",
]
stacked_crosstabs = stacked_crosstab_plot(
    df=circ_eda,
    col="age_group",
    func_col=pos_outcomes,
    legend_labels_list=pos_legend_labels,
    title=pos_title,
    kind="bar",
    width=0.8,
    rot=0,  # axis rotation angle
    custom_order=None,
    text_wrap=80,
    color=["#c8544c", "#1f77b4"],
    output="both",
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    return_dict=True,
    x=14,
    y=10,
    p=12,
    save_formats=["png", "svg"],
    file_prefix="Stacked_Bar",
    logscale=False,
    plot_type="both",
    show_legend=True,
    label_fontsize=12,
    tick_fontsize=12,
)
Plot saved as ../images/png_images/Stacked_Bar_Functional_Outcomes_Fast_Recovery.png
Plot saved as ../images/svg_images/Stacked_Bar_Functional_Outcomes_Fast_Recovery.svg

Plot saved as ../images/png_images/Stacked_Bar_Functional_Outcomes_Cosmetic_Satisfaction.png
Plot saved as ../images/svg_images/Stacked_Bar_Functional_Outcomes_Cosmetic_Satisfaction.svg

Plot saved as ../images/png_images/Stacked_Bar_Comorbidity_Flag.png
Plot saved as ../images/svg_images/Stacked_Bar_Comorbidity_Flag.svg


Crosstab for Functional_Outcomes_Fast_Recovery

Functional_Outcomes_Fast_Recovery  Not Fast Recovery  Fast Recovery  Total  \
age_group                                                                    
18-29                                              9             66     75   
30-39                                              0             21     21   
40-49                                              0             10     10   
50-59                                              0             20     20   
60-69                                              0             33     33   
70-79                                              0             15     15   
80-89                                              1              5      6   
90-99                                              0              2      2   
Total                                             10            172    182   

Functional_Outcomes_Fast_Recovery  Not Fast Recovery_%  Fast Recovery_%  
age_group                                                                
18-29                                            12.00            88.00  
30-39                                             0.00           100.00  
40-49                                             0.00           100.00  
50-59                                             0.00           100.00  
60-69                                             0.00           100.00  
70-79                                             0.00           100.00  
80-89                                            16.67            83.33  
90-99                                             0.00           100.00  
Total                                             5.49            94.51  


Crosstab for Functional_Outcomes_Cosmetic_Satisfaction

Functional_Outcomes_Cosmetic_Satisfaction  Not Satisfied  Satisfied  Total  \
age_group                                                                    
18-29                                                 10         65     75   
30-39                                                  0         21     21   
40-49                                                  0         10     10   
50-59                                                  0         20     20   
60-69                                                  0         33     33   
70-79                                                  0         15     15   
80-89                                                  1          5      6   
90-99                                                  0          2      2   
Total                                                 11        171    182   

Functional_Outcomes_Cosmetic_Satisfaction  Not Satisfied_%  Satisfied_%  
age_group                                                                
18-29                                                13.33        86.67  
30-39                                                 0.00       100.00  
40-49                                                 0.00       100.00  
50-59                                                 0.00       100.00  
60-69                                                 0.00       100.00  
70-79                                                 0.00       100.00  
80-89                                                16.67        83.33  
90-99                                                 0.00       100.00  
Total                                                 6.04        93.96  


Crosstab for Comorbidity_Flag

Comorbidity_Flag  Comorbidities  No Comorbidities  Total  Comorbidities_%  \
age_group                                                                   
18-29                        65                10     75            86.67   
30-39                        16                 5     21            76.19   
40-49                         9                 1     10            90.00   
50-59                        11                 9     20            55.00   
60-69                        13                20     33            39.39   
70-79                         7                 8     15            46.67   
80-89                         1                 5      6            16.67   
90-99                         0                 2      2             0.00   
Total                       122                60    182            67.03   

Comorbidity_Flag  No Comorbidities_%  
age_group                             
18-29                          13.33  
30-39                          23.81  
40-49                          10.00  
50-59                          45.00  
60-69                          60.61  
70-79                          53.33  
80-89                          83.33  
90-99                         100.00  
Total                          32.97  
surgical_outcomes = ["Surgical_Technique"]
surgical_legend_labels = [["Laser", "Traditional"]]
surgical_title = ["Surgical Technique"]
stacked_cross_surg_tech = stacked_crosstab_plot(
    df=circ_eda,
    col="age_group",
    func_col=surgical_outcomes,
    legend_labels_list=surgical_legend_labels,
    title=surgical_title,
    kind="bar",
    width=0.8,
    rot=0,  # axis rotation angle
    custom_order=None,
    text_wrap=80,
    color=["#1f77b4", "#203764"],
    output="both",
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    return_dict=True,
    x=14,
    y=10,
    p=12,
    save_formats=["png", "svg"],
    file_prefix="Stacked_Bar",
    logscale=False,
    plot_type="both",
    show_legend=True,
    label_fontsize=12,
    tick_fontsize=12,
)
Plot saved as ../images/png_images/Stacked_Bar_Surgical_Technique.png
Plot saved as ../images/svg_images/Stacked_Bar_Surgical_Technique.svg


Crosstab for Surgical_Technique

Surgical_Technique  Laser  Traditional  Total  Laser_%  Traditional_%
age_group                                                            
18-29                  68            7     75    90.67           9.33
30-39                  15            6     21    71.43          28.57
40-49                   4            6     10    40.00          60.00
50-59                   5           15     20    25.00          75.00
60-69                   9           24     33    27.27          72.73
70-79                  11            4     15    73.33          26.67
80-89                   6            0      6   100.00           0.00
90-99                   2            0      2   100.00           0.00
Total                 120           62    182    65.93          34.07
## pickle out the crosstab data for later use in dash plotly
pd.to_pickle(
    stacked_cross_surg_tech,
    os.path.join(data_raw, "stacked_cross_surg_tech.pkl"),
)

11 Surgical Techniques

11.1 Number of Procedures in each Surgical Category

surg_tech_color = ["#1f77b4", "#203764"]
surg_tech_values = circ_eda["Surgical_Technique"].value_counts(ascending=True)
ax = surg_tech_values.plot(
    kind="bar",
    rot=0,
    width=0.99,
    color=surg_tech_color,
)

for i, v in enumerate(surg_tech_values):
    ax.text(i, v - 40, str(v), ha="center", color="yellow")
ax.set_title("Number of Procedures in Each Surgical Category")
ax.set_xlabel("Surgical Technique")
ax.set_ylabel("Count")
plt.savefig(os.path.join(image_path_svg, "surgical_technique_by_count.svg"))
plt.savefig(os.path.join(image_path_png, "surgical_technique_by_count.png"))
plt.show()

11.2 Surgical Technique by Mean Time (in Minutes)

surgical_techniques = circ_eda.groupby("Surgical_Technique")["Surgical_Time_min"].agg(
    "mean"
)

ax = (
    circ_eda.groupby("Surgical_Technique")["Surgical_Time_min"]
    .agg("mean")
    .plot(
        kind="bar",
        rot=0,
        width=0.99,
        color=surg_tech_color,
    )
)

for i, v in enumerate(surgical_techniques):
    ax.text(i, v - 15, f"{v:.2f}", ha="center", color="yellow")

ax.set_title("Surgical Technique by Mean Time (in Minutes)")
plt.xlabel("Surgical Technique")
plt.ylabel("Surgical Time in Minutes")
plt.savefig(os.path.join(image_path_svg, "surgical_technique_by_mean_time.svg"))
plt.savefig(os.path.join(image_path_png, "surgical_technique_by_mean_time.png"))
plt.show()

11.3 Antibiotics by Surgical Technique

antibiotic_by_surgical_technique = pd.crosstab(
    circ_eda["Preop_drugs_antibiotic"], circ_eda["Surgical_Technique"]
)

antibiotic_by_surgical_technique
Surgical_Technique 0 1
Preop_drugs_antibiotic
Amoxicillina 7 0
Cefazolina 113 62
Ciprofloxacina 8 0
Gentamicina 4 0
antibiotic_by_surgical_technique.plot(
    kind="bar",
    rot=0,
    color=surg_tech_color,
)


plt.title("Type of Antibiotic by Surgical Technique")
plt.xlabel("Antibiotic")
plt.ylabel("Count")
plt.savefig(os.path.join(image_path_png, "antibiotic_by_surgical_technique.png"))
plt.savefig(os.path.join(image_path_svg, "antibiotic_by_surgical_technique.svg"))
plt.show()

11.4 Anesthesia by Surgical Technique

anesthesia_by_surgical_technique = pd.crosstab(
    circ_eda["Anesthesia_Type"], circ_eda["Surgical_Technique"]
)

anesthesia_by_surgical_technique
Surgical_Technique 0 1
Anesthesia_Type
carbocaine 4 0
lidocaine 126 62
xilocaine 2 0
anesthesia_by_surgical_technique.plot(
    kind="bar",
    rot=0,
    color=surg_tech_color,
)


plt.title("Type of Anesthesia by Surgical Technique")
plt.xlabel("Anesthesia Type")
plt.ylabel("Count")
plt.savefig(os.path.join(image_path_png, "anesthesia_by_surgical_technique.png"))
plt.savefig(os.path.join(image_path_svg, "anesthesia_by_surgical_technique.svg"))


plt.show()

11.5 Box Plot of Surgical Time by Surgical Technique

# Box plot for Surgical_Time_min across different Surgical_Techniques
unique_techniques = circ_eda["Surgical_Technique"].unique()
technique_colors = dict(zip(unique_techniques, surg_tech_color))
sns.boxplot(
    x="Surgical_Technique",
    y="Surgical_Time_min",
    data=circ_eda,
    hue="Surgical_Technique",
    palette=technique_colors,
    medianprops={"color": "yellow", "linewidth": 1},  # Setting median line props
)
plt.title("Box plot of Surgical Time by Surgical Technique")
plt.ylabel("Surgical Time (min)")
plt.xlabel("Surgical Technique")
plt.xticks(rotation=0)
plt.savefig(
    os.path.join(image_path_png, "surgical_time_by_technique_boxplot.png"),
)
plt.savefig(
    os.path.join(image_path_svg, "surgical_time_by_technique_boxplot.svg"),
)
plt.show()

11.6 Box Plot of Intraoperative Blood Loss by Surgical Technique

# Box plot for Intraoperative_Blood_Loss_ml across different Surgical_Techniques
sns.boxplot(
    x="Surgical_Technique",
    y="Intraoperative_Blood_Loss_ml",
    data=circ_eda,
    hue="Surgical_Technique",
    palette=technique_colors,
    medianprops={"color": "yellow", "linewidth": 1},  # Setting median line props
)
plt.title("Box plot of Intraoperative Blood Loss by Surgical Technique")
plt.ylabel("Intraoperative Blood Loss (ml)")
plt.xlabel("Surgical Technique")
plt.xticks(rotation=0)
plt.savefig(
    os.path.join(image_path_png, "intraop_blood_loss_by_technique_boxplot.png"),
)
plt.savefig(
    os.path.join(image_path_svg, "intraop_blood_loss_by_technique_boxplot.svg"),
)
plt.show()

11.7 Boxplot of Surgical Time by Anesthesia Type

# Box plot for Intraoperative_Blood_Loss_ml across different Anesthesia Types
sns.boxplot(
    x="Anesthesia_Type",
    y="Surgical_Time_min",
    hue="Anesthesia_Type",
    data=circ_eda,
)
plt.title("Box plot of Surgical Time by Anesthesia Type")
plt.ylabel("Surgical Time (min)")
plt.xlabel("Anesthesia Type")
plt.xticks(rotation=0)
plt.savefig(os.path.join(image_path_png, "anesthesia_surgical_time_boxplot.png"))
plt.savefig(os.path.join(image_path_svg, "anesthesia_surgical_time_boxplot.svg"))
plt.show()

11.8 Box Plot of Intraoperative Blood Loss by Anesthesia Type

# Box plot for Intraoperative_Blood_Loss_ml across different Anesthesia Types
sns.boxplot(
    x="Anesthesia_Type",
    y="Intraoperative_Blood_Loss_ml",
    hue="Anesthesia_Type",
    data=circ_eda,
)
plt.title("Box plot of Intraoperative Blood Loss by Anesthesia Type")
plt.ylabel("Intraoperative Blood Loss (ml)")
plt.xlabel("Anesthesia Type")
plt.xticks(rotation=0)
plt.savefig(os.path.join(image_path_png, "anesthesia_blood_loss_boxplot.png"))
plt.savefig(os.path.join(image_path_svg, "anesthesia_blood_loss_boxplot.svg"))
plt.show()

11.9 Box Plot of Surgical Time (min) by Preoperative Antibiotic

# Box plot for Intraoperative_Blood_Loss_ml across different Anesthesia Types
sns.boxplot(
    x="Preop_drugs_antibiotic",
    y="Surgical_Time_min",
    hue="Preop_drugs_antibiotic",
    data=circ_eda,
)
plt.title("Box plot of Surgical Time (min) by Preoperative Antibiotic")
plt.ylabel("Surgical Time (min)")
plt.xlabel("Antibiotic Type")
plt.xticks(rotation=0)
plt.savefig(os.path.join(image_path_png, "antibiotic_surgical_time_boxplot.png"))
plt.savefig(os.path.join(image_path_svg, "antibiotic_surgical_time_boxplot.svg"))
plt.show()

11.10 Box Plot of Intraoperative Blood Loss by Preoperative Antibiotic

# Box plot for Intraoperative_Blood_Loss_ml across different Anesthesia Types
sns.boxplot(
    x="Preop_drugs_antibiotic",
    y="Intraoperative_Blood_Loss_ml",
    hue="Preop_drugs_antibiotic",
    data=circ_eda,
)
plt.title("Box plot of Intraoperative Blood Loss by Preoperative Antibiotic")
plt.ylabel("Intraoperative Blood Loss (ml)")
plt.xlabel("Antibiotic Type")
plt.xticks(rotation=0)
plt.savefig(os.path.join(image_path_png, "antibiotic_blood_loss_boxplot.png"))
plt.savefig(os.path.join(image_path_svg, "antibiotic_blood_loss_boxplot.svg"))
plt.show()

11.11 Prevalance of Functional Outcomes by Surgical Technique

# List of custom plotting titles
functional_title_list = [
    "Pain",
    "Bleeding",
    "Edema",
    "Infection",
    "Recovery",
    "Satisfaction",
    "Comorbidities",
    "Surgical Technique",
]

functional_list = [col for col in circ_eda.columns if "Functional" in col]


functional_labels = {
    "Functional_Outcomes_Pain": {0: "No Pain", 1: "Pain"},
    "Functional_Outcomes_Bleeding": {0: "No Bleeding", 1: "Bleeding"},
    "Functional_Outcomes_Edema": {0: "No Edema", 1: "Edema"},
    "Functional_Outcomes_Infection": {0: "No Infection", 1: "Infection"},
    "Functional_Outcomes_Fast_Recovery": {0: "Not Fast Recovery", 1: "Fast Recovery"},
    "Functional_Outcomes_Cosmetic_Satisfaction": {
        0: "No Satisfaction",
        1: "Satisfaction",
    },
}


for item, title in zip(functional_list, functional_title_list):
    ax = pd.crosstab(
        circ_eda[item].map(functional_labels[item]),
        circ_eda["Surgical_Technique"],
    ).plot(
        kind="bar",
        # stacked=True,
        # width=0.9,
        rot=0,
        color=technique_colors,
    )

    ax.set_ylabel("Count")
    ax.set_title(f"Prevalence of {title} by Surgical Technique")
    # Setting labels for x-axis ticks
    ax.set_xticklabels(functional_labels[item].values())
    ax.set_xlabel(title)

    plt.savefig(
        os.path.join(
            image_path_png, f"Prevalance_of_{title}_by_surgical_technique.png"
        ),
    )

    plt.savefig(
        os.path.join(
            image_path_svg, f"Prevalance_of_{title}_by_surgical_technique.svg"
        ),
    )


plt.show()

12 Socioeconomic Impacts

12.1 Religious Affiliation by Geographical Origin

circ_eda["Cultural_Religious_Affiliation"].unique().tolist()
['Jewish', 'Catholic', 'Atheist', 'Buddhist', 'Orthodox', 'Muslims']
ct = pd.crosstab(
    circ_eda["Cultural_Religious_Affiliation"],
    circ_eda["Geographical_Origin"],
    margins=True,
    margins_name="Total",
)

highlight_columns(ct, "Total", color="brown")
Geographical_Origin Algeria China Egypt France Germany Italy Morocco Pakistan Philippines Spain Thailand Tunisia USA Total
Cultural_Religious_Affiliation                            
Atheist 0 0 0 0 1 34 0 0 0 1 0 0 1 37
Buddhist 0 1 0 0 0 1 0 0 1 0 1 0 0 4
Catholic 0 4 0 1 0 131 0 0 0 0 0 0 4 140
Jewish 0 0 0 0 0 3 0 0 0 0 0 0 1 4
Muslims 1 0 2 0 0 0 2 1 0 0 0 1 0 7
Orthodox 0 1 0 0 0 1 0 0 0 0 0 0 0 2
Total 1 6 2 1 1 170 2 1 1 1 1 1 6 194
# create a heatmap of the crosstab between religion and geo. origin
plt.figure(figsize=(9, 6))
sns.heatmap(
    pd.crosstab(
        circ_eda["Geographical_Origin"], circ_eda["Cultural_Religious_Affiliation"]
    ),
    annot=True,
    cmap="rocket_r",
    fmt="d",
)
plt.title("Cultural Religious Affiliation by Geographical Origin")
plt.xlabel("Geographical Origin")
plt.savefig(
    os.path.join(image_path_png, "religion_by_geog_origin.png"),
    bbox_inches="tight",
)
plt.savefig(
    os.path.join(image_path_svg, "religion_by_geog_origin.svg"),
    bbox_inches="tight",
)
plt.show()

12.2 Total Cost by Coverage Type

total_cost_ins_values = circ_eda["Cost_Type"].value_counts(ascending=True)
ins_bar_col = ["#1f77b4", "#c8544c", "#555555"]

ax = total_cost_ins_values.plot(
    kind="barh",
    rot=0,
    width=0.99,
    legend=False,
    color=ins_bar_col,
)

for i, v in enumerate(total_cost_ins_values.values):
    ax.text(v - 5, i, str(v), ha="right", va="center", color="yellow")

ax.set_title("Total Number of Procedures by Coverage Category")
ax.set_xlabel("Number of Procedures")
ax.set_ylabel("Cost Type")
plt.savefig(os.path.join(image_path_svg, "total_number_by_coverage.svg"))
plt.savefig(os.path.join(image_path_png, "total_number_by_coverage.png"))
plt.show()

total_cost_by_ins = round(
    circ_eda.groupby("Cost_Type")["Cost_of_Procedure_euros"].sum().to_frame(), 2
).rename(columns={"Cost_of_Procedure_euros": "Total_Cost"})

ax = total_cost_by_ins.plot(
    kind="barh",
    rot=0,
    width=0.99,
    legend=False,
    color=ins_bar_col,
)

# Accessing the bar patches
for i, patch in enumerate(ax.patches):
    patch.set_facecolor(ins_bar_col[i % len(ins_bar_col)])

# Iterate over the DataFrame's rows to place text labels
for i, (index, row) in enumerate(total_cost_by_ins.iterrows()):
    # Only place text if the cost is greater than 0
    if row["Total_Cost"] > 0:
        # Use 'i' for the y position and adjust the x position to place the text
        # to the right of the bar's end
        ax.text(
            row["Total_Cost"] - 10000,
            i,
            f"{row['Total_Cost']} €",
            ha="left",
            va="center",
            color="yellow",
        )

ax.set_title("Total Cost of Procedure by Coverage Category")
ax.set_xlabel("Cost (in €)")
ax.set_ylabel("Cost Type")

plt.savefig(os.path.join(image_path_svg, "total_cost_by_coverage.svg"))
plt.savefig(os.path.join(image_path_png, "total_cost_by_coverage.png"))

plt.show()
total_cost_by_ins

Total_Cost
Cost_Type
Insurance 24000
Private 38400
SSN 0

12.3 Average Cost by Coverage Type

avg_cost_by_ins = round(
    circ_eda.groupby("Cost_Type")["Cost_of_Procedure_euros"].agg("mean").to_frame(), 2
).rename(columns={"Cost_of_Procedure_euros": "Average_Cost"})


ax = avg_cost_by_ins.plot(
    kind="barh",
    rot=0,
    width=0.99,
    legend=False,
)

# Get the y-axis labels (which are the categories) as a list
y_labels = avg_cost_by_ins.index.tolist()

# Accessing the bar patches
for i, patch in enumerate(ax.patches):
    patch.set_facecolor(ins_bar_col[i % len(ins_bar_col)])

# Iterate over the DataFrame's rows to place text labels
for i, (index, row) in enumerate(avg_cost_by_ins.iterrows()):
    # Only place text if the cost is greater than 0
    if row["Average_Cost"] > 0:
        # Use 'i' for the y position and adjust the x position to place the text
        # to the right of the bar's end
        ax.text(
            row["Average_Cost"] - 200,
            i,
            f"{row['Average_Cost']} €",
            ha="left",
            va="center",
            color="yellow",
        )

ax.set_title("Average Cost of Procedure by Coverage Category")
ax.set_xlabel("Cost (in €)")
ax.set_ylabel("Cost Type")
plt.savefig(os.path.join(image_path_svg, "avg_cost_by_coverage.svg"))
plt.savefig(os.path.join(image_path_png, "avg_cost_by_coverage.png"))


plt.show()
avg_cost_by_ins

Average_Cost
Cost_Type
Insurance 1043.48
Private 984.62
SSN 0.00

12.4 Number of Patients by Country of Origin

circ_eda["Geographical_Origin"].value_counts(ascending=True).plot(kind="barh")
plt.title("Number of Patients by Country of Origin")
plt.xlabel("Number of Patients")
plt.ylabel("Country of Origin")
plt.savefig(
    os.path.join(image_path_svg, "number_patients_by_country.svg"),
    bbox_inches="tight",
)
plt.savefig(
    os.path.join(image_path_png, "number_patients_by_country.png"),
    bbox_inches="tight",
)
plt.show()

12.5 Cost of Procedure by Country of Origin

circ_eda.groupby("Geographical_Origin")["Cost_of_Procedure_euros"].agg(
    "mean"
).sort_values().plot(kind="barh")

plt.title("Average Cost of Procedure by Country of Origin")
plt.xlabel("Cost (in €)")
plt.ylabel("Country of Origin")


plt.savefig(
    os.path.join(image_path_svg, "cost_by_country.svg"),
    bbox_inches="tight",
)


plt.savefig(
    os.path.join(image_path_png, "cost_by_country.png"),
    bbox_inches="tight",
)
plt.show()