Exploratory Data Analysis

Import Requisite Libraries

######################## Standard Library Imports ##############################
import pandas as pd
import numpy as np
from scipy.stats import gaussian_kde
from itertools import combinations
import os
import sys

########################## Plotting Libraries ##################################
import matplotlib.pyplot as plt
import seaborn as sns
import eda_toolkit
from eda_toolkit import (
    ensure_directory,
    kde_distributions,
    box_violin_plot,
    stacked_crosstab_plot,
    flex_corr_matrix,
    box_violin_plot,
    highlight_columns,
    scatter_fit_plot,
    generate_table1,
)

################################################################################

# Add the parent directory to sys.path to access 'functions.py'
sys.path.append(os.path.join(os.pardir))

from core.functions import *  # import custom functions

print(f"This project uses Python {sys.version.split()[0]}.")
print(f"This project uses EDA_Toolkit {eda_toolkit.__version__}.")

This project uses Python 3.11.0.
This project uses EDA_Toolkit 0.0.19.

Read in the Data

# Define your base paths
# `base_path`` represents the parent directory of your current working directory
base_path = os.path.join(os.pardir)
data_path = os.path.join(base_path, "data")
data_raw = "../data/"
image_path_png = os.path.join(base_path, "images", "png_images")
image_path_svg = os.path.join(base_path, "images", "svg_images")

# Ensure that each directory exists
ensure_directory(data_path)
ensure_directory(image_path_png)
ensure_directory(image_path_svg)

Directory exists: ../data
Directory exists: ../images/png_images
Directory exists: ../images/svg_images

# read in the data, set index to "ID"
circ_eda = pd.read_csv(os.path.join(data_path, "circ_eda.csv")).set_index("patient_id")

circ_eda.columns.to_list()

['Birthday',
 'Age_years',
 'Weight_kg',
 'BMI',
 'Geographical_Origin',
 'Cultural_Religious_Affiliation',
 'Comorbidities',
 'Preop_drugs_antibiotic',
 'Preop_Blood_Pressure_mmHg',
 'Preop_Heart_Rate_bpm',
 'Preop_Pulse_Ox_Percent',
 'Surgical_Technique',
 'Anesthesia_Type',
 'Intraoperative_drugs',
 'Intraoperative_Blood_Loss_ml',
 'Intraop_Mean_Blood_Pressure_mmHg',
 'Intraop_Mean_Heart_Rate_bpm',
 'Intraop_Mean_Pulse_Ox_Percent',
 'Surgical_Time_min',
 'Functional_Outcomes_Pain',
 'Functional_Outcomes_Bleeding',
 'Functional_Outcomes_Edema',
 'Functional_Outcomes_Infection',
 'Functional_Outcomes_Fast_Recovery',
 'Functional_Outcomes_Cosmetic_Satisfaction',
 'Cost_of_Procedure_euros',
 'Cost_Type',
 'BMI_Category',
 'Preop_MAP',
 'Intraop_MAP',
 'Diabetes',
 'Anesthesia_Type_carbocaine',
 'Anesthesia_Type_lidocaine',
 'BMI_Category_Normal_Weight',
 'BMI_Category_Obese',
 'BMI_Category_Overweight',
 'BMI_Category_Underweight',
 'Intraop_SBP',
 'Intraop_DBP',
 'Preop_SBP',
 'Preop_DBP',
 'Comorbidity_Flag']

circ_eda.head()  # inspect first five rows of dataframe

	Birthday	Age_years	Weight_kg	BMI	Geographical_Origin	Cultural_Religious_Affiliation	Comorbidities	Preop_drugs_antibiotic	Preop_Blood_Pressure_mmHg	Preop_Heart_Rate_bpm	...	Anesthesia_Type_lidocaine	BMI_Category_Normal_Weight	BMI_Category_Obese	BMI_Category_Overweight	BMI_Category_Underweight	Intraop_SBP	Intraop_DBP	Preop_SBP	Preop_DBP	Comorbidity_Flag
patient_id
424123959	NaN	22	67	26.17	USA	Jewish	0	Cefazolina	130/80	85	...	1	0	0	1	0	130	90	130	80	0
390469576	NaN	50	90	31.14	Italy	Catholic	IPA	Cefazolina	120/70	65	...	1	0	1	0	0	120	70	120	70	1
633173792	NaN	70	65	23.88	Italy	Catholic	DM	Cefazolina	110/80	90	...	1	1	0	0	0	110	80	110	80	1
784928164	NaN	68	78	27.64	Italy	Catholic	DM	Cefazolina	120/90	65	...	1	0	0	1	0	120	90	120	90	1
936242280	NaN	64	88	31.18	Italy	Catholic	DM	Cefazolina	110/70	79	...	1	0	1	0	0	110	70	110	70	1

5 rows × 42 columns

circ_eda = circ_eda.drop(columns=["Birthday"])  # drop unused col

circ_eda = circ_eda[circ_eda["Age_years"] >= 18]

circ_eda.columns

Index(['Age_years', 'Weight_kg', 'BMI', 'Geographical_Origin',
       'Cultural_Religious_Affiliation', 'Comorbidities',
       'Preop_drugs_antibiotic', 'Preop_Blood_Pressure_mmHg',
       'Preop_Heart_Rate_bpm', 'Preop_Pulse_Ox_Percent', 'Surgical_Technique',
       'Anesthesia_Type', 'Intraoperative_drugs',
       'Intraoperative_Blood_Loss_ml', 'Intraop_Mean_Blood_Pressure_mmHg',
       'Intraop_Mean_Heart_Rate_bpm', 'Intraop_Mean_Pulse_Ox_Percent',
       'Surgical_Time_min', 'Functional_Outcomes_Pain',
       'Functional_Outcomes_Bleeding', 'Functional_Outcomes_Edema',
       'Functional_Outcomes_Infection', 'Functional_Outcomes_Fast_Recovery',
       'Functional_Outcomes_Cosmetic_Satisfaction', 'Cost_of_Procedure_euros',
       'Cost_Type', 'BMI_Category', 'Preop_MAP', 'Intraop_MAP', 'Diabetes',
       'Anesthesia_Type_carbocaine', 'Anesthesia_Type_lidocaine',
       'BMI_Category_Normal_Weight', 'BMI_Category_Obese',
       'BMI_Category_Overweight', 'BMI_Category_Underweight', 'Intraop_SBP',
       'Intraop_DBP', 'Preop_SBP', 'Preop_DBP', 'Comorbidity_Flag'],
      dtype='object')

table1_cont = generate_table1(
    circ_eda, include_types="continuous", groupby_col="Surgical_Technique"
)

Using Welch's t-test for continuous variable: Age_years
Using Welch's t-test for continuous variable: BMI
Using Welch's t-test for continuous variable: Cost_of_Procedure_euros
Using Welch's t-test for continuous variable: Intraop_DBP
Using Welch's t-test for continuous variable: Intraop_MAP
Using Welch's t-test for continuous variable: Intraop_Mean_Heart_Rate_bpm
Using Welch's t-test for continuous variable: Intraop_Mean_Pulse_Ox_Percent
Using Welch's t-test for continuous variable: Intraop_SBP
Using Welch's t-test for continuous variable: Intraoperative_Blood_Loss_ml
Using Welch's t-test for continuous variable: Preop_DBP
Using Welch's t-test for continuous variable: Preop_Heart_Rate_bpm
Using Welch's t-test for continuous variable: Preop_MAP
Using Welch's t-test for continuous variable: Preop_Pulse_Ox_Percent
Using Welch's t-test for continuous variable: Preop_SBP
Using Welch's t-test for continuous variable: Surgical_Time_min
Using Welch's t-test for continuous variable: Weight_kg

print(table1_cont)

 Variable           | Type       | Mean   | SD     | Median | Min    | Max      | Mode   | Missing (n) | Missing (%) | Count | Proportion (%) | 1 (n = 62)   | 0 (n = 132)   | P-value 
--------------------|------------|--------|--------|--------|--------|----------|--------|-------------|-------------|-------|----------------|--------------|---------------|---------
 Age_years          | Continuous | 43.13  | 21.88  | 34.00  | 18.00  | 93.00    | 18.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.00    
 BMI                | Continuous | 24.07  | 3.00   | 23.68  | 17.34  | 36.57    | 21.63  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.97    
 Cost_of_Procedure_ | Continuous | 321.65 | 475.78 | 0.00   | 0.00   | 1,200.00 | 0.00   | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.00    
 Intraop_DBP        | Continuous | 82.94  | 15.87  | 90.00  | 10.00  | 100.00   | 90.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.00    
 Intraop_MAP        | Continuous | 96.08  | 11.13  | 100.00 | 46.67  | 110.00   | 100.00 | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.00    
 Intraop_Mean_Heart | Continuous | 75.93  | 5.68   | 80.00  | 60.00  | 88.00    | 80.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.05    
 Intraop_Mean_Pulse | Continuous | 96.69  | 1.78   | 97.00  | 91.00  | 99.00    | 98.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.87    
 Intraop_SBP        | Continuous | 122.37 | 10.61  | 120.00 | 100.00 | 150.00   | 120.00 | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.21    
 Intraoperative_Blo | Continuous | 7.90   | 15.51  | 0.00   | 0.00   | 100.00   | 0.00   | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.00    
 Preop_DBP          | Continuous | 72.79  | 12.06  | 70.00  | 10.00  | 100.00   | 80.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.09    
 Preop_Heart_Rate_b | Continuous | 76.85  | 6.77   | 77.00  | 59.00  | 93.00    | 68.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.15    
 Preop_MAP          | Continuous | 89.34  | 9.90   | 90.00  | 46.67  | 116.67   | 96.67  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.07    
 Preop_Pulse_Ox_Per | Continuous | 96.78  | 1.72   | 97.00  | 91.00  | 99.00    | 98.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.68    
 Preop_SBP          | Continuous | 122.42 | 10.67  | 120.00 | 100.00 | 150.00   | 120.00 | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.24    
 Surgical_Time_min  | Continuous | 28.18  | 5.64   | 27.50  | 15.00  | 40.00    | 28.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.07    
 Weight_kg          | Continuous | 72.80  | 9.40   | 73.00  | 50.00  | 102.00   | 68.00  | 0           | 0.00        | 194   | 100.00         | 62 (100.00%) | 132 (100.00%) | 0.32

table1_cont = generate_table1(circ_eda, include_types="categorical")
table1_cont

	Variable	Type	Mode	Count	Proportion (%)
0	Geographical_Origin	Categorical	Italy	194	100.00
1	Cultural_Religious_Affiliation	Categorical	Catholic	194	100.00
2	Comorbidities	Categorical	0.00	194	100.00
3	Preop_drugs_antibiotic	Categorical	Cefazolina	194	100.00
4	Preop_Blood_Pressure_mmHg	Categorical	130/80	194	100.00
5	Anesthesia_Type	Categorical	lidocaine	194	100.00
6	Intraoperative_drugs	Categorical	ipnovel, tavor	194	100.00
7	Intraop_Mean_Blood_Pressure_mmHg	Categorical	120/90	194	100.00
8	Cost_Type	Categorical	SSN	194	100.00
9	BMI_Category	Categorical	Normal_Weight	194	100.00
10	Anesthesia_Type_carbocaine	Categorical	0.00	194	100.00
11	Anesthesia_Type_lidocaine	Categorical	1.00	194	100.00
12	BMI_Category_Normal_Weight	Categorical	1.00	194	100.00
13	BMI_Category_Obese	Categorical	0.00	194	100.00
14	BMI_Category_Overweight	Categorical	0.00	194	100.00
15	BMI_Category_Underweight	Categorical	0.00	194	100.00
16	Comorbidity_Flag	Categorical	0.00	194	100.00
17	Diabetes	Categorical	0.00	194	100.00
18	Functional_Outcomes_Bleeding	Categorical	0.00	194	100.00
19	Functional_Outcomes_Cosmetic_Satisfaction	Categorical	1.00	194	100.00
20	Functional_Outcomes_Edema	Categorical	0.00	194	100.00
21	Functional_Outcomes_Fast_Recovery	Categorical	1.00	194	100.00
22	Functional_Outcomes_Infection	Categorical	0.00	194	100.00
23	Functional_Outcomes_Pain	Categorical	0.00	194	100.00
24	Surgical_Technique	Categorical	0.00	194	100.00

Define Age Group

# create bins for age along with labels such that age as a continuous series
# can be converted to something more manageable for visualization and analysis
bin_ages = [18, 30, 40, 50, 60, 70, 80, 90, 100]
label_ages = [
    "18-29",
    "30-39",
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-89",
    "90-99",
]

circ_eda["age_group"] = pd.cut(
    circ_eda["Age_years"],
    bins=bin_ages,
    labels=label_ages,
    include_lowest=True,
    right=False,
)

Clinical Characteristics

Prevalence of Comorbidities

comorbid_color = ["#1f77b4", "#c8544c"]
comorbid_flag = {0: "No Comorbidities", 1: "Comorbidities"}
comorb_val_counts = circ_eda["Comorbidity_Flag"].value_counts()
comorb_val_counts.index = comorb_val_counts.index.map(comorbid_flag)
ax = comorb_val_counts.plot(
    kind="bar",
    rot=0,
    width=0.99,
    color=comorbid_color,
)

for i, v in enumerate(comorb_val_counts):
    ax.text(i, v - 20, str(v), ha="center", color="yellow")
ax.set_title("Prevalence of Comorbidities")
ax.set_xlabel("Comorbidity Flag")
ax.set_ylabel("Comorbidity Count")
plt.savefig(
    os.path.join(image_path_png, "comorbidities_vs_no_comorbidities.png"),
)
plt.savefig(
    os.path.join(image_path_svg, "comorbidities_vs_no_comorbidities.svg"),
)
plt.show()

Comorbidities by Age Group

circ_eda["age_group"].unique()

['18-29', '50-59', '70-79', '60-69', '40-49', '30-39', '90-99', '80-89']
Categories (8, object): ['18-29' < '30-39' < '40-49' < '50-59' < '60-69' < '70-79' < '80-89' < '90-99']

circ_eda["age_group"].value_counts(dropna=False).sort_index()

age_group
18-29    85
30-39    22
40-49    10
50-59    17
60-69    30
70-79    20
80-89     7
90-99     3
Name: count, dtype: int64

filtered_df = circ_eda[circ_eda["Comorbidities"] != "0"]
age_group_order = filtered_df["age_group"].dropna().unique().tolist()
age_group_order = [
    "18-29",
    "30-39",
    "40-49",
    "50-59",
    "60-69",
    "70-79",
    "80-89",
    "90-99",
]

sorted_crosstab = pd.crosstab(
    filtered_df["Comorbidities"], filtered_df["age_group"]
)


plt.figure(figsize=(9, 6))

# Create the heatmap using the sorted crosstab
sns.heatmap(
    sorted_crosstab,
    annot=True,
    cmap="rocket_r",
    fmt="d",
)
# plt.title("Comorbidites by Age Group")
plt.xlabel("Age Group")

# Save the image, assuming image_path_png and image_path_svg are already defined
plt.savefig(
    os.path.join(image_path_png, "comorbidities_by_age_group.png"),
    bbox_inches="tight",
)
plt.savefig(
    os.path.join(image_path_svg, "comorbidities_by_age_group.svg"),
    bbox_inches="tight",
)
plt.show()

print(len(circ_eda))
print(circ_eda["age_group"].value_counts().sort_index())

194
age_group
18-29    85
30-39    22
40-49    10
50-59    17
60-69    30
70-79    20
80-89     7
90-99     3
Name: count, dtype: int64

Comorbidities by Geographical Origin

plt.figure(figsize=(9, 6))

ct = pd.crosstab(
    circ_eda["Comorbidities"].astype(str).str.strip(),
    circ_eda["Geographical_Origin"]
)

ct = ct.drop(index="0", errors="ignore")


sns.heatmap(
    ct,
    annot=True,
    cmap="rocket_r",
    fmt="d",
)



plt.title("Comorbidites by Geographical Origin")
plt.xlabel("Geographical Origin")
plt.savefig(
    os.path.join(image_path_png, "comorbidities_by_geog_origin.png"),
    bbox_inches="tight",
)

plt.savefig(
    os.path.join(image_path_svg, "comorbidities_by_geog_origin.svg"),
    bbox_inches="tight",
)

plt.show()

Overall Distributions

dist_list_1 = [
    "Age_years",
    "BMI",
    "Comorbidities",
    "Preop_drugs_antibiotic",
    "Preop_Heart_Rate_bpm",
    "Preop_Pulse_Ox_Percent",
    "Surgical_Technique",
    "Intraoperative_Blood_Loss_ml",
]

dist_list_2 = [
    "Intraop_Mean_Heart_Rate_bpm",
    "Intraop_Mean_Pulse_Ox_Percent",
    "Surgical_Time_min",
    "Functional_Outcomes_Pain",
    "Functional_Outcomes_Bleeding",
    "Functional_Outcomes_Edema",
    "Functional_Outcomes_Infection",
    "Functional_Outcomes_Fast_Recovery",
]

dist_list_3 = [
    "Functional_Outcomes_Cosmetic_Satisfaction",
    "Cost_of_Procedure_euros",
    "Preop_MAP",
    "Intraop_MAP",
    "Anesthesia_Type_lidocaine",
    "SBP",
    "DBP",
    "Comorbidity_Flag",
]

len(dist_list_1) + len(dist_list_2) + len(dist_list_3)

dist_list = circ_eda.select_dtypes(np.number).columns.to_list()

kde_distributions(
    df=circ_eda,
    fill=True,
    n_rows=2,
    n_cols=4,
    h_pad=5,
    fill_alpha=0.60,
    text_wrap=40,
    # grid_figsize=(50, 25),  # Size of the overall grid figure
    vars_of_interest=dist_list_1,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    plot_type="both",
    image_filename="numeric_distributions_1",
    bbox_inches="tight",
    # y_axis_label=" ",
    # plot_title=None,
    bins=10,
    tick_fontsize=14,
    # custom_xlabels={"Age_years": "Age"},
    # custom_titles={"Age_years": None},
    label_fontsize=16,
)

dist_list = circ_eda.select_dtypes(np.number).columns.to_list()

kde_distributions(
    df=circ_eda,
    fill=True,
    n_rows=2,
    n_cols=4,
    h_pad=5,
    fill_alpha=0.60,
    text_wrap=30,
    # grid_figsize=(50, 25),  # Size of the overall grid figure
    vars_of_interest=dist_list_2,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    plot_type="both",
    image_filename="numeric_distributions_2",
    bbox_inches="tight",
    y_axis_label="Density",
    bins=10,
    tick_fontsize=14,
    label_fontsize=16,
)

dist_list = circ_eda.select_dtypes(np.number).columns.to_list()

kde_distributions(
    df=circ_eda,
    fill=True,
    n_rows=2,
    n_cols=4,
    h_pad=5,
    fill_alpha=0.60,
    text_wrap=40,
    # grid_figsize=(50, 25),  # Size of the overall grid figure
    vars_of_interest=dist_list_3,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    plot_type="both",
    image_filename="numeric_distributions_3",
    bbox_inches="tight",
    y_axis_label="Density",
    bins=10,
    tick_fontsize=14,
    label_fontsize=16,
)

kde_distributions(
    df=circ_eda,
    figsize=(10, 6),
    text_wrap=50,
    hist_color="brown",
    bbox_inches="tight",
    vars_of_interest=["Surgical_Time_min"],
    y_axis_label="Density",
    bins=10,
    fill_alpha=0.40,
    plot_type="both",
    stat="Density",
    label_fontsize=16,  # Font size for axis labels
    tick_fontsize=14,  # Font size for tick labels
    plot_mean=True,
    plot_median=True,
    mean_color="blue",
    image_filename="surgical_time_distribution",
    image_path_svg=image_path_svg,
    custom_xlabels=None,  # New parameter to customize x-axis labels
    custom_titles={
        "Surgical_Time_min": "Surgical Time in Minutes"
    },  # New parameter to customize plot titles
    image_path_png=image_path_png,
    std_dev_levels=[
        1,
        2,
        3,
    ],
    std_color=[
        "purple",
        "green",
        "silver",
    ],
    # title="Age Distribution",
)

Correlation Matrix

# Function to create a mock dataset
def create_mock_dataset(rows=100, seed=42):
    np.random.seed(seed)
    data = {
        "Age_years": np.random.randint(18, 65, size=rows),
        "Weight_kg": np.random.uniform(50, 120, size=rows),
        "BMI": np.random.uniform(18, 35, size=rows),
        "Comorbidities": np.random.choice([0, 1], size=rows),
        "Preop_Heart_Rate_bpm": np.random.randint(60, 100, size=rows),
        "Intraop_Mean_Heart_Rate_bpm": np.random.randint(70, 110, size=rows),
        "Intraop_Mean_Pulse_Ox_Percent": np.random.uniform(90, 100, size=rows),
        "Surgical_Time_min": np.random.randint(30, 300, size=rows),
        "Cost_of_Procedure_euros": np.random.uniform(5000, 20000, size=rows),
        "SBP": np.random.randint(100, 140, size=rows),
        "DBP": np.random.randint(60, 90, size=rows),
    }
    return pd.DataFrame(data)


# Generate the dataset
mock_dataset = create_mock_dataset(rows=100)

mock_dataset

	Age_years	Weight_kg	BMI	Comorbidities	Preop_Heart_Rate_bpm	Intraop_Mean_Heart_Rate_bpm	Intraop_Mean_Pulse_Ox_Percent	Surgical_Time_min	Cost_of_Procedure_euros	SBP	DBP
0	56	70.544172	30.089542	1	84	82	92.944939	291	19005.377356	126	79
1	46	50.985588	31.761518	0	99	92	99.958314	82	5113.015447	132	68
2	32	63.918968	23.927322	1	60	94	96.969251	234	8379.991966	103	76
3	60	99.793937	19.635001	0	75	104	93.842018	155	10480.352295	121	79
4	25	105.312288	33.988895	1	98	99	97.371007	264	12317.147012	101	81
...	...	...	...	...	...	...	...	...	...	...	...
95	24	85.573914	20.322563	1	83	104	94.593468	130	16121.913667	125	80
96	26	98.706896	30.051487	1	82	88	98.420914	35	13597.262739	127	73
97	41	110.085116	27.397940	1	91	89	97.689177	209	19965.389239	120	82
98	18	72.817123	23.040672	1	96	87	90.662360	255	16286.046801	106	75
99	61	65.416873	25.136275	1	71	83	90.458613	258	15604.706919	116	77

100 rows × 11 columns

from eda_toolkit import flex_corr_matrix

feature_list = [
    "Age_years",
    "BMI",
    "Surgical_Technique",
    "Intraoperative_Blood_Loss_ml",
    "Intraop_Mean_Heart_Rate_bpm",
    "Intraop_Mean_Pulse_Ox_Percent",
    "Surgical_Time_min",
    "Diabetes",
    "BMI_Category_Obese",
    "BMI_Category_Overweight",
    "BMI_Category_Underweight",
    "Intraop_SBP",
    "Intraop_DBP",
]
flex_corr_matrix(
    df=circ_eda,
    # cols=mock_dataset.columns.to_list(),
    cols=feature_list,
    annot=True,
    cmap="viridis",
    figsize=(20, 20),
    # title="US Census Correlation Matrix",
    xlabel_alignment="right",
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plots=True,
    label_fontsize=20,
    tick_fontsize=20,
    xlabel_rot=30,
    ylabel_rot=0,
    text_wrap=30,
    vmin=-1,
    vmax=1,
    cbar_label="Correlation Index",
    # cbar_padding=0.8,  # Adjust spacing as needed
    # cbar_width_ratio=0.05,  # Adjust width as needed
    triangular=True,
    show_colorbar=True,
)

Scatter Plots

Preoperative vs. Intraoperative Characteristics (Scatterplots)

Examining the clinical relevance of the proposed correlations involves understanding the physiological interactions and the possible implications of these measurements on patient outcomes. Here’s a brief overview of the clinical sensibility of each correlation:

Preoperative Heart Rate (BPM) vs. Preoperative Pulse Oximetry (SpO2):

Clinical Relevance: Moderately relevant. Heart rate and oxygen saturation can both be indicators of a patient’s cardiorespiratory status. While there’s no direct causal relationship, abnormalities in one might reflect or affect changes in the other, especially in the context of cardiorespiratory diseases.
Preoperative Heart Rate (BPM) vs. Intraoperative Blood Loss (ml):

Clinical Relevance: Indirect relevance. Preoperative heart rate could reflect the patient’s stress or anxiety level, potentially influencing blood pressure and vascular tone. However, the correlation with intraoperative blood loss is likely to be influenced by many other factors, making this relationship more complex and indirect.
Preoperative Heart Rate (BPM) vs. Preoperative Mean Arterial Pressure (MAP):

Clinical Relevance: Highly relevant. There’s a physiological interaction where the heart rate can influence and be influenced by arterial pressure due to cardiac output and vascular resistance factors. This relationship is fundamental in understanding the patient’s hemodynamic status.
Preoperative Heart Rate (BPM) vs. Intraoperative Mean Arterial Pressure (MAP):

Clinical Relevance: Moderately relevant. Similar to the preoperative MAP, but considering the stress and potential complications during surgery, the correlation might offer insights into how preoperative conditions could affect or predict intraoperative hemodynamic stability.
Preoperative Pulse Ox vs. Intraoperative Blood Loss (ml):

Clinical Relevance: Indirect relevance. While both metrics are important, the direct correlation between preoperative oxygen saturation and intraoperative blood loss is not straightforward. Other factors, such as the surgical site and technique, significantly influence blood loss.
Preoperative Pulse Ox vs. Preoperative MAP:

Clinical Relevance: Indirect relevance. Both are vital signs but relate to different physiological aspects (cardiorespiratory efficiency vs. circulatory pressure). The relationship is more about how general health can impact these measurements rather than a direct correlation.
Preoperative Pulse Ox vs. Intraoperative MAP:

Clinical Relevance: Indirect relevance. This relationship might be more about the underlying health status of the patient and how it could affect or be affected by intraoperative hemodynamic management rather than a direct correlation.
Intraoperative Blood Loss (ml) vs. Preoperative MAP:

Clinical Relevance: Indirect relevance. Preoperative MAP might influence the body’s response to blood loss (through compensatory mechanisms), but the amount of blood loss is more directly related to the surgical procedure and technique.
Intraoperative Blood Loss (ml) vs. Intraoperative MAP:

Clinical Relevance: Highly relevant. Significant blood loss can lead to a decrease in MAP due to reduced circulating volume, making this correlation critical for monitoring and managing intraoperative hemodynamics.
Preoperative MAP vs. Intraoperative MAP:

Clinical Relevance: Highly relevant. Understanding the changes from preoperative to intraoperative MAP can provide insights into the patient’s hemodynamic response to surgery and anesthesia, helping to guide management to maintain stability.

For each of these correlations, it’s important to consider the broader clinical context, including the type of surgery, patient health status, and other concurrent interventions. The significance of these correlations can vary based on specific patient populations and conditions.

preop_intraop_values = [
    "Preop_Heart_Rate_bpm",
    "Preop_Pulse_Ox_Percent",
    "Intraoperative_Blood_Loss_ml",
    "Preop_MAP",
    "Intraop_MAP",
]

custom_titles = {
    "Preop_Heart_Rate_bpm": "Preoperative Heart Rate (BPM)",
    "Preop_Pulse_Ox_Percent": "Preoperative Pulse Oximetry (SpO2)",
    "Intraoperative_Blood_Loss_ml": "Intraoperative Blood Loss (ML)",
    "Preop_MAP": "Preoperative Mean Arterial Pressure",
    "Intraop_MAP": "Intraoperative Mean Arterial Pressure",
}

# Define combinations to omit
combinations_to_omit = [
    (
        "Preop_Heart_Rate_bpm",
        "Preop_Pulse_Ox_Percent",
    ),  # Indirect physiological relationship
    (
        "Preop_Heart_Rate_bpm",
        "Intraoperative_Blood_Loss_ml",
    ),  # Indirect and complex relationship
    (
        "Preop_Pulse_Ox_Percent",
        "Intraoperative_Blood_Loss_ml",
    ),  # No direct physiological relationship
    (
        "Preop_Pulse_Ox_Percent",
        "Intraop_MAP",
    ),  # indirect relevance and differing physiological systems
    (
        "Intraoperative_Blood_Loss_ml",
        "Preop_MAP",
    ),  # complex and indirect influences on outcomes.
    # Any addtl' combos based on further clinical insights can be added here
]

scatter_fit_plot(
    df=circ_eda,
    all_vars=preop_intraop_values,
    label_names=custom_titles,
    show_legend=True,
    show_plot="subplots",
    label_fontsize=14,
    exclude_combinations=combinations_to_omit,
    tick_fontsize=12,
    add_best_fit_line=True,
    scatter_color="#808080",
    show_correlation=True,
    text_wrap=40,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plots="individual",
)

Saving scatter plot(s): 100%|██████████| 5/5 [00:00<00:00,  6.71it/s]

Mean Arterial Pressure vs. BMI

scatter_fit_plot(
    df=circ_eda,
    x_vars=["Preop_MAP", "Intraop_MAP"],
    y_vars=["BMI"],
    label_names=custom_titles,
    show_legend=True,
    show_plot="subplots",
    label_fontsize=14,
    tick_fontsize=12,
    add_best_fit_line=True,
    scatter_color="#808080",
    show_correlation=True,
    text_wrap=40,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plots="individual",
)

Saving scatter plot(s): 100%|██████████| 2/2 [00:00<00:00,  6.39it/s]

BMI by Geographical Origin

bmi_by_geog = (
    circ_eda.groupby("Geographical_Origin")["BMI"]
    .agg(["mean", "std", "min", "max"])
    .rename(
        columns={
            "mean": "Mean",
            "std": "Standard Deviation",
            "min": "Min",
            "max": "Max",
        },
    )
).replace(np.nan, "-")
bmi_by_geog

	Mean	Standard Deviation	Min	Max
Geographical_Origin
Algeria	19.370000	-	19.37	19.37
China	23.330000	2.160657	21.63	27.64
Egypt	20.840000	1.598061	19.71	21.97
France	26.030000	-	26.03	26.03
Germany	22.240000	-	22.24	22.24
Italy	24.277706	3.060707	17.34	36.57
Morocco	21.800000	1.682914	20.61	22.99
Pakistan	23.700000	-	23.70	23.70
Philippines	23.440000	-	23.44	23.44
Spain	20.800000	-	20.80	20.80
Thailand	22.660000	-	22.66	22.66
Tunisia	20.960000	-	20.96	20.96
USA	22.988333	2.174667	20.20	26.17

circ_eda.groupby("Geographical_Origin")["BMI"].agg("mean").plot(
    kind="barh",
    width=0.9,
    rot=0,
)

plt.title("BMI by Country of Origin")
plt.xlabel("Body Mass Index")
plt.ylabel("Country of Origin")
plt.savefig(
    os.path.join(image_path_png, "bmi_by_geog_origin.png"),
    bbox_inches="tight",
)
plt.savefig(
    os.path.join(image_path_svg, "bmi_by_geog_origin.svg"),
    bbox_inches="tight",
)
plt.show()

Age-Related Distributions

kde_distributions(
    df=circ_eda,
    figsize=(10, 6),
    text_wrap=50,
    hist_color="brown",
    bbox_inches="tight",
    vars_of_interest=["Age_years"],
    y_axis_label="Density",
    bins=10,
    fill_alpha=0.40,
    plot_type="both",
    stat="Density",
    label_fontsize=16,  # Font size for axis labels
    tick_fontsize=14,  # Font size for tick labels
    plot_mean=True,
    plot_median=True,
    mean_color="blue",
    image_filename="age_distribution",
    image_path_svg=image_path_svg,
    custom_xlabels=None,  # New parameter to customize x-axis labels
    custom_titles={"Age_years": " "},  # New parameter to customize plot titles
    image_path_png=image_path_png,
    std_dev_levels=[
        1,
        2,
        3,
    ],
    std_color=[
        "purple",
        "green",
        "silver",
    ],
)

Slide-Specific Implementation

# Define the bin edges to create 10 evenly spaced bins
bins = np.arange(0, 101, 10)

# Calculate descriptive statistics
age_description = circ_eda["Age_years"].describe()

# Create the histogram
fig, ax = plt.subplots()
circ_eda["Age_years"].hist(
    bins=bins, grid=False, edgecolor="black", ax=ax, density=True
)

# Calculate the KDE
kde = gaussian_kde(circ_eda["Age_years"].dropna())  # Ensure no NaN values interfere
age_range = np.linspace(0, 100, 500)  # Generate points between 0 and 100
kde_values = kde(age_range)  # Evaluate the KDE here

# Overlay the KDE plot
ax.plot(age_range, kde_values, color="red", alpha=0.5)  # Alpha for transparency

# Construct label with a multi-line string for better readability
label_text = (
    f'Count: {age_description["count"]:.0f}\n'
    f'Mean: {age_description["mean"]:.2f}\n'
    f'Std: {age_description["std"]:.2f}\n'
    f'Min: {age_description["min"]:.2f}\n'
    f'25%: {age_description["25%"]:.2f}\n'
    f'50%: {age_description["50%"]:.2f}\n'
    f'75%: {age_description["75%"]:.2f}\n'
    f'Max: {age_description["max"]:.2f}'
)

# Create an invisible plot for the purpose of adding the legend
ax.plot([], [], " ", label=label_text)

# Add title and labels
ax.set_title("Age Distribution")
ax.set_xlabel("Age")
ax.set_ylabel("Density")

# Display the legend
ax.legend(title="Summary Statistics")

plt.savefig(os.path.join(image_path_png, "age_hist.png"))
plt.savefig(os.path.join(image_path_svg, "age_hist.svg"))
plt.show()

Comprehensive Age-Related Boxplots for Continuous Values

boxplot_metrics_list = [
    "BMI",
    "Preop_MAP",
    "Intraop_MAP",
    "Preop_Pulse_Ox_Percent",
    "Intraop_Mean_Pulse_Ox_Percent",
    "Preop_Heart_Rate_bpm",
    "Intraop_Mean_Heart_Rate_bpm",
    "Surgical_Time_min",
]
metrics_boxplot_comp = ["age_group"]
metrics_comp = ["age_group"]

box_violin_plot(
    df=circ_eda,
    metrics_list=boxplot_metrics_list,
    metrics_comp=metrics_comp,
    image_path_png=image_path_png,
    image_path_svg=image_path_svg,
    save_plots=True,
    show_plot="subplots",
    show_legend=False,
    plot_type="boxplot",
    xlabel_rot=90,
)

Mean Arterial Pressure Averages by Age Group

map_values = circ_eda[["Preop_MAP", "Intraop_MAP"]]

# Convert dictionary items to a list and get the last two items
custom_maps = dict(list(custom_titles.items())[-2:])

for col in map_values.columns:
    # Use the custom title from the dict. if available, else use the column name
    custom_map = custom_maps.get(col, col)

    # Group by age_group and calculate statistics
    map_age = (
        circ_eda.groupby("age_group", observed=True)[col]
        .agg(["mean", "std", "min", "max"])
        .rename(
            columns={
                "mean": "Mean",
                "std": "Standard Deviation",
                "min": "Min",
                "max": "Max",
            }
        )
        .replace(np.nan, "-")  # Replace NaN values with "-",
    )

    # Plotting section
    map_age["Mean"].plot(
        kind="bar",
        width=0.92,
        rot=0,
    )
    plt.title(f"{custom_map} Averages by Age Group")
    plt.xlabel("Age Group")
    plt.ylabel(f"{custom_map}")
    file_name = custom_map.replace(" ", "_").replace("/", "_or_")
    plt.savefig(os.path.join(image_path_png, f"{file_name}_by_age_group.png"))
    plt.savefig(os.path.join(image_path_svg, f"{file_name}_by_age_group.svg"))
    plt.show()

    # Display the table
    print(f"Table for {custom_map} by Age Group:")
    display(map_age)  # Use print(map_age) if display is not available

Table for Preoperative Mean Arterial Pressure by Age Group:

	Mean	Standard Deviation	Min	Max
age_group
18-29	87.074471	8.829018	48.00	106.67
30-39	88.637273	8.011866	66.67	100.00
40-49	90.333000	7.277923	73.33	100.00
50-59	90.197059	12.772274	46.67	100.00
60-69	91.334000	12.429313	46.67	116.67
70-79	91.834500	9.270853	70.00	106.67
80-89	98.097143	8.998236	80.00	106.67
90-99	93.333333	3.335001	90.00	96.67

Table for Intraoperative Mean Arterial Pressure by Age Group:

	Mean	Standard Deviation	Min	Max
age_group
18-29	98.626706	7.546773	46.67	106.67
30-39	98.332273	4.570842	86.67	103.33
40-49	91.667000	16.196212	46.67	100.00
50-59	92.158235	12.906321	46.67	106.67
60-69	91.111000	15.764203	46.67	110.00
70-79	97.500500	7.789351	80.00	110.00
80-89	92.382857	22.828299	46.67	106.67
90-99	93.333333	8.821691	83.33	100.00

BMI Averages by Age Group

# group age by BMI and plot average BMI per age
circ_eda.groupby("age_group", observed=True)["BMI"].agg("mean").plot(
    kind="bar",
    width=0.92,
    rot=0,
)
plt.title("BMI Averages by Age Group")
plt.xlabel("Age Group")
plt.ylabel("Body Mass Index")

# Adjust figure size for saving only
# plt.gcf().set_size_inches(12, 6)  # Example: Adjust to desired size for saving
plt.savefig(os.path.join(image_path_png, "bmi_by_age_group.png"))
plt.savefig(os.path.join(image_path_svg, "bmi_by_age_group.svg"))
plt.show()

Functional Outcomes by Age

neg_outcomes = [
    "Functional_Outcomes_Pain",
    "Functional_Outcomes_Bleeding",
    "Functional_Outcomes_Edema",
    "Functional_Outcomes_Infection",
]

# Define the legend_labels to use in the loop
neg_legend_labels = [
    ["No Pain", "Pain"],
    ["No Bleeding", "Bleeding"],
    ["No Edema", "Edema"],
    ["No Infection", "Infection"],
]

# Define titles for the plots
neg_title = [
    "Pain",
    "Bleeding",
    "Edema",
    "Infection",
]

from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.figure import Figure
from matplotlib.ticker import PercentFormatter

_real_show = plt.show
_real_close = plt.close
_real_savefig = Figure.savefig

# captured: {fig_number: [(args, kwargs), ...]}
saved_paths = {}

def _tracking_savefig(self, *args, **kwargs):
    saved_paths.setdefault(self.number, []).append((args, kwargs))
    return _real_savefig(self, *args, **kwargs)

plt.show = lambda *a, **k: None
plt.close = lambda *a, **k: None  # keep figs alive so we can edit them after
Figure.savefig = _tracking_savefig

try:
    stacked_crosstabs = stacked_crosstab_plot(
        df=circ_eda,
        col="age_group",
        func_col=neg_outcomes,
        legend_labels_list=neg_legend_labels,
        title=neg_title,
        kind="bar",
        width=0.8,
        rot=0,
        custom_order=None,
        text_wrap=80,
        color=["#1f77b4", "#c8544c"],
        output="both",
        image_path_png=image_path_png,
        image_path_svg=image_path_svg,
        return_dict=True,
        save_formats=["png", "svg"],
        x=14,
        y=10,
        p=12,
        file_prefix="Stacked_Bar",
        logscale=False,
        plot_type="both",
        show_legend=True,
        label_fontsize=12,
        tick_fontsize=12,
    )
finally:
    plt.show = _real_show
    plt.close = _real_close
    Figure.savefig = _real_savefig

for num in plt.get_fignums():
    fig = plt.figure(num)
    axes = fig.get_axes()
    if len(axes) >= 2:
        ax_norm = axes[1]
        ax_norm.yaxis.set_major_formatter(PercentFormatter(xmax=1.0, decimals=0))
        ax_norm.set_ylabel("Percentage")
    fig.canvas.draw_idle()

    # re-save to the exact same paths eda_toolkit used
    for args, kwargs in saved_paths.get(num, []):
        fig.savefig(*args, **kwargs)

plt.show()

Plot saved as ../images/png_images/Stacked_Bar_Functional_Outcomes_Pain.png
Plot saved as ../images/svg_images/Stacked_Bar_Functional_Outcomes_Pain.svg
Plot saved as ../images/png_images/Stacked_Bar_Functional_Outcomes_Bleeding.png
Plot saved as ../images/svg_images/Stacked_Bar_Functional_Outcomes_Bleeding.svg
Plot saved as ../images/png_images/Stacked_Bar_Functional_Outcomes_Edema.png
Plot saved as ../images/svg_images/Stacked_Bar_Functional_Outcomes_Edema.svg
Plot saved as ../images/png_images/Stacked_Bar_Functional_Outcomes_Infection.png
Plot saved as ../images/svg_images/Stacked_Bar_Functional_Outcomes_Infection.svg

Crosstab for Functional_Outcomes_Pain

Functional_Outcomes_Pain  No Pain  Pain  Total  No Pain_%  Pain_%
age_group                                                        
18-29                          76     9     85      89.41   10.59
30-39                          21     1     22      95.45    4.55
40-49                           9     1     10      90.00   10.00
50-59                          14     3     17      82.35   17.65
60-69                          30     0     30     100.00    0.00
70-79                          18     2     20      90.00   10.00
80-89                           6     1      7      85.71   14.29
90-99                           1     2      3      33.33   66.67
Total                         175    19    194      90.21    9.79


Crosstab for Functional_Outcomes_Bleeding

Functional_Outcomes_Bleeding  No Bleeding  Bleeding  Total  No Bleeding_%  \
age_group                                                                   
18-29                                  54        31     85          63.53   
30-39                                  19         3     22          86.36   
40-49                                   8         2     10          80.00   
50-59                                  14         3     17          82.35   
60-69                                  29         1     30          96.67   
70-79                                  17         3     20          85.00   
80-89                                   5         2      7          71.43   
90-99                                   2         1      3          66.67   
Total                                 148        46    194          76.29   

Functional_Outcomes_Bleeding  Bleeding_%  
age_group                                 
18-29                              36.47  
30-39                              13.64  
40-49                              20.00  
50-59                              17.65  
60-69                               3.33  
70-79                              15.00  
80-89                              28.57  
90-99                              33.33  
Total                              23.71  


Crosstab for Functional_Outcomes_Edema

Functional_Outcomes_Edema  No Edema  Edema  Total  No Edema_%  Edema_%
age_group                                                             
18-29                            79      6     85       92.94     7.06
30-39                            22      0     22      100.00     0.00
40-49                             9      1     10       90.00    10.00
50-59                            17      0     17      100.00     0.00
60-69                            30      0     30      100.00     0.00
70-79                            20      0     20      100.00     0.00
80-89                             7      0      7      100.00     0.00
90-99                             3      0      3      100.00     0.00
Total                           187      7    194       96.39     3.61


Crosstab for Functional_Outcomes_Infection

Functional_Outcomes_Infection  No Infection  Infection  Total  No Infection_%  \
age_group                                                                       
18-29                                    77          8     85           90.59   
30-39                                    22          0     22          100.00   
40-49                                    10          0     10          100.00   
50-59                                    17          0     17          100.00   
60-69                                    30          0     30          100.00   
70-79                                    19          1     20           95.00   
80-89                                     6          1      7           85.71   
90-99                                     3          0      3          100.00   
Total                                   184         10    194           94.85   

Functional_Outcomes_Infection  Infection_%  
age_group                                   
18-29                                 9.41  
30-39                                 0.00  
40-49                                 0.00  
50-59                                 0.00  
60-69                                 0.00  
70-79                                 5.00  
80-89                                14.29  
90-99                                 0.00  
Total                                 5.15

stacked_crosstabs

{'Functional_Outcomes_Pain': Functional_Outcomes_Pain  No Pain  Pain  Total  No Pain_%  Pain_%
 age_group                                                        
 18-29                          76     9     85      89.41   10.59
 30-39                          21     1     22      95.45    4.55
 40-49                           9     1     10      90.00   10.00
 50-59                          14     3     17      82.35   17.65
 60-69                          30     0     30     100.00    0.00
 70-79                          18     2     20      90.00   10.00
 80-89                           6     1      7      85.71   14.29
 90-99                           1     2      3      33.33   66.67
 Total                         175    19    194      90.21    9.79,
 'Functional_Outcomes_Bleeding': Functional_Outcomes_Bleeding  No Bleeding  Bleeding  Total  No Bleeding_%  \
 age_group                                                                   
 18-29                                  54        31     85          63.53   
 30-39                                  19         3     22          86.36   
 40-49                                   8         2     10          80.00   
 50-59                                  14         3     17          82.35   
 60-69                                  29         1     30          96.67   
 70-79                                  17         3     20          85.00   
 80-89                                   5         2      7          71.43   
 90-99                                   2         1      3          66.67   
 Total                                 148        46    194          76.29   
 
 Functional_Outcomes_Bleeding  Bleeding_%  
 age_group                                 
 18-29                              36.47  
 30-39                              13.64  
 40-49                              20.00  
 50-59                              17.65  
 60-69                               3.33  
 70-79                              15.00  
 80-89                              28.57  
 90-99                              33.33  
 Total                              23.71  ,
 'Functional_Outcomes_Edema': Functional_Outcomes_Edema  No Edema  Edema  Total  No Edema_%  Edema_%
 age_group                                                             
 18-29                            79      6     85       92.94     7.06
 30-39                            22      0     22      100.00     0.00
 40-49                             9      1     10       90.00    10.00
 50-59                            17      0     17      100.00     0.00
 60-69                            30      0     30      100.00     0.00
 70-79                            20      0     20      100.00     0.00
 80-89                             7      0      7      100.00     0.00
 90-99                             3      0      3      100.00     0.00
 Total                           187      7    194       96.39     3.61,
 'Functional_Outcomes_Infection': Functional_Outcomes_Infection  No Infection  Infection  Total  No Infection_%  \
 age_group                                                                       
 18-29                                    77          8     85           90.59   
 30-39                                    22          0     22          100.00   
 40-49                                    10          0     10          100.00   
 50-59                                    17          0     17          100.00   
 60-69                                    30          0     30          100.00   
 70-79                                    19          1     20           95.00   
 80-89                                     6          1      7           85.71   
 90-99                                     3          0      3          100.00   
 Total                                   184         10    194           94.85   
 
 Functional_Outcomes_Infection  Infection_%  
 age_group                                   
 18-29                                 9.41  
 30-39                                 0.00  
 40-49                                 0.00  
 50-59                                 0.00  
 60-69                                 0.00  
 70-79                                 5.00  
 80-89                                14.29  
 90-99                                 0.00  
 Total                                 5.15  }

## pickle out the crosstab data for later use in dash plotly
pd.to_pickle(
    stacked_crosstabs,
    os.path.join(data_raw, "stacked_crosstabs.pkl"),
)

## Save the crosstabs to csv on data_path
for key, value in stacked_crosstabs.items():
    # Save each DataFrame as a CSV file
    value.to_csv(os.path.join(data_path, f"{key}.csv"), index=True)

for key, value in stacked_crosstabs.items():
    key = key.lower()
    print(key)
print()

for key, value in stacked_crosstabs.items():
    # Create DataFrame variables dynamically
    key = key.lower()
    globals()[key] = value
    # print(f"{globals()[key]}\n")

functional_outcomes_pain
functional_outcomes_bleeding
functional_outcomes_edema
functional_outcomes_infection

functional_outcomes_pain

Functional_Outcomes_Pain	No Pain	Pain	Total	No Pain_%	Pain_%
age_group
18-29	76	9	85	89.41	10.59
30-39	21	1	22	95.45	4.55
40-49	9	1	10	90.00	10.00
50-59	14	3	17	82.35	17.65
60-69	30	0	30	100.00	0.00
70-79	18	2	20	90.00	10.00
80-89	6	1	7	85.71	14.29
90-99	1	2	3	33.33	66.67
Total	175	19	194	90.21	9.79

functional_outcomes_bleeding

Functional_Outcomes_Bleeding	No Bleeding	Bleeding	Total	No Bleeding_%	Bleeding_%
age_group
18-29	54	31	85	63.53	36.47
30-39	19	3	22	86.36	13.64
40-49	8	2	10	80.00	20.00
50-59	14	3	17	82.35	17.65
60-69	29	1	30	96.67	3.33
70-79	17	3	20	85.00	15.00
80-89	5	2	7	71.43	28.57
90-99	2	1	3	66.67	33.33
Total	148	46	194	76.29	23.71

functional_outcomes_infection

Functional_Outcomes_Infection	No Infection	Infection	Total	No Infection_%	Infection_%
age_group
18-29	77	8	85	90.59	9.41
30-39	22	0	22	100.00	0.00
40-49	10	0	10	100.00	0.00
50-59	17	0	17	100.00	0.00
60-69	30	0	30	100.00	0.00
70-79	19	1	20	95.00	5.00
80-89	6	1	7	85.71	14.29
90-99	3	0	3	100.00	0.00
Total	184	10	194	94.85	5.15

circ_eda["Functional_Outcomes_Cosmetic_Satisfaction"].value_counts()

Functional_Outcomes_Cosmetic_Satisfaction
1    182
0     12
Name: count, dtype: int64

pos_outcomes = [
    "Functional_Outcomes_Fast_Recovery",
    "Functional_Outcomes_Cosmetic_Satisfaction",
    "Comorbidity_Flag",
]


pos_legend_labels = [
    ["Not Fast Recovery", "Fast Recovery"],
    ["Not Satisfied", "Satisfied"],
    ["Comorbidities", "No Comorbidities"],
]

pos_title = [
    "Recovery",
    "Cosmetic Satisfaction",
    "Comorbidities",
]

from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

_real_show = plt.show
plt.show = lambda *a, **k: None

try:
    stacked_crosstabs = stacked_crosstab_plot(
        df=circ_eda,
        col="age_group",
        func_col=pos_outcomes,
        legend_labels_list=pos_legend_labels,
        title=pos_title,
        kind="bar",
        width=0.8,
        rot=0,
        custom_order=None,
        text_wrap=80,
        color=["#c8544c", "#1f77b4"],
        output="both",
        image_path_png=image_path_png,
        image_path_svg=image_path_svg,
        return_dict=True,
        x=14,
        y=10,
        p=12,
        save_formats=["png", "svg"],
        file_prefix="Stacked_Bar",
        logscale=False,
        plot_type="both",
        show_legend=True,
        label_fontsize=12,
        tick_fontsize=12,
    )
finally:
    plt.show = _real_show

for num, outcome in zip(plt.get_fignums(), pos_outcomes):
    fig = plt.figure(num)
    axes = fig.get_axes()
    if len(axes) >= 2:
        ax_norm = axes[1]
        ax_norm.yaxis.set_major_formatter(PercentFormatter(xmax=1.0, decimals=0))
        ax_norm.set_ylabel("Percentage")
    fig.canvas.draw_idle()

    fig.savefig(Path(image_path_png) / f"Stacked_Bar_{outcome}.png", dpi=300, bbox_inches="tight")
    fig.savefig(Path(image_path_svg) / f"Stacked_Bar_{outcome}.svg", bbox_inches="tight")

plt.show()

Plot saved as ../images/png_images/Stacked_Bar_Functional_Outcomes_Fast_Recovery.png
Plot saved as ../images/svg_images/Stacked_Bar_Functional_Outcomes_Fast_Recovery.svg
Plot saved as ../images/png_images/Stacked_Bar_Functional_Outcomes_Cosmetic_Satisfaction.png
Plot saved as ../images/svg_images/Stacked_Bar_Functional_Outcomes_Cosmetic_Satisfaction.svg
Plot saved as ../images/png_images/Stacked_Bar_Comorbidity_Flag.png
Plot saved as ../images/svg_images/Stacked_Bar_Comorbidity_Flag.svg

Crosstab for Functional_Outcomes_Fast_Recovery

Functional_Outcomes_Fast_Recovery  Not Fast Recovery  Fast Recovery  Total  \
age_group                                                                    
18-29                                             11             74     85   
30-39                                              0             22     22   
40-49                                              0             10     10   
50-59                                              0             17     17   
60-69                                              0             30     30   
70-79                                              0             20     20   
80-89                                              1              6      7   
90-99                                              0              3      3   
Total                                             12            182    194   

Functional_Outcomes_Fast_Recovery  Not Fast Recovery_%  Fast Recovery_%  
age_group                                                                
18-29                                            12.94            87.06  
30-39                                             0.00           100.00  
40-49                                             0.00           100.00  
50-59                                             0.00           100.00  
60-69                                             0.00           100.00  
70-79                                             0.00           100.00  
80-89                                            14.29            85.71  
90-99                                             0.00           100.00  
Total                                             6.19            93.81  


Crosstab for Functional_Outcomes_Cosmetic_Satisfaction

Functional_Outcomes_Cosmetic_Satisfaction  Not Satisfied  Satisfied  Total  \
age_group                                                                    
18-29                                                 10         75     85   
30-39                                                  1         21     22   
40-49                                                  0         10     10   
50-59                                                  0         17     17   
60-69                                                  0         30     30   
70-79                                                  0         20     20   
80-89                                                  1          6      7   
90-99                                                  0          3      3   
Total                                                 12        182    194   

Functional_Outcomes_Cosmetic_Satisfaction  Not Satisfied_%  Satisfied_%  
age_group                                                                
18-29                                                11.76        88.24  
30-39                                                 4.55        95.45  
40-49                                                 0.00       100.00  
50-59                                                 0.00       100.00  
60-69                                                 0.00       100.00  
70-79                                                 0.00       100.00  
80-89                                                14.29        85.71  
90-99                                                 0.00       100.00  
Total                                                 6.19        93.81  


Crosstab for Comorbidity_Flag

Comorbidity_Flag  Comorbidities  No Comorbidities  Total  Comorbidities_%  \
age_group                                                                   
18-29                        72                13     85            84.71   
30-39                        16                 6     22            72.73   
40-49                        10                 0     10           100.00   
50-59                         9                 8     17            52.94   
60-69                        12                18     30            40.00   
70-79                        10                10     20            50.00   
80-89                         1                 6      7            14.29   
90-99                         0                 3      3             0.00   
Total                       130                64    194            67.01   

Comorbidity_Flag  No Comorbidities_%  
age_group                             
18-29                          15.29  
30-39                          27.27  
40-49                           0.00  
50-59                          47.06  
60-69                          60.00  
70-79                          50.00  
80-89                          85.71  
90-99                         100.00  
Total                          32.99

surgical_outcomes = ["Surgical_Technique"]
surgical_legend_labels = [["Traditional", "Laser"]]
surgical_title = ["Surgical Technique"]

from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

_real_show = plt.show
plt.show = lambda *a, **k: None

try:
    stacked_cross_surg_tech = stacked_crosstab_plot(
        df=circ_eda,
        col="age_group",
        func_col=surgical_outcomes,
        legend_labels_list=surgical_legend_labels,
        title=surgical_title,
        kind="bar",
        width=0.8,
        rot=0,
        custom_order=None,
        text_wrap=80,
        color=["#1f77b4", "#203764"],
        output="both",
        image_path_png=image_path_png,
        image_path_svg=image_path_svg,
        return_dict=True,
        x=14,
        y=10,
        p=12,
        save_formats=["png", "svg"],
        file_prefix="Stacked_Bar",
        logscale=False,
        plot_type="both",
        show_legend=True,
        label_fontsize=12,
        tick_fontsize=12,
    )
finally:
    plt.show = _real_show

for num, outcome in zip(plt.get_fignums(), surgical_outcomes):
    fig = plt.figure(num)
    axes = fig.get_axes()
    if len(axes) >= 2:
        ax_norm = axes[1]
        ax_norm.yaxis.set_major_formatter(PercentFormatter(xmax=1.0, decimals=0))
        ax_norm.set_ylabel("Percentage")
    fig.canvas.draw_idle()

    fig.savefig(Path(image_path_png) / f"Stacked_Bar_{outcome}.png", dpi=300, bbox_inches="tight")
    fig.savefig(Path(image_path_svg) / f"Stacked_Bar_{outcome}.svg", bbox_inches="tight")

plt.show()

Plot saved as ../images/png_images/Stacked_Bar_Surgical_Technique.png
Plot saved as ../images/svg_images/Stacked_Bar_Surgical_Technique.svg

Crosstab for Surgical_Technique

Surgical_Technique  Traditional  Laser  Total  Traditional_%  Laser_%
age_group                                                            
18-29                        79      6     85          92.94     7.06
30-39                        16      6     22          72.73    27.27
40-49                         4      6     10          40.00    60.00
50-59                         4     13     17          23.53    76.47
60-69                         9     21     30          30.00    70.00
70-79                        10     10     20          50.00    50.00
80-89                         7      0      7         100.00     0.00
90-99                         3      0      3         100.00     0.00
Total                       132     62    194          68.04    31.96

## pickle out the crosstab data for later use in dash plotly
pd.to_pickle(
    stacked_cross_surg_tech,
    os.path.join(data_raw, "stacked_cross_surg_tech.pkl"),
)

Surgical Techniques

Number of Procedures in each Surgical Category

mapping = {0: "Traditional", 1: "Laser"}
surg_tech_color = {"Traditional": "#1f77b4", "Laser": "#203764"}

surg_tech_values = (
    circ_eda["Surgical_Technique"]
    .map(mapping)
    .value_counts()
    .reindex(["Traditional", "Laser"])  # lock order
)
colors = [surg_tech_color[t] for t in surg_tech_values.index]

ax = surg_tech_values.plot(
    kind="bar",
    rot=0,
    width=0.99,
    color=colors,
)

for i, v in enumerate(surg_tech_values):
    ax.text(i, v - 40, str(v), ha="center", color="yellow")
ax.set_title("Number of Procedures in Each Surgical Category")
ax.set_xlabel("Surgical Technique")
ax.set_ylabel("Count")
plt.savefig(os.path.join(image_path_svg, "surgical_technique_by_count.svg"))
plt.savefig(os.path.join(image_path_png, "surgical_technique_by_count.png"))
plt.show()

Surgical Technique by Mean Time (in Minutes)

surgical_techniques = (
    circ_eda.groupby("Surgical_Technique")["Surgical_Time_min"]
    .agg("mean")
    .rename(index=mapping)
    .reindex(["Traditional", "Laser"])
)
colors = [surg_tech_color[t] for t in surgical_techniques.index]

ax = surgical_techniques.plot(
    kind="bar",
    rot=0,
    width=0.99,
    color=colors,
)

for i, v in enumerate(surgical_techniques):
    ax.text(i, v - 15, f"{v:.2f}", ha="center", color="yellow")

ax.set_title("Surgical Technique by Mean Time (in Minutes)")
plt.xlabel("Surgical Technique")
plt.ylabel("Surgical Time in Minutes")
plt.savefig(os.path.join(image_path_svg, "surgical_technique_by_mean_time.svg"))
plt.savefig(os.path.join(image_path_png, "surgical_technique_by_mean_time.png"))
plt.show()

Antibiotics by Surgical Technique

antibiotic_by_surgical_technique = pd.crosstab(
    circ_eda["Preop_drugs_antibiotic"], circ_eda["Surgical_Technique"]
).rename(columns=mapping)

antibiotic_by_surgical_technique

Surgical_Technique	Traditional	Laser
Preop_drugs_antibiotic
Amoxicillina	7	0
Cefazolina	113	62
Ciprofloxacina	8	0
Gentamicina	4	0

antibiotic_by_surgical_technique.plot(
    kind="bar",
    rot=0,
    color=surg_tech_color,
)


plt.title("Type of Antibiotic by Surgical Technique")
plt.xlabel("Antibiotic")
plt.ylabel("Count")
plt.savefig(os.path.join(image_path_png, "antibiotic_by_surgical_technique.png"))
plt.savefig(os.path.join(image_path_svg, "antibiotic_by_surgical_technique.svg"))
plt.show()

Anesthesia by Surgical Technique

anesthesia_by_surgical_technique = pd.crosstab(
    circ_eda["Anesthesia_Type"], circ_eda["Surgical_Technique"]
).rename(columns=mapping)

anesthesia_by_surgical_technique

Surgical_Technique	Traditional	Laser
Anesthesia_Type
carbocaine	4	0
lidocaine	126	62
xilocaine	2	0

anesthesia_by_surgical_technique.plot(
    kind="bar",
    rot=0,
    color=surg_tech_color,
)


plt.title("Type of Anesthesia by Surgical Technique")
plt.xlabel("Anesthesia Type")
plt.ylabel("Count")
plt.savefig(os.path.join(image_path_png, "anesthesia_by_surgical_technique.png"))
plt.savefig(os.path.join(image_path_svg, "anesthesia_by_surgical_technique.svg"))


plt.show()

Box Plot of Surgical Time by Surgical Technique

# Box plot for Surgical_Time_min across different Surgical_Techniques
plot_df = circ_eda.assign(
    Surgical_Technique=circ_eda["Surgical_Technique"].map(mapping)
)

sns.boxplot(
    x="Surgical_Technique",
    y="Surgical_Time_min",
    data=plot_df,
    hue="Surgical_Technique",
    order=["Traditional", "Laser"],
    palette=surg_tech_color,
    medianprops={"color": "yellow", "linewidth": 1},
)
plt.title("Box plot of Surgical Time by Surgical Technique")
plt.ylabel("Surgical Time (min)")
plt.xlabel("Surgical Technique")
plt.xticks(rotation=0)
plt.savefig(
    os.path.join(image_path_png, "surgical_time_by_technique_boxplot.png"),
)
plt.savefig(
    os.path.join(image_path_svg, "surgical_time_by_technique_boxplot.svg"),
)
plt.show()

Box Plot of Intraoperative Blood Loss by Surgical Technique

# Box plot for Intraoperative_Blood_Loss_ml across different Surgical_Techniques
plot_df = circ_eda.assign(
    Surgical_Technique=circ_eda["Surgical_Technique"].map(mapping)
)

ax = sns.boxplot(
    x="Surgical_Technique",
    y="Intraoperative_Blood_Loss_ml",
    data=plot_df,
    hue="Surgical_Technique",
    order=["Traditional", "Laser"],
    palette=surg_tech_color,
    medianprops={"color": "yellow", "linewidth": 1},
    legend="full",
)
plt.title("Box plot of Intraoperative Blood Loss by Surgical Technique")
plt.ylabel("Intraoperative Blood Loss (ml)")
plt.xlabel("Surgical Technique")
plt.xticks(rotation=0)
plt.savefig(
    os.path.join(image_path_png, "intraop_blood_loss_by_technique_boxplot.png"),
)
plt.savefig(
    os.path.join(image_path_svg, "intraop_blood_loss_by_technique_boxplot.svg"),
)
plt.show()

Boxplot of Surgical Time by Anesthesia Type

# Box plot for Intraoperative_Blood_Loss_ml across different Anesthesia Types
sns.boxplot(
    x="Anesthesia_Type",
    y="Surgical_Time_min",
    hue="Anesthesia_Type",
    data=circ_eda,
)
plt.title("Box plot of Surgical Time by Anesthesia Type")
plt.ylabel("Surgical Time (min)")
plt.xlabel("Anesthesia Type")
plt.xticks(rotation=0)
plt.savefig(os.path.join(image_path_png, "anesthesia_surgical_time_boxplot.png"))
plt.savefig(os.path.join(image_path_svg, "anesthesia_surgical_time_boxplot.svg"))
plt.show()

Box Plot of Intraoperative Blood Loss by Anesthesia Type

# Box plot for Intraoperative_Blood_Loss_ml across different Anesthesia Types
sns.boxplot(
    x="Anesthesia_Type",
    y="Intraoperative_Blood_Loss_ml",
    hue="Anesthesia_Type",
    data=circ_eda,
)
plt.title("Box plot of Intraoperative Blood Loss by Anesthesia Type")
plt.ylabel("Intraoperative Blood Loss (ml)")
plt.xlabel("Anesthesia Type")
plt.xticks(rotation=0)
plt.savefig(os.path.join(image_path_png, "anesthesia_blood_loss_boxplot.png"))
plt.savefig(os.path.join(image_path_svg, "anesthesia_blood_loss_boxplot.svg"))
plt.show()

Box Plot of Surgical Time (min) by Preoperative Antibiotic

# Box plot for Intraoperative_Blood_Loss_ml across different Anesthesia Types
sns.boxplot(
    x="Preop_drugs_antibiotic",
    y="Surgical_Time_min",
    hue="Preop_drugs_antibiotic",
    data=circ_eda,
)
plt.title("Box plot of Surgical Time (min) by Preoperative Antibiotic")
plt.ylabel("Surgical Time (min)")
plt.xlabel("Antibiotic Type")
plt.xticks(rotation=0)
plt.savefig(os.path.join(image_path_png, "antibiotic_surgical_time_boxplot.png"))
plt.savefig(os.path.join(image_path_svg, "antibiotic_surgical_time_boxplot.svg"))
plt.show()

Box Plot of Intraoperative Blood Loss by Preoperative Antibiotic

# Box plot for Intraoperative_Blood_Loss_ml across different Anesthesia Types
sns.boxplot(
    x="Preop_drugs_antibiotic",
    y="Intraoperative_Blood_Loss_ml",
    hue="Preop_drugs_antibiotic",
    data=circ_eda,
)
plt.title("Box plot of Intraoperative Blood Loss by Preoperative Antibiotic")
plt.ylabel("Intraoperative Blood Loss (ml)")
plt.xlabel("Antibiotic Type")
plt.xticks(rotation=0)
plt.savefig(os.path.join(image_path_png, "antibiotic_blood_loss_boxplot.png"))
plt.savefig(os.path.join(image_path_svg, "antibiotic_blood_loss_boxplot.svg"))
plt.show()

Prevalance of Functional Outcomes by Surgical Technique

functional_list = [col for col in circ_eda.columns if "Functional" in col]

functional_title_list = [
    "Pain",
    "Bleeding",
    "Edema",
    "Infection",
    "Recovery",
    "Satisfaction",
    "Comorbidities",
    "Surgical Technique",
]

functional_labels = {
    "Functional_Outcomes_Pain": {0: "No Pain", 1: "Pain"},
    "Functional_Outcomes_Bleeding": {0: "No Bleeding", 1: "Bleeding"},
    "Functional_Outcomes_Edema": {0: "No Edema", 1: "Edema"},
    "Functional_Outcomes_Infection": {0: "No Infection", 1: "Infection"},
    "Functional_Outcomes_Fast_Recovery": {0: "Not Fast Recovery", 1: "Fast Recovery"},
    "Functional_Outcomes_Cosmetic_Satisfaction": {0: "No Satisfaction", 1: "Satisfaction"},
}

for item, title in zip(functional_list, functional_title_list):
    ax = pd.crosstab(
        circ_eda[item].map(functional_labels[item]),
        circ_eda["Surgical_Technique"],
    ).rename(columns=mapping).plot(
        kind="bar",
        rot=0,
        color=surg_tech_color,
    )

    ax.set_ylabel("Count")
    ax.set_title(f"Prevalence of {title} by Surgical Technique")
    ax.set_xticklabels(functional_labels[item].values())
    ax.set_xlabel(title)

    plt.savefig(os.path.join(image_path_png, f"Prevalance_of_{title}_by_surgical_technique.png"))
    plt.savefig(os.path.join(image_path_svg, f"Prevalance_of_{title}_by_surgical_technique.svg"))

plt.show()

Socioeconomic Impacts

Religious Affiliation by Geographical Origin

circ_eda["Cultural_Religious_Affiliation"].unique().tolist()

['Jewish', 'Catholic', 'Atheist', 'Buddhist', 'Orthodox', 'Muslims']

ct = pd.crosstab(
    circ_eda["Cultural_Religious_Affiliation"],
    circ_eda["Geographical_Origin"],
    margins=True,
    margins_name="Total",
)

highlight_columns(ct, "Total", color="brown")

Geographical_Origin	Algeria	China	Egypt	France	Germany	Italy	Morocco	Pakistan	Philippines	Spain	Thailand	Tunisia	USA	Total
Cultural_Religious_Affiliation
Atheist	0	0	0	0	1	34	0	0	0	1	0	0	1	37
Buddhist	0	1	0	0	0	1	0	0	1	0	1	0	0	4
Catholic	0	4	0	1	0	131	0	0	0	0	0	0	4	140
Jewish	0	0	0	0	0	3	0	0	0	0	0	0	1	4
Muslims	1	0	2	0	0	0	2	1	0	0	0	1	0	7
Orthodox	0	1	0	0	0	1	0	0	0	0	0	0	0	2
Total	1	6	2	1	1	170	2	1	1	1	1	1	6	194

# create a heatmap of the crosstab between religion and geo. origin
plt.figure(figsize=(9, 6))
sns.heatmap(
    pd.crosstab(
        circ_eda["Geographical_Origin"], circ_eda["Cultural_Religious_Affiliation"]
    ),
    annot=True,
    cmap="rocket_r",
    fmt="d",
)
plt.title("Cultural Religious Affiliation by Geographical Origin")
plt.xlabel("Geographical Origin")
plt.savefig(
    os.path.join(image_path_png, "religion_by_geog_origin.png"),
    bbox_inches="tight",
)
plt.savefig(
    os.path.join(image_path_svg, "religion_by_geog_origin.svg"),
    bbox_inches="tight",
)
plt.show()

Total Cost by Coverage Type

total_cost_ins_values = circ_eda["Cost_Type"].value_counts(ascending=True)
ins_bar_col = ["#1f77b4", "#c8544c", "#555555"]

ax = total_cost_ins_values.plot(
    kind="barh",
    rot=0,
    width=0.99,
    legend=False,
    color=ins_bar_col,
)

for i, v in enumerate(total_cost_ins_values.values):
    ax.text(v - 5, i, str(v), ha="right", va="center", color="yellow")

ax.set_title("Total Number of Procedures by Coverage Category")
ax.set_xlabel("Number of Procedures")
ax.set_ylabel("Cost Type")
plt.savefig(os.path.join(image_path_svg, "total_number_by_coverage.svg"))
plt.savefig(os.path.join(image_path_png, "total_number_by_coverage.png"))
plt.show()

total_cost_by_ins = round(
    circ_eda.groupby("Cost_Type")["Cost_of_Procedure_euros"].sum().to_frame(), 2
).rename(columns={"Cost_of_Procedure_euros": "Total_Cost"})

ax = total_cost_by_ins.plot(
    kind="barh",
    rot=0,
    width=0.99,
    legend=False,
    color=ins_bar_col,
)

# Accessing the bar patches
for i, patch in enumerate(ax.patches):
    patch.set_facecolor(ins_bar_col[i % len(ins_bar_col)])

# Iterate over the DataFrame's rows to place text labels
for i, (index, row) in enumerate(total_cost_by_ins.iterrows()):
    # Only place text if the cost is greater than 0
    if row["Total_Cost"] > 0:
        # Use 'i' for the y position and adjust the x position to place the text
        # to the right of the bar's end
        ax.text(
            row["Total_Cost"] - 10000,
            i,
            f"{row['Total_Cost']} €",
            ha="left",
            va="center",
            color="yellow",
        )

ax.set_title("Total Cost of Procedure by Coverage Category")
ax.set_xlabel("Cost (in €)")
ax.set_ylabel("Cost Type")

plt.savefig(os.path.join(image_path_svg, "total_cost_by_coverage.svg"))
plt.savefig(os.path.join(image_path_png, "total_cost_by_coverage.png"))

plt.show()
total_cost_by_ins

	Total_Cost
Cost_Type
Insurance	24000
Private	38400
SSN	0

Average Cost by Coverage Type

avg_cost_by_ins = round(
    circ_eda.groupby("Cost_Type")["Cost_of_Procedure_euros"].agg("mean").to_frame(), 2
).rename(columns={"Cost_of_Procedure_euros": "Average_Cost"})


ax = avg_cost_by_ins.plot(
    kind="barh",
    rot=0,
    width=0.99,
    legend=False,
)

# Get the y-axis labels (which are the categories) as a list
y_labels = avg_cost_by_ins.index.tolist()

# Accessing the bar patches
for i, patch in enumerate(ax.patches):
    patch.set_facecolor(ins_bar_col[i % len(ins_bar_col)])

# Iterate over the DataFrame's rows to place text labels
for i, (index, row) in enumerate(avg_cost_by_ins.iterrows()):
    # Only place text if the cost is greater than 0
    if row["Average_Cost"] > 0:
        # Use 'i' for the y position and adjust the x position to place the text
        # to the right of the bar's end
        ax.text(
            row["Average_Cost"] - 200,
            i,
            f"{row['Average_Cost']} €",
            ha="left",
            va="center",
            color="yellow",
        )

ax.set_title("Average Cost of Procedure by Coverage Category")
ax.set_xlabel("Cost (in €)")
ax.set_ylabel("Cost Type")
plt.savefig(os.path.join(image_path_svg, "avg_cost_by_coverage.svg"))
plt.savefig(os.path.join(image_path_png, "avg_cost_by_coverage.png"))


plt.show()
avg_cost_by_ins

	Average_Cost
Cost_Type
Insurance	1043.48
Private	984.62
SSN	0.00

Number of Patients by Country of Origin

circ_eda["Geographical_Origin"].value_counts(ascending=True).plot(kind="barh")
plt.title("Number of Patients by Country of Origin")
plt.xlabel("Number of Patients")
plt.ylabel("Country of Origin")
plt.savefig(
    os.path.join(image_path_svg, "number_patients_by_country.svg"),
    bbox_inches="tight",
)
plt.savefig(
    os.path.join(image_path_png, "number_patients_by_country.png"),
    bbox_inches="tight",
)
plt.show()

Cost of Procedure by Country of Origin

circ_eda.groupby("Geographical_Origin")["Cost_of_Procedure_euros"].agg(
    "mean"
).sort_values().plot(kind="barh")

plt.title("Average Cost of Procedure by Country of Origin")
plt.xlabel("Cost (in €)")
plt.ylabel("Country of Origin")


plt.savefig(
    os.path.join(image_path_svg, "cost_by_country.svg"),
    bbox_inches="tight",
)


plt.savefig(
    os.path.join(image_path_png, "cost_by_country.png"),
    bbox_inches="tight",
)
plt.show()