In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind, chi2_contingency, fisher_exact
In [3]:
data = pd.read_csv('./data_processed/data_stats_.csv', index_col=0)
data.shape
Out[3]:
(1127, 25)
In [4]:
data.die_in_icu.value_counts()
Out[4]:
die_in_icu 0 988 1 139 Name: count, dtype: int64
In [97]:
Index_res = []
A_res = []
B_res = []
P_res = []
for col in data.columns:
A = data.loc[data.die_in_icu == 0, [col]].dropna()
B = data.loc[data.die_in_icu == 1, [col]].dropna()
if data[col].dtype == 'bool' or (col in ['gender', 'uc_only', 'cd_only']):
# print(f'INFO: {col} (bool)')
A_weight = (A.sum() / A.shape[0] * 100).item()
B_weight = (B.sum() / B.shape[0] * 100).item()
group_counts = data.groupby(['die_in_icu', col]).size().unstack(fill_value=0)
chi2_stat, p_val, dof, expected = chi2_contingency(group_counts)
oddsratio, fisher_p_val = fisher_exact(group_counts)
# print(f"Chi-squared test p-value: {p_val}")
# print(f"Fisher's exact test p-value: {fisher_p_val}")
Index_res.append(col)
A_res.append(f'{A_weight:.2f}%')
B_res.append(f'{B_weight:.2f}%')
if p_val <= 0.05:
P_res.append(f'{p_val:.4f}*')
else:
P_res.append(f'{p_val:.4f}')
# print(f'INFO: A: {A_weight:.2f}% B: {B_weight:.2f}%')
elif data[col].dtype == 'float64':
t_stat, p_val = ttest_ind(A, B)
p_val = p_val[0]
Index_res.append(col)
A_res.append(f'{A.mean().item():.2f}±{A.std().item():.2f}')
B_res.append(f'{B.mean().item():.2f}±{B.std().item():.2f}')
if p_val <= 0.05:
P_res.append(f'{p_val:.4f}*')
else:
P_res.append(f'{p_val:.4f}')
else:
print(f'WARN: {col} is missing.')
pd.DataFrame(
{
'Not Die in ICU': A_res,
'Die in ICU': B_res,
'P-Value': P_res,
},
index=Index_res,
)
WARN: die_in_icu is missing. WARN: icu_count is missing.
Out[97]:
Not Die in ICU | Die in ICU | P-Value | |
---|---|---|---|
los | 3.11±4.20 | 3.88±5.64 | 0.0545 |
gender | 46.66% | 38.13% | 0.0719 |
age | 62.95±15.69 | 67.69±13.78 | 0.0008* |
heart_rate | 87.55±19.51 | 93.30±21.11 | 0.0013* |
respiratory_rate | 19.16±5.60 | 19.91±6.95 | 0.1495 |
hematocrit | 30.20±5.76 | 30.71±5.39 | 0.3284 |
rdw | 16.13±2.57 | 16.04±2.18 | 0.6985 |
platelet | 247.69±156.37 | 246.94±152.51 | 0.9576 |
mcv | 90.51±8.55 | 91.53±8.71 | 0.1893 |
mch | 29.57±3.17 | 29.67±3.02 | 0.7443 |
hemoglobin | 9.89±1.95 | 10.02±1.80 | 0.4598 |
uc_only | 44.64% | 49.64% | 0.3082 |
cd_only | 49.09% | 47.48% | 0.7916 |
language_ENGLISH | 95.45% | 93.53% | 0.4352 |
race_BLACK | 9.51% | 5.76% | 0.1976 |
race_HISPANIC/LATINO | 2.02% | 0.72% | 0.4652 |
race_OTHER | 6.07% | 7.19% | 0.7450 |
race_WHITE | 81.68% | 86.33% | 0.2205 |
marital_status_MARRIED | 44.74% | 50.36% | 0.2478 |
marital_status_SINGLE | 33.60% | 26.62% | 0.1220 |
marital_status_WIDOWED | 11.94% | 13.67% | 0.6568 |
insurance_Medicare | 44.33% | 61.87% | 0.0002* |
insurance_Other | 47.57% | 33.09% | 0.0018* |
In [98]:
pd.DataFrame(
{
'Not Die in ICU': A_res,
'Die in ICU': B_res,
'P-Value': P_res,
},
index=Index_res,
).to_csv('./temp.csv')
In [ ]: