In [ ]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy import stats
Loading Data¶
In [ ]:
df = (
pd.read_csv('./data/icu_ibd_all_table.csv')
.assign(intime = lambda x: pd.to_datetime(x['intime']))
)
print(df.shape)
print(df.head())
In [3]:
df2 = (
pd.read_csv('./data/patients_ibd.csv')
.assign(anchor_year=lambda x: pd.to_datetime(x['anchor_year'].astype(str) + '-01-01'))
)
print(df2.shape)
print(df2.head())
(2417, 6) subject_id gender anchor_age anchor_year anchor_year_group dod 0 10098672 M 61 2140-01-01 2011 - 2013 NaN 1 10303503 F 23 2144-01-01 2008 - 2010 NaN 2 10312715 M 39 2176-01-01 2008 - 2010 NaN 3 10318500 F 46 2194-01-01 2011 - 2013 NaN 4 10410021 M 49 2135-01-01 2011 - 2013 NaN
Preprocess Data¶
Make a Combination¶
In [4]:
data = (
df.merge(
df2[['subject_id', 'anchor_year']],
on='subject_id',
how='left',
) # 需要 df2 的 anchor_year 信息
.sort_values(by=['subject_id', 'intime'],) # 每个 subject_id 按照入院时间 intime 排序
.assign(
age = lambda x: ((x['intime'] - x['anchor_year']).dt.days) / 365 + x['age']
)
.drop(columns=['outtime', 'anchor_year'])
)
print(data.shape)
print(data.head())
(1161, 31) hadm_id subject_id intime los mortality gender \ 327 22643604 10024331 2141-03-18 19:36:08 4.10571 1 1 1022 28899194 10025647 2176-09-22 17:57:15 1.96810 1 1 896 27617929 10037975 2185-01-17 19:12:12 4.87824 1 1 110 20845468 10048262 2168-08-21 00:21:53 0.44588 0 1 908 27715453 10056223 2122-09-23 15:08:45 5.04106 0 1 age weight bmi heart_rate ... hemoglobin CRP \ 327 73.210959 NaN NaN 70.0 ... 9.4 NaN 1022 84.726027 NaN NaN 70.0 ... 10.5 NaN 896 60.043836 NaN NaN 90.0 ... 12.5 NaN 110 46.641096 NaN NaN 101.0 ... NaN NaN 908 50.728767 NaN NaN 89.0 ... 9.4 NaN race language marital_status insurance \ 327 WHITE ENGLISH MARRIED Medicare 1022 WHITE ENGLISH MARRIED Medicare 896 UNKNOWN ENGLISH MARRIED Medicare 110 WHITE ENGLISH MARRIED Medicare 908 HISPANIC/LATINO - DOMINICAN ? SINGLE Medicaid die_in_icu uc_only cd_only uc_cd 327 0 1 0 0 1022 0 1 0 0 896 1 0 1 0 110 0 1 0 0 908 0 1 0 0 [5 rows x 31 columns]
Drop Columns (Missing Value + Useless)¶
In [5]:
# 删除缺失值过多的字段
cols_missing = []
for col in data.columns:
if data[col].isna().sum() / data.shape[0]*100 > 5:
cols_missing.append(col)
# print(
# f'{col} => '
# f'NA Count: {data[col].isna().sum()} ({data[col].isna().sum() / data.shape[0]*100:.2f}%)'
# '\n'
# )
print('缺失值过多的字段')
print(cols_missing)
缺失值过多的字段 ['weight', 'bmi', 'systolic_pressure', 'diastolic_pressure', 'temperature', 'white_blood_cell', 'red_blood_cell', 'CRP']
In [6]:
# 删除不需要的字段
cols_useless = [
'hadm_id',
'mortality',
]
print('手动删除的字段')
print(cols_useless)
手动删除的字段 ['hadm_id', 'mortality']
In [7]:
cols_except = cols_missing + cols_useless
print(f'Cols (before): {data.shape[1]}')
data.drop(columns=cols_except, inplace=True)
print(f'Cols (after): {data.shape[1]}')
Cols (before): 31 Cols (after): 21
Drop Rows (Missing Value)¶
In [8]:
# 删除缺失值过多的行
print(f'Rows (before): {data.shape[0]}')
data = data[~(data.isna().sum(axis=1) > (data.shape[1]+1)*0.05)]
print(f'Rows (after): {data.shape[0]}')
Rows (before): 1161 Rows (after): 1127
Process Race¶
In [9]:
def parse_race(race):
if 'WHITE' in race:
return 'WHITE'
elif 'BLACK' in race:
return 'BLACK'
elif 'HISPANIC' in race or 'LATINO' in race:
return 'HISPANIC/LATINO'
elif 'ASIAN' in race:
return 'ASIAN'
else:
return 'OTHER'
data.loc[:, 'race'] = data.loc[:, 'race'].apply(parse_race)
Format Dummy Variables¶
In [10]:
# 对 type 做 drop_first
data.drop(columns=['uc_cd'], inplace=True)
In [11]:
# 需要转换为 dummy variable 的字段
cols = ['language', 'race', 'marital_status', 'insurance']
for col in cols:
tmp = data[col].value_counts(dropna=False)
res = pd.DataFrame({'index': tmp.index, 'count': tmp.values, 'percentage': tmp.values/tmp.sum()*100})
print(f'{res.set_index("index")}')
print()
count percentage index ENGLISH 1073 95.208518 ? 54 4.791482 count percentage index WHITE 927 82.253771 BLACK 102 9.050577 OTHER 70 6.211180 HISPANIC/LATINO 21 1.863354 ASIAN 7 0.621118 count percentage index MARRIED 512 45.430346 SINGLE 369 32.741792 WIDOWED 137 12.156167 DIVORCED 95 8.429459 NaN 14 1.242236 count percentage index Medicare 524 46.495120 Other 516 45.785271 Medicaid 87 7.719610
In [12]:
# 处理 dummy variables
data = pd.get_dummies(data=data,
columns=cols,
prefix=cols,
drop_first=True)
data.columns
Out[12]:
Index(['subject_id', 'intime', 'los', 'gender', 'age', 'heart_rate', 'respiratory_rate', 'hematocrit', 'rdw', 'platelet', 'mcv', 'mch', 'hemoglobin', 'die_in_icu', 'uc_only', 'cd_only', 'language_ENGLISH', 'race_BLACK', 'race_HISPANIC/LATINO', 'race_OTHER', 'race_WHITE', 'marital_status_MARRIED', 'marital_status_SINGLE', 'marital_status_WIDOWED', 'insurance_Medicare', 'insurance_Other'], dtype='object')
Add ICU Count¶
In [13]:
data['icu_count'] = (
data.sort_values(by=['subject_id', 'intime'])
.groupby('subject_id').cumcount() + 1
)
Normalization¶
In [ ]:
cols = ['age', 'heart_rate', 'respiratory_rate', 'hematocrit', 'rdw', 'platelet', 'mcv', 'mch', 'hemoglobin', 'icu_count']
# scaler = StandardScaler()
scaler = MinMaxScaler()
data[cols] = scaler.fit_transform(data[cols])
In [ ]:
# 转换所有 bool 类型为 int
for k, v in data.dtypes.items():
if v == bool:
data[k] = data[k].astype(int)
Export Data¶
First ICU Record¶
In [19]:
# For Statistics
(
data
# .drop_duplicates(subset='subject_id', keep='first',)
.drop(columns=['subject_id', 'intime',])
.to_csv('./data_processed/data_stats_.csv')
# .columns
)
In [ ]:
(
data
# .drop_duplicates(subset='subject_id', keep='first',)
# .drop(columns=['die_in_icu'])
.to_csv('./data_processed/data_first_record_.csv')
# .columns
)
In [ ]:
plt.figure(figsize=(12, 6))
plt.scatter(data['los'],
data['subject_id'],
alpha=0.9,
s=data['icu_count']*100,
c=data['subject_id'].astype('category').cat.codes,
cmap='viridis'
) # alpha用于设置点的透明度
plt.xticks(rotation=45)
In [ ]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(12, 8), dpi=300)
ax = fig.add_subplot(111, projection='3d')
color_labels = data['subject_id'].astype('category').cat.codes
scatter = ax.scatter(data['los'],
data['subject_id'],
data['icu_count'], # 将icu_count作为第三个维度
alpha=0.9,
s=np.exp(data['icu_count']*5), # 也可以使用icu_count调整点的大小
c=color_labels,
cmap='jet')
ax.set_xlabel('Length of Stay (days)')
ax.set_ylabel('Subject ID')
ax.set_zlabel('ICU Count')
ax.set_title('3D Plot of ICU Data')
plt.colorbar(scatter, ax=ax, label='Subject ID')
In [ ]:
data.merge(df2[['subject_id', 'anchor_year_group']],
on='subject_id', how='left').anchor_year_group.value_counts()
In [ ]:
data
In [ ]: