In [7]:
import pandas as pd
import numpy as np
In [8]:
admissions = pd.read_csv('./data/admissions_ibd.csv')
diagnosis = pd.read_csv('./data/diagnoses_ibd.csv')
In [9]:
# Paper 1 找到的对 Mortality 有显著性的合并症
icd_codes = ['V1582', '5849', '5990', '3051', '2761', '25000', 'I10', '56089', '4280', '5866', '2875', '496', '99592', 'D649', '7850', '51881']
mapping = {
"V1582": "Personal history of tobacco use",
"5849": "Acute kidney failure",
"5990": "Urinary tract infection",
"3051": "Tobacco use disorder",
"2761": "Hyposmolality and/or hyponatremia",
"25000": "Diabetes mellitus",
"I10": "Essential (primary) hypertension",
"56089": "Other specified intestinal obstruction",
"4280": "Congestive heart failure",
"5866": "Long-term (current) use of aspirin",
"2875": "Thrombocytopenia",
"496": "Other finger(s) amputation status",
"99592": "Severe sepsis",
"D649": "Anemia",
"7850": "Tachycardia",
"51881": "Acute respiratory failure"
}
icd = pd.merge(diagnosis, admissions.loc[:, ['hadm_id', 'admittime']], on='hadm_id', how='left')
In [10]:
data = pd.read_csv('./data_processed/data_first_record_.csv', index_col=0)
print(data.shape)
print(data.head())
(652, 27) subject_id intime los gender age heart_rate \ 327 10024331 2141-03-18 19:36:08 4.10571 1 0.684717 0.28750 1022 10025647 2176-09-22 17:57:15 1.96810 1 0.833860 0.28750 896 10037975 2185-01-17 19:12:12 4.87824 1 0.514176 0.41250 908 10056223 2122-09-23 15:08:45 5.04106 1 0.393528 0.40625 559 10063856 2174-03-08 01:04:16 1.37475 0 0.464178 0.11875 respiratory_rate hematocrit rdw platelet ... race_BLACK \ 327 0.32 0.425234 0.484848 0.127773 ... 0 1022 0.38 0.474299 0.579545 0.164153 ... 0 896 0.74 0.635514 0.640152 0.007986 ... 0 908 0.40 0.355140 0.583333 0.062112 ... 0 559 0.34 0.436916 0.481061 0.236912 ... 0 race_HISPANIC/LATINO race_OTHER race_WHITE marital_status_MARRIED \ 327 0 0 1 1 1022 0 0 1 1 896 0 1 0 1 908 1 0 0 0 559 0 0 1 1 marital_status_SINGLE marital_status_WIDOWED insurance_Medicare \ 327 0 0 1 1022 0 0 1 896 0 0 1 908 1 0 0 559 0 0 0 insurance_Other icu_count 327 0 0.0 1022 0 0.0 896 0 0.0 908 0 0.0 559 1 0.0 [5 rows x 27 columns]
In [11]:
result = (
data.loc[:, ['subject_id', 'intime']]
.merge(
icd.loc[icd.icd_code.isin(mapping.keys()), ['subject_id', 'icd_code', 'admittime']],
on='subject_id',
how='left',
)
.query('admittime <= intime')
.drop(columns=['admittime'])
)
result = (
pd.get_dummies(
result,
columns=['icd_code'],
prefix='',
prefix_sep='',
dtype='int32',
).rename(columns=mapping)
.groupby(['subject_id', 'intime'])
.max()
.reset_index()
)
result
Out[11]:
subject_id | intime | Diabetes mellitus | Hyposmolality and/or hyponatremia | Thrombocytopenia | Tobacco use disorder | Congestive heart failure | Other finger(s) amputation status | Acute respiratory failure | Other specified intestinal obstruction | Acute kidney failure | Urinary tract infection | Tachycardia | Severe sepsis | Anemia | Essential (primary) hypertension | Personal history of tobacco use | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10024331 | 2141-03-18 19:36:08 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 10025647 | 2176-09-22 17:57:15 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 10037975 | 2185-01-17 19:12:12 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
3 | 10056223 | 2122-09-23 15:08:45 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
4 | 10063856 | 2174-03-08 01:04:16 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
520 | 19878969 | 2149-03-06 20:30:00 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
521 | 19890872 | 2190-07-30 12:49:21 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
522 | 19923690 | 2142-07-20 22:02:00 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
523 | 19940147 | 2127-11-11 12:29:24 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
524 | 19970491 | 2129-05-17 17:57:50 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 |
525 rows × 17 columns
In [13]:
(
data.merge(
result,
on=['subject_id', 'intime'],
how='left',
).drop(
columns=[
'subject_id', 'intime',
'race_BLACK', 'race_HISPANIC/LATINO', 'race_OTHER', 'race_WHITE',
'marital_status_SINGLE', 'marital_status_WIDOWED',
]
)
# .fillna(0.)
# .isna().any().any()
.to_csv('./data_processed/data_first_record_with_commorbidities_.csv')
# .columns
)
In [ ]: