In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, auc, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
Loading Data¶
In [6]:
data = pd.read_csv('./data_processed/data_die_in_icu_with_commorbidities.csv', index_col=0)
data.fillna(data.mean(), inplace=True)
# data.los = (data.los > 4).astype(int)
In [3]:
def run(data):
X = data.loc[:, data.columns != 'los']
y = data.loc[:, data.columns == 'los']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
res = pd.DataFrame(model.coef_.reshape(-1), index=X.columns, columns=['Coefficient'])
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with best parameters: {accuracy * 100.0}")
y_scores = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_scores)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(5, 5), dpi=300)
plt.plot(fpr,
tpr,
color='darkorange',
lw=2,
label=f'ROC curve (area = {roc_auc:.2f})')
plt.fill_between(fpr, tpr, color='darkorange', alpha=0.2) # alpha 参数控制填充颜色的透明度
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()
return res, model
In [4]:
res, model = run(data)
res
Accuracy with best parameters: 50.0
c:\Users\sitdo\.conda\envs\ibd\Lib\site-packages\sklearn\utils\validation.py:1300: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
Out[4]:
Coefficient | |
---|---|
gender | 0.568739 |
age | 0.211650 |
heart_rate | -0.590650 |
respiratory_rate | -0.153657 |
hematocrit | -0.379867 |
rdw | 0.531632 |
platelet | -0.103438 |
mcv | 0.427324 |
mch | 0.011441 |
hemoglobin | 0.502607 |
uc_only | 0.437538 |
cd_only | -0.087142 |
language_ENGLISH | -0.123826 |
race_BLACK | 0.323635 |
race_HISPANIC/LATINO | 0.000000 |
race_OTHER | -0.152254 |
race_WHITE | -0.168238 |
marital_status_MARRIED | 0.466710 |
marital_status_SINGLE | -0.445054 |
marital_status_WIDOWED | -0.514900 |
insurance_Medicare | -0.163893 |
insurance_Other | -0.397304 |
Diabetes mellitus | -0.065035 |
Hyposmolality and/or hyponatremia | 0.242183 |
Thrombocytopenia | 1.121379 |
Tobacco use disorder | -0.243405 |
Congestive heart failure | 0.283854 |
Other finger(s) amputation status | 0.162830 |
Acute respiratory failure | 0.891182 |
Other specified intestinal obstruction | -0.131905 |
Acute kidney failure | -0.386066 |
Urinary tract infection | 0.070269 |
Tachycardia | -0.939251 |
Severe sepsis | -0.008150 |
Anemia | -0.028707 |
Essential (primary) hypertension | -0.851959 |
Personal history of tobacco use | 0.594789 |
In [9]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
def run(data):
X = data.loc[:, data.columns != 'los']
y = data.loc[:, data.columns == 'los']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
res = pd.DataFrame(model.coef_.reshape(-1), index=X.columns, columns=['Coefficient'])
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R²: {r2}")
return res, model
In [10]:
res, model = run(data)
res
Mean Squared Error: 42.674823787709485 R²: -1.0568631047239445
Out[10]:
Coefficient | |
---|---|
gender | 6.812909e-01 |
age | -5.182908e-01 |
heart_rate | -4.965463e-01 |
respiratory_rate | -6.312746e-01 |
hematocrit | -6.877053e+00 |
rdw | 1.459312e+00 |
platelet | 1.102119e+00 |
mcv | 4.138899e+00 |
mch | -3.748251e+00 |
hemoglobin | 6.412568e+00 |
uc_only | -2.837385e+00 |
cd_only | -2.015443e+00 |
language_ENGLISH | -4.860911e+00 |
race_BLACK | 6.848052e-01 |
race_HISPANIC/LATINO | 7.105427e-15 |
race_OTHER | -1.054092e+00 |
race_WHITE | 3.692869e-01 |
marital_status_MARRIED | 6.097202e-01 |
marital_status_SINGLE | -1.519793e-01 |
marital_status_WIDOWED | 9.388441e-02 |
insurance_Medicare | -1.032345e+01 |
insurance_Other | -1.014416e+01 |
Diabetes mellitus | 2.126792e-01 |
Hyposmolality and/or hyponatremia | 2.853914e-01 |
Thrombocytopenia | 8.540771e-01 |
Tobacco use disorder | -1.988926e+00 |
Congestive heart failure | 7.958829e-02 |
Other finger(s) amputation status | -3.556449e-01 |
Acute respiratory failure | 4.059665e+00 |
Other specified intestinal obstruction | -1.009630e-01 |
Acute kidney failure | -2.161920e+00 |
Urinary tract infection | -1.869136e+00 |
Tachycardia | -3.503471e+00 |
Severe sepsis | 2.538384e+00 |
Anemia | 8.301675e-01 |
Essential (primary) hypertension | -3.292026e+00 |
Personal history of tobacco use | 1.106142e+00 |
In [ ]: