import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, auc, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

data = pd.read_csv('./data_processed/data_die_in_icu_with_commorbidities.csv', index_col=0)
data.fillna(data.mean(), inplace=True)
# data.los = (data.los > 4).astype(int)

def run(data):
    X = data.loc[:, data.columns != 'los']
    y = data.loc[:, data.columns == 'los']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    model = LogisticRegression(max_iter=1000)

    model.fit(X_train, y_train)
    
    res = pd.DataFrame(model.coef_.reshape(-1), index=X.columns, columns=['Coefficient'])
    
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print(f"Accuracy with best parameters: {accuracy * 100.0}")
    
    y_scores = model.predict_proba(X_test)[:, 1]

    fpr, tpr, thresholds = roc_curve(y_test, y_scores)

    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(5, 5), dpi=300)

    plt.plot(fpr, 
            tpr, 
            color='darkorange', 
            lw=2, 
            label=f'ROC curve (area = {roc_auc:.2f})')

    plt.fill_between(fpr, tpr, color='darkorange', alpha=0.2)  # alpha 参数控制填充颜色的透明度


    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.show()
    
    return res, model

res, model = run(data)

res

Accuracy with best parameters: 50.0

c:\Users\sitdo\.conda\envs\ibd\Lib\site-packages\sklearn\utils\validation.py:1300: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

def run(data):
    X = data.loc[:, data.columns != 'los']
    y = data.loc[:, data.columns == 'los']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    model = LinearRegression()

    model.fit(X_train, y_train)
    
    res = pd.DataFrame(model.coef_.reshape(-1), index=X.columns, columns=['Coefficient'])
    
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Mean Squared Error: {mse}")
    print(f"R²: {r2}")

    return res, model

res, model = run(data)

res

Mean Squared Error: 42.674823787709485
R²: -1.0568631047239445

	Coefficient
gender	0.568739
age	0.211650
heart_rate	-0.590650
respiratory_rate	-0.153657
hematocrit	-0.379867
rdw	0.531632
platelet	-0.103438
mcv	0.427324
mch	0.011441
hemoglobin	0.502607
uc_only	0.437538
cd_only	-0.087142
language_ENGLISH	-0.123826
race_BLACK	0.323635
race_HISPANIC/LATINO	0.000000
race_OTHER	-0.152254
race_WHITE	-0.168238
marital_status_MARRIED	0.466710
marital_status_SINGLE	-0.445054
marital_status_WIDOWED	-0.514900
insurance_Medicare	-0.163893
insurance_Other	-0.397304
Diabetes mellitus	-0.065035
Hyposmolality and/or hyponatremia	0.242183
Thrombocytopenia	1.121379
Tobacco use disorder	-0.243405
Congestive heart failure	0.283854
Other finger(s) amputation status	0.162830
Acute respiratory failure	0.891182
Other specified intestinal obstruction	-0.131905
Acute kidney failure	-0.386066
Urinary tract infection	0.070269
Tachycardia	-0.939251
Severe sepsis	-0.008150
Anemia	-0.028707
Essential (primary) hypertension	-0.851959
Personal history of tobacco use	0.594789

	Coefficient
gender	6.812909e-01
age	-5.182908e-01
heart_rate	-4.965463e-01
respiratory_rate	-6.312746e-01
hematocrit	-6.877053e+00
rdw	1.459312e+00
platelet	1.102119e+00
mcv	4.138899e+00
mch	-3.748251e+00
hemoglobin	6.412568e+00
uc_only	-2.837385e+00
cd_only	-2.015443e+00
language_ENGLISH	-4.860911e+00
race_BLACK	6.848052e-01
race_HISPANIC/LATINO	7.105427e-15
race_OTHER	-1.054092e+00
race_WHITE	3.692869e-01
marital_status_MARRIED	6.097202e-01
marital_status_SINGLE	-1.519793e-01
marital_status_WIDOWED	9.388441e-02
insurance_Medicare	-1.032345e+01
insurance_Other	-1.014416e+01
Diabetes mellitus	2.126792e-01
Hyposmolality and/or hyponatremia	2.853914e-01
Thrombocytopenia	8.540771e-01
Tobacco use disorder	-1.988926e+00
Congestive heart failure	7.958829e-02
Other finger(s) amputation status	-3.556449e-01
Acute respiratory failure	4.059665e+00
Other specified intestinal obstruction	-1.009630e-01
Acute kidney failure	-2.161920e+00
Urinary tract infection	-1.869136e+00
Tachycardia	-3.503471e+00
Severe sepsis	2.538384e+00
Anemia	8.301675e-01
Essential (primary) hypertension	-3.292026e+00
Personal history of tobacco use	1.106142e+00

Loading Data¶