Setting

rm(list=ls(all=TRUE))
setwd('C:/Users/sitdo/Documents/GitHub/IBD-EDA/paper1/')

Loading Data

library(dplyr)

载入程辑包:‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
data <- read.csv("./data_preprocessed/data.csv") %>% select(-1)

Installing Packages

library(MASS)

载入程辑包:‘MASS’

The following object is masked from ‘package:dplyr’:

    select
library(caret)
载入需要的程辑包:ggplot2
载入需要的程辑包:lattice
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     

Without Selection

Building Formula

variables <- setdiff(names(data), "dod")

variables_str <- paste(variables, collapse = " + ")
formula_str <- paste("dod ~", variables_str)

formula <- as.formula(formula_str)

Method I: Splitting Data

set.seed(123)
splitting_ratio <- 0.7

indices <- 1:nrow(data)
shuffled_indices <- sample(indices) 
train_size <- floor(splitting_ratio * length(indices))

train_indices <- shuffled_indices[1:train_size]
test_indices <- shuffled_indices[(train_size + 1):length(indices)]

train_data <- data[train_indices, ]
test_data <- data[test_indices, ]
model <- glm(formula = formula, 
             family = binomial, 
             data = train_data)

summary_model <- summary(model)
print(summary_model)

Call:
glm(formula = formula, family = binomial, data = train_data)

Coefficients:
                                Estimate Std. Error z value Pr(>|z|)    
(Intercept)                   -20.568365 457.277673  -0.045 0.964123    
anchor_age                      4.376731   0.682998   6.408 1.47e-10 ***
gender_M                        0.117515   0.224070   0.524 0.599960    
anchor_year_group_2011...2013   0.181411   0.233117   0.778 0.436454    
anchor_year_group_2014...2016   0.191048   0.297431   0.642 0.520659    
insurance_Medicare              0.278722   0.588401   0.474 0.635719    
insurance_Other                 0.122803   0.570508   0.215 0.829571    
language_ENGLISH               -0.063278   0.541505  -0.117 0.906975    
marital_status_MARRIED         -0.303299   0.358921  -0.845 0.398094    
marital_status_SINGLE           0.033429   0.376572   0.089 0.929263    
marital_status_WIDOWED          0.205544   0.416083   0.494 0.621308    
race_BLACK                     13.744950 457.276865   0.030 0.976021    
race_HISPANIC.LATINO           14.889290 457.277429   0.033 0.974025    
race_OTHER                     14.441410 457.276804   0.032 0.974806    
race_WHITE                     14.750642 457.276681   0.032 0.974267    
X4019                           0.281319   0.211344   1.331 0.183157    
X53081                          0.125019   0.228094   0.548 0.583623    
X2859                          -0.220703   0.245522  -0.899 0.368699    
V1582                           0.591333   0.237162   2.493 0.012654 *  
X311                            0.494764   0.241969   2.045 0.040881 *  
X2724                           0.116625   0.235142   0.496 0.619910    
X5849                           0.149032   0.259338   0.575 0.565518    
X30000                          0.012156   0.261345   0.047 0.962901    
X27651                          0.231549   0.277795   0.834 0.404548    
V4572                           0.170045   0.314391   0.541 0.588596    
X5990                           0.557491   0.258880   2.153 0.031281 *  
V5865                          -0.151904   0.281525  -0.540 0.589490    
X2851                          -0.146591   0.283898  -0.516 0.605610    
X78791                         -0.136259   0.290074  -0.470 0.638543    
X2809                           0.167475   0.314437   0.533 0.594298    
X3051                           0.833239   0.290879   2.865 0.004176 ** 
X2449                          -0.032129   0.256031  -0.125 0.900138    
V5861                          -0.104361   0.308857  -0.338 0.735443    
X49390                         -0.157094   0.349305  -0.450 0.652903    
X42731                          0.139982   0.280091   0.500 0.617235    
X2761                           0.784638   0.265078   2.960 0.003076 ** 
X25000                          0.368884   0.292123   1.263 0.206672    
X2762                           0.687437   0.290331   2.368 0.017896 *  
X41401                          0.034478   0.309362   0.111 0.911261    
V442                            0.564639   0.332344   1.699 0.089326 .  
Z87891                         -0.132430   0.375974  -0.352 0.724665    
K219                           -0.553997   0.388538  -1.426 0.153912    
I10                            -0.304855   0.371983  -0.820 0.412478    
X56089                         -0.958778   0.420954  -2.278 0.022748 *  
X28860                         -0.043626   0.328224  -0.133 0.894260    
X2720                          -0.269889   0.297439  -0.907 0.364208    
X2768                           0.587975   0.298933   1.967 0.049193 *  
K5090                           0.335024   0.380618   0.880 0.378745    
X5609                           0.247606   0.375040   0.660 0.509118    
X486                           -0.306877   0.298766  -1.027 0.304351    
X4280                           0.545076   0.289455   1.883 0.059685 .  
X73300                          0.210886   0.283605   0.744 0.457124    
X33829                         -0.166045   0.363199  -0.457 0.647546    
E8788                          -0.127576   0.386857  -0.330 0.741569    
E785                           -0.033408   0.396566  -0.084 0.932864    
X42789                         -0.018041   0.311825  -0.058 0.953863    
F329                            0.267174   0.448486   0.596 0.551359    
X78701                          0.212501   0.385781   0.551 0.581747    
E8497                           0.048812   0.353645   0.138 0.890220    
F419                           -0.935118   0.495969  -1.885 0.059371 .  
V1251                           0.433267   0.360049   1.203 0.228839    
X40390                          0.795588   0.469768   1.694 0.090346 .  
E8798                          -0.123046   0.335416  -0.367 0.713734    
X5601                          -0.263096   0.372585  -0.706 0.480104    
X2800                          -0.585804   0.425886  -1.375 0.168978    
V5866                          -0.500317   0.327123  -1.529 0.126154    
X27652                          0.355667   0.334254   1.064 0.287301    
X28529                          0.298285   0.356542   0.837 0.402814    
N179                            0.626968   0.376630   1.665 0.095977 .  
X99859                          0.840477   0.455358   1.846 0.064928 .  
X56400                          0.671233   0.339971   1.974 0.048338 *  
X5859                          -0.492281   0.481967  -1.021 0.307065    
X2875                           0.710494   0.320059   2.220 0.026427 *  
X2767                          -0.375984   0.346216  -1.086 0.277488    
X78321                         -0.614696   0.412255  -1.491 0.135946    
X27800                         -0.565395   0.394524  -1.433 0.151828    
X4589                           0.106752   0.340144   0.314 0.753639    
X00845                         -0.383894   0.380173  -1.010 0.312596    
E8782                          -0.189694   0.453056  -0.419 0.675436    
X5589                          -0.478533   0.406010  -1.179 0.238548    
Z9049                          -0.834310   0.487129  -1.713 0.086766 .  
X34690                          0.320296   0.411873   0.778 0.436771    
V4589                          -0.253097   0.356998  -0.709 0.478349    
X0389                          -0.177902   0.419587  -0.424 0.671571    
X32723                         -0.577602   0.399418  -1.446 0.148147    
V1279                          -0.617592   0.458913  -1.346 0.178376    
X2639                           0.718825   0.392310   1.832 0.066909 .  
K5190                           0.415341   0.445305   0.933 0.350969    
X3004                           0.058633   0.405844   0.144 0.885128    
X78060                          0.135103   0.394688   0.342 0.732124    
X496                            0.357420   0.312102   1.145 0.252124    
X78052                         -0.106995   0.370721  -0.289 0.772876    
X99592                          0.999731   0.416997   2.397 0.016510 *  
X56210                         -0.007779   0.340509  -0.023 0.981774    
V552                           -0.023422   0.505637  -0.046 0.963053    
D649                            1.407014   0.396461   3.549 0.000387 ***
X56722                         -1.076152   0.573644  -1.876 0.060656 .  
V5867                           0.561122   0.389217   1.442 0.149396    
X7850                           0.567011   0.410517   1.381 0.167213    
E9320                          -0.311771   0.452628  -0.689 0.490948    
X5680                          -0.237204   0.437828  -0.542 0.587974    
Y929                            0.513069   0.427964   1.199 0.230582    
X78820                         -0.076372   0.342501  -0.223 0.823549    
X73390                         -0.036771   0.417633  -0.088 0.929840    
X78702                         -0.237030   0.397751  -0.596 0.551225    
X51881                          0.980037   0.355040   2.760 0.005774 ** 
X7840                           0.164613   0.439942   0.374 0.708277    
X78900                          0.130564   0.490209   0.266 0.789975    
X79092                          0.235925   0.385304   0.612 0.540333    
X71590                         -0.126250   0.349736  -0.361 0.718109    
X78659                         -0.942214   0.423588  -2.224 0.026124 *  
X412                            0.777203   0.401817   1.934 0.053086 .  
V5869                           0.460522   0.417668   1.103 0.270200    
E8490                           0.301587   0.353409   0.853 0.393457    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1478.44  on 1690  degrees of freedom
Residual deviance:  831.18  on 1577  degrees of freedom
AIC: 1059.2

Number of Fisher Scoring iterations: 15

Performance

predictions <- predict(model, test_data, type="response")

Confusion Matrix

confusion_matrix <- table(
  as.numeric(test_data$dod), as.numeric(ifelse(predictions > 0.5, 1, 0))
)

TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]

## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")
Accuracy: 0.8636364 
## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")
Recall: 0.9381107 
## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")
Precision: 0.9042386 
## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")
Specificity: 0.4553571 
## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
F1 Score: 0.9208633 

ROC Curve

library(pROC)
Type 'citation("pROC")' for a citation.

载入程辑包:‘pROC’

The following objects are masked from ‘package:stats’:

    cov, smooth, var
# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
  as.numeric(test_data$dod), predictions
)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Plot the ROC curve
plot(
  roc_obj,
  col = "blue",
  main = "ROC Curve - Logistic Regression",
  legacy.axes = TRUE,
  print.auc = TRUE,
  print.thres = TRUE,
  grid = c(0.2, 0.2),
  grid.col = c("green", "orange")
)

Method II: Cross Validation

# Perform 10-fold cross-validation
num_folds <- 10

folds <- cut(seq(1, nrow(data)), breaks = num_folds, labels = FALSE)

# Create empty vectors to store the predictions and actual values
all_predictions <- vector()
all_actuals <- vector()

for (i in 1:num_folds) {
  # Split the data into training and test sets for the current fold
  train_data <- data[folds != i, ]
  test_data <- data[folds == i, ]
  
  # Logistic Regression
  model <- glm(formula = formula, family = binomial, data = train_data)

  predictions <- predict(model, test_data, type="response")

  # Append the predictions and actual values to the vectors
  all_predictions <- c(all_predictions, predictions)
  all_actuals <- c(all_actuals, test_data$dod)
}

Performance

Confusion Matrix

confusion_matrix <- table(
  as.numeric(all_actuals), as.numeric(ifelse(all_predictions > 0.5, 1, 0))
)

TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]

## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")
Accuracy: 0.860571 
## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")
Recall: 0.9410898 
## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")
Precision: 0.898313 
## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")
Specificity: 0.4289474 
## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
F1 Score: 0.919204 

ROC Curve

# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
  as.numeric(all_actuals), all_predictions
)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Plot the ROC curve
plot(
  roc_obj,
  col = "blue",
  main = "ROC Curve - Logistic Regression (Cross Validation)",
  legacy.axes = TRUE,
  print.auc = TRUE,
  print.thres = TRUE,
  grid = c(0.2, 0.2),
  grid.col = c("green", "orange")
)

With Selecting

Method I: Splitting Data

splitting_ratio <- 0.7

indices <- 1:nrow(data)
shuffled_indices <- sample(indices) 
train_size <- floor(splitting_ratio * length(indices))

train_indices <- shuffled_indices[1:train_size]
test_indices <- shuffled_indices[(train_size + 1):length(indices)]

train_data <- data[train_indices, ]
test_data <- data[test_indices, ]

Building Formula

removed_variables <- c()

variables <- setdiff(names(data), "dod")

formula_str <- paste("dod ~", paste(variables, collapse = " + "))

new_formula <- as.formula(formula_str)

AIC Step

p_value_threshold <- 0.05

repeat {
  new_model <- glm(
    formula = new_formula,
    family = binomial,
    data = train_data
  )
  
  summary_model <- summary(new_model)

  p_values <- summary_model$coefficients[, "Pr(>|z|)"]
  
  max_p_value <- max(p_values)
  max_p_value_attribute <- names(which.max(p_values))
  
  # 检查最大P值是否超过阈值
  if (max_p_value > p_value_threshold) {
      # 如果超过阈值,从公式中移除该变量并继续循环
      print(max_p_value_attribute)
    
      removed_variables <- c(removed_variables, max_p_value_attribute)
      
      variables <- setdiff(names(train_data), c("dod", removed_variables))
      
      formula_str <- paste("dod ~", paste(variables, collapse = " + "))
      
      new_formula <- as.formula(formula_str)
  } else {
      # 如果所有变量的P值都不超过阈值,则停止循环
      break
  }
}
[1] "X486"
[1] "V552"
[1] "X56210"
[1] "X2851"
[1] "X78900"
[1] "E8798"
[1] "X27651"
[1] "F329"
[1] "X2800"
[1] "X53081"
[1] "X27652"
[1] "X78060"
[1] "Z87891"
[1] "V5867"
[1] "X5680"
[1] "insurance_Medicare"
[1] "V4589"
[1] "X2767"
[1] "X2449"
[1] "language_ENGLISH"
[1] "X4589"
[1] "X5609"
[1] "race_HISPANIC.LATINO"
[1] "race_WHITE"
[1] "marital_status_SINGLE"
[1] "X28529"
[1] "X311"
[1] "X2724"
[1] "E785"
[1] "anchor_year_group_2014...2016"
[1] "race_OTHER"
[1] "X78820"
[1] "V5865"
[1] "X28860"
[1] "E8782"
[1] "E8497"
[1] "X49390"
[1] "V5869"
[1] "X2720"
[1] "X79092"
[1] "X3004"
[1] "V1279"
[1] "E8788"
[1] "E8490"
[1] "marital_status_WIDOWED"
[1] "X73390"
[1] "X56722"
[1] "X2762"
[1] "X78321"
[1] "X78791"
[1] "X41401"
[1] "X42789"
[1] "E9320"
[1] "V4572"
[1] "V442"
[1] "X42731"
[1] "X2809"
[1] "Y929"
[1] "X7840"
[1] "gender_M"
[1] "X5859"
[1] "X00845"
[1] "X71590"
[1] "X32723"
[1] "X2768"
[1] "X99859"
[1] "anchor_year_group_2011...2013"
[1] "X34690"
[1] "K5090"
[1] "K5190"
[1] "X0389"
[1] "Z9049"
[1] "X2859"
[1] "X78052"
[1] "X73300"
[1] "X78659"
[1] "X40390"
[1] "V5861"
[1] "insurance_Other"
[1] "X5589"
[1] "N179"
[1] "X27800"
[1] "X78701"
[1] "race_BLACK"
[1] "X4019"
[1] "X33829"
[1] "marital_status_MARRIED"
[1] "X5601"
[1] "V1251"
[1] "X56400"
[1] "X78702"
[1] "X2639"
[1] "X30000"
[1] "F419"
[1] "K219"
[1] "X412"

Final Model

print(new_formula)
dod ~ anchor_age + V1582 + X5849 + X5990 + X3051 + X2761 + X25000 + 
    I10 + X56089 + X4280 + V5866 + X2875 + X496 + X99592 + D649 + 
    X7850 + X51881
final_model <- glm(
  formula = new_formula,
  family = binomial,
  data = train_data
)

summary_model <- summary(final_model)
print(summary_model)

Call:
glm(formula = new_formula, family = binomial, data = train_data)

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  -5.4646     0.3200 -17.078  < 2e-16 ***
anchor_age    4.8258     0.4486  10.758  < 2e-16 ***
V1582         0.4463     0.1912   2.334 0.019607 *  
X5849         0.5317     0.1993   2.667 0.007643 ** 
X5990         0.5071     0.2037   2.490 0.012771 *  
X3051         0.7462     0.2484   3.004 0.002663 ** 
X2761         0.5390     0.2288   2.355 0.018503 *  
X25000        0.5162     0.2163   2.387 0.016977 *  
I10          -0.7720     0.2785  -2.772 0.005574 ** 
X56089       -0.7954     0.3512  -2.265 0.023507 *  
X4280         0.8982     0.2327   3.859 0.000114 ***
V5866        -0.5667     0.2839  -1.996 0.045914 *  
X2875         0.9134     0.2722   3.356 0.000791 ***
X496          0.5839     0.2696   2.166 0.030348 *  
X99592        0.8252     0.2879   2.867 0.004150 ** 
D649          1.1706     0.2988   3.917 8.96e-05 ***
X7850         0.7493     0.3340   2.244 0.024859 *  
X51881        0.5906     0.2952   2.000 0.045461 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1461.64  on 1690  degrees of freedom
Residual deviance:  921.35  on 1673  degrees of freedom
AIC: 957.35

Number of Fisher Scoring iterations: 6

Performance

predictions <- predict(final_model, test_data, type="response")

Confusion Matrix

confusion_matrix <- table(
  as.numeric(test_data$dod), as.numeric(ifelse(predictions > 0.5, 1, 0))
)

TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]

## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")
Accuracy: 0.8650138 
## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")
Recall: 0.9507389 
## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")
Precision: 0.8948995 
## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")
Specificity: 0.4188034 
## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
F1 Score: 0.9219745 

ROC Curve

library(pROC)
# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
  as.numeric(test_data$dod), predictions
)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Plot the ROC curve
plot(
  roc_obj,
  col = "blue",
  main = "ROC Curve - Logistic Regression",
  legacy.axes = TRUE,
  print.auc = TRUE,
  print.thres = TRUE,
  grid = c(0.2, 0.2),
  grid.col = c("green", "orange")
)

CI

ci <- confint(final_model)
Waiting for profiling to be done...
exp(cbind(OR <- coef(final_model), ci))
                                2.5 %       97.5 %
(Intercept) 4.234143e-03  0.002203303 7.733731e-03
anchor_age  1.246830e+02 52.795520515 3.070044e+02
V1582       1.562498e+00  1.071189551 2.268741e+00
X5849       1.701755e+00  1.147463197 2.508305e+00
X5990       1.660486e+00  1.110127913 2.468368e+00
X3051       2.108983e+00  1.287595319 3.415782e+00
X2761       1.714326e+00  1.090402570 2.677185e+00
X25000      1.675703e+00  1.092623703 2.552842e+00
I10         4.620937e-01  0.262951827 7.853254e-01
X56089      4.513950e-01  0.217309064 8.676556e-01
X4280       2.455225e+00  1.555041325 3.876640e+00
V5866       5.674154e-01  0.321608918 9.802037e-01
X2875       2.492865e+00  1.460025698 4.251551e+00
X496        1.793003e+00  1.054823472 3.040954e+00
X99592      2.282224e+00  1.296050351 4.014376e+00
D649        3.223989e+00  1.782193288 5.766061e+00
X7850       2.115452e+00  1.084120222 4.027586e+00
X51881      1.804986e+00  1.010362320 3.220061e+00

Method II: Cross Validation

# Perform 10-fold cross-validation
num_folds <- 10

folds <- cut(seq(1, nrow(data)), breaks = num_folds, labels = FALSE)
# Create empty vectors to store the predictions and actual values
all_predictions <- vector()
all_actuals <- vector()

for (i in 1:num_folds) {
  # Split the data into training and test sets for the current fold
  train_data <- data[folds != i, ]
  test_data <- data[folds == i, ]
  
  # Logistic Regression
  model <- glm(formula = new_formula, 
               family = binomial, 
               data = train_data)

  predictions <- predict(model, test_data, type="response")

  # Append the predictions and actual values to the vectors
  all_predictions <- c(all_predictions, predictions)
  all_actuals <- c(all_actuals, test_data$dod)
}

Performance

Confusion Matrix

confusion_matrix <- table(
  as.numeric(all_actuals), as.numeric(ifelse(all_predictions > 0.5, 1, 0))
)

TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]

## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")
Accuracy: 0.8750517 
## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")
Recall: 0.9558174 
## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")
Precision: 0.9018064 
## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")
Specificity: 0.4421053 
## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
F1 Score: 0.9280267 

ROC Curve

# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
  as.numeric(all_actuals), all_predictions
)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Plot the ROC curve
plot(
  roc_obj,
  col = "blue",
  main = "ROC Curve - Logistic Regression (Cross Validation)",
  legacy.axes = TRUE,
  print.auc = TRUE,
  print.thres = TRUE,
  grid = c(0.2, 0.2),
  grid.col = c("green", "orange")
)

---
title: "LR"
output: 
  html_notebook: 
    toc: true
    theme: cosmo
---

# Setting

```{r}
rm(list=ls(all=TRUE))
setwd('C:/Users/sitdo/Documents/GitHub/IBD-EDA/paper1/')
```

# Loading Data

```{r}
library(dplyr)

data <- read.csv("./data_preprocessed/data.csv") %>% select(-1)
```


# Installing Packages

```{r}
library(MASS)
library(caret)
```

# Without Selection

Building Formula

```{r}
variables <- setdiff(names(data), "dod")

variables_str <- paste(variables, collapse = " + ")
formula_str <- paste("dod ~", variables_str)

formula <- as.formula(formula_str)
```

## Method I: Splitting Data

```{r}
set.seed(123)
splitting_ratio <- 0.7

indices <- 1:nrow(data)
shuffled_indices <- sample(indices) 
train_size <- floor(splitting_ratio * length(indices))

train_indices <- shuffled_indices[1:train_size]
test_indices <- shuffled_indices[(train_size + 1):length(indices)]

train_data <- data[train_indices, ]
test_data <- data[test_indices, ]
```



```{r}
model <- glm(formula = formula, 
             family = binomial, 
             data = train_data)

summary_model <- summary(model)
print(summary_model)
```

### Performance

```{r}
predictions <- predict(model, test_data, type="response")
```

#### Confusion Matrix

```{r}
confusion_matrix <- table(
  as.numeric(test_data$dod), as.numeric(ifelse(predictions > 0.5, 1, 0))
)

TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]

## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")

## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")

## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")

## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")

## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
```

#### ROC Curve

```{r}
library(pROC)
# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
  as.numeric(test_data$dod), predictions
)

# Plot the ROC curve
plot(
  roc_obj,
  col = "blue",
  main = "ROC Curve - Logistic Regression",
  legacy.axes = TRUE,
  print.auc = TRUE,
  print.thres = TRUE,
  grid = c(0.2, 0.2),
  grid.col = c("green", "orange")
)
```

## Method II: Cross Validation

```{r}
# Perform 10-fold cross-validation
num_folds <- 10

folds <- cut(seq(1, nrow(data)), breaks = num_folds, labels = FALSE)

# Create empty vectors to store the predictions and actual values
all_predictions <- vector()
all_actuals <- vector()

for (i in 1:num_folds) {
  # Split the data into training and test sets for the current fold
  train_data <- data[folds != i, ]
  test_data <- data[folds == i, ]
  
  # Logistic Regression
  model <- glm(formula = formula, family = binomial, data = train_data)

  predictions <- predict(model, test_data, type="response")

  # Append the predictions and actual values to the vectors
  all_predictions <- c(all_predictions, predictions)
  all_actuals <- c(all_actuals, test_data$dod)
}

```

### Performance

#### Confusion Matrix

```{r}
confusion_matrix <- table(
  as.numeric(all_actuals), as.numeric(ifelse(all_predictions > 0.5, 1, 0))
)

TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]

## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")

## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")

## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")

## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")

## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
```

#### ROC Curve

```{r}
# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
  as.numeric(all_actuals), all_predictions
)

# Plot the ROC curve
plot(
  roc_obj,
  col = "blue",
  main = "ROC Curve - Logistic Regression (Cross Validation)",
  legacy.axes = TRUE,
  print.auc = TRUE,
  print.thres = TRUE,
  grid = c(0.2, 0.2),
  grid.col = c("green", "orange")
)
```

# With Selecting

## Method I: Splitting Data

```{r}
splitting_ratio <- 0.7

indices <- 1:nrow(data)
shuffled_indices <- sample(indices) 
train_size <- floor(splitting_ratio * length(indices))

train_indices <- shuffled_indices[1:train_size]
test_indices <- shuffled_indices[(train_size + 1):length(indices)]

train_data <- data[train_indices, ]
test_data <- data[test_indices, ]
```

Building Formula

```{r}
removed_variables <- c()

variables <- setdiff(names(data), "dod")

formula_str <- paste("dod ~", paste(variables, collapse = " + "))

new_formula <- as.formula(formula_str)
```

AIC Step

```{r}
p_value_threshold <- 0.05

repeat {
  new_model <- glm(
    formula = new_formula,
    family = binomial,
    data = train_data
  )
  
  summary_model <- summary(new_model)

  p_values <- summary_model$coefficients[, "Pr(>|z|)"]
  
  max_p_value <- max(p_values)
  max_p_value_attribute <- names(which.max(p_values))
  
  # 检查最大P值是否超过阈值
  if (max_p_value > p_value_threshold) {
      # 如果超过阈值，从公式中移除该变量并继续循环
      print(max_p_value_attribute)
    
      removed_variables <- c(removed_variables, max_p_value_attribute)
      
      variables <- setdiff(names(train_data), c("dod", removed_variables))
      
      formula_str <- paste("dod ~", paste(variables, collapse = " + "))
      
      new_formula <- as.formula(formula_str)
  } else {
      # 如果所有变量的P值都不超过阈值，则停止循环
      break
  }
}

```

## Final Model

```{r}
print(new_formula)
```

```{r}
final_model <- glm(
  formula = new_formula,
  family = binomial,
  data = train_data
)

summary_model <- summary(final_model)
print(summary_model)
```

### Performance

```{r}
predictions <- predict(final_model, test_data, type="response")
```

#### Confusion Matrix

```{r}
confusion_matrix <- table(
  as.numeric(test_data$dod), as.numeric(ifelse(predictions > 0.5, 1, 0))
)

TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]

## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")

## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")

## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")

## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")

## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
```

#### ROC Curve

```{r}
library(pROC)
# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
  as.numeric(test_data$dod), predictions
)

# Plot the ROC curve
plot(
  roc_obj,
  col = "blue",
  main = "ROC Curve - Logistic Regression",
  legacy.axes = TRUE,
  print.auc = TRUE,
  print.thres = TRUE,
  grid = c(0.2, 0.2),
  grid.col = c("green", "orange")
)
```

#### CI

```{r}
ci <- confint(final_model)
exp(cbind(OR <- coef(final_model), ci))
```

## Method II: Cross Validation

```{r}
# Perform 10-fold cross-validation
num_folds <- 10

folds <- cut(seq(1, nrow(data)), breaks = num_folds, labels = FALSE)
```

```{r}
# Create empty vectors to store the predictions and actual values
all_predictions <- vector()
all_actuals <- vector()

for (i in 1:num_folds) {
  # Split the data into training and test sets for the current fold
  train_data <- data[folds != i, ]
  test_data <- data[folds == i, ]
  
  # Logistic Regression
  model <- glm(formula = new_formula, 
               family = binomial, 
               data = train_data)

  predictions <- predict(model, test_data, type="response")

  # Append the predictions and actual values to the vectors
  all_predictions <- c(all_predictions, predictions)
  all_actuals <- c(all_actuals, test_data$dod)
}
```

### Performance

#### Confusion Matrix

```{r}
confusion_matrix <- table(
  as.numeric(all_actuals), as.numeric(ifelse(all_predictions > 0.5, 1, 0))
)

TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]

## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")

## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")

## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")

## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")

## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
```

#### ROC Curve

```{r}
# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
  as.numeric(all_actuals), all_predictions
)

# Plot the ROC curve
plot(
  roc_obj,
  col = "blue",
  main = "ROC Curve - Logistic Regression (Cross Validation)",
  legacy.axes = TRUE,
  print.auc = TRUE,
  print.thres = TRUE,
  grid = c(0.2, 0.2),
  grid.col = c("green", "orange")
)
```
