rm(list=ls(all=TRUE))
setwd('C:/Users/sitdo/Documents/GitHub/IBD-EDA/paper1/')
library(dplyr)
载入程辑包:‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
data <- read.csv("./data_preprocessed/data.csv") %>% select(-1)
library(MASS)
载入程辑包:‘MASS’
The following object is masked from ‘package:dplyr’:
select
library(caret)
载入需要的程辑包:ggplot2
载入需要的程辑包:lattice
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Building Formula
variables <- setdiff(names(data), "dod")
variables_str <- paste(variables, collapse = " + ")
formula_str <- paste("dod ~", variables_str)
formula <- as.formula(formula_str)
set.seed(123)
splitting_ratio <- 0.7
indices <- 1:nrow(data)
shuffled_indices <- sample(indices)
train_size <- floor(splitting_ratio * length(indices))
train_indices <- shuffled_indices[1:train_size]
test_indices <- shuffled_indices[(train_size + 1):length(indices)]
train_data <- data[train_indices, ]
test_data <- data[test_indices, ]
model <- glm(formula = formula,
family = binomial,
data = train_data)
summary_model <- summary(model)
print(summary_model)
Call:
glm(formula = formula, family = binomial, data = train_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -20.568365 457.277673 -0.045 0.964123
anchor_age 4.376731 0.682998 6.408 1.47e-10 ***
gender_M 0.117515 0.224070 0.524 0.599960
anchor_year_group_2011...2013 0.181411 0.233117 0.778 0.436454
anchor_year_group_2014...2016 0.191048 0.297431 0.642 0.520659
insurance_Medicare 0.278722 0.588401 0.474 0.635719
insurance_Other 0.122803 0.570508 0.215 0.829571
language_ENGLISH -0.063278 0.541505 -0.117 0.906975
marital_status_MARRIED -0.303299 0.358921 -0.845 0.398094
marital_status_SINGLE 0.033429 0.376572 0.089 0.929263
marital_status_WIDOWED 0.205544 0.416083 0.494 0.621308
race_BLACK 13.744950 457.276865 0.030 0.976021
race_HISPANIC.LATINO 14.889290 457.277429 0.033 0.974025
race_OTHER 14.441410 457.276804 0.032 0.974806
race_WHITE 14.750642 457.276681 0.032 0.974267
X4019 0.281319 0.211344 1.331 0.183157
X53081 0.125019 0.228094 0.548 0.583623
X2859 -0.220703 0.245522 -0.899 0.368699
V1582 0.591333 0.237162 2.493 0.012654 *
X311 0.494764 0.241969 2.045 0.040881 *
X2724 0.116625 0.235142 0.496 0.619910
X5849 0.149032 0.259338 0.575 0.565518
X30000 0.012156 0.261345 0.047 0.962901
X27651 0.231549 0.277795 0.834 0.404548
V4572 0.170045 0.314391 0.541 0.588596
X5990 0.557491 0.258880 2.153 0.031281 *
V5865 -0.151904 0.281525 -0.540 0.589490
X2851 -0.146591 0.283898 -0.516 0.605610
X78791 -0.136259 0.290074 -0.470 0.638543
X2809 0.167475 0.314437 0.533 0.594298
X3051 0.833239 0.290879 2.865 0.004176 **
X2449 -0.032129 0.256031 -0.125 0.900138
V5861 -0.104361 0.308857 -0.338 0.735443
X49390 -0.157094 0.349305 -0.450 0.652903
X42731 0.139982 0.280091 0.500 0.617235
X2761 0.784638 0.265078 2.960 0.003076 **
X25000 0.368884 0.292123 1.263 0.206672
X2762 0.687437 0.290331 2.368 0.017896 *
X41401 0.034478 0.309362 0.111 0.911261
V442 0.564639 0.332344 1.699 0.089326 .
Z87891 -0.132430 0.375974 -0.352 0.724665
K219 -0.553997 0.388538 -1.426 0.153912
I10 -0.304855 0.371983 -0.820 0.412478
X56089 -0.958778 0.420954 -2.278 0.022748 *
X28860 -0.043626 0.328224 -0.133 0.894260
X2720 -0.269889 0.297439 -0.907 0.364208
X2768 0.587975 0.298933 1.967 0.049193 *
K5090 0.335024 0.380618 0.880 0.378745
X5609 0.247606 0.375040 0.660 0.509118
X486 -0.306877 0.298766 -1.027 0.304351
X4280 0.545076 0.289455 1.883 0.059685 .
X73300 0.210886 0.283605 0.744 0.457124
X33829 -0.166045 0.363199 -0.457 0.647546
E8788 -0.127576 0.386857 -0.330 0.741569
E785 -0.033408 0.396566 -0.084 0.932864
X42789 -0.018041 0.311825 -0.058 0.953863
F329 0.267174 0.448486 0.596 0.551359
X78701 0.212501 0.385781 0.551 0.581747
E8497 0.048812 0.353645 0.138 0.890220
F419 -0.935118 0.495969 -1.885 0.059371 .
V1251 0.433267 0.360049 1.203 0.228839
X40390 0.795588 0.469768 1.694 0.090346 .
E8798 -0.123046 0.335416 -0.367 0.713734
X5601 -0.263096 0.372585 -0.706 0.480104
X2800 -0.585804 0.425886 -1.375 0.168978
V5866 -0.500317 0.327123 -1.529 0.126154
X27652 0.355667 0.334254 1.064 0.287301
X28529 0.298285 0.356542 0.837 0.402814
N179 0.626968 0.376630 1.665 0.095977 .
X99859 0.840477 0.455358 1.846 0.064928 .
X56400 0.671233 0.339971 1.974 0.048338 *
X5859 -0.492281 0.481967 -1.021 0.307065
X2875 0.710494 0.320059 2.220 0.026427 *
X2767 -0.375984 0.346216 -1.086 0.277488
X78321 -0.614696 0.412255 -1.491 0.135946
X27800 -0.565395 0.394524 -1.433 0.151828
X4589 0.106752 0.340144 0.314 0.753639
X00845 -0.383894 0.380173 -1.010 0.312596
E8782 -0.189694 0.453056 -0.419 0.675436
X5589 -0.478533 0.406010 -1.179 0.238548
Z9049 -0.834310 0.487129 -1.713 0.086766 .
X34690 0.320296 0.411873 0.778 0.436771
V4589 -0.253097 0.356998 -0.709 0.478349
X0389 -0.177902 0.419587 -0.424 0.671571
X32723 -0.577602 0.399418 -1.446 0.148147
V1279 -0.617592 0.458913 -1.346 0.178376
X2639 0.718825 0.392310 1.832 0.066909 .
K5190 0.415341 0.445305 0.933 0.350969
X3004 0.058633 0.405844 0.144 0.885128
X78060 0.135103 0.394688 0.342 0.732124
X496 0.357420 0.312102 1.145 0.252124
X78052 -0.106995 0.370721 -0.289 0.772876
X99592 0.999731 0.416997 2.397 0.016510 *
X56210 -0.007779 0.340509 -0.023 0.981774
V552 -0.023422 0.505637 -0.046 0.963053
D649 1.407014 0.396461 3.549 0.000387 ***
X56722 -1.076152 0.573644 -1.876 0.060656 .
V5867 0.561122 0.389217 1.442 0.149396
X7850 0.567011 0.410517 1.381 0.167213
E9320 -0.311771 0.452628 -0.689 0.490948
X5680 -0.237204 0.437828 -0.542 0.587974
Y929 0.513069 0.427964 1.199 0.230582
X78820 -0.076372 0.342501 -0.223 0.823549
X73390 -0.036771 0.417633 -0.088 0.929840
X78702 -0.237030 0.397751 -0.596 0.551225
X51881 0.980037 0.355040 2.760 0.005774 **
X7840 0.164613 0.439942 0.374 0.708277
X78900 0.130564 0.490209 0.266 0.789975
X79092 0.235925 0.385304 0.612 0.540333
X71590 -0.126250 0.349736 -0.361 0.718109
X78659 -0.942214 0.423588 -2.224 0.026124 *
X412 0.777203 0.401817 1.934 0.053086 .
V5869 0.460522 0.417668 1.103 0.270200
E8490 0.301587 0.353409 0.853 0.393457
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1478.44 on 1690 degrees of freedom
Residual deviance: 831.18 on 1577 degrees of freedom
AIC: 1059.2
Number of Fisher Scoring iterations: 15
predictions <- predict(model, test_data, type="response")
confusion_matrix <- table(
as.numeric(test_data$dod), as.numeric(ifelse(predictions > 0.5, 1, 0))
)
TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]
## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")
Accuracy: 0.8636364
## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")
Recall: 0.9381107
## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")
Precision: 0.9042386
## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")
Specificity: 0.4553571
## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
F1 Score: 0.9208633
library(pROC)
Type 'citation("pROC")' for a citation.
载入程辑包:‘pROC’
The following objects are masked from ‘package:stats’:
cov, smooth, var
# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
as.numeric(test_data$dod), predictions
)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Plot the ROC curve
plot(
roc_obj,
col = "blue",
main = "ROC Curve - Logistic Regression",
legacy.axes = TRUE,
print.auc = TRUE,
print.thres = TRUE,
grid = c(0.2, 0.2),
grid.col = c("green", "orange")
)
# Perform 10-fold cross-validation
num_folds <- 10
folds <- cut(seq(1, nrow(data)), breaks = num_folds, labels = FALSE)
# Create empty vectors to store the predictions and actual values
all_predictions <- vector()
all_actuals <- vector()
for (i in 1:num_folds) {
# Split the data into training and test sets for the current fold
train_data <- data[folds != i, ]
test_data <- data[folds == i, ]
# Logistic Regression
model <- glm(formula = formula, family = binomial, data = train_data)
predictions <- predict(model, test_data, type="response")
# Append the predictions and actual values to the vectors
all_predictions <- c(all_predictions, predictions)
all_actuals <- c(all_actuals, test_data$dod)
}
confusion_matrix <- table(
as.numeric(all_actuals), as.numeric(ifelse(all_predictions > 0.5, 1, 0))
)
TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]
## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")
Accuracy: 0.860571
## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")
Recall: 0.9410898
## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")
Precision: 0.898313
## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")
Specificity: 0.4289474
## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
F1 Score: 0.919204
# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
as.numeric(all_actuals), all_predictions
)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Plot the ROC curve
plot(
roc_obj,
col = "blue",
main = "ROC Curve - Logistic Regression (Cross Validation)",
legacy.axes = TRUE,
print.auc = TRUE,
print.thres = TRUE,
grid = c(0.2, 0.2),
grid.col = c("green", "orange")
)
splitting_ratio <- 0.7
indices <- 1:nrow(data)
shuffled_indices <- sample(indices)
train_size <- floor(splitting_ratio * length(indices))
train_indices <- shuffled_indices[1:train_size]
test_indices <- shuffled_indices[(train_size + 1):length(indices)]
train_data <- data[train_indices, ]
test_data <- data[test_indices, ]
Building Formula
removed_variables <- c()
variables <- setdiff(names(data), "dod")
formula_str <- paste("dod ~", paste(variables, collapse = " + "))
new_formula <- as.formula(formula_str)
AIC Step
p_value_threshold <- 0.05
repeat {
new_model <- glm(
formula = new_formula,
family = binomial,
data = train_data
)
summary_model <- summary(new_model)
p_values <- summary_model$coefficients[, "Pr(>|z|)"]
max_p_value <- max(p_values)
max_p_value_attribute <- names(which.max(p_values))
# 检查最大P值是否超过阈值
if (max_p_value > p_value_threshold) {
# 如果超过阈值,从公式中移除该变量并继续循环
print(max_p_value_attribute)
removed_variables <- c(removed_variables, max_p_value_attribute)
variables <- setdiff(names(train_data), c("dod", removed_variables))
formula_str <- paste("dod ~", paste(variables, collapse = " + "))
new_formula <- as.formula(formula_str)
} else {
# 如果所有变量的P值都不超过阈值,则停止循环
break
}
}
[1] "X486"
[1] "V552"
[1] "X56210"
[1] "X2851"
[1] "X78900"
[1] "E8798"
[1] "X27651"
[1] "F329"
[1] "X2800"
[1] "X53081"
[1] "X27652"
[1] "X78060"
[1] "Z87891"
[1] "V5867"
[1] "X5680"
[1] "insurance_Medicare"
[1] "V4589"
[1] "X2767"
[1] "X2449"
[1] "language_ENGLISH"
[1] "X4589"
[1] "X5609"
[1] "race_HISPANIC.LATINO"
[1] "race_WHITE"
[1] "marital_status_SINGLE"
[1] "X28529"
[1] "X311"
[1] "X2724"
[1] "E785"
[1] "anchor_year_group_2014...2016"
[1] "race_OTHER"
[1] "X78820"
[1] "V5865"
[1] "X28860"
[1] "E8782"
[1] "E8497"
[1] "X49390"
[1] "V5869"
[1] "X2720"
[1] "X79092"
[1] "X3004"
[1] "V1279"
[1] "E8788"
[1] "E8490"
[1] "marital_status_WIDOWED"
[1] "X73390"
[1] "X56722"
[1] "X2762"
[1] "X78321"
[1] "X78791"
[1] "X41401"
[1] "X42789"
[1] "E9320"
[1] "V4572"
[1] "V442"
[1] "X42731"
[1] "X2809"
[1] "Y929"
[1] "X7840"
[1] "gender_M"
[1] "X5859"
[1] "X00845"
[1] "X71590"
[1] "X32723"
[1] "X2768"
[1] "X99859"
[1] "anchor_year_group_2011...2013"
[1] "X34690"
[1] "K5090"
[1] "K5190"
[1] "X0389"
[1] "Z9049"
[1] "X2859"
[1] "X78052"
[1] "X73300"
[1] "X78659"
[1] "X40390"
[1] "V5861"
[1] "insurance_Other"
[1] "X5589"
[1] "N179"
[1] "X27800"
[1] "X78701"
[1] "race_BLACK"
[1] "X4019"
[1] "X33829"
[1] "marital_status_MARRIED"
[1] "X5601"
[1] "V1251"
[1] "X56400"
[1] "X78702"
[1] "X2639"
[1] "X30000"
[1] "F419"
[1] "K219"
[1] "X412"
print(new_formula)
dod ~ anchor_age + V1582 + X5849 + X5990 + X3051 + X2761 + X25000 +
I10 + X56089 + X4280 + V5866 + X2875 + X496 + X99592 + D649 +
X7850 + X51881
final_model <- glm(
formula = new_formula,
family = binomial,
data = train_data
)
summary_model <- summary(final_model)
print(summary_model)
Call:
glm(formula = new_formula, family = binomial, data = train_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.4646 0.3200 -17.078 < 2e-16 ***
anchor_age 4.8258 0.4486 10.758 < 2e-16 ***
V1582 0.4463 0.1912 2.334 0.019607 *
X5849 0.5317 0.1993 2.667 0.007643 **
X5990 0.5071 0.2037 2.490 0.012771 *
X3051 0.7462 0.2484 3.004 0.002663 **
X2761 0.5390 0.2288 2.355 0.018503 *
X25000 0.5162 0.2163 2.387 0.016977 *
I10 -0.7720 0.2785 -2.772 0.005574 **
X56089 -0.7954 0.3512 -2.265 0.023507 *
X4280 0.8982 0.2327 3.859 0.000114 ***
V5866 -0.5667 0.2839 -1.996 0.045914 *
X2875 0.9134 0.2722 3.356 0.000791 ***
X496 0.5839 0.2696 2.166 0.030348 *
X99592 0.8252 0.2879 2.867 0.004150 **
D649 1.1706 0.2988 3.917 8.96e-05 ***
X7850 0.7493 0.3340 2.244 0.024859 *
X51881 0.5906 0.2952 2.000 0.045461 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1461.64 on 1690 degrees of freedom
Residual deviance: 921.35 on 1673 degrees of freedom
AIC: 957.35
Number of Fisher Scoring iterations: 6
predictions <- predict(final_model, test_data, type="response")
confusion_matrix <- table(
as.numeric(test_data$dod), as.numeric(ifelse(predictions > 0.5, 1, 0))
)
TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]
## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")
Accuracy: 0.8650138
## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")
Recall: 0.9507389
## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")
Precision: 0.8948995
## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")
Specificity: 0.4188034
## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
F1 Score: 0.9219745
library(pROC)
# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
as.numeric(test_data$dod), predictions
)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Plot the ROC curve
plot(
roc_obj,
col = "blue",
main = "ROC Curve - Logistic Regression",
legacy.axes = TRUE,
print.auc = TRUE,
print.thres = TRUE,
grid = c(0.2, 0.2),
grid.col = c("green", "orange")
)
ci <- confint(final_model)
Waiting for profiling to be done...
exp(cbind(OR <- coef(final_model), ci))
2.5 % 97.5 %
(Intercept) 4.234143e-03 0.002203303 7.733731e-03
anchor_age 1.246830e+02 52.795520515 3.070044e+02
V1582 1.562498e+00 1.071189551 2.268741e+00
X5849 1.701755e+00 1.147463197 2.508305e+00
X5990 1.660486e+00 1.110127913 2.468368e+00
X3051 2.108983e+00 1.287595319 3.415782e+00
X2761 1.714326e+00 1.090402570 2.677185e+00
X25000 1.675703e+00 1.092623703 2.552842e+00
I10 4.620937e-01 0.262951827 7.853254e-01
X56089 4.513950e-01 0.217309064 8.676556e-01
X4280 2.455225e+00 1.555041325 3.876640e+00
V5866 5.674154e-01 0.321608918 9.802037e-01
X2875 2.492865e+00 1.460025698 4.251551e+00
X496 1.793003e+00 1.054823472 3.040954e+00
X99592 2.282224e+00 1.296050351 4.014376e+00
D649 3.223989e+00 1.782193288 5.766061e+00
X7850 2.115452e+00 1.084120222 4.027586e+00
X51881 1.804986e+00 1.010362320 3.220061e+00
# Perform 10-fold cross-validation
num_folds <- 10
folds <- cut(seq(1, nrow(data)), breaks = num_folds, labels = FALSE)
# Create empty vectors to store the predictions and actual values
all_predictions <- vector()
all_actuals <- vector()
for (i in 1:num_folds) {
# Split the data into training and test sets for the current fold
train_data <- data[folds != i, ]
test_data <- data[folds == i, ]
# Logistic Regression
model <- glm(formula = new_formula,
family = binomial,
data = train_data)
predictions <- predict(model, test_data, type="response")
# Append the predictions and actual values to the vectors
all_predictions <- c(all_predictions, predictions)
all_actuals <- c(all_actuals, test_data$dod)
}
confusion_matrix <- table(
as.numeric(all_actuals), as.numeric(ifelse(all_predictions > 0.5, 1, 0))
)
TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]
## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")
Accuracy: 0.8750517
## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")
Recall: 0.9558174
## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")
Precision: 0.9018064
## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")
Specificity: 0.4421053
## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
F1 Score: 0.9280267
# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
as.numeric(all_actuals), all_predictions
)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Plot the ROC curve
plot(
roc_obj,
col = "blue",
main = "ROC Curve - Logistic Regression (Cross Validation)",
legacy.axes = TRUE,
print.auc = TRUE,
print.thres = TRUE,
grid = c(0.2, 0.2),
grid.col = c("green", "orange")
)