rm(list=ls(all=TRUE))
setwd('C:/Users/sitdo/Documents/GitHub/IBD-EDA/paper1/')
library(dplyr)
载入程辑包:‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
data <- read.csv("./data_preprocessed/data.csv") %>% select(-1)
library(xgboost)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
载入程辑包:‘xgboost’
The following object is masked from ‘package:dplyr’:
slice
set.seed(123)
splitting_ratio <- 0.7
indices <- 1:nrow(data)
shuffled_indices <- sample(indices)
train_size <- floor(splitting_ratio * length(indices))
train_indices <- shuffled_indices[1:train_size]
test_indices <- shuffled_indices[(train_size + 1):length(indices)]
train_data <- data[train_indices, ]
test_data <- data[test_indices, ]
Format for XGBoost
train_X <- as.matrix(train_data[, -1])
train_y <- train_data[, 1]
dtrain <- xgb.DMatrix(data = train_X, label = train_y)
test_X <- as.matrix(test_data[, -1])
test_y <- test_data[, 1]
dtest <- xgb.DMatrix(data = test_X, label = test_y)
xgb_model <- xgboost(data = dtrain, nrounds = 10, objective = "binary:logistic")
[1] train-logloss:0.516862
[2] train-logloss:0.414344
[3] train-logloss:0.345329
[4] train-logloss:0.299047
[5] train-logloss:0.266506
[6] train-logloss:0.240876
[7] train-logloss:0.220764
[8] train-logloss:0.202892
[9] train-logloss:0.190781
[10] train-logloss:0.181812
predictions <- predict(xgb_model, dtest)
confusion_matrix <- table(
as.numeric(test_data$dod), as.numeric(ifelse(predictions > 0.5, 1, 0))
)
TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]
## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")
Accuracy: 0.8677686
## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")
Recall: 0.9592834
## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")
Precision: 0.8924242
## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")
Specificity: 0.3660714
## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
F1 Score: 0.9246468
library(pROC)
Type 'citation("pROC")' for a citation.
载入程辑包:‘pROC’
The following objects are masked from ‘package:stats’:
cov, smooth, var
# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
as.numeric(test_data$dod), predictions
)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Plot the ROC curve
plot(
roc_obj,
col = "blue",
main = "ROC Curve - XGBoost",
legacy.axes = TRUE,
print.auc = TRUE,
print.thres = TRUE,
grid = c(0.2, 0.2),
grid.col = c("green", "orange")
)
# Perform 10-fold cross-validation
num_folds <- 10
folds <- cut(seq(1, nrow(data)), breaks = num_folds, labels = FALSE)
# Create empty vectors to store the predictions and actual values
all_predictions <- vector()
all_actuals <- vector()
for (i in 1:num_folds) {
# Split the data into training and test sets for the current fold
train_data <- data[folds != i, ]
test_data <- data[folds == i, ]
train_X <- as.matrix(train_data[, -1])
train_y <- train_data[, 1]
dtrain <- xgb.DMatrix(data = train_X, label = train_y)
test_X <- as.matrix(test_data[, -1])
test_y <- test_data[, 1]
dtest <- xgb.DMatrix(data = test_X, label = test_y)
# Train the XGBoost model
xgb_model <- xgboost(data = dtrain,
nrounds = 10,
objective = "binary:logistic")
# Make predictions on the test set
predictions <- predict(xgb_model, dtest)
# Append the predictions and actual values to the vectors
all_predictions <- c(all_predictions, predictions)
all_actuals <- c(all_actuals, test_y)
}
[1] train-logloss:0.514947
[2] train-logloss:0.410452
[3] train-logloss:0.341302
[4] train-logloss:0.295937
[5] train-logloss:0.263464
[6] train-logloss:0.236689
[7] train-logloss:0.216530
[8] train-logloss:0.200514
[9] train-logloss:0.189588
[10] train-logloss:0.179107
[1] train-logloss:0.516426
[2] train-logloss:0.411312
[3] train-logloss:0.344040
[4] train-logloss:0.298025
[5] train-logloss:0.267050
[6] train-logloss:0.239892
[7] train-logloss:0.220835
[8] train-logloss:0.206187
[9] train-logloss:0.196080
[10] train-logloss:0.186351
[1] train-logloss:0.517139
[2] train-logloss:0.414882
[3] train-logloss:0.347915
[4] train-logloss:0.300564
[5] train-logloss:0.269629
[6] train-logloss:0.242952
[7] train-logloss:0.225045
[8] train-logloss:0.211140
[9] train-logloss:0.199494
[10] train-logloss:0.189307
[1] train-logloss:0.517704
[2] train-logloss:0.416161
[3] train-logloss:0.348609
[4] train-logloss:0.304104
[5] train-logloss:0.272806
[6] train-logloss:0.249079
[7] train-logloss:0.230848
[8] train-logloss:0.213198
[9] train-logloss:0.201362
[10] train-logloss:0.191374
[1] train-logloss:0.514778
[2] train-logloss:0.411650
[3] train-logloss:0.344116
[4] train-logloss:0.297460
[5] train-logloss:0.263377
[6] train-logloss:0.237486
[7] train-logloss:0.218717
[8] train-logloss:0.203552
[9] train-logloss:0.192085
[10] train-logloss:0.182169
[1] train-logloss:0.512115
[2] train-logloss:0.409012
[3] train-logloss:0.342744
[4] train-logloss:0.294665
[5] train-logloss:0.261763
[6] train-logloss:0.235945
[7] train-logloss:0.217731
[8] train-logloss:0.203521
[9] train-logloss:0.190508
[10] train-logloss:0.180136
[1] train-logloss:0.514445
[2] train-logloss:0.410080
[3] train-logloss:0.345503
[4] train-logloss:0.299575
[5] train-logloss:0.266068
[6] train-logloss:0.241382
[7] train-logloss:0.221666
[8] train-logloss:0.207073
[9] train-logloss:0.196356
[10] train-logloss:0.187308
[1] train-logloss:0.518344
[2] train-logloss:0.414575
[3] train-logloss:0.345888
[4] train-logloss:0.300702
[5] train-logloss:0.268699
[6] train-logloss:0.244398
[7] train-logloss:0.226117
[8] train-logloss:0.210062
[9] train-logloss:0.199268
[10] train-logloss:0.187823
[1] train-logloss:0.519144
[2] train-logloss:0.417627
[3] train-logloss:0.348794
[4] train-logloss:0.303320
[5] train-logloss:0.269170
[6] train-logloss:0.247101
[7] train-logloss:0.227207
[8] train-logloss:0.213543
[9] train-logloss:0.200485
[10] train-logloss:0.190767
[1] train-logloss:0.517139
[2] train-logloss:0.415299
[3] train-logloss:0.349253
[4] train-logloss:0.303498
[5] train-logloss:0.272634
[6] train-logloss:0.248983
[7] train-logloss:0.229593
[8] train-logloss:0.216510
[9] train-logloss:0.204303
[10] train-logloss:0.194366
confusion_matrix <- table(
as.numeric(all_actuals),
as.numeric(ifelse(all_predictions > 0.5, 1, 0))
)
TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]
## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")
Accuracy: 0.8696731
## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")
Recall: 0.9572901
## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")
Precision: 0.8953168
## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")
Specificity: 0.4
## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
F1 Score: 0.9252669
# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
as.numeric(all_actuals), all_predictions
)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Plot the ROC curve
plot(
roc_obj,
col = "blue",
main = "ROC Curve - XGBoost (Cross Validation)",
legacy.axes = TRUE,
print.auc = TRUE,
print.thres = TRUE,
grid = c(0.2, 0.2),
grid.col = c("green", "orange")
)