Setting

rm(list=ls(all=TRUE))
setwd('C:/Users/sitdo/Documents/GitHub/IBD-EDA/paper1/')

Loading Data

library(dplyr)

载入程辑包:‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
data <- read.csv("./data_preprocessed/data.csv") %>% select(-1)

Installing Packages

library(xgboost)
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     

载入程辑包:‘xgboost’

The following object is masked from ‘package:dplyr’:

    slice

Method I: Splitting Data

set.seed(123)
splitting_ratio <- 0.7

indices <- 1:nrow(data)
shuffled_indices <- sample(indices) 
train_size <- floor(splitting_ratio * length(indices))

train_indices <- shuffled_indices[1:train_size]
test_indices <- shuffled_indices[(train_size + 1):length(indices)]

train_data <- data[train_indices, ]
test_data <- data[test_indices, ]

Format for XGBoost

train_X <- as.matrix(train_data[, -1])
train_y <- train_data[, 1]
dtrain <- xgb.DMatrix(data = train_X, label = train_y)

test_X <- as.matrix(test_data[, -1])
test_y <- test_data[, 1]
dtest <- xgb.DMatrix(data = test_X, label = test_y)

Building Model

xgb_model <- xgboost(data = dtrain, nrounds = 10, objective = "binary:logistic")
[1] train-logloss:0.516862 
[2] train-logloss:0.414344 
[3] train-logloss:0.345329 
[4] train-logloss:0.299047 
[5] train-logloss:0.266506 
[6] train-logloss:0.240876 
[7] train-logloss:0.220764 
[8] train-logloss:0.202892 
[9] train-logloss:0.190781 
[10]    train-logloss:0.181812 
predictions <- predict(xgb_model, dtest)

Performance

Confusion Matrix

confusion_matrix <- table(
  as.numeric(test_data$dod), as.numeric(ifelse(predictions > 0.5, 1, 0))
)

TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]

## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")
Accuracy: 0.8677686 
## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")
Recall: 0.9592834 
## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")
Precision: 0.8924242 
## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")
Specificity: 0.3660714 
## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
F1 Score: 0.9246468 

ROC Curve

library(pROC)
Type 'citation("pROC")' for a citation.

载入程辑包:‘pROC’

The following objects are masked from ‘package:stats’:

    cov, smooth, var
# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
  as.numeric(test_data$dod), predictions
)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Plot the ROC curve
plot(
  roc_obj,
  col = "blue",
  main = "ROC Curve - XGBoost",
  legacy.axes = TRUE,
  print.auc = TRUE,
  print.thres = TRUE,
  grid = c(0.2, 0.2),
  grid.col = c("green", "orange")
)

Method II: Cross Validation

# Perform 10-fold cross-validation
num_folds <- 10
folds <- cut(seq(1, nrow(data)), breaks = num_folds, labels = FALSE)

# Create empty vectors to store the predictions and actual values
all_predictions <- vector()
all_actuals <- vector()

for (i in 1:num_folds) {
  # Split the data into training and test sets for the current fold
  train_data <- data[folds != i, ]
  test_data <- data[folds == i, ]
  
  train_X <- as.matrix(train_data[, -1])
  train_y <- train_data[, 1]
  dtrain <- xgb.DMatrix(data = train_X, label = train_y)
  
  test_X <- as.matrix(test_data[, -1])
  test_y <- test_data[, 1]
  dtest <- xgb.DMatrix(data = test_X, label = test_y)

  # Train the XGBoost model
  xgb_model <- xgboost(data = dtrain, 
                       nrounds = 10, 
                       objective = "binary:logistic")
  
  # Make predictions on the test set
  predictions <- predict(xgb_model, dtest)
  
  # Append the predictions and actual values to the vectors
  all_predictions <- c(all_predictions, predictions)
  all_actuals <- c(all_actuals, test_y)
}
[1] train-logloss:0.514947 
[2] train-logloss:0.410452 
[3] train-logloss:0.341302 
[4] train-logloss:0.295937 
[5] train-logloss:0.263464 
[6] train-logloss:0.236689 
[7] train-logloss:0.216530 
[8] train-logloss:0.200514 
[9] train-logloss:0.189588 
[10]    train-logloss:0.179107 
[1] train-logloss:0.516426 
[2] train-logloss:0.411312 
[3] train-logloss:0.344040 
[4] train-logloss:0.298025 
[5] train-logloss:0.267050 
[6] train-logloss:0.239892 
[7] train-logloss:0.220835 
[8] train-logloss:0.206187 
[9] train-logloss:0.196080 
[10]    train-logloss:0.186351 
[1] train-logloss:0.517139 
[2] train-logloss:0.414882 
[3] train-logloss:0.347915 
[4] train-logloss:0.300564 
[5] train-logloss:0.269629 
[6] train-logloss:0.242952 
[7] train-logloss:0.225045 
[8] train-logloss:0.211140 
[9] train-logloss:0.199494 
[10]    train-logloss:0.189307 
[1] train-logloss:0.517704 
[2] train-logloss:0.416161 
[3] train-logloss:0.348609 
[4] train-logloss:0.304104 
[5] train-logloss:0.272806 
[6] train-logloss:0.249079 
[7] train-logloss:0.230848 
[8] train-logloss:0.213198 
[9] train-logloss:0.201362 
[10]    train-logloss:0.191374 
[1] train-logloss:0.514778 
[2] train-logloss:0.411650 
[3] train-logloss:0.344116 
[4] train-logloss:0.297460 
[5] train-logloss:0.263377 
[6] train-logloss:0.237486 
[7] train-logloss:0.218717 
[8] train-logloss:0.203552 
[9] train-logloss:0.192085 
[10]    train-logloss:0.182169 
[1] train-logloss:0.512115 
[2] train-logloss:0.409012 
[3] train-logloss:0.342744 
[4] train-logloss:0.294665 
[5] train-logloss:0.261763 
[6] train-logloss:0.235945 
[7] train-logloss:0.217731 
[8] train-logloss:0.203521 
[9] train-logloss:0.190508 
[10]    train-logloss:0.180136 
[1] train-logloss:0.514445 
[2] train-logloss:0.410080 
[3] train-logloss:0.345503 
[4] train-logloss:0.299575 
[5] train-logloss:0.266068 
[6] train-logloss:0.241382 
[7] train-logloss:0.221666 
[8] train-logloss:0.207073 
[9] train-logloss:0.196356 
[10]    train-logloss:0.187308 
[1] train-logloss:0.518344 
[2] train-logloss:0.414575 
[3] train-logloss:0.345888 
[4] train-logloss:0.300702 
[5] train-logloss:0.268699 
[6] train-logloss:0.244398 
[7] train-logloss:0.226117 
[8] train-logloss:0.210062 
[9] train-logloss:0.199268 
[10]    train-logloss:0.187823 
[1] train-logloss:0.519144 
[2] train-logloss:0.417627 
[3] train-logloss:0.348794 
[4] train-logloss:0.303320 
[5] train-logloss:0.269170 
[6] train-logloss:0.247101 
[7] train-logloss:0.227207 
[8] train-logloss:0.213543 
[9] train-logloss:0.200485 
[10]    train-logloss:0.190767 
[1] train-logloss:0.517139 
[2] train-logloss:0.415299 
[3] train-logloss:0.349253 
[4] train-logloss:0.303498 
[5] train-logloss:0.272634 
[6] train-logloss:0.248983 
[7] train-logloss:0.229593 
[8] train-logloss:0.216510 
[9] train-logloss:0.204303 
[10]    train-logloss:0.194366 

Performance

Confusion Matrix

confusion_matrix <- table(
  as.numeric(all_actuals), 
  as.numeric(ifelse(all_predictions > 0.5, 1, 0))
)

TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]

## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")
Accuracy: 0.8696731 
## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")
Recall: 0.9572901 
## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")
Precision: 0.8953168 
## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")
Specificity: 0.4 
## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
F1 Score: 0.9252669 

ROC Curve

# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
  as.numeric(all_actuals), all_predictions
)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Plot the ROC curve
plot(
  roc_obj,
  col = "blue",
  main = "ROC Curve - XGBoost (Cross Validation)",
  legacy.axes = TRUE,
  print.auc = TRUE,
  print.thres = TRUE,
  grid = c(0.2, 0.2),
  grid.col = c("green", "orange")
)

LS0tDQp0aXRsZTogIlhHQm9vc3QiDQpvdXRwdXQ6IA0KICBodG1sX25vdGVib29rOiANCiAgICB0b2M6IHRydWUNCiAgICB0aGVtZTogY29zbW8NCi0tLQ0KDQojIFNldHRpbmcNCg0KYGBge3J9DQpybShsaXN0PWxzKGFsbD1UUlVFKSkNCnNldHdkKCdDOi9Vc2Vycy9zaXRkby9Eb2N1bWVudHMvR2l0SHViL0lCRC1FREEvcGFwZXIxLycpDQpgYGANCg0KIyBMb2FkaW5nIERhdGENCg0KYGBge3J9DQpsaWJyYXJ5KGRwbHlyKQ0KDQpkYXRhIDwtIHJlYWQuY3N2KCIuL2RhdGFfcHJlcHJvY2Vzc2VkL2RhdGEuY3N2IikgJT4lIHNlbGVjdCgtMSkNCmBgYA0KDQojIEluc3RhbGxpbmcgUGFja2FnZXMNCg0KYGBge3J9DQpsaWJyYXJ5KHhnYm9vc3QpDQpgYGANCg0KIyBNZXRob2QgSTogU3BsaXR0aW5nIERhdGENCg0KYGBge3J9DQpzZXQuc2VlZCgxMjMpDQpzcGxpdHRpbmdfcmF0aW8gPC0gMC43DQoNCmluZGljZXMgPC0gMTpucm93KGRhdGEpDQpzaHVmZmxlZF9pbmRpY2VzIDwtIHNhbXBsZShpbmRpY2VzKSANCnRyYWluX3NpemUgPC0gZmxvb3Ioc3BsaXR0aW5nX3JhdGlvICogbGVuZ3RoKGluZGljZXMpKQ0KDQp0cmFpbl9pbmRpY2VzIDwtIHNodWZmbGVkX2luZGljZXNbMTp0cmFpbl9zaXplXQ0KdGVzdF9pbmRpY2VzIDwtIHNodWZmbGVkX2luZGljZXNbKHRyYWluX3NpemUgKyAxKTpsZW5ndGgoaW5kaWNlcyldDQoNCnRyYWluX2RhdGEgPC0gZGF0YVt0cmFpbl9pbmRpY2VzLCBdDQp0ZXN0X2RhdGEgPC0gZGF0YVt0ZXN0X2luZGljZXMsIF0NCmBgYA0KDQpGb3JtYXQgZm9yIFhHQm9vc3QNCg0KYGBge3J9DQp0cmFpbl9YIDwtIGFzLm1hdHJpeCh0cmFpbl9kYXRhWywgLTFdKQ0KdHJhaW5feSA8LSB0cmFpbl9kYXRhWywgMV0NCmR0cmFpbiA8LSB4Z2IuRE1hdHJpeChkYXRhID0gdHJhaW5fWCwgbGFiZWwgPSB0cmFpbl95KQ0KDQp0ZXN0X1ggPC0gYXMubWF0cml4KHRlc3RfZGF0YVssIC0xXSkNCnRlc3RfeSA8LSB0ZXN0X2RhdGFbLCAxXQ0KZHRlc3QgPC0geGdiLkRNYXRyaXgoZGF0YSA9IHRlc3RfWCwgbGFiZWwgPSB0ZXN0X3kpDQpgYGANCg0KIyMgQnVpbGRpbmcgTW9kZWwNCg0KYGBge3J9DQp4Z2JfbW9kZWwgPC0geGdib29zdChkYXRhID0gZHRyYWluLCBucm91bmRzID0gMTAsIG9iamVjdGl2ZSA9ICJiaW5hcnk6bG9naXN0aWMiKQ0KDQpwcmVkaWN0aW9ucyA8LSBwcmVkaWN0KHhnYl9tb2RlbCwgZHRlc3QpDQpgYGANCg0KIyMgUGVyZm9ybWFuY2UNCg0KIyMjIENvbmZ1c2lvbiBNYXRyaXgNCg0KYGBge3J9DQpjb25mdXNpb25fbWF0cml4IDwtIHRhYmxlKA0KICBhcy5udW1lcmljKHRlc3RfZGF0YSRkb2QpLCBhcy5udW1lcmljKGlmZWxzZShwcmVkaWN0aW9ucyA+IDAuNSwgMSwgMCkpDQopDQoNClRQIDwtIGNvbmZ1c2lvbl9tYXRyaXhbMSwgMV0NClROIDwtIGNvbmZ1c2lvbl9tYXRyaXhbMiwgMl0NCkZQIDwtIGNvbmZ1c2lvbl9tYXRyaXhbMiwgMV0NCkZOIDwtIGNvbmZ1c2lvbl9tYXRyaXhbMSwgMl0NCg0KIyMgQ2FsY3VsYXRlIEFjY3VyYWN5DQphY2N1cmFjeSA8LSAoVFAgKyBUTikgLyAoVFAgKyBGUCArIFROICsgRk4pDQpjYXQoIkFjY3VyYWN5OiIsIGFjY3VyYWN5LCAiXG4iKQ0KDQojIyBDYWxjdWxhdGUgUmVjYWxsDQpyZWNhbGwgPC0gVFAgLyAoVFAgKyBGTikNCmNhdCgiUmVjYWxsOiIsIHJlY2FsbCwgIlxuIikNCg0KIyMgQ2FsY3VsYXRlIFByZWNpc2lvbg0KcHJlY2lzaW9uIDwtIFRQIC8gKFRQICsgRlApDQpjYXQoIlByZWNpc2lvbjoiLCBwcmVjaXNpb24sICJcbiIpDQoNCiMjIENhbGN1bGF0ZSBTcGVjaWZpY2l0eQ0Kc3BlY2lmaWNpdHkgPC0gVE4gLyAoVE4gKyBGUCkNCmNhdCgiU3BlY2lmaWNpdHk6Iiwgc3BlY2lmaWNpdHksICJcbiIpDQoNCiMjIENhbGN1bGF0ZSBGMSBTY29yZQ0KZjFfc2NvcmUgPC0gMiAqIChwcmVjaXNpb24gKiByZWNhbGwpIC8gKHByZWNpc2lvbiArIHJlY2FsbCkNCmNhdCgiRjEgU2NvcmU6IiwgZjFfc2NvcmUsICJcbiIpDQpgYGANCg0KIyMjIFJPQyBDdXJ2ZQ0KDQpgYGB7cn0NCmxpYnJhcnkocFJPQykNCiMgQ2FsY3VsYXRlIFJPQyBjdXJ2ZSB1c2luZyB0aGUgYWN0dWFsIHZhbHVlcyBhbmQgcHJlZGljdGlvbnMNCnJvY19vYmogPC0gcm9jKA0KICBhcy5udW1lcmljKHRlc3RfZGF0YSRkb2QpLCBwcmVkaWN0aW9ucw0KKQ0KDQojIFBsb3QgdGhlIFJPQyBjdXJ2ZQ0KcGxvdCgNCiAgcm9jX29iaiwNCiAgY29sID0gImJsdWUiLA0KICBtYWluID0gIlJPQyBDdXJ2ZSAtIFhHQm9vc3QiLA0KICBsZWdhY3kuYXhlcyA9IFRSVUUsDQogIHByaW50LmF1YyA9IFRSVUUsDQogIHByaW50LnRocmVzID0gVFJVRSwNCiAgZ3JpZCA9IGMoMC4yLCAwLjIpLA0KICBncmlkLmNvbCA9IGMoImdyZWVuIiwgIm9yYW5nZSIpDQopDQpgYGANCg0KIyBNZXRob2QgSUk6IENyb3NzIFZhbGlkYXRpb24NCg0KYGBge3J9DQojIFBlcmZvcm0gMTAtZm9sZCBjcm9zcy12YWxpZGF0aW9uDQpudW1fZm9sZHMgPC0gMTANCmZvbGRzIDwtIGN1dChzZXEoMSwgbnJvdyhkYXRhKSksIGJyZWFrcyA9IG51bV9mb2xkcywgbGFiZWxzID0gRkFMU0UpDQoNCiMgQ3JlYXRlIGVtcHR5IHZlY3RvcnMgdG8gc3RvcmUgdGhlIHByZWRpY3Rpb25zIGFuZCBhY3R1YWwgdmFsdWVzDQphbGxfcHJlZGljdGlvbnMgPC0gdmVjdG9yKCkNCmFsbF9hY3R1YWxzIDwtIHZlY3RvcigpDQoNCmZvciAoaSBpbiAxOm51bV9mb2xkcykgew0KICAjIFNwbGl0IHRoZSBkYXRhIGludG8gdHJhaW5pbmcgYW5kIHRlc3Qgc2V0cyBmb3IgdGhlIGN1cnJlbnQgZm9sZA0KICB0cmFpbl9kYXRhIDwtIGRhdGFbZm9sZHMgIT0gaSwgXQ0KICB0ZXN0X2RhdGEgPC0gZGF0YVtmb2xkcyA9PSBpLCBdDQogIA0KICB0cmFpbl9YIDwtIGFzLm1hdHJpeCh0cmFpbl9kYXRhWywgLTFdKQ0KICB0cmFpbl95IDwtIHRyYWluX2RhdGFbLCAxXQ0KICBkdHJhaW4gPC0geGdiLkRNYXRyaXgoZGF0YSA9IHRyYWluX1gsIGxhYmVsID0gdHJhaW5feSkNCiAgDQogIHRlc3RfWCA8LSBhcy5tYXRyaXgodGVzdF9kYXRhWywgLTFdKQ0KICB0ZXN0X3kgPC0gdGVzdF9kYXRhWywgMV0NCiAgZHRlc3QgPC0geGdiLkRNYXRyaXgoZGF0YSA9IHRlc3RfWCwgbGFiZWwgPSB0ZXN0X3kpDQoNCiAgIyBUcmFpbiB0aGUgWEdCb29zdCBtb2RlbA0KICB4Z2JfbW9kZWwgPC0geGdib29zdChkYXRhID0gZHRyYWluLCANCiAgICAgICAgICAgICAgICAgICAgICAgbnJvdW5kcyA9IDEwLCANCiAgICAgICAgICAgICAgICAgICAgICAgb2JqZWN0aXZlID0gImJpbmFyeTpsb2dpc3RpYyIpDQogIA0KICAjIE1ha2UgcHJlZGljdGlvbnMgb24gdGhlIHRlc3Qgc2V0DQogIHByZWRpY3Rpb25zIDwtIHByZWRpY3QoeGdiX21vZGVsLCBkdGVzdCkNCiAgDQogICMgQXBwZW5kIHRoZSBwcmVkaWN0aW9ucyBhbmQgYWN0dWFsIHZhbHVlcyB0byB0aGUgdmVjdG9ycw0KICBhbGxfcHJlZGljdGlvbnMgPC0gYyhhbGxfcHJlZGljdGlvbnMsIHByZWRpY3Rpb25zKQ0KICBhbGxfYWN0dWFscyA8LSBjKGFsbF9hY3R1YWxzLCB0ZXN0X3kpDQp9DQoNCmBgYA0KDQojIyBQZXJmb3JtYW5jZQ0KDQojIyMgQ29uZnVzaW9uIE1hdHJpeA0KDQpgYGB7cn0NCmNvbmZ1c2lvbl9tYXRyaXggPC0gdGFibGUoDQogIGFzLm51bWVyaWMoYWxsX2FjdHVhbHMpLCANCiAgYXMubnVtZXJpYyhpZmVsc2UoYWxsX3ByZWRpY3Rpb25zID4gMC41LCAxLCAwKSkNCikNCg0KVFAgPC0gY29uZnVzaW9uX21hdHJpeFsxLCAxXQ0KVE4gPC0gY29uZnVzaW9uX21hdHJpeFsyLCAyXQ0KRlAgPC0gY29uZnVzaW9uX21hdHJpeFsyLCAxXQ0KRk4gPC0gY29uZnVzaW9uX21hdHJpeFsxLCAyXQ0KDQojIyBDYWxjdWxhdGUgQWNjdXJhY3kNCmFjY3VyYWN5IDwtIChUUCArIFROKSAvIChUUCArIEZQICsgVE4gKyBGTikNCmNhdCgiQWNjdXJhY3k6IiwgYWNjdXJhY3ksICJcbiIpDQoNCiMjIENhbGN1bGF0ZSBSZWNhbGwNCnJlY2FsbCA8LSBUUCAvIChUUCArIEZOKQ0KY2F0KCJSZWNhbGw6IiwgcmVjYWxsLCAiXG4iKQ0KDQojIyBDYWxjdWxhdGUgUHJlY2lzaW9uDQpwcmVjaXNpb24gPC0gVFAgLyAoVFAgKyBGUCkNCmNhdCgiUHJlY2lzaW9uOiIsIHByZWNpc2lvbiwgIlxuIikNCg0KIyMgQ2FsY3VsYXRlIFNwZWNpZmljaXR5DQpzcGVjaWZpY2l0eSA8LSBUTiAvIChUTiArIEZQKQ0KY2F0KCJTcGVjaWZpY2l0eToiLCBzcGVjaWZpY2l0eSwgIlxuIikNCg0KIyMgQ2FsY3VsYXRlIEYxIFNjb3JlDQpmMV9zY29yZSA8LSAyICogKHByZWNpc2lvbiAqIHJlY2FsbCkgLyAocHJlY2lzaW9uICsgcmVjYWxsKQ0KY2F0KCJGMSBTY29yZToiLCBmMV9zY29yZSwgIlxuIikNCg0KYGBgDQoNCiMjIyBST0MgQ3VydmUNCg0KYGBge3J9DQojIENhbGN1bGF0ZSBST0MgY3VydmUgdXNpbmcgdGhlIGFjdHVhbCB2YWx1ZXMgYW5kIHByZWRpY3Rpb25zDQpyb2Nfb2JqIDwtIHJvYygNCiAgYXMubnVtZXJpYyhhbGxfYWN0dWFscyksIGFsbF9wcmVkaWN0aW9ucw0KKQ0KDQojIFBsb3QgdGhlIFJPQyBjdXJ2ZQ0KcGxvdCgNCiAgcm9jX29iaiwNCiAgY29sID0gImJsdWUiLA0KICBtYWluID0gIlJPQyBDdXJ2ZSAtIFhHQm9vc3QgKENyb3NzIFZhbGlkYXRpb24pIiwNCiAgbGVnYWN5LmF4ZXMgPSBUUlVFLA0KICBwcmludC5hdWMgPSBUUlVFLA0KICBwcmludC50aHJlcyA9IFRSVUUsDQogIGdyaWQgPSBjKDAuMiwgMC4yKSwNCiAgZ3JpZC5jb2wgPSBjKCJncmVlbiIsICJvcmFuZ2UiKQ0KKQ0KYGBgDQo=