rm(list=ls(all=TRUE))
setwd('C:/Users/sitdo/Documents/GitHub/IBD-EDA/paper1/')
library(dplyr)
载入程辑包:‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
data <- read.csv("./data_preprocessed/data.csv") %>% select(-1)
library(randomForest)
randomForest 4.7-1.1
Type rfNews() to see new features/changes/bug fixes.
载入程辑包:‘randomForest’
The following object is masked from ‘package:dplyr’:
combine
library(pROC)
Type 'citation("pROC")' for a citation.
载入程辑包:‘pROC’
The following objects are masked from ‘package:stats’:
cov, smooth, var
set.seed(123)
splitting_ratio <- 0.7
indices <- 1:nrow(data)
shuffled_indices <- sample(indices)
train_size <- floor(splitting_ratio * length(indices))
train_indices <- shuffled_indices[1:train_size]
test_indices <- shuffled_indices[(train_size + 1):length(indices)]
train_data <- data[train_indices, ]
test_data <- data[test_indices, ]
train_X <- as.matrix(train_data[, -1])
train_y <- train_data[, 1]
test_X <- as.matrix(test_data[, -1])
test_y <- test_data[, 1]
Building Model
rf_model <- randomForest(train_X, train_y)
Warning: The response has five or fewer unique values. Are you sure you want to do regression?
predictions <- predict(rf_model, test_X)
confusion_matrix <- table(
as.numeric(test_data$dod), as.numeric(ifelse(predictions > 0.5, 1, 0))
)
TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]
## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")
Accuracy: 0.8677686
## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")
Recall: 0.9495114
## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")
Precision: 0.8996914
## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")
Specificity: 0.4196429
## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
F1 Score: 0.9239303
# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
as.numeric(test_data$dod), predictions
)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Plot the ROC curve
plot(
roc_obj,
col = "blue",
main = "ROC Curve - Random Forest",
legacy.axes = TRUE,
print.auc = TRUE,
print.thres = TRUE,
grid = c(0.2, 0.2),
grid.col = c("green", "orange")
)
# Perform 10-fold cross-validation
num_folds <- 10
folds <- cut(seq(1, nrow(data)), breaks = num_folds, labels = FALSE)
# Create empty vectors to store the predictions and actual values
all_predictions <- vector()
all_actuals <- vector()
for (i in 1:num_folds) {
# Split the data into training and test sets for the current fold
train_data <- data[folds != i, ]
test_data <- data[folds == i, ]
# Convert training data to matrix format
train_X <- as.matrix(train_data[, -1])
train_y <- train_data[, 1]
# Train the random forest model
rf_model <- randomForest(train_X, train_y)
# Convert test data to matrix format
test_X <- as.matrix(test_data[, -1])
test_y <- test_data[, 1]
# Make predictions on the test set
predictions <- predict(rf_model, test_X)
# Append the predictions and actual values to the vectors
all_predictions <- c(all_predictions, predictions)
all_actuals <- c(all_actuals, test_y)
}
Warning: The response has five or fewer unique values. Are you sure you want to do regression?Warning: The response has five or fewer unique values. Are you sure you want to do regression?Warning: The response has five or fewer unique values. Are you sure you want to do regression?Warning: The response has five or fewer unique values. Are you sure you want to do regression?Warning: The response has five or fewer unique values. Are you sure you want to do regression?Warning: The response has five or fewer unique values. Are you sure you want to do regression?Warning: The response has five or fewer unique values. Are you sure you want to do regression?Warning: The response has five or fewer unique values. Are you sure you want to do regression?Warning: The response has five or fewer unique values. Are you sure you want to do regression?Warning: The response has five or fewer unique values. Are you sure you want to do regression?
confusion_matrix <- table(
as.numeric(all_actuals),
as.numeric(ifelse(all_predictions > 0.5, 1, 0))
)
TP <- confusion_matrix[1, 1]
TN <- confusion_matrix[2, 2]
FP <- confusion_matrix[2, 1]
FN <- confusion_matrix[1, 2]
## Calculate Accuracy
accuracy <- (TP + TN) / (TP + FP + TN + FN)
cat("Accuracy:", accuracy, "\n")
Accuracy: 0.8709144
## Calculate Recall
recall <- TP / (TP + FN)
cat("Recall:", recall, "\n")
Recall: 0.9543446
## Calculate Precision
precision <- TP / (TP + FP)
cat("Precision:", precision, "\n")
Precision: 0.8987517
## Calculate Specificity
specificity <- TN / (TN + FP)
cat("Specificity:", specificity, "\n")
Specificity: 0.4236842
## Calculate F1 Score
f1_score <- 2 * (precision * recall) / (precision + recall)
cat("F1 Score:", f1_score, "\n")
F1 Score: 0.9257143
# Calculate ROC curve using the actual values and predictions
roc_obj <- roc(
as.numeric(all_actuals), all_predictions
)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Plot the ROC curve
plot(
roc_obj,
col = "blue",
main = "ROC Curve - Random Forest (Cross Validation)",
legacy.axes = TRUE,
print.auc = TRUE,
print.thres = TRUE,
grid = c(0.2, 0.2),
grid.col = c("green", "orange")
)