# 1.i.
credit = read.csv("/course/data/2024-yong/lab06/credit.csv", strings=TRUE)
colSums(is.na(credit))
## A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13
## 12 12 0 6 6 9 9 0 0 0 0 0 0
## A14 A15 class
## 13 0 0
## Variable 1,2,4,5,6,7,14 have missing values.
sum(rowSums(is.na(credit))>0)/690
## [1] 0.05362319
## 53.6% of observations that have missing values.
# ii.
train = credit[1:400,]
test = credit[-(1:400),]
calculate_mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
for (col in colnames(train)) {
if (is.numeric(train[[col]])) {
mean <- mean(train[[col]], na.rm = TRUE)
train[[col]][is.na(train[[col]])] <- mean
} else {
mode <- calculate_mode(train[[col]])
train[[col]][is.na(train[[col]])] <- mode
}
}
for (col in colnames(test)) {
if (is.numeric(train[[col]])) {
mean <- mean(train[[col]], na.rm = TRUE)
test[[col]][is.na(test[[col]])] <- mean
} else {
mode <- calculate_mode(train[[col]])
test[[col]][is.na(test[[col]])] <- mode
}
}
# 2. Grow an unpruned tree
r = rpart(class ~ ., data=train, minsplit=2, cp=0)
## absolute deviance values
r$frame[1,4] * r$cptable[,3]
## 1 2 3 4 5 6 7 8 9 10 11
## 182 60 47 42 36 28 24 10 4 2 0
## the size of the trees
r$cptable[,2] + 1
## 1 2 3 4 5 6 7 8 9 10 11
## 1 2 4 5 7 11 14 28 37 41 47
As the size of the tree increases, the algorithm specifically looks for splits that can further reduce the deviance. As more nodes are added to the tree, the deviance must monotonically decrease.
# 3. pruning the unpruned tree
n = nrow(r$cptable)
error_train = n
error_test = n
for(i in 1:n) {
rt = prune(r, cp=r$cptable[i])
error_train[i] = mean(predict(rt, train, type="class") != train$class)
error_test[i] = mean(predict(rt, test, type="class") != test$class)
}
plot(r$cptable[, "nsplit"] + 1, error_train, type = "o", col = "blue", ylim = c(0, max(error_train, error_test)), xlab = "Tree Size", ylab = "Error", main = "Plot of Training and Test Error")
lines(r$cptable[, "nsplit"] + 1, error_test, type = "o", col = "red")
legend("topright", legend = c("Train", "Test"), col = c("blue", "red"), lty = 1, pch = 1)
# 4. pruning the tree using 10-fold cross-validation with 20 repetitions
dev = rowMeans(simplify2array(
mclapply(1:20, function(...) rpart(class ~ ., data=train,
method = "class", minsplit=2, cp=0)$cptable[,"xerror"], mc.cores=20)))
index = which.min(dev)
size = r$cptable[index,"nsplit"] + 1
## training errors
error_train[index]
## [1] 0.07
## test errors
error_test[index]
## [1] 0.1655172
# 5. Produce a Bagging model for the training data with 500 trees (with nodesize=1) constructed.
set.seed(769)
(r = randomForest(class ~ ., data=train, mtry=ncol(train)-1, importance=TRUE))
##
## Call:
## randomForest(formula = class ~ ., data = train, mtry = ncol(train) - 1, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 15
##
## OOB estimate of error rate: 11%
## Confusion matrix:
## - + class.error
## - 191 27 0.12385321
## + 17 165 0.09340659
o = order(importance(r)[,"MeanDecreaseGini"], decreasing=TRUE)
importance(r)[o,][1:3,]
## - + MeanDecreaseAccuracy MeanDecreaseGini
## A9 64.998848 89.35890 101.796708 100.38619
## A6 8.923318 13.72719 15.259827 19.50405
## A3 3.506488 11.50356 9.928582 13.11326
## The three most important variables are A9,A6 and A3
## training error
mean(predict(r, train) != train$class)
## [1] 0
## test error
mean(predict(r, test) != test$class)
## [1] 0.1448276
The test error is similar to the OOB estimate. Bagging perform better than the pruned tree found in Task 4.
# 6. Produce a Random Forest model with 500 trees (with nodesize=1) constructed.
set.seed(769)
(r = randomForest(class ~ ., data=train, importance=TRUE))
##
## Call:
## randomForest(formula = class ~ ., data = train, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 12%
## Confusion matrix:
## - + class.error
## - 193 25 0.1146789
## + 23 159 0.1263736
o = order(importance(r)[,"MeanDecreaseAccuracy"], decreasing=TRUE)
importance(r)[o,][1:3,]
## - + MeanDecreaseAccuracy MeanDecreaseGini
## A9 37.12825 35.65146 41.58352 49.78037
## A8 14.34494 11.89728 17.77080 19.30265
## A11 13.39434 13.06843 17.62196 21.08980
## The three most important variables are A9,A11 and A8
## training error
mean(predict(r, train) != train$class)
## [1] 0
## test error
mean(predict(r, test) != test$class)
## [1] 0.1482759
The test error is similar to the OOB estimate. Random Forest perform a little bit worse than Bagging.
# 7. Further consider using nodesize=5,10,20, respectively, when building a Random Forest model with 500 trees constructed.
nodesize = c(5,10,20)
error = matrix(nrow=3, ncol=2)
for (i in 1:3) {
r = randomForest(class ~ ., data=train, nodesize=nodesize[i])
error[i,1] = mean(train$class != predict(r, train))
error[i,2] = mean(test$class != predict(r, test))
}
dimnames(error) = list(paste0("nodesize=", nodesize), c("TrainError", "TestError"))
error
## TrainError TestError
## nodesize=5 0.0300 0.1517241
## nodesize=10 0.0550 0.1551724
## nodesize=20 0.0725 0.1620690
The training and test errors differ a little for a different value of node size.
# 8. Produce a Boosting model, using up to 500 trees, with the number of trees determined via 10-fold cross-validation.
train2 = transform(train, class2=as.integer(train$class)-1)
set.seed(769)
(r = gbm(class2 ~ . - class, data=train2, distribution="bernoulli", n.trees=500, interaction.depth=3, cv.folds=10))
## gbm(formula = class2 ~ . - class, distribution = "bernoulli",
## data = train2, n.trees = 500, interaction.depth = 3, cv.folds = 10)
## A gradient boosted model with bernoulli loss function.
## 500 iterations were performed.
## The best cross-validation iteration was 66.
## There were 15 predictors of which 14 had non-zero influence.
summary(r)[1:3,]
## var rel.inf
## A9 A9 35.97674
## A6 A6 17.82184
## A3 A3 10.86759
## The three most important variables are A9, A6 and A3
pre = function(r, data, ...) {
p = predict(r, data, type="response", ...)
yhat = (p > 0.5) + 1
levels(data$class)[yhat]
}
## training error
mean(pre(r, train) != train$class)
## Using 66 trees...
## [1] 0.0575
## test error
mean(pre(r, test) != test$class)
## Using 66 trees...
## [1] 0.1413793
#(predict(r, train, type="response")>0.5)+1
#levels(train$class)
Boosting predictor is better than other predictors obtained in Tasks 4, 5 and 6.
# 9.
error_train <- numeric(500)
error_test <- numeric(500)
for (i in 1:500) {
train_pred <- pre(r, train, n.trees = i)
error_train[i] <- mean(train_pred != train$class)
test_pred <- pre(r, test, n.trees = i)
error_test[i] <- mean(test_pred != test$class)
}
plot(1:500, error_train, type = "l", col = "blue", ylim = range(c(error_train, error_test)),
xlab = "Number of Trees", ylab = "Error", main = "Training vs Test Error in Boosting")
lines(1:500, error_test, type = "l", col = "red")
legend("right", legend = c("Training Error", "Test Error"), col = c("blue", "red"), lty = 1)
Boosting will overfit for this data set for using too many trees.
In this lab, various classification tasks performed using the credit dataset. The dataset contains approximately 53.6% of observations with missing data. Missing values for numeric variables were filled with the mean, while categorical variables were filled with the mode. An unpruned classification tree was grown, and it was observed that the absolute deviance values decreased monotonically as the size of the tree increased. The tree was pruned using cross-validation to find the optimal size that minimizes training and test errors. A bagging model with 500 trees was created, showing better performance than the pruned tree from the previous task. The three most important variables identified were A9, A6, and A3. A random forest model was constructed, which performed slightly worse than the bagging model, with the three most important variables being the same. The impact of different node sizes on training and test errors was investigated, revealing variations in errors with different node sizes. A boosting model was built using up to 500 trees and 10-fold cross-validation was performed. The boosting model outperformed other models in terms of training and test errors. Additionally, the relationship between training and test errors and the number of trees was plotted, demonstrating that using too many trees in the boosting model can lead to overfitting.