Data Pre-processing

# 1.i.
credit = read.csv("/course/data/2024-yong/lab06/credit.csv", strings=TRUE)
colSums(is.na(credit)) 
##    A1    A2    A3    A4    A5    A6    A7    A8    A9   A10   A11   A12   A13 
##    12    12     0     6     6     9     9     0     0     0     0     0     0 
##   A14   A15 class 
##    13     0     0
## Variable 1,2,4,5,6,7,14 have missing values.
sum(rowSums(is.na(credit))>0)/690
## [1] 0.05362319
## 53.6% of observations that have missing values.
# ii.
train = credit[1:400,]
test = credit[-(1:400),]
calculate_mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

for (col in colnames(train)) {
  if (is.numeric(train[[col]])) {
    mean <- mean(train[[col]], na.rm = TRUE)
    train[[col]][is.na(train[[col]])] <- mean
  } else {
    mode <- calculate_mode(train[[col]])
    train[[col]][is.na(train[[col]])] <- mode
  }
}

for (col in colnames(test)) {
  if (is.numeric(train[[col]])) {
    mean <- mean(train[[col]], na.rm = TRUE)
    test[[col]][is.na(test[[col]])] <- mean
  } else {
    mode <- calculate_mode(train[[col]])
    test[[col]][is.na(test[[col]])] <- mode
  }
}

Classification Trees

# 2. Grow an unpruned tree
r = rpart(class ~ ., data=train, minsplit=2, cp=0)
## absolute deviance values
r$frame[1,4] * r$cptable[,3]
##   1   2   3   4   5   6   7   8   9  10  11 
## 182  60  47  42  36  28  24  10   4   2   0
## the size of the trees
r$cptable[,2] + 1
##  1  2  3  4  5  6  7  8  9 10 11 
##  1  2  4  5  7 11 14 28 37 41 47

As the size of the tree increases, the algorithm specifically looks for splits that can further reduce the deviance. As more nodes are added to the tree, the deviance must monotonically decrease.

# 3. pruning the unpruned tree
n = nrow(r$cptable)
error_train = n
error_test = n
for(i in 1:n) {
  rt = prune(r, cp=r$cptable[i])
  error_train[i] = mean(predict(rt, train, type="class") != train$class)
  error_test[i] = mean(predict(rt, test, type="class") != test$class)
}

plot(r$cptable[, "nsplit"] + 1, error_train, type = "o", col = "blue", ylim = c(0, max(error_train, error_test)), xlab = "Tree Size", ylab = "Error", main = "Plot of Training and Test Error")
lines(r$cptable[, "nsplit"] + 1, error_test, type = "o", col = "red")
legend("topright", legend = c("Train", "Test"), col = c("blue", "red"), lty = 1, pch = 1)

# 4. pruning the tree using 10-fold cross-validation with 20 repetitions
dev = rowMeans(simplify2array(
  mclapply(1:20, function(...) rpart(class ~ ., data=train, 
                                     method = "class", minsplit=2, cp=0)$cptable[,"xerror"], mc.cores=20)))
index = which.min(dev)
size = r$cptable[index,"nsplit"] + 1
## training errors
error_train[index]
## [1] 0.07
## test errors
error_test[index]
## [1] 0.1655172

Bagging

# 5. Produce a Bagging model for the training data with 500 trees (with nodesize=1) constructed.
set.seed(769)
(r = randomForest(class ~ ., data=train, mtry=ncol(train)-1, importance=TRUE))
## 
## Call:
##  randomForest(formula = class ~ ., data = train, mtry = ncol(train) -      1, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 15
## 
##         OOB estimate of  error rate: 11%
## Confusion matrix:
##     -   + class.error
## - 191  27  0.12385321
## +  17 165  0.09340659
o = order(importance(r)[,"MeanDecreaseGini"], decreasing=TRUE)
importance(r)[o,][1:3,]
##            -        + MeanDecreaseAccuracy MeanDecreaseGini
## A9 64.998848 89.35890           101.796708        100.38619
## A6  8.923318 13.72719            15.259827         19.50405
## A3  3.506488 11.50356             9.928582         13.11326
## The three most important variables are A9,A6 and A3
## training error
mean(predict(r, train) != train$class)
## [1] 0
## test error
mean(predict(r, test) != test$class)
## [1] 0.1448276

The test error is similar to the OOB estimate. Bagging perform better than the pruned tree found in Task 4.

Random Forests

# 6. Produce a Random Forest model with 500 trees (with nodesize=1) constructed.
set.seed(769)
(r = randomForest(class ~ ., data=train, importance=TRUE))
## 
## Call:
##  randomForest(formula = class ~ ., data = train, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 12%
## Confusion matrix:
##     -   + class.error
## - 193  25   0.1146789
## +  23 159   0.1263736
o = order(importance(r)[,"MeanDecreaseAccuracy"], decreasing=TRUE)
importance(r)[o,][1:3,]
##            -        + MeanDecreaseAccuracy MeanDecreaseGini
## A9  37.12825 35.65146             41.58352         49.78037
## A8  14.34494 11.89728             17.77080         19.30265
## A11 13.39434 13.06843             17.62196         21.08980
## The three most important variables are A9,A11 and A8
## training error
mean(predict(r, train) != train$class)
## [1] 0
## test error
mean(predict(r, test) != test$class)
## [1] 0.1482759

The test error is similar to the OOB estimate. Random Forest perform a little bit worse than Bagging.

# 7. Further consider using nodesize=5,10,20, respectively, when building a Random Forest model with 500 trees constructed.
nodesize = c(5,10,20)
error = matrix(nrow=3, ncol=2)
for (i in 1:3) {
   r = randomForest(class ~ ., data=train, nodesize=nodesize[i])
   error[i,1] = mean(train$class != predict(r, train))
   error[i,2] = mean(test$class != predict(r, test))   
}
dimnames(error) = list(paste0("nodesize=", nodesize), c("TrainError", "TestError"))
error
##             TrainError TestError
## nodesize=5      0.0300 0.1517241
## nodesize=10     0.0550 0.1551724
## nodesize=20     0.0725 0.1620690

The training and test errors differ a little for a different value of node size.

Gradient Boosting

# 8. Produce a Boosting model, using up to 500 trees, with the number of trees determined via 10-fold cross-validation.
train2 = transform(train, class2=as.integer(train$class)-1)
set.seed(769)
(r = gbm(class2 ~ . - class, data=train2, distribution="bernoulli", n.trees=500, interaction.depth=3, cv.folds=10))
## gbm(formula = class2 ~ . - class, distribution = "bernoulli", 
##     data = train2, n.trees = 500, interaction.depth = 3, cv.folds = 10)
## A gradient boosted model with bernoulli loss function.
## 500 iterations were performed.
## The best cross-validation iteration was 66.
## There were 15 predictors of which 14 had non-zero influence.
summary(r)[1:3,]

##    var  rel.inf
## A9  A9 35.97674
## A6  A6 17.82184
## A3  A3 10.86759
## The three most important variables are A9, A6 and A3
pre = function(r, data, ...) {
  p = predict(r, data, type="response", ...)
  yhat = (p > 0.5) + 1
  levels(data$class)[yhat]
}

## training error
mean(pre(r, train) != train$class)
## Using 66 trees...
## [1] 0.0575
## test error
mean(pre(r, test) != test$class)
## Using 66 trees...
## [1] 0.1413793
#(predict(r, train, type="response")>0.5)+1
#levels(train$class)

Boosting predictor is better than other predictors obtained in Tasks 4, 5 and 6.

# 9.
error_train <- numeric(500)
error_test <- numeric(500)
for (i in 1:500) {
  train_pred <- pre(r, train, n.trees = i)
  error_train[i] <- mean(train_pred != train$class) 
  test_pred <- pre(r, test, n.trees = i)
  error_test[i] <- mean(test_pred != test$class) 
}

plot(1:500, error_train, type = "l", col = "blue", ylim = range(c(error_train, error_test)),
     xlab = "Number of Trees", ylab = "Error", main = "Training vs Test Error in Boosting")
lines(1:500, error_test, type = "l", col = "red")
legend("right", legend = c("Training Error", "Test Error"), col = c("blue", "red"), lty = 1)

Boosting will overfit for this data set for using too many trees.

Summary

In this lab, various classification tasks performed using the credit dataset. The dataset contains approximately 53.6% of observations with missing data. Missing values for numeric variables were filled with the mean, while categorical variables were filled with the mode. An unpruned classification tree was grown, and it was observed that the absolute deviance values decreased monotonically as the size of the tree increased. The tree was pruned using cross-validation to find the optimal size that minimizes training and test errors. A bagging model with 500 trees was created, showing better performance than the pruned tree from the previous task. The three most important variables identified were A9, A6, and A3. A random forest model was constructed, which performed slightly worse than the bagging model, with the three most important variables being the same. The impact of different node sizes on training and test errors was investigated, revealing variations in errors with different node sizes. A boosting model was built using up to 500 trees and 10-fold cross-validation was performed. The boosting model outperformed other models in terms of training and test errors. Additionally, the relationship between training and test errors and the number of trees was plotted, demonstrating that using too many trees in the boosting model can lead to overfitting.