Variable Selection

zip = read.csv("/course/data/2024-yong/lab07/zip.csv")
zip = zip[zip$digit %in% c(6,8),]
zip$digit = factor(zip$digit)
# 1.
set.seed(769)
r = randomForest(digit ~ ., data=zip, importance=TRUE)
imp = importance(r)
(top2 = rownames(imp[order(imp[,"MeanDecreaseAccuracy"], decreasing=TRUE),][1:2,]))
## [1] "p75" "p91"
# The two most important predictors: p75, p91
zip2 = zip[,c("digit",top2)]
r2 = randomForest(digit ~ ., data=zip2, mtry=1)
r$err.rate[500,"OOB"]
##        OOB 
## 0.01037613
r2$err.rate[500,"OOB"]
##        OOB 
## 0.07328145
# The OOB error using all predictors is 0.01037613.
# The OOB error using the two selected predictors is 0.07328145.
# 2.
(top10 = rownames(imp[order(imp[,"MeanDecreaseAccuracy"],decreasing=T),])[1:10])
##  [1] "p75"  "p91"  "p76"  "p120" "p60"  "p37"  "p90"  "p136" "p105" "p106"
cors=sapply(top10[2:10], function(x) cor(zip[,"p75"], zip[,x]))
p2 = names(sort(cors))[1]
pred=c("digit", "p75", p2)
zip3 = zip[,pred]
set.seed(769)
r3 = randomForest(digit ~ ., data = zip3, mtry = 1)
r3$err.rate[500,"OOB"]
##        OOB 
## 0.05901427
# The OOB is 0.05901427. These two predictors form a better subset of size 2 than the one obtained in Question 1.
train = zip[1:1000, pred]
test = zip[-(1:1000), pred]

Clustering

# 3. 
set.seed(769)
ari = double(6)
r = vector("list", len=6)
for(K in 2:7) {
  r[[K]] = kmeans(zip3[,-1], centers=K)
  ari[K-1] = adjustedRandIndex(zip3$digit, r[[K]]$cluster)
}
ari
## [1] 0.6359116 0.5773832 0.5349036 0.5068246 0.4959250 0.5149460

For K=2, K-means performs the best, which could suggest that the data may naturally form two broad clusters. However, this clustering is still far from perfectly matching the true class labels as the ARI is 0.64.

# 4. 
set.seed(769)
d = dist(zip3[,-1])
# Complete linkage
rC = hclust(d)
Cari = double(6)
for(K in 2:7) Cari[K-1] = adjustedRandIndex(zip3$digit, cutree(rC, K))
Cari
## [1] 0.4097873 0.3518207 0.5152887 0.5151526 0.4876007 0.4813894
# Single linkage
rS = hclust(d, method="single") 
Sari = double(6)
for(K in 2:7) Sari[K-1] = adjustedRandIndex(zip3$digit, cutree(rS, K))
Sari
## [1] 0.000702542 0.002977036 0.003227936 0.003480510 0.003765732 0.003752289
# Average linkage
rA = hclust(d, method="average") 
Aari = double(6)
for(K in 2:7) Aari[K-1] = adjustedRandIndex(zip3$digit, cutree(rA, K))
Aari
## [1] 0.4489173 0.5978563 0.5239059 0.5190509 0.4963023 0.5348528
# Centroid linkage
rCen = hclust(d, method="centroid")
Cen_ari = double(6)
for(K in 2:7) Cen_ari[K-1] = adjustedRandIndex(zip3$digit, cutree(rCen, K))
Cen_ari
## [1] 0.6887968 0.6031554 0.5213233 0.5623690 0.5568440 0.5297467
# 5.
par(mfrow=c(2,3))
data = zip3[,-1] + rnorm(n=nrow(zip3) * (ncol(zip3) - 1), sd=0.1)

plot(data[,1], data[,2], col = as.numeric(zip3$digit) + 1, pch=as.numeric(zip3$digit), main="Original Classes")

plot(data[,1], data[,2], col = r[[1]]$cluster+1, pch=as.numeric(r[[1]]$cluster), main="K-means")

methods = c("complete", "single", "average", "centroid")
for(i in 1:4) {
  rr = hclust(d, method = methods[i])
  clust = cutree(rr, k=2) 
  plot(data[,1], data[,2], col=clust+1, pch=clust, main=paste(methods[i], "linkage"))
}

The ARI values are basically consistent with of the clustering results.

Support Vector Machines

# 6.
costs=c(0.001, 0.01, 0.1, 1, 10, 100) 
svm_mods=list() 
plot_svm=function(mod, data){
  col=as.numeric(data$digit)+1
  plot(data[,2], data[,3], col=col, pch=19, xlab=names(data)[2], ylab=names(data)[3])
  sv=mod$index
  points(data[sv,2],data[sv,3], pch=20, col=4, cex=1.5)
  b=coef(mod)
  abline(-b[-3]/b[3], col="red", lwd=2)
}
par(mfrow=c(2,3))  
for(i in 1:length(costs)){
  set.seed(308)  
  svm_mods[[i]]=svm(digit~., data=train, kernel="linear", cost=costs[i], scale=F)
  plot_svm(svm_mods[[i]], train)
  title(paste("Cost =", costs[i]))
}

As cost increases, the support vectors decrease, the decision boundary becomes stricter.

# 7. 
compute_err = function(r, predict) {
  y1 = predict(r, train)
  y2 = predict(r, test)
  c(Train_err = mean(train$digit != y1), Test_err = mean(test$digit != y2))
}
err = sapply(1:6, function(i) compute_err(svm_mods[[i]], predict))
colnames(err)=paste("Cost =", costs)
err
##           Cost = 0.001 Cost = 0.01 Cost = 0.1   Cost = 1  Cost = 10 Cost = 100
## Train_err    0.1710000  0.08000000 0.06700000 0.06700000 0.06700000 0.06700000
## Test_err     0.1383764  0.06642066 0.05904059 0.06088561 0.06088561 0.06088561

The test misclassification rate is highest at cost = 0.001, at 0.1384. It reaches its lowest value, 0.0590, at cost = 0.1, which is the optimal value. As cost increases, the test misclassification rate significantly decreases, reaching its minimum at cost = 0.1. After this, as cost continues to increase, the test misclassification rate no longer decreases and remains around 0.0609.

# 8.
cost = 1
gamma = c(0.001, 0.01, 0.1, 1, 10, 100)
par(mfrow=c(2,3))
for(i in 1:6) {
  r[[i]] = svm(digit ~ ., data=train, scale=FALSE, kernel="radial", cost=cost, gamma=gamma[i], probability=TRUE)
  plot_svm(r[[i]], train, main=substitute(gamma == k, list(k = gamma[i])))
}
sapply(1:6, function(i) nrow(r[[i]]$SV))
err = sapply(1:6, function(i) compute_err(r[[i]]),predict)
colnames(err)=paste("Cost =", costs)
err

The number of support vectors initially decreases as gamma increases but then rises again. The test error decreases as gamma increases from 0.001 to 10, reaching its lowest value (0.0590) at gamma = 10, and then slightly increases at gamma = 100. The optimal value of gamma is 10. Compared to the linear kernel, the radial kernel with the optimal gamma (10) performs slightly better.

# 9.
set.seed(769)
r <- tune(svm, digit ~ ., data=train, kernel="radial", scale=FALSE, ranges=list(cost=10^(-3:2), gamma=10^(-3:2)))
r$best.parameters
r$best.model
(err = compute_err(r$best.model, predict))

Parameters: SVM-Type: C-classification SVM-Kernel: radial cost: 100 # Number of Support Vectors: 155 Train_err Test_err 0.06700000 0.06088561.

Summary

In this lab, the random forest method was used to identify the two most important predictors (p75 and p91). The OOB error rate when using all predictors was 0.010, while using only the two selected predictors increased the OOB error rate to 0.073. A different subset of two predictors, selected based on their correlation with p75, improved the OOB error rate to 0.059. K-means and hierarchical clustering methods were applied, and the ARI values were computed. K-means performed best at K=2, achieving an ARI of 0.64. The various hierarchical clustering methods produced similar results, with complete linkage showing a slight advantage. Despite showing moderate results, the clustering did not perfectly match the true class labels. Different cost values were used to train linear SVMs. As cost increased, the decision boundary became stricter, and the number of support vectors decreased. The optimal cost value was determined to be 0.1, with a test error rate of 0.0590. The radial kernel with gamma = 10 provided the best result, with a test error rate of 0.059. The choice of kernel and the tuning of hyperparameters such as cost and gamma had a significant impact on SVM performance. SVMs with different gamma values were trained using the radial kernel. The optimal gamma value was 10, with a test error rate of 0.0590. Compared to the linear kernel, the radial kernel performed slightly better at the optimal gamma value.