Data Exploration and Visualisation

airfoil = read.csv("/course/data/2024-yong/lab05/airfoil.csv")
plot_scatter <- function(x, xname) {
  correlation <- round(cor(x, airfoil$pressure), 3)
  plot(x, airfoil$pressure, main = paste("Correlation:", correlation),
       xlab = xname, ylab = "Pressure", pch = 20)
  abline(lm(airfoil$pressure ~ x), col = "red", lwd = 1) # add a linear fitted line
  lines(lowess(x, airfoil$pressure), col = "blue", lwd = 1)# add a loess fitted curve
}

plot_scatter(airfoil$frequency, "Frequency")

plot_scatter(airfoil$angle, "Angle")

plot_scatter(airfoil$length, "Length")

plot_scatter(airfoil$velocity, "Velocity")

plot_scatter(airfoil$thickness, "Thickness")

In the analysis of the scatter plots of the data, most variables show a linear relationship with pressure. Except for the velocity variable, the other variables exhibit a negative correlation with pressure. Among these, frequency has the strongest negative correlation with pressure, while velocity has the weakest correlation.

Classification and Data Resampling

airfoil2 = transform(airfoil, class=cut(pressure, c(0,120,125,130,Inf)))
  1. For the training and test sets, compute the confusion matrix and misclassification rate, using, respectively: i.Linear discriminant analysis
r = lda(class ~ frequency + angle + length + velocity + thickness, data = airfoil2[1:500,])
p = predict(r, newdata=airfoil2[-(1:500),])
yhat = p$class
table(airfoil2[-(1:500),]$class, yhat)  # confusion table (test)
##            yhat
##             (0,120] (120,125] (125,130] (130,Inf]
##   (0,120]       185        23        18        12
##   (120,125]      48        40       130        22
##   (125,130]      13        32       152        73
##   (130,Inf]       6        10        94       145
mean(airfoil2[-(1:500),]$class == yhat) # misclassification rate (test)
## [1] 0.5204387
p = predict(r, newdata=airfoil2[1:500,])
yhat = p$class
table(airfoil2[1:500,]$class, yhat)  # confusion table (training)
##            yhat
##             (0,120] (120,125] (125,130] (130,Inf]
##   (0,120]        97        14         9         2
##   (120,125]      18        17        44        13
##   (125,130]      10        15        99        42
##   (130,Inf]       0         3        53        64
mean(airfoil2[1:500,]$class == yhat) # misclassification rate (training)
## [1] 0.554

ii.The Naive Bayes method

r = naiveBayes(class ~ frequency + angle + length + velocity + thickness, data = airfoil2[1:500,])
yhat = predict(r, newdata=airfoil2[-(1:500),])
table(airfoil2[-(1:500),]$class, yhat) # confusion table (test)
##            yhat
##             (0,120] (120,125] (125,130] (130,Inf]
##   (0,120]       128        19        65        26
##   (120,125]      53        32       105        50
##   (125,130]      22        23       117       108
##   (130,Inf]      23        10        46       176
mean(airfoil2[-(1:500),]$class == yhat) # Classfication accuracy (test)
## [1] 0.4516451
yhat = predict(r, newdata=airfoil2[1:500,])
table(airfoil2[1:500,]$class, yhat) # confusion table (training)
##            yhat
##             (0,120] (120,125] (125,130] (130,Inf]
##   (0,120]        81         6        25        10
##   (120,125]      18        14        35        25
##   (125,130]      24         7        84        51
##   (130,Inf]       7         8        25        80
mean(airfoil2[1:500,]$class == yhat) # Classfication accuracy (training)
## [1] 0.518
  1. 𝐾-nearest-neighbour
X = scale(model.matrix(class ~ . - pressure, data=airfoil2)[,-1])   # standardised design matrix
pe_train = numeric(20)
pe_test = numeric(20)
for(k in 1:20) {
    yhat = knn(train=X[1:500,], test=X[1:500,], cl=airfoil2[1:500,7], k=k) # prediction
    pe_train[k] = mean(yhat != airfoil2[1:500,7]) # misclassification rate for trainning data 
}
pe_train
##  [1] 0.000 0.176 0.198 0.250 0.284 0.314 0.328 0.332 0.340 0.376 0.346 0.382
## [13] 0.354 0.370 0.374 0.364 0.388 0.380 0.398 0.412
for(k in 1:20) {
    yhat = knn(train=X[1:500,], test=X[-(1:500),], cl=airfoil2[1:500,7], k=k) # prediction
    pe_test[k] = mean(yhat != airfoil2[-(1:500),7]) # misclassification rate for test data
}
pe_test
##  [1] 0.3998006 0.4875374 0.4715852 0.4765703 0.4476570 0.4695912 0.4925224
##  [8] 0.4765703 0.4845464 0.4805583 0.4755733 0.4735793 0.4805583 0.4875374
## [15] 0.4915254 0.4915254 0.4825523 0.4885344 0.4905284 0.4935194

4.Use 10-fold cross-validation (CV), with 20 repetitions, to compute the CV misclassification rates of KNN for 𝐾(=1,2,…,20) over the training set.

set.seed(769) # set a random seed
n = nrow(airfoil2) # total number of observations
R = 20
M = 10
K = 20
pe = matrix(nrow=R*M, ncol=K)

test.set = function(i, n, K=10)
  (round((i-1)/K*n)+1):round(i/K*n)

for(i in 1:R) { # for each repetition
  ind = sample(n)
  for(j in 1:M) { # for each fold
    index = ind[test.set(j, n, M)]
    for(k in 1:K) { # for each k nearest neighbours (method)
      yhat = knn(train=X[-index,], test=X[index,], cl=airfoil2[-index,7], k=j) # prediction           
      pe[M*(i-1)+j,k] = mean(yhat != airfoil2[index,7]) # misclassification rate for test data
    }
  }
}
pe2 = colMeans(pe)

plot(1:K, pe2, type = "b", col = "green", ylim = c(0, 0.5),
     xlab = "K", ylab = "Misclassification Rate", main = "CV vs Train vs Test Misclassification Rates")
lines(1:K, pe_train, type = "b", col = "blue", pch = 2)
lines(1:K, pe_test, type = "b", col = "red", pch =3)
legend("bottomright", legend = c("CV (10-fold, 20 reps)", "Training", "Test"),
       col = c("green", "blue", "red"), lty = 1, pch = 1:3)

(k.optimal = which.min(pe2))
## [1] 12

Using the technique of same samples ensure that the same data splits are used for each model training and validation, we enhance the reproducibility and stability of the results, making the evaluation of model performance more reliable. The appropriate value of K we should use for this data set is 20.

  1. Use the Jackknifing technique (with a 90% for training and 10% for validation) to find an appropriate value of 𝐾(=1,2,…,20) for KNN. Use R = 200 repetitions.
n = nrow(airfoil2) # total number of observations
n2 = round(n * 0.10) # size of test set
K = 20 # largest K-value in comparison
R = 200 # number of repetitions
pe = matrix(nrow=R, ncol=K) # pre-allocate space for prediction errors

set.seed(769) # set a random seed
for(j in 1:K) {
  for(i in 1:R) {
    index = sample(n, n2)
    yhat = knn(train=X[-index,], test=X[index,], cl=airfoil2[-index,7], k=j) # prediction
    pe[i,j] = mean(yhat != airfoil2[index,7]) # misclassification rate for test data
}
}
# head(pe)
pe3 = colMeans(pe)
plot(1:K, pe3, type="o", xlab="k", ylab="pe")

(k.optimal = which.min(pe3))
## [1] 1
  1. Rewrite/reorganise your code so that each repetition can be carried out independently. Perform the Jackknifing selection of 𝐾(=1,2,…,20) using parallel computing, with function mclapply(). Compare the running times, with 1, 5, 10 or 20 cores used.
n = nrow(airfoil2) # total number of observations
n2 = round(n * 0.10) # size of test set
K = 20 # largest K-value in comparison
R = 200 # number of repetitions

jackknife_knn_single <- function(k, n2, X, y) {
  pe = numeric(R)  # Vector to store prediction errors for each repetition
  index = sample(n, n2)  # Sample indices for test set
  yhat = knn(train = X[-index,], test = X[index,], cl = y[-index,7], k = k)  # Make predictions
  pe[i] = mean(yhat != y[index,7])  # Calculate misclassification rate
  return(mean(pe))  # Return average misclassification rate for this k
}

# Function to perform Jackknifing over multiple repetitions
jackknife_knn <- function(k, n2, X, y, R) {
  # Perform parallel computation for all repetitions
  pe = mclapply(1:R, function(i) jackknife_knn_single(k, n2, X, y), mc.cores = detectCores() - 1)
  return(mean(unlist(pe)))  # Return average misclassification rate for this k
}

# Set the number of cores for testing
core_counts <- c(1, 5, 10, 20)

# Initialize a list to store timing results
timing_results <- vector("list", length(core_counts))

# Measure execution time for different core counts
for (core_count in core_counts) {
  start_time <- Sys.time()
  
  # Perform parallel computation for all K values
  pe5 <- mclapply(1:K, function(k) jackknife_knn(k, n2, X, airfoil2, R), mc.cores = core_count)
  
  end_time <- Sys.time()
  elapsed_time <- end_time - start_time
  
  # Store results in the timing_results list
  timing_results[[as.character(core_count)]] <- list(time = elapsed_time, errors = unlist(pe5))
}
timing_results$'1'$time
## Time difference of 2.171037 secs
timing_results$'5'$time
## Time difference of 1.31409 secs
timing_results$'10'$time
## Time difference of 0.8627286 secs
timing_results$'20'$time
## Time difference of 0.7211881 secs

Regression Trees

  1. Fit an unpruned regression tree with rpart (using cp=0.005) to the airfoil data and plot it (as clearly as you can, by adjusting the dimension of the diagram in your R Markdown file). Identify the most important split in the unpruned tree, and explain why it is.
set.seed(769)
r = rpart(pressure ~ ., data = airfoil[1:500,], cp = 0.005)
plot(r)
text(r, pretty=0)

r$variable.importance
##  thickness  frequency      angle     length   velocity 
## 10337.0910  8944.2277  4455.6268  2967.8959   857.2876

The most important splits in the unpruned tree: thickness>=0.002525, frequency>=2825, length>=0.0762. Because they have the longest paths in the dendrogram.

  1. Find the solutions to the following questions from the unpruned tree (without using code). i.What is the variation (TSS) reduction by the split at the root node?
r
## n= 500 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##   1) root 500 24020.36000 124.8536  
##     2) frequency>=2825 173 11308.01000 120.8037  
##       4) thickness>=0.00252473 104  3321.93200 116.2841  
##         8) thickness>=0.03530115 14   162.17540 109.8151 *
##         9) thickness< 0.03530115 90  2482.75100 117.2904  
##          18) frequency>=4500 53   776.65210 114.7588  
##            36) frequency>=9000 14   115.17630 111.4355 *
##            37) frequency< 9000 39   451.35380 115.9517  
##              74) thickness>=0.00467589 19   167.17730 114.1257 *
##              75) thickness< 0.00467589 20   160.64070 117.6864 *
##          19) frequency< 4500 37   879.82580 120.9168  
##            38) thickness>=0.00590279 19   271.18550 118.6622 *
##            39) thickness< 0.00590279 18   410.09830 123.2968 *
##       5) thickness< 0.00252473 69  2659.70500 127.6159  
##        10) length>=0.0762 31   736.14810 123.5234  
##          20) frequency>=5650 15   167.41440 119.5517 *
##          21) frequency< 5650 16   110.27800 127.2469 *
##        11) length< 0.0762 38   980.79950 130.9545  
##          22) frequency>=9000 13   132.66650 125.3808 *
##          23) frequency< 9000 25   234.27640 133.8528 *
##     3) frequency< 2825 327  8373.79300 126.9962  
##       6) thickness>=0.03530115 34  1202.61700 122.2866  
##        12) frequency>=565 19   347.39410 118.4038 *
##        13) frequency< 565 15   205.96210 127.2047 *
##       7) thickness< 0.03530115 293  6329.54100 127.5427  
##        14) angle>=14.05 39  1111.78300 124.5426  
##          28) frequency< 450 12   211.18000 119.6611 *
##          29) frequency>=450 27   487.56960 126.7121 *
##        15) angle< 14.05 254  4812.83500 128.0033  
##          30) velocity< 47.55 127  2636.10600 126.9384  
##            60) frequency>=1425 37  1032.23700 124.8109  
##             120) thickness>=0.00252473 29   504.96540 122.8911  
##               240) length>=0.127 19   246.71740 121.2292 *
##               241) length< 0.127 10   106.06800 126.0487 *
##             121) thickness< 0.00252473 8    32.92794 131.7702 *
##            61) frequency< 1425 90  1367.54900 127.8131  
##             122) frequency< 450 29   639.59920 126.0020  
##               244) thickness< 0.005975525 13   112.73750 121.5685 *
##               245) thickness>=0.005975525 16    63.72703 129.6042 *
##             123) frequency>=450 61   587.61190 128.6740 *
##          31) velocity>=47.55 127  1888.69000 129.0682  
##            62) angle< 5.05 73   576.77080 127.9411 *
##            63) angle>=5.05 54  1093.82400 130.5919 *
24020.36-11308.01-8373.793
## [1] 4338.557

ii.What is the predicted response value by the tree for the following observation? 118.4 iii.In which hyper-rectangular region, the mean of pressure is the smalles? frequency>=2825 & thickness>=0.0353 9. Find the pruned tree, using 1 SE-rule, and plot it.

# Use 1-SE rule
imin = which.min(r$cptable[, "xerror"]) # row number with minimum CV error
ise1 = which(r$cptable[, "xerror"] < r$cptable[imin, "xerror"] + r$cptable[imin,"xstd"])[1] # row number selected by 1-SE rule
cp2 = r$cptable[ise1, 1] # complexity paramater value
r2 = prune(r, cp=cp2) # pruned tree
plot(r2)
text(r2, pretty=0)

r2$variable.importance
## thickness frequency     angle    length  velocity 
## 9822.0883 8645.1606 3942.2422 2604.8268  804.6916
# training
# unpruned tree
yhat = predict(r, airfoil[1:500,])
mean((yhat - airfoil[1:500,]$pressure)^2)
## [1] 12.00716
# pruned tree
yhat = predict(r2, airfoil[1:500,])
mean((yhat - airfoil[1:500,]$pressure)^2)
## [1] 13.81211
# test
# unpruned tree
yhat = predict(r, airfoil[-(1:500),])
mean((yhat - airfoil[-(1:500),]$pressure)^2)
## [1] 18.81595
# pruned tree
yhat = predict(r2, airfoil[-(1:500),])
mean((yhat - airfoil[-(1:500),]$pressure)^2) 
## [1] 20.08011

Summary

In the analysis of the scatter plots, most variables exhibit a linear relationship with pressure. Except for velocity, all other variables show a negative correlation with pressure. Among them, frequency has the strongest negative correlation with pressure, while velocity shows the weakest correlation. When classifying the data, we first categorized pressure into different intervals and performed LDA, Naive Bayes, and KNN classifications. Through 10-fold cross-validation, repeated 20 times, we determined the optimal K value. Using the same dataset improved the repeatability of the results. Ultimately, the best K value was found to be 12. Using the Jackknifing technique with 200 repetitions and a 90% training, 10% validation set split, we further optimized the K value, and again K=12 was selected. The Jackknifing procedure was parallelized using different core counts (1, 5, 10, 20) to compare execution times. As expected, increasing the number of cores reduced computation time, with 20 cores being the fastest. An unpruned regression tree was fitted using the rpart method and visualized. Important split points were: Thickness >= 0.002525, Frequency >= 2825, and Length >= 0.0762. These split points are the most significant because they lead to the longest paths in the tree, indicating their strong explanatory power for the target variable.