airfoil = read.csv("/course/data/2024-yong/lab05/airfoil.csv")
plot_scatter <- function(x, xname) {
correlation <- round(cor(x, airfoil$pressure), 3)
plot(x, airfoil$pressure, main = paste("Correlation:", correlation),
xlab = xname, ylab = "Pressure", pch = 20)
abline(lm(airfoil$pressure ~ x), col = "red", lwd = 1) # add a linear fitted line
lines(lowess(x, airfoil$pressure), col = "blue", lwd = 1)# add a loess fitted curve
}
plot_scatter(airfoil$frequency, "Frequency")
plot_scatter(airfoil$angle, "Angle")
plot_scatter(airfoil$length, "Length")
plot_scatter(airfoil$velocity, "Velocity")
plot_scatter(airfoil$thickness, "Thickness")
In the analysis of the scatter plots of the data, most variables show a
linear relationship with pressure. Except for the velocity variable, the
other variables exhibit a negative correlation with pressure. Among
these, frequency has the strongest negative correlation with pressure,
while velocity has the weakest correlation.
airfoil2 = transform(airfoil, class=cut(pressure, c(0,120,125,130,Inf)))
r = lda(class ~ frequency + angle + length + velocity + thickness, data = airfoil2[1:500,])
p = predict(r, newdata=airfoil2[-(1:500),])
yhat = p$class
table(airfoil2[-(1:500),]$class, yhat) # confusion table (test)
## yhat
## (0,120] (120,125] (125,130] (130,Inf]
## (0,120] 185 23 18 12
## (120,125] 48 40 130 22
## (125,130] 13 32 152 73
## (130,Inf] 6 10 94 145
mean(airfoil2[-(1:500),]$class == yhat) # misclassification rate (test)
## [1] 0.5204387
p = predict(r, newdata=airfoil2[1:500,])
yhat = p$class
table(airfoil2[1:500,]$class, yhat) # confusion table (training)
## yhat
## (0,120] (120,125] (125,130] (130,Inf]
## (0,120] 97 14 9 2
## (120,125] 18 17 44 13
## (125,130] 10 15 99 42
## (130,Inf] 0 3 53 64
mean(airfoil2[1:500,]$class == yhat) # misclassification rate (training)
## [1] 0.554
ii.The Naive Bayes method
r = naiveBayes(class ~ frequency + angle + length + velocity + thickness, data = airfoil2[1:500,])
yhat = predict(r, newdata=airfoil2[-(1:500),])
table(airfoil2[-(1:500),]$class, yhat) # confusion table (test)
## yhat
## (0,120] (120,125] (125,130] (130,Inf]
## (0,120] 128 19 65 26
## (120,125] 53 32 105 50
## (125,130] 22 23 117 108
## (130,Inf] 23 10 46 176
mean(airfoil2[-(1:500),]$class == yhat) # Classfication accuracy (test)
## [1] 0.4516451
yhat = predict(r, newdata=airfoil2[1:500,])
table(airfoil2[1:500,]$class, yhat) # confusion table (training)
## yhat
## (0,120] (120,125] (125,130] (130,Inf]
## (0,120] 81 6 25 10
## (120,125] 18 14 35 25
## (125,130] 24 7 84 51
## (130,Inf] 7 8 25 80
mean(airfoil2[1:500,]$class == yhat) # Classfication accuracy (training)
## [1] 0.518
X = scale(model.matrix(class ~ . - pressure, data=airfoil2)[,-1]) # standardised design matrix
pe_train = numeric(20)
pe_test = numeric(20)
for(k in 1:20) {
yhat = knn(train=X[1:500,], test=X[1:500,], cl=airfoil2[1:500,7], k=k) # prediction
pe_train[k] = mean(yhat != airfoil2[1:500,7]) # misclassification rate for trainning data
}
pe_train
## [1] 0.000 0.176 0.198 0.250 0.284 0.314 0.328 0.332 0.340 0.376 0.346 0.382
## [13] 0.354 0.370 0.374 0.364 0.388 0.380 0.398 0.412
for(k in 1:20) {
yhat = knn(train=X[1:500,], test=X[-(1:500),], cl=airfoil2[1:500,7], k=k) # prediction
pe_test[k] = mean(yhat != airfoil2[-(1:500),7]) # misclassification rate for test data
}
pe_test
## [1] 0.3998006 0.4875374 0.4715852 0.4765703 0.4476570 0.4695912 0.4925224
## [8] 0.4765703 0.4845464 0.4805583 0.4755733 0.4735793 0.4805583 0.4875374
## [15] 0.4915254 0.4915254 0.4825523 0.4885344 0.4905284 0.4935194
4.Use 10-fold cross-validation (CV), with 20 repetitions, to compute the CV misclassification rates of KNN for 𝐾(=1,2,…,20) over the training set.
set.seed(769) # set a random seed
n = nrow(airfoil2) # total number of observations
R = 20
M = 10
K = 20
pe = matrix(nrow=R*M, ncol=K)
test.set = function(i, n, K=10)
(round((i-1)/K*n)+1):round(i/K*n)
for(i in 1:R) { # for each repetition
ind = sample(n)
for(j in 1:M) { # for each fold
index = ind[test.set(j, n, M)]
for(k in 1:K) { # for each k nearest neighbours (method)
yhat = knn(train=X[-index,], test=X[index,], cl=airfoil2[-index,7], k=j) # prediction
pe[M*(i-1)+j,k] = mean(yhat != airfoil2[index,7]) # misclassification rate for test data
}
}
}
pe2 = colMeans(pe)
plot(1:K, pe2, type = "b", col = "green", ylim = c(0, 0.5),
xlab = "K", ylab = "Misclassification Rate", main = "CV vs Train vs Test Misclassification Rates")
lines(1:K, pe_train, type = "b", col = "blue", pch = 2)
lines(1:K, pe_test, type = "b", col = "red", pch =3)
legend("bottomright", legend = c("CV (10-fold, 20 reps)", "Training", "Test"),
col = c("green", "blue", "red"), lty = 1, pch = 1:3)
(k.optimal = which.min(pe2))
## [1] 12
Using the technique of same samples ensure that the same data splits are used for each model training and validation, we enhance the reproducibility and stability of the results, making the evaluation of model performance more reliable. The appropriate value of K we should use for this data set is 20.
n = nrow(airfoil2) # total number of observations
n2 = round(n * 0.10) # size of test set
K = 20 # largest K-value in comparison
R = 200 # number of repetitions
pe = matrix(nrow=R, ncol=K) # pre-allocate space for prediction errors
set.seed(769) # set a random seed
for(j in 1:K) {
for(i in 1:R) {
index = sample(n, n2)
yhat = knn(train=X[-index,], test=X[index,], cl=airfoil2[-index,7], k=j) # prediction
pe[i,j] = mean(yhat != airfoil2[index,7]) # misclassification rate for test data
}
}
# head(pe)
pe3 = colMeans(pe)
plot(1:K, pe3, type="o", xlab="k", ylab="pe")
(k.optimal = which.min(pe3))
## [1] 1
n = nrow(airfoil2) # total number of observations
n2 = round(n * 0.10) # size of test set
K = 20 # largest K-value in comparison
R = 200 # number of repetitions
jackknife_knn_single <- function(k, n2, X, y) {
pe = numeric(R) # Vector to store prediction errors for each repetition
index = sample(n, n2) # Sample indices for test set
yhat = knn(train = X[-index,], test = X[index,], cl = y[-index,7], k = k) # Make predictions
pe[i] = mean(yhat != y[index,7]) # Calculate misclassification rate
return(mean(pe)) # Return average misclassification rate for this k
}
# Function to perform Jackknifing over multiple repetitions
jackknife_knn <- function(k, n2, X, y, R) {
# Perform parallel computation for all repetitions
pe = mclapply(1:R, function(i) jackknife_knn_single(k, n2, X, y), mc.cores = detectCores() - 1)
return(mean(unlist(pe))) # Return average misclassification rate for this k
}
# Set the number of cores for testing
core_counts <- c(1, 5, 10, 20)
# Initialize a list to store timing results
timing_results <- vector("list", length(core_counts))
# Measure execution time for different core counts
for (core_count in core_counts) {
start_time <- Sys.time()
# Perform parallel computation for all K values
pe5 <- mclapply(1:K, function(k) jackknife_knn(k, n2, X, airfoil2, R), mc.cores = core_count)
end_time <- Sys.time()
elapsed_time <- end_time - start_time
# Store results in the timing_results list
timing_results[[as.character(core_count)]] <- list(time = elapsed_time, errors = unlist(pe5))
}
timing_results$'1'$time
## Time difference of 2.171037 secs
timing_results$'5'$time
## Time difference of 1.31409 secs
timing_results$'10'$time
## Time difference of 0.8627286 secs
timing_results$'20'$time
## Time difference of 0.7211881 secs
set.seed(769)
r = rpart(pressure ~ ., data = airfoil[1:500,], cp = 0.005)
plot(r)
text(r, pretty=0)
r$variable.importance
## thickness frequency angle length velocity
## 10337.0910 8944.2277 4455.6268 2967.8959 857.2876
The most important splits in the unpruned tree: thickness>=0.002525, frequency>=2825, length>=0.0762. Because they have the longest paths in the dendrogram.
r
## n= 500
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 500 24020.36000 124.8536
## 2) frequency>=2825 173 11308.01000 120.8037
## 4) thickness>=0.00252473 104 3321.93200 116.2841
## 8) thickness>=0.03530115 14 162.17540 109.8151 *
## 9) thickness< 0.03530115 90 2482.75100 117.2904
## 18) frequency>=4500 53 776.65210 114.7588
## 36) frequency>=9000 14 115.17630 111.4355 *
## 37) frequency< 9000 39 451.35380 115.9517
## 74) thickness>=0.00467589 19 167.17730 114.1257 *
## 75) thickness< 0.00467589 20 160.64070 117.6864 *
## 19) frequency< 4500 37 879.82580 120.9168
## 38) thickness>=0.00590279 19 271.18550 118.6622 *
## 39) thickness< 0.00590279 18 410.09830 123.2968 *
## 5) thickness< 0.00252473 69 2659.70500 127.6159
## 10) length>=0.0762 31 736.14810 123.5234
## 20) frequency>=5650 15 167.41440 119.5517 *
## 21) frequency< 5650 16 110.27800 127.2469 *
## 11) length< 0.0762 38 980.79950 130.9545
## 22) frequency>=9000 13 132.66650 125.3808 *
## 23) frequency< 9000 25 234.27640 133.8528 *
## 3) frequency< 2825 327 8373.79300 126.9962
## 6) thickness>=0.03530115 34 1202.61700 122.2866
## 12) frequency>=565 19 347.39410 118.4038 *
## 13) frequency< 565 15 205.96210 127.2047 *
## 7) thickness< 0.03530115 293 6329.54100 127.5427
## 14) angle>=14.05 39 1111.78300 124.5426
## 28) frequency< 450 12 211.18000 119.6611 *
## 29) frequency>=450 27 487.56960 126.7121 *
## 15) angle< 14.05 254 4812.83500 128.0033
## 30) velocity< 47.55 127 2636.10600 126.9384
## 60) frequency>=1425 37 1032.23700 124.8109
## 120) thickness>=0.00252473 29 504.96540 122.8911
## 240) length>=0.127 19 246.71740 121.2292 *
## 241) length< 0.127 10 106.06800 126.0487 *
## 121) thickness< 0.00252473 8 32.92794 131.7702 *
## 61) frequency< 1425 90 1367.54900 127.8131
## 122) frequency< 450 29 639.59920 126.0020
## 244) thickness< 0.005975525 13 112.73750 121.5685 *
## 245) thickness>=0.005975525 16 63.72703 129.6042 *
## 123) frequency>=450 61 587.61190 128.6740 *
## 31) velocity>=47.55 127 1888.69000 129.0682
## 62) angle< 5.05 73 576.77080 127.9411 *
## 63) angle>=5.05 54 1093.82400 130.5919 *
24020.36-11308.01-8373.793
## [1] 4338.557
ii.What is the predicted response value by the tree for the following observation? 118.4 iii.In which hyper-rectangular region, the mean of pressure is the smalles? frequency>=2825 & thickness>=0.0353 9. Find the pruned tree, using 1 SE-rule, and plot it.
# Use 1-SE rule
imin = which.min(r$cptable[, "xerror"]) # row number with minimum CV error
ise1 = which(r$cptable[, "xerror"] < r$cptable[imin, "xerror"] + r$cptable[imin,"xstd"])[1] # row number selected by 1-SE rule
cp2 = r$cptable[ise1, 1] # complexity paramater value
r2 = prune(r, cp=cp2) # pruned tree
plot(r2)
text(r2, pretty=0)
r2$variable.importance
## thickness frequency angle length velocity
## 9822.0883 8645.1606 3942.2422 2604.8268 804.6916
# training
# unpruned tree
yhat = predict(r, airfoil[1:500,])
mean((yhat - airfoil[1:500,]$pressure)^2)
## [1] 12.00716
# pruned tree
yhat = predict(r2, airfoil[1:500,])
mean((yhat - airfoil[1:500,]$pressure)^2)
## [1] 13.81211
# test
# unpruned tree
yhat = predict(r, airfoil[-(1:500),])
mean((yhat - airfoil[-(1:500),]$pressure)^2)
## [1] 18.81595
# pruned tree
yhat = predict(r2, airfoil[-(1:500),])
mean((yhat - airfoil[-(1:500),]$pressure)^2)
## [1] 20.08011
In the analysis of the scatter plots, most variables exhibit a linear relationship with pressure. Except for velocity, all other variables show a negative correlation with pressure. Among them, frequency has the strongest negative correlation with pressure, while velocity shows the weakest correlation. When classifying the data, we first categorized pressure into different intervals and performed LDA, Naive Bayes, and KNN classifications. Through 10-fold cross-validation, repeated 20 times, we determined the optimal K value. Using the same dataset improved the repeatability of the results. Ultimately, the best K value was found to be 12. Using the Jackknifing technique with 200 repetitions and a 90% training, 10% validation set split, we further optimized the K value, and again K=12 was selected. The Jackknifing procedure was parallelized using different core counts (1, 5, 10, 20) to compare execution times. As expected, increasing the number of cores reduced computation time, with 20 cores being the fastest. An unpruned regression tree was fitted using the rpart method and visualized. Important split points were: Thickness >= 0.002525, Frequency >= 2825, and Length >= 0.0762. These split points are the most significant because they lead to the longest paths in the tree, indicating their strong explanatory power for the target variable.