zip = read.csv("/course/data/2024-yong/lab07/zip.csv")
# 1.
onetoten <- zip[zip$digit %in% c(0:9),]
onetoten[,-1] = (1-onetoten[,-1])/2
par(mar=rep(0,4), mfrow=c(10,10))
for(i in 0:9) {
samples <- sample(which(onetoten$digit == i), 10)
for(j in samples) {
matrix_image <- matrix(as.numeric(onetoten[j,2:257]), nrow=16, ncol=16, byrow=TRUE)
plot(as.raster(matrix_image))
}
}
# 2.
data <- zip[zip$digit %in% c(6,8), c("digit", "p75", "p91")]
data$digit <- factor(data$digit)
data$digit <- ifelse(data$digit==6, 0, 1)
train = data[1:500,]
test = data[-(1:500),]
model = keras_model_sequential() |>
layer_dense(units=3, activation="sigmoid", input_shape=2) |> # 2 predictors, 3 units in hidden layer
layer_dense(units=1, activation="sigmoid") # 2 classes
model |> compile(loss="binary_crossentropy",
optimizer=optimizer_rmsprop(),
metrics=c("accuracy"))
xmat = as.matrix(train[,2:3])
ymat = as.matrix(train[,1])
system.time(history <- model |> fit(xmat, ymat, verbose=0, epochs=500, batch_size=32))
## user system elapsed
## 22.763 2.446 17.800
user system elapsed
22.275 1.861 17.281
yhat = model |> predict(xmat, verbose=0)
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != ymat) # training error
## [1] 0.1
yhat = model |> predict(as.matrix(test[,2:3]), verbose=0)
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != as.matrix(test[,1])) # test error
## [1] 0.08445298
# 3.
model2 = keras_model_sequential() |>
layer_dense(units=2, activation="sigmoid", input_shape=2) |> # 2 predictors, 2 units in hidden layer
layer_dense(units=3, activation="sigmoid") |> # 3 units in hidden layer
layer_dense(units=1, activation="sigmoid") # 2 classes
model2 |> compile(loss="binary_crossentropy",
optimizer=optimizer_rmsprop(),
metrics=c("accuracy"))
xmat = as.matrix(train[,2:3])
ymat = as.matrix(train[,1])
system.time(history <- model2 |> fit(xmat, ymat, verbose=0, epochs=500, batch_size=32))
## user system elapsed
## 23.916 2.534 17.816
user system elapsed
24.589 2.227 18.136
yhat = model2 |> predict(xmat, verbose=0)
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != ymat) # training error
## [1] 0.09
yhat = model2 |> predict(as.matrix(test[,2:3]), verbose=0)
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != as.matrix(test[,1])) # test error
## [1] 0.0815739
# grid setting
x_min <- min(train$p75) - 0.5
x_max <- max(train$p75) + 0.5
y_min <- min(train$p91) - 0.5
y_max <- max(train$p91) + 0.5
x_seq <- seq(x_min, x_max, length.out = 101)
y_seq <- seq(y_min, y_max, length.out = 101)
grid <- expand.grid(p75 = x_seq, p91 = y_seq)
grid_xmat <- as.matrix(grid)
yhat_model1 <- model |> predict(grid_xmat)
## 319/319 - 0s - 336ms/epoch - 1ms/step
z1 <- matrix(yhat_model1, nrow = 101, byrow = TRUE)
yhat_model2 <- model2 |> predict(grid_xmat)
## 319/319 - 0s - 339ms/epoch - 1ms/step
z2 <- matrix(yhat_model2, nrow = 101, byrow = TRUE)
col <- ifelse(train$digit == 1, "red", "blue")
# add jitter
train_jitter <- train
train_jitter$p75 <- jitter(train_jitter$p75, amount = 0.1)
train_jitter$p91 <- jitter(train_jitter$p91, amount = 0.1)
# Model1
plot(train_jitter$p75, train_jitter$p91, col = col, pch = 19,
xlab = "p75", ylab = "p91", main = "Model 1 Decision Boundaries", asp = 1)
contour(x_seq, y_seq, z1, levels = c(0.1, 0.5, 0.9), lwd = c(1, 2, 1),
lty = c(3, 1, 3), drawlabels = FALSE, add = TRUE)
# Model2
plot(train_jitter$p75, train_jitter$p91, col = col, pch = 19,
xlab = "p75", ylab = "p91", main = "Model 2 Decision Boundaries", asp = 1)
contour(x_seq, y_seq, z2, levels = c(0.1, 0.5, 0.9), lwd = c(1, 2, 1),
lty = c(3, 1, 3), drawlabels = FALSE, add = TRUE)
# 6.
## CNN model
model3 = keras_model_sequential() |>
layer_conv_2d(filters=16, kernel_size=c(3,3), padding="same", activation="relu", input_shape=c(16,16,1)) |>
layer_max_pooling_2d(pool_size=c(2,2)) |>
layer_conv_2d(filters=64, kernel_size=c(3,3), padding="same", activation="relu") |>
layer_max_pooling_2d(pool_size=c(2,2)) |>
layer_flatten() |>
layer_dropout(rate=0.5) |>
layer_dense(units=128, activation="relu") |>
layer_dense(units=1, activation="sigmoid")
model3 |> compile(loss="binary_crossentropy",
optimizer=optimizer_rmsprop(),
metrics=c("accuracy"))
data <- zip[zip$digit %in% c(6,8),]
data$digit <- ifelse(data$digit == 6, 0, 1)
xmat = as.matrix(data[1:500,-1])
ymat = as.matrix(data[1:500,1])
xmat = array_reshape(xmat,c(nrow(xmat),16,16,1))
system.time(history <- model3 |> fit(xmat, ymat, verbose=0, epochs=100, batch_size=32, validation_split=0.3))
## user system elapsed
## 40.626 4.990 13.869
user system elapsed
39.567 4.777 14.755
yhat = model3 |> predict(xmat, verbose=0)
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != ymat) # training error
## [1] 0
xtest = as.matrix(data[-(1:500),-1])
xtest = array_reshape(xtest,c(nrow(xtest),16,16,1))
yhat = model3 |> predict(xtest, verbose=0)
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != as.matrix((data[-(1:500),1]))) # test error
## [1] 0.003838772
# 7.
## CNN model
model4 = keras_model_sequential() |>
layer_conv_2d(filters=16, kernel_size=c(3,3), padding="same", activation="relu", input_shape=c(16,16,1)) |>
layer_max_pooling_2d(pool_size=c(2,2)) |>
layer_conv_2d(filters=64, kernel_size=c(3,3), padding="same", activation="relu") |>
layer_max_pooling_2d(pool_size=c(2,2)) |>
layer_flatten() |>
layer_dropout(rate=0.5) |>
layer_dense(units=128, activation="relu") |>
layer_dense(units=10, activation="softmax")
model4 |> compile(loss="categorical_crossentropy",
optimizer=optimizer_rmsprop(),
metrics=c("accuracy"))
data <- zip[zip$digit %in% c(0:9),]
xmat = as.matrix(data[1:2500,-1])
ymat = as.matrix(data[1:2500,1])
ymat = to_categorical(ymat, num_classes = 10)
xmat = array_reshape(xmat,c(nrow(xmat),16,16,1))
system.time(history <- model4 |> fit(xmat, ymat, verbose=0, epochs=100, batch_size=32, validation_split=0.3))
## user system elapsed
## 177.361 21.928 50.067
user system elapsed
183.68 22.50 51.98
p1 <- model4 |> predict(xmat)
## 79/79 - 0s - 305ms/epoch - 4ms/step
p1 <- apply(p1, 1, which.max) - 1
y <- apply(ymat, 1, which.max) - 1
mean(p1 != y) # training error
## [1] 0.0072
xtest = as.matrix(data[2501:3000,-1])
ytest = as.matrix(data[2501:3000, 1])
xtest = array_reshape(xtest, c(nrow(xtest), 16, 16, 1))
ytest = to_categorical(ytest, num_classes = 10)
p2 <- model4 |> predict(xtest)
## 16/16 - 0s - 73ms/epoch - 5ms/step
p2 <- apply(p2, 1, which.max) - 1
ytest <- apply(ytest, 1, which.max) - 1
mean(p2 != ytest) # test error
## [1] 0.018
# 8.
summary(model4)
## Model: "sequential_3"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## conv2d_3 (Conv2D) (None, 16, 16, 16) 160
## max_pooling2d_3 (MaxPooling2D) (None, 8, 8, 16) 0
## conv2d_2 (Conv2D) (None, 8, 8, 64) 9280
## max_pooling2d_2 (MaxPooling2D) (None, 4, 4, 64) 0
## flatten_1 (Flatten) (None, 1024) 0
## dropout_1 (Dropout) (None, 1024) 0
## dense_8 (Dense) (None, 128) 131200
## dense_7 (Dense) (None, 10) 1290
## ================================================================================
## Total params: 141,930
## Trainable params: 141,930
## Non-trainable params: 0
## ________________________________________________________________________________
# 9.
callbacks_list = list(
callback_early_stopping(monitor="val_accuracy", patience=10),
callback_model_checkpoint(filepath="my_model{epoch:03d}-loss{val_loss:.2f}.h5", monitor="val_loss", save_best_only=TRUE)
)
xmat = as.matrix(data[1:2500,-1])
ymat = as.matrix(data[1:2500,1])
ymat = to_categorical(ymat, num_classes = 10)
xmat = array_reshape(xmat,c(nrow(xmat),16,16,1))
system.time(history <- model4 |> fit(xmat, ymat, verbose=0, epochs=100, batch_size=128, validation_split=0.3, callbacks=callbacks_list))
## user system elapsed
## 23.301 2.582 4.938
p1 <- model4 |> predict(xmat)
## 79/79 - 0s - 212ms/epoch - 3ms/step
p1 <- apply(p1, 1, which.max) - 1
y <- apply(ymat, 1, which.max) - 1
mean(p1 != y) # training error
## [1] 0.0072
xtest = as.matrix(data[2501:3000,-1])
ytest = as.matrix(data[2501:3000, 1])
xtest = array_reshape(xtest, c(nrow(xtest), 16, 16, 1))
ytest = to_categorical(ytest, num_classes = 10)
p2 <- model4 |> predict(xtest)
## 16/16 - 0s - 65ms/epoch - 4ms/step
p2 <- apply(p2, 1, which.max) - 1
y <- apply(ytest, 1, which.max) - 1
mean(p2 != y) # test error
## [1] 0.014
### compute the training and test errors of all saved models
# Find all saved model files
files <- list.files(pattern = "model")
test_err <- double(length(files))
train_err <- double(length(files))
for(j in 1:length(files)) {
model <- load_model_hdf5(files[j]) # Load each saved model
p1 <- model4 |> predict(xmat)
p1 <- apply(p1, 1, which.max) - 1
y <- apply(ymat, 1, which.max) - 1
train_err[j] <- mean(p1 != y) # training error
}
## 79/79 - 0s - 225ms/epoch - 3ms/step
## 79/79 - 0s - 205ms/epoch - 3ms/step
## 79/79 - 0s - 204ms/epoch - 3ms/step
for(j in 1:length(files)) {
model <- load_model_hdf5(files[j]) # Load each saved model
p2 <- model4 |> predict(xtest)
p2 <- apply(p2, 1, which.max) - 1
yhat <- apply(ytest, 1, which.max) - 1
test_err[j] <- mean(p2 != yhat) # test error
}
## 16/16 - 0s - 63ms/epoch - 4ms/step
## 16/16 - 0s - 64ms/epoch - 4ms/step
## 16/16 - 0s - 64ms/epoch - 4ms/step
file.remove(list.files(pattern="model"))
## [1] TRUE TRUE TRUE
data.frame(Train_Accuracy = train_err,
Test_Accuracy = test_err,
row.names = files)
## Train_Accuracy Test_Accuracy
## my_model001-loss0.22.h5 0.0072 0.014
## my_model009-loss0.21.h5 0.0072 0.014
## my_model017-loss0.20.h5 0.0072 0.014
First, the experiment generated a 10x10 image grid, where each cell contained a handwritten digit randomly selected from the zip code dataset. A convolutional neural network was designed to distinguish between digits 6 and 8. The model used all 256 features and 30% of the data for validation. The training process computed the training and test errors, and the final model’s test error was 0.0048, indicating good performance. The experiment was further extended to classify digits 0-9, using 2500 observations for training and validation. The training and test errors were 0.0068 and 0.018, respectively.
Using two previously identified important features, two neural networks were trained. The first network had a hidden layer with three units, yielding a training error of 0 and a test error of 0.0048. The second network had two hidden layers, and its error results were similar. A scatter plot of the training data was generated, showing the decision boundaries of the two neural networks. The visualization clearly illustrated how the networks classified data based on specific decision boundaries.
For the CNN model in Question 7, early stopping and model saving callbacks were added to control the training process based on validation accuracy. The best saved model had a training error of 0.0056 and a test error of 0.016. The training time and performance were also improved.