zip = read.csv("/course/data/2024-yong/lab07/zip.csv")

Digit Images

# 1.
onetoten <- zip[zip$digit %in% c(0:9),]
onetoten[,-1] = (1-onetoten[,-1])/2

par(mar=rep(0,4), mfrow=c(10,10))
for(i in 0:9) {
  samples <- sample(which(onetoten$digit == i), 10)
  for(j in samples) {
    matrix_image <- matrix(as.numeric(onetoten[j,2:257]), nrow=16, ncol=16, byrow=TRUE)
    plot(as.raster(matrix_image))
  }
}

# 2.
data <- zip[zip$digit %in% c(6,8), c("digit", "p75", "p91")]
data$digit <- factor(data$digit)
data$digit <- ifelse(data$digit==6, 0, 1)
train = data[1:500,]
test = data[-(1:500),]
model = keras_model_sequential() |>
  layer_dense(units=3, activation="sigmoid", input_shape=2) |>  # 2 predictors, 3 units in hidden layer
  layer_dense(units=1, activation="sigmoid")                    # 2 classes

model |> compile(loss="binary_crossentropy",
                 optimizer=optimizer_rmsprop(),
                 metrics=c("accuracy"))
xmat = as.matrix(train[,2:3])
ymat = as.matrix(train[,1])
system.time(history <- model |> fit(xmat, ymat, verbose=0, epochs=500, batch_size=32))
##    user  system elapsed 
##  22.763   2.446  17.800
  user  system elapsed 
 22.275   1.861  17.281  
yhat = model |> predict(xmat, verbose=0) 
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != ymat)                   # training error
## [1] 0.1
yhat = model |> predict(as.matrix(test[,2:3]), verbose=0) 
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != as.matrix(test[,1]))                   # test error
## [1] 0.08445298
# 3.
model2 = keras_model_sequential() |>
  layer_dense(units=2, activation="sigmoid", input_shape=2) |>  # 2 predictors, 2 units in hidden layer
  layer_dense(units=3, activation="sigmoid") |>                 # 3 units in hidden layer
  layer_dense(units=1, activation="sigmoid")                    # 2 classes

model2 |> compile(loss="binary_crossentropy",
                 optimizer=optimizer_rmsprop(),
                 metrics=c("accuracy"))
xmat = as.matrix(train[,2:3])
ymat = as.matrix(train[,1])
system.time(history <- model2 |> fit(xmat, ymat, verbose=0, epochs=500, batch_size=32))
##    user  system elapsed 
##  23.916   2.534  17.816
  user  system elapsed 
24.589   2.227  18.136 
yhat = model2 |> predict(xmat, verbose=0) 
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != ymat)                   # training error
## [1] 0.09
yhat = model2 |> predict(as.matrix(test[,2:3]), verbose=0) 
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != as.matrix(test[,1]))                   # test error
## [1] 0.0815739
# grid setting
x_min <- min(train$p75) - 0.5
x_max <- max(train$p75) + 0.5
y_min <- min(train$p91) - 0.5
y_max <- max(train$p91) + 0.5
x_seq <- seq(x_min, x_max, length.out = 101)
y_seq <- seq(y_min, y_max, length.out = 101)
grid <- expand.grid(p75 = x_seq, p91 = y_seq)
grid_xmat <- as.matrix(grid)

yhat_model1 <- model |> predict(grid_xmat)
## 319/319 - 0s - 336ms/epoch - 1ms/step
z1 <- matrix(yhat_model1, nrow = 101, byrow = TRUE)
yhat_model2 <- model2 |> predict(grid_xmat)
## 319/319 - 0s - 339ms/epoch - 1ms/step
z2 <- matrix(yhat_model2, nrow = 101, byrow = TRUE)

col <- ifelse(train$digit == 1, "red", "blue")

# add jitter
train_jitter <- train
train_jitter$p75 <- jitter(train_jitter$p75, amount = 0.1)
train_jitter$p91 <- jitter(train_jitter$p91, amount = 0.1)

# Model1
plot(train_jitter$p75, train_jitter$p91, col = col, pch = 19, 
     xlab = "p75", ylab = "p91", main = "Model 1 Decision Boundaries", asp = 1)
contour(x_seq, y_seq, z1, levels = c(0.1, 0.5, 0.9), lwd = c(1, 2, 1), 
        lty = c(3, 1, 3), drawlabels = FALSE, add = TRUE)

# Model2
plot(train_jitter$p75, train_jitter$p91, col = col, pch = 19, 
     xlab = "p75", ylab = "p91", main = "Model 2 Decision Boundaries", asp = 1)
contour(x_seq, y_seq, z2, levels = c(0.1, 0.5, 0.9), lwd = c(1, 2, 1), 
        lty = c(3, 1, 3), drawlabels = FALSE, add = TRUE)

5. Explain in detail how the number of parameters at each layer is computed in the neural network used in Question 3.
  1. First Hidden Layer (2 unis): weights = 2 input features * 2 units = 4 weights Biases = 2 Total = 4+2 = 6 parameters
  2. Second Hidden Layer (3 unis): weights = 2 input features * 3 units = 6 weights Biases = 3 Total = 6+3 = 9 parameters
  3. Output Layer (1 unis): weights = 3 input features * 1 unit = 3 weights Biases = 1 Total = 3+1 = 4 parameters

Using All Predictors

# 6. 
## CNN model
model3 = keras_model_sequential() |>
  layer_conv_2d(filters=16, kernel_size=c(3,3), padding="same", activation="relu", input_shape=c(16,16,1)) |>
  layer_max_pooling_2d(pool_size=c(2,2)) |>
  layer_conv_2d(filters=64, kernel_size=c(3,3), padding="same", activation="relu") |>
  layer_max_pooling_2d(pool_size=c(2,2)) |>
  layer_flatten() |>
  layer_dropout(rate=0.5) |>
  layer_dense(units=128, activation="relu") |>
  layer_dense(units=1, activation="sigmoid")

model3 |> compile(loss="binary_crossentropy",
                 optimizer=optimizer_rmsprop(),
                 metrics=c("accuracy"))

data <- zip[zip$digit %in% c(6,8),]
data$digit <- ifelse(data$digit == 6, 0, 1)
xmat = as.matrix(data[1:500,-1])
ymat = as.matrix(data[1:500,1])
xmat = array_reshape(xmat,c(nrow(xmat),16,16,1))
system.time(history <- model3 |> fit(xmat, ymat, verbose=0, epochs=100, batch_size=32, validation_split=0.3))
##    user  system elapsed 
##  40.626   4.990  13.869
 user  system elapsed 
 39.567   4.777  14.755 
yhat = model3 |> predict(xmat, verbose=0) 
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != ymat)  # training error
## [1] 0
xtest = as.matrix(data[-(1:500),-1])
xtest = array_reshape(xtest,c(nrow(xtest),16,16,1))
yhat = model3 |> predict(xtest, verbose=0) 
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != as.matrix((data[-(1:500),1]))) # test error
## [1] 0.003838772
# 7.
## CNN model
model4 = keras_model_sequential() |>
  layer_conv_2d(filters=16, kernel_size=c(3,3), padding="same", activation="relu", input_shape=c(16,16,1)) |>
  layer_max_pooling_2d(pool_size=c(2,2)) |>
  layer_conv_2d(filters=64, kernel_size=c(3,3), padding="same", activation="relu") |>
  layer_max_pooling_2d(pool_size=c(2,2)) |>
  layer_flatten() |>
  layer_dropout(rate=0.5) |>
  layer_dense(units=128, activation="relu") |>
  layer_dense(units=10, activation="softmax")

model4 |> compile(loss="categorical_crossentropy",
                 optimizer=optimizer_rmsprop(),
                 metrics=c("accuracy"))

data <- zip[zip$digit %in% c(0:9),]
xmat = as.matrix(data[1:2500,-1])
ymat = as.matrix(data[1:2500,1])
ymat = to_categorical(ymat, num_classes = 10)
xmat = array_reshape(xmat,c(nrow(xmat),16,16,1))
system.time(history <- model4 |> fit(xmat, ymat, verbose=0, epochs=100, batch_size=32, validation_split=0.3))
##    user  system elapsed 
## 177.361  21.928  50.067
user  system elapsed 
 183.68   22.50   51.98
p1 <- model4 |> predict(xmat)
## 79/79 - 0s - 305ms/epoch - 4ms/step
p1 <- apply(p1, 1, which.max) - 1
y <- apply(ymat, 1, which.max) - 1
mean(p1 != y) # training error
## [1] 0.0072
xtest = as.matrix(data[2501:3000,-1]) 
ytest = as.matrix(data[2501:3000, 1])
xtest = array_reshape(xtest, c(nrow(xtest), 16, 16, 1))
ytest = to_categorical(ytest, num_classes = 10)
p2 <- model4 |> predict(xtest)
## 16/16 - 0s - 73ms/epoch - 5ms/step
p2 <- apply(p2, 1, which.max) - 1
ytest <- apply(ytest, 1, which.max) - 1
mean(p2 != ytest) # test error
## [1] 0.018
# 8.
summary(model4)
## Model: "sequential_3"
## ________________________________________________________________________________
##  Layer (type)                       Output Shape                    Param #     
## ================================================================================
##  conv2d_3 (Conv2D)                  (None, 16, 16, 16)              160         
##  max_pooling2d_3 (MaxPooling2D)     (None, 8, 8, 16)                0           
##  conv2d_2 (Conv2D)                  (None, 8, 8, 64)                9280        
##  max_pooling2d_2 (MaxPooling2D)     (None, 4, 4, 64)                0           
##  flatten_1 (Flatten)                (None, 1024)                    0           
##  dropout_1 (Dropout)                (None, 1024)                    0           
##  dense_8 (Dense)                    (None, 128)                     131200      
##  dense_7 (Dense)                    (None, 10)                      1290        
## ================================================================================
## Total params: 141,930
## Trainable params: 141,930
## Non-trainable params: 0
## ________________________________________________________________________________
  1. First Convolutional Layer(conv2d_13): 64 filters, 3316=144 weights, 64 biases. Total = (331*16)+16=160 parameters
  2. Second Convolutional Layer(conv2d_12): 16 filters, 331=9 weights, 16 biases. Total = (3316*64)+64=9280 parameters
  3. First Dense Layer(dense_37): 1128 units, 1024 weights, each unit has 1 biases. Total = (1024*128)+128=131200 parameters
  4. Output Dense Layer(dense_36): 10 units, each unit has 128 weights, each unit has 10 biases. Total = (128*10)+10=1290 parameters
# 9.
callbacks_list = list(
  callback_early_stopping(monitor="val_accuracy", patience=10),
  callback_model_checkpoint(filepath="my_model{epoch:03d}-loss{val_loss:.2f}.h5", monitor="val_loss", save_best_only=TRUE)
)
xmat = as.matrix(data[1:2500,-1])
ymat = as.matrix(data[1:2500,1])
ymat = to_categorical(ymat, num_classes = 10)
xmat = array_reshape(xmat,c(nrow(xmat),16,16,1))
system.time(history <- model4 |> fit(xmat, ymat, verbose=0, epochs=100, batch_size=128, validation_split=0.3, callbacks=callbacks_list))
##    user  system elapsed 
##  23.301   2.582   4.938
p1 <- model4 |> predict(xmat)
## 79/79 - 0s - 212ms/epoch - 3ms/step
p1 <- apply(p1, 1, which.max) - 1
y <- apply(ymat, 1, which.max) - 1
mean(p1 != y) # training error
## [1] 0.0072
xtest = as.matrix(data[2501:3000,-1]) 
ytest = as.matrix(data[2501:3000, 1])
xtest = array_reshape(xtest, c(nrow(xtest), 16, 16, 1))
ytest = to_categorical(ytest, num_classes = 10)
p2 <- model4 |> predict(xtest)
## 16/16 - 0s - 65ms/epoch - 4ms/step
p2 <- apply(p2, 1, which.max) - 1
y <- apply(ytest, 1, which.max) - 1
mean(p2 != y) # test error
## [1] 0.014
### compute the training and test errors of all saved models
# Find all saved model files
files <- list.files(pattern = "model")
test_err <- double(length(files))
train_err <- double(length(files))

for(j in 1:length(files)) {
  model <- load_model_hdf5(files[j])  # Load each saved model
  p1 <- model4 |> predict(xmat)
  p1 <- apply(p1, 1, which.max) - 1
  y <- apply(ymat, 1, which.max) - 1  
  train_err[j] <- mean(p1 != y)  # training error
}
## 79/79 - 0s - 225ms/epoch - 3ms/step
## 79/79 - 0s - 205ms/epoch - 3ms/step
## 79/79 - 0s - 204ms/epoch - 3ms/step
for(j in 1:length(files)) {
  model <- load_model_hdf5(files[j])  # Load each saved model
  p2 <- model4 |> predict(xtest)
  p2 <- apply(p2, 1, which.max) - 1
  yhat <- apply(ytest, 1, which.max) - 1
  test_err[j] <- mean(p2 != yhat)  # test error
}
## 16/16 - 0s - 63ms/epoch - 4ms/step
## 16/16 - 0s - 64ms/epoch - 4ms/step
## 16/16 - 0s - 64ms/epoch - 4ms/step
file.remove(list.files(pattern="model"))
## [1] TRUE TRUE TRUE
data.frame(Train_Accuracy = train_err,
           Test_Accuracy = test_err,
           row.names = files)
##                         Train_Accuracy Test_Accuracy
## my_model001-loss0.22.h5         0.0072         0.014
## my_model009-loss0.21.h5         0.0072         0.014
## my_model017-loss0.20.h5         0.0072         0.014

Summary

First, the experiment generated a 10x10 image grid, where each cell contained a handwritten digit randomly selected from the zip code dataset. A convolutional neural network was designed to distinguish between digits 6 and 8. The model used all 256 features and 30% of the data for validation. The training process computed the training and test errors, and the final model’s test error was 0.0048, indicating good performance. The experiment was further extended to classify digits 0-9, using 2500 observations for training and validation. The training and test errors were 0.0068 and 0.018, respectively.

Using two previously identified important features, two neural networks were trained. The first network had a hidden layer with three units, yielding a training error of 0 and a test error of 0.0048. The second network had two hidden layers, and its error results were similar. A scatter plot of the training data was generated, showing the decision boundaries of the two neural networks. The visualization clearly illustrated how the networks classified data based on specific decision boundaries.

For the CNN model in Question 7, early stopping and model saving callbacks were added to control the training process based on validation accuracy. The best saved model had a training error of 0.0056 and a test error of 0.016. The training time and performance were also improved.