Lab8

zip = read.csv("/course/data/2024-yong/lab07/zip.csv")

Digit Images

# 1.
onetoten <- zip[zip$digit %in% c(0:9),]
onetoten[,-1] = (1-onetoten[,-1])/2

par(mar=rep(0,4), mfrow=c(10,10))
for(i in 0:9) {
  samples <- sample(which(onetoten$digit == i), 10)
  for(j in samples) {
    matrix_image <- matrix(as.numeric(onetoten[j,2:257]), nrow=16, ncol=16, byrow=TRUE)
    plot(as.raster(matrix_image))
  }
}

# 2.
data <- zip[zip$digit %in% c(6,8), c("digit", "p75", "p91")]
data$digit <- factor(data$digit)
data$digit <- ifelse(data$digit==6, 0, 1)
train = data[1:500,]
test = data[-(1:500),]
model = keras_model_sequential() |>
  layer_dense(units=3, activation="sigmoid", input_shape=2) |>  # 2 predictors, 3 units in hidden layer
  layer_dense(units=1, activation="sigmoid")                    # 2 classes

model |> compile(loss="binary_crossentropy",
                 optimizer=optimizer_rmsprop(),
                 metrics=c("accuracy"))
xmat = as.matrix(train[,2:3])
ymat = as.matrix(train[,1])
system.time(history <- model |> fit(xmat, ymat, verbose=0, epochs=500, batch_size=32))

##    user  system elapsed 
##  22.763   2.446  17.800

  user  system elapsed 
 22.275   1.861  17.281

yhat = model |> predict(xmat, verbose=0) 
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != ymat)                   # training error

## [1] 0.1

yhat = model |> predict(as.matrix(test[,2:3]), verbose=0) 
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != as.matrix(test[,1]))                   # test error

## [1] 0.08445298

# 3.
model2 = keras_model_sequential() |>
  layer_dense(units=2, activation="sigmoid", input_shape=2) |>  # 2 predictors, 2 units in hidden layer
  layer_dense(units=3, activation="sigmoid") |>                 # 3 units in hidden layer
  layer_dense(units=1, activation="sigmoid")                    # 2 classes

model2 |> compile(loss="binary_crossentropy",
                 optimizer=optimizer_rmsprop(),
                 metrics=c("accuracy"))
xmat = as.matrix(train[,2:3])
ymat = as.matrix(train[,1])
system.time(history <- model2 |> fit(xmat, ymat, verbose=0, epochs=500, batch_size=32))

##    user  system elapsed 
##  23.916   2.534  17.816

  user  system elapsed 
24.589   2.227  18.136

yhat = model2 |> predict(xmat, verbose=0) 
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != ymat)                   # training error

## [1] 0.09

yhat = model2 |> predict(as.matrix(test[,2:3]), verbose=0) 
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != as.matrix(test[,1]))                   # test error

## [1] 0.0815739

# grid setting
x_min <- min(train$p75) - 0.5
x_max <- max(train$p75) + 0.5
y_min <- min(train$p91) - 0.5
y_max <- max(train$p91) + 0.5
x_seq <- seq(x_min, x_max, length.out = 101)
y_seq <- seq(y_min, y_max, length.out = 101)
grid <- expand.grid(p75 = x_seq, p91 = y_seq)
grid_xmat <- as.matrix(grid)

yhat_model1 <- model |> predict(grid_xmat)

## 319/319 - 0s - 336ms/epoch - 1ms/step

z1 <- matrix(yhat_model1, nrow = 101, byrow = TRUE)
yhat_model2 <- model2 |> predict(grid_xmat)

## 319/319 - 0s - 339ms/epoch - 1ms/step

z2 <- matrix(yhat_model2, nrow = 101, byrow = TRUE)

col <- ifelse(train$digit == 1, "red", "blue")

# add jitter
train_jitter <- train
train_jitter$p75 <- jitter(train_jitter$p75, amount = 0.1)
train_jitter$p91 <- jitter(train_jitter$p91, amount = 0.1)

# Model1
plot(train_jitter$p75, train_jitter$p91, col = col, pch = 19, 
     xlab = "p75", ylab = "p91", main = "Model 1 Decision Boundaries", asp = 1)
contour(x_seq, y_seq, z1, levels = c(0.1, 0.5, 0.9), lwd = c(1, 2, 1), 
        lty = c(3, 1, 3), drawlabels = FALSE, add = TRUE)

# Model2
plot(train_jitter$p75, train_jitter$p91, col = col, pch = 19, 
     xlab = "p75", ylab = "p91", main = "Model 2 Decision Boundaries", asp = 1)
contour(x_seq, y_seq, z2, levels = c(0.1, 0.5, 0.9), lwd = c(1, 2, 1), 
        lty = c(3, 1, 3), drawlabels = FALSE, add = TRUE)

5. Explain in detail how the number of parameters at each layer is computed in the neural network used in Question 3.

First Hidden Layer (2 unis): weights = 2 input features * 2 units = 4 weights Biases = 2 Total = 4+2 = 6 parameters
Second Hidden Layer (3 unis): weights = 2 input features * 3 units = 6 weights Biases = 3 Total = 6+3 = 9 parameters
Output Layer (1 unis): weights = 3 input features * 1 unit = 3 weights Biases = 1 Total = 3+1 = 4 parameters

Using All Predictors

# 6. 
## CNN model
model3 = keras_model_sequential() |>
  layer_conv_2d(filters=16, kernel_size=c(3,3), padding="same", activation="relu", input_shape=c(16,16,1)) |>
  layer_max_pooling_2d(pool_size=c(2,2)) |>
  layer_conv_2d(filters=64, kernel_size=c(3,3), padding="same", activation="relu") |>
  layer_max_pooling_2d(pool_size=c(2,2)) |>
  layer_flatten() |>
  layer_dropout(rate=0.5) |>
  layer_dense(units=128, activation="relu") |>
  layer_dense(units=1, activation="sigmoid")

model3 |> compile(loss="binary_crossentropy",
                 optimizer=optimizer_rmsprop(),
                 metrics=c("accuracy"))

data <- zip[zip$digit %in% c(6,8),]
data$digit <- ifelse(data$digit == 6, 0, 1)
xmat = as.matrix(data[1:500,-1])
ymat = as.matrix(data[1:500,1])
xmat = array_reshape(xmat,c(nrow(xmat),16,16,1))
system.time(history <- model3 |> fit(xmat, ymat, verbose=0, epochs=100, batch_size=32, validation_split=0.3))

##    user  system elapsed 
##  40.626   4.990  13.869

 user  system elapsed 
 39.567   4.777  14.755

yhat = model3 |> predict(xmat, verbose=0) 
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != ymat)  # training error

## [1] 0

xtest = as.matrix(data[-(1:500),-1])
xtest = array_reshape(xtest,c(nrow(xtest),16,16,1))
yhat = model3 |> predict(xtest, verbose=0) 
yhat = ifelse(yhat > 0.5, 1, 0)
mean(yhat != as.matrix((data[-(1:500),1]))) # test error

## [1] 0.003838772

# 7.
## CNN model
model4 = keras_model_sequential() |>
  layer_conv_2d(filters=16, kernel_size=c(3,3), padding="same", activation="relu", input_shape=c(16,16,1)) |>
  layer_max_pooling_2d(pool_size=c(2,2)) |>
  layer_conv_2d(filters=64, kernel_size=c(3,3), padding="same", activation="relu") |>
  layer_max_pooling_2d(pool_size=c(2,2)) |>
  layer_flatten() |>
  layer_dropout(rate=0.5) |>
  layer_dense(units=128, activation="relu") |>
  layer_dense(units=10, activation="softmax")

model4 |> compile(loss="categorical_crossentropy",
                 optimizer=optimizer_rmsprop(),
                 metrics=c("accuracy"))

data <- zip[zip$digit %in% c(0:9),]
xmat = as.matrix(data[1:2500,-1])
ymat = as.matrix(data[1:2500,1])
ymat = to_categorical(ymat, num_classes = 10)
xmat = array_reshape(xmat,c(nrow(xmat),16,16,1))
system.time(history <- model4 |> fit(xmat, ymat, verbose=0, epochs=100, batch_size=32, validation_split=0.3))

##    user  system elapsed 
## 177.361  21.928  50.067

user  system elapsed 
 183.68   22.50   51.98

p1 <- model4 |> predict(xmat)

## 79/79 - 0s - 305ms/epoch - 4ms/step

p1 <- apply(p1, 1, which.max) - 1
y <- apply(ymat, 1, which.max) - 1
mean(p1 != y) # training error

## [1] 0.0072

xtest = as.matrix(data[2501:3000,-1]) 
ytest = as.matrix(data[2501:3000, 1])
xtest = array_reshape(xtest, c(nrow(xtest), 16, 16, 1))
ytest = to_categorical(ytest, num_classes = 10)
p2 <- model4 |> predict(xtest)

## 16/16 - 0s - 73ms/epoch - 5ms/step

p2 <- apply(p2, 1, which.max) - 1
ytest <- apply(ytest, 1, which.max) - 1
mean(p2 != ytest) # test error

## [1] 0.018

# 8.
summary(model4)

## Model: "sequential_3"
## ________________________________________________________________________________
##  Layer (type)                       Output Shape                    Param #     
## ================================================================================
##  conv2d_3 (Conv2D)                  (None, 16, 16, 16)              160         
##  max_pooling2d_3 (MaxPooling2D)     (None, 8, 8, 16)                0           
##  conv2d_2 (Conv2D)                  (None, 8, 8, 64)                9280        
##  max_pooling2d_2 (MaxPooling2D)     (None, 4, 4, 64)                0           
##  flatten_1 (Flatten)                (None, 1024)                    0           
##  dropout_1 (Dropout)                (None, 1024)                    0           
##  dense_8 (Dense)                    (None, 128)                     131200      
##  dense_7 (Dense)                    (None, 10)                      1290        
## ================================================================================
## Total params: 141,930
## Trainable params: 141,930
## Non-trainable params: 0
## ________________________________________________________________________________

First Convolutional Layer(conv2d_13): 64 filters, 3316=144 weights, 64 biases. Total = (331*16)+16=160 parameters
Second Convolutional Layer(conv2d_12): 16 filters, 331=9 weights, 16 biases. Total = (3316*64)+64=9280 parameters
First Dense Layer(dense_37): 1128 units, 1024 weights, each unit has 1 biases. Total = (1024*128)+128=131200 parameters
Output Dense Layer(dense_36): 10 units, each unit has 128 weights, each unit has 10 biases. Total = (128*10)+10=1290 parameters

# 9.
callbacks_list = list(
  callback_early_stopping(monitor="val_accuracy", patience=10),
  callback_model_checkpoint(filepath="my_model{epoch:03d}-loss{val_loss:.2f}.h5", monitor="val_loss", save_best_only=TRUE)
)
xmat = as.matrix(data[1:2500,-1])
ymat = as.matrix(data[1:2500,1])
ymat = to_categorical(ymat, num_classes = 10)
xmat = array_reshape(xmat,c(nrow(xmat),16,16,1))
system.time(history <- model4 |> fit(xmat, ymat, verbose=0, epochs=100, batch_size=128, validation_split=0.3, callbacks=callbacks_list))

##    user  system elapsed 
##  23.301   2.582   4.938

p1 <- model4 |> predict(xmat)

## 79/79 - 0s - 212ms/epoch - 3ms/step

p1 <- apply(p1, 1, which.max) - 1
y <- apply(ymat, 1, which.max) - 1
mean(p1 != y) # training error

## [1] 0.0072

xtest = as.matrix(data[2501:3000,-1]) 
ytest = as.matrix(data[2501:3000, 1])
xtest = array_reshape(xtest, c(nrow(xtest), 16, 16, 1))
ytest = to_categorical(ytest, num_classes = 10)
p2 <- model4 |> predict(xtest)

## 16/16 - 0s - 65ms/epoch - 4ms/step

p2 <- apply(p2, 1, which.max) - 1
y <- apply(ytest, 1, which.max) - 1
mean(p2 != y) # test error

## [1] 0.014

### compute the training and test errors of all saved models
# Find all saved model files
files <- list.files(pattern = "model")
test_err <- double(length(files))
train_err <- double(length(files))

for(j in 1:length(files)) {
  model <- load_model_hdf5(files[j])  # Load each saved model
  p1 <- model4 |> predict(xmat)
  p1 <- apply(p1, 1, which.max) - 1
  y <- apply(ymat, 1, which.max) - 1  
  train_err[j] <- mean(p1 != y)  # training error
}

## 79/79 - 0s - 225ms/epoch - 3ms/step
## 79/79 - 0s - 205ms/epoch - 3ms/step
## 79/79 - 0s - 204ms/epoch - 3ms/step

for(j in 1:length(files)) {
  model <- load_model_hdf5(files[j])  # Load each saved model
  p2 <- model4 |> predict(xtest)
  p2 <- apply(p2, 1, which.max) - 1
  yhat <- apply(ytest, 1, which.max) - 1
  test_err[j] <- mean(p2 != yhat)  # test error
}

## 16/16 - 0s - 63ms/epoch - 4ms/step
## 16/16 - 0s - 64ms/epoch - 4ms/step
## 16/16 - 0s - 64ms/epoch - 4ms/step

file.remove(list.files(pattern="model"))

## [1] TRUE TRUE TRUE

data.frame(Train_Accuracy = train_err,
           Test_Accuracy = test_err,
           row.names = files)

##                         Train_Accuracy Test_Accuracy
## my_model001-loss0.22.h5         0.0072         0.014
## my_model009-loss0.21.h5         0.0072         0.014
## my_model017-loss0.20.h5         0.0072         0.014

Summary

First, the experiment generated a 10x10 image grid, where each cell contained a handwritten digit randomly selected from the zip code dataset. A convolutional neural network was designed to distinguish between digits 6 and 8. The model used all 256 features and 30% of the data for validation. The training process computed the training and test errors, and the final model’s test error was 0.0048, indicating good performance. The experiment was further extended to classify digits 0-9, using 2500 observations for training and validation. The training and test errors were 0.0068 and 0.018, respectively.

Using two previously identified important features, two neural networks were trained. The first network had a hidden layer with three units, yielding a training error of 0 and a test error of 0.0048. The second network had two hidden layers, and its error results were similar. A scatter plot of the training data was generated, showing the decision boundaries of the two neural networks. The visualization clearly illustrated how the networks classified data based on specific decision boundaries.

For the CNN model in Question 7, early stopping and model saving callbacks were added to control the training process based on validation accuracy. The best saved model had a training error of 0.0056 and a test error of 0.016. The training time and performance were also improved.

Lab8

Yunshu Ran(yran945)

2024-10-15

Digit Images

5. Explain in detail how the number of parameters at each layer is computed in the neural network used in Question 3.

Using All Predictors

Summary