# 1. 
cd /course/users/yran945
mkdir lab-1
cd lab-1
curl https://www.stat.auckland.ac.nz/~su/769/vehicles.tar.gz | tar vxz
# 2. 
ls -lc
# 3.
wc vehicles-2020.csv
# 4. 
cat vehicles-*.csv | grep 8703231915 > car-imports.data
wc car-imports.data
### It matches the row count from Lab Zero.
## total 12128
## -rw-r--r-- 1 yran945 yran945  247152 Jul 30 18:11 car-imports.data
## -rw-r--r-- 1 yran945 yran945 1265211 Jul 30 18:11 lab1.html
## -rw-r--r-- 1 yran945 yran945    4482 Jul 30 18:13 lab1.Rmd
## -rw-r--r-- 1 yran945 yran945 2092355 Jul 29 14:08 new-car-imports.csv
## -rw-r----- 1 yran945 yran945  221672 Jul 30 10:59 vehicles-2000.csv
## -rw-r----- 1 yran945 yran945  249888 Jul 30 10:59 vehicles-2001.csv
## -rw-r----- 1 yran945 yran945  251520 Jul 30 10:59 vehicles-2002.csv
## -rw-r----- 1 yran945 yran945  286618 Jul 30 10:59 vehicles-2003.csv
## -rw-r----- 1 yran945 yran945  314266 Jul 30 10:59 vehicles-2004.csv
## -rw-r----- 1 yran945 yran945  340921 Jul 30 10:59 vehicles-2005.csv
## -rw-r----- 1 yran945 yran945  345942 Jul 30 10:59 vehicles-2006.csv
## -rw-r----- 1 yran945 yran945  351530 Jul 30 10:59 vehicles-2007.csv
## -rw-r----- 1 yran945 yran945  357033 Jul 30 10:59 vehicles-2008.csv
## -rw-r----- 1 yran945 yran945  299115 Jul 30 10:59 vehicles-2009.csv
## -rw-r----- 1 yran945 yran945  342888 Jul 30 10:59 vehicles-2010.csv
## -rw-r----- 1 yran945 yran945  350264 Jul 30 10:59 vehicles-2011.csv
## -rw-r----- 1 yran945 yran945  367853 Jul 30 10:59 vehicles-2012.csv
## -rw-r----- 1 yran945 yran945  409834 Jul 30 10:59 vehicles-2013.csv
## -rw-r----- 1 yran945 yran945  446754 Jul 30 10:59 vehicles-2014.csv
## -rw-r----- 1 yran945 yran945  445346 Jul 30 10:59 vehicles-2015.csv
## -rw-r----- 1 yran945 yran945  469612 Jul 30 10:59 vehicles-2016.csv
## -rw-r----- 1 yran945 yran945  614480 Jul 30 10:59 vehicles-2017.csv
## -rw-r----- 1 yran945 yran945  569281 Jul 30 10:59 vehicles-2018.csv
## -rw-r----- 1 yran945 yran945  571426 Jul 30 10:59 vehicles-2019.csv
## -rw-r----- 1 yran945 yran945  550903 Jul 30 10:59 vehicles-2020.csv
## -rw-r----- 1 yran945 yran945  594970 Jul 30 10:59 vehicles-2021.csv
##   2375  53401 550903 vehicles-2020.csv
##   1080  23300 247152 car-imports.data
# 5. 
cat vehicles-*.csv | sed '1d' | sed 's/[0-9]*,//' | sed 's/,.*//' | sort | uniq -c | sort -n | tail -5
##    1853 8703248005
##    1868 8703248003
##    1931 8703328003
##    2985 8703238005
##    3003 8703238003
# 6. 
head -1 vehicles-2000.csv > new-car-imports.csv
cat vehicles-*.csv | grep -E '8703218006|8703228003|8703238003|8703248003|8703211915|8703221915|8703231915|8703241915' >> new-car-imports.csv
# 7. read the CSV file
cars <- read.csv("new-car-imports.csv")
# 8.clean the file
colnames(cars)[2] <- "HSC"
colnames(cars)[3] <- "HSDescription"
colnames(cars)[4] <- "Unit"
colnames(cars)[6] <- "VFD"
cars$VFD <- as.numeric(gsub("\\,","",cars$VFD))

colnames(cars)[7] <- "CIF"
cars$CIF <- as.numeric(gsub("\\,","",cars$CIF))

colnames(cars)[8] <- "Quantity"
cars$Quantity <- as.numeric(sub("\\,","",cars$Quantity))

cars$Unknown <- rep(NA, length(cars$Month))
cars$Date <- as.Date(paste0(cars$Month, "01"), format="%Y%m%d")
str(cars)
## 'data.frame':    9451 obs. of  11 variables:
##  $ Month        : int  200001 200001 200001 200001 200001 200001 200001 200001 200001 200001 ...
##  $ HSC          : num  8.7e+09 8.7e+09 8.7e+09 8.7e+09 8.7e+09 ...
##  $ HSDescription: chr  "Vehicles; spark-ignition internal combustion reciprocating piston engine, cylinder capacity not exceeding 1000c"| __truncated__ "Vehicles; spark-ignition internal combustion reciprocating piston engine, cylinder capacity exceeding 1000cc bu"| __truncated__ "Vehicles; spark-ignition internal combustion reciprocating piston engine, cylinder capacity exceeding 1000cc bu"| __truncated__ "Vehicles; spark-ignition internal combustion reciprocating piston engine, cylinder capacity exceeding 1000cc bu"| __truncated__ ...
##  $ Unit         : chr  "NMB" "NMB" "NMB" "NMB" ...
##  $ Country      : chr  "Japan" "Spain" "France" "United Kingdom" ...
##  $ VFD          : num  410242 125408 662198 44266 68133 ...
##  $ CIF          : num  432496 137791 730063 49558 72299 ...
##  $ Quantity     : num  42 11 36 3 5 85 144 48 159 2 ...
##  $ Status       : chr  "Final" "Final" "Final" "Final" ...
##  $ Unknown      : logi  NA NA NA NA NA NA ...
##  $ Date         : Date, format: "2000-01-01" "2000-01-01" ...
# 9. A bar plot of total imported value by country
total = sort(tapply(cars$VFD, cars$Country, sum),decreasing = TRUE)
barplot(total, las=2, main="Plot of Total Imported Value by Country")

### From this plot, we can learn that Japan has the highest total imported value, which is significantly more than other countries. Australia and Germany follow behind. We cannot clearly indentify all countries' value in this plot.
barplot(total[1:10], las=2, main="Plot of Total Imported Value by Country")

### In this plot, it only shows the top 10 countries. We can do logarithm transformation to values so that we can see each volume more clearly.
# 10. 
### convert into million NZD
cars$VFD = cars$VFD/1000000
### Compute the aggregated monthly import value (VFD) of each HS code
mon_by_HSC = xtabs(VFD ~ Date + HSC, data = cars)
date = as.Date(rownames(mon_by_HSC))
### draw a line plot
matplot(date, mon_by_HSC, type = "l", lty = 1, 
        xlab = "Month", ylab = "Value for Duty (in millions NZD)",
        main = "Total Monthly Import Value by HS Code")

### This plot shows that, before 2017, there're mainly three HSC have VFD, but after 2017 their VFD are essentially zero. The amplitude of variaty in VFD for each HSC over time is quite noticeable. 
### Further analysis using moving averages can be done to identify the trend in VFD.
# extract Germany data
Germany = cars[cars$Country == "Germany",]
monGermany = aggregate(Germany$VFD, by=list(date=Germany$Date), FUN=sum, simplify = FALSE)
monGermany$x <- unlist(monGermany$x)
# generate training and test data sets
SetNum = floor(nrow(monGermany)*0.9)
training = monGermany[1:SetNum,]
test = monGermany[-(1:SetNum),]

### Model1: a simple overall mean model
mean = mean(training$x)
cat("RMSE for mean:\n")
## RMSE for mean:
sqrt(mean((test$x-mean)^2))
## [1] 8.379609
### Model2: a linear regression model
model2 <- lm(x~date, data=training)
predictions <- predict(model2, test)
residuals <- test$x - predictions
cat("RMSE for linear model:\n")
## RMSE for linear model:
sqrt(mean(residuals^2))
## [1] 11.52066
# plot
plot(x = monGermany$date, y = monGermany$x, type = "l",lty = "1111",
     main = "Monthly New Car Import Value from Germany", 
     xlab = "Year", 
     ylab = "Value for Duti (in millions NZD")
abline(v=training$date[SetNum],lty = "dashed")
abline(h=mean, col="skyblue2",lwd = 2)
abline(a=model2$coefficients[1], b=model2$coefficients[2], col="red",lwd=2)
text(x=2018,y=70,labels="training",col="pink")
text(x=2021,y=70,labels="test",col="pink")

### Mean model performs better. I think the linear model is not sensible, because the trend of data seems not to be linear. Additionally, there may some incident happened around 2020 which caused the import value. So the training data set cannot forecast the test data precisely.

12. Summary of findings

### A portion of HSCs' monthly import value have a sudden drop around 2017. But other HSCs become popular. Meanwhile, monthly new car import value from Germany also has a decreasing trend after achieve the top around 2017.