arrow::read_parquet("/course/data/nyctaxi/parquet/fhvhv_tripdata_2024-01.parquet") -> data

# 1.
which(is.na(data$trip_miles))
## integer(0)
which(is.na(data$trip_time))
## integer(0)
### No NA
summary(data$trip_miles)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.493   2.830   4.839   5.990 417.620
summary(data$trip_time)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0     571     912    1110    1430   52060
### exist 0 values
length(which(data$trip_miles==0))
## [1] 3286
length(which(data$trip_time==0))
## [1] 2
# Distributions are skewed. Use log transformation to plot distribution.
hist(log(data$trip_miles), probability = TRUE, breaks = 50)

hist(log(data$trip_time), probability = TRUE, breaks = 50)

### Log distribution is close to normal distribution.
# 2.
which(is.na(data$driver_pay))
## integer(0)
which(is.na(data$base_passenger_fare))
## integer(0)
### No NA
summary(data$driver_pay)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -67.76    8.21   13.57   18.27   22.83 1218.17
summary(data$base_passenger_fare)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -43.09   11.63   18.00   23.96   28.80 1911.16
### exist non-positive values
length(which(data$driver_pay<=0))
## [1] 4633
length(which(data$base_passenger_fare<=0))
## [1] 7334
# Distributions are skewed. Use log transformation to plot distribution.
hist(log(data$driver_pay), probability = TRUE, breaks = 50)
## Warning in log(data$driver_pay): NaNs produced

hist(log(data$base_passenger_fare), probability = TRUE, breaks = 50)
## Warning in log(data$base_passenger_fare): NaNs produced

### The data shows a right-skewed central tendency, with most values being relatively high. After a logarithmic transformation, the data approximates a normal distribution.
# 3. 
# remove any implausible records
data <- subset(data, trip_miles>0 & trip_time>0 & driver_pay>0 & base_passenger_fare>0)
# model the relations, hip between the trip time, trip length and driver pay
model1 <- lm(driver_pay~trip_time+trip_miles, data = data)
summary(model1)
## 
## Call:
## lm(formula = driver_pay ~ trip_time + trip_miles, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -385.54   -1.22   -0.85    0.35  557.64 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.248e-01  1.644e-03   562.4   <2e-16 ***
## trip_time   8.764e-03  1.946e-06  4503.0   <2e-16 ***
## trip_miles  1.574e+00  2.635e-04  5972.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.998 on 19649444 degrees of freedom
## Multiple R-squared:  0.9331, Adjusted R-squared:  0.9331 
## F-statistic: 1.371e+08 on 2 and 19649444 DF,  p-value: < 2.2e-16

The driver’s base pay is $0.9248. For each additional hour of trip time, the driver’s pay increases by $0.008764. For each additional mile of trip distance, the driver’s pay increases by $1.574. This indicates that trip distance has a more significant impact on pay.

# 4.
## Compute the RMSE
residuals = data$driver_pay - predict(model1)
sqrt(mean(residuals^2))
## [1] 3.997627
## It's not a good fit as it's relatively large.
# 5.
plot(data$driver_pay, residuals,
     xlim=c(0,500), ylim=c(-100,300),
     main = "Residuals vs Driver Pay",
     xlab = "Driver Pay",
     ylab = "Residuals",
     col = "blue", 
     pch = 20)
abline(h = 0, col = "red")
### There are three lines with different slopes, indicating that different drivers have different prices per kilometer.

# 6. 
library(iotools)
library(biglm)
sink("output.txt")
r = chunk.reader(xzfile("/course/data/nyctaxi/csv/fhvhv_tripdata_2024.csv.xz", "rb"))
start <- 1
while (length(chunk <- read.chunk(r))) {
  d <- dstrsplit(chunk, list(hvfhs_license_num="",NA,NA,NA,NA,NA,NA,NA,NA,
                             trip_miles=1, trip_time=1, base_passenger_fare=1,
                             NA,NA,NA,NA,NA,NA, driver_pay=1),
                 sep=",", strict=FALSE)
  # filter out records with implausible values
  d = subset(d, trip_miles>0 & trip_time>0 & driver_pay>0 & base_passenger_fare>0)
  # convert trip time to hours
  d$trip_time_h = d$trip_time / 3600
  if(start==1){
    start=0
    fit <- biglm(driver_pay ~ trip_miles + hvfhs_license_num * trip_time_h, d)
  }
  else{
    fit <- update(fit, d)
  }
  print(fit)
  print(coef(fit))
}
Large data regression model: biglm(driver_pay ~ trip_miles + hvfhs_license_num * trip_time_h, 
    d)
Sample size =  169968 
                        (Intercept)                          trip_miles 
                           8.091936                            1.156553 
            hvfhs_license_numHV0005                         trip_time_h 
                          -1.897261                           43.714040 
hvfhs_license_numHV0005:trip_time_h 
                          -3.661581 
                          
Large data regression model: biglm(driver_pay ~ trip_miles + hvfhs_license_num * trip_time_h, 
    d)                          
Sample size =  100688998 
                        (Intercept)                          trip_miles 
                          1.0015315                           1.5926714 
            hvfhs_license_numHV0005                         trip_time_h 
                         -0.6114901                          32.6644096 
hvfhs_license_numHV0005:trip_time_h 
                          0.4749846 

7.

It is not possible to calculate the residuals for all data chunks and accumulate them to compute the final RMSE. Because the biglm package processes data in chunks without storing all the observed and predicted values.

8.short summary.

We analyzed the relationship between driver pay and base fare with trip time and distance. First, we examined the data for January 2023 and found that trip time and distance have highly skewed distributions, resembling a log-normal distribution. Some implausible data were identified.

Next, we used a linear model to predict driver pay based on trip time and distance. The model estimated that the driver earns $1.17 per trip, $1.43 per mile, and $29.56 per hour. However, the RMSE was not satisfactory. There are three lines with different slopes in residuals, indicating that different drivers have different prices per kilometer.

Finally, we built a linear model using data from January to April 2023 and found that HV0005 pays more per hour compared to HV0003.