arrow::read_parquet("/course/data/nyctaxi/parquet/fhvhv_tripdata_2024-01.parquet") -> data
# 1.
which(is.na(data$trip_miles))
## integer(0)
which(is.na(data$trip_time))
## integer(0)
### No NA
summary(data$trip_miles)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.493 2.830 4.839 5.990 417.620
summary(data$trip_time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 571 912 1110 1430 52060
### exist 0 values
length(which(data$trip_miles==0))
## [1] 3286
length(which(data$trip_time==0))
## [1] 2
# Distributions are skewed. Use log transformation to plot distribution.
hist(log(data$trip_miles), probability = TRUE, breaks = 50)

hist(log(data$trip_time), probability = TRUE, breaks = 50)

### Log distribution is close to normal distribution.
# 2.
which(is.na(data$driver_pay))
## integer(0)
which(is.na(data$base_passenger_fare))
## integer(0)
### No NA
summary(data$driver_pay)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -67.76 8.21 13.57 18.27 22.83 1218.17
summary(data$base_passenger_fare)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -43.09 11.63 18.00 23.96 28.80 1911.16
### exist non-positive values
length(which(data$driver_pay<=0))
## [1] 4633
length(which(data$base_passenger_fare<=0))
## [1] 7334
# Distributions are skewed. Use log transformation to plot distribution.
hist(log(data$driver_pay), probability = TRUE, breaks = 50)
## Warning in log(data$driver_pay): NaNs produced

hist(log(data$base_passenger_fare), probability = TRUE, breaks = 50)
## Warning in log(data$base_passenger_fare): NaNs produced

### The data shows a right-skewed central tendency, with most values being relatively high. After a logarithmic transformation, the data approximates a normal distribution.
# 3.
# remove any implausible records
data <- subset(data, trip_miles>0 & trip_time>0 & driver_pay>0 & base_passenger_fare>0)
# model the relations, hip between the trip time, trip length and driver pay
model1 <- lm(driver_pay~trip_time+trip_miles, data = data)
summary(model1)
##
## Call:
## lm(formula = driver_pay ~ trip_time + trip_miles, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -385.54 -1.22 -0.85 0.35 557.64
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.248e-01 1.644e-03 562.4 <2e-16 ***
## trip_time 8.764e-03 1.946e-06 4503.0 <2e-16 ***
## trip_miles 1.574e+00 2.635e-04 5972.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.998 on 19649444 degrees of freedom
## Multiple R-squared: 0.9331, Adjusted R-squared: 0.9331
## F-statistic: 1.371e+08 on 2 and 19649444 DF, p-value: < 2.2e-16
The driver’s base pay is $0.9248. For each additional hour of trip
time, the driver’s pay increases by $0.008764. For each additional mile
of trip distance, the driver’s pay increases by $1.574. This indicates
that trip distance has a more significant impact on pay.
# 4.
## Compute the RMSE
residuals = data$driver_pay - predict(model1)
sqrt(mean(residuals^2))
## [1] 3.997627
## It's not a good fit as it's relatively large.
# 5.
plot(data$driver_pay, residuals,
xlim=c(0,500), ylim=c(-100,300),
main = "Residuals vs Driver Pay",
xlab = "Driver Pay",
ylab = "Residuals",
col = "blue",
pch = 20)
abline(h = 0, col = "red")
### There are three lines with different slopes, indicating that different drivers have different prices per kilometer.

# 6.
library(iotools)
library(biglm)
sink("output.txt")
r = chunk.reader(xzfile("/course/data/nyctaxi/csv/fhvhv_tripdata_2024.csv.xz", "rb"))
start <- 1
while (length(chunk <- read.chunk(r))) {
d <- dstrsplit(chunk, list(hvfhs_license_num="",NA,NA,NA,NA,NA,NA,NA,NA,
trip_miles=1, trip_time=1, base_passenger_fare=1,
NA,NA,NA,NA,NA,NA, driver_pay=1),
sep=",", strict=FALSE)
# filter out records with implausible values
d = subset(d, trip_miles>0 & trip_time>0 & driver_pay>0 & base_passenger_fare>0)
# convert trip time to hours
d$trip_time_h = d$trip_time / 3600
if(start==1){
start=0
fit <- biglm(driver_pay ~ trip_miles + hvfhs_license_num * trip_time_h, d)
}
else{
fit <- update(fit, d)
}
print(fit)
print(coef(fit))
}
Large data regression model: biglm(driver_pay ~ trip_miles + hvfhs_license_num * trip_time_h,
d)
Sample size = 169968
(Intercept) trip_miles
8.091936 1.156553
hvfhs_license_numHV0005 trip_time_h
-1.897261 43.714040
hvfhs_license_numHV0005:trip_time_h
-3.661581
Large data regression model: biglm(driver_pay ~ trip_miles + hvfhs_license_num * trip_time_h,
d)
Sample size = 100688998
(Intercept) trip_miles
1.0015315 1.5926714
hvfhs_license_numHV0005 trip_time_h
-0.6114901 32.6644096
hvfhs_license_numHV0005:trip_time_h
0.4749846
7.
It is not possible to calculate the residuals for all data chunks
and accumulate them to compute the final RMSE. Because the
biglm
package processes data in chunks without storing all
the observed and predicted values.
8.short summary.
We analyzed the relationship between driver pay and base fare with
trip time and distance. First, we examined the data for January 2023 and
found that trip time and distance have highly skewed distributions,
resembling a log-normal distribution. Some implausible data were
identified.
Next, we used a linear model to predict driver pay based on trip
time and distance. The model estimated that the driver earns $1.17 per
trip, $1.43 per mile, and $29.56 per hour. However, the RMSE was not
satisfactory. There are three lines with different slopes in residuals,
indicating that different drivers have different prices per
kilometer.
Finally, we built a linear model using data from January to April
2023 and found that HV0005 pays more per hour compared to HV0003.