knitr::opts_chunk$set(echo = TRUE)OR568_Midterm_2026_0310
Setup/Load Data
pacman::p_load(dplyr, skimr, tidyverse, caret, glmnet, janitor, ggplot2, lubridate, purrr,broom)
# enriched_flights_NEW_2019 <- read.csv("C:\\Users\\valso\\OneDrive - George Mason University - O365 Production\\2026_T1\\GitHub Clone\\OR568_ML_Project\\shared-notebooks\\notebooks\\val\\r\\enriched_flights_NEW.csv",
# stringsAsFactors = FALSE)
###################################################
# Code to Load Data from S3
###################################################
library(dplyr)
library(janitor)
notebook_dir <- dirname(knitr::current_input(dir = TRUE))
util_path <- normalizePath(
file.path(
notebook_dir,
"..", "..", "..",
"common_utils", "r", "load_flight_data.r"
),
winslash = "/",
mustWork = TRUE
)
source(util_path)
# Load data and clean column names
enriched_flights_NEW_2019 <- load_flight_data() %>%
janitor::clean_names()
###################################################
# End Code to Load Data from S3
###################################################Data Prep
#Assign imported dataset
flights_raw <- enriched_flights_NEW_2019 %>%
clean_names()
#Select columns to keep
bts_info <- c("year","month","dayof_month","day_of_week","flight_date","origin","dest","cancelled","dep_time_blk","crs_dep_time", "dep_time","dep_delay","taxi_out","wheels_off","air_time","wheels_on","arr_time_blk","crs_arr_time", "arr_time","arr_delay","taxi_in","distance","reporting_airline","dot_id_reporting_airline","iata_code_reporting_airline","tail_number")
arr_weather_info <- c("arr_tmpf","arr_dwpf","arr_relh","arr_sknt","arr_vsby")
dep_weather_info <- c("dep_tmpf","dep_dwpf","dep_relh","dep_sknt","dep_vsby")
plane_info <- c("aircraft_manufacturer","aircraft_model","aircraft_type","weight_category","num_engines","engine_type")
flights_filtered <- flights_raw %>%
select(bts_info, arr_weather_info, dep_weather_info, plane_info) %>%
# create flag for previous flight delayed
mutate(
dep_ts = ymd_hm(paste(flight_date, sprintf("%04d", dep_time))),
arr_ts = ymd_hm(paste(flight_date, sprintf("%04d", arr_time))),
# Fix overnight arrivals (arr_time < dep_time)
arr_ts = if_else(arr_ts < dep_ts, arr_ts + days(1), arr_ts)
) %>%
arrange(tail_number, dep_ts) %>%
group_by(tail_number) %>%
mutate(
prev_arr_delay = lag(arr_delay),
prev_arr_ts = lag(arr_ts),
time_since_prev = as.numeric(difftime(dep_ts, prev_arr_ts, units = "hours")),
prev_flight_late = prev_arr_delay > 15 & time_since_prev <= 8
) %>%
ungroup() %>%
mutate(
is_dep_delay = factor(dep_delay > 15, levels = c(FALSE, TRUE)),
is_arr_delay = factor(arr_delay > 15, levels = c(FALSE, TRUE)),
hour_of_day = dep_time %/% 100,
# Fallback to lubridate if day_of_week column is absent
is_weekend = day_of_week %in% c("6", "7"),
) %>%
group_by(hour_of_day) %>%
mutate(congestion = n()) %>%
ungroup %>%
filter(cancelled == '0')
# Define the 5-fold Cross-Validation strategy
set.seed(42)
train_control <- trainControl(method = "cv", number = 5)Models
Departure Delay
# Limit data to relevant variables
dep_flight_filtered <- flights_filtered %>%
filter(cancelled == '0') %>%
filter(origin == "BWI", dest == "JFK") %>%
select(c("dep_delay","is_dep_delay","prev_flight_late","dep_tmpf","dep_relh","dep_vsby","reporting_airline","dep_time_blk","is_weekend")) %>%
# Scale weather vars
mutate(across(all_of(c("dep_tmpf","dep_relh","dep_vsby")), ~ as.numeric(scale(.))))
dep_flight <- dep_flight_filtered %>%
filter(
!is.na(dep_delay),
!is.na(prev_flight_late),
dep_delay >= -60,
dep_delay <= 1440
)Regression
Linear
model_lm <- train(
dep_delay ~ prev_flight_late + dep_tmpf + dep_relh + dep_vsby + reporting_airline + dep_time_blk + is_weekend,
data = dep_flight, method = "lm", trControl = train_control
)
interpret_lm <- function(model) {
tidy(model$finalModel) %>%
mutate(
abs_estimate = abs(estimate)
) %>%
arrange(desc(abs_estimate))
}
interpret_lm(model_lm)Model Ridge
model_ridge <- train(
dep_delay ~ prev_flight_late + dep_tmpf + dep_relh + dep_vsby + reporting_airline + dep_time_blk + is_weekend,
data = dep_flight, method = "glmnet",
tuneGrid = expand.grid(alpha = 0, lambda = seq(0, 1, by = 0.1)),
trControl = train_control
)
interpret_glmnet <- function(model) {
lambda <- model$bestTune$lambda
coefs <- coef(model$finalModel, s = lambda)
tibble(
variable = rownames(coefs),
coefficient = as.numeric(coefs)
) %>%
filter(variable != "(Intercept)") %>%
mutate(abs_coef = abs(coefficient)) %>%
arrange(desc(abs_coef))
}
interpret_glmnet(model_ridge)Lasso
model_lasso <- train(
dep_delay ~ prev_flight_late + dep_tmpf + dep_relh + dep_vsby + reporting_airline + dep_time_blk + is_weekend,
data = dep_flight, method = "glmnet",
tuneGrid = expand.grid(alpha = 1, lambda = seq(0, 1, by = 0.1)),
trControl = train_control
)
interpret_glmnet(model_lasso)Elastic Net
model_enet <- train(
dep_delay ~ prev_flight_late + dep_tmpf + dep_relh + dep_vsby + reporting_airline + dep_time_blk + is_weekend,
data = dep_flight, method = "glmnet",
trControl = train_control
)
interpret_glmnet(model_enet)Classification
Logistic Regression
model_logit <- train(
is_dep_delay ~ prev_flight_late + dep_tmpf + dep_relh + dep_vsby + reporting_airline + dep_time_blk + is_weekend,
data = dep_flight, method = "glm",
family = "binomial",
trControl = train_control
)
summary(model_logit)PCA
# Select numeric columns; drop zero-variance and NA rows
pca_input <- dep_flight %>%
select(dep_delay, dep_tmpf, dep_relh, dep_vsby) %>%
select(where(~ n_distinct(.) > 1)) %>%
drop_na()
pca_results <- prcomp(pca_input, scale. = TRUE)
summary(pca_results)
plot(pca_results, type = "l", main = "PCA Variance Explained")
biplot(pca_results, cex = 0.7, main = "PCA: Flight Feature Relationships")Model Comparison
results <- resamples(list(
Linear = model_lm,
Ridge = model_ridge,
Lasso = model_lasso,
ElasticNet = model_enet
))
print(dotplot(results, metric = "Rsquared", main = "Model Comparison: R-squared"))# Predicted vs Actual — Linear Regression
dep_flight$predictions <- predict(model_lm, newdata = dep_flight)
ggplot(dep_flight, aes(x = dep_delay, y = predictions)) +
geom_point(alpha = 0.4, color = "darkgreen") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Linear Regression: Actual vs. Predicted Departure Delay",
x = "Actual Departure Delay (min)",
y = "Predicted Departure Delay (min)"
) +
theme_minimal()# Confusion Matrix — Logistic Regression
logit_preds <- predict(model_logit, dep_flight)
confusionMatrix(logit_preds, dep_flight$is_dep_delay)Departure Delay where the Incoming Flight is not Delayed
# Filter to where incoming flight is not late
dep_flight <- dep_flight %>%
filter(prev_flight_late == FALSE)Linear Regression
model_lm <- train(
dep_delay ~ dep_tmpf + dep_relh + dep_vsby + reporting_airline + dep_time_blk + is_weekend,
data = dep_flight, method = "lm", trControl = train_control
)
interpret_lm(model_lm)Ridge
model_ridge <- train(
dep_delay ~ dep_tmpf + dep_relh + dep_vsby + reporting_airline + dep_time_blk + is_weekend,
data = dep_flight, method = "glmnet",
tuneGrid = expand.grid(alpha = 0, lambda = seq(0, 1, by = 0.1)),
trControl = train_control
)
interpret_glmnet(model_ridge)Lasso
model_lasso <- train(
dep_delay ~ dep_tmpf + dep_relh + dep_vsby + reporting_airline + dep_time_blk + is_weekend,
data = dep_flight, method = "glmnet",
tuneGrid = expand.grid(alpha = 1, lambda = seq(0, 1, by = 0.1)),
trControl = train_control
)
interpret_glmnet(model_lasso)Elastic Net
model_enet <- train(
dep_delay ~ dep_tmpf + dep_relh + dep_vsby + reporting_airline + dep_time_blk + is_weekend,
data = dep_flight, method = "glmnet",
trControl = train_control
)
interpret_glmnet(model_enet)Logistic Regression
model_logit <- train(
is_dep_delay ~ dep_tmpf + dep_relh + dep_vsby + reporting_airline + dep_time_blk + is_weekend,
data = dep_flight, method = "glm",
family = "binomial",
trControl = train_control
)
summary(model_logit)PCA
# Select numeric columns; drop zero-variance and NA rows
pca_input <- dep_flight %>%
select(dep_delay, dep_tmpf, dep_relh, dep_vsby) %>%
select(where(~ n_distinct(.) > 1)) %>%
drop_na()
pca_results <- prcomp(pca_input, scale. = TRUE)
summary(pca_results)
plot(pca_results, type = "l", main = "PCA Variance Explained")
biplot(pca_results, cex = 0.7, main = "PCA: Flight Feature Relationships")Model Result Comparison
results <- resamples(list(
Linear = model_lm,
Ridge = model_ridge,
Lasso = model_lasso,
ElasticNet = model_enet
))
print(dotplot(results, metric = "Rsquared", main = "Model Comparison: R-squared"))Plot Predicted vs Actual
# Predicted vs Actual — Linear Regression
dep_flight$predictions <- predict(model_lm, newdata = dep_flight)
ggplot(dep_flight, aes(x = dep_delay, y = predictions)) +
geom_point(alpha = 0.4, color = "darkgreen") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Linear Regression: Actual vs. Predicted Departure Delay - Where Previous Flight was not Delayed",
x = "Actual Departure Delay (min)",
y = "Predicted Departure Delay (min)"
) +
theme_minimal()Confusion Matrix
# Confusion Matrix — Logistic Regression
logit_preds <- predict(model_logit, dep_flight)
confusionMatrix(logit_preds, dep_flight$is_dep_delay)Arrival Delay
# Limit data to relevant variables
arr_flight_filtered <- flights_filtered %>%
filter(cancelled == '0') %>%
filter(origin == "JFK", dest == "BWI") %>%
select(c("arr_delay","is_arr_delay","is_dep_delay","arr_tmpf","arr_relh","arr_vsby","reporting_airline","arr_time_blk","is_weekend")) %>%
# Scale weather vars
mutate(across(all_of(c("arr_tmpf","arr_relh","arr_vsby")), ~ as.numeric(scale(.))))
arr_flight <- arr_flight_filtered %>%
filter(
!is.na(arr_delay),
!is.na(is_dep_delay),
arr_delay >= -60,
arr_delay <= 1440
)Regression
Linear
model_lm <- train(
arr_delay ~ is_dep_delay + arr_tmpf + arr_relh + arr_vsby + reporting_airline + arr_time_blk + is_weekend,
data = arr_flight, method = "lm", trControl = train_control
)
interpret_lm(model_lm)Model Ridge
model_ridge <- train(
arr_delay ~ is_dep_delay + arr_tmpf + arr_relh + arr_vsby + reporting_airline + arr_time_blk + is_weekend,
data = arr_flight, method = "glmnet",
tuneGrid = expand.grid(alpha = 0, lambda = seq(0, 1, by = 0.1)),
trControl = train_control
)
interpret_glmnet(model_ridge)Lasso
model_lasso <- train(
arr_delay ~ is_dep_delay + arr_tmpf + arr_relh + arr_vsby + reporting_airline + arr_time_blk + is_weekend,
data = arr_flight, method = "glmnet",
tuneGrid = expand.grid(alpha = 1, lambda = seq(0, 1, by = 0.1)),
trControl = train_control
)
interpret_glmnet(model_lasso)Elastic Net
model_enet <- train(
arr_delay ~ is_dep_delay + arr_tmpf + arr_relh + arr_vsby + reporting_airline + arr_time_blk + is_weekend,
data = arr_flight, method = "glmnet",
trControl = train_control
)
interpret_glmnet(model_enet)Classification
Logistic Regression
model_logit <- train(
is_arr_delay ~ is_dep_delay + arr_tmpf + arr_relh + arr_vsby + reporting_airline + arr_time_blk + is_weekend,
data = arr_flight, method = "glm",
family = "binomial",
trControl = train_control
)
summary(model_logit)PCA
# Select numeric columns; drop zero-variance and NA rows
pca_input <- arr_flight %>%
select(arr_delay, arr_tmpf, arr_relh, arr_vsby) %>%
select(where(~ n_distinct(.) > 1)) %>%
drop_na()
pca_results <- prcomp(pca_input, scale. = TRUE)
summary(pca_results)
plot(pca_results, type = "l", main = "PCA Variance Explained")
biplot(pca_results, cex = 0.7, main = "PCA: Flight Feature Relationships")Model Comparison
results <- resamples(list(
Linear = model_lm,
Ridge = model_ridge,
Lasso = model_lasso,
ElasticNet = model_enet
))
print(dotplot(results, metric = "Rsquared", main = "Model Comparison: R-squared"))# Predicted vs Actual — Linear Regression
arr_flight$predictions <- predict(model_lm, newdata = arr_flight)
ggplot(arr_flight, aes(x = arr_delay, y = predictions)) +
geom_point(alpha = 0.4, color = "darkgreen") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Linear Regression: Actual vs. Predicted Arrival Delay",
x = "Actual Arrival Delay (min)",
y = "Predicted Arrival Delay (min)"
) +
theme_minimal()# Confusion Matrix — Logistic Regression
logit_preds <- predict(model_logit, arr_flight)
confusionMatrix(logit_preds, arr_flight$is_arr_delay)Arrival Delay where the Flight Departure is not Delayed
# Filter to where flight is not late upon departure
arr_flight <- arr_flight %>%
filter(is_dep_delay == FALSE)Linear Regression
model_lm <- train(
arr_delay ~ arr_tmpf + arr_relh + arr_vsby + reporting_airline + arr_time_blk + is_weekend,
data = arr_flight, method = "lm", trControl = train_control
)
interpret_lm(model_lm)Ridge
model_ridge <- train(
arr_delay ~ arr_tmpf + arr_relh + arr_vsby + reporting_airline + arr_time_blk + is_weekend,
data = arr_flight, method = "glmnet",
tuneGrid = expand.grid(alpha = 0, lambda = seq(0, 1, by = 0.1)),
trControl = train_control
)
interpret_glmnet(model_ridge)Lasso
model_lasso <- train(
arr_delay ~ arr_tmpf + arr_relh + arr_vsby + reporting_airline + arr_time_blk + is_weekend,
data = arr_flight, method = "glmnet",
tuneGrid = expand.grid(alpha = 1, lambda = seq(0, 1, by = 0.1)),
trControl = train_control
)
interpret_glmnet(model_lasso)Elastic Net
model_enet <- train(
arr_delay ~ arr_tmpf + arr_relh + arr_vsby + reporting_airline + arr_time_blk + is_weekend,
data = arr_flight, method = "glmnet",
trControl = train_control
)
interpret_glmnet(model_enet)Logistic Regression
model_logit <- train(
is_arr_delay ~ arr_tmpf + arr_relh + arr_vsby + reporting_airline + arr_time_blk + is_weekend,
data = arr_flight, method = "glm",
family = "binomial",
trControl = train_control
)
summary(model_logit)PCA
# Select numeric columns; drop zero-variance and NA rows
pca_input <- arr_flight %>%
select(arr_delay, arr_tmpf, arr_relh, arr_vsby) %>%
select(where(~ n_distinct(.) > 1)) %>%
drop_na()
pca_results <- prcomp(pca_input, scale. = TRUE)
summary(pca_results)
plot(pca_results, type = "l", main = "PCA Variance Explained")
biplot(pca_results, cex = 0.7, main = "PCA: Flight Feature Relationships")Model Performance Comparison
results <- resamples(list(
Linear = model_lm,
Ridge = model_ridge,
Lasso = model_lasso,
ElasticNet = model_enet
))
print(dotplot(results, metric = "Rsquared", main = "Model Comparison: R-squared"))Predicted vs Actual
# Predicted vs Actual — Linear Regression
arr_flight$predictions <- predict(model_lm, newdata = arr_flight)
ggplot(arr_flight, aes(x = arr_delay, y = predictions)) +
geom_point(alpha = 0.4, color = "darkgreen") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Linear Regression: Actual vs. Predicted Arrival Delay - Where Flight was not Late upon Departure",
x = "Actual Arrival Delay (min)",
y = "Predicted Arrival Delay (min)"
) +
theme_minimal()Confusion Matrix
# Confusion Matrix — Logistic Regression
logit_preds <- predict(model_logit, arr_flight)
confusionMatrix(logit_preds, arr_flight$is_arr_delay)