# ============================================================
# Step 1. Install and load libraries
# ============================================================
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
tidyverse, janitor, lubridate, skimr,
caret, glmnet, broom,
ggplot2, knitr, kableExtra
)
set.seed(42)Data Analysis BWI to JFK/EWR
R Markdown
Step 1 Installing and loading the libraries
NOTE:
Used command “pacman” to load up all libraries at once:
- tidyverse: collection of tools (dplyr to filter, tidyr to organize);
- janitor: clean up tool for name clean up;
- lubridate: time and date tool helps with extracting day of week or month;
- skimr: quick scan of data to check how many values are missing, averages, histograms; - caret: classification and regression traing, crossvalidaton;
- glmnet: ridge and lasso regression, used to check which varaiable is the strongest predictor;
- broom: sweeps the messy texts in data, clean, organize data;
- ggplot2: building charts;
- knitr: knitting for github;
- kableExtra: adds extra grapgics, formatting to table.
Used set.seed(42) - to generate same random number each time to produce same result.
Step 2 Loading the raw data from enriched_flights_2019
# ============================================================
# Step 2. Load raw data set
# ============================================================
flights_raw <- read_csv("enriched_flights_2019.csv") %>%
clean_names()
NOTE:
clean_names() - changes column names in to coding friendly names
Step 3 Loading the weather data, commented out since its not needed
NOTE:
included this so that I can remeber how to retrieve weather data
# ============================================================
# Step 3. NOAA WEATHER DATA (BWI Hourly *INSERT YEAR*)
# ============================================================
# Note: requires httr2 — run separately if needed
# library(httr2)
#
# base_url <- "https://www.ncei.noaa.gov/access/services/data/v1"
# resp <- request(base_url) |>
# req_url_query(
# dataset = "global-hourly",
# stations = "72406093721",
# startDate = "2024-01-01T00:00:00", ***change dates
# endDate = "2024-12-31T23:59:59",
# dataTypes = "TMP,WND,CIG,DEW", ***what data needed
# format = "csv",
# units = "metric",
# includeAttributes = "true",
# includeStationName = "true"
# ) |>
# req_perform()
#
# ncei_clean <- resp |> resp_body_string() |>
# read_csv(show_col_types = FALSE) |>
# separate(WND, into = c("w_dir","w_dir_q","w_type","w_spd","w_spd_q"), sep = ",", fill = "right") |>
# separate(TMP, into = c("t_raw","t_q"), sep = ",", fill = "right") |>
# separate(CIG, into = c("c_height","c_q","c_det","c_cavok"), sep = ",", fill = "right") |>
# mutate(
# temp_c = ifelse(t_raw %in% c("+9999","9999"), NA, as.numeric(t_raw) / 10),
# wind_speed_m_s = ifelse(w_spd == "9999", NA, as.numeric(w_spd) / 10),
# wind_dir = ifelse(w_dir == "999", NA, as.numeric(w_dir)),
# ceiling_m = ifelse(c_height == "99999", NA, as.numeric(c_height)),
# DATE = as.POSIXct(DATE, format = "%Y-%m-%dT%H:%M:%S", tz = "UTC")
# ) |>
# select(DATE, NAME, temp_c, wind_speed_m_s, wind_dir, ceiling_m)
#
# write_csv(ncei_clean, "BWI_Hourly_Weather_2024.csv")
#NOAA library(httr2)
#library(readr)
#library(dplyr)
# base NCEI API endpoint
#base_url <- "https://www.ncei.noaa.gov/access/services/data/v1"
#query parameters according to NCEI API documentation
#req <- request(base_url) |>
# req_url_query(
# dataset = "global-hourly", # Targets the ISD hourly database
# stations = "72406093721", # CORRECTED: 11-digit USAF-WBAN ID #for BWI
# startDate = "2022-06-27T00:00:00Z",# ISO 8601 start time (Midnight UTC)
# endDate = "2022-06-27T23:59:59Z",# ISO 8601 end time (End of day UTC)
# format = "csv", # Requests comma-separated values
# units = "metric", # Converts standard variables to #metric
# includeAttributes = "true", # Includes data quality flags
# includeStationName = "true" # Includes the physical station name
# )
#
# 3. Execute the request and extract the text payload
#resp <- req |> req_perform()
#csv_text <- resp |> resp_body_string()
# CSV text into an R dataframe
#ncei_hourly_datav2 <- read_csv(csv_text, show_col_types = FALSE)
# View the raw data in the console to verify all columns populated
#print(ncei_hourly_datav2)
# 5. Export the dataframe to a CSV file for Excel
#write_csv(ncei_hourly_datav2, file = "ncei_hourly_datav2_June27.csv")
# Load tidyr for the separate() function
#library(tidyr)
#library(dplyr)
#library(readr)
# Take the raw ncei_hourly_data and pass it through the cleaning pipeline
#ncei_clean <- ncei_hourly_data |>
# --- 1. TEMPERATURE (TMP) ---
# Format: Temperature, Quality
#separate(TMP, into = c("temp_raw", "temp_qual"), sep = ",", remove = FALSE) |>
#mutate(
# Missing is "+9999". Otherwise, convert to numeric and divide by 10 for Celsius.
# temp_c = ifelse(temp_raw == "+9999", NA, as.numeric(temp_raw) / 10)
# ) |>
# --- 2. WIND (WND) ---
# Format: Direction, Direction_Quality, Type, Speed, Speed_Quality
# separate(WND, into = c("wind_dir_raw", "wind_dir_qual", "wind_type", #"wind_speed_raw", "wind_speed_qual"), sep = ",", remove = FALSE) |>
# mutate(
# Speed missing is "9999". Divide by 10 for meters per second.
# wind_speed_m_s = ifelse(wind_speed_raw == "9999", NA, as.numeric(wind_speed_raw) / 10),
# Direction missing is "999". Kept as whole degrees.
# wind_dir_deg = ifelse(wind_dir_raw == "999", NA, as.numeric(wind_dir_raw))
#) |>
# --- 3. CLOUD CEILING (CIG) ---
# Format: Height, Quality, Determination, CAVOK
#separate(CIG, into = c("ceiling_raw", "ceiling_qual", "ceiling_det", "ceiling_cavok"), sep = ",", remove = FALSE) |>
#mutate(
# Height missing is "99999". Kept as whole meters.
# ceiling_height_m = ifelse(ceiling_raw == "99999", NA, as.numeric(ceiling_raw))
# ) |>
# --- 4. SELECT FINAL FEATURES ---
# Drop all the messy intermediate text columns so the final dataframe is lightweight
#select(
# DATE,
# temp_c,
# wind_speed_m_s,
# wind_dir_deg,
# ceiling_height_m
#)
# View the final, machine-learning-ready dataset
#print(ncei_clean)
# Export the clean dataframe to an Excel-ready CSV
#write_csv(ncei_clean, file = "NCEI_BWI_Cleanedv2_Features_June27.csv")
Step 4 KARAN
NOTES:
4a.
keep_cols - looking only at specific columns
sample_n - to pick a smaller smaple of flights, specified to 30,000 flights to ensure computer is able to process it
mutate - ensuring its a numeric data for R to process
delay_30 - to look at flights that are at least 30 min late
4b.
skim(df_sub) - quick report of data
4c.
ggplot - added histogram to show delay distribution
4d.
stat_summary - shows how weather severity impacted the delay
4e.
summarise - to calculate delay probability for every route in database
filter n>=50 - ignores any routes that have fewer than 50 flights
4f.
Logistic Regression - to predict a probability of an event, 0% to 100% of being 30 min late
delay_30 time + month + weather + airline + route
exp(coef(final_glm)) - odd ratio - if weather is 2, for every point increase in weather the chance of being late double
::: {.cell}
```{.r .cell-code}
# ============================================================
# Step 4. KARAN — EDA & LOGISTIC REGRESSION (All Routes)
# ============================================================
#4a.Feature selection & sampling
keep_cols <- c(
"arr_delay_minutes", "month", "day_of_week", "dep_time_blk",
"distance", "origin", "dest", "reporting_airline", "dep_weather_severity"
)
keep_cols <- keep_cols[keep_cols %in% names(flights_raw)]
df_sub <- flights_raw %>%
select(all_of(keep_cols)) %>%
sample_n(min(30000, nrow(.))) %>%
mutate(
arr_delay_minutes = as.numeric(arr_delay_minutes),
distance = as.numeric(distance),
dep_weather_severity = as.numeric(dep_weather_severity),
across(c(month, day_of_week, dep_time_blk, origin, dest, reporting_airline), as.factor),
route = as.factor(paste(origin, "to", dest)),
delay_30 = arr_delay_minutes >= 30,
log_delay = log1p(pmax(arr_delay_minutes, 0))
)
# 4b. Data checks ---
str(df_sub)
skim(df_sub)
summary(df_sub$arr_delay_minutes)
mean(is.na(df_sub$arr_delay_minutes))
# 4c. Arrival delay distribution ---
ggplot(df_sub, aes(x = arr_delay_minutes)) +
geom_histogram(binwidth = 5) +
coord_cartesian(xlim = c(0, 180)) +
geom_vline(xintercept = 30, linetype = "dashed") +
labs(
title = "Arrival Delay Distribution (0–180 min shown)",
subtitle = "Dashed line = 30 min delay threshold",
x = "Arrival Delay (minutes)", y = "Count"
)
# 4d. P(delay >= 30) vs weather severity ---
if ("dep_weather_severity" %in% names(df_sub)) {
ggplot(df_sub, aes(x = dep_weather_severity, y = as.numeric(delay_30))) +
stat_summary(fun = mean, geom = "point") +
stat_summary(fun = mean, geom = "line") +
labs(
title = "P(Delay ≥ 30) vs Departure Weather Severity",
x = "Departure Weather Severity", y = "P(Delay ≥ 30)"
)
}
# 4e. Top 15 routes by delay probability ---
df_sub %>%
group_by(route) %>%
summarise(
n = n(),
p_delay30 = mean(delay_30, na.rm = TRUE),
avg_arr_delay = mean(arr_delay_minutes, na.rm = TRUE),
.groups = "drop"
) %>%
filter(n >= 50) %>%
arrange(desc(p_delay30)) %>%
slice_head(n = 15)
# 4f. Logistic regression (GLM) ---
df_glm <- df_sub %>%
filter(!is.na(arr_delay_minutes)) %>%
select(delay_30, dep_time_blk, month, distance,
reporting_airline, dep_weather_severity, day_of_week, route, dest) %>%
na.omit()
route_counts <- df_glm %>% count(route, name = "n_route")
df_glm2 <- df_glm %>%
left_join(route_counts, by = "route") %>%
filter(n_route >= 100) %>%
select(-n_route)
final_glm <- glm(
delay_30 ~ dep_time_blk + month + day_of_week + distance +
reporting_airline + dep_weather_severity + route +
dep_time_blk:dep_weather_severity,
data = df_glm2,
family = binomial
)
summary(final_glm)
exp(coef(final_glm)) # Odds ratios
:::
## Step 5 Regression Models
NOTES:
traincontrol - crossvalidation of data cv - crossvalidation
10 - divide data into 10 piles, learn from 9 and test on 10th regression models:
- Linear: Draws a straight line through the data. It assumes if Departure Delay goes up by 1, Arrival Delay goes up by 1.
- Ridge: gives better R-squared than linear, reduce noise
- Lasso: declutters the data to keep model simple,
- ElasticNet: reduces noise and declutters.
Dot Plot - shows results of the prevous calculations
vs. Actual - creates predictions and visualizes as a cloud of green dots with a red dashed line.
The red line is “Perfect Prediction.” If a dot is on the line, the model guessed perfectly. If the green cloud is tight around the red line, your model is a sharpshooter. If the cloud is a messy blob, your model is just guessing.
PCA - variables: Taxi-out, Air time, Distance. PCA squashes them together to see which ones move in the same direction.
The Result: Delays cased by taxi out.
# ============================================================
# Step 5. Regression Models (BWI → EWR & BWI → JFK)
# ============================================================
train_control <- trainControl(method = "cv", number = 5)
# coding and building four regression models for any market pair
run_regression_models <- function(data, route_label) {
models <- list(
Linear = train(arr_delay ~ dep_delay + hour_of_day + is_weekend,
data = data, method = "lm", trControl = train_control),
Ridge = train(arr_delay ~ dep_delay + hour_of_day + is_weekend,
data = data, method = "glmnet",
tuneGrid = expand.grid(alpha = 0, lambda = seq(0, 1, by = 0.1)),
trControl = train_control),
Lasso = train(arr_delay ~ dep_delay + hour_of_day + is_weekend,
data = data, method = "glmnet",
tuneGrid = expand.grid(alpha = 1, lambda = seq(0, 1, by = 0.1)),
trControl = train_control),
ElasticNet = train(arr_delay ~ dep_delay + hour_of_day + is_weekend,
data = data, method = "glmnet", trControl = train_control)
)
# Model comparison
results <- resamples(models)
print(dotplot(results, metric = "Rsquared",
main = paste("Model Comparison: R-squared —", route_label)))
# Predicted vs Actual
data$predictions <- predict(models$ElasticNet, newdata = data)
print(
ggplot(data, aes(x = arr_delay, y = predictions)) +
geom_point(alpha = 0.4, color = "darkgreen") +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(title = paste("Elastic Net: Predicted vs. Actual —", route_label),
x = "Actual Arrival Delay (min)", y = "Predicted Arrival Delay (min)") +
theme_minimal()
)
invisible(models)
}
# --- BWI → EWR ---
market_pair_ewr <- flights_raw %>%
filter(origin == "BWI", dest == "EWR") %>%
mutate(
is_delayed = as.factor(ifelse(arr_delay > 15, "Yes", "No")),
hour_of_day = dep_time %/% 100,
is_weekend = ifelse(day_of_week %in% c(6, 7), 1, 0)
) %>%
filter(!is.na(arr_delay), !is.na(dep_delay))
ggplot(market_pair_ewr, aes(x = dep_delay, y = arr_delay)) +
geom_point(alpha = 0.5, color = "steelblue") +
geom_smooth(method = "lm", color = "red") +
labs(title = "BWI → EWR: Departure vs Arrival Delay",
x = "Departure Delay (min)", y = "Arrival Delay (min)") +
theme_minimal()
models_ewr <- run_regression_models(market_pair_ewr, "BWI-EWR")
# --- BWI → JFK (primary market pair) ---
market_pair <- flights_raw %>%
filter(origin == "BWI", dest == "JFK") %>%
mutate(
is_delayed = as.factor(ifelse(arr_delay > 15, "Yes", "No")),
hour_of_day = dep_time %/% 100,
is_weekend = ifelse(day_of_week %in% c(6, 7), 1, 0)
) %>%
filter(!is.na(arr_delay), !is.na(dep_delay), !is.na(hour_of_day))
models_jfk <- run_regression_models(market_pair, "BWI-JFK")
# Logistic regression for classification
model_logit <- train(
is_delayed ~ dep_delay + hour_of_day + is_weekend,
data = market_pair, method = "glm", family = "binomial",
trControl = train_control
)
logit_preds <- predict(model_logit, market_pair)
confusionMatrix(logit_preds, market_pair$is_delayed)
# PCA
pca_input <- market_pair %>%
select(dep_delay, arr_delay, air_time, distance, taxi_out) %>%
select(where(~ n_distinct(.) > 1)) %>%
drop_na()
pca_results <- prcomp(pca_input, scale. = TRUE)
plot(pca_results, type = "l", main = "PCA Variance Explained")
biplot(pca_results, cex = 0.7, main = "PCA: Flight Feature Relationships")
# Near-zero variance check
nzv_metrics <- nearZeroVar(market_pair, saveMetrics = TRUE)
print(nzv_metrics)
# Arrival delays by hour of day
ggplot(market_pair, aes(x = as.factor(hour_of_day), y = arr_delay)) +
geom_boxplot(fill = "orange", alpha = 0.7) +
labs(title = "Arrival Delays by Hour of Day (BWI-JFK)",
x = "Hour of Departure", y = "Arrival Delay (minutes)") +
theme_light()Step 6 Delays
NOTES:
group_by(tail_number, flight_date) - after looking at the flight delays, looking at tail number (plane registration) we are able to track the plane and see if one if that affected the delay;
Turnaround min - when it landed and when it took off to find the gap between;
prev_flight_delayed - was the plane delayed getting to destination;
ggplot - for visualization.
# ============================================================
# Step 6. Propagated Delays - looking at tail number analysis
# ============================================================
propagated_delays <- flights_raw %>%
filter(!is.na(tail_number), !is.na(wheels_on), !is.na(wheels_off)) %>%
select(tail_number, flight_date, origin, dest,
wheels_on, wheels_off, arr_delay, dep_delay) %>%
arrange(tail_number, flight_date, wheels_off) %>%
group_by(tail_number, flight_date) %>%
mutate(
prev_flight_arrival = lag(wheels_on),
turnaround_mins = as.numeric(difftime(
strptime(sprintf("%04d", wheels_off), format = "%H%M"),
strptime(sprintf("%04d", prev_flight_arrival), format = "%H%M"),
units = "mins"
)),
prev_flight_delayed = lag(arr_delay) > 15
) %>%
filter(!is.na(turnaround_mins), turnaround_mins > 0)
ggplot(propagated_delays, aes(x = prev_flight_delayed, y = dep_delay, fill = prev_flight_delayed)) +
geom_boxplot(alpha = 0.7) +
labs(
title = "Impact of Propagated Delays",
subtitle = "Departure delay based on whether the incoming aircraft arrived late",
x = "Incoming Aircraft Delayed (>15 min)", y = "Current Departure Delay (min)"
) +
scale_fill_manual(values = c("steelblue", "firebrick")) +
theme_minimal()
propagated_delays %>%
group_by(prev_flight_delayed) %>%
summarise(
avg_dep_delay = mean(dep_delay, na.rm = TRUE),
median_turnaround = median(turnaround_mins, na.rm = TRUE),
flight_count = n()
)
## Step 7 Taxi-Out Time
NOTES:
group by - hour of day in 24h period to get average and median;
flight count - it ignores hours that had 5 or less flights;
taxi_corr - correlation calculation,
ggplot - to visualize data;
Results: best time to fly woul dbe at 7am, least amout of taxi time; worst time after 4pm.\
# ============================================================
# Step 7. TAXI-OUT - how long the plane is on turmac/congestion
# (BWI → JFK)
# ============================================================
taxi_summary <- market_pair %>%
group_by(hour_of_day) %>%
summarise(
avg_taxi_out = mean(taxi_out, na.rm = TRUE),
median_taxi_out = median(taxi_out, na.rm = TRUE),
flight_count = n()
) %>%
filter(flight_count > 5)
taxi_corr <- cor(market_pair$hour_of_day, market_pair$taxi_out, use = "complete.obs")
message("Correlation (Hour of Day vs Taxi-Out): ", round(taxi_corr, 3))
ggplot(taxi_summary, aes(x = hour_of_day, y = avg_taxi_out)) +
geom_line(color = "darkred", linewidth = 1) +
geom_point(size = 3, color = "darkred") +
geom_hline(yintercept = mean(market_pair$taxi_out, na.rm = TRUE),
linetype = "dashed", color = "blue") +
annotate("text", x = 5,
y = mean(market_pair$taxi_out, na.rm = TRUE) + 1,
label = "Daily Average", color = "blue") +
scale_x_continuous(breaks = seq(0, 23, by = 2)) +
labs(
title = "Airport Congestion: Average Taxi-Out Time by Hour",
subtitle = "Identifying the 'Sweet Spot' for BWI-JFK departures",
x = "Hour of Day (24h)", y = "Average Taxi-Out Time (minutes)"
) +
theme_minimal()
Step 8 Weather impact
NOTES:
grouping by - weather severity;
LM and coef - linear model and multiplier;
Looking at the impact of weather on delays depending on severity.
ggplot - to view the results.
# ============================================================
# Step 8. Looking at weather impact
# ============================================================
weather_summary <- market_pair %>%
group_by(dep_weather_severity) %>%
summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE), flight_count = n()) %>%
filter(!is.na(dep_weather_severity))
weather_model <- lm(dep_delay ~ dep_weather_severity, data = market_pair)
multiplier <- coef(weather_model)["dep_weather_severity"]
message("Weather Multiplier: ", round(multiplier, 2), " minutes per severity level")
ggplot(weather_summary, aes(x = dep_weather_severity, y = avg_dep_delay)) +
geom_col(fill = "steelblue", alpha = 0.8) +
geom_text(aes(label = round(avg_dep_delay, 1)), vjust = -0.5) +
labs(
title = "Weather Multiplier: Impact on Departure Delays",
subtitle = paste("Each severity point adds ~", round(multiplier, 1), "minutes of delay"),
x = "Departure Weather Severity (0 = Clear, 4 = Extreme)",
y = "Average Departure Delay (minutes)"
) +
theme_minimal()
Step 9 Scheduling Strategy
NOTES: in this test I will check which airline adds time to thier schedule to allow times for delays.
time buffer - what is actual time and schedyled time of departure;
summarise - how many flights arrived on time, how many were late, in the review looking at airline that shows over 1000 flights;
ggplot - to visualize results.
Results: 9E (Endavor Air by Delta) with flight count of 1406 show over 71.4% arrivals on time.
# ============================================================
# Step 9. Airline Buffer and Scheduling Strategy
# ============================================================
airline_buffer <- market_pair %>%
mutate(
time_buffer = crs_elapsed_time - actual_elapsed_time,
is_on_time = arr_delay <= 0
) %>%
group_by(reporting_airline) %>%
summarise(
avg_buffer = mean(time_buffer, na.rm = TRUE),
on_time_rate = mean(is_on_time, na.rm = TRUE) * 100,
avg_arr_delay = mean(arr_delay, na.rm = TRUE),
flight_count = n()
) %>%
filter(flight_count > 1000)
ggplot(airline_buffer, aes(x = avg_buffer, y = on_time_rate, label = reporting_airline)) +
geom_point(aes(size = flight_count, color = avg_arr_delay)) +
geom_text(vjust = -1.5) +
geom_vline(xintercept = 0, linetype = "dashed") +
scale_color_gradient(low = "darkgreen", high = "red") +
labs(
title = "Airline Strategy: Scheduling Buffer vs. On-Time Performance",
subtitle = "Positive buffer = padded schedules",
x = "Average Scheduling Buffer (minutes)", y = "On-Time Arrival Rate (%)",
color = "Avg Arrival Delay", size = "Flight Count"
) +
theme_minimal()
Step 10 Scheduling
NOTES:
Looking at the airlines with most flights:
- 9E - Endavor Air
- AA American Airlines
- B6 - JetBlue
Looking at scheduled minus the actual flight time in data.
added “kable” for formatting
# ============================================================
# Step 10. Scheduling honesty (BWI → JFK)
# ============================================================
honesty_summary <- market_pair %>%
filter(reporting_airline %in% c("9E", "AA", "B6")) %>%
mutate(
time_buffer = crs_elapsed_time - actual_elapsed_time,
Carrier_Name = case_when(
reporting_airline == "9E" ~ "Endeavor (Delta)",
reporting_airline == "B6" ~ "JetBlue",
reporting_airline == "AA" ~ "American",
TRUE ~ reporting_airline
)
) %>%
group_by(reporting_airline, Carrier_Name) %>%
summarise(
Avg_Buffer = round(mean(time_buffer, na.rm = TRUE), 1),
On_Time_Rate = round(mean(arr_delay <= 0, na.rm = TRUE) * 100, 1),
Avg_Arrival_Delay = round(mean(arr_delay, na.rm = TRUE), 1),
.groups = "drop"
) %>%
mutate(Strategy = case_when(
Avg_Buffer > 8 ~ "Realistic (High Padding)",
Avg_Buffer > 0 ~ "Balanced",
TRUE ~ "Optimistic (Risky)"
)) %>%
rename(
`Airline Code` = reporting_airline,
`Carrier Name` = Carrier_Name,
`Avg Buffer (Mins)` = Avg_Buffer,
`On-Time Rate (%)` = On_Time_Rate,
`Avg Arrival Delay (Mins)`= Avg_Arrival_Delay
)
kable(honesty_summary, format = "html",
caption = "BWI-JFK Airline Scheduling Honesty Comparison") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE) %>%
column_spec(3, bold = TRUE, color = "white", background = "#2c3e50") %>%
column_spec(6, italic = TRUE)
## Step 11 Market Share
NOTES:
Since 9E belongs to Delta, I wanted to see howm any flights they have out of BWI. Results show 79% of the market.
# ============================================================
# Step 11. Market Share — Delta vs. Others (BWI → JFK)
# ============================================================
delta_comparison <- market_pair %>%
mutate(Carrier_Group = ifelse(reporting_airline %in% c("DL", "9E"),
"Delta / Endeavor", "Other Carriers")) %>%
group_by(Carrier_Group) %>%
summarise(Flight_Count = n(), .groups = "drop") %>%
mutate(Percentage = round(Flight_Count / sum(Flight_Count) * 100, 1))
print(delta_comparison)
ggplot(delta_comparison, aes(x = "", y = Flight_Count, fill = Carrier_Group)) +
geom_bar(stat = "identity", width = 1) +
coord_polar("y", start = 0) +
scale_fill_manual(values = c("#E01933", "#808080")) +
geom_text(aes(label = paste0(Percentage, "%")),
position = position_stack(vjust = 0.5), color = "white", size = 5) +
labs(title = "Market Share: Delta vs. All Other Carriers",
subtitle = "BWI to JFK Route Analysis") +
theme_void()
Step 12 Carrier Volume
NOTES:
Here I’m looking at all the flights, no mater where to and from. Counting the rows for each airline.
# ============================================================
# Step 12. Carrier Volume on all routes
# ============================================================
global_carrier_stats <- flights_raw %>%
group_by(reporting_airline) %>%
summarise(Total_Records = n(), .groups = "drop") %>%
mutate(
Global_Share = round(Total_Records / sum(Total_Records) * 100, 1),
Carrier_Name = case_when(
reporting_airline == "9E" ~ "Endeavor (Delta)",
reporting_airline == "DL" ~ "Delta Mainline",
reporting_airline == "AA" ~ "American",
reporting_airline == "B6" ~ "JetBlue",
reporting_airline == "WN" ~ "Southwest",
reporting_airline == "UA" ~ "United",
reporting_airline == "OO" ~ "SkyWest",
reporting_airline == "YX" ~ "Republic",
TRUE ~ reporting_airline
)
) %>%
arrange(desc(Total_Records))
print(head(global_carrier_stats, 10))
ggplot(head(global_carrier_stats, 10),
aes(x = reorder(Carrier_Name, Total_Records), y = Total_Records, fill = Carrier_Name)) +
geom_col() +
coord_flip() +
labs(title = "Top 10 Carriers by Data Volume",
subtitle = "Total records across all routes in 2019 dataset",
x = "Airline", y = "Number of Records") +
theme_minimal() +
theme(legend.position = "none")
Step 13 Buffers and Scheduling
NOTES:
Since Southwest had the most volume, I will look at their buffer and scheduling.
# ============================================================
# Step 13. Southwest Buffers & Scheduling Strategy - all
# ============================================================
southwest_strategy <- flights_raw %>%
filter(reporting_airline == "WN") %>%
mutate(time_buffer = crs_elapsed_time - actual_elapsed_time) %>%
summarise(
Avg_Buffer = round(mean(time_buffer, na.rm = TRUE), 1),
Median_Buffer = round(median(time_buffer, na.rm = TRUE), 1),
On_Time_Rate = round(mean(arr_delay <= 0, na.rm = TRUE) * 100, 1),
Total_Flights = n()
)
message("Global Southwest Strategy:")
print(southwest_strategy)
sw_padded_routes <- flights_raw %>%
filter(reporting_airline == "WN") %>%
mutate(time_buffer = crs_elapsed_time - actual_elapsed_time) %>%
group_by(origin, dest) %>%
summarise(
Avg_Buffer = round(mean(time_buffer, na.rm = TRUE), 1),
Flight_Count = n(),
.groups = "drop"
) %>%
filter(Flight_Count > 50) %>%
arrange(desc(Avg_Buffer)) %>%
head(5)
kable(sw_padded_routes,
caption = "Top 5 Southwest Routes with Most Buffer (Padding)") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
Step 14 Comparision
NOTES:
which airport should traveler choose when going to NY from BWI, looking at time of the day what is the risk of a delay.
Need to check LGA next
Results: best time for both is 9am and between 6-7pm.
Worst times 4am to go to EWR, 10am for both, 1pm and 4pm to JFK, and anytime after 7pm to both.
JFK has the worst taxi-out time.
# ============================================================
# Step 14. Comparative Analysis: BWI → JFK vs. BWI → EWR
# ============================================================
# --- Create Comparative Subset ---
nyc_comparison <- flights_raw %>%
filter(origin == "BWI", dest %in% c("JFK", "EWR")) %>%
mutate(
hour_of_day = dep_time %/% 100,
is_delayed = arr_delay > 15,
time_buffer = crs_elapsed_time - actual_elapsed_time
) %>%
filter(!is.na(arr_delay), !is.na(hour_of_day))
# --- Visualizing Delay Probability by Hour ---
# which NYC airport becomes "riskier" at which time
ggplot(nyc_comparison, aes(x = hour_of_day, y = as.numeric(is_delayed), color = dest)) +
stat_summary(fun = mean, geom = "line", linewidth = 1) +
stat_summary(fun = mean, geom = "point", size = 2) +
scale_x_continuous(breaks = seq(0, 23, by = 2)) +
scale_color_manual(values = c("JFK" = "#003594", "EWR" = "#D50032")) + # Airline-style colors
labs(
title = "BWI to NYC: Delay Probability by Departure Hour",
subtitle = "Comparing Risk Levels for JFK vs. Newark (EWR)",
x = "Hour of Day", y = "Probability of Delay (>15 min)",
color = "Destination"
) +
theme_minimal()
# --- Taxi-Out Comparison (Airport Congestion) ---
# which airport has worse tarmac congestion
ggplot(nyc_comparison, aes(x = dest, y = taxi_out, fill = dest)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
coord_cartesian(ylim = c(0, 60)) +
labs(
title = "Tarmac Times: JFK vs. EWR",
subtitle = "Taxi-out duration for flights departing BWI",
x = "Destination Airport", y = "Taxi-Out Minutes"
) +
theme_minimal() +
guides(fill = "none")
# --- Summary Table Comparison ---
nyc_summary_table <- nyc_comparison %>%
group_by(dest) %>%
summarise(
Total_Flights = n(),
Avg_Arrival_Delay = round(mean(arr_delay, na.rm = TRUE), 1),
On_Time_Rate = round(mean(arr_delay <= 0, na.rm = TRUE) * 100, 1),
Avg_Taxi_Out = round(mean(taxi_out, na.rm = TRUE), 1),
Avg_Buffer = round(mean(time_buffer, na.rm = TRUE), 1)
)
kable(nyc_summary_table, caption = "BWI to NYC Hubs: Side-by-Side Performance") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
## Step 15 Reliability
NOTES:
checking which airline is most reliable from the dataset.
Looking at :
- on time rate;
- extreme delay rate;
- flierting by m=numnber of flights > 20.
ggplot - to visualize data
Results: 9E over 80% on time rate with average delay of less than 5 min, MQ over 85% on time.
# ============================================================
# Step 15. How reliable are : BWI → JFK/EWR
# ============================================================
# --- Calculate Reliability Metrics ---
reliability_leaderboard <- nyc_comparison %>%
group_by(reporting_airline, dest) %>%
summarise(
Flight_Count = n(),
On_Time_Rate = mean(arr_delay <= 15, na.rm = TRUE) * 100,
Avg_Delay_Mins = mean(arr_delay, na.rm = TRUE),
Extreme_Delay_Rate = mean(arr_delay > 60, na.rm = TRUE) * 100, # % of flights > 1 hour late
.groups = "drop"
) %>%
filter(Flight_Count > 20) %>% # Filter for statistical significance
arrange(desc(On_Time_Rate))
# --- Visualizing Reliability (On-Time vs. Severity) ---
ggplot(reliability_leaderboard, aes(x = On_Time_Rate, y = Avg_Delay_Mins, label = reporting_airline)) +
geom_point(aes(size = Flight_Count, color = dest), alpha = 0.7) +
geom_text(vjust = -1.2, check_overlap = TRUE) +
scale_color_manual(values = c("JFK" = "#003594", "EWR" = "#D50032")) +
labs(
title = "Reliability Leaderboard: BWI to JFK & EWR",
subtitle = "Top-Left = Most Reliable (High On-Time, Low Delay Severity)",
x = "On-Time Arrival Rate (%)",
y = "Average Arrival Delay (Minutes)",
color = "Destination",
size = "Volume"
) +
theme_minimal()
# --- Recommendation Table ---
reliability_leaderboard %>%
mutate(
Reliability_Score = (On_Time_Rate / 10) - (Extreme_Delay_Rate / 5),
Status = case_when(
Reliability_Score > 8 ~ "Elite",
Reliability_Score > 6 ~ "Stable",
TRUE ~ "Volatile"
)
) %>%
select(reporting_airline, dest, On_Time_Rate, Extreme_Delay_Rate, Status) %>%
kable(caption = "The BWI-NYC Reliability Index") %>%
kable_styling(bootstrap_options = c("striped", "hover"))
## Step 16 Regression Models
NOTES:
Looking at the three airlines and best time to fly with them depending on their delay risk.
# ============================================================
# Step 16. Delay Risk Heatmap: airline X hour of the day
# ============================================================
# --- Prepare Heatmap Data ---
heatmap_data <- nyc_comparison %>%
group_by(reporting_airline, hour_of_day) %>%
summarise(
delay_prob = mean(is_delayed, na.rm = TRUE),
flight_count = n(),
.groups = "drop"
) %>%
# Filter out low-volume combinations to avoid "noise"
filter(flight_count > 5)
# --- Create the Heatmap ---
ggplot(heatmap_data, aes(x = factor(hour_of_day), y = reporting_airline, fill = delay_prob)) +
geom_tile(color = "white") +
scale_fill_gradientn(
colors = c("#e8f5e9", "#fff59d", "#ffcc80", "#e57373", "#b71c1c"),
labels = scales::percent,
name = "Delay Risk"
) +
labs(
title = "BWI-NYC Delay Risk Heatmap",
subtitle = "Probability of >15 min delay by Airline and Hour",
x = "Hour of Departure (24h)",
y = "Airline Code"
) +
theme_minimal() +
theme(
panel.grid = element_blank(),
axis.text.x = element_text(angle = 0)
)