Data Analysis BWI to JFK/EWR

Author

Irena Austin

Published

March 10, 2026

R Markdown

Step 1 Installing and loading the libraries

# ============================================================
# Step 1.  Install and load libraries
# ============================================================
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  tidyverse, janitor, lubridate, skimr,
  caret, glmnet, broom,
  ggplot2, knitr, kableExtra
)

set.seed(42)


NOTE:
Used command “pacman” to load up all libraries at once:
- tidyverse: collection of tools (dplyr to filter, tidyr to organize);
- janitor: clean up tool for name clean up;
- lubridate: time and date tool helps with extracting day of week or month;
- skimr: quick scan of data to check how many values are missing, averages, histograms;   - caret: classification and regression traing, crossvalidaton;
- glmnet: ridge and lasso regression, used to check which varaiable is the strongest predictor;
- broom: sweeps the messy texts in data, clean, organize data;
- ggplot2: building charts;
- knitr: knitting for github;
- kableExtra: adds extra grapgics, formatting to table.

Used set.seed(42) - to generate same random number each time to produce same result.

Step 2 Loading the raw data from enriched_flights_2019

# ============================================================
# Step 2. Load raw data set
# ============================================================
flights_raw <- read_csv("enriched_flights_2019.csv") %>%
  clean_names()


NOTE:
clean_names() - changes column names in to coding friendly names

Step 3 Loading the weather data, commented out since its not needed


NOTE:
included this so that I can remeber how to retrieve weather data

# ============================================================
# Step 3. NOAA WEATHER DATA (BWI Hourly *INSERT YEAR*)
# ============================================================
# Note: requires httr2 — run separately if needed
# library(httr2)
#
# base_url <- "https://www.ncei.noaa.gov/access/services/data/v1"
# resp <- request(base_url) |>
#   req_url_query(
#     dataset           = "global-hourly",
#     stations          = "72406093721",
#     startDate         = "2024-01-01T00:00:00", ***change dates
#     endDate           = "2024-12-31T23:59:59",
#     dataTypes         = "TMP,WND,CIG,DEW",  ***what data needed
#     format            = "csv",
#     units             = "metric",
#     includeAttributes = "true",
#     includeStationName = "true"
#   ) |>
#   req_perform()
#
# ncei_clean <- resp |> resp_body_string() |>
#   read_csv(show_col_types = FALSE) |>
#   separate(WND, into = c("w_dir","w_dir_q","w_type","w_spd","w_spd_q"), sep = ",", fill = "right") |>
#   separate(TMP, into = c("t_raw","t_q"),                                 sep = ",", fill = "right") |>
#   separate(CIG, into = c("c_height","c_q","c_det","c_cavok"),            sep = ",", fill = "right") |>
#   mutate(
#     temp_c         = ifelse(t_raw   %in% c("+9999","9999"), NA, as.numeric(t_raw)  / 10),
#     wind_speed_m_s = ifelse(w_spd   == "9999",              NA, as.numeric(w_spd)  / 10),
#     wind_dir       = ifelse(w_dir   == "999",               NA, as.numeric(w_dir)),
#     ceiling_m      = ifelse(c_height == "99999",            NA, as.numeric(c_height)),
#     DATE = as.POSIXct(DATE, format = "%Y-%m-%dT%H:%M:%S", tz = "UTC")
#   ) |>
#   select(DATE, NAME, temp_c, wind_speed_m_s, wind_dir, ceiling_m)
#
# write_csv(ncei_clean, "BWI_Hourly_Weather_2024.csv")


#NOAA library(httr2)
#library(readr)
#library(dplyr)
 
# base NCEI API endpoint
#base_url <- "https://www.ncei.noaa.gov/access/services/data/v1"
 
#query parameters according to NCEI API documentation
#req <- request(base_url) |>
# req_url_query(
#   dataset            = "global-hourly",       # Targets the ISD hourly database
#   stations           = "72406093721",         # CORRECTED: 11-digit USAF-WBAN ID #for BWI
#   startDate          = "2022-06-27T00:00:00Z",# ISO 8601 start time (Midnight UTC)
#   endDate            = "2022-06-27T23:59:59Z",# ISO 8601 end time (End of day UTC)
#   format             = "csv",                 # Requests comma-separated values
#   units              = "metric",              # Converts standard variables to #metric
#   includeAttributes  = "true",                # Includes data quality flags
#   includeStationName = "true"                 # Includes the physical station name
# )
# 
# 3. Execute the request and extract the text payload
#resp <- req |> req_perform()
#csv_text <- resp |> resp_body_string()
 
# CSV text into an R dataframe
#ncei_hourly_datav2 <- read_csv(csv_text, show_col_types = FALSE)
 
# View the raw data in the console to verify all columns populated
#print(ncei_hourly_datav2)
 
# 5. Export the dataframe to a CSV file for Excel
#write_csv(ncei_hourly_datav2, file = "ncei_hourly_datav2_June27.csv")
 
# Load tidyr for the separate() function
#library(tidyr)
#library(dplyr)
#library(readr)
 
# Take the raw ncei_hourly_data and pass it through the cleaning pipeline
#ncei_clean <- ncei_hourly_data |>
 
 # --- 1. TEMPERATURE (TMP) ---
 # Format: Temperature, Quality
 #separate(TMP, into = c("temp_raw", "temp_qual"), sep = ",", remove = FALSE) |>
 #mutate(
   # Missing is "+9999". Otherwise, convert to numeric and divide by 10 for Celsius.
#   temp_c = ifelse(temp_raw == "+9999", NA, as.numeric(temp_raw) / 10)
# ) |>
 
 # --- 2. WIND (WND) ---
 # Format: Direction, Direction_Quality, Type, Speed, Speed_Quality
# separate(WND, into = c("wind_dir_raw", "wind_dir_qual", "wind_type", #"wind_speed_raw", "wind_speed_qual"), sep = ",", remove = FALSE) |>
# mutate(
   # Speed missing is "9999". Divide by 10 for meters per second.
#   wind_speed_m_s = ifelse(wind_speed_raw == "9999", NA, as.numeric(wind_speed_raw) / 10),
   # Direction missing is "999". Kept as whole degrees.
 #  wind_dir_deg = ifelse(wind_dir_raw == "999", NA, as.numeric(wind_dir_raw))
 #) |>
 
 # --- 3. CLOUD CEILING (CIG) ---
 # Format: Height, Quality, Determination, CAVOK
 #separate(CIG, into = c("ceiling_raw", "ceiling_qual", "ceiling_det", "ceiling_cavok"), sep = ",", remove = FALSE) |>
 #mutate(
   # Height missing is "99999". Kept as whole meters.
#   ceiling_height_m = ifelse(ceiling_raw == "99999", NA, as.numeric(ceiling_raw))
# ) |>
 
 # --- 4. SELECT FINAL FEATURES ---
 # Drop all the messy intermediate text columns so the final dataframe is lightweight
 #select(
#   DATE, 
 #  temp_c, 
# wind_speed_m_s, 
 #  wind_dir_deg, 
#   ceiling_height_m
 #)
 
# View the final, machine-learning-ready dataset
#print(ncei_clean)
 
# Export the clean dataframe to an Excel-ready CSV
#write_csv(ncei_clean, file = "NCEI_BWI_Cleanedv2_Features_June27.csv")



Step 4 KARAN


NOTES:
4a.
keep_cols - looking only at specific columns
sample_n - to pick a smaller smaple of flights, specified to 30,000 flights to ensure computer is able to process it
mutate - ensuring its a numeric data for R to process
delay_30 - to look at flights that are at least 30 min late
4b.
skim(df_sub) - quick report of data

4c.
ggplot - added histogram to show delay distribution
4d.
stat_summary - shows how weather severity impacted the delay
4e.
summarise - to calculate delay probability for every route in database
filter n>=50 - ignores any routes that have fewer than 50 flights
4f.
Logistic Regression - to predict a probability of an event, 0% to 100% of being 30 min late
delay_30 time + month + weather + airline + route
exp(coef(final_glm)) - odd ratio - if weather is 2, for every point increase in weather the chance of being late double



::: {.cell}

```{.r .cell-code}
# ============================================================
# Step 4. KARAN — EDA & LOGISTIC REGRESSION (All Routes)
# ============================================================

#4a.Feature selection & sampling
keep_cols <- c(
  "arr_delay_minutes", "month", "day_of_week", "dep_time_blk",
  "distance", "origin", "dest", "reporting_airline", "dep_weather_severity"
)
keep_cols <- keep_cols[keep_cols %in% names(flights_raw)]

df_sub <- flights_raw %>%
  select(all_of(keep_cols)) %>%
  sample_n(min(30000, nrow(.))) %>%
  mutate(
    arr_delay_minutes    = as.numeric(arr_delay_minutes),
    distance             = as.numeric(distance),
    dep_weather_severity = as.numeric(dep_weather_severity),
    across(c(month, day_of_week, dep_time_blk, origin, dest, reporting_airline), as.factor),
    route     = as.factor(paste(origin, "to", dest)),
    delay_30  = arr_delay_minutes >= 30,
    log_delay = log1p(pmax(arr_delay_minutes, 0))
  )

# 4b. Data checks ---
str(df_sub)
skim(df_sub)
summary(df_sub$arr_delay_minutes)
mean(is.na(df_sub$arr_delay_minutes))

# 4c. Arrival delay distribution ---
ggplot(df_sub, aes(x = arr_delay_minutes)) +
  geom_histogram(binwidth = 5) +
  coord_cartesian(xlim = c(0, 180)) +
  geom_vline(xintercept = 30, linetype = "dashed") +
  labs(
    title    = "Arrival Delay Distribution (0–180 min shown)",
    subtitle = "Dashed line = 30 min delay threshold",
    x = "Arrival Delay (minutes)", y = "Count"
  )

# 4d. P(delay >= 30) vs weather severity ---
if ("dep_weather_severity" %in% names(df_sub)) {
  ggplot(df_sub, aes(x = dep_weather_severity, y = as.numeric(delay_30))) +
    stat_summary(fun = mean, geom = "point") +
    stat_summary(fun = mean, geom = "line") +
    labs(
      title = "P(Delay ≥ 30) vs Departure Weather Severity",
      x = "Departure Weather Severity", y = "P(Delay ≥ 30)"
    )
}

# 4e. Top 15 routes by delay probability ---
df_sub %>%
  group_by(route) %>%
  summarise(
    n             = n(),
    p_delay30     = mean(delay_30, na.rm = TRUE),
    avg_arr_delay = mean(arr_delay_minutes, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  filter(n >= 50) %>%
  arrange(desc(p_delay30)) %>%
  slice_head(n = 15)

# 4f. Logistic regression (GLM) ---
df_glm <- df_sub %>%
  filter(!is.na(arr_delay_minutes)) %>%
  select(delay_30, dep_time_blk, month, distance,
         reporting_airline, dep_weather_severity, day_of_week, route, dest) %>%
  na.omit()

route_counts <- df_glm %>% count(route, name = "n_route")

df_glm2 <- df_glm %>%
  left_join(route_counts, by = "route") %>%
  filter(n_route >= 100) %>%
  select(-n_route)

final_glm <- glm(
  delay_30 ~ dep_time_blk + month + day_of_week + distance +
    reporting_airline + dep_weather_severity + route +
    dep_time_blk:dep_weather_severity,
  data   = df_glm2,
  family = binomial
)

summary(final_glm)
exp(coef(final_glm))   # Odds ratios

:::



## Step 5 Regression Models
NOTES:
traincontrol - crossvalidation of data  cv - crossvalidation
10 - divide data into 10 piles, learn from 9 and test on 10th  regression models:
- Linear: Draws a straight line through the data. It assumes if Departure Delay goes up by 1, Arrival Delay goes up by 1.
- Ridge: gives better R-squared than linear, reduce noise
- Lasso: declutters the data to keep model simple,
- ElasticNet: reduces noise and declutters.

Dot Plot - shows results of the prevous calculations
vs. Actual - creates predictions and visualizes as a cloud of green dots with a red dashed line.
The red line is “Perfect Prediction.” If a dot is on the line, the model guessed perfectly. If the green cloud is tight around the red line, your model is a sharpshooter. If the cloud is a messy blob, your model is just guessing.
PCA - variables: Taxi-out, Air time, Distance. PCA squashes them together to see which ones move in the same direction.
The Result: Delays cased by taxi out.

# ============================================================
# Step 5. Regression Models (BWI → EWR & BWI → JFK)
# ============================================================
train_control <- trainControl(method = "cv", number = 5)

# coding and building four regression models for any market pair
run_regression_models <- function(data, route_label) {
  
  models <- list(
    Linear     = train(arr_delay ~ dep_delay + hour_of_day + is_weekend,
                       data = data, method = "lm",     trControl = train_control),
    Ridge      = train(arr_delay ~ dep_delay + hour_of_day + is_weekend,
                       data = data, method = "glmnet",
                       tuneGrid = expand.grid(alpha = 0, lambda = seq(0, 1, by = 0.1)),
                       trControl = train_control),
    Lasso      = train(arr_delay ~ dep_delay + hour_of_day + is_weekend,
                       data = data, method = "glmnet",
                       tuneGrid = expand.grid(alpha = 1, lambda = seq(0, 1, by = 0.1)),
                       trControl = train_control),
    ElasticNet = train(arr_delay ~ dep_delay + hour_of_day + is_weekend,
                       data = data, method = "glmnet", trControl = train_control)
  )
  
  # Model comparison
  results <- resamples(models)
  print(dotplot(results, metric = "Rsquared",
                main = paste("Model Comparison: R-squared —", route_label)))
  
  # Predicted vs Actual
  data$predictions <- predict(models$ElasticNet, newdata = data)
  print(
    ggplot(data, aes(x = arr_delay, y = predictions)) +
      geom_point(alpha = 0.4, color = "darkgreen") +
      geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
      labs(title = paste("Elastic Net: Predicted vs. Actual —", route_label),
           x = "Actual Arrival Delay (min)", y = "Predicted Arrival Delay (min)") +
      theme_minimal()
  )
  
  invisible(models)
}

# --- BWI → EWR ---
market_pair_ewr <- flights_raw %>%
  filter(origin == "BWI", dest == "EWR") %>%
  mutate(
    is_delayed  = as.factor(ifelse(arr_delay > 15, "Yes", "No")),
    hour_of_day = dep_time %/% 100,
    is_weekend  = ifelse(day_of_week %in% c(6, 7), 1, 0)
  ) %>%
  filter(!is.na(arr_delay), !is.na(dep_delay))

ggplot(market_pair_ewr, aes(x = dep_delay, y = arr_delay)) +
  geom_point(alpha = 0.5, color = "steelblue") +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "BWI → EWR: Departure vs Arrival Delay",
       x = "Departure Delay (min)", y = "Arrival Delay (min)") +
  theme_minimal()

models_ewr <- run_regression_models(market_pair_ewr, "BWI-EWR")

# --- BWI → JFK (primary market pair) ---
market_pair <- flights_raw %>%
  filter(origin == "BWI", dest == "JFK") %>%
  mutate(
    is_delayed  = as.factor(ifelse(arr_delay > 15, "Yes", "No")),
    hour_of_day = dep_time %/% 100,
    is_weekend  = ifelse(day_of_week %in% c(6, 7), 1, 0)
  ) %>%
  filter(!is.na(arr_delay), !is.na(dep_delay), !is.na(hour_of_day))

models_jfk <- run_regression_models(market_pair, "BWI-JFK")

# Logistic regression for classification
model_logit <- train(
  is_delayed ~ dep_delay + hour_of_day + is_weekend,
  data = market_pair, method = "glm", family = "binomial",
  trControl = train_control
)
logit_preds <- predict(model_logit, market_pair)
confusionMatrix(logit_preds, market_pair$is_delayed)

# PCA
pca_input <- market_pair %>%
  select(dep_delay, arr_delay, air_time, distance, taxi_out) %>%
  select(where(~ n_distinct(.) > 1)) %>%
  drop_na()

pca_results <- prcomp(pca_input, scale. = TRUE)
plot(pca_results, type = "l", main = "PCA Variance Explained")
biplot(pca_results, cex = 0.7, main = "PCA: Flight Feature Relationships")

# Near-zero variance check
nzv_metrics <- nearZeroVar(market_pair, saveMetrics = TRUE)
print(nzv_metrics)

# Arrival delays by hour of day
ggplot(market_pair, aes(x = as.factor(hour_of_day), y = arr_delay)) +
  geom_boxplot(fill = "orange", alpha = 0.7) +
  labs(title = "Arrival Delays by Hour of Day (BWI-JFK)",
       x = "Hour of Departure", y = "Arrival Delay (minutes)") +
  theme_light()

Step 6 Delays


NOTES:
group_by(tail_number, flight_date) - after looking at the flight delays, looking at tail number (plane registration) we are able to track the plane and see if one if that affected the delay;
Turnaround min - when it landed and when it took off to find the gap between;
prev_flight_delayed - was the plane delayed getting to destination;
ggplot - for visualization.

# ============================================================
# Step 6. Propagated Delays - looking at tail number analysis
# ============================================================
propagated_delays <- flights_raw %>%
  filter(!is.na(tail_number), !is.na(wheels_on), !is.na(wheels_off)) %>%
  select(tail_number, flight_date, origin, dest,
         wheels_on, wheels_off, arr_delay, dep_delay) %>%
  arrange(tail_number, flight_date, wheels_off) %>%
  group_by(tail_number, flight_date) %>%
  mutate(
    prev_flight_arrival = lag(wheels_on),
    turnaround_mins = as.numeric(difftime(
      strptime(sprintf("%04d", wheels_off),         format = "%H%M"),
      strptime(sprintf("%04d", prev_flight_arrival), format = "%H%M"),
      units = "mins"
    )),
    prev_flight_delayed = lag(arr_delay) > 15
  ) %>%
  filter(!is.na(turnaround_mins), turnaround_mins > 0)

ggplot(propagated_delays, aes(x = prev_flight_delayed, y = dep_delay, fill = prev_flight_delayed)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title    = "Impact of Propagated Delays",
    subtitle = "Departure delay based on whether the incoming aircraft arrived late",
    x = "Incoming Aircraft Delayed (>15 min)", y = "Current Departure Delay (min)"
  ) +
  scale_fill_manual(values = c("steelblue", "firebrick")) +
  theme_minimal()

propagated_delays %>%
  group_by(prev_flight_delayed) %>%
  summarise(
    avg_dep_delay      = mean(dep_delay,        na.rm = TRUE),
    median_turnaround  = median(turnaround_mins, na.rm = TRUE),
    flight_count       = n()
  )


## Step 7 Taxi-Out Time
NOTES:
group by - hour of day in 24h period to get average and median;
flight count - it ignores hours that had 5 or less flights;
taxi_corr - correlation calculation,
ggplot - to visualize data;

Results: best time to fly woul dbe at 7am, least amout of taxi time; worst time after 4pm.\

# ============================================================
# Step 7. TAXI-OUT - how long the plane is on turmac/congestion  
# (BWI → JFK)
# ============================================================
taxi_summary <- market_pair %>%
  group_by(hour_of_day) %>%
  summarise(
    avg_taxi_out    = mean(taxi_out,   na.rm = TRUE),
    median_taxi_out = median(taxi_out, na.rm = TRUE),
    flight_count    = n()
  ) %>%
  filter(flight_count > 5)

taxi_corr <- cor(market_pair$hour_of_day, market_pair$taxi_out, use = "complete.obs")
message("Correlation (Hour of Day vs Taxi-Out): ", round(taxi_corr, 3))

ggplot(taxi_summary, aes(x = hour_of_day, y = avg_taxi_out)) +
  geom_line(color = "darkred", linewidth = 1) +
  geom_point(size = 3, color = "darkred") +
  geom_hline(yintercept = mean(market_pair$taxi_out, na.rm = TRUE),
             linetype = "dashed", color = "blue") +
  annotate("text", x = 5,
           y = mean(market_pair$taxi_out, na.rm = TRUE) + 1,
           label = "Daily Average", color = "blue") +
  scale_x_continuous(breaks = seq(0, 23, by = 2)) +
  labs(
    title    = "Airport Congestion: Average Taxi-Out Time by Hour",
    subtitle = "Identifying the 'Sweet Spot' for BWI-JFK departures",
    x = "Hour of Day (24h)", y = "Average Taxi-Out Time (minutes)"
  ) +
  theme_minimal()



Step 8 Weather impact

NOTES:
grouping by - weather severity;
LM and coef - linear model and multiplier;
Looking at the impact of weather on delays depending on severity.
ggplot - to view the results.

# ============================================================
# Step 8. Looking at weather impact
# ============================================================
weather_summary <- market_pair %>%
  group_by(dep_weather_severity) %>%
  summarise(avg_dep_delay = mean(dep_delay, na.rm = TRUE), flight_count = n()) %>%
  filter(!is.na(dep_weather_severity))

weather_model  <- lm(dep_delay ~ dep_weather_severity, data = market_pair)
multiplier     <- coef(weather_model)["dep_weather_severity"]
message("Weather Multiplier: ", round(multiplier, 2), " minutes per severity level")

ggplot(weather_summary, aes(x = dep_weather_severity, y = avg_dep_delay)) +
  geom_col(fill = "steelblue", alpha = 0.8) +
  geom_text(aes(label = round(avg_dep_delay, 1)), vjust = -0.5) +
  labs(
    title    = "Weather Multiplier: Impact on Departure Delays",
    subtitle = paste("Each severity point adds ~", round(multiplier, 1), "minutes of delay"),
    x = "Departure Weather Severity (0 = Clear, 4 = Extreme)",
    y = "Average Departure Delay (minutes)"
  ) +
  theme_minimal()



Step 9 Scheduling Strategy


NOTES: in this test I will check which airline adds time to thier schedule to allow times for delays.
time buffer - what is actual time and schedyled time of departure;
summarise - how many flights arrived on time, how many were late, in the review looking at airline that shows over 1000 flights;
ggplot - to visualize results.
Results: 9E (Endavor Air by Delta) with flight count of 1406 show over 71.4% arrivals on time.

# ============================================================
# Step 9. Airline Buffer and Scheduling Strategy
# ============================================================
airline_buffer <- market_pair %>%
  mutate(
    time_buffer = crs_elapsed_time - actual_elapsed_time,
    is_on_time  = arr_delay <= 0
  ) %>%
  group_by(reporting_airline) %>%
  summarise(
    avg_buffer    = mean(time_buffer, na.rm = TRUE),
    on_time_rate  = mean(is_on_time,  na.rm = TRUE) * 100,
    avg_arr_delay = mean(arr_delay,   na.rm = TRUE),
    flight_count  = n()
  ) %>%
  filter(flight_count > 1000)

ggplot(airline_buffer, aes(x = avg_buffer, y = on_time_rate, label = reporting_airline)) +
  geom_point(aes(size = flight_count, color = avg_arr_delay)) +
  geom_text(vjust = -1.5) +
  geom_vline(xintercept = 0, linetype = "dashed") +
  scale_color_gradient(low = "darkgreen", high = "red") +
  labs(
    title    = "Airline Strategy: Scheduling Buffer vs. On-Time Performance",
    subtitle = "Positive buffer = padded schedules",
    x = "Average Scheduling Buffer (minutes)", y = "On-Time Arrival Rate (%)",
    color = "Avg Arrival Delay", size = "Flight Count"
  ) +
  theme_minimal()



Step 10 Scheduling

NOTES:
Looking at the airlines with most flights:
- 9E - Endavor Air
- AA American Airlines
- B6 - JetBlue
Looking at scheduled minus the actual flight time in data.
added “kable” for formatting

# ============================================================
# Step 10. Scheduling honesty (BWI → JFK)
# ============================================================
honesty_summary <- market_pair %>%
  filter(reporting_airline %in% c("9E", "AA", "B6")) %>%
  mutate(
    time_buffer  = crs_elapsed_time - actual_elapsed_time,
    Carrier_Name = case_when(
      reporting_airline == "9E" ~ "Endeavor (Delta)",
      reporting_airline == "B6" ~ "JetBlue",
      reporting_airline == "AA" ~ "American",
      TRUE                      ~ reporting_airline
    )
  ) %>%
  group_by(reporting_airline, Carrier_Name) %>%
  summarise(
    Avg_Buffer         = round(mean(time_buffer, na.rm = TRUE),          1),
    On_Time_Rate       = round(mean(arr_delay <= 0, na.rm = TRUE) * 100, 1),
    Avg_Arrival_Delay  = round(mean(arr_delay, na.rm = TRUE),            1),
    .groups = "drop"
  ) %>%
  mutate(Strategy = case_when(
    Avg_Buffer >  8 ~ "Realistic (High Padding)",
    Avg_Buffer >  0 ~ "Balanced",
    TRUE            ~ "Optimistic (Risky)"
  )) %>%
  rename(
    `Airline Code`            = reporting_airline,
    `Carrier Name`            = Carrier_Name,
    `Avg Buffer (Mins)`       = Avg_Buffer,
    `On-Time Rate (%)`        = On_Time_Rate,
    `Avg Arrival Delay (Mins)`= Avg_Arrival_Delay
  )

kable(honesty_summary, format = "html",
      caption = "BWI-JFK Airline Scheduling Honesty Comparison") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE) %>%
  column_spec(3, bold = TRUE,  color = "white", background = "#2c3e50") %>%
  column_spec(6, italic = TRUE)



## Step 11 Market Share
NOTES:
Since 9E belongs to Delta, I wanted to see howm any flights they have out of BWI. Results show 79% of the market.

# ============================================================
# Step 11. Market Share — Delta vs. Others (BWI → JFK)
# ============================================================
delta_comparison <- market_pair %>%
  mutate(Carrier_Group = ifelse(reporting_airline %in% c("DL", "9E"),
                                "Delta / Endeavor", "Other Carriers")) %>%
  group_by(Carrier_Group) %>%
  summarise(Flight_Count = n(), .groups = "drop") %>%
  mutate(Percentage = round(Flight_Count / sum(Flight_Count) * 100, 1))

print(delta_comparison)

ggplot(delta_comparison, aes(x = "", y = Flight_Count, fill = Carrier_Group)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y", start = 0) +
  scale_fill_manual(values = c("#E01933", "#808080")) +
  geom_text(aes(label = paste0(Percentage, "%")),
            position = position_stack(vjust = 0.5), color = "white", size = 5) +
  labs(title    = "Market Share: Delta vs. All Other Carriers",
       subtitle = "BWI to JFK Route Analysis") +
  theme_void()


Step 12 Carrier Volume

NOTES:
Here I’m looking at all the flights, no mater where to and from. Counting the rows for each airline.

# ============================================================
# Step 12. Carrier Volume on all routes
# ============================================================
global_carrier_stats <- flights_raw %>%
  group_by(reporting_airline) %>%
  summarise(Total_Records = n(), .groups = "drop") %>%
  mutate(
    Global_Share = round(Total_Records / sum(Total_Records) * 100, 1),
    Carrier_Name = case_when(
      reporting_airline == "9E" ~ "Endeavor (Delta)",
      reporting_airline == "DL" ~ "Delta Mainline",
      reporting_airline == "AA" ~ "American",
      reporting_airline == "B6" ~ "JetBlue",
      reporting_airline == "WN" ~ "Southwest",
      reporting_airline == "UA" ~ "United",
      reporting_airline == "OO" ~ "SkyWest",
      reporting_airline == "YX" ~ "Republic",
      TRUE                      ~ reporting_airline
    )
  ) %>%
  arrange(desc(Total_Records))

print(head(global_carrier_stats, 10))

ggplot(head(global_carrier_stats, 10),
       aes(x = reorder(Carrier_Name, Total_Records), y = Total_Records, fill = Carrier_Name)) +
  geom_col() +
  coord_flip() +
  labs(title    = "Top 10 Carriers by Data Volume",
       subtitle = "Total records across all routes in 2019 dataset",
       x = "Airline", y = "Number of Records") +
  theme_minimal() +
  theme(legend.position = "none")



Step 13 Buffers and Scheduling

NOTES:
Since Southwest had the most volume, I will look at their buffer and scheduling.

# ============================================================
# Step 13. Southwest Buffers & Scheduling Strategy - all
# ============================================================
southwest_strategy <- flights_raw %>%
  filter(reporting_airline == "WN") %>%
  mutate(time_buffer = crs_elapsed_time - actual_elapsed_time) %>%
  summarise(
    Avg_Buffer    = round(mean(time_buffer,        na.rm = TRUE), 1),
    Median_Buffer = round(median(time_buffer,      na.rm = TRUE), 1),
    On_Time_Rate  = round(mean(arr_delay <= 0,     na.rm = TRUE) * 100, 1),
    Total_Flights = n()
  )

message("Global Southwest Strategy:")
print(southwest_strategy)

sw_padded_routes <- flights_raw %>%
  filter(reporting_airline == "WN") %>%
  mutate(time_buffer = crs_elapsed_time - actual_elapsed_time) %>%
  group_by(origin, dest) %>%
  summarise(
    Avg_Buffer   = round(mean(time_buffer, na.rm = TRUE), 1),
    Flight_Count = n(),
    .groups = "drop"
  ) %>%
  filter(Flight_Count > 50) %>%
  arrange(desc(Avg_Buffer)) %>%
  head(5)

kable(sw_padded_routes,
      caption = "Top 5 Southwest Routes with Most Buffer (Padding)") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))



Step 14 Comparision

NOTES:
which airport should traveler choose when going to NY from BWI, looking at time of the day what is the risk of a delay.
Need to check LGA next
Results: best time for both is 9am and between 6-7pm.
Worst times 4am to go to EWR, 10am for both, 1pm and 4pm to JFK, and anytime after 7pm to both.
JFK has the worst taxi-out time.

# ============================================================
# Step 14. Comparative Analysis: BWI → JFK vs. BWI → EWR
# ============================================================

# --- Create Comparative Subset ---
nyc_comparison <- flights_raw %>%
  filter(origin == "BWI", dest %in% c("JFK", "EWR")) %>%
  mutate(
    hour_of_day = dep_time %/% 100,
    is_delayed  = arr_delay > 15,
    time_buffer = crs_elapsed_time - actual_elapsed_time
  ) %>%
  filter(!is.na(arr_delay), !is.na(hour_of_day))

# --- Visualizing Delay Probability by Hour ---
# which NYC airport becomes "riskier" at which time
ggplot(nyc_comparison, aes(x = hour_of_day, y = as.numeric(is_delayed), color = dest)) +
  stat_summary(fun = mean, geom = "line", linewidth = 1) +
  stat_summary(fun = mean, geom = "point", size = 2) +
  scale_x_continuous(breaks = seq(0, 23, by = 2)) +
  scale_color_manual(values = c("JFK" = "#003594", "EWR" = "#D50032")) + # Airline-style colors
  labs(
    title = "BWI to NYC: Delay Probability by Departure Hour",
    subtitle = "Comparing Risk Levels for JFK vs. Newark (EWR)",
    x = "Hour of Day", y = "Probability of Delay (>15 min)",
    color = "Destination"
  ) +
  theme_minimal()

# --- Taxi-Out Comparison (Airport Congestion) ---
# which airport has worse tarmac congestion
ggplot(nyc_comparison, aes(x = dest, y = taxi_out, fill = dest)) +
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +
  coord_cartesian(ylim = c(0, 60)) +
  labs(
    title = "Tarmac Times: JFK vs. EWR",
    subtitle = "Taxi-out duration for flights departing BWI",
    x = "Destination Airport", y = "Taxi-Out Minutes"
  ) +
  theme_minimal() +
  guides(fill = "none")

# --- Summary Table Comparison ---
nyc_summary_table <- nyc_comparison %>%
  group_by(dest) %>%
  summarise(
    Total_Flights   = n(),
    Avg_Arrival_Delay = round(mean(arr_delay, na.rm = TRUE), 1),
    On_Time_Rate     = round(mean(arr_delay <= 0, na.rm = TRUE) * 100, 1),
    Avg_Taxi_Out     = round(mean(taxi_out, na.rm = TRUE), 1),
    Avg_Buffer       = round(mean(time_buffer, na.rm = TRUE), 1)
  )

kable(nyc_summary_table, caption = "BWI to NYC Hubs: Side-by-Side Performance") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))



## Step 15 Reliability
NOTES:
checking which airline is most reliable from the dataset.
Looking at :
- on time rate;
- extreme delay rate;
- flierting by m=numnber of flights > 20.
ggplot - to visualize data
Results: 9E over 80% on time rate with average delay of less than 5 min, MQ over 85% on time.

# ============================================================
# Step 15. How reliable are : BWI → JFK/EWR 
# ============================================================

# --- Calculate Reliability Metrics ---
reliability_leaderboard <- nyc_comparison %>%
  group_by(reporting_airline, dest) %>%
  summarise(
    Flight_Count = n(),
    On_Time_Rate = mean(arr_delay <= 15, na.rm = TRUE) * 100,
    Avg_Delay_Mins = mean(arr_delay, na.rm = TRUE),
    Extreme_Delay_Rate = mean(arr_delay > 60, na.rm = TRUE) * 100, # % of flights > 1 hour late
    .groups = "drop"
  ) %>%
  filter(Flight_Count > 20) %>% # Filter for statistical significance
  arrange(desc(On_Time_Rate))

# --- Visualizing Reliability (On-Time vs. Severity) ---
ggplot(reliability_leaderboard, aes(x = On_Time_Rate, y = Avg_Delay_Mins, label = reporting_airline)) +
  geom_point(aes(size = Flight_Count, color = dest), alpha = 0.7) +
  geom_text(vjust = -1.2, check_overlap = TRUE) +
  scale_color_manual(values = c("JFK" = "#003594", "EWR" = "#D50032")) +
  labs(
    title = "Reliability Leaderboard: BWI to JFK & EWR",
    subtitle = "Top-Left = Most Reliable (High On-Time, Low Delay Severity)",
    x = "On-Time Arrival Rate (%)",
    y = "Average Arrival Delay (Minutes)",
    color = "Destination",
    size = "Volume"
  ) +
  theme_minimal()

# --- Recommendation Table ---
reliability_leaderboard %>%
  mutate(
    Reliability_Score = (On_Time_Rate / 10) - (Extreme_Delay_Rate / 5),
    Status = case_when(
      Reliability_Score > 8 ~ "Elite",
      Reliability_Score > 6 ~ "Stable",
      TRUE ~ "Volatile"
    )
  ) %>%
  select(reporting_airline, dest, On_Time_Rate, Extreme_Delay_Rate, Status) %>%
  kable(caption = "The BWI-NYC Reliability Index") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))



## Step 16 Regression Models
NOTES:
Looking at the three airlines and best time to fly with them depending on their delay risk.

# ============================================================ 
# Step 16. Delay Risk Heatmap: airline X hour of the day 
# ============================================================ 

# --- Prepare Heatmap Data --- 
heatmap_data <- nyc_comparison %>% 
group_by(reporting_airline, hour_of_day) %>% 
summarise( 
delay_prob = mean(is_delayed, na.rm = TRUE), 
 flight_count = n(), 
.groups = "drop" 
 ) %>% 
# Filter out low-volume combinations to avoid "noise" 
 filter(flight_count > 5)  

# --- Create the Heatmap --- 
ggplot(heatmap_data, aes(x = factor(hour_of_day), y = reporting_airline, fill = delay_prob)) + 
   geom_tile(color = "white") + 
   scale_fill_gradientn( 
     colors = c("#e8f5e9", "#fff59d", "#ffcc80", "#e57373", "#b71c1c"), 
     labels = scales::percent, 
     name = "Delay Risk" 
   ) + 
   labs(
    title = "BWI-NYC Delay Risk Heatmap", 
    subtitle = "Probability of >15 min delay by Airline and Hour", 
    x = "Hour of Departure (24h)", 
     y = "Airline Code" 
   ) + 
   theme_minimal() + 
  theme( 
    panel.grid = element_blank(), 
     axis.text.x = element_text(angle = 0) 
   )