Flight Delay EDA Plots (from enriched parquet)

from pathlib import Path
import polars as pl
import matplotlib.pyplot as plt
import polars as pl 

BASE_DIR = Path.cwd()  
PARQUET_PATH = "/home/jon/Documents/grad_school/OR568/project/OR568_ML_Project/data_pipeline/outputs/enriched_flights_filtered.parquet"

OUT_DIR = BASE_DIR / "plots"
OUT_DIR.mkdir(exist_ok=True)

PARQUET_PATH, OUT_DIR

df = pl.read_parquet(PARQUET_PATH)

df.head()
shape: (5, 91)
FlightDate Tail_Number Origin OriginState OriginStateFips OriginWac Dest DestState DestStateFips DestWac CRSDepTime DepTime DepDelay DepDelayMinutes DepDel15 DepartureDelayGroups CRSArrTime ArrTime ArrDelay ArrDelayMinutes ArrDel15 ArrivalDelayGroups TaxiOut WheelsOff WheelsOn TaxiIn Cancelled CancellationCode Diverted CRSElapsedTime ActualElapsedTime AirTime Flights Distance DistanceGroup CarrierDelay WeatherDelay dep_ice_accretion_6hr dep_peak_wind_gust dep_peak_wind_drct dep_peak_wind_time dep_weather_severity dep_wx_intensity dep_wx_has_ra dep_wx_has_ts dep_wx_has_sn dep_wx_has_fg dep_wx_has_br dep_wx_has_hz arr_drct arr_sknt arr_p01i arr_vsby arr_gust arr_wxcodes arr_ice_accretion_1hr arr_ice_accretion_3hr arr_ice_accretion_6hr arr_peak_wind_gust arr_peak_wind_drct arr_peak_wind_time arr_weather_severity arr_wx_intensity arr_wx_has_ra arr_wx_has_ts arr_wx_has_sn arr_wx_has_fg arr_wx_has_br arr_wx_has_hz icao24 aircraft_model aircraft_type weight_category num_seats
str str str str i64 i64 str str i64 i64 i64 str str str str str i64 str str str str str str str str str i64 str i64 i64 str str i64 i64 i64 str str str str str str f64 str bool bool bool bool bool bool f64 f64 f64 f64 f64 str str str str str str str f64 str bool bool bool bool bool bool str str str str str
"2019-04-19" "N111US" "CLT" "NC" 37 36 "BWI" "MD" 24 35 1314 "NA" "NA" "NA" "NA" "NA" 1442 "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" "NA" 1 "B" 0 88 "NA" "NA" 1 361 2 "NA" "NA" null null null null 0.0 "M" false false false false false false null null null null null null null null null null null null 0.0 "M" false false false false false false "a031bd" "A320-214" "Fixed wing multi engine" "20,000 and over." "182.0"
"2019-01-05" "N111US" "CLT" "NC" 37 36 "BWI" "MD" 24 35 2051 "2114" "23" "23" "1" "1" 2212 "2233" "21" "21" "1" "1" "16" "2130" "2227" "6" 0 "NA" 0 81 "79" "57" 1 361 2 "0" "0" null null null null 0.0 "M" false false false false false false 80.0 6.0 0.0 10.0 null "" null null null null null null 1.2 "M" false false false false false false "a031bd" "A320-214" "Fixed wing multi engine" "20,000 and over." "182.0"
"2019-01-20" "N111US" "CLT" "NC" 37 36 "BWI" "MD" 24 35 941 "0945" "4" "4" "0" "0" 1117 "1048" "-29" "0" "0" "-2" "9" "0954" "1043" "5" 0 "NA" 0 96 "63" "49" 1 361 2 "NA" "NA" null null null null 0.0 "M" false false false false false false null 6.0 0.0 9.0 null "" null null null null null null 1.3 "M" false false false false false false "a031bd" "A320-214" "Fixed wing multi engine" "20,000 and over." "182.0"
"2019-01-29" "N111US" "CLT" "NC" 37 36 "BWI" "MD" 24 35 1130 "1123" "-7" "0" "0" "-1" 1256 "1234" "-22" "0" "0" "-2" "9" "1132" "1227" "7" 0 "NA" 0 86 "71" "55" 1 361 2 "NA" "NA" null null null null 0.0 "M" false false false false false false null 6.0 0.0 9.0 null "" null null null null null null 1.3 "M" false false false false false false "a031bd" "A320-214" "Fixed wing multi engine" "20,000 and over." "182.0"
"2019-02-22" "N111US" "CLT" "NC" 37 36 "BWI" "MD" 24 35 1130 "1126" "-4" "0" "0" "-1" 1259 "1238" "-21" "0" "0" "-2" "13" "1139" "1232" "6" 0 "NA" 0 89 "72" "53" 1 361 2 "NA" "NA" null null null null 0.0 "M" false false false false false false 320.0 12.0 0.0 10.0 24.0 "" null null null null null null 0.4 "M" false false false false false false "a031bd" "A320-214" "Fixed wing multi engine" "20,000 and over." "182.0"
BASE_DIR = Path.cwd()  
PARQUET_PATH = "/home/jon/Documents/grad_school/OR568/project/OR568_ML_Project/data_pipeline/data/final/bts_weather_faa_filtered_all_years.parquet"
OUT_DIR = BASE_DIR / "plots"
OUT_DIR.mkdir(exist_ok=True)

PARQUET_PATH, OUT_DIR

df = pl.read_parquet(PARQUET_PATH)
df.write_csv("/home/jon/Documents/grad_school/OR568/project/OR568_ML_Project/data_pipeline/outputs/bts_weather_faa_filtered_all_years.csv")
# --------------------------------------------------
# FEATURE GROUPS
# --------------------------------------------------

flight_id_cols = [
    "FlightDate",
    "Tail_Number",
    "icao24",
]

route_cols = [
    "Origin",
    "OriginState",
    "OriginStateFips",
    "OriginWac",
    "Dest",
    "DestState",
    "DestStateFips",
    "DestWac",
]

scheduled_time_cols = [
    "CRSDepTime",
    "CRSArrTime",
    "CRSElapsedTime",
    "dep_ts_sched",
    "arr_ts_sched",
]

actual_time_cols = [
    "DepTime",
    "ArrTime",
    "dep_ts_actual",
    "arr_ts_actual",
    "act_arr_date",
]

departure_delay_cols = [
    "DepDelay",
    "DepDelayMinutes",
    "DepDel15",
    "DepartureDelayGroups",
]

arrival_delay_cols = [
    "ArrDelay",
    "ArrDelayMinutes",
    "ArrDel15",
    "ArrivalDelayGroups",
]

flight_phase_cols = [
    "TaxiOut",
    "WheelsOff",
    "WheelsOn",
    "TaxiIn",
    "AirTime",
    "ActualElapsedTime",
]

flight_status_cols = [
    "Cancelled",
    "CancellationCode",
    "Diverted",
]

delay_cause_cols = [
    "CarrierDelay",
    "WeatherDelay",
    "NASDelay",
    "SecurityDelay",
    "LateAircraftDelay",
]

distance_cols = [
    "Flights",
    "Distance",
    "DistanceGroup",
]

weather_time_cols = [
    "crs_arr_date",
]

dep_weather_cols = [
    "dep_drct",
    "dep_sknt",
    "dep_p01i",
    "dep_vsby",
    "dep_gust",
    "dep_wxcodes",
    "dep_ice_accretion_1hr",
    "dep_ice_accretion_3hr",
    "dep_ice_accretion_6hr",
    "dep_peak_wind_gust",
    "dep_peak_wind_drct",
    "dep_peak_wind_time",
    "dep_weather_severity",
    "dep_wx_intensity",
    "dep_wx_has_ra",
    "dep_wx_has_ts",
    "dep_wx_has_sn",
    "dep_wx_has_fg",
    "dep_wx_has_br",
    "dep_wx_has_hz",
]

arr_weather_cols = [
    "arr_drct",
    "arr_sknt",
    "arr_p01i",
    "arr_vsby",
    "arr_gust",
    "arr_wxcodes",
    "arr_ice_accretion_1hr",
    "arr_ice_accretion_3hr",
    "arr_ice_accretion_6hr",
    "arr_peak_wind_gust",
    "arr_peak_wind_drct",
    "arr_peak_wind_time",
    "arr_weather_severity",
    "arr_wx_intensity",
    "arr_wx_has_ra",
    "arr_wx_has_ts",
    "arr_wx_has_sn",
    "arr_wx_has_fg",
    "arr_wx_has_br",
    "arr_wx_has_hz",
]

aircraft_cols = [
    "aircraft_model",
    "aircraft_type",
    "weight_category",
    "num_seats",
]


# --------------------------------------------------
# MASTER FEATURE DICTIONARY
# --------------------------------------------------

FEATURE_GROUPS = {
    "flight_ids": flight_id_cols,
    "route": route_cols,
    "scheduled_times": scheduled_time_cols,
    "actual_times": actual_time_cols,
    "departure_delay": departure_delay_cols,
    "arrival_delay": arrival_delay_cols,
    "flight_phase": flight_phase_cols,
    "flight_status": flight_status_cols,
    "delay_causes": delay_cause_cols,
    "distance": distance_cols,
    "weather_time": weather_time_cols,
    "dep_weather": dep_weather_cols,
    "arr_weather": arr_weather_cols,
    "aircraft": aircraft_cols,
}

Delay Groups

Delay Group Meaning
-2 < -15 minutes early
-1 -15 to -1 minutes
0 on time
1 1–14 minutes late
2 15–29 minutes
3 30–44 minutes
4 45–59 minutes
5 60–74 minutes
6 75–89 minutes
7 90–104 minutes
8 105–119 minutes
9 120+ minutes
import polars as pl
import matplotlib.pyplot as plt

# ------------------------------------------------
# Prepare data
# Assumes df has: FlightDate, Origin, Dest,
# DepartureDelayGroups, ArrivalDelayGroups
# ------------------------------------------------
d = (
    df.select([
        "FlightDate",
        "Origin",
        "Dest",
        "DepartureDelayGroups",
        "ArrivalDelayGroups"
    ])
    .drop_nulls()
    .with_columns([
        pl.col("FlightDate").str.strptime(pl.Date, strict=False).alias("FlightDate_parsed")
    ])
    .with_columns([
        pl.col("FlightDate_parsed").dt.year().alias("Year"),
        (pl.col("Origin") + pl.lit(" -> ") + pl.col("Dest")).alias("Route")
    ])
    .drop("FlightDate_parsed")
)

# ------------------------------------------------
# Count frequencies by Year + Route + Delay Group
# ------------------------------------------------
dep_counts = (
    d.group_by(["Year", "Route", "DepartureDelayGroups"])
     .count()
     .sort(["Year", "Route", "DepartureDelayGroups"])
)

arr_counts = (
    d.group_by(["Year", "Route", "ArrivalDelayGroups"])
     .count()
     .sort(["Year", "Route", "ArrivalDelayGroups"])
)

# ------------------------------------------------
# Get all year-route combinations
# ------------------------------------------------
year_routes = (
    d.select(["Year", "Route"])
     .unique()
     .sort(["Year", "Route"])
     .to_dicts()
)

# ------------------------------------------------
# Plot Departure Delay Groups for each Year + Route
# ------------------------------------------------
for yr_rt in year_routes:
    year = yr_rt["Year"]
    route = yr_rt["Route"]

    dep_subset = (
        dep_counts
        .filter(
            (pl.col("Year") == year) &
            (pl.col("Route") == route)
        )
        .sort("DepartureDelayGroups")
    )

    if dep_subset.height > 0:
        dep_pd = dep_subset.to_pandas()

        plt.figure(figsize=(10, 5))
        plt.bar(dep_pd["DepartureDelayGroups"], dep_pd["count"])
        plt.title(f"Departure Delay Group Distribution\n{route} | {year}")
        plt.xlabel("Departure Delay Group")
        plt.ylabel("Number of Flights")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

# ------------------------------------------------
# Plot Arrival Delay Groups for each Year + Route
# ------------------------------------------------
for yr_rt in year_routes:
    year = yr_rt["Year"]
    route = yr_rt["Route"]

    arr_subset = (
        arr_counts
        .filter(
            (pl.col("Year") == year) &
            (pl.col("Route") == route)
        )
        .sort("ArrivalDelayGroups")
    )

    if arr_subset.height > 0:
        arr_pd = arr_subset.to_pandas()

        plt.figure(figsize=(10, 5))
        plt.bar(arr_pd["ArrivalDelayGroups"], arr_pd["count"])
        plt.title(f"Arrival Delay Group Distribution\n{route} | {year}")
        plt.xlabel("Arrival Delay Group")
        plt.ylabel("Number of Flights")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
/tmp/ipykernel_73374/1810928616.py:33: DeprecationWarning: `GroupBy.count` was renamed; use `GroupBy.len` instead
  .count()
/tmp/ipykernel_73374/1810928616.py:39: DeprecationWarning: `GroupBy.count` was renamed; use `GroupBy.len` instead
  .count()

import polars as pl

# ------------------------------------------------
# Prepare data
# ------------------------------------------------
d = (
    df.select([
        "FlightDate",
        "Origin",
        "Dest",
        "DepDelayMinutes",
        "ArrDelayMinutes",
    ])
    .with_columns([
        pl.col("FlightDate").str.strptime(pl.Date, strict=False).alias("FlightDate_parsed")
    ])
    .with_columns([
        pl.col("FlightDate_parsed").dt.year().alias("Year"),
        (pl.col("Origin") + pl.lit(" -> ") + pl.col("Dest")).alias("Route"),
        (
            (pl.col("DepDelayMinutes").fill_null(0) > 15) |
            (pl.col("ArrDelayMinutes").fill_null(0) > 15)
        ).alias("IsDelayed")
    ])
    .drop("FlightDate_parsed")
)

# ------------------------------------------------
# Aggregate by Year and Route
# ------------------------------------------------
route_delay_summary = (
    d.group_by(["Year", "Route"])
    .agg([
        pl.len().alias("TotalFlights"),
        pl.col("IsDelayed").sum().alias("DelayedFlights"),
        (pl.col("DepDelayMinutes").fill_null(0) > 15).sum().alias("DepDelayedFlights"),
        (pl.col("ArrDelayMinutes").fill_null(0) > 15).sum().alias("ArrDelayedFlights"),
        pl.col("DepDelayMinutes").fill_null(0).mean().alias("AvgDepDelayMinutes"),
        pl.col("ArrDelayMinutes").fill_null(0).mean().alias("AvgArrDelayMinutes"),
    ])
    .with_columns([
        (pl.col("DelayedFlights") / pl.col("TotalFlights") * 100).alias("DelayRatePct")
    ])
    .sort(["Year", "DelayedFlights"], descending=[False, True])
)

# ------------------------------------------------
# Rank routes within each year
# ------------------------------------------------
ranked_routes = (
    route_delay_summary
    .with_columns([
        pl.col("DelayedFlights")
        .rank(method="dense", descending=True)
        .over("Year")
        .alias("DelayRankWithinYear")
    ])
    .sort(["Year", "DelayRankWithinYear", "Route"])
)

# ------------------------------------------------
# Optional: round numeric columns for readability
# ------------------------------------------------
ranked_routes = ranked_routes.with_columns([
    pl.col("DelayRatePct").round(2),
    pl.col("AvgDepDelayMinutes").round(2),
    pl.col("AvgArrDelayMinutes").round(2),
])

# ------------------------------------------------
# Show full ranked table in Jupyter
# ------------------------------------------------
ranked_routes
shape: (8, 10)
Year Route TotalFlights DelayedFlights DepDelayedFlights ArrDelayedFlights AvgDepDelayMinutes AvgArrDelayMinutes DelayRatePct DelayRankWithinYear
i32 str u32 u32 u32 u32 f64 f64 f64 u32
2018 "BWI -> ATL" 6880 1365 1056 1123 11.42 11.94 19.84 1
2018 "ATL -> BWI" 6886 1342 1221 1002 12.22 10.76 19.49 2
2018 "BWI -> EWR" 850 319 246 291 22.25 25.25 37.53 3
2018 "EWR -> BWI" 849 197 189 148 16.35 13.28 23.2 4
2019 "BWI -> ATL" 6812 1252 1007 998 10.98 10.79 18.38 1
2019 "ATL -> BWI" 6861 1231 1127 913 10.95 9.12 17.94 2
2019 "BWI -> EWR" 59 27 26 21 29.19 27.49 45.76 3
2019 "EWR -> BWI" 60 20 20 17 19.53 18.03 33.33 4

Delays over time of day

import polars as pl
import matplotlib.pyplot as plt

# ------------------------------------------------
# Robust types: CRSDepTime -> hour, DepDelayMinutes -> numeric
# ------------------------------------------------
df2 = df.with_columns([
    # dep_hour from CRSDepTime (handles strings like "0545", ints, floats, blanks)
    (
        pl.col("CRSDepTime")
          .cast(pl.Utf8)
          .str.replace_all(r"[^0-9]", "")
          .replace("", None)
          .cast(pl.Int32)
          .fill_null(0)
          .floordiv(100)
          .cast(pl.Int32)
          .alias("dep_hour")
    ),

    # DepDelayMinutes to float (handles strings/blanks); null if not parseable
    (
        pl.col("DepDelayMinutes")
          .cast(pl.Utf8)
          .str.replace_all(r"[^0-9\.\-]", "")
          .replace("", None)
          .cast(pl.Float64)
          .alias("DepDelayMinutes_num")
    ),
])

# ------------------------------------------------
# Time-of-day labels (readable)
# ------------------------------------------------
df2 = df2.with_columns(
    pl.when(pl.col("dep_hour") < 6)
      .then(pl.lit("Night (12AM–6AM)"))
      .when(pl.col("dep_hour") < 12)
      .then(pl.lit("Morning (6AM–12PM)"))
      .when(pl.col("dep_hour") < 18)
      .then(pl.lit("Midday (12PM–6PM)"))
      .otherwise(pl.lit("Evening (6PM–12AM)"))
      .alias("time_of_day")
)

# ------------------------------------------------
# Delays > 15 minutes distribution across time of day
# ------------------------------------------------
order = ["Night (12AM–6AM)", "Morning (6AM–12PM)", "Midday (12PM–6PM)", "Evening (6PM–12AM)"]

delay_counts = (
    df2.filter(pl.col("DepDelayMinutes_num").is_not_null())
       .filter(pl.col("DepDelayMinutes_num") > 15)
       .group_by("time_of_day")
       .agg(pl.len().alias("count"))
)

# Ensure all categories appear (even if count=0), and in a nice order
delay_counts = (
    pl.DataFrame({"time_of_day": order})
      .join(delay_counts, on="time_of_day", how="left")
      .with_columns(pl.col("count").fill_null(0).cast(pl.Int64))
)

print(delay_counts)

# ------------------------------------------------
# Plot
# ------------------------------------------------
pdf = delay_counts.to_pandas()

plt.figure(figsize=(10, 6))
plt.bar(pdf["time_of_day"], pdf["count"])
plt.title("Departure delays > 15 minutes by time of day")
plt.xlabel("Time of day")
plt.ylabel("Number of flights")
plt.xticks(rotation=20, ha="right")
plt.tight_layout()
plt.show()
shape: (4, 2)
┌────────────────────┬───────┐
│ time_of_day        ┆ count │
│ ---                ┆ ---   │
│ str                ┆ i64   │
╞════════════════════╪═══════╡
│ Night (12AM–6AM)   ┆ 15    │
│ Morning (6AM–12PM) ┆ 954   │
│ Midday (12PM–6PM)  ┆ 1799  │
│ Evening (6PM–12AM) ┆ 2124  │
└────────────────────┴───────┘

import polars as pl
import matplotlib.pyplot as plt

# ------------------------------------------------
# Robust types:
# - FlightDate -> Year
# - CRSDepTime -> dep_hour
# - DepDelayMinutes -> numeric
# - Route = Origin -> Dest
# ------------------------------------------------
df2 = df.with_columns([
    # Year from FlightDate
    (
        pl.col("FlightDate")
          .cast(pl.Utf8)
          .str.strptime(pl.Date, strict=False)
          .dt.year()
          .alias("Year")
    ),

    # Route label
    (
        pl.col("Origin").cast(pl.Utf8) + pl.lit(" -> ") + pl.col("Dest").cast(pl.Utf8)
    ).alias("Route"),

    # dep_hour from CRSDepTime
    (
        pl.col("CRSDepTime")
          .cast(pl.Utf8)
          .str.replace_all(r"[^0-9]", "")
          .replace("", None)
          .cast(pl.Int32)
          .fill_null(0)
          .floordiv(100)
          .cast(pl.Int32)
          .alias("dep_hour")
    ),

    # DepDelayMinutes numeric
    (
        pl.col("DepDelayMinutes")
          .cast(pl.Utf8)
          .str.replace_all(r"[^0-9\.\-]", "")
          .replace("", None)
          .cast(pl.Float64)
          .alias("DepDelayMinutes_num")
    ),
])

# ------------------------------------------------
# Time-of-day labels
# ------------------------------------------------
df2 = df2.with_columns(
    pl.when(pl.col("dep_hour") < 6)
      .then(pl.lit("Night (12AM–6AM)"))
      .when(pl.col("dep_hour") < 12)
      .then(pl.lit("Morning (6AM–12PM)"))
      .when(pl.col("dep_hour") < 18)
      .then(pl.lit("Midday (12PM–6PM)"))
      .otherwise(pl.lit("Evening (6PM–12AM)"))
      .alias("time_of_day")
)

# ------------------------------------------------
# Ordered categories
# ------------------------------------------------
order = [
    "Night (12AM–6AM)",
    "Morning (6AM–12PM)",
    "Midday (12PM–6PM)",
    "Evening (6PM–12AM)"
]

# ------------------------------------------------
# Filter to delayed departures > 15 min
# ------------------------------------------------
delays = (
    df2.filter(pl.col("Year").is_not_null())
       .filter(pl.col("Route").is_not_null())
       .filter(pl.col("DepDelayMinutes_num").is_not_null())
       .filter(pl.col("DepDelayMinutes_num") > 15)
)

# ------------------------------------------------
# Count delays by Year + Route + time_of_day
# ------------------------------------------------
delay_counts = (
    delays.group_by(["Year", "Route", "time_of_day"])
          .agg(pl.len().alias("count"))
)

print(delay_counts)

# ------------------------------------------------
# Get all Year + Route combos
# ------------------------------------------------
year_route_pairs = (
    delays.select(["Year", "Route"])
          .unique()
          .sort(["Year", "Route"])
          .to_dicts()
)

# ------------------------------------------------
# Plot one chart per Year + Route
# ------------------------------------------------
for pair in year_route_pairs:
    year = pair["Year"]
    route = pair["Route"]

    subset = (
        delay_counts
        .filter((pl.col("Year") == year) & (pl.col("Route") == route))
        .select(["time_of_day", "count"])
    )

    # ensure all four time buckets appear
    plot_df = (
        pl.DataFrame({"time_of_day": order})
          .join(subset, on="time_of_day", how="left")
          .with_columns(pl.col("count").fill_null(0).cast(pl.Int64))
    )

    pdf = plot_df.to_pandas()

    plt.figure(figsize=(10, 6))
    plt.bar(pdf["time_of_day"], pdf["count"])
    plt.title(f"Departure delays > 15 minutes by time of day\n{route} | {year}")
    plt.xlabel("Time of day")
    plt.ylabel("Number of delayed flights")
    plt.xticks(rotation=20, ha="right")
    plt.tight_layout()
    plt.show()
shape: (25, 4)
┌──────┬────────────┬────────────────────┬───────┐
│ Year ┆ Route      ┆ time_of_day        ┆ count │
│ ---  ┆ ---        ┆ ---                ┆ ---   │
│ i32  ┆ str        ┆ str                ┆ u32   │
╞══════╪════════════╪════════════════════╪═══════╡
│ 2019 ┆ BWI -> EWR ┆ Evening (6PM–12AM) ┆ 18    │
│ 2019 ┆ BWI -> ATL ┆ Night (12AM–6AM)   ┆ 1     │
│ 2019 ┆ ATL -> BWI ┆ Evening (6PM–12AM) ┆ 539   │
│ 2018 ┆ ATL -> BWI ┆ Evening (6PM–12AM) ┆ 626   │
│ 2018 ┆ ATL -> BWI ┆ Midday (12PM–6PM)  ┆ 379   │
│ …    ┆ …          ┆ …                  ┆ …     │
│ 2019 ┆ BWI -> ATL ┆ Morning (6AM–12PM) ┆ 241   │
│ 2018 ┆ EWR -> BWI ┆ Midday (12PM–6PM)  ┆ 66    │
│ 2018 ┆ BWI -> EWR ┆ Evening (6PM–12AM) ┆ 75    │
│ 2018 ┆ BWI -> ATL ┆ Midday (12PM–6PM)  ┆ 420   │
│ 2018 ┆ BWI -> EWR ┆ Morning (6AM–12PM) ┆ 4     │
└──────┴────────────┴────────────────────┴───────┘

Delays by month

import polars as pl
import matplotlib.pyplot as plt

# ------------------------------------------------
# Convert FlightDate to date and extract month
# ------------------------------------------------
df2 = df.with_columns(
    pl.col("FlightDate")
      .cast(pl.Utf8)
      .str.to_date(strict=False)
      .dt.month()
      .alias("month")
)

# ------------------------------------------------
# Ensure DepDelayMinutes is numeric
# ------------------------------------------------
df2 = df2.with_columns(
    pl.col("DepDelayMinutes")
      .cast(pl.Utf8)
      .str.replace_all(r"[^0-9\.\-]", "")
      .replace("", None)
      .cast(pl.Float64)
      .alias("delay")
)

# ------------------------------------------------
# Filter delays > 15 minutes and count by month
# ------------------------------------------------
delay_month = (
    df2.filter(pl.col("delay") > 15)
       .group_by("month")
       .count()
       .sort("month")
)

print(delay_month)

# ------------------------------------------------
# Convert to pandas for plotting
# ------------------------------------------------
plot_df = delay_month.to_pandas()

# Month labels
month_labels = [
    "Jan","Feb","Mar","Apr","May","Jun",
    "Jul","Aug","Sep","Oct","Nov","Dec"
]

# ------------------------------------------------
# Plot distribution
# ------------------------------------------------
plt.figure(figsize=(12,6))

plt.bar(
    plot_df["month"],
    plot_df["count"]
)

plt.xticks(range(1,13), month_labels)

plt.title("Flights with Departure Delays > 15 Minutes by Month")
plt.xlabel("Month")
plt.ylabel("Number of Delayed Flights")

plt.tight_layout()
plt.show()
/tmp/ipykernel_15558/1624462220.py:33: DeprecationWarning: `GroupBy.count` was renamed; use `GroupBy.len` instead
  .count()
shape: (12, 2)
┌───────┬───────┐
│ month ┆ count │
│ ---   ┆ ---   │
│ i8    ┆ u32   │
╞═══════╪═══════╡
│ 1     ┆ 2337  │
│ 2     ┆ 2730  │
│ 3     ┆ 2636  │
│ 4     ┆ 3155  │
│ 5     ┆ 3801  │
│ …     ┆ …     │
│ 8     ┆ 3922  │
│ 9     ┆ 1766  │
│ 10    ┆ 2679  │
│ 11    ┆ 2587  │
│ 12    ┆ 4008  │
└───────┴───────┘

import matplotlib.pyplot as plt

plot_df = (
    df.select("DepDelayMinutes")
      .drop_nulls()
      .to_pandas()
)

plt.figure(figsize=(10,5))
plt.hist(plot_df["DepDelayMinutes"], bins=80)
plt.title("Departure Delay Distribution")
plt.xlabel("Delay (minutes)")
plt.ylabel("Count")
plt.xlim(-20, 200)
plt.show()

df_clean = (
    df.with_columns(
        pl.col("FlightDate").str.to_date("%Y-%m-%d")
    )
    .sort("FlightDate")   # ← REQUIRED
)

# ---------------------------------------------
# WEEKLY AGGREGATION
# ---------------------------------------------
weekly_flights = (
    df_clean.group_by_dynamic(
        index_column="FlightDate",
        every="1w",
        period="1w"
    )
    .agg(pl.len().alias("num_flights"))
)

# ---------------------------------------------
# PLOT
# ---------------------------------------------
plt.figure(figsize=(14,6))
plt.plot(
    weekly_flights["FlightDate"],
    weekly_flights["num_flights"]
)

plt.title("Total Flights per Week (2019)")
plt.xlabel("Week")
plt.ylabel("Number of Flights")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

origin_state_counts = (
    df.group_by("OriginState")
      .agg(pl.len().alias("num_flights"))
      .sort("num_flights", descending=True)
      .head(20)   # top states for readability
)

# ---------------------------------------------
# PLOT
# ---------------------------------------------
plt.figure(figsize=(10,6))
plt.barh(
    origin_state_counts["OriginState"],
    origin_state_counts["num_flights"]
)

plt.gca().invert_yaxis()  # highest on top
plt.title("Top Origin States by Number of Flights (2019)")
plt.xlabel("Number of Flights")
plt.ylabel("State")
plt.tight_layout()
plt.show()
---------------------------------------------------------------------------
ColumnNotFoundError                       Traceback (most recent call last)
Cell In[49], line 3
      1 origin_state_counts = (
      2     df.group_by("OriginState")
----> 3       .agg(pl.len().alias("num_flights"))
      4       .sort("num_flights", descending=True)
      5       .head(20)   # top states for readability
      6 )
      8 # ---------------------------------------------
      9 # PLOT
     10 # ---------------------------------------------
     11 plt.figure(figsize=(10,6))

File ~/anaconda3/envs/or568_ml_project/lib/python3.11/site-packages/polars/dataframe/group_by.py:298, in GroupBy.agg(self, *aggs, **named_aggs)
    195 """
    196 Compute aggregations for each group of a group by operation.
    197 
   (...)    291 └─────┴───────┴────────────────┘
    292 """
    293 from polars.lazyframe.opt_flags import QueryOptFlags
    295 return (
    296     self._lgb()
    297     .agg(*aggs, **named_aggs)
--> 298     .collect(optimizations=QueryOptFlags.none())
    299 )

File ~/anaconda3/envs/or568_ml_project/lib/python3.11/site-packages/polars/_utils/deprecation.py:97, in deprecate_streaming_parameter.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
     93         kwargs["engine"] = "in-memory"
     95     del kwargs["streaming"]
---> 97 return function(*args, **kwargs)

File ~/anaconda3/envs/or568_ml_project/lib/python3.11/site-packages/polars/lazyframe/opt_flags.py:326, in forward_old_opt_flags.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    323         optflags = cb(optflags, kwargs.pop(key))  # type: ignore[no-untyped-call,unused-ignore]
    325 kwargs["optimizations"] = optflags
--> 326 return function(*args, **kwargs)

File ~/anaconda3/envs/or568_ml_project/lib/python3.11/site-packages/polars/lazyframe/frame.py:2440, in LazyFrame.collect(self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, cluster_with_columns, collapse_joins, no_optimization, engine, background, optimizations, **_kwargs)
   2438 # Only for testing purposes
   2439 callback = _kwargs.get("post_opt_callback", callback)
-> 2440 return wrap_df(ldf.collect(engine, callback))

ColumnNotFoundError: unable to find column "OriginState"; valid columns: ["FlightDate", "Reporting_Airline", "Origin", "Dest", "Tail_Number", "DepDelay", "ArrDelay", "DepDelayMinutes", "ArrDelayMinutes", "Cancelled", "Diverted", "dep_ts_sched", "dep_ts_actual", "arr_ts_sched", "arr_ts_actual", "dep_weather_severity", "arr_weather_severity", "dep_tmpf", "dep_vsby", "dep_sknt", "dep_gust", "dep_p01i", "dep_skyl1", "arr_tmpf", "arr_vsby", "arr_sknt", "arr_gust", "arr_p01i", "arr_skyl1", "dep_wxcodes", "dep_wx_intensity", "dep_wx_has_ra", "dep_wx_has_ts", "dep_wx_has_sn", "dep_wx_has_fg", "dep_wx_has_br", "dep_wx_has_hz", "arr_wxcodes", "arr_wx_intensity", "arr_wx_has_ra", "arr_wx_has_ts", "arr_wx_has_sn", "arr_wx_has_fg", "arr_wx_has_br", "arr_wx_has_hz"]

Resolved plan until failure:

    ---> FAILED HERE RESOLVING 'sink' <---
DF ["FlightDate", "Reporting_Airline", "Origin", "Dest", ...]; PROJECT */45 COLUMNS
def savefig(name: str):
    path = OUT_DIR / f"{name}.png"
    plt.tight_layout()
    plt.savefig(path, dpi=150)
    print("Saved:", path)
    plt.show()

def to_float_np(series: pl.Series):
    return series.cast(pl.Float64, strict=False).drop_nulls().to_numpy()

1) Arrival Delay Distribution (minutes)

if "ArrDelayMinutes" in df.columns:
    arr = to_float_np(df["ArrDelayMinutes"])
    plt.figure(figsize=(8,5))
    plt.hist(arr, bins=60)
    plt.title("Arrival Delay Distribution (minutes)")
    plt.xlabel("Delay Minutes")
    plt.ylabel("Frequency")
else:
    print("ArrDelayMinutes not found")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[14], line 2
      1 if "ArrDelayMinutes" in df.columns:
----> 2     arr = to_float_np(df["ArrDelayMinutes"])
      3     plt.figure(figsize=(8,5))
      4     plt.hist(arr, bins=60)

NameError: name 'to_float_np' is not defined

2) Delay Classification View (≥ 15 minutes)

if "ArrDelayMinutes" in df.columns:
    delayed = (
        df.select(
            (pl.col("ArrDelayMinutes").cast(pl.Float64, strict=False).fill_null(0) >= 15)
            .alias("Delayed15")
        )
        .group_by("Delayed15")
        .agg(pl.len().alias("count"))
        .sort("Delayed15")
    )

    plt.figure(figsize=(6,4))
    plt.bar(delayed["Delayed15"].cast(pl.Utf8).to_list(), delayed["count"].to_list())
    plt.title("Flights Delayed ≥ 15 minutes")
    plt.xlabel("Delayed")
    plt.ylabel("Count")
else:
    print("ArrDelayMinutes not found")

3) Top Airlines by Mean Arrival Delay

if {"Reporting_Airline", "ArrDelayMinutes"}.issubset(set(df.columns)):
    airline_stats = (
        df.lazy()
        .group_by("Reporting_Airline")
        .agg(pl.col("ArrDelayMinutes").cast(pl.Float64, strict=False).mean().alias("mean_delay"))
        .sort("mean_delay", descending=True)
        .limit(15)
        .collect()
    )

    plt.figure(figsize=(9,5))
    plt.bar(airline_stats["Reporting_Airline"].to_list(), airline_stats["mean_delay"].to_list())
    plt.xticks(rotation=45, ha="right")
    plt.title("Top Airlines by Mean Arrival Delay")
    plt.ylabel("Mean Delay (min)")
    savefig("carrier_mean_delay")
else:
    print("Reporting_Airline or ArrDelayMinutes not found")
Reporting_Airline or ArrDelayMinutes not found

4) Departure Weather Severity vs Arrival Delay

if {"dep_weather_severity", "ArrDelayMinutes"}.issubset(set(df.columns)):
    sample = (
        df.select([
            pl.col("dep_weather_severity").cast(pl.Float64, strict=False),
            pl.col("ArrDelayMinutes").cast(pl.Float64, strict=False),
        ])
        .drop_nulls()
        .sample(min(10_000, df.height), seed=0)
    )

    plt.figure(figsize=(7,5))
    plt.scatter(sample["dep_weather_severity"].to_list(), sample["ArrDelayMinutes"].to_list(), alpha=0.3)
    plt.title("Departure Weather Severity vs Arrival Delay")
    plt.xlabel("Departure Weather Severity")
    plt.ylabel("Arrival Delay (min)")
    savefig("weather_vs_delay")
else:
    print("dep_weather_severity or ArrDelayMinutes not found")
Saved: /home/jon/Documents/grad_school/OR568/project/OR568_ML_Project/ml-shared-notebooks/notebooks/jon/python/plots/weather_vs_delay.png

5) Mean Arrival Delay vs Weather Severity Bucket

if {"dep_weather_severity", "ArrDelayMinutes"}.issubset(set(df.columns)):
    bucketed = (
        df.lazy()
        .with_columns(pl.col("dep_weather_severity").cast(pl.Float64, strict=False).floor().alias("wx_bucket"))
        .group_by("wx_bucket")
        .agg(pl.col("ArrDelayMinutes").cast(pl.Float64, strict=False).mean().alias("mean_delay"))
        .sort("wx_bucket")
        .collect()
    )

    plt.figure(figsize=(8,5))
    plt.plot(bucketed["wx_bucket"].to_list(), bucketed["mean_delay"].to_list(), marker="o")
    plt.title("Mean Arrival Delay vs Weather Severity")
    plt.xlabel("Weather Severity Bucket")
    plt.ylabel("Mean Arrival Delay (min)")
    savefig("delay_vs_weather_bucket")
else:
    print("dep_weather_severity or ArrDelayMinutes not found")
Saved: /home/jon/Documents/grad_school/OR568/project/OR568_ML_Project/ml-shared-notebooks/notebooks/jon/python/plots/delay_vs_weather_bucket.png

6) Destinations with Highest Mean Arrival Delay

if {"Dest", "ArrDelayMinutes"}.issubset(set(df.columns)):
    top_dest = (
        df.lazy()
        .group_by("Dest")
        .agg(pl.col("ArrDelayMinutes").cast(pl.Float64, strict=False).mean().alias("mean_delay"))
        .sort("mean_delay", descending=True)
        .limit(15)
        .collect()
    )

    plt.figure(figsize=(9,5))
    plt.bar(top_dest["Dest"].to_list(), top_dest["mean_delay"].to_list())
    plt.xticks(rotation=45, ha="right")
    plt.title("Destinations with Highest Mean Arrival Delay")
    plt.ylabel("Mean Delay (min)")
    savefig("destinations_delay")
else:
    print("Dest or ArrDelayMinutes not found")
Saved: /home/jon/Documents/grad_school/OR568/project/OR568_ML_Project/ml-shared-notebooks/notebooks/jon/python/plots/destinations_delay.png

Done

Plots are saved to the plots/ directory as PNG files.