from pathlib import Path
import sys
import importlib
import polars as pl
pl.Config.set_tbl_rows(1000) # allow many rows
pl.Config.set_tbl_cols(100) # allow many columns
pl.Config.set_tbl_width_chars(200)
current = Path.cwd()
while current.name != "shared-notebooks":
if current.parent == current:
raise RuntimeError("Could not locate shared-notebooks directory")
current = current.parent
utils_path = current / "common_utils" / "python"
if str(utils_path) not in sys.path:
sys.path.append(str(utils_path))
import load_flight_data
importlib.reload(load_flight_data)
df = load_flight_data.load_flight_data()
df = load_flight_data.engineer_features(df)
df_work = df.select([
"year", "quarter", "month", "dayof_month", "day_of_week", "flight_date",
"reporting_airline", "tail_number", "flight_number_reporting_airline",
"origin", "dest", "distance",
"crs_dep_time", "dep_time", "dep_delay", "dep_delay_minutes", "dep_del15",
"taxi_out", "wheels_off", "wheels_on", "taxi_in",
"crs_arr_time", "arr_time", "arr_delay", "arr_delay_minutes", "arr_del15",
"cancelled", "diverted", "crs_elapsed_time", "actual_elapsed_time", "air_time",
"dep_tmpf", "dep_dwpf", "dep_relh", "dep_drct", "dep_sknt", "dep_p01i",
"dep_alti", "dep_mslp", "dep_vsby", "dep_gust",
"dep_skyc1", "dep_skyl1", "dep_wxcodes",
"dep_peak_wind_gust", "dep_peak_wind_drct", "dep_weather_severity",
"sched_dep_hour", "sched_arr_hour", "hour_of_day", "is_weekend", "route",
"is_delayed", "delay_state", "sched_dep_min", "sched_arr_min", "dep_min", "arr_min",
"schedule_buffer", "prev_origin", "prev_dest", "prev_dep_delay", "prev_arr_delay",
"prev_arr_min", "prev_dep_min", "prev_flight_date", "rotation_leg_number",
"flights_per_aircraft_day", "cum_dep_delay_day", "cum_arr_delay_day",
"curr_sched_dep_abs_min", "prev_arr_abs_min", "turnaround_minutes",
"inherited_delay", "bad_visibility", "high_wind", "precipitation", "severe_weather"
])
sample = df_work.head(10)
sample.write_csv("sample_data.csv")