obtained from https://www.transtats.bts.gov/DL_SelectFields.asp
on Sunday, 8-Sept-2019
hdf5
¶import os
import glob
from zipfile import ZipFile
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
import vaex
from tqdm import tqdm_notebook as tqdm
# Columns to read
columns = ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime',
'Reporting_Airline', 'Flight_Number_Reporting_Airline', 'Tail_Number',
'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Origin', 'Dest',
'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode',
'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']
# Force dtypes for these columns - some choices are made to optimize disk space
dtypes = {
'Year': np.int16,
'Month': np.int8,
'DayofMonth': np.int16,
'DayOfWeek': np.int8,
'DepTime': 'Int16',
'CRSDepTime': 'Int16',
'ArrTime': 'Int16',
'CRSArrTime': 'Int16',
'Reporting_Airline': np.object,
'Flight_Number_Reporting_Airline': np.int32,
'Tail_Number': np.object,
'ActualElapsedTime': 'Int32',
'CRSElapsedTime': 'Int32',
'AirTime': 'Int32',
'ArrDelay': 'Int32',
'DepDelay': 'Int32',
'Origin': np.object,
'Dest': np.object,
'Distance': 'Int32',
'TaxiIn': 'Int32',
'TaxiOut': 'Int32',
'Cancelled': 'Int8',
'CancellationCode': np.object,
'Diverted': 'Int8',
'CarrierDelay': 'Int32',
'WeatherDelay': 'Int32',
'NASDelay': 'Int32',
'SecurityDelay': 'Int32',
'LateAircraftDelay': 'Int32',
}
# Set up a renaming dictionary, in order to make the column names to match the well known data from
# http://stat-computing.org/dataexpo/2009/the-data.html
rename_dict = {
'DayofMonth': 'DayOfMonth',
'Reporting_Airline': 'UniqueCarrier',
'Flight_Number_Reporting_Airline': 'FlightNum',
'Tail_Number': 'TailNum'
}
# Set up the list of zip files to be opened and converted
zip_list = np.sort(np.array(glob.glob('./airlines-us-original/raw/*.zip')))[::-1]
# The output directory
output_dir = './airlines-us-original/hdf5/'
# The magic happens heree:
for file in tqdm(zip_list, leave=False, desc='Converting to hdf5...'):
# Setting up the files, and directories
zip_file = ZipFile(file)
output_file = file.split('/')[-1][:-3]+'hdf5'
output = output_dir + output_file
# Check if a converted file already exists: if it does skip it, otherwise read in the raw csv and convert it
if (os.path.exists(output) and os.path.isfile(output)):
pass
else:
# Importing the data into pandas
pandas_df = [pd.read_csv(zip_file.open(text_file.filename),
encoding='latin',
usecols=columns,
dtype=dtypes,)
for text_file in zip_file.infolist()
if text_file.filename.endswith('.csv')][0]
# Rename some columns to match the more well known dataset from
# http://stat-computing.org/dataexpo/2009/the-data.html
pandas_df.rename(columns=rename_dict, inplace=True)
# Importing the data from pandas to vaex
vaex_df = vaex.from_pandas(pandas_df, copy_index=False)
# Export the data with vaex to hdf5
vaex_df.export_hdf5(path=output, progress=False)
HBox(children=(IntProgress(value=0, description='Converting to hdf5...', max=372, style=ProgressStyle(descript…
Notes:
Here we go:
Note that macOS has a rather low limit on the number of files one can open at one time. To circumvent this issue, in a terminal run:
>ulimit -n 9999
It should be the same terminal from which later the jupyter server is started.
import re
import glob
import vaex
import numpy as np
def tryint(s):
try:
return int(s)
except:
return s
def alphanum_key(s):
""" Turn a string into a list of string and number chunks.
"z23a" -> ["z", 23, "a"]
"""
return [ tryint(c) for c in re.split('([0-9]+)', s) ]
hdf5_list = glob.glob('./airlines-us-original/hdf5/*.hdf5')
hdf5_list.sort(key=alphanum_key)
hdf5_list = np.array(hdf5_list)
assert len(hdf5_list) == 372, "Incorrect number of files"
# This is an important step
master_df = vaex.open_many(hdf5_list)
# exporting
master_df.export_hdf5(path='./airline_data_1988_2018.hd5', progress=True)
[########################################]: 100.00% estimated time: 0s = 0.0m = 0.0h
# Check how the single file looks like:
df = vaex.open('./airline_data_1988_2018.hd5')
df
# | Year | Month | DayOfMonth | DayOfWeek | UniqueCarrier | TailNum | FlightNum | Origin | Dest | CRSDepTime | DepTime | DepDelay | TaxiOut | TaxiIn | CRSArrTime | ArrTime | ArrDelay | Cancelled | CancellationCode | Diverted | CRSElapsedTime | ActualElapsedTime | AirTime | Distance | CarrierDelay | WeatherDelay | NASDelay | SecurityDelay | LateAircraftDelay |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1988 | 1 | 8 | 5 | PI | None | 930 | BGM | ITH | 1525 | 1532 | 7 | -- | -- | 1545 | 1555 | 10 | 0 | None | 0 | 20 | 23 | -- | 32 | -- | -- | -- | -- | -- |
1 | 1988 | 1 | 9 | 6 | PI | None | 930 | BGM | ITH | 1525 | 1522 | -3 | -- | -- | 1545 | 1535 | -10 | 0 | None | 0 | 20 | 13 | -- | 32 | -- | -- | -- | -- | -- |
2 | 1988 | 1 | 10 | 7 | PI | None | 930 | BGM | ITH | 1525 | 1522 | -3 | -- | -- | 1545 | 1534 | -11 | 0 | None | 0 | 20 | 12 | -- | 32 | -- | -- | -- | -- | -- |
3 | 1988 | 1 | 11 | 1 | PI | None | 930 | BGM | ITH | 1525 | -- | -- | -- | -- | 1545 | -- | -- | 1 | None | 0 | 20 | -- | -- | 32 | -- | -- | -- | -- | -- |
4 | 1988 | 1 | 12 | 2 | PI | None | 930 | BGM | ITH | 1525 | 1524 | -1 | -- | -- | 1545 | 1540 | -5 | 0 | None | 0 | 20 | 16 | -- | 32 | -- | -- | -- | -- | -- |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
183,821,921 | 2018 | 12 | 27 | 4 | DL | N705TW | 864 | JFK | SLC | 935 | 929 | -6 | 28 | 9 | 1311 | 1230 | -41 | 0 | None | 0 | 336 | 301 | 264 | 1990 | -- | -- | -- | -- | -- |
183,821,922 | 2018 | 12 | 27 | 4 | DL | N336NB | 865 | MSP | SLC | 1035 | 1030 | -5 | 14 | 10 | 1240 | 1214 | -26 | 0 | None | 0 | 185 | 164 | 140 | 991 | -- | -- | -- | -- | -- |
183,821,923 | 2018 | 12 | 27 | 4 | DL | N945DN | 866 | DEN | MSP | 1054 | 1100 | 6 | 10 | 5 | 1353 | 1350 | -3 | 0 | None | 0 | 119 | 110 | 95 | 680 | -- | -- | -- | -- | -- |
183,821,924 | 2018 | 12 | 27 | 4 | DL | N945DN | 866 | MSP | DEN | 850 | 850 | 0 | 15 | 8 | 1010 | 951 | -19 | 0 | None | 0 | 140 | 121 | 98 | 680 | -- | -- | -- | -- | -- |
183,821,925 | 2018 | 12 | 27 | 4 | DL | N901DE | 867 | ATL | SYR | 1455 | 1500 | 5 | 19 | 3 | 1703 | 1703 | 0 | 0 | None | 0 | 128 | 123 | 101 | 794 | -- | -- | -- | -- | -- |