from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import preprocessing
air_raw = DataFrame.from_csv("allyears_tiny.csv", index_col = False)
print(air_raw.head())
air_raw['RandNum'] = Series(np.random.uniform(size = len(air_raw['Origin'])))
print(air_raw.head())
Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime \ 0 1987 10 14 3 741 730 912 1 1987 10 15 4 729 730 903 2 1987 10 17 6 741 730 918 3 1987 10 18 7 729 730 847 4 1987 10 19 1 749 730 922 CRSArrTime UniqueCarrier FlightNum ... Cancelled \ 0 849 PS 1451 ... 0 1 849 PS 1451 ... 0 2 849 PS 1451 ... 0 3 849 PS 1451 ... 0 4 849 PS 1451 ... 0 CancellationCode Diverted CarrierDelay WeatherDelay NASDelay \ 0 NaN 0 NaN NaN NaN 1 NaN 0 NaN NaN NaN 2 NaN 0 NaN NaN NaN 3 NaN 0 NaN NaN NaN 4 NaN 0 NaN NaN NaN SecurityDelay LateAircraftDelay IsArrDelayed IsDepDelayed 0 NaN NaN YES YES 1 NaN NaN YES NO 2 NaN NaN YES YES 3 NaN NaN NO NO 4 NaN NaN YES YES [5 rows x 31 columns] Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime \ 0 1987 10 14 3 741 730 912 1 1987 10 15 4 729 730 903 2 1987 10 17 6 741 730 918 3 1987 10 18 7 729 730 847 4 1987 10 19 1 749 730 922 CRSArrTime UniqueCarrier FlightNum ... CancellationCode Diverted \ 0 849 PS 1451 ... NaN 0 1 849 PS 1451 ... NaN 0 2 849 PS 1451 ... NaN 0 3 849 PS 1451 ... NaN 0 4 849 PS 1451 ... NaN 0 CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay \ 0 NaN NaN NaN NaN NaN 1 NaN NaN NaN NaN NaN 2 NaN NaN NaN NaN NaN 3 NaN NaN NaN NaN NaN 4 NaN NaN NaN NaN NaN IsArrDelayed IsDepDelayed RandNum 0 YES YES 0.193944 1 YES NO 0.466327 2 YES YES 0.943457 3 NO NO 0.232673 4 YES YES 0.133799 [5 rows x 32 columns]
air_mapped = DataFrame()
air_mapped['RandNum'] = air_raw['RandNum']
air_mapped['IsDepDelayed'] = air_raw['IsDepDelayed']
air_mapped['IsDepDelayedInt'] = air_mapped.apply(lambda row:
1 if row['IsDepDelayed'] == 'YES' else 0,
axis=1)
del air_mapped['IsDepDelayed']
print(air_mapped.shape)
lb_origin = sklearn.preprocessing.LabelBinarizer()
lb_origin.fit(air_raw['Origin'])
tmp_origin = lb_origin.transform(air_raw['Origin'])
tmp_origin_df = DataFrame(tmp_origin)
print(tmp_origin_df.shape)
lb_dest = sklearn.preprocessing.LabelBinarizer()
lb_dest.fit(air_raw['Dest'])
tmp_dest = lb_origin.transform(air_raw['Dest'])
tmp_dest_df = DataFrame(tmp_dest)
print(tmp_dest_df.shape)
lb_uniquecarrier = sklearn.preprocessing.LabelBinarizer()
lb_uniquecarrier.fit(air_raw['UniqueCarrier'])
tmp_uniquecarrier = lb_origin.transform(air_raw['UniqueCarrier'])
tmp_uniquecarrier_df = DataFrame(tmp_uniquecarrier)
print(tmp_uniquecarrier_df.shape)
air_mapped = pd.concat([
air_mapped,
tmp_origin_df,
tmp_dest_df,
air_raw['Distance'],
tmp_uniquecarrier_df,
air_raw['Month'],
air_raw['DayofMonth'],
air_raw['DayOfWeek'],
],
axis=1)
print(air_mapped.shape)
air_mapped
air = air_mapped
(999, 2) (999, 10) (999, 10) (999, 10) (999, 36)
air_train = air.ix[air['RandNum'] <= 0.8]
# air_valid = air.ix[(air['RandNum'] > 0.8) & (air['RandNum'] <= 0.9)]
air_test = air.ix[air['RandNum'] > 0.9]
print(air_train.shape)
print(air_test.shape)
(824, 36) (91, 36)
X_train = air_train.copy(deep=True)
del X_train['RandNum']
del X_train['IsDepDelayedInt']
print(list(X_train.columns.values))
print(X_train.shape)
y_train = air_train['IsDepDelayedInt']
print(y_train.shape)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Distance', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Month', 'DayofMonth', 'DayOfWeek'] (824, 34) (824,)
clf = GradientBoostingClassifier(n_estimators = 10, max_depth = 3, learning_rate = 0.01)
clf.fit(X_train, y_train)
GradientBoostingClassifier(init=None, learning_rate=0.01, loss='deviance', max_depth=3, max_features=None, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10, random_state=None, subsample=1.0, verbose=0, warm_start=False)
X_test = air_test.copy(deep=True)
del X_test['RandNum']
del X_test['IsDepDelayedInt']
print(list(X_test.columns.values))
print(X_test.shape)
print("")
print("--- PREDICTIONS ---")
print("")
pred = clf.predict(X_test)
print(pred)
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Distance', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 'Month', 'DayofMonth', 'DayOfWeek'] (91, 34) --- PREDICTIONS --- [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]