Tip: Menu "Cell -> Rul all" to see actual contents of this notebook
import re
import os.path as path
import colorlover as cl
import numpy as np
import ipywidgets as widgets
import pandas as pd
from IPython.display import display, Markdown
import plotly.offline as offline
import plotly.graph_objs as go
import plotly.tools as tools
offline.init_notebook_mode(connected=True)
def make_data_path(dataset_name, ext='csv'):
dataset_name = re.sub(r'[^a-zA-Z0-9_-]', '_', dataset_name)
dataset_name = re.sub(r'_+', '_', dataset_name)
dataset_name = re.sub(r'(^_+|_+$)', '', dataset_name)
f = lambda ex: path.join('.', 'data', dataset_name + '.' + ex).lower()
if ext is None:
for ext in ['csv', 'csv.gz']:
if path.isfile(f(ext)):
return f(ext)
ext = 'csv'
return f(ext)
$Pay(dis, ppl) = \left[\left(\frac{1}{ppl}\right)^{0.3} \cdot 500 \cdot \left(\frac{arctan(dis/25)}{arctan(1)}\right)\right] \cdot ppl \cdot dis$
def pay_per_mile_per_unit(distance, units):
maxfare = 1000
crit_distance = 25
return (
((1/units)**0.3)
* (maxfare / 2)
* (np.arctan(distance/crit_distance) / np.arctan(1))
) / distance
distances = np.linspace(1, 400, 100)
@widgets.interact(units=widgets.IntSlider(value=1, min=1, max=27, description='Passengers'))
def plot_fare(units):
fig = tools.make_subplots(
rows=1, cols=2,
subplot_titles=('Pay per mile per passenger', 'Pay per passenger'),
print_grid=False)
fig.append_trace(go.Scatter(
x=distances,
y=pay_per_mile_per_unit(distances, units) * distances,
line=dict(
shape='spline',
),
name='pay/p',
), row=1, col=1)
fig.append_trace(go.Scatter(
x=distances,
y=pay_per_mile_per_unit(distances, units),
line=dict(
shape='spline',
simplify=False,
),
name='pay/mn/p',
), row=1, col=2)
fig['layout']['xaxis1'].update(title='Distance (nm)', rangemode='tozero')
fig['layout']['yaxis1'].update(title='Pay (v$)', range=[0, 1000])
fig['layout']['xaxis2'].update(title='Distance (nm)', rangemode='tozero')
fig['layout']['yaxis2'].update(title='Pay (v$)', range=[0, 30])
fig['layout'].update(title='{} passenger(s) per assignment'.format(units))
offline.iplot(fig)
Preliminary conclusions:
# Load and per-filter all data
def _flight_logs(model):
return make_data_path('Flight logs - {}'.format(model), ext=None)
def _fliter_outliers_stddev(data, label_or_series, width=5):
series = data[label_or_series] if type(
label_or_series) == str else label_or_series
return data[np.abs(series - series.mean()) < width*series.std()]
# Load available models
models = pd.read_csv(make_data_path('aircraft models'))
models = models[models['MakeModel'].map(
lambda mm: path.isfile(_flight_logs(mm)))]
all_flight_logs = {}
for model in models['MakeModel']:
display(Markdown('### Loading ' + model))
# Load flight logs for selected model and fix data types
flight_logs = pd.read_csv(_flight_logs(model), dtype={})
flight_logs['FlightTime'] = pd.to_timedelta(flight_logs['FlightTime'])
flight_logs['Time'] = pd.to_datetime(flight_logs['Time'])
print('Initially loaded flight records:', len(flight_logs))
# Filter out obvious garbage
flight_logs = flight_logs[flight_logs.Distance > 0]
flight_logs = flight_logs[flight_logs.FlightTime > np.timedelta64(0, 'm')]
print('After garbage filtering:', len(flight_logs))
# Derived columns
flight_logs['FlightTimePerDistance'] = flight_logs['FlightTime'] / \
flight_logs['Distance']
# Filter out outliers
flight_logs = _fliter_outliers_stddev(flight_logs, 'Distance')
flight_logs = _fliter_outliers_stddev(flight_logs, 'FlightTime')
flight_logs = _fliter_outliers_stddev(
flight_logs, flight_logs['FlightTime'] / flight_logs['Distance'], width=6)
print('After outliers filtering:', len(flight_logs))
all_flight_logs[model] = flight_logs
def select_model_widget(desc='Model:'):
return widgets.Dropdown(options=models['MakeModel'], value='Cessna 172 Skyhawk', description=desc)
def _hist_color_scale(colors):
return [[float(i)/float(len(colors)-1), colors[i]] for i in range(len(colors))]
def _plots(data, x, y, func=None, time_units=np.timedelta64(1, 'm'), sample=400):
colorscale = _hist_color_scale(cl.scales['4']['seq']['BuPu'])
data_sampled = data.sample(n=sample, random_state=1)
traces = []
traces.append(go.Scatter(
x=data_sampled[x],
y=data_sampled[y] / time_units,
mode='markers',
opacity=0.3,
marker=dict(color='purple', size=3),
showlegend=False,
))
traces.append(go.Histogram2d(
x=data[x],
y=data[y] / time_units,
histnorm="probability",
showscale=False,
colorscale=colorscale,
))
if func is not None:
func_x = np.linspace(0, data[x].max(), 100)
func_y = func(func_x)
traces.append(go.Scatter(
x=func_x,
y=func_y,
opacity=0.5,
line=dict(color='black', width=2),
showlegend=False,
))
return traces
def _fit(data, x, y, deg=3, time_units=np.timedelta64(1, 'm')):
return np.poly1d(np.polyfit(x=data[x], y=data[y]/time_units, deg=deg))
layout = go.Layout(
xaxis=dict(
rangemode='tozero',
),
yaxis=dict(
rangemode='tozero',
),
)
def fit_flight_time(model):
return _fit(all_flight_logs[model], 'Distance', 'FlightTime')
def fit_flight_time_per_distance(model):
return _fit(all_flight_logs[model], 'Distance', 'FlightTimePerDistance')
@widgets.interact(model=select_model_widget())
def analyze_flight_time(model):
flight_logs = all_flight_logs[model]
flight_time_per_distance = fit_flight_time_per_distance(model)
flight_time = fit_flight_time(model)
fig = tools.make_subplots(rows=1, cols=2, print_grid=False)
[fig.append_trace(t, 1, 1) for t in _plots(
flight_logs, x='Distance', y='FlightTime', func=flight_time)]
[fig.append_trace(t, 1, 2) for t in _plots(
flight_logs, x='Distance', y='FlightTimePerDistance', func=flight_time_per_distance)]
fig['layout']['xaxis1'].update(
title='Distance (nm)', rangemode='nonnegative')
fig['layout']['yaxis1'].update(
title='Flight Time (min)', rangemode='nonnegative')
fig['layout']['xaxis2'].update(
title='Distance (nm)', rangemode='nonnegative')
fig['layout']['yaxis2'].update(
title='Flight Time per Distance (min/nm)', rangemode='nonnegative')
fig['layout'].update(title=model)
offline.iplot(fig)
Conclusions:
Let's take assignment pay from the first section and flight time estimation from the second and estimate pay her hour for different aircraft and flight distances. Here we make following assumptions:
Then we simply plot following formula: $pay\_per\_hour(distance) = \frac{pay\_per\_distance\_per\_passenger(distance, 1) \cdot distance \cdot plane\_capacity}{flight\_time(distance)}$
We use three different ways to estimate flight time (to cross-check ourselves):
def pay_per_hour(model, distance, loading_time_mins=10, mode='regression'):
distance = pd.Series(distance)
capacity = models[models['MakeModel'] == model]['Seats'].iloc[0]
capacity = min(capacity, 27*2)
total_pay = pay_per_mile_per_unit(
distance=distance, units=1) * capacity * distance
if mode == 'regression':
flight_time_hours = fit_flight_time(model)(distance) / 60
elif mode == 'regression_normalized':
flight_time_hours = (fit_flight_time_per_distance(
model)(distance) * distance) / 60
elif mode == 'stat':
flight_logs = all_flight_logs[model]
flight_time_hours = pd.Series([flight_logs[np.abs(flight_logs.Distance - d) < 5].FlightTime.mean() / np.timedelta64(1, 'h')
for d in distance])
elif mode == 'cruise':
cruise = models[models['MakeModel'] == model]['CruiseSpeed'].iloc[0]
flight_time_hours = distance / cruise
else:
raise ValueError('Unknown flight time estimation mode: ' + mode)
flight_time_hours += loading_time_mins / 60
return total_pay / flight_time_hours
@widgets.interact(
model_l=select_model_widget(desc='Model (left):'),
model_r=select_model_widget(desc='Model (right):'),
loading_time=widgets.IntSlider(desc='Loading time (mins):', min=0, max=60, value=15, continuous_update=False))
def plot_pay_per_hour(model_l, model_r, loading_time):
def plot_one(model, name):
traces = []
for mode in ['regression', 'stat', 'cruise']:
x = np.linspace(0, 300, 100)
y = pay_per_hour(model, x, mode=mode,
loading_time_mins=loading_time)
traces.append(go.Scatter(
x=x,
y=y,
name='{} ({})'.format(mode, name),
))
return traces
fig = tools.make_subplots(
rows=1, cols=2,
print_grid=False,
shared_yaxes=True,
subplot_titles=(model_l, model_r),
)
[fig.append_trace(t, row=1, col=1) for t in plot_one(model_l, 'left')]
[fig.append_trace(t, row=1, col=2) for t in plot_one(model_r, 'right')]
fig['layout']['xaxis1'].update(title='Distance (nm)')
fig['layout']['yaxis1'].update(title='Pay per hour (v$/h)')
fig['layout']['xaxis2'].update(title='Distance (nm)')
fig['layout']['yaxis2'].update(title='Pay per hour (v$/h)')
offline.iplot(fig)
Conclusions: