FSEconomy: most profitable assignments

Tip: Menu "Cell -> Rul all" to see actual contents of this notebook

In [ ]:
import re
import os.path as path

import colorlover as cl
import numpy as np
import ipywidgets as widgets
import pandas as pd
from IPython.display import display, Markdown

import plotly.offline as offline
import plotly.graph_objs as go
import plotly.tools as tools

offline.init_notebook_mode(connected=True)

def make_data_path(dataset_name, ext='csv'):
    dataset_name = re.sub(r'[^a-zA-Z0-9_-]', '_', dataset_name)
    dataset_name = re.sub(r'_+', '_', dataset_name)
    dataset_name = re.sub(r'(^_+|_+$)', '', dataset_name)
    f = lambda ex: path.join('.', 'data', dataset_name + '.' + ex).lower()
    if ext is None:
        for ext in ['csv', 'csv.gz']:
            if path.isfile(f(ext)):
                return f(ext)
        
        ext = 'csv'
    return f(ext)

Passenger fare: distance, number of people within a ticket

$Pay(dis, ppl) = \left[\left(\frac{1}{ppl}\right)^{0.3} \cdot 500 \cdot \left(\frac{arctan(dis/25)}{arctan(1)}\right)\right] \cdot ppl \cdot dis$

In [ ]:
def pay_per_mile_per_unit(distance, units):
    maxfare = 1000
    crit_distance = 25
    return (
        ((1/units)**0.3)
        * (maxfare / 2)
        * (np.arctan(distance/crit_distance) / np.arctan(1))
    ) / distance


distances = np.linspace(1, 400, 100)


@widgets.interact(units=widgets.IntSlider(value=1, min=1, max=27, description='Passengers'))
def plot_fare(units):
    fig = tools.make_subplots(
        rows=1, cols=2,
        subplot_titles=('Pay per mile per passenger', 'Pay per passenger'),
        print_grid=False)

    fig.append_trace(go.Scatter(
        x=distances,
        y=pay_per_mile_per_unit(distances, units) * distances,
        line=dict(
            shape='spline',
        ),
        name='pay/p',
    ), row=1, col=1)

    fig.append_trace(go.Scatter(
        x=distances,
        y=pay_per_mile_per_unit(distances, units),
        line=dict(
            shape='spline',
            simplify=False,
        ),
        name='pay/mn/p',
    ), row=1, col=2)

    fig['layout']['xaxis1'].update(title='Distance (nm)', rangemode='tozero')
    fig['layout']['yaxis1'].update(title='Pay (v$)', range=[0, 1000])
    fig['layout']['xaxis2'].update(title='Distance (nm)', rangemode='tozero')
    fig['layout']['yaxis2'].update(title='Pay (v$)', range=[0, 30])
    fig['layout'].update(title='{} passenger(s) per assignment'.format(units))

    offline.iplot(fig)

Preliminary conclusions:

  1. One ticket per person! Group discounts for 3 persons is ~30%, we don't want that.
  2. Ticket price levels out beyond 100 nm. Flhing further than 150nm doesn't bring noticably more money.

Flight time estimation

In [ ]:
# Load and per-filter all data


def _flight_logs(model):
    return make_data_path('Flight logs - {}'.format(model), ext=None)


def _fliter_outliers_stddev(data, label_or_series, width=5):
    series = data[label_or_series] if type(
        label_or_series) == str else label_or_series
    return data[np.abs(series - series.mean()) < width*series.std()]


# Load available models
models = pd.read_csv(make_data_path('aircraft models'))
models = models[models['MakeModel'].map(
    lambda mm: path.isfile(_flight_logs(mm)))]

all_flight_logs = {}

for model in models['MakeModel']:
    display(Markdown('### Loading ' + model))
    # Load flight logs for selected model and fix data types
    flight_logs = pd.read_csv(_flight_logs(model), dtype={})
    flight_logs['FlightTime'] = pd.to_timedelta(flight_logs['FlightTime'])
    flight_logs['Time'] = pd.to_datetime(flight_logs['Time'])
    print('Initially loaded flight records:', len(flight_logs))

    # Filter out obvious garbage
    flight_logs = flight_logs[flight_logs.Distance > 0]
    flight_logs = flight_logs[flight_logs.FlightTime > np.timedelta64(0, 'm')]
    print('After garbage filtering:', len(flight_logs))

    # Derived columns
    flight_logs['FlightTimePerDistance'] = flight_logs['FlightTime'] / \
        flight_logs['Distance']

    # Filter out outliers
    flight_logs = _fliter_outliers_stddev(flight_logs, 'Distance')
    flight_logs = _fliter_outliers_stddev(flight_logs, 'FlightTime')
    flight_logs = _fliter_outliers_stddev(
        flight_logs, flight_logs['FlightTime'] / flight_logs['Distance'], width=6)
    print('After outliers filtering:', len(flight_logs))

    all_flight_logs[model] = flight_logs
In [ ]:
def select_model_widget(desc='Model:'):
    return widgets.Dropdown(options=models['MakeModel'], value='Cessna 172 Skyhawk', description=desc)


def _hist_color_scale(colors):
    return [[float(i)/float(len(colors)-1), colors[i]] for i in range(len(colors))]


def _plots(data, x, y, func=None, time_units=np.timedelta64(1, 'm'), sample=400):
    colorscale = _hist_color_scale(cl.scales['4']['seq']['BuPu'])
    data_sampled = data.sample(n=sample, random_state=1)

    traces = []
    traces.append(go.Scatter(
        x=data_sampled[x],
        y=data_sampled[y] / time_units,
        mode='markers',
        opacity=0.3,
        marker=dict(color='purple', size=3),
        showlegend=False,
    ))
    traces.append(go.Histogram2d(
        x=data[x],
        y=data[y] / time_units,
        histnorm="probability",
        showscale=False,
        colorscale=colorscale,
    ))
    if func is not None:
        func_x = np.linspace(0, data[x].max(), 100)
        func_y = func(func_x)
        traces.append(go.Scatter(
            x=func_x,
            y=func_y,
            opacity=0.5,
            line=dict(color='black', width=2),
            showlegend=False,
        ))
    return traces


def _fit(data, x, y, deg=3, time_units=np.timedelta64(1, 'm')):
    return np.poly1d(np.polyfit(x=data[x], y=data[y]/time_units, deg=deg))


layout = go.Layout(
    xaxis=dict(
        rangemode='tozero',
    ),
    yaxis=dict(
        rangemode='tozero',
    ),
)


def fit_flight_time(model):
    return _fit(all_flight_logs[model], 'Distance', 'FlightTime')


def fit_flight_time_per_distance(model):
    return _fit(all_flight_logs[model], 'Distance', 'FlightTimePerDistance')


@widgets.interact(model=select_model_widget())
def analyze_flight_time(model):
    flight_logs = all_flight_logs[model]
    flight_time_per_distance = fit_flight_time_per_distance(model)
    flight_time = fit_flight_time(model)

    fig = tools.make_subplots(rows=1, cols=2, print_grid=False)
    [fig.append_trace(t, 1, 1) for t in _plots(
        flight_logs, x='Distance', y='FlightTime', func=flight_time)]
    [fig.append_trace(t, 1, 2) for t in _plots(
        flight_logs, x='Distance', y='FlightTimePerDistance', func=flight_time_per_distance)]

    fig['layout']['xaxis1'].update(
        title='Distance (nm)', rangemode='nonnegative')
    fig['layout']['yaxis1'].update(
        title='Flight Time (min)', rangemode='nonnegative')
    fig['layout']['xaxis2'].update(
        title='Distance (nm)', rangemode='nonnegative')
    fig['layout']['yaxis2'].update(
        title='Flight Time per Distance (min/nm)', rangemode='nonnegative')
    fig['layout'].update(title=model)
    offline.iplot(fig)

Conclusions:

  • Vast majority of flights is under 200 nm for GA planes.
  • Flight time per mile is higher for shorter flights, which is expected.
  • Curious that it's not very visible on the normal graph.

Flight profit per hour

Let's take assignment pay from the first section and flight time estimation from the second and estimate pay her hour for different aircraft and flight distances. Here we make following assumptions:

  1. All assignments are single-person. Having more people per assignment only reduces pay, so we won't be doing this (in ideal world, at least).
  2. We are able to fill selected airplane to its full capacity. Making this happen in real world is a whole another can of worms, but we're considering an ideal case for the time being.
  3. There is some time overhead for setting up the filght. For experienced FSE people it would be lower, for noobs like myself - higher.

Then we simply plot following formula: $pay\_per\_hour(distance) = \frac{pay\_per\_distance\_per\_passenger(distance, 1) \cdot distance \cdot plane\_capacity}{flight\_time(distance)}$

We use three different ways to estimate flight time (to cross-check ourselves):

  1. Cruise speed: ideal case when we perform all the flight at the specified cruise speed.
  2. Statistical: average flight time for the plane for distances +/- 5nm.
  3. Polynomial approximation: perform polynomial regression of flight time by distance data we have and use that polynomial to calculate flight time.
In [ ]:
def pay_per_hour(model, distance, loading_time_mins=10, mode='regression'):
    distance = pd.Series(distance)
    capacity = models[models['MakeModel'] == model]['Seats'].iloc[0]
    capacity = min(capacity, 27*2)
    total_pay = pay_per_mile_per_unit(
        distance=distance, units=1) * capacity * distance
    if mode == 'regression':
        flight_time_hours = fit_flight_time(model)(distance) / 60
    elif mode == 'regression_normalized':
        flight_time_hours = (fit_flight_time_per_distance(
            model)(distance) * distance) / 60
    elif mode == 'stat':
        flight_logs = all_flight_logs[model]
        flight_time_hours = pd.Series([flight_logs[np.abs(flight_logs.Distance - d) < 5].FlightTime.mean() / np.timedelta64(1, 'h')
                                       for d in distance])
    elif mode == 'cruise':
        cruise = models[models['MakeModel'] == model]['CruiseSpeed'].iloc[0]
        flight_time_hours = distance / cruise
    else:
        raise ValueError('Unknown flight time estimation mode: ' + mode)
    flight_time_hours += loading_time_mins / 60
    return total_pay / flight_time_hours


@widgets.interact(
    model_l=select_model_widget(desc='Model (left):'),
    model_r=select_model_widget(desc='Model (right):'),
    loading_time=widgets.IntSlider(desc='Loading time (mins):', min=0, max=60, value=15, continuous_update=False))
def plot_pay_per_hour(model_l, model_r, loading_time):
    def plot_one(model, name):
        traces = []
        for mode in ['regression', 'stat', 'cruise']:
            x = np.linspace(0, 300, 100)
            y = pay_per_hour(model, x, mode=mode,
                             loading_time_mins=loading_time)
            traces.append(go.Scatter(
                x=x,
                y=y,
                name='{} ({})'.format(mode, name),
            ))
        return traces

    fig = tools.make_subplots(
        rows=1, cols=2,
        print_grid=False,
        shared_yaxes=True,
        subplot_titles=(model_l, model_r),
    )
    [fig.append_trace(t, row=1, col=1) for t in plot_one(model_l, 'left')]
    [fig.append_trace(t, row=1, col=2) for t in plot_one(model_r, 'right')]

    fig['layout']['xaxis1'].update(title='Distance (nm)')
    fig['layout']['yaxis1'].update(title='Pay per hour (v$/h)')
    fig['layout']['xaxis2'].update(title='Distance (nm)')
    fig['layout']['yaxis2'].update(title='Pay per hour (v$/h)')
    
    offline.iplot(fig)

Conclusions:

  1. Bigger planes are much more profitable, assuming we fill them fully (what a surprise!)
  2. Most profitable flight distance is 30-50 nm, depending on a plane and setup time. Peak for bigger and faster aircraft is slightly shifted to the right, but not as much as I expected.
  3. Our polymonial approximation does a very good job! Machine Learning!