%pylab inline
import pandas as pd
import numpy as np
import statsmodels.api as sm
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 30)
idx = pd.IndexSlice
data_file = "../data/input-cleaned/combined.tsv"
df = pd.read_csv(data_file, sep="\t")
df['game_id'] = range(1, len(df) + 1)
for col in ['home_scorers', 'away_scorers']:
df[col] = df[col].fillna('NULL')
print df.shape
print df.iloc[0]
# add column denoting who won
def get_result(row):
h, a = row.home_score, row.away_score
return 'draw' if h == a else 'home' if h > a else 'away'
df['result'] = df.apply(get_result, axis=1)
# result counts
results = pd.DataFrame()
results['games'] = df.groupby('result').size().order(ascending=False)
results['freq'] = (results.games / sum(results.games)).astype(float)
print results
# transpose df so that there is only 1 team column per row
dfs = {}
for side in ['home', 'away']:
df_side = df.copy()
new_names = {}
for col in ['team', 'score', 'scorers', 'g1']:
new_names['%s_%s' % (side, col)] = col
df_side.rename(columns=new_names, inplace=True)
df_side['side'] = side
df_side['result'] = df_side['result'].apply(lambda x:
'draw' if x == 'draw' else 'win' if x == side else 'loss')
cols_to_keep = ['game_id', 'date', 'season', 'team', 'score', 'scorers', 'side', 'result']
df_side = df_side[cols_to_keep]
dfs[side] = df_side
dg = dfs['home'].append(dfs['away'])
# check that it worked
print len(dg)
print len(dg.game_id.unique())
dg[dg.game_id == 1]
# goals scored in a game by team
print dg.groupby(['team', 'score']).size().unstack().fillna(0).head(10)
# new df around indiv goals
data = []
for i, r in dg.iterrows():
if r.scorers != 'NULL':
goals = r.scorers.split(';')
for goal in goals:
goal_info = goal.split(',')
other_info = [r.game_id, r.season, r.team, r.side, r.result, r.score]
combined = other_info + goal_info
data.append(combined)
cols = ['game_id', 'season', 'team', 'side', 'result', 'tot_goals', 'player', 'time', 'type']
goals = pd.DataFrame(data, columns=cols)
goals['time'] = goals['time'].astype(int)
print len(goals)
print goals.head()
goals.time.describe()
# goals scored by 15 min time frames
# 30-45 + 75-90 includes stoppage time before half and end of game
def time_norm(t, bin=5):
t_start = max(min(t, 90) - 1, 0) / bin * bin
return '%d-%d' % (t_start, t_start + bin)
goals['time_norm'] = goals['time'].apply(lambda x: time_norm(x, 15))
goals.groupby('time_norm').size().plot(kind='barh')
# transpose goals df so that index is game and time
goals = goals.sort(['game_id', 'time'], ascending=[1, 1])
# 1/0 for home/away goals at each score times
score_by_time = goals.groupby(['game_id', 'time', 'side']).size().unstack().fillna(0)
# cumulative goals for home/away by game
score_by_time = score_by_time.groupby(level=[0]).cumsum()
print score_by_time.head(10)
# check a specific game to make sure things look good
print score_by_time.loc[idx[[10],:], :]
# densify so that all times are in index for each game
home = score_by_time.unstack()['home'].fillna(method='ffill', axis=1).fillna(0).stack()
away = score_by_time.unstack()['away'].fillna(method='ffill', axis=1).fillna(0).stack()
score_by_time_dense = pd.DataFrame()
score_by_time_dense['home'] = home
score_by_time_dense['away'] = away
# add goal diff column
score_by_time_dense['diff'] = score_by_time_dense['home'] - score_by_time_dense['away']
# check that it worked
print score_by_time_dense.loc[idx[[10], :], :].tail(10)
# add final result to this df
score_by_time_dense2 = pd.merge(score_by_time_dense.reset_index(), df[['game_id', 'result']], on='game_id')
# tally results by time and goal diff
outcomes = score_by_time_dense2.groupby(['time', 'diff', 'result']).size()
outcomes = outcomes.unstack().unstack().fillna(0).stack().loc[idx[:, -3:3], :]
# turn counts in percentages
outcomes["tot_games"] = outcomes.sum(axis=1)
outcomes.loc[:, "away":"home"] = outcomes.loc[:, "away":"home"].div(outcomes["tot_games"], axis=0)
print outcomes.head(15)
# pivot data so index is only time
outcomes2 = outcomes.stack().unstack(level=1).unstack(level=1)
outcomes2.head()
outcomes2[2].ix[:, 'away':'home'].plot()
# add smoothed series for each outcome (using local regression)
outcomes3 = outcomes2.copy()
for col in outcomes3.columns:
if col[1] in ['away', 'draw', 'home']:
s = outcomes3[col]
x, y = s.index.values, s.values
y_smooth = sm.nonparametric.lowess(y, x, frac=0.2, return_sorted=False)
outcomes3[col[0], '%s_smooth' % col[1]] = y_smooth
outcomes3 = outcomes3.sort_index(axis=1)
print outcomes3[0].head()
# check that things seem reasonable
outcomes3[0].ix[:, 'away':'home_smooth'].plot()
# output final dataset
outcomes3.to_csv('../ipy-data/outcome-probs.csv', index=True, tupleize_cols=True)