In [1]:
%pylab inline
Populating the interactive namespace from numpy and matplotlib
In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 30)

idx = pd.IndexSlice
In [4]:
data_file = "../data/input-cleaned/combined.tsv"
df = pd.read_csv(data_file, sep="\t")

df['game_id'] = range(1, len(df) + 1)

for col in ['home_scorers', 'away_scorers']:
    df[col] = df[col].fillna('NULL')

print df.shape
print df.iloc[0]
(4940, 10)
season                                               2001-2002
date                                                2002-06-11
home_team                                     Blackburn Rovers
home_score                                                   3
home_scorers    Andy Cole,53,g;Damien Duff,66,g;Andy Cole,81,g
away_team                                               Fulham
away_score                                                   0
away_scorers                                              NULL
venue                                               Ewood Park
game_id                                                      1
Name: 0, dtype: object
In [5]:
# add column denoting who won
def get_result(row):
    h, a = row.home_score, row.away_score
    return 'draw' if h == a else 'home' if h > a else 'away'

df['result'] = df.apply(get_result, axis=1)

# result counts
results = pd.DataFrame()
results['games'] = df.groupby('result').size().order(ascending=False)
results['freq'] = (results.games / sum(results.games)).astype(float)

print results
        games      freq
result                 
home     2286  0.462753
away     1374  0.278138
draw     1280  0.259109
In [7]:
# transpose df so that there is only 1 team column per row
dfs = {}
for side in ['home', 'away']:    
    df_side = df.copy()
    
    new_names = {}
    for col in ['team', 'score', 'scorers', 'g1']:
        new_names['%s_%s' % (side, col)] = col
    df_side.rename(columns=new_names, inplace=True)

    df_side['side'] = side
    df_side['result'] = df_side['result'].apply(lambda x: 
        'draw' if x == 'draw' else 'win' if x == side else 'loss')

    cols_to_keep = ['game_id', 'date', 'season', 'team', 'score', 'scorers', 'side', 'result']
    df_side = df_side[cols_to_keep]
    
    dfs[side] = df_side

dg = dfs['home'].append(dfs['away'])

# check that it worked
print len(dg)
print len(dg.game_id.unique())

dg[dg.game_id == 1]
9880
4940
Out[7]:
game_id date season team score scorers side result
0 1 2002-06-11 2001-2002 Blackburn Rovers 3 Andy Cole,53,g;Damien Duff,66,g;Andy Cole,81,g home win
0 1 2002-06-11 2001-2002 Fulham 0 NULL away loss
In [8]:
# goals scored in a game by team
print dg.groupby(['team', 'score']).size().unstack().fillna(0).head(10)
score                0    1    2   3   4   5  6  7  8  9
team                                                    
Arsenal             63  150  128  86  43  13  7  4  0  0
Aston Villa        151  175  105  40  16   5  2  0  0  0
Birmingham          60   49   30  10   2   1  0  0  0  0
Birmingham City     32   52   23   5   2   0  0  0  0  0
Blackburn Rovers   126  143   94  39  13   2  0  1  0  0
Blackpool            8   11   15   2   2   0  0  0  0  0
Bolton Wanderers   132  147   92  27  17   3  0  0  0  0
Burnley             13   15    5   3   2   0  0  0  0  0
Cardiff City        19   10    5   4   0   0  0  0  0  0
Charlton Athletic   77   81   47  16   7   0  0  0  0  0
In [9]:
# new df around indiv goals
data = []
for i, r in dg.iterrows():
    if r.scorers != 'NULL':
        goals = r.scorers.split(';')
        for goal in goals:
            goal_info = goal.split(',')
            other_info = [r.game_id, r.season, r.team, r.side, r.result, r.score]
            combined = other_info + goal_info
            data.append(combined)

cols = ['game_id', 'season', 'team', 'side', 'result', 'tot_goals', 'player', 'time', 'type']
goals = pd.DataFrame(data, columns=cols)

goals['time'] = goals['time'].astype(int)

print len(goals)
print goals.head()
13001
   game_id     season              team  side result  tot_goals         player  time type
0        1  2001-2002  Blackburn Rovers  home    win          3      Andy Cole    53    g
1        1  2001-2002  Blackburn Rovers  home    win          3    Damien Duff    66    g
2        1  2001-2002  Blackburn Rovers  home    win          3      Andy Cole    81    g
3        2  2001-2002    Leicester City  home    win          2    Paul Dickov    60    g
4        2  2001-2002    Leicester City  home    win          2  Matthew Piper    71    g
In [10]:
goals.time.describe()
Out[10]:
count    13001.000000
mean        50.486501
std         26.406922
min          0.000000
25%         29.000000
50%         51.000000
75%         74.000000
max         95.000000
Name: time, dtype: float64
In [11]:
# goals scored by 15 min time frames
# 30-45 + 75-90 includes stoppage time before half and end of game
def time_norm(t, bin=5):
    t_start = max(min(t, 90) - 1, 0) / bin * bin
    return '%d-%d' % (t_start, t_start + bin)

goals['time_norm'] = goals['time'].apply(lambda x: time_norm(x, 15))

goals.groupby('time_norm').size().plot(kind='barh')
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x10fdc53d0>
In [29]:
# transpose goals df so that index is game and time
goals = goals.sort(['game_id', 'time'], ascending=[1, 1])

# 1/0 for home/away goals at each score times
score_by_time = goals.groupby(['game_id', 'time', 'side']).size().unstack().fillna(0)

# cumulative goals for home/away by game
score_by_time = score_by_time.groupby(level=[0]).cumsum()

print score_by_time.head(10)
side          away  home
game_id time            
1       53       0     1
        66       0     2
        81       0     3
2       54       1     0
        60       1     1
        71       1     2
3       13       0     1
        35       0     2
        46       0     3
        57       0     4
In [24]:
# check a specific game to make sure things look good
print score_by_time.loc[idx[[10],:], :]
side          away  home
game_id time            
10      4        0     1
        20       1     1
        31       2     1
        33       2     2
        72       2     3
        83       2     4
        89       3     4
In [14]:
# densify so that all times are in index for each game
home = score_by_time.unstack()['home'].fillna(method='ffill', axis=1).fillna(0).stack()
away = score_by_time.unstack()['away'].fillna(method='ffill', axis=1).fillna(0).stack()

score_by_time_dense = pd.DataFrame()

score_by_time_dense['home'] = home
score_by_time_dense['away'] = away

# add goal diff column
score_by_time_dense['diff'] = score_by_time_dense['home'] - score_by_time_dense['away']

# check that it worked
print score_by_time_dense.loc[idx[[10], :], :].tail(10)
              home  away  diff
game_id time                  
10      85       4     2     2
        86       4     2     2
        87       4     2     2
        88       4     2     2
        89       4     3     1
        90       4     3     1
        91       4     3     1
        93       4     3     1
        94       4     3     1
        95       4     3     1
In [15]:
# add final result to this df
score_by_time_dense2 = pd.merge(score_by_time_dense.reset_index(), df[['game_id', 'result']], on='game_id')

# tally results by time and goal diff
outcomes = score_by_time_dense2.groupby(['time', 'diff', 'result']).size()
outcomes = outcomes.unstack().unstack().fillna(0).stack().loc[idx[:, -3:3], :]

# turn counts in percentages
outcomes["tot_games"] = outcomes.sum(axis=1)
outcomes.loc[:, "away":"home"] = outcomes.loc[:, "away":"home"].div(outcomes["tot_games"], axis=0)

print outcomes.head(15)
result         away      draw      home  tot_games
time diff                                         
0    -3         NaN       NaN       NaN          0
     -2         NaN       NaN       NaN          0
     -1    0.000000  0.000000  1.000000          2
      0    0.303125  0.192188  0.504687       4480
      1    1.000000  0.000000  0.000000          1
      2         NaN       NaN       NaN          0
      3         NaN       NaN       NaN          0
1    -3         NaN       NaN       NaN          0
     -2         NaN       NaN       NaN          0
     -1    0.500000  0.250000  0.250000         28
      0    0.303030  0.191542  0.505427       4422
      1    0.151515  0.212121  0.636364         33
      2         NaN       NaN       NaN          0
      3         NaN       NaN       NaN          0
2    -3         NaN       NaN       NaN          0
In [16]:
# pivot data so index is only time
outcomes2 = outcomes.stack().unstack(level=1).unstack(level=1)

outcomes2.head()
Out[16]:
diff -3 -2 -1 0 1 2 3
result away draw home tot_games away draw home tot_games away draw home tot_games away draw home tot_games away draw home tot_games away draw home tot_games away draw home tot_games
time
0 NaN NaN NaN 0 NaN NaN NaN 0 0.000000 0.000000 1.000000 2 0.303125 0.192188 0.504687 4480 1.000000 0.000000 0.000000 1 NaN NaN NaN 0 NaN NaN NaN 0
1 NaN NaN NaN 0 NaN NaN NaN 0 0.500000 0.250000 0.250000 28 0.303030 0.191542 0.505427 4422 0.151515 0.212121 0.636364 33 NaN NaN NaN 0 NaN NaN NaN 0
2 NaN NaN NaN 0 NaN NaN NaN 0 0.538462 0.200000 0.261538 65 0.303023 0.192707 0.504270 4333 0.129412 0.152941 0.717647 85 NaN NaN NaN 0 NaN NaN NaN 0
3 NaN NaN NaN 0 0.5 0.5 0 2 0.486957 0.165217 0.347826 115 0.304595 0.193273 0.502132 4222 0.111888 0.174825 0.713287 143 0 0.000000 1.000000 1 NaN NaN NaN 0
4 NaN NaN NaN 0 0.5 0.5 0 2 0.469880 0.180723 0.349398 166 0.305151 0.194121 0.500729 4116 0.124352 0.150259 0.725389 193 0 0.333333 0.666667 6 NaN NaN NaN 0
In [17]:
outcomes2[2].ix[:, 'away':'home'].plot()
Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x1111c8490>
In [18]:
# add smoothed series for each outcome (using local regression) 
outcomes3 = outcomes2.copy()

for col in outcomes3.columns:
    if col[1] in ['away', 'draw', 'home']:    
        s = outcomes3[col]
        x, y = s.index.values, s.values

        y_smooth = sm.nonparametric.lowess(y, x, frac=0.2, return_sorted=False)
        outcomes3[col[0], '%s_smooth' % col[1]] = y_smooth

outcomes3 = outcomes3.sort_index(axis=1)

print outcomes3[0].head()
result      away  away_smooth      draw  draw_smooth      home  home_smooth  tot_games
time                                                                                  
0       0.303125     0.303486  0.192188     0.192624  0.504687     0.503849       4480
1       0.303030     0.303999  0.191542     0.192699  0.505427     0.503266       4422
2       0.303023     0.304492  0.192707     0.192767  0.504270     0.502711       4333
3       0.304595     0.304962  0.193273     0.192828  0.502132     0.502185       4222
4       0.305151     0.305411  0.194121     0.192886  0.500729     0.501684       4116
In [23]:
# check that things seem reasonable
outcomes3[0].ix[:, 'away':'home_smooth'].plot()
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x117a95ed0>
In [33]:
# output final dataset
outcomes3.to_csv('../ipy-data/outcome-probs.csv', index=True, tupleize_cols=True)