#!/usr/bin/env python # coding: utf-8 # In[1]: # Reading in the initial data import pandas as pd matches = pd.read_csv("matches.csv", index_col=0) # In[2]: matches.head() # In[3]: matches.shape # In[4]: ### matches each team played in dataframe ### matches['team'].value_counts() # In[5]: ### Look at just "Liverpools" matches ### matches[matches['team'] == 'Liverpool'] # In[6]: ### count number of matches each week ### matches['round'].value_counts() # In[7]: matches.dtypes # In[8]: ### adjust date into proper datetime format ### matches['date'] = pd.to_datetime(matches['date']) del matches["comp"] del matches["notes"] # In[9]: matches.dtypes # In[10]: ### create away (0) vs home (1) column matches['venue_code'] = matches['venue'].astype('category').cat.codes # In[11]: matches # In[12]: ### create a number for each team ### matches['opp_code'] = matches['opponent'].astype('category').cat.codes # In[13]: matches # In[14]: ### remove ':' and minutes from time column ### matches['hour'] = matches['time'].str.replace(':.+', "", regex=True).astype('int') # In[15]: ### gives a number for each day of the week ### matches["day_code"] = matches['date'].dt.dayofweek # In[16]: ### Find wins and returns Boolean (true(1)/false(0) result) then converted to a number### matches['target'] = (matches['result']=='W').astype('int') # In[17]: ### 'target column a 1 if win and 0 if lose/draw' ### matches # In[18]: ### importing Random Forest Classifier Model from sklearn for ML building ### from sklearn.ensemble import RandomForestClassifier # In[19]: ### initialize class ### with 50 decition trees to train (n_estimators) ### 10 number of samples in the leaf of the decision tree ### random state = 1 (same results each time we run forest) rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1) # In[20]: ### Splitting training (matches before '2022-01-01') and testing data ('2022-01-01') ### train = matches[matches['date']<'2022-01-01'] test = matches[matches['date']>'2022-01-01'] # In[21]: ### create predictors, list of predictor columns created above ### predictors = ['venue_code', 'opp_code', 'hour', 'day_code'] # In[22]: ### fit random forest model with train predictors, to predict target ### rf.fit(train[predictors], train['target']) # In[23]: ### generate predictions using predict method ### preds = rf.predict(test[predictors]) # In[24]: ### import accuracy score from sklearn (what percent of the time was the prediction accurate)### from sklearn.metrics import accuracy_score acc = accuracy_score(test['target'], preds) # In[25]: acc ### about 61% accuracy of predicting correct result ### # In[26]: ### creating a dataframe to comine actual values and predicted values ### combined = pd.DataFrame(dict(actual = test['target'], prediction = preds)) # In[27]: ### two way table to what actually happened ### pd.crosstab(index=combined['actual'], columns=combined['prediction']) # In[28]: ### import precision score ###(when we predicted a win, what percentage of the time di the team actually win) from sklearn.metrics import precision_score precision_score(test['target'],preds) ### When we predicted a win, the team only won 47% of the time ### # In[29]: ### creating 1 dataframe for every team in our data ### grouped_matches = matches.groupby('team') # In[30]: ### just matches Manchester City played ### group = grouped_matches.get_group('Manchester City').sort_values("date") # In[31]: group # In[32]: ### take a group in, take a set of columns we want averages for, ### and take in a set of new columns we can assign rolling average to ### closed = 'left' -> use previsous weeks taking current week out ### drops any missing values """ Modified your code to include variable 'closed' that will be passed all the options available """ def rolling_averages(group, cols, new_cols, closed): group = group.sort_values("date") rolling_stats = group[cols].rolling(3, closed = closed).mean() group[new_cols] = rolling_stats group = group.dropna(subset=new_cols) return group # In[33]: cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"] new_cols = [f"{c}_rolling" for c in cols] ### adding rolling onto cols labels # In[34]: new_cols # ### Experiments with 'closed' parameter # In[35]: rolling_averages(group, cols, new_cols, closed = "left")[["gf", "gf_rolling"]].head() # In[36]: rolling_averages(group, cols, new_cols, closed = "right")[["gf", "gf_rolling"]].head() # In[37]: rolling_averages(group, cols, new_cols, closed = "both")[["gf", "gf_rolling"]].head() # In[38]: rolling_averages(group, cols, new_cols, closed = "neither")[["gf", "gf_rolling"]].head() # In[39]: s = pd.Series(range(5)) help(s.rolling) pd.__version__