#!/usr/bin/env python
# coding: utf-8

# In[1]:


# Reading in the initial data

import pandas as pd

matches = pd.read_csv("matches.csv", index_col=0)


# In[2]:


matches.head()


# In[3]:


matches.shape


# In[4]:


### matches each team played in dataframe ###
matches['team'].value_counts()


# In[5]:


### Look at just "Liverpools" matches ###
matches[matches['team'] == 'Liverpool']


# In[6]:


### count number of matches each week ###
matches['round'].value_counts()


# In[7]:


matches.dtypes


# In[8]:


### adjust date into proper datetime format ###
matches['date'] = pd.to_datetime(matches['date'])

del matches["comp"]
del matches["notes"]


# In[9]:


matches.dtypes


# In[10]:


### create away (0) vs home (1) column
matches['venue_code'] = matches['venue'].astype('category').cat.codes


# In[11]:


matches


# In[12]:


### create a number for each team ###
matches['opp_code'] = matches['opponent'].astype('category').cat.codes


# In[13]:


matches


# In[14]:


### remove ':' and minutes from time column ###
matches['hour'] = matches['time'].str.replace(':.+', "", regex=True).astype('int')


# In[15]:


### gives a number for each day of the week ###
matches["day_code"] = matches['date'].dt.dayofweek


# In[16]:


### Find wins and returns Boolean (true(1)/false(0) result) then converted to a number###
matches['target'] = (matches['result']=='W').astype('int')


# In[17]:


### 'target column a 1 if win and 0 if lose/draw' ###
matches


# In[18]:


### importing Random Forest Classifier Model from sklearn for ML building ###
from sklearn.ensemble import RandomForestClassifier


# In[19]:


### initialize class 
### with 50 decition trees to train (n_estimators) 
### 10 number of samples in the leaf of the decision tree
### random state = 1 (same results each time we run forest)

rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)


# In[20]:


### Splitting training (matches before '2022-01-01') and testing data ('2022-01-01') ###

train = matches[matches['date']<'2022-01-01']

test = matches[matches['date']>'2022-01-01']


# In[21]:


### create predictors, list of predictor columns created above ###

predictors = ['venue_code', 'opp_code', 'hour', 'day_code']


# In[22]:


### fit random forest model with train predictors, to predict target ###

rf.fit(train[predictors], train['target'])


# In[23]:


### generate predictions using predict method ###
preds = rf.predict(test[predictors])


# In[24]:


### import accuracy score from sklearn (what percent of the time was the prediction accurate)###
from sklearn.metrics import accuracy_score

acc = accuracy_score(test['target'], preds)


# In[25]:


acc
### about 61% accuracy of predicting correct result ###


# In[26]:


### creating a dataframe to comine actual values and predicted values ###

combined = pd.DataFrame(dict(actual = test['target'], prediction = preds))


# In[27]:


### two way table to what actually happened ###

pd.crosstab(index=combined['actual'], columns=combined['prediction'])


# In[28]:


### import precision score 
###(when we predicted a win, what percentage of the time di the team actually win)

from sklearn.metrics import precision_score

precision_score(test['target'],preds)

### When we predicted a win, the team only won 47% of the time ###


# In[29]:


### creating 1 dataframe for every team in our data ###

grouped_matches = matches.groupby('team')


# In[30]:


### just matches Manchester City played ###
group = grouped_matches.get_group('Manchester City').sort_values("date")


# In[31]:


group


# In[32]:


### take a group in, take a set of columns we want averages for, 
### and take in a set of new columns we can assign rolling average to
### closed = 'left' -> use previsous weeks taking current week out
### drops any missing values


"""
Modified your code to include variable 'closed' that will be passed all the options available
"""
def rolling_averages(group, cols, new_cols, closed):
    group = group.sort_values("date")
    rolling_stats = group[cols].rolling(3, closed = closed).mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group


# In[33]:


cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]
### adding rolling onto cols labels


# In[34]:


new_cols


# ### Experiments with 'closed' parameter

# In[35]:


rolling_averages(group, cols, new_cols, closed = "left")[["gf", "gf_rolling"]].head()


# In[36]:


rolling_averages(group, cols, new_cols, closed = "right")[["gf", "gf_rolling"]].head()


# In[37]:


rolling_averages(group, cols, new_cols, closed = "both")[["gf", "gf_rolling"]].head()


# In[38]:


rolling_averages(group, cols, new_cols, closed = "neither")[["gf", "gf_rolling"]].head()


# In[39]:


s = pd.Series(range(5))
help(s.rolling)

pd.__version__