# Reading in the initial data
import pandas as pd
matches = pd.read_csv("matches.csv", index_col=0)
matches.head()
date | time | comp | round | day | venue | result | gf | ga | opponent | ... | match report | notes | sh | sot | dist | fk | pk | pkatt | season | team | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2021-08-15 | 16:30 | Premier League | Matchweek 1 | Sun | Away | L | 0.0 | 1.0 | Tottenham | ... | Match Report | NaN | 18.0 | 4.0 | 16.9 | 1.0 | 0.0 | 0.0 | 2022 | Manchester City |
2 | 2021-08-21 | 15:00 | Premier League | Matchweek 2 | Sat | Home | W | 5.0 | 0.0 | Norwich City | ... | Match Report | NaN | 16.0 | 4.0 | 17.3 | 1.0 | 0.0 | 0.0 | 2022 | Manchester City |
3 | 2021-08-28 | 12:30 | Premier League | Matchweek 3 | Sat | Home | W | 5.0 | 0.0 | Arsenal | ... | Match Report | NaN | 25.0 | 10.0 | 14.3 | 0.0 | 0.0 | 0.0 | 2022 | Manchester City |
4 | 2021-09-11 | 15:00 | Premier League | Matchweek 4 | Sat | Away | W | 1.0 | 0.0 | Leicester City | ... | Match Report | NaN | 25.0 | 8.0 | 14.0 | 0.0 | 0.0 | 0.0 | 2022 | Manchester City |
6 | 2021-09-18 | 15:00 | Premier League | Matchweek 5 | Sat | Home | D | 0.0 | 0.0 | Southampton | ... | Match Report | NaN | 16.0 | 1.0 | 15.7 | 1.0 | 0.0 | 0.0 | 2022 | Manchester City |
5 rows × 27 columns
matches.shape
(1389, 27)
### matches each team played in dataframe ###
matches['team'].value_counts()
Newcastle United 72 West Ham United 72 Brighton and Hove Albion 72 Southampton 72 Manchester United 72 Crystal Palace 71 Tottenham Hotspur 71 Burnley 71 Manchester City 71 Arsenal 71 Wolverhampton Wanderers 71 Leeds United 71 Aston Villa 70 Leicester City 70 Everton 70 Chelsea 70 Sheffield United 38 West Bromwich Albion 38 Fulham 38 Liverpool 38 Brentford 34 Watford 33 Norwich City 33 Name: team, dtype: int64
### Look at just "Liverpools" matches ###
matches[matches['team'] == 'Liverpool']
date | time | comp | round | day | venue | result | gf | ga | opponent | ... | match report | notes | sh | sot | dist | fk | pk | pkatt | season | team | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2020-09-12 | 17:30 | Premier League | Matchweek 1 | Sat | Home | W | 4.0 | 3.0 | Leeds United | ... | Match Report | NaN | 20.0 | 4.0 | 17.0 | 0.0 | 2.0 | 2.0 | 2021 | Liverpool |
2 | 2020-09-20 | 16:30 | Premier League | Matchweek 2 | Sun | Away | W | 2.0 | 0.0 | Chelsea | ... | Match Report | NaN | 17.0 | 5.0 | 17.7 | 1.0 | 0.0 | 0.0 | 2021 | Liverpool |
4 | 2020-09-28 | 20:00 | Premier League | Matchweek 3 | Mon | Home | W | 3.0 | 1.0 | Arsenal | ... | Match Report | NaN | 21.0 | 9.0 | 16.8 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
6 | 2020-10-04 | 19:15 | Premier League | Matchweek 4 | Sun | Away | L | 2.0 | 7.0 | Aston Villa | ... | Match Report | NaN | 14.0 | 8.0 | 15.8 | 1.0 | 0.0 | 0.0 | 2021 | Liverpool |
7 | 2020-10-17 | 12:30 | Premier League | Matchweek 5 | Sat | Away | D | 2.0 | 2.0 | Everton | ... | Match Report | NaN | 22.0 | 8.0 | 15.0 | 1.0 | 0.0 | 0.0 | 2021 | Liverpool |
9 | 2020-10-24 | 20:00 | Premier League | Matchweek 6 | Sat | Home | W | 2.0 | 1.0 | Sheffield Utd | ... | Match Report | NaN | 17.0 | 5.0 | 18.2 | 1.0 | 0.0 | 0.0 | 2021 | Liverpool |
11 | 2020-10-31 | 17:30 | Premier League | Matchweek 7 | Sat | Home | W | 2.0 | 1.0 | West Ham | ... | Match Report | NaN | 8.0 | 2.0 | 18.6 | 1.0 | 1.0 | 1.0 | 2021 | Liverpool |
13 | 2020-11-08 | 16:30 | Premier League | Matchweek 8 | Sun | Away | D | 1.0 | 1.0 | Manchester City | ... | Match Report | NaN | 9.0 | 2.0 | 21.5 | 0.0 | 1.0 | 1.0 | 2021 | Liverpool |
14 | 2020-11-22 | 19:15 | Premier League | Matchweek 9 | Sun | Home | W | 3.0 | 0.0 | Leicester City | ... | Match Report | NaN | 24.0 | 12.0 | 11.9 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
16 | 2020-11-28 | 12:30 | Premier League | Matchweek 10 | Sat | Away | D | 1.0 | 1.0 | Brighton | ... | Match Report | NaN | 6.0 | 2.0 | 20.9 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
18 | 2020-12-06 | 19:15 | Premier League | Matchweek 11 | Sun | Home | W | 4.0 | 0.0 | Wolves | ... | Match Report | NaN | 11.0 | 6.0 | 16.6 | 1.0 | 0.0 | 0.0 | 2021 | Liverpool |
20 | 2020-12-13 | 16:30 | Premier League | Matchweek 12 | Sun | Away | D | 1.0 | 1.0 | Fulham | ... | Match Report | NaN | 11.0 | 5.0 | 20.0 | 1.0 | 1.0 | 1.0 | 2021 | Liverpool |
21 | 2020-12-16 | 20:00 | Premier League | Matchweek 13 | Wed | Home | W | 2.0 | 1.0 | Tottenham | ... | Match Report | NaN | 17.0 | 11.0 | 15.5 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
22 | 2020-12-19 | 12:30 | Premier League | Matchweek 14 | Sat | Away | W | 7.0 | 0.0 | Crystal Palace | ... | Match Report | NaN | 14.0 | 7.0 | 13.2 | 1.0 | 0.0 | 0.0 | 2021 | Liverpool |
23 | 2020-12-27 | 16:30 | Premier League | Matchweek 15 | Sun | Home | D | 1.0 | 1.0 | West Brom | ... | Match Report | NaN | 17.0 | 2.0 | 17.8 | 2.0 | 0.0 | 0.0 | 2021 | Liverpool |
24 | 2020-12-30 | 20:00 | Premier League | Matchweek 16 | Wed | Away | D | 0.0 | 0.0 | Newcastle Utd | ... | Match Report | NaN | 11.0 | 4.0 | 16.7 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
25 | 2021-01-04 | 20:00 | Premier League | Matchweek 17 | Mon | Away | L | 0.0 | 1.0 | Southampton | ... | Match Report | NaN | 17.0 | 1.0 | 14.3 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
27 | 2021-01-17 | 16:30 | Premier League | Matchweek 19 | Sun | Home | D | 0.0 | 0.0 | Manchester Utd | ... | Match Report | NaN | 17.0 | 3.0 | 17.1 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
28 | 2021-01-21 | 20:00 | Premier League | Matchweek 18 | Thu | Home | L | 0.0 | 1.0 | Burnley | ... | Match Report | NaN | 27.0 | 6.0 | 17.3 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
30 | 2021-01-28 | 20:00 | Premier League | Matchweek 20 | Thu | Away | W | 3.0 | 1.0 | Tottenham | ... | Match Report | NaN | 14.0 | 7.0 | 14.7 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
31 | 2021-01-31 | 16:30 | Premier League | Matchweek 21 | Sun | Away | W | 3.0 | 1.0 | West Ham | ... | Match Report | NaN | 14.0 | 5.0 | 15.3 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
32 | 2021-02-03 | 20:15 | Premier League | Matchweek 22 | Wed | Home | L | 0.0 | 1.0 | Brighton | ... | Match Report | NaN | 11.0 | 0.0 | 19.9 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
33 | 2021-02-07 | 16:30 | Premier League | Matchweek 23 | Sun | Home | L | 1.0 | 4.0 | Manchester City | ... | Match Report | NaN | 7.0 | 2.0 | 17.9 | 1.0 | 1.0 | 1.0 | 2021 | Liverpool |
34 | 2021-02-13 | 12:30 | Premier League | Matchweek 24 | Sat | Away | L | 1.0 | 3.0 | Leicester City | ... | Match Report | NaN | 15.0 | 4.0 | 15.4 | 2.0 | 0.0 | 0.0 | 2021 | Liverpool |
36 | 2021-02-20 | 17:30 | Premier League | Matchweek 25 | Sat | Home | L | 0.0 | 2.0 | Everton | ... | Match Report | NaN | 16.0 | 6.0 | 15.9 | 1.0 | 0.0 | 0.0 | 2021 | Liverpool |
37 | 2021-02-28 | 19:15 | Premier League | Matchweek 26 | Sun | Away | W | 2.0 | 0.0 | Sheffield Utd | ... | Match Report | NaN | 15.0 | 8.0 | 14.2 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
38 | 2021-03-04 | 20:15 | Premier League | Matchweek 29 | Thu | Home | L | 0.0 | 1.0 | Chelsea | ... | Match Report | NaN | 7.0 | 1.0 | 18.4 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
39 | 2021-03-07 | 14:00 | Premier League | Matchweek 27 | Sun | Home | L | 0.0 | 1.0 | Fulham | ... | Match Report | NaN | 16.0 | 3.0 | 17.0 | 1.0 | 0.0 | 0.0 | 2021 | Liverpool |
41 | 2021-03-15 | 20:00 | Premier League | Matchweek 28 | Mon | Away | W | 1.0 | 0.0 | Wolves | ... | Match Report | NaN | 12.0 | 4.0 | 15.9 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
42 | 2021-04-03 | 20:00 | Premier League | Matchweek 30 | Sat | Away | W | 3.0 | 0.0 | Arsenal | ... | Match Report | NaN | 16.0 | 7.0 | 17.0 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
44 | 2021-04-10 | 15:00 | Premier League | Matchweek 31 | Sat | Home | W | 2.0 | 1.0 | Aston Villa | ... | Match Report | NaN | 23.0 | 8.0 | 16.7 | 1.0 | 0.0 | 0.0 | 2021 | Liverpool |
46 | 2021-04-19 | 20:00 | Premier League | Matchweek 32 | Mon | Away | D | 1.0 | 1.0 | Leeds United | ... | Match Report | NaN | 17.0 | 7.0 | 15.7 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
47 | 2021-04-24 | 12:30 | Premier League | Matchweek 33 | Sat | Home | D | 1.0 | 1.0 | Newcastle Utd | ... | Match Report | NaN | 21.0 | 9.0 | 17.0 | 1.0 | 0.0 | 0.0 | 2021 | Liverpool |
48 | 2021-05-08 | 20:15 | Premier League | Matchweek 35 | Sat | Home | W | 2.0 | 0.0 | Southampton | ... | Match Report | NaN | 14.0 | 6.0 | 12.4 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
49 | 2021-05-13 | 20:15 | Premier League | Matchweek 34 | Thu | Away | W | 4.0 | 2.0 | Manchester Utd | ... | Match Report | NaN | 17.0 | 8.0 | 14.9 | 0.0 | 0.0 | 0.0 | 2021 | Liverpool |
50 | 2021-05-16 | 16:30 | Premier League | Matchweek 36 | Sun | Away | W | 2.0 | 1.0 | West Brom | ... | Match Report | NaN | 26.0 | 6.0 | 16.9 | 1.0 | 0.0 | 0.0 | 2021 | Liverpool |
51 | 2021-05-19 | 20:15 | Premier League | Matchweek 37 | Wed | Away | W | 3.0 | 0.0 | Burnley | ... | Match Report | NaN | 20.0 | 3.0 | 15.5 | 1.0 | 0.0 | 0.0 | 2021 | Liverpool |
52 | 2021-05-23 | 16:00 | Premier League | Matchweek 38 | Sun | Home | W | 2.0 | 0.0 | Crystal Palace | ... | Match Report | NaN | 19.0 | 5.0 | 14.2 | 1.0 | 0.0 | 0.0 | 2021 | Liverpool |
38 rows × 27 columns
### count number of matches each week ###
matches['round'].value_counts()
Matchweek 5 39 Matchweek 24 39 Matchweek 3 39 Matchweek 13 39 Matchweek 23 39 Matchweek 29 39 Matchweek 20 39 Matchweek 19 39 Matchweek 9 39 Matchweek 14 39 Matchweek 8 39 Matchweek 17 39 Matchweek 28 39 Matchweek 34 39 Matchweek 15 39 Matchweek 26 39 Matchweek 11 39 Matchweek 2 39 Matchweek 7 39 Matchweek 4 39 Matchweek 16 39 Matchweek 25 39 Matchweek 6 39 Matchweek 32 39 Matchweek 1 39 Matchweek 31 39 Matchweek 10 39 Matchweek 12 39 Matchweek 22 37 Matchweek 30 37 Matchweek 18 37 Matchweek 21 37 Matchweek 27 37 Matchweek 33 32 Matchweek 38 20 Matchweek 37 20 Matchweek 36 20 Matchweek 35 20 Name: round, dtype: int64
matches.dtypes
date object time object comp object round object day object venue object result object gf float64 ga float64 opponent object xg float64 xga float64 poss float64 attendance float64 captain object formation object referee object match report object notes float64 sh float64 sot float64 dist float64 fk float64 pk float64 pkatt float64 season int64 team object dtype: object
### adjust date into proper datetime format ###
matches['date'] = pd.to_datetime(matches['date'])
del matches["comp"]
del matches["notes"]
matches.dtypes
date datetime64[ns] time object round object day object venue object result object gf float64 ga float64 opponent object xg float64 xga float64 poss float64 attendance float64 captain object formation object referee object match report object sh float64 sot float64 dist float64 fk float64 pk float64 pkatt float64 season int64 team object dtype: object
### create away (0) vs home (1) column
matches['venue_code'] = matches['venue'].astype('category').cat.codes
matches
date | time | round | day | venue | result | gf | ga | opponent | xg | ... | match report | sh | sot | dist | fk | pk | pkatt | season | team | venue_code | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2021-08-15 | 16:30 | Matchweek 1 | Sun | Away | L | 0.0 | 1.0 | Tottenham | 1.9 | ... | Match Report | 18.0 | 4.0 | 16.9 | 1.0 | 0.0 | 0.0 | 2022 | Manchester City | 0 |
2 | 2021-08-21 | 15:00 | Matchweek 2 | Sat | Home | W | 5.0 | 0.0 | Norwich City | 2.7 | ... | Match Report | 16.0 | 4.0 | 17.3 | 1.0 | 0.0 | 0.0 | 2022 | Manchester City | 1 |
3 | 2021-08-28 | 12:30 | Matchweek 3 | Sat | Home | W | 5.0 | 0.0 | Arsenal | 3.8 | ... | Match Report | 25.0 | 10.0 | 14.3 | 0.0 | 0.0 | 0.0 | 2022 | Manchester City | 1 |
4 | 2021-09-11 | 15:00 | Matchweek 4 | Sat | Away | W | 1.0 | 0.0 | Leicester City | 2.9 | ... | Match Report | 25.0 | 8.0 | 14.0 | 0.0 | 0.0 | 0.0 | 2022 | Manchester City | 0 |
6 | 2021-09-18 | 15:00 | Matchweek 5 | Sat | Home | D | 0.0 | 0.0 | Southampton | 1.1 | ... | Match Report | 16.0 | 1.0 | 15.7 | 1.0 | 0.0 | 0.0 | 2022 | Manchester City | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
38 | 2021-05-02 | 19:15 | Matchweek 34 | Sun | Away | L | 0.0 | 4.0 | Tottenham | 0.5 | ... | Match Report | 8.0 | 1.0 | 17.4 | 0.0 | 0.0 | 0.0 | 2021 | Sheffield United | 0 |
39 | 2021-05-08 | 15:00 | Matchweek 35 | Sat | Home | L | 0.0 | 2.0 | Crystal Palace | 0.7 | ... | Match Report | 7.0 | 0.0 | 11.4 | 1.0 | 0.0 | 0.0 | 2021 | Sheffield United | 1 |
40 | 2021-05-16 | 19:00 | Matchweek 36 | Sun | Away | W | 1.0 | 0.0 | Everton | 1.6 | ... | Match Report | 10.0 | 3.0 | 17.0 | 0.0 | 0.0 | 0.0 | 2021 | Sheffield United | 0 |
41 | 2021-05-19 | 18:00 | Matchweek 37 | Wed | Away | L | 0.0 | 1.0 | Newcastle Utd | 0.8 | ... | Match Report | 11.0 | 1.0 | 16.0 | 1.0 | 0.0 | 0.0 | 2021 | Sheffield United | 0 |
42 | 2021-05-23 | 16:00 | Matchweek 38 | Sun | Home | W | 1.0 | 0.0 | Burnley | 0.6 | ... | Match Report | 12.0 | 3.0 | 17.0 | 0.0 | 0.0 | 0.0 | 2021 | Sheffield United | 1 |
1389 rows × 26 columns
### create a number for each team ###
matches['opp_code'] = matches['opponent'].astype('category').cat.codes
matches
date | time | round | day | venue | result | gf | ga | opponent | xg | ... | sh | sot | dist | fk | pk | pkatt | season | team | venue_code | opp_code | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2021-08-15 | 16:30 | Matchweek 1 | Sun | Away | L | 0.0 | 1.0 | Tottenham | 1.9 | ... | 18.0 | 4.0 | 16.9 | 1.0 | 0.0 | 0.0 | 2022 | Manchester City | 0 | 18 |
2 | 2021-08-21 | 15:00 | Matchweek 2 | Sat | Home | W | 5.0 | 0.0 | Norwich City | 2.7 | ... | 16.0 | 4.0 | 17.3 | 1.0 | 0.0 | 0.0 | 2022 | Manchester City | 1 | 15 |
3 | 2021-08-28 | 12:30 | Matchweek 3 | Sat | Home | W | 5.0 | 0.0 | Arsenal | 3.8 | ... | 25.0 | 10.0 | 14.3 | 0.0 | 0.0 | 0.0 | 2022 | Manchester City | 1 | 0 |
4 | 2021-09-11 | 15:00 | Matchweek 4 | Sat | Away | W | 1.0 | 0.0 | Leicester City | 2.9 | ... | 25.0 | 8.0 | 14.0 | 0.0 | 0.0 | 0.0 | 2022 | Manchester City | 0 | 10 |
6 | 2021-09-18 | 15:00 | Matchweek 5 | Sat | Home | D | 0.0 | 0.0 | Southampton | 1.1 | ... | 16.0 | 1.0 | 15.7 | 1.0 | 0.0 | 0.0 | 2022 | Manchester City | 1 | 17 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
38 | 2021-05-02 | 19:15 | Matchweek 34 | Sun | Away | L | 0.0 | 4.0 | Tottenham | 0.5 | ... | 8.0 | 1.0 | 17.4 | 0.0 | 0.0 | 0.0 | 2021 | Sheffield United | 0 | 18 |
39 | 2021-05-08 | 15:00 | Matchweek 35 | Sat | Home | L | 0.0 | 2.0 | Crystal Palace | 0.7 | ... | 7.0 | 0.0 | 11.4 | 1.0 | 0.0 | 0.0 | 2021 | Sheffield United | 1 | 6 |
40 | 2021-05-16 | 19:00 | Matchweek 36 | Sun | Away | W | 1.0 | 0.0 | Everton | 1.6 | ... | 10.0 | 3.0 | 17.0 | 0.0 | 0.0 | 0.0 | 2021 | Sheffield United | 0 | 7 |
41 | 2021-05-19 | 18:00 | Matchweek 37 | Wed | Away | L | 0.0 | 1.0 | Newcastle Utd | 0.8 | ... | 11.0 | 1.0 | 16.0 | 1.0 | 0.0 | 0.0 | 2021 | Sheffield United | 0 | 14 |
42 | 2021-05-23 | 16:00 | Matchweek 38 | Sun | Home | W | 1.0 | 0.0 | Burnley | 0.6 | ... | 12.0 | 3.0 | 17.0 | 0.0 | 0.0 | 0.0 | 2021 | Sheffield United | 1 | 4 |
1389 rows × 27 columns
### remove ':' and minutes from time column ###
matches['hour'] = matches['time'].str.replace(':.+', "", regex=True).astype('int')
### gives a number for each day of the week ###
matches["day_code"] = matches['date'].dt.dayofweek
### Find wins and returns Boolean (true(1)/false(0) result) then converted to a number###
matches['target'] = (matches['result']=='W').astype('int')
### 'target column a 1 if win and 0 if lose/draw' ###
matches
date | time | round | day | venue | result | gf | ga | opponent | xg | ... | fk | pk | pkatt | season | team | venue_code | opp_code | hour | day_code | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2021-08-15 | 16:30 | Matchweek 1 | Sun | Away | L | 0.0 | 1.0 | Tottenham | 1.9 | ... | 1.0 | 0.0 | 0.0 | 2022 | Manchester City | 0 | 18 | 16 | 6 | 0 |
2 | 2021-08-21 | 15:00 | Matchweek 2 | Sat | Home | W | 5.0 | 0.0 | Norwich City | 2.7 | ... | 1.0 | 0.0 | 0.0 | 2022 | Manchester City | 1 | 15 | 15 | 5 | 1 |
3 | 2021-08-28 | 12:30 | Matchweek 3 | Sat | Home | W | 5.0 | 0.0 | Arsenal | 3.8 | ... | 0.0 | 0.0 | 0.0 | 2022 | Manchester City | 1 | 0 | 12 | 5 | 1 |
4 | 2021-09-11 | 15:00 | Matchweek 4 | Sat | Away | W | 1.0 | 0.0 | Leicester City | 2.9 | ... | 0.0 | 0.0 | 0.0 | 2022 | Manchester City | 0 | 10 | 15 | 5 | 1 |
6 | 2021-09-18 | 15:00 | Matchweek 5 | Sat | Home | D | 0.0 | 0.0 | Southampton | 1.1 | ... | 1.0 | 0.0 | 0.0 | 2022 | Manchester City | 1 | 17 | 15 | 5 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
38 | 2021-05-02 | 19:15 | Matchweek 34 | Sun | Away | L | 0.0 | 4.0 | Tottenham | 0.5 | ... | 0.0 | 0.0 | 0.0 | 2021 | Sheffield United | 0 | 18 | 19 | 6 | 0 |
39 | 2021-05-08 | 15:00 | Matchweek 35 | Sat | Home | L | 0.0 | 2.0 | Crystal Palace | 0.7 | ... | 1.0 | 0.0 | 0.0 | 2021 | Sheffield United | 1 | 6 | 15 | 5 | 0 |
40 | 2021-05-16 | 19:00 | Matchweek 36 | Sun | Away | W | 1.0 | 0.0 | Everton | 1.6 | ... | 0.0 | 0.0 | 0.0 | 2021 | Sheffield United | 0 | 7 | 19 | 6 | 1 |
41 | 2021-05-19 | 18:00 | Matchweek 37 | Wed | Away | L | 0.0 | 1.0 | Newcastle Utd | 0.8 | ... | 1.0 | 0.0 | 0.0 | 2021 | Sheffield United | 0 | 14 | 18 | 2 | 0 |
42 | 2021-05-23 | 16:00 | Matchweek 38 | Sun | Home | W | 1.0 | 0.0 | Burnley | 0.6 | ... | 0.0 | 0.0 | 0.0 | 2021 | Sheffield United | 1 | 4 | 16 | 6 | 1 |
1389 rows × 30 columns
### importing Random Forest Classifier Model from sklearn for ML building ###
from sklearn.ensemble import RandomForestClassifier
### initialize class
### with 50 decition trees to train (n_estimators)
### 10 number of samples in the leaf of the decision tree
### random state = 1 (same results each time we run forest)
rf = RandomForestClassifier(n_estimators = 50, min_samples_split = 10, random_state = 1)
### Splitting training (matches before '2022-01-01') and testing data ('2022-01-01') ###
train = matches[matches['date']<'2022-01-01']
test = matches[matches['date']>'2022-01-01']
### create predictors, list of predictor columns created above ###
predictors = ['venue_code', 'opp_code', 'hour', 'day_code']
### fit random forest model with train predictors, to predict target ###
rf.fit(train[predictors], train['target'])
RandomForestClassifier(min_samples_split=10, n_estimators=50, random_state=1)
### generate predictions using predict method ###
preds = rf.predict(test[predictors])
### import accuracy score from sklearn (what percent of the time was the prediction accurate)###
from sklearn.metrics import accuracy_score
acc = accuracy_score(test['target'], preds)
acc
### about 61% accuracy of predicting correct result ###
0.6123188405797102
### creating a dataframe to comine actual values and predicted values ###
combined = pd.DataFrame(dict(actual = test['target'], prediction = preds))
### two way table to what actually happened ###
pd.crosstab(index=combined['actual'], columns=combined['prediction'])
prediction | 0 | 1 |
---|---|---|
actual | ||
0 | 141 | 31 |
1 | 76 | 28 |
### import precision score
###(when we predicted a win, what percentage of the time di the team actually win)
from sklearn.metrics import precision_score
precision_score(test['target'],preds)
### When we predicted a win, the team only won 47% of the time ###
0.4745762711864407
### creating 1 dataframe for every team in our data ###
grouped_matches = matches.groupby('team')
### just matches Manchester City played ###
group = grouped_matches.get_group('Manchester City').sort_values("date")
group
date | time | round | day | venue | result | gf | ga | opponent | xg | ... | fk | pk | pkatt | season | team | venue_code | opp_code | hour | day_code | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-09-21 | 20:15 | Matchweek 2 | Mon | Away | W | 3.0 | 1.0 | Wolves | 1.9 | ... | 2.0 | 1.0 | 1.0 | 2021 | Manchester City | 0 | 22 | 20 | 0 | 1 |
2 | 2020-09-27 | 16:30 | Matchweek 3 | Sun | Home | L | 2.0 | 5.0 | Leicester City | 0.9 | ... | 1.0 | 0.0 | 0.0 | 2021 | Manchester City | 1 | 10 | 16 | 6 | 0 |
4 | 2020-10-03 | 17:30 | Matchweek 4 | Sat | Away | D | 1.0 | 1.0 | Leeds United | 1.5 | ... | 1.0 | 0.0 | 0.0 | 2021 | Manchester City | 0 | 9 | 17 | 5 | 0 |
5 | 2020-10-17 | 17:30 | Matchweek 5 | Sat | Home | W | 1.0 | 0.0 | Arsenal | 1.5 | ... | 0.0 | 0.0 | 0.0 | 2021 | Manchester City | 1 | 0 | 17 | 5 | 1 |
7 | 2020-10-24 | 12:30 | Matchweek 6 | Sat | Away | D | 1.0 | 1.0 | West Ham | 1.1 | ... | 1.0 | 0.0 | 0.0 | 2021 | Manchester City | 0 | 21 | 12 | 5 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
42 | 2022-03-14 | 20:00 | Matchweek 29 | Mon | Away | D | 0.0 | 0.0 | Crystal Palace | 2.3 | ... | 0.0 | 0.0 | 0.0 | 2022 | Manchester City | 0 | 6 | 20 | 0 | 0 |
44 | 2022-04-02 | 15:00 | Matchweek 31 | Sat | Away | W | 2.0 | 0.0 | Burnley | 1.8 | ... | 0.0 | 0.0 | 0.0 | 2022 | Manchester City | 0 | 4 | 15 | 5 | 1 |
46 | 2022-04-10 | 16:30 | Matchweek 32 | Sun | Home | D | 2.0 | 2.0 | Liverpool | 2.0 | ... | 1.0 | 0.0 | 0.0 | 2022 | Manchester City | 1 | 11 | 16 | 6 | 0 |
49 | 2022-04-20 | 20:00 | Matchweek 30 | Wed | Home | W | 3.0 | 0.0 | Brighton | 1.2 | ... | 0.0 | 0.0 | 0.0 | 2022 | Manchester City | 1 | 3 | 20 | 2 | 1 |
50 | 2022-04-23 | 15:00 | Matchweek 34 | Sat | Home | W | 5.0 | 1.0 | Watford | 3.0 | ... | 0.0 | 1.0 | 1.0 | 2022 | Manchester City | 1 | 19 | 15 | 5 | 1 |
71 rows × 30 columns
### take a group in, take a set of columns we want averages for,
### and take in a set of new columns we can assign rolling average to
### closed = 'left' -> use previsous weeks taking current week out
### drops any missing values
def rolling_averages(group, cols, new_cols):
group = group.sort_values("date")
rolling_stats = group[cols].rolling(3, closed='left').mean()
group[new_cols] = rolling_stats
group = group.dropna(subset=new_cols)
return group
cols = ["gf", "ga", "sh", "sot", "dist", "fk", "pk", "pkatt"]
new_cols = [f"{c}_rolling" for c in cols]
### adding rolling onto cols labels
new_cols
['gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 'dist_rolling', 'fk_rolling', 'pk_rolling', 'pkatt_rolling']
rolling_averages(group, cols, new_cols)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-35-f04f66ef61c1> in <module> ----> 1 rolling_averages(group, cols, new_cols) <ipython-input-32-55dbec3105a9> in rolling_averages(group, cols, new_cols) 6 def rolling_averages(group, cols, new_cols): 7 group = group.sort_values("date") ----> 8 rolling_stats = group[cols].rolling(3, closed='left').mean() 9 group[new_cols] = rolling_stats 10 group = group.dropna(subset=new_cols) /dataquest/system/env/python3/lib/python3.8/site-packages/pandas/core/generic.py in rolling(self, window, min_periods, center, win_type, on, axis, closed) 10374 ) 10375 > 10376 return Rolling( 10377 self, 10378 window=window, /dataquest/system/env/python3/lib/python3.8/site-packages/pandas/core/window/rolling.py in __init__(self, obj, window, min_periods, center, win_type, axis, on, closed, **kwargs) 92 self.win_freq = None 93 self.axis = obj._get_axis_number(axis) if axis is not None else None ---> 94 self.validate() 95 self._numba_func_cache: Dict[Optional[str], Callable] = dict() 96 /dataquest/system/env/python3/lib/python3.8/site-packages/pandas/core/window/rolling.py in validate(self) 1865 1866 if not self.is_datetimelike and self.closed is not None: -> 1867 raise ValueError( 1868 "closed only implemented for datetimelike and offset based windows" 1869 ) ValueError: closed only implemented for datetimelike and offset based windows