#!/usr/bin/env python # coding: utf-8 # # # StatsBomb 360 Data Parsing # ##### Notebook to parse and engineer the JSON data from the [StatsBomb Open Data GitHub repository](https://github.com/statsbomb/open-data) using [pandas](http://pandas.pydata.org/). # # ### By [Edd Webster](https://www.twitter.com/eddwebster) # Notebook first written: 29/10/2021
# Notebook last updated: 05/12/2021 # # ![StatsBomb](../../img/logos/stats-bomb-logo.png) # # ![StatsBomb 360](../../img/logos/stats-bomb-360-logo.png) # # Click [here](#section5) to jump straight to the Exploratory Data Analysis section and skip the [Task Brief](#section2), [Data Sources](#section3), [Data Engineering](#section4), [Data Aggregation](#section5), and [Subsetted DataFrames](#section6) sections. # ___ # # # ## Introduction # This notebook parses pubicly available [StatsBomb](https://statsbomb.com/) Event data, using [pandas](http://pandas.pydata.org/) for data manipulation through DataFrames. # # For more information about this notebook and the author, I'm available through all the following channels: # * [eddwebster.com](https://www.eddwebster.com/); # * edd.j.webster@gmail.com; # * [@eddwebster](https://www.twitter.com/eddwebster); # * [linkedin.com/in/eddwebster](https://www.linkedin.com/in/eddwebster/); # * [github/eddwebster](https://github.com/eddwebster/); and # * [public.tableau.com/profile/edd.webster](https://public.tableau.com/profile/edd.webster). # # ![Edd Webster](../../img/edd_webster/fifa21eddwebsterbanner.png) # # The accompanying GitHub repository for this notebook can be found [here](https://github.com/eddwebster/football_analytics) and a static version of this notebook can be found [here](https://nbviewer.org/github/eddwebster/football_analytics/blob/master/notebooks/2_data_parsing/Parma%20Calcio%201913%20-%20StatsBomb%20Data%20Parsing%20and%20Engineering.ipynb). # ___ # # ## Notebook Contents # 1. [Notebook Dependencies](#section1)
# 2. [Project Brief](#section2)
# 3. [Data Sources](#section3)
# 1. [Introduction](#section3.1)
# 2. [Read in the Datasets](#section3.2)
# 3. [Join the Datasets](#section3.3)
# 4. [Initial Data Handling](#section3.4)
# 5. [Summary](#section5)
# 6. [Next Steps](#section6)
# 7. [References](#section7)
# ___ # # # # ## 1. Notebook Dependencies # # This notebook was written using [Python 3](https://docs.python.org/3.7/) and requires the following libraries: # * [`Jupyter notebooks`](https://jupyter.org/) for this notebook environment with which this project is presented; # * [`NumPy`](http://www.numpy.org/) for multidimensional array computing; and # * [`pandas`](http://pandas.pydata.org/) for data analysis and manipulation. # # All packages used for this notebook except for BeautifulSoup can be obtained by downloading and installing the [Conda](https://anaconda.org/anaconda/conda) distribution, available on all platforms (Windows, Linux and Mac OSX). Step-by-step guides on how to install Anaconda can be found for Windows [here](https://medium.com/@GalarnykMichael/install-python-on-windows-anaconda-c63c7c3d1444) and Mac [here](https://medium.com/@GalarnykMichael/install-python-on-mac-anaconda-ccd9f2014072), as well as in the Anaconda documentation itself [here](https://docs.anaconda.com/anaconda/install/). # ### Import Libraries and Modules # In[1]: # Python ≥3.5 (ideally) import platform import sys, getopt assert sys.version_info >= (3, 5) import csv # Import Dependencies get_ipython().run_line_magic('matplotlib', 'inline') # Math Operations import numpy as np from math import pi # Datetime import datetime from datetime import date import time # Data Preprocessing import pandas as pd import pandas_profiling as pp import os import re import chardet import random from io import BytesIO from pathlib import Path # Reading Directories import glob import os # Working with JSON import json from pandas.io.json import json_normalize # Data Visualisation import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns import missingno as msno # Progress Bar from tqdm import tqdm # Display in Jupyter from IPython.display import Image, YouTubeVideo from IPython.core.display import HTML # Ignore Warnings import warnings warnings.filterwarnings(action="ignore", message="^internal gelsd") print("Setup Complete") # In[2]: # Python / module versions used here for reference print('Python: {}'.format(platform.python_version())) print('NumPy: {}'.format(np.__version__)) print('pandas: {}'.format(pd.__version__)) print('matplotlib: {}'.format(mpl.__version__)) # ### Defined Variables # In[3]: # Define today's date today = datetime.datetime.now().strftime('%d/%m/%Y').replace('/', '') # ### Defined Filepaths # In[4]: # Set up initial paths to subfolders base_dir = os.path.join('..', '..') data_dir = os.path.join(base_dir, 'data') data_dir_sb = os.path.join(base_dir, 'data', 'sb') img_dir = os.path.join(base_dir, 'img') fig_dir = os.path.join(base_dir, 'img', 'fig') # ### Create Directory Structure # In[5]: # make the directory structure for folder in ['combined', 'competitions', 'events', 'matches']: path = os.path.join(data_dir_sb, 'raw', folder) if not os.path.exists(path): os.mkdir(path) # ### Custom Functions # In[6]: # Define custom functions for used in the notebook ## Function to read JSON files that also handles the encoding of special characters e.g. accents in names of players and teams def read_json_file(filename): with open(filename, 'rb') as json_file: return BytesIO(json_file.read()).getvalue().decode('unicode_escape') ## Function to flatten pandas DataFrames with nested JSON columns. Source: https://stackoverflow.com/questions/39899005/how-to-flatten-a-pandas-dataframe-with-some-columns-as-json def flatten_nested_json_df(df): df = df.reset_index() print(f"original shape: {df.shape}") print(f"original columns: {df.columns}") # search for columns to explode/flatten s = (df.applymap(type) == list).all() list_columns = s[s].index.tolist() s = (df.applymap(type) == dict).all() dict_columns = s[s].index.tolist() print(f"lists: {list_columns}, dicts: {dict_columns}") while len(list_columns) > 0 or len(dict_columns) > 0: new_columns = [] for col in dict_columns: print(f"flattening: {col}") # explode dictionaries horizontally, adding new columns horiz_exploded = pd.json_normalize(df[col]).add_prefix(f'{col}.') horiz_exploded.index = df.index df = pd.concat([df, horiz_exploded], axis=1).drop(columns=[col]) new_columns.extend(horiz_exploded.columns) # inplace for col in list_columns: print(f"exploding: {col}") # explode lists vertically, adding new columns df = df.drop(columns=[col]).join(df[col].explode().to_frame()) new_columns.append(col) # check if there are still dict o list fields to flatten s = (df[new_columns].applymap(type) == list).all() list_columns = s[s].index.tolist() s = (df[new_columns].applymap(type) == dict).all() dict_columns = s[s].index.tolist() print(f"lists: {list_columns}, dicts: {dict_columns}") print(f"final shape: {df.shape}") print(f"final columns: {df.columns}") return df # ### Notebook Settings # In[7]: # Display all columns of displayed pandas DataFrames pd.set_option('display.max_columns', None) pd.options.mode.chained_assignment=None # --- # # # # ## 2. Notebook Brief # This Jupyter notebook is part of a series of notebooks to parse and engineer StatsBomb Event data. # # This particular notebook is the **StatsBomb Data Parsing** notebook for 360 data, that takes raw JSON data downloaded from the StatsBomb Open Data GitHub Repository and converts this to event level data that is saved as a CSV file. # # Links to these notebooks in the [`football_analytics`](https://github.com/eddwebster/football_analytics) GitHub repository can be found at the following: # * [Data Parsing](https://github.com/eddwebster/football_analytics/tree/master/notebooks/2_data_parsing) # + [StatsBomb Data Parsing](https://github.com/eddwebster/football_analytics/blob/master/notebooks/2_data_parsing/ELO%20Team%20Ratings%20Data%20Parsing.ipynb) # * [Data Engineering](https://github.com/eddwebster/football_analytics/tree/master/notebooks/3_data_engineering) # + [StatsBomb Data Engineering](https://github.com/eddwebster/football_analytics/blob/master/notebooks/3_data_engineering/FBref%20Player%20Stats%20Data%20Engineering.ipynb) # # **Notebook Conventions**:
# * Variables that refer a `DataFrame` object are prefixed with `df_`. # * Variables that refer to a collection of `DataFrame` objects (e.g., a list, a set or a dict) are prefixed with `dfs_`. # --- # # # # ## 3. Data Sources # ### 3.1. Introduction # #### 3.1.1. About StatsBomb # [StatsBomb](https://statsbomb.com/) are a football analytics and data company. # # ![title](../../img/logos/stats-bomb-logo.png) # # Before conducting our EDA, the data needs to be imported as a DataFrame in the Data Sources section [Section 3](#section3) and Cleaned in the Data Engineering section [Section 4](#section4). # # We'll be using the [pandas](http://pandas.pydata.org/) library to import our data to this workbook as a DataFrame. # #### 3.1.2. About the StatsBomb publicly available data # The complete data set contains: # - 7 competitions; # - 879 matches; # - 3,161,917 events; and # - z players. # # The datasets we will be using are: # - competitions; # - matches; # - events; # - lineups; and # - tactics; # # The data needs to be imported as a DataFrame in the Data Sources section [Section 3](#section3) and cleaned in the Data Engineering section [Section 4](#section4). # ### 3.2. Downloading StatsBomb Data # In[8]: # ADD CODE HERE # ### 3.3. Reading In and Parsing the JSON Data # The following cells read in the `JSON` files into a [pandas](https://pandas.pydata.org/) `DataFrame` object with some basic Data Engineering to flatten the data and select only the columns of interest ensuring that the Jupyter otebook does not crash on a standard laptop. # #### 3.3.1. Competitions # ##### Data dictionary # In[9]: # ADD MARKDOWN TABLE OF DATA HERE # ##### Read in JSON files # In[10]: # Show files in directory print(glob.glob(os.path.join(data_dir_sb, 'raw', 'competitions/*'))) # In[13]: # Read in exported CSV file if exists, if not, read in JSON file ## if not os.path.exists(os.path.join(data_dir_sb, 'raw', 'competitions', 'competitions.csv')): json_competitions = read_json_file(os.path.join(data_dir_sb, 'open-data', 'data', 'competitions.json')) df_competitions_flat = pd.read_json(json_competitions) ## else: df_competitions_flat = pd.read_csv(os.path.join(data_dir_sb, 'raw', 'competitions', 'competitions.csv')) # Display DataFrame df_competitions_flat # In[14]: df_competitions_flat.shape # ##### Identify the Competition of Interest # For our analysis, we only want to take the players that have played in the **male** competitions. # In[15]: # Filter DataFrame for rows where 'competition_gender' is equal to 'male' df_competitions_flat = df_competitions_flat.loc[(df_competitions_flat['competition_id'] == 55) & (df_competitions_flat['season_id'] == 43) ] # In[16]: df_competitions_flat # ##### Export DataFrame # In[17]: # Export DataFrame as a CSV file ## if not os.path.exists(os.path.join(data_dir_sb, 'raw', 'competitions', 'competitions_sb_360.csv')): df_competitions_flat.to_csv(os.path.join(data_dir_sb, 'raw', 'competitions', 'competitions_sb_360.csv'), index=None, header=True) ## else: pass # #### 3.3.2. Matches # ##### Data Dictionary # In[18]: # ADD MARKDOWN TABLE OF DATA HERE # ##### Define competitions # The following cell lists the competitions to be included in the dataset. Dataset includes data for seven different competitions - 5 domestic and 2 international. # In[19]: # Define a list to select only the competitions of interest. # Flatmap all Competition IDs to use all available competitions lst_competitions = df_competitions_flat['competition_id'].unique().tolist() """ # Define list of competitions lst_competitions = [2, # Premier League 11, # La Liga 16, # Champions League #37, # FA Women's Super League 43, # FIFA World Cup #49, # NWSL #55, # UEFA Euro #72, # Women's World Cup ] """ # Display list of competitions lst_competitions # In[20]: # Display the number of competitions len(lst_competitions) # ##### Read in JSON files # In[21]: # Show files in directory print(glob.glob(os.path.join(data_dir_sb, 'raw', 'matches/*'))) # Steps: # * Loop through match files for the select competitions. # * Take the separate JSON file each representing the matches for the selected competitions. This file is called {match_id}.json. # * Read JSON file as a pandas DataFrame. # * Append the DataFrames to a list. # * Finally, concatenate all the separate DataFrames into one DataFrame of matches. # In[22]: # Read in selected matches ## Read in exported CSV file if exists, if not, read in JSON file if not os.path.exists(os.path.join(data_dir_sb, 'raw', 'matches', 'matches_sb_360.csv')): ### Create empty list for DataFrames dfs_matches_all = [] ### Loop through the selected competitions for competition in lst_competitions: ### Create empty list for DataFrames dfs_matches_competition = [] #### Show files in directory lst_filepaths = list(glob.glob(data_dir_sb + '/open-data/data/matches/' + str(competition) + '/*')) for filepath in lst_filepaths: ##### Open the JSON filepath with defined Competition and Season IDs try: ###### Import all StatsBomb JSON Match data for the mens matches with open(filepath) as f: json_sb_match_data = json.load(f) ###### Flatten the JSON Match data df_matches_flat = json_normalize(json_sb_match_data) ###### Append each Match data to dfs_matches_competition.append(df_matches_flat) ## Concatenate DataFrames to one DataFrame df_matches_competition = pd.concat(dfs_matches_competition) ##### except: pass ## Concatenate DataFrames to one DataFrame dfs_matches_all.append(df_matches_competition) ## Concatenate DataFrames to one DataFrame df_matches_flat = pd.concat(dfs_matches_all) ## else: df_matches_flat = pd.read_csv(os.path.join(data_dir_sb, 'raw', 'matches', 'matches_sb_360.csv')) ## Display DataFrame df_matches_flat.head() # In[23]: df_matches_flat.shape # In[24]: # Shot outcomes types and their frequency df_matches_flat.groupby(['competition.competition_name', 'season.season_name']).match_id.count() # There are 51 games in the UEFA Euro 2020 that can be used as part of the Expected Goals model. # ##### Convert `match_id` column to list # List used as reference of matches to parse for Events, Lineups, and Tactics data - iteration through list comprehension. # In[25]: # Flatmap all Match IDs to use all available matches lst_matches = df_matches_flat['match_id'].tolist() # Display the number of matches len(lst_matches) # ##### Export DataFrame # In[26]: # Export DataFrame as a CSV file ## if not os.path.exists(os.path.join(data_dir_sb, 'raw', 'matches', 'matches_sb_360.csv')): df_matches_flat.to_csv(os.path.join(data_dir_sb, 'raw', 'matches', 'matches_sb_360.csv'), index=None, header=True) ## else: pass # #### 3.3.3. Events # ##### Data dictionary # The [StatsBomb](https://statsbomb.com/) dataset has one hundred and fourteen features (columns) with the following definitions and data types: # # | Feature | Data type | # |------|-----| # | `id` | `object` # | `index` | `object` # | `period` | `object` # | `timestamp` | `object` # | `minute` | `object` # | `second` | `object` # | `possession` | `object` # | `duration` | `object` # | `type.id` | `object` # | `type.name` | `object` # | `possession_team.id` | `object` # | `possession_team.name` | `object` # | `play_pattern.id` | `object` # | `play_pattern.name` | `object` # | `team.id` | `object` # | `team.name` | `object` # | `tactics.formation` | `object` # | `tactics.lineup` | `object` # | `related_events` | `object` # | `location` | `object` # | `player.id` | `object` # | `player.name` | `object` # | `position.id` | `object` # | `position.name` | `object` # | `pass.recipient.id` | `object` # | `pass.recipient.name` | `object` # | `pass.length` | `object` # | `pass.angle` | `object` # | `pass.height.id` | `object` # | `pass.height.name` | `object` # | `pass.end_location` | `object` # | `pass.type.id` | `object` # | `pass.type.name` | `object` # | `pass.body_part.id` | `object` # | `pass.body_part.name` | `object` # | `carry.end_location` | `object` # | `under_pressure` | `object` # | `duel.type.id` | `object` # | `duel.type.name` | `object` # | `out` | `object` # | `miscontrol.aerial_won` | `object` # | `pass.outcome.id` | `object` # | `pass.outcome.name` | `object` # | `ball_receipt.outcome.id` | `object` # | `ball_receipt.outcome.name` | `object` # | `pass.aerial_won` | `object` # | `counterpress` | `object` # | `off_camera` | `object` # | `dribble.outcome.id` | `object` # | `dribble.outcome.name` | `object` # | `dribble.overrun` | `object` # | `ball_recovery.offensive` | `object` # | `shot.statsbomb_xg` | `object` # | `shot.end_location` | `object` # | `shot.outcome.id` | `object` # | `shot.outcome.name` | `object` # | `shot.type.id` | `object` # | `shot.type.name` | `object` # | `shot.body_part.id` | `object` # | `shot.body_part.name` | `object` # | `shot.technique.id` | `object` # | `shot.technique.name` | `object` # | `shot.freeze_frame` | `object` # | `goalkeeper.end_location` | `object` # | `goalkeeper.type.id` | `object` # | `goalkeeper.type.name` | `object` # | `goalkeeper.position.id` | `object` # | `goalkeeper.position.name` | `object` # | `pass.straight` | `object` # | `pass.technique.id` | `object` # | `pass.technique.name` | `object` # | `clearance.head` | `object` # | `clearance.body_part.id` | `object` # | `clearance.body_part.name` | `object` # | `pass.switch` | `object` # | `duel.outcome.id` | `object` # | `duel.outcome.name` | `object` # | `foul_committed.advantage` | `object` # | `foul_won.advantage` | `object` # | `pass.cross` | `object` # | `pass.assisted_shot_id` | `object` # | `pass.shot_assist` | `object` # | `shot.one_on_one` | `object` # | `shot.key_pass_id` | `object` # | `goalkeeper.body_part.id` | `object` # | `goalkeeper.body_part.name` | `object` # | `goalkeeper.technique.id` | `object` # | `goalkeeper.technique.name` | `object` # | `goalkeeper.outcome.id` | `object` # | `goalkeeper.outcome.name` | `object` # | `clearance.aerial_won` | `object` # | `foul_committed.card.id` | `object` # | `foul_committed.card.name` | `object` # | `foul_won.defensive` | `object` # | `clearance.right_foot` | `object` # | `shot.first_time` | `object` # | `pass.through_ball` | `object` # | `interception.outcome.id` | `object` # | `interception.outcome.name` | `object` # | `clearance.left_foot` | `object` # | `ball_recovery.recovery_failure` | `object` # | `shot.aerial_won` | `object` # | `pass.goal_assist` | `object` # | `pass.cut_back` | `object` # | `pass.deflected` | `object` # | `clearance.other` | `object` # | `pass.outswinging` | `object` # | `substitution.outcome.id` | `object` # | `substitution.outcome.name` | `object` # | `substitution.replacement.id` | `object` # | `substitution.replacement.name` | `object` # | `block.deflection` | `object` # | `block.offensive` | `object` # | `injury_stoppage.in_chain` | `object` # # For a full list of definitions, see the official documentation [[link](https://statsbomb.com/stat-definitions/)]. # ##### Read in JSON files # In[27]: # Show files in directory print(glob.glob(os.path.join(data_dir_sb, 'raw', 'events/*'))) # Steps: # * Loop through the matches files for the selected match(es) # * Take the separate JSON file each representing theevents match for the selected matches. This file is called {match_id}.json. # * Read the corresponding JSON matches files using the auxillary function # * Read JSON file as a pandas DataFrame # * Append the DataFrames to a list # * Finally, concatenate all the separate DataFrames into one DataFrame # In[28]: # Read in exported CSV file if exists, if not, read in JSON file ## if not os.path.exists(os.path.join(data_dir_sb, 'raw', 'events', 'events_sb_360.csv')): ### Create empty list for DataFrames dfs_events = [] ### Loop through event files for the selected matches and append DataFrame to dfs_events list for match_id in lst_matches: #### with open(data_dir_sb + '/open-data/data/events/' + str(match_id) + '.json') as f: event = json.load(f) #match_id = str(match_id) df_event_flat = json_normalize(event) df_event_flat['match_id'] = match_id dfs_events.append(df_event_flat) ### Concatenate DataFrames to one DataFrame df_events = pd.concat(dfs_events) ### Flatten the nested columns df_events_flat = flatten_nested_json_df(df_events) ## else: df_events_flat = pd.read_csv(os.path.join(data_dir_sb, 'raw', 'events', 'events_sb_360.csv')) ## Display DataFrame df_events_flat.head() # In[29]: df_events_flat.shape # ##### Export DataFrame # In[30]: # Export DataFrame as a CSV file ## if not os.path.exists(os.path.join(data_dir_sb, 'raw', 'events', 'events_sb_360.csv')): df_events_flat.to_csv(os.path.join(data_dir_sb, 'raw', 'events', 'events_sb_360.csv'), index=None, header=True) ## else: pass # ### 3.4. Join the Datasets # The final step of the data parsing is to join the `Matches` DataFrame and the `Competition` DataFrames to the `Events` DataFrame. The `Events` data is the base DataFrame in which we join the other tables via `match_id` and `competition.competition_id`. # In[31]: # Read in exported CSV file if exists, if not, merge the individual DataFrames if not os.path.exists(os.path.join(data_dir_sb, 'raw', 'combined', 'combined_sb_360.csv')): # Join the Matches DataFrame to the Events DataFrame df_events_matches = pd.merge(df_events_flat, df_matches_flat, left_on=['match_id'], right_on=['match_id']) # Join the Competitions DataFrame to the Events-Matches DataFrame df_events_matches_competitions = pd.merge(df_events_matches, df_competitions_flat, left_on=['competition.competition_id', 'season.season_id'], right_on=['competition_id', 'season_id']) else: df_events_matches_competitions = pd.read_csv(os.path.join(data_dir_sb, 'raw', 'combined', 'combined_sb_360.csv')) # Display DataFrame df_events_matches_competitions.head() # In[32]: print('No. rows in Events DataFrame BEFORE join to Matches and Competitions DataFrames: {}'.format(len(df_events_flat))) print('No. rows in DataFrame AFTER join: {}\n'.format(len(df_events_matches_competitions))) print('-'*10+'\n') print('Variance in rows before and after join: {}\n'.format(len(df_events_matches_competitions) - len(df_events_flat))) # ##### Export DataFrame # In[33]: # Export DataFrame as a CSV file ## if not os.path.exists(os.path.join(data_dir_sb, 'raw', 'combined', 'combined_sb_360.csv')): df_events_matches_competitions.to_csv(os.path.join(data_dir_sb, 'raw', 'combined', 'combined_sb_360.csv'), index=None, header=True) ## else: pass # ### 3.4. Initial Data Handling # Let's quality of the dataset by looking first and last rows in pandas using the [head()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.head.html) and [tail()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.tail.html) methods. # #### 3.4.1. Summary Report # Initial step of the data handling and Exploratory Data Analysis (EDA) is to create a quick summary report of the dataset using [pandas Profiling Report](https://github.com/pandas-profiling/pandas-profiling). # In[ ]: # Summary of the data using pandas Profiling Report #pp.ProfileReport(df_events_matches_competitions) # #### 3.4.2. Further Inspection # The following commands go into more bespoke summary of the dataset. Some of the commands include content covered in the [pandas Profiling](https://github.com/pandas-profiling/pandas-profiling) summary above, but using the standard [pandas](https://pandas.pydata.org/) functions and methods that most peoplem will be more familiar with. # # First check the quality of the dataset by looking first and last rows in pandas using the [head()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.head.html) and [tail()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.tail.html) methods. # In[34]: # Display the first five rows of the DataFrame, df_events_matches_competitions df_events_matches_competitions.head() # In[35]: # Display the last five rows of the DataFrame, df_events_matches_competitions df_events_matches_competitions.tail() # In[36]: # Print the shape of the DataFrame, df_events_matches_competitions print(df_events_matches_competitions.shape) # In[37]: # Print the column names of the DataFrame, df_events_matches_competitions print(df_events_matches_competitions.columns) # In[38]: # Data types of the features of the raw DataFrame, df_events_matches_competitions df_events_matches_competitions.dtypes # In[39]: # Displays all columns with pd.option_context('display.max_rows', None, 'display.max_columns', None): print(df_events_matches_competitions.dtypes) # Full details of these attributes and their data types can be found in the [Data Dictionary](section3.3.1). # In[40]: # Counts of missing values null_value_stats = df_events_matches_competitions.isnull().sum(axis=0) null_value_stats[null_value_stats != 0] # ## 4. Summary # This notebook parses JSON data from the [StatsBomb Open Data GitHub repository](https://github.com/statsbomb/open-data) using [pandas](http://pandas.pydata.org/). # ## 5. Next Steps # The next stage is to engineer this DataFrame. # ## 5. References # * [StatsBomb](https://statsbomb.com/) data # * [StatsBomb Announce The Release Of Free StatsBomb 360 Data: Euro 2020 Available Now](https://statsbomb.com/2021/11/statsbomb-announce-the-release-of-free-statsbomb-360-data-euro-2020-available-now/) # * [StatsBomb](https://github.com/statsbomb/open-data/tree/master/data) open data GitHub repository # --- # # ***Visit my website [eddwebster.com](https://www.eddwebster.com) or my [GitHub Repository](https://github.com/eddwebster) for more projects. If you'd like to get in contact, my Twitter handle is [@eddwebster](http://www.twitter.com/eddwebster) and my email is: edd.j.webster@gmail.com.*** # [Back to the top](#top)