# default_exp understat #hide from nbdev.showdoc import * #export import enum import re import json import requests import bs4 #export def fetch_html(url): """ Fetch HTML and decode into a `bs4.BeautifulSoup` object """ r = requests.get(url) r.raise_for_status() return bs4.BeautifulSoup(str(r.content, 'unicode-escape'), features='html.parser') def extract_json(soup, json_var): """ Extract a JSON variable from understat HTML. """ node, *__ = [s for s in soup.select('script') if s.string and json_var in s.string] # Clean string by removing and newlines (\n) and tabs (\t) node_string = ' '.join(node.string.split()) json_value = re.match(f"var {json_var} = JSON\.parse\(\'(?P.*?)\'\)", node_string).group('json') return json.loads(json_value) #export # 'Competition' might be a better name, but let's stick with understat's terminology class League(enum.Enum): """ Understat leagues """ EPL = 'EPL' LA_LIGA = 'La_Liga' SERIE_A = 'Serie_A' BUNDESLIGA = 'Bundesliga' LIGUE_1 = 'Ligue_1' RPL = 'RPL' #export class Understat: """ Fetches understat data webpages """ def __init__(self, base_url: str='https://understat.com'): self.base_url = base_url def matches(self, league: League, season: int): """ Fetch match data for a given `league` and `season` (start year). """ league_url = f'{self.base_url}/league/{league.value}/{season}' soup = fetch_html(league_url) return extract_json(soup, 'datesData') def shots(self, match_id: int): match_url = f'{self.base_url}/match/{match_id}' soup = fetch_html(match_url) return extract_json(soup, 'shotsData') understat = Understat() matches = understat.matches(League.EPL, 2019)[17] matches shots = understat.shots(11660) # Take the home team's 5th shot shots['h'][5]