#!/usr/bin/env python # coding: utf-8 # In[2]: import json import boto3 from snowflake.snowpark.session import Session # In[3]: def get_secret(secret_name, region_name="us-east-1"): session = boto3.session.Session() client = session.client( service_name='secretsmanager', region_name=region_name) get_secret_value_response = client.get_secret_value(SecretId=secret_name) get_secret_value_response = json.loads(get_secret_value_response['SecretString']) return get_secret_value_response # In[4]: creds = get_secret("wysde") connection_parameters = { "account": creds["SNOWFLAKE_ACCOUNT"], "user": creds["SNOWFLAKE_USERNAME"], "password": creds["SNOWFLAKE_PASSWORD"], "warehouse": creds["SNOWFLAKE_WAREHOUSE"], "role": creds["SNOWFLAKE_ROLE"], "database": "sparsh", "schema": "madcow" } # In[ ]: # Install needed packages # Be sure check that packages also exist in Acaconda repo for Snowflake get_ipython().system('pip install scipy numpy seaborn pandas statsmodels') # In[ ]: get_ipython().system('mkdir -p seeds') get_ipython().system('cp -r ../data/*.csv seeds') # In[9]: get_ipython().system('dbt seed') # In[5]: session = Session.builder.configs(connection_parameters).create() # In[ ]: import pandas as pd import numpy as np import seaborn import statsmodels.api as sm import statsmodels.formula.api as smf from scipy.stats import poisson from snowflake.snowpark.functions import udf # In[7]: # Let's fetch training data that we have in Snowflake about FIFA games # Data is limited to games played after 2000 for validity reasons and also because this limits the size of the model df = session.sql("SELECT * FROM RESULTS WHERE DATE > '2000-01-01'").to_pandas() # In[14]: # It seems that score values are casted as strings, let's cast them to floats df = df[['HOME_TEAM','AWAY_TEAM','HOME_SCORE','AWAY_SCORE']].copy() df['HOME_SCORE'] = df['HOME_SCORE'].astype(float) df['AWAY_SCORE'] = df['AWAY_SCORE'].astype(float) df = df.sample(frac=0.1) # In[15]: # Lets build the model using Poisson model goal_model_data = pd.concat([df[['HOME_TEAM','AWAY_TEAM','HOME_SCORE']].assign(home=1).rename( columns={'HOME_TEAM':'team', 'AWAY_TEAM':'opponent','HOME_SCORE':'goals'}), df[['AWAY_TEAM','HOME_TEAM','AWAY_SCORE']].assign(home=0).rename( columns={'AWAY_TEAM':'team', 'HOME_TEAM':'opponent','AWAY_SCORE':'goals'})]) model = smf.glm(formula="goals ~ home + team + opponent", data=goal_model_data, family=sm.families.Poisson()).fit() # In[16]: # Lets import the model to Snowflake stage session.sql('create or replace stage MODELSTAGE').collect() #Create a model stage if it does not already exist. # In[18]: # Let's build the Snowflake Python UDF using Snowpark @udf command # Necessary packages are imported in the same command # This is part is needed only once @udf(name='predict_result', is_permanent = True, stage_location = '@MODELSTAGE', replace=True, session=session, packages=["numpy", "pandas", "scipy", "statsmodels"]) def simulate_match(homeTeam: str, awayTeam: str) -> float: home_goals_avg = model.predict(pd.DataFrame(data={'team': homeTeam, 'opponent': awayTeam,'home':1}, index=[1])).values[0] away_goals_avg = model.predict(pd.DataFrame(data={'team': awayTeam, 'opponent': homeTeam,'home':0}, index=[1])).values[0] team_pred = [[poisson.pmf(i, team_avg) for i in range(0, 10)] for team_avg in [home_goals_avg, away_goals_avg]] home_win=(np.outer(np.array(team_pred[0]), np.array(team_pred[1]))) result = np.sum(np.tril(home_win, -1)) return result # In[19]: get_ipython().system('mkdir -p models') # In[ ]: get_ipython().run_cell_magic('writefile', 'models/fifa_predictions.py', 'def model(dbt, session):\n dbt.config(\n materialized="table",\n packages=["pandas", "numpy", "statsmodels", "scipy", "joblib"]\n )\n goal_model_data = dbt.ref("goal_model_data").to_pandas()\n\n model = smf.glm(formula="goals ~ home + team + opponent", data=goal_model_data, \n family=sm.families.Poisson()).fit()\n\n session.sql(\'create or replace stage MODELSTAGE\').collect()\n\n model_file = \'tmp/model.joblib\'\n dump(model, model_file)\n session.file.put(model_file, "@MODELSTAGE", overwrite=True)\n\n return goal_model_data #We need to always return a DF from dbt Python\n') # Predict 1st round winners using existing RESULTS Loop through the schedule and populate PREDICT_FOR_HOME_WIN column with value provided by previously build FUNCTION # # Note, full data predicton can take up to 20minutes with X-SMALL warehouse # In[ ]: get_ipython().run_cell_magic('sql', '', 'SELECT * FROM SHOOTOUTS LIMIT 10\nSELECT * FROM GOALSCORERS LIMIT 10\nSELECT * FROM RESULTS LIMIT 10\n\nCREATE TABLE RESULTS_SMALL AS\nSELECT * FROM RESULTS LIMIT 10\n\nALTER TABLE RESULTS_SMALL\nADD PREDICT_FOR_HOME_WIN smallint;\n\nDECLARE\n c1 CURSOR FOR SELECT home_team, away_team FROM RESULTS_SMALL;\n home_team VARCHAR;\n away_team VARCHAR;\nBEGIN\n FOR record IN c1 DO\n home_team := record.home_team;\n away_team := record.away_team;\n UPDATE RESULTS_SMALL f SET predict_for_home_win = prediction.p \n FROM (SELECT predict_result(:home_team, :away_team) p ) AS prediction\n WHERE f.home_team = :home_team AND f.away_team = :away_team ;\n END FOR;\nEND;\n\nSELECT * FROM RESULTS_SMALL LIMIT 10\n') # In[ ]: get_ipython().run_cell_magic('sql', '', '-- 3. Create view to show winners of Group stage\nCREATE OR REPLACE VIEW A1_WINNERS_GROUP_STAGE(\n\tHOME_TEAM,\n\tAWAY_TEAM,\n\tWINNER\n) as (\nSELECT \n HOME_TEAM\n , AWAY_TEAM\n , CASE WHEN PREDICT_FOR_HOME_WIN > 0.5 THEN HOME_TEAM\n ELSE AWAY_TEAM END AS WINNER\nFROM RESULTS_SMALL\n);\n\nSELECT * FROM RESULTS_SMALL LIMIT 10\n') # In[ ]: get_ipython().run_cell_magic('sql', '', "-- 4. Create view to show Round of 16 game pairs\nCREATE OR REPLACE VIEW A2_GAME_PAIRS_ROUND_OF_16(\n\tHOME_TEAM,\n\tAWAY_TEAM\n) as (\nWITH BRACKET_WINNERS AS (\nSELECT \n DISTINCT\n winners_group_stage.HOME_TEAM as W_HOME,\n winners_group_stage.AWAY_TEAM as W_AWAY,\n winners_group_stage.WINNER\nFROM A1_WINNERS_GROUP_STAGE winners_group_stage\nINNER JOIN RESULTS_SMALL fifa_schedule ON TRIM(fifa_schedule.HOME_TEAM) = TRIM(winners_group_stage.HOME_TEAM) AND TRIM(fifa_schedule.AWAY_TEAM) = TRIM(winners_group_stage.AWAY_TEAM)\n GROUP BY W_HOME, W_AWAY, WINNER\n), BRACKET_QUALIFIERS AS (\n SELECT \n WINNER,\n COUNT(WINNER) AS WIN_COUNT,\n RANK() OVER (ORDER BY COUNT(WINNER) DESC) AS rank \n FROM BRACKET_WINNERS\n GROUP BY WINNER\n), BRACKET_RANKS AS (\n SELECT \n bracket_qualifiers.WINNER AS COUNTRY\n , bracket_qualifiers.WIN_COUNT AS WINS\n , CASE WHEN rank = 1 THEN\n 1\n ELSE 2\n END AS RANK_IN_BRACKET\n FROM BRACKET_QUALIFIERS bracket_qualifiers\n WHERE RANK < 3\n), BRACKET_RANK_TO_COUNTRY_SCHEDULE AS (\n SELECT \n COUNTRY\n , REPLACE(RANK_IN_BRACKET, ' ', '') AS RANK_IN_BRACKET\n FROM BRACKET_RANKS\n ORDER BY RANK_IN_BRACKET\n), KNOCKOUT_GAMES AS (SELECT \n *\n FROM RESULTS_SMALL \n) SELECT \n bg_home.country AS HOME_TEAM\n , bg_away.country AS AWAY_TEAM\n FROM KNOCKOUT_GAMES kg \n INNER JOIN BRACKET_RANK_TO_COUNTRY_SCHEDULE bg_home ON bg_home.RANK_IN_BRACKET = kg.HOME_TEAM\n INNER JOIN BRACKET_RANK_TO_COUNTRY_SCHEDULE bg_away ON bg_away.RANK_IN_BRACKET = kg.AWAY_TEAM\n) ;\n\nSELECT * FROM A2_GAME_PAIRS_ROUND_OF_16 LIMIT 10;\n") # In[ ]: