Notebook
# Define function to def prepare_xg_df(df, pitch_length_x, pitch_length_y, lr_model): """ Function to... """ ## Data Engineering ### Filter DataFrame for shots df_shots = df[df['isShot'] == 1] ### Create new features - 'isFoot', 'distance_to_goal', 'distance_to_center', and 'angle' df_shots['isFoot'] = np.where(((df_shots['isLeftFooted'] == 1) | (df_shots['isRightFooted'] == 1)) , 1, 0 ) df_shots['distance_to_goal'] = np.sqrt(((pitch_length_x - df_shots['x'])**2) + ((df_shots['y'] - (pitch_length_y/2))**2) ) df_shots['distance_to_center'] = abs(df_shots['y'] - pitch_length_y/2) df_shots['angle'] = np.absolute(np.degrees(np.arctan((abs((pitch_length_y/2) - df_shots['y'])) / (pitch_length_x - df_shots['x'])))) ### Convert data types df_shots['isHead'] = df_shots['isHead'].astype('int64') ## Data Preparation - for xG model ### Select Features of interest features_cols = ['distance_to_goal', 'angle', 'isFoot', 'isHead' ] ### Define Target target_col = ['isGoal'] ### df_shots = df_shots[['event_id'] + features_cols + target_col] ## Assign Feature and Target to separate DataFrames and Series X = df_shots[features_cols] y = df_shots[target_col] ## Training of the logistic regression on the train set lr_model = LogisticRegression(random_state=42) lr_model.fit(X, y) ## Assign xG values i.e. Probability Predictions ### y_xg = lr_model.predict_proba(X) ### Convert the Probability Predictions array to a pandas DataFrame df_xg = pd.DataFrame(y_xg, columns = ['prob_no_goal', 'prob_goal']) ## Final DataFrame preparation ### Reset shots index df_shots = df_shots.reset_index(drop=True) ### Join the Probability Predictions back onto Shots DataFrame df_shots_xg = pd.merge(df_shots, df_xg, left_index=True, right_index=True, how='left') ### Select columns of interest df_shots_xg = df_shots_xg[['event_id', 'prob_goal']] ### Join the Shots DataFrame with the xG values back onto the original Events DataFrame df = pd.merge(df, df_shots_xg, left_on='event_id', right_on='event_id', how='left') ### Rename columns df = df.rename(columns={'prob_goal': 'xG'}) ## Return DataFrame return df