#!/usr/bin/env python # coding: utf-8 # ## [02_Probabilistic.ipynb](https://github.com/raybellwaves/xskillscore-tutorial/blob/master/02_Probabilistic.ipynb) # This notebook shows how to use probabilistic metrics in a typical data science task where the data is a pandas.DataFrame. # # The metric Continuous Ranked Probability Score (CRPS) is used to verify multiple forecasts for the same target. # In[1]: import xarray as xr import pandas as pd import numpy as np import xskillscore as xs # Use the same data as in [01_Deterministic.ipynb](https://github.com/raybellwaves/xskillscore-tutorial/blob/master/01_Determinisitic.ipynb): # In[2]: stores = np.arange(4) skus = np.arange(3) dates = pd.date_range("1/1/2020", "1/5/2020", freq="D") rows = [] for _, date in enumerate(dates): for _, store in enumerate(stores): for _, sku in enumerate(skus): rows.append( dict( { "DATE": date, "STORE": store, "SKU": sku, "QUANTITY_SOLD": np.random.randint(9) + 1, } ) ) df = pd.DataFrame(rows) df.rename(columns={"QUANTITY_SOLD": "y"}, inplace=True) df.set_index(['DATE', 'STORE', 'SKU'], inplace=True) df.head() # Instead of making a single prediction as in [01_Deterministic.ipynb](https://github.com/raybellwaves/xskillscore-tutorial/blob/master/01_Determinisitic.ipynb) we will make multiple forecasts (ensemble forecast). This is akin to more complex methods such as **bagging**, **boosting** and **stacking**. # # Do 6 forecasts and append them to the `pandas.DataFrame` using an extra field called `member`. This will be saved in a new `pandas.DataFrame` called `df_yhat`: # In[3]: tmp = df.copy() for i in range(1, 7): tmp['member'] = i noise = np.random.uniform(-1, 1, size=len(df['y'])) tmp['yhat'] = (df['y'] + (df['y'] * noise)).astype(int) if i == 1: df_yhat = tmp.copy() else: df_yhat = df_yhat.append(tmp) df_yhat # Drop the `y` column from `df_yhat` and add `member` to the MultiIndex: # In[4]: df_yhat.drop('y', axis=1, inplace=True) df_yhat.set_index(['member'], append=True, inplace=True) df_yhat # Convert the target `pandas.DataFrame` (`df`) to an `xarray.Dataset`: # In[5]: ds = df.to_xarray() ds # Now add the predicted `pandas.DataFrame` (`df`) as an `xarray.DataArray` called `yhat` to the `xarray.Dataset`: # In[6]: ds['yhat'] = df_yhat.to_xarray()['yhat'] ds # Notice how an `xarray.Dataset` can handle Data variables which have different shape but share some dimensions # # Using xskillscore - CRPS # Continuous Ranked Probability Score (CRPS) can also be considered as the probabilistic Mean Absolute Error. It compares the empirical distribution of an ensemble forecast to a scalar observation it is given as: # \begin{align} # CRPS = \int_{-\infty}^{\infty} (F(f) - H(f - o))^{2} df # \end{align} # where where `F(f)` is the cumulative distribution function (CDF) of the forecast and `H()` is the Heaviside step function where the value is 1 if the argument is positive (the prediction overestimates the target or 0 (the prediction is equal to or lower than the target data). # # See https://climpred.readthedocs.io/en/stable/metrics.html#continuous-ranked-probability-score-crps for further documentation. # It is not a common verification metric and in most cases the predictions are averaged then verified using deterministic metrics. # # For example, you can see averaging on the `member` dimension gives a better prediction than any individual prediction: # # Note: for this we will use the function itself instead of the Accessor method: # In[7]: avg_member_rmse = xs.rmse( ds["y"], ds["yhat"].mean(dim="member"), ["DATE", "STORE", "SKU"] ) print("avg_member_rmse: ", avg_member_rmse.values) for i in range(len(ds.coords["member"])): print(f"member {i + 1}:") ind_member_rmse = xs.rmse( ds["y"], ds["yhat"].sel(member=i + 1), ["DATE", "STORE", "SKU"] ) print("ind_member_rmse: ", ind_member_rmse.values) assert avg_member_rmse < ind_member_rmse # However, you will see it appear in some Kaggle competitions such as the [NFL Big Data Bowl](https://www.kaggle.com/c/nfl-big-data-bowl-2020/overview/evaluation) and the [Second Annual Data Science Bowl](https://www.kaggle.com/c/second-annual-data-science-bowl/overview/evaluation) so it's good to have in your arsenal. # The CRPS is only valid over the `member` dimension and therefore only takes 2 arguments: # In[8]: ds.xs.crps_ensemble('y', 'yhat') # To return an overall CRPS it is recommended to average over all dimensions before using `crps`: # In[9]: y = ds['y'].mean(dim=['DATE', 'STORE', 'SKU']) yhat = ds['yhat'].mean(dim=['DATE', 'STORE', 'SKU']) xs.crps_ensemble(y, yhat) # In[ ]: