#!/usr/bin/env python # coding: utf-8 # ## [03_Big_Data.ipynb](https://github.com/raybellwaves/xskillscore-tutorial/blob/master/03_Big_Data.ipynb) # In this notebook I verify 12 million forecasts in a couple of seconds using the RMSE metric on a `dask.array`. # In[1]: import xarray as xr import pandas as pd import numpy as np import xskillscore as xs import dask.array as da from dask.distributed import Client # By default the [`dask.distributed.Client`](https://distributed.dask.org/en/latest/client.html) uses a [`LocalCluster`](https://docs.dask.org/en/latest/setup/single-distributed.html#localcluster) # # ``` # cluster = LocalCluster() # client = Client(cluster) # ``` # # However, this code can easily be adapted to scale on massive datasets using distributed computing via various methods of deployment: # # - [Kubernetes](https://docs.dask.org/en/latest/setup/kubernetes.html) # - [High Performance Computers](https://docs.dask.org/en/latest/setup/hpc.html) # - [YARN](https://yarn.dask.org/en/latest/) # - [AWS Fargate](https://aws.amazon.com/fargate/) # # or vendor products: # # - [SaturnCloud](https://www.saturncloud.io/s/) # # If anyone does run this example on a large cluster I would be curious how big you can scale `nstores` and `nskus` and how long it takes to run `rmse`. You are welcome to post it in the issue section following this [link](https://github.com/raybellwaves/xskillscore-tutorial/issues/new/choose). # Setup the client (i.e. connect to the scheduler): # In[2]: client = Client() client # Due to the success of your previous forecast (and verification using xskillscore!) the company you work for has expanded. They have grown to 4,000 stores each with 3,000 products: # In[3]: nstores = 4000 nskus = 3000 nforecasts = nstores * nskus print(f"That's {nforecasts:,d} different forecasts to verify!") stores = np.arange(nstores) skus = np.arange(nskus) # The time period of interest is the same dates but for 2021: # In[4]: dates = pd.date_range("1/1/2021", "1/5/2021", freq="D") # Setup the data as a `dask.array` of dates x stores x skus. # # `dask` uses similar functions as `numpy`. In this case switch the `np.` to `da.` to generate random numbers between 1 and 10: # In[5]: data = da.random.randint(9, size=(len(dates), len(stores), len(skus))) + 1 data # Put this into an `xarray.DataArray` and specify the Coordinates and dimensions: # In[6]: y = xr.DataArray(data, coords=[dates, stores, skus], dims=["DATE", "STORE", "SKU"]) y # Create a prediction array similar to that in [01_Deterministic.ipynb](https://github.com/raybellwaves/xskillscore-tutorial/blob/master/01_Determinisitic.ipynb): # In[7]: noise = da.random.uniform(-1, 1, size=(len(dates), len(stores), len(skus))) yhat = y + (y * noise) yhat # Finally calculate RMSE at the store and sku level. # # Use the [`.compute()`](https://distributed.dask.org/en/latest/manage-computation.html) method to return the values: # In[8]: get_ipython().run_line_magic('time', "xs.rmse(y, yhat, 'DATE').compute()") # In[ ]: