#!/usr/bin/env python # coding: utf-8 # # Including teleconnection indices as predictors # # In this tutorial, we demonstrate how to set the atmospheric circulation indices included in the pyESD package as predictor for developing downscaling model. Specifically, in this notebook, we rely on the *Recursive* predictor selection method for the predictor selection and used *LassoLarsCV* as the learning model to evaluate the performance of the models generated with and without the indices. # In[7]: # import all the models required import os import sys import pandas as pd import numpy as np from collections import OrderedDict import socket # modules related to pyESD from pyESD.Weatherstation import read_station_csv from pyESD.standardizer import MonthlyStandardizer from pyESD.ESD_utils import store_pickle, store_csv from pyESD.splitter import KFold from pyESD.ESD_utils import Dataset from pyESD.Weatherstation import read_weatherstationnames # ### Data repositories # To avoid the repetition of the paths to the predictors and predictand datasets, we have included them in a script that can easily be imported in all the notebooks # 1. The ERA5 datasets loaded with the Dataset module implemented in the ```pyESD.ESD_utils``` # 2. The constructed time series of the predictors are stored in a pickle file to avoid loading them again on the next run. We set the predictordir to store the datasets # 3. We use the ```read_weatherstationnames``` to generated a list of all the weather station names that would be used to create the weather station objects # 4. We import all these variable as ```from read_data import * ``` which is this file: [read_data.py](./read_data.py) # # All the weather stations are stored in [data](./data) # ### Predictors setting # # 1. list of predictors to be loaded. Note that these names are the variables names stored in the netCDF files containing the predictors datasets # 2. set the date range that would be used to selected the predictors. Here it should be the same as the training and validation period # In[8]: # define the predictors that includes the teleconnection indices predictors_with_indices = ["NAO", "EA", "SCAN", "EAWR", "t2m", "tp","msl", "v10", "u10", "u250", "u850", "u500","u700", "u1000","v250", "v850", "v500","v700", "v1000", "r250", "r850", "r500","r700", "r1000", "z250", "z500", "z700", "z850", "z1000", "t250", "t850", "t500","t700", "t1000","dtd250", "dtd850", "dtd500","dtd700", "dtd1000" ] # In[9]: # define the predictors without teleconnection indices predictors_without = ["t2m", "tp","msl", "v10", "u10", "u250", "u850", "u500","u700", "u1000","v250", "v850", "v500","v700", "v1000", "r250", "r850", "r500","r700", "r1000", "z250", "z500", "z700", "z850", "z1000", "t250", "t850", "t500","t700", "t1000","dtd250", "dtd850", "dtd500","dtd700", "dtd1000" ] # In[10]: # date-range for model training and validation from1958to2010 = pd.date_range(start="1958-01-01", end="2010-12-31", freq="MS") # ### control function # # Define the control function that performs the predictor selection and model training. # 1. read the station data as object that would apply all the ESD routines # 2. set predictors with the list of predictors defined and the radius to construct the regional means # 3. standardize the data with any of the standardizers. Here we use the MonthlyStandardizer method # 4. defined the scoring metrics to be used for the validation # 5. set the model to be used for the ESD training (here we will use the LassoLarsCV model) # 6. fit the model, here we have to define the predictor selector method (here: Recursive ) to be used for selecting the predictors # 7. get the selected predictors # 8. use the cross_validate_predict to get the cross-validation metrics of the model training # 9. store the selected predictors # 10. stored the validation metrics # In[33]: def run_predictor_selection_example_to_test_indices(variable, cachedir, stationnames, station_datadir, predictors, predictordir, radius): """ Run an experiment using pyESD to perform predictor selection for a given variable. Args: variable (str): The target variable to predict, here Precipitation. regressor (str): The regression method to use, here we use the RidgeCV regression to test all the predictor selection methods. selector_method (str): The method for selecting predictors ("Recursive", "TreeBased", "Sequential"). cachedir (str): Directory to store cached results, here all the files would be stored in the . stationnames (list): List of station names. it would be loaded from the read_data file station_datadir (str): Directory containing station data files: this is also set in the read the data file predictors (list): List of predictor variables. predictordir (str): Directory containing predictor data files. radius (float): Radius for selecting predictors: also defined in the read_data file """ num_of_stations = len(stationnames) # Loop through all stations for i in range(num_of_stations): stationname = stationnames[i] # set the exact path for the station data station_dir = os.path.join(station_datadir, stationname + ".csv") # 1. create the station object using the read_station_csv and apply all the methods on the station object SO_instance = read_station_csv(filename=station_dir, varname=variable) # 2. Setting predictors (generate the predictors using the defined predictor names) SO_instance.set_predictors(variable, predictors, predictordir, radius) # 3. Setting standardizer SO_instance.set_standardizer(variable, standardizer=MonthlyStandardizer(detrending=False, scaling=False)) # perform correlation corr = SO_instance.predictor_correlation(variable, from1958to2010, ERA5Data, fit_predictors=True, fit_predictand=True, method="pearson", use_scipy=True) # 4. define the scoring metrics scoring = ["neg_root_mean_squared_error", "r2", "neg_mean_absolute_error"] # 5. Setting model with cross-validation SO_instance.set_model(variable, method="LassoLarsCV", scoring=scoring, cv=KFold(n_splits=10)) # 6. Fitting model with predictor selector option SO_instance.fit(variable, from1958to2010, ERA5Data, fit_predictors=True, predictor_selector=True, selector_method="Recursive", selector_regressor="ARD", cal_relative_importance=False) # 7. Extracting selected predictors selected_predictors = SO_instance.selected_names(variable) # 8. Training estimate for the same model score, ypred = SO_instance.cross_validate_and_predict(variable, from1958to2010, ERA5Data) # 9-10. Storing results using pickle store_pickle(stationname, "selected_predictors_" + "Recursive", selected_predictors, cachedir) store_pickle(stationname, "validation_score_" + "Recursive", score, cachedir) store_csv(stationname, "corrwith_predictors_scipy", corr, cachedir) # In[34]: from read_data import radius, station_prec_datadir, stationnames_prec, ERA5Data, predictordir, cachedir_prec # ### Experiment with the indices defined # Perform the modelling with indices predictors (then store the data in a folder for the analysis) # In[36]: run_predictor_selection_example_to_test_indices(variable="Precipitation", cachedir=cachedir_prec, stationnames=stationnames_prec, station_datadir=station_prec_datadir, predictors=predictors_with_indices, predictordir=predictordir, radius=radius) # ### Experiment without the indices # Perform the modelling with predictors without indices (then store the data in a folder for the analysis) # In[24]: run_predictor_selection_example_to_test_indices(variable="Precipitation", cachedir=cachedir_prec, stationnames=stationnames_prec, station_datadir=station_prec_datadir, predictors=predictors_without, predictordir=predictordir, radius=radius) # In[25]: path_to_results_exp1 = "C:/Users/dboateng/Desktop/Python_scripts/ESD_Package/examples/tutorials/predictor_selection_with_indices/" path_to_results_exp2 = "C:/Users/dboateng/Desktop/Python_scripts/ESD_Package/examples/tutorials/predictor_selection_without_indices/" # In[32]: from pyESD.plot import barplot import matplotlib.pyplot as plt fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(20,15), sharey=False, sharex=True) barplot(methods=["Recursive"], stationnames=stationnames_prec , path_to_data=path_to_results_exp1, xlabel="Stations", ylabel="CV R² (with indices)", varname= "test_r2", varname_std ="test_r2_std", filename="validation_score_", legend=True, ax=ax1,) barplot(methods=["Recursive"], stationnames=stationnames_prec , path_to_data=path_to_results_exp2, xlabel="Stations", ylabel="CV R²", varname= "test_r2", varname_std ="test_r2_std", filename="validation_score_", legend=True, ax=ax2) # The models with indices estimate slight increase in performance in comparison to the models without indices # In[ ]: