#!/usr/bin/env python # coding: utf-8 # #

EDIPI Training Event 1&2, November 2021

Introductory exercises: data analysis with xarray and scipy

# # # Imports # # First cell in notebook should always contain all necessary imports # In[1]: import numpy as np import xarray as xr import scipy.stats as stats from matplotlib import pyplot as plt # # Load the dataset and take a look at the contents # The dataset contains a single variable called 'tmax' with time coordinate 'time'. # # The 'tmax' variable is ERA5 daily-maximum surface temperature from the Pacific Northwest area of the USA for the period 1950-2021. # In[2]: ds = xr.open_dataset('itmax_era5_index.nc') ds # # Extract the tmax data array and plot it # # Clearly, the summer 2021 of was an extreme event, you can read about it [here](https://www.worldweatherattribution.org/western-north-american-extreme-heat-virtually-impossible-without-human-caused-climate-change/) # In[3]: tx = ds['tmax'] # tx now contains the tmax data # In[4]: plt.figure(figsize=(15,5)) tx.plot(); # # The properties of time # The 'time' coordinate is a datetime64 object, which provides lots of powerful functionality in xarray, see [here](http://xarray.pydata.org/en/stable/user-guide/time-series.html) # # In particular, the .dt method gives access to a lot of information: “year”, “month”, “day”, “hour”, “minute”, “second”, “dayofyear”, “week”, “dayofweek”, “weekday”, “quarter” and "season". # In[5]: dates = ds['time'].sel(time = ['1950-01-01', '1966-07-14', '2001-09-11']) # can select specific dates as strings dates # In[6]: dates.dt.dayofyear # the day of year at those dates # In[7]: dates.dt.season # the season at those dates # # Exercises # ## Compute the mean, median and 99th percentile temperature # Use the xarray .mean and [.quantile](http://xarray.pydata.org/en/stable/generated/xarray.DataArray.quantile.html) functions # In[8]: print('mean: %4.1f' % tx.mean(dim='time')) print('median: %4.1f' % tx.quantile(0.5, dim='time')) print('99th percentile: %4.1f' % tx.quantile(0.99, dim='time') ) # ## Compute the mean summer (May-Sep) temperature for the years 2011-2020 # # Use [.sel](http://xarray.pydata.org/en/stable/generated/xarray.DataArray.sel.html) to select the years 2011-2020 # # Use [.where](http://xarray.pydata.org/en/stable/generated/xarray.DataArray.where.html) to filter out non-summer days # # Use [.coarsen](https://xarray.pydata.org/en/stable/generated/xarray.DataArray.coarsen.html) to take means over each the summer of each year # In[9]: # select all data in the time range specified by slice tx_2011_2020 = tx.sel(time = slice('2011-01-01','2020-12-31')) # filter data -- replace all data with dates outside May-Sep with nan tx_2011_2020_summer = tx_2011_2020.where(tx['time'].dt.month.isin([5,6,7,8,9])) # coarsen data into 365 day chunks and average over each chunk. #Nans do not contribute to mean, so the result is the May-Sep mean for each year tx_2011_2020_summer_mean = tx_2011_2020_summer.coarsen(time=365,boundary='trim').mean() # In[10]: plt.figure(figsize=(15,5)) tx_2011_2020_summer.plot() tx_2011_2020_summer_mean.plot.line(marker='o'); # ## Compute a smoothed seasonal cycle # # Use [.groupby](http://xarray.pydata.org/en/stable/generated/xarray.DataArray.groupby.html) to group data by day of year and average over all years -- this gives a "raw" seasonal cycle # # Apply [.rolling](http://xarray.pydata.org/en/stable/generated/xarray.DataArray.rolling.html) to the raw seasonal cycle to compute a 31-day running-mean smoothed seasonal cycle; you will need to use [.pad](http://xarray.pydata.org/en/stable/generated/xarray.DataArray.pad.html) with mode='wrap' to extend the raw seasonal cycle periodically, so the running mean window doesn't run out of data at the beginning and end # In[70]: # daily seasonal cycle tx_sc = tx.groupby(tx['time'].dt.dayofyear).mean() # smooth seasonal cyle with moving average over window of specified width window = 31 pad = int(window/2) tx_sc_smooth = tx_sc.pad(dayofyear=(pad,pad), mode='wrap',center=True).rolling(dayofyear=window,center=True).mean().dropna(dim='dayofyear') # remove seasonal cycle from data tx_nosc = tx.groupby('time.dayofyear') - tx_sc_smooth # In[12]: plt.figure(figsize=(15,5)) tx_sc.plot() tx_sc_smooth.plot() plt.legend(['raw','smooth']) plt.title('Smoothed seasonal cycle (degC)'); plt.ylabel('temperature (degC)'); # In[13]: plt.figure(figsize=(15,5)) tx_nosc.plot() plt.title('Deseasonalised data (anomalies from annual cycle, degC)'); plt.ylabel('temperature (degC)'); # ## Extract annual block maxima (i.e. the maximum temperature for each year in the dataset) # # Use [.groupby](http://xarray.pydata.org/en/stable/generated/xarray.DataArray.groupby.html) to group data by year and extract the maximum value for each year # In[14]: tx_bm = tx.groupby(tx['time'].dt.year).max() # In[15]: plt.figure(figsize=(15,5)) tx_bm.plot.line(marker='o'); plt.ylabel('temperature (degC)'); # ## Compute the linear time trend in the annual maxima # # A linear time trend is just a linear regression of the data onto time. # # Compute the trend in the annual block maxima calculated above using [stats.linregress](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.linregress.html) # In[16]: # note that the time axis of tx_bm is no longer called 'time', it's called 'year' r = stats.linregress(tx_bm['year'], tx_bm) trend = r.intercept + r.slope * tx_bm['year'] tx_bm_detr = tx_bm - trend + tx_bm.mean() # In[17]: plt.figure(figsize=(15,5)) tx_bm.plot.line(marker='o') tx_bm_detr.plot.line(marker='o') trend.plot.line() plt.legend(['full','detrended','trend line']) plt.title('Removing linear trend in annual maxima'); plt.ylabel('temperature (degC)'); # ## Estimating the probability density function (PDF) # The PDF is interpreted as "in a given year, a the temperature will have a value $\ell$ contined in a bin ($\ell$,$\ell+d\ell$) with probability $p \, d\ell$", where $p$ is the PDF. # # The PDF for a given dataset can be estimated 2 ways: # 1. parametrically: assume the data follows a given distribution function, and fit the parameters # 2. non-parametrically: compute a histogram # # Exercise: # # Apply both these approaches to the annual block maxima: # 1. Use [stats.genextreme.fit](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.genextreme.html) to fit a generalized extreme value (GEV) distribution to the block maxima, leaving out the extreme data point for 2021 # 2. Generate a "frozen" random variate object by calling stats.genextreme with the fitted parameters: rv = stats.genextreme(params) # 3. Compute the PDF using rv.pdf # 4. Compare to histogram, computed using the xarray .plot.hist function # In[91]: # fit a GEV distribution shape, location, scale = stats.genextreme.fit( tx_bm.sel(year=slice('1950','2020')) ) print(shape, location, scale) # produce "frozen" random variate for the fitted distribution rv = stats.genextreme(shape, location, scale) # set up a dummy tx axis called tx_p which spans from the 0.001 percentile to the 0.999 percentile of the fitted distribution tx_min = rv.ppf(0.0001) tx_max = rv.ppf(0.9999) tx_p = np.linspace(tx_min, tx_max, 100) # compute the PDF p = rv.pdf(tx_p) # In[92]: tx_bm.plot.hist(bins=10, density=True) plt.plot(tx_p, p) plt.ylabel('$p$') plt.xlabel('$\ell$',fontsize=14); # ## Estimating the cumulative distribution function (CDF) # The CDF is just the integral of the PDF. It can be interpreted as "in a given year, the temperature will stay below level $\ell$ with probability $P(\ell)$", where $P$ is the CDF. # # Alternatively, once can consider the function $1-P$ (sometimes called the survival function); then interpretation is "in a given year, the temperature will *exceed* $\ell$ with probability $1-P(\ell)$" # # Again, the CDF can be estimated parametrically (via a fit, as above), or non-parametrically directly from the data. # # The simplest non-parametric estimator for the CDF is to rank the data points in ascending order and assume that the CDF increases by 1/N for each data point, where N is total number of years in sample. More accurate assumptions are described [here](https://edx.hydrolearn.org/courses/course-v1:HydroLearn+HydroLearn410+2019_S2/courseware/cd43c81fa55944d4ad7d5bd9aabc8837/6bdb39cb4f1847a4afeab59c37706570/3?activate_block_id=block-v1%3AHydroLearn%2BHydroLearn410%2B2019_S2%2Btype%40vertical%2Bblock%40a03085271bac412ebbcc7a25e12b081a). # # Exercise: # 1. Use the .sortby function to rank the annual block maxima in ascending order # 2. Create a P array which increases from just above 0 to just below 1 in N even steps, as described in the link above # 3. Plot the two against one another; this is the non-parametric CDF # 4. Compute and plot the parametric CDF using rv.cdf with the "frozen" rv computed in the previous exercise # In[93]: # Non-parametric CDF estimator N = len(tx_bm) # number of data points tx_np = tx_bm.sortby(tx_bm, ascending=True) a = 0.5 # this is the Hazen estimator P_np = (np.linspace(1, N, N) - a)/ (N + 1 - 2*a) # Parametric estimator P_p = rv.cdf(tx_p) # In[94]: fig,axes = plt.subplots(1,2,figsize=(15,5)) ax=axes[0] ax.plot(tx_np, P_np, 'k.') ax.plot(tx_p, P_p) ax.set_xlabel('$\ell$',fontsize=14) ax.set_ylabel('$P(\ell$)',fontsize=14) ax.set_title('CDF') ax=axes[1] ax.plot(tx_np, 1 - P_np, 'k.') ax.plot(tx_p, 1 - P_p) ax.set_xlabel('$\ell$',fontsize=14) ax.set_ylabel('$1-P(\ell$)',fontsize=14) ax.set_title('1 - CDF'); # ## Return level plot # # The return period $r$ is defined as the number of years you expect to wait in order to see the temperature exceeding a certain level $\ell$. # # You can also think of return period as the number of years you need to wait in order for the probability of exceeding $\ell$ to be just equal to 1, assuming all years are independent: # # $r \ (1-P(\ell)) = 1$ # # so # # $ r = \frac{1}{1-P(\ell)}$ # # A return level plot is just a modified form of the 1-CDF plot, with the temperatures plotted on the y axis (instead of the x axis) and with $r = 1/(1-P)$ on the x axis. # # Exercise: # # Plot the parametric and non-parametric return level estimators. Use a logarithmic x axis. # In[95]: # non-parametric return periods r_np = 1 / (1 - P_np) # parametric return periods r_p = 1 / (1 - P_p) # In[96]: plt.figure(figsize=(7.5,5)) plt.semilogx(r_np, tx_np, 'k.') plt.semilogx(r_p, tx_p) plt.ylabel('$\ell$',fontsize=14) plt.xlabel('return period (years)'); plt.title('Return level plot'); # ## Bootstrap method to determine uncertainty in fit # # Exercise: To get an estimate of the uncertainty in the estimates, do the following: # 1. Generate 1000 synthetic data samples by resampling the block maxima with replacement (use the np.random.choice function) # 2. Fit a GEV to each of the samples # 3. For each fit, compute the return level for a given set of return periods # 4. Compute the upper and lower percentiles for the return levels at each return period # In[24]: def bootstrap_ci(original_sample): Nsamples = 1000 # no. of samples in bootstrap Nrp = 50 # no. of return periods r = np.linspace(10, 1000, Nrp) # the return periods at which to compute confidence interval P = 1 - 1/r # the corresponding probabilities data = np.zeros((Nrp, Nsamples)) # array to hold bootstrap samples for i in range(Nsamples): sample = np.random.choice(original_sample, size=len(original_sample), replace=True) # resample with replacement rv = stats.genextreme( *stats.genextreme.fit(sample) ) # fit GEV to new sample data[:,i] = rv.ppf(P) # get quantiles for given probs ci = np.percentile(data, (2.5, 97.5), axis=1) # 95% confidence interval return r, ci # In[25]: # perform bootstrap r_boot, ci = bootstrap_ci(tx_bm[:-1]) # In[26]: plt.figure(figsize=(7.5,5)) plt.semilogx(r_np, tx_np, 'k.') plt.semilogx(r_p, tx_p) plt.semilogx(r_boot, ci[0], r_boot, ci[1]) plt.ylabel('$\ell$',fontsize=14) plt.xlabel('return period (years)'); plt.title('Return level plot with 95% uncertainty bounds');