#!/usr/bin/env python # coding: utf-8 # # The Power of labeled data structures # # ***Purpose: Your data has labels; you should use them*** # # ![](images/dataset-diagram.png) # # Scientific data is inherently labeled. For example, time series data includes timestamps that label individual periods or points in time, spatial data has coordinates (e.g. longitude, latitude, elevation), and model or laboratory experiments are often identified by unique identifiers. The figure above provides an example of a labeled dataset. In this case the data is a map of global air temperature from a numeric weather model. The labels on this particular dataset are time (e.g. “2016-05-01”), longitude (x-axis), and latitude (y-axis). # # ---- # # ### Outline # - Numpy index (in brief) # - Named dimensions/axes # - Coordinate labels # - Label based indexing # - Alignment # # ### Tutorial Duriation # 10 minutes # # ### Going Further # # Xarray Documentation on Indexing: http://xarray.pydata.org/en/latest/indexing.html # In[1]: import xarray as xr # In[2]: # load a sample dataset ds = xr.tutorial.load_dataset('air_temperature') ds # ## The old way (numpy positional indexing) # # When working with numpy, indexing is done by position (slices/ranges/scalars). # In[3]: t = ds['air'].data # numpy array t # In[4]: t.shape # In[5]: # extract a time-series for one spatial location t[:, 10, 20] # but wait, what labels go with `10` and `20`? Was that lat/lon or lon/lat? Where are the timestamps that go along with this time-series? # # Indexing with Xarray # # xarray offers extremely flexible indexing routines that combine the best features of NumPy and pandas for data selection. # In[6]: da = ds['air'] # In[7]: # numpy style indexing still works (but preserves the labels/metadata) da[:, 10, 20] # In[8]: # Positional indexing using dimension names da.isel(lat=10, lon=20) # In[9]: # Label-based indexing da.sel(lat=50., lon=250.) # In[10]: # Nearest neighbor lookups da.sel(lat=52.25, lon=251.8998, method='nearest') # In[11]: # all of these indexing methods work on the dataset too, e.g.: ds.sel(lat=52.25, lon=251.8998, method='nearest') # ## Vectorized indexing # # Like numpy and pandas, xarray supports indexing many array elements at once in a vectorized manner. # In[12]: # generate a coordinates for a transect of points lat_points = xr.DataArray([52, 52.5, 53], dims='points') lon_points = xr.DataArray([250, 250, 250], dims='points') # In[13]: # nearest neighbor selection along the transect da.sel(lat=lat_points, lon=lon_points, method='nearest') # ## Alignment # # xarray enforces alignment between index Coordinates (that is, coordinates with the same name as a dimension, marked by *) on objects used in binary operations. # In[21]: da # In[24]: arr = da.isel(time=0, lat=slice(5, 10), lon=slice(7, 11)) arr # In[26]: part = arr[:-1] part # In[32]: # default behavior is an "inner join" (arr + part) / 2 # In[31]: # we can also use an outer join with xr.set_options(arithmetic_join="outer"): print((arr + part) / 2) # notice that missing values (nan) were inserted # ## Broadcasting # # DataArray objects are automatically align themselves (“broadcasting” in the numpy parlance) by dimension name instead of axis order. With xarray, you do not need to transpose arrays or insert dimensions of length 1 to get array operations to work, as commonly done in numpy with np.reshape() or np.newaxis. # In[ ]: