#!/usr/bin/env python
# coding: utf-8

# ## Demo notebook for accessing NSRDB data on Azure
# 
# This notebook provides an example of accessing National Solar Radiation Database (NSRDB) data from blob storage on Azure. The data is stored in one HDF file per year.
# 
# NSRDB data are stored in the East US Azure region, so this notebook will run most efficiently on Azure compute located in East US.  We recommend that substantial computation depending on NSRDB data also be situated in East US.  You don't want to download hundreds of terabytes to your laptop!  If you are using this data for environmental science applications, consider applying for an [AI for Earth grant](http://aka.ms/ai4egrants) to support your compute requirements.
# 
# This notebook was adapted from the [NREL NSRDB/HSDS example](https://github.com/NREL/hsds-examples/blob/master/notebooks/03_NSRDB_introduction.ipynb).

# ### Imports and constants

# In[1]:


import xarray as xr
import pandas as pd
import planetary_computer

from adlfs import AzureBlobFileSystem

# Year to investigate and plot
year = 2015

# Storage resources
storage_account_name = 'nrel'
folder = f'nrel-nsrdb/v3'


# ### List the data files
# 
# We can use `adlfs` to list available files (one per year):

# In[2]:


fs = AzureBlobFileSystem(
    account_name=storage_account_name,
    credential=planetary_computer.sas.get_token("nrel", "nrel-nsrdb").token
)
annual_files = fs.glob(folder + '/*.h5')
print('Found {} annual files:'.format(len(annual_files)))
for path in annual_files[:10]:
    print(path)

print('...')


# ### Open one data file with xarray

# In[3]:


file = fs.open(f"nrel-nsrdb/v3/nsrdb_{year}.h5")
ds = xr.open_dataset(file, backend_kwargs={"phony_dims": "sort"}, engine="h5netcdf")
ds


# ### Explore metadata

# In[4]:


# Datasets are stored in 2D grids of size [time x location]
dset = ds['ghi']
dset.shape


# In[5]:


# Extract datetime index for datasets
time_index = pd.to_datetime(ds['time_index'][...].astype(str))
time_index # temporal resolution is 30min


# In[6]:


# Location information is stored in either 'meta' or 'coordinates'
meta = ds['meta']

# Each element contains values for, e.g., latitude, longitude, elevation, and land cover type
print(meta)


# ### Extract a subset of the data

# In[7]:


# find the position where time_index is this value
timestep = time_index.get_loc(pd.Timestamp(str(year) + "-12-31 19:00:00"))

# capture metadata in a pandas dataframe
meta = ds["meta"].data
df = pd.DataFrame(meta) 

# find the positions where df['state'] is California
CA = df[df["state"] == b"California"]

# load a subset of that data
subset = ds["ghi"][timestep, CA.index].load()


# ### Plot that subset

# In[8]:


p = pd.DataFrame({"ghi": subset, "lat": CA.latitude, "lon": CA.longitude}).plot.scatter(x="lon", y="lat", c="ghi", cmap="viridis")