#!/usr/bin/env python
# coding: utf-8

# # HRRR Forecast Collection Time Series
# Read a collection of GRIB2 files on AWS as a single dataset using the Zarr library, via fsspec's ReferenceFileSystem.  
# 
# This notebook demonstrates  how to generate the reference JSON files using [Kerchunk](https://github.com/fsspec/kerchunk) 
# 

# In[1]:


import xarray as xr
import datetime as dt
import fsspec
import ujson
from kerchunk.grib2 import scan_grib
from kerchunk.combine import MultiZarrToZarr


# `fsspec` file systems to read grib2 forecast files from AWS and write reference jsons to an aws bucket in this case

# In[2]:


fs_read = fsspec.filesystem('s3', anon=True, skip_instance_cache=True)
fs_write = fsspec.filesystem('s3', anon=False)
#fs_write = fsspec.filesystem('') #uncomment this if you intend to write jsons to a local folder


# Get the latest forecast

# In[3]:


days_avail = fs_read.glob('s3://noaa-hrrr-bdp-pds/hrrr.*')
files = fs_read.glob(f's3://{days_avail[-1]}/conus/*wrfsfcf01.grib2')
files = sorted(['s3://'+f for f in files])
files


# `scan_grib` does not require a filter and will happily create a reference file for each available grib message. However when combining the grib messages using `MultiZarrToZarr` it is neccassary for the messages to share a coordinate system. Thus to make our lives easier and ensure all reference outputs from `scan_grib` share a coordinate system we pass a filter argument. 

# In[4]:


afilter={'typeOfLevel': 'heightAboveGround', 'level': [2, 10]}    


# In[5]:


so = {"anon": True}


# In[6]:


json_dir = 's3://esip-qhub/noaa/hrrr/jsons/'

def make_json_name(file_url, message_number): #create a unique name for each reference file
    date = file_url.split('/')[3].split('.')[1]
    name = file_url.split('/')[5].split('.')[1:3]
    return f'{json_dir}{date}_{name[0]}_{name[1]}_message{message_number}.json'

def gen_json(file_url):
    out = scan_grib(file_url, storage_options=so, filter=afilter)   #create the reference using scan_grib
    for i, message in enumerate(out): # scan_grib outputs a list containing one reference per grib message
        out_file_name = make_json_name(file_url, i)  #get name
        with fs_write.open(out_file_name, "w") as f: 
            f.write(ujson.dumps(message)) #write to file


# In[7]:


fs_write.rm(json_dir) # clear json directory of old references


# In[8]:


get_ipython().run_cell_magic('time', '', '#this step is best run via a cluster\nfor f in files:\n    gen_json(f)\n')


# In[9]:


reference_jsons = fs_write.ls(json_dir) #get list of file names
reference_jsons = sorted(['s3://'+f for f in reference_jsons]) #prepend s3 protocol (not neccessary if writing to local filesystem)


# In[10]:


#combine individual references into single consolidated reference
mzz = MultiZarrToZarr(reference_jsons,
                        concat_dims = ['valid_time'],
                        identical_dims=['latitude', 'longitude', 'heightAboveGround', 'step'])


# In[11]:


get_ipython().run_cell_magic('time', '', 'd = mzz.translate()\n')


# In[12]:


#open dataset as zarr object using fsspec reference file system and xarray
fs = fsspec.filesystem("reference", fo=d, remote_protocol='s3', remote_options={'anon':True})
m = fs.get_mapper("")
ds = xr.open_dataset(m, engine="zarr", backend_kwargs=dict(consolidated=False), 
                      chunks={'valid_time':1})


# In[13]:


ds


# In[14]:


ds['2t'][-1].plot()


# In[15]:


ds['2t'][:,500,500].plot()