#!/usr/bin/env python
# coding: utf-8

# # Accessing the latest HRRR GRIB2 forecast data using Kerchunk
# 
# This notebook demonstates a method to open all the variables in the latest HRRR forecast in a single xarray-datatree object using kerchunk to create and edit the necessary reference files.

# In[1]:


import datetime as dt
import fsspec
import ujson

import kerchunk
from kerchunk.grib2 import scan_grib
from kerchunk.combine import MultiZarrToZarr

import dask.bag as db
import pickle


# Much of this script can be run in parallel 

# In[2]:


from dask.distributed import Client
client = Client()


# First let's create two file systems using `fsspec` one to read the HRRR data from the S3 bucket and another to save the reference files.

# In[3]:


fs_read = fsspec.filesystem('s3', anon=True, skip_instance_cache=True)
fs_write = fsspec.filesystem('')


# The we generate a list containing paths to the latest forecast data:

# In[4]:


days_avail = fs_read.glob('s3://noaa-hrrr-bdp-pds/hrrr.*')
runs = fs_read.glob(f's3://{days_avail[-1]}/conus/*wrfsfcf01.grib2')
files = fs_read.glob(f's3://{runs[-2][:-8]}*.grib2') #select second last run to ensure it is a complete forecast
#please note only 00,06,12 and 18z HRRR forecasts are 48 hours the remainder are 18 hours
files = sorted(['s3://'+f for f in files])
files


# Create folder path to save reference files to.

# In[5]:


json_dir = 'jsons/'
fs_write.mkdir(json_dir)


# We use the below two functions to run `scan_grib` on each HRRR forecast file and save a reference file for each grib message within these files

# In[6]:


def make_json_name(file_url, message_number): #create a unique name for each reference file
    date = file_url.split('/')[3].split('.')[1]
    name = file_url.split('/')[5].split('.')[1:3]
    return f'{json_dir}{date}_{name[0]}_{name[1]}_message{message_number}.json'

def gen_json(file_url):
    out = scan_grib(file_url, storage_options={"anon": True})   #create the reference using scan_grib
    for i, message in enumerate(out): # scan_grib outputs a list containing one reference file per grib message
        out_file_name = make_json_name(file_url, i)  #get name
        with fs_write.open(out_file_name, "w") as f: 
            f.write(ujson.dumps(message)) #write to file


# Run the computation, each file takes about 5-10 minutes to scan. I recommend only doing two files at first to check everything runs smoothly. 

# In[7]:


get_ipython().run_cell_magic('time', '', 'bag = db.from_sequence(files[:2])\nbag_map = bag.map(gen_json)\n_ = bag_map.compute()\n')


# There are some variables within the HRRR dataset that will return the error message: 
# 
# " MissingDimensionsError: 'x' has more than 1-dimension and the same name as one of its dimensions ('x', 'y'). xarray disallows such variables because they conflict with the coordinates used to label dimensions." see https://github.com/pydata/xarray/issues/2233
# 
# To avoid this issue I have created a dictionary containing the grib message and variable names which cause this issue and use `preprocess` in `MultiZarrtoZarr` to drop these variables. These grib messages are primarily 1 dimensional arrays describing their vertical component. 

# In[8]:


to_drop = {'103.json': 'surface', '106.json': 'isobaricLayer', '107.json': 'surface', '108.json': 'surface', '109.json': 'atmosphereSingleLayer', '110.json': 'atmosphereSingleLayer','111.json': 'atmosphereSingleLayer','112.json': 'atmosphere','113.json': 'atmosphere','114.json': 'boundaryLayerCloudLayer','115.json': 'lowCloudLayer','116.json': 'middleCloudLayer','117.json': 'highCloudLayer','118.json': 'atmosphere','119.json': 'cloudCeiling','120.json': 'cloudBase','121.json': 'cloudBase','122.json': 'cloudTop','123.json': 'cloudTop','124.json': 'nominalTop','129.json': 'surface','130.json': 'surface','131.json': 'surface','132.json': 'nominalTop','138.json': 'heightAboveGroundLayer','139.json': 'heightAboveGroundLayer','140.json': 'heightAboveGroundLayer','141.json': 'isothermZero','142.json': 'isothermZero','143.json': 'isothermZero','144.json': 'highestTroposphericFreezing','145.json': 'highestTroposphericFreezing','146.json': 'highestTroposphericFreezing','147.json': 'isothermal','148.json': 'isothermal','149.json': 'pressureFromGroundLayer','150.json': 'pressureFromGroundLayer','152.json': 'surface','153.json': 'adiabaticCondensation','155.json': 'pressureFromGroundLayer','157.json': 'pressureFromGroundLayer','158.json': 'equilibrium','159.json': 'pressureFromGroundLayer','162.json': 'surface','163.json': 'heightAboveGroundLayer','164.json': 'unknown','165.json': 'heightAboveGroundLayer','166.json': 'atmosphere','167.json': 'surface','168.json': 'surface'}


# Next we combine the reference messages along the valid time dimension using `MultiZarrtoZarr`.

# In[9]:


fs_write.mkdir(f'{json_dir}combined/')


# In[10]:


def combine(message_number):
    jsons = fs_write.glob(f'{json_dir}*message{message_number}.json')
    file_name = f'{json_dir}combined/{message_number}.json'
    if file_name.split('/')[-1] in list(to_drop):
            preprocess = kerchunk.combine.drop(to_drop[file_name.split('/')[-1]])
            mzz = MultiZarrToZarr(jsons,concat_dims = ['valid_time'], preprocess = preprocess)
    else:
        mzz = MultiZarrToZarr(jsons,concat_dims = ['valid_time'])
    d = mzz.translate(filename = f'{json_dir}combined/{message_number}.json')


# And run this combine computation across all 169 grib messages

# In[11]:


get_ipython().run_cell_magic('time', '', 'bag = db.from_sequence(range(170)) #there are 169 grib messages within an HRRR grib2 file\nbag_map = bag.map(combine)\n_ = bag_map.compute()\n')


# The next step is to group the grib messages that can be opened as a single xarray dataset together. I have again created a dictionary that contains the compatible groupings:

# In[12]:


groups = {'atmosphere': [0, 2, 3, 52, 58, 109, 110, 115, 163], 'cloudTop': [1, 119, 120], 'surface': [4,  8,54,  61,  62,  63,  64,  66,  67,  68,  69,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100,  101,  102,  104,  105,  122,  123,  124,  125,  126,  127,  128,  149,  159,  164,  165], 'heightAboveGround': [5,  6,  42,  56,  57,  59,  60,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80], 'isothermal': [7, 43, 144, 145], 'isobaricInhPa': [9,  10,  11,  12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  41], 'nominalTop': [129, 166, 167, 168, 169, 121], 'heightAboveGroundLayer': [130,  131,  132,  133,  134,  135,  136,  137,  157,  160,  162,  44,  45,  46,  47,  48,  49,  50,  51], 'isothermZero': [138, 139, 140], 'highestTroposphericFreezing': [141, 142, 143], 'pressureFromGroundLayer': [146, 147, 148, 151, 152, 153, 154, 156, 37, 38], 'adiabaticCondensation': [150], 'equilibrium': [155], 'depthBelowLand': [65], 'unknown': [161, 158], 'isobaricLayer': [103], 'sigmaLayer': [39], 'meanSea': [40], 'atmosphereSingleLayer': [106, 107, 108, 55], 'boundaryLayerCloudLayer': [111], 'lowCloudLayer': [112], 'middleCloudLayer': [113], 'highCloudLayer': [114], 'cloudBase': [117, 118], 'cloudCeiling': [116], 'sigma': [53]}


# Create list of reference files to group together

# Use `MultiZarrtoZarr` to combine each group into a single reference file. We use `zstd` compression when saving to reduce the size of the reference files. 

# In[13]:


def make_groups(group):
    jsons = [f'{json_dir}combined/{m}.json' for m in groups[group]]
    if group == 'unknown':
        preprocess = kerchunk.combine.drop('unknown')
        mzz = MultiZarrToZarr(jsons,concat_dims = ['valid_time'], preprocess = preprocess)
    else:
        mzz = MultiZarrToZarr(jsons,concat_dims = ['valid_time'])
    ref = mzz.translate(filename=f'{group}.json.zst', storage_options=dict(compression='zstd'))


# run the computation:

# In[14]:


get_ipython().run_cell_magic('time', '', 'bag = db.from_sequence(list(groups))\nbag_map = bag.map(make_groups)\n_ = bag_map.compute()\n')


# # Opening the reference files as an `xarray-datatree` 

# Function to open the references we have just created as zarr object using fsspec reference file system and xarray:

# In[15]:


import xarray as xr
from datatree import DataTree
import hvplot.xarray
import panel as pn
from geoviews import tile_sources as gvts


# In[16]:


def return_ds(d):
    fs_ = fsspec.filesystem("reference", fo=d, remote_protocol='s3', remote_options={'anon':True},ref_storage_args={"compression": "zstd"})
    m = fs_.get_mapper("")
    return xr.open_dataset(m, engine="zarr", backend_kwargs=dict(consolidated=False))


# In[17]:


reference_files = fs_write.glob('*.json.zst')
datasets = {ref.split('/')[-1].split('.')[0]:return_ds(ref) for ref in reference_files}


# In[18]:


dt = DataTree.from_dict(datasets)


# Xarray datatree provides a lot of functionality like performing `.sel()` across all datasets, but here we will just use it as a convenient way to view all the variables available from HRRR

# In[19]:


print(dt)


# In[20]:


ds = dt['surface'].to_dataset()
ds = ds.assign_coords(longitude=(((ds.longitude + 180) % 360) - 180), latitude=ds.latitude)


# In[21]:


ds


# In[22]:


sel_time = ds['valid_time'][0]
ds['vis'].sel(valid_time=sel_time).hvplot.quadmesh(x='longitude', y='latitude', geo=True, tiles='OSM',
                                                rasterize=True, cmap='viridis_r', title = str(sel_time.values))