#!/usr/bin/env python # coding: utf-8 # # ## Accessing NOAA HRRR data on Azure # # The NOAA [HRRR](https://www.nco.ncep.noaa.gov/pmb/products/hrrr/) is a real-time 3km resolution, hourly updated, cloud-resolving, convection-allowing atmospheric model, initialized by 3km grids with 3km radar assimilation. # # This notebook provides an example of accessing HRRR data from blob storage on Azure, including (1) finding the data file corresponding to a date and time, (2) retrieving a portion of that file from blob storage which includes the surface temperature variable, (3) opening the file using the `xarray` library, and (4) rendering an image of the forecast. # # This dataset is stored in the East US Azure region, so this notebook will run most efficiently on Azure compute located in the same region. If you are using this data for environmental science applications, consider applying for an AI for Earth grant to support your compute requirements. # # This dataset is documented at . # ### Set up the environment # # We're using `xarray` with the `cfgrib` engine to open the GRIB2 data into an xarray Dataset. `cfgrib` has some binary dependencies so it's easiest to [install with Conda](https://github.com/ecmwf/cfgrib#installation). # In[1]: import io from datetime import date, timedelta import xarray as xr import requests import cartopy.crs as ccrs import cartopy.feature as cfeature import matplotlib.pyplot as plt import cmocean # Not used directly, but used via xarray import cfgrib # ### Assemble the URL for a single forecast model run # # HRRR model data are in files named by date and sector (either CONUS or Alaska). We'll load up yesterday's 2D surface levels model forecast for CONUS run from 12pm UTC. The URL will be assembled based on the path parameters [outlined in our documentation](https://github.com/microsoft/AIforEarthDataSets/blob/main/data/noaa-hrrr.md#storage-resources). # In[2]: # Constants for creating the full URL blob_container = "https://noaahrrr.blob.core.windows.net/hrrr" sector = "conus" yesterday = date.today() - timedelta(days=1) cycle = 12 # noon forecast_hour = 1 # offset from cycle time product = "wrfsfcf" # 2D surface levels # Put it all together file_path = f"hrrr.t{cycle:02}z.{product}{forecast_hour:02}.grib2" url = f"{blob_container}/hrrr.{yesterday:%Y%m%d}/{sector}/{file_path}" print(url) # ### Determine the file-subset of our variable # # The GRIB2 file format stacks binary representations of 2D arrays under a top-level header. This means that instead of downloading and opening the whole file of 170+ variables (150MB+), we can read from a byte offset of just a single variable (or several). The NOAA HRRR data comes with a sidecar index file that specifies start-byte positions for each variable. These files have the same name as the HRRR file, suffixed with `.idx`. # # Let's use this to calculate the offset of the variable we want, before making the data request. # In[3]: # Fetch the idx file by appending the .idx file extension to our already formatted URL r = requests.get(f"{url}.idx") idx = r.text.splitlines() # Take a peek at the content of the index print(*idx[0:10], sep="\n") # In[4]: # You can see it has a 1-indexed base line number, staring byte position, date, variable, atmosphere level, # and forecast description. The lines are colon-delimited. # Let's grab surface temperature `TMP:surface`. sfc_temp_idx = [l for l in idx if ":TMP:surface" in l][0].split(":") print("Surface temp line:", sfc_temp_idx) # Pluck the byte offset from this line, plus the beginning offset of the next line line_num = int(sfc_temp_idx[0]) range_start = sfc_temp_idx[1] # The line number values are 1-indexed, so we don't need to increment it to get the next list index, # but check we're not already reading the last line next_line = idx[line_num].split(':') if line_num < len(idx) else None # Pluck the start of the next byte offset, or nothing if we were on the last line range_end = next_line[1] if next_line else None print(f"Byte range: {range_start}-{range_end}") # ### Request the surface temperature file-subset # # Now that we have the byte range of the single variable we want, we can make a traditional GET request to the Azure Blob Storage container, which supports HTTP range headers. We'll have to save this file to disk since [cfgrib does not currently support reading file-like objects](https://github.com/ecmwf/cfgrib/issues/99). # In[5]: import tempfile file = tempfile.NamedTemporaryFile(prefix="tmp_", delete=False) headers = {"Range": f"bytes={range_start}-{range_end}"} resp = requests.get(url, headers=headers, stream=True) with file as f: f.write(resp.content) # ### Open the GRIB2 file as an xarray dataset # # The downloaded file subset is a valid GRIB2 file. Open it with the `cfgrib` engine for `xarray` and we can do traditional `xarray` operations. Setting `indexpath` here to an empty string tells `cfgrib` to open the file without an associated `.idx` file (which doesn't exist, since we just created this subset file). # In[6]: ds = xr.open_dataset(file.name, engine='cfgrib', backend_kwargs={'indexpath':''}) # ### Check the dataset value # # Let's do a quick check of the distribution of temperature values across the country to make sure things look right. # In[7]: ds.t.plot.hist(edgecolor="white") plt.show() # ### Plot an image of the temperature forecast # # First, let's create a Cartopy CRS (coordinate reference system) specification from the attributes in the dataset. The HRRR data comes in the Lambert conformal projection. See [this example](https://github.com/blaylockbk/HRRR_archive_download/blob/4105f21ee01ad5a915d9545008fad94cf8af8213/herbie/accessors.py#L69-L78) for more information. # In[8]: attrs = ds.t.attrs assert attrs['GRIB_gridType'] == 'lambert' # Define the CRS with attributes from the temperate DataArray prj_kwargs = dict( globe=ccrs.Globe(ellipse='sphere'), central_latitude=attrs['GRIB_LaDInDegrees'], central_longitude=attrs['GRIB_LoVInDegrees'], standard_parallels=(attrs['GRIB_Latin1InDegrees'],\ attrs['GRIB_Latin2InDegrees']) ) prj = ccrs.LambertConformal(**prj_kwargs) # Now we can plot the entire CONUS with some overlaid context from `cartopy` and a nice thermal colormap from `cmocean`. # In[9]: # Ignore some matplotlib deprecation warnings import warnings; warnings.simplefilter("ignore") fig = plt.figure(figsize=(15,8)) ax = plt.axes(projection=prj) plt_kwargs = dict(x='longitude', y='latitude', cmap=cmocean.cm.thermal, transform=ccrs.PlateCarree()) ds.t.plot(**plt_kwargs, ax=ax) ax.coastlines(linewidth=0.5) ax.add_feature(cfeature.STATES.with_scale('50m'), linewidth=0.5) plt.show() # Lastly, let's convert Kelvin to Centigrade, and zoom in on the Southwest where there is the most temperature variation. # In[10]: # Increase the dpi a bit for some crisper text fig = plt.figure(figsize=(15,8), dpi=100) ax = plt.axes(projection=prj) # K to C with xr.set_options(keep_attrs=True): t_c = ds.t - 273.15 t_c.attrs["units"] = "C" # Add some context and zoom to the SW ax.coastlines(linewidth=0.5) ax.add_feature(cfeature.STATES.with_scale('50m'), linewidth=0.5) ax.set_extent([-121, -108, 33, 39], crs=ccrs.PlateCarree()) t_c.plot(**plt_kwargs, ax=ax) d = ds.coords['valid_time'].values plt.title(f"HRRR Surface Temperatures Valid {yesterday} {cycle + forecast_hour:02}:00 UTC") plt.show() # ### Additional resources # # Check out the [HRRR Archive Download](https://github.com/blaylockbk/HRRR_archive_download) repo for a great collection of utility functions and more documentation about accessing and using this dataset.