#!/usr/bin/env python # coding: utf-8 # # Making Zarr data from NetCDF files # # - Funding: Interagency Implementation and Advanced Concepts Team [IMPACT](https://earthdata.nasa.gov/esds/impact) for the Earth Science Data Systems (ESDS) program and AWS Public Dataset Program # - Software developed during [OceanHackWeek 2020](https://github.com/oceanhackweek) # # ### Credits: Tutorial development # * [Dr. Chelle Gentemann](mailto:gentemann@faralloninstitute.org) - [Twitter](https://twitter.com/ChelleGentemann) - Farallon Institute # * [Patrick Gray](mailto:patrick.c.gray@duke.edu) - [Twitter](https://twitter.com/clifgray) - Duke University # * [Phoebe Hudson](mailto:pahdsn@outlook.com) - University of Southampton # # ## Why data format matters # - NetCDF sprinkles metadata throughout files, making them slow to access and read data # - Zarr consolidates the metadata, making them FAST for access and reading # # In[ ]: # filter some warning messages import warnings warnings.filterwarnings("ignore") #libraries import datetime as dt import xarray as xr import fsspec import s3fs from matplotlib import pyplot as plt import numpy as np import pandas as pd # make datasets display nicely xr.set_options(display_style="html") import os.path #magic fncts #put static images of your plot embedded in the notebook get_ipython().run_line_magic('matplotlib', 'inline') plt.rcParams['figure.figsize'] = 12, 6 get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'retina'") # In[ ]: def get_geo_data(sat,lyr,idyjl): # arguments # sat goes-east,goes-west,himawari # lyr year # idyjl day of year ds,iexist=[],False d = dt.datetime(lyr,1,1) + dt.timedelta(days=idyjl) fs = s3fs.S3FileSystem(anon=True) #connect to s3 bucket! #create strings for the year and julian day imon,idym=d.month,d.day syr,sjdy,smon,sdym = str(lyr).zfill(4),str(idyjl).zfill(3),str(imon).zfill(2),str(idym).zfill(2) #use glob to list all the files in the directory if sat=='goes-east': file_location,var = fs.glob('s3://noaa-goes16/ABI-L2-SSTF/'+syr+'/'+sjdy+'/*/*.nc'),'SST' if sat=='goes-west': file_location,var = fs.glob('s3://noaa-goes17/ABI-L2-SSTF/'+syr+'/'+sjdy+'/*/*.nc'),'SST' if sat=='himawari': file_location,var = fs.glob('s3://noaa-himawari8/AHI-L2-FLDK-SST/'+syr+'/'+smon+'/'+sdym+'/*/*L2P*.nc'),'sea_surface_temperature' #make a list of links to the file keys if len(file_location)<1: return file_ob file_ob = [fs.open(file) for file in file_location] #open connection to files #open all the day's data with xr.open_mfdataset(file_ob,combine='nested',concat_dim='time') as ds: iexist = True #file exists #clean up coordinates which are a MESS in GOES #rename one of the coordinates that doesn't match a dim & should if not sat=='himawari': ds = ds.rename({'t':'time'}) ds = ds.reset_coords() else: ds = ds.rename({'ni':'x','nj':'y'}) #put in to Celsius #ds[var] -= 273.15 #nice python shortcut to +- from itself a-=273.15 is the same as a=a-273.15 #ds[var].attrs['units'] = '$^\circ$C' return ds,iexist # ## Open GOES-16 (East Coast) Data # - Careful of what you ask for.... each day is about 3 min to access # In[ ]: get_ipython().run_cell_magic('time', '', "lyr = 2020\n\nsatlist = ['goes-east','goes-west','himawari']\n\nfor sat in satlist:\n\n init = 0 #reset new data store\n\n for idyjl in range(180,201): #6/28/2020-7/18/2020\n\n print('starting ', idyjl)\n\n ds,iexist = get_geo_data(sat,lyr,idyjl)\n \n if not iexist:\n continue\n\n print('writing zarr store')\n\n if init == 0:\n ds.to_zarr(sat)\n init = 1\n else:\n ds.to_zarr(sat,append_dim='time')\n") # #### Now write this to our shared AWS S3 bucket # # Note that in order to do this you need the aws command line tools which can be installed by running from the command line # # `pip install awscli` # # `aws s3 sync ./goes_east s3://ohw-bucket/goes_east` # # `aws s3 sync ./goes_west s3://ohw-bucket/goes_west` # # `aws s3 sync ./goes_west s3://ohw-bucket/himawari` # # #### note that putting the ! in front of a command in jupyter send it to the terminal so you could run it here with # # # In[ ]: get_ipython().system(' pip install awscli') get_ipython().system(' aws s3 sync ./goes_east s3://ohw-bucket/goes_east') get_ipython().system(' aws s3 sync ./goes_west s3://ohw-bucket/goes_west') get_ipython().system(' aws s3 sync ./goes_west s3://ohw-bucket/himawari') # ## Test reading the data # In[ ]: get_ipython().run_cell_magic('time', '', "\nfile_location = 's3://ohw-bucket/goes_east'\n\nds = xr.open_zarr(fsspec.get_mapper(file_location,anon=False))\n\nds\n") # In[ ]: