#!/usr/bin/env python # coding: utf-8 # ## Project Eclipse data on the Planetary Computer # # The [Project Eclipse Network](https://planetarycomputer.microsoft.com/dataset/eclipse) is a low-cost air quality sensing network for cities and a research project led by the Urban Innovation Group at Microsoft Research. # # ### Using the STAC API # # Project Eclipse data are distributed as a set of parquet files -- one per week. We can use the STAC API to search for files for a specific week. # In[1]: import pystac_client import planetary_computer catalog = pystac_client.Client.open( "https://planetarycomputer.microsoft.com/api/stac/v1", modifier=planetary_computer.sign_inplace, ) search = catalog.search(collections=["eclipse"], datetime="2022-03-01") items = search.item_collection() print(f"Found {len(items)} item") item = items[0] item # We'll load the parquet file with pandas. # In[2]: import geopandas import pandas as pd asset = item.assets["data"] df = pd.read_parquet( asset.href, storage_options=asset.extra_fields["table:storage_options"] ) df # In[3]: df = df[(df.Longitude > -89) & (df.Longitude < -86)] len(df) # In[4]: df.CalibratedO3 # In[5]: ts = df.resample("h", on="ReadingDateTimeUTC")[ ["CalibratedPM25", "Humidity", "CalibratedO3", "CalibratedNO2", "CO"] ].mean() ts.plot(subplots=True, sharex=True, figsize=(12, 12)); # The dataset contains many observations from each sensor. We can plot the location of each sensor with geopandas, by selecting just the first observation for that sensor. # In[6]: gdf = geopandas.GeoDataFrame( df, geometry=geopandas.points_from_xy(df.Longitude, df.Latitude), crs="epsg:4326" ) gdf[["LocationName", "geometry"]].drop_duplicates( subset="LocationName" ).dropna().explore(marker_kwds=dict(radius=8)) # Using a [named aggregation](https://pandas.pydata.org/docs/user_guide/groupby.html#named-aggregation) we can compute a summary per senor and plot it on a map. Hover over the markers to see the average Calibrated PM 25 per sensor. # In[7]: average_pm25 = geopandas.GeoDataFrame( gdf.groupby("LocationName").agg( mean_pm25=("CalibratedPM25", "mean"), geometry=("geometry", "first") ), crs="epsg:4326", ) average_pm25.explore( marker_kwds=dict(radius=10), ) # ### Reading the full dataset # # The STAC collection includes a `data` asset, which links to the root of the parquet dataset. This can be used to read all of the data across time. We'll use [Dask](https://docs.dask.org/) to read in the dataset. # In[8]: eclipse = catalog.get_collection("eclipse") asset = planetary_computer.sign(eclipse.assets["data"]) # In[9]: import adlfs import dask.dataframe as dd fs = adlfs.AzureBlobFileSystem(**asset.extra_fields["table:storage_options"]) files = [f"az://{x}" for x in fs.ls(asset.href)] ddf = dd.read_parquet( files, storage_options=asset.extra_fields["table:storage_options"] ) ddf