#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import geopandas as gpd import os, shutil # In[2]: # Fire perimeter datasets for the US can be found via GeoMAC at # https://rmgsc.cr.usgs.gov/outgoing/GeoMAC/current_year_fire_data/current_year_all_states/ # Fire perimeters for Canada can be found at # https://catalogue.data.gov.bc.ca/dataset/fire-perimeters-current # In[3]: # Define which state/province we are downloading L8 data for state = 'WY' # Define the filepaths where the shapefile data resides CA = r'D:\data\FirePerimeters\2018_2019_Canada_perimeters.shp' US = r'D:\data\FirePerimeters\perimeters_dd83.shp' stateboundaries = (os.path.join(r'D:\data\boundaries',state + '.shp')) wrsfile = r'D:\data\l8\wrs2_descending.shp' # Define where the resultant l8 scenes and metadata files will go l8out = r'D:\data\imagery' sceneinfo = r'D:\data\l8' # In[4]: if state == 'BC' or state == 'AB': country = 'Canada' fire = gpd.GeoDataFrame.from_file(CA) # Used est perimeter data from http://cwfis.cfs.nrcan.gc.ca/downloads/hotspots/ (merged 2019/2018) else: country = 'US' fire = gpd.GeoDataFrame.from_file(US) # In[5]: # Read the state boundary shapefile and the wrs path/row shapefile # State files need to be the same projection as the WRS file bounds = gpd.read_file(stateboundaries) wrs = gpd.GeoDataFrame.from_file(wrsfile) # Select the Landsat path/rows that intersect with the state of interest wrs_intersection = wrs[wrs.intersects(bounds.geometry[0])] # In[ ]: # Select the fires that intersect to later determine the needed imagery date fires = gpd.sjoin(fire, wrs, how='inner', op='within')### # sort dataframe by most recent date, change date format to match AWS's landsat metadata date format if country == 'Canada': fires['enddate'] = fires['LASTDATE']+ '.000000' else: fires['enddate'] = fires['DATECRNT']+' 00:00:00.000000' # empty gdf for most recent fire perimeter date recent_fire = gpd.GeoDataFrame() # select just fires in the state, make lowercase strings for consistent matching of fire names if country == 'US': fires = fires.loc[(fires.STATE == state)] fires.FIRENAME = fires.FIRENAME.str.lower() fires = fires[['FIRENAME','PATH','ROW','enddate']] else: print 'Skipping firename' fires = fires[['PATH','ROW','enddate']] fires['PR'] = fires['PATH'].astype(str)+' '+fires['ROW'].astype(str) # for each fire, pick the latest date if country == 'US': for firename in fires['FIRENAME']: rec_fire = fires.loc[(fires.FIRENAME == firename)] rec_fire['enddate'].sort_values() rec_fire = rec_fire.tail(1) recent_fire = recent_fire.append(rec_fire) else: rec_fire = fires recent_fire = recent_fire.append(rec_fire) # then find the latest fire date for the path/row pr_date = gpd.GeoDataFrame() for pr in recent_fire['PR'].unique(): prdate = recent_fire.loc[(recent_fire.PR == pr)] prdate['enddate'].sort_values() prdate = prdate.tail(1) pr_date = pr_date.append(prdate) # In[7]: # OPTIONAL: view folium map of the path/rows selected to visualize coverage import folium import numpy as np xy = np.asarray(bounds.centroid[0].xy).squeeze() center = list(xy[::-1]) zoom = 6 m = folium.Map(location=center, zoom_start=zoom, control_scale=True) m.add_child(folium.GeoJson(bounds.__geo_interface__, name='Path/Row Coverage', style_function=lambda x: {'color': 'red', 'alpha': 0})) for i, row in wrs_intersection.iterrows(): # Create a string for the name containing the path and row of this Polygon name = 'path: %03d, row: %03d' % (row.PATH, row.ROW) # Create the folium geometry of this Polygon g = folium.GeoJson(row.geometry.__geo_interface__, name=name) # Add a folium Popup object with the name string g.add_child(folium.Popup(name)) # Add the object to the map g.add_to(m) folium.LayerControl().add_to(m) m # In[8]: paths, rows = wrs_intersection['PATH'].values, wrs_intersection['ROW'].values # In[9]: # Count how many paths and rows there are to download imagery for count_images = 0 for (path, row) in enumerate(zip(paths, rows)): count_images = count_images + 1 print str(count_images) + ' scenes' # In[10]: # Read AWS metadata csv for l8 into a dataframe. This is the data we will use to select scenes matching our requirements. s3_scenes = pd.read_csv('http://landsat-pds.s3.amazonaws.com/c1/L8/scene_list.gz', compression='gzip') # In[11]: # bulk download list bulk_list = [] not_found = [] n = 0 # Find scenes for each path/row for path, row in zip(paths, rows): n = n + 1 print n ## Define the thresholds for date range and cloud cover: datelowest = '2019-06-01 00:00:00.000000' datehigh = '2019-09-30 00:00:00.000000' cloudcover = 10 print 'Path: ' + str(path) + ' Row: ' + str(row) #def fire_scene(): # Check if the Path/Row has a recent fire, use the fire's end date for the datelow L8 scene search pr = str(path) +' '+ str(row) prloc = pr_date.loc[(pr_date.PR == pr)] print prloc.head() if prloc.shape[0] != 1: datelow = datelowest print 'No fire - daterange unchanged '+ datelow[:-15] +'to '+ datehigh[:-16] elif prloc['enddate'].values[0] > datelowest: #Ensure most recent years imagery used, if using more than 1 fire year datelow = prloc['enddate'].values[0] print 'Fire occured - new scene daterange ' + datelow[:-15] +'to '+ datehigh[:-16] else: datelow = datelowest print 'Fires present from previous year, using current year imagery '+ datelow[:-15] +'to '+ datehigh[:-16] if datelow == '': datelow = datelowest #fire_scene() # Filter the Landsat Amazon S3 table for images matching path/row and cloudcover parameters. tries = 10 # Ideally, imagery will be <10% scene cloud cover. The below code loops through the imagery in increments # of 10% cover until a 100% threshhold is reached. Change the threshold requirements as needed. # Currently there is no way to look at cloud cover within the fire perimeter/aoi before download - but this # method (looking at total scene cover) should be adequate for most purposes. while tries >= 10 and tries <= 90: if tries > 10: ntries = tries/10 cloudcover = tries print 'Try #' + str(ntries) +': '+ str(cloudcover) + '% cloudcover threshold' scenes = s3_scenes[(s3_scenes.path == path) & (s3_scenes.row == row) & (s3_scenes.cloudCover <= cloudcover) & (s3_scenes.acquisitionDate >= datelow) & (s3_scenes.acquisitionDate <= datehigh) & # We don't want any tier2/uncorrected data (~s3_scenes.productId.str.contains('_T2')) & (~s3_scenes.productId.str.contains('_RT'))] print 'Found {} images\n'.format(len(scenes)) if len(scenes) == 0: tries = tries + 10 print 'Retry with higher cloudcover threshold:' else: tries = 100 # Select the scenes that meet the date and cloud cover criteria if len(scenes)>0: # select a scene in the middle of the date ranges if possible - for my purposes, full leaf imagery is ideal sc = len(scenes) sd = sc / 2 sl = sc - sd if sd > 2 and sl < 2: sl = -1 else: sl = sl * -1 # pick the middle date scene scene = scenes.sort_values('acquisitionDate').iloc[sl] # Add the selected scene to the bulk download list. bulk_list.append(scene) else: # if there are no scenes found even after altering the cloudcover threshold, create a list (find manually) print 'No scenes were selected for this path/row' nf = str(path) + ',' + str(row) not_found.append(nf) # In[12]: # Concatenate the scene info into two lists: scenes that have no match, and scenes we want to download. bulk_frame = pd.concat(bulk_list, 1).T nf_frame = pd.DataFrame(not_found) nf_frame.to_csv((os.path.join(sceneinfo, state + 'scenes_missing.txt')),sep='\t', index=False, header=False) bulk_frame.head(10) # In[13]: # Option 1 - get the scene list to upload to earthexplorer.usgs.gov/filelist bulklist = bulk_frame[['entityId']] bulklist.to_csv((os.path.join(sceneinfo, state + 'pathrowlist.txt')),sep='\t', index=False, header=False) bulk_frame.to_csv((os.path.join(sceneinfo, state + 'frame.txt')),sep='\t', index=False) # In[ ]: # Option 2 - download the data directly import requests from bs4 import BeautifulSoup LANDSAT_PATH = os.path.join(l8out, state, 'l8imagery') # For each row for i, row in bulk_frame.iterrows(): entity_dir = os.path.join(LANDSAT_PATH, row.productId) # added to skip the file if it already has been downloaded - check and re-download any files that may be corrupted # if download is interrupted if os.path.isdir(entity_dir): print'Skipping ' + entity_dir + ' as it already exists' else: # Print the product ID print '\n', 'EntityId:', row.productId, '\n' # Request the html text of the download_url from the amazon server. response = requests.get(row.download_url) # If the response status code is fine (200) if response.status_code == 200: # Import the html to beautiful soup html = BeautifulSoup(response.content, 'html.parser') # Create the dir where we will put this image files. if not os.path.exists(entity_dir): os.makedirs(entity_dir) # Second loop: for each band of this image that we find using the html
  • tag for li in html.find_all('li'): # Get the href tag file = li.find_next('a').get('href') filestring = str(file) filen = os.path.join(LANDSAT_PATH,entity_dir,filestring) # only download the .tif and metadata files, other formats (.IMD) aren't necessary for what I need if filestring[-4:] == '.TIF' or filestring[-8:] == '_MTL.txt' or filestring[-8:] == '_ANG.txt': if not os.path.isfile(os.path.join(filen)): # skip anything already downloaded print ' Downloading: {}'.format(file) # Download the files response = requests.get(row.download_url.replace('index.html', file), stream=True) with open(os.path.join(entity_dir, file), 'wb') as output: shutil.copyfileobj(response.raw, output) del response else: print filestring + ' exists' # In[ ]: