Notebook

In [1]:

import pandas as pd
import geopandas as gpd
import os, shutil

In [2]:

# Fire perimeter datasets for the US can be found via GeoMAC at 
# https://rmgsc.cr.usgs.gov/outgoing/GeoMAC/current_year_fire_data/current_year_all_states/
# Fire perimeters for Canada can be found at
# https://catalogue.data.gov.bc.ca/dataset/fire-perimeters-current

In [3]:

# Define which state/province we are downloading L8 data for
state = 'WY'

# Define the filepaths where the shapefile data resides
CA = r'D:\data\FirePerimeters\2018_2019_Canada_perimeters.shp'
US = r'D:\data\FirePerimeters\perimeters_dd83.shp'
stateboundaries = (os.path.join(r'D:\data\boundaries',state + '.shp'))
wrsfile = r'D:\data\l8\wrs2_descending.shp'

# Define where the resultant l8 scenes and metadata files will go
l8out = r'D:\data\imagery'
sceneinfo = r'D:\data\l8'

In [4]:

if state == 'BC' or state == 'AB':
    country = 'Canada'
    fire = gpd.GeoDataFrame.from_file(CA)
    # Used est perimeter data from http://cwfis.cfs.nrcan.gc.ca/downloads/hotspots/ (merged 2019/2018)
else: 
    country = 'US'
    fire = gpd.GeoDataFrame.from_file(US)

In [5]:

# Read the state boundary shapefile and the wrs path/row shapefile
# State files need to be the same projection as the WRS file
bounds = gpd.read_file(stateboundaries)
wrs = gpd.GeoDataFrame.from_file(wrsfile)

# Select the Landsat path/rows that intersect with the state of interest
wrs_intersection = wrs[wrs.intersects(bounds.geometry[0])]

In [ ]:

# Select the fires that intersect to later determine the needed imagery date
fires =  gpd.sjoin(fire, wrs, how='inner', op='within')###

# sort dataframe by most recent date, change date format to match AWS's landsat metadata date format
if country == 'Canada':
    fires['enddate'] = fires['LASTDATE']+ '.000000'
else:
    fires['enddate'] = fires['DATECRNT']+' 00:00:00.000000'

# empty gdf for most recent fire perimeter date
recent_fire = gpd.GeoDataFrame()

# select just fires in the state, make lowercase strings for consistent matching of fire names
if country == 'US':
    fires = fires.loc[(fires.STATE == state)]
    fires.FIRENAME = fires.FIRENAME.str.lower()
    fires = fires[['FIRENAME','PATH','ROW','enddate']]
else:
    print 'Skipping firename'
    fires = fires[['PATH','ROW','enddate']]

fires['PR'] = fires['PATH'].astype(str)+' '+fires['ROW'].astype(str)

# for each fire, pick the latest date
if country == 'US':
    for firename in fires['FIRENAME']:
        rec_fire = fires.loc[(fires.FIRENAME == firename)]
        rec_fire['enddate'].sort_values()
        rec_fire = rec_fire.tail(1)
        recent_fire = recent_fire.append(rec_fire)
else:
    rec_fire = fires
    recent_fire = recent_fire.append(rec_fire)
    
# then find the latest fire date for the path/row
pr_date = gpd.GeoDataFrame()

for pr in recent_fire['PR'].unique():
    prdate = recent_fire.loc[(recent_fire.PR == pr)]
    prdate['enddate'].sort_values()
    prdate = prdate.tail(1)
    pr_date = pr_date.append(prdate)

In [7]:

# OPTIONAL: view folium map of the path/rows selected to visualize coverage
import folium
import numpy as np
xy = np.asarray(bounds.centroid[0].xy).squeeze()
center = list(xy[::-1])
zoom = 6
m = folium.Map(location=center, zoom_start=zoom, control_scale=True)
m.add_child(folium.GeoJson(bounds.__geo_interface__, name='Path/Row Coverage', 
                           style_function=lambda x: {'color': 'red', 'alpha': 0}))
for i, row in wrs_intersection.iterrows():
    # Create a string for the name containing the path and row of this Polygon
    name = 'path: %03d, row: %03d' % (row.PATH, row.ROW)
    # Create the folium geometry of this Polygon 
    g = folium.GeoJson(row.geometry.__geo_interface__, name=name)
    # Add a folium Popup object with the name string
    g.add_child(folium.Popup(name))
    # Add the object to the map
    g.add_to(m)

folium.LayerControl().add_to(m)
m

c:\python27\lib\site-packages\folium\__init__.py:59: UserWarning: This version of folium is the last to support Python 2. Transition to Python 3 to be able to receive updates and fixes. Check out https://python3statement.org/ for more info.
  UserWarning

Out[7]:

In [8]:

paths, rows = wrs_intersection['PATH'].values, wrs_intersection['ROW'].values

In [9]:

# Count how many paths and rows there are to download imagery for
count_images = 0
for (path, row) in enumerate(zip(paths, rows)):
    count_images = count_images + 1
print str(count_images) + ' scenes'

24 scenes

In [10]:

# Read AWS metadata csv for l8 into a dataframe. This is the data we will use to select scenes matching our requirements.
s3_scenes = pd.read_csv('http://landsat-pds.s3.amazonaws.com/c1/L8/scene_list.gz', compression='gzip')

In [11]:

# bulk download list
bulk_list = []
not_found = []
n = 0


# Find scenes for each path/row
for path, row in zip(paths, rows):
    n = n + 1
    print n

    ## Define the thresholds for date range and cloud cover:
    datelowest = '2019-06-01 00:00:00.000000'
    datehigh = '2019-09-30 00:00:00.000000'
    cloudcover = 10
    print 'Path: ' +  str(path) +  ' Row: ' + str(row)
    
    #def fire_scene():
    # Check if the Path/Row has a recent fire, use the fire's end date for the datelow L8 scene search
    pr = str(path) +' '+ str(row)
    prloc = pr_date.loc[(pr_date.PR == pr)]
    print prloc.head()

    if prloc.shape[0] != 1:
        datelow = datelowest
        print 'No fire - daterange unchanged '+ datelow[:-15] +'to '+ datehigh[:-16]
    elif prloc['enddate'].values[0] > datelowest: #Ensure most recent years imagery used, if using more than 1 fire year
            datelow = prloc['enddate'].values[0]
            print 'Fire occured - new scene daterange ' + datelow[:-15] +'to '+ datehigh[:-16]
    else:
        datelow = datelowest
        print 'Fires present from previous year, using current year imagery '+ datelow[:-15] +'to '+ datehigh[:-16]
    if datelow == '':
        datelow = datelowest
    #fire_scene()
    
    # Filter the Landsat Amazon S3 table for images matching path/row and cloudcover parameters.
    tries = 10
    
    # Ideally, imagery will be <10% scene cloud cover. The below code loops through the imagery in increments
    # of 10% cover until a 100% threshhold is reached. Change the threshold requirements as needed.
    # Currently there is no way to look at cloud cover within the fire perimeter/aoi before download - but this
    # method (looking at total scene cover) should be adequate for most purposes.
    while tries >= 10 and tries <= 90:
        if tries > 10:
            ntries = tries/10
            cloudcover = tries
            print 'Try #' + str(ntries) +': '+ str(cloudcover) + '% cloudcover threshold'
        scenes = s3_scenes[(s3_scenes.path == path) & (s3_scenes.row == row) & 
                           (s3_scenes.cloudCover <= cloudcover) & 
                           (s3_scenes.acquisitionDate >= datelow) & 
                           (s3_scenes.acquisitionDate <= datehigh) &
                           # We don't want any tier2/uncorrected data
                           (~s3_scenes.productId.str.contains('_T2')) &
                           (~s3_scenes.productId.str.contains('_RT'))]
        
        print 'Found {} images\n'.format(len(scenes))
        if len(scenes) == 0:
            tries = tries + 10
            print 'Retry with higher cloudcover threshold:'
        else: tries = 100
            
    # Select the scenes that meet the date and cloud cover criteria
    if len(scenes)>0:
        # select a scene in the middle of the date ranges if possible - for my purposes, full leaf imagery is ideal
        sc =  len(scenes)
        sd = sc / 2
        sl = sc - sd
        if sd > 2 and sl < 2:
            sl = -1
        else:
            sl = sl * -1
        
        # pick the middle date scene
        scene = scenes.sort_values('acquisitionDate').iloc[sl]
        
        # Add the selected scene to the bulk download list.
        bulk_list.append(scene)
    else:
        # if there are no scenes found even after altering the cloudcover threshold, create a list (find manually)
        print 'No scenes were selected for this path/row'
        nf = str(path) + ',' + str(row)
        not_found.append(nf)

1
Path: 36 Row: 29
      FIRENAME  PATH  ROW                     enddate     PR
2103  valley 2    36   29  2019-08-17 00:00:00.000000  36 29
Fire occured - new scene daterange 2019-08-17 to 2019-09-30
Found 1 images

2
Path: 36 Row: 30
     FIRENAME  PATH  ROW                     enddate     PR
2328  sawmill    36   30  2019-08-13 00:00:00.000000  36 30
Fire occured - new scene daterange 2019-08-13 to 2019-09-30
Found 2 images

3
Path: 36 Row: 31
Empty DataFrame
Columns: [FIRENAME, PATH, ROW, enddate, PR]
Index: []
No fire - daterange unchanged 2019-06-01 to 2019-09-30
Found 5 images

4
Path: 36 Row: 32
Empty DataFrame
Columns: [FIRENAME, PATH, ROW, enddate, PR]
Index: []
No fire - daterange unchanged 2019-06-01 to 2019-09-30
Found 5 images

5
Path: 34 Row: 29
              FIRENAME  PATH  ROW                     enddate     PR
1306  prairie dog draw    34   29  2019-09-10 00:00:00.000000  34 29
Fire occured - new scene daterange 2019-09-10 to 2019-09-30
Found 1 images

6
Path: 34 Row: 30
           FIRENAME  PATH  ROW                     enddate     PR
1784  spring canyon    34   30  2019-08-26 00:00:00.000000  34 30
Fire occured - new scene daterange 2019-08-26 to 2019-09-30
Found 2 images

7
Path: 34 Row: 31
         FIRENAME  PATH  ROW                     enddate     PR
2304  ashenfelder    34   31  2019-09-06 00:00:00.000000  34 31
Fire occured - new scene daterange 2019-09-06 to 2019-09-30
Found 1 images

8
Path: 34 Row: 32
Empty DataFrame
Columns: [FIRENAME, PATH, ROW, enddate, PR]
Index: []
No fire - daterange unchanged 2019-06-01 to 2019-09-30
Found 1 images

9
Path: 39 Row: 29
Empty DataFrame
Columns: [FIRENAME, PATH, ROW, enddate, PR]
Index: []
No fire - daterange unchanged 2019-06-01 to 2019-09-30
Found 0 images

Retry with higher cloudcover threshold:
Try #2: 20% cloudcover threshold
Found 2 images

10
Path: 37 Row: 29
       FIRENAME  PATH  ROW                     enddate     PR
2871  brimstone    37   29  2019-09-17 00:00:00.000000  37 29
Fire occured - new scene daterange 2019-09-17 to 2019-09-30
Found 0 images

Retry with higher cloudcover threshold:
Try #2: 20% cloudcover threshold
Found 1 images

11
Path: 37 Row: 30
         FIRENAME  PATH  ROW                     enddate     PR
2459  bomber lake    37   30  2019-08-29 00:00:00.000000  37 30
Fire occured - new scene daterange 2019-08-29 to 2019-09-30
Found 0 images

Retry with higher cloudcover threshold:
Try #2: 20% cloudcover threshold
Found 2 images

12
Path: 37 Row: 31
     FIRENAME  PATH  ROW                     enddate     PR
2280  currant    37   31  2019-09-03 00:00:00.000000  37 31
Fire occured - new scene daterange 2019-09-03 to 2019-09-30
Found 2 images

13
Path: 37 Row: 32
Empty DataFrame
Columns: [FIRENAME, PATH, ROW, enddate, PR]
Index: []
No fire - daterange unchanged 2019-06-01 to 2019-09-30
Found 5 images

14
Path: 35 Row: 29
                 FIRENAME  PATH  ROW                     enddate     PR
1299  fortification creek    35   29  2019-08-22 00:00:00.000000  35 29
Fire occured - new scene daterange 2019-08-22 to 2019-09-30
Found 1 images

15
Path: 35 Row: 30
     FIRENAME  PATH  ROW                     enddate     PR
1849  whipsaw    35   30  2019-08-19 00:00:00.000000  35 30
Fire occured - new scene daterange 2019-08-19 to 2019-09-30
Found 2 images

16
Path: 35 Row: 31
            FIRENAME  PATH  ROW                     enddate     PR
2850  pedro mountain    35   31  2019-08-28 00:00:00.000000  35 31
Fire occured - new scene daterange 2019-08-28 to 2019-09-30
Found 0 images

Retry with higher cloudcover threshold:
Try #2: 20% cloudcover threshold
Found 2 images

17
Path: 35 Row: 32
Empty DataFrame
Columns: [FIRENAME, PATH, ROW, enddate, PR]
Index: []
No fire - daterange unchanged 2019-06-01 to 2019-09-30
Found 2 images

18
Path: 33 Row: 29
Empty DataFrame
Columns: [FIRENAME, PATH, ROW, enddate, PR]
Index: []
No fire - daterange unchanged 2019-06-01 to 2019-09-30
Found 1 images

19
Path: 33 Row: 30
Empty DataFrame
Columns: [FIRENAME, PATH, ROW, enddate, PR]
Index: []
No fire - daterange unchanged 2019-06-01 to 2019-09-30
Found 2 images

20
Path: 33 Row: 31
      FIRENAME  PATH  ROW                     enddate     PR
1428  finkbine    33   31  2019-07-24 00:00:00.000000  33 31
Fire occured - new scene daterange 2019-07-24 to 2019-09-30
Found 0 images

Retry with higher cloudcover threshold:
Try #2: 20% cloudcover threshold
Found 1 images

21
Path: 33 Row: 32
Empty DataFrame
Columns: [FIRENAME, PATH, ROW, enddate, PR]
Index: []
No fire - daterange unchanged 2019-06-01 to 2019-09-30
Found 3 images

22
Path: 38 Row: 29
       FIRENAME  PATH  ROW                     enddate     PR
1819  box creek    38   29  2019-09-11 00:00:00.000000  38 29
Fire occured - new scene daterange 2019-09-11 to 2019-09-30
Found 0 images

Retry with higher cloudcover threshold:
Try #2: 20% cloudcover threshold
Found 1 images

23
Path: 38 Row: 30
          FIRENAME  PATH  ROW                     enddate     PR
1587  saddle butte    38   30  2019-09-03 00:00:00.000000  38 30
Fire occured - new scene daterange 2019-09-03 to 2019-09-30
Found 0 images

Retry with higher cloudcover threshold:
Try #2: 20% cloudcover threshold
Found 1 images

24
Path: 38 Row: 31
Empty DataFrame
Columns: [FIRENAME, PATH, ROW, enddate, PR]
Index: []
No fire - daterange unchanged 2019-06-01 to 2019-09-30
Found 4 images

In [12]:

# Concatenate the scene info into two lists: scenes that have no match, and scenes we want to download.
bulk_frame = pd.concat(bulk_list, 1).T
nf_frame = pd.DataFrame(not_found)
nf_frame.to_csv((os.path.join(sceneinfo, state + 'scenes_missing.txt')),sep='\t', index=False, header=False)
bulk_frame.head(10)

Out[12]:

	productId	entityId	acquisitionDate	cloudCover	processingLevel	path	row	min_lat	min_lon	max_lat	max_lon	download_url
1698771	LC08_L1TP_036029_20190831_20190916_01_T1	LC80360292019243LGN00	2019-08-31 17:54:50.649040	1.11	L1TP	36	29	43.4775	-108.983	45.6465	-105.894	https://s3-us-west-2.amazonaws.com/landsat-pds...
1698381	LC08_L1TP_036030_20190831_20190916_01_T1	LC80360302019243LGN00	2019-08-31 17:55:14.535844	0.52	L1TP	36	30	42.0485	-109.453	44.2195	-106.418	https://s3-us-west-2.amazonaws.com/landsat-pds...
1641340	LC08_L1TP_036031_20190714_20190720_01_T1	LC80360312019195LGN00	2019-07-14 17:55:23.158334	1.07	L1TP	36	31	40.6706	-109.78	42.8245	-107.077	https://s3-us-west-2.amazonaws.com/landsat-pds...
1641341	LC08_L1TP_036032_20190714_20190720_01_T1	LC80360322019195LGN00	2019-07-14 17:55:47.045136	2.04	L1TP	36	32	39.2485	-110.209	41.3938	-107.557	https://s3-us-west-2.amazonaws.com/landsat-pds...
1715327	LC08_L1TP_034029_20190918_20190926_01_T1	LC80340292019261LGN00	2019-09-18 17:42:34.829412	0	L1TP	34	29	43.5227	-105.811	45.6673	-102.901	https://s3-us-west-2.amazonaws.com/landsat-pds...
1715084	LC08_L1TP_034030_20190918_20190926_01_T1	LC80340302019261LGN00	2019-09-18 17:42:58.716216	0.01	L1TP	34	30	42.1046	-106.284	44.2397	-103.42	https://s3-us-west-2.amazonaws.com/landsat-pds...
1715409	LC08_L1TP_034031_20190918_20190926_01_T1	LC80340312019261LGN00	2019-09-18 17:43:22.611492	5.09	L1TP	34	31	40.6765	-106.738	42.8102	-103.92	https://s3-us-west-2.amazonaws.com/landsat-pds...
1697556	LC08_L1TP_034032_20190902_20190916_01_T1	LC80340322019245LGN00	2019-09-02 17:43:40.977337	7.84	L1TP	34	32	39.2426	-107.169	41.379	-104.394	https://s3-us-west-2.amazonaws.com/landsat-pds...
1701659	LC08_L1TP_039029_20190905_20190917_01_T1	LC80390292019248LGN00	2019-09-05 18:13:23.833895	14.55	L1TP	39	29	43.5041	-113.579	45.6572	-110.568	https://s3-us-west-2.amazonaws.com/landsat-pds...
1712605	LC08_L1TP_037029_20190923_20190926_01_T1	LC80370292019266LGN00	2019-09-23 18:01:08.866094	14.89	L1TP	37	29	43.5178	-110.43	45.674	-107.58	https://s3-us-west-2.amazonaws.com/landsat-pds...

In [13]:

# Option 1 - get the scene list to upload to earthexplorer.usgs.gov/filelist
bulklist = bulk_frame[['entityId']]
bulklist.to_csv((os.path.join(sceneinfo, state + 'pathrowlist.txt')),sep='\t', index=False, header=False)
bulk_frame.to_csv((os.path.join(sceneinfo, state + 'frame.txt')),sep='\t', index=False)

In [ ]:

# Option 2 - download the data directly
import requests
from bs4 import BeautifulSoup

LANDSAT_PATH = os.path.join(l8out, state, 'l8imagery')

# For each row
for i, row in bulk_frame.iterrows():
    
    entity_dir = os.path.join(LANDSAT_PATH, row.productId)
    
    # added to skip the file if it already has been downloaded - check and re-download any files that may be corrupted
    # if download is interrupted
    if os.path.isdir(entity_dir): 
        print'Skipping ' + entity_dir + ' as it already exists'
    else:
        # Print the product ID
        print '\n', 'EntityId:', row.productId, '\n'

        # Request the html text of the download_url from the amazon server. 
        response = requests.get(row.download_url)

        # If the response status code is fine (200)
        if response.status_code == 200:

            # Import the html to beautiful soup
            html = BeautifulSoup(response.content, 'html.parser')

            # Create the dir where we will put this image files.
            if not os.path.exists(entity_dir):
                os.makedirs(entity_dir)


            # Second loop: for each band of this image that we find using the html <li> tag
            for li in html.find_all('li'):

                # Get the href tag
                file = li.find_next('a').get('href')

                filestring = str(file)
                filen = os.path.join(LANDSAT_PATH,entity_dir,filestring)

                # only download the .tif and metadata files, other formats (.IMD) aren't necessary for what I need
                if filestring[-4:] == '.TIF' or filestring[-8:] == '_MTL.txt' or filestring[-8:] == '_ANG.txt':
                    if not os.path.isfile(os.path.join(filen)): # skip anything already downloaded
                        print '  Downloading: {}'.format(file)

                        # Download the files
                        response = requests.get(row.download_url.replace('index.html', file), stream=True)

                        with open(os.path.join(entity_dir, file), 'wb') as output:
                            shutil.copyfileobj(response.raw, output)
                        del response
                    else: print filestring + ' exists'

Skipping D:\data\imagery\WY\l8imagery\LC08_L1TP_036029_20190831_20190916_01_T1 as it already exists
Skipping D:\data\imagery\WY\l8imagery\LC08_L1TP_036030_20190831_20190916_01_T1 as it already exists
Skipping D:\data\imagery\WY\l8imagery\LC08_L1TP_036031_20190714_20190720_01_T1 as it already exists

EntityId: LC08_L1TP_036032_20190714_20190720_01_T1 

  Downloading: LC08_L1TP_036032_20190714_20190720_01_T1_B6.TIF
  Downloading: LC08_L1TP_036032_20190714_20190720_01_T1_B8.TIF

In [ ]: