import leafmap
import pandas as pd
import geopandas as gpd
from datetime import datetime
import re
To download and access the data, you will need to create an Earthdata login. You can register for an account at urs.earthdata.nasa.gov.
leafmap.nasa_data_login()
TSV of NASA Earthdata products is available in the NASA-Earth-Data repo. We filter to just OPERA products.
url = 'https://github.com/opengeos/NASA-Earth-Data/raw/main/nasa_earth_data.tsv'
earth_data_df = pd.read_csv(url, sep='\t')
opera_df = earth_data_df[earth_data_df['ShortName'].str.contains('OPERA', case=False)]
opera_df
This will take some time, and quite a bit of RAM. For reference, as of Jan. 2024 there are ~2,100,000 RTC-S1 granules, and it takes somewhere north of 90 minutes to load it into the geodataframe. It also uses significant RAM, so if the kernel crashes, it may be best to reduce the number of files you are reading into memory. Change count
to something other than -1 (all files).
results, gdf = leafmap.nasa_data_search(
short_name='OPERA_L2_RTC-S1_V1',
cloud_hosted=True,
bounding_box= (-180.0, -90.0, 180, 90.0), #
temporal=("2014-06-15", str(datetime.now().date())),
count=-1, # use -1 to return all datasets
return_gdf=True,
)
gdf.tail()
identifier_list = gdf['native-id'].tolist()
print('Total granules:', len(identifier_list))
print(identifier_list[0:2])
### Access the parts that may indicate true duplicates from the identifier name
print(identifier_list[0][0:-29]) # burst ID
### This is only the hdf5
duplicate_identifiers = set()
unique_identifiers = set()
for identifier in identifier_list:
potential_duplicate_portion = identifier[0:-29]
# Check if the identifier is already in the set
if potential_duplicate_portion in unique_identifiers:
duplicate_identifiers.add(potential_duplicate_portion)
else:
# Add the identifier to the set if it's not a duplicate
unique_identifiers.add(potential_duplicate_portion)
# If you need the result as a list, you can convert the sets back to lists
duplicate_identifiers_list = list(duplicate_identifiers)
unique_identifiers_list = list(unique_identifiers)
print(f'Total RTC-S1 granules as of {datetime.now().strftime("%d-%m-%Y")}:', len(identifier_list))
print('Granules with more than one version:',len(duplicate_identifiers))
# Create a dictionary to store the potentially duplicated portion as the key and a list of entire elements as the value
granules_dictionary = {}
# Create a list to store pairs of potentially duplicated elements
duplicate_pairs = []
# Iterate over the elements in the list
for granule in identifier_list:
# Extract the potentially duplicated portion
potential_duplicate_portion = granule[0:-29]
# If the potential duplicate portion is not in the dictionary, add it with the entire element
if potential_duplicate_portion not in granules_dictionary:
granules_dictionary[potential_duplicate_portion] = [granule]
else:
# If the potential duplicate portion is already in the dictionary, add the entire element to the list
granules_dictionary[potential_duplicate_portion].append(granule)
# Create pairs from the dictionary values
for granules in granules_dictionary.values():
if len(granules) > 1:
duplicate_pairs.append(granules)
print(f'Total RTC-S1 granules as of {datetime.now().strftime("%d-%m-%Y")}:', len(identifier_list))
duplicates = []
for pair in duplicate_pairs:
for granule in pair:
duplicates.append(granule)
print(f'Total RTC-S1 duplicate tiles as of {datetime.now().strftime("%d-%m-%Y")}: {len(duplicates)} granules')
one_duplicate = []
for pair in duplicate_pairs:
if len(pair) == 2:
one_duplicate.append(pair)
print(f'Total RTC-S1 duplicate tiles with 1 duplicate as of {datetime.now().strftime("%d-%m-%Y")}: {len(one_duplicate)} totaling {len(one_duplicate)*2} granules.')
two_duplicates = []
for pair in duplicate_pairs:
if len(pair) == 3:
two_duplicates.append(pair)
print(f'Total RTC-S1 duplicate tiles with 2 duplicate as of {datetime.now().strftime("%d-%m-%Y")}: {len(two_duplicates)} totaling {len(two_duplicates)*3} granules.')
three_duplicates = []
for pair in duplicate_pairs:
if len(pair) == 4:
three_duplicates.append(pair)
print(f'Total RTC-S1 duplicate tiles with 3 duplicate as of {datetime.now().strftime("%d-%m-%Y")}: {len(three_duplicates)} totaling {len(three_duplicates)*4} granules.')
four_duplicates = []
for pair in duplicate_pairs:
if len(pair) == 5:
four_duplicates.append(pair)
print(f'Total RTC-S1 duplicate tiles with 4 duplicate as of {datetime.now().strftime("%d-%m-%Y")}: {len(four_duplicates)} totaling {len(four_duplicates)*5} granules.')
duplicate_urls = []
for pair in duplicate_pairs:
pair_urls = []
for granule in pair:
pair_urls.append('https://datapool.asf.alaska.edu/RTC/OPERA-S1/'+str(granule)+'.h5')
duplicate_urls.append(pair_urls)
burst_ids = []
dates = []
for pair in duplicate_pairs:
burst_ids.append(pair[0][16:31])
date_str = pair[0][32:48]
datetime_object = datetime.strptime(date_str, '%Y%m%dT%H%M%SZ')
dates.append(datetime_object)
duplicates_df = pd.DataFrame({
'burst_id': burst_ids,
'acquisition_date': dates,
'duplicates': duplicate_urls
})
#df_final = pd.DataFrame(duplicates_df['duplicates'].tolist(), index=duplicates_df[['burst_id', 'date']]).reset_index()
df_final = pd.concat([duplicates_df[['burst_id', 'acquisition_date']], duplicates_df['duplicates'].apply(lambda x: pd.Series(x))], axis=1)
# Rename the columns
df_final.columns = ['burst_id', 'acquisition_date', 'duplicate_1', 'duplicate_2']
#df_final.columns = ['burst_id', 'acquisition_date', 'duplicate_1', 'duplicate_2', 'duplicate_3','duplicate_4']
# Sort by burst_id
sorted_df = df_final.sort_values(by='burst_id', ignore_index=True)
sorted_df.head()
df2 = pd.DataFrame(gdf)
df2.tail()
def extract_portion(url):
if pd.isna(url):
return None
match = re.search(r'/([^/]+)\.h5', url)
if match:
return match.group(1)
else:
return None
# Apply the function to extract the portion and create a new column
# The try/except approach is used in case there less than 4 duplicates found for every granule
try:
sorted_df['extracted_portion_duplicate_1'] = sorted_df['duplicate_1'].apply(extract_portion)
sorted_df['extracted_portion_duplicate_2'] = sorted_df['duplicate_2'].apply(extract_portion)
sorted_df['extracted_portion_duplicate_3'] = sorted_df['duplicate_3'].apply(extract_portion)
sorted_df['extracted_portion_duplicate_4'] = sorted_df['duplicate_4'].apply(extract_portion)
except:
pass
try:
merged_df1 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_1', right_on='native-id', how='inner')
merged_df2 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_2', right_on='native-id', how='inner')
merged_df3 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_3', right_on='native-id', how='inner')
merged_df4 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_4', right_on='native-id', how='inner')
except:
pass
columns_to_drop = ['acquisition_date','size','concept-type','concept-id','Projects','ArchiveAndDistributionInformation','DayNightFlag',
'Identifiers','Platforms','Name','URL','ShortName','AdditionalAttributes','GPolygons','ProviderDates','EntryTitle','PGEName','PGEVersion',
'OrbitCalculatedSpatialDomains','GranuleUR','RelatedUrls','InputGranules','format','provider-id','native-id','revision-id']
try:
merged_df1 = merged_df1.drop(columns=columns_to_drop)
merged_df2 = merged_df2.drop(columns=columns_to_drop)
merged_df3 = merged_df3.drop(columns=columns_to_drop)
merged_df4 = merged_df4.drop(columns=columns_to_drop)
except:
pass
try:
merged_df1['revision-date-1'] = merged_df1['revision-date']
merged_df2['revision-date-2'] = merged_df2['revision-date']
merged_df3['revision-date-3'] = merged_df3['revision-date']
merged_df4['revision-date-4'] = merged_df4['revision-date']
except:
pass
try:
merged_df1['ProductionDateTime-1'] = merged_df1['ProductionDateTime']
merged_df2['ProductionDateTime-2'] = merged_df2['ProductionDateTime']
merged_df3['ProductionDateTime-3'] = merged_df3['ProductionDateTime']
merged_df4['ProductionDateTime-4'] = merged_df4['ProductionDateTime']
except:
pass
try:
merged_df = pd.merge(merged_df1, merged_df2, on='duplicate_1', how='left', suffixes=('_df2', '_df1'))
merged_df = pd.merge(merged_df, merged_df3, on='duplicate_1', how='left', suffixes=('_merged', '_df3'))
merged_df = pd.merge(merged_df, merged_df4, on='duplicate_1', how='left', suffixes=('_merged', '_df4'))
except:
pass
try:
merged_df['burst_id'] = merged_df['burst_id_df2']
merged_df['duplicate_2'] = merged_df['duplicate_2_df2']
merged_df['duplicate_3'] = merged_df['duplicate_3_df2']
merged_df['duplicate_4'] = merged_df['duplicate_4_df2']
except:
pass
suffixes_to_remove = ['_df1', '_df2', '_df3', '_df4','merged']
# Iterate over the suffixes and drop columns
for suffix in suffixes_to_remove:
try:
columns_to_drop = [col for col in merged_df.columns if (col.endswith(suffix))]
merged_df = merged_df.drop(columns=columns_to_drop)
except:
pass
merged_df['extracted_portion_duplicate_1'] = merged_df['duplicate_1'].apply(extract_portion)
common_column = 'extracted_portion_duplicate_1'
column_to_include = 'BeginningDateTime'
# Merge the DataFrames based on the common column
merged_df = pd.merge(merged_df, df2[['native-id', column_to_include]], left_on=common_column, right_on='native-id', how='left')
# Drop the duplicate columns and rename the result column
merged_df = merged_df.drop(columns=['native-id','extracted_portion_duplicate_1']).rename(columns={column_to_include: 'BeginningDateTime'})
# Specify the desired column order
desired_order = ['burst_id', 'duplicate_1', 'duplicate_2', 'duplicate_3','duplicate_4','BeginningDateTime','revision-date-1','revision-date-2','revision-date-3','revision-date-4','ProductionDateTime-1','ProductionDateTime-2','ProductionDateTime-3','ProductionDateTime-4']
# Filter the columns that exist in the DataFrame
existing_columns = [col for col in desired_order if col in merged_df.columns]
# Reorder the DataFrame columns based on the desired order
merged_df = merged_df[existing_columns]
merged_df.head()
# Output the DataFrame to a CSV file
csv_file_path = f'RTC-S1_duplicates_{datetime.now().strftime("%d-%m-%Y")}.csv'
merged_df.to_csv(csv_file_path, index=False)
# make geodataframe
duplicates_gdf = gdf[gdf['native-id'].isin(duplicates)]
columns_to_drop = ['size', 'concept-type', 'concept-id', 'revision-id', 'native-id',
'provider-id', 'format', 'revision-date', 'BeginningDateTime',
'EndingDateTime', 'OrbitCalculatedSpatialDomains', 'GranuleUR',
'AdditionalAttributes', 'GPolygons', 'ProviderDates', 'EntryTitle',
'PGEName', 'PGEVersion', 'RelatedUrls', 'InputGranules', 'Projects',
'ArchiveAndDistributionInformation', 'DayNightFlag', 'Identifiers',
'ProductionDateTime', 'Platforms', 'Name', 'URL', 'ShortName',]
duplicates_gdf = duplicates_gdf.drop(columns = columns_to_drop)
duplicates_gdf.to_file(f'RTC-S1_duplicates_{datetime.now().strftime("%d-%m-%Y")}.geojson', driver='GeoJSON')