import leafmap
import pandas as pd
import geopandas as gpd
from datetime import datetime
import re
To and access the data, you will need to create an Earthdata login. You can register for an account at urs.earthdata.nasa.gov.
leafmap.nasa_data_login()
TSV of NASA Earthdata products is available in the NASA-Earth-Data repo. We filter to just OPERA products.
url = 'https://github.com/opengeos/NASA-Earth-Data/raw/main/nasa_earth_data.tsv'
earth_data_df = pd.read_csv(url, sep='\t')
opera_df = earth_data_df[earth_data_df['ShortName'].str.contains('OPERA', case=False)]
opera_df
For reference, as of Jan. 2024 there are ~225,000 CSLC-S1 granules, and it takes about 6 minutes to load it into the geodataframe.
results, gdf = leafmap.nasa_data_search(
short_name='OPERA_L2_CSLC-S1_V1',
cloud_hosted=True,
bounding_box= (-180.0, -90.0, 180, 90.0),
temporal=("2014-06-15", str(datetime.now().date())),
count=-1, # use -1 to return all datasets
return_gdf=True,
)
gdf.tail()
identifier_list = gdf['native-id'].tolist()
print('Total granules:', len(identifier_list))
print(identifier_list[0:2])
### Access the parts that may indicate true duplicates from the identifier name
print(identifier_list[0][0:-29]) # burst ID
duplicate_identifiers = set()
unique_identifiers = set()
for identifier in identifier_list:
potential_duplicate_portion = identifier[0:-29]
# Check if the identifier is already in the set
if potential_duplicate_portion in unique_identifiers:
duplicate_identifiers.add(potential_duplicate_portion)
else:
# Add the identifier to the set if it's not a duplicate
unique_identifiers.add(potential_duplicate_portion)
# If you need the result as a list, you can convert the sets back to lists
duplicate_identifiers_list = list(duplicate_identifiers)
unique_identifiers_list = list(unique_identifiers)
print(f'Total CSLC-S1 granules as of {datetime.now().strftime("%d-%m-%Y")}:', len(identifier_list))
print('Granules with more than one version:',len(duplicate_identifiers))
# Create a dictionary to store the potentially duplicated portion as the key and a list of entire elements as the value
granules_dictionary = {}
# Create a list to store pairs of potentially duplicated elements
duplicate_pairs = []
# Iterate over the elements in the list
for granule in identifier_list:
# Extract the potentially duplicated portion
potential_duplicate_portion = granule[0:-29]
# If the potential duplicate portion is not in the dictionary, add it with the entire element
if potential_duplicate_portion not in granules_dictionary:
granules_dictionary[potential_duplicate_portion] = [granule]
else:
# If the potential duplicate portion is already in the dictionary, add the entire element to the list
granules_dictionary[potential_duplicate_portion].append(granule)
# Create pairs from the dictionary values
for granules in granules_dictionary.values():
if len(granules) > 1:
duplicate_pairs.append(granules)
print(f'Total CSLC-S1 granules as of {datetime.now().strftime("%d-%m-%Y")}:', len(identifier_list))
duplicates = []
for pair in duplicate_pairs:
for granule in pair:
duplicates.append(granule)
print(f'Total CSLC-S1 duplicate tiles as of {datetime.now().strftime("%d-%m-%Y")}: {len(duplicates)} granules')
one_duplicate = []
for pair in duplicate_pairs:
if len(pair) > 1 and len(pair) < 3:
one_duplicate.append(pair)
print(f'Total CSLC-S1 duplicate tiles with 1 duplicate as of {datetime.now().strftime("%d-%m-%Y")}: {len(one_duplicate)} totaling {len(one_duplicate)*2} granules.')
two_duplicates = []
for pair in duplicate_pairs:
if len(pair) > 2:
two_duplicates.append(pair)
print(f'Total CSLC-S1 duplicate tiles with 2 duplicate as of {datetime.now().strftime("%d-%m-%Y")}: {len(two_duplicates)} totaling {len(two_duplicates)*3} granules.')
three_duplicates = []
for pair in duplicate_pairs:
if len(pair)>3:
three_duplicates.append(pair)
print(f'Total CSLC-S1 duplicate tiles with 3 duplicate as of {datetime.now().strftime("%d-%m-%Y")}: {len(three_duplicates)} totaling {len(three_duplicates)*4} granules.')
### add the url to the duplicate names
duplicate_urls = []
for pair in duplicate_pairs:
pair_urls = []
for granule in pair:
pair_urls.append('https://datapool.asf.alaska.edu/CSLC/OPERA-S1/'+str(granule)+'.h5')
duplicate_urls.append(pair_urls)
### burst ids
burst_ids = []
dates = []
for pair in duplicate_pairs:
burst_ids.append(pair[0][17:32])
dates.append(pair[0][33:41])
duplicates_df = pd.DataFrame({
'burst_id': burst_ids,
'date': dates,
'duplicates': duplicate_urls
})
#df_final = pd.DataFrame(duplicates_df['duplicates'].tolist(), index=duplicates_df[['burst_id', 'date']]).reset_index()
df_final = pd.concat([duplicates_df[['burst_id', 'date']], duplicates_df['duplicates'].apply(lambda x: pd.Series(x))], axis=1)
# Rename the columns
df_final.columns = ['burst_id', 'date', 'duplicate_1', 'duplicate_2', 'duplicate_3']
# Sort by burst_id
sorted_df = df_final.sort_values(by='burst_id')
sorted_df.head()
df2 = pd.DataFrame(gdf)
df2.head()
# Function to extract acquisition and processing times as datetime objects
def extract_portion(url):
if pd.notna(url):
match = re.search(r'([^/]+)\.h5', url)
if match:
info_string = match.group(1)
return info_string
return None
# Apply the function to extract the portion and create a new column
sorted_df['extracted_portion_duplicate_1'] = sorted_df['duplicate_1'].apply(extract_portion)
sorted_df['extracted_portion_duplicate_2'] = sorted_df['duplicate_2'].apply(extract_portion)
sorted_df['extracted_portion_duplicate_3'] = sorted_df['duplicate_3'].apply(extract_portion)
merged_df1 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_1', right_on='native-id', how='inner')
merged_df2 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_2', right_on='native-id', how='inner')
merged_df3 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_3', right_on='native-id', how='inner')
merged_df1['revision-date-1'] = merged_df1['revision-date']
merged_df2['revision-date-2'] = merged_df2['revision-date']
merged_df3['revision-date-3'] = merged_df3['revision-date']
merged_df1['ProductionDateTime-1'] = merged_df1['ProductionDateTime']
merged_df2['ProductionDateTime-2'] = merged_df2['ProductionDateTime']
merged_df3['ProductionDateTime-3'] = merged_df3['ProductionDateTime']
merged_df = pd.merge(merged_df1, merged_df2, on='duplicate_1', how='left', suffixes=('_df2', '_df1'))
merged_df = pd.merge(merged_df, merged_df3, on='duplicate_1', how='left', suffixes=('_merged', '_df3'))
merged_df['burst_id'] = merged_df['burst_id_df2']
merged_df['duplicate_2'] = merged_df['duplicate_2_df2']
merged_df['duplicate_3'] = merged_df['duplicate_3_df2']
suffixes_to_remove = ['_df1', '_df2', '_df3', '_df4','merged']
# Iterate over the suffixes and drop columns
for suffix in suffixes_to_remove:
columns_to_drop = [col for col in merged_df.columns if (col.endswith(suffix))]
merged_df = merged_df.drop(columns=columns_to_drop)
merged_df['extracted_portion_duplicate_1'] = merged_df['duplicate_1'].apply(extract_portion)
#merged_df['BeginningDateTime'] = merged_df['BeginningDateTime_y']
common_column = 'extracted_portion_duplicate_1'
column_to_include = 'BeginningDateTime'
# Merge the DataFrames based on the common column
merged_df = pd.merge(merged_df, df2[['native-id', column_to_include]], left_on=common_column, right_on='native-id', how='left')
# Drop the duplicate columns and rename the result column
merged_df = merged_df.drop(columns=['extracted_portion_duplicate_1']).rename(columns={column_to_include: 'BeginningDateTime'})
merged_df['BeginningDateTime'] = merged_df['BeginningDateTime_y']
columns_to_drop = ['date','extracted_portion_duplicate_2', 'extracted_portion_duplicate_3',
'size', 'concept-type', 'concept-id', 'revision-id', 'native-id_x',
'provider-id', 'format', 'revision-date', 'BeginningDateTime_x',
'EndingDateTime', 'OrbitCalculatedSpatialDomains', 'GranuleUR',
'AdditionalAttributes', 'GPolygons', 'ProviderDates', 'EntryTitle',
'PGEName', 'PGEVersion', 'RelatedUrls', 'InputGranules', 'Projects',
'ArchiveAndDistributionInformation', 'DayNightFlag', 'Identifiers',
'ProductionDateTime', 'Platforms', 'Name', 'URL', 'ShortName',
'geometry', 'BeginningDateTime_y','native-id_y']
merged_df = merged_df.drop(columns=columns_to_drop)
# Specify the desired column order
desired_order = ['burst_id', 'duplicate_1', 'duplicate_2', 'duplicate_3','BeginningDateTime','revision-date-1','revision-date-2','revision-date-3','ProductionDateTime-1','ProductionDateTime-2','ProductionDateTime-3']
# Create a new DataFrame with the specified column order
merged_df = merged_df[desired_order]
merged_df.head()
# Output the DataFrame to a CSV file
csv_file_path = f'CSLC-S1_duplicates_{datetime.now().strftime("%d-%m-%Y")}.csv'
merged_df.to_csv(csv_file_path, index=False)
# make geodataframe
duplicates_gdf = gdf[gdf['native-id'].isin(duplicates)]
columns_to_drop = ['size', 'concept-type', 'concept-id', 'revision-id', 'native-id',
'provider-id', 'format', 'revision-date', 'BeginningDateTime',
'EndingDateTime', 'OrbitCalculatedSpatialDomains', 'GranuleUR',
'AdditionalAttributes', 'GPolygons', 'ProviderDates', 'EntryTitle',
'PGEName', 'PGEVersion', 'RelatedUrls', 'InputGranules', 'Projects',
'ArchiveAndDistributionInformation', 'DayNightFlag', 'Identifiers',
'ProductionDateTime', 'Platforms', 'Name', 'URL', 'ShortName',]
duplicates_gdf = duplicates_gdf.drop(columns = columns_to_drop)
duplicates_gdf.to_file(f'CSLC-S1_duplicates_{datetime.now().strftime("%d-%m-%Y")}.geojson', driver='GeoJSON')