Notebook

Find duplicate granules in OPERA CSLC-S1 dataset¶

In [ ]:

import leafmap
import pandas as pd
import geopandas as gpd
from datetime import datetime
import re

To and access the data, you will need to create an Earthdata login. You can register for an account at urs.earthdata.nasa.gov.

In [ ]:

leafmap.nasa_data_login()

View available OPERA product metadate¶

TSV of NASA Earthdata products is available in the NASA-Earth-Data repo. We filter to just OPERA products.

In [ ]:

url = 'https://github.com/opengeos/NASA-Earth-Data/raw/main/nasa_earth_data.tsv'
earth_data_df = pd.read_csv(url, sep='\t')
opera_df = earth_data_df[earth_data_df['ShortName'].str.contains('OPERA', case=False)]
opera_df

Load all CSLC-S1 graunules into a geodataframe¶

For reference, as of Jan. 2024 there are ~225,000 CSLC-S1 granules, and it takes about 6 minutes to load it into the geodataframe.

In [ ]:

results, gdf = leafmap.nasa_data_search(
    short_name='OPERA_L2_CSLC-S1_V1',
    cloud_hosted=True,
    bounding_box= (-180.0, -90.0, 180, 90.0),
    temporal=("2014-06-15", str(datetime.now().date())),
    count=-1,  # use -1 to return all datasets
    return_gdf=True,
)

In [ ]:

gdf.tail()

Make a list of 'native-id' from the files¶

In [ ]:

identifier_list = gdf['native-id'].tolist()
print('Total granules:', len(identifier_list))
print(identifier_list[0:2])

Find duplicates¶

Isolate the part of the file name that would be the same for duplicate granules¶

In [ ]:

### Access the parts that may indicate true duplicates from the identifier name
print(identifier_list[0][0:-29])   # burst ID

Create a set of duplicates and unique identifiers¶

In [ ]:

duplicate_identifiers = set()
unique_identifiers = set()

for identifier in identifier_list:
    potential_duplicate_portion = identifier[0:-29]

    # Check if the identifier is already in the set
    if potential_duplicate_portion in unique_identifiers:
        duplicate_identifiers.add(potential_duplicate_portion)
    else:
        # Add the identifier to the set if it's not a duplicate
        unique_identifiers.add(potential_duplicate_portion)

# If you need the result as a list, you can convert the sets back to lists
duplicate_identifiers_list = list(duplicate_identifiers)
unique_identifiers_list = list(unique_identifiers)

In [ ]:

print(f'Total CSLC-S1 granules as of {datetime.now().strftime("%d-%m-%Y")}:', len(identifier_list))
print('Granules with more than one version:',len(duplicate_identifiers))

Collect duplicates for each granulate where duplicates exist¶

In [ ]:

# Create a dictionary to store the potentially duplicated portion as the key and a list of entire elements as the value
granules_dictionary = {}

# Create a list to store pairs of potentially duplicated elements
duplicate_pairs = []

# Iterate over the elements in the list
for granule in identifier_list:
    # Extract the potentially duplicated portion
    potential_duplicate_portion = granule[0:-29]
    
    # If the potential duplicate portion is not in the dictionary, add it with the entire element
    if potential_duplicate_portion not in granules_dictionary:
        granules_dictionary[potential_duplicate_portion] = [granule]
    else:
        # If the potential duplicate portion is already in the dictionary, add the entire element to the list
        granules_dictionary[potential_duplicate_portion].append(granule)

# Create pairs from the dictionary values
for granules in granules_dictionary.values():
    if len(granules) > 1:
        duplicate_pairs.append(granules)

Print the number of duplicates for each granule¶

In [ ]:

print(f'Total CSLC-S1 granules as of {datetime.now().strftime("%d-%m-%Y")}:', len(identifier_list))

In [ ]:

duplicates = []
for pair in duplicate_pairs:
    for granule in pair:
        duplicates.append(granule)
print(f'Total CSLC-S1 duplicate tiles as of {datetime.now().strftime("%d-%m-%Y")}: {len(duplicates)} granules')

In [ ]:

one_duplicate = []
for pair in duplicate_pairs:
    if len(pair) > 1 and len(pair) < 3:
        one_duplicate.append(pair)
print(f'Total CSLC-S1 duplicate tiles with 1 duplicate as of {datetime.now().strftime("%d-%m-%Y")}: {len(one_duplicate)} totaling {len(one_duplicate)*2} granules.')

In [ ]:

two_duplicates = []
for pair in duplicate_pairs:
    if len(pair) > 2:
        two_duplicates.append(pair)
print(f'Total CSLC-S1 duplicate tiles with 2 duplicate as of {datetime.now().strftime("%d-%m-%Y")}: {len(two_duplicates)} totaling {len(two_duplicates)*3} granules.')

In [ ]:

three_duplicates = []
for pair in duplicate_pairs:
    if len(pair)>3:
        three_duplicates.append(pair)
print(f'Total CSLC-S1 duplicate tiles with 3 duplicate as of {datetime.now().strftime("%d-%m-%Y")}: {len(three_duplicates)} totaling {len(three_duplicates)*4} granules.')

In [ ]:

### add the url to the duplicate names
duplicate_urls = []
for pair in duplicate_pairs:
    pair_urls = []
    for granule in pair:
        pair_urls.append('https://datapool.asf.alaska.edu/CSLC/OPERA-S1/'+str(granule)+'.h5')
    duplicate_urls.append(pair_urls)
    

Add the url, burst IDs, and dates to the duplicate names¶

In [ ]:

### burst ids
burst_ids = []
dates = []
for pair in duplicate_pairs:
    burst_ids.append(pair[0][17:32])
    dates.append(pair[0][33:41])

In [ ]:

duplicates_df = pd.DataFrame({
    'burst_id': burst_ids,
    'date': dates,
    'duplicates': duplicate_urls
})

Make dataframe of all duplicates¶

In [ ]:

#df_final = pd.DataFrame(duplicates_df['duplicates'].tolist(), index=duplicates_df[['burst_id', 'date']]).reset_index()
df_final = pd.concat([duplicates_df[['burst_id', 'date']], duplicates_df['duplicates'].apply(lambda x: pd.Series(x))], axis=1)

# Rename the columns
df_final.columns = ['burst_id', 'date', 'duplicate_1', 'duplicate_2', 'duplicate_3']

# Sort by burst_id 
sorted_df = df_final.sort_values(by='burst_id')

In [ ]:

sorted_df.head()

Add columns of interest from the original geodataframe for each duplicate and format it nicely (probably could be improved, but should work)¶

In [ ]:

df2 = pd.DataFrame(gdf)
df2.head()

In [ ]:

# Function to extract acquisition and processing times as datetime objects
def extract_portion(url):
    if pd.notna(url):
        match = re.search(r'([^/]+)\.h5', url)
        if match:
            info_string = match.group(1)
            return info_string
    return None

# Apply the function to extract the portion and create a new column
sorted_df['extracted_portion_duplicate_1'] = sorted_df['duplicate_1'].apply(extract_portion)
sorted_df['extracted_portion_duplicate_2'] = sorted_df['duplicate_2'].apply(extract_portion)
sorted_df['extracted_portion_duplicate_3'] = sorted_df['duplicate_3'].apply(extract_portion)

merged_df1 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_1', right_on='native-id', how='inner')
merged_df2 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_2', right_on='native-id', how='inner')
merged_df3 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_3', right_on='native-id', how='inner')

merged_df1['revision-date-1'] = merged_df1['revision-date']
merged_df2['revision-date-2'] = merged_df2['revision-date']
merged_df3['revision-date-3'] = merged_df3['revision-date']

merged_df1['ProductionDateTime-1'] = merged_df1['ProductionDateTime']
merged_df2['ProductionDateTime-2'] = merged_df2['ProductionDateTime']
merged_df3['ProductionDateTime-3'] = merged_df3['ProductionDateTime']

merged_df = pd.merge(merged_df1, merged_df2, on='duplicate_1', how='left', suffixes=('_df2', '_df1'))
merged_df = pd.merge(merged_df, merged_df3, on='duplicate_1', how='left', suffixes=('_merged', '_df3'))

merged_df['burst_id'] = merged_df['burst_id_df2']
merged_df['duplicate_2'] = merged_df['duplicate_2_df2']
merged_df['duplicate_3'] = merged_df['duplicate_3_df2']

suffixes_to_remove = ['_df1', '_df2', '_df3', '_df4','merged']

# Iterate over the suffixes and drop columns
for suffix in suffixes_to_remove:
    columns_to_drop = [col for col in merged_df.columns if (col.endswith(suffix))]
    merged_df = merged_df.drop(columns=columns_to_drop)


merged_df['extracted_portion_duplicate_1'] = merged_df['duplicate_1'].apply(extract_portion)
#merged_df['BeginningDateTime'] = merged_df['BeginningDateTime_y']

common_column = 'extracted_portion_duplicate_1'
column_to_include = 'BeginningDateTime'

# Merge the DataFrames based on the common column
merged_df = pd.merge(merged_df, df2[['native-id', column_to_include]], left_on=common_column, right_on='native-id', how='left')

# Drop the duplicate columns and rename the result column
merged_df = merged_df.drop(columns=['extracted_portion_duplicate_1']).rename(columns={column_to_include: 'BeginningDateTime'})

merged_df['BeginningDateTime'] = merged_df['BeginningDateTime_y']

columns_to_drop = ['date','extracted_portion_duplicate_2', 'extracted_portion_duplicate_3',
       'size', 'concept-type', 'concept-id', 'revision-id', 'native-id_x',
       'provider-id', 'format', 'revision-date', 'BeginningDateTime_x',
       'EndingDateTime', 'OrbitCalculatedSpatialDomains', 'GranuleUR',
       'AdditionalAttributes', 'GPolygons', 'ProviderDates', 'EntryTitle',
       'PGEName', 'PGEVersion', 'RelatedUrls', 'InputGranules', 'Projects',
       'ArchiveAndDistributionInformation', 'DayNightFlag', 'Identifiers',
       'ProductionDateTime', 'Platforms', 'Name', 'URL', 'ShortName',
       'geometry', 'BeginningDateTime_y','native-id_y']

merged_df = merged_df.drop(columns=columns_to_drop)

# Specify the desired column order
desired_order = ['burst_id', 'duplicate_1', 'duplicate_2', 'duplicate_3','BeginningDateTime','revision-date-1','revision-date-2','revision-date-3','ProductionDateTime-1','ProductionDateTime-2','ProductionDateTime-3']

# Create a new DataFrame with the specified column order
merged_df = merged_df[desired_order]
merged_df.head()

Output dataframe as csv¶

In [ ]:

# Output the DataFrame to a CSV file
csv_file_path = f'CSLC-S1_duplicates_{datetime.now().strftime("%d-%m-%Y")}.csv'
merged_df.to_csv(csv_file_path, index=False)

Output duplicates as geojson¶

In [ ]:

# make geodataframe
duplicates_gdf = gdf[gdf['native-id'].isin(duplicates)]

columns_to_drop = ['size', 'concept-type', 'concept-id', 'revision-id', 'native-id',
       'provider-id', 'format', 'revision-date', 'BeginningDateTime',
       'EndingDateTime', 'OrbitCalculatedSpatialDomains', 'GranuleUR',
       'AdditionalAttributes', 'GPolygons', 'ProviderDates', 'EntryTitle',
       'PGEName', 'PGEVersion', 'RelatedUrls', 'InputGranules', 'Projects',
       'ArchiveAndDistributionInformation', 'DayNightFlag', 'Identifiers',
       'ProductionDateTime', 'Platforms', 'Name', 'URL', 'ShortName',]

duplicates_gdf = duplicates_gdf.drop(columns = columns_to_drop)
duplicates_gdf.to_file(f'CSLC-S1_duplicates_{datetime.now().strftime("%d-%m-%Y")}.geojson', driver='GeoJSON')