Notebook

Find duplicate granules in OPERA RTC-S1 dataset¶

In [ ]:

import leafmap
import pandas as pd
import geopandas as gpd
from datetime import datetime
import re

To download and access the data, you will need to create an Earthdata login. You can register for an account at urs.earthdata.nasa.gov.

In [ ]:

leafmap.nasa_data_login()

View available OPERA product metadate¶

TSV of NASA Earthdata products is available in the NASA-Earth-Data repo. We filter to just OPERA products.

In [ ]:

url = 'https://github.com/opengeos/NASA-Earth-Data/raw/main/nasa_earth_data.tsv'
earth_data_df = pd.read_csv(url, sep='\t')
opera_df = earth_data_df[earth_data_df['ShortName'].str.contains('OPERA', case=False)]
opera_df

Load all RTC-S1 graunules into a geodataframe¶

This will take some time, and quite a bit of RAM. For reference, as of Jan. 2024 there are ~2,100,000 RTC-S1 granules, and it takes somewhere north of 90 minutes to load it into the geodataframe. It also uses significant RAM, so if the kernel crashes, it may be best to reduce the number of files you are reading into memory. Change count to something other than -1 (all files).

In [ ]:

results, gdf = leafmap.nasa_data_search(
    short_name='OPERA_L2_RTC-S1_V1',
    cloud_hosted=True,
    bounding_box= (-180.0, -90.0, 180, 90.0), # 
    temporal=("2014-06-15", str(datetime.now().date())),
    count=-1,  # use -1 to return all datasets
    return_gdf=True,
)

In [ ]:

gdf.tail()

Make a list of 'native-id' from the files¶

In [ ]:

identifier_list = gdf['native-id'].tolist()
print('Total granules:', len(identifier_list))
print(identifier_list[0:2])

Find duplicates¶

Isolate the part of the file name that would be the same for duplicate granules¶

In [ ]:

### Access the parts that may indicate true duplicates from the identifier name
print(identifier_list[0][0:-29])   # burst ID

Create a set of duplicates and unique identifiers¶

In [ ]:

### This is only the hdf5 
duplicate_identifiers = set()
unique_identifiers = set()

for identifier in identifier_list:
    potential_duplicate_portion = identifier[0:-29]

    # Check if the identifier is already in the set
    if potential_duplicate_portion in unique_identifiers:
        duplicate_identifiers.add(potential_duplicate_portion)
    else:
        # Add the identifier to the set if it's not a duplicate
        unique_identifiers.add(potential_duplicate_portion)

# If you need the result as a list, you can convert the sets back to lists
duplicate_identifiers_list = list(duplicate_identifiers)
unique_identifiers_list = list(unique_identifiers)

Print the basic stats as of today (total duplicates)¶

In [ ]:

print(f'Total RTC-S1 granules as of {datetime.now().strftime("%d-%m-%Y")}:', len(identifier_list))
print('Granules with more than one version:',len(duplicate_identifiers))

Collect duplicates for each granulate where duplicates exist¶

In [ ]:

# Create a dictionary to store the potentially duplicated portion as the key and a list of entire elements as the value
granules_dictionary = {}

# Create a list to store pairs of potentially duplicated elements
duplicate_pairs = []

# Iterate over the elements in the list
for granule in identifier_list:
    # Extract the potentially duplicated portion
    potential_duplicate_portion = granule[0:-29]
    
    # If the potential duplicate portion is not in the dictionary, add it with the entire element
    if potential_duplicate_portion not in granules_dictionary:
        granules_dictionary[potential_duplicate_portion] = [granule]
    else:
        # If the potential duplicate portion is already in the dictionary, add the entire element to the list
        granules_dictionary[potential_duplicate_portion].append(granule)

# Create pairs from the dictionary values
for granules in granules_dictionary.values():
    if len(granules) > 1:
        duplicate_pairs.append(granules)

Print the number of duplicates for each granule¶

In [ ]:

print(f'Total RTC-S1 granules as of {datetime.now().strftime("%d-%m-%Y")}:', len(identifier_list))

In [ ]:

duplicates = []
for pair in duplicate_pairs:
    for granule in pair:
        duplicates.append(granule)
print(f'Total RTC-S1 duplicate tiles as of {datetime.now().strftime("%d-%m-%Y")}: {len(duplicates)} granules')

In [ ]:

one_duplicate = []
for pair in duplicate_pairs:
    if len(pair) == 2:
        one_duplicate.append(pair)
print(f'Total RTC-S1 duplicate tiles with 1 duplicate as of {datetime.now().strftime("%d-%m-%Y")}: {len(one_duplicate)} totaling {len(one_duplicate)*2} granules.')

In [ ]:

two_duplicates = []
for pair in duplicate_pairs:
    if len(pair) == 3:
        two_duplicates.append(pair)
print(f'Total RTC-S1 duplicate tiles with 2 duplicate as of {datetime.now().strftime("%d-%m-%Y")}: {len(two_duplicates)} totaling {len(two_duplicates)*3} granules.')

In [ ]:

three_duplicates = []
for pair in duplicate_pairs:
    if len(pair) == 4:
        three_duplicates.append(pair)
print(f'Total RTC-S1 duplicate tiles with 3 duplicate as of {datetime.now().strftime("%d-%m-%Y")}: {len(three_duplicates)} totaling {len(three_duplicates)*4} granules.')

In [ ]:

four_duplicates = []
for pair in duplicate_pairs:
    if len(pair) == 5:
        four_duplicates.append(pair)
print(f'Total RTC-S1 duplicate tiles with 4 duplicate as of {datetime.now().strftime("%d-%m-%Y")}: {len(four_duplicates)} totaling {len(four_duplicates)*5} granules.')

Add the url, burst IDs, and dates to the duplicate names¶

In [ ]:

duplicate_urls = []
for pair in duplicate_pairs:
    pair_urls = []
    for granule in pair:
        pair_urls.append('https://datapool.asf.alaska.edu/RTC/OPERA-S1/'+str(granule)+'.h5')
    duplicate_urls.append(pair_urls)

In [ ]:

burst_ids = []
dates = []
for pair in duplicate_pairs:
    burst_ids.append(pair[0][16:31])
    date_str = pair[0][32:48]
    datetime_object = datetime.strptime(date_str, '%Y%m%dT%H%M%SZ')  
    dates.append(datetime_object)

In [ ]:

duplicates_df = pd.DataFrame({
    'burst_id': burst_ids,
    'acquisition_date': dates,
    'duplicates': duplicate_urls
})

Make dataframe of all duplicates¶

In [ ]:

#df_final = pd.DataFrame(duplicates_df['duplicates'].tolist(), index=duplicates_df[['burst_id', 'date']]).reset_index()
df_final = pd.concat([duplicates_df[['burst_id', 'acquisition_date']], duplicates_df['duplicates'].apply(lambda x: pd.Series(x))], axis=1)

# Rename the columns
df_final.columns = ['burst_id', 'acquisition_date', 'duplicate_1', 'duplicate_2']
#df_final.columns = ['burst_id', 'acquisition_date', 'duplicate_1', 'duplicate_2', 'duplicate_3','duplicate_4']

# Sort by burst_id 
sorted_df = df_final.sort_values(by='burst_id', ignore_index=True)

In [ ]:

sorted_df.head()

Add columns of interest from the original geodataframe for each duplicate and format it nicely (probably could be improved, but should work)¶

In [ ]:

df2 = pd.DataFrame(gdf)
df2.tail()

In [ ]:

def extract_portion(url):
    if pd.isna(url):
        return None
    match = re.search(r'/([^/]+)\.h5', url)
    if match:
        return match.group(1)
    else:
        return None
    
# Apply the function to extract the portion and create a new column
# The try/except approach is used in case there less than 4 duplicates found for every granule

try:
    sorted_df['extracted_portion_duplicate_1'] = sorted_df['duplicate_1'].apply(extract_portion)
    sorted_df['extracted_portion_duplicate_2'] = sorted_df['duplicate_2'].apply(extract_portion)
    sorted_df['extracted_portion_duplicate_3'] = sorted_df['duplicate_3'].apply(extract_portion)
    sorted_df['extracted_portion_duplicate_4'] = sorted_df['duplicate_4'].apply(extract_portion)
except:
    pass

try:
    merged_df1 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_1', right_on='native-id', how='inner')
    merged_df2 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_2', right_on='native-id', how='inner')
    merged_df3 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_3', right_on='native-id', how='inner')
    merged_df4 = pd.merge(sorted_df, df2, left_on='extracted_portion_duplicate_4', right_on='native-id', how='inner')
except:
    pass

columns_to_drop = ['acquisition_date','size','concept-type','concept-id','Projects','ArchiveAndDistributionInformation','DayNightFlag',
                   'Identifiers','Platforms','Name','URL','ShortName','AdditionalAttributes','GPolygons','ProviderDates','EntryTitle','PGEName','PGEVersion',
                   'OrbitCalculatedSpatialDomains','GranuleUR','RelatedUrls','InputGranules','format','provider-id','native-id','revision-id']

try:
    merged_df1 = merged_df1.drop(columns=columns_to_drop)
    merged_df2 = merged_df2.drop(columns=columns_to_drop)
    merged_df3 = merged_df3.drop(columns=columns_to_drop)
    merged_df4 = merged_df4.drop(columns=columns_to_drop)
except:
    pass

try:
    merged_df1['revision-date-1'] = merged_df1['revision-date']
    merged_df2['revision-date-2'] = merged_df2['revision-date']
    merged_df3['revision-date-3'] = merged_df3['revision-date']
    merged_df4['revision-date-4'] = merged_df4['revision-date']
except:
    pass

try:
    merged_df1['ProductionDateTime-1'] = merged_df1['ProductionDateTime']
    merged_df2['ProductionDateTime-2'] = merged_df2['ProductionDateTime']
    merged_df3['ProductionDateTime-3'] = merged_df3['ProductionDateTime']
    merged_df4['ProductionDateTime-4'] = merged_df4['ProductionDateTime']
except:
    pass

try:
    merged_df = pd.merge(merged_df1, merged_df2, on='duplicate_1', how='left', suffixes=('_df2', '_df1'))
    merged_df = pd.merge(merged_df, merged_df3, on='duplicate_1', how='left', suffixes=('_merged', '_df3'))
    merged_df = pd.merge(merged_df, merged_df4, on='duplicate_1', how='left', suffixes=('_merged', '_df4'))
except:
    pass

try:
    merged_df['burst_id'] = merged_df['burst_id_df2']
    merged_df['duplicate_2'] = merged_df['duplicate_2_df2']
    merged_df['duplicate_3'] = merged_df['duplicate_3_df2']
    merged_df['duplicate_4'] = merged_df['duplicate_4_df2']
except:
    pass

suffixes_to_remove = ['_df1', '_df2', '_df3', '_df4','merged']

# Iterate over the suffixes and drop columns
for suffix in suffixes_to_remove:
    try:
        columns_to_drop = [col for col in merged_df.columns if (col.endswith(suffix))]
        merged_df = merged_df.drop(columns=columns_to_drop)
    except:
        pass

merged_df['extracted_portion_duplicate_1'] = merged_df['duplicate_1'].apply(extract_portion)

common_column = 'extracted_portion_duplicate_1'
column_to_include = 'BeginningDateTime'

# Merge the DataFrames based on the common column
merged_df = pd.merge(merged_df, df2[['native-id', column_to_include]], left_on=common_column, right_on='native-id', how='left')

# Drop the duplicate columns and rename the result column
merged_df = merged_df.drop(columns=['native-id','extracted_portion_duplicate_1']).rename(columns={column_to_include: 'BeginningDateTime'})

# Specify the desired column order
desired_order = ['burst_id', 'duplicate_1', 'duplicate_2', 'duplicate_3','duplicate_4','BeginningDateTime','revision-date-1','revision-date-2','revision-date-3','revision-date-4','ProductionDateTime-1','ProductionDateTime-2','ProductionDateTime-3','ProductionDateTime-4']

# Filter the columns that exist in the DataFrame
existing_columns = [col for col in desired_order if col in merged_df.columns]

# Reorder the DataFrame columns based on the desired order
merged_df = merged_df[existing_columns]

merged_df.head()

Output dataframe as csv¶

In [ ]:

# Output the DataFrame to a CSV file
csv_file_path = f'RTC-S1_duplicates_{datetime.now().strftime("%d-%m-%Y")}.csv'
merged_df.to_csv(csv_file_path, index=False)

Output the GeoDataFrame to GeoJSON¶

In [ ]:

# make geodataframe
duplicates_gdf = gdf[gdf['native-id'].isin(duplicates)]

columns_to_drop = ['size', 'concept-type', 'concept-id', 'revision-id', 'native-id',
       'provider-id', 'format', 'revision-date', 'BeginningDateTime',
       'EndingDateTime', 'OrbitCalculatedSpatialDomains', 'GranuleUR',
       'AdditionalAttributes', 'GPolygons', 'ProviderDates', 'EntryTitle',
       'PGEName', 'PGEVersion', 'RelatedUrls', 'InputGranules', 'Projects',
       'ArchiveAndDistributionInformation', 'DayNightFlag', 'Identifiers',
       'ProductionDateTime', 'Platforms', 'Name', 'URL', 'ShortName',]
duplicates_gdf = duplicates_gdf.drop(columns = columns_to_drop)

In [ ]:

duplicates_gdf.to_file(f'RTC-S1_duplicates_{datetime.now().strftime("%d-%m-%Y")}.geojson', driver='GeoJSON')