#!/usr/bin/env python # coding: utf-8 # [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ThomasAlbin/Astroniz-YT-Tutorials/blob/main/[ML1]-Asteroid-Spectra/2_data_parse.ipynb) # # Step 2: Data Parsing # # This notebook takes now the downloaded files, parses and cleans the data and merges the taxonomy classification with the spectra data. The final dataset is stored in a Level 1 directory for further computations. Further, a first clean up is performed. # In[1]: # Import standard libraries import glob import os import pathlib import re # Import installed libraries import pandas as pd # In[2]: # Let's mount the Google Drive, where we store files and models (if applicable, otherwise work # locally) try: from google.colab import drive drive.mount('/gdrive') core_path = "/gdrive/MyDrive/Colab/asteroid_taxonomy/" except ModuleNotFoundError: core_path = "" # In[3]: # Get a sorted list of all spectra files (consider only the spfit files that have been explained in # the references) spectra_filepaths = sorted(glob.glob(os.path.join(core_path, "data/lvl0/", "smass2/*spfit*"))) # ## Asteroid Designation Separation # # The spectra data have 2 different naming conventions starting with an "a" and starting with an "au". The first case corresponds to asteroids with an actual designation number (like (1) Ceres). The second case contains only temporary (at the time of the data release) designations (like 1995 BM2). Later, these spectra data need to be joined with the taxonomy class file # In[4]: # Separate the filepaths into designation and non-designation files des_file_paths = spectra_filepaths[:-8] non_file_paths = spectra_filepaths[-8:] # Convert the arrays to dataframes des_file_paths_df = pd.DataFrame(des_file_paths, columns=["FilePath"]) non_file_paths_df = pd.DataFrame(non_file_paths, columns=["FilePath"]) # In[5]: # Add now the designation / "non"-designation number des_file_paths_df.loc[:, "DesNr"] = des_file_paths_df["FilePath"] \ .apply(lambda x: int(re.search(r'smass2/a(.*).spfit', x).group(1))) non_file_paths_df.loc[:, "DesNr"] = non_file_paths_df["FilePath"] \ .apply(lambda x: re.search(r'smass2/au(.*).spfit', x).group(1)) # ## Taxonomy Classification # # Now we read the taxonomy classification file. Theoretically, the file has only 3 columns (asteroid name, Tholen & Bus Classification) with a rather large header. However, due to some formatting errors and the usage of white spaces as well as tabulator tabs, Pandas identifies 5 columns in total ... # # Since one cannot assign these "unknown" classes correctly to either Tholen nor Bus, the corresponding rows are deleted later. # In[6]: # Read the classification file asteroid_class_df = pd.read_csv(os.path.join(core_path, "data/lvl0/", "Bus.Taxonomy.txt"), skiprows=21, sep="\t", names=["Name", "Tholen_Class", "Bus_Class", "unknown1", "unknown2" ] ) # Remove white spaces asteroid_class_df.loc[:, "Name"] = asteroid_class_df["Name"].apply(lambda x: x.strip()).copy() # Separate between designated and non-designated asteroid classes des_ast_class_df = asteroid_class_df[:1403].copy() non_ast_class_df = asteroid_class_df[1403:].copy() # In[7]: # Now split the designated names and get the designation number (to link with the spfit files) des_ast_class_df.loc[:, "DesNr"] = des_ast_class_df["Name"].apply(lambda x: int(x.split(" ")[0])) # Merge with the spectra file paths des_ast_class_join_df = des_ast_class_df.merge(des_file_paths_df, on="DesNr") # For the non designated names, one needs to remove the white space between number and name and # compare with the file paths non_ast_class_df.loc[:, "DesNr"] = non_ast_class_df["Name"].apply(lambda x: x.replace(" ", "")) # Merge with the spectra file paths non_ast_class_join_df = non_ast_class_df.merge(non_file_paths_df, on="DesNr") # In[8]: # Merge now both datasets asteroids_df = pd.concat([des_ast_class_join_df, non_ast_class_join_df], axis=0) # Reset the index asteroids_df.reset_index(drop=True, inplace=True) # Remove the tholen class and both unknown columns asteroids_df.drop(columns=["Tholen_Class", "unknown1", "unknown2"], inplace=True) # Drop now all rows that do not contains a Bus Class asteroids_df.dropna(subset=["Bus_Class"], inplace=True) # ## Read and store the Spectra into a dataframe # In[9]: # Read and store the spectra asteroids_df.loc[:, "SpectrumDF"] = \ asteroids_df["FilePath"].apply(lambda x: pd.read_csv(x, sep="\t", names=["Wavelength_in_microm", "Reflectance_norm550nm"] ) ) # Reset the index asteroids_df.reset_index(drop=True, inplace=True) # Convert the Designation Number to string asteroids_df.loc[:, "DesNr"] = asteroids_df["DesNr"].astype(str) # In[10]: # Create (if applicable) the level 1 directory pathlib.Path(os.path.join(core_path, "data/lvl1")).mkdir(parents=True, exist_ok=True) # Save the dataframe as a pickle file asteroids_df.to_pickle(os.path.join(core_path, "data/lvl1/", "asteroids_merged.pkl"), protocol=4)