#!/usr/bin/env python
# coding: utf-8

# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ThomasAlbin/Astroniz-YT-Tutorials/blob/main/[ML1]-Asteroid-Spectra/2_data_parse.ipynb)

# # Step 2: Data Parsing
# 
# This notebook takes now the downloaded files, parses and cleans the data and merges the taxonomy classification with the spectra data. The final dataset is stored in a Level 1 directory for further computations. Further, a first clean up is performed.

# In[1]:


# Import standard libraries
import glob
import os
import pathlib
import re

# Import installed libraries
import pandas as pd


# In[2]:


# Let's mount the Google Drive, where we store files and models (if applicable, otherwise work
# locally)
try:
    from google.colab import drive
    drive.mount('/gdrive')
    core_path = "/gdrive/MyDrive/Colab/asteroid_taxonomy/"
except ModuleNotFoundError:
    core_path = ""


# In[3]:


# Get a sorted list of all spectra files (consider only the spfit files that have been explained in
# the references)
spectra_filepaths = sorted(glob.glob(os.path.join(core_path, "data/lvl0/", "smass2/*spfit*")))


# ## Asteroid Designation Separation
# 
# The spectra data have 2 different naming conventions starting with an "a" and starting with an "au". The first case corresponds to asteroids with an actual designation number (like (1) Ceres). The second case contains only temporary (at the time of the data release) designations (like 1995 BM2). Later, these spectra data need to be joined with the taxonomy class file

# In[4]:


# Separate the filepaths into designation and non-designation files
des_file_paths = spectra_filepaths[:-8]
non_file_paths = spectra_filepaths[-8:]

# Convert the arrays to dataframes
des_file_paths_df = pd.DataFrame(des_file_paths, columns=["FilePath"])
non_file_paths_df = pd.DataFrame(non_file_paths, columns=["FilePath"])


# In[5]:


# Add now the designation / "non"-designation number
des_file_paths_df.loc[:, "DesNr"] = des_file_paths_df["FilePath"] \
                                        .apply(lambda x: int(re.search(r'smass2/a(.*).spfit',
                                                                       x).group(1)))
non_file_paths_df.loc[:, "DesNr"] = non_file_paths_df["FilePath"] \
                                        .apply(lambda x: re.search(r'smass2/au(.*).spfit',
                                                                   x).group(1))


# ## Taxonomy Classification
# 
# Now we read the taxonomy classification file. Theoretically, the file has only 3 columns (asteroid name, Tholen & Bus Classification) with a rather large header. However, due to some formatting errors and the usage of white spaces as well as tabulator tabs, Pandas identifies 5 columns in total ...
# 
# Since one cannot assign these "unknown" classes correctly to either Tholen nor Bus, the corresponding rows are deleted later.

# In[6]:


# Read the classification file
asteroid_class_df = pd.read_csv(os.path.join(core_path, "data/lvl0/", "Bus.Taxonomy.txt"),
                                skiprows=21,
                                sep="\t",
                                names=["Name",
                                       "Tholen_Class",
                                       "Bus_Class",
                                       "unknown1",
                                       "unknown2"
                                      ]
                               )

# Remove white spaces
asteroid_class_df.loc[:, "Name"] = asteroid_class_df["Name"].apply(lambda x: x.strip()).copy()

# Separate between designated and non-designated asteroid classes
des_ast_class_df = asteroid_class_df[:1403].copy()
non_ast_class_df = asteroid_class_df[1403:].copy()


# In[7]:


# Now split the designated names and get the designation number (to link with the spfit files)
des_ast_class_df.loc[:, "DesNr"] = des_ast_class_df["Name"].apply(lambda x: int(x.split(" ")[0]))

# Merge with the spectra file paths
des_ast_class_join_df = des_ast_class_df.merge(des_file_paths_df, on="DesNr")

# For the non designated names, one needs to remove the white space between number and name and
# compare with the file paths
non_ast_class_df.loc[:, "DesNr"] = non_ast_class_df["Name"].apply(lambda x: x.replace(" ", ""))

# Merge with the spectra file paths
non_ast_class_join_df = non_ast_class_df.merge(non_file_paths_df, on="DesNr")


# In[8]:


# Merge now both datasets
asteroids_df = pd.concat([des_ast_class_join_df, non_ast_class_join_df], axis=0)

# Reset the index
asteroids_df.reset_index(drop=True, inplace=True)

# Remove the tholen class and both unknown columns
asteroids_df.drop(columns=["Tholen_Class", "unknown1", "unknown2"], inplace=True)

# Drop now all rows that do not contains a Bus Class
asteroids_df.dropna(subset=["Bus_Class"], inplace=True)


# ## Read and store the Spectra into a dataframe

# In[9]:


# Read and store the spectra
asteroids_df.loc[:, "SpectrumDF"] = \
    asteroids_df["FilePath"].apply(lambda x: pd.read_csv(x, sep="\t",
                                                         names=["Wavelength_in_microm",
                                                                "Reflectance_norm550nm"]
                                                        )
                                  )
# Reset the index
asteroids_df.reset_index(drop=True, inplace=True)

# Convert the Designation Number to string
asteroids_df.loc[:, "DesNr"] = asteroids_df["DesNr"].astype(str)


# In[10]:


# Create (if applicable) the level 1 directory
pathlib.Path(os.path.join(core_path, "data/lvl1")).mkdir(parents=True, exist_ok=True)

# Save the dataframe as a pickle file
asteroids_df.to_pickle(os.path.join(core_path, "data/lvl1/", "asteroids_merged.pkl"), protocol=4)