#!/usr/bin/env python
# coding: utf-8

# # Data cleaning and preparation

# ## Imports

# In[1]:


import pandas as pd
import numpy as np
import datetime


# ## Configure data path variables

# In[2]:


# Path where the input csv-files are stored
fpath_src_data_dir = 'src_data/'

# Path where the cleaned data is stored
fpath_clean_data_dir = 'clean_data/'

# Path where the data ready for the ML analysis is stored
fpath_prepared_data_dir = 'ready_data/'


# In[3]:


get_ipython().system('ls {fpath_src_data_dir}*.csv')


# In[4]:


get_ipython().system('mkdir -p {fpath_clean_data_dir}')
get_ipython().system('mkdir -p {fpath_prepared_data_dir}')


# ## Definitions to read in data and what columns to ignore
# According to Thomas Denninger, some columns can be ignored (see below). In a second step, the analysis can and should however be refined, i.e. the information be used nonetheless.
# 
# **Remarks**:
# * We are not taking into consideration the dataset 'hm_mastitis_bact_tfpcr.csv' in this analysis.
# * The columns 'bloodBHB', 'bloodNEFA', 'milkAcetone', 'CH4' of the dataset 'Hm_milkrecording.csv' are removed in this analysis. However, they are stored to a separate file in a different Jupyter notebook.

# In[5]:


data_files = {'hm_BCS.csv': {'delimiter': ';',
                             'columns_to_ignore': []
                            },
            
              'hm_lactation.csv': {'delimiter': ';',
                                   'columns_to_ignore': ['dry_date_prev_lact']
                                  },

              'hm_NSAIET.csv': {'delimiter': ';',
                                'columns_to_ignore': ['ejaculate',
                                                      'INSEM_SEXINGBEHANDLUNG',
                                                      'AI_organization',
                                                      'INSEM_SEXINGGESCHLECHT',
                                                      'idhrd_anon'
                                                     ]
                               },

#               'hm_mastitis_bact_tfpcr.csv': {'delimiter': ';',
#                                              'columns_to_ignore': ['Ct_value']
#                                             },

              'hm_animal.csv': {'delimiter': ';',
                                'columns_to_ignore': ['idbrd', 'idbrd_sire', 'idbrd_dam']
                               },

              'hm_milkrecording.csv': {'delimiter': ';',
                                       'columns_to_ignore': ['idspectrum_num',
                                                             'AR_PESEE_PESCODEMALADEANALYSE',
                                                             'AR_PESEE_PESCODEMALADEPESEE',
                                                             'bloodBHB',
                                                             'bloodNEFA',
                                                             'milkAcetone',
                                                             'CH4',
                                                             'milk_yield_morning_measured',
                                                             'milk_yield_evening_measured'
                                                            ]
                                      },

              'hm_ebv.csv': {'delimiter': ' ',
                             'columns_to_ignore': ['type', 'nDau', 'pubCode', 'rel', 'edc', 'nHerd', 'IVF', 'FIT', 'GZW', 'MIW']
                            },

              'hm_pregnancy.csv': {'delimiter': ';',
                                   'columns_to_ignore': ['pregnancy_detection_method', 'P_N_value']
                                  },

              'hm_health.csv': {'delimiter': ';',
                                'columns_to_ignore': ['intervening_person', 'infected_udder_quarter', 'leg']
                               }
             }

fnames = list(data_files.keys())


# ## Useful functions to clean data

# In[6]:


def read_in_csv(file_path, sep):
    """
    Reads in the specified csv-file.
    
    :param file_path: Path to the csv-file
    :param sep: Separator used in the csv-file
    :return: Pandas dataframe
    """
    return pd.read_csv(file_path, sep=sep)


def save_csv(df, file_path, index=False):
    """
    Save the specified csv-file.
    
    :param df: Pandas dataframe
    :param file_path: Path to the output csv-file
    :param index: Boolean value whether or not the first column (bool) is written to file as well (default: False)
    :return: None
    """
    return df.to_csv(file_path, index=index)


def remove_rows_with_missing_values(df):
    """
    Compute the number of missing values per column and remove all rows with missing values.
    
    :param df: Pandas dataframe
    :return: Pandas dataframe
    """
    
    for _, val in enumerate(df.columns):
        mask = df[val].isnull().values
        if np.sum(mask) > 0:
            print('Column: {:}; Removed missing values: {:.2f}%'.format(val, np.sum(mask)/mask.size*100))
            df = df.drop(np.arange(mask.size)[mask], axis=0)
            df = df.reset_index(drop=True)
    return df


def remove_columns_to_be_ignored(df, columns_to_ignore):
    """
    Remove all columns to be ignored.
    
    :param df: Pandas dataframe
    :param columns_to_ignore: List of the column names of the columns to ignore
    :return: Pandas dataframe
    """
    return df.drop(labels=columns_to_ignore, axis=1)


def convert_column_to_datetime(df, column, format='%Y%m%d', convert_to_int=False):
    """
    Convert a date-like column to a pandas datetime-column.
    
    :param df: Pandas dataframe
    :param column: Column name of the datetime column
    :param format: Format of the date-entry (default: YYYYMMDD)
    :param convert_to_int: Bool value whether the column needs to be converted to an
                           integer column first (default: False)
    :return: Pandas dataframe
    """
    if convert_to_int:
        df[column] = df[column].values.astype(np.int64)
    df[column] = pd.to_datetime(df[column], format='%Y%m%d', errors='coerce').values
    return df


# ## Read in src-data & clean it (with modifications as suggested by Thomas Denninger)
# 
# Mögliche Verbesserungen an den Daten:
# Jahreszeit (wie auch immer man diese definiert :) ) hinzufügen, wann eine Milchleistungsprüfung / Besamung vorgenommen wurde
# Information der Art der Milchleistungsprüfmethode hineinnehmen
# milking_time_morning und milking_time_evening ignorieren
# Die Information AR_PESEE_PESCODEALPAGE (ob auf Alp oder nicht) weiter verfeinern, da sehr viele Einträge den Wert 0 haben
# Information der Herde (idhrd_anon) “retten”, z.B. indem man berechnet wie die Milchleistung im Vergleich der mittleren Milchleistung der Herde ist
# Zeilen entfernen, bei welchen die Spalte label nicht ein “CH” drin hat
# milk_yield_msrmt_type (Typ wie der Milk Yield gemessen wurde) möglicherweise ignorieren
# Werte mit nsaiet_type == Belegung oder == ET entfernen
# Herausfinden, was bei calving_ease schief lief

# In[7]:


for _, fname in enumerate(fnames):
    print('--- Reading in {:} ---'.format(fname))
    df = read_in_csv(file_path=fpath_src_data_dir+fname, sep=data_files[fname]['delimiter'])

    # Convert date columns to datetime objects & sort dataframe
    if fname == 'hm_BCS.csv':
        df = convert_column_to_datetime(df, column='BCS_date', convert_to_int=False)
        df = df.sort_values(by=['idani_anon', 'BCS_date'])
    if fname == 'hm_lactation.csv':
        df = convert_column_to_datetime(df, column='calving_date', convert_to_int=False)
        df = convert_column_to_datetime(df, column='dry_date_prev_lact', convert_to_int=True)
        df = df.sort_values(by=['idani_anon', 'calving_date'])
    if fname == 'hm_NSAIET.csv':
        df = convert_column_to_datetime(df, column='nsaiet_date', convert_to_int=False)
        df = df.sort_values(by=['idani_anon', 'nsaiet_date'])
    if fname == 'hm_animal.csv':
        df = convert_column_to_datetime(df, column='birth_date', convert_to_int=False)
        df = df.sort_values(by=['idani_anon', 'birth_date'])
    if fname == 'hm_milkrecording.csv':
        df = convert_column_to_datetime(df, column='mlksmpl_date', convert_to_int=False)
        df = convert_column_to_datetime(df, column='lab_date', convert_to_int=False)
        df = df.sort_values(by=['idani_anon', 'mlksmpl_date'])
#     if fname == 'hm_ebv.csv':  # Sort afterwards
#         df = df.sort_values(by=['idani_anon'])
    if fname == 'hm_pregnancy.csv':
        df = convert_column_to_datetime(df, column='pregnancy_detection_date', convert_to_int=False)
        df = df.sort_values(by=['idani_anon', 'pregnancy_detection_date'])
    if fname == 'hm_health.csv':
        df = convert_column_to_datetime(df, column='healthevent_date', convert_to_int=False)
        df = df.sort_values(by=['idani_anon', 'healthevent_date'])

    # Modifications to individual datasets, as suggested by Thomas Denninger
    if fname == 'hm_milkrecording.csv':
        # Add new column depending on how the milk_yield was measured
        # "Only milk_yield_24h": 1; "milk_yield_morning_measured": 2; "milk_yield_evening_measured": 3
        new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)
        new_col[~(df['milk_yield_morning_measured'].isnull().values)] = 2
        new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3
        df['milk_yield_msrmt_type'] = new_col.copy()

        # Add new column of the fat_24h vs. protein_24h ratio
        new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)
        new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3
        df['fat_protein_24h_ratio'] = df['fat_24h'] / df['protein_24h']

        # Replace empty 'AR_PESEE_PESCODEALPAGE' (altitude in m when cow was milked) values with fixed value (350)
        df['AR_PESEE_PESCODEALPAGE'] = df['AR_PESEE_PESCODEALPAGE'].fillna(350)

    if fname == 'hm_lactation.csv':
#         # In a first step, fill empty 'dry_date_prev_lact' values with 'calving_date' + 305 days
#         values = df['dry_date_prev_lact'].values.copy()
#         mask = df['dry_date_prev_lact'].isnull().values & (df['parity'].values > 1)
#         values[mask] = (df['calving_date'][mask] + pd.DateOffset(days=305)).values
#         df['dry_date_prev_lact'] = values.copy()

        # Fill empty 'calving_ease' with constant value 2.5
        df['calving_ease'] = df['calving_ease'].fillna(2.5)

    if fname == 'hm_NSAIET.csv':
        # Overwrite the AI_technician (= ID of person who performed the artificial insemination) column with
        # "nsaiet_type=Belegung": 5; "nsaiet_type=Besamung": 10; "nsaiet_type=ET": 15
        new_col = np.zeros(df['AI_technician'].size, dtype=np.int)
        new_col[df['nsaiet_type'].values == 'Belegung'] = 5
        new_col[df['nsaiet_type'].values == 'Besamung'] = 10
        new_col[df['nsaiet_type'].values == 'ET'] = 15
        df['AI_technician'] = new_col.copy()

    if fname == 'hm_ebv.csv':
        # Replace the values 'XXXX' in the column 'label' with a 'U' (Unbekannt)
        values = df['label'].values.copy()
        mask = df['label'].values == 'XXXX'
        values[mask] = 'U'
        df['label'] = values.copy()
        
        # Convert 'trait' (=genetic trait) and 'estimate' (=value) to individual columns unsing a pivot table 
        column_names = [col for col in list(df.columns.values) if col not in data_files[fname]['columns_to_ignore']]
        column_names.remove('trait')
        column_names.remove('estimate')
        df = pd.DataFrame(df.pivot_table('estimate', column_names, 'trait').to_records())
        
        # Only consider the traits 'mkg', 'fkg', 'fpr', 'ekg', and 'epr' here and remove the rest
        considered_traits = ['ekg', 'epr', 'fkg', 'fpr', 'mkg', 'per', 'scs']
        
        df = df[column_names+considered_traits]
        
        # Sort dataframe
        df = df.sort_values(by=['idani_anon'])
    
    # Remove columns to be ignored and rows with emtpy values
    if fname != 'hm_ebv.csv':
        df = remove_columns_to_be_ignored(df=df, columns_to_ignore=data_files[fname]['columns_to_ignore'])
    
    # Remove rows with missing values
    df = df.reset_index(drop=True)
    df = remove_rows_with_missing_values(df)
    
    assert(0 == df.isnull().sum().sum())
    print(df)
    print()

    # Save file to clean directory
    save_csv(df=df, file_path=fpath_clean_data_dir+fname)


# ### Make an extra file of the hm_milkrecording.csv blood, methane, and acetone values (for other challenge)

# In[8]:


fname = 'hm_milkrecording.csv'
columns_to_ignore = ['idspectrum_num',
                     'AR_PESEE_PESCODEMALADEANALYSE',
                     'AR_PESEE_PESCODEMALADEPESEE',
                     'milk_yield_morning_measured',
                     'milk_yield_evening_measured'
                    ]

df = read_in_csv(file_path=fpath_src_data_dir+fname, sep=data_files[fname]['delimiter'])

# Convert date columns to datetime objects & sort dataframe
df = convert_column_to_datetime(df, column='mlksmpl_date', convert_to_int=False)
df = df.sort_values(by=['idani_anon', 'mlksmpl_date'])

# Add new column depending on how the milk_yield was measured
# "Only milk_yield_24h": 1; "milk_yield_morning_measured": 2; "milk_yield_evening_measured": 3
new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)
new_col[~(df['milk_yield_morning_measured'].isnull().values)] = 2
new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3
df['milk_yield_msrmt_type'] = new_col.copy()

# Replace empty 'AR_PESEE_PESCODEALPAGE' (altitude in m when cow was milked) values with fixed value (350)
df['AR_PESEE_PESCODEALPAGE'] = df['AR_PESEE_PESCODEALPAGE'].fillna(350)

# Remove columns to be ignored
df = remove_columns_to_be_ignored(df=df, columns_to_ignore=columns_to_ignore)

# Remove rows with emtpy values
df = df.reset_index(drop=True)
df = remove_rows_with_missing_values(df)

assert(0 == df.isnull().sum().sum())

# Save file to clean directory
save_csv(df=df, file_path=fpath_clean_data_dir+'hm_milkrecording_bloodvalues.csv')


# In[ ]: