#!/usr/bin/env python # coding: utf-8 # # Data cleaning and preparation # ## Imports # In[1]: import pandas as pd import numpy as np import datetime # ## Configure data path variables # In[2]: # Path where the input csv-files are stored fpath_src_data_dir = 'src_data/' # Path where the cleaned data is stored fpath_clean_data_dir = 'clean_data/' # Path where the data ready for the ML analysis is stored fpath_prepared_data_dir = 'ready_data/' # In[3]: get_ipython().system('ls {fpath_src_data_dir}*.csv') # In[4]: get_ipython().system('mkdir -p {fpath_clean_data_dir}') get_ipython().system('mkdir -p {fpath_prepared_data_dir}') # ## Definitions to read in data and what columns to ignore # According to Thomas Denninger, some columns can be ignored (see below). In a second step, the analysis can and should however be refined, i.e. the information be used nonetheless. # # **Remarks**: # * We are not taking into consideration the dataset 'hm_mastitis_bact_tfpcr.csv' in this analysis. # * The columns 'bloodBHB', 'bloodNEFA', 'milkAcetone', 'CH4' of the dataset 'Hm_milkrecording.csv' are removed in this analysis. However, they are stored to a separate file in a different Jupyter notebook. # In[5]: data_files = {'hm_BCS.csv': {'delimiter': ';', 'columns_to_ignore': [] }, 'hm_lactation.csv': {'delimiter': ';', 'columns_to_ignore': ['dry_date_prev_lact'] }, 'hm_NSAIET.csv': {'delimiter': ';', 'columns_to_ignore': ['ejaculate', 'INSEM_SEXINGBEHANDLUNG', 'AI_organization', 'INSEM_SEXINGGESCHLECHT', 'idhrd_anon' ] }, # 'hm_mastitis_bact_tfpcr.csv': {'delimiter': ';', # 'columns_to_ignore': ['Ct_value'] # }, 'hm_animal.csv': {'delimiter': ';', 'columns_to_ignore': ['idbrd', 'idbrd_sire', 'idbrd_dam'] }, 'hm_milkrecording.csv': {'delimiter': ';', 'columns_to_ignore': ['idspectrum_num', 'AR_PESEE_PESCODEMALADEANALYSE', 'AR_PESEE_PESCODEMALADEPESEE', 'bloodBHB', 'bloodNEFA', 'milkAcetone', 'CH4', 'milk_yield_morning_measured', 'milk_yield_evening_measured' ] }, 'hm_ebv.csv': {'delimiter': ' ', 'columns_to_ignore': ['type', 'nDau', 'pubCode', 'rel', 'edc', 'nHerd', 'IVF', 'FIT', 'GZW', 'MIW'] }, 'hm_pregnancy.csv': {'delimiter': ';', 'columns_to_ignore': ['pregnancy_detection_method', 'P_N_value'] }, 'hm_health.csv': {'delimiter': ';', 'columns_to_ignore': ['intervening_person', 'infected_udder_quarter', 'leg'] } } fnames = list(data_files.keys()) # ## Useful functions to clean data # In[6]: def read_in_csv(file_path, sep): """ Reads in the specified csv-file. :param file_path: Path to the csv-file :param sep: Separator used in the csv-file :return: Pandas dataframe """ return pd.read_csv(file_path, sep=sep) def save_csv(df, file_path, index=False): """ Save the specified csv-file. :param df: Pandas dataframe :param file_path: Path to the output csv-file :param index: Boolean value whether or not the first column (bool) is written to file as well (default: False) :return: None """ return df.to_csv(file_path, index=index) def remove_rows_with_missing_values(df): """ Compute the number of missing values per column and remove all rows with missing values. :param df: Pandas dataframe :return: Pandas dataframe """ for _, val in enumerate(df.columns): mask = df[val].isnull().values if np.sum(mask) > 0: print('Column: {:}; Removed missing values: {:.2f}%'.format(val, np.sum(mask)/mask.size*100)) df = df.drop(np.arange(mask.size)[mask], axis=0) df = df.reset_index(drop=True) return df def remove_columns_to_be_ignored(df, columns_to_ignore): """ Remove all columns to be ignored. :param df: Pandas dataframe :param columns_to_ignore: List of the column names of the columns to ignore :return: Pandas dataframe """ return df.drop(labels=columns_to_ignore, axis=1) def convert_column_to_datetime(df, column, format='%Y%m%d', convert_to_int=False): """ Convert a date-like column to a pandas datetime-column. :param df: Pandas dataframe :param column: Column name of the datetime column :param format: Format of the date-entry (default: YYYYMMDD) :param convert_to_int: Bool value whether the column needs to be converted to an integer column first (default: False) :return: Pandas dataframe """ if convert_to_int: df[column] = df[column].values.astype(np.int64) df[column] = pd.to_datetime(df[column], format='%Y%m%d', errors='coerce').values return df # ## Read in src-data & clean it (with modifications as suggested by Thomas Denninger) # # Mögliche Verbesserungen an den Daten: # Jahreszeit (wie auch immer man diese definiert :) ) hinzufügen, wann eine Milchleistungsprüfung / Besamung vorgenommen wurde # Information der Art der Milchleistungsprüfmethode hineinnehmen # milking_time_morning und milking_time_evening ignorieren # Die Information AR_PESEE_PESCODEALPAGE (ob auf Alp oder nicht) weiter verfeinern, da sehr viele Einträge den Wert 0 haben # Information der Herde (idhrd_anon) “retten”, z.B. indem man berechnet wie die Milchleistung im Vergleich der mittleren Milchleistung der Herde ist # Zeilen entfernen, bei welchen die Spalte label nicht ein “CH” drin hat # milk_yield_msrmt_type (Typ wie der Milk Yield gemessen wurde) möglicherweise ignorieren # Werte mit nsaiet_type == Belegung oder == ET entfernen # Herausfinden, was bei calving_ease schief lief # In[7]: for _, fname in enumerate(fnames): print('--- Reading in {:} ---'.format(fname)) df = read_in_csv(file_path=fpath_src_data_dir+fname, sep=data_files[fname]['delimiter']) # Convert date columns to datetime objects & sort dataframe if fname == 'hm_BCS.csv': df = convert_column_to_datetime(df, column='BCS_date', convert_to_int=False) df = df.sort_values(by=['idani_anon', 'BCS_date']) if fname == 'hm_lactation.csv': df = convert_column_to_datetime(df, column='calving_date', convert_to_int=False) df = convert_column_to_datetime(df, column='dry_date_prev_lact', convert_to_int=True) df = df.sort_values(by=['idani_anon', 'calving_date']) if fname == 'hm_NSAIET.csv': df = convert_column_to_datetime(df, column='nsaiet_date', convert_to_int=False) df = df.sort_values(by=['idani_anon', 'nsaiet_date']) if fname == 'hm_animal.csv': df = convert_column_to_datetime(df, column='birth_date', convert_to_int=False) df = df.sort_values(by=['idani_anon', 'birth_date']) if fname == 'hm_milkrecording.csv': df = convert_column_to_datetime(df, column='mlksmpl_date', convert_to_int=False) df = convert_column_to_datetime(df, column='lab_date', convert_to_int=False) df = df.sort_values(by=['idani_anon', 'mlksmpl_date']) # if fname == 'hm_ebv.csv': # Sort afterwards # df = df.sort_values(by=['idani_anon']) if fname == 'hm_pregnancy.csv': df = convert_column_to_datetime(df, column='pregnancy_detection_date', convert_to_int=False) df = df.sort_values(by=['idani_anon', 'pregnancy_detection_date']) if fname == 'hm_health.csv': df = convert_column_to_datetime(df, column='healthevent_date', convert_to_int=False) df = df.sort_values(by=['idani_anon', 'healthevent_date']) # Modifications to individual datasets, as suggested by Thomas Denninger if fname == 'hm_milkrecording.csv': # Add new column depending on how the milk_yield was measured # "Only milk_yield_24h": 1; "milk_yield_morning_measured": 2; "milk_yield_evening_measured": 3 new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int) new_col[~(df['milk_yield_morning_measured'].isnull().values)] = 2 new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3 df['milk_yield_msrmt_type'] = new_col.copy() # Add new column of the fat_24h vs. protein_24h ratio new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int) new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3 df['fat_protein_24h_ratio'] = df['fat_24h'] / df['protein_24h'] # Replace empty 'AR_PESEE_PESCODEALPAGE' (altitude in m when cow was milked) values with fixed value (350) df['AR_PESEE_PESCODEALPAGE'] = df['AR_PESEE_PESCODEALPAGE'].fillna(350) if fname == 'hm_lactation.csv': # # In a first step, fill empty 'dry_date_prev_lact' values with 'calving_date' + 305 days # values = df['dry_date_prev_lact'].values.copy() # mask = df['dry_date_prev_lact'].isnull().values & (df['parity'].values > 1) # values[mask] = (df['calving_date'][mask] + pd.DateOffset(days=305)).values # df['dry_date_prev_lact'] = values.copy() # Fill empty 'calving_ease' with constant value 2.5 df['calving_ease'] = df['calving_ease'].fillna(2.5) if fname == 'hm_NSAIET.csv': # Overwrite the AI_technician (= ID of person who performed the artificial insemination) column with # "nsaiet_type=Belegung": 5; "nsaiet_type=Besamung": 10; "nsaiet_type=ET": 15 new_col = np.zeros(df['AI_technician'].size, dtype=np.int) new_col[df['nsaiet_type'].values == 'Belegung'] = 5 new_col[df['nsaiet_type'].values == 'Besamung'] = 10 new_col[df['nsaiet_type'].values == 'ET'] = 15 df['AI_technician'] = new_col.copy() if fname == 'hm_ebv.csv': # Replace the values 'XXXX' in the column 'label' with a 'U' (Unbekannt) values = df['label'].values.copy() mask = df['label'].values == 'XXXX' values[mask] = 'U' df['label'] = values.copy() # Convert 'trait' (=genetic trait) and 'estimate' (=value) to individual columns unsing a pivot table column_names = [col for col in list(df.columns.values) if col not in data_files[fname]['columns_to_ignore']] column_names.remove('trait') column_names.remove('estimate') df = pd.DataFrame(df.pivot_table('estimate', column_names, 'trait').to_records()) # Only consider the traits 'mkg', 'fkg', 'fpr', 'ekg', and 'epr' here and remove the rest considered_traits = ['ekg', 'epr', 'fkg', 'fpr', 'mkg', 'per', 'scs'] df = df[column_names+considered_traits] # Sort dataframe df = df.sort_values(by=['idani_anon']) # Remove columns to be ignored and rows with emtpy values if fname != 'hm_ebv.csv': df = remove_columns_to_be_ignored(df=df, columns_to_ignore=data_files[fname]['columns_to_ignore']) # Remove rows with missing values df = df.reset_index(drop=True) df = remove_rows_with_missing_values(df) assert(0 == df.isnull().sum().sum()) print(df) print() # Save file to clean directory save_csv(df=df, file_path=fpath_clean_data_dir+fname) # ### Make an extra file of the hm_milkrecording.csv blood, methane, and acetone values (for other challenge) # In[8]: fname = 'hm_milkrecording.csv' columns_to_ignore = ['idspectrum_num', 'AR_PESEE_PESCODEMALADEANALYSE', 'AR_PESEE_PESCODEMALADEPESEE', 'milk_yield_morning_measured', 'milk_yield_evening_measured' ] df = read_in_csv(file_path=fpath_src_data_dir+fname, sep=data_files[fname]['delimiter']) # Convert date columns to datetime objects & sort dataframe df = convert_column_to_datetime(df, column='mlksmpl_date', convert_to_int=False) df = df.sort_values(by=['idani_anon', 'mlksmpl_date']) # Add new column depending on how the milk_yield was measured # "Only milk_yield_24h": 1; "milk_yield_morning_measured": 2; "milk_yield_evening_measured": 3 new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int) new_col[~(df['milk_yield_morning_measured'].isnull().values)] = 2 new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3 df['milk_yield_msrmt_type'] = new_col.copy() # Replace empty 'AR_PESEE_PESCODEALPAGE' (altitude in m when cow was milked) values with fixed value (350) df['AR_PESEE_PESCODEALPAGE'] = df['AR_PESEE_PESCODEALPAGE'].fillna(350) # Remove columns to be ignored df = remove_columns_to_be_ignored(df=df, columns_to_ignore=columns_to_ignore) # Remove rows with emtpy values df = df.reset_index(drop=True) df = remove_rows_with_missing_values(df) assert(0 == df.isnull().sum().sum()) # Save file to clean directory save_csv(df=df, file_path=fpath_clean_data_dir+'hm_milkrecording_bloodvalues.csv') # In[ ]: