import pandas as pd
import numpy as np
import datetime
# Path where the input csv-files are stored
fpath_src_data_dir = 'src_data/'
# Path where the cleaned data is stored
fpath_clean_data_dir = 'clean_data/'
# Path where the data ready for the ML analysis is stored
fpath_prepared_data_dir = 'ready_data/'
!ls {fpath_src_data_dir}*.csv
!mkdir -p {fpath_clean_data_dir}
!mkdir -p {fpath_prepared_data_dir}
According to Thomas Denninger, some columns can be ignored (see below). In a second step, the analysis can and should however be refined, i.e. the information be used nonetheless.
Remarks:
data_files = {'hm_BCS.csv': {'delimiter': ';',
'columns_to_ignore': []
},
'hm_lactation.csv': {'delimiter': ';',
'columns_to_ignore': ['dry_date_prev_lact']
},
'hm_NSAIET.csv': {'delimiter': ';',
'columns_to_ignore': ['ejaculate',
'INSEM_SEXINGBEHANDLUNG',
'AI_organization',
'INSEM_SEXINGGESCHLECHT',
'idhrd_anon'
]
},
# 'hm_mastitis_bact_tfpcr.csv': {'delimiter': ';',
# 'columns_to_ignore': ['Ct_value']
# },
'hm_animal.csv': {'delimiter': ';',
'columns_to_ignore': ['idbrd', 'idbrd_sire', 'idbrd_dam']
},
'hm_milkrecording.csv': {'delimiter': ';',
'columns_to_ignore': ['idspectrum_num',
'AR_PESEE_PESCODEMALADEANALYSE',
'AR_PESEE_PESCODEMALADEPESEE',
'bloodBHB',
'bloodNEFA',
'milkAcetone',
'CH4',
'milk_yield_morning_measured',
'milk_yield_evening_measured'
]
},
'hm_ebv.csv': {'delimiter': ' ',
'columns_to_ignore': ['type', 'nDau', 'pubCode', 'rel', 'edc', 'nHerd', 'IVF', 'FIT', 'GZW', 'MIW']
},
'hm_pregnancy.csv': {'delimiter': ';',
'columns_to_ignore': ['pregnancy_detection_method', 'P_N_value']
},
'hm_health.csv': {'delimiter': ';',
'columns_to_ignore': ['intervening_person', 'infected_udder_quarter', 'leg']
}
}
fnames = list(data_files.keys())
def read_in_csv(file_path, sep):
"""
Reads in the specified csv-file.
:param file_path: Path to the csv-file
:param sep: Separator used in the csv-file
:return: Pandas dataframe
"""
return pd.read_csv(file_path, sep=sep)
def save_csv(df, file_path, index=False):
"""
Save the specified csv-file.
:param df: Pandas dataframe
:param file_path: Path to the output csv-file
:param index: Boolean value whether or not the first column (bool) is written to file as well (default: False)
:return: None
"""
return df.to_csv(file_path, index=index)
def remove_rows_with_missing_values(df):
"""
Compute the number of missing values per column and remove all rows with missing values.
:param df: Pandas dataframe
:return: Pandas dataframe
"""
for _, val in enumerate(df.columns):
mask = df[val].isnull().values
if np.sum(mask) > 0:
print('Column: {:}; Removed missing values: {:.2f}%'.format(val, np.sum(mask)/mask.size*100))
df = df.drop(np.arange(mask.size)[mask], axis=0)
df = df.reset_index(drop=True)
return df
def remove_columns_to_be_ignored(df, columns_to_ignore):
"""
Remove all columns to be ignored.
:param df: Pandas dataframe
:param columns_to_ignore: List of the column names of the columns to ignore
:return: Pandas dataframe
"""
return df.drop(labels=columns_to_ignore, axis=1)
def convert_column_to_datetime(df, column, format='%Y%m%d', convert_to_int=False):
"""
Convert a date-like column to a pandas datetime-column.
:param df: Pandas dataframe
:param column: Column name of the datetime column
:param format: Format of the date-entry (default: YYYYMMDD)
:param convert_to_int: Bool value whether the column needs to be converted to an
integer column first (default: False)
:return: Pandas dataframe
"""
if convert_to_int:
df[column] = df[column].values.astype(np.int64)
df[column] = pd.to_datetime(df[column], format='%Y%m%d', errors='coerce').values
return df
Mögliche Verbesserungen an den Daten: Jahreszeit (wie auch immer man diese definiert :) ) hinzufügen, wann eine Milchleistungsprüfung / Besamung vorgenommen wurde Information der Art der Milchleistungsprüfmethode hineinnehmen milking_time_morning und milking_time_evening ignorieren Die Information AR_PESEE_PESCODEALPAGE (ob auf Alp oder nicht) weiter verfeinern, da sehr viele Einträge den Wert 0 haben Information der Herde (idhrd_anon) “retten”, z.B. indem man berechnet wie die Milchleistung im Vergleich der mittleren Milchleistung der Herde ist Zeilen entfernen, bei welchen die Spalte label nicht ein “CH” drin hat milk_yield_msrmt_type (Typ wie der Milk Yield gemessen wurde) möglicherweise ignorieren Werte mit nsaiet_type == Belegung oder == ET entfernen Herausfinden, was bei calving_ease schief lief
for _, fname in enumerate(fnames):
print('--- Reading in {:} ---'.format(fname))
df = read_in_csv(file_path=fpath_src_data_dir+fname, sep=data_files[fname]['delimiter'])
# Convert date columns to datetime objects & sort dataframe
if fname == 'hm_BCS.csv':
df = convert_column_to_datetime(df, column='BCS_date', convert_to_int=False)
df = df.sort_values(by=['idani_anon', 'BCS_date'])
if fname == 'hm_lactation.csv':
df = convert_column_to_datetime(df, column='calving_date', convert_to_int=False)
df = convert_column_to_datetime(df, column='dry_date_prev_lact', convert_to_int=True)
df = df.sort_values(by=['idani_anon', 'calving_date'])
if fname == 'hm_NSAIET.csv':
df = convert_column_to_datetime(df, column='nsaiet_date', convert_to_int=False)
df = df.sort_values(by=['idani_anon', 'nsaiet_date'])
if fname == 'hm_animal.csv':
df = convert_column_to_datetime(df, column='birth_date', convert_to_int=False)
df = df.sort_values(by=['idani_anon', 'birth_date'])
if fname == 'hm_milkrecording.csv':
df = convert_column_to_datetime(df, column='mlksmpl_date', convert_to_int=False)
df = convert_column_to_datetime(df, column='lab_date', convert_to_int=False)
df = df.sort_values(by=['idani_anon', 'mlksmpl_date'])
# if fname == 'hm_ebv.csv': # Sort afterwards
# df = df.sort_values(by=['idani_anon'])
if fname == 'hm_pregnancy.csv':
df = convert_column_to_datetime(df, column='pregnancy_detection_date', convert_to_int=False)
df = df.sort_values(by=['idani_anon', 'pregnancy_detection_date'])
if fname == 'hm_health.csv':
df = convert_column_to_datetime(df, column='healthevent_date', convert_to_int=False)
df = df.sort_values(by=['idani_anon', 'healthevent_date'])
# Modifications to individual datasets, as suggested by Thomas Denninger
if fname == 'hm_milkrecording.csv':
# Add new column depending on how the milk_yield was measured
# "Only milk_yield_24h": 1; "milk_yield_morning_measured": 2; "milk_yield_evening_measured": 3
new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)
new_col[~(df['milk_yield_morning_measured'].isnull().values)] = 2
new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3
df['milk_yield_msrmt_type'] = new_col.copy()
# Add new column of the fat_24h vs. protein_24h ratio
new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)
new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3
df['fat_protein_24h_ratio'] = df['fat_24h'] / df['protein_24h']
# Replace empty 'AR_PESEE_PESCODEALPAGE' (altitude in m when cow was milked) values with fixed value (350)
df['AR_PESEE_PESCODEALPAGE'] = df['AR_PESEE_PESCODEALPAGE'].fillna(350)
if fname == 'hm_lactation.csv':
# # In a first step, fill empty 'dry_date_prev_lact' values with 'calving_date' + 305 days
# values = df['dry_date_prev_lact'].values.copy()
# mask = df['dry_date_prev_lact'].isnull().values & (df['parity'].values > 1)
# values[mask] = (df['calving_date'][mask] + pd.DateOffset(days=305)).values
# df['dry_date_prev_lact'] = values.copy()
# Fill empty 'calving_ease' with constant value 2.5
df['calving_ease'] = df['calving_ease'].fillna(2.5)
if fname == 'hm_NSAIET.csv':
# Overwrite the AI_technician (= ID of person who performed the artificial insemination) column with
# "nsaiet_type=Belegung": 5; "nsaiet_type=Besamung": 10; "nsaiet_type=ET": 15
new_col = np.zeros(df['AI_technician'].size, dtype=np.int)
new_col[df['nsaiet_type'].values == 'Belegung'] = 5
new_col[df['nsaiet_type'].values == 'Besamung'] = 10
new_col[df['nsaiet_type'].values == 'ET'] = 15
df['AI_technician'] = new_col.copy()
if fname == 'hm_ebv.csv':
# Replace the values 'XXXX' in the column 'label' with a 'U' (Unbekannt)
values = df['label'].values.copy()
mask = df['label'].values == 'XXXX'
values[mask] = 'U'
df['label'] = values.copy()
# Convert 'trait' (=genetic trait) and 'estimate' (=value) to individual columns unsing a pivot table
column_names = [col for col in list(df.columns.values) if col not in data_files[fname]['columns_to_ignore']]
column_names.remove('trait')
column_names.remove('estimate')
df = pd.DataFrame(df.pivot_table('estimate', column_names, 'trait').to_records())
# Only consider the traits 'mkg', 'fkg', 'fpr', 'ekg', and 'epr' here and remove the rest
considered_traits = ['ekg', 'epr', 'fkg', 'fpr', 'mkg', 'per', 'scs']
df = df[column_names+considered_traits]
# Sort dataframe
df = df.sort_values(by=['idani_anon'])
# Remove columns to be ignored and rows with emtpy values
if fname != 'hm_ebv.csv':
df = remove_columns_to_be_ignored(df=df, columns_to_ignore=data_files[fname]['columns_to_ignore'])
# Remove rows with missing values
df = df.reset_index(drop=True)
df = remove_rows_with_missing_values(df)
assert(0 == df.isnull().sum().sum())
print(df)
print()
# Save file to clean directory
save_csv(df=df, file_path=fpath_clean_data_dir+fname)
fname = 'hm_milkrecording.csv'
columns_to_ignore = ['idspectrum_num',
'AR_PESEE_PESCODEMALADEANALYSE',
'AR_PESEE_PESCODEMALADEPESEE',
'milk_yield_morning_measured',
'milk_yield_evening_measured'
]
df = read_in_csv(file_path=fpath_src_data_dir+fname, sep=data_files[fname]['delimiter'])
# Convert date columns to datetime objects & sort dataframe
df = convert_column_to_datetime(df, column='mlksmpl_date', convert_to_int=False)
df = df.sort_values(by=['idani_anon', 'mlksmpl_date'])
# Add new column depending on how the milk_yield was measured
# "Only milk_yield_24h": 1; "milk_yield_morning_measured": 2; "milk_yield_evening_measured": 3
new_col = np.ones(df['milk_yield_24h'].size, dtype=np.int)
new_col[~(df['milk_yield_morning_measured'].isnull().values)] = 2
new_col[~(df['milk_yield_evening_measured'].isnull().values)] = 3
df['milk_yield_msrmt_type'] = new_col.copy()
# Replace empty 'AR_PESEE_PESCODEALPAGE' (altitude in m when cow was milked) values with fixed value (350)
df['AR_PESEE_PESCODEALPAGE'] = df['AR_PESEE_PESCODEALPAGE'].fillna(350)
# Remove columns to be ignored
df = remove_columns_to_be_ignored(df=df, columns_to_ignore=columns_to_ignore)
# Remove rows with emtpy values
df = df.reset_index(drop=True)
df = remove_rows_with_missing_values(df)
assert(0 == df.isnull().sum().sum())
# Save file to clean directory
save_csv(df=df, file_path=fpath_clean_data_dir+'hm_milkrecording_bloodvalues.csv')