#!/usr/bin/env python # coding: utf-8 # Most of the times, the data is damaged, or missing, we need to take care of it since Machine Learning models don't work when the data is missing or not a number. # In[6]: import pandas as pd from sklearn.preprocessing import Imputer # ## Imputing missing values using Imputer # In[7]: df = pd.read_csv('Data.csv') df.head() # In[8]: # replace every occurrence of missing_values to one defined by strategy # which can be mean, median, mode. Axis = 0 means rows, 1 means column imputer = Imputer(missing_values='NaN', strategy='mean', axis = 0) df.iloc[:, 1:3] = imputer.fit_transform(df.iloc[:, 1:3]) df.head() # ## Encoding categorical data # In[9]: # Label Encoder will replace every categorical variable with number. Useful for replacing yes by 1, no by 0. # One Hot Encoder will create a separate column for every variable and give a value of 1 where the variable is present from sklearn.preprocessing import LabelEncoder, OneHotEncoder # In[10]: lable_encoder = LabelEncoder() temp = df.copy() temp.iloc[:, 0] = lable_encoder.fit_transform(df.iloc[:, 0]) temp.head() # In[11]: # you can pass an array of indices of categorical features # one_hot_encoder = OneHotEncoder(categorical_features=[0]) # temp = df.copy() # temp.iloc[:, 0] = one_hot_encoder.fit_transform(df.iloc[:, 0]) # you can achieve the same thing using get_dummies pd.get_dummies(df.iloc[:, :-1]) # ## Binarizing # # Often we need to do the reverse of what we've done above. That is, convert continuous features to discrete values. For instance, we want to convert the output to 0 or 1 depending on the threshold. # In[16]: from sklearn.datasets import load_iris iris_dataset = load_iris() X = iris_dataset.data y = iris_dataset.target feature_names = iris_dataset.feature_names # Now we'll binarize the sepal width with 0 or 1 indicating whether the current value is below or above mean. # In[22]: X[:, 1] # In[26]: from sklearn.preprocessing import Binarizer X[:, 1:2] = Binarizer(threshold=X[:, 1].mean()).fit_transform(X[:, 1].reshape(-1, 1)) X[:, 1] # In[ ]: