#!/usr/bin/env python
# coding: utf-8

# Most of the times, the data is damaged, or missing, we need to take care of it since Machine Learning models don't work when the data is missing or not a number. 

# In[6]:


import pandas as pd
from sklearn.preprocessing import Imputer


# ## Imputing missing values using Imputer

# In[7]:


df = pd.read_csv('Data.csv')
df.head()


# In[8]:


# replace every occurrence of missing_values to one defined by strategy
# which can be mean, median, mode. Axis = 0 means rows, 1 means column

imputer = Imputer(missing_values='NaN', strategy='mean', axis = 0)
df.iloc[:, 1:3] = imputer.fit_transform(df.iloc[:, 1:3])
df.head()


# ## Encoding categorical data  

# In[9]:


# Label Encoder will replace every categorical variable with number. Useful for replacing yes by 1, no by 0.
# One Hot Encoder will create a separate column for every variable and give a value of 1 where the variable is present
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


# In[10]:


lable_encoder = LabelEncoder()
temp = df.copy()
temp.iloc[:, 0] = lable_encoder.fit_transform(df.iloc[:, 0])
temp.head()


# In[11]:


# you can pass an array of indices of categorical features
# one_hot_encoder = OneHotEncoder(categorical_features=[0])
# temp = df.copy()
# temp.iloc[:, 0] = one_hot_encoder.fit_transform(df.iloc[:, 0])

# you can achieve the same thing using get_dummies
pd.get_dummies(df.iloc[:, :-1])


# ## Binarizing
# 
# Often we need to do the reverse of what we've done above. That is, convert continuous features to discrete values. For instance, we want to convert the output to 0 or 1 depending on the threshold. 

# In[16]:


from sklearn.datasets import load_iris

iris_dataset = load_iris()
X = iris_dataset.data
y = iris_dataset.target
feature_names = iris_dataset.feature_names


# Now we'll binarize the sepal width with 0 or 1 indicating whether the current value is below or above mean. 

# In[22]:


X[:, 1]


# In[26]:


from sklearn.preprocessing import Binarizer
X[:, 1:2] = Binarizer(threshold=X[:, 1].mean()).fit_transform(X[:, 1].reshape(-1, 1))
X[:, 1]


# In[ ]: