Most of the times, the data is damaged, or missing, we need to take care of it since Machine Learning models don't work when the data is missing or not a number.
import pandas as pd
from sklearn.preprocessing import Imputer
df = pd.read_csv('Data.csv')
df.head()
Country | Age | Salary | Purchased | |
---|---|---|---|---|
0 | France | 44.0 | 72000.0 | No |
1 | Spain | 27.0 | 48000.0 | Yes |
2 | Germany | 30.0 | 54000.0 | No |
3 | Spain | 38.0 | 61000.0 | No |
4 | Germany | 40.0 | NaN | Yes |
# replace every occurrence of missing_values to one defined by strategy
# which can be mean, median, mode. Axis = 0 means rows, 1 means column
imputer = Imputer(missing_values='NaN', strategy='mean', axis = 0)
df.iloc[:, 1:3] = imputer.fit_transform(df.iloc[:, 1:3])
df.head()
Country | Age | Salary | Purchased | |
---|---|---|---|---|
0 | France | 44.0 | 72000.000000 | No |
1 | Spain | 27.0 | 48000.000000 | Yes |
2 | Germany | 30.0 | 54000.000000 | No |
3 | Spain | 38.0 | 61000.000000 | No |
4 | Germany | 40.0 | 63777.777778 | Yes |
# Label Encoder will replace every categorical variable with number. Useful for replacing yes by 1, no by 0.
# One Hot Encoder will create a separate column for every variable and give a value of 1 where the variable is present
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
lable_encoder = LabelEncoder()
temp = df.copy()
temp.iloc[:, 0] = lable_encoder.fit_transform(df.iloc[:, 0])
temp.head()
Country | Age | Salary | Purchased | |
---|---|---|---|---|
0 | 0 | 44.0 | 72000.000000 | No |
1 | 2 | 27.0 | 48000.000000 | Yes |
2 | 1 | 30.0 | 54000.000000 | No |
3 | 2 | 38.0 | 61000.000000 | No |
4 | 1 | 40.0 | 63777.777778 | Yes |
# you can pass an array of indices of categorical features
# one_hot_encoder = OneHotEncoder(categorical_features=[0])
# temp = df.copy()
# temp.iloc[:, 0] = one_hot_encoder.fit_transform(df.iloc[:, 0])
# you can achieve the same thing using get_dummies
pd.get_dummies(df.iloc[:, :-1])
Age | Salary | Country_France | Country_Germany | Country_Spain | |
---|---|---|---|---|---|
0 | 44.000000 | 72000.000000 | 1 | 0 | 0 |
1 | 27.000000 | 48000.000000 | 0 | 0 | 1 |
2 | 30.000000 | 54000.000000 | 0 | 1 | 0 |
3 | 38.000000 | 61000.000000 | 0 | 0 | 1 |
4 | 40.000000 | 63777.777778 | 0 | 1 | 0 |
5 | 35.000000 | 58000.000000 | 1 | 0 | 0 |
6 | 38.777778 | 52000.000000 | 0 | 0 | 1 |
7 | 48.000000 | 79000.000000 | 1 | 0 | 0 |
8 | 50.000000 | 83000.000000 | 0 | 1 | 0 |
9 | 37.000000 | 67000.000000 | 1 | 0 | 0 |
Often we need to do the reverse of what we've done above. That is, convert continuous features to discrete values. For instance, we want to convert the output to 0 or 1 depending on the threshold.
from sklearn.datasets import load_iris
iris_dataset = load_iris()
X = iris_dataset.data
y = iris_dataset.target
feature_names = iris_dataset.feature_names
Now we'll binarize the sepal width with 0 or 1 indicating whether the current value is below or above mean.
X[:, 1]
array([ 3.5, 3. , 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3. , 3. , 4. , 4.4, 3.9, 3.5, 3.8, 3.8, 3.4, 3.7, 3.6, 3.3, 3.4, 3. , 3.4, 3.5, 3.4, 3.2, 3.1, 3.4, 4.1, 4.2, 3.1, 3.2, 3.5, 3.1, 3. , 3.4, 3.5, 2.3, 3.2, 3.5, 3.8, 3. , 3.8, 3.2, 3.7, 3.3, 3.2, 3.2, 3.1, 2.3, 2.8, 2.8, 3.3, 2.4, 2.9, 2.7, 2. , 3. , 2.2, 2.9, 2.9, 3.1, 3. , 2.7, 2.2, 2.5, 3.2, 2.8, 2.5, 2.8, 2.9, 3. , 2.8, 3. , 2.9, 2.6, 2.4, 2.4, 2.7, 2.7, 3. , 3.4, 3.1, 2.3, 3. , 2.5, 2.6, 3. , 2.6, 2.3, 2.7, 3. , 2.9, 2.9, 2.5, 2.8, 3.3, 2.7, 3. , 2.9, 3. , 3. , 2.5, 2.9, 2.5, 3.6, 3.2, 2.7, 3. , 2.5, 2.8, 3.2, 3. , 3.8, 2.6, 2.2, 3.2, 2.8, 2.8, 2.7, 3.3, 3.2, 2.8, 3. , 2.8, 3. , 2.8, 3.8, 2.8, 2.8, 2.6, 3. , 3.4, 3.1, 3. , 3.1, 3.1, 3.1, 2.7, 3.2, 3.3, 3. , 2.5, 3. , 3.4, 3. ])
from sklearn.preprocessing import Binarizer
X[:, 1:2] = Binarizer(threshold=X[:, 1].mean()).fit_transform(X[:, 1].reshape(-1, 1))
X[:, 1]
array([ 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0.])