#!/usr/bin/env python # coding: utf-8 # # Chapter 2: Processing data for machine learning # # To simplify the code examples in these notebooks, we populate the namespace with functions from numpy and matplotlib: # In[2]: get_ipython().run_line_magic('pylab', 'inline') # ### Converting categorical data to numerical features # In[26]: cat_data = array(['male', 'female', 'male', 'male', 'female', 'male', 'female', 'female']) # In[27]: def cat_to_num(data): categories = unique(data) features = [] for cat in categories: binary = (data == cat) features.append(binary.astype("int")) return features # In[28]: cat_to_num(cat_data) # ### Simple feature engineering of the Titanic dataset # In[23]: cabin_data = array(["C65", "", "E36", "C54", "B57 B59 B63 B66"]) # In[31]: def cabin_features(data): features = [] for cabin in data: cabins = cabin.split(" ") n_cabins = len(cabins) # First char is the cabin_char try: cabin_char = cabins[0][0] except IndexError: cabin_char = "X" n_cabins = 0 # The rest is the cabin number try: cabin_num = int(cabins[0][1:]) except: cabin_num = -1 # Add 3 features for each passanger features.append( [cabin_char, cabin_num, n_cabins] ) return features # In[33]: cabin_features(cabin_data) # ### Feature normalization # In[41]: num_data = array([1, 10, 0.5, 43, 0.12, 8]) # In[42]: def normalize_feature(data, f_min=-1, f_max=1): d_min, d_max = min(data), max(data) factor = (f_max - f_min) / (d_max - d_min) normalized = f_min + data*factor return normalized, factor # In[43]: normalize_feature(num_data)