#!/usr/bin/env python # coding: utf-8 # # Handling Categorical Data in Python # ### What does Nominal, Ordinal and Continuous features mean? # # Categorical features can only take on a limited, and usually fixed, number of possible values. For example, if a dataset is about information related to users, then you will typically find features like country, gender, age group, etc. Alternatively, if the data you're working with is related to products, you will find features like product type, manufacturer, seller and so on. # # These are all categorical features in your dataset. These features are typically stored as text values which represent various traits of the observations. For example, gender is described as Male (M) or Female (F), product type could be described as electronics, apparels, food etc. # # **Note that these type of features where the categories are only labeled without any order of precedence are called nominal features. Features which have some order associated with them are called ordinal features.** # # For example, a feature like economic status, with three categories: low, medium and high, which have an order associated with them. # # There are also continuous features. These are numeric variables that have an infinite number of values between any two values. A continuous variable can be numeric or a date/time. # # Regardless of what the value is used for, the challenge is determining how to use this data in the analysis because of the following constraints: # # - Categorical features may have a very large number of levels, known as high cardinality, (for example, cities or URLs), where most of the levels appear in a relatively small number of instances. # # - Many machine learning models, such as regression or SVM, are algebraic. This means that their input must be numerical. To use these models, categories must be transformed into numbers first, before you can apply the learning algorithm on them. # # - While some ML packages or libraries might transform categorical data to numeric automatically based on some default embedding method, many other ML packages don’t support such inputs. # # - For the machine, categorical data doesn’t contain the same context or information that humans can easily associate and understand. For example, when looking at a feature called ``City`` with three cities ``New York``, ``New Jersey`` and ``New Delhi``, humans can infer that ``New York`` is closely related to ``New Jersey`` as they are from same country, while ``New York`` and ``New Delhi`` are much different. But for the model, ``New York``, ``New Jersey`` and ``New Delhi``, are just three different levels (possible values) of the same feature ``City``. If you don’t specify the additional contextual information, it will be impossible for the model to differentiate between highly different levels. # # You therefore are faced with the challenge of figuring out how to turn these text values into numerical values for further processing and unmask lots of interesting information which these features might hide. Typically, any standard work-flow in feature engineering involves some form of transformation of these categorical values into numeric labels and then applying some encoding scheme on these values. # ### Encoding Categorical Data # Existing Encoding Methods (all modules for which code is available, see: http://contrib.scikit-learn.org/categorical-encoding/_modules/index.html) # - category_encoders.backward_difference # - category_encoders.basen # - category_encoders.binary # - category_encoders.hashing # - category_encoders.helmert # - category_encoders.leave_one_out # - category_encoders.one_hot # - category_encoders.ordinal # - category_encoders.polynomial # - category_encoders.sum_coding # - category_encoders.target_encoder # # # The techniques that you'll cover are the following: # # * Replacing values # * Encoding labels # * One-Hot encoding # * Binary encoding # * Backward difference encoding # * Polynomial encodings # * Miscellaneous features # > ## 1. Replace Values # In[1]: import pandas as pd import numpy as np import copy get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: df_flights = pd.read_csv('https://raw.githubusercontent.com/ismayc/pnwflights14/master/data/flights.csv') df_flights.head() # In[3]: cat_df_flights = df_flights.select_dtypes(include=['object']).copy() cat_df_flights.head() # In[4]: print(cat_df_flights.isnull().values.sum()) # In[5]: cat_df_flights = cat_df_flights.fillna(cat_df_flights['tailnum'].value_counts().index[0]) print(cat_df_flights.isnull().values.sum()) # In[6]: replace_map = {'carrier': {'AA': 1, 'AS': 2, 'B6': 3, 'DL': 4, 'F9': 5, 'HA': 6, 'OO': 7 , 'UA': 8 , 'US': 9,'VX': 10,'WN': 11}} # In[7]: labels = cat_df_flights['carrier'].astype('category').cat.categories.tolist() replace_map_comp = {'carrier' : {k: v for k,v in zip(labels,list(range(1,len(labels)+1)))}} print(replace_map_comp) # In[8]: cat_df_flights_replace = cat_df_flights.copy() # In[9]: cat_df_flights_replace.replace(replace_map_comp, inplace=True) print(cat_df_flights_replace.head()) # In[10]: print(cat_df_flights_replace['carrier'].dtypes) # In[11]: cat_df_flights_lc = cat_df_flights.copy() cat_df_flights_lc['carrier'] = cat_df_flights_lc['carrier'].astype('category') cat_df_flights_lc['origin'] = cat_df_flights_lc['origin'].astype('category') print(cat_df_flights_lc.dtypes) # In[12]: import time get_ipython().run_line_magic('timeit', "cat_df_flights.groupby(['origin','carrier']).count() #DataFrame with object dtype columns") # In[13]: get_ipython().run_line_magic('timeit', "cat_df_flights_lc.groupby(['origin','carrier']).count() #DataFrame with category dtype columns") # > ## 2. Label Encoding # In[14]: cat_df_flights_lc.head() # In[15]: cat_df_flights_lc['carrier'] = cat_df_flights_lc['carrier'].cat.codes # In[16]: cat_df_flights_lc.head() #alphabetically labeled from 0 to 10 # In[17]: cat_df_flights_specific = cat_df_flights.copy() cat_df_flights_specific['US_code'] = np.where(cat_df_flights_specific['carrier'].str.contains('US'), 1, 0) cat_df_flights_specific.head() # In[18]: cat_df_flights_sklearn = cat_df_flights.copy() from sklearn.preprocessing import LabelEncoder lb_make = LabelEncoder() cat_df_flights_sklearn['carrier_code'] = lb_make.fit_transform(cat_df_flights['carrier']) cat_df_flights_sklearn.head() #Results in appending a new column to df # **Label encoding is pretty much intuitive and straight-forward and may give you a good performance from your learning algorithm, but it has as disadvantage that the numerical values can be misinterpreted by the algorithm. Should the carrier US (encoded to 8) be given 8x more weight than the carrier AS (encoded to 1) ?** # # **To solve this issue there is another popular way to encode the categories via something called one-hot encoding.** # # > ## 3. One-Hot encoding # In[19]: cat_df_flights.head() # In[20]: cat_df_flights_onehot = cat_df_flights.copy() cat_df_flights_onehot = pd.get_dummies(cat_df_flights_onehot, columns=['carrier'], prefix = ['carrier']) cat_df_flights_onehot.head() # In[21]: cat_df_flights_onehot_sklearn = cat_df_flights.copy() from sklearn.preprocessing import LabelBinarizer lb = LabelBinarizer() lb_results = lb.fit_transform(cat_df_flights_onehot_sklearn['carrier']) lb_results_df = pd.DataFrame(lb_results, columns=lb.classes_) lb_results_df.head() # In[22]: result_df = pd.concat([cat_df_flights_onehot_sklearn, lb_results_df], axis=1) result_df.head() # > ## 4. Binary Encoding # In[23]: cat_df_flights_ce = cat_df_flights.copy() import category_encoders as ce encoder = ce.BinaryEncoder(cols=['carrier']) df_binary = encoder.fit_transform(cat_df_flights_ce) df_binary.head() # In[24]: df_binary[df_binary['carrier_0']==1] # > ## 5. Backward Difference Encoding # In[25]: encoder = ce.BackwardDifferenceEncoder(cols=['carrier']) df_bd = encoder.fit_transform(cat_df_flights_ce) df_bd.head() # In[26]: np.unique(df_bd['col_carrier_1']) # > ## 6. Polynomial Encoding # In[27]: encoder = ce.PolynomialEncoder(cols=['carrier']) df_bd = encoder.fit_transform(cat_df_flights_ce) df_bd.head() # In[28]: np.unique(df_bd['col_carrier_1']) # > ## 7. Miscellaneous Features # In[29]: dummy_df_age = pd.DataFrame({'age': ['0-20', '20-40', '40-60','60-80']}) dummy_df_age['start'], dummy_df_age['end'] = zip(*dummy_df_age['age'].map(lambda x: x.split('-'))) dummy_df_age.head() # In[30]: dummy_df_age = pd.DataFrame({'age': ['0-20', '20-40', '40-60','60-80']}) def split_mean(x): split_list = x.split('-') mean = (float(split_list[0])+float(split_list[1]))/2 return mean dummy_df_age['age_mean'] = dummy_df_age['age'].apply(lambda x: split_mean(x)) dummy_df_age.head() # ### Dealing with Categorical Features in Big Data with Spark # - The first step in Spark programming is to create a SparkContext. SparkContext is required when you want to execute operations in a cluster. SparkContext tells Spark how and where to access a cluster. You'll start by importing SparkContext. # # - To start working with Spark DataFrames, you first have to create a SparkSession object from your SparkContext. # > #### 1st way # In[ ]: #import findspark #findspark.init() #import pyspark #confspark = pyspark.SparkConf().setMaster("local[*]").set("spark.cores.max", "4").set("spark.executor.memory", "2G").setAppName("--test--") #sc = pyspark.SparkContext(conf=confspark) #sc._conf.getAll() #from pyspark.sql import SparkSession #spark = SparkSession(sc) #sc.stop() # > #### 2nd way # In[31]: import findspark findspark.init() import pyspark from pyspark.sql import SparkSession #confspark = pyspark.SparkConf().setMaster("local[4]").set("spark.cores.max", "4").set("spark.executor.memory", "2G").setAppName("--test--") #spark = SparkSession.builder.config(conf=confspark).getOrCreate() spark = SparkSession.builder.master("local[*]").appName("--test--").config("spark.some.config.option", "some-value").getOrCreate() # In[32]: spark.version # In[33]: spark.catalog.listTables() # In[34]: spark_flights = spark.read.format("csv").option('header',True).load('data/flights.csv',inferSchema=True) spark_flights.show(3) # In[35]: spark_flights.printSchema() # In[36]: spark.catalog.listTables() # In[37]: spark_flights.createOrReplaceTempView("flights_temp") # In[38]: spark.catalog.listTables() # In[39]: carrier_df = spark_flights.select("carrier") carrier_df.show(5) # > ## StringIndexer # In[40]: from pyspark.ml.feature import StringIndexer carr_indexer = StringIndexer(inputCol="carrier",outputCol="carrier_index") carr_indexed = carr_indexer.fit(carrier_df).transform(carrier_df) carr_indexed.show(7) # > ## OneHotEncoder # In[41]: carrier_df_onehot = spark_flights.select("carrier") from pyspark.ml.feature import OneHotEncoder, StringIndexer stringIndexer = StringIndexer(inputCol="carrier", outputCol="carrier_index") model = stringIndexer.fit(carrier_df_onehot) indexed = model.transform(carrier_df_onehot) encoder = OneHotEncoder(dropLast=False, inputCol="carrier_index", outputCol="carrier_vec") encoded = encoder.transform(indexed) encoded.show(7) # > ## Example # In[42]: from pyspark.ml.feature import OneHotEncoder, StringIndexer df1 = spark.createDataFrame([ (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c"), (6, "a"), (7, "b"), (8, "d"), (9, "d") ], ["id", "category"]) df2 = spark.createDataFrame([ (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") ], ["id", "category"]) df = df2 stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex") model = stringIndexer.fit(df) indexed = model.transform(df) encoder = OneHotEncoder(dropLast=True, inputCol="categoryIndex", outputCol="categoryVec") encoded = encoder.transform(indexed) encoded.show() # > ## VectorIndexer # In[43]: from pyspark.ml.feature import VectorIndexer data = spark.read.format("libsvm").load("data/sample_libsvm_data.txt") indexer = VectorIndexer(inputCol="features", outputCol="indexed", maxCategories=10) indexerModel = indexer.fit(data) categoricalFeatures = indexerModel.categoryMaps print("Chose %d categorical features: %s" % (len(categoricalFeatures), ", ".join(str(k) for k in categoricalFeatures.keys()))) # Create new column "indexed" with categorical values transformed to indices indexedData = indexerModel.transform(data) indexedData.show() # > https://www.datacamp.com/community/tutorials/categorical-data # # > http://contrib.scikit-learn.org/categorical-encoding/_modules/index.html