#!/usr/bin/env python # coding: utf-8 # # Simple Binary Classification on Adult Dataset # # You can use this notebook to try out StickyLand! # # To launch StickyLand, click the note icon in the toobar above. # # ![](https://i.imgur.com/kQyAEF3.png) # In[1]: # Install dependencies get_ipython().run_line_magic('pip', 'install numpy pandas matplotlib sklearn') # In[2]: import numpy as np import pandas as pd from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split from collections import Counter get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'retina'") # ## 1. Exploratory Data Analyais # ### 1.1. Loading the Dataset # In[3]: df = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', sep=', ', engine='python', header=None ) column_names = [ 'Age', 'WorkClass', 'fnlwgt', 'Education', 'EducationNum', 'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender', 'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'NativeCountry', 'Income' ] df.columns = [n.lower() for n in column_names] df.shape # ### Adult Features # # The Adult dataset has 14 features.
# The output variable is binary (`income > 50k`). # In[4]: df.head() # In[5]: sub_df = df[df['age'] < 20] sub_df.head() # ### Task List [02/22] # # - [x] Visualize the adult datatset # - [x] Histogram of all features # - [x] Scatter plot of `age` vs. `income` # - [x] Test ML models on new dataset # - [x] XGBoost # - [x] Explanable Boosting Machine # - [ ] Share the notebook with Ellie 😊 # # Also support $\LaTeX$! # In[6]: def overlay_hist(df, c): """ Plot two histogram of two values overlaying each other. """ num_unique = len(df[c].unique()) if df[c].dtype == 'object': counter_1 = Counter(df[c][df['target'] == 1]) counter_2 = Counter(df[c][df['target'] != 1]) bar_names = [] bar_densities_1 = [] bar_densities_2 = [] for f in counter_1: bar_names.append(f) bar_densities_1.append(counter_1[f] / df.shape[0]) bar_densities_2.append(counter_2[f] / df.shape[0]) for f in counter_2: if f not in counter_1: bar_names.append(f) bar_densities_1.append(counter_1[f] / df.shape[0]) bar_densities_2.append(counter_2[f] / df.shape[0]) count_df = pd.DataFrame(np.c_[bar_densities_2, bar_densities_1], index=bar_names) ax = count_df.plot.bar(alpha=0.5) ax.set_title(c) ax.figure.autofmt_xdate(rotation=45) else: plt.hist(df[c][df['target'] == 1], alpha=0.5, density=True, label='>50k', bins=50) plt.hist(df[c][df['target'] != 1], alpha=0.5, density=True, label='<=50k', bins=50) plt.title(c) plt.legend(loc='upper right') print('Num of unique values: ', num_unique) plt.show() # ### Task List [02/22] # # - [x] Visualize the adult datatset # - [x] Histogram of all features # - [x] Scatter plot of `age` vs. `income` # - [x] Test ML models on new dataset # - [x] XGBoost # - [x] Explanable Boosting Machine # - [x] Share the notebook with Ellie 😊 # # Also support $\LaTeX$! # Transform the target variable `Income` as a binary variable. # In[7]: df['target'] = [0 if l else 1 for l in (df['income'] == '<=50K')] new_df = df.copy() # ### 1.2. Data Engineering # # In this section, we delete or transform some features before training the binary classifier. # In[8]: intersted_feature = 'maritalstatus' # In[9]: overlay_hist(df, intersted_feature) # The distribution difference between these two groups on age is quite significant. # In[10]: overlay_hist(df, 'workclass') # In[11]: overlay_hist(df, 'fnlwgt') # `fnlwgt` stands for "Final Weight", which is used to give weight to different sample so that people with similar demographic characteristics have the same weight. This feature is not really useful in this model. # In[12]: del new_df['fnlwgt'] # In[13]: overlay_hist(df, 'education') # In[14]: overlay_hist(df, 'educationnum') # In[15]: overlay_hist(df, 'maritalstatus') # In[16]: overlay_hist(df, 'occupation') # In[17]: overlay_hist(df, 'relationship') # In[18]: overlay_hist(df, 'race') # In[19]: overlay_hist(df, 'gender') # In[20]: overlay_hist(df, 'capitalgain') # In[21]: overlay_hist(df, 'capitalloss') # These two features `capitalgain` and `capitalloss` have many 0 values. It makes sense, because the census define capital gain/loss as the profit/loss of asset sales (stocks or real estate). Not all people would yield cpaital gain/loss in a particular. We can convert these two variables as binary features `has_capitalgain` and `has_capitalloss`. # In[22]: new_df['has_capitalgain'] = [int(t) for t in df['capitalgain'] != 0] new_df['has_capitalloss'] = [int(t) for t in df['capitalloss'] != 0] del new_df['capitalgain'] del new_df['capitalloss'] # In[23]: overlay_hist(df, 'hoursperweek') # Working 40 hours a week is typical in the dataset. Interestingly people who earn more tend to work longer. # In[24]: overlay_hist(df, 'nativecountry') # The majority of the native country is the US. We can encode it as another binary variable `from-usa` to decrease the number of levels. # In[25]: new_df['from_usa'] = [int(t) for t in df['nativecountry'] == 'United-States'] del new_df['nativecountry'] # In[26]: overlay_hist(df, 'income') # It shows this dataset is quite imbalanced. # In[27]: new_df.head() # ## Image Augmentation # In[28]: # Install dependencies get_ipython().run_line_magic('pip', 'install imageio imgaug') # In[29]: import imageio import numpy as np import imgaug as ia import imgaug.augmenters as iaa import matplotlib.pyplot as plt get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'retina'") get_ipython().run_line_magic('matplotlib', 'inline') # In[30]: counter = 0 # In[31]: def load_random_image(dataset): global counter images = [ 'https://i.imgur.com/xnrNBo3.png', 'https://i.imgur.com/Ch4p4ds.png', 'https://i.imgur.com/DUSjJ5U.png', 'https://i.imgur.com/pfM32N4.png' ] # image = imageio.imread(np.random.choice(images)) image = imageio.imread(images[counter % 4]) counter += 1 image = image[:, :, :3] s = 250 aug = iaa.size.Resize([s, s]) image = aug(image=image) return image dataset = 25 # In[32]: def load_random_image(dataset): # image = imageio.imread('https://i.imgur.com/Ch4p4ds.png') image = imageio.imread('https://i.imgur.com/DUSjJ5U.png') image = image[:, :, :3] s = 250 aug = iaa.size.Resize([s, s]) image = aug(image=image) return image # In[33]: def rotate(image): """Rotate the image""" aug = iaa.Affine(rotate=(-10, -9)) image_aug = aug(image=image) return image_aug def add_noise(image): """Add random noise on the image""" aug = iaa.CoarseDropout(0.02, size_percent=0.5) image_aug = aug(image=image) return image_aug def corrupt(image): """Corrupt the image""" aug = iaa.MultiplyHueAndSaturation(mul_hue=4) image = aug(image=image) aug = iaa.MultiplyHueAndSaturation(mul_hue=4) image = aug(image=image) return image # In[34]: image = load_random_image(dataset) plt.imshow(image); # In[35]: image = rotate(image) plt.imshow(image); # In[36]: image = add_noise(image) plt.imshow(image); # In[37]: image = corrupt(image) plt.imshow(image); #
#
#

#
# #
#
#
#