#!/usr/bin/env python
# coding: utf-8
# # Simple Binary Classification on Adult Dataset
#
# You can use this notebook to try out StickyLand!
#
# To launch StickyLand, click the note icon in the toobar above.
#
# 
# In[1]:
# Install dependencies
get_ipython().run_line_magic('pip', 'install numpy pandas matplotlib sklearn')
# In[2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter
get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'retina'")
# ## 1. Exploratory Data Analyais
# ### 1.1. Loading the Dataset
# In[3]:
df = pd.read_csv(
'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
sep=', ',
engine='python',
header=None
)
column_names = [
'Age', 'WorkClass', 'fnlwgt', 'Education', 'EducationNum',
'MaritalStatus', 'Occupation', 'Relationship', 'Race', 'Gender',
'CapitalGain', 'CapitalLoss', 'HoursPerWeek', 'NativeCountry', 'Income'
]
df.columns = [n.lower() for n in column_names]
df.shape
# ### Adult Features
#
# The Adult dataset has 14 features.
# The output variable is binary (`income > 50k`).
# In[4]:
df.head()
# In[5]:
sub_df = df[df['age'] < 20]
sub_df.head()
# ### Task List [02/22]
#
# - [x] Visualize the adult datatset
# - [x] Histogram of all features
# - [x] Scatter plot of `age` vs. `income`
# - [x] Test ML models on new dataset
# - [x] XGBoost
# - [x] Explanable Boosting Machine
# - [ ] Share the notebook with Ellie 😊
#
# Also support $\LaTeX$!
# In[6]:
def overlay_hist(df, c):
"""
Plot two histogram of two values overlaying each other.
"""
num_unique = len(df[c].unique())
if df[c].dtype == 'object':
counter_1 = Counter(df[c][df['target'] == 1])
counter_2 = Counter(df[c][df['target'] != 1])
bar_names = []
bar_densities_1 = []
bar_densities_2 = []
for f in counter_1:
bar_names.append(f)
bar_densities_1.append(counter_1[f] / df.shape[0])
bar_densities_2.append(counter_2[f] / df.shape[0])
for f in counter_2:
if f not in counter_1:
bar_names.append(f)
bar_densities_1.append(counter_1[f] / df.shape[0])
bar_densities_2.append(counter_2[f] / df.shape[0])
count_df = pd.DataFrame(np.c_[bar_densities_2, bar_densities_1], index=bar_names)
ax = count_df.plot.bar(alpha=0.5)
ax.set_title(c)
ax.figure.autofmt_xdate(rotation=45)
else:
plt.hist(df[c][df['target'] == 1], alpha=0.5, density=True, label='>50k', bins=50)
plt.hist(df[c][df['target'] != 1], alpha=0.5, density=True, label='<=50k', bins=50)
plt.title(c)
plt.legend(loc='upper right')
print('Num of unique values: ', num_unique)
plt.show()
# ### Task List [02/22]
#
# - [x] Visualize the adult datatset
# - [x] Histogram of all features
# - [x] Scatter plot of `age` vs. `income`
# - [x] Test ML models on new dataset
# - [x] XGBoost
# - [x] Explanable Boosting Machine
# - [x] Share the notebook with Ellie 😊
#
# Also support $\LaTeX$!
# Transform the target variable `Income` as a binary variable.
# In[7]:
df['target'] = [0 if l else 1 for l in (df['income'] == '<=50K')]
new_df = df.copy()
# ### 1.2. Data Engineering
#
# In this section, we delete or transform some features before training the binary classifier.
# In[8]:
intersted_feature = 'maritalstatus'
# In[9]:
overlay_hist(df, intersted_feature)
# The distribution difference between these two groups on age is quite significant.
# In[10]:
overlay_hist(df, 'workclass')
# In[11]:
overlay_hist(df, 'fnlwgt')
# `fnlwgt` stands for "Final Weight", which is used to give weight to different sample so that people with similar demographic characteristics have the same weight. This feature is not really useful in this model.
# In[12]:
del new_df['fnlwgt']
# In[13]:
overlay_hist(df, 'education')
# In[14]:
overlay_hist(df, 'educationnum')
# In[15]:
overlay_hist(df, 'maritalstatus')
# In[16]:
overlay_hist(df, 'occupation')
# In[17]:
overlay_hist(df, 'relationship')
# In[18]:
overlay_hist(df, 'race')
# In[19]:
overlay_hist(df, 'gender')
# In[20]:
overlay_hist(df, 'capitalgain')
# In[21]:
overlay_hist(df, 'capitalloss')
# These two features `capitalgain` and `capitalloss` have many 0 values. It makes sense, because the census define capital gain/loss as the profit/loss of asset sales (stocks or real estate). Not all people would yield cpaital gain/loss in a particular. We can convert these two variables as binary features `has_capitalgain` and `has_capitalloss`.
# In[22]:
new_df['has_capitalgain'] = [int(t) for t in df['capitalgain'] != 0]
new_df['has_capitalloss'] = [int(t) for t in df['capitalloss'] != 0]
del new_df['capitalgain']
del new_df['capitalloss']
# In[23]:
overlay_hist(df, 'hoursperweek')
# Working 40 hours a week is typical in the dataset. Interestingly people who earn more tend to work longer.
# In[24]:
overlay_hist(df, 'nativecountry')
# The majority of the native country is the US. We can encode it as another binary variable `from-usa` to decrease the number of levels.
# In[25]:
new_df['from_usa'] = [int(t) for t in df['nativecountry'] == 'United-States']
del new_df['nativecountry']
# In[26]:
overlay_hist(df, 'income')
# It shows this dataset is quite imbalanced.
# In[27]:
new_df.head()
# ## Image Augmentation
# In[28]:
# Install dependencies
get_ipython().run_line_magic('pip', 'install imageio imgaug')
# In[29]:
import imageio
import numpy as np
import imgaug as ia
import imgaug.augmenters as iaa
import matplotlib.pyplot as plt
get_ipython().run_line_magic('config', "InlineBackend.figure_format = 'retina'")
get_ipython().run_line_magic('matplotlib', 'inline')
# In[30]:
counter = 0
# In[31]:
def load_random_image(dataset):
global counter
images = [
'https://i.imgur.com/xnrNBo3.png',
'https://i.imgur.com/Ch4p4ds.png',
'https://i.imgur.com/DUSjJ5U.png',
'https://i.imgur.com/pfM32N4.png'
]
# image = imageio.imread(np.random.choice(images))
image = imageio.imread(images[counter % 4])
counter += 1
image = image[:, :, :3]
s = 250
aug = iaa.size.Resize([s, s])
image = aug(image=image)
return image
dataset = 25
# In[32]:
def load_random_image(dataset):
# image = imageio.imread('https://i.imgur.com/Ch4p4ds.png')
image = imageio.imread('https://i.imgur.com/DUSjJ5U.png')
image = image[:, :, :3]
s = 250
aug = iaa.size.Resize([s, s])
image = aug(image=image)
return image
# In[33]:
def rotate(image):
"""Rotate the image"""
aug = iaa.Affine(rotate=(-10, -9))
image_aug = aug(image=image)
return image_aug
def add_noise(image):
"""Add random noise on the image"""
aug = iaa.CoarseDropout(0.02, size_percent=0.5)
image_aug = aug(image=image)
return image_aug
def corrupt(image):
"""Corrupt the image"""
aug = iaa.MultiplyHueAndSaturation(mul_hue=4)
image = aug(image=image)
aug = iaa.MultiplyHueAndSaturation(mul_hue=4)
image = aug(image=image)
return image
# In[34]:
image = load_random_image(dataset)
plt.imshow(image);
# In[35]:
image = rotate(image)
plt.imshow(image);
# In[36]:
image = add_noise(image)
plt.imshow(image);
# In[37]:
image = corrupt(image)
plt.imshow(image);
#
#
#
#
#
#
#
#
#