In [1]:

import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

/home/anirudh/anaconda/lib/python2.7/site-packages/pandas/computation/__init__.py:19: UserWarning: The installed version of numexpr 2.4.4 is not supported in pandas and will be not be used

  UserWarning)

In [2]:

from pprint import *

In [3]:

import seaborn
seaborn.set()

:0: FutureWarning: IPython widgets are experimental and may change in the future.

In [4]:

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

Hypotheses¶

Age
Income
Profession
Marital Status
Urban / Rural
Expenses
Education
Total Work Experience

In [5]:

test.head()

Out[5]:

	ID	Age	Workclass	Education	Marital.Status	Occupation	Relationship	Race	Sex	Hours.Per.Week	Native.Country
0	32562	25	Private	11th	Never-married	Machine-op-inspct	Own-child	Black	Male	40	United-States
1	32563	38	Private	HS-grad	Married-civ-spouse	Farming-fishing	Husband	White	Male	50	United-States
2	32564	28	Local-gov	Assoc-acdm	Married-civ-spouse	Protective-serv	Husband	White	Male	40	United-States
3	32565	44	Private	Some-college	Married-civ-spouse	Machine-op-inspct	Husband	Black	Male	40	United-States
4	32566	18	NaN	Some-college	Never-married	NaN	Own-child	White	Female	30	United-States

In [6]:

train.head()

Out[6]:

	ID	Age	Workclass	Education	Marital.Status	Occupation	Relationship	Race	Sex	Hours.Per.Week	Native.Country	Income.Group
0	1	39	State-gov	Bachelors	Never-married	Adm-clerical	Not-in-family	White	Male	40	United-States	<=50K
1	2	50	Self-emp-not-inc	Bachelors	Married-civ-spouse	Exec-managerial	Husband	White	Male	13	United-States	<=50K
2	3	38	Private	HS-grad	Divorced	Handlers-cleaners	Not-in-family	White	Male	40	United-States	<=50K
3	4	53	Private	11th	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	40	United-States	<=50K
4	5	28	Private	Bachelors	Married-civ-spouse	Prof-specialty	Wife	Black	Female	40	Cuba	<=50K

Missing Value Treatment¶

In [7]:

train.apply(lambda x: sum(x.isnull()))

Out[7]:

ID                   0
Age                  0
Workclass         1836
Education            0
Marital.Status       0
Occupation        1843
Relationship         0
Race                 0
Sex                  0
Hours.Per.Week       0
Native.Country     583
Income.Group         0
dtype: int64

In [8]:

test.apply(lambda x: sum(x.isnull()))

Out[8]:

ID                  0
Age                 0
Workclass         963
Education           0
Marital.Status      0
Occupation        966
Relationship        0
Race                0
Sex                 0
Hours.Per.Week      0
Native.Country    274
dtype: int64

In [9]:

from scipy.stats import mode

Workclass¶

In [10]:

mode(train.Workclass)[0][0]

/home/anirudh/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:200: FutureWarning: numpy not_equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.
  flag = np.concatenate(([True], aux[1:] != aux[:-1]))

Out[10]:

'Private'

In [11]:

Workclass_Null_Indices = train[train.Workclass.isnull()].index
for el in Workclass_Null_Indices: train.Workclass[el] = 'Private'

/home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app

In [12]:

mode(test.Workclass)[0][0]

Out[12]:

'Private'

In [13]:

Workclass_Null_Indices_test = test[test.Workclass.isnull()].index
for el in Workclass_Null_Indices_test: test.Workclass[el] = 'Private'

/home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app

Occupation¶

In [14]:

mode(train.Occupation)[0][0]

Out[14]:

'Prof-specialty'

In [15]:

mode(test.Occupation)[0][0]

Out[15]:

'Prof-specialty'

In [16]:

Occupation_Null_Indices_train = train[train.Occupation.isnull()].index
for el in Occupation_Null_Indices_train: train.Occupation[el] = 'Prof-specialty'

/home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app

In [17]:

Occupation_Null_Indices_test = test[test.Occupation.isnull()].index
for el in Occupation_Null_Indices_test: test.Occupation[el] = 'Prof-specialty'

/home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app

Native Country¶

In [18]:

mode(train['Native.Country'])[0][0]

Out[18]:

'United-States'

In [19]:

mode(test['Native.Country'])[0][0]

Out[19]:

'United-States'

In [20]:

NativeCountry_Null_Indices_train = train[train['Native.Country'].isnull()].index
for el in NativeCountry_Null_Indices_train: train['Native.Country'][el] = 'United-States'
    
NativeCountry_Null_Indices_test = test[test['Native.Country'].isnull()].index
for el in NativeCountry_Null_Indices_test: test['Native.Country'][el] = 'United-States'

/home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from IPython.kernel.zmq import kernelapp as app
/home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Univariate Analysis¶

Preprocessing Categorical Variables¶

*Age*

In [21]:

plt.hist(train.Age, bins = 100)
plt.show()
plt.hist(test.Age, bins = 100)
plt.show()

In [22]:

train.Age.describe()

Out[22]:

count    32561.000000
mean        38.581647
std         13.640433
min         17.000000
25%         28.000000
50%         37.000000
75%         48.000000
max         90.000000
Name: Age, dtype: float64

In [23]:

test.Age.describe()

Out[23]:

count    16281.000000
mean        38.767459
std         13.849187
min         17.000000
25%         28.000000
50%         37.000000
75%         48.000000
max         90.000000
Name: Age, dtype: float64

*** 25% of the population is between 17 and 28***

*** 25% of the population is between 28 and 37***

*** 75% of the population is under 48 ***

*** Working Class ***

In [24]:

train.Workclass.describe()

Out[24]:

count       32561
unique          8
top       Private
freq        24532
Name: Workclass, dtype: object

In [25]:

test.Workclass.describe()

Out[25]:

count       16281
unique          8
top       Private
freq        12173
Name: Workclass, dtype: object

In [26]:

unique_working_classes = list(train.Workclass.unique())
unique_working_classes

Out[26]:

['State-gov',
 'Self-emp-not-inc',
 'Private',
 'Federal-gov',
 'Local-gov',
 'Self-emp-inc',
 'Without-pay',
 'Never-worked']

In [27]:

from sklearn.preprocessing import LabelEncoder

In [28]:

number = LabelEncoder()
train.Workclass = number.fit_transform(train.Workclass.astype('str'))
test.Workclass = number.fit_transform(test.Workclass.astype('str'))

In [29]:

fig = plt.figure(figsize=(4,4), dpi=1600)
plt.hist(train.Workclass, bins = len(unique_working_classes))
plt.show()

fig = plt.figure(figsize=(4,4), dpi=1600)
plt.hist(test.Workclass, bins = len(unique_working_classes))
plt.show()

In [30]:

train.Workclass.unique()

Out[30]:

array([6, 5, 3, 0, 1, 4, 7, 2])

In [31]:

# Workclass Decoder

workclass_decoder = dict(zip(train.Workclass.unique(),unique_working_classes))
workclass_decoder

Out[31]:

{0: 'Federal-gov',
 1: 'Local-gov',
 2: 'Never-worked',
 3: 'Private',
 4: 'Self-emp-inc',
 5: 'Self-emp-not-inc',
 6: 'State-gov',
 7: 'Without-pay'}

In [32]:

# # Combine bins based on some common sense
# new_workclass_encoder = {0:0,1:0,2:1,3:2,4:2,5:2,6:0,7:1}

# new_workclass_train = [new_workclass_encoder[i] for i in train.Workclass]
# train.Workclass = new_workclass_train

# new_workclass_test = [new_workclass_encoder[i] for i in test.Workclass]
# test.Workclass = new_workclass_test

# new_workclass_decoder = {0:'Government',1:'Never Worked / Without Pay / Information NA', 2:'Private'}

In [33]:

# new_workclass_decoder

In [34]:

# fig = plt.figure(figsize=(2,5), dpi=1600)
# plt.hist(train.Workclass, bins = 3)

# fig = plt.figure(figsize=(2,5), dpi=1600)
# plt.hist(test.Workclass, bins = 3)

*** Education ***

In [35]:

education_levels_train = train.Education.unique()
print education_levels_train

['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th' '1st-4th'
 'Preschool' '12th']

In [36]:

train.Education = number.fit_transform(train.Education)
new_education_levels = train.Education.unique()
education_levels_decoder = dict(zip(education_levels_train, new_education_levels))
pprint(education_levels_decoder)
plt.hist(train.Education, bins = len(new_education_levels))
plt.show()

test.Education = number.fit_transform(test.Education)
plt.hist(test.Education, bins = len(new_education_levels))
plt.show()

{'10th': 0,
 '11th': 1,
 '12th': 2,
 '1st-4th': 3,
 '5th-6th': 4,
 '7th-8th': 5,
 '9th': 6,
 'Assoc-acdm': 7,
 'Assoc-voc': 8,
 'Bachelors': 9,
 'Doctorate': 10,
 'HS-grad': 11,
 'Masters': 12,
 'Preschool': 13,
 'Prof-school': 14,
 'Some-college': 15}

...it would make sense to club bins in a more sensible way

In [37]:

pprint(education_levels_decoder)

{'10th': 0,
 '11th': 1,
 '12th': 2,
 '1st-4th': 3,
 '5th-6th': 4,
 '7th-8th': 5,
 '9th': 6,
 'Assoc-acdm': 7,
 'Assoc-voc': 8,
 'Bachelors': 9,
 'Doctorate': 10,
 'HS-grad': 11,
 'Masters': 12,
 'Preschool': 13,
 'Prof-school': 14,
 'Some-college': 15}

In [38]:

# new_education_levels_encoder = {0:1,1:1,2:2,3:0,4:0,5:0,6:0, 7:5, 8:5, 9:3, 10:7, 11:2, 12:4, 13:0, 14:6, 15:3}

# new_education_levels_train = [new_education_levels_encoder[i] for i in train.Education]
# train.Education = new_education_levels_train

# new_education_levels_test = [new_education_levels_encoder[i] for i in test.Education]
# test.Education = new_education_levels_test

# new_education_levels_decoder = {0:'< 9th', 1:'High School', 2:'High School Grad', 3:'Bachelors', 4:'Masters', 5:'Assoc-acdm / Assoc-voc', 6:'Prof School', 7:'Doctorate'}

In [39]:

# plt.hist(train.Education, bins = len(new_education_levels_decoder))
# plt.show()
# plt.hist(test.Education, bins = len(new_education_levels_decoder))
# plt.show()

Income Group¶

In [40]:

# Rows for which Income is under $50k
print train[train['Income.Group'] != '<=50K'].index

Int64Index([    7,     8,     9,    10,    11,    14,    19,    20,    25,
               27,
            ...
            32530, 32532, 32533, 32536, 32538, 32539, 32545, 32554, 32557,
            32560],
           dtype='int64', length=7841)

In [41]:

temp = [0]*len(train)
for i in train[train['Income.Group'] != '<=50K'].index:
    temp[i] = 1
train['Income.Group'] = temp

In [42]:

train.head()

Out[42]:

	ID	Age	Workclass	Education	Marital.Status	Occupation	Relationship	Race	Sex	Hours.Per.Week	Native.Country
0	1	39	6	9	Never-married	Adm-clerical	Not-in-family	White	Male	40	United-States
1	2	50	5	9	Married-civ-spouse	Exec-managerial	Husband	White	Male	13	United-States
2	3	38	3	11	Divorced	Handlers-cleaners	Not-in-family	White	Male	40	United-States
3	4	53	3	1	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	40	United-States
4	5	28	3	9	Married-civ-spouse	Prof-specialty	Wife	Black	Female	40	Cuba

In [43]:

train.describe()

Out[43]:

	ID	Age	Workclass	Education	Hours.Per.Week	Income.Group
count	32561.000000	32561.000000	32561.000000	32561.000000	32561.000000	32561.000000
mean	16281.000000	38.581647	3.094438	10.298210	40.437456	0.240810
std	9399.695394	13.640433	1.107194	3.870264	12.347429	0.427581
min	1.000000	17.000000	0.000000	0.000000	1.000000	0.000000
25%	8141.000000	28.000000	3.000000	9.000000	40.000000	0.000000
50%	16281.000000	37.000000	3.000000	11.000000	40.000000	0.000000
75%	24421.000000	48.000000	3.000000	12.000000	45.000000	0.000000
max	32561.000000	90.000000	7.000000	15.000000	99.000000	1.000000

Marital Status¶

In [44]:

train['Marital.Status'].describe()

Out[44]:

count                  32561
unique                     7
top       Married-civ-spouse
freq                   14976
Name: Marital.Status, dtype: object

In [45]:

test['Marital.Status'].describe()

Out[45]:

count                  16281
unique                     7
top       Married-civ-spouse
freq                    7403
Name: Marital.Status, dtype: object

In [46]:

marital_status_levels= train['Marital.Status'].unique()
# Encode marital status variable
print marital_status_levels

['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']

In [47]:

train['Marital.Status'] = number.fit_transform(train['Marital.Status'])
test['Marital.Status'] = number.fit_transform(test['Marital.Status'])

In [48]:

new_marital_status_levels = train['Marital.Status'].unique()

In [49]:

new_marital_status_levels

Out[49]:

array([4, 2, 0, 3, 5, 1, 6])

In [50]:

fig = plt.figure(figsize=(4,4), dpi=1600)
plt.hist(train['Marital.Status'], bins = len(new_marital_status_levels))

fig = plt.figure(figsize=(4,4), dpi=1600)
plt.hist(test['Marital.Status'], bins = len(new_marital_status_levels))

Out[50]:

(array([ 2190.,    14.,  7403.,   210.,  5434.,   505.,   525.]),
 array([ 0.        ,  0.85714286,  1.71428571,  2.57142857,  3.42857143,
         4.28571429,  5.14285714,  6.        ]),
 <a list of 7 Patch objects>)

In [51]:

marital_level_status_decoder = dict(zip(new_marital_status_levels, marital_status_levels))
print marital_level_status_decoder

{0: 'Divorced', 1: 'Married-AF-spouse', 2: 'Married-civ-spouse', 3: 'Married-spouse-absent', 4: 'Never-married', 5: 'Separated', 6: 'Widowed'}

Occupation¶

In [52]:

occupation_levels = train.Occupation.unique()
print occupation_levels

['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' 'Protective-serv'
 'Armed-Forces' 'Priv-house-serv']

In [53]:

train.Occupation = number.fit_transform(train.Occupation)
test.Occupation = number.fit_transform(test.Occupation)
new_occupation_levels = train.Occupation.unique()
new_occupation_levels

Out[53]:

array([ 0,  3,  5,  9,  7, 11,  2, 13,  4,  6, 12, 10,  1,  8])

In [54]:

new_occupation_levels_decoder = dict(zip(new_occupation_levels,occupation_levels))
new_occupation_levels_decoder

Out[54]:

{0: 'Adm-clerical',
 1: 'Armed-Forces',
 2: 'Craft-repair',
 3: 'Exec-managerial',
 4: 'Farming-fishing',
 5: 'Handlers-cleaners',
 6: 'Machine-op-inspct',
 7: 'Other-service',
 8: 'Priv-house-serv',
 9: 'Prof-specialty',
 10: 'Protective-serv',
 11: 'Sales',
 12: 'Tech-support',
 13: 'Transport-moving'}

In [55]:

plt.hist(train.Occupation, bins = len(new_occupation_levels))
plt.show()

plt.hist(test.Occupation, bins = len(new_occupation_levels))
plt.show()

Relationshop¶

In [56]:

relationship_levels = train.Relationship.unique()
print relationship_levels

['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative']

In [57]:

train.Relationship = number.fit_transform(train.Relationship)
test.Relationship = number.fit_transform(test.Relationship)

In [58]:

new_relationship_levels = train.Relationship.unique()
relationship_levels_decoder = dict(zip(new_relationship_levels, relationship_levels))
relationship_levels_decoder

Out[58]:

{0: 'Husband',
 1: 'Not-in-family',
 2: 'Other-relative',
 3: 'Own-child',
 4: 'Unmarried',
 5: 'Wife'}

In [59]:

plt.hist(train.Relationship, bins=len(relationship_levels))
plt.show()

plt.hist(test.Relationship, bins=len(relationship_levels))
plt.show()

Race¶

In [60]:

race_levels = train.Race.unique()
print race_levels

['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other']

In [61]:

train.Race = number.fit_transform(train.Race)
test.Race = number.fit_transform(test.Race)
new_race_levels = train.Race.unique()
print new_race_levels

[4 2 1 0 3]

In [62]:

race_levels_decoder = dict(zip(new_race_levels,race_levels))
race_levels_decoder

Out[62]:

{0: 'Amer-Indian-Eskimo',
 1: 'Asian-Pac-Islander',
 2: 'Black',
 3: 'Other',
 4: 'White'}

In [63]:

plt.hist(train.Race, bins = len(race_levels))
plt.show()

plt.hist(test.Race, bins = len(race_levels))
plt.show()

Sex¶

In [64]:

sex_levels = train.Sex.unique()
train.Sex = number.fit_transform(train.Sex)
test.Sex = number.fit_transform(test.Sex)
new_sex_levels = train.Sex.unique()
sex_levels_decoder = dict(zip(new_sex_levels, sex_levels))
sex_levels_decoder

Out[64]:

{0: 'Female', 1: 'Male'}

In [65]:

fig = plt.figure(figsize=(3,4), dpi=1600)
plt.hist(train.Sex, bins = 2)
plt.show()

fig = plt.figure(figsize=(3,4), dpi=1600)
plt.hist(test.Sex, bins = 2)
plt.show()

Native Country¶

In [66]:

native_country_levels = train['Native.Country'].unique()
train['Native.Country'] = number.fit_transform(train['Native.Country'])
test['Native.Country'] = number.fit_transform(test['Native.Country'])
new_native_country_levels = train['Native.Country'].unique()
native_country_levels_decoder = dict(zip(new_native_country_levels, native_country_levels))

In [67]:

native_country_levels_decoder

Out[67]:

{0: 'Cambodia',
 1: 'Canada',
 2: 'China',
 3: 'Columbia',
 4: 'Cuba',
 5: 'Dominican-Republic',
 6: 'Ecuador',
 7: 'El-Salvador',
 8: 'England',
 9: 'France',
 10: 'Germany',
 11: 'Greece',
 12: 'Guatemala',
 13: 'Haiti',
 14: 'Holand-Netherlands',
 15: 'Honduras',
 16: 'Hong',
 17: 'Hungary',
 18: 'India',
 19: 'Iran',
 20: 'Ireland',
 21: 'Italy',
 22: 'Jamaica',
 23: 'Japan',
 24: 'Laos',
 25: 'Mexico',
 26: 'Nicaragua',
 27: 'Outlying-US(Guam-USVI-etc)',
 28: 'Peru',
 29: 'Philippines',
 30: 'Poland',
 31: 'Portugal',
 32: 'Puerto-Rico',
 33: 'Scotland',
 34: 'South',
 35: 'Taiwan',
 36: 'Thailand',
 37: 'Trinadad&Tobago',
 38: 'United-States',
 39: 'Vietnam',
 40: 'Yugoslavia'}

In [68]:

plt.hist(train['Native.Country'], bins = 41)
plt.show()

plt.hist(test['Native.Country'], bins = 41)
plt.show()

This feature needs binning countries together. Question is, how best to do that?

In [69]:

pprint(native_country_levels_decoder)

{0: 'Cambodia',
 1: 'Canada',
 2: 'China',
 3: 'Columbia',
 4: 'Cuba',
 5: 'Dominican-Republic',
 6: 'Ecuador',
 7: 'El-Salvador',
 8: 'England',
 9: 'France',
 10: 'Germany',
 11: 'Greece',
 12: 'Guatemala',
 13: 'Haiti',
 14: 'Holand-Netherlands',
 15: 'Honduras',
 16: 'Hong',
 17: 'Hungary',
 18: 'India',
 19: 'Iran',
 20: 'Ireland',
 21: 'Italy',
 22: 'Jamaica',
 23: 'Japan',
 24: 'Laos',
 25: 'Mexico',
 26: 'Nicaragua',
 27: 'Outlying-US(Guam-USVI-etc)',
 28: 'Peru',
 29: 'Philippines',
 30: 'Poland',
 31: 'Portugal',
 32: 'Puerto-Rico',
 33: 'Scotland',
 34: 'South',
 35: 'Taiwan',
 36: 'Thailand',
 37: 'Trinadad&Tobago',
 38: 'United-States',
 39: 'Vietnam',
 40: 'Yugoslavia'}

Count the number of data points per country¶

In [70]:

country_observations_count_train = [len(train[train['Native.Country'] == i]) for i in range(41)]
country_observations_count_test = [len(test[test['Native.Country'] == i]) for i in range(41)]

print dict(zip([native_country_levels_decoder[i] for i in range(41)],country_observations_count_train))
print dict(zip([native_country_levels_decoder[i] for i in range(41)],country_observations_count_test))

{'Canada': 121, 'Hong': 20, 'Dominican-Republic': 70, 'Italy': 73, 'Peru': 31, 'Outlying-US(Guam-USVI-etc)': 14, 'Scotland': 12, 'Cambodia': 19, 'France': 29, 'Ireland': 24, 'Laos': 18, 'Thailand': 18, 'Ecuador': 28, 'Columbia': 59, 'Cuba': 95, 'Guatemala': 64, 'China': 75, 'Germany': 137, 'Haiti': 44, 'Poland': 60, 'United-States': 29753, 'Philippines': 198, 'Vietnam': 67, 'Japan': 62, 'Iran': 43, 'Honduras': 13, 'South': 80, 'Jamaica': 81, 'England': 90, 'Portugal': 37, 'Mexico': 643, 'El-Salvador': 106, 'India': 100, 'Puerto-Rico': 114, 'Yugoslavia': 16, 'Holand-Netherlands': 1, 'Trinadad&Tobago': 19, 'Greece': 29, 'Hungary': 13, 'Taiwan': 51, 'Nicaragua': 34}
{'Canada': 61, 'Hong': 6, 'Dominican-Republic': 33, 'Italy': 25, 'Peru': 97, 'Outlying-US(Guam-USVI-etc)': 15, 'Scotland': 35, 'Cambodia': 9, 'France': 9, 'Ireland': 32, 'Laos': 308, 'Thailand': 8, 'Ecuador': 17, 'Columbia': 26, 'Cuba': 43, 'Guatemala': 24, 'China': 47, 'Germany': 69, 'Haiti': 31, 'Poland': 30, 'United-States': 19, 'Philippines': 27, 'Vietnam': 7, 'Japan': 5, 'Iran': 13, 'Honduras': 10, 'South': 14, 'Jamaica': 30, 'England': 37, 'Portugal': 70, 'Mexico': 15, 'El-Salvador': 49, 'India': 16, 'Puerto-Rico': 9, 'Yugoslavia': 0, 'Holand-Netherlands': 7, 'Trinadad&Tobago': 14936, 'Greece': 20, 'Hungary': 51, 'Taiwan': 12, 'Nicaragua': 9}

let's not over_complicate. Let's club this as 2 bins. One for US and other non-US¶

In [71]:

# country_list = [0]*len(train['Native.Country'])
# US_index = list(train[train['Native.Country'] == 38].index)
# for i in US_index: country_list[i] = 1
# train['Native.Country'] = country_list

# country_list = [0]*len(test['Native.Country'])
# US_index = list(test[test['Native.Country'] == 38].index)
# for i in US_index: country_list[i] = 1
# test['Native.Country'] = country_list

In [72]:

test.head()

Out[72]:

	ID	Age	Workclass	Education	Marital.Status	Occupation	Relationship	Race	Sex	Hours.Per.Week	Native.Country
0	32562	25	3	1	4	6	3	2	1	40	37
1	32563	38	3	11	2	4	0	4	1	50	37
2	32564	28	1	7	2	10	0	4	1	40	37
3	32565	44	3	15	2	6	0	2	1	40	37
4	32566	18	3	15	4	9	3	4	0	30	37

Multivariate Analysis¶

In [73]:

sex_income_crosstab = pd.crosstab(train.Sex, train['Income.Group'], margins = True)
print sex_income_crosstab

Income.Group      0     1    All
Sex                             
0              9592  1179  10771
1             15128  6662  21790
All           24720  7841  32561

In [74]:

sex_income_crosstab.iloc[:-1,:-1]

Out[74]:

Income.Group	0	1
Sex
0	9592	1179
1	15128	6662

In [75]:

fig = plt.figure(figsize=(6,6), dpi=1600)
# fig, ax = plt.subplots()
sex_income_crosstab.iloc[:-1,:-1].plot(kind = 'barh', stacked = True, color = ['red','blue'], alpha = 0.65)

Out[75]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f5d656d1e50>

<matplotlib.figure.Figure at 0x7f5d65b66710>

In [76]:

sex_income_crosstab

Out[76]:

Income.Group	0	1	All
Sex
0	9592	1179	10771
1	15128	6662	21790
All	24720	7841	32561

In [77]:

def percentConvert(x):
    return x / float(x[-1])

sex_income_crosstab.apply(percentConvert, axis = 0)

Out[77]:

Income.Group	0	1	All
Sex
0	0.388026	0.150363	0.330795
1	0.611974	0.849637	0.669205
All	1.000000	1.000000	1.000000

In [78]:

sex_income_crosstab.apply(percentConvert, axis = 1)

Out[78]:

Income.Group	0	1	All
Sex
0	0.890539	0.109461	1.0
1	0.694263	0.305737	1.0
All	0.759190	0.240810	1.0

In [79]:

plt.figure(figsize=(6,10), dpi=1600)

sex_income_crosstab.apply(percentConvert, axis = 1).iloc[:-1,:-1].plot(kind='barh', stacked = True, color = ['red','blue'], alpha = 0.65)
plt.title('Income Distribution by Sex')
plt.show()


sex_income_crosstab.apply(percentConvert, axis = 0).iloc[:-1,:-1].transpose().plot(kind='barh', stacked = True, color = ['red','blue'], alpha = 0.65)
plt.title('Percentage share of Male / Female in each Income Class')
plt.show()

<matplotlib.figure.Figure at 0x7f5d65686050>

In [80]:

test.plot('Age', 'Hours.Per.Week', kind = 'scatter')

Out[80]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f5d65828810>

/home/anirudh/anaconda/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

This shows no real relationship between Age and Hours-Per-Week. Even intuitively we were not expecting any specific trend so this is good. In other cases, you might figure out interesting trends which can be exploited.

In [81]:

train[train.Age > 40]['Income.Group'].mean()

Out[81]:

0.3535669121475861

Let's see how income classification changes solely based on age

In [82]:

ages = []
high_income_proportion = []
for age in range(17,91,3):
    ages.append(age)
    high_income_proportion.append(train[train.Age > age]['Income.Group'].mean())

In [83]:

plt.plot(ages, high_income_proportion)
plt.title('Proportion of people earning above $50K, given age')

Out[83]:

<matplotlib.text.Text at 0x7f5d648055d0>

In [84]:

train.boxplot(column='Hours.Per.Week', by= 'Sex')

Out[84]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f5d65892810>

In [85]:

test.boxplot(column='Hours.Per.Week', by= 'Sex')

Out[85]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f5d64a1dc10>

In [86]:

train.boxplot(column = 'Age', by = 'Income.Group')

Out[86]:

<matplotlib.axes._subplots.AxesSubplot at 0x7f5d6491fd90>

What is the percentage of males which have income <= 50K ?

In [87]:

1 - train[train.Sex == 1]['Income.Group'].mean()

Out[87]:

0.6942634235888022

Predictive Modelling¶

Model I - Decision Tree Classifier¶

In [88]:

from sklearn.tree import DecisionTreeClassifier

In [89]:

# define the predictors

dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]

print independent_variable

# Inititate the algorithm
model_01 = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 100, max_features= 'sqrt')

# Fit the algorithm
model_01.fit(train[independent_variable], train[dependent_variable])

# Model predictions
model_01_predictions = model_01.predict(test[independent_variable])

# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_01_predictions)):
    if model_01_predictions[i] == 1: 
        income_group[i] = '>50K'        
submission['Income.Group'] = income_group
submission.to_csv('submission_01.csv', index = False)

['Age', 'Workclass', 'Education', 'Marital.Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Hours.Per.Week', 'Native.Country']

Your score for this submission is : 0.826976229961.

Model II - Decision Tree Classifier with reduced number of features¶

In [90]:

from sklearn.tree import DecisionTreeClassifier

In [91]:

# define the predictors

dependent_variable = 'Income.Group'
independent_variable = ['Age', 'Education', 'Workclass', 'Sex', 'Hours.Per.Week']

print independent_variable

# Inititate the algorithm
model_02 = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 100, max_features= 'sqrt')

# Fit the algorithm
model_02.fit(train[independent_variable], train[dependent_variable])

# Model predictions
model_02_predictions = model_02.predict(test[independent_variable])

submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])

for i in range(len(model_02_predictions)):
    if model_02_predictions[i] == 1: 
        income_group[i] = '>50K'
        
submission['Income.Group'] = income_group

submission.to_csv('submission_02.csv', index = False)

['Age', 'Education', 'Workclass', 'Sex', 'Hours.Per.Week']

Your score for this submission is : 0.806522940851.

Model III - Random Forest Classifier¶

In [92]:

from sklearn.ensemble import RandomForestClassifier

In [93]:

dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]

# Inititate the algorithm
model_03 = RandomForestClassifier(max_depth = 10, min_samples_leaf = 10, max_features= 'auto')

# Fit the algorithm
model_03.fit(train[independent_variable], train[dependent_variable])

# Model predictions
model_03_predictions = model_03.predict(test[independent_variable])

# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_03_predictions)):
    if model_03_predictions[i] == 1: 
        income_group[i] = '>50K'        
submission['Income.Group'] = income_group
submission.to_csv('submission_03.csv', index = False)

Your score for this submission is : 0.8359437381.

Model IV - K Nearest Neighbors Classifier¶

In [94]:

from sklearn.neighbors import KNeighborsClassifier

In [95]:

dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]

# Inititate the algorithm
model_04 = KNeighborsClassifier(n_neighbors=10, leaf_size=1)

# Fit the algorithm
model_04.fit(train[independent_variable], train[dependent_variable])

# Model predictions
model_04_predictions = model_04.predict(test[independent_variable])

# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_04_predictions)):
    if model_04_predictions[i] == 1: 
        income_group[i] = '>50K'        
submission['Income.Group'] = income_group
submission.to_csv('submission_04.csv', index = False)

Your score for this submission is : 0.811313801364.

Model V - Bagging¶

In [96]:

from sklearn.ensemble import BaggingClassifier

In [97]:

dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]

# Inititate the algorithm
model_05 = BaggingClassifier()

# Fit the algorithm
model_05.fit(train[independent_variable], train[dependent_variable])

# Model predictions
model_05_predictions = model_05.predict(test[independent_variable])

# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_05_predictions)):
    if model_05_predictions[i] == 1: 
        income_group[i] = '>50K'        
submission['Income.Group'] = income_group
submission.to_csv('submission_05.csv', index = False)

Model VI - Ada Boost¶

In [98]:

from sklearn.ensemble import AdaBoostClassifier

In [99]:

dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]

# Inititate the algorithm
model_06 = AdaBoostClassifier(n_estimators=160, learning_rate= 0.5)

# Fit the algorithm
model_06.fit(train[independent_variable], train[dependent_variable])

# Model predictions
model_06_predictions = model_06.predict(test[independent_variable])

# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_06_predictions)):
    if model_06_predictions[i] == 1: 
        income_group[i] = '>50K'        
submission['Income.Group'] = income_group
submission.to_csv('submission_06.csv', index = False)

Your score for this submission is : 0.836496529697.

Model VII - Gradient Boosting¶

In [100]:

from sklearn.ensemble import GradientBoostingClassifier

In [101]:

dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]

# Inititate the algorithm
# model_07 = GradientBoostingClassifier(learning_rate=0.5, n_estimators=1700, max_depth=3)

model_07 = GradientBoostingClassifier(learning_rate=0.2, n_estimators=195, max_depth=3)

# Fit the algorithm
model_07.fit(train[independent_variable], train[dependent_variable])

# Model predictions
model_07_predictions = model_07.predict(test[independent_variable])

# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_07_predictions)):
    if model_07_predictions[i] == 1: 
        income_group[i] = '>50K'        
submission['Income.Group'] = income_group
submission.to_csv('submission_07.csv', index = False)

Your score for this submission is : 0.841471654075.

Model VIII - Extra Trees Classifier¶

In [102]:

from sklearn.ensemble import ExtraTreesClassifier

In [103]:

dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]

# Inititate the algorithm
model_08 = ExtraTreesClassifier()
    
# Fit the algorithm
model_08.fit(train[independent_variable], train[dependent_variable])

# Model predictions
model_08_predictions = model_08.predict(test[independent_variable])

# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_08_predictions)):
    if model_08_predictions[i] == 1: 
        income_group[i] = '>50K'        
submission['Income.Group'] = income_group
submission.to_csv('submission_08.csv', index = False)

Your score for this submission is : 0.81057.