import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
/home/anirudh/anaconda/lib/python2.7/site-packages/pandas/computation/__init__.py:19: UserWarning: The installed version of numexpr 2.4.4 is not supported in pandas and will be not be used UserWarning)
from pprint import *
import seaborn
seaborn.set()
:0: FutureWarning: IPython widgets are experimental and may change in the future.
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test.head()
ID | Age | Workclass | Education | Marital.Status | Occupation | Relationship | Race | Sex | Hours.Per.Week | Native.Country | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 32562 | 25 | Private | 11th | Never-married | Machine-op-inspct | Own-child | Black | Male | 40 | United-States |
1 | 32563 | 38 | Private | HS-grad | Married-civ-spouse | Farming-fishing | Husband | White | Male | 50 | United-States |
2 | 32564 | 28 | Local-gov | Assoc-acdm | Married-civ-spouse | Protective-serv | Husband | White | Male | 40 | United-States |
3 | 32565 | 44 | Private | Some-college | Married-civ-spouse | Machine-op-inspct | Husband | Black | Male | 40 | United-States |
4 | 32566 | 18 | NaN | Some-college | Never-married | NaN | Own-child | White | Female | 30 | United-States |
train.head()
ID | Age | Workclass | Education | Marital.Status | Occupation | Relationship | Race | Sex | Hours.Per.Week | Native.Country | Income.Group | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 39 | State-gov | Bachelors | Never-married | Adm-clerical | Not-in-family | White | Male | 40 | United-States | <=50K |
1 | 2 | 50 | Self-emp-not-inc | Bachelors | Married-civ-spouse | Exec-managerial | Husband | White | Male | 13 | United-States | <=50K |
2 | 3 | 38 | Private | HS-grad | Divorced | Handlers-cleaners | Not-in-family | White | Male | 40 | United-States | <=50K |
3 | 4 | 53 | Private | 11th | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 40 | United-States | <=50K |
4 | 5 | 28 | Private | Bachelors | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 40 | Cuba | <=50K |
train.apply(lambda x: sum(x.isnull()))
ID 0 Age 0 Workclass 1836 Education 0 Marital.Status 0 Occupation 1843 Relationship 0 Race 0 Sex 0 Hours.Per.Week 0 Native.Country 583 Income.Group 0 dtype: int64
test.apply(lambda x: sum(x.isnull()))
ID 0 Age 0 Workclass 963 Education 0 Marital.Status 0 Occupation 966 Relationship 0 Race 0 Sex 0 Hours.Per.Week 0 Native.Country 274 dtype: int64
from scipy.stats import mode
mode(train.Workclass)[0][0]
/home/anirudh/anaconda/lib/python2.7/site-packages/numpy/lib/arraysetops.py:200: FutureWarning: numpy not_equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change. flag = np.concatenate(([True], aux[1:] != aux[:-1]))
'Private'
Workclass_Null_Indices = train[train.Workclass.isnull()].index
for el in Workclass_Null_Indices: train.Workclass[el] = 'Private'
/home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy from IPython.kernel.zmq import kernelapp as app
mode(test.Workclass)[0][0]
'Private'
Workclass_Null_Indices_test = test[test.Workclass.isnull()].index
for el in Workclass_Null_Indices_test: test.Workclass[el] = 'Private'
/home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy from IPython.kernel.zmq import kernelapp as app
mode(train.Occupation)[0][0]
'Prof-specialty'
mode(test.Occupation)[0][0]
'Prof-specialty'
Occupation_Null_Indices_train = train[train.Occupation.isnull()].index
for el in Occupation_Null_Indices_train: train.Occupation[el] = 'Prof-specialty'
/home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy from IPython.kernel.zmq import kernelapp as app
Occupation_Null_Indices_test = test[test.Occupation.isnull()].index
for el in Occupation_Null_Indices_test: test.Occupation[el] = 'Prof-specialty'
/home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy from IPython.kernel.zmq import kernelapp as app
mode(train['Native.Country'])[0][0]
'United-States'
mode(test['Native.Country'])[0][0]
'United-States'
NativeCountry_Null_Indices_train = train[train['Native.Country'].isnull()].index
for el in NativeCountry_Null_Indices_train: train['Native.Country'][el] = 'United-States'
NativeCountry_Null_Indices_test = test[test['Native.Country'].isnull()].index
for el in NativeCountry_Null_Indices_test: test['Native.Country'][el] = 'United-States'
/home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy from IPython.kernel.zmq import kernelapp as app /home/anirudh/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:5: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
*Age*
plt.hist(train.Age, bins = 100)
plt.show()
plt.hist(test.Age, bins = 100)
plt.show()
train.Age.describe()
count 32561.000000 mean 38.581647 std 13.640433 min 17.000000 25% 28.000000 50% 37.000000 75% 48.000000 max 90.000000 Name: Age, dtype: float64
test.Age.describe()
count 16281.000000 mean 38.767459 std 13.849187 min 17.000000 25% 28.000000 50% 37.000000 75% 48.000000 max 90.000000 Name: Age, dtype: float64
*** 25% of the population is between 17 and 28***
*** 25% of the population is between 28 and 37***
*** 75% of the population is under 48 ***
*** Working Class ***
train.Workclass.describe()
count 32561 unique 8 top Private freq 24532 Name: Workclass, dtype: object
test.Workclass.describe()
count 16281 unique 8 top Private freq 12173 Name: Workclass, dtype: object
unique_working_classes = list(train.Workclass.unique())
unique_working_classes
['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov', 'Local-gov', 'Self-emp-inc', 'Without-pay', 'Never-worked']
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
train.Workclass = number.fit_transform(train.Workclass.astype('str'))
test.Workclass = number.fit_transform(test.Workclass.astype('str'))
fig = plt.figure(figsize=(4,4), dpi=1600)
plt.hist(train.Workclass, bins = len(unique_working_classes))
plt.show()
fig = plt.figure(figsize=(4,4), dpi=1600)
plt.hist(test.Workclass, bins = len(unique_working_classes))
plt.show()
train.Workclass.unique()
array([6, 5, 3, 0, 1, 4, 7, 2])
# Workclass Decoder
workclass_decoder = dict(zip(train.Workclass.unique(),unique_working_classes))
workclass_decoder
{0: 'Federal-gov', 1: 'Local-gov', 2: 'Never-worked', 3: 'Private', 4: 'Self-emp-inc', 5: 'Self-emp-not-inc', 6: 'State-gov', 7: 'Without-pay'}
# # Combine bins based on some common sense
# new_workclass_encoder = {0:0,1:0,2:1,3:2,4:2,5:2,6:0,7:1}
# new_workclass_train = [new_workclass_encoder[i] for i in train.Workclass]
# train.Workclass = new_workclass_train
# new_workclass_test = [new_workclass_encoder[i] for i in test.Workclass]
# test.Workclass = new_workclass_test
# new_workclass_decoder = {0:'Government',1:'Never Worked / Without Pay / Information NA', 2:'Private'}
# new_workclass_decoder
# fig = plt.figure(figsize=(2,5), dpi=1600)
# plt.hist(train.Workclass, bins = 3)
# fig = plt.figure(figsize=(2,5), dpi=1600)
# plt.hist(test.Workclass, bins = 3)
*** Education ***
education_levels_train = train.Education.unique()
print education_levels_train
['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm' 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th' '1st-4th' 'Preschool' '12th']
train.Education = number.fit_transform(train.Education)
new_education_levels = train.Education.unique()
education_levels_decoder = dict(zip(education_levels_train, new_education_levels))
pprint(education_levels_decoder)
plt.hist(train.Education, bins = len(new_education_levels))
plt.show()
test.Education = number.fit_transform(test.Education)
plt.hist(test.Education, bins = len(new_education_levels))
plt.show()
{'10th': 0, '11th': 1, '12th': 2, '1st-4th': 3, '5th-6th': 4, '7th-8th': 5, '9th': 6, 'Assoc-acdm': 7, 'Assoc-voc': 8, 'Bachelors': 9, 'Doctorate': 10, 'HS-grad': 11, 'Masters': 12, 'Preschool': 13, 'Prof-school': 14, 'Some-college': 15}
...it would make sense to club bins in a more sensible way
pprint(education_levels_decoder)
{'10th': 0, '11th': 1, '12th': 2, '1st-4th': 3, '5th-6th': 4, '7th-8th': 5, '9th': 6, 'Assoc-acdm': 7, 'Assoc-voc': 8, 'Bachelors': 9, 'Doctorate': 10, 'HS-grad': 11, 'Masters': 12, 'Preschool': 13, 'Prof-school': 14, 'Some-college': 15}
# new_education_levels_encoder = {0:1,1:1,2:2,3:0,4:0,5:0,6:0, 7:5, 8:5, 9:3, 10:7, 11:2, 12:4, 13:0, 14:6, 15:3}
# new_education_levels_train = [new_education_levels_encoder[i] for i in train.Education]
# train.Education = new_education_levels_train
# new_education_levels_test = [new_education_levels_encoder[i] for i in test.Education]
# test.Education = new_education_levels_test
# new_education_levels_decoder = {0:'< 9th', 1:'High School', 2:'High School Grad', 3:'Bachelors', 4:'Masters', 5:'Assoc-acdm / Assoc-voc', 6:'Prof School', 7:'Doctorate'}
# plt.hist(train.Education, bins = len(new_education_levels_decoder))
# plt.show()
# plt.hist(test.Education, bins = len(new_education_levels_decoder))
# plt.show()
# Rows for which Income is under $50k
print train[train['Income.Group'] != '<=50K'].index
Int64Index([ 7, 8, 9, 10, 11, 14, 19, 20, 25, 27, ... 32530, 32532, 32533, 32536, 32538, 32539, 32545, 32554, 32557, 32560], dtype='int64', length=7841)
temp = [0]*len(train)
for i in train[train['Income.Group'] != '<=50K'].index:
temp[i] = 1
train['Income.Group'] = temp
train.head()
ID | Age | Workclass | Education | Marital.Status | Occupation | Relationship | Race | Sex | Hours.Per.Week | Native.Country | Income.Group | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 39 | 6 | 9 | Never-married | Adm-clerical | Not-in-family | White | Male | 40 | United-States | 0 |
1 | 2 | 50 | 5 | 9 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 13 | United-States | 0 |
2 | 3 | 38 | 3 | 11 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 40 | United-States | 0 |
3 | 4 | 53 | 3 | 1 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 40 | United-States | 0 |
4 | 5 | 28 | 3 | 9 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 40 | Cuba | 0 |
train.describe()
ID | Age | Workclass | Education | Hours.Per.Week | Income.Group | |
---|---|---|---|---|---|---|
count | 32561.000000 | 32561.000000 | 32561.000000 | 32561.000000 | 32561.000000 | 32561.000000 |
mean | 16281.000000 | 38.581647 | 3.094438 | 10.298210 | 40.437456 | 0.240810 |
std | 9399.695394 | 13.640433 | 1.107194 | 3.870264 | 12.347429 | 0.427581 |
min | 1.000000 | 17.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
25% | 8141.000000 | 28.000000 | 3.000000 | 9.000000 | 40.000000 | 0.000000 |
50% | 16281.000000 | 37.000000 | 3.000000 | 11.000000 | 40.000000 | 0.000000 |
75% | 24421.000000 | 48.000000 | 3.000000 | 12.000000 | 45.000000 | 0.000000 |
max | 32561.000000 | 90.000000 | 7.000000 | 15.000000 | 99.000000 | 1.000000 |
train['Marital.Status'].describe()
count 32561 unique 7 top Married-civ-spouse freq 14976 Name: Marital.Status, dtype: object
test['Marital.Status'].describe()
count 16281 unique 7 top Married-civ-spouse freq 7403 Name: Marital.Status, dtype: object
marital_status_levels= train['Marital.Status'].unique()
# Encode marital status variable
print marital_status_levels
['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent' 'Separated' 'Married-AF-spouse' 'Widowed']
train['Marital.Status'] = number.fit_transform(train['Marital.Status'])
test['Marital.Status'] = number.fit_transform(test['Marital.Status'])
new_marital_status_levels = train['Marital.Status'].unique()
new_marital_status_levels
array([4, 2, 0, 3, 5, 1, 6])
fig = plt.figure(figsize=(4,4), dpi=1600)
plt.hist(train['Marital.Status'], bins = len(new_marital_status_levels))
fig = plt.figure(figsize=(4,4), dpi=1600)
plt.hist(test['Marital.Status'], bins = len(new_marital_status_levels))
(array([ 2190., 14., 7403., 210., 5434., 505., 525.]), array([ 0. , 0.85714286, 1.71428571, 2.57142857, 3.42857143, 4.28571429, 5.14285714, 6. ]), <a list of 7 Patch objects>)
marital_level_status_decoder = dict(zip(new_marital_status_levels, marital_status_levels))
print marital_level_status_decoder
{0: 'Divorced', 1: 'Married-AF-spouse', 2: 'Married-civ-spouse', 3: 'Married-spouse-absent', 4: 'Never-married', 5: 'Separated', 6: 'Widowed'}
occupation_levels = train.Occupation.unique()
print occupation_levels
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty' 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving' 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' 'Protective-serv' 'Armed-Forces' 'Priv-house-serv']
train.Occupation = number.fit_transform(train.Occupation)
test.Occupation = number.fit_transform(test.Occupation)
new_occupation_levels = train.Occupation.unique()
new_occupation_levels
array([ 0, 3, 5, 9, 7, 11, 2, 13, 4, 6, 12, 10, 1, 8])
new_occupation_levels_decoder = dict(zip(new_occupation_levels,occupation_levels))
new_occupation_levels_decoder
{0: 'Adm-clerical', 1: 'Armed-Forces', 2: 'Craft-repair', 3: 'Exec-managerial', 4: 'Farming-fishing', 5: 'Handlers-cleaners', 6: 'Machine-op-inspct', 7: 'Other-service', 8: 'Priv-house-serv', 9: 'Prof-specialty', 10: 'Protective-serv', 11: 'Sales', 12: 'Tech-support', 13: 'Transport-moving'}
plt.hist(train.Occupation, bins = len(new_occupation_levels))
plt.show()
plt.hist(test.Occupation, bins = len(new_occupation_levels))
plt.show()
relationship_levels = train.Relationship.unique()
print relationship_levels
['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative']
train.Relationship = number.fit_transform(train.Relationship)
test.Relationship = number.fit_transform(test.Relationship)
new_relationship_levels = train.Relationship.unique()
relationship_levels_decoder = dict(zip(new_relationship_levels, relationship_levels))
relationship_levels_decoder
{0: 'Husband', 1: 'Not-in-family', 2: 'Other-relative', 3: 'Own-child', 4: 'Unmarried', 5: 'Wife'}
plt.hist(train.Relationship, bins=len(relationship_levels))
plt.show()
plt.hist(test.Relationship, bins=len(relationship_levels))
plt.show()
race_levels = train.Race.unique()
print race_levels
['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other']
train.Race = number.fit_transform(train.Race)
test.Race = number.fit_transform(test.Race)
new_race_levels = train.Race.unique()
print new_race_levels
[4 2 1 0 3]
race_levels_decoder = dict(zip(new_race_levels,race_levels))
race_levels_decoder
{0: 'Amer-Indian-Eskimo', 1: 'Asian-Pac-Islander', 2: 'Black', 3: 'Other', 4: 'White'}
plt.hist(train.Race, bins = len(race_levels))
plt.show()
plt.hist(test.Race, bins = len(race_levels))
plt.show()
sex_levels = train.Sex.unique()
train.Sex = number.fit_transform(train.Sex)
test.Sex = number.fit_transform(test.Sex)
new_sex_levels = train.Sex.unique()
sex_levels_decoder = dict(zip(new_sex_levels, sex_levels))
sex_levels_decoder
{0: 'Female', 1: 'Male'}
fig = plt.figure(figsize=(3,4), dpi=1600)
plt.hist(train.Sex, bins = 2)
plt.show()
fig = plt.figure(figsize=(3,4), dpi=1600)
plt.hist(test.Sex, bins = 2)
plt.show()
native_country_levels = train['Native.Country'].unique()
train['Native.Country'] = number.fit_transform(train['Native.Country'])
test['Native.Country'] = number.fit_transform(test['Native.Country'])
new_native_country_levels = train['Native.Country'].unique()
native_country_levels_decoder = dict(zip(new_native_country_levels, native_country_levels))
native_country_levels_decoder
{0: 'Cambodia', 1: 'Canada', 2: 'China', 3: 'Columbia', 4: 'Cuba', 5: 'Dominican-Republic', 6: 'Ecuador', 7: 'El-Salvador', 8: 'England', 9: 'France', 10: 'Germany', 11: 'Greece', 12: 'Guatemala', 13: 'Haiti', 14: 'Holand-Netherlands', 15: 'Honduras', 16: 'Hong', 17: 'Hungary', 18: 'India', 19: 'Iran', 20: 'Ireland', 21: 'Italy', 22: 'Jamaica', 23: 'Japan', 24: 'Laos', 25: 'Mexico', 26: 'Nicaragua', 27: 'Outlying-US(Guam-USVI-etc)', 28: 'Peru', 29: 'Philippines', 30: 'Poland', 31: 'Portugal', 32: 'Puerto-Rico', 33: 'Scotland', 34: 'South', 35: 'Taiwan', 36: 'Thailand', 37: 'Trinadad&Tobago', 38: 'United-States', 39: 'Vietnam', 40: 'Yugoslavia'}
plt.hist(train['Native.Country'], bins = 41)
plt.show()
plt.hist(test['Native.Country'], bins = 41)
plt.show()
This feature needs binning countries together. Question is, how best to do that?
pprint(native_country_levels_decoder)
{0: 'Cambodia', 1: 'Canada', 2: 'China', 3: 'Columbia', 4: 'Cuba', 5: 'Dominican-Republic', 6: 'Ecuador', 7: 'El-Salvador', 8: 'England', 9: 'France', 10: 'Germany', 11: 'Greece', 12: 'Guatemala', 13: 'Haiti', 14: 'Holand-Netherlands', 15: 'Honduras', 16: 'Hong', 17: 'Hungary', 18: 'India', 19: 'Iran', 20: 'Ireland', 21: 'Italy', 22: 'Jamaica', 23: 'Japan', 24: 'Laos', 25: 'Mexico', 26: 'Nicaragua', 27: 'Outlying-US(Guam-USVI-etc)', 28: 'Peru', 29: 'Philippines', 30: 'Poland', 31: 'Portugal', 32: 'Puerto-Rico', 33: 'Scotland', 34: 'South', 35: 'Taiwan', 36: 'Thailand', 37: 'Trinadad&Tobago', 38: 'United-States', 39: 'Vietnam', 40: 'Yugoslavia'}
country_observations_count_train = [len(train[train['Native.Country'] == i]) for i in range(41)]
country_observations_count_test = [len(test[test['Native.Country'] == i]) for i in range(41)]
print dict(zip([native_country_levels_decoder[i] for i in range(41)],country_observations_count_train))
print dict(zip([native_country_levels_decoder[i] for i in range(41)],country_observations_count_test))
{'Canada': 121, 'Hong': 20, 'Dominican-Republic': 70, 'Italy': 73, 'Peru': 31, 'Outlying-US(Guam-USVI-etc)': 14, 'Scotland': 12, 'Cambodia': 19, 'France': 29, 'Ireland': 24, 'Laos': 18, 'Thailand': 18, 'Ecuador': 28, 'Columbia': 59, 'Cuba': 95, 'Guatemala': 64, 'China': 75, 'Germany': 137, 'Haiti': 44, 'Poland': 60, 'United-States': 29753, 'Philippines': 198, 'Vietnam': 67, 'Japan': 62, 'Iran': 43, 'Honduras': 13, 'South': 80, 'Jamaica': 81, 'England': 90, 'Portugal': 37, 'Mexico': 643, 'El-Salvador': 106, 'India': 100, 'Puerto-Rico': 114, 'Yugoslavia': 16, 'Holand-Netherlands': 1, 'Trinadad&Tobago': 19, 'Greece': 29, 'Hungary': 13, 'Taiwan': 51, 'Nicaragua': 34} {'Canada': 61, 'Hong': 6, 'Dominican-Republic': 33, 'Italy': 25, 'Peru': 97, 'Outlying-US(Guam-USVI-etc)': 15, 'Scotland': 35, 'Cambodia': 9, 'France': 9, 'Ireland': 32, 'Laos': 308, 'Thailand': 8, 'Ecuador': 17, 'Columbia': 26, 'Cuba': 43, 'Guatemala': 24, 'China': 47, 'Germany': 69, 'Haiti': 31, 'Poland': 30, 'United-States': 19, 'Philippines': 27, 'Vietnam': 7, 'Japan': 5, 'Iran': 13, 'Honduras': 10, 'South': 14, 'Jamaica': 30, 'England': 37, 'Portugal': 70, 'Mexico': 15, 'El-Salvador': 49, 'India': 16, 'Puerto-Rico': 9, 'Yugoslavia': 0, 'Holand-Netherlands': 7, 'Trinadad&Tobago': 14936, 'Greece': 20, 'Hungary': 51, 'Taiwan': 12, 'Nicaragua': 9}
# country_list = [0]*len(train['Native.Country'])
# US_index = list(train[train['Native.Country'] == 38].index)
# for i in US_index: country_list[i] = 1
# train['Native.Country'] = country_list
# country_list = [0]*len(test['Native.Country'])
# US_index = list(test[test['Native.Country'] == 38].index)
# for i in US_index: country_list[i] = 1
# test['Native.Country'] = country_list
test.head()
ID | Age | Workclass | Education | Marital.Status | Occupation | Relationship | Race | Sex | Hours.Per.Week | Native.Country | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 32562 | 25 | 3 | 1 | 4 | 6 | 3 | 2 | 1 | 40 | 37 |
1 | 32563 | 38 | 3 | 11 | 2 | 4 | 0 | 4 | 1 | 50 | 37 |
2 | 32564 | 28 | 1 | 7 | 2 | 10 | 0 | 4 | 1 | 40 | 37 |
3 | 32565 | 44 | 3 | 15 | 2 | 6 | 0 | 2 | 1 | 40 | 37 |
4 | 32566 | 18 | 3 | 15 | 4 | 9 | 3 | 4 | 0 | 30 | 37 |
sex_income_crosstab = pd.crosstab(train.Sex, train['Income.Group'], margins = True)
print sex_income_crosstab
Income.Group 0 1 All Sex 0 9592 1179 10771 1 15128 6662 21790 All 24720 7841 32561
sex_income_crosstab.iloc[:-1,:-1]
Income.Group | 0 | 1 |
---|---|---|
Sex | ||
0 | 9592 | 1179 |
1 | 15128 | 6662 |
fig = plt.figure(figsize=(6,6), dpi=1600)
# fig, ax = plt.subplots()
sex_income_crosstab.iloc[:-1,:-1].plot(kind = 'barh', stacked = True, color = ['red','blue'], alpha = 0.65)
<matplotlib.axes._subplots.AxesSubplot at 0x7f5d656d1e50>
<matplotlib.figure.Figure at 0x7f5d65b66710>
sex_income_crosstab
Income.Group | 0 | 1 | All |
---|---|---|---|
Sex | |||
0 | 9592 | 1179 | 10771 |
1 | 15128 | 6662 | 21790 |
All | 24720 | 7841 | 32561 |
def percentConvert(x):
return x / float(x[-1])
sex_income_crosstab.apply(percentConvert, axis = 0)
Income.Group | 0 | 1 | All |
---|---|---|---|
Sex | |||
0 | 0.388026 | 0.150363 | 0.330795 |
1 | 0.611974 | 0.849637 | 0.669205 |
All | 1.000000 | 1.000000 | 1.000000 |
sex_income_crosstab.apply(percentConvert, axis = 1)
Income.Group | 0 | 1 | All |
---|---|---|---|
Sex | |||
0 | 0.890539 | 0.109461 | 1.0 |
1 | 0.694263 | 0.305737 | 1.0 |
All | 0.759190 | 0.240810 | 1.0 |
plt.figure(figsize=(6,10), dpi=1600)
sex_income_crosstab.apply(percentConvert, axis = 1).iloc[:-1,:-1].plot(kind='barh', stacked = True, color = ['red','blue'], alpha = 0.65)
plt.title('Income Distribution by Sex')
plt.show()
sex_income_crosstab.apply(percentConvert, axis = 0).iloc[:-1,:-1].transpose().plot(kind='barh', stacked = True, color = ['red','blue'], alpha = 0.65)
plt.title('Percentage share of Male / Female in each Income Class')
plt.show()
<matplotlib.figure.Figure at 0x7f5d65686050>
test.plot('Age', 'Hours.Per.Week', kind = 'scatter')
<matplotlib.axes._subplots.AxesSubplot at 0x7f5d65828810>
/home/anirudh/anaconda/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison if self._edgecolors == str('face'):
This shows no real relationship between Age and Hours-Per-Week. Even intuitively we were not expecting any specific trend so this is good. In other cases, you might figure out interesting trends which can be exploited.
train[train.Age > 40]['Income.Group'].mean()
0.3535669121475861
Let's see how income classification changes solely based on age
ages = []
high_income_proportion = []
for age in range(17,91,3):
ages.append(age)
high_income_proportion.append(train[train.Age > age]['Income.Group'].mean())
plt.plot(ages, high_income_proportion)
plt.title('Proportion of people earning above $50K, given age')
<matplotlib.text.Text at 0x7f5d648055d0>
train.boxplot(column='Hours.Per.Week', by= 'Sex')
<matplotlib.axes._subplots.AxesSubplot at 0x7f5d65892810>
test.boxplot(column='Hours.Per.Week', by= 'Sex')
<matplotlib.axes._subplots.AxesSubplot at 0x7f5d64a1dc10>
train.boxplot(column = 'Age', by = 'Income.Group')
<matplotlib.axes._subplots.AxesSubplot at 0x7f5d6491fd90>
What is the percentage of males which have income <= 50K ?
1 - train[train.Sex == 1]['Income.Group'].mean()
0.6942634235888022
from sklearn.tree import DecisionTreeClassifier
# define the predictors
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]
print independent_variable
# Inititate the algorithm
model_01 = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 100, max_features= 'sqrt')
# Fit the algorithm
model_01.fit(train[independent_variable], train[dependent_variable])
# Model predictions
model_01_predictions = model_01.predict(test[independent_variable])
# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_01_predictions)):
if model_01_predictions[i] == 1:
income_group[i] = '>50K'
submission['Income.Group'] = income_group
submission.to_csv('submission_01.csv', index = False)
['Age', 'Workclass', 'Education', 'Marital.Status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Hours.Per.Week', 'Native.Country']
Your score for this submission is : 0.826976229961.
from sklearn.tree import DecisionTreeClassifier
# define the predictors
dependent_variable = 'Income.Group'
independent_variable = ['Age', 'Education', 'Workclass', 'Sex', 'Hours.Per.Week']
print independent_variable
# Inititate the algorithm
model_02 = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 100, max_features= 'sqrt')
# Fit the algorithm
model_02.fit(train[independent_variable], train[dependent_variable])
# Model predictions
model_02_predictions = model_02.predict(test[independent_variable])
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_02_predictions)):
if model_02_predictions[i] == 1:
income_group[i] = '>50K'
submission['Income.Group'] = income_group
submission.to_csv('submission_02.csv', index = False)
['Age', 'Education', 'Workclass', 'Sex', 'Hours.Per.Week']
Your score for this submission is : 0.806522940851.
from sklearn.ensemble import RandomForestClassifier
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]
# Inititate the algorithm
model_03 = RandomForestClassifier(max_depth = 10, min_samples_leaf = 10, max_features= 'auto')
# Fit the algorithm
model_03.fit(train[independent_variable], train[dependent_variable])
# Model predictions
model_03_predictions = model_03.predict(test[independent_variable])
# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_03_predictions)):
if model_03_predictions[i] == 1:
income_group[i] = '>50K'
submission['Income.Group'] = income_group
submission.to_csv('submission_03.csv', index = False)
Your score for this submission is : 0.8359437381.
from sklearn.neighbors import KNeighborsClassifier
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]
# Inititate the algorithm
model_04 = KNeighborsClassifier(n_neighbors=10, leaf_size=1)
# Fit the algorithm
model_04.fit(train[independent_variable], train[dependent_variable])
# Model predictions
model_04_predictions = model_04.predict(test[independent_variable])
# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_04_predictions)):
if model_04_predictions[i] == 1:
income_group[i] = '>50K'
submission['Income.Group'] = income_group
submission.to_csv('submission_04.csv', index = False)
Your score for this submission is : 0.811313801364.
from sklearn.ensemble import BaggingClassifier
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]
# Inititate the algorithm
model_05 = BaggingClassifier()
# Fit the algorithm
model_05.fit(train[independent_variable], train[dependent_variable])
# Model predictions
model_05_predictions = model_05.predict(test[independent_variable])
# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_05_predictions)):
if model_05_predictions[i] == 1:
income_group[i] = '>50K'
submission['Income.Group'] = income_group
submission.to_csv('submission_05.csv', index = False)
from sklearn.ensemble import AdaBoostClassifier
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]
# Inititate the algorithm
model_06 = AdaBoostClassifier(n_estimators=160, learning_rate= 0.5)
# Fit the algorithm
model_06.fit(train[independent_variable], train[dependent_variable])
# Model predictions
model_06_predictions = model_06.predict(test[independent_variable])
# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_06_predictions)):
if model_06_predictions[i] == 1:
income_group[i] = '>50K'
submission['Income.Group'] = income_group
submission.to_csv('submission_06.csv', index = False)
Your score for this submission is : 0.836496529697.
from sklearn.ensemble import GradientBoostingClassifier
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]
# Inititate the algorithm
# model_07 = GradientBoostingClassifier(learning_rate=0.5, n_estimators=1700, max_depth=3)
model_07 = GradientBoostingClassifier(learning_rate=0.2, n_estimators=195, max_depth=3)
# Fit the algorithm
model_07.fit(train[independent_variable], train[dependent_variable])
# Model predictions
model_07_predictions = model_07.predict(test[independent_variable])
# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_07_predictions)):
if model_07_predictions[i] == 1:
income_group[i] = '>50K'
submission['Income.Group'] = income_group
submission.to_csv('submission_07.csv', index = False)
Your score for this submission is : 0.841471654075.
from sklearn.ensemble import ExtraTreesClassifier
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]
# Inititate the algorithm
model_08 = ExtraTreesClassifier()
# Fit the algorithm
model_08.fit(train[independent_variable], train[dependent_variable])
# Model predictions
model_08_predictions = model_08.predict(test[independent_variable])
# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_08_predictions)):
if model_08_predictions[i] == 1:
income_group[i] = '>50K'
submission['Income.Group'] = income_group
submission.to_csv('submission_08.csv', index = False)
Your score for this submission is : 0.81057.