import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from pprint import *
import seaborn
seaborn.set()
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test.head()
train.head()
train.apply(lambda x: sum(x.isnull()))
test.apply(lambda x: sum(x.isnull()))
from scipy.stats import mode
mode(train.Workclass)[0][0]
Workclass_Null_Indices = train[train.Workclass.isnull()].index
for el in Workclass_Null_Indices: train.Workclass[el] = 'Private'
mode(test.Workclass)[0][0]
Workclass_Null_Indices_test = test[test.Workclass.isnull()].index
for el in Workclass_Null_Indices_test: test.Workclass[el] = 'Private'
mode(train.Occupation)[0][0]
mode(test.Occupation)[0][0]
Occupation_Null_Indices_train = train[train.Occupation.isnull()].index
for el in Occupation_Null_Indices_train: train.Occupation[el] = 'Prof-specialty'
Occupation_Null_Indices_test = test[test.Occupation.isnull()].index
for el in Occupation_Null_Indices_test: test.Occupation[el] = 'Prof-specialty'
mode(train['Native.Country'])[0][0]
mode(test['Native.Country'])[0][0]
NativeCountry_Null_Indices_train = train[train['Native.Country'].isnull()].index
for el in NativeCountry_Null_Indices_train: train['Native.Country'][el] = 'United-States'
NativeCountry_Null_Indices_test = test[test['Native.Country'].isnull()].index
for el in NativeCountry_Null_Indices_test: test['Native.Country'][el] = 'United-States'
Age
plt.hist(train.Age, bins = 100)
plt.show()
plt.hist(test.Age, bins = 100)
plt.show()
train.Age.describe()
test.Age.describe()
25% of the population is between 17 and 28
25% of the population is between 28 and 37
75% of the population is under 48
Working Class
train.Workclass.describe()
test.Workclass.describe()
unique_working_classes = list(train.Workclass.unique())
unique_working_classes
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
train.Workclass = number.fit_transform(train.Workclass.astype('str'))
test.Workclass = number.fit_transform(test.Workclass.astype('str'))
fig = plt.figure(figsize=(4,4), dpi=1600)
plt.hist(train.Workclass, bins = len(unique_working_classes))
plt.show()
fig = plt.figure(figsize=(4,4), dpi=1600)
plt.hist(test.Workclass, bins = len(unique_working_classes))
plt.show()
train.Workclass.unique()
# Workclass Decoder
workclass_decoder = dict(zip(train.Workclass.unique(),unique_working_classes))
workclass_decoder
# # Combine bins based on some common sense
# new_workclass_encoder = {0:0,1:0,2:1,3:2,4:2,5:2,6:0,7:1}
# new_workclass_train = [new_workclass_encoder[i] for i in train.Workclass]
# train.Workclass = new_workclass_train
# new_workclass_test = [new_workclass_encoder[i] for i in test.Workclass]
# test.Workclass = new_workclass_test
# new_workclass_decoder = {0:'Government',1:'Never Worked / Without Pay / Information NA', 2:'Private'}
# new_workclass_decoder
# fig = plt.figure(figsize=(2,5), dpi=1600)
# plt.hist(train.Workclass, bins = 3)
# fig = plt.figure(figsize=(2,5), dpi=1600)
# plt.hist(test.Workclass, bins = 3)
Education
education_levels_train = train.Education.unique()
print education_levels_train
train.Education = number.fit_transform(train.Education)
new_education_levels = train.Education.unique()
education_levels_decoder = dict(zip(education_levels_train, new_education_levels))
pprint(education_levels_decoder)
plt.hist(train.Education, bins = len(new_education_levels))
plt.show()
test.Education = number.fit_transform(test.Education)
plt.hist(test.Education, bins = len(new_education_levels))
plt.show()
...it would make sense to club bins in a more sensible way
pprint(education_levels_decoder)
# new_education_levels_encoder = {0:1,1:1,2:2,3:0,4:0,5:0,6:0, 7:5, 8:5, 9:3, 10:7, 11:2, 12:4, 13:0, 14:6, 15:3}
# new_education_levels_train = [new_education_levels_encoder[i] for i in train.Education]
# train.Education = new_education_levels_train
# new_education_levels_test = [new_education_levels_encoder[i] for i in test.Education]
# test.Education = new_education_levels_test
# new_education_levels_decoder = {0:'< 9th', 1:'High School', 2:'High School Grad', 3:'Bachelors', 4:'Masters', 5:'Assoc-acdm / Assoc-voc', 6:'Prof School', 7:'Doctorate'}
# plt.hist(train.Education, bins = len(new_education_levels_decoder))
# plt.show()
# plt.hist(test.Education, bins = len(new_education_levels_decoder))
# plt.show()
# Rows for which Income is under $50k
print train[train['Income.Group'] != '<=50K'].index
temp = [0]*len(train)
for i in train[train['Income.Group'] != '<=50K'].index:
temp[i] = 1
train['Income.Group'] = temp
train.head()
train.describe()
train['Marital.Status'].describe()
test['Marital.Status'].describe()
marital_status_levels= train['Marital.Status'].unique()
# Encode marital status variable
print marital_status_levels
train['Marital.Status'] = number.fit_transform(train['Marital.Status'])
test['Marital.Status'] = number.fit_transform(test['Marital.Status'])
new_marital_status_levels = train['Marital.Status'].unique()
new_marital_status_levels
fig = plt.figure(figsize=(4,4), dpi=1600)
plt.hist(train['Marital.Status'], bins = len(new_marital_status_levels))
fig = plt.figure(figsize=(4,4), dpi=1600)
plt.hist(test['Marital.Status'], bins = len(new_marital_status_levels))
marital_level_status_decoder = dict(zip(new_marital_status_levels, marital_status_levels))
print marital_level_status_decoder
occupation_levels = train.Occupation.unique()
print occupation_levels
train.Occupation = number.fit_transform(train.Occupation)
test.Occupation = number.fit_transform(test.Occupation)
new_occupation_levels = train.Occupation.unique()
new_occupation_levels
new_occupation_levels_decoder = dict(zip(new_occupation_levels,occupation_levels))
new_occupation_levels_decoder
plt.hist(train.Occupation, bins = len(new_occupation_levels))
plt.show()
plt.hist(test.Occupation, bins = len(new_occupation_levels))
plt.show()
relationship_levels = train.Relationship.unique()
print relationship_levels
train.Relationship = number.fit_transform(train.Relationship)
test.Relationship = number.fit_transform(test.Relationship)
new_relationship_levels = train.Relationship.unique()
relationship_levels_decoder = dict(zip(new_relationship_levels, relationship_levels))
relationship_levels_decoder
plt.hist(train.Relationship, bins=len(relationship_levels))
plt.show()
plt.hist(test.Relationship, bins=len(relationship_levels))
plt.show()
race_levels = train.Race.unique()
print race_levels
train.Race = number.fit_transform(train.Race)
test.Race = number.fit_transform(test.Race)
new_race_levels = train.Race.unique()
print new_race_levels
race_levels_decoder = dict(zip(new_race_levels,race_levels))
race_levels_decoder
plt.hist(train.Race, bins = len(race_levels))
plt.show()
plt.hist(test.Race, bins = len(race_levels))
plt.show()
sex_levels = train.Sex.unique()
train.Sex = number.fit_transform(train.Sex)
test.Sex = number.fit_transform(test.Sex)
new_sex_levels = train.Sex.unique()
sex_levels_decoder = dict(zip(new_sex_levels, sex_levels))
sex_levels_decoder
fig = plt.figure(figsize=(3,4), dpi=1600)
plt.hist(train.Sex, bins = 2)
plt.show()
fig = plt.figure(figsize=(3,4), dpi=1600)
plt.hist(test.Sex, bins = 2)
plt.show()
native_country_levels = train['Native.Country'].unique()
train['Native.Country'] = number.fit_transform(train['Native.Country'])
test['Native.Country'] = number.fit_transform(test['Native.Country'])
new_native_country_levels = train['Native.Country'].unique()
native_country_levels_decoder = dict(zip(new_native_country_levels, native_country_levels))
native_country_levels_decoder
plt.hist(train['Native.Country'], bins = 41)
plt.show()
plt.hist(test['Native.Country'], bins = 41)
plt.show()
This feature needs binning countries together. Question is, how best to do that?
pprint(native_country_levels_decoder)
country_observations_count_train = [len(train[train['Native.Country'] == i]) for i in range(41)]
country_observations_count_test = [len(test[test['Native.Country'] == i]) for i in range(41)]
print dict(zip([native_country_levels_decoder[i] for i in range(41)],country_observations_count_train))
print dict(zip([native_country_levels_decoder[i] for i in range(41)],country_observations_count_test))
# country_list = [0]*len(train['Native.Country'])
# US_index = list(train[train['Native.Country'] == 38].index)
# for i in US_index: country_list[i] = 1
# train['Native.Country'] = country_list
# country_list = [0]*len(test['Native.Country'])
# US_index = list(test[test['Native.Country'] == 38].index)
# for i in US_index: country_list[i] = 1
# test['Native.Country'] = country_list
test.head()
sex_income_crosstab = pd.crosstab(train.Sex, train['Income.Group'], margins = True)
print sex_income_crosstab
sex_income_crosstab.iloc[:-1,:-1]
fig = plt.figure(figsize=(6,6), dpi=1600)
# fig, ax = plt.subplots()
sex_income_crosstab.iloc[:-1,:-1].plot(kind = 'barh', stacked = True, color = ['red','blue'], alpha = 0.65)
sex_income_crosstab
def percentConvert(x):
return x / float(x[-1])
sex_income_crosstab.apply(percentConvert, axis = 0)
sex_income_crosstab.apply(percentConvert, axis = 1)
plt.figure(figsize=(6,10), dpi=1600)
sex_income_crosstab.apply(percentConvert, axis = 1).iloc[:-1,:-1].plot(kind='barh', stacked = True, color = ['red','blue'], alpha = 0.65)
plt.title('Income Distribution by Sex')
plt.show()
sex_income_crosstab.apply(percentConvert, axis = 0).iloc[:-1,:-1].transpose().plot(kind='barh', stacked = True, color = ['red','blue'], alpha = 0.65)
plt.title('Percentage share of Male / Female in each Income Class')
plt.show()
test.plot('Age', 'Hours.Per.Week', kind = 'scatter')
This shows no real relationship between Age and Hours-Per-Week. Even intuitively we were not expecting any specific trend so this is good. In other cases, you might figure out interesting trends which can be exploited.
train[train.Age > 40]['Income.Group'].mean()
Let's see how income classification changes solely based on age
ages = []
high_income_proportion = []
for age in range(17,91,3):
ages.append(age)
high_income_proportion.append(train[train.Age > age]['Income.Group'].mean())
plt.plot(ages, high_income_proportion)
plt.title('Proportion of people earning above $50K, given age')
train.boxplot(column='Hours.Per.Week', by= 'Sex')
test.boxplot(column='Hours.Per.Week', by= 'Sex')
train.boxplot(column = 'Age', by = 'Income.Group')
What is the percentage of males which have income <= 50K ?
1 - train[train.Sex == 1]['Income.Group'].mean()
from sklearn.tree import DecisionTreeClassifier
# define the predictors
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]
print independent_variable
# Inititate the algorithm
model_01 = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 100, max_features= 'sqrt')
# Fit the algorithm
model_01.fit(train[independent_variable], train[dependent_variable])
# Model predictions
model_01_predictions = model_01.predict(test[independent_variable])
# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_01_predictions)):
if model_01_predictions[i] == 1:
income_group[i] = '>50K'
submission['Income.Group'] = income_group
submission.to_csv('submission_01.csv', index = False)
Your score for this submission is : 0.826976229961.
from sklearn.tree import DecisionTreeClassifier
# define the predictors
dependent_variable = 'Income.Group'
independent_variable = ['Age', 'Education', 'Workclass', 'Sex', 'Hours.Per.Week']
print independent_variable
# Inititate the algorithm
model_02 = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 100, max_features= 'sqrt')
# Fit the algorithm
model_02.fit(train[independent_variable], train[dependent_variable])
# Model predictions
model_02_predictions = model_02.predict(test[independent_variable])
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_02_predictions)):
if model_02_predictions[i] == 1:
income_group[i] = '>50K'
submission['Income.Group'] = income_group
submission.to_csv('submission_02.csv', index = False)
Your score for this submission is : 0.806522940851.
from sklearn.ensemble import RandomForestClassifier
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]
# Inititate the algorithm
model_03 = RandomForestClassifier(max_depth = 10, min_samples_leaf = 10, max_features= 'auto')
# Fit the algorithm
model_03.fit(train[independent_variable], train[dependent_variable])
# Model predictions
model_03_predictions = model_03.predict(test[independent_variable])
# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_03_predictions)):
if model_03_predictions[i] == 1:
income_group[i] = '>50K'
submission['Income.Group'] = income_group
submission.to_csv('submission_03.csv', index = False)
Your score for this submission is : 0.8359437381.
from sklearn.neighbors import KNeighborsClassifier
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]
# Inititate the algorithm
model_04 = KNeighborsClassifier(n_neighbors=10, leaf_size=1)
# Fit the algorithm
model_04.fit(train[independent_variable], train[dependent_variable])
# Model predictions
model_04_predictions = model_04.predict(test[independent_variable])
# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_04_predictions)):
if model_04_predictions[i] == 1:
income_group[i] = '>50K'
submission['Income.Group'] = income_group
submission.to_csv('submission_04.csv', index = False)
Your score for this submission is : 0.811313801364.
from sklearn.ensemble import BaggingClassifier
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]
# Inititate the algorithm
model_05 = BaggingClassifier()
# Fit the algorithm
model_05.fit(train[independent_variable], train[dependent_variable])
# Model predictions
model_05_predictions = model_05.predict(test[independent_variable])
# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_05_predictions)):
if model_05_predictions[i] == 1:
income_group[i] = '>50K'
submission['Income.Group'] = income_group
submission.to_csv('submission_05.csv', index = False)
from sklearn.ensemble import AdaBoostClassifier
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]
# Inititate the algorithm
model_06 = AdaBoostClassifier(n_estimators=160, learning_rate= 0.5)
# Fit the algorithm
model_06.fit(train[independent_variable], train[dependent_variable])
# Model predictions
model_06_predictions = model_06.predict(test[independent_variable])
# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_06_predictions)):
if model_06_predictions[i] == 1:
income_group[i] = '>50K'
submission['Income.Group'] = income_group
submission.to_csv('submission_06.csv', index = False)
Your score for this submission is : 0.836496529697.
from sklearn.ensemble import GradientBoostingClassifier
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]
# Inititate the algorithm
# model_07 = GradientBoostingClassifier(learning_rate=0.5, n_estimators=1700, max_depth=3)
model_07 = GradientBoostingClassifier(learning_rate=0.2, n_estimators=195, max_depth=3)
# Fit the algorithm
model_07.fit(train[independent_variable], train[dependent_variable])
# Model predictions
model_07_predictions = model_07.predict(test[independent_variable])
# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_07_predictions)):
if model_07_predictions[i] == 1:
income_group[i] = '>50K'
submission['Income.Group'] = income_group
submission.to_csv('submission_07.csv', index = False)
Your score for this submission is : 0.841471654075.
from sklearn.ensemble import ExtraTreesClassifier
dependent_variable = 'Income.Group'
independent_variable = [x for x in train.columns if x not in ['ID', dependent_variable]]
# Inititate the algorithm
model_08 = ExtraTreesClassifier()
# Fit the algorithm
model_08.fit(train[independent_variable], train[dependent_variable])
# Model predictions
model_08_predictions = model_08.predict(test[independent_variable])
# Make submission
submission = pd.read_csv('sample_submission.csv')
submission.ID = test.ID
income_group = ['<=50K']*len(test['ID'])
for i in range(len(model_08_predictions)):
if model_08_predictions[i] == 1:
income_group[i] = '>50K'
submission['Income.Group'] = income_group
submission.to_csv('submission_08.csv', index = False)
Your score for this submission is : 0.81057.