#This one reaaaallllyyyy didn't do well. Submission 3 performed much better
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
pd.set_option('display.max_columns', 50)
train = pd.DataFrame.from_csv('train.csv')
###
# Use numerical values for categorical data
def makeNumerical(input_df):
output_df = input_df.copy()
non_numeric_cols = [col for col in output_df.columns\
if output_df[col].dtype != 'int64']
for col in non_numeric_cols:
vals = pd.unique(output_df[col])
for i in xrange(len(vals)):
output_df.loc[(output_df[col] == vals[i]), col] = i
return output_df
train_numerical = makeNumerical(train)
features = ['T1_V1','T1_V2','T1_V3','T2_V1','T2_V2']
X = train_numerical[features].copy().astype(float)
y = train_numerical[train_numerical.columns[0]].copy().astype(float)
train_numerical.sort(['Hazard'], ascending=False).head(10)
Hazard | T1_V1 | T1_V2 | T1_V3 | T1_V4 | T1_V5 | T1_V6 | T1_V7 | T1_V8 | T1_V9 | T1_V10 | T1_V11 | T1_V12 | T1_V13 | T1_V14 | T1_V15 | T1_V16 | T1_V17 | T2_V1 | T2_V2 | T2_V3 | T2_V4 | T2_V5 | T2_V6 | T2_V7 | T2_V8 | T2_V9 | T2_V10 | T2_V11 | T2_V12 | T2_V13 | T2_V14 | T2_V15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Id | |||||||||||||||||||||||||||||||||
28023 | 69 | 13 | 20 | 3 | 0 | 2 | 1 | 0 | 3 | 2 | 7 | 1 | 0 | 10 | 1 | 0 | 1 | 0 | 50 | 11 | 1 | 8 | 0 | 2 | 31 | 1 | 1 | 5 | 0 | 0 | 1 | 3 | 5 |
73541 | 64 | 19 | 13 | 4 | 3 | 2 | 1 | 3 | 3 | 2 | 7 | 1 | 1 | 15 | 1 | 0 | 6 | 0 | 57 | 9 | 1 | 11 | 4 | 2 | 40 | 1 | 13 | 3 | 0 | 0 | 1 | 2 | 10 |
67188 | 63 | 19 | 7 | 1 | 0 | 0 | 1 | 0 | 3 | 0 | 3 | 0 | 0 | 20 | 1 | 0 | 8 | 0 | 15 | 7 | 0 | 6 | 0 | 2 | 28 | 1 | 1 | 3 | 0 | 0 | 1 | 6 | 1 |
61658 | 52 | 10 | 15 | 1 | 0 | 3 | 1 | 0 | 3 | 0 | 12 | 4 | 1 | 15 | 1 | 0 | 7 | 0 | 20 | 10 | 0 | 7 | 2 | 2 | 37 | 1 | 9 | 1 | 0 | 0 | 0 | 4 | 1 |
18251 | 51 | 18 | 20 | 9 | 5 | 2 | 0 | 0 | 0 | 4 | 8 | 4 | 0 | 15 | 0 | 2 | 8 | 0 | 40 | 26 | 0 | 16 | 4 | 1 | 25 | 1 | 1 | 3 | 1 | 0 | 2 | 1 | 1 |
47720 | 49 | 8 | 23 | 2 | 0 | 1 | 1 | 0 | 3 | 2 | 12 | 1 | 0 | 10 | 1 | 0 | 3 | 0 | 54 | 15 | 0 | 11 | 2 | 3 | 37 | 1 | 8 | 7 | 0 | 0 | 1 | 3 | 8 |
66305 | 46 | 9 | 5 | 4 | 3 | 3 | 1 | 0 | 3 | 0 | 3 | 0 | 0 | 10 | 1 | 0 | 0 | 0 | 84 | 12 | 1 | 14 | 2 | 2 | 22 | 1 | 11 | 2 | 0 | 0 | 1 | 2 | 3 |
24519 | 46 | 12 | 22 | 1 | 0 | 1 | 1 | 3 | 0 | 2 | 8 | 1 | 0 | 15 | 1 | 0 | 3 | 0 | 43 | 13 | 1 | 16 | 2 | 1 | 31 | 1 | 11 | 3 | 0 | 0 | 3 | 1 | 4 |
33278 | 44 | 10 | 6 | 3 | 0 | 1 | 0 | 0 | 3 | 3 | 3 | 1 | 0 | 15 | 1 | 0 | 1 | 0 | 38 | 15 | 1 | 4 | 2 | 2 | 31 | 1 | 18 | 3 | 1 | 0 | 2 | 4 | 3 |
22623 | 42 | 15 | 9 | 6 | 3 | 4 | 0 | 0 | 3 | 2 | 3 | 1 | 0 | 20 | 1 | 0 | 1 | 0 | 7 | 13 | 1 | 11 | 2 | 1 | 34 | 1 | 1 | 3 | 1 | 0 | 1 | 1 | 7 |
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn.fit(X,y)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_neighbors=3, p=2, weights='distance')
# Load test df, predict on it
test = makeNumerical(pd.DataFrame.from_csv('test.csv')[features])
preds = knn.predict(test)
preds[:10]
array([ 4., 1., 8., 1., 1., 6., 1., 2., 1., 4.])
same_preds = knn.predict(X) - y
pd.unique(same_preds)
out_df = pd.DataFrame({'Hazard': preds}, index=test.index)
out_df.to_csv("submission_4.csv", index=True)