%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_columns', 50)
train = pd.DataFrame.from_csv('train.csv')
train.head()
Hazard | T1_V1 | T1_V2 | T1_V3 | T1_V4 | T1_V5 | T1_V6 | T1_V7 | T1_V8 | T1_V9 | T1_V10 | T1_V11 | T1_V12 | T1_V13 | T1_V14 | T1_V15 | T1_V16 | T1_V17 | T2_V1 | T2_V2 | T2_V3 | T2_V4 | T2_V5 | T2_V6 | T2_V7 | T2_V8 | T2_V9 | T2_V10 | T2_V11 | T2_V12 | T2_V13 | T2_V14 | T2_V15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Id | |||||||||||||||||||||||||||||||||
1 | 1 | 15 | 3 | 2 | N | B | N | B | B | D | 7 | B | B | 15 | 1 | A | B | N | 36 | 11 | N | 10 | B | 2 | 37 | 1 | 11 | 6 | Y | N | E | 2 | 2 |
2 | 4 | 16 | 14 | 5 | H | B | N | B | B | C | 12 | B | B | 10 | 3 | A | B | Y | 78 | 10 | Y | 17 | C | 2 | 22 | 1 | 18 | 5 | Y | Y | E | 2 | 1 |
3 | 1 | 10 | 10 | 5 | N | K | N | B | B | E | 12 | H | B | 15 | 1 | A | R | Y | 71 | 21 | Y | 13 | C | 6 | 37 | 2 | 14 | 6 | Y | Y | E | 6 | 1 |
4 | 1 | 18 | 18 | 5 | N | K | N | B | B | E | 3 | H | B | 15 | 1 | A | R | N | 71 | 13 | N | 15 | A | 2 | 25 | 1 | 1 | 6 | Y | N | C | 2 | 6 |
5 | 1 | 13 | 19 | 5 | N | H | N | B | B | E | 7 | H | B | 10 | 1 | A | J | N | 75 | 10 | Y | 11 | B | 1 | 22 | 1 | 2 | 7 | N | N | E | 1 | 1 |
# Gonna go ahead and make some training data for almost all of the columns
def makeNumerical(input_df):
output_df = input_df.copy()
non_numeric_cols = [col for col in output_df.columns\
if output_df[col].dtype != 'int64']
for col in non_numeric_cols:
vals = pd.unique(output_df[col])
for i in xrange(len(vals)):
output_df.loc[(output_df[col] == vals[i]), col] = i
return output_df
train_numeric = makeNumerical(train)
train_numeric.head(5)
Hazard | T1_V1 | T1_V2 | T1_V3 | T1_V4 | T1_V5 | T1_V6 | T1_V7 | T1_V8 | T1_V9 | T1_V10 | T1_V11 | T1_V12 | T1_V13 | T1_V14 | T1_V15 | T1_V16 | T1_V17 | T2_V1 | T2_V2 | T2_V3 | T2_V4 | T2_V5 | T2_V6 | T2_V7 | T2_V8 | T2_V9 | T2_V10 | T2_V11 | T2_V12 | T2_V13 | T2_V14 | T2_V15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Id | |||||||||||||||||||||||||||||||||
1 | 1 | 15 | 3 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | 0 | 0 | 15 | 1 | 0 | 0 | 0 | 36 | 11 | 0 | 10 | 0 | 2 | 37 | 1 | 11 | 6 | 0 | 0 | 0 | 2 | 2 |
2 | 4 | 16 | 14 | 5 | 1 | 0 | 0 | 0 | 0 | 1 | 12 | 0 | 0 | 10 | 3 | 0 | 0 | 1 | 78 | 10 | 1 | 17 | 1 | 2 | 22 | 1 | 18 | 5 | 0 | 1 | 0 | 2 | 1 |
3 | 1 | 10 | 10 | 5 | 0 | 1 | 0 | 0 | 0 | 2 | 12 | 1 | 0 | 15 | 1 | 0 | 1 | 1 | 71 | 21 | 1 | 13 | 1 | 6 | 37 | 2 | 14 | 6 | 0 | 1 | 0 | 6 | 1 |
4 | 1 | 18 | 18 | 5 | 0 | 1 | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 15 | 1 | 0 | 1 | 0 | 71 | 13 | 0 | 15 | 2 | 2 | 25 | 1 | 1 | 6 | 0 | 0 | 1 | 2 | 6 |
5 | 1 | 13 | 19 | 5 | 0 | 2 | 0 | 0 | 0 | 2 | 7 | 1 | 0 | 10 | 1 | 0 | 2 | 0 | 75 | 10 | 1 | 11 | 0 | 1 | 22 | 1 | 2 | 7 | 1 | 0 | 0 | 1 | 1 |
train.head()
Hazard | T1_V1 | T1_V2 | T1_V3 | T1_V4 | T1_V5 | T1_V6 | T1_V7 | T1_V8 | T1_V9 | T1_V10 | T1_V11 | T1_V12 | T1_V13 | T1_V14 | T1_V15 | T1_V16 | T1_V17 | T2_V1 | T2_V2 | T2_V3 | T2_V4 | T2_V5 | T2_V6 | T2_V7 | T2_V8 | T2_V9 | T2_V10 | T2_V11 | T2_V12 | T2_V13 | T2_V14 | T2_V15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Id | |||||||||||||||||||||||||||||||||
1 | 1 | 15 | 3 | 2 | N | B | N | B | B | D | 7 | B | B | 15 | 1 | A | B | N | 36 | 11 | N | 10 | B | 2 | 37 | 1 | 11 | 6 | Y | N | E | 2 | 2 |
2 | 4 | 16 | 14 | 5 | H | B | N | B | B | C | 12 | B | B | 10 | 3 | A | B | Y | 78 | 10 | Y | 17 | C | 2 | 22 | 1 | 18 | 5 | Y | Y | E | 2 | 1 |
3 | 1 | 10 | 10 | 5 | N | K | N | B | B | E | 12 | H | B | 15 | 1 | A | R | Y | 71 | 21 | Y | 13 | C | 6 | 37 | 2 | 14 | 6 | Y | Y | E | 6 | 1 |
4 | 1 | 18 | 18 | 5 | N | K | N | B | B | E | 3 | H | B | 15 | 1 | A | R | N | 71 | 13 | N | 15 | A | 2 | 25 | 1 | 1 | 6 | Y | N | C | 2 | 6 |
5 | 1 | 13 | 19 | 5 | N | H | N | B | B | E | 7 | H | B | 10 | 1 | A | J | N | 75 | 10 | Y | 11 | B | 1 | 22 | 1 | 2 | 7 | N | N | E | 1 | 1 |
from sklearn import metrics
features = train_numeric.columns[1:]
y = train_numeric['Hazard']
clf = RandomForestClassifier()
clf.fit(train_numeric[features], y)
test = makeNumerical(pd.DataFrame.from_csv('test.csv'))
test[features].head()
preds = clf.predict(test[features])
Id = test.index
out_df = pd.DataFrame(data=preds, index = Id, columns = ['Hazard'])
out_df.head()
Hazard | |
---|---|
Id | |
6 | 1 |
7 | 1 |
8 | 4 |
9 | 1 |
10 | 1 |
out_df.to_csv("submission_2.csv", index=True)
sub_1 = pd.DataFrame.from_csv('submission_1.csv')
sub_2 = pd.DataFrame.from_csv('submission_2.csv')
sub_2.columns = ['Hazard']
sub_2.head()
Hazard | |
---|---|
Id | |
6 | 1 |
7 | 2 |
8 | 1 |
9 | 1 |
10 | 1 |
sub_2.to_csv("submission_2.csv", index=True)