%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_columns', 50)
df = pd.DataFrame.from_csv('train.csv')
train_df = df[]
#train_dat = df[['T1_V1','T1_V2']].as_matrix()
train_dat = df[['Hazard','T1_V1','T1_V2', 'T1_V3', 'T1_V10', 'T1_V13', 'T1_V14', 'T2_V1', 'T2_V2', 'T2_V4']]
train_dat['Id'] = train_dat.index
train_dat1 = train_dat[['Id', 'Hazard', 'T1_V1','T1_V2', 'T1_V3', 'T1_V10', 'T1_V13', 'T1_V14', 'T2_V1', 'T2_V2', 'T2_V4']]
train_dat1
Id | Hazard | T1_V1 | T1_V2 | T1_V3 | T1_V10 | T1_V13 | T1_V14 | T2_V1 | T2_V2 | T2_V4 | |
---|---|---|---|---|---|---|---|---|---|---|---|
Id | |||||||||||
1 | 1 | 1 | 15 | 3 | 2 | 7 | 15 | 1 | 36 | 11 | 10 |
2 | 2 | 4 | 16 | 14 | 5 | 12 | 10 | 3 | 78 | 10 | 17 |
3 | 3 | 1 | 10 | 10 | 5 | 12 | 15 | 1 | 71 | 21 | 13 |
4 | 4 | 1 | 18 | 18 | 5 | 3 | 15 | 1 | 71 | 13 | 15 |
5 | 5 | 1 | 13 | 19 | 5 | 7 | 10 | 1 | 75 | 10 | 11 |
12 | 12 | 1 | 14 | 12 | 2 | 12 | 15 | 1 | 65 | 10 | 14 |
15 | 15 | 5 | 8 | 17 | 1 | 8 | 20 | 1 | 100 | 14 | 16 |
19 | 19 | 1 | 14 | 20 | 4 | 3 | 15 | 1 | 83 | 13 | 5 |
21 | 21 | 1 | 8 | 2 | 2 | 8 | 5 | 1 | 20 | 12 | 4 |
22 | 22 | 1 | 5 | 4 | 3 | 8 | 20 | 3 | 88 | 7 | 14 |
23 | 23 | 7 | 4 | 7 | 5 | 7 | 10 | 2 | 86 | 7 | 10 |
24 | 24 | 15 | 18 | 15 | 4 | 3 | 15 | 1 | 23 | 7 | 5 |
25 | 25 | 1 | 3 | 20 | 5 | 3 | 15 | 1 | 49 | 15 | 12 |
26 | 26 | 4 | 14 | 18 | 6 | 7 | 15 | 1 | 92 | 10 | 16 |
31 | 31 | 14 | 11 | 23 | 2 | 3 | 20 | 1 | 41 | 11 | 15 |
32 | 32 | 1 | 9 | 20 | 5 | 7 | 10 | 1 | 33 | 10 | 3 |
33 | 33 | 1 | 3 | 7 | 5 | 7 | 10 | 1 | 81 | 14 | 19 |
39 | 39 | 1 | 4 | 6 | 4 | 2 | 10 | 2 | 81 | 8 | 9 |
41 | 41 | 4 | 3 | 13 | 4 | 3 | 10 | 2 | 75 | 13 | 3 |
43 | 43 | 1 | 7 | 7 | 5 | 12 | 20 | 2 | 72 | 6 | 4 |
44 | 44 | 4 | 11 | 18 | 5 | 8 | 15 | 1 | 77 | 16 | 3 |
45 | 45 | 4 | 15 | 19 | 2 | 8 | 10 | 1 | 61 | 18 | 18 |
50 | 50 | 8 | 12 | 12 | 5 | 12 | 15 | 1 | 38 | 15 | 13 |
58 | 58 | 16 | 9 | 5 | 2 | 8 | 15 | 3 | 18 | 14 | 13 |
59 | 59 | 5 | 14 | 17 | 5 | 3 | 10 | 1 | 92 | 12 | 7 |
62 | 62 | 1 | 6 | 15 | 2 | 8 | 10 | 3 | 23 | 7 | 12 |
63 | 63 | 6 | 9 | 21 | 2 | 8 | 10 | 3 | 32 | 15 | 10 |
67 | 67 | 7 | 19 | 11 | 3 | 2 | 15 | 1 | 36 | 12 | 12 |
69 | 69 | 1 | 14 | 1 | 1 | 12 | 10 | 3 | 72 | 13 | 6 |
72 | 72 | 1 | 8 | 18 | 5 | 3 | 10 | 1 | 98 | 12 | 17 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
101929 | 101929 | 1 | 9 | 19 | 4 | 8 | 10 | 1 | 48 | 12 | 5 |
101931 | 101931 | 4 | 12 | 8 | 5 | 3 | 10 | 3 | 33 | 18 | 7 |
101933 | 101933 | 2 | 7 | 19 | 3 | 8 | 15 | 2 | 70 | 11 | 4 |
101935 | 101935 | 1 | 6 | 15 | 4 | 3 | 20 | 2 | 72 | 8 | 12 |
101937 | 101937 | 14 | 16 | 22 | 7 | 2 | 10 | 1 | 54 | 10 | 11 |
101939 | 101939 | 5 | 7 | 16 | 2 | 8 | 20 | 1 | 79 | 8 | 18 |
101942 | 101942 | 4 | 3 | 5 | 4 | 3 | 10 | 1 | 47 | 12 | 4 |
101943 | 101943 | 2 | 3 | 17 | 1 | 12 | 20 | 1 | 53 | 11 | 17 |
101944 | 101944 | 4 | 3 | 17 | 2 | 8 | 15 | 1 | 38 | 9 | 8 |
101945 | 101945 | 6 | 6 | 13 | 2 | 7 | 20 | 3 | 72 | 8 | 10 |
101946 | 101946 | 9 | 15 | 8 | 6 | 8 | 20 | 1 | 65 | 14 | 10 |
101953 | 101953 | 1 | 4 | 11 | 2 | 3 | 15 | 1 | 92 | 6 | 14 |
101959 | 101959 | 1 | 1 | 14 | 5 | 12 | 10 | 2 | 42 | 15 | 16 |
101962 | 101962 | 5 | 9 | 19 | 2 | 8 | 10 | 1 | 7 | 14 | 12 |
101968 | 101968 | 1 | 17 | 1 | 1 | 7 | 20 | 1 | 29 | 8 | 4 |
101971 | 101971 | 1 | 6 | 4 | 2 | 12 | 15 | 3 | 36 | 4 | 21 |
101975 | 101975 | 1 | 3 | 18 | 2 | 12 | 10 | 1 | 82 | 10 | 12 |
101978 | 101978 | 14 | 6 | 21 | 3 | 12 | 20 | 1 | 46 | 9 | 9 |
101980 | 101980 | 3 | 5 | 14 | 4 | 8 | 15 | 3 | 98 | 6 | 7 |
101981 | 101981 | 1 | 13 | 24 | 4 | 7 | 15 | 1 | 77 | 15 | 4 |
101984 | 101984 | 2 | 5 | 18 | 1 | 8 | 10 | 1 | 93 | 9 | 8 |
101986 | 101986 | 16 | 10 | 19 | 5 | 8 | 15 | 1 | 49 | 16 | 6 |
101987 | 101987 | 12 | 14 | 3 | 2 | 12 | 20 | 3 | 51 | 20 | 10 |
101988 | 101988 | 4 | 5 | 19 | 3 | 3 | 15 | 1 | 88 | 8 | 21 |
101991 | 101991 | 1 | 8 | 2 | 2 | 8 | 10 | 1 | 3 | 10 | 14 |
101992 | 101992 | 7 | 12 | 24 | 1 | 12 | 10 | 1 | 64 | 9 | 6 |
101993 | 101993 | 4 | 12 | 17 | 4 | 3 | 15 | 1 | 75 | 10 | 10 |
101994 | 101994 | 3 | 18 | 7 | 5 | 8 | 20 | 2 | 33 | 13 | 3 |
101998 | 101998 | 14 | 18 | 17 | 5 | 8 | 10 | 1 | 35 | 11 | 18 |
101999 | 101999 | 9 | 5 | 15 | 3 | 8 | 15 | 3 | 49 | 10 | 6 |
50999 rows × 11 columns
features = train_dat1.columns[2:]
y = train_dat1['Hazard']
clf = RandomForestClassifier(n_jobs=2)
clf.fit(train_dat1[features], y)
RandomForestClassifier(bootstrap=True, compute_importances=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_density=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=2, oob_score=False, random_state=None, verbose=0)
from sklearn import metrics
test = pd.DataFrame.from_csv('test.csv')
test[features].head()
preds = clf.predict(test[features])
Id = test.index
output_df = pd.DataFrame(data=preds, index = Id)
# metrics.accuracy_score(train_dat1['Hazard'], preds)
output_df.columns = ['Hazard']
sample = pd.DataFrame.from_csv('sample_submission.csv')
output_df.head()
Hazard | |
---|---|
Id | |
6 | 1 |
7 | 1 |
8 | 1 |
9 | 4 |
10 | 1 |
output_df.to_csv("submission_1.csv", index=True)