%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_columns', 50)
train = pd.DataFrame.from_csv('train.csv')
###
# Use numerical values for categorical data
def makeNumerical(input_df):
output_df = input_df.copy()
non_numeric_cols = [col for col in output_df.columns\
if output_df[col].dtype != 'int64']
for col in non_numeric_cols:
vals = pd.unique(output_df[col])
for i in xrange(len(vals)):
output_df.loc[(output_df[col] == vals[i]), col] = i
return output_df
train_numerical = makeNumerical(train)
features = ['T1_V1','T1_V2','T2_V1','T2_V2']
X = train_numerical[features].copy().astype(float)
y = train_numerical[train_numerical.columns[0]].copy().astype(float)
from sklearn import svm
clf = svm.SVR(C = 0.5, gamma=.001) #try making C a little lower, maybe change gamma
clf.fit(X,y)
# Load test df, predict on it
test = makeNumerical(pd.DataFrame.from_csv('test.csv')[['T1_V1','T1_V2','T2_V1','T2_V2']])
preds = clf.predict(test)
#
out_df = pd.DataFrame({'Hazard': preds}, index=test.index)
out_df.to_csv("submission_3.csv", index=True)
same_preds = clf.predict(X) - y
same_preds.head(20)
Id 1 0.853256 2 -0.998832 3 1.116817 4 2.453021 5 2.615416 12 1.730286 15 -1.931312 19 2.509336 21 0.911379 22 0.386930 23 -5.199475 24 -11.527683 25 1.994604 26 -0.331582 31 -9.513998 32 3.166062 33 1.301354 39 0.776953 41 -1.171212 43 0.810963 Name: Hazard, dtype: float64