In [16]:

%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
pd.set_option('display.max_columns', 50)

train = pd.DataFrame.from_csv('train.csv')

###
#   Use numerical values for categorical data

def makeNumerical(input_df):
    
    output_df = input_df.copy()
    non_numeric_cols = [col for col in output_df.columns\
                        if output_df[col].dtype != 'int64']
    for col in non_numeric_cols:
        vals = pd.unique(output_df[col])
        for i in xrange(len(vals)):
            output_df.loc[(output_df[col] == vals[i]), col] = i     

    return output_df

In [17]:

train_numerical = makeNumerical(train)
features = ['T1_V1','T1_V2','T2_V1','T2_V2']
X = train_numerical[features].copy().astype(float)
y = train_numerical[train_numerical.columns[0]].copy().astype(float)

In [22]:

from sklearn import svm
clf = svm.SVR(C = 0.5, gamma=.001)   #try making C a little lower, maybe change gamma
clf.fit(X,y)

In [25]:

# Load test df, predict on it

test = makeNumerical(pd.DataFrame.from_csv('test.csv')[['T1_V1','T1_V2','T2_V1','T2_V2']])

preds = clf.predict(test)
#
out_df = pd.DataFrame({'Hazard': preds}, index=test.index)
out_df.to_csv("submission_3.csv", index=True)

In [33]:

same_preds = clf.predict(X) - y

In [34]:

same_preds.head(20)

Out[34]:

Id
1      0.853256
2     -0.998832
3      1.116817
4      2.453021
5      2.615416
12     1.730286
15    -1.931312
19     2.509336
21     0.911379
22     0.386930
23    -5.199475
24   -11.527683
25     1.994604
26    -0.331582
31    -9.513998
32     3.166062
33     1.301354
39     0.776953
41    -1.171212
43     0.810963
Name: Hazard, dtype: float64

In [ ]: