import re
import pandas as pd
import string
training_data = pd.read_csv('iris-training.csv')
testing_data = pd.read_csv('iris-testing.csv')
training_data.describe()
def to_vw_format(line):
chars = re.escape(string.punctuation)
res = f'{int(line.y)} |'
for idx, value in line.drop(['y']).iteritems():
feature_name = re.sub(r'(['+chars+']|\s)+', '_', idx)
res += f' {feature_name}:{value}'
return res
Vowpal Wabbit has its own input format we can use. Lets see how it looks like.
for ex in training_data.head(10).apply(to_vw_format, axis=1):
print(ex)
from vowpalwabbit import pyvw
vw = pyvw.vw("--oaa 3")
# learn from training set
for example in training_data.apply(to_vw_format, axis = 1):
vw.learn(example)
# predict from the testing set
predictions = []
for example in testing_data.apply(to_vw_format, axis = 1):
predicted_class = vw.predict(example)
predictions.append(predicted_class)
accuracy = len(testing_data[testing_data.y == predictions]) / len(testing_data)
f'Model accuracy {accuracy}'
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import os
ds = load_iris()
df = pd.DataFrame(data = ds.data, columns = ds.feature_names)
df["y"] = ds.target + 1 # vw expects labels startins on 1
training_data, testing_data = train_test_split(df, random_state = 2019, test_size = 0.2)
training_data.to_csv(os.path.join(os.getcwd(), 'iris-training.csv'), index=False)
testing_data.to_csv(os.path.join(os.getcwd(),'iris-testing.csv'), index=False)