Classification with Vowpal Wabbit¶

In [ ]:

import re
import pandas as pd
import string

In [ ]:

training_data = pd.read_csv('iris-training.csv')
testing_data = pd.read_csv('iris-testing.csv')

In [ ]:

training_data.describe()

In [ ]:

def to_vw_format(line):
    chars = re.escape(string.punctuation)
    res = f'{int(line.y)} |'
    for idx, value in line.drop(['y']).iteritems():
        feature_name = re.sub(r'(['+chars+']|\s)+', '_', idx)
        res += f' {feature_name}:{value}'
    return res

Vowpal Wabbit input format¶

Vowpal Wabbit has its own input format we can use. Lets see how it looks like.

In [ ]:

for ex in training_data.head(10).apply(to_vw_format, axis=1):
    print(ex)

In [ ]:

from vowpalwabbit import pyvw

vw = pyvw.vw("--oaa 3")

# learn from training set
for example in training_data.apply(to_vw_format, axis = 1):
    vw.learn(example)

# predict from the testing set
predictions = []
for example in testing_data.apply(to_vw_format, axis = 1):
    predicted_class = vw.predict(example)
    predictions.append(predicted_class)

In [ ]:

accuracy = len(testing_data[testing_data.y == predictions]) / len(testing_data)

f'Model accuracy {accuracy}'

How was this data set generated?¶

In [ ]:

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import os

ds = load_iris()
df = pd.DataFrame(data = ds.data, columns = ds.feature_names)
df["y"] = ds.target + 1 # vw expects labels startins on 1

training_data, testing_data = train_test_split(df, random_state = 2019, test_size = 0.2)

training_data.to_csv(os.path.join(os.getcwd(), 'iris-training.csv'), index=False)
testing_data.to_csv(os.path.join(os.getcwd(),'iris-testing.csv'), index=False)