Predict a grape sort using chemichal analysis of wine. Employ KNN - k nearest neighbors method with 3 different metrics. Plot the relation of the error to the number of neighbors.
import numpy as np
import pandas as pd
import urllib
# url with dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
# download the file
raw_data = urllib.urlopen(url)
# load the CSV file as a numpy matrix
dataset = np.loadtxt(raw_data, delimiter=",")
column_names = ['Sort','Alcohol','Malic acid','Ash','Alcalinity of ash',
'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols',
'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
dataset = pd.DataFrame(dataset, columns = column_names)
# identify X and y as a matrix of features and answers, respectively
X = dataset.drop(columns = 'Sort')
y = dataset['Sort']
In order to get the most accurate results, all data has to be normalized
from sklearn import preprocessing
# standardize the data attributes
X = preprocessing.scale(X)
Here we create subsets for cross-validation
from sklearn.model_selection import KFold
folding = KFold(n_splits=5, shuffle=True)
Learning process and accuracy calculations using cross-validation method
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
metrics_names = ['euclidean','chebyshev','manhattan']
metrics_errors = [[],[],[]]
for i in range (len(metrics_names)):
for k_neighbors in range(1,len(X_train)):
# initialize and learn the classifier
classifier = KNeighborsClassifier(n_neighbors = k_neighbors, metric = metrics_names[i])
classifier.fit(X,y)
# here we use cross-validation to measure the accuracy of the classifier
accuracy = cross_val_score(classifier, X, y, scoring='accuracy', cv = folding)
accuracy = accuracy.mean()
metrics_errors[i].append(1 - accuracy)
Plotting the relation of error to number of neighbors
import matplotlib.pyplot as plt
plt.figure(figsize=(12,7))
plt.xlabel('Number of neighbors', fontsize=22)
plt.ylabel('Error', fontsize=22)
for i in range(len(metrics_names)):
plt.plot(metrics_errors[i], label=metrics_names[i])
plt.legend(fontsize=18)
plt.savefig(fname='Problem12_hw2.png',format='png')
Find the relation between standart errors and the number of objects in the training set
number_of_neighbors = 18
number_of_subjects = len(X)
avg = []
std = []
for i in range(number_of_neighbors, number_of_subjects):
subset = dataset.sample(n=i)
answers = subset['Sort']
qualities = subset.drop(columns = 'Sort')
qualities = preprocessing.scale(qualities)
test = pd.concat([dataset, subset,subset]).drop_duplicates (keep = False)
test_answers = test['Sort']
test_qualities = test.drop(columns = 'Sort')
test_qualities = preprocessing.scale(test_qualities)
classifier = KNeighborsClassifier(n_neighbors = number_of_neighbors, metric = 'chebyshev')
classifier.fit(qualities, answers)
prediction = classifier.predict(test_qualities)
avg.append(np.mean(prediction == test_answers))
std.append(np.std(prediction == test_answers))
plt.figure(figsize=(12,7))
plt.xlabel('Size of train subset', fontsize=22)
plt.ylabel('Accuracy', fontsize=22)
plt.errorbar(list(range(number_of_neighbors, number_of_subjects)), avg,
yerr=std, linewidth=2, elinewidth=0.5, ecolor='red', capsize=1.5)
plt.savefig(fname='Problem12_hw3.png',format='png')