In [1]:

import numpy as np
import pandas as pd
import math
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn import cross_validation, metrics

from matplotlib import pyplot as plt
from matplotlib.pyplot import cm 
import seaborn as sns

/home/nekorobov/.local/lib/python3.5/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:

df = pd.read_csv('data.csv')

In [3]:

def split_data(df):
    Y = df['Type']
    X = df.drop(['Type', 'Id'], axis=1)
    return X, Y

In [4]:

X, Y = split_data(df)

There is correlation matrix below, so we can realize that the Type of Glass is independent from K and Ca. Moreover, features are pretty correlated between each other, so we definitely can use PCA or another dimensionality decreasing method without great accuracy loss. I'll show it later.

In [5]:

fig, ax = plt.subplots()
fig.set_size_inches(12, 12)
plot = sns.heatmap(np.corrcoef(df.T), annot=True, fmt=".2g", linewidths=1, xticklabels=df.columns, yticklabels=df.columns)

If we build the decision tree, we will get features importances, which shows the medium importance of Ca regardless to the previous discussion. Mg, Al, Rl, Ba are important features, so it's correspond to the correlation matrix.

In [6]:

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X, Y)
plt.bar(X.columns, clf.feature_importances_)

Out[6]:

<BarContainer object of 9 artists>

In [7]:

def draw_ff_plot(X, y, f1, f2):
    plt.figure(figsize=(5, 5))
    colors = cm.rainbow(np.linspace(0, 1, 10))
    for n in range(7):
        plt.scatter(df[df.Type == n][f1], df[df.Type == n][f2], color=colors[n], label=n)
    plt.xlabel(f1)
    plt.ylabel(f2)
    plt.legend()
    plt.show()

On the scatter plots we can see projections of data set to the different feature-axis. There are no clearly independent clusters.

In [8]:

for i in range(len(X.columns)):
    if(i + 1 < len(X.columns)):
        for j in range(i+1, len(X.columns)):
            draw_ff_plot(X, Y, X.columns[i], X.columns[j])

We use cross-validation to validate our classification accuracy, I chose only 3 cross-validation blocks, because on of the clusters is presented by 9 objects, so, we will lose accuracy if we increase the number of blocks.

In [9]:

def tree(X, Y, n=3):
    clf = DecisionTreeClassifier(random_state=0)
    return np.mean(cross_val_score(clf, X, Y, cv=n))

In [10]:

def neigh(X, Y, k, n=3):
    neigh = KNeighborsClassifier(n_neighbors=k)
    return np.mean(cross_val_score(neigh, X, Y, cv=n))

In [11]:

def preprocess(X, components):  
    # Scale
    scaler = StandardScaler()
    X_t = scaler.fit_transform(np.float32(X))
    
    # PCA
    pca = PCA(n_components=components)
    X_t = pca.fit_transform(X_t)
    print(sum(pca.explained_variance_ratio_))
    return pd.DataFrame(X_t)

We scale the data with a standard scaler and reduce the dimensionality with PCA. As we can see, we will lose only 5% of information if we reduce number of features by 3.

In [12]:

X_s = preprocess(X, 6)

0.9517311379313469

In [13]:

tree(X, Y)
tree(X_s, Y)
neigh(X, Y, 5)
neigh(X_s, Y, 5)

Out[13]:

0.6495699618785963

The graph presents dependence between the number of neighbors in knn method and accuracy of clusterisation. We can realize that knn, which was fitted with preprocessed data, gives better result, than knn, which was fitted with original data. The maximum of accuracy achieves with the 4nn method.

In [14]:

orig = []
prep = []
N_k = 9
x = np.linspace(1, N_k-1, N_k-1)
for k in range(1, N_k):
    orig.append(neigh(X, Y, k))
    prep.append(neigh(X_s, Y, k))
plt.figure(figsize=(6, 6))
plt.plot(x, orig, color='b', label='original X')
plt.plot(x, prep, color='r', label='preprocessed X')
plt.xlabel('k neighbors')
plt.ylabel('score')
plt.legend()
plt.show()

The graph presents dependence between the number of components in data and accuracy of clusterisation. We can realize that knn, is more accuracy than decision tree. The greater number of components the greater the score, but it's true only for knn method. Decision tree doesn't depends from dimensionality a lot.

In [15]:

neighbors = []
dec_tree = []
N_k = 9
x = np.linspace(1, N_k-1, N_k-1)
for k in range(1, N_k):
    X_s = preprocess(X, k)
    neighbors.append(neigh(X_s, Y, 4))
    dec_tree.append(tree(X_s, Y))
plt.figure(figsize=(6, 6))
plt.plot(x, neighbors, color='r', label='kneighbors')
plt.plot(x, dec_tree, color='b', label='decision tree')
plt.xlabel('number of components')
plt.ylabel('score')
plt.legend()
plt.show()

0.2790183424949646
0.5068043321371078
0.6628979742527008
0.7915493249893188
0.8931050598621368
0.9517311379313469
0.992726493626833
0.9998212596401572

In [16]:

x = np.arange(0.1, 0.9, 0.01)
y = []
neig = KNeighborsClassifier(n_neighbors=4)
for i in x:
    train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(X, Y, test_size = i, random_state = 1)
    neig.fit(train_data, train_labels)
    predict_KNeighbors = neig.predict(test_data)
    accuracy_KNeighbors = metrics.accuracy_score(predict_KNeighbors, test_labels)
    y.append(accuracy_KNeighbors)

plt.plot(x, y)
plt.ylabel('Accuracy')
plt.xlabel('DataRatioSize')
plt.show()
plt.savefig('HW3.png')

<Figure size 432x288 with 0 Axes>

In [ ]: