A program or a function which maps from unlabeled instances to classes is called a classifier.
A confusion matrix, also called a contingeny table or error matrix, is used to visualize the performance of a classifier. The columns of the matrix represent the instances of the predicted classes and the rows represent the instances of the actual class. (Note: It can be the other way around as well.) In the case of binary classification the table has 2 rows and 2 columns.
Accuracy is a statistical measure which is defined as the quotient of correct predictions made by a classifier divided by the sum of predictions made by the classifier.
The classifier in our previous example predicted correctly predicted 42 male instances and 32 female instance.
Therefore, the accuracy can be calculated by:
accuracy = (42+32)/(42+8+18+32)
Accuracy: (TN+TP)/(TN+TP+FN+FP) Precision: TP/(TP+FP) Recall: TP/(TP+FN)
from sklearn.datasets import load_iris
iris = load_iris()
# The features of each sample flower are stored in the data attribute of the dataset:
n_samples, n_features = iris.data.shape
print('Number of samples:', n_samples)
print('Number of features:', n_features)
# the sepal length, sepal width, petal length and petal width of the first sample (first flower)
print(iris.data[0])
Number of samples: 150 Number of features: 4 [5.1 3.5 1.4 0.2]
print(iris.target)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
### Visualising the Features of the Iris Data Set
## The feature data is four dimensional, but we can visualize one or two of the dimensions at a time using a simple histogram or scatter-plot.
from sklearn.datasets import load_iris
iris = load_iris()
print(iris.data[iris.target==1][:5])
print(iris.data[iris.target==1, 0][:5])
[[7. 3.2 4.7 1.4] [6.4 3.2 4.5 1.5] [6.9 3.1 4.9 1.5] [5.5 2.3 4. 1.3] [6.5 2.8 4.6 1.5]] [7. 6.4 6.9 5.5 6.5]
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots()
x_index = 3
colors = ['blue', 'red', 'green']
for label, color in zip(range(len(iris.target_names)), colors):
ax.hist(iris.data[iris.target==label, x_index],
label=iris.target_names[label],
color=color)
ax.set_xlabel(iris.feature_names[x_index])
ax.legend(loc='upper right')
fig.show()
iris.feature_names
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
fig, ax = plt.subplots()
x_index = 3
y_index = 0
colors = ['blue', 'red', 'green']
for label, color in zip(range(len(iris.target_names)), colors):
ax.scatter(iris.data[iris.target==label, x_index],
iris.data[iris.target==label, y_index],
label=iris.target_names[label],
c=color)
ax.set_xlabel(iris.feature_names[x_index])
ax.set_ylabel(iris.feature_names[y_index])
ax.legend(loc='upper left')
plt.show()
# Change x_index and y_index in the above script and find a combination of two parameters which maximally separate the three classes.
import matplotlib.pyplot as plt
%matplotlib inline
n = len(iris.feature_names)
fig, ax = plt.subplots(n, n, figsize=(16, 16))
colors = ['blue', 'red', 'green']
for x in range(n):
for y in range(n):
xname = iris.feature_names[x]
yname = iris.feature_names[y]
for color_ind in range(len(iris.target_names)):
ax[x, y].scatter(iris.data[iris.target==color_ind, x],
iris.data[iris.target==color_ind, y],
label=iris.target_names[color_ind],
c=colors[color_ind])
ax[x, y].set_xlabel(xname)
ax[x, y].set_ylabel(yname)
ax[x, y].legend(loc='upper left')
plt.show()
# Scatterplot 'Matrices
# Instead of doing it manually we can also use the scatterplot matrix provided by the pandas module.
# Scatterplot matrices show scatter plots between all features in the data set, as well as histograms to show the distribution of each feature.
import pandas as pd
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
pd.plotting.scatter_matrix(iris_df,
c=iris.target,
figsize=(8, 8)
);
from sklearn.datasets import load_digits
digits = load_digits()
digits.keys()
dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])
n_samples, n_features = digits.data.shape
print((n_samples, n_features))
print(digits.data[0])
print(digits.target)
(1797, 64) [ 0. 0. 5. 13. 9. 1. 0. 0. 0. 0. 13. 15. 10. 15. 5. 0. 0. 3. 15. 2. 0. 11. 8. 0. 0. 4. 12. 0. 0. 8. 8. 0. 0. 5. 8. 0. 0. 9. 8. 0. 0. 4. 11. 0. 1. 12. 7. 0. 0. 2. 14. 5. 10. 12. 0. 0. 0. 0. 6. 13. 10. 0. 0. 0.] [0 1 2 ... 8 9 8]
print(digits.target.shape)
(1797,)
# The is just the digit represented by the data. The data is an array of length 64... but what does this data mean?
#There's a clue in the fact that we have two versions of the data array: data and images. Let's take a look at them:
print(digits.data.shape)
print(digits.images.shape)
#We can see that they're related by a simple reshaping:
import numpy as np
print(np.all(digits.images.reshape((1797, 64)) == digits.data))
(1797, 64) (1797, 8, 8) True
# Let's visualize the data. It's little bit more involved than the simple scatter-plot we used above, but we can do it rather quickly.
import matplotlib.pyplot as plt
%matplotlib inline
# set up the figure
fig = plt.figure(figsize=(6, 6)) # figure size in inches
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
# plot the digits: each image is 8x8 pixels
for i in range(64):
ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
ax.imshow(digits.images[i], cmap=plt.cm.binary, interpolation='nearest')
# label the image with the target value
ax.text(0, 7, str(digits.target[i]))
We see now what the features mean. Each feature is a real-valued quantity representing the darkness of a pixel in an 8x8 image of a hand-written digit.
Even though each sample has data that is inherently two-dimensional, the data matrix flattens this 2D data into a single vector, which can be contained in one row of the data matrix.
## Another dataset
from sklearn.datasets import fetch_olivetti_faces
# fetch the faces data
faces = fetch_olivetti_faces()
# Use a script like above to plot the faces image data.
# hint: plt.cm.bone is a good colormap for this data
faces.keys()
downloading Olivetti faces from https://ndownloader.figshare.com/files/5976027 to /home/akash/scikit_learn_data
dict_keys(['data', 'images', 'target', 'DESCR'])
n_samples, n_features = faces.data.shape
print((n_samples, n_features))
(400, 4096)
np.sqrt(4096)
64.0
faces.images.shape
(400, 64, 64)
faces.data.shape
print(np.all(faces.images.reshape((400, 4096)) == faces.data))
True
fig = plt.figure(figsize=(6, 6)) # figure size in inches
fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
# plot the digits: each image is 8x8 pixels
for i in range(64):
ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
ax.imshow(faces.images[i], cmap=plt.cm.bone, interpolation='nearest')
# label the image with the target value
ax.text(0, 7, str(faces.target[i]))