#!/usr/bin/env python
# coding: utf-8
# In[1]:
get_ipython().run_line_magic('load_ext', 'watermark')
get_ipython().run_line_magic('watermark', " -d -u -a 'Andreas Mueller, Kyle Kastner, Sebastian Raschka' -v -p numpy,scipy,matplotlib,pillow,scikit-learn")
# In[ ]:
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import numpy as np
# # SciPy 2016 Scikit-learn Tutorial
# # Supervised Learning Part 1 -- Classification
# To visualize the workings of machine learning algorithms, it is often helpful to study two-dimensional or one-dimensional data, that is data with only one or two features. While in practice, datasets usually have many more features, it is hard to plot high-dimensional data in on two-dimensional screens.
#
# We will illustrate some very simple examples before we move on to more "real world" data sets.
#
# First, we will look at a two class classification problem in two dimensions. We use the synthetic data generated by the ``make_blobs`` function.
# In[ ]:
from sklearn.datasets import make_blobs
X, y = make_blobs(centers=2, random_state=0)
print('X ~ n_samples x n_features:', X.shape)
print('y ~ n_samples:', y.shape)
print('\nFirst 5 samples:\n', X[:5, :])
print('\nFirst 5 labels:', y[:5])
# As the data is two-dimensional, we can plot each sample as a point in a two-dimensional coordinate system, with the first feature being the x-axis and the second feature being the y-axis.
# In[ ]:
plt.scatter(X[y == 0, 0], X[y == 0, 1],
c='blue', s=40, label='0')
plt.scatter(X[y == 1, 0], X[y == 1, 1],
c='red', s=40, label='1', marker='s')
plt.xlabel('first feature')
plt.ylabel('second feature')
plt.legend(loc='upper right');
# Classification is a supervised task, and since we are interested in its performance on unseen data, we split our data into two parts:
#
# 1. a training set that the learning algorithm uses to fit the model
# 2. a test set to evaluate the generalization performance of the model
#
# The ``train_test_split`` function from the ``cross_validation`` module does that for us -- we will use it to split a dataset into 75% training data and 25% test data.
#
#
#
# In[ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.25,
random_state=1234,
stratify=y)
# ### The scikit-learn estimator API
#
#
# Every algorithm is exposed in scikit-learn via an ''Estimator'' object. (All models in scikit-learn have a very consistent interface). For instance, we first import the logistic regression class.
# In[ ]:
from sklearn.linear_model import LogisticRegression
# Next, we instantiate the estimator object.
# In[ ]:
classifier = LogisticRegression()
# In[ ]:
X_train.shape
# In[ ]:
y_train.shape
# To built the model from our data, that is to learn how to classify new points, we call the ``fit`` function with the training data, and the corresponding training labels (the desired output for the training data point):
# In[ ]:
classifier.fit(X_train, y_train)
# (Some estimator methods such as `fit` return `self` by default. Thus, after executing the code snippet above, you will see the default parameters of this particular instance of `LogisticRegression`. Another way of retrieving the estimator's ininitialization parameters is to execute `classifier.get_params()`, which returns a parameter dictionary.)
# We can then apply the model to unseen data and use the model to predict the estimated outcome using the ``predict`` method:
# In[ ]:
prediction = classifier.predict(X_test)
# We can compare these against the true labels:
# In[ ]:
print(prediction)
print(y_test)
# We can evaluate our classifier quantitatively by measuring what fraction of predictions is correct. This is called **accuracy**:
# In[ ]:
np.mean(prediction == y_test)
# There is also a convenience function , ``score``, that all scikit-learn classifiers have to compute this directly from the test data:
#
# In[ ]:
classifier.score(X_test, y_test)
# It is often helpful to compare the generalization performance (on the test set) to the performance on the training set:
# In[ ]:
classifier.score(X_train, y_train)
# LogisticRegression is a so-called linear model,
# that means it will create a decision that is linear in the input space. In 2d, this simply means it finds a line to separate the blue from the red:
# In[ ]:
from figures import plot_2d_separator
plt.scatter(X[y == 0, 0], X[y == 0, 1],
c='blue', s=40, label='0')
plt.scatter(X[y == 1, 0], X[y == 1, 1],
c='red', s=40, label='1', marker='s')
plt.xlabel("first feature")
plt.ylabel("second feature")
plot_2d_separator(classifier, X)
plt.legend(loc='upper right');
# **Estimated parameters**: All the estimated model parameters are attributes of the estimator object ending by an underscore. Here, these are the coefficients and the offset of the line:
# In[ ]:
print(classifier.coef_)
print(classifier.intercept_)
# Another classifier: K Nearest Neighbors
# ------------------------------------------------
# Another popular and easy to understand classifier is K nearest neighbors (kNN). It has one of the simplest learning strategies: given a new, unknown observation, look up in your reference database which ones have the closest features and assign the predominant class.
#
# The interface is exactly the same as for ``LogisticRegression above``.
# In[ ]:
from sklearn.neighbors import KNeighborsClassifier
# This time we set a parameter of the KNeighborsClassifier to tell it we only want to look at one nearest neighbor:
# In[ ]:
knn = KNeighborsClassifier(n_neighbors=1)
# We fit the model with out training data
# In[ ]:
knn.fit(X_train, y_train)
# In[ ]:
plt.scatter(X[y == 0, 0], X[y == 0, 1],
c='blue', s=40, label='0')
plt.scatter(X[y == 1, 0], X[y == 1, 1],
c='red', s=40, label='1', marker='s')
plt.xlabel("first feature")
plt.ylabel("second feature")
plot_2d_separator(knn, X)
plt.legend(loc='upper right');
# In[ ]:
knn.score(X_test, y_test)
# Exercise
# =========
# Apply the KNeighborsClassifier to the ``iris`` dataset. Play with different values of the ``n_neighbors`` and observe how training and test score change.