#!/usr/bin/env python # coding: utf-8 # # Applying logistic regression and SVM # > In this chapter you will learn the basics of applying logistic regression and support vector machines (SVMs) to classification problems. You'll use the scikit-learn library to fit classification models to real data. This is the Summary of lecture "Linear Classifiers in Python", via datacamp. # # - toc: true # - badges: true # - comments: true # - author: Chanseok Kang # - categories: [Python, Datacamp, Machine_Learning] # - image: images/plot_4_classifiers.png # ## Scikit-learn refresher # ### KNN classification # this exercise you'll explore a subset of the [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/).The X variables contain features based on the words in the movie reviews, and the y variables contain labels for whether the review sentiment is positive (+1) or negative (-1). # > Large Movie Review Dataset : This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well. Raw text and already processed bag of words formats are provided. See the README file contained in the release for more details. # # # # In[1]: import numpy as np from sklearn.datasets import load_svmlight_file # In[2]: X_train, y_train = load_svmlight_file('./dataset/aclImdb/train/labeledBow.feat') X_test, y_test = load_svmlight_file('./dataset/aclImdb/test/labeledBow.feat') # In[3]: y_train[y_train < 5] = -1.0 y_train[y_train >= 5] = 1.0 y_test[y_test < 5] = -1.0 y_test[y_test >= 5] = 1.0 # In[4]: from sklearn.neighbors import KNeighborsClassifier # Create and fit the model knn = KNeighborsClassifier() knn.fit(X_train[:, :89523], y_train) # Predict on the test features, print the results pred = knn.predict(X_test)[0] print("Prediction for test example 0:", pred) # ## Applying logistic regression and SVM # # ### Running LogisticRegression and SVC # In this exercise, you'll apply logistic regression and a support vector machine to classify images of handwritten digits. # # # In[5]: from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC digits = datasets.load_digits() X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target) # Apply logistic regression and print scores lr = LogisticRegression() lr.fit(X_train, y_train) print(lr.score(X_train, y_train)) print(lr.score(X_test, y_test)) # Apply SVM and print scores svm = SVC() svm.fit(X_train, y_train) print(svm.score(X_train, y_train)) print(svm.score(X_test, y_test)) # ## Linear classifiers # - Classification: learning to predict categories # - decision boundary: the surface separating different predicted classes # - linear classifier: a classifier that learn linear decision boundaries # - e.g. logistic regression, linear SVM # - linearly separable: a dataset can be perfectly explained by a linear classifier # ### Visualizing decision boundaries # In this exercise, you'll visualize the decision boundaries of various classifier types. # In[2]: import matplotlib.pyplot as plt # In[3]: #hide X = np.array([[11.45, 2.4 ], [13.62, 4.95], [13.88, 1.89], [12.42, 2.55], [12.81, 2.31], [12.58, 1.29], [13.83, 1.57], [13.07, 1.5 ], [12.7 , 3.55], [13.77, 1.9 ], [12.84, 2.96], [12.37, 1.63], [13.51, 1.8 ], [13.87, 1.9 ], [12.08, 1.39], [13.58, 1.66], [13.08, 3.9 ], [11.79, 2.13], [12.45, 3.03], [13.68, 1.83], [13.52, 3.17], [13.5 , 3.12], [12.87, 4.61], [14.02, 1.68], [12.29, 3.17], [12.08, 1.13], [12.7 , 3.87], [11.03, 1.51], [13.32, 3.24], [14.13, 4.1 ], [13.49, 1.66], [11.84, 2.89], [13.05, 2.05], [12.72, 1.81], [12.82, 3.37], [13.4 , 4.6 ], [14.22, 3.99], [13.72, 1.43], [12.93, 2.81], [11.64, 2.06], [12.29, 1.61], [11.65, 1.67], [13.28, 1.64], [12.93, 3.8 ], [13.86, 1.35], [11.82, 1.72], [12.37, 1.17], [12.42, 1.61], [13.9 , 1.68], [14.16, 2.51]]) y = np.array([ True, True, False, True, True, True, False, False, True, False, True, True, False, False, True, False, True, True, True, False, True, True, True, False, True, True, True, True, True, True, True, True, False, True, True, True, False, False, True, True, True, True, False, False, False, True, True, True, False, True]) # In[4]: def make_meshgrid(x, y, h=.02, lims=None): """Create a mesh of points to plot in Parameters ---------- x: data to base x-axis meshgrid on y: data to base y-axis meshgrid on h: stepsize for meshgrid, optional Returns ------- xx, yy : ndarray """ if lims is None: x_min, x_max = x.min() - 1, x.max() + 1 y_min, y_max = y.min() - 1, y.max() + 1 else: x_min, x_max, y_min, y_max = lims xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) return xx, yy # In[5]: def plot_contours(ax, clf, xx, yy, proba=False, **params): """Plot the decision boundaries for a classifier. Parameters ---------- ax: matplotlib axes object clf: a classifier xx: meshgrid ndarray yy: meshgrid ndarray params: dictionary of params to pass to contourf, optional """ if proba: Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,-1] Z = Z.reshape(xx.shape) out = ax.imshow(Z,extent=(np.min(xx), np.max(xx), np.min(yy), np.max(yy)), origin='lower', vmin=0, vmax=1, **params) ax.contour(xx, yy, Z, levels=[0.5]) else: Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) out = ax.contourf(xx, yy, Z, **params) return out # In[6]: def plot_classifier(X, y, clf, ax=None, ticks=False, proba=False, lims=None): # assumes classifier "clf" is already fit X0, X1 = X[:, 0], X[:, 1] xx, yy = make_meshgrid(X0, X1, lims=lims) if ax is None: plt.figure() ax = plt.gca() show = True else: show = False # can abstract some of this into a higher-level function for learners to call cs = plot_contours(ax, clf, xx, yy, cmap=plt.cm.coolwarm, alpha=0.8, proba=proba) if proba: cbar = plt.colorbar(cs) cbar.ax.set_ylabel('probability of red $\Delta$ class', fontsize=20, rotation=270, labelpad=30) cbar.ax.tick_params(labelsize=14) #ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=30, edgecolors=\'k\', linewidth=1) labels = np.unique(y) if len(labels) == 2: ax.scatter(X0[y==labels[0]], X1[y==labels[0]], cmap=plt.cm.coolwarm, s=60, c='b', marker='o', edgecolors='k') ax.scatter(X0[y==labels[1]], X1[y==labels[1]], cmap=plt.cm.coolwarm, s=60, c='r', marker='^', edgecolors='k') else: ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=50, edgecolors='k', linewidth=1) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) # ax.set_xlabel(data.feature_names[0]) # ax.set_ylabel(data.feature_names[1]) if ticks: ax.set_xticks(()) ax.set_yticks(()) # ax.set_title(title) if show: plt.show() else: return ax # In[7]: def plot_4_classifiers(X, y, clfs): # Set-up 2x2 grid for plotting. fig, sub = plt.subplots(2, 2) plt.subplots_adjust(wspace=0.2, hspace=0.2) for clf, ax, title in zip(clfs, sub.flatten(), ("(1)", "(2)", "(3)", "(4)")): # clf.fit(X, y) plot_classifier(X, y, clf, ax, ticks=True) ax.set_title(title) # In[12]: from sklearn.svm import LinearSVC, SVC from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier # Define the classifiers classifiers = [LogisticRegression(), LinearSVC(), SVC(), KNeighborsClassifier()] # Fit the classifiers for c in classifiers: c.fit(X, y) # Plot the classifiers plot_4_classifiers(X, y, classifiers) plt.savefig('../images/plot_4_classifiers.png')