from sklearn.datasets import make_blobs X, y = make_blobs(centers=2) # generate dataset consisting of two Gaussian clusters print "X.shape:", X.shape print "y: ", y plt.prism() plt.scatter(X[:, 0], X[:, 1], c=y) from sklearn.linear_model import LogisticRegression logreg = LogisticRegression() X_train = X[:50] y_train = y[:50] X_test = X[50:] y_test = y[50:] plt.prism() plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train) plt.scatter(X_test[:, 0], X_test[:, 1], c='white', marker='^') logreg.fit(X_train, y_train) from utility import plot_decision_boundary y_pred_train = logreg.predict(X_train) plt.scatter(X_train[:, 0], X_train[:, 1], c=y_pred_train) plot_decision_boundary(logreg, X) print "Accuracy on training set:", logreg.score(X_train, y_train) y_pred_test = logreg.predict(X_test) plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred_test, marker='^') plt.scatter(X_train[:, 0], X_train[:, 1], c=y_pred_train) plot_decision_boundary(logreg, X) print "Accuracy on test set:", logreg.score(X_test, y_test) logreg.coef_ from sklearn.datasets import fetch_mldata mnist = fetch_mldata("MNIST original") X_digits, y_digits = mnist.data, mnist.target print "X_digits.shape:", X_digits.shape print "Unique entries of y_digits:", np.unique(y_digits) print(y_digits[0]) plt.rc("image", cmap="binary") plt.matshow(X_digits[0].reshape(28, 28)) zeros = X_digits[y_digits==0] # select all the rows of X where y is zero (i.e. the zeros) ones = X_digits[y_digits==1] # select all the rows of X where y is one (i.e. the ones) print "zeros.shape: ", zeros.shape print "ones.shape: ", ones.shape plt.matshow(ones[0].reshape(28, 28)) # change the 0 to another number to see some more zeros. Or try looking at some ones. X_new = np.vstack([zeros, ones]) # this "stacks" sevens and nines vertically print "X_new.shape: ", X_new.shape y_new = np.hstack([np.repeat(0, zeros.shape[0]), np.repeat(1, ones.shape[0])]) print "y_new.shape: ", y_new.shape print "y_new: ", y_new from sklearn.utils import shuffle X_new, y_new = shuffle(X_new, y_new) X_mnist_train = X_new[:5000] y_mnist_train = y_new[:5000] X_mnist_test = X_new[5000:] y_mnist_test = y_new[5000:] logreg.fit(X_mnist_train, y_mnist_train) plt.matshow(logreg.coef_.reshape(28, 28)) plt.colorbar() print "Accuracy training set:", logreg.score(X_mnist_train, y_mnist_train) print "Accuracy test set:", logreg.score(X_mnist_test, y_mnist_test)