%pylab inline from sklearn.manifold import TSNE help(TSNE) from sklearn.datasets import load_iris from sklearn.decomposition import PCA iris = load_iris() X_tsne = TSNE(learning_rate=100).fit_transform(iris.data) X_pca = PCA().fit_transform(iris.data) figure(figsize=(10, 5)) subplot(121) scatter(X_tsne[:, 0], X_tsne[:, 1], c=iris.target) subplot(122) scatter(X_pca[:, 0], X_pca[:, 1], c=iris.target) from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] newsgroups = fetch_20newsgroups(subset="train", categories=categories) vectors = TfidfVectorizer().fit_transform(newsgroups.data) print(repr(vectors)) from sklearn.decomposition import TruncatedSVD X_reduced = TruncatedSVD(n_components=50, random_state=0).fit_transform(vectors) X_embedded = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(X_reduced) fig = figure(figsize=(10, 10)) ax = axes(frameon=False) setp(ax, xticks=(), yticks=()) subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9, wspace=0.0, hspace=0.0) scatter(X_embedded[:, 0], X_embedded[:, 1], c=newsgroups.target, marker="x") from sklearn.datasets import fetch_mldata # Load MNIST dataset mnist = fetch_mldata("MNIST original") X, y = mnist.data / 255.0, mnist.target # Create subset and reduce to first 50 dimensions indices = arange(X.shape[0]) random.shuffle(indices) n_train_samples = 5000 X_pca = PCA(n_components=50).fit_transform(X) X_train = X_pca[indices[:n_train_samples]] y_train = y[indices[:n_train_samples]] # Plotting function matplotlib.rc('font', **{'family' : 'sans-serif', 'weight' : 'bold', 'size' : 18}) matplotlib.rc('text', **{'usetex' : True}) def plot_mnist(X, y, X_embedded, name, min_dist=10.0): fig = figure(figsize=(10, 10)) ax = axes(frameon=False) title("\\textbf{MNIST dataset} -- Two-dimensional " "embedding of 70,000 handwritten digits with %s" % name) setp(ax, xticks=(), yticks=()) subplots_adjust(left=0.0, bottom=0.0, right=1.0, top=0.9, wspace=0.0, hspace=0.0) scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, marker="x") if min_dist is not None: from matplotlib import offsetbox shown_images = np.array([[15., 15.]]) indices = arange(X_embedded.shape[0]) random.shuffle(indices) for i in indices[:5000]: dist = np.sum((X_embedded[i] - shown_images) ** 2, 1) if np.min(dist) < min_dist: continue shown_images = np.r_[shown_images, [X_embedded[i]]] imagebox = offsetbox.AnnotationBbox( offsetbox.OffsetImage(X[i].reshape(28, 28), cmap=cm.gray_r), X_embedded[i]) ax.add_artist(imagebox) X_train_embedded = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(X_train) plot_mnist(X[indices[:n_train_samples]], y_train, X_train_embedded, "t-SNE", min_dist=20.0)