from sklearn import manifold, datasets
from scipy.stats import entropy, pearsonr
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import random
# Next line to silence pyflakes. This import is needed.
Axes3D
n_points = 1000
X, color = datasets.make_s_curve(n_points, random_state=0)
x, col = datasets.make_swiss_roll(n_points, noise=0.05)
X1, X2, X3 = X.T
# Create figure
fig = plt.figure(figsize=(8, 8))
# Add 3d scatter plot
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral)
ax.view_init(4, -72)
# Create figure
fig = plt.figure(figsize=(8, 8))
# Add 3d scatter plot
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=col, cmap=plt.cm.Spectral)
ax.view_init(4, -72)
plt.plot(X1, X3, 'ro');
normalized_mutual_info_score(X1, X3), pearsonr(X1, X3)
(1.0, (-0.08697318258913389, 0.005921261651160034))
def processNegVals(x):
x = np.array(x)
minx = np.min(x)
if minx < 0:
x = x + abs(minx)
""" 0.000001 is used here to avoid 0. """
x = x + 0.000001
px = x/np.sum(x)
return px
entropy(processNegVals(X1), processNegVals(X3))
0.9820849486320271
def KL(P,Q):
epsilon = 0.00001
P = processNegVals(P)
Q = processNegVals(Q)
# You may want to instead make copies to avoid changing the np arrays.
divergence = np.sum(P*np.log(P/Q))
return divergence
KL(X1, X3)
0.9820849486320271
# Kullback-Leibler divergence is basically the sum of the relative entropy of two probabilities:
import scipy
vec = scipy.special.rel_entr(processNegVals(X1), processNegVals(X3))
kl_div = np.sum(vec)
kl_div
0.9820849486320271
plt.plot(x[:,0], x[:,2], 'ro');
KL(x[:,0], x[:,2])
0.6193061278370542
entropy(processNegVals(x[:,0]), processNegVals(x[:,2]))
0.6193061278370544
# Kullback-Leibler divergence is basically the sum of the relative entropy of two probabilities:
import scipy
vec = scipy.special.rel_entr(processNegVals(x[:,0]), processNegVals(x[:,2]))
kl_div = np.sum(vec)
kl_div
0.6193061278370542
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples,
factor=.5,
noise=.05)
c1, c2 = noisy_circles[0][:,0], noisy_circles[0][:,1]
plt.scatter(c1, c2);
KL(c1, c2), entropy(processNegVals(c1), processNegVals(c2)), pearsonr(c1, c2)
(0.3705245581961967, 0.3705245581961967, (0.0009233483083248371, 0.9714965955817252))
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
m1, m2 = noisy_moons[0][:,0], noisy_moons[0][:,1]
plt.scatter(m1, m2);
KL(m1, m2), entropy(processNegVals(m1), processNegVals(m2)), pearsonr(m1, m2)
(0.64323091348723, 0.64323091348723, (-0.44759555499930603, 8.659194606811307e-75))
# Creating our own non-linear dataset
# A good way to create a non-linear dataset is to mix sines with different phases.
n_samples = 1000
de_linearize = lambda X: np.cos(1.5 * np.pi * X) + np.cos( 5 * np.pi * X )
X = np.sort(np.random.rand(n_samples)) * 2
y = de_linearize(X) + np.random.randn(n_samples) * 0.1
plt.plot(X, y);
KL(X, y), entropy(processNegVals(X), processNegVals(y)), pearsonr(X, y)
(0.35368486325681037, 0.35368486325681014, (-0.030606904040838705, 0.3335983907604126))
# generate training set
rng = np.random.RandomState(1)
x = 10 * rng.rand(1000)
x1 = np.random.choice(x, size = 500, replace = False)
x2 = np.array([i for i in x if i not in x1])
y1 = 3 * x1 - 5 + rng.randn(500)
y2 = -3 * x2 + 25 + rng.randn(500)
x = np.hstack((x1, x2))
y = np.hstack((y1, y2))
plt.scatter(x, y);
KL(x, y), entropy(processNegVals(x), processNegVals(y)), pearsonr(x, y)
(0.4341830070262035, 0.4341830070262035, (0.018987388444658082, 0.5486815033156951))
num_samples = 1000
# make a simple unit circle
theta = np.linspace(0, 2*np.pi, num_samples)
a, b = 1 * np.cos(theta), 1 * np.sin(theta)
# generate the points
# theta = np.random.rand((num_samples)) * (2 * np.pi)
r = 10
x, y = r * np.cos(theta)+ rng.randn(num_samples), r * np.sin(theta)+ rng.randn(num_samples)
# plots
plt.figure(figsize=(6,6))
plt.plot(x, y, marker='o', linestyle='');
KL(x, y), entropy(processNegVals(x), processNegVals(y)), pearsonr(x, y)
(0.45900891300219837, 0.45900891300219837, (0.001991373458561054, 0.949850865621781))
David N. Reshef, Yakir A. Reshef, Hilary K. Finucane, Sharon R. Grossman, Gilean McVean, Peter J. Turnbaugh, Eric S. Lander, Michael Mitzenmacher, Pardis C. Sabeti, (2011) Detecting Novel Associations in Large Data Sets. Science.334 (6062):1518-1524.DOI: 10.1126/science.1205438
Abstract
Identifying interesting relationships between pairs of variables in large data sets is increasingly important. Here, we present a measure of dependence for two-variable relationships: the maximal information coefficient (MIC).
We apply MIC and MINE to data sets in global health, gene expression, major-league baseball, and the human gut microbiota and identify known and novel relationships.
!pip install minepy
Collecting minepy Downloading minepy-1.2.5.tar.gz (495 kB) |████████████████████████████████| 495 kB 550 kB/s eta 0:00:01 Requirement already satisfied: numpy>=1.3.0 in /opt/anaconda3/lib/python3.7/site-packages (from minepy) (1.18.1) Building wheels for collected packages: minepy Building wheel for minepy (setup.py) ... done Created wheel for minepy: filename=minepy-1.2.5-cp37-cp37m-macosx_10_9_x86_64.whl size=53204 sha256=bc8898a79054426890486cd6b5726a0e3347dcd73b2616fe865bda505a322376 Stored in directory: /Users/datalab/Library/Caches/pip/wheels/d1/ea/d7/fabbfa6e294adcbc43dabca0e0158dafdd36051246992c7311 Successfully built minepy Installing collected packages: minepy Successfully installed minepy-1.2.5
from __future__ import division
import numpy as np
import matplotlib.pyplot as plt
from minepy import MINE
rs = np.random.RandomState(seed=0)
def mysubplot(x, y, numRows, numCols, plotNum,
xlim=(-4, 4), ylim=(-4, 4)):
r = np.around(np.corrcoef(x, y)[0, 1], 1)
mine = MINE(alpha=0.6, c=15, est="mic_approx")
mine.compute_score(x, y)
mic = np.around(mine.mic(), 1)
ax = plt.subplot(numRows, numCols, plotNum,
xlim=xlim, ylim=ylim)
ax.set_title('Pearson r=%.1f\nMIC=%.1f' % (r, mic),fontsize=10)
ax.set_frame_on(False)
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)
ax.plot(x, y, ',')
ax.set_xticks([])
ax.set_yticks([])
return ax
def rotation(xy, t):
return np.dot(xy, [[np.cos(t), -np.sin(t)], [np.sin(t), np.cos(t)]])
def mvnormal(n=1000):
cors = [1.0, 0.8, 0.4, 0.0, -0.4, -0.8, -1.0]
for i, cor in enumerate(cors):
cov = [[1, cor],[cor, 1]]
xy = rs.multivariate_normal([0, 0], cov, n)
mysubplot(xy[:, 0], xy[:, 1], 3, 7, i+1)
def rotnormal(n=1000):
ts = [0, np.pi/12, np.pi/6, np.pi/4, np.pi/2-np.pi/6,
np.pi/2-np.pi/12, np.pi/2]
cov = [[1, 1],[1, 1]]
xy = rs.multivariate_normal([0, 0], cov, n)
for i, t in enumerate(ts):
xy_r = rotation(xy, t)
mysubplot(xy_r[:, 0], xy_r[:, 1], 3, 7, i+8)
def others(n=1000):
x = rs.uniform(-1, 1, n)
y = 4*(x**2-0.5)**2 + rs.uniform(-1, 1, n)/3
mysubplot(x, y, 3, 7, 15, (-1, 1), (-1/3, 1+1/3))
y = rs.uniform(-1, 1, n)
xy = np.concatenate((x.reshape(-1, 1), y.reshape(-1, 1)), axis=1)
xy = rotation(xy, -np.pi/8)
lim = np.sqrt(2+np.sqrt(2)) / np.sqrt(2)
mysubplot(xy[:, 0], xy[:, 1], 3, 7, 16, (-lim, lim), (-lim, lim))
xy = rotation(xy, -np.pi/8)
lim = np.sqrt(2)
mysubplot(xy[:, 0], xy[:, 1], 3, 7, 17, (-lim, lim), (-lim, lim))
y = 2*x**2 + rs.uniform(-1, 1, n)
mysubplot(x, y, 3, 7, 18, (-1, 1), (-1, 3))
y = (x**2 + rs.uniform(0, 0.5, n)) * \
np.array([-1, 1])[rs.random_integers(0, 1, size=n)]
mysubplot(x, y, 3, 7, 19, (-1.5, 1.5), (-1.5, 1.5))
y = np.cos(x * np.pi) + rs.uniform(0, 1/8, n)
x = np.sin(x * np.pi) + rs.uniform(0, 1/8, n)
mysubplot(x, y, 3, 7, 20, (-1.5, 1.5), (-1.5, 1.5))
xy1 = np.random.multivariate_normal([3, 3], [[1, 0], [0, 1]], int(n/4))
xy2 = np.random.multivariate_normal([-3, 3], [[1, 0], [0, 1]], int(n/4))
xy3 = np.random.multivariate_normal([-3, -3], [[1, 0], [0, 1]], int(n/4))
xy4 = np.random.multivariate_normal([3, -3], [[1, 0], [0, 1]], int(n/4))
xy = np.concatenate((xy1, xy2, xy3, xy4), axis=0)
mysubplot(xy[:, 0], xy[:, 1], 3, 7, 21, (-7, 7), (-7, 7))
plt.figure(facecolor='white', figsize = (16, 10))
mvnormal(n=800)
rotnormal(n=200)
others(n=800)
plt.tight_layout()
plt.show()
/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:65: DeprecationWarning: This function is deprecated. Please call randint(0, 1 + 1) instead
# Author: Jake Vanderplas -- <vanderplas@astro.washington.edu>
from collections import OrderedDict
from functools import partial
from time import time
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
from sklearn import manifold, datasets
# Next line to silence pyflakes. This import is needed.
Axes3D
n_points = 1000
X, color = datasets.make_s_curve(n_points, random_state=0)
n_neighbors = 10
n_components = 2
# Create figure
fig = plt.figure(figsize=(15, 8))
fig.suptitle("Manifold Learning with %i points, %i neighbors"
% (1000, n_neighbors), fontsize=14)
# Add 3d scatter plot
ax = fig.add_subplot(251, projection='3d')
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=color, cmap=plt.cm.Spectral)
ax.view_init(4, -72)
# Set-up manifold methods
LLE = partial(manifold.LocallyLinearEmbedding,
n_neighbors, n_components, eigen_solver='auto')
methods = OrderedDict()
methods['LLE'] = LLE(method='standard')
methods['LTSA'] = LLE(method='ltsa')
methods['Hessian LLE'] = LLE(method='hessian')
methods['Modified LLE'] = LLE(method='modified')
methods['Isomap'] = manifold.Isomap(n_neighbors, n_components)
methods['MDS'] = manifold.MDS(n_components, max_iter=100, n_init=1)
methods['SE'] = manifold.SpectralEmbedding(n_components=n_components,
n_neighbors=n_neighbors)
methods['t-SNE'] = manifold.TSNE(n_components=n_components, init='pca',
random_state=0)
# Plot results
for i, (label, method) in enumerate(methods.items()):
t0 = time()
Y = method.fit_transform(X)
t1 = time()
print("%s: %.2g sec" % (label, t1 - t0))
ax = fig.add_subplot(2, 5, 2 + i + (i > 3))
ax.scatter(Y[:, 0], Y[:, 1], c=color, cmap=plt.cm.Spectral)
ax.set_title("%s (%.2g sec)" % (label, t1 - t0))
ax.xaxis.set_major_formatter(NullFormatter())
ax.yaxis.set_major_formatter(NullFormatter())
ax.axis('tight')
plt.show()
LLE: 0.11 sec LTSA: 0.18 sec Hessian LLE: 0.28 sec Modified LLE: 0.23 sec Isomap: 0.37 sec MDS: 1.5 sec SE: 0.079 sec t-SNE: 3.2 sec