%pylab inline
import pandas as pd
rcParams['axes.spines.top'] = False
rcParams['axes.spines.right'] = False
Populating the interactive namespace from numpy and matplotlib
import theano
theano.config.floatX = 'float64'
import theano.tensor as tt
from theano.tensor.nlinalg import det
import pymc3 as pm
cluster = np.random.binomial(1, 0.33, (1000, 1))
batch = np.random.binomial(1, 0.5, (1000, 1))
depth = np.random.uniform(size=(1000, 1))
noise = np.random.normal(size=(1000, 1))
df = pd.DataFrame(data={'cluster': cluster[:, 0],
'batch': batch[:, 0],
'depth': depth[:, 0],
'noise': noise[:, 0]},
index=np.arange(cluster.shape[0]))
df['y'] = df['cluster'] + df['batch'] * 0.5 + df['depth'] * 2 + df['noise'] * 0.25
df['y'].hist(bins=20, grid=False, ec='k', fc='w');
for v, g in df.groupby('cluster'):
g['y'].hist(bins=20, grid=False, ec='k', fc=cm.Greys(v / 2));
for c, gc in df.groupby('cluster'):
plt.scatter(gc['depth'], gc['y'], c=cm.Greys(c / 2 + 1 / 2));
for v, g in df.groupby('cluster'):
(g['y'] - g['depth'] * 2).hist(bins=20, grid=False, ec='k', fc=cm.Greys(v / 2));
for v, g in df.groupby('cluster'):
(g['y'] - g['batch'] * 0.5 - g['depth'] * 2).hist(bins=20, grid=False, ec='k',fc=cm.Greys(v / 2));
K = 2
with pm.Model() as model:
sigma = pm.HalfCauchy('sigma', beta=10, testval=1., shape=(K,))
mu_cluster = pm.Normal('mu_cluster', mu=0., sd=1., testval=1., shape=(1, K))
pi = pm.Dirichlet('pi', a=0.1 * np.ones(K), shape=(K,))
likelihood = pm.NormalMixture('y', w=pi, mu=mu_cluster, sd=sigma, observed=df['y'])
means, sds, elbos = pm.variational.advi(model=model, n=1000, learning_rate=1e-1)
Average ELBO = -2,490.2: 100%|███████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2535.60it/s] Finished [100%]: Average ELBO = -2,489.5
means
{'mu_cluster': array([[ 0.96668414, 1.57696357]]), 'pi_stickbreaking_': array([-5.4162677]), 'sigma_log_': array([-0.42210571, -0.20109472])}
weights = pm.distributions.transforms.stick_breaking.backward(means['pi_stickbreaking_']).eval()
plt.bar(np.arange(K), weights, color='k');
from scipy import stats
RV = stats.norm(loc=means['mu_cluster'], scale=exp(means['sigma_log_']))
inferred_cluster = df['y'].apply(RV.logpdf).apply(np.argmax)
df['inferred_cluster'] = inferred_cluster
pd.crosstab(df['cluster'], df['inferred_cluster'])
inferred_cluster | 0 | 1 |
---|---|---|
cluster | ||
0 | 381 | 296 |
1 | 37 | 286 |
for i, g in df.groupby('inferred_cluster'):
for c, gc in g.groupby('cluster'):
noise = np.random.rand(gc.shape[0]) * 0.5
plt.scatter(i + gc.y * 0 + noise , gc.y, c=cm.Greys(c / 2 + 1 / 2))
df.head()
batch | cluster | depth | noise | y | inferred_cluster | |
---|---|---|---|---|---|---|
0 | 1 | 1 | 0.369236 | -0.309137 | 2.161188 | 1 |
1 | 0 | 0 | 0.422379 | -0.506229 | 0.718201 | 0 |
2 | 1 | 0 | 0.384382 | -0.639732 | 1.108831 | 0 |
3 | 1 | 1 | 0.512384 | -0.012628 | 2.521612 | 1 |
4 | 1 | 0 | 0.413098 | -2.631005 | 0.668444 | 0 |
X = patsy.dmatrix('~ C(batch) + depth', df, return_type='dataframe')
XX = X.values
K = 10
P = X.shape[1]
XX.shape
(1000, 3)
(np.dot(XX, np.ones((P, 1))) + np.ones((1, K))).shape
(1000, 2)
K = 2
P = X.shape[1]
with pm.Model() as model:
sigma = pm.HalfCauchy('sigma', beta=10, testval=1., shape=(K,))
alpha = pm.Normal('alpha', mu=0., sd=1., testval=1., shape=(P, 1))
mu_cluster = pm.Normal('mu_cluster', mu=0., sd=1., testval=1., shape=(1, K))
pi = pm.Dirichlet('pi', a=0.1 * np.ones(K), shape=(K,))
mu = pm.math.dot(XX, alpha) + mu_cluster
likelihood = pm.NormalMixture('y', w=pi, mu=mu, sd=sigma, observed=df['y'])
means, sds, elbos = pm.variational.advi(model=model, n=1000, learning_rate=1e-1)
Average ELBO = -1,291.5: 100%|███████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 2147.61it/s] Finished [100%]: Average ELBO = -1,290.9
means
{'alpha': array([[ 0.06706388], [ 0.51137022], [ 2.03863715]]), 'mu_cluster': array([[ 0.87454467, -0.11863096]]), 'pi_stickbreaking_': array([-0.6993302]), 'sigma_log_': array([-1.35343494, -1.37067626])}
weights = pm.distributions.transforms.stick_breaking.backward(means['pi_stickbreaking_']).eval()
plt.bar(np.arange(K), weights, color='k');
inferred_cluster = []
for y, xx in zip(df.y[:, None], XX):
mu = xx.dot(means['alpha'])
RV = stats.norm(loc=mu + means['mu_cluster'], scale=np.exp(means['sigma_log_']))
inferred_cluster.append(RV.logpdf(y).argmax())
df['inferred_cluster'] = inferred_cluster
pd.crosstab(df.cluster, df.inferred_cluster)
inferred_cluster | 0 | 1 |
---|---|---|
cluster | ||
0 | 21 | 656 |
1 | 317 | 6 |
for i, g in df.groupby('inferred_cluster'):
for c, gc in g.groupby('cluster'):
noise = np.random.rand(gc.shape[0]) * 0.5
plt.scatter(i + gc.y * 0 + noise , gc.y, c=cm.Greys(c / 2 + 1 / 2), label='Ground Truth: {}'.format(c))
plt.xticks([0.25, 1.25], [0, 1]);
plt.xlabel('Inferred cluster')
plt.ylabel('Expression');
noise2 = np.random.normal(size=(1000, 1))
df['noise2'] = noise2
df['y2'] = 1. + 1.2 * df['cluster'] + df['batch'] * 0.5 + df['depth'] * 1.8 + df['noise2'] * 0.25
plt.scatter(df['y'], df['y2'], c=df.cluster);
plt.scatter(df['y'] - df.depth * 2. - df.batch * 0.5, df['y2'] - df.depth * 1.8 - df.batch * 0.5, c=df.cluster);
from pymc3.math import logsumexp
# Log likelihood of normal distribution
def logp_normal(mu, tau, value):
# log probability of individual samples
k = tau.shape[0]
delta = lambda mu: value - mu
return (-1 / 2.) * (k * tt.log(2 * np.pi) + tt.log(1./det(tau)) +
(delta(mu).dot(tau) * delta(mu)).sum(axis=1))
# Log likelihood of Gaussian mixture distribution
def logp_gmix(mus, pi, tau, n_samples):
def logp_(value):
logps = [tt.log(pi[i]) + logp_normal(mu, tau, value)
for i, mu in enumerate(mus)]
return tt.sum(logsumexp(tt.stacklists(logps)[:, :n_samples], axis=0))
return logp_
G = 2
K = 2
with pm.Model() as model:
mus = [pm.MvNormal('mu_%d' % i, mu=np.zeros(2), tau=0.1 * np.eye(G), shape=(G,)) for i in range(K)]
sigma = pm.HalfCauchy('sigma', beta=10, testval=1.)
alpha = pm.Normal('alpha', mu=0., sd=1., testval=1., shape=(P, G))
pi = pm.Dirichlet('pi', a=0.1 * np.ones(K), shape=(K,))
mu_tech = [pm.math.dot(X, alpha) + mu for mu in mus]
yg = pm.DensityDist('Y', logp_gmix(mu_tech, pi, sigma * np.eye(K), df.shape[0]), observed=df[['y', 'y2']])
means, sds, elbos = pm.variational.advi(model=model, n=1000, learning_rate=1e-1)
Average ELBO = -786.76: 100%|████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1328.57it/s] Finished [100%]: Average ELBO = -789.26
means
{'alpha': array([[ 0.63935756, 1.25256157], [ 0.48901349, 0.48521498], [ 2.03951192, 1.8011695 ]]), 'mu_0': array([-0.69458221, -0.24840747]), 'mu_1': array([ 0.29980015, 0.94996631]), 'pi_stickbreaking_': array([ 0.78966357]), 'sigma_log_': array(2.7837811858747386)}
plt.scatter(df.y, df.y2)
plt.scatter(*means['mu_0'])
plt.scatter(*means['mu_1'])
<matplotlib.collections.PathCollection at 0x13f3f156978>
plt.plot(elbos);
plt.scatter(df['y'] - df.depth * 2. - df.batch * 0.5, df['y2'] - df.depth * 1.8 - df.batch * 0.5, c=df.cluster);
plt.scatter(*means['mu_0'])
plt.scatter(*means['mu_1']);
K = 2
P = X.shape[1]
G = 2
with pm.Model() as model:
sigma = pm.HalfCauchy('sigma', beta=10, testval=1., shape=(G, K))
alpha = pm.Normal('alpha', mu=0., sd=1., testval=1., shape=(P, G))
mu_cluster = pm.Normal('mu_cluster', mu=0., sd=1., testval=1., shape=(G, K))
pi = pm.Dirichlet('pi', a=0.1 * np.ones(K), shape=(K,))
mu = pm.math.dot(XX, alpha) + mu_cluster
likelihood = pm.NormalMixture('y', w=pi, mu=mu, sd=sigma, observed=df[['y', 'y2']])
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-730-13339fcfa340> in <module>() 8 pi = pm.Dirichlet('pi', a=0.1 * np.ones(K), shape=(K,)) 9 ---> 10 mu = pm.math.dot(XX, alpha) + mu_cluster 11 likelihood = pm.NormalMixture('y', w=pi, mu=mu, sd=sigma, observed=df[['y', 'y2']]) C:\Users\vale\Anaconda3\lib\site-packages\theano\tensor\var.py in __add__(self, other) 126 def __add__(self, other): 127 try: --> 128 return theano.tensor.basic.add(self, other) 129 # We should catch the minimum number of exception here. 130 # Otherwise this will convert error when Theano flags C:\Users\vale\Anaconda3\lib\site-packages\theano\gof\op.py in __call__(self, *inputs, **kwargs) 672 thunk.outputs = [storage_map[v] for v in node.outputs] 673 --> 674 required = thunk() 675 assert not required # We provided all inputs 676 C:\Users\vale\Anaconda3\lib\site-packages\theano\gof\op.py in rval() 841 842 def rval(): --> 843 fill_storage() 844 for o in node.outputs: 845 compute_map[o][0] = True C:\Users\vale\Anaconda3\lib\site-packages\theano\gof\cc.py in __call__(self) 1696 print(self.error_storage, file=sys.stderr) 1697 raise -> 1698 reraise(exc_type, exc_value, exc_trace) 1699 1700 C:\Users\vale\Anaconda3\lib\site-packages\six.py in reraise(tp, value, tb) 684 if value.__traceback__ is not tb: 685 raise value.with_traceback(tb) --> 686 raise value 687 688 else: ValueError: Input dimension mis-match. (input[0].shape[0] = 1000, input[1].shape[0] = 2)
import patsy
X = patsy.dmatrix('np.log(total_counts) + 1', sample_info, return_type='dataframe')
Y = np.log1p(counts.sample(10, axis=0).T)
K = 3
G = Y.shape[1]
P = X.shape[1]
with pm.Model() as model:
alpha = pm.Normal('alpha', mu=0., sd=1., testval=1., shape=(G, K, P))
alpha = np.ones((P, G, K))
a_0 = alpha[:, :, 0]
X.dot(a_0).shape
(144, 10)
Y.shape
(144, 10)