%matplotlib inline

from collections import defaultdict, Counter
import urllib

import numpy as np
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn import preprocessing
import scipy.cluster.hierarchy as sch

import nimfa

clinical_fname = 'clinical.BRCA-US.tsv.gz'
urllib.request.urlretrieve('https://dcc.icgc.org/api/v1/download?' \
                           'fn=/release_18/Projects/BRCA-US/' \
                           'clinical.BRCA-US.tsv.gz', clinical_fname);

expression_fname = 'protein_expression.BRCA-US.tsv.gz'
urllib.request.urlretrieve('https://dcc.icgc.org/api/v1/download?' \
                           'fn=/release_18/Projects/BRCA-US/' \
                           'protein_expression.BRCA-US.tsv.gz', expression_fname);

C = np.genfromtxt(clinical_fname, delimiter='\t', dtype='object', 
                  skip_header=1, usecols=(0, 17, 19))

E = np.genfromtxt(expression_fname, delimiter='\t', dtype='object', 
                  skip_header=1, usecols=(0, 2, 7, 10))

donors = set(E[:, 0])
genes = set(E[:, 2])
print(len(donors))
print(len(genes))

spec2gene = defaultdict(set)
for spec_id, gene in E[:, 1:3]:
    spec2gene[spec_id].add(gene)

donor2spec = defaultdict(set)
for donor, spec_id in E[:, :2]:
    donor2spec[donor].add(spec_id)

# how many specimens are there per donor?
tmp = [len(v) for v in donor2spec.values()]
print(Counter(tmp))

# protein expression data is available only for tumor data (no matched normal tissue)
spec2spec_type = {C[idx, 1]: C[idx, 2] for idx, spec in enumerate(C[:, 1]) if spec in spec2gene}
print(set(spec2spec_type.values()))

data = np.zeros((len(donors), len(genes)))

donor2id = {donor.decode('UTF-8'): idx for idx, donor in enumerate(donors)}
id2donor = dict(zip(donor2id.values(), donor2id.keys()))

gene2id = {gene.decode('UTF-8'): idx for idx, gene in enumerate(genes)}
id2gene = dict(zip(gene2id.values(), gene2id.keys()))

for donor, gene, val in E[:, [0, 2, 3]]:
    data[donor2id[donor.decode('UTF-8')], gene2id[gene.decode('UTF-8')]] = float(val)

# level of sparsity
print(np.sum(data != 0)/data.size)

# negative elements
print(np.sum(data < 0))

# remove negative elements
data = data - np.min(data)

# normalize data
data = preprocessing.Normalizer().fit_transform(data)

rank = 10
nmf = nimfa.Nmf(data, rank=rank, seed='random_vcol', max_iter=100, 
                update='divergence', objective='div', n_run=50, track_factor=True)
nmf_fit = nmf()

def clean_axis(ax):
    ax.get_xaxis().set_ticks([])
    ax.get_yaxis().set_ticks([])
    for sp in ax.spines.values():
        sp.set_visible(False)

fig = plt.figure(figsize=(13.9, 10))
heatmapGS = gridspec.GridSpec(1, 2, wspace=.01, hspace=0., width_ratios=[0.25,1])

C = 1 - nmf_fit.fit.consensus()
Y = sch.linkage(C, method='average')

denAX = fig.add_subplot(heatmapGS[0,0])
denD = sch.dendrogram(Y, orientation='right', link_color_func=lambda k: 'black')
clean_axis(denAX)

heatmapAX = fig.add_subplot(heatmapGS[0,1])
D = C[denD['leaves'], :][:, denD['leaves']]
axi = heatmapAX.imshow(D, interpolation='nearest', aspect='equal', origin='lower', cmap='RdBu') 
clean_axis(heatmapAX)

cb = fig.colorbar(axi, fraction=0.046, pad=0.04, aspect=10) 
cb.set_label('Distance', fontsize=20)

rank = 10
nmf = nimfa.Nmf(data.T, rank=rank, seed='random_vcol', max_iter=200, 
                update='euclidean', objective='conn', conn_change=40, 
                n_run=50, track_factor=True)
nmf_fit = nmf()

def clean_axis(ax):
    ax.get_xaxis().set_ticks([])
    ax.get_yaxis().set_ticks([])
    for sp in ax.spines.values():
        sp.set_visible(False)

fig = plt.figure(figsize=(13.9, 10))
heatmapGS = gridspec.GridSpec(1, 2, wspace=.01, hspace=0., width_ratios=[0.25,1])

C = 1 - nmf_fit.fit.consensus()
Y = sch.linkage(C, method='average')

denAX = fig.add_subplot(heatmapGS[0,0])
denD = sch.dendrogram(Y, orientation='right', link_color_func=lambda k: 'black')
clean_axis(denAX)

heatmapAX = fig.add_subplot(heatmapGS[0,1])
D = C[denD['leaves'], :][:, denD['leaves']]
axi = heatmapAX.imshow(D, interpolation='nearest', aspect='equal', origin='lower', cmap='RdBu') 
clean_axis(heatmapAX)

cb = fig.colorbar(axi, fraction=0.046, pad=0.04, aspect=10) 
cb.set_label('Distance', fontsize=20)

rank_cands = range(3, 30, 5)
snmf = nimfa.Snmf(data, seed='random_vcol', max_iter=100)
summary = snmf.estimate_rank(rank_range=rank_cands, n_run=10, what='all')

summary[3].keys()

rss = [summary[rank]['rss'] for rank in rank_cands]
coph = [summary[rank]['cophenetic'] for rank in rank_cands]
disp = [summary[rank]['dispersion'] for rank in rank_cands]
spar = [summary[rank]['sparseness'] for rank in rank_cands]
spar_w, spar_h = zip(*spar)
evar = [summary[rank]['evar'] for rank in rank_cands]

plt.plot(rank_cands, rss, 'o-', label='RSS', linewidth=2)
plt.plot(rank_cands, coph, 'o-', label='Cophenetic correlation', linewidth=2)
plt.plot(rank_cands, disp,'o-', label='Dispersion', linewidth=2)
plt.plot(rank_cands, spar_w, 'o-', label='Sparsity (Basis)', linewidth=2)
plt.plot(rank_cands, spar_h, 'o-', label='Sparsity (Mixture)', linewidth=2)
plt.plot(rank_cands, evar, 'o-', label='Explained variance', linewidth=2)
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=3, numpoints=1);

# SNMF/L: for sparse W (where ‘l’ denotes the sparseness imposed on the left factor)
# SNMF/R: for sparse H (where ‘r’ denotes the sparseness imposed on the right factor)
rank = 13
snmf = nimfa.Snmf(data.T, rank=rank, seed='random_vcol', version='l', max_iter=200)
snmf_fit = snmf()

W = snmf_fit.fit.basis()

c = 0
k = 10

topk = np.argsort(np.asarray(W[:, c]).flatten())[-k:]
val = W[topk, c]

plt.barh(np.arange(k) + .5, val, align="center")
labels = [id2gene[idx] for idx in topk]
plt.yticks(np.arange(k) + .5, labels)
plt.xlabel("Weight")
plt.ylabel("Gene");

print('\n'.join(labels))

gs = snmf_fit.fit.score_features()

gss = snmf_fit.fit.select_features()
gene_set = [id2gene[idx] for idx in np.nonzero(gss)[0]]

print('\n'.join(gene_set))

# SNMF/L: for sparse W (where ‘l’ denotes the sparseness imposed on the left factor)
# SNMF/R: for sparse H (where ‘r’ denotes the sparseness imposed on the right factor)
rank = 13
snmf = nimfa.Snmf(data.T, rank=rank, seed='random_vcol', version='r', max_iter=200)
snmf_fit = snmf()

H = snmf_fit.fit.coef().T

c = 0
k = 30

topk = np.argsort(np.asarray(H[:, c]).flatten())[-k:]
val = H[topk, c]

plt.barh(np.arange(k) + .5, val, color="yellow", align="center")
labels = [id2donor[idx] for idx in topk]
plt.yticks(np.arange(k) + .5, labels)
plt.xlabel("Weight")
plt.ylabel("Donor");