%matplotlib inline from collections import defaultdict, Counter import urllib import numpy as np from matplotlib import pyplot as plt import matplotlib.gridspec as gridspec from sklearn import preprocessing import scipy.cluster.hierarchy as sch import nimfa clinical_fname = 'clinical.BRCA-US.tsv.gz' urllib.request.urlretrieve('https://dcc.icgc.org/api/v1/download?' \ 'fn=/release_18/Projects/BRCA-US/' \ 'clinical.BRCA-US.tsv.gz', clinical_fname); expression_fname = 'protein_expression.BRCA-US.tsv.gz' urllib.request.urlretrieve('https://dcc.icgc.org/api/v1/download?' \ 'fn=/release_18/Projects/BRCA-US/' \ 'protein_expression.BRCA-US.tsv.gz', expression_fname); C = np.genfromtxt(clinical_fname, delimiter='\t', dtype='object', skip_header=1, usecols=(0, 17, 19)) E = np.genfromtxt(expression_fname, delimiter='\t', dtype='object', skip_header=1, usecols=(0, 2, 7, 10)) donors = set(E[:, 0]) genes = set(E[:, 2]) print(len(donors)) print(len(genes)) spec2gene = defaultdict(set) for spec_id, gene in E[:, 1:3]: spec2gene[spec_id].add(gene) donor2spec = defaultdict(set) for donor, spec_id in E[:, :2]: donor2spec[donor].add(spec_id) # how many specimens are there per donor? tmp = [len(v) for v in donor2spec.values()] print(Counter(tmp)) # protein expression data is available only for tumor data (no matched normal tissue) spec2spec_type = {C[idx, 1]: C[idx, 2] for idx, spec in enumerate(C[:, 1]) if spec in spec2gene} print(set(spec2spec_type.values())) data = np.zeros((len(donors), len(genes))) donor2id = {donor.decode('UTF-8'): idx for idx, donor in enumerate(donors)} id2donor = dict(zip(donor2id.values(), donor2id.keys())) gene2id = {gene.decode('UTF-8'): idx for idx, gene in enumerate(genes)} id2gene = dict(zip(gene2id.values(), gene2id.keys())) for donor, gene, val in E[:, [0, 2, 3]]: data[donor2id[donor.decode('UTF-8')], gene2id[gene.decode('UTF-8')]] = float(val) # level of sparsity print(np.sum(data != 0)/data.size) # negative elements print(np.sum(data < 0)) # remove negative elements data = data - np.min(data) # normalize data data = preprocessing.Normalizer().fit_transform(data) rank = 10 nmf = nimfa.Nmf(data, rank=rank, seed='random_vcol', max_iter=100, update='divergence', objective='div', n_run=50, track_factor=True) nmf_fit = nmf() def clean_axis(ax): ax.get_xaxis().set_ticks([]) ax.get_yaxis().set_ticks([]) for sp in ax.spines.values(): sp.set_visible(False) fig = plt.figure(figsize=(13.9, 10)) heatmapGS = gridspec.GridSpec(1, 2, wspace=.01, hspace=0., width_ratios=[0.25,1]) C = 1 - nmf_fit.fit.consensus() Y = sch.linkage(C, method='average') denAX = fig.add_subplot(heatmapGS[0,0]) denD = sch.dendrogram(Y, orientation='right', link_color_func=lambda k: 'black') clean_axis(denAX) heatmapAX = fig.add_subplot(heatmapGS[0,1]) D = C[denD['leaves'], :][:, denD['leaves']] axi = heatmapAX.imshow(D, interpolation='nearest', aspect='equal', origin='lower', cmap='RdBu') clean_axis(heatmapAX) cb = fig.colorbar(axi, fraction=0.046, pad=0.04, aspect=10) cb.set_label('Distance', fontsize=20) rank = 10 nmf = nimfa.Nmf(data.T, rank=rank, seed='random_vcol', max_iter=200, update='euclidean', objective='conn', conn_change=40, n_run=50, track_factor=True) nmf_fit = nmf() def clean_axis(ax): ax.get_xaxis().set_ticks([]) ax.get_yaxis().set_ticks([]) for sp in ax.spines.values(): sp.set_visible(False) fig = plt.figure(figsize=(13.9, 10)) heatmapGS = gridspec.GridSpec(1, 2, wspace=.01, hspace=0., width_ratios=[0.25,1]) C = 1 - nmf_fit.fit.consensus() Y = sch.linkage(C, method='average') denAX = fig.add_subplot(heatmapGS[0,0]) denD = sch.dendrogram(Y, orientation='right', link_color_func=lambda k: 'black') clean_axis(denAX) heatmapAX = fig.add_subplot(heatmapGS[0,1]) D = C[denD['leaves'], :][:, denD['leaves']] axi = heatmapAX.imshow(D, interpolation='nearest', aspect='equal', origin='lower', cmap='RdBu') clean_axis(heatmapAX) cb = fig.colorbar(axi, fraction=0.046, pad=0.04, aspect=10) cb.set_label('Distance', fontsize=20) rank_cands = range(3, 30, 5) snmf = nimfa.Snmf(data, seed='random_vcol', max_iter=100) summary = snmf.estimate_rank(rank_range=rank_cands, n_run=10, what='all') summary[3].keys() rss = [summary[rank]['rss'] for rank in rank_cands] coph = [summary[rank]['cophenetic'] for rank in rank_cands] disp = [summary[rank]['dispersion'] for rank in rank_cands] spar = [summary[rank]['sparseness'] for rank in rank_cands] spar_w, spar_h = zip(*spar) evar = [summary[rank]['evar'] for rank in rank_cands] plt.plot(rank_cands, rss, 'o-', label='RSS', linewidth=2) plt.plot(rank_cands, coph, 'o-', label='Cophenetic correlation', linewidth=2) plt.plot(rank_cands, disp,'o-', label='Dispersion', linewidth=2) plt.plot(rank_cands, spar_w, 'o-', label='Sparsity (Basis)', linewidth=2) plt.plot(rank_cands, spar_h, 'o-', label='Sparsity (Mixture)', linewidth=2) plt.plot(rank_cands, evar, 'o-', label='Explained variance', linewidth=2) plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=3, numpoints=1); # SNMF/L: for sparse W (where ‘l’ denotes the sparseness imposed on the left factor) # SNMF/R: for sparse H (where ‘r’ denotes the sparseness imposed on the right factor) rank = 13 snmf = nimfa.Snmf(data.T, rank=rank, seed='random_vcol', version='l', max_iter=200) snmf_fit = snmf() W = snmf_fit.fit.basis() c = 0 k = 10 topk = np.argsort(np.asarray(W[:, c]).flatten())[-k:] val = W[topk, c] plt.barh(np.arange(k) + .5, val, align="center") labels = [id2gene[idx] for idx in topk] plt.yticks(np.arange(k) + .5, labels) plt.xlabel("Weight") plt.ylabel("Gene"); print('\n'.join(labels)) gs = snmf_fit.fit.score_features() gss = snmf_fit.fit.select_features() gene_set = [id2gene[idx] for idx in np.nonzero(gss)[0]] print('\n'.join(gene_set)) # SNMF/L: for sparse W (where ‘l’ denotes the sparseness imposed on the left factor) # SNMF/R: for sparse H (where ‘r’ denotes the sparseness imposed on the right factor) rank = 13 snmf = nimfa.Snmf(data.T, rank=rank, seed='random_vcol', version='r', max_iter=200) snmf_fit = snmf() H = snmf_fit.fit.coef().T c = 0 k = 30 topk = np.argsort(np.asarray(H[:, c]).flatten())[-k:] val = H[topk, c] plt.barh(np.arange(k) + .5, val, color="yellow", align="center") labels = [id2donor[idx] for idx in topk] plt.yticks(np.arange(k) + .5, labels) plt.xlabel("Weight") plt.ylabel("Donor");