The idea here is to build a similarity graph between gene expression.The idea is the same as for the genotype graph, see the "genotype graph" notebook for more info.
In this notebook, proteins or gene expression are nodes of the graph. They are connected to their k nearest neighbors. The connections are weighted by the similarity between two protein expression according to a chosen distance. To each protein is associated a vector encoding its variations over the BXD mouse dataset. Two proteins are similar if their vectors are close in term of Euclidean distance.
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import os
import networkx as nx
import sklearn.metrics
import sklearn.neighbors
import matplotlib.pyplot as plt
# Config for accessing the data on the s3 storage
storage_options = {'anon':True, 'client_kwargs':{'endpoint_url':'https://os.unil.cloud.switch.ch'}}
s3_path = 's3://lts2-graphnex/BXDmice/'
# Load the data
# Tissue
tissue_name = 'LiverProt_CD'
# Other examples:
#tissue_name = 'Eye'
#tissue_name = 'Muscle_CD'
#tissue_name = 'Hippocampus'
#tissue_name = 'Gastrointestinal'
#tissue_name = 'Lung'
tissue_path = os.path.join(s3_path, 'expression data', tissue_name + '.txt.gz')
tissue = pd.read_csv(tissue_path, sep='\t', storage_options=storage_options)
print('File {} Opened.'.format(tissue_path))
# Remove the columns (mouse strains) where there are no measurement:
tissue = tissue.dropna(axis=1)
# Extract the data as a numpy array (drop the first columns)
tissue_values = tissue.iloc[:,2:].values
If unormalized, the graph of gene expression may not account for correlated expressions but only for similar concentration.
from sklearn.preprocessing import normalize
tissue_values = normalize(tissue_values, norm='l2', axis=1)
tissue_values.shape
# Default distance is Euclidean
num_neighbors = 4
tissue_knn = sklearn.neighbors.kneighbors_graph(tissue_values, num_neighbors, mode='distance')
# Optionally, one can use the following function to compute all the distances:
#geno_distances = sklearn.metrics.pairwise_distances(geno_values)
# Distribution of weights
plt.hist(tissue_knn.data, bins=50)
plt.title('Distribution of distances')
plt.xlabel('Distance')
plt.ylabel('Nb of edges')
plt.show()
# Distance to weight
# Modify the non-zero values to turn them into weights instead of distances
def distance2weight(d):
sigma = 1
return np.exp(- sigma * d)
M = tissue_knn.copy() #csr_matrix(tissue_knn.shape)
M.data = distance2weight(tissue_knn.data)
print('A distance of 1 becomes a weight of {}.'.format(str(distance2weight(1))))
# Distribution of weights
plt.hist(M.data, bins=20)
plt.title('Distribution of weights')
plt.xlabel('Weight value')
plt.ylabel('Nb of edges')
plt.show()
G = nx.from_scipy_sparse_matrix(M)
# Adding info on the nodes of the graph
tissueinfo_dic = tissue[['gene']].to_dict()
nx.set_node_attributes(G, tissueinfo_dic['gene'], name='Gene') # gene name
# Saving the graph as a gexf file readable with Gephi.
nx.write_gexf(G,tissue_name + 'graph.gexf')
Graph plotted using Gephi, colored by community (communities found automatically with Gephi). The gene expression forms two distinct clusters.
There are different possible applications of this graph, see the "genotype graph" notebook for examples.