In this tutorial, you will:
This notebook makes use of a dataset made of mouse neuron morphologies downloaded from Allen Cell Types Database and MouseLight as well as mouse electrophysiology recordings from Allen Cell Types Database. This dataset is expected to be accessible from a Blue Brain Nexus Project to be configured below. If not, please run the Tutorial: Integrate Neuroscience Datasets from Multiple Sources using MINDS
notebook.
!pip install pyRDF2vec
!pip install nexusforge==0.7.0
!pip install matplotlib
!pip install validators
!pip install gensim
!pip install scikit-learn
!pip install --upgrade nest-asyncio==1.5.1
import getpass
import json
import requests
import time
import numpy as np
import pandas as pd
from community import community_louvain
from pathlib import Path
from kgforge.core import KnowledgeGraphForge
import nexussdk as nexus
from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.graphs import KG
from pyrdf2vec.graphs.vertex import Vertex
from pyrdf2vec.walkers import WLWalker
from pyrdf2vec.samplers import ObjFreqSampler
from sklearn.manifold import TSNE
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from urllib.parse import quote_plus
The Nexus sandbox application can be used to get a token:
Step 1: From the web page, click on the login button in the top right corner and follow the instructions on screen.
Step 2: Once logged in, click on the button on the top right that displays your GitHub username. From the dropdown select Copy token
option. This will copy the token to your clipboard.
Once a token is obtained, proceed to paste it as the value of the TOKEN
variable below.
TOKEN = getpass.getpass()
r = requests.get('https://raw.githubusercontent.com/BlueBrain/nexus/ef830192d4e7bb95f9351c4bdab7b0114c27e2f0/docs/src/main/paradox/docs/getting-started/notebooks/rdfmodel/jsonldcontext.json')
dirpath = './rdfmodel'
Path(dirpath).mkdir(parents=True, exist_ok=True)
with open(f'{dirpath}/jsonldcontext.json', 'w') as outfile:
json.dump(r.json(), outfile)
ENDPOINT = "https://sandbox.bluebrainnexus.io/v1"
ORG = "github-users"
PROJECT = "" # Provide here the automatically created project name created when you logged into the Nexus sandbox instance.
forge = KnowledgeGraphForge("https://raw.githubusercontent.com/BlueBrain/nexus/ef830192d4e7bb95f9351c4bdab7b0114c27e2f0/docs/src/main/paradox/docs/getting-started/notebooks/forge.yml",
bucket=f"{ORG}/{PROJECT}",
endpoint=ENDPOINT,
token=TOKEN)
property_to_display = ["id","type","@id","@type","name","subject","brainLocation.brainRegion.id","brainLocation.brainRegion.label","brainLocation.layer.id","brainLocation.layer.label", "contribution.agent.label","brainLocation.layer.id","brainLocation.layer.label"]
_type = "NeuronMorphology"
filters = {"type": _type}
number_of_results = 20
morphologies = forge.search(filters, limit=number_of_results)
print(f"{str(len(morphologies))} dataset(s) of type {_type} found")
reshaped_data = forge.reshape(morphologies, keep = property_to_display)
morphologies_df = forge.as_dataframe(reshaped_data)
morphologies_df.head()
_type = "Trace"
filters = {"type": _type}
number_of_results = 20
ephys = forge.search(filters, limit=number_of_results)
print(f"{str(len(ephys))} dataset(s) of type {_type} found")
reshaped_data = forge.reshape(ephys, keep = property_to_display)
ephys_df = forge.as_dataframe(reshaped_data)
ephys_df.head()
In this step, from the search results we generate a graph data structure to perform embedding on.
We create a KG
object to feed into a pyrdf2vec
embedder.
dataset = ephys + morphologies
graph = forge.as_graph(dataset)
knowledge_graph = KG(
# These predicates will not be used during the embedding
skip_predicates={
"http://www.w3.org/1999/02/22-rdf-syntax-ns#type",
"http://www.w3.org/2000/01/rdf-schema#label",
"http://schema.org/distribution",
"http://schema.org/identifier",
"http://schema.org/name",
"http://schema.org/description",
"https://neuroshapes.org/dateCreated",
},
# These chains of predicates define literals that will not be used during the embedding,
# but will be further incorporated into the similarity search
literals=[
[
"https://neuroshapes.org/brainLocation",
"https://neuroshapes.org/coordinatesInBrainAtlas",
"https://neuroshapes.org/valueX",
],
[
"https://neuroshapes.org/brainLocation",
"https://neuroshapes.org/coordinatesInBrainAtlas",
"https://neuroshapes.org/valueY",
],
[
"https://neuroshapes.org/brainLocation",
"https://neuroshapes.org/coordinatesInBrainAtlas",
"https://neuroshapes.org/valueZ"
]
])
for (s, p, o) in graph:
subj = Vertex(str(s))
obj = Vertex(str(o))
pred = Vertex(str(p), predicate=True, vprev=subj, vnext=obj)
knowledge_graph.add_walk(subj, pred, obj)
We select the Neuron Morphologies and Electrophysiology recordings to build embeddings.
entities = morphologies_df["id"].tolist() + ephys_df["id"].tolist()
print(f"{len(entities)} Neuron Morphologies and Electrophysiology recordings instances found")
A knowledge graph embedding model represents each node (e.g. each Neuron Morphology or Electrophysiology Trace) in the graph with a numerical dense vector in a space with low dimensionality. The goal of such embedding is to place similar nodes closer in the resulting space. Similarity between two nodes can be computed using a distance score such as cosine similarity between their embedding vectors.
For this tutorial, the pyRDF2Vec will be used to generate embeddings for the selected Neuron Morphologies or Electrophysiology Traces.
Note: Similarly to other machine learning tasks, graph embedding relies heavily on the provided dataset: the larger and richer the dataset is, the more accurate and meaningful embeddings can be constructed (and consequently their similarities). In this notebook we use a very small dataset of neuron morphologies and traces and build a 'toy' model serving as a proof of concept, rather than a real-world recommendation model.
walkers = [WLWalker(20, 100, ObjFreqSampler())]
transformer = RDF2VecTransformer(walkers=walkers)
embeddings, literals = transformer.fit_transform(
knowledge_graph, entities
)
Create numerical features from extracted literals.
features = pd.DataFrame(literals, columns=["X", "Y", "Z"])
features.sample(5)
Encode categorical features into vectors.
def aggregate_vectors(x):
return [x.X] + [x.Y] + [x.Z]
feature_vectors = features.apply(aggregate_vectors, axis=1).to_list()
imputer = SimpleImputer()
feature_vectors = imputer.fit_transform(feature_vectors)
Concatename produced embedding vectors with the feature vectors.
final_embeddings = [
np.hstack([embedding, feature_vectors[i]])
for i, embedding in enumerate(embeddings)
]
Reduce dimensionality of vectors to 2D in order to plot them.
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=3000, random_state=23)
embeddings_2D = tsne_model.fit_transform(final_embeddings)
def plot_embeddings(vectors, labels=None, title=None, ids=None):
unlabeled = False
if labels is None:
labels = [0] * len(vectors)
unlabeled = True
# Generate color map
unique_labels = set(labels)
cm = plt.get_cmap('gist_rainbow')
generated_colors = np.array([
cm(1. * i / len(unique_labels))
for i in range(len(unique_labels))
])
np.random.shuffle(generated_colors)
alpha = 1
fig, ax = plt.subplots(figsize=(10, 10))
# create a scatter per node label
for i, l in enumerate(unique_labels):
indices = np.where(np.array(labels) == l)
ax.scatter(
vectors[indices, 0],
vectors[indices, 1],
c=[generated_colors[i]] * indices[0].shape[0],
cmap="jet",
s=50,
alpha=alpha,
label=l if not unlabeled else None
)
if not unlabeled:
ax.legend(loc="center right", bbox_to_anchor=(1.8, 0.5))
if ids is not None:
for i, el in enumerate(vectors):
plt.annotate(ids[i].split("/")[-1], el)
ax.set_title(title if title else "2D visualization of the input node representation")
plt.show()
labels = morphologies_df["brainLocation.brainRegion.label"].tolist() + ephys_df["brainLocation.brainRegion.label"].tolist()
plot_embeddings(
embeddings_2D, labels,
"2D visualization of the node embedding (colors=brain regions)",
ids=None) # To show id's of points, set `ids=entities`
pd.concat([
morphologies_df[["id", "brainLocation.brainRegion.label"]],
ephys_df[["id", "brainLocation.brainRegion.label"]]
]).rename(columns={"id": "resource_id", "brainLocation.brainRegion.label": "brain_region"}).to_csv(
"metadata_rdf2vec_kg_embeddings.tsv", sep="\t", index=None)
np.savetxt("vectors_rdf2vec_kg_embeddings.tsv", final_embeddings, delimiter="\t")
Tip: Try playing with the produced embeddings by loading the saved files in https://projector.tensorflow.org/
transformer.embedder._model.save("./kg_embedding_model")
for i, entity in enumerate(entities):
resource = forge.retrieve(entity)
resource.embedding = final_embeddings[i].tolist()
forge.update(resource)
The goal here is to create an Elasticsearch index within the configured Nexus project in which to store and query the embeddings. Such index can be created using an ElasticSearchView.
VIEW_ID = f"https://bbp.epfl.ch/neurosciencegraph/data/views/es/embedding_view_{int(time.time())}"
DIM = final_embeddings[0].shape[0] # dimensionality of vectors
type_to_index = [
"https://neuroshapes.org/NeuronMorphology",
"https://neuroshapes.org/Trace"
]
view_data = {
"@type": [
"ElasticSearchView"
],
"includeMetadata": True,
"includeDeprecated": False,
"resourceTypes": type_to_index,
"mapping": {
"properties": {
"@id": {
"type": "keyword"
},
"@type": {
"type": "keyword"
},
"embedding": {
"type":"dense_vector",
"dims": DIM
}
}
},
"sourceAsText": False
}
try:
response = nexus.views.create_(
org_label=ORG, project_label=PROJECT,
payload=view_data, view_id=VIEW_ID)
except nexus.HTTPError as ne:
print(ne.response.json())
Configure forge to point to the newly created ElasticSearch view
forge._store.service.elastic_endpoint["endpoint"] = "/".join(
(ENDPOINT, "views", quote_plus(ORG), quote_plus(PROJECT), quote_plus(VIEW_ID), "_search"))
def get_similar_resources(item_id, q="*", number_of_results=10):
"""Get similar resources.
Given a resource id, execute the recommendation function score query
to find similar resources, ranked by cosine similarity.
"""
# Get the item from Nexus and retrieve its embedding
item_source = forge.retrieve(id=item_id)
# extract the embedding
item_embedding = item_source.embedding
query = f"""{{
"query": {{
"script_score": {{
"query": {{
"exists": {{
"field": "embedding"
}}
}},
"script": {{
"source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
"params": {{
"query_vector": {item_embedding}
}}
}}
}}
}}
}}"""
results = forge.elastic(query=query, debug=False, limit=number_of_results)
scores = [r._score for r in results if hasattr(r, "_score")]
return [forge.from_json(dict(r._source)) for r in results if hasattr(r, "_source")], scores
Select a morphology id to recommend similar morphologies for.
morphology_id = morphologies_df.id[0]
morphology_id
morphologies_df[morphologies_df["id"] == morphology_id]
Important: Elasticsearch indexing takes a few seconds, if you run the following cells and get no results, most probably the indexing has not finalized yet. Try rerunning in a few seconds.
number_of_results = 10
forge._debug = True
res, scores = get_similar_resources(
item_id=morphology_id,
number_of_results=number_of_results)
reshaped_res = forge.reshape(res, keep=property_to_display)
print(f"Found {len(res)} Datasets")
result_df = forge.as_dataframe(reshaped_res)
result_df.insert(0, "score", scores)
result_df
Select an electrophysiology recording id to recommend similar electrophysiology recordings for.
ephys_id = ephys_df.id[0]
ephys_id
number_of_results = 10
forge._debug= True
res, scores = get_similar_resources(
item_id=ephys_id,
number_of_results=number_of_results)
reshaped_res = forge.reshape(res, keep = property_to_display)
print(f"Found {len(res)} Datasets")
result_df = forge.as_dataframe(reshaped_res)
result_df.insert(0, "score", scores)
result_df