In this notebook we show the following aspects of the EpiGraphDB platform, and how to use the API to get the information:
For detailed documentation on the API endpoints please visit:
from pprint import pformat
import networkx as nx
import pandas as pd
import requests
API_URL = "https://api.epigraphdb.org"
requests.get(f"{API_URL}/ping").json()
True
Here we query for the metadata information using the endpoint GET /meta/schema
, which will be used for downstream processing.
endpoint = "/meta/schema"
params = {"graphviz": False, "plot": False}
r = requests.get(f"{API_URL}{endpoint}", params=params)
r.raise_for_status()
metadata = r.json()
# Preview of metadata information
keys = metadata.keys()
print(pformat(keys), "\n")
for key in list(keys):
print(f"# {key}:")
print(pformat(metadata[key])[:1000], "\n")
dict_keys(['nodes', 'edges', 'connections']) # nodes: {'Disease': {'count': 21829, 'properties': {'definition': {'indexed': False, 'type': 'STRING', 'unique': False}, 'doid': {'indexed': False, 'type': 'LIST', 'unique': False}, 'efo': {'indexed': False, 'type': 'LIST', 'unique': False}, 'icd10': {'indexed': False, 'type': 'LIST', 'unique': False}, 'icd9': {'indexed': False, 'type': 'LIST', 'unique': False}, 'id': {'indexed': True, 'type': 'STRING', # edges: {'BN_GEN_COR': {'count': 904832, 'properties': {'gcov_int': {'array': False, 'type': 'FLOAT'}, 'gcov_int_se': {'array': False, 'type': 'FLOAT'}, 'h2_int': {'array': False, 'type': 'FLOAT'}, 'h2_int_se': {'array': False, 'type': 'FLOAT'}, 'h2_obs': {'array': False, 'type': 'FLOAT'}, 'h2_obs_se': {'array': False, 'type': 'FLOAT'}, 'p': {'array': False, 'type': 'FLOAT'}, 'rg': {'array': False, 'type': 'FLOAT'}, 'se': {'array': False, 'type': 'FLOAT'}, 'z': {'array': False, 'type': 'FLOAT'}}}, 'CPIC': {'count': 355, 'properties': {'cpic_level': {'array': False, 'type': 'STRING'}, 'guideline': {'array': False, 'type': 'STRING'}, 'pgx_on_fda_ # connections: [{'count': 2486, 'from_node': 'Drug', 'rel': 'OPENTARGETS_DRUG_TO_DISEASE', 'to_node': 'Disease'}, {'count': 3414, 'from_node': 'Disease', 'rel': 'MONDO_MAP_UMLS', 'to_node': 'SemmedTerm'}, {'count': 626, 'from_node': 'Protein', 'rel': 'PROTEIN_TO_DISEASE', 'to_node': 'Disease'}, {'count': 2822, 'from_node': 'Disease', 'rel': 'MONDO_MAP_EFO', 'to_node': 'Efo'}, {'count': 541, 'from_node': 'Pathway', 'rel': 'PATHWAY_TO_DISEASE', 'to_node': 'Disease'}, {'count': 41706, 'from_node': 'SemmedTerm', 'rel': 'SEM_GENE', 'to_node': 'Gene'}, {'count': 3428531, 'from_node': 'SemmedTriple', 'rel': 'SEM_SUB', 'to_node': 'SemmedTerm'}, {'count': 2081, 'from_node': 'Gwas', 'rel': 'METAMAP_LITE', 'to_node': 'SemmedTerm'}, {'count': 3428531, 'from_node': 'SemmedTerm', 'rel': 'SEM_PREDICATE', 'to_node': 'SemmedTerm'}, {'count': 3428531, 'from_node': 'SemmedTriple', 'rel': 'SEM_OBJ', 'to_node': 'SemmedTerm'}, {'count': 12488, 'from_n
We can extract the specific meta node information as a pandas dataframe from the metadata.
meta_node_df = pd.DataFrame.from_dict(metadata["nodes"], orient="index")
(
meta_node_df.sort_index().assign(
count=lambda df: df["count"].apply(lambda x: f"{x:,}")
)
)
count | properties | |
---|---|---|
Disease | 21,829 | {'doid': {'type': 'LIST', 'indexed': False, 'u... |
Drug | 2,455 | {'molecule_type': {'type': 'STRING', 'indexed'... |
Efo | 25,390 | {'type': {'type': 'STRING', 'indexed': False, ... |
Event | 11,868 | {'name': {'type': 'STRING', 'indexed': False, ... |
Gene | 59,171 | {'druggability_tier': {'type': 'STRING', 'inde... |
Gwas | 31,773 | {'note': {'type': 'STRING', 'indexed': False, ... |
Literature | 29,137,785 | {'pubmed_id': {'type': 'STRING', 'indexed': Tr... |
Pathway | 2,180 | {'name': {'type': 'STRING', 'indexed': False, ... |
Protein | 21,543 | {'uniprot_id': {'type': 'STRING', 'indexed': T... |
SemmedTerm | 103,967 | {'name': {'type': 'STRING', 'indexed': True, '... |
SemmedTriple | 3,428,531 | {'subject_id': {'type': 'STRING', 'indexed': F... |
Tissue | 53 | {'tissue': {'type': 'STRING', 'indexed': True,... |
Variant | 88,176 | {'name': {'type': 'STRING', 'indexed': True, '... |
We can also extract the meta relationship (edge) information, and the connections.
meta_rel_df = pd.DataFrame.from_dict(metadata["edges"], orient="index").merge(
pd.DataFrame.from_dict(
{_["rel"]: _ for _ in metadata["connections"]}, orient="index"
)[["from_node", "to_node"]],
left_index=True,
right_index=True,
)
(
meta_rel_df.sort_values(by=["from_node", "to_node"]).assign(
count=lambda df: df["count"].apply(lambda x: f"{x:,}")
)
)
count | properties | from_node | to_node | |
---|---|---|---|---|
MONDO_MAP_EFO | 2,822 | None | Disease | Efo |
MONDO_MAP_UMLS | 3,414 | None | Disease | SemmedTerm |
OPENTARGETS_DRUG_TO_DISEASE | 2,486 | None | Drug | Disease |
CPIC | 355 | {'pharmgkb_level_of_evidence': {'array': False... | Drug | Gene |
OPENTARGETS_DRUG_TO_TARGET | 6,024 | {'phase': {'array': False, 'type': 'STRING'}, ... | Drug | Gene |
EFO_CHILD_OF | 43,154 | None | Efo | Efo |
PRECEDING_EVENT | 10,418 | None | Event | Event |
INTACT_INTERACTS_WITH_GENE_GENE | 2 | {'intact_confidence_score': {'array': False, '... | Gene | Gene |
XQTL_MULTI_SNP_MR | 3,098,049 | {'p': {'array': False, 'type': 'FLOAT'}, 'se':... | Gene | Gwas |
XQTL_SINGLE_SNP_MR_GENE_GWAS | 8,703,863 | {'p': {'array': False, 'type': 'FLOAT'}, 'rsid... | Gene | Gwas |
GENE_TO_LITERATURE | 771 | None | Gene | Literature |
INTACT_INTERACTS_WITH_GENE_PROTEIN | 1,451 | {'intact_confidence_score': {'array': False, '... | Gene | Protein |
GENE_TO_PROTEIN | 20,762 | None | Gene | Protein |
EXPRESSED_IN | 861,552 | {'tpm': {'array': False, 'type': 'FLOAT'}} | Gene | Tissue |
GWAS_NLP_EFO | 6,936 | {'score': {'array': False, 'type': 'FLOAT'}} | Gwas | Efo |
BN_GEN_COR | 904,832 | {'p': {'array': False, 'type': 'FLOAT'}, 'se':... | Gwas | Gwas |
PRS | 132,703 | {'p': {'array': False, 'type': 'FLOAT'}, 'r2':... | Gwas | Gwas |
MR | 583,619 | {'b': {'array': False, 'type': 'FLOAT'}, 'se':... | Gwas | Gwas |
OBS_COR | 17,932 | {'cor': {'array': False, 'type': 'FLOAT'}} | Gwas | Gwas |
GWAS_NLP | 30,838,964 | {'score': {'array': False, 'type': 'FLOAT'}} | Gwas | Gwas |
GWAS_TO_LIT | 19,079,468 | None | Gwas | Literature |
METAMAP_LITE | 2,081 | {'mmi_score': {'array': False, 'type': 'FLOAT'}} | Gwas | SemmedTerm |
GWAS_SEM | 9,075,020 | {'globalTotal': {'array': False, 'type': 'INTE... | Gwas | SemmedTriple |
TOPHITS | 122,730 | {'pval': {'array': False, 'type': 'FLOAT'}, 'b... | Gwas | Variant |
GWAS_TO_VARIANT | 26,521 | {'se': {'array': False, 'type': 'FLOAT'}, 'nca... | Gwas | Variant |
PATHWAY_TO_DISEASE | 541 | None | Pathway | Disease |
EVENT_IN_PATHWAY | 12,488 | None | Pathway | Event |
PATHWAY_TO_LITERATURE | 8,952 | None | Pathway | Literature |
PROTEIN_TO_DISEASE | 626 | None | Protein | Disease |
PROTEIN_IN_EVENT | 13,484 | None | Protein | Event |
PROTEIN_TO_LITERATURE | 107,315 | None | Protein | Literature |
PROTEIN_IN_PATHWAY | 9,955 | None | Protein | Pathway |
INTACT_NOT_INTERACTS_WITH | 699 | {'intact_confidence_score': {'array': False, '... | Protein | Protein |
STRING_INTERACT_WITH | 390,222 | {'score': {'array': False, 'type': 'INTEGER'}} | Protein | Protein |
INTACT_INTERACTS_WITH_PROTEIN_PROTEIN | 187,426 | {'intact_confidence_score': {'array': False, '... | Protein | Protein |
SEM_GENE | 41,706 | None | SemmedTerm | Gene |
SEM_PREDICATE | 3,428,531 | {'count': {'array': False, 'type': 'INTEGER'},... | SemmedTerm | SemmedTerm |
SEM_TO_LIT | 6,127,985 | None | SemmedTriple | Literature |
SEM_OBJ | 3,428,531 | None | SemmedTriple | SemmedTerm |
SEM_SUB | 3,428,531 | None | SemmedTriple | SemmedTerm |
VARIANT_TO_GENE | 59,157 | {'feature_type': {'array': False, 'type': 'STR... | Variant | Gene |
XQTL_SINGLE_SNP_MR_SNP_GENE | 41,564 | None | Variant | Gene |
We can generate a network diagram of the graph db schema using networkx
.
graph = nx.from_pandas_edgelist(
meta_rel_df, source="from_node", target="to_node"
)
nx.draw(
G=graph,
pos=nx.kamada_kawai_layout(graph),
with_labels=True,
node_color="white",
)
A detailed version of the shema plot can be obtained from the API:
Users can use the explorer on the Web UI to search for a specific node by:
Here we show how these are done at the API level using Gwas
nodes as an example.
First we need to know what the "ID" and "name" fields are for the meta nodes using GET /meta/nodes/id-name-schema
:
r = requests.get(f"{API_URL}/meta/nodes/id-name-schema")
r.raise_for_status()
meta_node_fields = r.json()
meta_node_fields
{'Disease': {'id': 'id', 'name': 'label'}, 'Drug': {'id': 'label', 'name': 'label'}, 'Efo': {'id': 'id', 'name': 'value'}, 'Event': {'id': 'reactome_id', 'name': 'name'}, 'Gene': {'id': 'ensembl_id', 'name': 'name'}, 'Tissue': {'id': 'tissue', 'name': 'tissue'}, 'Gwas': {'id': 'id', 'name': 'trait'}, 'Literature': {'id': 'pubmed_id', 'name': 'pubmed_id'}, 'Pathway': {'id': 'reactome_id', 'name': 'name'}, 'Protein': {'id': 'uniprot_id', 'name': 'uniprot_id'}, 'SemmedTerm': {'id': 'id', 'name': 'name'}, 'Variant': {'id': 'name', 'name': 'name'}}
Here we search for nodes can contain "body mass index" in their traits.
name = "body mass index"
r = requests.get(f"{API_URL}/meta/nodes/Gwas/search", params={"name": name})
r.raise_for_status()
print(pformat(r.json())[:3000])
{'metadata': {'empty_results': False, 'query': 'MATCH (node: Gwas) WHERE node.trait =~ "(?i).*body ' 'mass index.*" RETURN node LIMIT 10;', 'total_seconds': 0.009114}, 'results': [{'node': {'access': 'public', 'author': 'Hoffmann TJ', 'category': 'NA', 'consortium': 'NA', 'id': 'ebi-a-GCST006368', 'mr': '1', 'note': 'NA', 'nsnp': '27854527', 'pmid': '30108127', 'population': 'European', 'priority': '0', 'sample_size': '315347', 'sex': 'NA', 'subcategory': 'NA', 'trait': 'Body mass index', 'unit': 'NA', 'year': '2018'}}, {'node': {'access': 'public', 'author': 'Locke AE', 'category': 'Risk factor', 'consortium': 'NA', 'id': 'ieu-a-2', 'mr': '1', 'nsnp': '2555511', 'pmid': '25673413', 'population': 'Mixed', 'priority': '1', 'sample_size': '339224', 'sd': '4.77', 'sex': 'Males and Females', 'subcategory': 'Anthropometric', 'trait': 'Body mass index', 'unit': 'NA', 'year': '2015'}}, {'node': {'access': 'public', 'author': 'Locke AE', 'category': 'Risk factor', 'consortium': 'NA', 'id': 'ieu-a-785', 'mr': '1', 'nsnp': '2477659', 'pmid': '25673413', 'population': 'European', 'priority': '2', 'sample_size': '152893', 'sd': '4.77', 'sex': 'Males', 'subcategory': 'Anthropometric', 'trait': 'Body mass index', 'unit': 'NA', 'year': '2015'}}, {'node': {'access': 'public', 'author': 'Locke AE', 'category': 'Risk factor', 'consortium': 'NA', 'id': 'ieu-a-835', 'mr': '1', 'nsnp': '2554668', 'pmid': '25673413', 'population': 'European', 'priority': '3', 'sample_size': '322154', 'sd': '4.77', 'sex': 'Males and Females',
Similarly, we can exact match a specific node by its ID.
id = "ieu-a-2"
r = requests.get(f"{API_URL}/meta/nodes/Gwas/search", params={"id": id})
r.raise_for_status()
print(pformat(r.json())[:3000])
{'metadata': {'empty_results': False, 'query': 'MATCH (node: Gwas {id: "ieu-a-2"}) RETURN node LIMIT ' '10;', 'total_seconds': 0.002578}, 'results': [{'node': {'access': 'public', 'author': 'Locke AE', 'category': 'Risk factor', 'consortium': 'NA', 'id': 'ieu-a-2', 'mr': '1', 'nsnp': '2555511', 'pmid': '25673413', 'population': 'Mixed', 'priority': '1', 'sample_size': '339224', 'sd': '4.77', 'sex': 'Males and Females', 'subcategory': 'Anthropometric', 'trait': 'Body mass index', 'unit': 'NA', 'year': '2015'}}]}
Advanced users that are familiar with Neo4j Cypher can query the database using Cypher directly.
query = """
MATCH (exposure:Gwas)-[mr:MR]->(outcome:Gwas)
WHERE exposure.trait = "Body mass index"
RETURN exposure, outcome, mr LIMIT 2
"""
r = requests.post(f"{API_URL}/cypher", json={"query": query})
r.raise_for_status()
print(pformat(r.json())[:3000])
{'metadata': {'empty_results': False, 'query': 'MATCH (exposure:Gwas)-[mr:MR]->(outcome:Gwas) WHERE ' 'exposure.trait = "Body mass index" RETURN exposure, ' 'outcome, mr LIMIT 2', 'total_seconds': 0.049648}, 'results': [{'exposure': {'access': 'public', 'author': 'Locke AE', 'category': 'Risk factor', 'consortium': 'NA', 'id': 'ieu-a-2', 'mr': '1', 'nsnp': '2555511', 'pmid': '25673413', 'population': 'Mixed', 'priority': '1', 'sample_size': '339224', 'sd': '4.77', 'sex': 'Males and Females', 'subcategory': 'Anthropometric', 'trait': 'Body mass index', 'unit': 'NA', 'year': '2015'}, 'mr': {'b': 0.0030348598957061768, 'ci_low': -0.002742477459833026, 'ci_upp': 0.008812196552753448, 'log10pval': 1.0, 'method': 'Weighted median', 'moescore': 0.7799999713897705, 'nsnp': 77, 'pval': 0.3032084107398987, 'se': 0.002947675297036767, 'selection': 'DF + HF'}, 'outcome': {'access': 'public', 'author': 'Neale', 'category': 'NA', 'consortium': 'Neale Lab', 'id': 'ukb-a-99', 'mr': '1', 'ncase': '8718', 'ncontrol': '328441', 'note': 'NA', 'nsnp': '10894596', 'population': 'European', 'priority': '1', 'sample_size': '337159', 'sex': 'Males and Females', 'subcategory': 'NA', 'trait': 'Non-cancer illness code self-reported: ' 'eczema/dermatitis', 'unit': 'SD', 'year': '2017'}}, {'exposure': {'access': 'public', 'author': 'Locke AE', 'category': 'Risk factor', 'consortium': 'NA', 'id': 'ieu-a-2', 'mr': '1', 'nsnp': '2555511', 'pmid': '25673413', 'population': 'Mixed', 'priority': '1', 'sample_size': '339224',
Alternatively we provide an endpoint POST /cypher/builder/plain
that assist users in querying for simple cypher queries.
payload = {
"source_meta_node": "Gwas",
"target_meta_node": "Gwas",
"meta_rel": "MR",
"where": ["source_node.trait = 'Body mass index'"],
"limit": 2,
}
r = requests.post(f"{API_URL}/cypher/builder/plain", json=payload)
r.raise_for_status()
print(pformat(r.json())[:3000])
{'metadata': {'empty_results': False, 'query': 'MATCH (source_node:Gwas) -[rel:MR]- (target_node:Gwas) ' "WHERE source_node.trait = 'Body mass index' RETURN " 'source_node, rel, target_node LIMIT 2', 'total_seconds': 0.035151}, 'results': [{'rel': {'b': 0.0030348598957061768, 'ci_low': -0.002742477459833026, 'ci_upp': 0.008812196552753448, 'log10pval': 1.0, 'method': 'Weighted median', 'moescore': 0.7799999713897705, 'nsnp': 77, 'pval': 0.3032084107398987, 'se': 0.002947675297036767, 'selection': 'DF + HF'}, 'source_node': {'access': 'public', 'author': 'Locke AE', 'category': 'Risk factor', 'consortium': 'NA', 'id': 'ieu-a-2', 'mr': '1', 'nsnp': '2555511', 'pmid': '25673413', 'population': 'Mixed', 'priority': '1', 'sample_size': '339224', 'sd': '4.77', 'sex': 'Males and Females', 'subcategory': 'Anthropometric', 'trait': 'Body mass index', 'unit': 'NA', 'year': '2015'}, 'target_node': {'access': 'public', 'author': 'Neale', 'category': 'NA', 'consortium': 'Neale Lab', 'id': 'ukb-a-99', 'mr': '1', 'ncase': '8718', 'ncontrol': '328441', 'note': 'NA', 'nsnp': '10894596', 'population': 'European', 'priority': '1', 'sample_size': '337159', 'sex': 'Males and Females', 'subcategory': 'NA', 'trait': 'Non-cancer illness code ' 'self-reported: eczema/dermatitis', 'unit': 'SD', 'year': '2017'}}, {'rel': {'b': -3.053751788684167e-05, 'ci_low': -0.00041146361036226153, 'ci_upp': 0.00035038855276070535, 'log10pval': 0.0, 'method': 'RE IVW', 'moescore': 0.800000011920929, 'nsnp': 79, 'pval'
Again for the detailed documentation on the API endpoints please visit: