In this notebook we show the following aspects of the EpiGraphDB platform, and how to use the API to get the information:
For detailed documentation on the API endpoints please visit:
from pprint import pformat
import networkx as nx
import pandas as pd
import requests
# default parameters
API_URL = "https://api.epigraphdb.org"
# Parameters
API_URL = "https://api.epigraphdb.org"
print(API_URL)
requests.get(f"{API_URL}/ping").json()
https://api.epigraphdb.org
True
Here we query for the metadata information using the endpoint GET /meta/schema
, which will be used for downstream processing.
endpoint = "/meta/schema"
params = {"graphviz": False, "plot": False}
r = requests.get(f"{API_URL}{endpoint}", params=params)
r.raise_for_status()
metadata = r.json()
# Preview of metadata information
keys = metadata.keys()
print(pformat(keys), "\n")
for key in list(keys):
print(f"# {key}:")
print(pformat(metadata[key])[:1000], "\n")
dict_keys(['nodes', 'edges', 'connections']) # nodes: {'Disease': {'count': 38960, 'properties': {'_id': {'indexed': True, 'type': 'STRING', 'unique': False}, '_name': {'indexed': True, 'type': 'STRING', 'unique': False}, '_source': {'indexed': False, 'type': 'LIST', 'unique': False}, 'definition': {'indexed': False, 'type': 'STRING', 'unique': False}, 'doid': {'indexed': True, 'type': 'LIST', 'unique': False}, 'efo': {'indexed': False, 'type': 'LIST', # edges: {'BIORXIV_OBJ': {'count': 32651, 'properties': {'_source': {'array': True, 'type': 'LIST'}}}, 'BIORXIV_PREDICATE': {'count': 32648, 'properties': {'_source': {'array': True, 'type': 'LIST'}, 'count': {'array': False, 'type': 'INTEGER'}, 'predicate': {'array': False, 'type': 'STRING'}}}, 'BIORXIV_SUB': {'count': 32657, 'properties': {'_source': {'array': True, 'type': 'LIST'}}}, 'BIORXIV_TO_LIT': {'count': 35211, 'properties': {'_source': {'array': True, 'type': 'LIST'}}}, 'CPIC': {'count': 375, 'properties': {'_source': {'array': True, 'type': 'LIST'}, 'cpic_level': {'array': False, 'type': 'STRING'}, 'guideline': {'array': F # connections: [{'count': 2461, 'from_node': 'Drug', 'rel': 'OPENTARGETS_DRUG_TO_DISEASE', 'to_node': 'Disease'}, {'count': 5763, 'from_node': 'Gene', 'rel': 'GENE_TO_DISEASE', 'to_node': 'Disease'}, {'count': 8247, 'from_node': 'Disease', 'rel': 'MONDO_MAP_UMLS', 'to_node': 'LiteratureTerm'}, {'count': 2819, 'from_node': 'Disease', 'rel': 'MONDO_MAP_EFO', 'to_node': 'Efo'}, {'count': 2463, 'from_node': 'Pathway', 'rel': 'PATHWAY_CHILD_OF', 'to_node': 'Pathway'}, {'count': 121873, 'from_node': 'Protein', 'rel': 'PROTEIN_IN_PATHWAY', 'to_node': 'Pathway'}, {'count': 1969, 'from_node': 'LiteratureTerm', 'rel': 'MEDRXIV_PREDICATE', 'to_node': 'LiteratureTerm'}, {'count': 16435, 'from_node': 'LiteratureTerm', 'rel': 'TERM_TO_GENE', 'to_node': 'Gene'}, {'count': 32651, 'from_node': 'LiteratureTriple', 'rel': 'BIORXIV_OBJ', 'to_node': 'LiteratureTerm'}, {'count': 32657, 'from_node': 'LiteratureTriple', 'rel': 'BIORXIV_SUB', 'to_node': 'Li
We can extract the specific meta node information as a pandas dataframe from the metadata.
meta_node_df = pd.DataFrame.from_dict(metadata["nodes"], orient="index")
(
meta_node_df.sort_index().assign(
count=lambda df: df["count"].apply(lambda x: f"{x:,}")
)
)
count | properties | |
---|---|---|
Disease | 38,960 | {'_name': {'type': 'STRING', 'indexed': True, ... |
Drug | 2,697 | {'molecule_type': {'type': 'STRING', 'indexed'... |
Efo | 25,390 | {'_name': {'type': 'STRING', 'indexed': True, ... |
Gene | 57,737 | {'druggability_tier': {'type': 'STRING', 'inde... |
Gwas | 34,494 | {'note': {'type': 'STRING', 'indexed': False, ... |
Literature | 3,995,672 | {'issn': {'type': 'STRING', 'indexed': False, ... |
LiteratureTerm | 108,905 | {'_name': {'type': 'STRING', 'indexed': True, ... |
LiteratureTriple | 5,609,945 | {'subject_id': {'type': 'STRING', 'indexed': T... |
Pathway | 2,441 | {'_name': {'type': 'STRING', 'indexed': True, ... |
Protein | 20,280 | {'name': {'type': 'STRING', 'indexed': True, '... |
Tissue | 54 | {'name': {'type': 'STRING', 'indexed': True, '... |
Variant | 99,005 | {'ref': {'type': 'STRING', 'indexed': False, '... |
We can also extract the meta relationship (edge) information, and the connections.
meta_rel_df = pd.DataFrame.from_dict(metadata["edges"], orient="index").merge(
pd.DataFrame.from_dict(
{_["rel"]: _ for _ in metadata["connections"]}, orient="index"
)[["from_node", "to_node"]],
left_index=True,
right_index=True,
)
(
meta_rel_df.sort_values(by=["from_node", "to_node"]).assign(
count=lambda df: df["count"].apply(lambda x: f"{x:,}")
)
)
count | properties | from_node | to_node | |
---|---|---|---|---|
MONDO_MAP_EFO | 2,819 | {'_source': {'array': False, 'type': 'STRING'}} | Disease | Efo |
MONDO_MAP_UMLS | 8,247 | {'_source': {'array': False, 'type': 'STRING'}} | Disease | LiteratureTerm |
OPENTARGETS_DRUG_TO_DISEASE | 2,461 | {'_source': {'array': True, 'type': 'LIST'}} | Drug | Disease |
CPIC | 375 | {'pharmgkb_level_of_evidence': {'array': False... | Drug | Gene |
OPENTARGETS_DRUG_TO_TARGET | 6,534 | {'phase': {'array': False, 'type': 'STRING'}, ... | Drug | Gene |
EFO_CHILD_OF | 43,132 | {'_source': {'array': True, 'type': 'LIST'}} | Efo | Efo |
GENE_TO_DISEASE | 5,763 | {'last_updated': {'array': False, 'type': 'STR... | Gene | Disease |
XQTL_MULTI_SNP_MR | 3,015,233 | {'p': {'array': False, 'type': 'FLOAT'}, 'se':... | Gene | Gwas |
XQTL_SINGLE_SNP_MR_GENE_GWAS | 8,449,779 | {'p': {'array': False, 'type': 'FLOAT'}, 'se':... | Gene | Gwas |
GENE_TO_PROTEIN | 19,142 | {'_source': {'array': True, 'type': 'LIST'}} | Gene | Protein |
EXPRESSED_IN | 2,918,240 | {'tpm': {'array': False, 'type': 'FLOAT'}, '_s... | Gene | Tissue |
GWAS_NLP_EFO | 12,302 | {'score': {'array': False, 'type': 'FLOAT'}, '... | Gwas | Efo |
GWAS_EFO_EBI | 281 | {'_source': {'array': True, 'type': 'LIST'}} | Gwas | Efo |
PRS | 118,124 | {'p': {'array': False, 'type': 'FLOAT'}, 'r2':... | Gwas | Gwas |
MR_EVE_MR | 25,804,945 | {'b': {'array': False, 'type': 'FLOAT'}, 'se':... | Gwas | Gwas |
GEN_COR | 840,960 | {'h2_intercept_SE': {'array': False, 'type': '... | Gwas | Gwas |
OBS_COR | 17,932 | {'_source': {'array': True, 'type': 'LIST'}, '... | Gwas | Gwas |
GWAS_NLP | 89,239,773 | {'score': {'array': False, 'type': 'FLOAT'}, '... | Gwas | Gwas |
GWAS_TO_LITERATURE | 28,111,669 | {'_source': {'array': True, 'type': 'LIST'}} | Gwas | Literature |
METAMAP_LITE | 5,556 | {'_source': {'array': True, 'type': 'LIST'}, '... | Gwas | LiteratureTerm |
GWAS_TO_LITERATURE_TRIPLE | 17,531,153 | {'pval': {'array': False, 'type': 'FLOAT'}, 'g... | Gwas | LiteratureTriple |
OPENGWAS_TOPHITS | 160,283 | {'_source': {'array': True, 'type': 'LIST'}, '... | Gwas | Variant |
GWAS_TO_VARIANT | 26,436 | {'se': {'array': False, 'type': 'FLOAT'}, 'nca... | Gwas | Variant |
TERM_TO_GENE | 16,435 | {'_source': {'array': False, 'type': 'STRING'}} | LiteratureTerm | Gene |
SEMMEDDB_PREDICATE | 5,584,547 | {'count': {'array': False, 'type': 'INTEGER'},... | LiteratureTerm | LiteratureTerm |
BIORXIV_PREDICATE | 32,648 | {'count': {'array': False, 'type': 'INTEGER'},... | LiteratureTerm | LiteratureTerm |
MEDRXIV_PREDICATE | 1,969 | {'count': {'array': False, 'type': 'INTEGER'},... | LiteratureTerm | LiteratureTerm |
BIORXIV_TO_LIT | 35,211 | {'_source': {'array': True, 'type': 'LIST'}} | LiteratureTriple | Literature |
SEMMEDDB_TO_LIT | 10,589,785 | {'_source': {'array': True, 'type': 'LIST'}} | LiteratureTriple | Literature |
SEMMEDDB_SUB | 5,584,547 | {'_source': {'array': True, 'type': 'LIST'}} | LiteratureTriple | LiteratureTerm |
BIORXIV_OBJ | 32,651 | {'_source': {'array': True, 'type': 'LIST'}} | LiteratureTriple | LiteratureTerm |
BIORXIV_SUB | 32,657 | {'_source': {'array': True, 'type': 'LIST'}} | LiteratureTriple | LiteratureTerm |
SEMMEDDB_OBJ | 5,584,547 | {'_source': {'array': True, 'type': 'LIST'}} | LiteratureTriple | LiteratureTerm |
PATHWAY_CHILD_OF | 2,463 | {'_source': {'array': True, 'type': 'LIST'}} | Pathway | Pathway |
PROTEIN_IN_PATHWAY | 121,873 | {'_source': {'array': True, 'type': 'LIST'}} | Protein | Pathway |
STRING_INTERACT_WITH | 827,184 | {'score': {'array': False, 'type': 'FLOAT'}, '... | Protein | Protein |
VARIANT_TO_GENE | 108,561 | {'amino_acids': {'array': False, 'type': 'STRI... | Variant | Gene |
XQTL_SINGLE_SNP_MR_SNP_GENE | 41,564 | {'_source': {'array': True, 'type': 'LIST'}} | Variant | Gene |
We can generate a network diagram of the graph db schema using networkx
.
graph = nx.from_pandas_edgelist(meta_rel_df, source="from_node", target="to_node")
nx.draw(
G=graph,
pos=nx.kamada_kawai_layout(graph),
with_labels=True,
node_color="white",
)
A detailed version of the shema plot can be obtained from the API:
Users can use the explorer on the Web UI to search for a specific node by:
Here we show how these are done at the API level using Gwas
nodes as an example.
First we need to know what the "ID" and "name" fields are for the meta nodes using GET /meta/nodes/id-name-schema
:
r = requests.get(f"{API_URL}/meta/nodes/id-name-schema")
r.raise_for_status()
meta_node_fields = r.json()
meta_node_fields
{'Disease': {'id': 'id', 'name': 'label'}, 'Drug': {'id': 'label', 'name': 'label'}, 'Efo': {'id': 'id', 'name': 'value'}, 'Gene': {'id': 'ensembl_id', 'name': 'name'}, 'Gwas': {'id': 'id', 'name': 'trait'}, 'Literature': {'id': 'id', 'name': 'id'}, 'LiteratureTerm': {'id': 'id', 'name': 'name'}, 'LiteratureTriple': {'id': 'id', 'name': 'name'}, 'Pathway': {'id': 'id', 'name': 'name'}, 'Protein': {'id': 'uniprot_id', 'name': 'uniprot_id'}, 'Tissue': {'id': 'id', 'name': 'name'}, 'Variant': {'id': 'name', 'name': 'name'}}
Here we search for nodes can contain "body mass index" in their traits.
name = "body mass index"
r = requests.get(f"{API_URL}/meta/nodes/Gwas/search", params={"name": name})
r.raise_for_status()
print(pformat(r.json())[:3000])
{'metadata': {'empty_results': False, 'query': 'MATCH (node: Gwas) WHERE node.trait =~ "(?i).*body ' 'mass index.*" RETURN node LIMIT 10;', 'total_seconds': 0.021328}, 'results': [{'node': {'_id': 'ieu-a-1089', '_name': 'Body mass index', '_source': ['OpenGWAS-2020-10-13'], 'author': 'Wood', 'build': 'HG19/GRCh37', 'category': 'Risk factor', 'id': 'ieu-a-1089', 'mr': '0', 'note': 'Dominance model? If so then not necessarily of ' 'value for MR; Results from interim Biobank ' 'release enriched for smokers; could lead to ' 'bias through collider issues in MR', 'nsnp': '8654252', 'pmid': '26961502.0', 'population': 'European', 'sample_size': '120286.0', 'sex': 'Males and Females', 'subcategory': 'Anthropometric', 'trait': 'Body mass index', 'year': '2016.0'}}, {'node': {'_id': 'ieu-a-974', '_name': 'Body mass index', '_source': ['OpenGWAS-2020-10-13'], 'author': 'Locke AE', 'build': 'HG19/GRCh37', 'category': 'Risk factor', 'id': 'ieu-a-974', 'mr': '1', 'nsnp': '2494613', 'pmid': '25673413.0', 'population': 'European', 'sample_size': '171977.0', 'sd': '4.77', 'sex': 'Females', 'subcategory': 'Anthropometric', 'trait': 'Body mass index', 'year': '2015.0'}}, {'node': {'_id': 'ieu-a-95', '_name': 'Body mass index', '_source': ['OpenGWAS-2020-10-13'], 'author': 'Randall JC', 'build': 'HG19/GRCh37', 'category': 'Risk factor', 'id': 'ieu-a-95', 'mr': '1', 'nsnp': '2736876', 'pmid': '23754948.0', 'population': 'European', 'sample_size': '73137.0', 'sd': '4.77', 'sex': 'Females', 'subcategory': 'Anthropometric', 'trait': 'Body mass index', 'year': '2013.0'}}, {'node': {'_id': 'ebi-a-GCST004904', '_name': 'Body mass index', '_source': ['OpenGWAS-2020-
Similarly, we can exact match a specific node by its ID.
id = "ieu-a-2"
r = requests.get(f"{API_URL}/meta/nodes/Gwas/search", params={"id": id})
r.raise_for_status()
print(pformat(r.json())[:3000])
{'metadata': {'empty_results': False, 'query': 'MATCH (node: Gwas {id: "ieu-a-2"}) RETURN node LIMIT ' '10;', 'total_seconds': 0.012118}, 'results': [{'node': {'_id': 'ieu-a-2', '_name': 'Body mass index', '_source': ['OpenGWAS-2020-10-13'], 'author': 'Locke AE', 'build': 'HG19/GRCh37', 'category': 'Risk factor', 'id': 'ieu-a-2', 'mr': '1', 'nsnp': '2555511', 'pmid': '25673413.0', 'population': 'Mixed', 'sample_size': '339224.0', 'sd': '4.77', 'sex': 'Males and Females', 'subcategory': 'Anthropometric', 'trait': 'Body mass index', 'year': '2015.0'}}]}
Advanced users that are familiar with Neo4j Cypher can query the database using Cypher directly.
query = """
MATCH (exposure:Gwas)-[mr:MR_EVE_MR]->(outcome:Gwas)
WHERE exposure.trait = "Body mass index"
RETURN exposure, outcome, mr LIMIT 2
"""
r = requests.post(f"{API_URL}/cypher", json={"query": query})
r.raise_for_status()
print(pformat(r.json())[:3000])
{'metadata': {'empty_results': False, 'query': 'MATCH (exposure:Gwas)-[mr:MR_EVE_MR]->(outcome:Gwas) ' 'WHERE exposure.trait = "Body mass index" RETURN ' 'exposure, outcome, mr LIMIT 2', 'total_seconds': 0.011951}, 'results': [{'exposure': {'_id': 'ieu-a-974', '_name': 'Body mass index', '_source': ['OpenGWAS-2020-10-13'], 'author': 'Locke AE', 'build': 'HG19/GRCh37', 'category': 'Risk factor', 'id': 'ieu-a-974', 'mr': '1', 'nsnp': '2494613', 'pmid': '25673413.0', 'population': 'European', 'sample_size': '171977.0', 'sd': '4.77', 'sex': 'Females', 'subcategory': 'Anthropometric', 'trait': 'Body mass index', 'year': '2015.0'}, 'mr': {'_source': ['MR-EvE-2021-03-10'], 'b': 0.0289678834636601, 'ci_low': -0.165265753333202, 'ci_upp': 0.22320152026052198, 'method': 'Simple mean', 'moescore': 0.77, 'nsnp': 37, 'pval': 0.771725286643359, 'se': 0.0990987942841133, 'selection': 'HF'}, 'outcome': {'_id': 'prot-a-1729', '_name': 'Galectin-7', '_source': ['OpenGWAS-2020-10-13'], 'author': 'Sun BB', 'build': 'HG19/GRCh37', 'category': 'Immune system', 'id': 'prot-a-1729', 'mr': '1', 'nsnp': '10534735', 'pmid': '29875488.0', 'population': 'European', 'sample_size': '3301.0', 'sex': 'Males and Females', 'subcategory': 'Protein', 'trait': 'Galectin-7', 'year': '2018.0'}}, {'exposure': {'_id': 'ieu-a-974', '_name': 'Body mass index', '_source': ['OpenGWAS-2020-10-13'], 'author': 'Locke AE', 'build': 'HG19/GRCh37', 'category': 'Risk factor', 'id': 'ieu-a-974', 'mr': '1', 'nsnp': '2494613', 'pmid': '25673413.0', 'population': 'European', 'sample_size': '171977.0',
Again for the detailed documentation on the API endpoints please visit: