# Fetch HTML using requests lib and feed to bs4
import requests
# note their SSL certificate is not verified. Be careful!
result = requests.get("https://globalgenes.org/rarelist", verify=False)
from bs4 import BeautifulSoup
from bs4 import NavigableString
soup = BeautifulSoup(result.content, 'html.parser')
/usr/local/lib/python3.6/site-packages/urllib3/connectionpool.py:858: InsecureRequestWarning: Unverified HTTPS request is being made. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#ssl-warnings InsecureRequestWarning)
# check
soup.title
<title>Rare Disease List</title>
# write formatted html to file
# (not used: this is just a useful side effect for exploration)
f=open('rarelist.html','w')
f.write(soup.prettify())
f.close()
# use bs4 to extract names from HTML
names = [] ## all disease names found
name2url = {} ## mapping of names to URLs
h5s = soup.find_all("h5")
for h5 in h5s:
ul = h5.find_next_sibling('ul')
for li in ul.findAll('li'):
if len(li.contents) == 0:
continue
n = li.contents[0]
if n is None:
print('BAD: {}'.format(li))
continue
if not isinstance(n, NavigableString):
n = n.contents[0]
if li.select('a'):
url = li.a['href']
name2url[n] = url
names.append(n)
# show the first 20 for sanity checking
names[0:20]
['Aagenaes syndrome', 'Aarskog syndrome', 'Aase Smith syndrome', 'ABCD syndrome', 'Abderhalden Kaufmann Lignac syndrome', 'Abdominal aortic aneurysm', 'Abdominal chemodectomas with cutaneous angiolipomas', 'Abdominal cystic lymphangioma', 'Abdominal obesity metabolic syndrome', 'Aberrant subclavian artery', 'Abetalipoproteinemia', 'Abidi X-linked mental retardation syndrome', 'Ablepharon macrostomia syndrome', "Abrikosov's tumor", 'Abruzzo Erickson syndrome', 'Absence of fingerprints congenital milia', 'Absence of gluteal muscle', 'Absence of septum pellucidum', 'Absence of Tibia', 'Absence of tibia with polydactyly']
## sanity check URL mapping
list(name2url.items())[0:10]
[('Acute disseminated encephalomyelitis', 'http://ulf.org/'), ('Acute hemorrhagic leukoencephalitis', 'http://ulf.org/'), ('Adrenoleukodystrophy X-linked', 'http://ulf.org/'), ('Adrenomyeloneuropathy', 'http://ulf.org/'), ('Aicardi-Goutieres syndrome', 'http://ulf.org/'), ('Alexander disease', 'http://ulf.org/'), ('Alkaptonuria', 'http://www.alkaptonuria.info/'), ('Alpers syndrome', 'http://www.umdf.org/site/c.8qKOJ0MvF7LUG/b.7929671/k.BDF0/Home.htm'), ('Alzheimer disease familial', 'http://www.mitoaction.org/'), ('Alzheimer disease type 1', 'http://www.mitoaction.org/')]
import csv
with open('rare-list.tsv', 'w', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter='\t')
for n in names:
spamwriter.writerow([n, name2url.get(n)])
## use ontobio lib for fetching ontologies and lexical mapping
from ontobio import OntologyFactory
/usr/local/lib/python3.6/site-packages/cachier/mongo_core.py:24: UserWarning: Cachier warning: pymongo was not found. MongoDB cores will not work. "Cachier warning: pymongo was not found. MongoDB cores will not work.")
ofa = OntologyFactory()
hp = ofa.create('obo:hp')
mondo = ofa.create('obo:mondo')
from ontobio.lexmap import LexicalMapEngine
lexmap = LexicalMapEngine()
# Quick hack to make a degenerate 'ontology' from the list of names
from ontobio import Ontology
def ont_from_names(names):
ont = Ontology(id='rare')
for n in names:
## use name as ID
ont.add_node(n, n)
return ont
rare = ont_from_names(names)
rare
rare handle: None meta: None
## quick inspection
rare.nodes()[0:10]
['Aagenaes syndrome', 'Aarskog syndrome', 'Aase Smith syndrome', 'ABCD syndrome', 'Abderhalden Kaufmann Lignac syndrome', 'Abdominal aortic aneurysm', 'Abdominal chemodectomas with cutaneous angiolipomas', 'Abdominal cystic lymphangioma', 'Abdominal obesity metabolic syndrome', 'Aberrant subclavian artery']
## index the 3 ontologies
lexmap.index_ontology(hp)
lexmap.index_ontology(mondo)
lexmap.index_ontology(rare)
WARNING:root:Incomplete syn: HP:0000991 "" hasRelatedSynonym None [] 1.0 WARNING:root:Incomplete syn: HP:0012377 "" hasRelatedSynonym None [] 1.0 WARNING:root:Incomplete syn: HP:0000510 "" hasRelatedSynonym None [] 1.0 WARNING:root:Ignoring suspicous synonym: UBERON:0002722 "4" hasBroadSynonym None ['http://uri.neuinfo.org/nif/nifstd/birnlex_1488', 'NIFSTD:NeuroNames_abbrevSource'] 1.0 WARNING:root:Ignoring suspicous synonym: UBERON:0001715 "3" hasBroadSynonym None ['http://uri.neuinfo.org/nif/nifstd/birnlex_1240', 'NIFSTD:NeuroNames_abbrevSource'] 1.0
## CONFIGURE
## we will map R to mondo and hp separately
lexmap.ontology_pairs = [(rare.id, mondo.id), (rare.id, hp.id)]
# align
g = lexmap.get_xref_graph()
# get a dataframe from the mapping graph
df=lexmap.as_dataframe(g)
df
left | left_label | right | right_label | left_match_type | right_match_type | left_match_val | right_match_val | score | left_simscore | ... | conditional_pr_equiv | pr_subClassOf | pr_superClassOf | pr_equivalentTo | pr_other | left_novel | right_novel | left_consistent | right_consistent | equiv_clique_size | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3287 | 11-beta-hydroxylase deficiency | 11-beta-hydroxylase deficiency | MONDO:0008729 | congenital adrenal hyperplasia due to 11-beta-... | label | hasRelatedSynonym | 11-beta-hydroxylase deficiency | 11-Beta-Hydroxylase Deficiency | 50.0 | 1.000000 | ... | 1.000000 | 0.061581 | 0.061581 | 0.799654 | 0.077184 | True | True | False | False | 7 |
2199 | 15q13.3 microdeletion syndrome | 15q13.3 microdeletion syndrome | MONDO:0012774 | chromosome 15q13.3 microdeletion syndrome | label | hasExactSynonym | 15q13.3 microdeletion syndrome | 15q13.3 microdeletion syndrome | 90.0 | 1.000000 | ... | 1.000000 | 0.029969 | 0.029969 | 0.918763 | 0.021299 | True | True | False | False | 6 |
3339 | 17-alpha-hydroxylase deficiency | 17-alpha-hydroxylase deficiency | MONDO:0008730 | congenital adrenal hyperplasia due to 17-alpha... | label | hasRelatedSynonym | 17-alpha-hydroxylase deficiency | 17-Alpha-Hydroxylase Deficiency | 50.0 | 1.000000 | ... | 1.000000 | 0.061581 | 0.061581 | 0.799654 | 0.077184 | True | True | False | False | 5 |
3481 | 17-beta hydroxysteroid dehydrogenase 3 deficiency | 17-beta hydroxysteroid dehydrogenase 3 deficiency | MONDO:0009916 | 46,XY disorder of sex development due to 17-be... | label | hasExactSynonym | 17-beta hydroxysteroid dehydrogenase 3 deficiency | 17-beta-hydroxysteroid dehydrogenase 3 deficiency | 58.0 | 1.000000 | ... | 1.000000 | 0.205965 | 0.205965 | 0.392394 | 0.195675 | True | True | False | False | 7 |
2592 | 17q21.31 microdeletion syndrome | 17q21.31 microdeletion syndrome | MONDO:0012496 | Koolen de Vries syndrome | label | hasExactSynonym | 17q21.31 microdeletion syndrome | 17q21.31 microdeletion syndrome | 90.0 | 1.000000 | ... | 0.473684 | 0.168017 | 0.055554 | 0.749591 | 0.026839 | True | True | False | False | 8 |
2593 | 17q21.31 microdeletion syndrome | 17q21.31 microdeletion syndrome | MONDO:0018216 | 17q21.31 microdeletion syndrome | label | label | 17q21.31 microdeletion syndrome | 17q21.31 microdeletion syndrome | 100.0 | 1.000000 | ... | 0.526316 | 0.051671 | 0.108232 | 0.824734 | 0.015363 | True | True | False | False | 8 |
2987 | 18 Hydroxylase deficiency | 18 Hydroxylase deficiency | MONDO:0008751 | Corticosterone methyloxidase type 1 deficiency | label | hasRelatedSynonym | 18 Hydroxylase deficiency | 18-Hydroxylase Deficiency | 32.0 | 1.000000 | ... | 0.355556 | 0.232996 | 0.289482 | 0.283582 | 0.193941 | True | True | False | False | 6 |
2986 | 18 Hydroxylase deficiency | 18 Hydroxylase deficiency | MONDO:0020489 | familial hyperreninemic hypoaldosteronism type 1 | label | hasExactSynonym | 18 Hydroxylase deficiency | 18-hydroxylase deficiency | 58.0 | 1.000000 | ... | 0.644444 | 0.292046 | 0.210145 | 0.309167 | 0.188643 | True | True | False | False | 6 |
1960 | 1q21.1 microdeletion syndrome | 1q21.1 microdeletion syndrome | MONDO:0012914 | chromosome 1q21.1 deletion syndrome | label | hasExactSynonym | 1q21.1 microdeletion syndrome | 1q21.1 microdeletion syndrome | 90.0 | 1.000000 | ... | 1.000000 | 0.030109 | 0.030109 | 0.923042 | 0.016740 | True | True | False | False | 6 |
1428 | 2 4-Dienoyl-CoA reductase deficiency | 2 4-Dienoyl-CoA reductase deficiency | MONDO:0014464 | progressive encephalopathy with leukodystrophy... | label | hasExactSynonym | 2 4-Dienoyl-CoA reductase deficiency | 2,4-dienoyl-CoA reductase deficiency | 58.0 | 1.000000 | ... | 1.000000 | 0.200803 | 0.200803 | 0.382559 | 0.215835 | True | True | False | False | 5 |
4514 | 2-Hydroxyglutaric aciduria | 2-Hydroxyglutaric aciduria | MONDO:0016001 | 2-hydroxyglutaric aciduria | label | label | 2-Hydroxyglutaric aciduria | 2-hydroxyglutaric aciduria | 100.0 | 1.000000 | ... | 1.000000 | 0.028758 | 0.028758 | 0.925963 | 0.016522 | True | True | False | False | 7 |
1888 | 2-methyl-3-hydroxybutyric aciduria | 2-methyl-3-hydroxybutyric aciduria | MONDO:0010327 | HSD10 disease | label | hasExactSynonym | 2-methyl-3-hydroxybutyric aciduria | 2-methyl-3-hydroxybutyric aciduria | 90.0 | 1.000000 | ... | 1.000000 | 0.029969 | 0.029969 | 0.918763 | 0.021299 | True | True | False | False | 5 |
1202 | 2-methylbutyryl-CoA dehydrogenase deficiency | 2-methylbutyryl-CoA dehydrogenase deficiency | MONDO:0012392 | 2-methylbutyryl-CoA dehydrogenase deficiency | label | label | 2-methylbutyryl-CoA dehydrogenase deficiency | 2-methylbutyryl-CoA dehydrogenase deficiency | 100.0 | 1.000000 | ... | 1.000000 | 0.028795 | 0.028795 | 0.927169 | 0.015241 | True | True | False | False | 7 |
3288 | 21-hydroxylase deficiency | 21-hydroxylase deficiency | MONDO:0008728 | classic congenital adrenal hyperplasia due to ... | label | hasRelatedSynonym | 21-hydroxylase deficiency | 21-Hydroxylase Deficiency | 50.0 | 1.000000 | ... | 1.000000 | 0.061581 | 0.061581 | 0.799654 | 0.077184 | True | True | False | False | 5 |
3507 | 22q11.2 deletion syndrome | 22q11.2 deletion syndrome | MONDO:0008644 | velocardiofacial syndrome | label | hasExactSynonym | 22q11.2 deletion syndrome | deletion 22q11.2 syndrome | 58.0 | 1.000000 | ... | 0.134754 | 0.179472 | 0.287938 | 0.282070 | 0.250520 | True | True | False | False | 41 |
2964 | 22q11.2 deletion syndrome | 22q11.2 deletion syndrome | MONDO:0018923 | 22q11.2 deletion syndrome | label | label | 22q11.2 deletion syndrome | 22q11.2 deletion syndrome | 100.0 | 0.166667 | ... | 0.115075 | 0.092223 | 0.035954 | 0.841716 | 0.030107 | True | True | False | False | 41 |
1721 | 3 methylglutaconic aciduria type I | 3 methylglutaconic aciduria type I | MONDO:0009610 | 3-methylglutaconic aciduria type 1 | label | label | 3 methylglutaconic aciduria type I | 3-methylglutaconic aciduria type 1 | 64.0 | 1.000000 | ... | 1.000000 | 0.200803 | 0.200803 | 0.382559 | 0.215835 | True | True | False | False | 9 |
1720 | 3 methylglutaconic aciduria type IV | 3 methylglutaconic aciduria type IV | MONDO:0009611 | 3-methylglutaconic aciduria type 4 | label | label | 3 methylglutaconic aciduria type IV | 3-methylglutaconic aciduria type 4 | 64.0 | 1.000000 | ... | 1.000000 | 0.200803 | 0.200803 | 0.382559 | 0.215835 | True | True | False | False | 8 |
2580 | 3 methylglutaconic aciduria type V | 3 methylglutaconic aciduria type V | MONDO:0012435 | 3-methylglutaconic aciduria type 5 | label | label | 3 methylglutaconic aciduria type V | 3-methylglutaconic aciduria type 5 | 64.0 | 1.000000 | ... | 1.000000 | 0.198342 | 0.198342 | 0.377872 | 0.225444 | True | True | False | False | 7 |
1877 | 3-Hydroxyisobutyric aciduria | 3-Hydroxyisobutyric aciduria | MONDO:0009371 | 3-hydroxyisobutyric aciduria | label | label | 3-Hydroxyisobutyric aciduria | 3-hydroxyisobutyric aciduria | 100.0 | 1.000000 | ... | 1.000000 | 0.028795 | 0.028795 | 0.927169 | 0.015241 | True | True | False | False | 8 |
3289 | 3-beta-hydroxysteroid dehydrogenase deficiency | 3-beta-hydroxysteroid dehydrogenase deficiency | MONDO:0008727 | congenital adrenal hyperplasia due to 3-beta-h... | label | hasRelatedSynonym | 3-beta-hydroxysteroid dehydrogenase deficiency | 3-Beta-Hydroxysteroid Dehydrogenase Deficiency | 50.0 | 1.000000 | ... | 1.000000 | 0.061581 | 0.061581 | 0.799654 | 0.077184 | True | True | False | False | 5 |
3670 | 3-methylglutaconic aciduria type III | 3-methylglutaconic aciduria type III | MONDO:0009787 | 3-methylglutaconic aciduria type 3 | label | hasExactSynonym | 3-methylglutaconic aciduria type III | 3-methylglutaconic aciduria type III | 90.0 | 1.000000 | ... | 1.000000 | 0.029969 | 0.029969 | 0.918763 | 0.021299 | True | True | False | False | 8 |
755 | 4-hydroxyphenylacetic aciduria | 4-hydroxyphenylacetic aciduria | HP:0003607 | 4-Hydroxyphenylacetic aciduria | label | label | 4-hydroxyphenylacetic aciduria | 4-Hydroxyphenylacetic aciduria | 100.0 | 1.000000 | ... | 1.000000 | 0.028891 | 0.028891 | 0.930268 | 0.011949 | True | True | False | False | 2 |
3680 | 46 XX testicular disorder of sex development | 46 XX testicular disorder of sex development | MONDO:0010766 | 46,XX testicular disorder of sex development | label | label | 46 XX testicular disorder of sex development | 46,XX testicular disorder of sex development | 64.0 | 1.000000 | ... | 1.000000 | 0.198342 | 0.198342 | 0.377872 | 0.225444 | True | True | False | False | 6 |
3136 | 47 XXX syndrome | 47 XXX syndrome | MONDO:0018066 | trisomy X | label | hasExactSynonym | 47 XXX syndrome | 47,XXX syndrome | 58.0 | 1.000000 | ... | 1.000000 | 0.226493 | 0.185437 | 0.392394 | 0.195675 | True | True | False | False | 5 |
3166 | 47 XYY syndrome | 47 XYY syndrome | MONDO:0019339 | 47,XYY syndrome | label | label | 47 XYY syndrome | 47,XYY syndrome | 64.0 | 1.000000 | ... | 1.000000 | 0.226493 | 0.185437 | 0.392394 | 0.195675 | True | True | False | False | 5 |
4164 | 49 XXXXX syndrome | 49 XXXXX syndrome | MONDO:0015228 | pentasomy X | label | hasExactSynonym | 49 XXXXX syndrome | 49,XXXXX syndrome | 58.0 | 1.000000 | ... | 1.000000 | 0.205965 | 0.205965 | 0.392394 | 0.195675 | True | True | False | False | 5 |
4531 | 49 XXXXY syndrome | 49 XXXXY syndrome | MONDO:0019929 | 49,XXXXY syndrome | label | label | 49 XXXXY syndrome | 49,XXXXY syndrome | 64.0 | 1.000000 | ... | 1.000000 | 0.219001 | 0.179303 | 0.379414 | 0.222282 | True | True | False | False | 6 |
710 | 5-oxoprolinase deficiency | 5-oxoprolinase deficiency | MONDO:0009825 | 5-oxoprolinase deficiency (disease) | label | hasExactSynonym | 5-oxoprolinase deficiency | 5-oxoprolinase deficiency | 90.0 | 1.000000 | ... | 1.000000 | 0.030109 | 0.030109 | 0.923042 | 0.016740 | True | True | False | False | 7 |
709 | 5-oxoprolinase deficiency | 5-oxoprolinase deficiency | HP:0040142 | 5-oxoprolinase deficiency | label | label | 5-oxoprolinase deficiency | 5-oxoprolinase deficiency | 100.0 | 1.000000 | ... | 1.000000 | 0.028891 | 0.028891 | 0.930268 | 0.011949 | True | True | False | False | 7 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2067 | Wrinkly skin syndrome | Wrinkly skin syndrome | MONDO:0010208 | Wrinkly skin syndrome | label | label | Wrinkly skin syndrome | Wrinkly skin syndrome | 100.0 | 1.000000 | ... | 1.000000 | 0.028758 | 0.028758 | 0.925963 | 0.016522 | True | True | False | False | 7 |
2105 | X-linked adrenal hypoplasia congenita | X-linked adrenal hypoplasia congenita | MONDO:0010264 | X-linked adrenal hypoplasia congenita | label | label | X-linked adrenal hypoplasia congenita | X-linked adrenal hypoplasia congenita | 100.0 | 1.000000 | ... | 1.000000 | 0.028738 | 0.028738 | 0.925323 | 0.017201 | True | True | False | False | 7 |
1552 | X-linked hypohidrotic ectodermal dysplasia | X-linked hypohidrotic ectodermal dysplasia | MONDO:0010585 | X-linked hypohidrotic ectodermal dysplasia | label | label | X-linked hypohidrotic ectodermal dysplasia | X-linked hypohidrotic ectodermal dysplasia | 100.0 | 1.000000 | ... | 1.000000 | 0.028738 | 0.028738 | 0.925323 | 0.017201 | True | True | False | False | 4 |
3900 | X-linked ichthyosis | X-linked ichthyosis | MONDO:0010622 | recessive X-linked ichthyosis | label | hasExactSynonym | X-linked ichthyosis | X-linked ichthyosis | 90.0 | 1.000000 | ... | 1.000000 | 0.029886 | 0.029886 | 0.916224 | 0.024003 | True | True | False | False | 7 |
1968 | X-linked severe combined immunodeficiency | X-linked severe combined immunodeficiency | MONDO:0010315 | gamma chain deficiency | label | hasExactSynonym | X-linked severe combined immunodeficiency | X-Linked Severe Combined Immunodeficiency | 90.0 | 1.000000 | ... | 1.000000 | 0.029969 | 0.029969 | 0.918763 | 0.021299 | True | True | False | False | 8 |
2543 | XFE progeroid syndrome | XFE progeroid syndrome | MONDO:0012590 | XFE progeroid syndrome | label | label | XFE progeroid syndrome | XFE progeroid syndrome | 100.0 | 1.000000 | ... | 1.000000 | 0.028891 | 0.028891 | 0.930268 | 0.011949 | True | True | False | False | 7 |
3037 | XK aprosencephaly | XK aprosencephaly | MONDO:0008811 | XK aprosencephaly | label | label | XK aprosencephaly | XK aprosencephaly | 100.0 | 1.000000 | ... | 1.000000 | 0.028891 | 0.028891 | 0.930268 | 0.011949 | True | True | False | False | 8 |
2070 | Xanthinuria type 1 | Xanthinuria type 1 | MONDO:0010209 | xanthinuria type I | label | label | Xanthinuria type 1 | xanthinuria type I | 64.0 | 1.000000 | ... | 1.000000 | 0.205965 | 0.205965 | 0.392394 | 0.195675 | True | True | False | False | 5 |
2414 | Xanthinuria type 2 | Xanthinuria type 2 | MONDO:0011346 | xanthinuria type II | label | label | Xanthinuria type 2 | xanthinuria type II | 64.0 | 1.000000 | ... | 1.000000 | 0.205965 | 0.205965 | 0.392394 | 0.195675 | True | True | False | False | 6 |
1509 | Xanthogranulomatous cholecystitis | Xanthogranulomatous cholecystitis | MONDO:0004875 | xanthogranulomatous cholecystitis | label | label | Xanthogranulomatous cholecystitis | xanthogranulomatous cholecystitis | 100.0 | 1.000000 | ... | 1.000000 | 0.028795 | 0.028795 | 0.927169 | 0.015241 | True | True | False | False | 8 |
2867 | Xeroderma pigmentosum | Xeroderma pigmentosum | MONDO:0019600 | xeroderma pigmentosum | label | label | Xeroderma pigmentosum | xeroderma pigmentosum | 100.0 | 1.000000 | ... | 1.000000 | 0.028758 | 0.028758 | 0.925963 | 0.016522 | True | True | False | False | 8 |
2077 | Xeroderma pigmentosum variant type | Xeroderma pigmentosum variant type | MONDO:0010214 | xeroderma pigmentosum variant type | label | label | Xeroderma pigmentosum variant type | xeroderma pigmentosum variant type | 100.0 | 1.000000 | ... | 1.000000 | 0.028758 | 0.028758 | 0.925963 | 0.016522 | True | True | False | False | 8 |
3151 | Yaws | Yaws | MONDO:0006019 | yaws | label | label | Yaws | yaws | 100.0 | 1.000000 | ... | 1.000000 | 0.051830 | 0.051830 | 0.874531 | 0.021809 | True | True | False | False | 10 |
3080 | Yellow fever | Yellow fever | MONDO:0020502 | yellow fever | label | label | Yellow fever | yellow fever | 100.0 | 1.000000 | ... | 1.000000 | 0.028891 | 0.028891 | 0.930268 | 0.011949 | True | True | False | False | 8 |
4539 | Yellow nail syndrome | Yellow nail syndrome | MONDO:0007921 | yellow nail syndrome | label | label | Yellow nail syndrome | yellow nail syndrome | 100.0 | 1.000000 | ... | 1.000000 | 0.028891 | 0.028891 | 0.930268 | 0.011949 | True | True | False | False | 9 |
2555 | Yemenite deaf-blind hypopigmentation syndrome | Yemenite deaf-blind hypopigmentation syndrome | MONDO:0011133 | Deaf blind hypopigmentation syndrome, Yemenite... | label | hasExactSynonym | Yemenite deaf-blind hypopigmentation syndrome | Yemenite deaf-blind hypopigmentation syndrome | 90.0 | 1.000000 | ... | 1.000000 | 0.030109 | 0.030109 | 0.923042 | 0.016740 | True | True | False | False | 6 |
4262 | Yolk sac tumor | Yolk sac tumor | MONDO:0005744 | yolk sac tumor | label | label | Yolk sac tumor | yolk sac tumor | 100.0 | 1.000000 | ... | 1.000000 | 0.028758 | 0.028758 | 0.925963 | 0.016522 | True | True | False | False | 7 |
3775 | Yorifuji Okuno syndrome | Yorifuji Okuno syndrome | MONDO:0010802 | pancreatic hypoplasia-diabetes-congenital hear... | label | hasExactSynonym | Yorifuji Okuno syndrome | Yorifuji-Okuno syndrome | 58.0 | 1.000000 | ... | 1.000000 | 0.205965 | 0.205965 | 0.392394 | 0.195675 | True | True | False | False | 5 |
4330 | Young Hughes syndrome | Young Hughes syndrome | MONDO:0017614 | X-linked intellectual disability-hypogonadism-... | label | hasExactSynonym | Young Hughes syndrome | Young-Hughes syndrome | 58.0 | 1.000000 | ... | 1.000000 | 0.200803 | 0.200803 | 0.382559 | 0.215835 | True | True | False | False | 4 |
2384 | Young Simpson syndrome | Young Simpson syndrome | MONDO:0011365 | blepharophimosis-intellectual disability syndr... | label | hasRelatedSynonym | Young Simpson syndrome | Young-Simpson Syndrome | 32.0 | 1.000000 | ... | 1.000000 | 0.200803 | 0.200803 | 0.382559 | 0.215835 | True | True | False | False | 7 |
2059 | Young syndrome | Young syndrome | MONDO:0010220 | young syndrome | label | label | Young syndrome | young syndrome | 100.0 | 1.000000 | ... | 1.000000 | 0.028891 | 0.028891 | 0.930268 | 0.011949 | True | True | False | False | 7 |
2892 | Yunis Varon syndrome | Yunis Varon syndrome | MONDO:0008995 | Yunis-Varon syndrome | label | label | Yunis Varon syndrome | Yunis-Varon syndrome | 64.0 | 1.000000 | ... | 1.000000 | 0.062922 | 0.062922 | 0.817066 | 0.057090 | True | True | False | False | 7 |
1473 | Zechi Ceide syndrome | Zechi Ceide syndrome | MONDO:0013036 | Zechi-Ceide syndrome | label | label | Zechi Ceide syndrome | Zechi-Ceide syndrome | 64.0 | 1.000000 | ... | 1.000000 | 0.205965 | 0.205965 | 0.392394 | 0.195675 | True | True | False | False | 6 |
2866 | Zellweger syndrome | Zellweger syndrome | MONDO:0019609 | Zellweger syndrome | label | label | Zellweger syndrome | Zellweger syndrome | 100.0 | 1.000000 | ... | 1.000000 | 0.028758 | 0.028758 | 0.925963 | 0.016522 | True | True | False | False | 6 |
657 | Zollinger-Ellison syndrome | Zollinger-Ellison syndrome | MONDO:0006020 | Zollinger-Ellison syndrome (disease) | label | hasExactSynonym | Zollinger-Ellison syndrome | Zollinger-Ellison Syndrome | 90.0 | 1.000000 | ... | 0.473684 | 0.075251 | 0.062185 | 0.839062 | 0.023503 | True | True | False | False | 11 |
655 | Zollinger-Ellison syndrome | Zollinger-Ellison syndrome | HP:0002044 | Zollinger-Ellison syndrome | label | label | Zollinger-Ellison syndrome | Zollinger-Ellison syndrome | 100.0 | 1.000000 | ... | 1.000000 | 0.028891 | 0.028891 | 0.930268 | 0.011949 | True | True | False | False | 11 |
656 | Zollinger-Ellison syndrome | Zollinger-Ellison syndrome | MONDO:0019610 | Zollinger-Ellison syndrome | label | label | Zollinger-Ellison syndrome | Zollinger-Ellison syndrome | 100.0 | 1.000000 | ... | 0.526316 | 0.055295 | 0.045694 | 0.882570 | 0.016441 | True | True | False | False | 11 |
3377 | Zori Stalker Williams syndrome | Zori Stalker Williams syndrome | MONDO:0010883 | pectus excavatum-macrocephaly-dysplastic nails... | label | hasExactSynonym | Zori Stalker Williams syndrome | Zori-Stalker-Williams syndrome | 58.0 | 1.000000 | ... | 1.000000 | 0.205965 | 0.205965 | 0.392394 | 0.195675 | True | True | False | False | 5 |
2061 | Zunich neuroectodermal syndrome | Zunich neuroectodermal syndrome | MONDO:0010221 | CHIME syndrome | label | hasRelatedSynonym | Zunich neuroectodermal syndrome | Zunich Neuroectodermal Syndrome | 50.0 | 1.000000 | ... | 1.000000 | 0.061951 | 0.061951 | 0.804454 | 0.071645 | True | True | False | False | 6 |
3649 | Zygomycosis | Zygomycosis | MONDO:0019136 | zygomycosis | label | label | Zygomycosis | zygomycosis | 100.0 | 1.000000 | ... | 1.000000 | 0.051830 | 0.051830 | 0.874531 | 0.021809 | True | True | False | False | 9 |
4558 rows × 22 columns
## write to file (not used here but can be examined separately)
df.to_csv('rare-matches.tsv', sep="\t", index=False)
udf = lexmap.unmapped_dataframe(g)
## unmapped (TODO this includes unmapped from MONDO/HP to R, which we don't care about so much)
udf.to_csv('rare-no-matches.tsv', sep="\t", index=False)
udf
id | label | mapped_equivs | |
---|---|---|---|
18057 | 16p11.2 deletion syndrome | 16p11.2 deletion syndrome | |
105646 | 2-Methylacetoacetyl CoA thiolase deficiency | 2-Methylacetoacetyl CoA thiolase deficiency | |
41905 | 2-hydroxyethyl methacrylate sensitization | 2-hydroxyethyl methacrylate sensitization | |
29133 | 22q11.2 duplication syndrome | 22q11.2 duplication syndrome | |
100428 | 22q13.3 deletion syndrome | 22q13.3 deletion syndrome | |
96122 | 2q37 deletion syndrome | 2q37 deletion syndrome | |
88482 | 3 Methylcrotonyl-CoA carboxylase 1 deficiency | 3 Methylcrotonyl-CoA carboxylase 1 deficiency | |
34501 | 3 alpha methylcrotonyl-CoA carboxylase 2 defic... | 3 alpha methylcrotonyl-CoA carboxylase 2 defic... | |
85670 | 3-alpha hydroxyacyl-CoA dehydrogenase deficiency | 3-alpha hydroxyacyl-CoA dehydrogenase deficiency | |
77929 | 3p deletion syndrome | 3p deletion syndrome | |
95095 | 46 XX Gonadal dysgenesis epibulbar dermoid | 46 XX Gonadal dysgenesis epibulbar dermoid | |
90032 | 5-Nucleotidase syndrome | 5-Nucleotidase syndrome | |
74374 | 6 alpha mercaptopurine sensitivity | 6 alpha mercaptopurine sensitivity | |
51486 | ACTH-independent macronodular adrenal hyperplasia | ACTH-independent macronodular adrenal hyperplasia | |
26334 | AIDS Dementia Complex | AIDS Dementia Complex | |
12881 | AIDS dysmorphic syndrome | AIDS dysmorphic syndrome | |
77859 | ALK+ histiocytosis | ALK+ histiocytosis | |
26275 | ALS-like syndrome of encephalomyopathy | ALS-like syndrome of encephalomyopathy | |
60831 | Abderhalden Kaufmann Lignac syndrome | Abderhalden Kaufmann Lignac syndrome | |
10975 | Abdominal chemodectomas with cutaneous angioli... | Abdominal chemodectomas with cutaneous angioli... | |
108580 | Abdominal cystic lymphangioma | Abdominal cystic lymphangioma | |
94496 | Aberrant subclavian artery | Aberrant subclavian artery | |
93116 | Abidi X-linked mental retardation syndrome | Abidi X-linked mental retardation syndrome | |
40555 | Absence of fingerprints congenital milia | Absence of fingerprints congenital milia | |
7562 | Absence of gluteal muscle | Absence of gluteal muscle | |
6259 | Absence of tibia with polydactyly | Absence of tibia with polydactyly | |
75137 | Absent T lymphocytes | Absent T lymphocytes | |
49480 | Absent breasts and nipples | Absent breasts and nipples | |
56640 | Abuse dwarfism syndrome | Abuse dwarfism syndrome | |
8138 | Acanthamoeba infection | Acanthamoeba infection | |
... | ... | ... | ... |
25120 | http://www.orpha.net/ORDO/Orphanet_99948 | None | [MONDO:0008961] |
21037 | http://www.orpha.net/ORDO/Orphanet_99949 | None | [MONDO:0011113] |
17454 | http://www.orpha.net/ORDO/Orphanet_99950 | None | [MONDO:0011085] |
20779 | http://www.orpha.net/ORDO/Orphanet_99951 | None | [MONDO:0011527] |
70495 | http://www.orpha.net/ORDO/Orphanet_99952 | None | [] |
94070 | http://www.orpha.net/ORDO/Orphanet_99953 | None | [MONDO:0011534] |
82089 | http://www.orpha.net/ORDO/Orphanet_99954 | None | [] |
106842 | http://www.orpha.net/ORDO/Orphanet_99955 | None | [MONDO:0011066] |
99610 | http://www.orpha.net/ORDO/Orphanet_99956 | None | [MONDO:0011475] |
10190 | http://www.orpha.net/ORDO/Orphanet_99960 | None | [] |
100192 | http://www.orpha.net/ORDO/Orphanet_99961 | None | [] |
30752 | http://www.orpha.net/ORDO/Orphanet_99965 | None | [] |
51288 | http://www.orpha.net/ORDO/Orphanet_99966 | None | [] |
24502 | http://www.orpha.net/ORDO/Orphanet_99967 | None | [] |
102919 | http://www.orpha.net/ORDO/Orphanet_99969 | None | [] |
79112 | http://www.orpha.net/ORDO/Orphanet_99970 | None | [] |
17426 | http://www.orpha.net/ORDO/Orphanet_99971 | None | [] |
15059 | http://www.orpha.net/ORDO/Orphanet_99976 | None | [] |
65846 | http://www.orpha.net/ORDO/Orphanet_99977 | None | [] |
6381 | http://www.orpha.net/ORDO/Orphanet_99978 | None | [MONDO:0003345] |
3759 | http://www.orpha.net/ORDO/Orphanet_99981 | None | [] |
3847 | http://www.orpha.net/ORDO/Orphanet_99983 | None | [] |
85668 | http://www.orpha.net/ORDO/Orphanet_99989 | None | [] |
4382 | http://www.orpha.net/ORDO/Orphanet_99990 | None | [] |
45866 | http://www.orpha.net/ORDO/Orphanet_99991 | None | [] |
21041 | http://www.orpha.net/ORDO/Orphanet_99994 | None | [] |
65561 | http://www.orpha.net/ORDO/Orphanet_99995 | None | [] |
43256 | http://www.w3.org/2000/01/rdf-schema#seeAlso | seeAlso | |
21673 | http://www.w3.org/2002/07/owl#Thing | None | |
15400 | http://www.w3.org/2002/07/owl#topObjectProperty | None |
110240 rows × 3 columns