#!/usr/bin/env python # coding: utf-8 # In[1]: import logging import pathlib from IPython.display import display import pandas as pd import rdflib ENGINE = 'postgresql://postgres@/glottolog3' QUERY = ''' SELECT l.id AS glottocode, l.name, ll.level, ll.category, substring(wikidata_link->>'url' FROM '/([^/]+)$') AS qid, substring(wikipedia_link->>'url' FROM '/([^/]+)$') AS title, i.name AS iso639_3 FROM language AS l JOIN languoid AS ll USING (pk) CROSS JOIN jsonb_path_query(l.jsondata::jsonb, '$.links[*] ? (@.url starts with "https://www.wikidata.org/entity/")') AS wikidata_link LEFT JOIN jsonb_path_query(l.jsondata::jsonb, '$.links[*] ? (@.url starts with "https://en.wikipedia.org/wiki/")') AS wikipedia_link ON TRUE LEFT JOIN ( languageidentifier AS li JOIN identifier AS i ON li.identifier_pk = i.pk AND i.type = 'iso639-3' ) ON li.language_pk = l.pk ORDER BY l.id '''.strip() logging.basicConfig(format='[%(levelname)s@%(name)s] %(message)s', level=logging.INFO) logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) gf = pd.read_sql_query(QUERY, ENGINE, index_col='glottocode', dtype='string') gf.info(memory_usage='deep') assert gf.index.is_unique assert gf.index.is_monotonic_increasing gf.head() # In[2]: get_ipython().run_cell_magic('time', '', '\nENDPOINT = \'https://query.wikidata.org/sparql\'\n\nclass SCHEMA(rdflib.SDO):\n """https://github.com/RDFLib/rdflib/issues/1120"""\n _NS = rdflib.Namespace(rdflib.SDO._NS.replace(\'https://\', \'http://\'))\n\nPREFIXES = {\'schema\': SCHEMA}\n\nSPARQL_QUERY = \'\'\'\nSELECT\n ?glottocode\n (strafter(str(?languoid), str(wd:)) AS ?qid)\n (?languoidLabel AS ?name)\n (strafter(str(?siteLink), "https://en.wikipedia.org/wiki/") AS ?title)\nWHERE {\n ?languoid wdt:P1394 ?glottocode.\n FILTER (REGEX(?glottocode, "^[a-z0-9]{4}[0-9]{4}$")).\n OPTIONAL {\n ?siteLink schema:about ?languoid;\n schema:inLanguage "en";\n schema:isPartOf .\n }\n SERVICE wikibase:label {\n bd:serviceParam wikibase:language "en".\n ?languoid rdfs:label ?languoidLabel.\n }\n}\nORDER BY\n ?glottocode\n xsd:integer(strafter(str(?languoid), str(wd:Q)))\n\'\'\'.strip()\n\nCSV_PATH = pathlib.Path(\'wikidata.csv\')\n\n\ndef open_sparql_graph(endpoint=ENDPOINT, *, prefixes=PREFIXES):\n logging.info(\'endpoint: %r\', endpoint)\n graph = rdflib.ConjunctiveGraph(\'SPARQLStore\')\n graph.open(endpoint)\n logging.info(\'graph: %s\', graph)\n logging.info(\'prefixes: %r\', prefixes)\n for prefix, namespace in prefixes.items():\n graph.namespace_manager.bind(prefix, namespace, replace=True)\n logging.debug(\'namespaces: %r\', list(graph.namespaces()))\n return graph\n\n\ndef iterrows(query, *, prefixes=PREFIXES,\n limit=None, verbose: bool = False,\n per_request: int = 100_000):\n if limit is None:\n limit = float(\'inf\')\n\n graph = open_sparql_graph()\n\n offset = 0\n while offset < limit:\n request_limit = min(limit - offset, per_request)\n request_query = (f\'{query}\\n\'\n f\'OFFSET {offset:d}\\n\'\n f\'LIMIT {request_limit:d}\')\n logging.info("graph.query(\'\'\'\\n%s\\n\'\'\')", request_query)\n result = graph.query(request_query)\n if not offset:\n yield (v.toPython().removeprefix(\'?\') for v in result.vars)\n for n, values in enumerate(result, 1):\n yield (v.toPython() if v is not None else None for v in values)\n if n < request_limit:\n return\n offset += n\n\n\ndef read_sparql_query(query, *, limit=None, **kwargs):\n rows = iterrows(query, limit=limit)\n columns = list(next(rows))\n return pd.DataFrame.from_records(rows, columns=columns, **kwargs)\n\n\ndef read_glottocodes(*, path=CSV_PATH, encoding=\'utf-8\'):\n if not path.exists():\n df = read_sparql_query(SPARQL_QUERY, index=\'glottocode\').astype(\'string\')\n df.to_csv(path, encoding=encoding)\n return pd.read_csv(path, index_col=\'glottocode\', encoding=encoding).astype(\'string\')\n\n\nwf = read_glottocodes()\n\nwf.info(memory_usage=\'deep\')\nassert wf.index.is_monotonic_increasing\nwf.head(10)\n') # In[3]: KEEP_ALL = False wf[wf.index.duplicated(keep=KEEP_ALL)] # In[4]: df = gf.join(wf, on='glottocode', rsuffix='_wd').drop(['category', 'name_wd', 'iso639_3'], axis='columns') df.info(memory_usage='deep') df.head() # In[5]: df[df.index.duplicated(keep=KEEP_ALL)] # In[6]: mismatch = df.loc[df['qid'] != df['qid_wd'], ['name', 'level', 'title', 'title_wd', 'qid', 'qid_wd']] mismatch.info(memory_usage='deep') mismatch[mismatch.index.duplicated(keep=KEEP_ALL)] # In[7]: with pd.option_context('display.max_rows', 150): display(mismatch)