#!/usr/bin/env python
# coding: utf-8

# In[1]:


import logging
import pathlib

from IPython.display import display
import pandas as pd
import rdflib

ENGINE = 'postgresql://postgres@/glottolog3'

QUERY = '''
SELECT
  l.id AS glottocode,
  l.name,
  ll.level,
  ll.category,
  substring(wikidata_link->>'url' FROM '/([^/]+)$') AS qid,
  substring(wikipedia_link->>'url' FROM '/([^/]+)$') AS title,
  i.name AS iso639_3
FROM language AS l
JOIN languoid AS ll USING (pk)
CROSS JOIN jsonb_path_query(l.jsondata::jsonb,
  '$.links[*] ? (@.url starts with "https://www.wikidata.org/entity/")') AS wikidata_link
LEFT JOIN jsonb_path_query(l.jsondata::jsonb,
  '$.links[*] ? (@.url starts with "https://en.wikipedia.org/wiki/")') AS wikipedia_link ON TRUE
LEFT JOIN (
  languageidentifier AS li
  JOIN identifier AS i
  ON li.identifier_pk = i.pk AND i.type = 'iso639-3'
) ON li.language_pk = l.pk
ORDER BY l.id
'''.strip()


logging.basicConfig(format='[%(levelname)s@%(name)s] %(message)s', level=logging.INFO)
logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO)


gf = pd.read_sql_query(QUERY, ENGINE, index_col='glottocode', dtype='string')

gf.info(memory_usage='deep')
assert gf.index.is_unique
assert gf.index.is_monotonic_increasing
gf.head()


# In[2]:


get_ipython().run_cell_magic('time', '', '\nENDPOINT = \'https://query.wikidata.org/sparql\'\n\nclass SCHEMA(rdflib.SDO):\n    """https://github.com/RDFLib/rdflib/issues/1120"""\n    _NS = rdflib.Namespace(rdflib.SDO._NS.replace(\'https://\', \'http://\'))\n\nPREFIXES = {\'schema\': SCHEMA}\n\nSPARQL_QUERY = \'\'\'\nSELECT\n  ?glottocode\n  (strafter(str(?languoid), str(wd:)) AS ?qid)\n  (?languoidLabel AS ?name)\n  (strafter(str(?siteLink), "https://en.wikipedia.org/wiki/") AS ?title)\nWHERE {\n  ?languoid wdt:P1394 ?glottocode.\n  FILTER (REGEX(?glottocode, "^[a-z0-9]{4}[0-9]{4}$")).\n  OPTIONAL {\n    ?siteLink schema:about ?languoid;\n              schema:inLanguage "en";\n              schema:isPartOf <https://en.wikipedia.org/>.\n  }\n  SERVICE wikibase:label {\n    bd:serviceParam wikibase:language "en".\n    ?languoid rdfs:label ?languoidLabel.\n  }\n}\nORDER BY\n  ?glottocode\n  xsd:integer(strafter(str(?languoid), str(wd:Q)))\n\'\'\'.strip()\n\nCSV_PATH = pathlib.Path(\'wikidata.csv\')\n\n\ndef open_sparql_graph(endpoint=ENDPOINT, *, prefixes=PREFIXES):\n    logging.info(\'endpoint: %r\', endpoint)\n    graph = rdflib.ConjunctiveGraph(\'SPARQLStore\')\n    graph.open(endpoint)\n    logging.info(\'graph: %s\', graph)\n    logging.info(\'prefixes: %r\', prefixes)\n    for prefix, namespace in prefixes.items():\n        graph.namespace_manager.bind(prefix, namespace, replace=True)\n    logging.debug(\'namespaces: %r\', list(graph.namespaces()))\n    return graph\n\n\ndef iterrows(query, *, prefixes=PREFIXES,\n             limit=None, verbose: bool = False,\n             per_request: int = 100_000):\n    if limit is None:\n        limit = float(\'inf\')\n\n    graph = open_sparql_graph()\n\n    offset = 0\n    while offset < limit:\n        request_limit = min(limit - offset, per_request)\n        request_query = (f\'{query}\\n\'\n                         f\'OFFSET {offset:d}\\n\'\n                         f\'LIMIT {request_limit:d}\')\n        logging.info("graph.query(\'\'\'\\n%s\\n\'\'\')", request_query)\n        result = graph.query(request_query)\n        if not offset:\n            yield (v.toPython().removeprefix(\'?\') for v in result.vars)\n        for n, values in enumerate(result, 1):\n            yield (v.toPython() if v is not None else None for v in values)\n        if n < request_limit:\n            return\n        offset += n\n\n\ndef read_sparql_query(query, *, limit=None, **kwargs):\n    rows = iterrows(query, limit=limit)\n    columns = list(next(rows))\n    return pd.DataFrame.from_records(rows, columns=columns, **kwargs)\n\n\ndef read_glottocodes(*, path=CSV_PATH, encoding=\'utf-8\'):\n    if not path.exists():\n        df = read_sparql_query(SPARQL_QUERY, index=\'glottocode\').astype(\'string\')\n        df.to_csv(path, encoding=encoding)\n    return pd.read_csv(path, index_col=\'glottocode\', encoding=encoding).astype(\'string\')\n\n\nwf = read_glottocodes()\n\nwf.info(memory_usage=\'deep\')\nassert wf.index.is_monotonic_increasing\nwf.head(10)\n')


# In[3]:


KEEP_ALL = False

wf[wf.index.duplicated(keep=KEEP_ALL)]


# In[4]:


df = gf.join(wf, on='glottocode', rsuffix='_wd').drop(['category', 'name_wd', 'iso639_3'], axis='columns')

df.info(memory_usage='deep')
df.head()


# In[5]:


df[df.index.duplicated(keep=KEEP_ALL)]


# In[6]:


mismatch = df.loc[df['qid'] != df['qid_wd'], ['name', 'level', 'title', 'title_wd', 'qid', 'qid_wd']]

mismatch.info(memory_usage='deep')
mismatch[mismatch.index.duplicated(keep=KEEP_ALL)]


# In[7]:


with pd.option_context('display.max_rows', 150):
    display(mismatch)