#!/usr/bin/env python # coding: utf-8 # # Exploring *Glottolog* with Python # Sebastian Bank (sebastian.bank@uni-leipzig.de) http://www.uni-leipzig.de/~sbank/ # # The latest version of this [IPython Notebook](http://ipython.org/notebook.html) is available at http://gist.github.com/xflr6/9050337. # # [Glottolog](http://glottolog.org) provides its comprehensive catalog of the world's languages, language families and dialects for [download](http://glottolog.org/meta/downloads) in linked data format. # # In this notebook, I will process this data set using the following tools: # # * [Python](http://www.python.org) (2.7) # * [rdflib](http://github.com/RDFLib/rdflib) # * [sqlite3](http://docs.python.org/2/library/sqlite3.html) (included with Python) # * [pandas](http://pandas.pydata.org) (using [matplotlib](http://matplotlib.org) for visualization) # # If you are new to scientific Python, the [Anaconda Python Distribution](http://continuum.io/downloads) is probably the fastest way to get Python installed with all the commonly used scientific packages. It supports all platforms (Linux, Mac, and Windows). # # If you are on Windows, there are [Unofficial Windows Binaries](http://www.lfd.uci.edu/~gohlke/pythonlibs/) for a lot of Python extension packages used in scientific computing. # ## Getting the file # Download the [RDF](http://en.wikipedia.org/wiki/Resource_Description_Framework) export file with Pythons built-in `urllib` module ([docs](http://docs.python.org/2/library/urllib.html)). # In[1]: import urllib URL = 'http://glottolog.org/static/download/2.7/glottolog-language.n3.gz' filename, headers = urllib.urlretrieve(URL, URL.rpartition('/')[2]) # The file contains RDF in [Notation3](http://en.wikipedia.org/wiki/Notation3) compressed with gzip. # In[2]: print filename # Display the size in megabytes. # In[3]: size = int(headers['Content-Length']) print size / 1024.0 ** 2 # ## A first look # Read the first few bytes from the file with `gzip` ([docs](http://docs.python.org/2/library/gzip.html)) so we can get an impression of the format. # In[4]: import gzip with gzip.open(filename) as fd: sample = fd.read(4000) # Split the sample into the namespaces definitions and the actual RDF triples. They are separated by a blank line. # In[5]: head, _, body = sample.partition('\n\n') len(head), len(body) # Inspect the start of the namespaces. # In[6]: print head[:600] + '...' # Glottolog uses well-known ontologies and some which are dedicated to linguistics like [Lexvo](http://www.lexvo.org/) and [GOLD](http://linguistics-ontology.org/). # Display the first RDF triples. # In[7]: print body[:600] + '...' # The entry starts with the full URI of the languoid, followed by its types, label, ISO 639-3 code and description. # Let's try to extract some meaningful information from this string just using Pythons regular expressions. # ## Using text processing # Load the whole file uncompressed into memory. # In[8]: with gzip.open(filename) as fd: data = fd.read() # Display the size in megabytes. # In[9]: print len(data) / 1024.0 ** 2 # Extract the glottocode from the start of all `dcterms:LinguisticSystem` entries with the `re` module ([docs](http://docs.python.org/2/library/re.html)) and count them. # In[10]: import re GLOTTOCODE = ' a dcterms:LinguisticSystem' gcodes = re.findall(GLOTTOCODE, data) len(gcodes) # Display the glottocodes of the first five entries. # In[11]: gcodes[:5] # Looks unordered, sort them alphabetically and display the first and last five entries. # In[12]: gcodes.sort() print gcodes[:5] print gcodes[-5:] # Extract everything that looks like an ISO code. Count the results. # In[13]: ISO_CODE = 'iso639P3PCode "(\w+)"' icodes = re.findall(ISO_CODE, data) len(icodes) # Display the first ten ISO codes. # In[14]: icodes[:10] # Sort them as well and display the start and end. # In[15]: icodes.sort() print icodes[:10] print icodes[-10:] # ## Glottocodes # Glottocodes consist of four letters and some apparently recurring digit combinations. # # Display the five most common of those digits and their frequency with `collections.Counter` ([docs](http://docs.python.org/2/library/collections.html#collections.Counter)). # In[16]: import collections collections.Counter(g[4:] for g in gcodes).most_common(5) # Show the most common inital parts. # In[17]: collections.Counter(g[:4] for g in gcodes).most_common(5) # ## Loading into RDFlib # Use `rdflib` ([docs](http://rdflib.readthedocs.org/en/latest/)) to load the whole graph into memory. # # This will take a while and fill a couple hundred megabytes of RAM. # In[18]: import rdflib graph = rdflib.Graph() with gzip.open(filename) as fd: graph.parse(fd, format='n3') graph # Count the number of triples. # In[19]: len(graph) # ## Using the RDF graph # Display some of the triples (subject, predicate, object). # In[20]: import itertools for s, p, o in itertools.islice(graph, 15): print s[42:], graph.qname(p), o # Show all available predicates. # In[21]: for p in sorted(set(graph.predicates())): print graph.qname(p) # Create shortcuts for querying glottocodes and ISO codes. Translate glottocodes into ISO codes. # In[22]: glottocode = rdflib.Namespace('http://glottolog.org/resource/languoid/id/') lexvo = rdflib.Namespace('http://lexvo.org/ontology#') iso639 = lexvo.iso639P3PCode for g in gcodes[:5]: i = graph.value(glottocode[g], iso639, default='___') print '%s -> %s,' % (g, i), # Translate ISO codes into glottocodes # In[23]: string = rdflib.namespace.XSD.string for i in icodes[:5]: g = graph.value(None, iso639, rdflib.Literal(i, datatype=string)) print '%s -> %s,' % (i, g[42:]), # Retrieve the preferred label of languoids. # In[24]: label = rdflib.namespace.RDFS.label for g in gcodes[:5]: l = graph.value(glottocode[g], label) print '%s -> %s,' % (g, l), # Lookup an arbitrary languoid with a given label. # In[25]: print graph.value(None, label, rdflib.Literal('Aalawa', lang='en')) # Show the predicates and objects of an individual languoid. # In[26]: for p, o in graph[glottocode['aala1237']]: print graph.qname(p), o # Display the nodes along a languoid's path up the tree. # In[27]: broader = rdflib.namespace.SKOS.broader aalawa = graph.resource(glottocode['aala1237']) print ' -> '.join(b.label() for b in aalawa.transitive_objects(broader)) # Display the nodes immediately below a languoid. # In[28]: narrower = rdflib.namespace.SKOS.narrower atlaco = graph.resource(glottocode['atla1278']) print '%s <- %s' % (atlaco.label(), ', '.join(n.label() for n in atlaco.objects(narrower))) # Count all nodes below a languoid. # In[29]: len(list(atlaco.transitive_objects(narrower))) # ## Querying with SPARQL # Retrieve rows of glottocode, ISO code, and label with RDFs query language [SPARQL](http://www.w3.org/TR/sparql11-query/). Also display the annotated language of the label. # In[30]: GIL = """ SELECT (substr(str(?s), 43) AS ?glottocode) ?iso ?label WHERE { ?s a dcterms:LinguisticSystem ; skos:prefLabel ?label OPTIONAL { ?s lexvo:iso639P3PCode ?iso } } ORDER BY ?s LIMIT 10""" for g, i, l in graph.query(GIL): print '%s | %-4s | %-10s | %s' % (g, i, l, l.language) # Display the result as CSV (`json` and `xml` format are also supported). # In[31]: print graph.query(GIL).serialize(format='csv') # Determine the language families with the most child languages. # In[32]: FAMILIES = """ SELECT ?label (count(*) as ?n) WHERE { ?s a gold:LanguageFamily ; rdfs:label ?label ; skos:narrower+/a gold:Language } GROUP BY ?s ORDER BY desc(?n) LIMIT 10""" for f, n in graph.query(FAMILIES): print '%s\t%s' % (f, n) # Display the immediate children for some families. # In[33]: CHILDREN = """ SELECT ?label (group_concat(?o; separator=", ") as ?children) WHERE { ?s a gold:LanguageFamily ; rdfs:label ?label ; skos:narrower/rdfs:label ?o } GROUP BY ?s ORDER BY ?label OFFSET 10 LIMIT 5""" for f, c in graph.query(CHILDREN): print '%s <- %s' % (f, c) # Do the same for a specific languoid. # In[34]: for l, c in graph.query("""BASE SELECT ?label (group_concat(?o; separator=", ") as ?children) WHERE { rdfs:label ?label ; skos:narrower/rdfs:label ?o }"""): print '%s <- %s' % (l, c) # Here's a SPARQL query that retrieves most of the [functional properties](http://www.w3.org/TR/owl-ref/#FunctionalProperty-def) of the languoids. # In[35]: LANGUOIDS = """ SELECT (substr(str(?s), 43) AS ?id) ?label (substr(str(?type), 34) AS ?level) (substr(str(?broader), 43) AS ?parent) (if(bound(?change_note), 1, 0) AS ?obsolete) ?status ?iso639 ?latitude ?longitude WHERE { ?s a dcterms:LinguisticSystem ; skos:prefLabel ?label . ?s a ?type FILTER (strstarts(str(?type), "http://purl.org/linguistics/gold/")) OPTIONAL { ?s skos:broader ?broader } OPTIONAL { ?s skos:changeNote ?change_note FILTER (?change_note = "obsolete") } OPTIONAL { ?s skos:editorialNote ?status } OPTIONAL { ?s lexvo:iso639P3PCode ?iso639 } OPTIONAL { ?s geo:lat ?latitude; geo:long ?longitude } }""" # Display some results. # In[36]: for row in itertools.islice(graph.query(LANGUOIDS), 20): print '%s %-20s %-17s %-8s %s %-11s %-4s %-8s %s' % row # Write the results into a CSV file. Show the beginning of the file. # In[37]: CSV = 'glottolog.csv' graph.query(LANGUOIDS).serialize(CSV, format='csv') with open(CSV) as fd: sample = fd.read(500) print sample + '...' # Let's put that into a relational database so we can reuse it later. # ## Export to SQLite # Create an [SQLite](http://www.sqlite.org/) database file connecting with `sqlite3` ([docs](http://docs.python.org/2/library/sqlite3.html)). Activate [foreign key checks](http://www.sqlite.org/foreignkeys.html) so we notice if something is inconsistent. # In[38]: import sqlite3 DB = 'glottolog.sqlite3' conn = sqlite3.connect(DB) conn.execute('PRAGMA foreign_keys = ON') conn.execute('PRAGMA synchronous = OFF') conn.execute('PRAGMA journal_mode = MEMORY') conn # Create a table for the results of the languoids query with some additional sanity checks. Insert the query rows. Count them. # In[39]: conn.execute(""" CREATE TABLE languoid ( id TEXT NOT NULL PRIMARY KEY, label TEXT NOT NULL, level TEXT NOT NULL, parent TEXT, obsolete BOOLEAN NOT NULL, status TEXT, iso TEXT UNIQUE, latitude REAL, longitude REAL, FOREIGN KEY(parent) REFERENCES languoid(id) DEFERRABLE INITIALLY DEFERRED, CHECK (level IN ('LanguageFamily', 'LanguageSubfamily', 'Language', 'Dialect')), CHECK (obsolete IN (0, 1)), CHECK (status IN ('established', 'spurious', 'spurious retired', 'unattested', 'provisional', 'retired')) )""") conn.executemany('INSERT INTO languoid VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', graph.query(LANGUOIDS)) conn.commit() conn.execute('SELECT count(*) FROM languoid').fetchone() # Languoids may have *n* alternative labels. # # Create a table for the labels and their language. Retrieve them with SPARQL. Insert the query results into the table. Count rows. # In[40]: conn.execute(""" CREATE TABLE label ( id TEXT NOT NULL, lang TEXT NOT NULL, label TEXT NOT NULL, PRIMARY KEY (id, lang, label), FOREIGN KEY(id) REFERENCES languoid(id) )""") LABELS = """ SELECT (substr(str(?s), 43) AS ?id) (lang(?label) AS ?lang) ?label WHERE { ?s a dcterms:LinguisticSystem ; skos:altLabel ?label }""" conn.executemany('INSERT INTO label VALUES (?, ?, ?)', graph.query(LABELS)) conn.commit() conn.execute('SELECT count(*) FROM label').fetchone() # Languoids may have *n* references. # # Create a table for the references. Retrieve them with SPARQL. Insert the query results into the table. Count. # In[41]: conn.execute(""" CREATE TABLE reference ( id TEXT NOT NULL, reference INTEGER NOT NULL, PRIMARY KEY (id, reference), FOREIGN KEY(id) REFERENCES languoid(id) )""") REFERENCES = """ SELECT (substr(str(?s), 43) AS ?id) (substr(str(?o), 44) AS ?reference) WHERE { ?s a dcterms:LinguisticSystem ; dcterms:description ?o FILTER (strstarts(str(?o), "http://glottolog.org/resource/reference/id/")) }""" conn.executemany('INSERT INTO reference VALUES (?, ?)', graph.query(REFERENCES)) conn.commit() conn.execute('SELECT count(*) FROM reference').fetchone() # ## Querying with SQLite # Display the number of languoids. Break it down by type and check the proportion of superseded entries. Most of the family entries are obsolete. # In[42]: print conn.execute('SELECT count(*) FROM languoid').fetchone() conn.execute('SELECT level, count(*), sum(obsolete) FROM languoid GROUP BY level').fetchall() # Check the distribution of status values by type. Only language entries distinguish it. # In[43]: conn.execute("""SELECT level, status, count(*) AS n FROM languoid GROUP BY level, status ORDER BY level, n DESC""").fetchall() # Display the number ISO codes. Break the proportions down by languoid type. ISO 639-3 also contains macrolanguages. # In[44]: print conn.execute('SELECT count(*), count(iso) FROM languoid').fetchone() conn.execute('SELECT level, count(*), count(iso) FROM languoid GROUP BY level').fetchall() # Check how many entries specify location. Only language entries do so. # In[45]: conn.execute('SELECT level, count(latitude) FROM languoid GROUP BY level').fetchall() # Display the first and last glottocodes and ISO codes. # In[46]: GLOTTOCODES = 'SELECT id FROM languoid ORDER BY id %s LIMIT 10' print ', '.join(g for g, in conn.execute(GLOTTOCODES % 'ASC')) print ', '.join(g for g, in conn.execute(GLOTTOCODES % 'DESC')) ISO_CODES = 'SELECT iso FROM languoid WHERE iso NOT NULL ORDER BY iso %s LIMIT 20' print ', '.join(i for i, in conn.execute(ISO_CODES % 'ASC')) print ', '.join(i for i, in conn.execute(ISO_CODES % 'DESC')) # ### Labels # Display the number of labels. Break them down by language and entry type. # In[47]: print conn.execute('SELECT count(*) FROM label').fetchone() print conn.execute("""SELECT lang, count(*) AS n FROM label GROUP BY lang ORDER BY n DESC LIMIT 5""").fetchall() conn.execute("""SELECT languoid.level, count(*) AS n FROM label JOIN languoid ON languoid.id=label.id GROUP BY languoid.level""").fetchall() # Show the minimal, mean, and maximal number of labels per entry. Check the languoids with the most labels. # In[48]: print conn.execute("""SELECT min(n), avg(n), max(n) FROM (SELECT count(*) AS n FROM label GROUP BY id)""").fetchone() conn.execute("""SELECT languoid.label, count(*) AS n FROM label JOIN languoid ON languoid.id=label.id GROUP BY label.id ORDER BY n DESC LIMIT 3""").fetchall() # Show the minimal, mean, and maximal label length. Check the frequencies of the most common lengths. # In[49]: print conn.execute("""SELECT min(s), avg(s), max(s) FROM (SELECT length(label) AS s FROM label)""").fetchall() conn.execute("""SELECT length(label) AS l, count(*) AS n FROM label GROUP BY l HAVING n > 3200 ORDER BY l""").fetchall() # ### References # Display the number of references. Break them down by entry type. There are much less references for non-languages. # In[50]: print conn.execute('SELECT count(*) FROM reference').fetchone() conn.execute("""SELECT l.level, count(*) AS n FROM reference AS r JOIN languoid AS l ON l.id=r.id GROUP BY l.level""").fetchall() # Show the minimal, mean, and maximal number of references per entry. Check the most referenced languoids. # In[51]: print conn.execute("""SELECT min(n), avg(n), max(n) FROM (SELECT count(*) AS n FROM reference GROUP BY id)""").fetchone() conn.execute("""SELECT l.label, count(*) AS n FROM reference AS r JOIN languoid AS l ON l.id=r.id GROUP BY r.id ORDER BY n DESC LIMIT 3""").fetchall() # ## Building the tree # The languoids table only specifies the direct parent of each entry. However, we want to be able to traverse the tree and query the whole path. # # As SQLite supports [hierarchical queries](http://en.wikipedia.org/wiki/Hierarchical_and_recursive_queries_in_SQL) only with version 3.8.3+, we will use a more general approach and generate a table with all tree paths. # # In other words, we will compute the *transitive closure* of the parent relation, a.k.a. tree closure table. # # Since we won't use recursion *inside* the database, we will simply put together a bunch of SQL queries and feed the results back into a new table of our database. # In[52]: PATH = """SELECT i0 AS child, %(depth)d AS steps, i%(depth)d AS parent, i%(next)d IS NULL AS terminal FROM ( SELECT %(select)s FROM languoid AS l0 %(joins)s ) WHERE parent IS NOT NULL""" def path_query(depth): select = ', '.join('l%(step)d.id AS i%(step)d' % {'step': i} for i in range(depth + 2)) joins = ' '.join('LEFT JOIN languoid AS l%(next)d ON l%(step)d.parent = l%(next)d.id' % {'step': i, 'next': i + 1} for i in range(depth + 1)) return PATH % {'depth': depth, 'next': depth + 1, 'select': select, 'joins': joins} # The `path_query` function generates a query for a tree walk of the length given by `depth`. Note that we will omit zero step (*reflexive*) walks. # In[53]: print path_query(1) # Each query returns the start glottocode, number of steps, end glottocode and a boolean indicating if there is no grandparent. # In[54]: print conn.execute('%s ORDER BY i0 LIMIT 3' % path_query(1)).fetchall() print conn.execute('%s ORDER BY i0 LIMIT 3' % path_query(2)).fetchall() # When all paths in the query are terminal, we have arrived at the maximal depth. # In[55]: conn.execute(path_query(18)).fetchall() # Create a table for the results. Insert path walks of increasing depth until all walks have ended. Count the walks. # In[56]: conn.execute(""" CREATE TABLE tree ( child TEXT NOT NULL, steps INTEGER NOT NULL, parent TEXT NOT NULL, terminal BOOLEAN NOT NULL, PRIMARY KEY (child, steps), UNIQUE (child, parent), UNIQUE (parent, child), FOREIGN KEY (child) REFERENCES languoid (id), FOREIGN KEY (parent) REFERENCES languoid (id), CHECK (terminal IN (0, 1)) )""") depth = 1 while True: rows = conn.execute(path_query(depth)).fetchall() if not rows: break conn.executemany('INSERT INTO tree VALUES (?, ?, ?, ?)', rows) depth += 1 conn.commit() conn.execute('SELECT count(*) FROM tree').fetchone() # If the SQlite we use from Python is version 3.8.3 or later, we can also get the rows for the tree closure table with a single query: # # ```sql # WITH RECURSIVE tree(child, steps, parent, terminal) AS ( # SELECT l.id, 1, l.parent, 0 # FROM languoid AS l # WHERE l.parent IS NOT NULL # UNION ALL # SELECT t.child, t.steps + 1, p.parent, gp.parent IS NULL # FROM languoid AS p # JOIN tree AS t ON p.id=t.parent # LEFT JOIN languoid AS gp ON gp.id=p.parent # WHERE p.parent IS NOT NULL # ) # SELECT * FROM tree``` # ## Querying the tree # Show the minimal, mean, and maximal number of languages per family. Display the language familes with the most child languages. # In[57]: print conn.execute("""SELECT min(n), avg(n), max(n) FROM (SELECT count(*) AS n FROM languoid AS p JOIN tree AS w ON w.parent=p.id AND w.terminal JOIN languoid AS c ON w.child=c.id AND c.level='Language' WHERE p.level='LanguageFamily' GROUP BY p.id)""").fetchone() conn.execute("""SELECT p.label, count(*) AS n FROM languoid AS p JOIN tree AS w ON w.parent=p.id AND w.terminal JOIN languoid AS c ON w.child=c.id AND c.level='Language' WHERE p.level='LanguageFamily' GROUP BY p.id ORDER BY n DESC LIMIT 3""").fetchall() # Determine the languages with the most dialects. # In[58]: conn.execute("""SELECT p.label, count(*) AS n FROM languoid AS p JOIN tree AS w ON w.parent=p.id AND w.terminal JOIN languoid AS c ON w.child=c.id AND c.level='Dialect' WHERE p.level='Language' GROUP BY p.id ORDER BY n DESC LIMIT 4""").fetchall() # Display some of the longest paths. # In[59]: for child, path in conn.execute("""SELECT c.label, (SELECT group_concat(parent, ' <- ') FROM (SELECT g.child AS child , p.label AS parent FROM tree AS g JOIN languoid AS p ON g.parent=p.id WHERE child=c.id ORDER BY g.steps DESC) GROUP BY child) FROM languoid AS c JOIN tree AS w ON w.child=c.id AND w.terminal ORDER BY w.steps DESC, c.id LIMIT 3"""): print '%s <= %s\n' % (path, child) # Note that with SPARQL the [number of steps is not available](http://www.w3.org/TR/sparql11-property-paths/#Outstanding_Issues), so it might be [difficult](http://stackoverflow.com/questions/5198889/calculate-length-of-path-between-nodes) to get the path in the right order like this. # ## Analysis with pandas # Activate [inline plotting](http://nbviewer.ipython.org/github/jrjohansson/scientific-python-lectures/blob/master/Lecture-4-Matplotlib.ipynb#The-IPython-notebook-inline-backend) in this notebook. # In[60]: get_ipython().run_line_magic('matplotlib', 'inline') # Load the language labels into a `pandas` ([docs](http://pandas.pydata.org/pandas-docs/stable/)) `DataFrame`. Display the result. # In[61]: import pandas as pd pd.set_option('max_rows', 15) labels = pd.read_sql_query("""SELECT label.* FROM label JOIN languoid ON label.id=languoid.id WHERE languoid.level='Language' ORDER BY label.id""", conn, index_col='id') labels # Break the number of labels down by language. # In[62]: labels_lang = labels.groupby('lang').size().sort_values(ascending=False) labels_lang[labels_lang > 400].plot.bar(); # Show summary statistics on the **number of labels per languoid**. Plot the more common label count frequencies. # In[63]: nlabels = labels.groupby(level='id').size() nlabels_hist = nlabels.value_counts().sort_index() print nlabels.describe() nlabels_hist[nlabels_hist > 30].plot.area(); # Do statistics on the **string length of the labels**. # In[64]: slabel = labels['label'].str.len() slabel_hist = slabel.value_counts().sort_index() print slabel.describe() slabel_hist[slabel_hist > 30].plot.area(); # Load the languages and the full paths into data frames. Join them into one data frame and show the result. # In[65]: languages = pd.read_sql_query("""SELECT * FROM languoid WHERE level='Language' AND NOT obsolete ORDER BY id""", conn, index_col='id') tree = pd.read_sql_query('SELECT * FROM tree WHERE terminal', conn, index_col='child') langs = languages.join(tree, how='left', rsuffix='_tree') langs # Analyze the **number of languages per top-level family**. # In[66]: famsizes = langs.groupby('parent_tree').size().sort_values(ascending=False) print famsizes.describe() famsizes[famsizes > 100].plot.bar(); # Analyze the **number of steps from languages to their top-level family**. # In[67]: langs['steps'] = langs['steps'].fillna(0) print langs['steps'].describe() langs['steps'].value_counts().sort_index().plot.area(); # Inspect the **geographical distribution** of languages. # In[68]: langs['latitude'].hist(bins=100); # In[69]: langs['longitude'].hist(bins=100); # In[70]: import matplotlib.pyplot as plt plt.figure(figsize=(12, 6)) plt.axis([-180, 180, -90, 90]) plt.xticks(range(-180, 181, 60)) plt.yticks(range(-90, 91, 30)) plt.scatter(langs['longitude'], langs['latitude'], 1);