In [1]:
%matplotlib inline

import collections
import os

os.environ['SQLALCHEMY_WARN_20'] = 'true'

import pandas as pd
import sqlalchemy as sa
import matplotlib as mpl
import matplotlib.pyplot as plt

import treedb
import treedb.raw

treedb.configure_logging(log_sql=False)

treedb.print_versions()
treedb version: 2.4.3
sqlalchemy version: 1.4.36
sqlite_version: 3.38.5
csv23 version: 0.3.3

Clone the data repository

In [2]:
%%time

GLOTTOLOG_TAG = 'v4.6'

TARGET = '../glottolog/'

treedb.checkout_or_clone(GLOTTOLOG_TAG, target=TARGET)
CPU times: total: 0 ns
Wall time: 50.6 s
Out[2]:
(CompletedProcess(args=['git', 'clone', '-c', 'advice.detachedHead=false', '--single-branch', '--branch', 'v4.6', '--depth', '1', 'https://github.com/glottolog/glottolog.git', WindowsPath('../glottolog')], returncode=0),
 CompletedProcess(args=['git', 'checkout', '-B', 'treedb', 'v4.6'], returncode=0))
In [3]:
treedb.glottolog_version()
Out[3]:
GlottologVersion(commit='aababe11c3afa6388b84de2b12255155ab0646b2', describe='v4.6')

Use the repository root as source

In [4]:
treedb.set_root(TARGET)
Out[4]:
<treedb._proxies.PathProxy path='../glottolog/languoids/tree' inode=13510798882220529>
In [5]:
next(treedb.iterfiles())
Out[5]:
FileInfo(path=('abin1243',), dentry=<DirEntry 'md.ini'>, config=<treedb.languoids.files.ConfigParser object at 0x00000177AC2836A0>)
In [6]:
dict(treedb.iterlanguoids(limit=1))
Out[6]:
{('abin1243',): {'id': 'abin1243',
  'parent_id': None,
  'name': 'Abinomn',
  'level': 'language',
  'hid': 'bsa',
  'iso639_3': 'bsa',
  'latitude': -2.92281,
  'longitude': 138.891,
  'macroareas': ['Papunesia'],
  'countries': [{'id': 'ID', 'name': 'Indonesia'}],
  'links': [{'url': 'https://en.wikipedia.org/wiki/Abinomn_language',
    'title': None,
    'scheme': 'https'},
   {'url': 'https://www.wikidata.org/entity/Q56648',
    'title': None,
    'scheme': 'https'}],
  'timespan': None,
  'sources': {'glottolog': [{'bibfile': 'hh',
     'bibkey': 'e:Lagerberg:Moegip',
     'pages': None,
     'trigger': None},
    {'bibfile': 'hh',
     'bibkey': 'h:SilzerClouse:Index',
     'pages': None,
     'trigger': None},
    {'bibfile': 'hh',
     'bibkey': 'h:SilzerHeikkinen:Irian',
     'pages': None,
     'trigger': None},
    {'bibfile': 'hh',
     'bibkey': 'hv:Foley:Northwest-New-Guinea',
     'pages': None,
     'trigger': None},
    {'bibfile': 'hh',
     'bibkey': 'hvtyp:DonohueMusgrave:Melanesia',
     'pages': None,
     'trigger': None},
    {'bibfile': 'hh',
     'bibkey': 'w:Fiwei:Abinomn',
     'pages': None,
     'trigger': None}]},
  'altnames': {'multitree': [{'name': '"Baso"', 'lang': None},
    {'name': 'Abinomn', 'lang': None},
    {'name': 'Avinomen', 'lang': None},
    {'name': 'Foja', 'lang': None},
    {'name': 'Foya', 'lang': None}],
   'lexvo': [{'name': 'Abinomn', 'lang': 'en'},
    {'name': 'Abinomn language', 'lang': 'en'},
    {'name': 'Abinomneg', 'lang': 'br'},
    {'name': 'Lingua abinomn', 'lang': 'gl'},
    {'name': 'Llingua Abinomn', 'lang': 'ast'}],
   'hhbib_lgcode': [{'name': 'Baso', 'lang': None}]},
  'triggers': {'lgcode': ['macrohistory', 'moegip']},
  'identifier': {'multitree': 'bsa', 'endangeredlanguages': '1763'},
  'classification': {'familyrefs': [{'bibfile': 'hh',
     'bibkey': 'h:SilzerClouse:Index',
     'pages': None,
     'trigger': None},
    {'bibfile': 'hh',
     'bibkey': 'hvtyp:DonohueMusgrave:Melanesia',
     'pages': None,
     'trigger': None}]},
  'endangerment': {'status': 'shifting',
   'source': {'name': 'ElCat', 'bibfile': None, 'bibkey': None, 'pages': None},
   'date': datetime.datetime(2017, 8, 19, 8, 16, 16),
   'comment': 'Abinomn (1763-bsa) = Endangered (20 percent certain, based on the evidence available)'},
  'hh_ethnologue_comment': None,
  'iso_retirement': None}}
In [7]:
%time treedb.checksum(source='files')
CPU times: total: 13.9 s
Wall time: 16.3 s
Out[7]:
'path_languoid:path:sha256:3e07ebd1d58f84604dab5f717dfcd2745b8dd37adbecab61a6bfdcb4b9e85d1c'

Set the database file and load it

In [8]:
treedb.set_engine('treedb.sqlite3')
Out[8]:
<treedb._proxies.SQLiteEngineProxy filename='treedb.sqlite3' size=None>
In [9]:
%%time

engine = treedb.load(rebuild=False, exclude_raw=False)
engine
0:00:48.633790
CPU times: total: 42.7 s
Wall time: 48.9 s
Out[9]:
<treedb._proxies.SQLiteEngineProxy filename='treedb.sqlite3' size=71778304>

Check the database

In [10]:
engine.file_mtime()
Out[10]:
datetime.datetime(2022, 5, 27, 20, 9, 16, 457834)
In [11]:
engine.file_size(as_megabytes=True)
Out[11]:
68.453125
In [12]:
%time engine.file_sha256()
CPU times: total: 46.9 ms
Wall time: 50 ms
Out[12]:
'fb20b671c08e227187475d23ad3e8b7fb477a770463b59fa8e131c85de119284'
In [13]:
%time treedb.raw.checksum()
CPU times: total: 109 ms
Wall time: 106 ms
Out[13]:
'strong:sha256:fcc129dbfedc77a8238565e4e12eb53198e70dda9f6613c7f467e5b057dc04ca'
In [14]:
%time treedb.raw.checksum(weak=True)
CPU times: total: 3.09 s
Wall time: 3.09 s
Out[14]:
'weak:sha256:1610520f11227a5489b44693b14a5d8db423d66d7ace391c2db221a56036457a'
In [15]:
%time treedb.checksum(source='tables')
CPU times: total: 4.47 s
Wall time: 4.46 s
Out[15]:
'path_languoid:path:sha256:3e07ebd1d58f84604dab5f717dfcd2745b8dd37adbecab61a6bfdcb4b9e85d1c'
In [16]:
%time treedb.checksum(source='raw')
CPU times: total: 5.16 s
Wall time: 5.15 s
Out[16]:
'path_languoid:path:sha256:3e07ebd1d58f84604dab5f717dfcd2745b8dd37adbecab61a6bfdcb4b9e85d1c'
In [17]:
%time treedb.check()
valid_pseudofamily_referencesCheck: OK
pseudofamilies_are_rootsCheck: OK
valid_glottocodeCheck: OK
valid_iso639_3Check: OK
valid_hidCheck: OK
clean_nameCheck: OK
family_parentCheck: OK
language_parentCheck: OK
dialect_parentCheck: OK
family_childrenCheck: OK
family_languagesCheck: OK
no_empty_filesCheck: OK
CPU times: total: 922 ms
Wall time: 919 ms
Out[17]:
True

treedb.sqlite3

In [18]:
treedb.configure_logging(level='INFO', log_sql=True)

treedb.scalar(sa.select(sa.func.sqlite_version()))
[[email protected]_] treedb version: 2.4.3
BEGIN (implicit)
SELECT sqlite_version() AS sqlite_version_1
[generated in 0.00074s] ()
ROLLBACK
Out[18]:
'3.38.5'
In [19]:
application_id = treedb.scalar(sa.text('PRAGMA application_id'))

assert application_id == 1122 == 0x462
assert application_id == sum(ord(c) for c in treedb.Dataset.__tablename__)
assert treedb.Dataset.__tablename__ == '__dataset__'

application_id
BEGIN (implicit)
PRAGMA application_id
[generated in 0.00069s] ()
ROLLBACK
Out[19]:
1122
In [20]:
from treedb import pd_read_sql as read_sql

read_sql(treedb.select_tables_nrows(), index_col='table_name')
BEGIN (implicit)
SELECT sqlite_master.name 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name NOT LIKE ? ORDER BY sqlite_master.name
[generated in 0.00075s] ('table', 'sqlite_%')
ROLLBACK
[[email protected]] pandas version: 1.4.2
BEGIN (implicit)
SELECT ? AS table_name, (SELECT count(*) AS n 
FROM __dataset__) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM __producer__) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM _config) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM _file) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM _option) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM _value) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM altname) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM altnameprovider) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM bibfile) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM bibitem) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM classificationcomment) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM classificationref) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM country) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM endangerment) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM endangermentsource) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM endangermentstatus) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM ethnologuecomment) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM identifier) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM identifiersite) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM isoretirement) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM isoretirement_changeto) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM languoid) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM languoid_country) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM languoid_macroarea) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM languoidlevel) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM link) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM macroarea) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM pseudofamily) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM source) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM sourceprovider) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM timespan) AS n_rows UNION ALL SELECT ? AS table_name, (SELECT count(*) AS n 
FROM "trigger") AS n_rows
[generated in 0.00092s] ('__dataset__', '__producer__', '_config', '_file', '_option', '_value', 'altname', 'altnameprovider', 'bibfile', 'bibitem', 'classificationcomment', 'classificationref', 'country', 'endangerment', 'endangermentsource', 'endangermentstatus', 'ethnologuecomment', 'identifier', 'identifiersite', 'isoretirement', 'isoretirement_changeto', 'languoid', 'languoid_country', 'languoid_macroarea', 'languoidlevel', 'link', 'macroarea', 'pseudofamily', 'source', 'sourceprovider', 'timespan', 'trigger')
ROLLBACK
Out[20]:
n_rows
table_name
__dataset__ 1
__producer__ 1
_config 370
_file 26285
_option 53
_value 624375
altname 156868
altnameprovider 11
bibfile 40
bibitem 148716
classificationcomment 12026
classificationref 17321
country 246
endangerment 8345
endangermentsource 52
endangermentstatus 6
ethnologuecomment 613
identifier 21402
identifiersite 4
isoretirement 358
isoretirement_changeto 451
languoid 26285
languoid_country 11463
languoid_macroarea 21616
languoidlevel 3
link 29722
macroarea 6
pseudofamily 8
source 197791
sourceprovider 1
timespan 199
trigger 30258
In [21]:
treedb.print_rows(treedb.backend.sqlite_master.select_views(),
                  format_='{name}')
BEGIN (implicit)
SELECT sqlite_master.name 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name NOT LIKE ? ORDER BY sqlite_master.name
[cached since 0.06139s ago] ('view', 'sqlite_%')
ROLLBACK
example
path_languoid
stats
In [22]:
from treedb import print_table_sql as print_sql

print_sql('sqlite_master')
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[generated in 0.00091s] ('table', 'sqlite_master')
SELECT count(*) AS n_rows 
FROM sqlite_master
[generated in 0.00049s] ()
ROLLBACK
None
66
In [23]:
treedb.print_dataset()
BEGIN (implicit)
SELECT __dataset__.id, __dataset__.title, __dataset__.git_commit, __dataset__.git_describe, __dataset__.clean, __dataset__.version, __dataset__.exclude_raw 
FROM __dataset__
[generated in 0.00100s] ()
SELECT __producer__.id, __producer__.name, __producer__.version 
FROM __producer__
[cached since 14.05s ago] ()
ROLLBACK
[[email protected]] git describe 'v4.6' clean: True
[[email protected]] __dataset__.git_commit: 'aababe11c3afa6388b84de2b12255155ab0646b2'
[[email protected]] __dataset__.version: '4.6'
[[email protected]] __producer__.name: treedb
[[email protected]] __producer__.version: 2.4.3
git describe 'v4.6' clean: True
__dataset__.title: 'Glottolog treedb''
__dataset__.git_commit: 'aababe11c3afa6388b84de2b12255155ab0646b2'
__dataset__.version: '4.6'
__dataset__.exclude_raw: False
__producer__.name: treedb
__producer__.version: 2.4.3

__dataset__

In [24]:
from treedb import Dataset

print_sql(Dataset)
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 0.03181s ago] ('table', '__dataset__')
SELECT count(*) AS n_rows 
FROM __dataset__
[generated in 0.00058s] ()
ROLLBACK
CREATE TABLE __dataset__ (
	id INTEGER NOT NULL CHECK (id = 1), 
	title TEXT NOT NULL CHECK (title != ''), 
	git_commit VARCHAR(40) NOT NULL CHECK (length(git_commit) = 40), 
	git_describe TEXT NOT NULL CHECK (git_describe != ''), 
	clean BOOLEAN NOT NULL, 
	version TEXT CHECK (version != ''), 
	exclude_raw BOOLEAN NOT NULL, 
	PRIMARY KEY (id), 
	UNIQUE (git_commit), 
	UNIQUE (git_describe), 
	CHECK (clean IN (0, 1)), 
	CHECK (exclude_raw IN (0, 1))
)
1
In [25]:
dataset, = treedb.iterrows(sa.select(Dataset), mappings=True)

pd.DataFrame.from_dict(dataset, orient='index',
                       columns=['__dataset__'])
BEGIN (implicit)
SELECT __dataset__.id, __dataset__.title, __dataset__.git_commit, __dataset__.git_describe, __dataset__.clean, __dataset__.version, __dataset__.exclude_raw 
FROM __dataset__
[cached since 0.03478s ago] ()
ROLLBACK
Out[25]:
__dataset__
id 1
title Glottolog treedb
git_commit aababe11c3afa6388b84de2b12255155ab0646b2
git_describe v4.6
clean True
version 4.6
exclude_raw False

__producer__

In [26]:
from treedb import Producer

print_sql(Producer)
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 0.06319s ago] ('table', '__producer__')
SELECT count(*) AS n_rows 
FROM __producer__
[generated in 0.00045s] ()
ROLLBACK
CREATE TABLE __producer__ (
	id INTEGER NOT NULL CHECK (id = 1), 
	name TEXT NOT NULL CHECK (name != ''), 
	version TEXT NOT NULL CHECK (version != ''), 
	PRIMARY KEY (id), 
	UNIQUE (name)
)
1
In [27]:
producer, = treedb.iterrows(sa.select(Producer), mappings=True)

pd.DataFrame.from_dict(producer, orient='index',
                       columns=['__producer__'])
BEGIN (implicit)
SELECT __producer__.id, __producer__.name, __producer__.version 
FROM __producer__
[cached since 14.11s ago] ()
ROLLBACK
Out[27]:
__producer__
id 1
name treedb
version 2.4.3

treedb.raw

In [28]:
from treedb.raw import File, Option, Value

for model in (File, Option, Value):
    print_sql(model)
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 0.09509s ago] ('table', '_file')
SELECT count(*) AS n_rows 
FROM _file
[generated in 0.00051s] ()
ROLLBACK
CREATE TABLE _file (
	id INTEGER NOT NULL, 
	glottocode VARCHAR(8) NOT NULL CHECK (length(glottocode) = 8), 
	path TEXT NOT NULL CHECK (length(path) >= 8 AND (length(path) + 1) % 9 = 0), 
	size INTEGER NOT NULL CHECK (size > 0), 
	sha256 VARCHAR(64) NOT NULL CHECK (length(sha256) = 64), 
	PRIMARY KEY (id), 
	CHECK (substr(path, -length(glottocode)) = glottocode), 
	UNIQUE (glottocode), 
	UNIQUE (path), 
	UNIQUE (sha256)
)
26285
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 0.103s ago] ('table', '_option')
SELECT count(*) AS n_rows 
FROM _option
[generated in 0.00035s] ()
ROLLBACK
CREATE TABLE _option (
	id INTEGER NOT NULL, 
	section TEXT NOT NULL CHECK (section != ''), 
	option TEXT NOT NULL CHECK (option != ''), 
	is_lines BOOLEAN, 
	defined BOOLEAN NOT NULL, 
	defined_any_options BOOLEAN NOT NULL, 
	ord_section INTEGER CHECK (ord_section >= 1), 
	ord_option INTEGER CHECK (ord_section >= 0), 
	PRIMARY KEY (id), 
	UNIQUE (section, option), 
	CHECK ((is_lines IS NULL) = (defined = 0)), 
	CHECK (defined = 1 OR defined_any_options = 0), 
	CHECK ((defined = 0) = (ord_section IS NULL)), 
	CHECK (ord_section IS NOT NULL OR ord_option IS NULL), 
	CHECK (is_lines IN (0, 1)), 
	CHECK (defined IN (0, 1)), 
	CHECK (defined_any_options IN (0, 1))
)
53
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 0.1096s ago] ('table', '_value')
SELECT count(*) AS n_rows 
FROM _value
[generated in 0.00051s] ()
ROLLBACK
CREATE TABLE _value (
	file_id INTEGER NOT NULL, 
	option_id INTEGER NOT NULL, 
	line INTEGER NOT NULL CHECK (line > 0), 
	value TEXT NOT NULL CHECK (value != ''), 
	PRIMARY KEY (file_id, option_id, line), 
	UNIQUE (file_id, line), 
	FOREIGN KEY(file_id) REFERENCES _file (id), 
	FOREIGN KEY(option_id) REFERENCES _option (id)
) WITHOUT ROWID
624375
In [29]:
read_sql(sa.select(File).limit(5), index_col='id')
BEGIN (implicit)
SELECT _file.id, _file.glottocode, _file.path, _file.size, _file.sha256 
FROM _file
 LIMIT ? OFFSET ?
[generated in 0.00090s] (5, 0)
ROLLBACK
Out[29]:
glottocode path size sha256
id
1 abin1243 abin1243 1109 c41caf5e12bcc9a22c6ff184e6a659ca744c4fe363cf1d...
2 abis1238 abis1238 1702 b9889f1e63c8f7244f4228b21183594aae965eab830396...
3 abkh1242 abkh1242 306 5dd28ae1265a0967b1a236d06c7fd6e7ac3b9e32d866dd...
4 abkh1243 abkh1242/abkh1243 257 a5900355c69af22d94eaa0a147b95fc291270634bc033c...
5 abaz1241 abkh1242/abkh1243/abaz1241 2363 93c6b3b27e142d40f38c8c55b929217edd534f80f44a94...
In [30]:
read_sql(sa.select(Option).limit(5), index_col='id')
BEGIN (implicit)
SELECT _option.id, _option.section, _option.option, _option.is_lines, _option.defined, _option.defined_any_options, _option.ord_section, _option.ord_option 
FROM _option
 LIMIT ? OFFSET ?
[generated in 0.00093s] (5, 0)
ROLLBACK
Out[30]:
section option is_lines defined defined_any_options ord_section ord_option
id
1 core name False True False 1 1
2 core hid False True False 1 2
3 core level False True False 1 3
4 core iso639-3 False True False 1 4
5 core latitude False True False 1 5
In [31]:
read_sql(sa.select(Value).limit(5), index_col=['file_id', 'option_id'])
BEGIN (implicit)
SELECT _value.file_id, _value.option_id, _value.line, _value.value 
FROM _value
 LIMIT ? OFFSET ?
[generated in 0.00085s] (5, 0)
ROLLBACK
Out[31]:
line value
file_id option_id
1 1 1 Abinomn
2 2 bsa
3 3 language
4 4 bsa
5 5 -2.92281
In [32]:
select_file_values = (sa.select(Option.section, Option.option, Value.line, Value.value)
                      .select_from(File)
                      .filter_by(glottocode=sa.bindparam('glottocode'))
                      .join(Value).join(Option))

read_sql(select_file_values, params={'glottocode': 'abin1243'},
         index_col=['section', 'option', 'line'])
BEGIN (implicit)
SELECT _option.section, _option.option, _value.line, _value.value 
FROM _file JOIN _value ON _file.id = _value.file_id JOIN _option ON _option.id = _value.option_id 
WHERE _file.glottocode = ?
[generated in 0.00085s] ('abin1243',)
ROLLBACK
Out[32]:
value
section option line
core name 1 Abinomn
hid 2 bsa
level 3 language
iso639-3 4 bsa
latitude 5 -2.92281
longitude 6 138.891
macroareas 7 Papunesia
countries 8 ID
links 9 https://en.wikipedia.org/wiki/Abinomn_language
10 https://www.wikidata.org/entity/Q56648
sources glottolog 11 **hh:e:Lagerberg:Moegip**
12 **hh:h:SilzerClouse:Index**
13 **hh:h:SilzerHeikkinen:Irian**
14 **hh:hv:Foley:Northwest-New-Guinea**
15 **hh:hvtyp:DonohueMusgrave:Melanesia**
16 **hh:w:Fiwei:Abinomn**
altnames multitree 17 "Baso"
18 Abinomn
19 Avinomen
20 Foja
21 Foya
lexvo 22 Abinomn [en]
23 Abinomn language [en]
24 Abinomneg [br]
25 Lingua abinomn [gl]
26 Llingua Abinomn [ast]
hhbib_lgcode 27 Baso
triggers lgcode 28 macrohistory
29 moegip
identifier multitree 30 bsa
endangeredlanguages 31 1763
classification familyrefs 32 **hh:h:SilzerClouse:Index**
33 **hh:hvtyp:DonohueMusgrave:Melanesia**
endangerment status 34 shifting
source 35 ElCat
date 36 2017-08-19T08:16:16
comment 37 Abinomn (1763-bsa) = Endangered (20 percent ce...
In [33]:
path_depth = File.path_depth()

select_path_depths = (sa.select(path_depth,
                                treedb.Languoid.level, sa.func.count().label('n_files'))
                      .join_from(File, treedb.Languoid, File.glottocode == treedb.Languoid.id)
                      .group_by(path_depth, treedb.Languoid.level)
                      .order_by('path_depth', 'level'))

_ = (read_sql(select_path_depths, index_col=['path_depth', 'level'])
     .unstack(fill_value=0).droplevel(0, axis='columns')[list(treedb.LEVEL)])

_.plot.bar(stacked=True, figsize=(12, 3))

(100 * _.div(_.sum(axis='columns'), axis='rows')).plot.bar(stacked=True, figsize=(12, 3));
BEGIN (implicit)
SELECT floor((length(_file.path) + ?) / ?) AS path_depth, languoid.level, count(*) AS n_files 
FROM _file JOIN languoid ON _file.glottocode = languoid.id GROUP BY floor((length(_file.path) + ?) / ?), languoid.level ORDER BY path_depth, languoid.level
[generated in 0.00076s] (1, 9, 1, 9)
ROLLBACK
In [34]:
file_size = File.size.label('file_size')

select_file_sizes = (sa.select(file_size, sa.func.count().label('n_files'))
                     .group_by(file_size)
                     .order_by('file_size'))

_ = read_sql(select_file_sizes, index_col='file_size')

(_.plot.area(figsize=(12, 3), logx=True)
 .xaxis.set_major_formatter(mpl.ticker.ScalarFormatter()))
(_.index.to_series().describe()
 .to_frame().transpose()[['count', 'min', 'max']])
BEGIN (implicit)
SELECT _file.size AS file_size, count(*) AS n_files 
FROM _file GROUP BY _file.size ORDER BY file_size
[generated in 0.00106s] ()
ROLLBACK
Out[34]:
count min max
file_size 3286.0 65.0 47175.0
In [35]:
file_nvalues = (sa.select(File.glottocode,
                          sa.func.count(Value.option_id.distinct()).label('n_values'))
                .join_from(File, Value)
                .group_by(File.glottocode)
                .alias('file_nvalues')
                .c.n_values)

select_nvalues = (sa.select(file_nvalues, sa.func.count().label('n_files'))
                  .group_by(file_nvalues)
                  .order_by(file_nvalues))

_ = read_sql(select_nvalues, index_col='n_values')

_.plot.bar(figsize=(12, 3))
(_.index.to_series().describe()
 .to_frame().transpose()[['count', 'min', 'max']])
BEGIN (implicit)
SELECT file_nvalues.n_values, count(*) AS n_files 
FROM (SELECT _file.glottocode AS glottocode, count(DISTINCT _value.option_id) AS n_values 
FROM _file JOIN _value ON _file.id = _value.file_id GROUP BY _file.glottocode) AS file_nvalues GROUP BY file_nvalues.n_values ORDER BY file_nvalues.n_values
[generated in 0.00080s] ()
ROLLBACK
Out[35]:
count min max
n_values 36.0 2.0 37.0
In [36]:
value_length = sa.func.length(Value.value).label('value_length')

select_value_length = (sa.select(value_length, sa.func.count().label('n_values'))
                       .group_by(value_length)
                       .order_by('value_length'))

_ = read_sql(select_value_length, index_col='value_length')

(_.plot.area(figsize=(12, 3), logx=True)
 .xaxis.set_major_formatter(mpl.ticker.ScalarFormatter()))
(_.index.to_series().describe()
 .to_frame().transpose()[['count', 'min', 'max']])
BEGIN (implicit)
SELECT length(_value.value) AS value_length, count(*) AS n_values 
FROM _value GROUP BY length(_value.value) ORDER BY value_length
[generated in 0.00092s] ()
ROLLBACK
Out[36]:
count min max
value_length 633.0 1.0 6215.0
In [37]:
%time treedb.raw.print_stats()
[[email protected]] fetch statistics
BEGIN (implicit)
SELECT _option.section, _option.option, count(*) AS n 
FROM _option JOIN _value ON _option.id = _value.option_id GROUP BY _option.section, _option.option ORDER BY _option.defined DESC, _option.ord_section, _option.ord_option, _option.section, n DESC, _option.option
[generated in 0.00075s] ()
ROLLBACK
core                   name                   26,285
core                   hid                    8,858
core                   level                  26,285
core                   iso639-3               8,130
core                   latitude               8,755
core                   longitude              8,755
core                   macroareas             21,616
core                   countries              11,463
core                   name_comment           14
core                   comment                1
core                   location               1
core                   name_pronunciation     1
core                   speakers               1
core                   links                  29,722
core                   timespan               199
sources                glottolog              197,791
altnames               multitree              54,501
altnames               hhbib_lgcode           34,996
altnames               lexvo                  26,027
altnames               elcat                  24,422
altnames               aiatsis                6,542
altnames               wals                   2,442
altnames               wals other             2,097
altnames               moseley & asher (1994) 2,089
altnames               ruhlen (1987)          2,007
altnames               glottolog              1,742
altnames               ethnologue             3
triggers               lgcode                 30,071
triggers               inlg                   187
identifier             multitree              14,849
identifier             endangeredlanguages    3,596
identifier             wals                   2,620
identifier             languagelandscape      337
classification         sub                    11,816
classification         subrefs                16,613
classification         family                 210
classification         familyrefs             708
endangerment           status                 8,345
endangerment           source                 8,345
endangerment           date                   8,345
endangerment           comment                8,345
hh_ethnologue_comment  isohid                 613
hh_ethnologue_comment  comment_type           613
hh_ethnologue_comment  ethnologue_versions    613
hh_ethnologue_comment  comment                613
iso_retirement         code                   358
iso_retirement         name                   358
iso_retirement         change_request         349
iso_retirement         effective              358
iso_retirement         reason                 358
iso_retirement         change_to              451
iso_retirement         remedy                 327
iso_retirement         comment                232
CPU times: total: 766 ms
Wall time: 744 ms

languoid

In [38]:
from treedb import Languoid

print_sql(Languoid)
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 2.673s ago] ('table', 'languoid')
SELECT count(*) AS n_rows 
FROM languoid
[generated in 0.00046s] ()
ROLLBACK
CREATE TABLE languoid (
	id VARCHAR(8) NOT NULL CHECK (length(id) = 8), 
	name VARCHAR NOT NULL CHECK (name != ''), 
	level VARCHAR NOT NULL, 
	parent_id VARCHAR(8), 
	hid TEXT CHECK (length(hid) >= 3), 
	iso639_3 VARCHAR(3) CHECK (length(iso639_3) = 3), 
	latitude FLOAT CHECK (latitude BETWEEN -90 AND 90), 
	longitude FLOAT CHECK (longitude BETWEEN -180 AND 180), 
	PRIMARY KEY (id), 
	CHECK ((latitude IS NULL) = (longitude IS NULL)), 
	UNIQUE (name), 
	FOREIGN KEY(level) REFERENCES languoidlevel (name), 
	FOREIGN KEY(parent_id) REFERENCES languoid (id) DEFERRABLE INITIALLY DEFERRED, 
	UNIQUE (hid), 
	UNIQUE (iso639_3)
) WITHOUT ROWID
26285
In [39]:
from treedb import LEVEL
from treedb.models import LanguoidLevel

FAMILY, LANGUAGE, DIALECT = LEVEL

print_sql(LanguoidLevel)
read_sql(sa.select(LanguoidLevel).order_by('ordinal'), index_col='name')
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 2.69s ago] ('table', 'languoidlevel')
SELECT count(*) AS n_rows 
FROM languoidlevel
[generated in 0.00037s] ()
ROLLBACK
CREATE TABLE languoidlevel (
	name VARCHAR NOT NULL CHECK (name != ''), 
	description TEXT NOT NULL CHECK (description != ''), 
	ordinal INTEGER NOT NULL CHECK (ordinal >= 1), 
	PRIMARY KEY (name)
) WITHOUT ROWID
3
BEGIN (implicit)
SELECT languoidlevel.name, languoidlevel.description, languoidlevel.ordinal 
FROM languoidlevel ORDER BY languoidlevel.ordinal
[generated in 0.00069s] ()
ROLLBACK
Out[39]:
description ordinal
name
family sub-grouping of languoids above the language l... 1
language defined by mutual non-intellegibility 2
dialect any variety which is not a language 3
In [40]:
from treedb.models import PseudoFamily

print_sql(PseudoFamily)
read_sql(sa.select(PseudoFamily).order_by('name'), index_col='config_section')
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 2.724s ago] ('table', 'pseudofamily')
SELECT count(*) AS n_rows 
FROM pseudofamily
[generated in 0.00041s] ()
ROLLBACK
CREATE TABLE pseudofamily (
	languoid_id VARCHAR(8) NOT NULL, 
	name VARCHAR NOT NULL, 
	config_section VARCHAR NOT NULL CHECK (config_section != ''), 
	description TEXT CHECK (description != ''), 
	bookkeeping BOOLEAN CHECK (bookkeeping = 1), 
	PRIMARY KEY (languoid_id), 
	FOREIGN KEY(languoid_id) REFERENCES languoid (id), 
	UNIQUE (name), 
	FOREIGN KEY(name) REFERENCES languoid (name), 
	UNIQUE (config_section), 
	UNIQUE (bookkeeping)
) WITHOUT ROWID
8
BEGIN (implicit)
SELECT pseudofamily.languoid_id, pseudofamily.name, pseudofamily.config_section, pseudofamily.description, pseudofamily.bookkeeping 
FROM pseudofamily ORDER BY pseudofamily.name
[generated in 0.00057s] ()
ROLLBACK
Out[40]:
languoid_id name description bookkeeping
config_section
artificial_language arti1236 Artificial Language A language known to have been created by consc... None
bookkeeping book1242 Bookkeeping An alleged language that has been replaced wit... True
mixed_language mixe1287 Mixed Language None None
pidgin pidg1258 Pidgin A language used for interethnic communication ... None
sign_language sign1238 Sign Language A language with visible manual signs as the mo... None
speech_register spee1234 Speech Register A language not used by a community of speakers... None
unattested unat1236 Unattested A language arguably different from all other e... None
unclassifiable uncl1493 Unclassifiable A language arguably different from all other e... None
In [41]:
%time treedb.print_languoid_stats()
BEGIN (implicit)
WITH RECURSIVE tree(parent_id, child_id) AS 
(SELECT root.id AS parent_id, root.id AS child_id 
FROM languoid AS root 
WHERE root.parent_id IS NULL UNION ALL SELECT tree.parent_id AS parent_id, child.id AS child_id 
FROM tree JOIN languoid AS child ON tree.child_id = child.parent_id)
 SELECT ? AS kind, count(*) AS n 
FROM languoid UNION ALL SELECT ? AS kind, count(*) AS n 
FROM languoid 
WHERE languoid.level = ? AND languoid.parent_id IS NULL UNION ALL SELECT ? AS kind, count(*) AS n 
FROM languoid 
WHERE languoid.level = ? AND languoid.parent_id IS NULL UNION ALL SELECT ? AS kind, count(*) AS n 
FROM languoid 
WHERE languoid.parent_id IS NULL UNION ALL SELECT ? AS kind, count(*) AS n 
FROM languoid 
WHERE languoid.level = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM languoid 
WHERE languoid.level = ? AND languoid.parent_id IS NOT NULL UNION ALL SELECT ? AS kind, count(*) AS n 
FROM languoid 
WHERE languoid.level = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND (root.name NOT IN (?, ?, ?, ?, ?, ?, ?, ?)) UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name = ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name != ? UNION ALL SELECT ? AS kind, count(*) AS n 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? AND root.name = ?
[generated in 0.00094s] ('languoids', 'families', 'family', 'isolates', 'language', 'roots', 'languages', 'language', 'subfamilies', 'family', 'dialects', 'dialect', 'Spoken L1 Languages', 'language', 'Sign Language', 'Unclassifiable', 'Pidgin', 'Unattested', 'Artificial Language', 'Mixed Language', 'Speech Register', 'Bookkeeping', 'Sign Language', 'language', 'Sign Language', 'Unclassifiable', 'language', 'Unclassifiable', 'Pidgin', 'language', 'Pidgin', 'Unattested', 'language', 'Unattested', 'Artificial Language', 'language', 'Artificial Language', 'Mixed Language', 'language', 'Mixed Language', 'Speech Register', 'language', 'Speech Register', 'All', 'language', 'Bookkeeping', 'Bookkeeping', 'language', 'Bookkeeping')
26,285 languoids
   245 families
   182 isolates
   427 roots
 8,565 languages
 4,388 subfamilies
13,087 dialects
 7,628 Spoken L1 Languages
   215 Sign Language
   121 Unclassifiable
    84 Pidgin
    68 Unattested
    32 Artificial Language
    10 Mixed Language
    15 Speech Register
ROLLBACK
 8,173 All
   392 Bookkeeping
CPU times: total: 375 ms
Wall time: 369 ms
In [42]:
read_sql(sa.select(Languoid).limit(5), index_col='id')
BEGIN (implicit)
SELECT languoid.id, languoid.name, languoid.level, languoid.parent_id, languoid.hid, languoid.iso639_3, languoid.latitude, languoid.longitude 
FROM languoid
 LIMIT ? OFFSET ?
[generated in 0.00084s] (5, 0)
ROLLBACK
Out[42]:
name level parent_id hid iso639_3 latitude longitude
id
3adt1234 3Ad-Tekles dialect nort3292 None None NaN NaN
aala1237 Aalawa dialect ramo1244 None None NaN NaN
aant1238 Aantantara dialect nort2920 None None NaN NaN
aari1238 Aari-Gayil family ahkk1235 aiz aiz NaN NaN
aari1239 Aari language aari1238 aiw aiw 5.95034 36.5721
In [43]:
read_sql(sa.select(Languoid).order_by('id').limit(5), index_col='id')
BEGIN (implicit)
SELECT languoid.id, languoid.name, languoid.level, languoid.parent_id, languoid.hid, languoid.iso639_3, languoid.latitude, languoid.longitude 
FROM languoid ORDER BY languoid.id
 LIMIT ? OFFSET ?
[generated in 0.00062s] (5, 0)
ROLLBACK
Out[43]:
name level parent_id hid iso639_3 latitude longitude
id
3adt1234 3Ad-Tekles dialect nort3292 None None NaN NaN
aala1237 Aalawa dialect ramo1244 None None NaN NaN
aant1238 Aantantara dialect nort2920 None None NaN NaN
aari1238 Aari-Gayil family ahkk1235 aiz aiz NaN NaN
aari1239 Aari language aari1238 aiw aiw 5.95034 36.5721
In [44]:
read_sql(sa.select(Languoid).order_by('name').limit(5), index_col='id')
BEGIN (implicit)
SELECT languoid.id, languoid.name, languoid.level, languoid.parent_id, languoid.hid, languoid.iso639_3, languoid.latitude, languoid.longitude 
FROM languoid ORDER BY languoid.name
 LIMIT ? OFFSET ?
[generated in 0.00085s] (5, 0)
ROLLBACK
Out[44]:
name level parent_id hid iso639_3 latitude longitude
id
gane1238 !Gã!ne language east2867 NOCODE_Gane None -31.3200 28.7500
oung1238 !O!ung language book1242 oun oun -15.3000 14.3500
kwii1241 !Ui family tuuu1241 None None NaN NaN
abda1238 'Abd Al-Kuri dialect soqo1240 None None 12.1959 52.2282
aden1242 'Aden dialect jude1267 None None NaN NaN
In [45]:
Child, Parent = (sa.orm.aliased(Languoid, name=n) for n in ('child', 'parent'))

select_parent_levels = (sa.select(Child.level.label('child_level'),
                                  Parent.level.label('parent_level'),
                                  sa.func.count().label('n_languoids'))
                        .outerjoin_from(Child, Parent, Child.parent_id == Parent.id)
                        .group_by(Child.level, Parent.level)
                        .order_by('child_level', 'parent_level'))

(read_sql(select_parent_levels, index_col=['child_level', 'parent_level'])
 .unstack(fill_value=0).assign(all=lambda x: x.sum(axis='columns'))
 .pipe(lambda x: pd.concat([x, x.sum().rename('all').to_frame().T.rename_axis('child_level')])))
BEGIN (implicit)
SELECT child.level AS child_level, parent.level AS parent_level, count(*) AS n_languoids 
FROM languoid AS child LEFT OUTER JOIN languoid AS parent ON child.parent_id = parent.id GROUP BY child.level, parent.level ORDER BY child_level, parent_level
[generated in 0.00091s] ()
ROLLBACK
Out[45]:
n_languoids all
parent_level NaN dialect family language
child_level
dialect 0 2310 0 10777 13087
family 245 0 4388 0 4633
language 182 0 8383 0 8565
all 427 2310 12771 10777 26285
In [46]:
select_lang_nisos = (sa.select(Languoid.level.label('level'),
                              sa.func.count().label('n_languoids'),
                              sa.func.count(Languoid.iso639_3).label('n_isos'))
                    .group_by(Languoid.level)
                    .order_by('level'))

(read_sql(select_lang_nisos, index_col='level')
 .assign(ratio=lambda x: 100 * x['n_isos'] / x['n_languoids']))
BEGIN (implicit)
SELECT languoid.level AS level, count(*) AS n_languoids, count(languoid.iso639_3) AS n_isos 
FROM languoid GROUP BY languoid.level ORDER BY level
[generated in 0.00076s] ()
ROLLBACK
Out[46]:
n_languoids n_isos ratio
level
dialect 13087 249 1.902651
family 4633 44 0.949709
language 8565 7837 91.500292
In [47]:
select_lang_nlocations = (sa.select(Languoid.level.label('level'),
                                    sa.func.count().label('n_languoids'),
                                    sa.func.count(Languoid.latitude).label('n_locations'))
                          .group_by(Languoid.level)
                          .order_by('level'))

(read_sql(select_lang_nlocations, index_col='level')
 .assign(ratio=lambda x: 100 * x['n_locations'] / x['n_languoids']))
BEGIN (implicit)
SELECT languoid.level AS level, count(*) AS n_languoids, count(languoid.latitude) AS n_locations 
FROM languoid GROUP BY languoid.level ORDER BY level
[generated in 0.00110s] ()
ROLLBACK
Out[47]:
n_languoids n_locations ratio
level
dialect 13087 487 3.721250
family 4633 28 0.604360
language 8565 8240 96.205487
In [48]:
select_latlon = (sa.select(Languoid.latitude, Languoid.longitude)
                 .select_from(Languoid)
                 .filter_by(level=LANGUAGE))

latitudes, longitudes = zip(*treedb.iterrows(select_latlon))
    
plt.figure(figsize=(12, 6))
plt.axis([-180, 180, -90, 90])
plt.xticks(range(-180, 181, 60))
plt.yticks(range(-90, 91, 30))
plt.scatter(longitudes, latitudes, 1, 'black');
BEGIN (implicit)
SELECT languoid.latitude, languoid.longitude 
FROM languoid 
WHERE languoid.level = ?
[generated in 0.00092s] ('language',)
ROLLBACK
In [49]:
Family, Child, family_child = treedb.Languoid.parent_descendant(parent_root=True,
                                                                parent_level=FAMILY)

n_languages = sa.func.count(Child.id).label('n_languages')

select_family_nlanguages = (sa.select(Family.id.label('family_id'), n_languages)
                            .select_from(family_child)
                            .where(Child.level == LANGUAGE)
                            .group_by(Family.id))

select_top_families = (select_family_nlanguages
                       .having(n_languages >= 100)
                       .order_by('n_languages'))

(read_sql(select_top_families, index_col='family_id')
 .plot.barh());
BEGIN (implicit)
WITH RECURSIVE tree(parent_id, child_id) AS 
(SELECT root.id AS parent_id, child.id AS child_id 
FROM languoid AS root LEFT OUTER JOIN languoid AS child ON root.id = child.parent_id 
WHERE root.parent_id IS NULL AND root.level = ? UNION ALL SELECT tree.parent_id AS parent_id, child.id AS child_id 
FROM tree JOIN languoid AS child ON tree.child_id = child.parent_id)
 SELECT root.id AS family_id, count(child.id) AS n_languages 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id LEFT OUTER JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? GROUP BY root.id 
HAVING count(child.id) >= ? ORDER BY n_languages
[generated in 0.00077s] ('family', 'language', 100)
ROLLBACK
In [50]:
family_size = (select_family_nlanguages
               .alias('family_nlanguages')
               .c.n_languages.label('family_size'))

select_family_sizes = (sa.select(family_size, sa.func.count().label('n_families'))
                       .group_by(family_size)
                       .order_by('family_size'))

_ = read_sql(select_family_sizes, index_col='family_size')

(_.plot.area(figsize=(12, 3), logx=True)
 .xaxis.set_major_formatter(mpl.ticker.ScalarFormatter()))
(_.index.to_series().describe()
 .to_frame().transpose()[['count', 'min', 'max']])
BEGIN (implicit)
WITH RECURSIVE tree(parent_id, child_id) AS 
(SELECT root.id AS parent_id, child.id AS child_id 
FROM languoid AS root LEFT OUTER JOIN languoid AS child ON root.id = child.parent_id 
WHERE root.parent_id IS NULL AND root.level = ? UNION ALL SELECT tree.parent_id AS parent_id, child.id AS child_id 
FROM tree JOIN languoid AS child ON tree.child_id = child.parent_id)
 SELECT family_nlanguages.n_languages AS family_size, count(*) AS n_families 
FROM (SELECT root.id AS family_id, count(child.id) AS n_languages 
FROM tree JOIN languoid AS root ON tree.parent_id = root.id LEFT OUTER JOIN languoid AS child ON tree.child_id = child.id 
WHERE child.level = ? GROUP BY root.id) AS family_nlanguages GROUP BY family_nlanguages.n_languages ORDER BY family_size
[generated in 0.00084s] ('family', 'language')
ROLLBACK
Out[50]:
count min max
family_size 58.0 2.0 1406.0
In [51]:
levels = (sa.union_all(*[sa.select(sa.literal(l).label('level')) for l in LEVEL])
          .cte(name='levels'))

select_parent_nchildren = (sa.select(Parent.id.label('parent_id'),
                                     Parent.level.label('parent_level'),
                                     levels.c.level.label('child_level'),
                                     sa.func.count(Child.id).label('n_children'))
                           .join_from(Parent, levels, sa.true())
                           .outerjoin(Child, sa.and_(Parent.id == Child.parent_id, 
                                                     Child.level == levels.c.level))
                           .group_by(Parent.id, Parent.level, levels.c.level))

(read_sql(select_parent_nchildren)
 .pivot_table(index='parent_level', columns='child_level', values='n_children',
              aggfunc=['sum', 'max', 'mean'], fill_value=0))
BEGIN (implicit)
WITH levels AS 
(SELECT ? AS level UNION ALL SELECT ? AS level UNION ALL SELECT ? AS level)
 SELECT parent.id AS parent_id, parent.level AS parent_level, levels.level AS child_level, count(child.id) AS n_children 
FROM languoid AS parent JOIN levels ON 1 = 1 LEFT OUTER JOIN languoid AS child ON parent.id = child.parent_id AND child.level = levels.level GROUP BY parent.id, parent.level, levels.level
[generated in 0.00093s] ('family', 'language', 'dialect')
ROLLBACK
Out[51]:
sum max mean
child_level dialect family language dialect family language dialect family language
parent_level
dialect 2310 0 0 22 0 0 0.176511 0.000000 0.000000
family 0 4388 8383 0 46 379 0.000000 0.947118 1.809411
language 10777 0 0 32 0 0 1.258260 0.000000 0.000000
In [52]:
Parent, Child, parent_child = treedb.Languoid.parent_descendant()

select_parent_ndescendants = (sa.select(Parent.id.label('parent_id'),
                                        Parent.level.label('parent_level'),
                                        sa.func.count(Child.id).label('n_descendants'))
                              .select_from(parent_child)
                              .group_by(Parent.id, Parent.level)
                              .alias('parent_ndescendants'))

parent_level = select_parent_ndescendants.c.parent_level

n_descendants = select_parent_ndescendants.c.n_descendants

d_min = sa.func.min(n_descendants).label('min')
d_max = sa.func.max(n_descendants).label('max')
d_mean = (sa.func.sum(n_descendants).cast(sa.Float) / sa.func.count(n_descendants)).label('mean')

select_level_mean_descendants = (sa.select(parent_level, d_min, d_max, d_mean)
                                 .group_by(parent_level)
                                 .order_by(parent_level))

select_total_mean_descendants = sa.select(sa.literal('total').label('parent_level'),
                                          d_min, d_max, d_mean)

select_mean_descendants = [select_level_mean_descendants, select_total_mean_descendants]

_ = pd.concat([read_sql(q, index_col='parent_level') for q in select_mean_descendants])
_.columns = pd.MultiIndex.from_product([['n_descendants'], _.columns])
_
BEGIN (implicit)
WITH RECURSIVE tree(parent_id, child_id) AS 
(SELECT parent.id AS parent_id, child.id AS child_id 
FROM languoid AS parent LEFT OUTER JOIN languoid AS child ON parent.id = child.parent_id UNION ALL SELECT tree.parent_id AS parent_id, child.id AS child_id 
FROM tree JOIN languoid AS child ON tree.child_id = child.parent_id)
 SELECT parent_ndescendants.parent_level, min(parent_ndescendants.n_descendants) AS min, max(parent_ndescendants.n_descendants) AS max, CAST(sum(parent_ndescendants.n_descendants) AS FLOAT) / count(parent_ndescendants.n_descendants) AS mean 
FROM (SELECT parent.id AS parent_id, parent.level AS parent_level, count(child.id) AS n_descendants 
FROM tree JOIN languoid AS parent ON tree.parent_id = parent.id LEFT OUTER JOIN languoid AS child ON tree.child_id = child.id GROUP BY parent.id, parent.level) AS parent_ndescendants GROUP BY parent_ndescendants.parent_level ORDER BY parent_ndescendants.parent_level
[generated in 0.00083s] ()
ROLLBACK
BEGIN (implicit)
WITH RECURSIVE tree(parent_id, child_id) AS 
(SELECT parent.id AS parent_id, child.id AS child_id 
FROM languoid AS parent LEFT OUTER JOIN languoid AS child ON parent.id = child.parent_id UNION ALL SELECT tree.parent_id AS parent_id, child.id AS child_id 
FROM tree JOIN languoid AS child ON tree.child_id = child.parent_id)
 SELECT ? AS parent_level, min(parent_ndescendants.n_descendants) AS min, max(parent_ndescendants.n_descendants) AS max, CAST(sum(parent_ndescendants.n_descendants) AS FLOAT) / count(parent_ndescendants.n_descendants) AS mean 
FROM (SELECT parent.id AS parent_id, parent.level AS parent_level, count(child.id) AS n_descendants 
FROM tree JOIN languoid AS parent ON tree.parent_id = parent.id LEFT OUTER JOIN languoid AS child ON tree.child_id = child.id GROUP BY parent.id, parent.level) AS parent_ndescendants
[generated in 0.00066s] ('total',)
ROLLBACK
Out[52]:
n_descendants
min max mean
parent_level
dialect 0 57 0.242989
family 1 4815 35.261170
language 0 125 1.527963
total 0 4815 6.834012

macroarea

In [53]:
from treedb.models import Macroarea, languoid_macroarea

print_sql(Macroarea)
print_sql(languoid_macroarea)
read_sql(sa.select(Macroarea).order_by('name'), index_col='config_section')
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 5.656s ago] ('table', 'macroarea')
SELECT count(*) AS n_rows 
FROM macroarea
[generated in 0.00036s] ()
ROLLBACK
CREATE TABLE macroarea (
	name VARCHAR NOT NULL CHECK (name != ''), 
	config_section VARCHAR NOT NULL CHECK (config_section != ''), 
	description TEXT NOT NULL CHECK (description != ''), 
	PRIMARY KEY (name), 
	UNIQUE (config_section)
) WITHOUT ROWID
6
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 5.662s ago] ('table', 'languoid_macroarea')
SELECT count(*) AS n_rows 
FROM languoid_macroarea
[generated in 0.00027s] ()
ROLLBACK
CREATE TABLE languoid_macroarea (
	languoid_id VARCHAR(8) NOT NULL, 
	macroarea_name VARCHAR NOT NULL, 
	PRIMARY KEY (languoid_id, macroarea_name), 
	FOREIGN KEY(languoid_id) REFERENCES languoid (id), 
	FOREIGN KEY(macroarea_name) REFERENCES macroarea (name)
) WITHOUT ROWID
21616
BEGIN (implicit)
SELECT macroarea.name, macroarea.config_section, macroarea.description 
FROM macroarea ORDER BY macroarea.name
[generated in 0.00073s] ()
ROLLBACK
Out[53]:
name description
config_section
africa Africa The continent
australia Australia The continent
eurasia Eurasia The Eurasian landmass North of Sinai. Includes...
northamerica North America North and Middle America up to Panama. Include...
pacific Papunesia All islands between Sumatra and the Americas, ...
southamerica South America Everything South of Darién.
In [54]:
select_macroarea_nlanguages = (sa.select(Macroarea.name.label('macroarea'),
                                         sa.func.count().label('n_languages'))
                               .outerjoin_from(Macroarea, languoid_macroarea).join(Languoid)
                               .filter_by(level=LANGUAGE)
                               .group_by(Macroarea.name)
                               .order_by(sa.desc('n_languages')))

_ = read_sql(select_macroarea_nlanguages, index_col='macroarea')
_.plot.pie(y='n_languages')
_
BEGIN (implicit)
SELECT macroarea.name AS macroarea, count(*) AS n_languages 
FROM macroarea LEFT OUTER JOIN languoid_macroarea ON macroarea.name = languoid_macroarea.macroarea_name JOIN languoid ON languoid.id = languoid_macroarea.languoid_id 
WHERE languoid.level = ? GROUP BY macroarea.name ORDER BY n_languages DESC
[generated in 0.00114s] ('language',)
ROLLBACK
Out[54]:
n_languages
macroarea
Africa 2365
Papunesia 2209
Eurasia 2001
North America 792
South America 716
Australia 388
In [55]:
macroareas = sa.func.group_concat(Macroarea.name, ', ').label('macroareas')

select_multiarea_languages = (sa.select(Languoid.id, Languoid.name, macroareas)
                              .select_from(Languoid)
                              .filter_by(level=LANGUAGE)
                              .join(languoid_macroarea).join(Macroarea)
                              .group_by(Languoid.id)
                              .having(sa.func.count() > 1)
                              .order_by('id'))
    
assert read_sql(select_multiarea_languages).empty
BEGIN (implicit)
SELECT languoid.id, languoid.name, group_concat(macroarea.name, ?) AS macroareas 
FROM languoid JOIN languoid_macroarea ON languoid.id = languoid_macroarea.languoid_id JOIN macroarea ON macroarea.name = languoid_macroarea.macroarea_name 
WHERE languoid.level = ? GROUP BY languoid.id 
HAVING count(*) > ? ORDER BY languoid.id
[generated in 0.00083s] (', ', 'language', 1)
ROLLBACK

country

In [56]:
from treedb.models import Country, languoid_country

print_sql(Country)
print_sql(languoid_country)
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 5.862s ago] ('table', 'country')
SELECT count(*) AS n_rows 
FROM country
[generated in 0.00048s] ()
ROLLBACK
CREATE TABLE country (
	id VARCHAR(2) NOT NULL CHECK (length(id) = 2), 
	name TEXT NOT NULL CHECK (name != ''), 
	PRIMARY KEY (id), 
	UNIQUE (name)
) WITHOUT ROWID
246
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 5.869s ago] ('table', 'languoid_country')
SELECT count(*) AS n_rows 
FROM languoid_country
[generated in 0.00038s] ()
ROLLBACK
CREATE TABLE languoid_country (
	languoid_id VARCHAR(8) NOT NULL, 
	country_id VARCHAR(2) NOT NULL, 
	PRIMARY KEY (languoid_id, country_id), 
	FOREIGN KEY(languoid_id) REFERENCES languoid (id), 
	FOREIGN KEY(country_id) REFERENCES country (id)
) WITHOUT ROWID
11463
In [57]:
select_country_nlanguages = (sa.select(Country.name.label('country'),
                                       sa.func.count().label('n_languages'))
                             .outerjoin_from(Country, languoid_country).join(Languoid)
                             .filter_by(level=LANGUAGE)
                             .group_by(Country.id)
                             .order_by(sa.desc('n_languages'))
                             .limit(10))

read_sql(select_country_nlanguages, index_col='country')
BEGIN (implicit)
SELECT country.name AS country, count(*) AS n_languages 
FROM country LEFT OUTER JOIN languoid_country ON country.id = languoid_country.country_id JOIN languoid ON languoid.id = languoid_country.languoid_id 
WHERE languoid.level = ? GROUP BY country.id ORDER BY n_languages DESC
 LIMIT ? OFFSET ?
[generated in 0.00092s] ('language', 10, 0)
ROLLBACK
Out[57]:
n_languages
country
Papua New Guinea 896
Indonesia 748
Nigeria 581
India 508
China 406
Australia 398
Mexico 333
Brazil 330
United States 322
Cameroon 315
In [58]:
select_lang_country = (sa.select(Languoid.id, Languoid.name, Country.id.label('country'))
                       .select_from(Languoid)
                       .filter_by(level=LANGUAGE)
                       .join(languoid_country).join(Country)
                       .order_by(Languoid.id, 'country')
                       .alias('lang_country'))

countries = sa.func.group_concat(select_lang_country.c.country, ', ').label('countries')

select_multicountry_languages = (sa.select(select_lang_country.c.id,
                                           select_lang_country.c.name,
                                           sa.func.count().label('n_countries'),
                                           countries)
                                 .group_by(select_lang_country.c.id,
                                           select_lang_country.c.name)
                                 .having(sa.func.count() > 1)
                                 .order_by(sa.desc('n_countries'),
                                           select_lang_country.c.id)
                                 .limit(10))
    
read_sql(select_multicountry_languages, index_col='id')
BEGIN (implicit)
SELECT lang_country.id, lang_country.name, count(*) AS n_countries, group_concat(lang_country.country, ?) AS countries 
FROM (SELECT languoid.id AS id, languoid.name AS name, country.id AS country 
FROM languoid JOIN languoid_country ON languoid.id = languoid_country.languoid_id JOIN country ON country.id = languoid_country.country_id 
WHERE languoid.level = ? ORDER BY languoid.id, country) AS lang_country GROUP BY lang_country.id, lang_country.name 
HAVING count(*) > ? ORDER BY n_countries DESC, lang_country.id
 LIMIT ? OFFSET ?
[generated in 0.00105s] (', ', 'language', 1, 10, 0)
ROLLBACK
Out[58]:
name n_countries countries
id
stan1293 English 34 AU, BM, BR, BZ, CA, CC, CK, CU, CX, DO, FK, GB...
stan1288 Spanish 30 AD, AR, BO, BR, BZ, CL, CO, CR, CU, DO, EC, ES...
stan1318 Standard Arabic 25 AE, BH, DJ, DZ, EG, ER, IL, IQ, JO, KM, KW, LB...
amer1248 American Sign Language 22 BB, BF, BJ, BO, CA, CD, CF, CI, CN, GA, GH, JM...
east2295 Eastern Yiddish 22 AR, AU, BE, BY, CA, CR, EE, HU, IL, LT, LV, MD...
russ1263 Russian 20 BY, CN, EE, FI, GE, KG, KP, KZ, LT, LV, MD, MN...
stan1295 German 16 AT, BE, CH, CZ, DE, DK, FR, HU, IT, LI, LU, NL...
port1283 Portuguese 15 AD, AR, BO, BR, CO, ES, FR, GF, GY, PE, PT, PY...
sint1235 Sinte-Manus Romani 13 AT, CH, CZ, DE, FR, HR, IT, KZ, ME, NL, PL, RS...
nucl1301 Turkish 12 AL, AM, AZ, BG, CY, GE, GR, IQ, IR, MK, SY, TR
In [59]:
n_countries = (sa.select(Languoid.id, sa.func.count().label('n_countries'))
               .select_from(Languoid)
               .filter_by(level=LANGUAGE)
               .outerjoin(languoid_country)
               .group_by(Languoid.id)
               .alias('language_ncountries')
               .c.n_countries)

select_lc_dist = (sa.select(n_countries, sa.func.count().label('n_languages'))
                  .group_by(n_countries)
                  .order_by('n_countries'))

_ = read_sql(select_lc_dist, index_col='n_countries')

_.plot.bar(figsize=(12, 3))

(read_sql(sa.select(sa.literal('n_countries').label('value'),
                   sa.func.count().label('count'),
                   sa.func.sum(n_countries).label('sum'),
                   sa.func.min(n_countries).label('min'),
                   sa.func.max(n_countries).label('max'),
                   sa.func.avg(n_countries).label('mean')),
          index_col='value')
 .rename_axis(None))
BEGIN (implicit)
SELECT language_ncountries.n_countries, count(*) AS n_languages 
FROM (SELECT languoid.id AS id, count(*) AS n_countries 
FROM languoid LEFT OUTER JOIN languoid_country ON languoid.id = languoid_country.languoid_id 
WHERE languoid.level = ? GROUP BY languoid.id) AS language_ncountries GROUP BY language_ncountries.n_countries ORDER BY language_ncountries.n_countries
[generated in 0.00098s] ('language',)
ROLLBACK
BEGIN (implicit)
SELECT ? AS value, count(*) AS count, sum(language_ncountries.n_countries) AS sum, min(language_ncountries.n_countries) AS min, max(language_ncountries.n_countries) AS max, avg(language_ncountries.n_countries) AS mean 
FROM (SELECT languoid.id AS id, count(*) AS n_countries 
FROM languoid LEFT OUTER JOIN languoid_country ON languoid.id = languoid_country.languoid_id 
WHERE languoid.level = ? GROUP BY languoid.id) AS language_ncountries
[generated in 0.00108s] ('n_countries', 'language')
ROLLBACK
Out[59]:
count sum min max mean
n_countries 8565 11020 1 34 1.286632

altname

In [60]:
from treedb.models import Altname, AltnameProvider

select_provider_nlanguoids = (sa.select(AltnameProvider.name.label('altname_provider'),
                                        sa.func.count(sa.distinct(Altname.languoid_id)).label('n_languoids'))
                              .join_from(AltnameProvider, Altname)
                              .group_by(AltnameProvider.name)
                              .order_by('n_languoids', 'altname_provider'))

(read_sql(select_provider_nlanguoids, index_col='altname_provider')
 .plot.barh());
BEGIN (implicit)
SELECT altnameprovider.name AS altname_provider, count(DISTINCT altname.languoid_id) AS n_languoids 
FROM altnameprovider JOIN altname ON altnameprovider.id = altname.provider_id GROUP BY altnameprovider.name ORDER BY n_languoids, altname_provider
[generated in 0.00095s] ()
ROLLBACK
In [61]:
n_altnames = (sa.select(Languoid.id, sa.func.count().label('n_altnames'))
              .outerjoin_from(Languoid, Altname)
              .group_by(Languoid.id)
              .alias('languoid_naltnames')
              .c.n_altnames)

select_la_dist = (sa.select(n_altnames, sa.func.count().label('n_languoids'))
                  .group_by(n_altnames)
                  .order_by(n_altnames))

_ = read_sql(select_la_dist, index_col='n_altnames')

_.plot.area(figsize=(12, 3))

(read_sql(sa.select(sa.literal('n_altnames').label('value'),
                    sa.func.count().label('count'),
                    sa.func.sum(n_altnames).label('sum'),
                    sa.func.min(n_altnames).label('min'),
                    sa.func.max(n_altnames).label('max'),
                    sa.func.avg(n_altnames).label('mean')),
         index_col='value')
 .rename_axis(None))
BEGIN (implicit)
SELECT languoid_naltnames.n_altnames, count(*) AS n_languoids 
FROM (SELECT languoid.id AS id, count(*) AS n_altnames 
FROM languoid LEFT OUTER JOIN altname ON languoid.id = altname.languoid_id GROUP BY languoid.id) AS languoid_naltnames GROUP BY languoid_naltnames.n_altnames ORDER BY languoid_naltnames.n_altnames
[generated in 0.00077s] ()
ROLLBACK
BEGIN (implicit)
SELECT ? AS value, count(*) AS count, sum(languoid_naltnames.n_altnames) AS sum, min(languoid_naltnames.n_altnames) AS min, max(languoid_naltnames.n_altnames) AS max, avg(languoid_naltnames.n_altnames) AS mean 
FROM (SELECT languoid.id AS id, count(*) AS n_altnames 
FROM languoid LEFT OUTER JOIN altname ON languoid.id = altname.languoid_id GROUP BY languoid.id) AS languoid_naltnames
[generated in 0.00072s] ('n_altnames',)
ROLLBACK
Out[61]:
count sum min max mean
n_altnames 26285 166610 1 261 6.338596

source

In [62]:
from treedb.models import Source, SourceProvider

select_provider_nsources = (sa.select(SourceProvider.name.label('provider'),
                                      Languoid.level,
                                      sa.func.count().label('n_sources'))
                            .join_from(SourceProvider, Source).join(Languoid)
                            .group_by(SourceProvider.name, Languoid.level)
                            .order_by('provider', sa.desc('n_sources')))

read_sql(select_provider_nsources, index_col=['provider', 'level'])
BEGIN (implicit)
SELECT sourceprovider.name AS provider, languoid.level, count(*) AS n_sources 
FROM sourceprovider JOIN source ON sourceprovider.id = source.provider_id JOIN languoid ON languoid.id = source.languoid_id GROUP BY sourceprovider.name, languoid.level ORDER BY provider, n_sources DESC
[generated in 0.00080s] ()
ROLLBACK
Out[62]:
n_sources
provider level
glottolog language 193508
family 2464
dialect 1819
In [63]:
n_sources = (sa.select(Languoid.id,
                       sa.func.count(Source.languoid_id).label('n_sources'))
             .outerjoin_from(Languoid, Source)
             .group_by(Languoid.id)
             .alias('lang_nsources')
             .c.n_sources)

select_nsources_nlangs = (sa.select(n_sources, sa.func.count().label('n_languoids'))
                          .group_by(n_sources)
                          .order_by('n_languoids'))

_ = read_sql(select_nsources_nlangs, index_col='n_sources')
(_.groupby(_.index != 0).sum().rename_axis('n_sources')
    .rename(index={False: '= 0', True: '> 0'}).plot.bar(figsize=(4, 3)))
(_.drop(0).plot.area(figsize=(8, 3), logx=True)
    .xaxis.set_major_formatter(mpl.ticker.ScalarFormatter()))

(read_sql(sa.select(sa.literal('n_sources').label('value'),
                    sa.func.count().label('count'),
                    sa.func.sum(n_sources).label('sum'),
                    sa.func.min(n_sources).label('min'),
                    sa.func.max(n_sources).label('max'),
                    sa.func.avg(n_sources).label('mean')),
          index_col='value')
 .rename_axis(None))
BEGIN (implicit)
SELECT lang_nsources.n_sources, count(*) AS n_languoids 
FROM (SELECT languoid.id AS id, count(source.languoid_id) AS n_sources 
FROM languoid LEFT OUTER JOIN source ON languoid.id = source.languoid_id GROUP BY languoid.id) AS lang_nsources GROUP BY lang_nsources.n_sources ORDER BY n_languoids
[generated in 0.00073s] ()
ROLLBACK
BEGIN (implicit)
SELECT ? AS value, count(*) AS count, sum(lang_nsources.n_sources) AS sum, min(lang_nsources.n_sources) AS min, max(lang_nsources.n_sources) AS max, avg(lang_nsources.n_sources) AS mean 
FROM (SELECT languoid.id AS id, count(source.languoid_id) AS n_sources 
FROM languoid LEFT OUTER JOIN source ON languoid.id = source.languoid_id GROUP BY languoid.id) AS lang_nsources
[generated in 0.00098s] ('n_sources',)
ROLLBACK
Out[63]:
count sum min max mean
n_sources 26285 197791 0 1549 7.524862
In [64]:
select_lang_nsources = (sa.select(Languoid.id, Languoid.level,
                                  sa.func.count(Source.languoid_id).label('n_sources'))
                        .outerjoin_from(Languoid, Source)
                        .group_by(Languoid.id, Languoid.level)
                        .alias('lang_nsources'))

select_ln_nlangs = (sa.select(select_lang_nsources.c.level,
                              select_lang_nsources.c.n_sources,
                              sa.func.count().label('n_languoids'))
                    .group_by(select_lang_nsources.c.level, select_lang_nsources.c.n_sources)
                    .order_by('n_languoids'))

_ = read_sql(select_ln_nlangs).pivot(index='n_sources', columns='level', values='n_languoids')
(_.groupby(_.index != 0).sum()[list(treedb.LEVEL)].rename_axis('n_sources')
 .rename(index={False: '= 0', True: '> 0'}).plot.bar(figsize=(4, 3)))
(_.drop(0)[list(treedb.LEVEL)].plot.area(figsize=(8, 3), logx=True)
 .xaxis.set_major_formatter(mpl.ticker.ScalarFormatter()))
BEGIN (implicit)
SELECT lang_nsources.level, lang_nsources.n_sources, count(*) AS n_languoids 
FROM (SELECT languoid.id AS id, languoid.level AS level, count(source.languoid_id) AS n_sources 
FROM languoid LEFT OUTER JOIN source ON languoid.id = source.languoid_id GROUP BY languoid.id, languoid.level) AS lang_nsources GROUP BY lang_nsources.level, lang_nsources.n_sources ORDER BY n_languoids
[generated in 0.00079s] ()
ROLLBACK
In [65]:
from treedb.models import Bibfile, Bibitem

select_bibfile_nused = (sa.select(Bibfile.name.label('bibfile'), sa.func.count().label('n_used'))
                        .join_from(Bibfile, Bibitem)
                        .group_by(Bibfile.name)
                        .order_by(sa.desc('n_used')))

(read_sql(select_bibfile_nused, index_col='bibfile')
 .plot.bar(figsize=(12, 3)));
BEGIN (implicit)
SELECT bibfile.name AS bibfile, count(*) AS n_used 
FROM bibfile JOIN bibitem ON bibfile.id = bibitem.bibfile_id GROUP BY bibfile.name ORDER BY n_used DESC
[generated in 0.00086s] ()
ROLLBACK

classification

In [66]:
from treedb.models import ClassificationRef

select_cr_levels = (sa.select(ClassificationRef.kind, Languoid.level,
                              sa.func.count().label('n_classificationrefs'))
                    .join_from(ClassificationRef, Languoid)
                    .group_by(ClassificationRef.kind, Languoid.level)
                    .order_by('kind', 'level'))

read_sql(select_cr_levels, index_col=['kind', 'level']).unstack()
BEGIN (implicit)
SELECT classificationref.kind, languoid.level, count(*) AS n_classificationrefs 
FROM classificationref JOIN languoid ON languoid.id = classificationref.languoid_id GROUP BY classificationref.kind, languoid.level ORDER BY classificationref.kind, languoid.level
[generated in 0.00088s] ()
ROLLBACK
Out[66]:
n_classificationrefs
level dialect family language
kind
family 8 444 256
sub 114 6286 10213
In [67]:
n_crefs = (sa.select(Languoid.id, sa.func.count().label('n_crefs'))
           .outerjoin_from(Languoid, ClassificationRef)
           .group_by(Languoid.id)
           .alias('lang_ncrefs')
           .c.n_crefs)

select_lcr_dist = (sa.select(n_crefs, sa.func.count().label('n_languoids'))
                   .group_by(n_crefs)
                   .order_by(n_crefs))

_ = read_sql(select_lcr_dist, index_col='n_crefs')

_.plot.area(figsize=(12, 3))

(read_sql(sa.select(sa.literal('n_crefs').label('value'),
                    sa.func.count().label('count'),
                    sa.func.sum(n_crefs).label('sum'),
                    sa.func.min(n_crefs).label('min'),
                    sa.func.max(n_crefs).label('max'),
                    sa.func.avg(n_crefs).label('mean')),
         index_col='value')
 .rename_axis(None))
BEGIN (implicit)
SELECT lang_ncrefs.n_crefs, count(*) AS n_languoids 
FROM (SELECT languoid.id AS id, count(*) AS n_crefs 
FROM languoid LEFT OUTER JOIN classificationref ON languoid.id = classificationref.languoid_id GROUP BY languoid.id) AS lang_ncrefs GROUP BY lang_ncrefs.n_crefs ORDER BY lang_ncrefs.n_crefs
[generated in 0.00100s] ()
ROLLBACK
BEGIN (implicit)
SELECT ? AS value, count(*) AS count, sum(lang_ncrefs.n_crefs) AS sum, min(lang_ncrefs.n_crefs) AS min, max(lang_ncrefs.n_crefs) AS max, avg(lang_ncrefs.n_crefs) AS mean 
FROM (SELECT languoid.id AS id, count(*) AS n_crefs 
FROM languoid LEFT OUTER JOIN classificationref ON languoid.id = classificationref.languoid_id GROUP BY languoid.id) AS lang_ncrefs
[generated in 0.00086s] ('n_crefs',)
ROLLBACK
Out[67]:
count sum min max mean
n_crefs 26285 31512 1 11 1.198859
In [68]:
from treedb.models import Link

print_sql(Link)
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 7.984s ago] ('table', 'link')
SELECT count(*) AS n_rows 
FROM link
[generated in 0.00036s] ()
ROLLBACK
CREATE TABLE link (
	languoid_id VARCHAR(8) NOT NULL, 
	ord INTEGER NOT NULL CHECK (ord >= 1), 
	url TEXT NOT NULL CHECK (url != ''), 
	title TEXT CHECK (title != ''), 
	scheme TEXT, 
	PRIMARY KEY (languoid_id, ord), 
	UNIQUE (languoid_id, url), 
	CHECK (substr(url, 1, length(scheme) + 3) = scheme || '://'), 
	FOREIGN KEY(languoid_id) REFERENCES languoid (id), 
	CHECK (scheme IN ('http', 'https'))
) WITHOUT ROWID
29722
In [69]:
select_scheme_nlinks = (sa.select(Link.scheme.label('link_scheme'), sa.func.count().label('n_links'))
                        .group_by(Link.scheme)
                        .order_by(sa.desc('n_links')))

(read_sql(select_scheme_nlinks, index_col='link_scheme')
 .plot.pie(y='n_links'));
BEGIN (implicit)
SELECT link.scheme AS link_scheme, count(*) AS n_links 
FROM link GROUP BY link.scheme ORDER BY n_links DESC
[generated in 0.00089s] ()
ROLLBACK
In [70]:
from urllib.parse import urlparse

hosts = collections.Counter(urlparse(url).hostname for url, in treedb.iterrows(sa.select(Link.url)))

(pd.DataFrame.from_dict(hosts, orient='index', columns=['n_links'])
 .sort_values(by='n_links')
 .plot.barh());
BEGIN (implicit)
SELECT link.url 
FROM link
[generated in 0.00093s] ()
ROLLBACK

endangerment

In [71]:
from treedb.models import Endangerment, EndangermentStatus, EndangermentSource

print_sql(Endangerment)
print_sql(EndangermentStatus)
print_sql(EndangermentSource)
read_sql(sa.select(EndangermentStatus).order_by('ordinal'), index_col='config_section')
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 8.491s ago] ('table', 'endangerment')
SELECT count(*) AS n_rows 
FROM endangerment
[generated in 0.00045s] ()
ROLLBACK
CREATE TABLE endangerment (
	languoid_id VARCHAR(8) NOT NULL, 
	status VARCHAR NOT NULL, 
	source_id INTEGER NOT NULL, 
	date DATETIME NOT NULL, 
	comment TEXT NOT NULL CHECK (comment != ''), 
	PRIMARY KEY (languoid_id), 
	FOREIGN KEY(languoid_id) REFERENCES languoid (id), 
	FOREIGN KEY(status) REFERENCES endangermentstatus (name), 
	FOREIGN KEY(source_id) REFERENCES endangermentsource (id)
) WITHOUT ROWID
8345
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 8.501s ago] ('table', 'endangermentstatus')
SELECT count(*) AS n_rows 
FROM endangermentstatus
[generated in 0.00036s] ()
ROLLBACK
CREATE TABLE endangermentstatus (
	name VARCHAR NOT NULL CHECK (name != ''), 
	config_section VARCHAR NOT NULL CHECK (config_section != ''), 
	ordinal INTEGER NOT NULL CHECK (ordinal >= 1), 
	egids VARCHAR NOT NULL CHECK (egids != ''), 
	unesco VARCHAR NOT NULL CHECK (unesco != ''), 
	elcat VARCHAR NOT NULL CHECK (elcat != ''), 
	icon VARCHAR NOT NULL CHECK (icon != ''), 
	bibitem_id INTEGER, 
	PRIMARY KEY (name), 
	UNIQUE (config_section), 
	FOREIGN KEY(bibitem_id) REFERENCES bibitem (id)
) WITHOUT ROWID
6
BEGIN (implicit)
SELECT sqlite_master.sql 
FROM sqlite_master 
WHERE sqlite_master.type = ? AND sqlite_master.name = ?
[cached since 8.507s ago] ('table', 'endangermentsource')
SELECT count(*) AS n_rows 
FROM endangermentsource
[generated in 0.00043s] ()
ROLLBACK
CREATE TABLE endangermentsource (
	id INTEGER NOT NULL, 
	name TEXT NOT NULL CHECK (name != ''), 
	bibitem_id INTEGER, 
	pages TEXT CHECK (pages != ''), 
	PRIMARY KEY (id), 
	UNIQUE (bibitem_id, pages), 
	CHECK ((bibitem_id IS NULL) = (pages IS NULL)), 
	UNIQUE (name), 
	FOREIGN KEY(bibitem_id) REFERENCES bibitem (id)
)
52
BEGIN (implicit)
SELECT endangermentstatus.name, endangermentstatus.config_section, endangermentstatus.ordinal, endangermentstatus.egids, endangermentstatus.unesco, endangermentstatus.elcat, endangermentstatus.icon, endangermentstatus.bibitem_id 
FROM endangermentstatus ORDER BY endangermentstatus.ordinal
[generated in 0.00100s] ()
ROLLBACK
Out[71]:
name ordinal egids unesco elcat icon bibitem_id
config_section
safe not endangered 1 <=6a safe at risk c00ff00 1
vulnerable threatened 2 6b vulnerable vulnerable ca0fb75 1
definite shifting 3 7 definitely endangered threatened/endangered sff6600 1
severe moribund 4 8a severely endangered severly endangered dff4400 1
critical nearly extinct 5 8b critically endangered critically endangered tff0000 1
extinct extinct 6 >=9 extinct dormant/awakening f000000 1
In [72]:
e_source = EndangermentSource.name.label('source')

select_source_nendangerments = (sa.select(e_source, sa.func.count().label('n_endangerments'))
                                .join_from(Endangerment, EndangermentSource)
                                .group_by(e_source)
                                .order_by('n_endangerments'))

(read_sql(select_source_nendangerments, index_col='source')
 .plot.barh());
BEGIN (implicit)
SELECT endangermentsource.name AS source, count(*) AS n_endangerments 
FROM endangerment JOIN endangermentsource ON endangermentsource.id = endangerment.source_id GROUP BY endangermentsource.name ORDER BY n_endangerments
[generated in 0.00080s] ()
ROLLBACK

Example query

In [73]:
%time treedb.hash_csv(treedb.get_example_query())
BEGIN (implicit)
WITH RECURSIVE tree(child_id, parent_id, steps, terminal) AS 
(SELECT child.id AS child_id, child.id AS parent_id, ? AS steps, child.parent_id IS NULL AS terminal 
FROM languoid AS child UNION ALL SELECT tree.child_id AS child_id, parent.parent_id AS parent_id, tree.steps + ? AS steps, grandparent.parent_id IS NULL AS terminal 
FROM tree JOIN languoid AS parent ON tree.parent_id = parent.id AND parent.parent_id IS NOT NULL LEFT OUTER JOIN languoid AS grandparent ON parent.parent_id = grandparent.id)
 SELECT languoid.id, languoid.name, languoid.level, languoid.parent_id, (SELECT group_concat(parent_path.path_part, ?) AS path 
FROM (SELECT tree.parent_id AS path_part 
FROM tree 
WHERE tree.child_id = languoid.id ORDER BY tree.steps DESC) AS parent_path) AS path, (SELECT tree.parent_id 
FROM tree 
WHERE tree.child_id = languoid.id AND tree.steps > ? AND tree.terminal = 1) AS family_id, (SELECT tree.parent_id 
FROM tree 
WHERE tree.child_id = languoid.id AND languoid.level = ? AND (EXISTS (SELECT * 
FROM languoid AS ancestor 
WHERE tree.parent_id = ancestor.id AND ancestor.level = ?))) AS dialect_language_id, languoid.hid, languoid.iso639_3, languoid.latitude, languoid.longitude, (SELECT group_concat(lang_ma.macroarea_name, ?) AS macroareas 
FROM (SELECT languoid_macroarea.macroarea_name AS macroarea_name 
FROM languoid_macroarea 
WHERE languoid_macroarea.languoid_id = languoid.id ORDER BY languoid_macroarea.macroarea_name) AS lang_ma) AS macroareas, (SELECT group_concat(lang_country.country_id, ?) AS countries 
FROM (SELECT languoid_country.country_id AS country_id 
FROM languoid_country 
WHERE languoid_country.languoid_id = languoid.id ORDER BY languoid_country.country_id) AS lang_country) AS countries, (SELECT group_concat(lang_link.printf, ?) AS links 
FROM (SELECT CASE WHEN (link.title IS NOT NULL) THEN printf(?, link.title, link.url) ELSE link.url END AS printf 
FROM link 
WHERE link.languoid_id = languoid.id ORDER BY link.ord) AS lang_link) AS links, (SELECT group_concat(lang_source_glottolog.printf, ?) AS sources_glottolog 
FROM (SELECT CASE WHEN (source_glottolog.pages IS NOT NULL AND source_glottolog."trigger" IS NOT NULL) THEN printf(?, source_bibfile.name, source_bibitem.bibkey, source_glottolog.pages, source_glottolog."trigger") WHEN (source_glottolog.pages IS NOT NULL) THEN printf(?, source_bibfile.name, source_bibitem.bibkey, source_glottolog.pages) WHEN (source_glottolog."trigger" IS NOT NULL) THEN printf(?, source_bibfile.name, source_bibitem.bibkey, source_glottolog."trigger") ELSE printf(?, source_bibfile.name, source_bibitem.bibkey) END AS printf 
FROM source AS source_glottolog JOIN sourceprovider AS source_provider ON source_provider.id = source_glottolog.provider_id JOIN bibitem AS source_bibitem ON source_bibitem.id = source_glottolog.bibitem_id JOIN bibfile AS source_bibfile ON source_bibfile.id = source_bibitem.bibfile_id 
WHERE source_glottolog.languoid_id = languoid.id AND source_provider.name = ? ORDER BY source_bibfile.name, source_bibitem.bibkey) AS lang_source_glottolog) AS sources_glottolog, (SELECT group_concat(lang_altname_aiatsis.printf, ?) AS altnames_aiatsis 
FROM (SELECT CASE WHEN (altname_aiatsis.lang = ?) THEN altname_aiatsis.name ELSE printf(?, altname_aiatsis.name, altname_aiatsis.lang) END AS printf 
FROM altname AS altname_aiatsis JOIN altnameprovider AS altname_aiatsis_provider ON altname_aiatsis_provider.id = altname_aiatsis.provider_id 
WHERE altname_aiatsis.languoid_id = languoid.id AND altname_aiatsis_provider.name = ? ORDER BY altname_aiatsis.name, altname_aiatsis.lang) AS lang_altname_aiatsis) AS altnames_aiatsis, (SELECT group_concat(lang_altname_elcat.printf, ?) AS altnames_elcat 
FROM (SELECT CASE WHEN (altname_elcat.lang = ?) THEN altname_elcat.name ELSE printf(?, altname_elcat.name, altname_elcat.lang) END AS printf 
FROM altname AS altname_elcat JOIN altnameprovider AS altname_elcat_provider ON altname_elcat_provider.id = altname_elcat.provider_id 
WHERE altname_elcat.languoid_id = languoid.id AND altname_elcat_provider.name = ? ORDER BY altname_elcat.name, altname_elcat.lang) AS lang_altname_elcat) AS altnames_elcat, (SELECT group_concat(lang_altname_ethnologue.printf, ?) AS altnames_ethnologue 
FROM (SELECT CASE WHEN (altname_ethnologue.lang = ?) THEN altname_ethnologue.name ELSE printf(?, altname_ethnologue.name, altname_ethnologue.lang) END AS printf 
FROM altname AS altname_ethnologue JOIN altnameprovider AS altname_ethnologue_provider ON altname_ethnologue_provider.id = altname_ethnologue.provider_id 
WHERE altname_ethnologue.languoid_id = languoid.id AND altname_ethnologue_provider.name = ? ORDER BY altname_ethnologue.name, altname_ethnologue.lang) AS lang_altname_ethnologue) AS altnames_ethnologue, (SELECT group_concat(lang_altname_glottolog.printf, ?) AS altnames_glottolog 
FROM (SELECT CASE WHEN (altname_glottolog.lang = ?) THEN altname_glottolog.name ELSE printf(?, altname_glottolog.name, altname_glottolog.lang) END AS printf 
FROM altname AS altname_glottolog JOIN altnameprovider AS altname_glottolog_provider ON altname_glottolog_provider.id = altname_glottolog.provider_id 
WHERE altname_glottolog.languoid_id = languoid.id AND altname_glottolog_provider.name = ? ORDER BY altname_glottolog.name, altname_glottolog.lang) AS lang_altname_glottolog) AS altnames_glottolog, (SELECT group_concat(lang_altname_hhbib_lgcode.printf, ?) AS altnames_hhbib_lgcode 
FROM (SELECT CASE WHEN (altname_hhbib_lgcode.lang = ?) THEN altname_hhbib_lgcode.name ELSE printf(?, altname_hhbib_lgcode.name, altname_hhbib_lgcode.lang) END AS printf 
FROM altname AS altname_hhbib_lgcode JOIN altnameprovider AS altname_hhbib_lgcode_provider ON altname_hhbib_lgcode_provider.id = altname_hhbib_lgcode.provider_id 
WHERE altname_hhbib_lgcode.languoid_id = languoid.id AND altname_hhbib_lgcode_provider.name = ? ORDER BY altname_hhbib_lgcode.name, altname_hhbib_lgcode.lang) AS lang_altname_hhbib_lgcode) AS altnames_hhbib_lgcode, (SELECT group_concat(lang_altname_lexvo.printf, ?) AS altnames_lexvo 
FROM (SELECT CASE WHEN (altname_lexvo.lang = ?) THEN altname_lexvo.name ELSE printf(?, altname_lexvo.name, altname_lexvo.lang) END AS printf 
FROM altname AS altname_lexvo JOIN altnameprovider AS altname_lexvo_provider ON altname_lexvo_provider.id = altname_lexvo.provider_id 
WHERE altname_lexvo.languoid_id = languoid.id AND altname_lexvo_provider.name = ? ORDER BY altname_lexvo.name, altname_lexvo.lang) AS lang_altname_lexvo) AS altnames_lexvo, (SELECT group_concat("lang_altname_moseley & asher (1994)".printf, ?) AS "altnames_moseley & asher (1994)" 
FROM (SELECT CASE WHEN ("altname_moseley & asher (1994)".lang = ?) THEN "altname_moseley & asher (1994)".name ELSE printf(?, "altname_moseley & asher (1994)".name, "altname_moseley & asher (1994)".lang) END AS printf 
FROM altname AS "altname_moseley & asher (1994)" JOIN altnameprovider AS "altname_moseley & asher (1994)_provider" ON "altname_moseley & asher (1994)_provider".id = "altname_moseley & asher (1994)".provider_id 
WHERE "altname_moseley & asher (1994)".languoid_id = languoid.id AND "altname_moseley & asher (1994)_provider".name = ? ORDER BY "altname_moseley & asher (1994)".name, "altname_moseley & asher (1994)".lang) AS "lang_altname_moseley & asher (1994)") AS "altnames_moseley & asher (1994)", (SELECT group_concat(lang_altname_multitree.printf, ?) AS altnames_multitree 
FROM (SELECT CASE WHEN (altname_multitree.lang = ?) THEN altname_multitree.name ELSE printf(?, altname_multitree.name, altname_multitree.lang) END AS printf 
FROM altname AS altname_multitree JOIN altnameprovider AS altname_multitree_provider ON altname_multitree_provider.id = altname_multitree.provider_id 
WHERE altname_multitree.languoid_id = languoid.id AND altname_multitree_provider.name = ? ORDER BY altname_multitree.name, altname_multitree.lang) AS lang_altname_multitree) AS altnames_multitree, (SELECT group_concat("lang_altname_ruhlen (1987)".printf, ?) AS "altnames_ruhlen (1987)" 
FROM (SELECT CASE WHEN ("altname_ruhlen (1987)".lang = ?) THEN "altname_ruhlen (1987)".name ELSE printf(?, "altname_ruhlen (1987)".name, "altname_ruhlen (1987)".lang) END AS printf 
FROM altname AS "altname_ruhlen (1987)" JOIN altnameprovider AS "altname_ruhlen (1987)_provider" ON "altname_ruhlen (1987)_provider".id = "altname_ruhlen (1987)".provider_id 
WHERE "altname_ruhlen (1987)".languoid_id = languoid.id AND "altname_ruhlen (1987)_provider".name = ? ORDER BY "altname_ruhlen (1987)".name, "altname_ruhlen (1987)".lang) AS "lang_altname_ruhlen (1987)") AS "altnames_ruhlen (1987)", (SELECT group_concat(lang_altname_wals.printf, ?) AS altnames_wals 
FROM (SELECT CASE WHEN (altname_wals.lang = ?) THEN altname_wals.name ELSE printf(?, altname_wals.name, altname_wals.lang) END AS printf 
FROM altname AS altname_wals JOIN altnameprovider AS altname_wals_provider ON altname_wals_provider.id = altname_wals.provider_id 
WHERE altname_wals.languoid_id = languoid.id AND altname_wals_provider.name = ? ORDER BY altname_wals.name, altname_wals.lang) AS lang_altname_wals) AS altnames_wals, (SELECT group_concat("lang_altname_wals other".printf, ?) AS "altnames_wals other" 
FROM (SELECT CASE WHEN ("altname_wals other".lang = ?) THEN "altname_wals other".name ELSE printf(?, "altname_wals other".name, "altname_wals other".lang) END AS printf 
FROM altname AS "altname_wals other" JOIN altnameprovider AS "altname_wals other_provider" ON "altname_wals other_provider".id = "altname_wals other".provider_id 
WHERE "altname_wals other".languoid_id = languoid.id AND "altname_wals other_provider".name = ? ORDER BY "altname_wals other".name, "altname_wals other".lang) AS "lang_altname_wals other") AS "altnames_wals other", (SELECT group_concat(lang_trigger_lgcode."trigger", ?) AS triggers_lgcode 
FROM (SELECT trigger_lgcode."trigger" AS "trigger" 
FROM "trigger" AS trigger_lgcode 
WHERE trigger_lgcode.languoid_id = languoid.id AND trigger_lgcode.field = ? ORDER BY trigger_lgcode.ord) AS lang_trigger_lgcode) AS triggers_lgcode, (SELECT group_concat(lang_trigger_inlg."trigger", ?) AS triggers_inlg 
FROM (SELECT trigger_inlg."trigger" AS "trigger" 
FROM "trigger" AS trigger_inlg 
WHERE trigger_inlg.languoid_id = languoid.id AND trigger_inlg.field = ? ORDER BY trigger_inlg.ord) AS lang_trigger_inlg) AS triggers_inlg, ident_endangeredlanguages.identifier AS identifier_endangeredlanguages, ident_languagelandscape.identifier AS identifier_languagelandscape, ident_multitree.identifier AS identifier_multitree, ident_wals.identifier AS identifier_wals, cc_sub.comment AS classification_sub, (SELECT group_concat(lang_cref_sub.printf, ?) AS classification_subrefs 
FROM (SELECT printf(CASE WHEN (cr_sub.pages IS NOT NULL) THEN ? ELSE ? END, bibfile_cr_sub.name, bibitem_cr_sub.bibkey, cr_sub.pages) AS printf 
FROM classificationref AS cr_sub JOIN bibitem AS bibitem_cr_sub ON bibitem_cr_sub.id = cr_sub.bibitem_id JOIN bibfile AS bibfile_cr_sub ON bibfile_cr_sub.id = bibitem_cr_sub.bibfile_id 
WHERE cr_sub.languoid_id = languoid.id AND cr_sub.kind = ? ORDER BY cr_sub.ord) AS lang_cref_sub) AS classification_subrefs, cc_family.comment AS classification_family, (SELECT group_concat(lang_cref_family.printf, ?) AS classification_familyrefs 
FROM (SELECT printf(CASE WHEN (cr_family.pages IS NOT NULL) THEN ? ELSE ? END, bibfile_cr_family.name, bibitem_cr_family.bibkey, cr_family.pages) AS printf 
FROM classificationref AS cr_family JOIN bibitem AS bibitem_cr_family ON bibitem_cr_family.id = cr_family.bibitem_id JOIN bibfile AS bibfile_cr_family ON bibfile_cr_family.id = bibitem_cr_family.bibfile_id 
WHERE cr_family.languoid_id = languoid.id AND cr_family.kind = ? ORDER BY cr_family.ord) AS lang_cref_family) AS classification_familyrefs, endangerment.status AS endangerment_status, endangerment.date AS endangerment_date, endangerment.comment AS endangerment_comment, CASE WHEN (endangermentsource.bibitem_id IS NULL) THEN endangermentsource.name ELSE printf(?, bibfile_e.name, bibitem_e.bibkey, endangermentsource.pages) END AS endangerment_source, ethnologuecomment.isohid AS elcomment_isohid, ethnologuecomment.comment_type AS elcomment_comment_type, ethnologuecomment.ethnologue_versions AS elcomment_ethnologue_versions, ethnologuecomment.comment AS elcomment_comment, isoretirement.code AS iso_retirement_code, isoretirement.name AS iso_retirement_name, isoretirement.change_request AS iso_retirement_change_request, isoretirement.effective AS iso_retirement_effective, isoretirement.reason AS iso_retirement_reason, isoretirement.remedy AS iso_retirement_remedy, isoretirement.comment AS iso_retirement_comment, (SELECT group_concat(lang_irct.code, ?) AS iso_retirement_change_to 
FROM (SELECT isoretirement_changeto.code AS code 
FROM isoretirement_changeto 
WHERE isoretirement_changeto.languoid_id = isoretirement.languoid_id ORDER BY isoretirement_changeto.ord) AS lang_irct) AS iso_retirement_change_to 
FROM languoid LEFT OUTER JOIN (identifier AS ident_endangeredlanguages JOIN identifiersite AS ident_endangeredlanguages_site ON ident_endangeredlanguages.site_id = ident_endangeredlanguages_site.id) ON ident_endangeredlanguages_site.name = ? AND ident_endangeredlanguages.languoid_id = languoid.id LEFT OUTER JOIN (identifier AS ident_languagelandscape JOIN identifiersite AS ident_languagelandscape_site ON ident_languagelandscape.site_id = ident_languagelandscape_site.id) ON ident_languagelandscape_site.name = ? AND ident_languagelandscape.languoid_id = languoid.id LEFT OUTER JOIN (identifier AS ident_multitree JOIN identifiersite AS ident_multitree_site ON ident_multitree.site_id = ident_multitree_site.id) ON ident_multitree_site.name = ? AND ident_multitree.languoid_id = languoid.id LEFT OUTER JOIN (identifier AS ident_wals JOIN identifiersite AS ident_wals_site ON ident_wals.site_id = ident_wals_site.id) ON ident_wals_site.name = ? AND ident_wals.languoid_id = languoid.id LEFT OUTER JOIN classificationcomment AS cc_sub ON cc_sub.kind = ? AND cc_sub.languoid_id = languoid.id LEFT OUTER JOIN classificationcomment AS cc_family ON cc_family.kind = ? AND cc_family.languoid_id = languoid.id LEFT OUTER JOIN (endangerment JOIN endangermentsource ON endangermentsource.id = endangerment.source_id) ON languoid.id = endangerment.languoid_id LEFT OUTER JOIN (bibitem AS bibitem_e JOIN bibfile AS bibfile_e ON bibfile_e.id = bibitem_e.bibfile_id) ON bibitem_e.id = endangermentsource.bibitem_id LEFT OUTER JOIN ethnologuecomment ON languoid.id = ethnologuecomment.languoid_id LEFT OUTER JOIN isoretirement ON languoid.id = isoretirement.languoid_id ORDER BY languoid.id
[generated in 0.00134s] (0, 1, '/', 0, 'dialect', 'language', ', ', ', ', ', ', '[%s](%s)', ', ', '**%s:%s**:%s<trigger "%s">', '**%s:%s**:%s', '**%s:%s**<trigger "%s">', '**%s:%s**', 'glottolog', ', ', '', '%s [%s]', 'aiatsis', ', ', '', '%s [%s]', 'elcat', ', ', '', '%s [%s]', 'ethnologue', ', ', '', '%s [%s]', 'glottolog', ', ', '', '%s [%s]', 'hhbib_lgcode', ', ', '', '%s [%s]', 'lexvo', ', ', '', '%s [%s]', 'moseley & asher (1994)', ', ', '', '%s [%s]', 'multitree', ', ', '', '%s [%s]', 'ruhlen (1987)', ', ', '', '%s [%s]', 'wals', ', ', '', '%s [%s]', 'wals other', ', ', 'lgcode', ', ', 'inlg', ', ', '**%s:%s**:%s', '**%s:%s**', 'sub', ', ', '**%s:%s**:%s', '**%s:%s**', 'family', '**%s:%s**:%s', ', ', 'endangeredlanguages', 'languagelandscape', 'multitree', 'wals', 'sub', 'family')
[[email protected]] hash rows with 'sha256', csv header: ['id', 'name', 'level', 'parent_id', 'path', 'family_id', 'dialect_language_id', 'hid', 'iso639_3', 'latitude', 'longitude', 'macroareas', 'countries', 'links', 'sources_glottolog', 'altnames_aiatsis', 'altnames_elcat', 'altnames_ethnologue', 'altnames_glottolog', 'altnames_hhbib_lgcode', 'altnames_lexvo', 'altnames_moseley & asher (1994)', 'altnames_multitree', 'altnames_ruhlen (1987)', 'altnames_wals', 'altnames_wals other', 'triggers_lgcode', 'triggers_inlg', 'identifier_endangeredlanguages', 'identifier_languagelandscape', 'identifier_multitree', 'identifier_wals', 'classification_sub', 'classification_subrefs', 'classification_family', 'classification_familyrefs', 'endangerment_status', 'endangerment_date', 'endangerment_comment', 'endangerment_source', 'elcomment_isohid', 'elcomment_comment_type', 'elcomment_ethnologue_versions', 'elcomment_comment', 'iso_retirement_code', 'iso_retirement_name', 'iso_retirement_change_request', 'iso_retirement_effective', 'iso_retirement_reason', 'iso_retirement_remedy', 'iso_retirement_comment', 'iso_retirement_change_to']
ROLLBACK
CPU times: total: 4.5 s
Wall time: 4.49 s
Out[73]:
'4479476b397fa7dfbfd560a4bef5be06513ddc54f7c103d1f565e3a26404a90f'