record properties of treedb.sqlite3
loaded from Glottolog
master repo commit
and monitor changes by diffing this
%matplotlib inline
import collections
import os
os.environ['SQLALCHEMY_WARN_20'] = 'true'
import pandas as pd
import sqlalchemy as sa
import matplotlib as mpl
import matplotlib.pyplot as plt
import treedb
import treedb.raw
treedb.configure_logging(log_sql=False)
treedb.print_versions()
%%time
GLOTTOLOG_TAG = 'v4.6'
TARGET = '../glottolog/'
treedb.checkout_or_clone(GLOTTOLOG_TAG, target=TARGET)
treedb.glottolog_version()
treedb.set_root(TARGET)
next(treedb.iterfiles())
dict(treedb.iterlanguoids(limit=1))
%time treedb.checksum(source='files')
treedb.set_engine('treedb.sqlite3')
%%time
engine = treedb.load(rebuild=False, exclude_raw=False)
engine
engine.file_mtime()
engine.file_size(as_megabytes=True)
%time engine.file_sha256()
%time treedb.raw.checksum()
%time treedb.raw.checksum(weak=True)
%time treedb.checksum(source='tables')
%time treedb.checksum(source='raw')
%time treedb.check()
treedb.configure_logging(level='INFO', log_sql=True)
treedb.scalar(sa.select(sa.func.sqlite_version()))
application_id = treedb.scalar(sa.text('PRAGMA application_id'))
assert application_id == 1122 == 0x462
assert application_id == sum(ord(c) for c in treedb.Dataset.__tablename__)
assert treedb.Dataset.__tablename__ == '__dataset__'
application_id
from treedb import pd_read_sql as read_sql
read_sql(treedb.select_tables_nrows(), index_col='table_name')
treedb.print_rows(treedb.backend.sqlite_master.select_views(),
format_='{name}')
from treedb import print_table_sql as print_sql
print_sql('sqlite_master')
treedb.print_dataset()
from treedb import Dataset
print_sql(Dataset)
dataset, = treedb.iterrows(sa.select(Dataset), mappings=True)
pd.DataFrame.from_dict(dataset, orient='index',
columns=['__dataset__'])
from treedb import Producer
print_sql(Producer)
producer, = treedb.iterrows(sa.select(Producer), mappings=True)
pd.DataFrame.from_dict(producer, orient='index',
columns=['__producer__'])
treedb.raw
¶from treedb.raw import File, Option, Value
for model in (File, Option, Value):
print_sql(model)
read_sql(sa.select(File).limit(5), index_col='id')
read_sql(sa.select(Option).limit(5), index_col='id')
read_sql(sa.select(Value).limit(5), index_col=['file_id', 'option_id'])
select_file_values = (sa.select(Option.section, Option.option, Value.line, Value.value)
.select_from(File)
.filter_by(glottocode=sa.bindparam('glottocode'))
.join(Value).join(Option))
read_sql(select_file_values, params={'glottocode': 'abin1243'},
index_col=['section', 'option', 'line'])
path_depth = File.path_depth()
select_path_depths = (sa.select(path_depth,
treedb.Languoid.level, sa.func.count().label('n_files'))
.join_from(File, treedb.Languoid, File.glottocode == treedb.Languoid.id)
.group_by(path_depth, treedb.Languoid.level)
.order_by('path_depth', 'level'))
_ = (read_sql(select_path_depths, index_col=['path_depth', 'level'])
.unstack(fill_value=0).droplevel(0, axis='columns')[list(treedb.LEVEL)])
_.plot.bar(stacked=True, figsize=(12, 3))
(100 * _.div(_.sum(axis='columns'), axis='rows')).plot.bar(stacked=True, figsize=(12, 3));
file_size = File.size.label('file_size')
select_file_sizes = (sa.select(file_size, sa.func.count().label('n_files'))
.group_by(file_size)
.order_by('file_size'))
_ = read_sql(select_file_sizes, index_col='file_size')
(_.plot.area(figsize=(12, 3), logx=True)
.xaxis.set_major_formatter(mpl.ticker.ScalarFormatter()))
(_.index.to_series().describe()
.to_frame().transpose()[['count', 'min', 'max']])
file_nvalues = (sa.select(File.glottocode,
sa.func.count(Value.option_id.distinct()).label('n_values'))
.join_from(File, Value)
.group_by(File.glottocode)
.alias('file_nvalues')
.c.n_values)
select_nvalues = (sa.select(file_nvalues, sa.func.count().label('n_files'))
.group_by(file_nvalues)
.order_by(file_nvalues))
_ = read_sql(select_nvalues, index_col='n_values')
_.plot.bar(figsize=(12, 3))
(_.index.to_series().describe()
.to_frame().transpose()[['count', 'min', 'max']])
value_length = sa.func.length(Value.value).label('value_length')
select_value_length = (sa.select(value_length, sa.func.count().label('n_values'))
.group_by(value_length)
.order_by('value_length'))
_ = read_sql(select_value_length, index_col='value_length')
(_.plot.area(figsize=(12, 3), logx=True)
.xaxis.set_major_formatter(mpl.ticker.ScalarFormatter()))
(_.index.to_series().describe()
.to_frame().transpose()[['count', 'min', 'max']])
%time treedb.raw.print_stats()
from treedb import Languoid
print_sql(Languoid)
from treedb import LEVEL
from treedb.models import LanguoidLevel
FAMILY, LANGUAGE, DIALECT = LEVEL
print_sql(LanguoidLevel)
read_sql(sa.select(LanguoidLevel).order_by('ordinal'), index_col='name')
from treedb.models import PseudoFamily
print_sql(PseudoFamily)
read_sql(sa.select(PseudoFamily).order_by('name'), index_col='config_section')
%time treedb.print_languoid_stats()
read_sql(sa.select(Languoid).limit(5), index_col='id')
read_sql(sa.select(Languoid).order_by('id').limit(5), index_col='id')
read_sql(sa.select(Languoid).order_by('name').limit(5), index_col='id')
Child, Parent = (sa.orm.aliased(Languoid, name=n) for n in ('child', 'parent'))
select_parent_levels = (sa.select(Child.level.label('child_level'),
Parent.level.label('parent_level'),
sa.func.count().label('n_languoids'))
.outerjoin_from(Child, Parent, Child.parent_id == Parent.id)
.group_by(Child.level, Parent.level)
.order_by('child_level', 'parent_level'))
(read_sql(select_parent_levels, index_col=['child_level', 'parent_level'])
.unstack(fill_value=0).assign(all=lambda x: x.sum(axis='columns'))
.pipe(lambda x: pd.concat([x, x.sum().rename('all').to_frame().T.rename_axis('child_level')])))
select_lang_nisos = (sa.select(Languoid.level.label('level'),
sa.func.count().label('n_languoids'),
sa.func.count(Languoid.iso639_3).label('n_isos'))
.group_by(Languoid.level)
.order_by('level'))
(read_sql(select_lang_nisos, index_col='level')
.assign(ratio=lambda x: 100 * x['n_isos'] / x['n_languoids']))
select_lang_nlocations = (sa.select(Languoid.level.label('level'),
sa.func.count().label('n_languoids'),
sa.func.count(Languoid.latitude).label('n_locations'))
.group_by(Languoid.level)
.order_by('level'))
(read_sql(select_lang_nlocations, index_col='level')
.assign(ratio=lambda x: 100 * x['n_locations'] / x['n_languoids']))
select_latlon = (sa.select(Languoid.latitude, Languoid.longitude)
.select_from(Languoid)
.filter_by(level=LANGUAGE))
latitudes, longitudes = zip(*treedb.iterrows(select_latlon))
plt.figure(figsize=(12, 6))
plt.axis([-180, 180, -90, 90])
plt.xticks(range(-180, 181, 60))
plt.yticks(range(-90, 91, 30))
plt.scatter(longitudes, latitudes, 1, 'black');
Family, Child, family_child = treedb.Languoid.parent_descendant(parent_root=True,
parent_level=FAMILY)
n_languages = sa.func.count(Child.id).label('n_languages')
select_family_nlanguages = (sa.select(Family.id.label('family_id'), n_languages)
.select_from(family_child)
.where(Child.level == LANGUAGE)
.group_by(Family.id))
select_top_families = (select_family_nlanguages
.having(n_languages >= 100)
.order_by('n_languages'))
(read_sql(select_top_families, index_col='family_id')
.plot.barh());
family_size = (select_family_nlanguages
.alias('family_nlanguages')
.c.n_languages.label('family_size'))
select_family_sizes = (sa.select(family_size, sa.func.count().label('n_families'))
.group_by(family_size)
.order_by('family_size'))
_ = read_sql(select_family_sizes, index_col='family_size')
(_.plot.area(figsize=(12, 3), logx=True)
.xaxis.set_major_formatter(mpl.ticker.ScalarFormatter()))
(_.index.to_series().describe()
.to_frame().transpose()[['count', 'min', 'max']])
levels = (sa.union_all(*[sa.select(sa.literal(l).label('level')) for l in LEVEL])
.cte(name='levels'))
select_parent_nchildren = (sa.select(Parent.id.label('parent_id'),
Parent.level.label('parent_level'),
levels.c.level.label('child_level'),
sa.func.count(Child.id).label('n_children'))
.join_from(Parent, levels, sa.true())
.outerjoin(Child, sa.and_(Parent.id == Child.parent_id,
Child.level == levels.c.level))
.group_by(Parent.id, Parent.level, levels.c.level))
(read_sql(select_parent_nchildren)
.pivot_table(index='parent_level', columns='child_level', values='n_children',
aggfunc=['sum', 'max', 'mean'], fill_value=0))
Parent, Child, parent_child = treedb.Languoid.parent_descendant()
select_parent_ndescendants = (sa.select(Parent.id.label('parent_id'),
Parent.level.label('parent_level'),
sa.func.count(Child.id).label('n_descendants'))
.select_from(parent_child)
.group_by(Parent.id, Parent.level)
.alias('parent_ndescendants'))
parent_level = select_parent_ndescendants.c.parent_level
n_descendants = select_parent_ndescendants.c.n_descendants
d_min = sa.func.min(n_descendants).label('min')
d_max = sa.func.max(n_descendants).label('max')
d_mean = (sa.func.sum(n_descendants).cast(sa.Float) / sa.func.count(n_descendants)).label('mean')
select_level_mean_descendants = (sa.select(parent_level, d_min, d_max, d_mean)
.group_by(parent_level)
.order_by(parent_level))
select_total_mean_descendants = sa.select(sa.literal('total').label('parent_level'),
d_min, d_max, d_mean)
select_mean_descendants = [select_level_mean_descendants, select_total_mean_descendants]
_ = pd.concat([read_sql(q, index_col='parent_level') for q in select_mean_descendants])
_.columns = pd.MultiIndex.from_product([['n_descendants'], _.columns])
_
from treedb.models import Macroarea, languoid_macroarea
print_sql(Macroarea)
print_sql(languoid_macroarea)
read_sql(sa.select(Macroarea).order_by('name'), index_col='config_section')
select_macroarea_nlanguages = (sa.select(Macroarea.name.label('macroarea'),
sa.func.count().label('n_languages'))
.outerjoin_from(Macroarea, languoid_macroarea).join(Languoid)
.filter_by(level=LANGUAGE)
.group_by(Macroarea.name)
.order_by(sa.desc('n_languages')))
_ = read_sql(select_macroarea_nlanguages, index_col='macroarea')
_.plot.pie(y='n_languages')
_
macroareas = sa.func.group_concat(Macroarea.name, ', ').label('macroareas')
select_multiarea_languages = (sa.select(Languoid.id, Languoid.name, macroareas)
.select_from(Languoid)
.filter_by(level=LANGUAGE)
.join(languoid_macroarea).join(Macroarea)
.group_by(Languoid.id)
.having(sa.func.count() > 1)
.order_by('id'))
assert read_sql(select_multiarea_languages).empty
from treedb.models import Country, languoid_country
print_sql(Country)
print_sql(languoid_country)
select_country_nlanguages = (sa.select(Country.name.label('country'),
sa.func.count().label('n_languages'))
.outerjoin_from(Country, languoid_country).join(Languoid)
.filter_by(level=LANGUAGE)
.group_by(Country.id)
.order_by(sa.desc('n_languages'))
.limit(10))
read_sql(select_country_nlanguages, index_col='country')
select_lang_country = (sa.select(Languoid.id, Languoid.name, Country.id.label('country'))
.select_from(Languoid)
.filter_by(level=LANGUAGE)
.join(languoid_country).join(Country)
.order_by(Languoid.id, 'country')
.alias('lang_country'))
countries = sa.func.group_concat(select_lang_country.c.country, ', ').label('countries')
select_multicountry_languages = (sa.select(select_lang_country.c.id,
select_lang_country.c.name,
sa.func.count().label('n_countries'),
countries)
.group_by(select_lang_country.c.id,
select_lang_country.c.name)
.having(sa.func.count() > 1)
.order_by(sa.desc('n_countries'),
select_lang_country.c.id)
.limit(10))
read_sql(select_multicountry_languages, index_col='id')
n_countries = (sa.select(Languoid.id, sa.func.count().label('n_countries'))
.select_from(Languoid)
.filter_by(level=LANGUAGE)
.outerjoin(languoid_country)
.group_by(Languoid.id)
.alias('language_ncountries')
.c.n_countries)
select_lc_dist = (sa.select(n_countries, sa.func.count().label('n_languages'))
.group_by(n_countries)
.order_by('n_countries'))
_ = read_sql(select_lc_dist, index_col='n_countries')
_.plot.bar(figsize=(12, 3))
(read_sql(sa.select(sa.literal('n_countries').label('value'),
sa.func.count().label('count'),
sa.func.sum(n_countries).label('sum'),
sa.func.min(n_countries).label('min'),
sa.func.max(n_countries).label('max'),
sa.func.avg(n_countries).label('mean')),
index_col='value')
.rename_axis(None))
from treedb.models import Altname, AltnameProvider
select_provider_nlanguoids = (sa.select(AltnameProvider.name.label('altname_provider'),
sa.func.count(sa.distinct(Altname.languoid_id)).label('n_languoids'))
.join_from(AltnameProvider, Altname)
.group_by(AltnameProvider.name)
.order_by('n_languoids', 'altname_provider'))
(read_sql(select_provider_nlanguoids, index_col='altname_provider')
.plot.barh());
n_altnames = (sa.select(Languoid.id, sa.func.count().label('n_altnames'))
.outerjoin_from(Languoid, Altname)
.group_by(Languoid.id)
.alias('languoid_naltnames')
.c.n_altnames)
select_la_dist = (sa.select(n_altnames, sa.func.count().label('n_languoids'))
.group_by(n_altnames)
.order_by(n_altnames))
_ = read_sql(select_la_dist, index_col='n_altnames')
_.plot.area(figsize=(12, 3))
(read_sql(sa.select(sa.literal('n_altnames').label('value'),
sa.func.count().label('count'),
sa.func.sum(n_altnames).label('sum'),
sa.func.min(n_altnames).label('min'),
sa.func.max(n_altnames).label('max'),
sa.func.avg(n_altnames).label('mean')),
index_col='value')
.rename_axis(None))
from treedb.models import Source, SourceProvider
select_provider_nsources = (sa.select(SourceProvider.name.label('provider'),
Languoid.level,
sa.func.count().label('n_sources'))
.join_from(SourceProvider, Source).join(Languoid)
.group_by(SourceProvider.name, Languoid.level)
.order_by('provider', sa.desc('n_sources')))
read_sql(select_provider_nsources, index_col=['provider', 'level'])
n_sources = (sa.select(Languoid.id,
sa.func.count(Source.languoid_id).label('n_sources'))
.outerjoin_from(Languoid, Source)
.group_by(Languoid.id)
.alias('lang_nsources')
.c.n_sources)
select_nsources_nlangs = (sa.select(n_sources, sa.func.count().label('n_languoids'))
.group_by(n_sources)
.order_by('n_languoids'))
_ = read_sql(select_nsources_nlangs, index_col='n_sources')
(_.groupby(_.index != 0).sum().rename_axis('n_sources')
.rename(index={False: '= 0', True: '> 0'}).plot.bar(figsize=(4, 3)))
(_.drop(0).plot.area(figsize=(8, 3), logx=True)
.xaxis.set_major_formatter(mpl.ticker.ScalarFormatter()))
(read_sql(sa.select(sa.literal('n_sources').label('value'),
sa.func.count().label('count'),
sa.func.sum(n_sources).label('sum'),
sa.func.min(n_sources).label('min'),
sa.func.max(n_sources).label('max'),
sa.func.avg(n_sources).label('mean')),
index_col='value')
.rename_axis(None))
select_lang_nsources = (sa.select(Languoid.id, Languoid.level,
sa.func.count(Source.languoid_id).label('n_sources'))
.outerjoin_from(Languoid, Source)
.group_by(Languoid.id, Languoid.level)
.alias('lang_nsources'))
select_ln_nlangs = (sa.select(select_lang_nsources.c.level,
select_lang_nsources.c.n_sources,
sa.func.count().label('n_languoids'))
.group_by(select_lang_nsources.c.level, select_lang_nsources.c.n_sources)
.order_by('n_languoids'))
_ = read_sql(select_ln_nlangs).pivot(index='n_sources', columns='level', values='n_languoids')
(_.groupby(_.index != 0).sum()[list(treedb.LEVEL)].rename_axis('n_sources')
.rename(index={False: '= 0', True: '> 0'}).plot.bar(figsize=(4, 3)))
(_.drop(0)[list(treedb.LEVEL)].plot.area(figsize=(8, 3), logx=True)
.xaxis.set_major_formatter(mpl.ticker.ScalarFormatter()))
from treedb.models import Bibfile, Bibitem
select_bibfile_nused = (sa.select(Bibfile.name.label('bibfile'), sa.func.count().label('n_used'))
.join_from(Bibfile, Bibitem)
.group_by(Bibfile.name)
.order_by(sa.desc('n_used')))
(read_sql(select_bibfile_nused, index_col='bibfile')
.plot.bar(figsize=(12, 3)));
from treedb.models import ClassificationRef
select_cr_levels = (sa.select(ClassificationRef.kind, Languoid.level,
sa.func.count().label('n_classificationrefs'))
.join_from(ClassificationRef, Languoid)
.group_by(ClassificationRef.kind, Languoid.level)
.order_by('kind', 'level'))
read_sql(select_cr_levels, index_col=['kind', 'level']).unstack()
n_crefs = (sa.select(Languoid.id, sa.func.count().label('n_crefs'))
.outerjoin_from(Languoid, ClassificationRef)
.group_by(Languoid.id)
.alias('lang_ncrefs')
.c.n_crefs)
select_lcr_dist = (sa.select(n_crefs, sa.func.count().label('n_languoids'))
.group_by(n_crefs)
.order_by(n_crefs))
_ = read_sql(select_lcr_dist, index_col='n_crefs')
_.plot.area(figsize=(12, 3))
(read_sql(sa.select(sa.literal('n_crefs').label('value'),
sa.func.count().label('count'),
sa.func.sum(n_crefs).label('sum'),
sa.func.min(n_crefs).label('min'),
sa.func.max(n_crefs).label('max'),
sa.func.avg(n_crefs).label('mean')),
index_col='value')
.rename_axis(None))
from treedb.models import Link
print_sql(Link)
select_scheme_nlinks = (sa.select(Link.scheme.label('link_scheme'), sa.func.count().label('n_links'))
.group_by(Link.scheme)
.order_by(sa.desc('n_links')))
(read_sql(select_scheme_nlinks, index_col='link_scheme')
.plot.pie(y='n_links'));
from urllib.parse import urlparse
hosts = collections.Counter(urlparse(url).hostname for url, in treedb.iterrows(sa.select(Link.url)))
(pd.DataFrame.from_dict(hosts, orient='index', columns=['n_links'])
.sort_values(by='n_links')
.plot.barh());
from treedb.models import Endangerment, EndangermentStatus, EndangermentSource
print_sql(Endangerment)
print_sql(EndangermentStatus)
print_sql(EndangermentSource)
read_sql(sa.select(EndangermentStatus).order_by('ordinal'), index_col='config_section')
e_source = EndangermentSource.name.label('source')
select_source_nendangerments = (sa.select(e_source, sa.func.count().label('n_endangerments'))
.join_from(Endangerment, EndangermentSource)
.group_by(e_source)
.order_by('n_endangerments'))
(read_sql(select_source_nendangerments, index_col='source')
.plot.barh());
%time treedb.hash_csv(treedb.get_example_query())