Exploring the Digitised Books Collection from Trove (GLAM Workbench)

by Adel Rahmani
This Jupyter notebook can be found on GitHub

After my exploratory data analysis of 12,000 parliamentary press releases, I started looking through the other collections of digitised documents available on GLAM Workbench, the wonderful platform developed by Tim Sherratt.

One particular collection which caught my attention was the 9,738 digitised books from Trove Australia, which Tim kindly and conveniently made available on GLAM Workbench.

Since the topic modelling computation below is similar to the one I used previously, I'll be going through that part rather quickly. Please see my notebook on the press releases for more details.

A particular feature of this book collection is that it is multilingual, therefore I'll be focusing a bit on that, and on the use of the topic model to figure out what the collection is about.

Let's jump in!

In [2]:
# basic libraries
import pandas as pd
import numpy as np
from pathlib import Path
from itertools import chain

# plotting
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
import seaborn as sns
%matplotlib inline

# For some nice interactive plots
import altair as alt
alt.renderers.enable('notebook')

# By default, altair requires us to load the data from
# a file if the data source exceeds 5000 rows. 
# I want to load the data from memory instead, hence the
# modified max_rows.
alt.data_transformers.enable('default', max_rows=10000)

def my_theme(*args, **kwargs):
    '''
    Ensure that altair figures have a white
    background instead of a transparent one
    when saving/exporting them.
    '''
    return {
            "background": "white"
          }
          
alt.themes.register('my_theme', my_theme)
alt.themes.enable('my_theme');

# natural language processing
from sklearn.feature_extraction.text import TfidfVectorizer

# matrix factorisation
from sklearn.decomposition import NMF

# dimensionality reduction
from umap import UMAP

# regular expressions
import re

# language detection
import pycld2 as cld2

Helper functions

These are for the most part the same code snippets as the ones I discussed previously...

In [3]:
def tokenize(corpus, docs_path, **kwargs):
    '''
    Simple wrapper function around a sklearn 
    TfidfVectorizer object. 
    '''
    # create an instance of the vectoriser
    tfidf = TfidfVectorizer(**kwargs)                             

    # the vectoriser returns a sparse array which
    # we convert to a dense array for convenience
    X_tfidf = np.asarray(tfidf.fit_transform(corpus).todense())

    print(f"Tokenised {X_tfidf.shape[0]} documents using a vocabulary of {len(tfidf.get_feature_names())} tokens.")
    
    return X_tfidf, tfidf

def extract_topics(model, vec, sep=' | ', n=5):
    '''
    Extract topics in terms of vocabulary tokens from
    from a trained tokeniser and a trained NMF model.
    '''
    
    topics = {}
    
    # sort the array so that the most important tokens are first
    idx = model.components_.argsort(axis=1)[:, ::-1]
    
    # extract the most important tokens 
    for i, t in enumerate(np.array(vec.get_feature_names())[idx]):
        topic = sep.join(t[:n])
        topics[i] = topic
    
    return topics

metadata = pd.read_csv('books/trove_digitised_books_with_ocr.csv')

def get_url(doc, metadata=metadata):
    '''
    Get the trove and nla urls for a document from the metadata
    file provided by Tim Sherratt on GLAM Workbench
    '''
    prefix, number = doc.name.partition('nla.obj')[-2:]
    trove_id = prefix + number.split('.')[0]
    nla_url = metadata[metadata.trove_id==trove_id]['fulltext_url'].values[0]
    url = metadata[metadata.trove_id==trove_id]['url'].values[0]
    return (nla_url, url)

def get_most_representative_doc_for_topic(topic, 
                                          n=3, 
                                          docs_path=None, 
                                          df=None, 
                                          plot=True,
                                          print_url=True):            
    '''
    Extract the top n most representative documents for a given topic, 
    based on the topic scores.
    '''
    # sort the results according to the score for the topic of interest
    docs_idx = df.iloc[:, topic].sort_values(ascending=False).index.values[:n]
        
    # extract the top n most representative documents          
    results = np.array(docs_path)[docs_idx]

    # output the results and plot the topic allocations      
    for i, item in zip(docs_idx, results):
        url, url_trove = get_url(item)
        if print_url:
            print(item.name, url, url_trove, sep='\n')
        else:
            print(item.name)
        if plot:  
            plot_topic_allocation(i)
    
    return results
    

STOPWORDS = {'anything', 'mightn', 'upon', 'six', 'herein', 'hers', 'indeed', 'becomes', 'twenty', 'at', 'up', 'will', 'meanwhile', 'same', 'onto', 'seem', 'it', 'had', 'they', "'m", 'beforehand', 'describe', 'was', 'moreover', 'hereupon', 'your', 'due', 'un', 'eleven', 'further', 'him', 'is', 'whereas', 'hasnt', 'in', 'we', 'them', 'ten', 
'however', 'done', 'fire', 'through', 'keep', 'sometimes', 'unless', 'needn', 'until', 'top', 'there', 'just', 'didn', 'because', 'wherever', 'couldnt', 'front', 'someone', 'afterwards', 'within', 'won', 'except', 'he', 'fill', 'ours', 'my', 'others', 'latterly', 'made', 'first', 'about', 'call', 'may', 'thence', 'seeming', 'nor', 'haven', 'couldn','nothing', 'everyone', 'enough', 'her', 'latter', 'detail', 'now', 'where', 'while', 'became', 'wouldn', 'besides','do', 'its', 'wasn', 'another', 'during', 'around', 'shouldn', 'some', 'whoever', 'once', 'inc', 'con', 'll', 'four','back', 'm', 'although', 've', 'either', 'their', 'beside', 'yourself', 'how', 'when', 'whom', 'sincere', 'thereafter', 'out', 'between', 'whether', 'hereafter', 'she', "'re", 'over', 'thru', 'i', 'very', 'whereupon', 'above', 'third','alone', 'aren', 'nevertheless', 'almost', 'various', 'nowhere', 'so', 'make', 'somehow', 'here', 'take', "'d", 'those','whereby', 'whereafter', 'mill', 'get', 'after', 'into', 'ourselves', 'more', 'regarding', 'quite', 'don', 'ever','everywhere', 'whole', 'five', 'ma', 'whence', 'below', 'eg', 'give', 'under', 'ltd', 'yours', 'd', 'whatever', 'might','be', 'using', 'serious', 'not', 'anyhow', 'ca', 'his', 'becoming', 'who', 'hasn', 'therein', 'again', 'me', 'empty', 'noone', 'being', 't', 'nobody', 'hadn', 'theirs', 'since', 'rather', 'mustn', 'nine', 'from', 'none', 'the', 'seems',"'ve", 'system', 'amongst', 'thereby', 'been', 'own', 'next', 'down', 'hundred', 'each', 'seemed', 'other', 'everything','across', 'ain', 'off', 'doesn', 'than', 'many', 'show', 'but', 'an', 'then', 'never', 'without', 'before', 'only', 'anyway', 'namely', 'o', 'etc', 'formerly', 'wherein', 'two', 'did', 'y', 'toward', 'thereupon', "'ll", 'full', 'most','have', 'always', 'were', 'myself', 'name', 'move', 'say', 'put', 'cry', 'become', 'would', 'to', 'am', 'bottom','having', 'amoungst', 'as', 'already', 'whenever', 'thin', 'us', 'that', 'whither', 'our', 'yourselves', 'cant', 'several', "'s", 'really', 'fifteen', 'otherwise', 'must', 'anywhere', 'much', 'hereby', 'anyone', 'for', 'could', 'often', 'themselves', 'can', 'all', 'too', 'sometime', 'what', 'somewhere', 'every', 'find', 'herself', 'together', 'are', 'well', 'de', 'on', 'which', 'interest', 'bill', 'isn', 'himself', 'therefore', 'whose', 'along', 'has', 'though','mostly', 'please', 'beyond', 'neither', 'against', 'go', 'behind', 'amount', 'something', 'hence', 'part', 'this','and', 'you', 'eight', 'per', 'among', 'least', 'side', 'mine', 'towards', 'see', 'a', 'also', 'by', 'via', 'twelve', 'forty', 'found', 'such', 'less', 'even', 'still', 'these', 'few', 's', 'perhaps', 'both', 'throughout', "n't", 'shan','elsewhere', 'co', 'sixty', 'why', 'one', 'if', 'thus', 'itself', 'used', 'ie', 'of', 'fifty', 'former', 'else', 'or','three', 'cannot', 'last', 'any', 'thick', 'no', 're', 'with', 'should', 'doing', 'weren', 'does', 'yet'}

How many documents do we have?

In [4]:
docs_path = [p for p in Path('books').glob('*.txt')]
print(f"Found {len(docs_path)} documents.")
Found 9738 documents.

How long are the documents?

For convenience I use the logarithm (base 10) to visualise the results because otherwise all the data would be squished (I think that's the technical term) to the left. Accordingly, on the horizontal axis of the figure below, a value of 1 corresponds to a document length of 10 characters, a value of 2 corresponds to 100 characters, a value of 3 to a 1000 character, and so on...

In [5]:
doc_lengths = pd.Series([len(p.read_text(encoding='utf-8').strip()) for p in docs_path]).rename('Document length')

source  = doc_lengths.map(np.log10).astype(float).to_frame()
chart = alt.Chart(source).mark_bar().encode(
    alt.X("Document length:Q", 
          bin=alt.Bin(maxbins=100), 
          title='Number of Characters (log10)',
         ),
    y='count()',
).properties(height=300, 
             width=700, 
             title=f'{doc_lengths.shape[0]:,} Digitised Books from @TroveAustralia (source: GLAM Workbench)')

chart.configure_axis(
    labelFontSize=14,
    titleFontSize=16
).configure_title(fontSize=16)
Out[5]:

Longest and shortest documents

In [6]:
print('\nLONGEST DOCUMENTS:\n')
for i in doc_lengths.nlargest().index:
    print(f"{docs_path[i].name:<80} {doc_lengths.loc[i]:,.0f} characters.")
    print(f"{get_url(docs_path[i])[0]}")
    print(f"{get_url(docs_path[i])[1]}")
LONGEST DOCUMENTS:

navigantium-atque-itinerantium-bibliotheca-or-a-nla.obj-302079118.txt            8,901,188 characters.
http://nla.gov.au/nla.obj-302079118
https://trove.nla.gov.au/work/16739978
navigantium-atque-itinerantium-bibliotheca-or-a-nla.obj-52928883.txt             8,669,438 characters.
http://nla.gov.au/nla.obj-52928883
https://trove.nla.gov.au/work/16739978
a-new-authentic-and-complete-collection-of-voyag-nla.obj-674833791.txt           7,672,397 characters.
http://nla.gov.au/nla.obj-674833791
https://trove.nla.gov.au/work/5769729
the-cyclopedia-of-western-australia-an-historica-nla.obj-135063414.txt           4,405,157 characters.
http://nla.gov.au/nla.obj-135063414
https://trove.nla.gov.au/work/12456547
official-record-of-the-debates-of-the-australasian-nla.obj-3821074.txt           4,329,931 characters.
http://nla.gov.au/nla.obj-3821074
https://trove.nla.gov.au/work/12982228
In [7]:
print('\nSHORTEST DOCUMENTS:\n')
for i in doc_lengths.nsmallest().index:
    print(f"{docs_path[i].name:<80} {doc_lengths.loc[i]:3d} characters.")
    print(f"{get_url(docs_path[i])[0]}")
    print(f"{get_url(docs_path[i])[1]}")
SHORTEST DOCUMENTS:

religious-sects-of-all-nations-nla.obj-187563224.txt                               4 characters.
http://nla.gov.au/nla.obj-187563224
https://trove.nla.gov.au/work/18240964
hear-heitmann-mp-wedderburn-victoria-oct-1-nla.obj-508107473.txt                   8 characters.
http://nla.gov.au/nla.obj-508107473
https://trove.nla.gov.au/work/228202997
chinesisch-und-tai-sprachvergleichende-untersuch-nla.obj-192209780.txt             9 characters.
http://nla.gov.au/nla.obj-192209780
https://trove.nla.gov.au/work/17417127
in-memoriam-emily-adeline-coleman-of-warbleton-nla.obj-363341982.txt              10 characters.
http://nla.gov.au/nla.obj-363341982
https://trove.nla.gov.au/work/12088738
a-st-albans-pioneer-nla.obj-362072830.txt                                         10 characters.
http://nla.gov.au/nla.obj-362072830
https://trove.nla.gov.au/work/12075464

While the longest documents look reasonable, the shortest ones are clearly documents for which the OCR process did not work...

Topic Model

Before we can train a topic model we need to tokenise our document. Let's use the same tokenisation process that we used previously with the Australian parliamentary press releases.

In [8]:
%%time

corpus = (p.read_text(encoding='utf-8').strip() for p in docs_path)

X_tfidf, tfidf = tokenize(corpus,                     # the corpus
                          docs_path,                  # list of paths to the individual documents
                          min_df=10,                  # only consider words which appear in at least 10 docs
                          max_df=0.5,                 # only consider words which appear in at most 50% of the docs
                          lowercase=True,             # convert everything to lowercase
                          token_pattern='[a-z]{2,}',  # what's a token (2 or more letters)
                          stop_words=STOPWORDS,       # which words are to be excluded
                          max_features=10000          # keep the top 10,000 tokens (based on tfidf scores)
                         )
Tokenised 9738 documents using a vocabulary of 10000 tokens.
CPU times: user 1min 34s, sys: 3.54 s, total: 1min 37s
Wall time: 1min 37s

We can now train the model.

In [9]:
%%time
model = NMF(n_components=20, random_state=0)
X_nmf = model.fit_transform(X_tfidf)
CPU times: user 1min 3s, sys: 785 ms, total: 1min 4s
Wall time: 16.9 s

For future convenience, let's gather the results in a dataframe, and convert the topic scores into proportions.

We can then visualise how prevalent each topic is within the collection.

In [10]:
df = pd.DataFrame(X_nmf, columns=extract_topics(model, tfidf, n=5).values())
df = df.div(df.sum(axis=1), axis=0)     
In [11]:
source = (df
          .sum(axis='rows')
          .rename('records')
          .reset_index()
          .sort_values('records')
          .assign(row=np.arange(df.shape[1]))
         )

chart = alt.Chart(source).mark_bar().encode(
   alt.X("records:Q", 
          title='Effective Number of Records',
         ),
    alt.Y("index", 
          title='Topics',
          sort=alt.EncodingSortField(
                    field="records",  
                    op="sum",  
                    order="descending"  
                    )
         ),
    color=alt.condition(
                alt.Predicate(
                    alt.FieldOneOfPredicate(field='row', oneOf=[0, 1 , 2, 17]),
                ),
                 alt.value('orange'),     
                 alt.value('steelblue'),   
                ),
).properties(height=400, 
             width=600, 
             title=f'{doc_lengths.shape[0]:,} Digitised Books from @TroveAustralia (source: GLAM Workbench)')

(chart
 .configure_axis(
    labelFontSize=14,
    titleFontSize=16,
    labelLimit=1000)
 .configure_title(fontSize=16)
 .configure_axisY(
    titleAngle=-90,
    titleAlign="left",
    titleX=-380)
)
Out[11]:

In the plot above, I've highlighted 4 topics. The ones at the bottom are clearly made up of common words in French, German, and Italian.

This tells us that we have a multilingual collection.

The third topic from the top seems to be composed of Roman numerals. We'll need to have a closer look at it.

It is more convenient to use the topic number than the corresponding list of tokens so let's extract the topic numbers.

In [12]:
topics = extract_topics(model, tfidf, n=5)
topics
Out[12]:
{0: 'law | human | moral | science | society',
 1: 'ii | iii | vi | iv | ix',
 2: 'god | christ | jesus | lord | christian',
 3: 'went | oh | night | got | mother',
 4: 'les | la | et | le | que',
 5: 'mrs | miss | john | esq | george',
 6: 'miles | river | creek | feet | north',
 7: 'thy | thee | thou | lord | love',
 8: 'di | che | il | la | del',
 9: 'ship | captain | board | passengers | sea',
 10: 'war | german | germany | british | military',
 11: 'church | rev | bishop | synod | churches',
 12: 'und | der | die | den | zu',
 13: 'natives | island | islands | native | guinea',
 14: 'committee | members | meeting | council | member',
 15: 'inches | ft | fig | plants | feet',
 16: 'colony | acres | lands | acre | population',
 17: 'federal | commonwealth | parliament | federation | colonies',
 18: 'school | schools | education | children | teachers',
 19: 'gold | quartz | mining | ore | reef'}

Topic 1: Documents consisting of Roman numerals? Not quite!

Looking at a few documents representative of this topic, we see that they are in Chinese.

In [13]:
chinese = get_most_representative_doc_for_topic(1, n=10, docs_path=docs_path, df=df, plot=False)
yesu-shi-ji-kao-nla.obj-55712310.txt
http://nla.gov.au/nla.obj-55712310
https://trove.nla.gov.au/work/12524291
gang-jian-yi-zhi-lu-wu-chengquan-deng-ji-nla.obj-57177068.txt
http://nla.gov.au/nla.obj-57177068
https://trove.nla.gov.au/work/12442950
sheng-jiao-yao-jing-by-t-dacosta-nla.obj-78179360.txt
http://nla.gov.au/nla.obj-78179360
https://trove.nla.gov.au/work/12535209
chu-deng-xiao-xue-nu-zi-guo-wen-jiao-ke-shu-nla.obj-45978562.txt
http://nla.gov.au/nla.obj-45978562
https://trove.nla.gov.au/work/12566013
shanghai-yu-ying-tang-zheng-xin-lu-nla.obj-48355311.txt
http://nla.gov.au/nla.obj-48355311
https://trove.nla.gov.au/work/12183597
yin-dou-lue-qiu-xi-shou-ji-nla.obj-46143424.txt
http://nla.gov.au/nla.obj-46143424
https://trove.nla.gov.au/work/12568110
yang-xin-shi-diao-du-jiade-zhuan-nla.obj-45188001.txt
http://nla.gov.au/nla.obj-45188001
https://trove.nla.gov.au/work/12537321
qin-ding-qian-lu-liang-shizheng-et-al-nla.obj-57037535.txt
http://nla.gov.au/nla.obj-57037535
https://trove.nla.gov.au/work/12582067
quan-jie-shi-ya-pian-yan-nla.obj-46101832.txt
http://nla.gov.au/nla.obj-46101832
https://trove.nla.gov.au/work/12568225
ge-zhi-jing-yuan-chen-yuanlong-nla.obj-59228214.txt
http://nla.gov.au/nla.obj-59228214
https://trove.nla.gov.au/work/12567129

Topic 4: French documents.

Topic 4 indeed corresponds to French documents.

In [14]:
french  = get_most_representative_doc_for_topic(4, n=10, docs_path=docs_path, df=df, plot=False)
voyage-en-australie-paul-gingeot-nla.obj-85784669.txt
http://nla.gov.au/nla.obj-85784669
https://trove.nla.gov.au/work/18869125
compte-rendu-par-l-envoye-extraordinaire-du-diable-nla.obj-19892684.txt
http://nla.gov.au/nla.obj-19892684
https://trove.nla.gov.au/work/32541149
discours-dur-roi-et-de-m-de-calonne-prononces-a-nla.obj-52882774.txt
http://nla.gov.au/nla.obj-52882774
https://trove.nla.gov.au/work/28493430
la-vie-des-animaux-et-des-plantes-dans-l-antarctiq-nla.obj-357092018.txt
http://nla.gov.au/nla.obj-357092018
https://trove.nla.gov.au/work/22328688
histoire-et-description-des-iles-sechelles-par-c-nla.obj-289639053.txt
http://nla.gov.au/nla.obj-289639053
https://trove.nla.gov.au/work/17828014
vers-le-pole-sud-conference-faite-a-la-sorbonne-nla.obj-357102024.txt
http://nla.gov.au/nla.obj-357102024
https://trove.nla.gov.au/work/22330839
notice-sur-le-guano-de-malden-pacifique-par-ma-nla.obj-52881455.txt
http://nla.gov.au/nla.obj-52881455
https://trove.nla.gov.au/work/22158247
lettre-de-m-h-d-de-blainville-a-m-le-redacteur-nla.obj-39360717.txt
http://nla.gov.au/nla.obj-39360717
https://trove.nla.gov.au/work/35499514
rapport-fait-a-la-convention-nationale-au-nom-du-nla.obj-193128113.txt
http://nla.gov.au/nla.obj-193128113
https://trove.nla.gov.au/work/28561951
talus-n-est-ce-que-cela-vraiment-je-voudrais-dire-nla.obj-500360847.txt
http://nla.gov.au/nla.obj-500360847
https://trove.nla.gov.au/work/6172654

Topic 8: Italian documents.

Looks like topic 8 is made up of Italian documents. The third document below appears to have a German title, however, if we follow the link to the NLA source, we can see that the document is indeed in Italian.

In [15]:
italian = get_most_representative_doc_for_topic(8, n=10, docs_path=docs_path, df=df, plot=False)
compendio-della-storia-generale-de-viaggi-opera-nla.obj-549108466.txt
http://nla.gov.au/nla.obj-549108466
https://trove.nla.gov.au/work/6397064
compendio-della-storia-generale-de-viaggi-opera-nla.obj-553766859.txt
http://nla.gov.au/nla.obj-553766859
https://trove.nla.gov.au/work/6397064
die-schreiende-grube-roman-arthur-j-rees-b-nla.obj-182882753.txt
http://nla.gov.au/nla.obj-182882753
https://trove.nla.gov.au/work/9600166
compendio-della-storia-generale-de-viaggi-opera-nla.obj-549125673.txt
http://nla.gov.au/nla.obj-549125673
https://trove.nla.gov.au/work/6397064
compendio-della-storia-generale-de-viaggi-opera-nla.obj-549102470.txt
http://nla.gov.au/nla.obj-549102470
https://trove.nla.gov.au/work/6397064
compendio-della-storia-generale-de-viaggi-opera-nla.obj-555769345.txt
http://nla.gov.au/nla.obj-555769345
https://trove.nla.gov.au/work/6397064
piccolo-mondo-melanesiano-p-rinaldo-pavese-s-m-nla.obj-291156674.txt
http://nla.gov.au/nla.obj-291156674
https://trove.nla.gov.au/work/32799873
compendio-della-storia-generale-de-viaggi-opera-nla.obj-554099544.txt
http://nla.gov.au/nla.obj-554099544
https://trove.nla.gov.au/work/6397064
compendio-della-storia-generale-de-viaggi-opera-nla.obj-554063610.txt
http://nla.gov.au/nla.obj-554063610
https://trove.nla.gov.au/work/6397064
gausava-storia-di-una-scuola-di-catechisti-un-nla.obj-291285408.txt
http://nla.gov.au/nla.obj-291285408
https://trove.nla.gov.au/work/32799866

Topic 12: German documents.

Topic 12 seems to indeed correspond to German documents. Even the last document below, with its non-German title, is actually in German.

In [16]:
german = get_most_representative_doc_for_topic(12, n=10, docs_path=docs_path, df=df, plot=False)
beitrag-zur-kenntnis-der-morphologie-und-phylogeni-nla.obj-51079198.txt
http://nla.gov.au/nla.obj-51079198
https://trove.nla.gov.au/work/13002640
anthropologische-mittheilungen-uber-die-papuas-von-nla.obj-427926338.txt
http://nla.gov.au/nla.obj-427926338
https://trove.nla.gov.au/work/18725142
die-sudseeinseln-und-der-deutsche-sudseehandel-v-nla.obj-108484542.txt
http://nla.gov.au/nla.obj-108484542
https://trove.nla.gov.au/work/19242587
eine-sorgenfreie-zukunft-das-neue-evangelium-v-nla.obj-52869270.txt
http://nla.gov.au/nla.obj-52869270
https://trove.nla.gov.au/work/228963317
die-nutzpflanzen-von-neu-sudwales-von-othm-riet-nla.obj-63219759.txt
http://nla.gov.au/nla.obj-63219759
https://trove.nla.gov.au/work/18106322
anthropologische-mittheilungen-uber-die-papuas-von-nla.obj-79637099.txt
http://nla.gov.au/nla.obj-79637099
https://trove.nla.gov.au/work/18725142
die-ersten-bucher-des-wolkenwanderer-verlages-in-l-nla.obj-852288908.txt
http://nla.gov.au/nla.obj-852288908
https://trove.nla.gov.au/work/19215914
max-klingers-beethoven-kurze-erlauterung-nla.obj-503146672.txt
http://nla.gov.au/nla.obj-503146672
https://trove.nla.gov.au/work/16389605
fuhrer-und-rathgeber-fur-auswanderer-nach-sud-aust-nla.obj-52765537.txt
http://nla.gov.au/nla.obj-52765537
https://trove.nla.gov.au/work/18889938
concert-programmes-etc-nla.obj-530514315.txt
http://nla.gov.au/nla.obj-530514315
https://trove.nla.gov.au/work/34622091

Dimensionality reduction

Let's visualise the whole collection using UMAP.

We first compute a 2-dimensional embedding.

In [17]:
%%time
proj = UMAP(n_components=2, n_neighbors=30, min_dist=0.3, random_state=0)
X_proj = proj.fit_transform(X_nmf)
CPU times: user 28.1 s, sys: 1.38 s, total: 29.5 s
Wall time: 27.4 s

Then, we extract the dominant topic for each document, in order to use it to visualise how our collection is made up of different topics.

In [18]:
dominant_topic = X_nmf.argsort(axis=1)[:, -1]

df_proj = (pd.DataFrame(X_proj, columns=['x', 'y'])
               .assign(topic_num = dominant_topic)
          )
df_proj = df_proj.assign(topic=df_proj.topic_num.map(topics))
df_proj = pd.concat((df_proj, pd.DataFrame(df.values)), axis='columns')

Let's also identify which documents have one of the topics highlighed above as their dominant topic, and use this information to plot the location of the non-English documents in the embedding.

In [19]:
lang_mask = {}

lang_subset = lambda t: df_proj.iloc[:, -model.n_components:].idxmax(axis=1).fillna(-1).astype(int) == t

lang_mask['chinese'] = lang_subset(1)
lang_mask['french']  = lang_subset(4)
lang_mask['italian'] = lang_subset(8)
lang_mask['german']  = lang_subset(12)
In [20]:
def plot_embedding(df_proj, xlim=None, ylim=None, hue='topic', palette='tab20', figsize=(18, 10)):

    fig, ax = plt.subplots(figsize=figsize)
    sns.scatterplot(x='x', 
                    y='y', 
                    hue=hue, 
                    data=df_proj, 
                    palette=palette, 
                    alpha=0.8, 
                    s=50,
                    ax=ax)

    leg = ax.legend(bbox_to_anchor = (1.01, 1), markerscale=2, frameon=False, prop={"size":14})
    leg.texts[0].set_text("")
    leg.set_title(hue.title(), prop={"size":20})
    
    if xlim is not None:
        ax.set_xlim(xlim)
        ax.set_title('Topical portrait of the trove books', fontsize=18)
        
    if ylim is not None:
        ax.set_ylim(ylim)
        ax.get_legend().remove()    
        
    path_effects = [PathEffects.withStroke(linewidth=3, foreground="w")]
    
    for lang in lang_mask:
        x_ave, y_ave = df_proj.loc[lang_mask[lang], ['x', 'y']].median()
        ax.annotate(lang.title(), 
                    xy=(x_ave, y_ave), 
                    path_effects=path_effects,
                    fontsize=20)

    ax.set_axis_off() # comment this line to see the axes
    fig.tight_layout()

    return ax

def list_documents_in_frame(ax):
    indices = df_proj[df_proj.x.between(*ax.get_xlim()) & df_proj.y.between(*ax.get_ylim())].index.values
    return [docs_path[i] for i in indices]
In [21]:
ax = plot_embedding(df_proj)