Explore harvested text files

In [ ]:
import os
import pandas as pd
import fileinput
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from textblob import TextBlob
from operator import itemgetter
from pathlib import Path
import nltk
import numpy as np
import altair as alt

import nltk

stopwords = nltk.corpus.stopwords.words('english')
stopwords += ['tho', 'tbe']

# Are you using Jupyter Lab?
# If so either don't run this cell or comment out the line below

# alt.renderers.enable('notebook')

# If you forget, run this cell, and then get strange warnings when you make a chart,
# uncomment the following line and run this cell to reset the chart renderer

# alt.renderers.enable('default')

# alt.data_transformers.enable('json')
#stopwords = nltk.corpus.stopwords.words('english')
In [ ]:
# Import a harvest zip file you've created previously
# First upload the zip file to the data directory, then run this cell
import zipfile
for zipped in sorted(Path('data').glob('*.zip')):
    print(f'Unzipping {zipped}...')
    with zipfile.ZipFile(zipped, 'r') as zip_file:
In [ ]:
def get_latest_harvest():
    Get the timestamp of the most recent harvest.
    harvests = sorted([d for d in Path('data').iterdir() if d.is_dir() and not d.name.startswith('.')])
        harvest = harvests[-1]
    except IndexError:
        print('No harvests!')
        harvest = None
    return harvest
In [ ]:
def get_docs(harvest):
    docs_path = get_docs_path(harvest)
    for p in docs_path:
        yield p.read_text(encoding='utf-8').strip()
def get_docs_path(harvest):
    path = Path(harvest, 'text')
    docs_path = [p for p in sorted(path.glob('*.txt'))]
    return docs_path

def get_file_names(harvest):
    return [p.stem for p in get_docs_path(harvest)]
In [ ]:
harvest = get_latest_harvest()
In [ ]:
In [ ]:
vectorizer = CountVectorizer(stop_words=frozenset(stopwords), max_features=10000, ngram_range=(1,1))
# preprocessor = lambda x: re.sub(r'(\d[\d\.])+', 'NUM', x.lower())
X_freq = np.asarray(vectorizer.fit_transform(get_docs(harvest)).todense())
df_freq = pd.DataFrame(X_freq, columns=vectorizer.get_feature_names(), index=get_file_names(harvest))
In [ ]:
In [ ]:
df_freq.unstack().to_frame().reset_index().dropna(axis=0, subset=[0])
In [ ]:
# The number of words you want to show
num_words = 10
top_words = pd.DataFrame({n: df_freq.T[col].nlargest(num_words).index.tolist() for n, col in enumerate(df_freq.T)}).T
top_words.index = get_file_names(harvest)
In [ ]:

Add a 'year' column to the dataframe

Each file name includes the date on which the article was published. For example, 18601224-13-5696044 was published on 24 December 1860. We can easily extract the year by just slicing the first four characters off the index.

In [ ]:
df_freq['article_year'] = df_freq.index.str.slice(0, 4)

Most frequent words each year

In [ ]:
# Group by year and sum the word counts
year_groups = df_freq.groupby(by='article_year')
year_group_totals = year_groups.sum()
In [ ]:
# Reshape so that we have columns for year, word, and count
words_by_year = year_group_totals.unstack().to_frame().reset_index()
words_by_year.columns = ['word', 'year', 'count']
In [ ]:
top_words_by_year = words_by_year.sort_values('count', ascending=False).groupby(by=['year']).head(10).reset_index(drop=True)
In [ ]:

Visualise top ten words per year

In [ ]:
    y=alt.Y('word:N', sort='-x'),
    facet=alt.Facet('year', columns=4)
    width=120, height=120

Visualise word frequencies over time

Create a faceted chart

In [ ]:
alt.Chart(words_by_year.loc[words_by_year['word'].isin(['storm', 'cyclone', 'snow'])]).mark_line().encode(
    x=alt.X('year:Q', axis=alt.Axis(format='c', title='Year')),
    facet=alt.Facet('word:N', columns=1)
).properties(width=700, height=100).resolve_scale(

Created by Tim Sherratt (@wragge) for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.