In this notebook we'll look at some ways of exploring the results.csv
created by the Trove Newspaper and Gazette Harvester.
import os
import pandas as pd
import altair as alt
from wordcloud import WordCloud
import zipfile
from pathlib import Path
from textblob import TextBlob
from operator import itemgetter
import nltk
nltk.download('stopwords')
nltk.download('punkt')
stopwords = nltk.corpus.stopwords.words('english')
By default, this notebook will look for existing harvests in the data
directory. If you want to use a harvest that downloaded previously, just upload the zipped harvest to the data
directory and run the cell below. It will expand all the zipped files in the data
directory.
# Import a harvest zip file you've created previously
# First upload the zip file to the data directory, then run this cell
for zipped in Path('data').glob('*.zip'):
with zipfile.ZipFile(zipped, 'r') as zip_file:
zip_file.extractall(Path('data', zipped.name[:-4]))
These functions open up a harvest and convert the results.csv
into a dataframe for analysis.
def get_latest_harvest():
'''
Get the timestamp of the most recent harvest.
'''
harvests = sorted([d for d in os.listdir('data') if os.path.isdir(os.path.join('data', d))])
return harvests[-1]
def open_harvest_data(timestamp=None):
'''
Open the results of the specified harvest (most recent by default).
Returns a DataFrame.
'''
if not timestamp:
timestamp = get_latest_harvest()
df = pd.read_csv(os.path.join('data', timestamp, 'results.csv'), parse_dates=['date'])
return df
Running open_harvest_data()
without any parameters will load the most recent harvest. To load a different harvest, just supply the name of the directory containing the harvest (this will generally be a timestamp).
df = open_harvest_data()
Let's have a peek at the dataset.
# .head() displays the first 5 rows of a dataframe
df.head()
How many articles did we harvest?
df.shape[0]
What's the earliest and latest publication date in the dataset?
df['date'].min()
df['date'].max()
How many different newspapers are represented in our dataset?
len(df['newspaper_id'].unique())
Which article has the most words?
df.loc[df['words'].idxmax()]
Here we'll visualise the 25 most common newspapers in the dataset.
df_newspapers = df.value_counts(['newspaper_title', 'newspaper_id']).to_frame().reset_index()
df_newspapers.columns = ['newspaper_title', 'newspaper_id', 'count']
df_newspapers
alt.Chart(df_newspapers[:25]).mark_bar().encode(
x=alt.X('count:Q', title='Number of articles'),
y=alt.Y('newspaper_title:N', title='Newspaper', sort='-x'),
tooltip=[alt.Tooltip('newspaper_title:N', title='Newspaper'), alt.Tooltip('count:Q', title='Articles')]
)
df['year'] = df['date'].dt.year
df_years = df['year'].value_counts().to_frame().reset_index()
df_years.columns = ['year', 'count']
df_years
alt.Chart(df_years).mark_line().encode(
x=alt.X('year:Q', axis=alt.Axis(format='d')),
y=alt.Y('count:Q'),
tooltip=[alt.Tooltip('year', title='Year'), alt.Tooltip('count', title='Articles', format=',d')]
).properties(width=700)
df_titles = df.loc[(df['title'] != 'No Title') & (df['title'] != 'Advertising')]
# Get all the articles titles and turn them into a single string
title_text = df_titles['title'].str.lower().str.cat(sep=' ')
# Generate a word cloud image
wordcloud = WordCloud(width=800, height=500, collocations=True).generate(title_text)
display(wordcloud.to_image())
blob = TextBlob(title_text)
word_counts = [[word, count] for word, count in blob.lower().word_counts.items() if word not in stopwords]
word_counts = sorted(word_counts, key=itemgetter(1), reverse=True)[:25]
pd.DataFrame(word_counts, columns=['word', 'count']).style.format({'count': '{:,}'}).bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'})
This makes use of a spreadsheet file that maps Trove newspaper titles to locations. Once we've loaded the spreadsheet we can use it to locate all of the harvested articles.
# Url of the Trove places spreadshseet
trove_places = 'https://docs.google.com/spreadsheets/d/1rURriHBSf3MocI8wsdl1114t0YeyU0BVSXWeg232MZs/gviz/tq?tqx=out:csv&sheet=198244298'
# Open the CSV file with Pandas
place_df = pd.read_csv(trove_places)
df_located = pd.merge(df_newspapers, place_df, how='left', left_on='newspaper_id', right_on='title_id')
# There may be some newspapers that haven't been added to the locations dataset yet, so we'll drop them
df_located.dropna(axis=0, subset=['latitude'], inplace=True)
# Load Australian boundaries
australia = alt.topo_feature('https://raw.githubusercontent.com/GLAM-Workbench/trove-newspapers/master/data/aus_state.geojson', feature='features')
# Create the map of Australia using the boundaries
aus_background = alt.Chart(australia).mark_geoshape(
# Style the map
fill='lightgray',
stroke='white'
).project('equirectangular').properties(width=600, height=600)
# Plot the places
points = alt.Chart(df_located).mark_circle(
color='steelblue'
).encode(
# Set position of each place using lat and lon
longitude='longitude:Q',
latitude='latitude:Q',
size=alt.Size('count:Q',
scale=alt.Scale(range=[0, 1000]),
legend=alt.Legend(title='Number of articles')
),
# More details on hover
tooltip=[alt.Tooltip('newspaper_title_x', title='Newspaper'), 'latitude', 'longitude', 'count']
).properties(width=600, height=600)
# Combine map and points
alt.layer(aus_background, points)
Created by Tim Sherratt (@wragge) for the GLAM Workbench.
Support this project by becoming a GitHub sponsor.