Using the Trove API we'll harvest some information about Trove lists and create a dataset containing the following fields:
id
— the list identifier, you can use this to get more information about a list from either the web interface or the APItitle
number_items
— the number of items in the listcreated
— the date the list was createdupdated
— the date the list was last updatedIf you haven't used one of these notebooks before, they're basically web pages in which you can write, edit, and run live code. They're meant to encourage experimentation, so don't feel nervous. Just try running a few cells and see what happens!.
Some tips:
import datetime
import os
import time
import warnings
from json import JSONDecodeError
from operator import itemgetter
warnings.simplefilter(action="ignore", category=FutureWarning)
import altair as alt
import nltk
import pandas as pd
import requests
from IPython.display import HTML, display
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from textblob import TextBlob
from tqdm.auto import tqdm
from wordcloud import WordCloud
nltk.download("stopwords")
nltk.download("punkt")
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
s.mount("http://", HTTPAdapter(max_retries=retries))
s.mount("https://", HTTPAdapter(max_retries=retries))
[nltk_data] Downloading package stopwords to /home/tim/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package punkt to /home/tim/nltk_data... [nltk_data] Package punkt is already up-to-date!
%%capture
# Load variables from the .env file if it exists
# Use %%capture to suppress messages
%load_ext dotenv
%dotenv
# Insert your Trove API key between the quotes
API_KEY = "YOUR API KEY"
# Use api key value from environment variables if it is available
if os.getenv("TROVE_API_KEY"):
API_KEY = os.getenv("TROVE_API_KEY")
Your API key is: YOUR API KEY
You could change the value of q
if you only want to harvest a subset of lists.
api_url = "http://api.trove.nla.gov.au/v2/result"
params = {
"q": " ",
"zone": "list",
"encoding": "json",
"n": 100,
"s": "*",
"key": API_KEY,
"reclevel": "full",
"bulkHarvest": "true",
}
def get_total():
"""
This will enable us to make a nice progress bar...
"""
response = requests.get(api_url, params=params)
data = response.json()
return int(data["response"]["zone"][0]["records"]["total"])
lists = []
total = get_total()
with tqdm(total=total) as pbar:
while params["s"]:
response = requests.get(api_url, params=params)
try:
data = response.json()
except JSONDecodeError:
print(response.text)
print(response.url)
raise
else:
records = data["response"]["zone"][0]["records"]
try:
params["s"] = records["nextStart"]
except KeyError:
params["s"] = None
for record in records["list"]:
lists.append(
{
"id": record["id"],
"title": record.get("title", ""),
"number_items": record["listItemCount"],
"created": record["created"],
"updated": record["lastupdated"],
}
)
pbar.update(100)
time.sleep(0.2)
0%| | 0/103207 [00:00<?, ?it/s]
# Load past file for testing if in dev
if os.getenv("GW_STATUS") and os.getenv("GW_STATUS") == "dev":
df = pd.read_csv("data/trove-lists-2022-07-05.csv")
# Otherwise load current harvested data
else:
df = pd.DataFrame(lists)
df.head()
df.describe()
id | number_items | |
---|---|---|
count | 103207.000000 | 103207.000000 |
mean | 82882.011433 | 19.137752 |
std | 45973.423472 | 84.193921 |
min | 51.000000 | 0.000000 |
25% | 43685.500000 | 1.000000 |
50% | 83407.000000 | 4.000000 |
75% | 121694.500000 | 13.000000 |
max | 163460.000000 | 10351.000000 |
csv_file = "data/trove-lists-{}.csv".format(datetime.datetime.now().isoformat()[:10])
df.to_csv(csv_file, index=False)
HTML('<a target="_blank" href="{}">Download CSV</a>'.format(csv_file))
total_items = df["number_items"].sum()
print("There are {:,} items in {:,} lists.".format(total_items, df.shape[0]))
There are 1,975,150 items in 103,207 lists.
biggest = df.iloc[df["number_items"].idxmax()]
biggest
id 71461 title Victoria and elsewhere... number_items 10351 created 2015-04-03T11:50:51Z updated 2016-02-22T04:27:12Z Name: 82443, dtype: object
display(
HTML(
'The biggest list is <a target="_blank" href="https://trove.nla.gov.au/list?id={}">{}</a> with {:,} items.'.format(
biggest["id"], biggest["title"], biggest["number_items"]
)
)
)
# This makes it possible to include more than 5000 records
# alt.data_transformers.enable('json', urlpath='files')
alt.data_transformers.disable_max_rows()
alt.Chart(df[["created"]]).mark_line().encode(
x="yearmonth(created):T",
y="count()",
tooltip=[
alt.Tooltip("yearmonth(created):T", title="Month"),
alt.Tooltip("count()", title="Lists"),
],
).properties(width=600)
titles = df["title"].str.lower().str.cat(sep=" ")
# Generate a word cloud image
wordcloud = WordCloud(width=1200, height=800).generate(titles)
wordcloud.to_image()
blob = TextBlob(titles)
stopwords = nltk.corpus.stopwords.words("english")
word_counts = [
[word, count]
for word, count in blob.lower().word_counts.items()
if word not in stopwords
]
word_counts = sorted(word_counts, key=itemgetter(1), reverse=True)[:25]
pd.DataFrame(word_counts).style.format({1: "{:,}"}).bar(
subset=[1], color="#d65f5f"
).set_properties(subset=[1], **{"width": "300px"})
0 | 1 | |
---|---|---|
0 | family | 6,971 |
1 | ww1 | 4,307 |
2 | list | 4,300 |
3 | soldier | 4,295 |
4 | articles | 4,196 |
5 | trove | 3,950 |
6 | john | 2,737 |
7 | william | 2,594 |
8 | history | 2,254 |
9 | james | 1,858 |
10 | george | 1,548 |
11 | thomas | 1,507 |
12 | henry | 1,327 |
13 | charles | 1,085 |
14 | australia | 1,021 |
15 | mary | 967 |
16 | australian | 958 |
17 | edward | 833 |
18 | ww2 | 830 |
19 | nee | 816 |
20 | nsw | 786 |
21 | robert | 782 |
22 | nt | 762 |
23 | joseph | 747 |
24 | arthur | 729 |
ngrams = [" ".join(ngram).lower() for ngram in blob.lower().ngrams(2)]
ngram_counts = (
pd.DataFrame(ngrams)[0]
.value_counts()
.rename_axis("ngram")
.reset_index(name="count")
)
display(
ngram_counts[:25]
.style.format({"count": "{:,}"})
.bar(subset=["count"], color="#d65f5f")
.set_properties(subset=["count"], **{"width": "300px"})
)
ngram | count | |
---|---|---|
0 | ww1 soldier | 3,957 |
1 | list of | 3,885 |
2 | of articles | 3,858 |
3 | soldier list | 3,847 |
4 | in trove | 3,756 |
5 | articles in | 3,737 |
6 | family history | 1,050 |
7 | nt ww2 | 725 |
8 | family tree | 362 |
9 | of the | 319 |
10 | in the | 294 |
11 | in australia | 277 |
12 | wwi soldier | 271 |
13 | family of | 231 |
14 | william ww1 | 221 |
15 | south australia | 212 |
16 | port lincoln | 202 |
17 | henry ww1 | 194 |
18 | john ww1 | 182 |
19 | maroochydore slsc | 175 |
20 | james ww1 | 161 |
21 | and the | 158 |
22 | world war | 158 |
23 | motor boat | 153 |
24 | george ww1 | 152 |
ngrams = [" ".join(ngram).lower() for ngram in blob.lower().ngrams(3)]
ngram_counts = (
pd.DataFrame(ngrams)[0]
.value_counts()
.rename_axis("ngram")
.reset_index(name="count")
)
display(
ngram_counts[:25]
.style.format({"count": "{:,}"})
.bar(subset=["count"], color="#d65f5f")
.set_properties(subset=["count"], **{"width": "300px"})
)
ngram | count | |
---|---|---|
0 | list of articles | 3,847 |
1 | soldier list of | 3,840 |
2 | articles in trove | 3,728 |
3 | of articles in | 3,721 |
4 | ww1 soldier list | 3,563 |
5 | wwi soldier list | 266 |
6 | william ww1 soldier | 219 |
7 | henry ww1 soldier | 191 |
8 | john ww1 soldier | 180 |
9 | james ww1 soldier | 160 |
10 | george ww1 soldier | 150 |
11 | charles ww1 soldier | 133 |
12 | joseph ww1 soldier | 124 |
13 | edward ww1 soldier | 123 |
14 | of articles on | 118 |
15 | articles on trove | 117 |
16 | thomas ww1 soldier | 115 |
17 | andrews of albury | 106 |
18 | cocker spaniel affix | 101 |
19 | arthur andrews of | 100 |
20 | dr arthur andrews | 100 |
21 | ww1 trophy guns | 82 |
22 | music resources theme | 79 |
23 | robert ww1 soldier | 73 |
24 | frederick ww1 soldier | 70 |
Created by Tim Sherratt for the GLAM Workbench.