#!/usr/bin/env python # coding: utf-8 # # Harvest summary data from Trove lists # # Using the Trove API we'll harvest some information about Trove lists and create a dataset containing the following fields: # # * `id` — the list identifier, you can use this to get more information about a list from either the web interface or the API # * `title` # * `number_items` — the number of items in the list # * `created` — the date the list was created # * `updated` — the date the list was last updated #

If you haven't used one of these notebooks before, they're basically web pages in which you can write, edit, and run live code. They're meant to encourage experimentation, so don't feel nervous. Just try running a few cells and see what happens!.

# #

# Some tips: #

Code cells have boxes around them.
To run a code cell click on the cell and then hit Shift+Enter. The Shift+Enter combo will also move you to the next cell, so it's a quick way to work through the notebook.
While a cell is running a * appears in the square brackets next to the cell. Once the cell has finished running the asterix will be replaced with a number.
In most cases you'll want to start from the top of notebook and work your way down running each cell in turn. Later cells might depend on the results of earlier ones.
To edit a code cell, just click on it and type stuff. Remember to run the cell once you've finished editing.

# ## Setting up... # In[13]: import datetime import os import time import warnings from json import JSONDecodeError from operator import itemgetter warnings.simplefilter(action="ignore", category=FutureWarning) import altair as alt import nltk import pandas as pd import requests from IPython.display import HTML, display from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry from textblob import TextBlob from tqdm.auto import tqdm from wordcloud import WordCloud nltk.download("stopwords") nltk.download("punkt") s = requests.Session() retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504]) s.mount("http://", HTTPAdapter(max_retries=retries)) s.mount("https://", HTTPAdapter(max_retries=retries)) # In[3]: get_ipython().run_cell_magic('capture', '', '# Load variables from the .env file if it exists\n# Use %%capture to suppress messages\n%load_ext dotenv\n%dotenv\n') # ## Add your Trove API key # In[2]: # Insert your Trove API key between the quotes API_KEY = "YOUR API KEY" # Use api key value from environment variables if it is available if os.getenv("TROVE_API_KEY"): API_KEY = os.getenv("TROVE_API_KEY") # ## Set some parameters # # You could change the value of `q` if you only want to harvest a subset of lists. # In[3]: api_url = "http://api.trove.nla.gov.au/v2/result" params = { "q": " ", "zone": "list", "encoding": "json", "n": 100, "s": "*", "key": API_KEY, "reclevel": "full", "bulkHarvest": "true", } # ## Harvest the data # In[4]: def get_total(): """ This will enable us to make a nice progress bar... """ response = requests.get(api_url, params=params) data = response.json() return int(data["response"]["zone"][0]["records"]["total"]) # In[5]: lists = [] total = get_total() with tqdm(total=total) as pbar: while params["s"]: response = requests.get(api_url, params=params) try: data = response.json() except JSONDecodeError: print(response.text) print(response.url) raise else: records = data["response"]["zone"][0]["records"] try: params["s"] = records["nextStart"] except KeyError: params["s"] = None for record in records["list"]: lists.append( { "id": record["id"], "title": record.get("title", ""), "number_items": record["listItemCount"], "created": record["created"], "updated": record["lastupdated"], } ) pbar.update(100) time.sleep(0.2) # ## Inspect the results # In[7]: # Load past file for testing if in dev if os.getenv("GW_STATUS") and os.getenv("GW_STATUS") == "dev": df = pd.read_csv("data/trove-lists-2022-07-05.csv") # Otherwise load current harvested data else: df = pd.DataFrame(lists) df.head() # In[8]: df.describe() # ## Save the harvested data as a CSV file # In[8]: csv_file = "data/trove-lists-{}.csv".format(datetime.datetime.now().isoformat()[:10]) df.to_csv(csv_file, index=False) HTML('Download CSV'.format(csv_file)) # ## How many items are in lists? # In[9]: total_items = df["number_items"].sum() print("There are {:,} items in {:,} lists.".format(total_items, df.shape[0])) # ## What is the biggest list? # In[10]: biggest = df.iloc[df["number_items"].idxmax()] biggest # In[11]: display( HTML( 'The biggest list is {} with {:,} items.'.format( biggest["id"], biggest["title"], biggest["number_items"] ) ) ) # ## When were they created? # In[14]: # This makes it possible to include more than 5000 records # alt.data_transformers.enable('json', urlpath='files') alt.data_transformers.disable_max_rows() alt.Chart(df[["created"]]).mark_line().encode( x="yearmonth(created):T", y="count()", tooltip=[ alt.Tooltip("yearmonth(created):T", title="Month"), alt.Tooltip("count()", title="Lists"), ], ).properties(width=600) # ## What words are used in the titles? # In[15]: titles = df["title"].str.lower().str.cat(sep=" ") # In[16]: # Generate a word cloud image wordcloud = WordCloud(width=1200, height=800).generate(titles) wordcloud.to_image() # ### Word frequency # In[17]: blob = TextBlob(titles) stopwords = nltk.corpus.stopwords.words("english") word_counts = [ [word, count] for word, count in blob.lower().word_counts.items() if word not in stopwords ] word_counts = sorted(word_counts, key=itemgetter(1), reverse=True)[:25] pd.DataFrame(word_counts).style.format({1: "{:,}"}).bar( subset=[1], color="#d65f5f" ).set_properties(subset=[1], **{"width": "300px"}) # ### Bigram frequency # In[18]: ngrams = [" ".join(ngram).lower() for ngram in blob.lower().ngrams(2)] ngram_counts = ( pd.DataFrame(ngrams)[0] .value_counts() .rename_axis("ngram") .reset_index(name="count") ) display( ngram_counts[:25] .style.format({"count": "{:,}"}) .bar(subset=["count"], color="#d65f5f") .set_properties(subset=["count"], **{"width": "300px"}) ) # ### Trigram frequency # In[19]: ngrams = [" ".join(ngram).lower() for ngram in blob.lower().ngrams(3)] ngram_counts = ( pd.DataFrame(ngrams)[0] .value_counts() .rename_axis("ngram") .reset_index(name="count") ) display( ngram_counts[:25] .style.format({"count": "{:,}"}) .bar(subset=["count"], color="#d65f5f") .set_properties(subset=["count"], **{"width": "300px"}) ) # ---- # # Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.github.io/).