#!/usr/bin/env python # coding: utf-8 # # Beyond the copyright cliff of death # # Most of the newspaper articles on Trove were published before 1955, but there are some from the later period. Let's find out how many, and which newspapers they were published in. # In[1]: import requests import pandas as pd from IPython.display import display, FileLink # In[2]: trove_api_key = 'YOUR API KEY' # ## Search for articles published after 1955 # # First we're going to run a date query to find all the articles published after 1954. But instead of looking at the articles themselves, we're going to get the `title` facet – this will tell us the number of articles for each newspaper. # In[3]: params = { 'q': 'date:[1955 TO *]', # date range query 'zone': 'newspaper', 'facet': 'title', # get the newspaper facets 'encoding': 'json', 'n': 0, # no articles thanks 'key': trove_api_key } # In[4]: # Make our API request response = requests.get('https://api.trove.nla.gov.au/v2/result', params=params) data = response.json() # In[5]: # Get the facet data facets = data['response']['zone'][0]['facets']['facet']['term'] # In[6]: # Convert to a dataframe df_articles = pd.DataFrame(facets) # Get rid of some columns df_articles = df_articles[['count', 'display']] # Rename columns df_articles.columns = ['number_of_articles', 'id'] # Change id to string, so we can merge on it later df_articles['id'] = df_articles['id'].astype('str') # Preview results df_articles.head() # ## Match the facets with newspapers # # As you can see from the data above, the `title` facet only gives us the identifier for a newspaper, not its title or date range. To get more information about each newspaper, we're going to get a list of newspapers from the Trove API and then merge the two datasets. # In[7]: # Get ALL the newspapers response = requests.get('https://api.trove.nla.gov.au/v2/newspaper/titles', params={'encoding': 'json', 'key': trove_api_key}) newspapers_data = response.json() # In[8]: newspapers = newspapers_data['response']['records']['newspaper'] # Convert to a dataframe df_newspapers = pd.DataFrame(newspapers) # In[9]: # Merge the two dataframes by doing a left join on the 'id' column df_newspapers_post54 = pd.merge(df_articles, df_newspapers, how='left', on='id') df_newspapers_post54.head() # ## Results # In[10]: # How many newspapers? df_newspapers_post54.shape[0] # In[11]: # Reorder columns and save as CSV df_newspapers_post54[['title', 'state', 'id', 'startDate', 'endDate', 'issn', 'number_of_articles', 'troveUrl']].to_csv('newspapers_post_54.csv', index=False) # Display a link for easy download display(FileLink('newspapers_post_54.csv')) # ---- # # Created by [Tim Sherratt](https://timsherratt.org/) for the [GLAM Workbench](https://glam-workbench.github.io/).