#!/usr/bin/env python # coding: utf-8 # # Reshaping your newspaper harvest # # The Trove Newspaper Harvester downloads the OCRd text of newspaper articles as individual text files – one file for each article. That's great for exploring the content of individual articles in depth, but sometimes you might want to zoom out and aggregate the files into larger chunks. For example, if you're interested in how language changes over time, you might what to create a separate corpus for each year in the results set. Or perhaps you want to examine differences in the way particular newspapers talk about an event by grouping the articles by newspaper. This notebook provides a slice and dice wonder tool for Trove newspaper harvests, enabling you to repackage OCRd text by decade, year, and newspaper title. It saves the results as zip files, concatenated text files, or CSV files with embedded text. These repackaged slices should suit a variety of text analysis tools and questions. # In[ ]: import os import shutil from pathlib import Path from zipfile import ZIP_DEFLATED from zipfile import Path as ZipPath from zipfile import ZipFile import pandas as pd from dotenv import load_dotenv from natsort import natsorted from trove_newspaper_harvester.core import Harvester, prepare_query load_dotenv() # ## Upload an existing harvest # # If you want to reshape a dataset downloaded from a previous run of the Trove Newspaper Harvester, upload the zipped dataset file to the `zips` directory: # # - double click the `zips` folder to open it # - click on the upload icon to select your existing dataset # # Once the file has been uploaded to the `zips` directory, run the cell below to unpack the contents into the `data` directory. # In[ ]: # Unzip the contents of the `zips` directory and save to the `data` directory for zip in Path("zips").glob("*.zip"): zip_file = ZipFile(zip) if ZipPath(zip_file, at=f"{zip.stem}/").exists(): zip_file.extractall("data") else: output_path = Path("data", zip.stem) output_path.mkdir(exist_ok=True) zip_file.extractall(output_path) # ## Code for the HarvestSlicer # In[ ]: class HarvestSlicer: def __init__(self, harvest_id, data_dir="data", relevance_percent=None): """ Initialise the Slicer with details of the harvest. """ self.harvest_id = harvest_id self.data_path = Path(data_dir, harvest_id) self.text_path = Path(self.data_path, "text") self.relevance_percent = relevance_percent df = pd.read_csv(Path(self.data_path, "results.csv"), low_memory=False) df["year"] = df["date"].str.slice(0, 4) if relevance_percent: df = df.loc[ df["relevance"] > df["relevance"].quantile(relevance_percent / 100) ] self.df = df def get_years(self): """ Get a list of the years in which articles in the current harvest were published. """ return sorted(self.df["year"].unique()) def get_titles(self, year=""): """ Get a list of the newspaper titles in which articles in the current harvest were published. """ return sorted(self.df["newspaper_id"].unique()) def get_top_titles(self, sample_size=10, measure="articles", period=None): """ Get a list of the 'top' titles in which articles in the current harvest were published. 'Top' can be measured by either the number of articles, or number of words. Parameters: - sample_size: number of titles to include - measure: how to determine the ranking, either 'articles' or 'words' - period: specify a year or decade """ df = self.df.copy() if period: df = df.loc[df["date"].str.startswith(str(period))] if measure == "articles": sample = ( df.groupby("newspaper_id") .size() .to_frame() .reset_index() .sort_values(0, ascending=False)[:sample_size] ) elif measure == "words": sample = ( df.groupby("newspaper_id")["words"] .sum() .to_frame() .reset_index() .sort_values("words", ascending=False)[:sample_size] ) return sample["newspaper_id"].to_list() def slice_by_time_title(self, period=None, unit="year", title=None, save_as="zip"): """ Slice the collection of harvested newspaper articles to create a subset using the supplied parameters. Parameters: - period: value of year or decade, eg: "1950" - unit: unit of time, either "year" or "decade" - title: newspaper title identifier - save_as: how to save the slice, either "zip" or "text" """ relevance = "" if self.relevance_percent: relevance = f"-relevance-{self.relevance_percent}" if period and title: glob_pattern = f"{period}*-{title}-*.txt" filters = [unit, str(period), "title", str(title)] output_path = Path(self.data_path, f"{unit}-title{relevance}") elif period: glob_pattern = f"{period}*.txt" filters = [unit, str(period)] output_path = Path(self.data_path, f"{unit}{relevance}") elif title: glob_pattern = f"*-{title}-*.txt" filters = ["title", str(title)] output_path = Path(self.data_path, f"title{relevance}") else: return output_path.mkdir(exist_ok=True) # Save into a new zip file if save_as == "zip": zip_path = Path(output_path, f"{self.harvest_id}-{'-'.join(filters)}.zip") with ZipFile(zip_path, "w", ZIP_DEFLATED) as zip_file: for text_file in self.text_path.glob(glob_pattern): zip_file.write(text_file, text_file.name) # Save as one big concatenated text file elif save_as == "text": with Path(output_path, f"{self.harvest_id}-{'-'.join(filters)}.txt").open( "w" ) as combined_text: for text_file in natsorted(self.text_path.glob(glob_pattern)): combined_text.write(text_file.read_text()) combined_text.write("\n\n") def slice_titles(self, save_as="zip"): """ Create slices for each newspaper title. Parameters: - save_as: how to save the slice, either "zip" or "text" """ titles = self.get_titles() for title in titles: self.slice_by_time_title(title=title, save_as=save_as) def slice_top_titles(self, measure="articles", sample_size=10, save_as="zip"): """ Create slices for 'top' newspaper titles. 'Top' can be measured by either the number of articles, or number of words. Parameters: - sample_size: number of titles to include - measure: how to determine the ranking, either 'articles' or 'words' - save_as: how to save the slice, either "zip" or "text" """ top_titles = self.get_top_titles(measure=measure, sample_size=sample_size) for title in top_titles: self.slice_by_time_title(title=title, save_as=save_as) def slice_years(self, save_as="zip"): """ Create slices by year. Parameters: - save_as: how to save the slice, either "zip" or "text" """ years = self.get_years() for year in years: self.slice_by_time_title(period=year, save_as=save_as) def slice_decades(self, save_as="zip"): """ Create slices by decade. Parameters: - save_as: how to save the slice, either "zip" or "text" """ years = self.get_years() decades = sorted(set([str(y)[:3] for y in years])) for decade in decades: self.slice_by_time_title(period=decade, unit="decade", save_as=save_as) def slice_years_titles(self, save_as="zip"): """ Create slices for each combination of newspaper titles and year. Parameters: - save_as: how to save the slice, either "zip" or "text" """ years = self.get_years() for year in years: titles = self.get_titles(year=year) for title in titles: self.slice_by_time_title(period=year, title=title, save_as=save_as) def slice_years_top_titles(self, measure="articles", sample_size=10, save_as="zip"): """ Create slices for each combination of year and 'top' newspaper titles. 'Top' can be measured by either the number of articles, or number of words. Parameters: - sample_size: number of titles to include - measure: how to determine the ranking, either 'articles' or 'words' - save_as: how to save the slice, either "zip" or "text" """ years = self.get_years() for year in years: titles = self.get_top_titles( measure=measure, sample_size=sample_size, period=year ) for title in titles: self.slice_by_time_title(period=year, title=title, save_as=save_as) # ## Using the Harvest Slicer # # To create a new Harvest Slicer run: # # ``` python # slicer = HarvestSlicer("[Your Harvest ID]") # ``` # # Substitute your harvest's identifier for `[Your Harvest ID]`. The harvest identifier is the name of the directory containing your harvest. This will usually be a string of numbers representing the date/time when the harvest was started. For example: # # ``` python # slicer = HarvestSlicer("20240522025457") # ``` # # By default, harvests are saved in the `data` directory. If your harvest is in a different directory, you need to supply a `data_dir` parameter set to the directory name. For example: # # ``` python # slicer = HarvestSlicer("20240522025457", data_dir="myharvests") # ``` # # By default, the HarvestSlicer will operate on *all* the results in the harvested dataset. However, you might want to do some initial filtering by making use of relevance scores. The relevance scores are calculated by Trove's search index and take into account things like where and the number of times your keywords appear in an article. Use the `relevance_percent` parameter to specify a cut-off value for inclusion. For example, if you set `relevance_percent` to `50` only articles with relevance scores in the top 50% of scores will be included in your dataset: # # ``` python # slicer = HarvestSlicer("20240522025457", relevance_percent=50) # ``` # # Enter your harvest id below and run the cell to create a Harvest Slicer. # In[ ]: # Substitute your harvest identifier below slicer = HarvestSlicer("[Your Harvest ID]") # Optionally filter your dataset by relevance score. # In[ ]: # Substitute your harvest identifier below # Change relevance_percent to your desired cutoff point slicer = HarvestSlicer("[Your Harvest ID]", relevance_percent=50) # ### Slicing by decade or year # # You can create slices of harvested articles by year or decade. These slices can be saved as either a zip file containing the individual text files, or as one big text file containing the contents of each article – one article per line. # # The slices will be saved in a directories named `year` or `decade`. Each slice is named using the harvest identifier and the year or decade. For example, if you sliced the `20240522025457` harvest by year, you'd end up with a `year` directory cont|aining files like `20240522025457-year-1950.txt`. # # Slice by year and save the results as zip files (the default): # In[ ]: slicer.slice_years() # Slice by year and save the results as concatenated text files: # In[ ]: slicer.slice_years(save_as="text") # Slice by decade and save the results as zip files (the default): # In[ ]: slicer.slice_decades() # Slice by decade and save the results as concatenated text files: # In[ ]: slicer.slice_decades(save_as="text") # ### Slice by newspaper # # You can create slices of harvested articles according to the newspaper in which they were published. These slices can be saved as either a zip file containing the individual text files, or as one big text file containing the contents of each article – one article per line. # # There are close to 2,000 different newspapers in Trove. To limit the number of slices you can choose to only save articles from the 'top' newspapers in the dataset. Top is measured by looking at either the total number of articles, or the total number of words in articles. You can choose how many newspapers in the ranked 'top' list to include. # # The slices will be saved in a directory named `title`. Each slice is named using the harvest identifier and the newspaper identifier. For example, if you sliced the `20240522025457` harvest by title, you'd end up with a `title` directory containing files like `20240522025457-title-11.txt` (`11` is Trove's identifier of the *Canberra Times*). # # Slice by newspaper title and save the results as zip files (the default): # In[ ]: slicer.slice_titles() # Slice by title and save the results as concatenated text files: # In[ ]: slicer.slice_titles(save_as="text") # Slice by newspaper title and save results from the 10 newspapers with the most articles as zip files (the default settings): # In[ ]: slicer.slice_top_titles() # Slice by newspaper title and save results from the 20 newspapers with the most words as concatenated text files: # In[ ]: slicer.slice_top_titles(measure="words", sample_size=20, save_as="text") # ### Slice by both year and newspaper # # You can create slices of harvested articles from each newspaper, published in each year. This means there'll be a slice for each combination of title and year. These slices can be saved as either a zip file containing the individual text files, or as one big text file containing the contents of each article – one article per line. # # To limit the number of slices you can choose to only save articles from the 'top' newspapers in the dataset. Top is measured by looking at either the total number of articles, or the total number of words in articles. You can choose how many newspapers in the ranked 'top' list to include. # # The slices will be saved in a directory named `year-title`. Each slice is named using the harvest identifier, the year, and the newspaper identifier. For example, if you sliced the `20240522025457` harvest by year and title, you'd end up with a `year-title` directory containing files like `20240522025457-year-1950-title-11.txt` (`11` is Trove's identifier of the *Canberra Times*). # # Slice by year and newspaper title and save the results as zip files (the default): # In[ ]: slicer.slice_years_titles() # Slice by title and save the results as concatenated text files: # In[ ]: slicer.slice_years_titles(save_as="text") # Slice by year and newspaper title and save results from the 10 newspapers with the most articles as zip files (the default settings): # In[ ]: slicer.slice_years_top_titles() # Slice by year and newspaper title and save results from the 20 newspapers with the most words as concatenated text files: # In[ ]: slicer.slice_years_top_titles(measure="words", sample_size=20, save_as="text") # ## Create a CSV file with a subset of results # # The `HarvestSlicer` creates new collections of OCRd text. For some purposes it might be more useful to create a subset of the harvested metadata in the `results.csv` file, adding the OCRd text into new CSV file. # # The `filter_results()` function creates a new CSV file with a subset of the original results, filtering by year and/or newspaper title. By default, it will also add the OCRd text from each article to a new `full_text` column, and filter the columns in the dataset to include only `title`, `date`, `page`, `newspaper_title`, `url`, and `full_text`. Both these defaults can be changed. # # The resulting CSV files are saved in the harvest directory. For example, a dataset that was filtered to include results from 1950 published in the *Canberra Times* (id is '11') would be saved as: `filtered-results-year-1950-title-11.csv`. # In[ ]: def filter_results( harvest_id, data_dir="data", relevance_percent=None, year=None, title=None, add_text=True, fields=["title", "date", "page", "newspaper_title", "url"], ): """ Filter an existing results set by year and/or title, adding the OCRd text of each individual article to a new `full_text` column. Parameters: - harvest_id: identifier of the harvest to filter - data_dir: location of the harvest (default is 'data') - relevance_percent: relevance score cut off - year: eg '1950' - title: Trove newspaper title identifier, eg '11' - add_text: add OCRd text to CSV (default is True) - fields: list of fields to include in the resulting CSV Result: - saves the results as a CSV file """ data_path = Path(data_dir, harvest_id) df = pd.read_csv(Path(data_path, "results.csv"), low_memory=False) filters = [] if relevance_percent: df = df.loc[df["relevance"] > df["relevance"].quantile(relevance_percent / 100)] print(df["relevance"].quantile(relevance_percent / 100)) filters += ["relevance", str(relevance_percent)] if year and title: df = df.loc[ (df["date"].str.startswith(str(year))) & (df["newspaper_id"] == int(title)) ] filters += ["year", str(year), "title", str(title)] elif year: df = df.loc[df["date"].str.startswith(str(year))] filters += ["year", str(year)] elif title: df = df.loc[df["newspaper_id"] == int(title)] filters += ["title", str(title)] if add_text: df["full_text"] = df["text"].apply(lambda x: Path(data_path, x).read_text()) fields.append("full_text") filters.append("text") df[fields].to_csv( Path(data_path, f"filtered-results-{'-'.join(filters)}.csv"), index=False ) # Create a filtered results set containing articles published in 1950: # In[ ]: # Insert your harvest identifier between the quotes filter_results("[Your harvest id]", year=1950) # Create a filtered results set containing articles published in the *Canberra Times*: # In[ ]: filter_results("[Your harvest id]", title=11) # Create a filtered results set containing articles published in 1950 in the *Canberra Times*: # In[ ]: filter_results("[Your harvest id]", year=1950, title=11) # Filter the results set using the relevance scores of articles, saving results with scores in the top 50%, without adding the OCRd text to the CSV file. # In[ ]: # Insert your harvest identifier between the quotes filter_results("[Your harvest id]", relevance_percent=50, add_text=False) # Filter the results set using the relevance scores of articles, saving results with scores in the top 50%, and adding the OCRd text to the CSV file. # In[ ]: # Insert your harvest identifier between the quotes filter_results("[Your harvest id]", relevance_percent=50) # In[ ]: # IGNORE CELL -- TESTING ONLY if os.getenv("GW_STATUS") == "dev": API_KEY = os.getenv("TROVE_API_KEY") query = "https://trove.nla.gov.au/search/category/newspapers?keyword=%22octopus%20intelligence%22" params = prepare_query(query=query) harvester = Harvester(query_params=params, key=API_KEY, text=True) harvester.harvest() harvester.save_csv() harvest_id = harvester.harvest_dir.name slicer = HarvestSlicer(harvest_id) slicer.slice_titles() slicer.slice_years_top_titles(measure="words", sample_size=20, save_as="text") slicer.slice_years() filter_results(harvest_id, year=1946) slicer = HarvestSlicer(harvest_id, relevance_percent=50) slicer.slice_titles() slicer.slice_years_top_titles(measure="words", sample_size=20, save_as="text") slicer.slice_years() filter_results(harvest_id, year=1946, relevance_percent=50) shutil.rmtree(harvester.harvest_dir) # ---- # # Created by [Tim Sherratt](https://timsherratt.au) for the [GLAM Workbench](https://glam-workbench.net/). # #