import json
import string
import time
from pathlib import Path
import pandas as pd
from recordsearch_data_scraper.scrapers import RSItemSearch, RSSeriesSearch
from tqdm.auto import tqdm
To harvest all of the series data we'll simply loop through the alphabet and search for series identifiers starting with each letter. Note that the recordsearch_data_scraper
caches results. This means that if the harvest fails for some reason, you can just re-run the cell below to start it again and it'll pick up where it left off.
However, this also means that if you want a completely fresh harvest, you should delete the cache_db.sqlite
file before you start.
series = []
for letter in string.ascii_uppercase:
series_results = RSSeriesSearch(record_detail="full", series_id=f"{letter}*")
if series_results.total_results > 0:
with tqdm(total=series_results.total_results, desc=letter) as pbar:
more = True
while more:
data = series_results.get_results()
if data["results"]:
series += data["results"]
pbar.update(len(data["results"]))
time.sleep(0.5)
else:
more = False
time.sleep(1)
How many series did we harvest?
len(series)
The series details include complex relationships which can't be easily saved into a flat file format like CSV. So we'll write all the harvested data to a JSON lines file, where each series description is a JSON object on a single line.
with Path("all_series_april_2022.ndjson").open("w") as series_file:
for s in series:
series_file.write(json.dumps(s) + "\n")
To find the numbers of items digitised, and the access status of items within each series, the scraper fires off an item search. However, RecordSearch returns a maximum of 20,000 results from a search. This means that some values in the harvested data will be '20,000+'. To replace that with something a bit more useful, we have to break the result set into a series of smaller chunks by filtering on the control symbol.
Here we just loop through a list of letters and numbers, adding them to the control symbol symbol search until the results are below 20,000. Then we add the results for all the chunks together to get the total.
First convert the data into a dataframe.
df = pd.json_normalize(series)
control_range = (
[str(number) for number in range(0, 10)]
+ [letter for letter in string.ascii_uppercase]
+ ["/"]
)
def get_results(**kwargs):
s = RSItemSearch(**kwargs)
if s.total_results == "20,000+":
return False
else:
return s.total_results
def refine_controls(current_control, **kwargs):
total_digitised = 0
for control in control_range:
new_control = current_control.strip("*") + control + "*"
# print(new_control)
kwargs["control"] = new_control
total = get_results(**kwargs)
# print(total)
if total is False:
total_digitised += refine_controls(new_control, **kwargs)
else:
total_digitised += total
return total_digitised
def get_large_series_totals(series, digital=None, access=None):
total_digitised = 0
kwargs = {"series": series}
if digital:
kwargs["digital"] = True
if access:
kwargs["access"] = access
for control in control_range:
kwargs["control"] = control + "*"
# print(control1)
total = get_results(**kwargs)
# print(total)
if total is False:
total_digitised += refine_controls(control, **kwargs)
else:
total_digitised += total
return total_digitised
def get_digitised_total(row):
if row["items_digitised"] != "20,000+":
return row["items_digitised"]
else:
print(row["identifier"])
return get_large_series_totals(row["identifier"], digital=True)
def get_open_total(row):
if row["access_status_totals.OPEN"] != "20,000+":
return row["access_status_totals.OPEN"]
else:
print(row["identifier"])
return get_large_series_totals(row["identifier"], access="OPEN")
def get_nye_total(row):
if row["access_status_totals.NYE"] != "20,000+":
return row["access_status_totals.NYE"]
else:
print(row["identifier"])
return get_large_series_totals(row["identifier"], access="NYE")
def get_owe_total(row):
if row["access_status_totals.OWE"] != "20,000+":
return row["access_status_totals.OWE"]
else:
print(row["identifier"])
return get_large_series_totals(row["identifier"], access="OWE")
Now we'll process the fields that have the '20,000+' values.
df["digitised_total"] = df.apply(get_digitised_total, axis=1)
df["access_open_total"] = df.apply(get_open_total, axis=1)
df["access_nye_total"] = df.apply(get_nye_total, axis=1)
df["access_owe_total"] = df.apply(get_owe_total, axis=1)
df.loc[df["digitised_total"] == "20,000+"]
Quantities and locations are stored in a list. Here we'll add up the quantities in the list to get a total quantity for each series.
df["quantity_total"] = (
df["locations"].dropna().apply(lambda l: round(sum([x["quantity"] for x in l]), 2))
)
To save the totals as a CSV, we'll leave out the series relationships with other series and agencies.
df_totals = df.copy()[
[
"identifier",
"title",
"contents_date_str",
"contents_start_date",
"contents_end_date",
"quantity_total",
"items_described_note",
"items_described",
"digitised_total",
"access_open_total",
"access_owe_total",
"access_status_totals.CLOSED",
"access_nye_total",
]
]
Simplify some of the column headings.
df_totals.columns = [
"identifier",
"title",
"contents_date_str",
"contents_start_date",
"contents_end_date",
"quantity_total",
"described_note",
"described_total",
"digitised_total",
"access_open_total",
"access_owe_total",
"access_closed_total",
"access_nye_total",
]
Make sure the totals are stored as integers.
df_totals["described_total"] = df_totals["described_total"].astype("Int64")
df_totals["digitised_total"] = df_totals["digitised_total"].astype("Int64")
df_totals["access_open_total"] = df_totals["access_open_total"].astype("Int64")
df_totals["access_nye_total"] = df_totals["access_nye_total"].astype("Int64")
df_totals["access_owe_total"] = df_totals["access_owe_total"].astype("Int64")
df_totals["access_closed_total"] = df_totals["access_closed_total"].astype("Int64")
Save as a CSV file.
df_totals.to_csv("series_totals_April_2022.csv", index=False)
Created by Tim Sherratt for the GLAM Workbench. Support me by becoming a GitHub sponsor!