Each Sunday I'm automatically harvesting details of files digitised by the NAA in the previous week. You can view the results in this repository. This notebook analyses the most recent harvest to provide a summary of the results.
import datetime
from urllib.error import HTTPError
import arrow
import pandas as pd
from IPython.display import display
from recordsearch_data_scraper.scrapers import RSSeries
from tqdm.auto import tqdm
# Find the date of the most recent Sunday
today = arrow.now().to("Australia/Sydney")
# Today is Sunday and it's past 2pm so the harvest should have run
if today.weekday() == 6 and today.time() >= datetime.time(14, 0, 0, 0):
harvest_day = today
# Otherwise get last Sunday
else:
harvest_day = arrow.now().to("Australia/Sydney").shift(weekday=6).shift(weeks=-1)
print(f'Harvested on {harvest_day.format("dddd, D MMMM YYYY")}.')
Harvested on Sunday, 15 January 2023.
try:
df = pd.read_csv(
f'https://raw.githubusercontent.com/wragge/naa-recently-digitised/master/data/digitised-week-ending-{harvest_day.format("YYYYMMDD")}.csv'
)
except HTTPError:
df = pd.DataFrame()
df.shape
(0, 0)
if not df.empty:
df["series"].value_counts()[:10]
if not df.empty:
series_list = list(df["series"].unique())
cited_series = []
for series in tqdm(series_list):
data = RSSeries(
series, include_number_digitised=False, include_access_status=False
).data
cited_series.append({"series": series, "series_title": data["title"]})
df_titles = pd.merge(df, pd.DataFrame(cited_series), how="left", on="series")
with pd.option_context("display.max_colwidth", 100):
df_titles = (
df_titles.value_counts(["series", "series_title"]).to_frame().reset_index()
)
df_titles.columns = ["series", "series_title", "total"]
display(df_titles[:20])
totals = ""
for title in df_titles[:20].itertuples():
totals += (
f"{title.series}, {title.series_title}, {title.total} files digitised; "
)
print(totals)