In this notebook we'll summarise data from all the harvested series.
import os
import pandas as pd
from IPython.core.display import display, HTML
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode()
import series_details
# This is a list of all the series harvested as part of this repository
series_list = ['A6119', 'A6122', 'A6126', 'A9626', 'A6335', 'B2836', 'A8703', 'A13828', 'A6281', 'A6285', 'A6283', 'A6282', 'A6126', 'A9106', 'A9108', 'A9105', 'A12694', 'D1902', 'D1915']
Let's combine summaries of all the harvested series into a single dataframe so we can look at the big picture.
# Create a list to store the summaries
summaries = []
# Loop through the list of series in this repo
for series in series_list:
# Open the CSV of each series harvest as a data frame
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
# Extract a summary of each series and add it to the list of summaries
summaries.append(series_details.make_summary(series, df, include_titles=False))
# Convert the list of summaries into a DataFrame for easy manipulation
df = pd.DataFrame(summaries)
# Flatten the access count dictionaries and fill blanks with zero
df = pd.concat([df, pd.DataFrame((d for idx, d in df['access_counts'].iteritems()))], axis=1).fillna(0)
# Change access counts from floats to integers
df[['Closed', 'Not yet examined', 'Open with exception', 'Open']] = df[['Closed', 'Not yet examined', 'Open with exception', 'Open']].astype(int)
# Delete the old 'access_counts' column
del df['access_counts']
# For convenience acronymise 'Not yet examined' and 'Open with exception'
df.rename({'Not yet examined': 'NYE', 'Open with exception': 'OWE'}, axis=1, inplace=True)
Let's display a summary of each series in a nicely formatted table.
# Get the columns into the order we want
df = df[['series', 'total_items', 'date_from', 'date_to', 'Open', 'OWE', 'NYE', 'Closed', 'digitised_files', 'digitised_pages']]
# Calculate and add a percentage open column
df['% open'] = df['Open'] / df['total_items']
# Calculate and add a percentage digitised column
df['% digitised'] = df['digitised_files'] / df['total_items']
# Add a link to the series name
df['series'] = df['series'].apply(lambda x: '<a href="{}-summary.ipynb">{}</a>'.format(x.replace('/', '-'), x))
# Style the output
(df.style
.set_properties(**{'font-size': '120%'})
.set_properties(subset=['series'], **{'text-align': 'left', 'font-weight': 'bold'})
.format('{:,}', ['total_items', 'Open', 'OWE', 'NYE', 'Closed', 'digitised_files', 'digitised_pages'])
.format('{:.2%}', ['% open', '% digitised'])
# Hide the index
.set_table_styles([dict(selector="th", props=[("font-size", "120%"), ("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
.background_gradient(cmap='Greens', subset=['% open', '% digitised'], high=0.5)
)
series | total_items | date_from | date_to | Open | OWE | NYE | Closed | digitised_files | digitised_pages | % open | % digitised | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | A6119 | 6,741 | 1852 | 2009 | 43 | 6,314 | 363 | 20 | 2,320 | 258,547 | 0.64% | 34.42% |
1 | A6122 | 2,819 | 1800 | 1993 | 162 | 2,376 | 137 | 138 | 565 | 69,007 | 5.75% | 20.04% |
2 | A6126 | 1,409 | 1800 | 1993 | 83 | 1,306 | 8 | 11 | 364 | 13,521 | 5.89% | 25.83% |
3 | A9626 | 1,075 | 1919 | 1998 | 792 | 277 | 6 | 0 | 570 | 9,370 | 73.67% | 53.02% |
4 | A6335 | 42 | 1922 | 1956 | 38 | 4 | 0 | 0 | 25 | 2,607 | 90.48% | 59.52% |
5 | B2836 | 14 | 1926 | 1972 | 14 | 0 | 0 | 0 | 3 | 375 | 100.00% | 21.43% |
6 | A8703 | 641 | 1937 | 1980 | 328 | 0 | 313 | 0 | 0 | 0 | 51.17% | 0.00% |
7 | A13828 | 12 | 1955 | 1974 | 3 | 0 | 9 | 0 | 0 | 0 | 25.00% | 0.00% |
8 | A6281 | 17 | 0 | 0 | 11 | 1 | 5 | 0 | 0 | 0 | 64.71% | 0.00% |
9 | A6285 | 132 | 1954 | 1955 | 83 | 31 | 17 | 0 | 110 | 186 | 62.88% | 83.33% |
10 | A6283 | 256 | 1800 | 1959 | 21 | 208 | 24 | 3 | 23 | 3,352 | 8.20% | 8.98% |
11 | A6282 | 14 | 1954 | 1956 | 13 | 1 | 0 | 0 | 2 | 328 | 92.86% | 14.29% |
12 | A6126 | 1,409 | 1800 | 1993 | 83 | 1,306 | 8 | 11 | 364 | 13,521 | 5.89% | 25.83% |
13 | A9106 | 1 | 1968 | 1968 | 1 | 0 | 0 | 0 | 0 | 0 | 100.00% | 0.00% |
14 | A9108 | 691 | 1920 | 1967 | 220 | 465 | 2 | 4 | 107 | 9,810 | 31.84% | 15.48% |
15 | A9105 | 1 | 1991 | 1991 | 1 | 0 | 0 | 0 | 0 | 0 | 100.00% | 0.00% |
16 | A12694 | 25 | 1965 | 1986 | 5 | 20 | 0 | 0 | 8 | 669 | 20.00% | 32.00% |
17 | D1902 | 3 | 1920 | 1960 | 3 | 0 | 0 | 0 | 0 | 0 | 100.00% | 0.00% |
18 | D1915 | 4,884 | 1800 | 1987 | 2,703 | 101 | 2,007 | 73 | 203 | 13,917 | 55.34% | 4.16% |
Now let's calculate some aggregate data across the complete set of series harvests.
total_items = df['total_items'].sum()
print('Total items: {:,}'.format(total_items))
Total items: 88,580
date_from = df['date_from'].min()
print('Earliest date: {}'.format(date_from))
Earliest date: 1800
date_to = df['date_to'].max()
print('Latest date: {}'.format(date_to))
Latest date: 2005
access_status = {}
total = df['total_items'].sum()
print('Access status:\n')
for status in ['Open', 'OWE', 'NYE', 'Closed', 'Withheld pending agency advice']:
status_total = df[status].sum()
access_status[status] = status_total
print(' {: <30} {: <10,} ({:.2%})'.format(status, status_total, status_total/total))
Access status: Open 4,607 (22.82%) OWE 12,410 (61.48%) NYE 2,899 (14.36%) Closed 260 (1.29%) Withheld pending agency advice 10.0 (0.05%)
digitised_files = df['digitised_files'].sum()
print('Total files digitised: {:,}'.format(digitised_files))
Total files digitised: 21,762
digitised_pages = df['digitised_pages'].sum()
print('Total pages digitised: {:,}'.format(digitised_pages))
Total pages digitised: 173,120
sdfs = []
for series in series_list:
sdf = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
sdfs.append(sdf)
combined_df = pd.concat(sdfs)
fig = series_details.plot_dates(combined_df)
py.iplot(fig)
statuses = ['Open', 'OWE', 'NYE', 'Closed', 'Withheld pending agency advice']
totals = []
for status in statuses:
totals.append(df[status].sum())
# Create a pie chart
plot_data = [go.Pie(
labels=statuses,
values=totals
)]
py.iplot(plot_data)