import os
import pandas as pd
from IPython.display import Image as DImage
from IPython.core.display import display, HTML
import series_details
# Plotly helps us make pretty charts
import plotly.offline as py
import plotly.graph_objs as go
# This lets Plotly draw charts in cells
py.init_notebook_mode()
This notebook is for analysing a series that you've already harvested. If you haven't harvested any data yet, then you need to go back to the ['Harvesting a series' notebook](Harvesting series.ipynb).
# What series do you want to analyse?
# Insert the series id between the quotes.
series = 'A6122'
# Load the CSV data for the specified series into a dataframe. Parse the dates as dates!
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
We're going to create a simple summary of some of the main characteristics of the series, as reflected in the harvested files.
# We're going to assemble some summary data about the series in a 'summary' dictionary
# Let's create the dictionary and add the series identifier
summary = {'series': series}
# The 'shape' property returns the number of rows and columns. So 'shape[0]' gives us the number of items harvested.
summary['total_items'] = df.shape[0]
print(summary['total_items'])
# Get the frequency of the different access status categories
summary['access_counts'] = df['access_status'].value_counts().to_dict()
print(summary['access_counts'])
# Get the number of files that have been digitised
summary['digitised_files'] = len(df.loc[df['digitised_status'] == True])
print(summary['digitised_files'])
# Get the number of individual pages that have been digitised
summary['digitised_pages'] = df['digitised_pages'].sum()
print(summary['digitised_pages'])
# Get the earliest start date
start = df['start_date'].min()
try:
summary['date_from'] = start.year
except AttributeError:
summary['date_from'] = None
print(summary['date_from'])
# Get the latest end date
end = df['end_date'].max()
try:
summary['date_to'] = end.year
except AttributeError:
summary['date_to'] = None
print(summary['date_to'])
# Let's display all the summary data
print('SERIES: {}'.format(summary['series']))
print('Number of items: {:,}'.format(summary['total_items']))
print('Access status:')
for status, total in summary['access_counts'].items():
print(' {}: {:,}'.format(status, total))
print('Contents dates: {} to {}'.format(summary['date_from'], summary['date_to']))
print('Digitised files: {:,}'.format(summary['digitised_files']))
print('Digitised pages: {:,}'.format(summary['digitised_pages']))
Note that a slightly enhanced version of the code above is available in the series_details
module that you can import into any notebook. So to create a summary of a series you can just:
# Import the module
import series_details
# Call display_series() providing the series name and the dataframe
series_details.display_summary(series, df)
Plotting the dates is a bit tricky. Each file can have both a start date and an end date. So if we want to plot the years covered by a file, we need to include all the years between the start and end dates. Also dates can be recorded at different levels of granularity, for specific days to just years. And sometimes there are no end dates recorded at all – what does this mean?
The code in the cell below does a few things:
I'm sure this is not perfect, but it seems to produce useful results.
# Fill any blank end dates with start dates
df['end_date'] = df[['end_date']].apply(lambda x: x.fillna(value=df['start_date']))
# This is a bit tricky.
# For each item we want to find the years that it has content from -- ie start_year <= year <= end_year.
# Then we want to put all the years from all the items together and look at their frequency
years = []
for row in df.itertuples(index=False):
try:
years_in_range = pd.date_range(start=row.start_date, end=row.end_date, freq='AS').year.to_series()
except ValueError:
# No start date
pass
else:
years.append(years_in_range)
year_counts = pd.concat(years).value_counts()
# Put the resulting series in a dataframe so it looks pretty.
year_totals = pd.DataFrame(year_counts)
# Sort results by year
year_totals.sort_index(inplace=True)
# Display the results
year_totals.style.format({0: '{:,}'})
# Let's graph the frequency of content years
plotly_data = [go.Bar(
x=year_totals.index.values, # The years are the index
y=year_totals[0]
)]
# Add some labels
layout = go.Layout(
title='Content dates',
xaxis=dict(
title='Year'
),
yaxis=dict(
title='Number of items'
)
)
# Create a chart
fig = go.Figure(data=plotly_data, layout=layout)
py.iplot(fig, filename='series-dates-bar')
Note that a slightly enhanced version of the code above is available in the series_details module that you can import into any notebook. So to create a summary of a series you can just:
# Import the module
import series_details
# Call plot_series() providing the series name and the dataframe
fig = series_details.plot_dates(df)
py.iplot(fig)
# Find titles containing a particular phrase -- in this case 'wife'
# This creates a new dataframe
# Try changing this to filter for other words
search_term = 'wife'
df_filtered = df.loc[df['title'].str.contains(search_term, case=False)].copy()
df_filtered
# We can plot this filtered dataframe just like the series
fig = series_details.plot_dates(df)
py.iplot(fig)
# Save the new dataframe as a csv
df_filtered.to_csv(os.path.join('data', '{}-{}.csv'.format(series.replace('/', '-'), search_term)))
# Find titles containing one of two words -- ie an OR statement
# Try changing this to filter for other words
df_filtered = df.loc[df['title'].str.contains('chinese', case=False) | df['title'].str.contains(r'\bah\b', case=False)].copy()
df_filtered
start_year = '1920'
end_year = '1930'
df_filtered = df[(df['start_date'] >= start_year) & (df['end_date'] <= end_year)]
df_filtered
# Import TextBlob for text analysis
from textblob import TextBlob
import nltk
stopwords = nltk.corpus.stopwords.words('english')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
blob = TextBlob(title_text)
words = [[word, count] for word, count in blob.lower().word_counts.items() if word not in stopwords]
word_counts = pd.DataFrame(words).rename({0: 'word', 1: 'count'}, axis=1).sort_values(by='count', ascending=False)
word_counts[:25].style.format({'count': '{:,}'}).bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'})
def get_ngram_counts(text, size):
blob = TextBlob(text)
# Extract n-grams as WordLists, then convert to a list of strings
ngrams = [' '.join(ngram).lower() for ngram in blob.lower().ngrams(size)]
# Convert to dataframe then count values and rename columns
ngram_counts = pd.DataFrame(ngrams)[0].value_counts().rename_axis('ngram').reset_index(name='count')
return ngram_counts
def display_top_ngrams(text, size):
ngram_counts = get_ngram_counts(text, size)
# Display top 25 results as a bar chart
display(ngram_counts[:25].style.format({'count': '{:,}'}).bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'}))
display_top_ngrams(title_text, 2)
display_top_ngrams(title_text, 4)