Analyse a series

In [ ]:
import os
import pandas as pd
from IPython.display import Image as DImage
from IPython.core.display import display, HTML
import series_details

# Plotly helps us make pretty charts
import plotly.offline as py
import plotly.graph_objs as go

# This lets Plotly draw charts in cells
py.init_notebook_mode()

This notebook is for analysing a series that you've already harvested. If you haven't harvested any data yet, then you need to go back to the 'Harvesting a series' notebook.

In [ ]:
# What series do you want to analyse?
# Insert the series id between the quotes.
series = 'A6122'
In [ ]:
# Load the CSV data for the specified series into a dataframe. Parse the dates as dates!
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])

Get some summary data

We're going to create a simple summary of some of the main characteristics of the series, as reflected in the harvested files.

In [ ]:
# We're going to assemble some summary data about the series in a 'summary' dictionary
# Let's create the dictionary and add the series identifier
summary = {'series': series}
In [ ]:
# The 'shape' property returns the number of rows and columns. So 'shape[0]' gives us the number of items harvested.
summary['total_items'] = df.shape[0]
print(summary['total_items'])
In [ ]:
# Get the frequency of the different access status categories
summary['access_counts'] = df['access_status'].value_counts().to_dict()
print(summary['access_counts'])
In [ ]:
# Get the number of files that have been digitised
summary['digitised_files'] = len(df.loc[df['digitised_status'] == True])
print(summary['digitised_files'])
In [ ]:
# Get the number of individual pages that have been digitised
summary['digitised_pages'] = df['digitised_pages'].sum()
print(summary['digitised_pages'])
In [ ]:
# Get the earliest start date
start = df['start_date'].min()
try:
    summary['date_from'] = start.year
except AttributeError:
    summary['date_from'] = None
print(summary['date_from'])
In [ ]:
# Get the latest end date
end = df['end_date'].max()
try:
    summary['date_to'] = end.year
except AttributeError:
    summary['date_to'] = None
print(summary['date_to'])
In [ ]:
# Let's display all the summary data
print('SERIES: {}'.format(summary['series']))
print('Number of items: {:,}'.format(summary['total_items']))
print('Access status:')
for status, total in summary['access_counts'].items():
    print('    {}: {:,}'.format(status, total))
print('Contents dates: {} to {}'.format(summary['date_from'], summary['date_to']))
print('Digitised files: {:,}'.format(summary['digitised_files']))
print('Digitised pages: {:,}'.format(summary['digitised_pages']))

Note that a slightly enhanced version of the code above is available in the series_details module that you can import into any notebook. So to create a summary of a series you can just:

In [ ]:
# Import the module
import series_details

# Call display_series() providing the series name and the dataframe
series_details.display_summary(series, df)

Plot the contents dates

Plotting the dates is a bit tricky. Each file can have both a start date and an end date. So if we want to plot the years covered by a file, we need to include all the years between the start and end dates. Also dates can be recorded at different levels of granularity, for specific days to just years. And sometimes there are no end dates recorded at all – what does this mean?

The code in the cell below does a few things:

  • It fills any empty end dates with the start date from the same item. This probably means some content years will be missed, but it's the only date we can be certain of.
  • It loops through all the rows in the dataframe, then for each row it extracts the years between the start and end date. Currently this looks to see if the 1 January is covered by the date range, so if there's an exact start date after 1 January I don't think it will be captured. I need to investigate this further.
  • It combines all of the years into one big series and then totals up the frquency of each year.

I'm sure this is not perfect, but it seems to produce useful results.

In [ ]:
# Fill any blank end dates with start dates
df['end_date'] = df[['end_date']].apply(lambda x: x.fillna(value=df['start_date']))

# This is a bit tricky.
# For each item we want to find the years that it has content from -- ie start_year <= year <= end_year.
# Then we want to put all the years from all the items together and look at their frequency
years = []
for row in df.itertuples(index=False):
    try:
        years_in_range = pd.date_range(start=row.start_date, end=row.end_date, freq='AS').year.to_series()
    except ValueError:
        # No start date
        pass
    else:
        years.append(years_in_range)
year_counts = pd.concat(years).value_counts()
In [ ]:
# Put the resulting series in a dataframe so it looks pretty.
year_totals = pd.DataFrame(year_counts)

# Sort results by year
year_totals.sort_index(inplace=True)
In [ ]:
# Display the results
year_totals.style.format({0: '{:,}'})
In [ ]:
# Let's graph the frequency of content years
plotly_data = [go.Bar(
            x=year_totals.index.values, # The years are the index
            y=year_totals[0]
    )]

# Add some labels
layout = go.Layout(
    title='Content dates',
    xaxis=dict(
        title='Year'
    ),
    yaxis=dict(
        title='Number of items'
    )
)

# Create a chart 
fig = go.Figure(data=plotly_data, layout=layout)
py.iplot(fig, filename='series-dates-bar')

Note that a slightly enhanced version of the code above is available in the series_details module that you can import into any notebook. So to create a summary of a series you can just:

In [ ]:
# Import the module
import series_details

# Call plot_series() providing the series name and the dataframe
fig = series_details.plot_dates(df)
py.iplot(fig)

Filter by words in file titles

In [ ]:
# Find titles containing a particular phrase -- in this case 'wife'
# This creates a new dataframe
# Try changing this to filter for other words

search_term = 'wife'
df_filtered = df.loc[df['title'].str.contains(search_term, case=False)].copy()
df_filtered
In [ ]:
# We can plot this filtered dataframe just like the series
fig = series_details.plot_dates(df)
py.iplot(fig)
In [ ]:
# Save the new dataframe as a csv
df_filtered.to_csv(os.path.join('data', '{}-{}.csv'.format(series.replace('/', '-'), search_term)))
In [ ]:
# Find titles containing one of two words -- ie an OR statement
# Try changing this to filter for other words

df_filtered = df.loc[df['title'].str.contains('chinese', case=False) | df['title'].str.contains(r'\bah\b', case=False)].copy()
df_filtered

Filter by date range

In [ ]:
start_year = '1920'
end_year = '1930'
df_filtered = df[(df['start_date'] >= start_year) & (df['end_date'] <= end_year)]
df_filtered

N-gram frequencies in file titles

In [ ]:
# Import TextBlob for text analysis
from textblob import TextBlob
import nltk
stopwords = nltk.corpus.stopwords.words('english')
In [ ]:
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
In [ ]:
blob = TextBlob(title_text)
words = [[word, count] for word, count in blob.lower().word_counts.items() if word not in stopwords]
word_counts = pd.DataFrame(words).rename({0: 'word', 1: 'count'}, axis=1).sort_values(by='count', ascending=False)
word_counts[:25].style.format({'count': '{:,}'}).bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'})
In [ ]:
def get_ngram_counts(text, size):
    blob = TextBlob(text)
    # Extract n-grams as WordLists, then convert to a list of strings
    ngrams = [' '.join(ngram).lower() for ngram in blob.lower().ngrams(size)]
    # Convert to dataframe then count values and rename columns
    ngram_counts = pd.DataFrame(ngrams)[0].value_counts().rename_axis('ngram').reset_index(name='count')
    return ngram_counts
    
def display_top_ngrams(text, size):
    ngram_counts = get_ngram_counts(text, size)
    # Display top 25 results as a bar chart
    display(ngram_counts[:25].style.format({'count': '{:,}'}).bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'}))
In [ ]:
display_top_ngrams(title_text, 2)
In [ ]:
display_top_ngrams(title_text, 4)
In [ ]: