#!/usr/bin/env python # coding: utf-8 # # Analyse a series # In[1]: import os import pandas as pd from IPython.display import Image as DImage from IPython.core.display import display, HTML import series_details # Plotly helps us make pretty charts import plotly.offline as py import plotly.graph_objs as go # This lets Plotly draw charts in cells py.init_notebook_mode() # This notebook is for analysing a series that you've already harvested. If you haven't harvested any data yet, then you need to go back to the ['Harvesting a series' notebook](Harvesting series.ipynb). # In[2]: # What series do you want to analyse? # Insert the series id between the quotes. series = 'B13' # In[3]: # Load the CSV data for the specified series into a dataframe. Parse the dates as dates! df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date']) # ## Get some summary data # # We're going to create a simple summary of some of the main characteristics of the series, as reflected in the harvested files. # In[20]: # We're going to assemble some summary data about the series in a 'summary' dictionary # Let's create the dictionary and add the series identifier summary = {'series': series} # In[30]: # The 'shape' property returns the number of rows and columns. So 'shape[0]' gives us the number of items harvested. summary['total_items'] = df.shape[0] print(summary['total_items']) # In[31]: # Get the frequency of the different access status categories summary['access_counts'] = df['access_status'].value_counts().to_dict() print(summary['access_counts']) # In[32]: # Get the number of files that have been digitised summary['digitised_files'] = len(df.loc[df['digitised_status'] == True]) print(summary['digitised_files']) # In[33]: # Get the number of individual pages that have been digitised summary['digitised_pages'] = df['digitised_pages'].sum() print(summary['digitised_pages']) # In[34]: # Get the earliest start date summary['date_from'] = df['start_date'].min().year print(summary['date_from']) # In[35]: # Get the latest end date summary['date_to'] = df['end_date'].max().year print(summary['date_to']) # In[36]: # Let's display all the summary data print('SERIES: {}'.format(summary['series'])) print('Number of items: {:,}'.format(summary['total_items'])) print('Access status:') for status, total in summary['access_counts'].items(): print(' {}: {:,}'.format(status, total)) print('Contents dates: {} to {}'.format(summary['date_from'], summary['date_to'])) print('Digitised files: {:,}'.format(summary['digitised_files'])) print('Digitised pages: {:,}'.format(summary['digitised_pages'])) # Note that a slightly enhanced version of the code above is available in the `series_details` module that you can import into any notebook. So to create a summary of a series you can just: # In[45]: # Import the module import series_details # Call display_series() providing the series name and the dataframe series_details.display_summary(series, df) # ## Plot the contents dates # # Plotting the dates is a bit tricky. Each file can have both a start date and an end date. So if we want to plot the years covered by a file, we need to include all the years between the start and end dates. Also dates can be recorded at different levels of granularity, for specific days to just years. And sometimes there are no end dates recorded at all – what does this mean? # # The code in the cell below does a few things: # # * It fills any empty end dates with the start date from the same item. This probably means some content years will be missed, but it's the only date we can be certain of. # * It loops through all the rows in the dataframe, then for each row it extracts the years between the start and end date. Currently this looks to see if the 1 January is covered by the date range, so if there's an exact start date after 1 January I don't think it will be captured. I need to investigate this further. # * It combines all of the years into one big series and then totals up the frquency of each year. # # I'm sure this is not perfect, but it seems to produce useful results. # # In[39]: # Fill any blank end dates with start dates df['end_date'] = df[['end_date']].apply(lambda x: x.fillna(value=df['start_date'])) # This is a bit tricky. # For each item we want to find the years that it has content from -- ie start_year <= year <= end_year. # Then we want to put all the years from all the items together and look at their frequency years = pd.concat([pd.date_range( start=row.start_date, end=row.end_date, freq='AS').year.to_series() for row in df.itertuples(index=False)]).value_counts() # In[40]: # Put the resulting series in a dataframe so it looks pretty. year_totals = pd.DataFrame(years) # Sort results by year year_totals.sort_index(inplace=True) # In[41]: # Display the results year_totals.style.format({0: '{:,}'}) # In[42]: # Let's graph the frequency of content years plotly_data = [go.Bar( x=year_totals.index.values, # The years are the index y=year_totals[0] )] # Add some labels layout = go.Layout( title='Content dates', xaxis=dict( title='Year' ), yaxis=dict( title='Number of items' ) ) # Create a chart fig = go.Figure(data=plotly_data, layout=layout) py.iplot(fig, filename='series-dates-bar') # Note that a slightly enhanced version of the code above is available in the series_details module that you can import into any notebook. So to create a summary of a series you can just: # In[51]: # Import the module import series_details # Call plot_series() providing the series name and the dataframe series_details.plot_dates(df) # ## Filter by words in file titles # In[1]: # Find titles containing a particular phrase -- in this case 'wife' # This creates a new dataframe called 'df_wives' # Try changing this to filter for other words search_term = 'wife' df_filtered = df.loc[df['title'].str.contains(search_term, case=False)].copy() df_filtered # In[57]: # We can plot this filtered dataframe just like the series series_details.plot_dates(df_filtered) # In[59]: # Save the new dataframe as a csv df_filtered.to_csv(os.path.join('data', '{}-{}.csv'.format(series.replace('/', '-'), search_term))) # In[99]: # Find titles containing one of two words -- ie an OR statement # Try changing this to filter for other words df_filtered = df.loc[df['title'].str.contains('chinese', case=False) | df['title'].str.contains(r'\bah\b', case=False)].copy() df_filtered # ## Filter by date range # In[66]: start_year = '1920' end_year = '1930' df_filtered = df[(df['start_date'] >= start_year) & (df['end_date'] <= end_year)] df_filtered # ## N-gram frequencies in file titles # In[4]: # Import TextBlob for text analysis from textblob import TextBlob import nltk stopwords = nltk.corpus.stopwords.words('english') # In[5]: # Combine all of the file titles into a single string title_text = a = df['title'].str.lower().str.cat(sep=' ') # In[14]: blob = TextBlob(title_text) words = [[word, count] for word, count in blob.lower().word_counts.items() if word not in stopwords] word_counts = pd.DataFrame(words).rename({0: 'word', 1: 'count'}, axis=1).sort_values(by='count', ascending=False) word_counts[:25].style.format({'count': '{:,}'}).bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'}) # In[53]: def get_ngram_counts(text, size): blob = TextBlob(text) # Extract n-grams as WordLists, then convert to a list of strings ngrams = [' '.join(ngram).lower() for ngram in blob.lower().ngrams(size)] # Convert to dataframe then count values and rename columns ngram_counts = pd.DataFrame(ngrams)[0].value_counts().rename_axis('ngram').reset_index(name='count') return ngram_counts def display_top_ngrams(text, size): ngram_counts = get_ngram_counts(text, 2) # Display top 25 results as a bar chart display(ngram_counts[:25].style.format({'count': '{:,}'}).bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'})) # In[54]: display_top_ngrams(title_text, 2) # In[55]: display_top_ngrams(title_text, 6) # In[ ]: