#!/usr/bin/env python
# coding: utf-8
# # Tribune collection metadata magic
#
# This notebook helps you explore metadata (and images) from the State Library of NSW's Tribune collection.
#
# The metadata has been harvested from the SLNSW catalogue and [saved in CSV format](https://github.com/wragge/dxlab-tribune/blob/master/negatives/csv/all_items.csv). [See this post for more information](http://timsherratt.org/research-notebook/records-resistance/notes/tribune-metadata/) on the structure of the metadata.
#
# Just work your way down the page running each cell by either clicking the play icon that appears when you hover over a cell, or by clicking a cell then hitting **Shift+Enter**.
#
# To edit a cell, just click on it.
# ## Setting things up
# In[1]:
# Import the libraries we need
import pandas as pd
import altair as alt
from altair import datum
from wordcloud import WordCloud
from textblob import TextBlob
import nltk
import matplotlib.pyplot as plt
nltk.download('stopwords')
nltk.download('punkt')
from IPython.display import display, HTML
alt.renderers.enable('notebook')
# ----
#
# ## Let's load some data
#
# The harvested metadata is currently sitting in another GitHub repository. We can load it directly from there using Pandas.
# In[2]:
# Load the CSV file from GitHub.
# This puts the data in a Pandas DataFrame
df = pd.read_csv('https://raw.githubusercontent.com/wragge/dxlab-tribune/master/negatives/csv/all_items.csv')
# ### Have a peek...
# In[142]:
# Let's have a look inside...
# Note that both the columns and rows are truncated in this preview
df
# ----
#
# ## Create some summary data
#
# We can use Pandas to give us a quick overview of the dataset.
#
# ### What are the column headings?
# In[223]:
# What are the column headings?
# Can you see which ones are missing from the truncated sample above?
df.columns
# ### How many records are there?
# In[170]:
# How many items?
len(df)
# ### How many images are there?
# In[144]:
# How many images?
df['number_images'].sum()
# ### What's the earliest start date?
# In[145]:
# What's the earliest start date?
df['date_start'].min()
# ### What's the latest end date?
# In[146]:
# What's the latest end date?
df['date_end'].max()
# ----
#
# ## Exploring topics
#
# Topics are text tags describing features of the photographs. They're more free-form and fine-grained than the subjects, but there can bit a bit of overlap and inconsistency.
#
# ### Create a list of unique topics and sort them alphabetically
# In[5]:
# Get unique values
topics = pd.unique(df['topics'].str.split('|', expand=True).stack()).tolist()
for topic in sorted(topics, key=str.lower):
print(topic)
# In[19]:
# Save as a CSV file
topics_df = pd.DataFrame(topics)
topics_df.columns = ['topic']
topics_df.to_csv('data/topics.csv', index=False)
display(HTML('Download topics.csv'))
# ### How often is each topic used?
# In[11]:
# Splits the topic column and counts frequencies
topic_counts = df['topics'].str.split('|').apply(lambda x: pd.Series(x).value_counts()).sum().astype('int').sort_values(ascending=False).to_frame().reset_index(level=0)
# Add column names
topic_counts.columns = ['topic', 'count']
# Display with horizontal bars
display(topic_counts.style.bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'}))
# In[12]:
# Save as a CSV file
topic_counts.to_csv('data/topic_counts.csv', index=False)
display(HTML('Download topic_counts.csv'))
# ### Search for records that include a topic
# In[35]:
# Search topics
search_term = 'work'
topics_filtered = df.loc[df['topics'].str.contains(search_term, case=False, na=False)].copy()
topics_filtered
# In[36]:
# Save as a CSV file
topics_filtered.to_csv('data/topics-search-{}.csv'.format(search_term), index=False)
display(HTML('Download topics-search-{0}.csv'.format(search_term)))
# ----
#
# ## Exploring subjects
#
# Subjects are a set of controlled and hierarchical descriptive labels following the [Library of Congress Subject Headings](http://id.loc.gov/authorities/subjects.html).
#
# ### Create a list of unique subject headings and sort them alphabetically
# In[13]:
# Get unique values
subjects = pd.unique(df['subjects'].str.split('|', expand=True).stack()).tolist()
for subj in sorted(subjects, key=str.lower):
print(subj)
# In[14]:
# Save as a CSV file
subjects_df = pd.DataFrame(subjects)
subjects_df.columns = ['subject']
subjects_df.to_csv('data/subjects.csv', index=False)
display(HTML('Download subjects.csv'))
# ### How often is each subject used?
# In[22]:
# Splits the subject column and counts frequencies
subject_counts = df['subjects'].str.split('|').apply(lambda x: pd.Series(x).value_counts()).sum().astype('int').sort_values(ascending=False).to_frame().reset_index(level=0)
# Add column names
subject_counts.columns = ['subject', 'count']
# Display with horizontal bars
display(subject_counts.style.bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'}))
# In[23]:
# Save as a CSV file
subject_counts.to_csv('data/subject_counts.csv', index=False)
display(HTML('Download subject_counts.csv'))
# ### Search for records with a particular subject
# In[33]:
search_term = 'Pine Gap'
subjects_filtered = df.loc[df['subjects'].str.contains(search_term, case=False, na=False)].copy()
subjects_filtered
# In[34]:
# Save as a CSV file
subjects_filtered.to_csv('data/subjects-search-{}.csv'.format(search_term), index=False)
display(HTML('Download subjects-search-{0}.csv'.format(search_term)))
# ----
#
# ## Exploring people and organisations
#
# ### Create a list of unique names and sort them alphabetically
# In[15]:
# Get unique values
people = pd.unique(df['people'].str.split('|', expand=True).stack()).tolist()
for person in sorted(people):
print(person)
# In[20]:
# Save as a CSV file
people_df = pd.DataFrame(people)
people_df.columns = ['people']
people_df.to_csv('data/people.csv', index=False)
display(HTML('Download people.csv'))
# ### How often is each name used?
# In[24]:
# Splits the people column and counts frequencies
people_counts = df['people'].str.split('|').apply(lambda x: pd.Series(x).value_counts()).sum().astype('int').sort_values(ascending=False).to_frame().reset_index(level=0)
# Add column names
people_counts.columns = ['name', 'count']
# Display with horizontal bars
display(people_counts.style.bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'}))
# In[25]:
# Save as a CSV file
people_counts.to_csv('data/people_counts.csv', index=False)
display(HTML('Download people_counts.csv'))
# ### Search for names
# In[31]:
search_term = 'union'
people_filtered = df.loc[df['people'].str.contains(search_term, case=False, na=False)].copy()
people_filtered
# In[32]:
# Save as a CSV file
people_filtered.to_csv('data/people-search-{}.csv'.format(search_term), index=False)
display(HTML('Download people-search-{0}.csv'.format(search_term)))
# ----
#
# ## Exploring places
#
# ### Create a list of unique place names and sort them alphabetically
# In[17]:
# Get unique values in multi-value column
places = pd.unique(df['places'].str.split('|', expand=True).stack()).tolist()
for place in sorted(places, key=str.lower):
print(place)
# In[21]:
# Save as a CSV file
places_df = pd.DataFrame(places)
places_df.columns = ['place']
places_df.to_csv('data/places.csv', index=False)
display(HTML('Download places.csv'))
# ### How many times is each place name used?
# In[26]:
# Splits the place column and counts frequencies
place_counts = df['places'].str.split('|').apply(lambda x: pd.Series(x).value_counts()).sum().astype('int').sort_values(ascending=False).to_frame().reset_index(level=0)
# Add column names
place_counts.columns = ['place', 'count']
# Display with horizontal bars
display(place_counts.style.bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'}))
# In[27]:
# Save as a CSV file
place_counts.to_csv('data/place_counts.csv', index=False)
display(HTML('Download place_counts.csv'))
# ### Search for a place
# In[28]:
search_term = 'university'
places_filtered = df.loc[df['places'].str.contains(search_term, case=False, na=False)].copy()
places_filtered
# In[30]:
# Save as a CSV file
places_filtered.to_csv('data/places-search-{}.csv'.format(search_term), index=False)
display(HTML('Download places-search-{0}.csv'.format(search_term)))
# ----
#
# ## Exploring dates
#
# ### Chart the number of records per year
# In[198]:
# Let's make a chart of the start dates!
# What happens when you change the :O to :T
chart = alt.Chart(df).mark_bar().encode(
x=alt.X('year(date_start):O', title='Year'),
y=alt.Y('count()', axis=alt.Axis(title='Number of records')),
tooltip=[alt.Tooltip('year(date_start):O', title='Year'), alt.Tooltip(aggregate='count', type='quantitative', title='Total')]
)
chart
# ### Search for a date
# In[37]:
date = '1900-01-01 00:00:00'
dates_filtered = df.loc[df['date_start'] == date].copy()
dates_filtered
# In[41]:
# Save as a CSV file
dates_filtered.to_csv('data/dates-search-{}.csv'.format(date.replace(' 00:00:00', '')), index=False)
display(HTML('Download dates-search-{0}.csv'.format(date.replace(' 00:00:00', ''))))
# ----
#
# ## Exploring titles
#
# ### Make a word cloud
# In[222]:
stopwords = nltk.corpus.stopwords.words('english')
# Words we want to exclude
exclude = stopwords + ['tribune', 'negatives', 'including', 'item']
# Turn all the titles into one big string
titles = df['title'].str.lower().str.cat(sep=' ')
# Generate a word cloud image
wordcloud = WordCloud(width=1200, height=800, stopwords=exclude).generate(titles)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
# ----
#
# ## Exploring descriptions
#
# The `description` field contains free text descriptions of the photographs. Sometimes this can include numbered lists relating descriptions to specific images.
#
# ### Make a word cloud
# In[220]:
stopwords = nltk.corpus.stopwords.words('english')
# words we want to exclude
exclude = stopwords + ['image', 'descriptions', 'description', 'provided', 'cataloguer', 'negative', 'sleeve', 'titled', 'original', 'include', 'includes', 'row', 'rows']
# Add all the descriptions together to make one big string
descriptions = df['description'].str.lower().str.cat(sep=' ')
# Generate a word cloud image
wordcloud = WordCloud(width=1200, height=800, stopwords=exclude).generate(descriptions)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
# ### Search descriptions
# In[42]:
search_term = 'eviction'
desc_filtered = df.loc[df['description'].str.contains(search_term, case=False, na=False)].copy()
desc_filtered
# In[43]:
# Save as a CSV file
desc_filtered.to_csv('data/description-search-{}.csv'.format(search_term), index=False)
display(HTML('Download description-search-{0}.csv'.format(search_term)))
# ----
#
# ## Examine rows and fields
#
# If you've searched in one of the fields you might like to examine a specific row in more detail. Just change `row_number` in the cell below.
#
# ### Get the metadata for a specific row
# In[162]:
# Inspect a row by row number
# Change the row_number to display a different row
# The row number (or index), is in the first column
row_number = 650
df.loc[row_number]
# ### Get the value of a specific field in the selected row
# In[163]:
# Inspect a field in a particular row
# In this case we're looking at the 'description' field
df.loc[row_number]['description']
# ### Create a clickable link to view this record in the SLNSW catalogue
# In[164]:
# Display a clickable url
display(HTML('{0}'.format(df.loc[row_number]['url'])))
# ----
#
# ## Examine an image
#
# Just copy an image identifier from the `images` column and paste in below as the value of `image_id`.
# In[165]:
image_id = 'FL1817297'
display(HTML(''.format(image_id)))
df.loc[df['images'].str.contains(image_id, case=False, na=False)].copy().iloc[0]
# ----
#
# ## Browse images by item (row)
#
# Just change the `row_number` to view the images associated with an item. Click on the images to enlarge.
#
# The item metadata is displayed below the images.
# In[239]:
row_number = 900
images = df.loc[row_number]['images'].split('|')
html = ''
for image in images:
html += ''.format(image)
display(HTML(html))
df.loc[row_number]
# In[3]:
# This cell creates a CSV file with a row for each image, preserving all the parent item metadata
def tidy_split(df, column, sep='|', keep=False):
"""
Split the values of a column and expand so the new DataFrame has one split
value per row. Filters rows where the column is missing.
Params
------
df : pandas.DataFrame
dataframe with the column to split and expand
column : str
the column to split and expand
sep : str
the string used to split the column's values
keep : bool
whether to retain the presplit value as it's own row
Returns
-------
pandas.DataFrame
Returns a dataframe with the same columns as `df`.
"""
indexes = list()
new_values = list()
df = df.dropna(subset=[column])
for i, presplit in enumerate(df[column].astype(str)):
values = presplit.split(sep)
if keep and len(values) > 1:
indexes.append(i)
new_values.append(presplit)
for value in values:
indexes.append(i)
new_values.append(value)
new_df = df.iloc[indexes, :].copy()
new_df[column] = new_values
return new_df
images_df = tidy_split(df, 'images', sep='|')
images_df.head()
# In[4]:
images_df.to_csv('data/images.csv', index=False)
display(HTML('Download images.csv'))
# In[ ]: