#!/usr/bin/env python # coding: utf-8 # # Tribune collection metadata magic # # This notebook helps you explore metadata (and images) from the State Library of NSW's Tribune collection. # # The metadata has been harvested from the SLNSW catalogue and [saved in CSV format](https://github.com/wragge/dxlab-tribune/blob/master/negatives/csv/all_items.csv). [See this post for more information](http://timsherratt.org/research-notebook/records-resistance/notes/tribune-metadata/) on the structure of the metadata. # # Just work your way down the page running each cell by either clicking the play icon that appears when you hover over a cell, or by clicking a cell then hitting **Shift+Enter**. # # To edit a cell, just click on it. # ## Setting things up # In[1]: # Import the libraries we need import pandas as pd import altair as alt from altair import datum from wordcloud import WordCloud from textblob import TextBlob import nltk import matplotlib.pyplot as plt nltk.download('stopwords') nltk.download('punkt') from IPython.display import display, HTML alt.renderers.enable('notebook') # ---- # # ## Let's load some data # # The harvested metadata is currently sitting in another GitHub repository. We can load it directly from there using Pandas. # In[2]: # Load the CSV file from GitHub. # This puts the data in a Pandas DataFrame df = pd.read_csv('https://raw.githubusercontent.com/wragge/dxlab-tribune/master/negatives/csv/all_items.csv') # ### Have a peek... # In[142]: # Let's have a look inside... # Note that both the columns and rows are truncated in this preview df # ---- # # ## Create some summary data # # We can use Pandas to give us a quick overview of the dataset. # # ### What are the column headings? # In[223]: # What are the column headings? # Can you see which ones are missing from the truncated sample above? df.columns # ### How many records are there? # In[170]: # How many items? len(df) # ### How many images are there? # In[144]: # How many images? df['number_images'].sum() # ### What's the earliest start date? # In[145]: # What's the earliest start date? df['date_start'].min() # ### What's the latest end date? # In[146]: # What's the latest end date? df['date_end'].max() # ---- # # ## Exploring topics # # Topics are text tags describing features of the photographs. They're more free-form and fine-grained than the subjects, but there can bit a bit of overlap and inconsistency. # # ### Create a list of unique topics and sort them alphabetically # In[5]: # Get unique values topics = pd.unique(df['topics'].str.split('|', expand=True).stack()).tolist() for topic in sorted(topics, key=str.lower): print(topic) # In[19]: # Save as a CSV file topics_df = pd.DataFrame(topics) topics_df.columns = ['topic'] topics_df.to_csv('data/topics.csv', index=False) display(HTML('Download topics.csv')) # ### How often is each topic used? # In[11]: # Splits the topic column and counts frequencies topic_counts = df['topics'].str.split('|').apply(lambda x: pd.Series(x).value_counts()).sum().astype('int').sort_values(ascending=False).to_frame().reset_index(level=0) # Add column names topic_counts.columns = ['topic', 'count'] # Display with horizontal bars display(topic_counts.style.bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'})) # In[12]: # Save as a CSV file topic_counts.to_csv('data/topic_counts.csv', index=False) display(HTML('Download topic_counts.csv')) # ### Search for records that include a topic # In[35]: # Search topics search_term = 'work' topics_filtered = df.loc[df['topics'].str.contains(search_term, case=False, na=False)].copy() topics_filtered # In[36]: # Save as a CSV file topics_filtered.to_csv('data/topics-search-{}.csv'.format(search_term), index=False) display(HTML('Download topics-search-{0}.csv'.format(search_term))) # ---- # # ## Exploring subjects # # Subjects are a set of controlled and hierarchical descriptive labels following the [Library of Congress Subject Headings](http://id.loc.gov/authorities/subjects.html). # # ### Create a list of unique subject headings and sort them alphabetically # In[13]: # Get unique values subjects = pd.unique(df['subjects'].str.split('|', expand=True).stack()).tolist() for subj in sorted(subjects, key=str.lower): print(subj) # In[14]: # Save as a CSV file subjects_df = pd.DataFrame(subjects) subjects_df.columns = ['subject'] subjects_df.to_csv('data/subjects.csv', index=False) display(HTML('Download subjects.csv')) # ### How often is each subject used? # In[22]: # Splits the subject column and counts frequencies subject_counts = df['subjects'].str.split('|').apply(lambda x: pd.Series(x).value_counts()).sum().astype('int').sort_values(ascending=False).to_frame().reset_index(level=0) # Add column names subject_counts.columns = ['subject', 'count'] # Display with horizontal bars display(subject_counts.style.bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'})) # In[23]: # Save as a CSV file subject_counts.to_csv('data/subject_counts.csv', index=False) display(HTML('Download subject_counts.csv')) # ### Search for records with a particular subject # In[33]: search_term = 'Pine Gap' subjects_filtered = df.loc[df['subjects'].str.contains(search_term, case=False, na=False)].copy() subjects_filtered # In[34]: # Save as a CSV file subjects_filtered.to_csv('data/subjects-search-{}.csv'.format(search_term), index=False) display(HTML('Download subjects-search-{0}.csv'.format(search_term))) # ---- # # ## Exploring people and organisations # # ### Create a list of unique names and sort them alphabetically # In[15]: # Get unique values people = pd.unique(df['people'].str.split('|', expand=True).stack()).tolist() for person in sorted(people): print(person) # In[20]: # Save as a CSV file people_df = pd.DataFrame(people) people_df.columns = ['people'] people_df.to_csv('data/people.csv', index=False) display(HTML('Download people.csv')) # ### How often is each name used? # In[24]: # Splits the people column and counts frequencies people_counts = df['people'].str.split('|').apply(lambda x: pd.Series(x).value_counts()).sum().astype('int').sort_values(ascending=False).to_frame().reset_index(level=0) # Add column names people_counts.columns = ['name', 'count'] # Display with horizontal bars display(people_counts.style.bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'})) # In[25]: # Save as a CSV file people_counts.to_csv('data/people_counts.csv', index=False) display(HTML('Download people_counts.csv')) # ### Search for names # In[31]: search_term = 'union' people_filtered = df.loc[df['people'].str.contains(search_term, case=False, na=False)].copy() people_filtered # In[32]: # Save as a CSV file people_filtered.to_csv('data/people-search-{}.csv'.format(search_term), index=False) display(HTML('Download people-search-{0}.csv'.format(search_term))) # ---- # # ## Exploring places # # ### Create a list of unique place names and sort them alphabetically # In[17]: # Get unique values in multi-value column places = pd.unique(df['places'].str.split('|', expand=True).stack()).tolist() for place in sorted(places, key=str.lower): print(place) # In[21]: # Save as a CSV file places_df = pd.DataFrame(places) places_df.columns = ['place'] places_df.to_csv('data/places.csv', index=False) display(HTML('Download places.csv')) # ### How many times is each place name used? # In[26]: # Splits the place column and counts frequencies place_counts = df['places'].str.split('|').apply(lambda x: pd.Series(x).value_counts()).sum().astype('int').sort_values(ascending=False).to_frame().reset_index(level=0) # Add column names place_counts.columns = ['place', 'count'] # Display with horizontal bars display(place_counts.style.bar(subset=['count'], color='#d65f5f').set_properties(subset=['count'], **{'width': '300px'})) # In[27]: # Save as a CSV file place_counts.to_csv('data/place_counts.csv', index=False) display(HTML('Download place_counts.csv')) # ### Search for a place # In[28]: search_term = 'university' places_filtered = df.loc[df['places'].str.contains(search_term, case=False, na=False)].copy() places_filtered # In[30]: # Save as a CSV file places_filtered.to_csv('data/places-search-{}.csv'.format(search_term), index=False) display(HTML('Download places-search-{0}.csv'.format(search_term))) # ---- # # ## Exploring dates # # ### Chart the number of records per year # In[198]: # Let's make a chart of the start dates! # What happens when you change the :O to :T chart = alt.Chart(df).mark_bar().encode( x=alt.X('year(date_start):O', title='Year'), y=alt.Y('count()', axis=alt.Axis(title='Number of records')), tooltip=[alt.Tooltip('year(date_start):O', title='Year'), alt.Tooltip(aggregate='count', type='quantitative', title='Total')] ) chart # ### Search for a date # In[37]: date = '1900-01-01 00:00:00' dates_filtered = df.loc[df['date_start'] == date].copy() dates_filtered # In[41]: # Save as a CSV file dates_filtered.to_csv('data/dates-search-{}.csv'.format(date.replace(' 00:00:00', '')), index=False) display(HTML('Download dates-search-{0}.csv'.format(date.replace(' 00:00:00', '')))) # ---- # # ## Exploring titles # # ### Make a word cloud # In[222]: stopwords = nltk.corpus.stopwords.words('english') # Words we want to exclude exclude = stopwords + ['tribune', 'negatives', 'including', 'item'] # Turn all the titles into one big string titles = df['title'].str.lower().str.cat(sep=' ') # Generate a word cloud image wordcloud = WordCloud(width=1200, height=800, stopwords=exclude).generate(titles) plt.figure(figsize=(12,8)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") # ---- # # ## Exploring descriptions # # The `description` field contains free text descriptions of the photographs. Sometimes this can include numbered lists relating descriptions to specific images. # # ### Make a word cloud # In[220]: stopwords = nltk.corpus.stopwords.words('english') # words we want to exclude exclude = stopwords + ['image', 'descriptions', 'description', 'provided', 'cataloguer', 'negative', 'sleeve', 'titled', 'original', 'include', 'includes', 'row', 'rows'] # Add all the descriptions together to make one big string descriptions = df['description'].str.lower().str.cat(sep=' ') # Generate a word cloud image wordcloud = WordCloud(width=1200, height=800, stopwords=exclude).generate(descriptions) plt.figure(figsize=(12,8)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") # ### Search descriptions # In[42]: search_term = 'eviction' desc_filtered = df.loc[df['description'].str.contains(search_term, case=False, na=False)].copy() desc_filtered # In[43]: # Save as a CSV file desc_filtered.to_csv('data/description-search-{}.csv'.format(search_term), index=False) display(HTML('Download description-search-{0}.csv'.format(search_term))) # ---- # # ## Examine rows and fields # # If you've searched in one of the fields you might like to examine a specific row in more detail. Just change `row_number` in the cell below. # # ### Get the metadata for a specific row # In[162]: # Inspect a row by row number # Change the row_number to display a different row # The row number (or index), is in the first column row_number = 650 df.loc[row_number] # ### Get the value of a specific field in the selected row # In[163]: # Inspect a field in a particular row # In this case we're looking at the 'description' field df.loc[row_number]['description'] # ### Create a clickable link to view this record in the SLNSW catalogue # In[164]: # Display a clickable url display(HTML('{0}'.format(df.loc[row_number]['url']))) # ---- # # ## Examine an image # # Just copy an image identifier from the `images` column and paste in below as the value of `image_id`. # In[165]: image_id = 'FL1817297' display(HTML(''.format(image_id))) df.loc[df['images'].str.contains(image_id, case=False, na=False)].copy().iloc[0] # ---- # # ## Browse images by item (row) # # Just change the `row_number` to view the images associated with an item. Click on the images to enlarge. # # The item metadata is displayed below the images. # In[239]: row_number = 900 images = df.loc[row_number]['images'].split('|') html = '' for image in images: html += ''.format(image) display(HTML(html)) df.loc[row_number] # In[3]: # This cell creates a CSV file with a row for each image, preserving all the parent item metadata def tidy_split(df, column, sep='|', keep=False): """ Split the values of a column and expand so the new DataFrame has one split value per row. Filters rows where the column is missing. Params ------ df : pandas.DataFrame dataframe with the column to split and expand column : str the column to split and expand sep : str the string used to split the column's values keep : bool whether to retain the presplit value as it's own row Returns ------- pandas.DataFrame Returns a dataframe with the same columns as `df`. """ indexes = list() new_values = list() df = df.dropna(subset=[column]) for i, presplit in enumerate(df[column].astype(str)): values = presplit.split(sep) if keep and len(values) > 1: indexes.append(i) new_values.append(presplit) for value in values: indexes.append(i) new_values.append(value) new_df = df.iloc[indexes, :].copy() new_df[column] = new_values return new_df images_df = tidy_split(df, 'images', sep='|') images_df.head() # In[4]: images_df.to_csv('data/images.csv', index=False) display(HTML('Download images.csv')) # In[ ]: