NSW State Archives Index Explorer

NSW State Archives provides a lot of rich descriptive data in its online indexes. But there's so much data it can be hard to understand what's actually in each index. This notebook tries to help by generating an overview of an index, summarising the contents of each field.

When you select an index from the dropdown list, the Index Explorer loads a CSV file containing data harvested from the index. It then looks at each column in the dataset, tries to identify the type of data inside, and attempts to tell you something useful about it.

Given all the possible variations in recording and formatting data, there will be oddities and errors. But hopefully this will provide you with a useful starting point for further exploration.

This notebook is designed to run in 'app mode', which hides all the code and de-clutters the interface. If you see lots of code below, look for an Appmode button in the menu bar above and click it. If there's no button, you can run the Index Explorer using Binder.

The Index Explorer is a slightly-modified version of the GLAM CSV Explorer.

In [2]:
%%capture
import pandas as pd
from pandas.errors import ParserError
import statistics
import time
import os
from urllib.parse import urlparse, urljoin
from urllib.error import HTTPError
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import altair as alt
from wordcloud import WordCloud
from slugify import slugify
alt.renderers.enable('notebook')
alt.data_transformers.enable('json')
In [3]:
%%javascript
// This is necessary to stop the output area folding up
IPython.OutputArea.prototype._should_scroll = function(lines) {return false}
In [4]:
#This is where the results go...
results = widgets.Output()

def read_csv(url, header=0, encoding=0):
    '''
    Loop through some encoding/parsing options to see if we can get the CSV to open properly.
    '''
    encodings = ['ISO-8859-1', 'latin-1']
    headers = [None]
    try:
        if encoding > 0 and header > 0:
            df = pd.read_csv(url, na_values=['-', ' '], encoding=encodings[encoding-1], header=headers[header-1])
        elif encoding > 0:
            df = pd.read_csv(url, na_values=['-', ' '], encoding=encodings[encoding-1])
        elif header > 0:
            df = pd.read_csv(url, na_values=['-', ' '], header=headers[header-1])
        else:
            df = pd.read_csv(url, na_values=['-', ' '])
    except UnicodeDecodeError:
        if encoding == len(encodings):
            raise
        else:
            return read_csv(url=url, header=header, encoding=encoding+1)
    except ParserError:
        if header == len(headers):
            raise
        else:
            return read_csv(url=url, header=header+1, encoding=encoding)
    else:
        return df

def analyse_csv(b):
    '''
    Try to open the CSV file, and start the analysis.
    '''
    results.clear_output()
    error = ''
    with results:
        title = select_csv.value
        filename = '{}.csv'.format(slugify(title))
        url = urljoin('https://raw.githubusercontent.com/wragge/srnsw-indexes/master/data/', filename)
        try:
            df = read_csv(url)
        except UnicodeDecodeError:
            error = 'Unicode error: unable to read the CSV!'
        except ParserError:
            error = 'Parser error: unable to read the CSV!'
        except HTTPError:
            error = 'File not found!'
        html = '<hr><h2>{}</h2>'.format(title)
        # html += '<h4>Source</h4><p><a href="{0}">{0}</a></p>'.format(url)
        if error:
            html += '<p class="alert alert-warning">{}</p>'.format(error)
        display(HTML(html))
        if not error:
            rows, cols = df.shape
            size = '<h4>Size</h4><ul>'
            size += '<li>{} rows</li>'.format(rows)
            size += '<li>{} columns</li></ul>'.format(cols)
            cols = "<h4>Columns</h4><ol>"
            for col in df.columns:
                cols += '<li><a style="font-family: monospace" href="#{}">{}</a></li>'.format(slugify(col), col)
            cols += '</ol>'
            display(HTML(size))
            display(HTML(cols))
            display(HTML('<h4>Sample</h4>'))
            display(df.head())
            analyse_columns(df)
    
In [5]:
date_cutoff = 0.8
cutoff = 0.8
unique_cutoff = 0.2

def display_dates(df, col):
    # Better to group years first, so that the altair data isn't huge
    # Get counts by year
    counts = df[col].groupby([df[col].dt.year]).agg('count').to_frame()
    # Get the full range of years
    years = pd.Index([y for y in range(int(counts.index[0]), int(counts.index[-1]) + 1)])
    # Set missing years to zero
    counts = counts.reindex(years, fill_value=0)
    counts = counts.reset_index()
    counts.columns = [col, 'count']
    chart = alt.Chart(counts).mark_bar().encode(
        x=alt.X('{}:O'.format(col), axis=alt.Axis(format='')),
        y='count:Q',
        tooltip=[alt.Tooltip('{}:O'.format(col), title='Year'), alt.Tooltip('count:Q', title='Count', format=',')],
        color=alt.value('#5254a3')
    ).properties(
    )
    display(chart)
    
def display_categories(df, col):
    counts = df[col].value_counts()
    if counts.size > 20:
        counts = counts[:20].to_frame()
    else:
        counts = counts.to_frame()
    counts = counts.reset_index()
    counts.columns = [col, 'count']
    chart = alt.Chart(counts).mark_bar().encode(
        x='count:Q',
        y=alt.Y('{}:N'.format(col),  sort=alt.EncodingSortField(field='count', op='count', order='ascending')),
        tooltip=[alt.Tooltip('{}:N'.format(col), title='Category'), alt.Tooltip('count:Q', title='Count', format=',')],
        color=alt.value('#8ca252')
    )
    display(chart)

def display_wordcloud(df, col):
    # Make a word cloud!
    # The word cloud software splits the string into individual words and calculates their frquency
    words = df[col].str.cat(sep=' ')
    wordcloud = WordCloud(width=800, height=300, collocations=False).generate(words)
    image = wordcloud.to_image()
    image_file = 'images/{}_cloud_{}.png'.format(slugify(col), int(time.time()))
    try:
        image.save(image_file)
    except FileNotFoundError:
        os.makedirs('images')
        image.save(image_file)
    display(HTML('<a href="{0}"><img src="{0}"></a>'.format(image_file)))

def display_numbers(df, col, unique_count):
    #display(df[col])
    if unique_count <= 20:
        # df[col].replace('0', np.NaN)
        counts = df[col].value_counts().to_frame()
        counts = counts.reset_index()
        counts.columns = [col, 'count']
        #display(counts)
        chart = alt.Chart(counts).mark_bar().encode(
            alt.X('{}:Q'.format(col)),
            y='count',
            tooltip=[alt.Tooltip('{}:Q'.format(col)), alt.Tooltip('count:Q', title='Count', format=',')],
            color=alt.value('#ad494a')
        )
    else:
        chart = alt.Chart(df).mark_bar().encode(
            alt.X('{}:Q'.format(col), bin=alt.Bin(maxbins=10, nice=True)),
            y='count()',
            tooltip=[alt.Tooltip('{}:Q'.format(col), bin=alt.Bin(maxbins=10, nice=True), title='Range'), alt.Tooltip('count():Q', title='Count', format=',')],
            color=alt.value('#ad494a')
        )
    display(chart)

def text_field(df, col, value_count, word_counts, details):
    html = 'This looks like a text field.'
    display(HTML(html))
    details['Total number of words'] = word_counts.sum()
    details['Highest number of words'] = word_counts.max()
    details['Median number of words'] = statistics.median(word_counts)
    details['Number of empty records'] = df[col].shape[0] - value_count
    display_details(details)
    wordcloud = display_wordcloud(df, col)
    image = wordcloud.to_image()
    image_file = 'images/{}_cloud_{}.png'.format(slugify(col), int(time.time()))
    try:
        image.save(image_file)
    except FileNotFoundError:
        os.makedirs('images')
    display(HTML('<a href="{0}"><img src="{0}"></a>'.format(image_file)))
    
def textplus_field(df, col, value_count, unique_count, unique_ratio, word_counts, has_year, details, html):
    median_word_count = statistics.median(word_counts)
    mixed = False
    details['Total number of words'] = word_counts.sum()
    details['Highest number of words'] = word_counts.max()
    details['Median number of words'] = median_word_count
    details['Number of empty records'] = df[col].shape[0] - value_count
    display_details(details)
    has_mixed = df[col].str.contains(r'(?=\S*[a-zA-Z\/])(?=\S*[0-9])', regex=True)
    if has_mixed.sum() / value_count > cutoff and median_word_count <= 2:
        mixed = True
        html = '<p>This columns contains a small number of words that combine letters and numbers. They\'re probably collection identifiers. Here\'s some examples:</p><ul>'
        samples = df.loc[df[col].notna()][col].sample(5).to_list()
        for sample in samples:
            html += '<li>{}</li>'.format(sample)
        html += '</ul>'
        display(HTML(html))
    elif unique_ratio > (1 - cutoff):
        if unique_count > 20:
            try:
                wordcloud = display_wordcloud(df, col)
            except ValueError:
                pass
        else:
            display_categories(df, col)
    else:
        html = '<p>Only {:.2%} of the values in this column are unique, so it probably contains categories.</p>'.format(unique_ratio)
        display(HTML(html))
        display_categories(df, col)
    has_number = df[col].str.contains(r'\b\d+\b', regex=True)
    # Check for dates
    if has_year.sum() / value_count > cutoff and mixed is False:
        html = '<p>Most of the values in this column include a number that looks like a year. It might be useful to convert them to dates.</p>'
        df['{}_years_extracted'.format(col)] = df[col].str.extract(r'\b(1[7-9]{1}\d{2}|20[0-1]{1}\d{1})\b')
        if df['{}_years_extracted'.format(col)].nunique(dropna=True) > 1:
            df['{}_date_converted'.format(col)] = pd.to_datetime(df['{}_years_extracted'.format(col)], format='%Y', utc=True)
            html += '<p>{:,} of {:,} values in this column were successfully parsed as dates.</p>'.format(df['{}_date_converted'.format(col)].dropna().size, value_count)
            details = {}
            details['Earliest date'] = df['{}_date_converted'.format(col)].min().strftime('%Y-%m-%d')
            details['Latest date'] = df['{}_date_converted'.format(col)].max().strftime('%Y-%m-%d')
            display(HTML(html))
            display_details(details)
            display_dates(df, '{}_date_converted'.format(col))
    # Check for numbers
    elif has_number.sum() / value_count > cutoff and mixed is False:
        html = '<p>Most of the values in this column include a number. It might be useful to extract the values.</p>'
        df['{}_numbers_extracted'.format(col)] = df[col].str.extract(r'\b(\d+)\b')
        if df['{}_numbers_extracted'.format(col)].nunique(dropna=True) > 2:
            df['{}_numbers_extracted'.format(col)] = pd.to_numeric(df['{}_numbers_extracted'.format(col)], errors='coerce', downcast='integer')
            details = {}
            details['Highest value'] = df['{}_numbers_extracted'.format(col)].max()
            details['Lowest value'] = df['{}_numbers_extracted'.format(col)].dropna().min()
            display(HTML(html))
            display_details(details)
            display_numbers(df, '{}_numbers_extracted'.format(col), unique_count)
        
    
def date_field(df, col, value_count, year_count, details, html):
    default_dates = pd.to_datetime(df[col], infer_datetime_format=True, errors='coerce', utc=True)
    default_dates_count = default_dates.dropna().size
    dayfirst_dates = pd.to_datetime(df[col], infer_datetime_format=True, errors='coerce', dayfirst=True, yearfirst=True, utc=True)
    dayfirst_dates_count = dayfirst_dates.dropna().size
    if (default_dates_count / value_count > date_cutoff) and (default_dates_count >= dayfirst_dates_count):
        df['{}_date_converted'.format(col)] = default_dates
    elif (dayfirst_dates_count / value_count > date_cutoff) and (dayfirst_dates_count >= default_dates_count):
        df['{}_date_converted'.format(col)] = dayfirst_dates
    else:
        # It's not a known date format, so let's just get the years
        df['{}_years_extracted'.format(col)] = df[col].str.extract(r'\b(1[7-9]{1}\d{2}|20[0-1]{1}\d{1})\b')
        df['{}_date_converted'.format(col)] = pd.to_datetime(df['{}_years_extracted'.format(col)], format='%Y', utc=True)
    html += '<p>This looks like it contains dates.</p>'
    html += '<p>{:,} of {:,} values in this column were successfully parsed as dates.</p>'.format(df['{}_date_converted'.format(col)].dropna().size, value_count)
    details['Earliest date'] = df['{}_date_converted'.format(col)].min().strftime('%Y-%m-%d')
    details['Latest date'] = df['{}_date_converted'.format(col)].max().strftime('%Y-%m-%d')
    display(HTML(html))
    display_details(details)
    display_dates(df, '{}_date_converted'.format(col))

def url_field(df, col, details, html):
    display_details(details)
    html += '<p>It looks like this column contains urls. Here are some examples:</p><ul>'
    samples = df.loc[df[col].notna()][col].sample(5).to_list()
    for sample in samples:
        html += '<li><a href="{0}">{0}</a></li>'.format(sample)
    html += '</ul>'
    display(HTML(html))
    
def unique_field(df, col, details, html):
    display_details(details)
    html += '<p>This column only contains one value:</p>'
    html += '<blockquote>{}</blockquote>'.format(df[col].loc[df[col].first_valid_index()])
    display(HTML(html))
    
def number_field(df, col, value_count, unique_count, unique_ratio, details, html):
    has_year = df.loc[(df[col] >= 1700) & (df[col] <= 2019)]
    if (has_year.size / value_count) > date_cutoff:
        df['{}_date_converted'.format(col)] = pd.to_datetime(df[col], format='%Y', utc=True, errors='coerce')
        html += '<p>This looks like it contains dates.</p>'
        html += '<p>{:,} of {:,} values in this column were successfully parsed as dates.</p>'.format(df['{}_date_converted'.format(col)].dropna().size, value_count)
        details['Earliest date'] = df['{}_date_converted'.format(col)].dropna().min().strftime('%Y-%m-%d')
        details['Latest date'] = df['{}_date_converted'.format(col)].dropna().max().strftime('%Y-%m-%d')
        display(HTML(html))
        display_details(details)
        display_dates(df, '{}_date_converted'.format(col))
    else:
        details['Highest value'] = df[col].max()
        details['Lowest value'] = df[col].dropna().min()
        display_details(details)
        if unique_ratio > cutoff:
            html = '{:.2%} of the values in this column are unique, so it\'s probably some sort of identifier.'.format(unique_ratio)
            display(HTML(html))
        if unique_count <= 20:
            display_categories(df, col)
        else:
            display_numbers(df, col, unique_count)
        #Check for geocoordinates?

def display_details(details):
    details_df = pd.DataFrame.from_dict(details, orient='index', columns=[' '])
    details_df.rename_axis('Summary', axis='columns', inplace=True)
    details_df = details_df.style.set_table_styles([ dict(selector='th', props=[('text-align', 'left')] ) ])
    display(details_df)

def analyse_columns(df):
    enriched_df = df.copy()
    #out = widgets.Output()
    outputs = {}
    for index, col in enumerate(enriched_df.columns):
        display(HTML('<hr><h3 id="{}">{}. <code>{}</code></h3>'.format(slugify(col), index+1, col)))
        details = {}
        html = ''
        # Are there any values in this column
        value_count = enriched_df[col].dropna().size
        details['Number of (non empty) values'] = '{:,} ({:.2%} of rows)'.format(value_count, (value_count / enriched_df[col].size))
        if value_count:
            # How many unique values are there in this column?
            unique_count = enriched_df[col].nunique(dropna=True)
            # What proportion of the values are unique?
            unique_ratio = unique_count / value_count
            details['Number of unique values'] = '{:,} ({:.2%} of non-empty values)'.format(unique_count, unique_ratio)
            if unique_ratio == 1:
                html += '<p>All the values in this column are unique, perhaps it''s some form of identifier.</p>'
            if unique_count == 1:
                unique_field(enriched_df, col, details, html)
            # Check it's a string field
            elif enriched_df[col].dtype == 'object':
                word_counts = enriched_df[col].dropna().str.split().str.len().fillna(0)
                median_word_count = statistics.median(word_counts)
                # Check for the presence of years
                # year_count = enriched_df[col].str.count(r'\b1[7-9]{1}\d{2}\b|\b20[0-1]{1}\d{1}\b').sum()
                if enriched_df[col].str.startswith('http', na=False).sum() > 1:
                    url_field(enriched_df, col, details, html)
                #elif median_word_count <= 4:
                    # How many have words that combine letters and numbers?
                else:
                    # How many start with words (and no numbers in the first two words)?
                    starts_with_words = enriched_df[col].str.contains(r'^[a-zA-Z]+$|^(?:\b[a-zA-Z]{2,}\b\W*){2}', regex=True)
                    # How many have patterns that look like years?
                    has_year = enriched_df[col].str.contains(r'\b1[7-9]{1}\d{2}|20[0-1]{1}\d{1}\b', regex=True)
                    # If most don't start with words...
                    # This filters out titles or names that might include dates.
                    if (value_count - starts_with_words.sum()) / value_count > date_cutoff:
                        # If most contain years...
                        if (has_year.sum() / value_count) > date_cutoff:
                            date_field(enriched_df, col, value_count, has_year.sum(), details, html)
                        else:
                            textplus_field(enriched_df, col, value_count, unique_count, unique_ratio, word_counts, has_year, details, html)
                    else:
                        textplus_field(enriched_df, col, value_count, unique_count, unique_ratio, word_counts, has_year, details, html)
            elif enriched_df[col].dtype in ['int64', 'float64']:
                number_field(enriched_df, col, value_count, unique_count, unique_ratio, details, html)
        else:
            html = 'This column is empty.'
            display(HTML(html))
            
def clear_all(b):
    select_csv.value = options[0][1]
    csv_url.value = ''
    results.clear_output()               
In [7]:
csvs = pd.read_csv('https://raw.githubusercontent.com/wragge/srnsw-indexes/master/indexes.csv')
csvs.sort_values(by=['title'], inplace=True)
options = []
for row in csvs.itertuples():
    options.append(row.title)


select_csv = widgets.Dropdown(
        options=options,
        description='',
        disabled=False,
        layout=widgets.Layout(width='80%')
    )

csv_url = widgets.Text(
        placeholder='Enter the url of a CSV file',
        description='Url:',
        disabled=False,
        layout=widgets.Layout(width='100%')
    )

clear_button = widgets.Button(
        description='Clear',
        disabled=False,
        button_style='', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Clear current data',
        icon=''
    )

analyse_button = widgets.Button(
        description='Analyse CSV',
        disabled=False,
        button_style='primary', # 'success', 'info', 'warning', 'danger' or ''
        tooltip='Analyse CSV',
        icon=''
    )

clear_button.on_click(clear_all)
analyse_button.on_click(analyse_csv)
select_note = widgets.HTML('Select an index:')
select_tab = widgets.VBox([select_note, select_csv])
#tab = widgets.Tab(children=[select_tab])
#tab.set_title(0, 'Select CSV')
display(widgets.VBox([select_tab, widgets.HBox([analyse_button, clear_button]), results]))

Created by Tim Sherratt.

Part of the GLAM Workbench project.

Work on this notebook was supported by the Humanities, Arts and Social Sciences (HASS) Data Enhanced Virtual Lab.