In [1]:

series = 'D1915'

In [2]:

import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()

In [3]:

df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])

In [4]:

series_details.display_summary(series, df)

National Archives of Australia: Series D1915

Investigation case files, single number series with 'SA' (South Australia) prefix

Total items	4,884
Access status
Open	2,703 (55.34%)
Not yet examined	2,007 (41.09%)
Open with exception	101 (2.07%)
Closed	73 (1.49%)
Number of items digitised	203 (4.16%)
Number of pages digitised	13,917
Date of earliest content	1800
Date of latest content	1987

Content preview¶

In [5]:

# Change the number_of_rows value to see more
number_of_rows = 5

# Display dataframe 
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
                       dict(selector='.row_heading, .blank', props=[('display', 'none')])])

Out[5]:

	identifier	series	control_symbol	title	contents_dates	start_date	end_date	access_status	location	digitised_status	digitised_pages
0	277752	D1915	SA20047	Wehrbouern Scheme - Peasant Guards	1939 - 1943	1939-01-01 00:00:00	1943-01-01 00:00:00	Open	Adelaide	False	0
1	323055	D1915	SA13	Circulars [includes instructions for surveillance of Sinn Fein activities]	1917 - 1924	1917-01-01 00:00:00	1924-01-01 00:00:00	Open	Adelaide	True	54
2	323062	D1915	SA26	Intelligence enquiries - co-ordination of [consists mainly of intelligence reports of persons under suspicion in South Australia]	1918 - 1919	1918-01-01 00:00:00	1919-01-01 00:00:00	Open	Adelaide	True	193
3	323065	D1915	SA82	Mormons - movements of	1922 - 1922	1922-01-01 00:00:00	1922-01-01 00:00:00	Open	Adelaide	False	0
4	323069	D1915	SA96	Germans - projected settlement in South Australia	1919 - 1924	1919-01-01 00:00:00	1924-01-01 00:00:00	Open	Adelaide	False	0

Plot content dates¶

In [6]:

fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')

View word frequencies¶

In [7]:

# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')

In [8]:

series_details.display_word_counts(title_text)

Out[8]:

	word	count
137	naturalization	972
120	application	803
189	naturalisation	478
136	sa	401
77	german	313
199	nationality	251
127	enquiry	199
100	also	192
20	australia	179
278	john	137
628	admission	133
1578	giovanni	122
115	adelaide	122
1863	antonio	122
1860	giuseppe	120
180	immigration	117
476	george	110
118	carl	96
1140	whereabouts	89
1983	luigi	82
119	wilhelm	81
605	william	79
221	friedrich	79
282	heinrich	78
102	war	77

In [9]:

# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)

	ngram	count
0	application for	734
1	for naturalization	535
2	sa application	338
3	naturalization german	202
4	for naturalisation	161
5	german nationality	146
6	admission of	107
7	by immigration	102
8	enquiry re	83
9	enquiry by	82
10	to australia	74
11	applicant for	71
12	nationality also	63
13	of war	55
14	naturalization syrian	52
15	prisoner of	47
16	adelaide application	44
17	war internee	40
18	also application	39
19	re by	39
20	return to	38
21	on parole	35
22	south australia	34
23	for passport	34
24	to return	33

In [ ]: