series = 'D1915'
import os
import pandas as pd
import series_details
import plotly.offline as py
py.init_notebook_mode()
df = pd.read_csv(os.path.join('data', '{}.csv'.format(series.replace('/', '-'))), parse_dates=['start_date', 'end_date'])
series_details.display_summary(series, df)
Total items | 4,884 |
---|---|
Access status | |
Open | 2,703 (55.34%) |
Not yet examined | 2,007 (41.09%) |
Open with exception | 101 (2.07%) |
Closed | 73 (1.49%) |
Number of items digitised | 203 (4.16%) |
Number of pages digitised | 13,917 |
Date of earliest content | 1800 |
Date of latest content | 1987 |
# Change the number_of_rows value to see more
number_of_rows = 5
# Display dataframe
df[:number_of_rows].style.set_properties(['title'], **{'text-align': 'left'}).set_table_styles([dict(selector="th", props=[("text-align", "center")]),
dict(selector='.row_heading, .blank', props=[('display', 'none')])])
identifier | series | control_symbol | title | contents_dates | start_date | end_date | access_status | location | digitised_status | digitised_pages | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 277752 | D1915 | SA20047 | Wehrbouern Scheme - Peasant Guards | 1939 - 1943 | 1939-01-01 00:00:00 | 1943-01-01 00:00:00 | Open | Adelaide | False | 0 |
1 | 323055 | D1915 | SA13 | Circulars [includes instructions for surveillance of Sinn Fein activities] | 1917 - 1924 | 1917-01-01 00:00:00 | 1924-01-01 00:00:00 | Open | Adelaide | True | 54 |
2 | 323062 | D1915 | SA26 | Intelligence enquiries - co-ordination of [consists mainly of intelligence reports of persons under suspicion in South Australia] | 1918 - 1919 | 1918-01-01 00:00:00 | 1919-01-01 00:00:00 | Open | Adelaide | True | 193 |
3 | 323065 | D1915 | SA82 | Mormons - movements of | 1922 - 1922 | 1922-01-01 00:00:00 | 1922-01-01 00:00:00 | Open | Adelaide | False | 0 |
4 | 323069 | D1915 | SA96 | Germans - projected settlement in South Australia | 1919 - 1924 | 1919-01-01 00:00:00 | 1924-01-01 00:00:00 | Open | Adelaide | False | 0 |
fig = series_details.plot_dates(df)
py.iplot(fig, filename='series-dates-bar')
# Combine all of the file titles into a single string
title_text = a = df['title'].str.lower().str.cat(sep=' ')
series_details.display_word_counts(title_text)
word | count | |
---|---|---|
137 | naturalization | 972 |
120 | application | 803 |
189 | naturalisation | 478 |
136 | sa | 401 |
77 | german | 313 |
199 | nationality | 251 |
127 | enquiry | 199 |
100 | also | 192 |
20 | australia | 179 |
278 | john | 137 |
628 | admission | 133 |
1578 | giovanni | 122 |
115 | adelaide | 122 |
1863 | antonio | 122 |
1860 | giuseppe | 120 |
180 | immigration | 117 |
476 | george | 110 |
118 | carl | 96 |
1140 | whereabouts | 89 |
1983 | luigi | 82 |
119 | wilhelm | 81 |
605 | william | 79 |
221 | friedrich | 79 |
282 | heinrich | 78 |
102 | war | 77 |
# Change ngram_count for larger ngrams (trigrams etc)
ngram_count = 2
series_details.display_top_ngrams(title_text, ngram_count)
ngram | count | |
---|---|---|
0 | application for | 734 |
1 | for naturalization | 535 |
2 | sa application | 338 |
3 | naturalization german | 202 |
4 | for naturalisation | 161 |
5 | german nationality | 146 |
6 | admission of | 107 |
7 | by immigration | 102 |
8 | enquiry re | 83 |
9 | enquiry by | 82 |
10 | to australia | 74 |
11 | applicant for | 71 |
12 | nationality also | 63 |
13 | of war | 55 |
14 | naturalization syrian | 52 |
15 | prisoner of | 47 |
16 | adelaide application | 44 |
17 | war internee | 40 |
18 | also application | 39 |
19 | re by | 39 |
20 | return to | 38 |
21 | on parole | 35 |
22 | south australia | 34 |
23 | for passport | 34 |
24 | to return | 33 |