import io
import json
import sqlite3
import zipfile
from pathlib import Path
import markdown2
import pandas as pd
import requests_cache
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from slugify import slugify
from sqlite_utils import Database
s = requests_cache.CachedSession()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
s.mount("https://", HTTPAdapter(max_retries=retries))
s.mount("http://", HTTPAdapter(max_retries=retries))
df_csvs = pd.read_csv("glam-datasets-from-gov-portals-csvs.csv")
df_csvs.fillna("", inplace=True)
df_csvs["file_index"] = df_csvs.apply(
lambda x: f'{slugify(x["publisher"])}-{slugify(x["file_title"])}-{slugify(x["file_created"][:10])}',
axis=1,
)
def read_csv(url, header=0, encoding=0):
"""
Loop through some encoding/parsing options to see if we can get the CSV to open properly.
"""
encodings = ["ISO-8859-1", "latin-1"]
headers = [None]
try:
if encoding > 0 and header > 0:
df = pd.read_csv(
url,
sep=None,
engine="python",
na_values=["-", " "],
encoding=encodings[encoding - 1],
header=headers[header - 1],
)
elif encoding > 0:
df = pd.read_csv(
url,
sep=None,
engine="python",
na_values=["-", " "],
encoding=encodings[encoding - 1],
)
elif header > 0:
df = pd.read_csv(
url,
sep=None,
engine="python",
na_values=["-", " "],
header=headers[header - 1],
)
else:
df = pd.read_csv(url, sep=None, engine="python", na_values=["-", " "])
except UnicodeDecodeError:
if encoding == len(encodings):
raise
else:
return read_csv(url=url, header=header, encoding=encoding + 1)
except pd.errors.ParserError:
if header == len(headers):
raise
else:
return read_csv(url=url, header=header + 1, encoding=encoding)
else:
return df
for i, csv in enumerate(df_csvs.itertuples()):
# print(csv.dataset_title)
try:
response = s.get(csv.download_url)
response.raise_for_status
except:
print(csv.dataset_title)
with Path("csvs", f"{csv.file_index}.csv").open("w") as csv_file:
csv_file.write(response.text)
PROV Digitisation Program statistics PROV Workforce Data 19-20 PROV Annual Report - Records Issued & Visitor Statistics 2015-2016
# This will throw errors where the contents aren't CSV files
# Delete these
dfs = []
for csv in Path("csvs").glob("*.csv"):
# print(csv)
file_index = csv.name.split(".")[0]
# print(file_index)
try:
details = (
df_csvs.loc[df_csvs["file_index"] == file_index][
["publisher", "info_url", "file_title", "file_modified"]
]
.iloc[0]
.to_dict()
)
except IndexError:
print(f"No details -- {file_index}")
else:
details["csv_file"] = csv.name
try:
df_csv = pd.read_csv(csv, low_memory=False)
except:
print(f"Error -- {file_index}")
else:
details["columns"] = "|".join(list(df_csv.columns))
dfs.append(details)
df = pd.DataFrame(dfs)
No details -- history-trust-of-sa-suffrage-petition Error -- queensland-state-archives-corporate-school-files-works-facilities-works-establishment-files-1871-1998-2018-02-23 Error -- state-library-of-south-australia-fire-insurance-maps-1911-1914-2014-06-22 No details -- public-records-office-victoria-outwards-passengers-from-victoria-1852-1915-2014-08-01 No details -- history-trust-of-sa-passengers-in-history Error -- south-australian-museum-consultants-2017-18-2019-08-15 Error -- nsw-state-archives-railway-employment-records-2014-09-30 Error -- state-library-of-south-australia-19th-century-photographs-by-ernest-gall-2014-06-10 Error -- queensland-museum-queensland-museum-collection-of-ethnographic-object-records-2014-06-25 Error -- state-library-of-south-australia-bradman-collection-2013-11-18 Error -- nsw-state-archives-nsw-govt-railways-and-tramways-roll-of-honour-1914-1919-csv-2014-09-30 Error -- queensland-museum-queensland-museum-collection-of-historical-object-records-2014-06-25 Error -- state-library-of-south-australia-election-leaflets-2013-05-08 Error -- nsw-state-archives-nominal-roll-of-the-first-railway-section-aif-csv-2014-09-30 Error -- state-library-of-queensland-nasla-music-csv-2013-05-29 No details -- public-records-office-victoria-british-assisted-passengers-to-victoria-1839-1871-2014-08-01 Error -- libraries-tasmania-archives-series-csv-2016-04-06 Error -- south-australian-museum-workplace-health-and-safety-2017-18-2019-08-15 Error -- state-library-of-south-australia-australia-1-63360-military-survey-s-a-1914-1958-2014-06-22 Error -- state-library-of-south-australia-19th-century-photographs-by-townsend-duryea-2014-06-10 Error -- public-records-office-victoria-reading-room-visitors-2014-12-23 Error -- mount-gambier-library-commercial-street-traders-2014-06-10 Error -- state-library-of-south-australia-election-posters-2013-05-08 Error -- south-australian-museum-contractors-2017-18-2019-08-15 No details -- public-records-office-victoria-unassisted-inward-passengers-1852-1923-2020-10-27
df
dataset_title | publisher | author | dataset_issued | dataset_modified | dataset_description | source | info_url | start_date | end_date | ... | download_url | format | file_description | file_created | file_modified | file_size | licence | file_index | csv_file | columns | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | State Library of Queensland - Real estate maps | State Library of Queensland | opendata@slq.qld.gov.au | 2012-12-07T06:05:16.640302 | 2020-12-09T05:55:15.871780 | A unique collection of original maps and plans... | data.qld.gov.au | https://data.qld.gov.au/dataset/959d611f-a9cf-... | ... | https://www.data.qld.gov.au/dataset/959d611f-a... | CSV | This updated dataset includes links to 798 dig... | 2018-02-28T04:50:33.127516 | 2019-08-19T06:18:57.312772 | 252416 | Creative Commons Attribution 4.0 | state-library-of-queensland-real-estate-maps-f... | state-library-of-queensland-real-estate-maps-f... | Title|Description|Lat|Lon|Link|ID | ||
1 | Passport registers 1926 to 1939 | Queensland State Archives | web@archives.qld.gov.au | 2013-10-14T06:10:08.409229 | 2022-06-20T23:00:36.801163 | These indexes were compiled from the passport ... | data.qld.gov.au | https://data.qld.gov.au/dataset/fc87f25a-dc02-... | ... | https://www.data.qld.gov.au/dataset/fc87f25a-d... | CSV | This open data file lists the names of immigra... | 2017-01-11T23:47:35.449465 | 2022-01-10T04:53:27.827980 | 2831155 | Creative Commons Attribution 4.0 | queensland-state-archives-passport-clearances-... | queensland-state-archives-passport-clearances-... | Last name|Given names|Notes|Date of arrival|Ye... | ||
2 | Assisted immigration 1848 to 1912 | Queensland State Archives | web@archives.qld.gov.au | 2013-03-04T06:34:34.270023 | 2022-06-20T12:57:24.964249 | These indexes were created from the [Registers... | data.qld.gov.au | https://data.qld.gov.au/dataset/ba182873-e8a7-... | ... | https://www.data.qld.gov.au/dataset/ba182873-e... | CSV | This open data file lists the names of assiste... | 2013-03-05T23:30:57.308546 | 2022-06-14T07:46:06.234434 | 2621440 | Creative Commons Attribution 4.0 | queensland-state-archives-assisted-immigration... | queensland-state-archives-assisted-immigration... | Last name|Given names|Notes|Age|Ship|Date|Year... | ||
3 | Australian South Sea Islanders 1867 to 1908 | Queensland State Archives | web@archives.qld.gov.au | 2014-06-25T04:29:57.438596 | 2022-06-20T13:07:35.777233 | This index was compiled from a wide variety of... | data.qld.gov.au | https://data.qld.gov.au/dataset/eae0afa9-681c-... | ... | https://www.data.qld.gov.au/dataset/eae0afa9-6... | CSV | This open data file lists the names (L-Z) of A... | 2017-01-11T01:32:27.747955 | 2017-01-11T01:32:27.556535 | 13107200 | Creative Commons Attribution 4.0 | queensland-state-archives-australian-south-sea... | queensland-state-archives-australian-south-sea... | Last name|Given name/s|Page|Date|Ref|Prev sys ... | ||
4 | Queensland Museum collection of protozoan spec... | Queensland Museum | opendata@qm.qld.gov.au | 2014-02-18T23:18:45.102073 | 2019-07-10T16:42:34.524484 | A list of specimens of protozoan species in Qu... | data.qld.gov.au | https://data.qld.gov.au/dataset/4f1071f2-f4fa-... | ... | http://www.qm.qld.gov.au/microsites/data/proto... | CSV | A CSV file containing records of all protozoan... | 2014-02-18T23:19:05.331656 | 2017-06-23T00:00:00 | 41733324 | Creative Commons Attribution 4.0 | queensland-museum-queensland-museum-protozoan-... | queensland-museum-queensland-museum-protozoan-... | dcterms:type|dcterms:modified|dcterms:language... | ||
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
723 | SA FOI – number of fee waiver or reduction by ... | State Records South Australia | State Records | 2018-02-02T04:38:06.752608 | 2018-12-23T01:22:00.630016 | 2017-18 annual reporting data on the number of... | data.sa.gov.au | https://data.sa.gov.au/data/dataset/f923f9b0-b... | 2012-07-01 | 2018-06-30 | ... | https://data.sa.gov.au/data/dataset/f923f9b0-b... | CSV | 2017-18 annual reporting data on the number of... | 2018-02-02T15:38:27.930264 | 2018-12-23T01:22:00.608148 | Creative Commons Attribution | state-records-south-australia-sa-foi-number-of... | state-records-south-australia-sa-foi-number-of... | Reason for fee waiver, by sector|No. of waiver... | |
724 | State Library of Queensland - Catalogue searches | State Library of Queensland | opendata@slq.qld.gov.au | 2012-12-07T05:55:14.502123 | 2021-03-08T07:42:00.611055 | This open data file contains the text strings ... | data.qld.gov.au | https://data.qld.gov.au/dataset/cebb997c-1c42-... | ... | https://www.data.qld.gov.au/dataset/cebb997c-1... | CSV | The text strings searched and count of recurri... | 2019-06-18T06:37:31.010964 | 2019-08-27T01:13:09.674578 | 104448 | Creative Commons Attribution 4.0 | state-library-of-queensland-july-2017-catalogu... | state-library-of-queensland-july-2017-catalogu... | Search strings|Count | ||
725 | World War I Soldiers and Nurses (1914-1928). | Libraries Tasmania | Libraries Tasmania | 2015-06-15T03:04:09.056176 | 2021-11-23T14:36:42.489452 | Photographs, articles and applications for lan... | data.gov.au | https://data.gov.au/dataset/b711231a-2a02-48eb... | 1914 | 1928 | ... | https://data.gov.au/data/dataset/b711231a-2a02... | CSV | 2016-03-22T10:01:58.539607 | 2021-11-23 | 2835528 | Creative Commons Attribution 4.0 International | libraries-tasmania-world-war-one-tasmanian-pho... | libraries-tasmania-world-war-one-tasmanian-pho... | DIGITAL_OBJECT - URL_TEXT|DIGITAL_OBJECT - URL... | |
726 | Deceased Estate Files, 1880-1923 | NSW State Archives | State Records Authority | 2014-09-30T04:52:48.805972 | 2016-07-20T12:09:20.785878 | Researching deceased estates files before 1923... | data.nsw.gov.au | https://data.nsw.gov.au/data/dataset/5d45437c-... | ... | https://data.nsw.gov.au/data/dataset/5d45437c-... | CSV | This dataset contains the following attributes... | 2014-09-30T00:55:53.313012 | Creative Commons Attribution | nsw-state-archives-deceased-estates-2014-09-30 | nsw-state-archives-deceased-estates-2014-09-30... | Surname|FirstName|Locality|DateOfDeath|DateDut... | ||||
727 | SA Memory | State Library of South Australia | State Library of South Australia | 2013-03-07T16:15:35.228085 | 2019-08-29T02:29:51.427322 | A selected and wide range of digitised archiva... | data.sa.gov.au | https://data.sa.gov.au/data/dataset/7cd90f98-1... | 1836-2010 | ... | https://data.sa.gov.au/data/dataset/7cd90f98-1... | CSV | A selected and wide range of digitised archiva... | 2013-05-31T01:01:00.469271 | 2019-08-28T23:40:58.400220 | 1495812 | Creative Commons Attribution | state-library-of-south-australia-sa-memory-201... | state-library-of-south-australia-sa-memory-201... | id|TITLE|CREATOR|INNOPAC|LINK|coverage_place|C... |
728 rows × 21 columns
df.to_csv(f'csvs_for_indexing_{datetime.datetime.now().strftime('%Y%m%d')}.csv', index=False)
# Get previously checked details
# Change date to previous checked file
df_checked = pd.read_csv(
"csvs_for_indexing_checked_20211018.csv", keep_default_na=False
)[["publisher", "info_url", "csv_file", "index", "drop", "extract"]]
df_checked.head()
publisher | info_url | csv_file | index | drop | extract | |
---|---|---|---|---|---|---|
0 | Australian Institute of Aboriginal and Torres ... | https://data.gov.au/dataset/11cbf24a-a31a-488c... | australian-institute-of-aboriginal-and-torres-... | |||
1 | Libraries Tasmania | https://data.gov.au/dataset/b0627a17-6783-4c18... | libraries-tasmania-bankruptcy-csv-2017-07-14.csv | NAME|NAME_SEE_ALSO | ||
2 | Libraries Tasmania | https://data.gov.au/dataset/069a423b-abd8-4454... | libraries-tasmania-colonial-secretary-correspo... | DESC|NAME|NAME_SEE_ALSO | ||
3 | Libraries Tasmania | https://data.gov.au/dataset/58a9a8d7-01e0-43df... | libraries-tasmania-court-csv-2017-07-14.csv | NAME | ||
4 | Libraries Tasmania | https://data.gov.au/dataset/d7ec2d93-b9dd-482b... | libraries-tasmania-digitised-archives-csv-2016... |
# Merge checking files
# df_new_check = pd.merge(df, df_checked, how='left', on=['publisher', 'info_url', 'file_title', 'file_modified'])
# This should merge latest harvested data with the indexing info from the checked file
df_new_check = pd.merge(
df, df_checked, how="left", on=["publisher", "info_url", "csv_file"]
)
df_new_check
publisher | info_url | file_title_x | file_modified_x | csv_file | file_title_y | file_modified_y | columns | index | drop | extract | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | State Library of Queensland | https://data.qld.gov.au/dataset/959d611f-a9cf-... | Real Estate Maps February 2018 | 2019-08-19T06:18:57.312772 | state-library-of-queensland-real-estate-maps-f... | Real Estate Maps February 2018 | 2019-08-19T06:18:57.313 | Title|Description|Lat|Lon|Link|ID | |||
1 | Queensland State Archives | https://data.qld.gov.au/dataset/fc87f25a-dc02-... | Passport clearances 1923 to 1940 | 2022-01-10T04:53:27.827980 | queensland-state-archives-passport-clearances-... | Passport clearances 1923 to 1940 | 2022-01-10T04:53:27.828 | Last name|Given names|Notes|Date of arrival|Ye... | Last name|Given names | Description | |
2 | Queensland State Archives | https://data.qld.gov.au/dataset/ba182873-e8a7-... | Assisted immigration 1848 to 1912 - A | 2022-06-14T07:46:06.234434 | queensland-state-archives-assisted-immigration... | Assisted immigration 1848 to 1912 - A | 2022-06-14T07:46:06.234 | Last name|Given names|Notes|Age|Ship|Date|Year... | |||
3 | Queensland State Archives | https://data.qld.gov.au/dataset/eae0afa9-681c-... | Australian South Sea Islanders 1867 to 1908 L-Z | 2017-01-11T01:32:27.556535 | queensland-state-archives-australian-south-sea... | Australian South Sea Islanders 1867 to 1908 L-Z | 2017-01-11T01:32:27.557 | Last name|Given name/s|Page|Date|Ref|Prev sys ... | |||
4 | Queensland Museum | https://data.qld.gov.au/dataset/4f1071f2-f4fa-... | Queensland Museum protozoan collection records | 2017-06-23T00:00:00 | queensland-museum-queensland-museum-protozoan-... | Queensland Museum protozoan collection records | 2017-06-23T00:00:00 | dcterms:type|dcterms:modified|dcterms:language... | |||
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
723 | State Records South Australia | https://data.sa.gov.au/data/dataset/f923f9b0-b... | SA FOI – number of fee waiver or reduction by ... | 2018-12-23T01:22:00.608148 | state-records-south-australia-sa-foi-number-of... | SA FOI – number of fee waiver or reduction by ... | 2018-12-23T01:22:00.608 | Reason for fee waiver, by sector|No. of waiver... | |||
724 | State Library of Queensland | https://data.qld.gov.au/dataset/cebb997c-1c42-... | July 2017 Catalogue searches | 2019-08-27T01:13:09.674578 | state-library-of-queensland-july-2017-catalogu... | July 2017 Catalogue searches | 2019-08-27T01:13:09.675 | Search strings|Count | |||
725 | Libraries Tasmania | https://data.gov.au/dataset/b711231a-2a02-48eb... | World War One Tasmanian Photographs - CSV | 2021-11-23 | libraries-tasmania-world-war-one-tasmanian-pho... | World War One Tasmanian Photographs - CSV | 2021-11-23 | DIGITAL_OBJECT - URL_TEXT|DIGITAL_OBJECT - URL... | NAME | ||
726 | NSW State Archives | https://data.nsw.gov.au/data/dataset/5d45437c-... | Deceased Estates | nsw-state-archives-deceased-estates-2014-09-30... | Deceased Estates | Surname|FirstName|Locality|DateOfDeath|DateDut... | |||||
727 | State Library of South Australia | https://data.sa.gov.au/data/dataset/7cd90f98-1... | SA Memory | 2019-08-28T23:40:58.400220 | state-library-of-south-australia-sa-memory-201... | SA Memory | 2019-08-28T23:40:58.400 | id|TITLE|CREATOR|INNOPAC|LINK|coverage_place|C... |
728 rows × 11 columns
Manually check the file below to see if all historical files with names have index values. Check against list of new file titles.
# Save new checking file
df_new_check.to_csv(f'csvs_for_indexing_checked_{datetime.datetime.now().strftime('%Y%m%d')}.csv', index=False)
prov_csvs = pd.read_csv("extra-prov-indexes.csv")
prov_csvs["file_index"] = prov_csvs.apply(
lambda x: f'{slugify(x["publisher"])}-{slugify(x["file_title"])}-{slugify(x["file_created"][:10])}',
axis=1,
)
for i, csv in enumerate(prov_csvs.itertuples()):
print(csv.dataset_title)
response = s.get(csv.download_url)
response.raise_for_status
if csv.format == "ZIP":
print(
f"{slugify(csv.publisher)}-{slugify(csv.file_title)}-{csv.file_created[:10]}.csv"
)
try:
z = zipfile.ZipFile(io.BytesIO(response.content))
z.extractall("prov_csvs")
except zipfile.BadZipFile:
pass
else:
with Path(
"prov_csvs",
f"{slugify(csv.publisher)}-{slugify(csv.file_title)}-{csv.file_created[:10]}.csv",
).open("w") as csv_file:
csv_file.write(response.text)
Victorian World War One Soldier Settlers British Assisted Passengers to Victoria 1839-1871 public-records-office-victoria-british-assisted-passengers-to-victoria-1839-1871-2014-08-01.csv Unassisted Inward Passengers 1852-1923 public-records-office-victoria-unassisted-inward-passengers-1852-1923-2020-10-27.csv Outwards Passengers from Victoria 1852-1915 public-records-office-victoria-outwards-passengers-from-victoria-1852-1915-2014-08-01.csv
prov_df = pd.DataFrame()
for csv in Path("prov_csvs").glob("*.csv"):
print(csv)
file_index = csv.name.split(".")[0]
details = (
prov_csvs.loc[prov_csvs["file_index"] == file_index][
["publisher", "info_url", "file_title", "file_modified"]
]
.iloc[0]
.to_dict()
)
details["csv_file"] = csv.name
df_csv = pd.read_csv(csv, low_memory=False)
details["columns"] = "|".join(list(df_csv.columns))
prov_df = prov_df.append(details, ignore_index=True)
prov_csvs/public-records-office-victoria-british-assisted-passengers-to-victoria-1839-1871-2014-08-01.csv
/Users/tim/.pyenv/versions/3.8.5/envs/ozglam-data/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3155: DtypeWarning: Columns (4) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
prov_csvs/public-records-office-victoria-outwards-passengers-from-victoria-1852-1915-2014-08-01.csv prov_csvs/public-records-office-victoria-victorian-world-war-one-soldier-settlers-2015-05-29.csv prov_csvs/public-records-office-victoria-unassisted-inward-passengers-1852-1923-2020-10-27.csv
/Users/tim/.pyenv/versions/3.8.5/envs/ozglam-data/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3155: DtypeWarning: Columns (6,9,11,12) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
prov_df.to_csv("prov_csvs_for_indexing.csv", index=False)
{
"databases": {
"database1": {
"source": "Alternative source",
"source_url": "http://example.com/",
"tables": {
"example_table": {
"description_html": "Custom <em>table</em> description",
"license": "CC BY 3.0 US",
"license_url": "https://creativecommons.org/licenses/by/3.0/us/",
}
},
}
}
}
df_checked = pd.read_csv(
"csvs_for_indexing_checked_20220809.csv", keep_default_na=False
)
df_checked_filtered = df_checked.loc[df_checked["index"] != ""]
df_prov_checked = pd.read_csv(
"prov_csvs_for_indexing_checked.csv", keep_default_na=False
)
df_sa_checked = pd.read_csv("sa_datasets_for_checking.csv", keep_default_na=False)
df_all_checked = pd.concat([df_checked_filtered, df_prov_checked, df_sa_checked])
df_all = pd.read_csv("glam-datasets-from-gov-portals.csv", keep_default_na=False)
df_all["csv_file"] = df_all.apply(
lambda x: f'{slugify(x["publisher"])}-{slugify(x["file_title"])}-{slugify(x["file_created"][:10])}.csv',
axis=1,
)
# df_final = pd.merge(df_all_checked, df_all, how='left', on=['file_title', 'info_url', 'publisher', 'file_modified'])
df_final = pd.merge(
df_all_checked, df_all, how="left", on=["info_url", "publisher", "csv_file"]
)
df_final.shape
(194, 25)
df_all.loc[df_all["publisher"] == "History Trust of South Australia"][
"csv_file"
].to_list()
['history-trust-of-south-australia-executive-employment-at-the-history-trust-of-south-australia-2011-2020-2018-09-03.csv', 'history-trust-of-south-australia-consultants-engaged-by-the-history-trust-of-south-australia-2019-11-08.csv', 'history-trust-of-south-australia-public-complaints-received-by-history-trust-of-south-australia-2018-09-03.csv', 'history-trust-of-south-australia-fraud-detection-at-history-trust-of-south-australia-2011-2021-2018-09-03.csv', 'history-trust-of-south-australia-whistleblowers-disclosure-for-history-trust-of-south-australia-2011-2021-2018-09-03.csv', 'history-trust-of-south-australia-contractors-engaged-by-the-history-trust-of-south-australia-2019-11-08.csv', 'history-trust-of-south-australia-passengers-in-history-search-index-2016-06-28.csv', 'history-trust-of-south-australia-workplace-injury-claims-received-by-history-trust-of-south-australia-2019-20-2018-09-03.csv', 'history-trust-of-south-australia-suffrage125-petition-2019-04-04.csv', 'history-trust-of-south-australia-bond-studios-glass-negative-collection-2017-06-27.csv', 'history-trust-of-south-australia-south-australian-government-photographic-collection-api-2017-06-23.csv', 'history-trust-of-south-australia-sa-government-photographic-collection-2017-07-28.csv', 'history-trust-of-south-australia-state-history-collection-2017-06-27.csv', 'history-trust-of-south-australia-community-history-live-stream-2015-05-15.csv', 'history-trust-of-south-australia-historical-places-2015-07-02.csv', 'history-trust-of-south-australia-historical-things-2015-07-02.csv', 'history-trust-of-south-australia-historical-organisations-2015-07-02.csv', 'history-trust-of-south-australia-historical-events-2015-07-02.csv', 'history-trust-of-south-australia-internee-data-2015-05-29.csv', 'history-trust-of-south-australia-history-festival-events-2015-05-14.csv']
metadata = {
"title": "GLAM Name Indexes",
"description_html": """
<p><b>Search for names across an aggregated collection of name indexes from Australian GLAM organisations.</b></p>
<p>For more information about the datasets, see the <a href="https://glam-workbench.net/glam-data-portals/">GLAM data portals</a> section of the GLAM Workbench.</p>
""",
"databases": {},
}
for org, csvs in df_final.groupby(by="publisher"):
metadata["databases"][slugify(org)] = {"title": org, "tables": {}}
db = Database(sqlite3.connect(f"{slugify(org)}.db"))
for csv in csvs.itertuples():
print(csv.csv_file)
if csv.dataset_title != csv.file_title_y:
title = f"{csv.dataset_title} – {csv.file_title_y}"
else:
title = csv.file_title_y
# print(title)
if csv.dataset_description != csv.file_description:
description = f"{markdown2.markdown(str(csv.dataset_description))}{markdown2.markdown(str(csv.file_description))}"
else:
description = markdown2.markdown(str(csv.dataset_description))
if csv.file_modified_y:
description += f"<p>Last modified: {csv.file_modified_y}</p>"
table_data = {
"title": title,
"description_html": description,
"source_url": csv.download_url,
"about_url": csv.info_url,
"license": csv.licence,
"searchmode": "raw",
}
metadata["databases"][slugify(org)]["tables"][
slugify(csv.file_title_y)
] = table_data
table = db[slugify(csv.file_title_y)]
df_csv = pd.read_csv(
Path("csvs", csv.csv_file), keep_default_na=False, low_memory=False
)
for col in csv.drop.split("|"):
if col:
df_csv.drop(columns=col, inplace=True)
for col in csv.extract.split("|"):
if col:
df_csv[f"{col}_url"] = df_csv[col].str.extract(r"(http.*?)'")
df_csv.drop(columns=col, inplace=True)
table.insert_all(df_csv.to_dict("records"))
cols_to_index = csv.index.split("|")
# print(cols_to_index)
table.enable_fts(cols_to_index)
with Path("metadata.json").open("w") as json_file:
json_file.write(json.dumps(metadata))
history-trust-of-south-australia-passengers-in-history-search-index-2016-06-28.csv history-trust-of-south-australia-suffrage125-petition-2019-04-04.csv libraries-tasmania-bankruptcy-csv-2017-07-14.csv libraries-tasmania-colonial-secretary-correspondence-csv-2019-09-05.csv libraries-tasmania-court-csv-2017-07-14.csv libraries-tasmania-education-csv-2019-09-05.csv libraries-tasmania-eheritage-data-csv-2017-07-17.csv libraries-tasmania-employment-csv-2019-09-05.csv libraries-tasmania-hotels-properties-csv-2016-03-22.csv libraries-tasmania-land-records-csv-2021-03-23.csv libraries-tasmania-miscellaneous-csv-2019-09-05.csv libraries-tasmania-tasmanian-arrivals-csv-2016-03-22.csv libraries-tasmania-tasmanian-births-csv-2016-03-22.csv libraries-tasmania-tasmanian-census-csv-2016-03-22.csv libraries-tasmania-tasmanian-convicts-csv-2016-03-22.csv libraries-tasmania-tasmanian-convicts-permission-to-marry-csv-2016-03-22.csv libraries-tasmania-tasmanian-deaths-csv-2016-03-22.csv libraries-tasmania-tasmanian-departures-csv-2016-03-22.csv libraries-tasmania-tasmanian-divorces-csv-2016-03-22.csv libraries-tasmania-tasmanian-health-welfare-records-csv-2016-03-22.csv libraries-tasmania-tasmanian-immigration-csv-2017-05-09.csv libraries-tasmania-tasmanian-inquests-csv-2016-03-22.csv libraries-tasmania-tasmanian-marriages-csv-2016-03-22.csv libraries-tasmania-tasmanian-naturalisations-csv-2016-03-22.csv libraries-tasmania-tasmanian-prisoners-csv-2016-03-22.csv libraries-tasmania-tasmanian-wills-csv-2016-03-22.csv libraries-tasmania-world-war-one-tasmanian-photographs-csv-2016-03-22.csv nsw-state-archives-index-to-certificates-of-freedom-1823-69-2015-10-01.csv nsw-state-archives-index-to-convict-bank-accounts-1837-70-2015-10-01.csv nsw-state-archives-index-to-convict-pardons-1791-1825-and-1837-47-2015-10-01.csv nsw-state-archives-index-to-tickets-of-exemption-from-government-labour-1827-32-2015-10-01.csv nsw-state-archives-index-to-tickets-of-leave-passports-1835-69-2015-10-01.csv nsw-state-archives-index-to-tickets-of-leave-1810-75-2015-10-01.csv nsw-state-archives-index-to-tickets-of-leave-certificates-of-emancipation-and-pardons-1810-19-2015-10-01.csv public-records-office-victoria-index-to-wills-probate-and-administration-records-1841-2009-2014-12-22.csv public-records-office-victoria-victorian-world-war-one-soldier-settlers-2015-05-29.csv public-records-office-victoria-vprs-515-p1-central-register-of-male-prisoners-2014-12-22.csv public-records-office-victoria-british-assisted-passengers-to-victoria-1839-1871-2014-08-01.csv public-records-office-victoria-unassisted-inward-passengers-1852-1923-2020-10-27.csv public-records-office-victoria-outwards-passengers-from-victoria-1852-1915-2014-08-01.csv queensland-state-archives-aboriginal-war-census-1915-to-1916-2015-07-08.csv queensland-state-archives-army-reservist-payments-1909-to-1920-2016-08-16.csv queensland-state-archives-assisted-immigration-1848-to-1912-combined-2018-08-27.csv queensland-state-archives-australian-south-sea-islanders-1867-1948-2021-02-23.csv queensland-state-archives-beaudesert-shire-and-logan-village-burials-1878-2000-2020-06-24.csv queensland-state-archives-brisbane-hospital-admissions-1872-to-1887-2014-09-29.csv queensland-state-archives-brisbane-hospital-registers-of-deaths-1899-to-1913-2014-09-29.csv queensland-state-archives-brisbane-hospital-registers-of-deaths-1933-to-1963-2014-09-29.csv queensland-state-archives-chronological-register-of-convicts-1824-1839-2018-03-02.csv queensland-state-archives-civil-servants-1866-1867-2014-06-24.csv queensland-state-archives-coloured-labour-and-asiatic-aliens-in-queensland-1913-2014-06-24.csv queensland-state-archives-companies-1863-to-1959-2014-07-07.csv queensland-state-archives-consumptive-patients-1897-to-1903-2014-09-29.csv queensland-state-archives-criminal-depositions-1861-1885-2018-12-18.csv queensland-state-archives-dental-board-1910-to-1932-2014-06-25.csv queensland-state-archives-dental-board-records-1900-1932-2022-04-12.csv queensland-state-archives-dentist-apprentices-1903-to-1925-2014-06-25.csv queensland-state-archives-dentists-1903-to-1932-2014-06-25.csv queensland-state-archives-discharged-soldier-settler-loans-1917-1919-2017-07-21.csv queensland-state-archives-divorces-1861-1894-2013-03-04.csv queensland-state-archives-equity-files-1857-to-1899-2013-03-03.csv queensland-state-archives-farm-lads-1922-1940-2018-01-19.csv queensland-state-archives-female-prisoners-admitted-toowoomba-1887-1891-2015-07-02.csv queensland-state-archives-immigrants-landed-bowen-1888-1896-2022-05-06.csv queensland-state-archives-immigrants-nominated-for-passage-maryborough-1884-to-1907-2013-06-25.csv queensland-state-archives-immigrants-bowen-immigration-depot-1885-1892-2021-07-22.csv queensland-state-archives-immigration-1922-to-1940-2013-06-25.csv queensland-state-archives-imperial-pensions-1872-to-1915-2014-06-23.csv queensland-state-archives-imperial-pensions-1898-to-1912-2014-06-23.csv queensland-state-archives-index-to-pensions-1908-1909-2020-06-28.csv queensland-state-archives-index-to-boer-war-records-1899-1902-2018-12-14.csv queensland-state-archives-index-to-brisbane-gaol-hospital-admission-registers-1889-1911-2020-06-24.csv queensland-state-archives-index-to-colonial-secretary-s-correspondence-1859-1861-csv-2018-12-17.csv queensland-state-archives-index-to-dunwich-benevolent-asylum-1885-1907-2021-01-19.csv queensland-state-archives-index-to-immigrants-and-crew-1860-1964-2020-06-24.csv queensland-state-archives-index-to-immigration-1909-1932-2020-06-30.csv queensland-state-archives-index-to-mariner-s-certificates-1877-1939-2022-04-27.csv queensland-state-archives-index-to-outdoor-relief-1892-1920-2020-06-29.csv queensland-state-archives-index-to-register-of-cases-and-treatment-at-moreton-bay-hospital-1830-1862-2020-06-28.csv queensland-state-archives-index-to-registers-of-agricultural-lessees-1885-1908-2020-06-29.csv queensland-state-archives-index-to-registers-of-immigrants-rockhampton-1882-1915-2020-06-29.csv queensland-state-archives-index-to-wallangarra-flu-camp-1918-1919-2021-07-26.csv queensland-state-archives-indigence-cases-1899-to-1948-2016-08-16.csv queensland-state-archives-inquests-1859-to-1902-2013-03-03.csv queensland-state-archives-instruments-of-renunciation-1915-to-1983-2013-03-04.csv queensland-state-archives-justices-of-the-peace-1857-to-1957-2013-03-04.csv queensland-state-archives-land-orders-1861-to-1874-2013-06-25.csv queensland-state-archives-land-orders-1862-to-1878-2016-12-08.csv queensland-state-archives-land-orders-1865-to-1866-2016-02-22.csv queensland-state-archives-land-selections-1885-1981-2021-07-23.csv queensland-state-archives-lazaret-patient-registers-2021-08-02.csv queensland-state-archives-leases-selections-and-pastoral-runs-and-other-related-records-1850-2014-2020-06-29.csv queensland-state-archives-mackay-hospital-admissions-1891-to-1908-2014-09-29.csv queensland-state-archives-military-service-south-african-boer-war-2022-04-22.csv queensland-state-archives-mineral-leases-1871-to-1940-2013-10-14.csv queensland-state-archives-miners-rights-1874-to-1880-combined-2018-09-05.csv queensland-state-archives-miners-rights-and-warden-s-collections-palmer-goldfields-1874-1880-2018-04-26.csv queensland-state-archives-monthly-and-half-yearly-returns-for-moreton-bay-1829-to-1837-2016-09-08.csv queensland-state-archives-naturalisations-1851-to-1908-2013-03-03.csv queensland-state-archives-naturalisations-1880-1885-2021-01-19.csv queensland-state-archives-nominated-immigrants-1908-to-1922-2013-06-25.csv queensland-state-archives-nurses-examinations-1915-to-1925-2014-06-24.csv queensland-state-archives-oronsay-immigration-1925-to-1972-2015-05-28.csv queensland-state-archives-passage-certificates-1887-to-1906-2013-06-25.csv queensland-state-archives-passport-clearances-1923-to-1940-2017-01-11.csv queensland-state-archives-perpetual-lease-selections-of-soldier-settlements-1917-1929-2018-11-13.csv queensland-state-archives-photographic-records-of-prisoners-1875-1913-2021-01-19.csv queensland-state-archives-prisoners-admitted-toowoomba-1895-1906-2015-07-02.csv queensland-state-archives-prisoners-discharged-toowoomba-1869-1879-2017-01-11.csv queensland-state-archives-prisoners-tried-toowoomba-1864-1903-2015-07-01.csv queensland-state-archives-rations-issued-to-immigrants-maryborough-1875-1884-2013-06-25.csv queensland-state-archives-redeemed-land-orders-1860-1907-2021-07-21.csv queensland-state-archives-reformatory-school-for-boys-1871-to-1906-2014-11-03.csv queensland-state-archives-register-of-court-fees-marburg-1885-to-1908-2016-09-08.csv queensland-state-archives-register-of-immigrants-1864-to-1878-2013-10-15.csv queensland-state-archives-register-of-immigrants-brisbane-1885-to-1917-2013-10-15.csv queensland-state-archives-register-of-immigrants-toowoomba-1880-to-1888-2017-01-12.csv queensland-state-archives-register-of-land-sold-1842-to-1859-2014-01-05.csv queensland-state-archives-register-of-lands-1861-to-1868-2014-01-05.csv queensland-state-archives-register-of-lands-sold-1842-1868-2022-05-03.csv queensland-state-archives-register-of-lands-sold-1849-to-1861-2014-03-20.csv queensland-state-archives-register-of-the-engagement-of-immigrants-at-the-immigration-depot-bowen-1873-1912-2020-06-29.csv queensland-state-archives-registers-of-applications-by-selectors-1868-1885-2020-06-23.csv queensland-state-archives-registers-of-immigrants-1882-to-1938-combined-2018-08-28.csv queensland-state-archives-registers-of-immigrants-promissory-notes-maryborough-1874-1903-2021-07-23.csv queensland-state-archives-scholarships-in-the-education-office-gazette-1900-1940-2020-06-24.csv queensland-state-archives-seamen-1882-to-1919-2014-06-23.csv queensland-state-archives-soldier-settlement-ledgers-1917-to-1929-2016-11-02.csv queensland-state-archives-south-sea-islanders-1867-to-1908-combined-2017-07-31.csv queensland-state-archives-st-helena-prisoners-1863-to-1936-2012-12-10.csv queensland-state-archives-sugar-exemptions-1922-1923-2018-03-06.csv queensland-state-archives-tb-home-applications-1923-to-1932-2014-06-24.csv queensland-state-archives-teachers-1860-to-1905-2015-07-01.csv queensland-state-archives-teachers-in-the-education-office-gazettes-1899-1925-2020-06-29.csv queensland-state-archives-toowoomba-girls-industrial-school-admissions-and-discharges-1881-to-1903-2014-11-03.csv queensland-state-archives-toowoomba-girls-reformatory-discharges-1882-to-1903-2014-11-03.csv queensland-state-archives-toowoomba-prisoners-1864-1906-2022-04-28.csv queensland-state-archives-transfer-of-runs-1848-to-1874-2015-06-01.csv queensland-state-archives-trustees-files-1889-to-1929-2013-03-04.csv queensland-state-archives-wills-1857-to-1940-2017-01-11.csv queensland-state-archives-windsor-town-council-honour-roll-1914-to-1925-2015-11-24.csv queensland-state-archives-writs-1857-2008-2018-01-19.csv state-library-of-queensland-british-convict-registers-2013-05-29.csv state-library-of-queensland-licensed-victuallers-index-updated-july-2022-2022-08-01.csv state-library-of-queensland-persons-called-before-queensland-government-committees-1860-1920-2019-07-24.csv state-library-of-queensland-police-gazette-inquests-1875-1885-updated-dec-2021-2021-12-02.csv state-library-of-queensland-portraits-of-soldiers-from-the-south-african-war-1899-1902-2014-09-26.csv state-library-of-queensland-portraits-of-ww1-soldiers-australasian-traveller-2018-08-20.csv state-library-of-queensland-queensland-mining-accidents-1882-1945-2014-11-23.csv state-library-of-queensland-queensland-railway-appointees-1890-1915-2018-05-18.csv state-library-of-queensland-queensland-railway-removals-1890-1915-2018-05-18.csv state-library-of-queensland-southern-and-western-railway-appointees-1866-1876-2019-07-23.csv state-library-of-queensland-southern-and-western-railway-removals-1866-1876-2019-07-23.csv state-library-of-queensland-world-war-1-soldier-portraits-2015-07-02.csv state-library-of-south-australia-heroes-of-the-great-war-chronicle-1915-1919-2015-07-02.csv state-library-of-south-australia-heroes-of-the-great-war-chronicle-portraits-1915-1919-2015-07-02.csv state-library-of-south-australia-old-colonists-men-2019-06-17.csv state-library-of-south-australia-old-colonists-women-2019-06-17.csv state-library-of-south-australia-oral-histories-2017-07-07.csv state-library-of-south-australia-s-a-speaks-an-oral-history-of-life-in-south-australia-before-1930-2017-07-06.csv state-library-of-south-australia-south-australian-photographs-world-war-1-1914-1929-2018-06-18.csv state-library-of-south-australia-south-australian-photographs-1900-1919-2019-08-28.csv state-library-of-south-australia-south-australian-photographs-1920-1949-2019-08-28.csv state-library-of-south-australia-south-australian-photographs-1950-onwards-2019-08-28.csv state-library-of-south-australia-south-australian-photographs-pre-1900-2019-08-28.csv state-library-of-victoria-melbourne-and-metropolitan-hotels-pubs-and-publicans-2017-03-29.csv state-library-of-western-australia-adopt-a-soldier-photographs-csv-2016-07-25.csv state-library-of-western-australia-eastern-goldfields-2016-07-28.csv state-library-of-western-australia-in-memoriam-cards-csv-2016-07-25.csv state-library-of-western-australia-index-entries-beginning-with-a-2016-07-28.csv state-library-of-western-australia-index-entries-beginning-with-b-2016-07-28.csv state-library-of-western-australia-index-entries-beginning-with-c-2016-07-28.csv state-library-of-western-australia-index-entries-beginning-with-d-and-e-2016-07-28.csv state-library-of-western-australia-index-entries-beginning-with-f-2016-07-28.csv state-library-of-western-australia-index-entries-beginning-with-g-2016-07-28.csv state-library-of-western-australia-index-entries-beginning-with-h-2016-07-28.csv state-library-of-western-australia-index-entries-beginning-with-i-and-j-2016-07-28.csv state-library-of-western-australia-index-entries-beginning-with-k-2016-07-28.csv state-library-of-western-australia-index-entries-beginning-with-l-2016-07-28.csv state-library-of-western-australia-index-entries-beginning-with-m-2016-07-28.csv state-library-of-western-australia-index-entries-beginning-with-n-2016-07-28.csv state-library-of-western-australia-index-entries-beginning-with-o-2016-07-28.csv state-library-of-western-australia-index-entries-beginning-with-p-and-q-2016-07-28.csv state-library-of-western-australia-index-entries-beginning-with-r-2016-07-28.csv state-library-of-western-australia-index-entries-beginning-with-s-2016-07-28.csv state-library-of-western-australia-index-entries-beginning-with-t-2016-07-28.csv state-library-of-western-australia-index-entries-beginning-with-u-z-2016-07-28.csv state-library-of-western-australia-indexed-obituaries-csv-2018-09-04.csv state-library-of-western-australia-krantz-sheldon-csv-2016-07-25.csv state-library-of-western-australia-pictorial-collection-csv-2016-07-25.csv state-library-of-western-australia-slwa-centenary-wwi-2016-07-25.csv state-library-of-western-australia-wabi-police-subset-2016-07-28.csv state-library-of-western-australia-york-and-districts-subset-2016-07-28.csv state-records-office-of-western-australia-index-to-group-settlements-in-wa-2017-06-01.csv
with Path("metadata.json").open("w") as json_file:
json_file.write(json.dumps(metadata))
" ".join(
sorted(
[p.name for p in Path("/Volumes/Workspace/mycode/ozglam-data/src").glob("*.db")]
)
)
'history-trust-of-south-australia.db libraries-tasmania.db nsw-state-archives.db public-records-office-victoria.db queensland-state-archives.db state-library-of-queensland.db state-library-of-south-australia.db state-library-of-western-australia.db state-records-office-of-western-australia.db'