This notebook visualises changes in Australian government departments over time, using data from Wikidata. It creates a hierarchically-ordered network graph where each agency is represented as a node whose position and colour is determined by the decade in which the agency was created. The size of the node indicates how long the agency was in existence, while edges between nodes connect agencies to their successors. Earliest agencies will be at the top of the graph.
You can view the query used to generate this graph using the Wikidata Query Service.
import json
import arrow
import pandas as pd
from IPython.display import IFrame, display
from pyvis.network import Network
from SPARQLWrapper import JSON, SPARQLWrapper
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery(
"""
SELECT
?item ?label
?id ?start_date ?end_date ?after_id
WHERE
{
?item wdt:P31 wd:Q57605562;
wdt:P10856 ?id;
wdt:P571 ?start_date;
rdfs:label ?agency_label.
OPTIONAL { ?item wdt:P576 ?end_date. }
OPTIONAL { ?item wdt:P1366 ?after.
?after wdt:P10856 ?after_id. }
FILTER (lang(?agency_label) = "en").
# Combine start and end year into a single string, setting end date to "" if it doesn't exist
BIND(concat(xsd:string(YEAR(?start_date)), "-", COALESCE(xsd:string(YEAR(?end_date)), "")) as ?date_range)
# Combine dept name and date range into a single string
BIND(concat(?agency_label, " (", ?date_range, ")") as ?label)
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
}
"""
)
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
df = pd.json_normalize(results["results"]["bindings"], sep="_")
df.head()
item_type | item_value | after_id_type | after_id_value | id_type | id_value | start_date_datatype | start_date_type | start_date_value | end_date_datatype | end_date_type | end_date_value | label_type | label_value | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | uri | http://www.wikidata.org/entity/Q16956105 | literal | CA 15 | literal | CA 8 | http://www.w3.org/2001/XMLSchema#dateTime | literal | 1901-01-01T00:00:00Z | http://www.w3.org/2001/XMLSchema#dateTime | literal | 1916-11-14T00:00:00Z | literal | Department of Home Affairs (1901-1916) |
1 | uri | http://www.wikidata.org/entity/Q16956105 | literal | CA 14 | literal | CA 8 | http://www.w3.org/2001/XMLSchema#dateTime | literal | 1901-01-01T00:00:00Z | http://www.w3.org/2001/XMLSchema#dateTime | literal | 1916-11-14T00:00:00Z | literal | Department of Home Affairs (1901-1916) |
2 | uri | http://www.wikidata.org/entity/Q16956110 | literal | CA 27 | literal | CA 24 | http://www.w3.org/2001/XMLSchema#dateTime | literal | 1928-12-10T00:00:00Z | http://www.w3.org/2001/XMLSchema#dateTime | literal | 1932-04-12T00:00:00Z | literal | Department of Home Affairs (1928-1932) |
3 | uri | http://www.wikidata.org/entity/Q16956114 | literal | CA 3068 | literal | CA 2474 | http://www.w3.org/2001/XMLSchema#dateTime | literal | 1977-12-20T00:00:00Z | http://www.w3.org/2001/XMLSchema#dateTime | literal | 1980-11-03T00:00:00Z | literal | Department of Home Affairs (1977-1980) |
4 | uri | http://www.wikidata.org/entity/Q16956119 | literal | CA 4131 | literal | CA 3068 | http://www.w3.org/2001/XMLSchema#dateTime | literal | 1980-11-03T00:00:00Z | http://www.w3.org/2001/XMLSchema#dateTime | literal | 1984-12-13T00:00:00Z | literal | Department of Home Affairs and Environment (19... |
# Tableau style colours from http://tableaufriction.blogspot.com/2012/11/finally-you-can-use-tableau-data-colors.html
rgb = [
"255.187.120",
"255.127.14",
"174.199.232",
"44.160.44",
"31.119.180",
"255.152.150",
"214.39.40",
"197.176.213",
"152.223.138",
"148.103.189",
"247.182.210",
"227.119.194",
"196.156.148",
"140.86.75",
"127.127.127",
"219.219.141",
"199.199.199",
"188.189.34",
"158.218.229",
"23.190.207",
]
def make_darker(colour, factor=0.75):
"""
Darken colour by given factor.
"""
return [str(round(int(c) * factor)) for c in colour]
def make_lighter(colour, factor=0.75):
"""
Lighten colour by given factor.
"""
return [str(round((255 - int(c)) * factor) + int(c)) for c in colour]
# List of Tableau style colours
colours = [f'rgb({",".join(r.split("."))})' for r in rgb]
# List of darkened colors
borders = [f'rgb({",".join(make_darker(r.split(".")))})' for r in rgb]
# List of lightened colours
highlights = [f'rgb({",".join(make_lighter(r.split(".")))})' for r in rgb]
# Create groups for each decade in the date range, assigning a different colour for each group
decades = [str(d) for d in range(190, 203)]
decade_groups = {
d: {
"color": {
"background": colours[i],
"border": borders[i],
"highlight": {"background": highlights[i], "border": borders[i]},
}
}
for i, d in enumerate(decades)
}
# Calculate the possible range of values for the length of an agency's existence
max_days = (arrow.utcnow() - arrow.get("1901-01-01")).days
min_days = 1
current_range = max_days - min_days
def calculate_size(start, end, current_range=current_range, biggest=150, smallest=30):
"""
Calculate the size of nodes based on each agency's length of existence.
Adjust value to fall with the desired range.
See: https://stackoverflow.com/a/929107
"""
start_date = arrow.get(start)
try:
end_date = arrow.get(end)
except (ValueError, TypeError):
end_date = arrow.utcnow()
delta = end_date - start_date
return (((delta.days - 1) * (biggest - smallest)) / current_range) + 20
net = Network(notebook=True, cdn_resources="remote")
# Loop through the agency data, creating a node for each agency
for agency in df.itertuples():
net.add_node(
agency.id_value,
label=agency.id_value,
# Include a hyperlink to the agency record in RecordSearch
title=f"<a target='_blank' href='https://recordsearch.naa.gov.au/scripts/AutoSearch.asp?Number={agency.id_value}'>{agency.id_value}, {agency.label_value}</a>",
# Assign to a group based on the decade in which it was created
# This will colour the nodes by decade
group=agency.start_date_value[:3],
# Assign a level based on decade in which it was created
# This will help to position the agency hierarchically by creation date
level=int(agency.start_date_value[:4]),
# Size the node according the length of time the agency existed
size=calculate_size(agency.start_date_value, agency.end_date_value),
)
# Add edges between a node and its successors
for agency in df.dropna(subset=["after_id_value"]).itertuples():
net.add_edge(agency.id_value, agency.after_id_value)
# Network graph configuration
# It's easier to manange this in a Python dict then convert to JSON for PyVis
options = {
"configure": {"enabled": False},
"layout": {
# A hierarchical layout with levels based on start date will order the agencies by time
"hierarchical": {
"enabled": True,
"sortMethod": "directed",
"shakeTowards": "leaves",
"nodeSpacing": 20,
"levelSeparation": 40,
"treeSpacing": 20,
}
},
"physics": {"hierarchicalRepulsion": {"avoidOverlap": 1, "nodeDistance": 180}},
"nodes": {"font": {"size": 15}},
# Assign colours by decade
"groups": decade_groups,
"edges": {
"arrows": {"to": {"enabled": True, "scaleFactor": 0.5}},
"arrowStrikethrough": False,
"smooth": {"enabled": True},
"color": {"color": "#b0bec5", "inherit": True},
},
}
net.set_options(f"var options = {json.dumps(options)}")
# Doing this rather than net.show() gives better results and predicatble sizes
net.write_html("agencies-network.html", notebook=True)
display(IFrame("agencies-network.html", height=800, width="100%"))
Created by Tim Sherratt for the GLAM Workbench.
The development of the Wikidata section of the GLAM Workbench was supported by Wikimedia Australia.