Check Open Data portals for Swedish municipalities
from datetime import datetime
start_time = datetime.now()
print("Last runa: ", start_time)
Last runa: 2021-06-29 11:33:35.369172
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/
import sys,json
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
endpoint_url = "https://query.wikidata.org/sparql"
queryOpenDataMiss = """SELECT (REPLACE(STR(?org), ".*Q", "Q") AS ?wikidata) ?org ?orgLabel ?www WHERE {
?org wdt:P31 wd:Q127448.
?org wdt:P856 ?www.
?org wdt:P361 ?lan.
?lan wdt:P31 wd:Q193556.
?org wdt:P276?/wdt:P625 ?coord.
minus {?org wdt:P8402 ?portalValue}
SERVICE wikibase:label { bd:serviceParam wikibase:language "sv". }
}
GROUP BY ?org ?orgLabel ?www
ORDER BY DESC (?orgLabel) """
def get_sparql_dataframe(endpoint_url, query):
"""
Helper function to convert SPARQL results into a Pandas data frame.
"""
user_agent = "salgo60/%s.%s" % (sys.version_info[0], sys.version_info[1])
sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
result = sparql.query()
processed_results = json.load(result.response)
cols = processed_results['head']['vars']
out = []
for row in processed_results['results']['bindings']:
item = []
for c in cols:
item.append(row.get(c, {}).get('value'))
out.append(item)
return pd.DataFrame(out, columns=cols)
WDOpenDataCheck = get_sparql_dataframe(endpoint_url, queryOpenDataMiss)
WDOpenDataCheck.shape
(148, 4)
pd.set_option('max_colwidth', 400)
WDOpenDataCheck.tail(60)
import urllib3, json
from tqdm import tqdm
timeout = urllib3.util.Timeout(connect=2.0, read=7.0)
http = urllib3.PoolManager()
listOpenDataCheck = []
for WD, row in tqdm(WDOpenDataCheck.iterrows(), total=WDOpenDataCheck.shape[0]):
url = row["www"] +"/psidata"
new_item = dict()
new_item['wikidata'] = row["wikidata"]
try:
r = http.request('GET', url)
new_item['status'] = r.status
except:
print ("Error ", r.status, url, row["wikidata"] )
new_item['status'] = r.status
# if r.status == 200:
# print (row["wikidata"] , r.status, url)
new_item['url'] = url
# new_item['country'] = row["country"]
listOpenDataCheck.append(new_item)
print (len(listOpenDataCheck))
22%|██▏ | 32/148 [00:10<00:45, 2.53it/s]
Error 404 http://www.torsas.se/psidata Q515551
35%|███▌ | 52/148 [00:17<00:34, 2.82it/s]
Error 404 http://www.skurup.se/psidata Q515266
50%|█████ | 74/148 [00:35<00:35, 2.07it/s]
Error 404 http://www.monsteras.se/psidata Q515250
66%|██████▌ | 97/148 [30:15<6:30:51, 459.84s/it]
Error 404 http://www.koping.se/psidata Q42009
100%|██████████| 148/148 [30:33<00:00, 12.39s/it]
148
OpenDataNewtot = pd.DataFrame(listOpenDataCheck,
columns=['wikidata','status','url'])
OpenDataNewtot.shape
(148, 3)
OpenDataNewtot[OpenDataNewtot.status == 200]
wikidata | status | url | |
---|---|---|---|
11 | Q271153 | 200 | http://www.are.se/psidata |
18 | Q515477 | 200 | http://www.vastervik.se/psidata |
36 | Q501432 | 200 | http://www.tibro.se/psidata |
89 | Q504235 | 200 | http://www.ljungby.se/psidata |
108 | Q510223 | 200 | http://www.karlshamn.se/psidata |
109 | Q499435 | 200 | https://www.karlsborg.se/psidata |
116 | Q428749 | 200 | http://www.hjo.se/psidata |
121 | Q499359 | 200 | http://www.hammaro.se/psidata |
142 | Q509476 | 200 | http://www.boden.se/psidata |