Looks we have a limit of 10000 records
from datetime import datetime
start_time = datetime.now()
print("Last run: ", start_time)
Last run: 2021-10-28 02:32:20.192058
import urllib3, json
import pandas as pd
http = urllib3.PoolManager()
url= "https://portrattarkiv.se/endpoints/latest.php"
url= "https://xn--portrttarkiv-kcb.se/endpoints/search.php"
url= "https://portrattarkiv.se/endpoints/search.php"
from tqdm.notebook import tqdm
SPAdetail = "https://portrattarkiv.se/details/"
#print("\n",row["search"],row["item"])
limit = 10000
listdf = []
for year in range(1780,1910,50):
for start in range(0,10000,limit):
print("\n\nyear", year,"\tfrom: ",start,"\tlimit",limit,"")
encoded_body = json.dumps({
"limit": limit,
"from": start,
"year": year,
"facts":{"URL":"https://sv.wikipedia.org/wiki"}
})
r = http.request('POST', url,
headers={'Content-Type': 'application/json'},
body=encoded_body)
print("http status ", r.status)
if r.status != 200:
print("Exit ",r.status)
print(r)
break
data = json.loads(r.data.decode('utf-8'),)
urls = []
print("\nTotal: ",data["hits"]["total"])
print(data["_shards"])
print("Hits: ",len(data["hits"]["hits"]))
df = pd.json_normalize(data["hits"]["hits"])
listdf.append(df)
year 1780 from: 0 limit 10000 http status 200 Total: 17243 {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0} Hits: 10000 year 1830 from: 0 limit 10000 http status 200 Total: 17243 {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0} Hits: 10000 year 1880 from: 0 limit 10000 http status 200 Total: 17243 {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0} Hits: 10000
#listdf
len(listdf)
3
dfTot = pd.concat(listdf)
#for c in dfTot.columns:
# print(c)
len(dfTot)
30000
df = dfTot.drop_duplicates(subset = ["_id"])
len(df[{"_id","_source.Facts.URL"}])
10000
from wikidata.client import Client
from tqdm.notebook import tqdm
def checkSPA(wikidata_id):
# get SPA
client = Client()
entity = client.get(wikidata_id, load=True)
try:
spa_prop = entity[client.get('P4819')]
except:
return False
return spa_prop
def getWD (url):
#print(url)
baseUrl = "https://sv.wikipedia.org/w/api.php?action=query&prop=pageprops|wbentityusage&titles="
title = url.replace("https://sv.wikipedia.org/wiki/","")
urlWD = baseUrl + title
wdr = http.request('Get', urlWD + "&format=json",
headers={'Content-Type': 'application/json'})
if wdr.status != 200:
print("Error ", wdr.status)
return False
datawd = json.loads(wdr.data.decode('utf-8'),)
pg_dict = datawd['query']['pages']
pg_key = list(pg_dict.keys())[0]
wikidata_id = pg_dict[pg_key]['pageprops']['wikibase_item']
return wikidata_id
for index, row in tqdm( df.iterrows(),total= df.shape[0]):
#print(row['_id'], row['_source.Facts.URL'])
for n in row['_source.Facts.URL']:
if "wikipedia" in n.lower():
try:
wdrec = getWD(n)
if checkSPA(wdrec) == False:
print ("Todo",row['_id'],wdrec,n)
#print (n,"Value :",getWD(n))
except:
pass
0%| | 0/10000 [00:00<?, ?it/s]
Todo sj9PGLAlnmUAAAAAACaEqg Q61992416 https://sv.wikipedia.org/wiki/Fannyudde Todo 5TJc-sPXaKAAAAAAAAAnng Q41660503 https://sv.wikipedia.org/wiki/Fahlcrantz Todo sj9PGLAlnmUAAAAAABT2Cg Q10684951 https://sv.wikipedia.org/wiki/Svenska_Bindgarnsfabriken Todo sj9PGLAlnmUAAAAAAA2A0Q Q10432053 https://sv.wikipedia.org/wiki/Bo_fajans Todo NhqpvvI9tpAAAAAAAAAFYg Q6206374 https://sv.wikipedia.org/wiki/Carl_Herman_Tersmeden Todo sj9PGLAlnmUAAAAAABg2AQ Q10700627 https://sv.wikipedia.org/wiki/Tottie Todo wbbNj8LLWMAAAAAAAABARg Q10454594 https://sv.wikipedia.org/wiki/Classic_kaffe Todo I_2wtd5VefAAAAAAAABLGA Q274489 https://sv.wikipedia.org/wiki/Yngsj%C3%B6mordet Todo wbbNj8LLWMAAAAAAAAAwXw Q27243339 https://sv.wikipedia.org/wiki/Nisser_(sl%C3%A4kt) Todo IQojCnw0WmAAAAAAAAAzTA Q97482829 https://sv.wikipedia.org/wiki/Elias_Fries_(ingenj%C3%B6r) Todo YB0QHyfj0hAAAAAAAAA-fw Q98711021 https://sv.wikipedia.org/wiki/Staaff_(sl%C3%A4kt) Todo sj9PGLAlnmUAAAAAABL3vQ Q27243339 https://sv.wikipedia.org/wiki/Nisser_(sl%C3%A4kt) Todo 9fCwuOWMwGAAAAAAAAAB4w Q10685071 https://sv.wikipedia.org/wiki/Svenska_J%C3%A4rnv%C3%A4gsverkst%C3%A4derna Todo sj9PGLAlnmUAAAAAABThMw Q10541905 https://sv.wikipedia.org/wiki/J%C3%A4rnv%C3%A4gsolyckan_i_Malmsl%C3%A4tt Todo mLh5P0pzYFAAAAAAAABWug Q10728408 https://sv.wikipedia.org/wiki/%C3%96stra_Sm%C3%A5lands_missionsf%C3%B6rening Todo sj9PGLAlnmUAAAAAABF4Ew Q274489 https://sv.wikipedia.org/wiki/Yngsj%C3%B6mordet Todo sj9PGLAlnmUAAAAAABLYnw Q2167776 https://sv.wikipedia.org/wiki/Douglas_(%C3%A4tt) Todo 4e1scVvZ-EAAAAAAAAAGrg Q10427446 https://sv.wikipedia.org/wiki/Baumgardt