#!/usr/bin/env python
# coding: utf-8
# #### Test att hämta bilder och söka i SPA
# * [Denna notebook](https://github.com/salgo60/spa2Commons/blob/main/Notebook/SPA%20Litteraturbanken.ipynb)
#
# Tanken att se vilka möjligheter som finns
# * skapa ett script --> ett klick för att ladda upp bilder se test [spa2commons](https://commons.wikimedia.org/wiki/User:Salgo60/spa2commons.js) - GITHUB [salgo60/spa2Commons](https://github.com/salgo60/spa2Commons)
# * att ta en plats som en kyrkogård eller en koppling Litteraturbanken och se vad som kan matchas
# * att stämma av länkar SPA -> WIkipedia att dessa finns i WIkidata med länk tillbaka via [P4819](https://www.wikidata.org/wiki/Property:P4819?uselang=sv)
#
# ## sv:Wikipedia
# * kopplas till Litteraturbanken med en [mall](https://sv.wikipedia.org/wiki/Mall:Litteraturbanken) som visae användningen i Spårningskategorin [Litteraturbanken](https://sv.wikipedia.org/wiki/Kategori:Litteraturbanken)
# * [mallen är skriven](https://sv.wikipedia.org/w/index.php?title=Mall:Litteraturbanken&action=edit) så att identifieraren hämtas från [Wikidata egenskap P5101](https://www.wikidata.org/wiki/Property:P5101?uselang=sv) om inget annat anges
# In[1]:
from datetime import datetime
start_time = datetime.now()
print("Last run: ", start_time)
# In[2]:
import urllib3, json
import pandas as pd
http = urllib3.PoolManager()
url= "https://portrattarkiv.se/endpoints/latest.php"
url= "https://xn--portrttarkiv-kcb.se/endpoints/search.php"
url= "https://portrattarkiv.se/endpoints/search.php"
#
# ## Icke kopplade personer till Svenskt Porträttarkiv P4819 som har Litteraturbanken P5101
#
# Testar att söka med lite olika parametrar för att få bättre precision i rankingen när jag söker mot Svenskt Porträttarkiv SPA
# * personer hos Litteraturbanken [Property:P5101](https://www.wikidata.org/wiki/Property:P5101?uselang=sv)
# * bilderna i SPA är gamla --> utan copyright --> begränsa till sökning för födda innan 1920
# * [SPARQL](https://w.wiki/6QWL)
# * 2023 03 05 ger 1290 träff
#
# In[34]:
# query to get people with no picture in WIkidata but connected Litteraturbanken
endpoint_url = "https://query.wikidata.org/sparql"
import sys
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
# query people connected to Litteraturebank with no picture and born before 1920
# https://w.wiki/6QX5
queryLitt = """#title: Records in Wikidata connected to Litteraturbanken with no picture for people born before 1920
SELECT (REPLACE(STR(?item), ".*Q", "Q") AS ?WikidataID)
(CONCAT (?itemLabel," ",str(year(?birthDayWD))) AS ?search)
?birthDayWD
?firstnameLabel ?lastnameLabel
?item ?itemLabel
(str(year(?birthDayWD)) AS ?BirthYearWD)
?Littauthorid
WHERE {
?item wdt:P5101 ?Littauthorid.
# minus {?item wdt:P4819 ?c}
minus {?item wdt:P18 ?noimage }
OPTIONAL { ?item wdt:P735 ?firstname. }
OPTIONAL { ?item wdt:P734 ?lastname. }
OPTIONAL { ?item wdt:P569 ?birthDayWD. }
SERVICE wikibase:label { bd:serviceParam wikibase:language "sv", "en". }
FILTER (?birthDayWD <= "1920-01-01T00:00:00Z"^^xsd:dateTime )
}"""
# query for all Littbank without a picture in WD but article in Wikidata
# https://w.wiki/6QWw
queryArticle = """SELECT distinct (CONCAT (?itemLabel," ",str(year(?birthDayWD))) AS ?search)
?birthDayWD
?firstnameLabel ?lastnameLabel
?item ?itemLabel ?wikipedia
(str(year(?birthDayWD)) AS ?BirthYearWD)
WHERE {
?item wdt:P5101 ?Litt.
minus {?item wdt:P18 ?d }
OPTIONAL { ?item wdt:P735 ?firstname. }
OPTIONAL { ?item wdt:P734 ?lastname. }
OPTIONAL { ?item wdt:P569 ?birthDayWD. }
{
?wikipedia schema:about ?item .
?wikipedia schema:inLanguage "sv" .
?wikipedia schema:isPartOf .
}
SERVICE wikibase:label { bd:serviceParam wikibase:language "sv", "en". }
} order by ?itemLabel """
def get_sparql_dataframe(endpoint_url, query):
"""
Helper function to convert SPARQL results into a Pandas data frame.
"""
user_agent = "salgo60/%s.%s" % (sys.version_info[0], sys.version_info[1])
sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
result = sparql.query()
processed_results = json.load(result.response)
cols = processed_results['head']['vars']
out = []
for row in processed_results['results']['bindings']:
item = []
for c in cols:
item.append(row.get(c, {}).get('value'))
out.append(item)
return pd.DataFrame(out, columns=cols)
#SPALittdf = get_sparql_dataframe(endpoint_url, query)
SPALittdf = get_sparql_dataframe(endpoint_url, queryLitt)
SPALittdf.info()
# In[35]:
SPALittdf
# In[36]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# In[37]:
from wikidata.client import Client
from tqdm.notebook import tqdm
import urllib3, json
http = urllib3.PoolManager()
SPAdetail = "https://portrattarkiv.se/details/"
url= "https://portrattarkiv.se/endpoints/search.php"
urlbasePic = "https://portrattarkiv.se/endpoints/file.php?id="
def getdfScore(panddf):
dfScore = pd.DataFrame(columns=['all','WikidataID','spaid','Littauthorid','score',
'FirstNameWD','LastNameWD','BirthDayWD',
'FirstNameSPA','LastNameSPA','BirthDaySPA','searchStr'])
#for index, row in SPALittdf.iterrows():
for index, row in tqdm(panddf.iterrows(),total=panddf.shape[0]):
allString=row["search"]
wd=row["WikidataID"]
Littauthorid=row["Littauthorid"]
print("\n",allString,wd)
encoded_body = json.dumps({
"limit": "5",
"from": "0",
"firstname" : row["firstnameLabel"],
"lastname" : row["lastnameLabel"],
"birthyear":row["BirthYearWD"],
"all":allString
})
r = http.request('POST', url,
headers={'Content-Type': 'application/json'},
body=encoded_body)
if r.status != 200:
print(r.status)
continue
data = json.loads(r.data.decode('utf-8'),)
urls = []
for h in data["hits"]["hits"]:
spaid = h["_id"]
#print(h)
source = h["_source"]
try:
urlPicture = urlbasePic + spaid
urls.append(urlPicture)
score = h["_score"]
FirstNameSPA = source["FirstName"]
LastNameSPA = source["LastName"]
BirthYearSPA = source["BirthYear"]
print("\t\t",score,FirstNameSPA, " ", LastNameSPA, " - ", BirthYearSPA,SPAdetail+spaid, "\t", )
dfScore = dfScore.append({
'all' : allString,
'wd' : wd,
'Littauthorid' : Littauthorid,
'spaid' : spaid,
'score' : score,
'FirstNameWD' : row["firstnameLabel"],
'LastNameWD' : row["lastnameLabel"],
'BirthYearWD' : row["BirthYearWD"],
'FirstNameSPA' : FirstNameSPA,
'LastNameSPA' : LastNameSPA,
'BirthYearSPA' : BirthYearSPA
},ignore_index=True)
except Exception as e:
print(f"{type(e).__name__} at line {e.__traceback__.tb_lineno} of {__file__}: {e}")
return dfScore
# In[38]:
dfScoreSPALitt = getdfScore(SPALittdf)
dfScoreSPALitt.info()
# In[39]:
dfScoreSPALitt.sort_values('score', ascending=False)
dfScoreSPALitt.groupby("wd")
# In[40]:
dfScoreSPALitt.info()
# In[41]:
dfScoreSPALitt.head(200)
#Group by WIkidata Q number and get highest score
#create a helper column with max
#dfScoreSPALitt['max'] = dfScoreSPALitt.groupby('wd')['score'].transform('max')
#dfScoreSPALittMax = dfScoreSPALitt.groupby('wd')['max','spaid','FirstNameWD','LastNameWD','BirthYearWD']. \
# apply(lambda x: x.nlargest(1, columns=['score'])).sort_values(['max','wd'], ascending=False)
#dfScoreSPALittMax
#ss = dfScoreSPALitt.groupby('wd')['max','spaid','FirstNameWD','LastNameWD','BirthYearWD']. \
# apply(lambda x: x.nlargest(1, columns=['score'])).sort_values(ascending=False)
# In[42]:
dfCandidates = dfScoreSPALitt.sort_values(by=['score'], ascending=False)
dfCandidates
#
# ## Candidates SPA
#
# In[43]:
#dfCandidates
# In[44]:
from IPython.display import HTML
space = "%20"
dfCandidates["Littbank"] = "link Littbank"
dfCandidates["WD"] = "" + dfCandidates["wd"] + ""
dfCandidates["WD_template"] = "{{Q|" + dfCandidates["wd"] + "}}"
dfCandidates["SPA"] = "link SPA"
dfCandidates["WikicommonsUrl"] = "link Commons"
dfCandidates["name_Ascii"] = dfCandidates["FirstNameWD"] +\
space + dfCandidates["LastNameWD"] + space + dfCandidates["BirthYearWD"]
dfCandidates["name_str"] = dfCandidates["FirstNameWD"] \
+ " " + dfCandidates["LastNameWD"] + " " + dfCandidates["BirthYearWD"]
#dfCandidates["name_Ascii"] = dfCandidates["FirstNameWD"] \
# + space + dfCandidates["LastNameWD"] + space + dfCandidates["BirthYearWD"]
dfCandidates["CommonsCat"] = "[[Category:" + dfCandidates["name_Ascii"] + "]]"
curlyLeftBrackets = "%7B%7B"
curlyRightBrackets = "%7D%7D"
verticalBar = "%7C"
lineFeed = "%0A"
equalSign = "%3D"
information = curlyLeftBrackets + "Information"
description = verticalBar + "description" + equalSign + curlyLeftBrackets
english = curlyLeftBrackets + "en" + verticalBar
swedish = curlyLeftBrackets + "sv" + verticalBar
uploadedFromSwe = "uppladdat%20fr%C3%A5n%20Portrattarkiv.se%20bild%20av%20"
uploadedFromEn = "uploaded%20from%20Portrattarkiv.se%20picture%20of%20"
source = verticalBar + "source" + equalSign + "https://portrattarkiv.se/details/"
author= verticalBar + "author" + equalSign + curlyLeftBrackets + "sv|okänd"+ curlyRightBrackets + \
curlyLeftBrackets +"en|Unknown" + curlyRightBrackets + lineFeed
permission = verticalBar + "permission" + equalSign + lineFeed
otherversion = verticalBar + "other versions" + equalSign + lineFeed
categories = lineFeed + "[[Category:Uploaded with spa2CommonsNotebook]]" + lineFeed + \
"[[Category:Swedish_Portrait_Archive]]"
license = "&wpLicense=cc-by-sa-4.0"
wpDestFile = "&wpDestFile="
wpSourceType = "wpSourceType=url&wpUploadFileURL=https://portrattarkiv.se/endpoints/file.php?id="
dfCandidates["UploadUrl"] = "link Upload"
pd.set_option("display.max.columns", None)
HTML(dfCandidates[["score","name_str", "LastNameWD","FirstNameWD","Littbank", "UploadUrl","WD", "SPA", "spaid",\
"WikicommonsUrl"]
].to_html(escape=False))
# In[46]:
dfCandidates.to_csv("Littraturbanken_candidates.csv")
# ### Test med preview
# In[ ]: