#!/usr/bin/env python # coding: utf-8 # #### Test att hämta bilder och söka i SPA # * [Denna notebook](https://github.com/salgo60/spa2Commons/blob/main/Notebook/SPA%20Litteraturbanken.ipynb) # # Tanken att se vilka möjligheter som finns # * skapa ett script --> ett klick för att ladda upp bilder se test [spa2commons](https://commons.wikimedia.org/wiki/User:Salgo60/spa2commons.js) - GITHUB [salgo60/spa2Commons](https://github.com/salgo60/spa2Commons) # * att ta en plats som en kyrkogård eller en koppling Litteraturbanken och se vad som kan matchas # * att stämma av länkar SPA -> WIkipedia att dessa finns i WIkidata med länk tillbaka via [P4819](https://www.wikidata.org/wiki/Property:P4819?uselang=sv) # # ## sv:Wikipedia # * kopplas till Litteraturbanken med en [mall](https://sv.wikipedia.org/wiki/Mall:Litteraturbanken) som visae användningen i Spårningskategorin [Litteraturbanken](https://sv.wikipedia.org/wiki/Kategori:Litteraturbanken) # * [mallen är skriven](https://sv.wikipedia.org/w/index.php?title=Mall:Litteraturbanken&action=edit) så att identifieraren hämtas från [Wikidata egenskap P5101](https://www.wikidata.org/wiki/Property:P5101?uselang=sv) om inget annat anges # In[1]: from datetime import datetime start_time = datetime.now() print("Last run: ", start_time) # In[2]: import urllib3, json import pandas as pd http = urllib3.PoolManager() url= "https://portrattarkiv.se/endpoints/latest.php" url= "https://xn--portrttarkiv-kcb.se/endpoints/search.php" url= "https://portrattarkiv.se/endpoints/search.php" # # ## Icke kopplade personer till Svenskt Porträttarkiv P4819 som har Litteraturbanken P5101 # # Testar att söka med lite olika parametrar för att få bättre precision i rankingen när jag söker mot Svenskt Porträttarkiv SPA # * personer hos Litteraturbanken [Property:P5101](https://www.wikidata.org/wiki/Property:P5101?uselang=sv) # * bilderna i SPA är gamla --> utan copyright --> begränsa till sökning för födda innan 1920 # * [SPARQL](https://w.wiki/6QWL) # * 2023 03 05 ger 1290 träff # # In[34]: # query to get people with no picture in WIkidata but connected Litteraturbanken endpoint_url = "https://query.wikidata.org/sparql" import sys import pandas as pd from SPARQLWrapper import SPARQLWrapper, JSON # query people connected to Litteraturebank with no picture and born before 1920 # https://w.wiki/6QX5 queryLitt = """#title: Records in Wikidata connected to Litteraturbanken with no picture for people born before 1920 SELECT (REPLACE(STR(?item), ".*Q", "Q") AS ?WikidataID) (CONCAT (?itemLabel," ",str(year(?birthDayWD))) AS ?search) ?birthDayWD ?firstnameLabel ?lastnameLabel ?item ?itemLabel (str(year(?birthDayWD)) AS ?BirthYearWD) ?Littauthorid WHERE { ?item wdt:P5101 ?Littauthorid. # minus {?item wdt:P4819 ?c} minus {?item wdt:P18 ?noimage } OPTIONAL { ?item wdt:P735 ?firstname. } OPTIONAL { ?item wdt:P734 ?lastname. } OPTIONAL { ?item wdt:P569 ?birthDayWD. } SERVICE wikibase:label { bd:serviceParam wikibase:language "sv", "en". } FILTER (?birthDayWD <= "1920-01-01T00:00:00Z"^^xsd:dateTime ) }""" # query for all Littbank without a picture in WD but article in Wikidata # https://w.wiki/6QWw queryArticle = """SELECT distinct (CONCAT (?itemLabel," ",str(year(?birthDayWD))) AS ?search) ?birthDayWD ?firstnameLabel ?lastnameLabel ?item ?itemLabel ?wikipedia (str(year(?birthDayWD)) AS ?BirthYearWD) WHERE { ?item wdt:P5101 ?Litt. minus {?item wdt:P18 ?d } OPTIONAL { ?item wdt:P735 ?firstname. } OPTIONAL { ?item wdt:P734 ?lastname. } OPTIONAL { ?item wdt:P569 ?birthDayWD. } { ?wikipedia schema:about ?item . ?wikipedia schema:inLanguage "sv" . ?wikipedia schema:isPartOf . } SERVICE wikibase:label { bd:serviceParam wikibase:language "sv", "en". } } order by ?itemLabel """ def get_sparql_dataframe(endpoint_url, query): """ Helper function to convert SPARQL results into a Pandas data frame. """ user_agent = "salgo60/%s.%s" % (sys.version_info[0], sys.version_info[1]) sparql = SPARQLWrapper(endpoint_url, agent=user_agent) sparql.setQuery(query) sparql.setReturnFormat(JSON) result = sparql.query() processed_results = json.load(result.response) cols = processed_results['head']['vars'] out = [] for row in processed_results['results']['bindings']: item = [] for c in cols: item.append(row.get(c, {}).get('value')) out.append(item) return pd.DataFrame(out, columns=cols) #SPALittdf = get_sparql_dataframe(endpoint_url, query) SPALittdf = get_sparql_dataframe(endpoint_url, queryLitt) SPALittdf.info() # In[35]: SPALittdf # In[36]: import warnings warnings.simplefilter(action='ignore', category=FutureWarning) # In[37]: from wikidata.client import Client from tqdm.notebook import tqdm import urllib3, json http = urllib3.PoolManager() SPAdetail = "https://portrattarkiv.se/details/" url= "https://portrattarkiv.se/endpoints/search.php" urlbasePic = "https://portrattarkiv.se/endpoints/file.php?id=" def getdfScore(panddf): dfScore = pd.DataFrame(columns=['all','WikidataID','spaid','Littauthorid','score', 'FirstNameWD','LastNameWD','BirthDayWD', 'FirstNameSPA','LastNameSPA','BirthDaySPA','searchStr']) #for index, row in SPALittdf.iterrows(): for index, row in tqdm(panddf.iterrows(),total=panddf.shape[0]): allString=row["search"] wd=row["WikidataID"] Littauthorid=row["Littauthorid"] print("\n",allString,wd) encoded_body = json.dumps({ "limit": "5", "from": "0", "firstname" : row["firstnameLabel"], "lastname" : row["lastnameLabel"], "birthyear":row["BirthYearWD"], "all":allString }) r = http.request('POST', url, headers={'Content-Type': 'application/json'}, body=encoded_body) if r.status != 200: print(r.status) continue data = json.loads(r.data.decode('utf-8'),) urls = [] for h in data["hits"]["hits"]: spaid = h["_id"] #print(h) source = h["_source"] try: urlPicture = urlbasePic + spaid urls.append(urlPicture) score = h["_score"] FirstNameSPA = source["FirstName"] LastNameSPA = source["LastName"] BirthYearSPA = source["BirthYear"] print("\t\t",score,FirstNameSPA, " ", LastNameSPA, " - ", BirthYearSPA,SPAdetail+spaid, "\t", ) dfScore = dfScore.append({ 'all' : allString, 'wd' : wd, 'Littauthorid' : Littauthorid, 'spaid' : spaid, 'score' : score, 'FirstNameWD' : row["firstnameLabel"], 'LastNameWD' : row["lastnameLabel"], 'BirthYearWD' : row["BirthYearWD"], 'FirstNameSPA' : FirstNameSPA, 'LastNameSPA' : LastNameSPA, 'BirthYearSPA' : BirthYearSPA },ignore_index=True) except Exception as e: print(f"{type(e).__name__} at line {e.__traceback__.tb_lineno} of {__file__}: {e}") return dfScore # In[38]: dfScoreSPALitt = getdfScore(SPALittdf) dfScoreSPALitt.info() # In[39]: dfScoreSPALitt.sort_values('score', ascending=False) dfScoreSPALitt.groupby("wd") # In[40]: dfScoreSPALitt.info() # In[41]: dfScoreSPALitt.head(200) #Group by WIkidata Q number and get highest score #create a helper column with max #dfScoreSPALitt['max'] = dfScoreSPALitt.groupby('wd')['score'].transform('max') #dfScoreSPALittMax = dfScoreSPALitt.groupby('wd')['max','spaid','FirstNameWD','LastNameWD','BirthYearWD']. \ # apply(lambda x: x.nlargest(1, columns=['score'])).sort_values(['max','wd'], ascending=False) #dfScoreSPALittMax #ss = dfScoreSPALitt.groupby('wd')['max','spaid','FirstNameWD','LastNameWD','BirthYearWD']. \ # apply(lambda x: x.nlargest(1, columns=['score'])).sort_values(ascending=False) # In[42]: dfCandidates = dfScoreSPALitt.sort_values(by=['score'], ascending=False) dfCandidates # # ## Candidates SPA # # In[43]: #dfCandidates # In[44]: from IPython.display import HTML space = "%20" dfCandidates["Littbank"] = "link Littbank" dfCandidates["WD"] = "" + dfCandidates["wd"] + "" dfCandidates["WD_template"] = "{{Q|" + dfCandidates["wd"] + "}}" dfCandidates["SPA"] = "link SPA" dfCandidates["WikicommonsUrl"] = "link Commons" dfCandidates["name_Ascii"] = dfCandidates["FirstNameWD"] +\ space + dfCandidates["LastNameWD"] + space + dfCandidates["BirthYearWD"] dfCandidates["name_str"] = dfCandidates["FirstNameWD"] \ + " " + dfCandidates["LastNameWD"] + " " + dfCandidates["BirthYearWD"] #dfCandidates["name_Ascii"] = dfCandidates["FirstNameWD"] \ # + space + dfCandidates["LastNameWD"] + space + dfCandidates["BirthYearWD"] dfCandidates["CommonsCat"] = "[[Category:" + dfCandidates["name_Ascii"] + "]]" curlyLeftBrackets = "%7B%7B" curlyRightBrackets = "%7D%7D" verticalBar = "%7C" lineFeed = "%0A" equalSign = "%3D" information = curlyLeftBrackets + "Information" description = verticalBar + "description" + equalSign + curlyLeftBrackets english = curlyLeftBrackets + "en" + verticalBar swedish = curlyLeftBrackets + "sv" + verticalBar uploadedFromSwe = "uppladdat%20fr%C3%A5n%20Portrattarkiv.se%20bild%20av%20" uploadedFromEn = "uploaded%20from%20Portrattarkiv.se%20picture%20of%20" source = verticalBar + "source" + equalSign + "https://portrattarkiv.se/details/" author= verticalBar + "author" + equalSign + curlyLeftBrackets + "sv|okänd"+ curlyRightBrackets + \ curlyLeftBrackets +"en|Unknown" + curlyRightBrackets + lineFeed permission = verticalBar + "permission" + equalSign + lineFeed otherversion = verticalBar + "other versions" + equalSign + lineFeed categories = lineFeed + "[[Category:Uploaded with spa2CommonsNotebook]]" + lineFeed + \ "[[Category:Swedish_Portrait_Archive]]" license = "&wpLicense=cc-by-sa-4.0" wpDestFile = "&wpDestFile=" wpSourceType = "wpSourceType=url&wpUploadFileURL=https://portrattarkiv.se/endpoints/file.php?id=" dfCandidates["UploadUrl"] = "link Upload" pd.set_option("display.max.columns", None) HTML(dfCandidates[["score","name_str", "LastNameWD","FirstNameWD","Littbank", "UploadUrl","WD", "SPA", "spaid",\ "WikicommonsUrl"] ].to_html(escape=False)) # In[46]: dfCandidates.to_csv("Littraturbanken_candidates.csv") # ### Test med preview # In[ ]: