#!/usr/bin/env python
# coding: utf-8
# ### Hitta poster med Libris-URI
#
#
# Denna [Notebook](https://github.com/salgo60/spa2Commons/blob/main/Notebook/Litteraturbanken%20-%20Libris.ipynb)
#
# * alla poster i Litteraturbanken skall ha samma LIBRIS-URI i WD och Litteraturbanken
# * se Litteraturbanken issue [#15 ökad precision i Wikidataimport](https://github.com/Litteraturbanken/littb-frontend/issues/55)
# In[1]:
from datetime import datetime
start_time = datetime.now()
print("Last run: ", start_time)
# In[2]:
import urllib3, json
import pandas as pd
http = urllib3.PoolManager()
pd.set_option("display.max.columns", None)
url = "https://litteraturbanken.se/api/get_authors"
r = http.request('GET', url)
data = json.loads(r.data)
df = pd.json_normalize(data["data"])
# In[3]:
df.info()
# In[4]:
#just objects with show = True --> displayed in the web
dfShowTrue = df[df["show"] == True].copy()
dfShowLibris = dfShowTrue[~dfShowTrue.librisid.isnull()]
dfShowLibris.info()
# In[5]:
dfShowLibris.head()
# ### Get Wikidata
# In[6]:
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/
import sys,json
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
endpoint_url = "https://query.wikidata.org/sparql"
#
query = """SELECT (REPLACE(STR(?item), ".*Q", "Q") AS ?WikidataID) ?authorid ?SBL ?SKBL WHERE {
?item wdt:P31 wd:Q5.
?item wdt:P5101 ?authorid
OPTIONAL {?item wdt:P3217 ?SBL}
OPTIONAL {?item wdt:P4963 ?SKBL}
} order by ?authorid"""
queryLIBRIS = """SELECT ?item (REPLACE(STR(?item), ".*Q", "Q") AS ?WikidataID) ?authorid ?SBL ?SKBL (sample(?WD_LIBRISXL) AS ?WD_LIBRISXL)
?WD_SELIBR_ID WHERE {
?item wdt:P31 wd:Q5.
?item wdt:P5101 ?authorid
OPTIONAL {?item wdt:P3217 ?SBL}
OPTIONAL {?item wdt:P5587 ?WD_LIBRISXL}
OPTIONAL {?item wdt:P906 ?WD_SELIBR_ID}
OPTIONAL {?item wdt:P4963 ?SKBL}
} group by ?item ?WikidataID ?authorid ?SBL ?SKBL ?WD_SELIBR_ID
order by ?authorid"""
def get_sparql_dataframe(endpoint_url, query):
"""
Helper function to convert SPARQL results into a Pandas data frame.
"""
user_agent = "salgo60/%s.%s" % (sys.version_info[0], sys.version_info[1])
sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
result = sparql.query()
processed_results = json.load(result.response)
cols = processed_results['head']['vars']
out = []
for row in processed_results['results']['bindings']:
item = []
for c in cols:
item.append(row.get(c, {}).get('value'))
out.append(item)
return pd.DataFrame(out, columns=cols)
WDLittbanktot = get_sparql_dataframe(endpoint_url, queryLIBRIS)
# In[7]:
WDLittbanktot.head(5)
# ### Compare Littbank Libris field with Wikidata
#
# In[8]:
WDLittbank_WD_merge = pd.merge(WDLittbanktot, dfShowLibris, on='authorid',indicator=True)
WDLittbank_WD_merge.rename(columns={"_merge": "WD_Littbank_merge_False"},inplace = True)
WDLittbank_WD_merge["WD_Littbank_merge_False"].value_counts()
# ## LIttbank Show LIBRIS
# In[9]:
columnsShow = ['authorid', 'WD_LIBRISXL', 'WD_SELIBR_ID' ,'librisid']
# In[10]:
WDLittbank_WD_merge[columnsShow]
# In[11]:
WDLittbank_WD_merge["diffLIBRIS"] = WDLittbank_WD_merge['WD_LIBRISXL'] == WDLittbank_WD_merge['librisid']
# In[12]:
columnsShow = ['authorid', 'WD_LIBRISXL', 'WD_SELIBR_ID','librisid','diffLIBRIS']
# In[13]:
WDLittbank_WD_merge["diffLIBRIS"].value_counts()
# In[14]:
#WDLittbank_WD_merge_False[columnsShow]
WDLittbank_WD_Librisid_False = WDLittbank_WD_merge[WDLittbank_WD_merge["diffLIBRIS"] == False].copy()
# In[15]:
WDLittbank_WD_Librisid_False.shape
# ### List to check
# In[16]:
from IPython.display import HTML
WDLittbank_WD_Librisid_False["Littbank"] = "link"
WDLittbank_WD_Librisid_False["LittAPI"] = "link API"
WDLittbank_WD_Librisid_False["LIBRIS_uri"] = "link LIBRIS"
WDLittbank_WD_Librisid_False["WD"] = "link WD"
WDLittbank_WD_Librisid_False["WD_sök_Librisid"] = "sök WD Librisid"
pd.set_option("display.max.columns", None)
HTML(WDLittbank_WD_Librisid_False[["Littbank","LittAPI","WD","WikidataID", "WD_sök_Librisid", \
"authorid","WD_LIBRISXL","WD_SELIBR_ID","librisid","LIBRIS_uri"]
].to_html(escape=False))
# ### Diff found after corrections
#
# 1. some records at Litteraturbanken use the old id e.g. BrausewetterE
# 2. HenriksonA has the # tag
# 3. CarlssonGottfrid feels like the wrong value = 22551370
# 4. duplicates
# 1. LIBRISXL [SPARQL](https://w.wiki/6Q2c)
# 1. Litteraturbanken [SPARQL](https://w.wiki/6Q2k)
#
# ## Litteraturbanken diff LIBRISXL med Wikidata
# * WDLittbank_WD_merge
# In[17]:
dfShowLibrisHasNoLibrisid = dfShowTrue[dfShowTrue.librisid.isnull()]
# Wikidata objects with LIBRISXL
# WDLittbank_WD_merge_Has_WD_LIBRISXL
dfShowLibrisHasNoLibrisid.info()
# In[18]:
#dfShowLibrisHasNoLibrisid
WDLittbank_WD_merge_LittNoLibrisXL = pd.merge(WDLittbanktot, dfShowLibrisHasNoLibrisid, on='authorid',indicator=True)
WDLittbank_WD_merge_LittNoLibrisXL.rename(columns={"_merge": "WD_Littbank_merge_NoLibrisid"},inplace = True)
WDLittbank_WD_merge_LittNoLibrisXL["WD_Littbank_merge_NoLibrisid"].value_counts()
# In[19]:
WDLittbank_WD_merge_LittNoLibrisXL["librisid"].value_counts
# In[20]:
WDLittbank_WD_merge_LittNoLibrisXL[["authorid","WikidataID","librisid","WD_LIBRISXL"]]
# In[21]:
#finding Values were WIkidata has LIBRISXL but not Litteraturbanken
#WDLittbank_WD_merge_LittNoLibrisXL[["authorid","WikidataID","librisid","WD_LIBRISXL"]].WDLittbank_WD_merge_LittNoLibrisXLisnull()
CandidatesLittarurbankenLIBRISXL = WDLittbank_WD_merge_LittNoLibrisXL[WDLittbank_WD_merge_LittNoLibrisXL["WD_LIBRISXL"].notnull()]
CandidatesLittarurbankenLIBRISXL[["authorid","WikidataID","librisid","WD_LIBRISXL"]]
# In[22]:
CandidatesLittarurbankenLIBRISXL[["authorid","WikidataID","librisid","WD_LIBRISXL"]].to_csv("CandidatesLittarurbankenLIBRISXL.csv")
# In[23]:
pd.options.mode.chained_assignment = None # default='warn'
CandidatesLittarurbankenLIBRISXL["Littbank"] = "link Litt"
CandidatesLittarurbankenLIBRISXL["LittAPI"] = "link API"
CandidatesLittarurbankenLIBRISXL["LIBRIS_uri"] = "link LIBRIS"
CandidatesLittarurbankenLIBRISXL["WD"] = "link WD"
pd.set_option("display.max.columns", None)
#
# ### Table with Candidates
#
# In[24]:
HTML(CandidatesLittarurbankenLIBRISXL[["Littbank","LittAPI","WD","WikidataID", \
"authorid","WD_LIBRISXL","WD_SELIBR_ID","librisid","LIBRIS_uri"]
].to_html(escape=False))
# In[25]:
end = datetime.now()
print("Ended: ", end)
print('Time elapsed (hh:mm:ss.ms) {}'.format(datetime.now() - start_time))
# In[ ]: