set SDC depicts from files uploaded in spa2Commons
see below 1. get WD object the Picture depict
from datetime import datetime
start_time = datetime.now()
print("Last run: ", start_time)
Last run: 2021-11-26 02:50:01.339986
import requests
S = requests.Session()
URL = "https://commons.wikipedia.org/w/api.php"
def get_Category(pageName):
PARAMS = {
"action": "query",
"format": "json",
"prop": "categories",
"titles": pageName
}
#print (PARAMS)
r = S.get(url=URL, params=PARAMS)
data = r.json()
# TODO dont get hidden categories
filtercat = {
"Category:CC-BY-4.0",
"Category:Swedish Portrait Archive",
"Category:Uploaded with spa2Commons",
"Category:Template Unknown (author)",
"Category:Images with extracted images",
"Category:Extracted images",
"Category:Scanned with HP Deskjet F4200",
"Category:Pages using duplicate arguments in template calls",
"Category:Creative Commons Attribution-Share Alike missing SDC copyright status",
"Category:Creative Commons Attribution-Share Alike 4.0 missing SDC copyright license",
"Category:Creative Commons Attribution missing SDC copyright status",
"Category:Creative Commons Attribution 4.0 missing SDC copyright license",
"Category:Media requiring renaming - rationale 3",
"Media requiring renaming - target already exists"
}
target_category = ""
PAGES = data["query"]["pages"]
for k, v in PAGES.items():
# print(k,v)
for cat in v['categories']:
if cat["title"] not in filtercat:
target_category = cat["title"]
#print("\tTarget cat" ,target_category)
return target_category
common_name = "File:A_G_Ahlqvist_SPA10.jpg"
get_Category(common_name)
'Category:Alfred Gustaf Ahlqvist'
def getWD(commonsCategory):
urlHub = "https://hub.toolforge.org/commons:" + commonsCategory + "?format=json&site=wd"
#print(urlHub)
hub = S.get(url=urlHub)
data = hub.json()
try:
wd = data["destination"]["url"].replace("https://www.wikidata.org/wiki/","")
except:
print("Error", data)
wd =""
return wd
test_category ="Category:Axel_Rappe_(1838%E2%80%931918)"
getWD(test_category)
'Q4830349'
# used tool xxxx to get csv files with pictures
import csv
mid_wd_list = []
cat2commonsfiles = "Cat_2commons.csv"
cat2commonsfiles = "Cat_2commons_20211101.csv"
cat2commonsfiles = "Cat_2commons_20211103.csv"
cat2commonsfiles = "Cat_2commons_20211106.csv"
cat2commonsfiles = "Cat_2commons_20211114.csv"
cat2commonsfiles = "Cat_2commons_20211126.csv"
with open(cat2commonsfiles) as csvfile:
cat_reader = csv.DictReader(csvfile, delimiter=',', quotechar='"')
# cat_reader = csv.DictReader(csvfile, delimiter=';', quotechar='"')
# cat_reader = csv.DictReader(csvfile, delimiter=',', quotechar='"')
for row in cat_reader:
# print(row)
# print(row["mid"],get_Category(row["title"]),row["url"])
# print(row["mid"],getWD(get_Category(row["title"])))
mid_wd_list.append([row["mid"],getWD(get_Category(row["title"]))])
#print(mid_wd_list)
Error {'message': 'Not Found', 'context': {'text': 'commons:Category:Bernhard Lundgren 1843', 'lang': 'en'}} Error {'message': 'Not Found', 'context': {'text': 'commons:Category:Bernhard Lundgren 1843', 'lang': 'en'}} Error {'message': 'Not Found', 'context': {'text': 'commons:Category:Axel Wästfelt 1881', 'lang': 'en'}} Error {'message': 'Not Found', 'context': {'text': 'commons:Category:Axel Wästfelt 1881', 'lang': 'en'}} Error {'message': 'Not Found', 'context': {'text': 'commons:Category:Karl Salin 1890', 'lang': 'en'}} Error {'message': 'Not Found', 'context': {'text': 'commons:Category:Karl Salin 1890', 'lang': 'en'}} Error {'message': 'Not Found', 'context': {'text': 'commons:Category:Bernhard Lundgren 1843', 'lang': 'en'}} Error {'message': 'Not Found', 'context': {'text': 'commons:Category:Sven Trägårdh 1814', 'lang': 'en'}} Error {'message': 'Not Found', 'context': {'text': 'commons:Category:Sven Trägårdh 1814', 'lang': 'en'}}
import pandas as pd
df = pd.DataFrame(mid_wd_list)
df.head(10)
0 | 1 | |
---|---|---|
0 | M110465275 | Q2824703 |
1 | M111371972 | Q2482314 |
2 | M112375203 | Q18238225 |
3 | M112375216 | Q18238225 |
4 | M112392665 | Q4943098 |
5 | M112408085 | Q29047398 |
6 | M112427640 | Q48709014 |
7 | M112458663 | Q6171709 |
8 | M112489160 | Q5771716 |
9 | M112490909 | Q109545314 |
df.to_csv("SPACategories_Mid_WD.txt")
df.shape
(1717, 2)
end = datetime.now()
print("Ended: ", end)
print('Time elapsed (hh:mm:ss.ms) {}'.format(datetime.now() - start_time))
Ended: 2021-11-26 03:03:52.587248 Time elapsed (hh:mm:ss.ms) 0:13:51.248001