from datetime import datetime
start_time = datetime.now()
print("Last run: ", start_time)
Last run: 2023-01-05 13:29:52.233378
import urllib3, json
import pandas as pd
http = urllib3.PoolManager()
pd.set_option("display.max.columns", None)
url = "https://litteraturbanken.se/api/get_authors"
r = http.request('GET', url)
data = json.loads(r.data)
df = pd.json_normalize(data["data"])
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4712 entries, 0 to 4711 Data columns (total 50 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 authorid 4712 non-null object 1 authorid_norm 4712 non-null object 2 db_checksum 4712 non-null object 3 db_timestamp 4712 non-null int64 4 doc_type 4712 non-null object 5 full_name 4712 non-null object 6 gender 4712 non-null object 7 imported 4072 non-null object 8 intro 746 non-null object 9 librisid 2689 non-null object 10 name_for_index 4712 non-null object 11 pictureinfo 224 non-null object 12 searchable 4712 non-null bool 13 show 4712 non-null bool 14 surname 4712 non-null object 15 updated 4072 non-null object 16 birth.date 4432 non-null object 17 birth.plain 4712 non-null object 18 death.date 2646 non-null object 19 death.plain 3971 non-null object 20 wikidata.birthplace 2156 non-null object 21 wikidata.birthplace_label 2156 non-null object 22 wikidata.deathplace 1870 non-null object 23 wikidata.deathplace_label 1870 non-null object 24 wikidata.image 1652 non-null object 25 wikidata.sbl_link 890 non-null object 26 wikidata.skbl_link 171 non-null object 27 wikidata.sol_link 142 non-null object 28 wikidata.wikidata_id 2881 non-null object 29 wikidata.wikipedia 2115 non-null object 30 db_timestamp_updated 2974 non-null float64 31 intro_text 746 non-null object 32 popularity 2477 non-null float64 33 pseudonym 151 non-null object 34 dramawebben.intro 114 non-null object 35 dramawebben.intro_author 113 non-null object 36 dramawebben.intro_author_norm 113 non-null object 37 dramawebben.legacy_url 127 non-null object 38 dramawebben.picture 82 non-null object 39 sources 543 non-null object 40 other_name 111 non-null object 41 intro_author 419 non-null object 42 intro_author_norm 419 non-null object 43 dramawebben.picture_info 76 non-null object 44 picture 312 non-null object 45 bibliography 19 non-null object 46 external_ref 9 non-null object 47 presentation 37 non-null object 48 seemore 4 non-null object 49 dramawebben.sources 6 non-null object dtypes: bool(2), float64(2), int64(1), object(45) memory usage: 1.7+ MB
df["presentation"].value_counts()
True 33 False 4 Name: presentation, dtype: int64
dfPresentation = df[~df['presentation'].isna()]
dfPresentation["presentation"].value_counts()
True 33 False 4 Name: presentation, dtype: int64
dfPresentationTrue = dfPresentation[dfPresentation["presentation"]].copy()
dfPresentation["presentation"].value_counts()
True 33 False 4 Name: presentation, dtype: int64
dfPresentationTrue["presentation"].value_counts()
True 33 Name: presentation, dtype: int64
#just objects with show = True --> displayed in the web
dfPresentationTrueShow = dfPresentationTrue[dfPresentationTrue["show"]].copy()
dfPresentationTrueShow.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 33 entries, 93 to 4589 Data columns (total 50 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 authorid 33 non-null object 1 authorid_norm 33 non-null object 2 db_checksum 33 non-null object 3 db_timestamp 33 non-null int64 4 doc_type 33 non-null object 5 full_name 33 non-null object 6 gender 33 non-null object 7 imported 33 non-null object 8 intro 33 non-null object 9 librisid 30 non-null object 10 name_for_index 33 non-null object 11 pictureinfo 25 non-null object 12 searchable 33 non-null bool 13 show 33 non-null bool 14 surname 33 non-null object 15 updated 33 non-null object 16 birth.date 32 non-null object 17 birth.plain 33 non-null object 18 death.date 23 non-null object 19 death.plain 29 non-null object 20 wikidata.birthplace 31 non-null object 21 wikidata.birthplace_label 31 non-null object 22 wikidata.deathplace 27 non-null object 23 wikidata.deathplace_label 27 non-null object 24 wikidata.image 31 non-null object 25 wikidata.sbl_link 12 non-null object 26 wikidata.skbl_link 12 non-null object 27 wikidata.sol_link 6 non-null object 28 wikidata.wikidata_id 32 non-null object 29 wikidata.wikipedia 32 non-null object 30 db_timestamp_updated 33 non-null float64 31 intro_text 33 non-null object 32 popularity 33 non-null float64 33 pseudonym 6 non-null object 34 dramawebben.intro 3 non-null object 35 dramawebben.intro_author 3 non-null object 36 dramawebben.intro_author_norm 3 non-null object 37 dramawebben.legacy_url 4 non-null object 38 dramawebben.picture 4 non-null object 39 sources 7 non-null object 40 other_name 1 non-null object 41 intro_author 19 non-null object 42 intro_author_norm 19 non-null object 43 dramawebben.picture_info 4 non-null object 44 picture 26 non-null object 45 bibliography 13 non-null object 46 external_ref 3 non-null object 47 presentation 33 non-null object 48 seemore 2 non-null object 49 dramawebben.sources 0 non-null object dtypes: bool(2), float64(2), int64(1), object(45) memory usage: 12.7+ KB
pd.set_option('display.max_colwidth', None)
"http://litteraturbanken.se/författare/" + dfPresentationTrueShow["authorid"] + "/presentation"
93 http://litteraturbanken.se/författare/AlmqvistCJL/presentation 110 http://litteraturbanken.se/författare/AlvingB/presentation 139 http://litteraturbanken.se/författare/AnderssonD/presentation 161 http://litteraturbanken.se/författare/AnderssonP/presentation 251 http://litteraturbanken.se/författare/AurellT/presentation 328 http://litteraturbanken.se/författare/BenedictssonV/presentation 618 http://litteraturbanken.se/författare/BoyeK/presentation 652 http://litteraturbanken.se/författare/BremerF/presentation 807 http://litteraturbanken.se/författare/CederborghF/presentation 970 http://litteraturbanken.se/författare/DiktoniusE/presentation 1012 http://litteraturbanken.se/författare/EdelfeldtI/presentation 1244 http://litteraturbanken.se/författare/FerlinN/presentation 1277 http://litteraturbanken.se/författare/FlygareCarlénE/presentation 1358 http://litteraturbanken.se/författare/FrostensonK/presentation 1431 http://litteraturbanken.se/författare/GierowKR/presentation 1695 http://litteraturbanken.se/författare/HedbergO/presentation 2239 http://litteraturbanken.se/författare/KnorringS/presentation 2366 http://litteraturbanken.se/författare/LagerlöfS/presentation 2424 http://litteraturbanken.se/författare/LarssonStig/presentation 2571 http://litteraturbanken.se/författare/LindegrenE/presentation 2617 http://litteraturbanken.se/författare/LindqvistS/presentation 2861 http://litteraturbanken.se/författare/MattsonO/presentation 3096 http://litteraturbanken.se/författare/NordströmEB/presentation 3207 http://litteraturbanken.se/författare/OlssonJO/presentation 3636 http://litteraturbanken.se/författare/SandelM/presentation 3809 http://litteraturbanken.se/författare/SjögrenL/presentation 3817 http://litteraturbanken.se/författare/SjöstrandI/presentation 4039 http://litteraturbanken.se/författare/SundmanPO/presentation 4111 http://litteraturbanken.se/författare/SöderbergH/presentation 4118 http://litteraturbanken.se/författare/SödergranE/presentation 4144 http://litteraturbanken.se/författare/TaubeE/presentation 4312 http://litteraturbanken.se/författare/VallquistG/presentation 4589 http://litteraturbanken.se/författare/WägnerE/presentation Name: authorid, dtype: object
dfPresentationTrueShow[["authorid","wikidata.wikidata_id"]].to_csv("LitteraturbankenPresentationer.csv")
end = datetime.now()
print("Ended: ", end)
print('Time elapsed (hh:mm:ass.ms) {}'.format(datetime.now() - start_time))
Ended: 2023-01-05 13:29:55.588496 Time elapsed (hh:mm:ass.ms) 0:00:03.356294