version 0.8
see at the end where we find an ORCID in the JSON file looks like not everyone has an ORCID...
# Try to get ORCID from SWEPUB
# see https://kundo.se/org/swepub/d/api-for-amnesklassificering/#c3571837
import pandas as pd
import json
import time
start_time = time.time()
filename ="data/swepub-duplicated-2020-07-05.jsonl"
filestore ="data/swepub-duplicated-2020-07-05_1.pd"
print(time.ctime())
df_chunk = pd.read_json(filename, lines=True, chunksize=10000)
chunk_list = []
for i, chunk in enumerate(df_chunk):
chunk_list.append(chunk)
print("--- %s seconds ---" % (time.time() - start_time))
# concat the list into dataframe
df_concat = pd.concat(chunk_list)
print("--- %s seconds ---" % (time.time() - start_time))
df_concat.info()
#df_concat.to_pickle(filestore)
#print("--- %s seconds ---" % (time.time() - start_time))
Thu Jul 16 16:56:56 2020
pd.set_option("display.max.columns", None)
df_concat["instanceOf"]
pd.DataFrame(df_concat["instanceOf"].tolist())
instanceOfdf = pd.DataFrame(df_concat["instanceOf"].tolist())
instanceOfdf
pd.DataFrame(instanceOfdf["genreForm"].tolist())
instanceOfdf.info()
pd.DataFrame(instanceOfdf["hasTitle"][1:10].tolist())
pd.DataFrame(instanceOfdf["contribution"][1:10].tolist())
pd.DataFrame(instanceOfdf["hasNote"][1:10].tolist())
pd.options.display.width = 0
pd.DataFrame(instanceOfdf["contribution"][1:10].tolist())
pd.DataFrame(instanceOfdf["contribution"][1:10].tolist()[0])
pd.DataFrame(instanceOfdf["contribution"][1:10].tolist()[0])["agent"]
pd.DataFrame(instanceOfdf["contribution"][1:10].tolist()[0])["agent"].tolist()
pd.DataFrame(instanceOfdf["contribution"][1:10].tolist()[1])["agent"].tolist()
pd.options.display.width = 0
pd.DataFrame(instanceOfdf["hasTitle"][1:10].tolist()[2])
pd.DataFrame(instanceOfdf["hasTitle"][1:10].tolist()[2])
pd.DataFrame(instanceOfdf["contribution"][1:10].tolist()[0])["role"].tolist()
pd.DataFrame(instanceOfdf["electronicLocator"][1:10])
pd.DataFrame(instanceOfdf["contribution"][1000:1010].tolist()[1])["agent"].tolist()
pd.DataFrame(instanceOfdf["contribution"][2000:2010].tolist()[1])["agent"].tolist()
pd.DataFrame(instanceOfdf["contribution"][2000:2010].tolist()[1])["agent"].tolist()[0]["identifiedBy"]
pd.DataFrame(instanceOfdf["contribution"][2000:2010].tolist()[1])["agent"].tolist()[0]["identifiedBy"][0]["value"]