# import libraries for json import
import json
import urllib.request, json
# import json for visualiziation
import pandas as pd
from pandas.io.json import json_normalize
# import Matpltlib for data visualisation
import matplotlib.pyplot as plt
# import Numpy for data handling
import numpy as np
# connect to DataCite REST API and get items with own search query, e.g. query=Max%20Planck&page[size]=1000'
with urllib.request.urlopen('https://api.datacite.org/dois?query=Ilia%20State%20University&page[size]=5000') as url:
# create a json file out of API call
DataCite = json.load(url)
# create and open a new json file
with open('DataCite_Raw_Metadata.json', 'w') as f:
# write results in json file
json.dump(DataCite, f)
# load to normalize json file
data = json.load(open('DataCite_Raw_Metadata.json'))
# load to normalize json file
data_normalized = pd.json_normalize(data,'data')
# create Pandas dataframe
df = pd.DataFrame(data_normalized)
# display Pandas dataframe as table
display(df)
id | type | attributes.doi | attributes.identifiers | attributes.creators | attributes.titles | attributes.publisher | attributes.publicationYear | attributes.subjects | attributes.contributors | ... | attributes.versionOfCount | attributes.created | attributes.registered | attributes.published | attributes.updated | relationships.client.data.id | relationships.client.data.type | attributes.container.type | attributes.container.identifier | attributes.container.identifierType | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10.5281/zenodo.7223794 | dois | 10.5281/zenodo.7223794 | [{'identifier': 'https://zenodo.org/record/722... | [{'name': 'Grossmann, Yves Vincent', 'givenNam... | [{'title': 'Research Data Policies – A Short O... | Zenodo | 2022 | [{'subject': 'ISUBII22'}, {'subject': 'Ilia St... | [] | ... | 0 | 2022-10-26T12:15:33Z | 2022-10-26T12:15:34Z | None | 2022-10-26T12:15:34Z | cern.zenodo | clients | NaN | NaN | NaN |
1 | 10.5281/zenodo.7223795 | dois | 10.5281/zenodo.7223795 | [] | [{'name': 'Grossmann, Yves Vincent', 'givenNam... | [{'title': 'Research Data Policies – A Short O... | Zenodo | 2022 | [{'subject': 'ISUBII22'}, {'subject': 'Ilia St... | [] | ... | 0 | 2022-10-26T12:15:32Z | 2022-10-26T12:15:33Z | None | 2022-10-26T12:15:33Z | cern.zenodo | clients | NaN | NaN | NaN |
2 | 10.5281/zenodo.7243699 | dois | 10.5281/zenodo.7243699 | [{'identifier': 'https://zenodo.org/record/724... | [{'name': 'Grossmann, Yves Vincent', 'givenNam... | [{'title': 'DataCite ROR Parser'}] | Zenodo | 2022 | [{'subject': 'ISUBII22'}, {'subject': 'Python'... | [] | ... | 0 | 2022-10-24T08:08:18Z | 2022-10-24T08:08:19Z | None | 2022-10-26T06:40:27Z | cern.zenodo | clients | NaN | NaN | NaN |
3 | 10.5281/zenodo.7243700 | dois | 10.5281/zenodo.7243700 | [] | [{'name': 'Grossmann, Yves Vincent', 'givenNam... | [{'title': 'DataCite ROR Parser'}] | Zenodo | 2022 | [{'subject': 'ISUBII22'}, {'subject': 'Python'... | [] | ... | 0 | 2022-10-24T08:08:17Z | 2022-10-24T08:08:18Z | None | 2022-10-26T06:40:26Z | cern.zenodo | clients | NaN | NaN | NaN |
4 | 10.5281/zenodo.7225465 | dois | 10.5281/zenodo.7225465 | [{'identifier': 'https://zenodo.org/record/722... | [{'name': 'Grossmann, Yves Vincent', 'givenNam... | [{'title': 'Research Data Management – A Short... | Zenodo | 2022 | [{'subject': 'ISUBII22'}, {'subject': 'Ilia St... | [] | ... | 0 | 2022-10-25T07:15:22Z | 2022-10-25T07:15:22Z | None | 2022-10-25T07:15:22Z | cern.zenodo | clients | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
97 | 10.13140/rg.2.1.3170.7766 | dois | 10.13140/rg.2.1.3170.7766 | [{'identifier': 'https://doi.org/10.13140/rg.2... | [{'name': 'Tumanishvili, George G.', 'nameType... | [{'title': 'ხელშეკრულების შედგენის ტექნიკა და ... | ILIA STATE UNIVERSITY PRESS | 2012 | [] | [] | ... | 0 | 2016-06-26T21:47:27Z | 2016-06-26T21:47:28Z | None | 2020-06-29T13:28:40Z | rg.rg | clients | NaN | NaN | NaN |
98 | 10.13140/rg.2.1.1126.8720 | dois | 10.13140/rg.2.1.1126.8720 | [{'identifier': 'https://doi.org/10.13140/rg.2... | [{'name': 'Tumanishvili, George G.', 'nameType... | [{'title': 'სამართლის პროფესიები'}] | ILIA STATE UNIVERSITY PRESS | 2009 | [] | [] | ... | 0 | 2016-06-26T17:15:55Z | 2016-06-26T17:15:56Z | None | 2020-06-29T13:28:40Z | rg.rg | clients | NaN | NaN | NaN |
99 | 10.13140/rg.2.1.3944.9206 | dois | 10.13140/rg.2.1.3944.9206 | [{'identifier': 'https://doi.org/10.13140/rg.2... | [{'name': 'Godoladze, Karlo', 'nameType': 'Per... | [{'title': 'საკონსტიტუციო ცვლილებები საქართველ... | ილიას სახელმწიფო უნივერსიტეტის გამომცემლობა, I... | 2013 | [] | [] | ... | 0 | 2016-06-26T16:37:37Z | 2016-06-26T16:37:38Z | None | 2020-06-29T13:28:40Z | rg.rg | clients | NaN | NaN | NaN |
100 | 10.13140/rg.2.1.5071.3209 | dois | 10.13140/rg.2.1.5071.3209 | [{'identifier': 'https://doi.org/10.13140/rg.2... | [{'name': 'Tumanishvili, George G.', 'nameType... | [{'title': 'საკუთრება, როგორც მოთხოვნის უზრუნვ... | ILIA STATE UNIVERSITY PRESS | 2012 | [] | [] | ... | 0 | 2016-06-26T21:41:26Z | 2016-06-26T21:41:27Z | None | 2020-06-29T13:28:40Z | rg.rg | clients | NaN | NaN | NaN |
101 | 10.15468/dl.77nmev | dois | 10.15468/dl.77nmev | [] | [{'name': 'Occdownload Gbif.Org', 'nameType': ... | [{'title': 'Occurrence Download'}] | The Global Biodiversity Information Facility | 2020 | [{'lang': 'eng', 'subject': 'GBIF'}, {'lang': ... | [] | ... | 0 | 2020-06-11T20:24:17Z | 2020-06-11T20:24:17Z | None | 2020-06-11T20:24:18Z | gbif.gbif | clients | NaN | NaN | NaN |
102 rows × 51 columns
#create csv file
df.to_csv('DataCite_Raw_Metadata.csv')
# count all attributes in Pandas dataframe
print(df.count())
# create csv file of the count
df.to_csv('DataCite-Data_Count_All_Attributes.csv')
id 102 type 102 attributes.doi 102 attributes.identifiers 102 attributes.creators 102 attributes.titles 102 attributes.publisher 102 attributes.publicationYear 102 attributes.subjects 102 attributes.contributors 102 attributes.dates 102 attributes.language 26 attributes.types.ris 102 attributes.types.bibtex 102 attributes.types.citeproc 102 attributes.types.schemaOrg 102 attributes.types.resourceType 51 attributes.types.resourceTypeGeneral 102 attributes.relatedIdentifiers 102 attributes.sizes 102 attributes.formats 102 attributes.version 4 attributes.rightsList 102 attributes.descriptions 102 attributes.geoLocations 102 attributes.fundingReferences 102 attributes.url 102 attributes.contentUrl 0 attributes.metadataVersion 102 attributes.schemaVersion 90 attributes.source 93 attributes.isActive 102 attributes.state 102 attributes.reason 0 attributes.viewCount 102 attributes.downloadCount 102 attributes.referenceCount 102 attributes.citationCount 102 attributes.partCount 102 attributes.partOfCount 102 attributes.versionCount 102 attributes.versionOfCount 102 attributes.created 102 attributes.registered 102 attributes.published 0 attributes.updated 102 relationships.client.data.id 102 relationships.client.data.type 102 attributes.container.type 29 attributes.container.identifier 29 attributes.container.identifierType 29 dtype: int64
# create values with all DOIs
df_DOI = df['attributes.doi']
# give results
print(df_DOI)
# create csv file of the count
df_DOI.to_csv('DataCite-Data_DOI-List.csv')
0 10.5281/zenodo.7223794 1 10.5281/zenodo.7223795 2 10.5281/zenodo.7243699 3 10.5281/zenodo.7243700 4 10.5281/zenodo.7225465 ... 97 10.13140/rg.2.1.3170.7766 98 10.13140/rg.2.1.1126.8720 99 10.13140/rg.2.1.3944.9206 100 10.13140/rg.2.1.5071.3209 101 10.15468/dl.77nmev Name: attributes.doi, Length: 102, dtype: object
# count values in the column "attributes.types.schemaOrg" and list it
df_pub_types = df['attributes.types.schemaOrg'].value_counts(dropna=False).rename_axis('Count_Publication_Types').sort_values(ascending=True)
# give results
print(df_pub_types)
# create csv file of the count
df_pub_types.to_csv('DataCite-Data_Publication-Types.csv')
Count_Publication_Types CreativeWork 1 SoftwareSourceCode 2 Book 9 ScholarlyArticle 43 Dataset 47 Name: attributes.types.schemaOrg, dtype: int64
# count values in the column "relationships.client.data.id" and list it
df_relationship = df['relationships.client.data.id'].value_counts(dropna=False).rename_axis('Count_Relationship_Clients').sort_values(ascending=True)
print(df_relationship)
# create csv file of the count
df_relationship.to_csv('DataCite-Data_Relationship-Clients.csv')
Count_Relationship_Clients tib.wdcrsat 2 gdcc.harvard-dv 5 rg.rg 14 gbif.gbif 40 cern.zenodo 41 Name: relationships.client.data.id, dtype: int64
# count values in the column "attributes.publisher" and list it
df_year = df['attributes.publicationYear'].value_counts(dropna=False).rename_axis('Publication_Year')
# sort values by year
df_year = df_year.sort_index(ascending=True)
# display result
print(df_year)
# create csv file of the count
df_year.to_csv('DataCite-Data_Count_by_Year.csv')
Publication_Year 2008 2 2009 2 2010 4 2011 8 2012 4 2013 2 2014 2 2015 2 2016 1 2017 6 2019 2 2020 12 2021 28 2022 27 Name: attributes.publicationYear, dtype: int64
# create list of counted values in the column "attributes.publisher"
df_publisher = df['attributes.publisher'].value_counts(dropna=False).rename_axis('Count_Publisher').sort_values(ascending=True)
# create csv file of this list
df_publisher.to_csv('DataCite-Data_Publisher.csv')
# print list of counted publisher
print(df_publisher)
Count_Publisher National Herbarium of Georgia TBI, Botanical Institute of Ilia State University 1 Ilia State University Institute of Addictology, Alternative Georgia Addiction Research Center 1 ილიას სახელმწიფო უნივერსიტეტის გამომცემლობა, ILIA STATE UNIVERSITY PRESS 1 Ilia State University 2 Unpublished 2 World Data Center for Remote Sensing of the Atmosphere (WDC-RSAT) 2 Harvard Dataverse 5 ILIA STATE UNIVERSITY PRESS 8 The Global Biodiversity Information Facility 39 Zenodo 41 Name: attributes.publisher, dtype: int64
# create table with publishers by years
df_publisher_y = df[['attributes.publicationYear','attributes.publisher']].copy()
# count the published datasets by publisher and year
df_publisher_y['Count.Year'] = df_publisher_y.groupby(['attributes.publisher'])['attributes.publisher'].transform('count')
# create Pivot table with pandas
df_pivot = df_publisher_y.pivot_table(index='attributes.publisher', columns='attributes.publicationYear', values = 'Count.Year', aggfunc='count', fill_value = 0)
# create file with pivot values
df_pivot.to_csv('DataCite-Data_Publisher-Year-Pivot.csv')
# create plot with bar
ax = df_year.plot(x='attributes.publicationYear', y='Publication_Year', kind='bar', figsize = (15, 15), legend=False, rot=0)
# set plot title
plt.title('Publications by Year')
# set x axis title
plt.xlabel('Count of Publications')
# set y axis title
plt.ylabel('Publisher')
# set values at the bars
ax.bar_label(ax.containers[0], label_type='edge')
# create image file of it
plt.savefig('Publications_by_Year.jpg', bbox_inches='tight')
# create plot with crossbar
ax = df_relationship.plot(x='relationship.client.data.id', y='Count_Relationship_Clients', kind='barh', figsize = (15, 15), legend=False, rot=0)
# set plot title
plt.title('Counts of Relationship Clients')
# set x axis title
plt.xlabel('Count of Publications')
# set y axis title
plt.ylabel('Relationship Clients')
# set values at the bars
ax.bar_label(ax.containers[0], label_type='edge')
# create image file of it
plt.savefig('Relationship_Clients.jpg', bbox_inches='tight')
# create plot with pie chart
df_pub_types.plot.pie(autopct='%.2f', figsize = (10, 10))
# set plot title
plt.title('Count of Publication Types')
# create image file of it
plt.savefig('Publication_Types.jpg', bbox_inches='tight')
# create plot with crossbar
ax = df_publisher.plot(x='attributes.publisher', y='Count_Publisher', kind='barh', figsize = (15,15), legend=False, rot=0)
# set plot title
plt.title('Total Publications by Publishers')
# set x axis title
plt.xlabel('Count of Publications')
# set y axis title
plt.ylabel('Publisher')
# set values at the bars
ax.bar_label(ax.containers[0], label_type='edge')
# create image file of it
plt.savefig('Publications_by_Publisher.jpg', bbox_inches='tight')