In [1]:

# import libraries for json import 
import json
import urllib.request, json 

# import json for visualiziation
import pandas as pd
from pandas.io.json import json_normalize

# import Matpltlib for data visualisation
import matplotlib.pyplot as plt

# import Numpy for data handling
import numpy as np

Get Metadata from DataCite REST API¶

In [2]:

# connect to DataCite REST API and get items with own search query, e.g. query=Max%20Planck&page[size]=1000'
with urllib.request.urlopen('https://api.datacite.org/dois?query=Ilia%20State%20University&page[size]=5000') as url:
    
# create a json file out of API call
    DataCite = json.load(url)

# create and open a new json file    
with open('DataCite_Raw_Metadata.json', 'w') as f:
    # write results in json file
    json.dump(DataCite, f)

# load to normalize json file
data = json.load(open('DataCite_Raw_Metadata.json'))

# load to normalize json file
data_normalized = pd.json_normalize(data,'data')

Create Pandas Dataframe as Preview¶

In [3]:

# create Pandas dataframe
df = pd.DataFrame(data_normalized)

# display Pandas dataframe as table
display(df)

	id	type	attributes.doi	attributes.identifiers	attributes.creators	attributes.titles	attributes.publisher	attributes.publicationYear	attributes.subjects	attributes.contributors	...	attributes.versionOfCount	attributes.created	attributes.registered	attributes.published	attributes.updated	relationships.client.data.id	relationships.client.data.type	attributes.container.type	attributes.container.identifier	attributes.container.identifierType
0	10.5281/zenodo.7223794	dois	10.5281/zenodo.7223794	[{'identifier': 'https://zenodo.org/record/722...	[{'name': 'Grossmann, Yves Vincent', 'givenNam...	[{'title': 'Research Data Policies – A Short O...	Zenodo	2022	[{'subject': 'ISUBII22'}, {'subject': 'Ilia St...	[]	...	0	2022-10-26T12:15:33Z	2022-10-26T12:15:34Z	None	2022-10-26T12:15:34Z	cern.zenodo	clients	NaN	NaN	NaN
1	10.5281/zenodo.7223795	dois	10.5281/zenodo.7223795	[]	[{'name': 'Grossmann, Yves Vincent', 'givenNam...	[{'title': 'Research Data Policies – A Short O...	Zenodo	2022	[{'subject': 'ISUBII22'}, {'subject': 'Ilia St...	[]	...	0	2022-10-26T12:15:32Z	2022-10-26T12:15:33Z	None	2022-10-26T12:15:33Z	cern.zenodo	clients	NaN	NaN	NaN
2	10.5281/zenodo.7243699	dois	10.5281/zenodo.7243699	[{'identifier': 'https://zenodo.org/record/724...	[{'name': 'Grossmann, Yves Vincent', 'givenNam...	[{'title': 'DataCite ROR Parser'}]	Zenodo	2022	[{'subject': 'ISUBII22'}, {'subject': 'Python'...	[]	...	0	2022-10-24T08:08:18Z	2022-10-24T08:08:19Z	None	2022-10-26T06:40:27Z	cern.zenodo	clients	NaN	NaN	NaN
3	10.5281/zenodo.7243700	dois	10.5281/zenodo.7243700	[]	[{'name': 'Grossmann, Yves Vincent', 'givenNam...	[{'title': 'DataCite ROR Parser'}]	Zenodo	2022	[{'subject': 'ISUBII22'}, {'subject': 'Python'...	[]	...	0	2022-10-24T08:08:17Z	2022-10-24T08:08:18Z	None	2022-10-26T06:40:26Z	cern.zenodo	clients	NaN	NaN	NaN
4	10.5281/zenodo.7225465	dois	10.5281/zenodo.7225465	[{'identifier': 'https://zenodo.org/record/722...	[{'name': 'Grossmann, Yves Vincent', 'givenNam...	[{'title': 'Research Data Management – A Short...	Zenodo	2022	[{'subject': 'ISUBII22'}, {'subject': 'Ilia St...	[]	...	0	2022-10-25T07:15:22Z	2022-10-25T07:15:22Z	None	2022-10-25T07:15:22Z	cern.zenodo	clients	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
97	10.13140/rg.2.1.3170.7766	dois	10.13140/rg.2.1.3170.7766	[{'identifier': 'https://doi.org/10.13140/rg.2...	[{'name': 'Tumanishvili, George G.', 'nameType...	[{'title': 'ხელშეკრულების შედგენის ტექნიკა და ...	ILIA STATE UNIVERSITY PRESS	2012	[]	[]	...	0	2016-06-26T21:47:27Z	2016-06-26T21:47:28Z	None	2020-06-29T13:28:40Z	rg.rg	clients	NaN	NaN	NaN
98	10.13140/rg.2.1.1126.8720	dois	10.13140/rg.2.1.1126.8720	[{'identifier': 'https://doi.org/10.13140/rg.2...	[{'name': 'Tumanishvili, George G.', 'nameType...	[{'title': 'სამართლის პროფესიები'}]	ILIA STATE UNIVERSITY PRESS	2009	[]	[]	...	0	2016-06-26T17:15:55Z	2016-06-26T17:15:56Z	None	2020-06-29T13:28:40Z	rg.rg	clients	NaN	NaN	NaN
99	10.13140/rg.2.1.3944.9206	dois	10.13140/rg.2.1.3944.9206	[{'identifier': 'https://doi.org/10.13140/rg.2...	[{'name': 'Godoladze, Karlo', 'nameType': 'Per...	[{'title': 'საკონსტიტუციო ცვლილებები საქართველ...	ილიას სახელმწიფო უნივერსიტეტის გამომცემლობა, I...	2013	[]	[]	...	0	2016-06-26T16:37:37Z	2016-06-26T16:37:38Z	None	2020-06-29T13:28:40Z	rg.rg	clients	NaN	NaN	NaN
100	10.13140/rg.2.1.5071.3209	dois	10.13140/rg.2.1.5071.3209	[{'identifier': 'https://doi.org/10.13140/rg.2...	[{'name': 'Tumanishvili, George G.', 'nameType...	[{'title': 'საკუთრება, როგორც მოთხოვნის უზრუნვ...	ILIA STATE UNIVERSITY PRESS	2012	[]	[]	...	0	2016-06-26T21:41:26Z	2016-06-26T21:41:27Z	None	2020-06-29T13:28:40Z	rg.rg	clients	NaN	NaN	NaN
101	10.15468/dl.77nmev	dois	10.15468/dl.77nmev	[]	[{'name': 'Occdownload Gbif.Org', 'nameType': ...	[{'title': 'Occurrence Download'}]	The Global Biodiversity Information Facility	2020	[{'lang': 'eng', 'subject': 'GBIF'}, {'lang': ...	[]	...	0	2020-06-11T20:24:17Z	2020-06-11T20:24:17Z	None	2020-06-11T20:24:18Z	gbif.gbif	clients	NaN	NaN	NaN

102 rows × 51 columns

Create csv File for Raw Data Export¶

In [4]:

#create csv file
df.to_csv('DataCite_Raw_Metadata.csv')

Count Attributes in Pandas Dataframe¶

Count all attributes in Pandas dataframe¶

In [5]:

# count all attributes in Pandas dataframe
print(df.count())

# create csv file of the count
df.to_csv('DataCite-Data_Count_All_Attributes.csv')

id                                      102
type                                    102
attributes.doi                          102
attributes.identifiers                  102
attributes.creators                     102
attributes.titles                       102
attributes.publisher                    102
attributes.publicationYear              102
attributes.subjects                     102
attributes.contributors                 102
attributes.dates                        102
attributes.language                      26
attributes.types.ris                    102
attributes.types.bibtex                 102
attributes.types.citeproc               102
attributes.types.schemaOrg              102
attributes.types.resourceType            51
attributes.types.resourceTypeGeneral    102
attributes.relatedIdentifiers           102
attributes.sizes                        102
attributes.formats                      102
attributes.version                        4
attributes.rightsList                   102
attributes.descriptions                 102
attributes.geoLocations                 102
attributes.fundingReferences            102
attributes.url                          102
attributes.contentUrl                     0
attributes.metadataVersion              102
attributes.schemaVersion                 90
attributes.source                        93
attributes.isActive                     102
attributes.state                        102
attributes.reason                         0
attributes.viewCount                    102
attributes.downloadCount                102
attributes.referenceCount               102
attributes.citationCount                102
attributes.partCount                    102
attributes.partOfCount                  102
attributes.versionCount                 102
attributes.versionOfCount               102
attributes.created                      102
attributes.registered                   102
attributes.published                      0
attributes.updated                      102
relationships.client.data.id            102
relationships.client.data.type          102
attributes.container.type                29
attributes.container.identifier          29
attributes.container.identifierType      29
dtype: int64

Give List of DOIs¶

In [6]:

# create values with all DOIs
df_DOI = df['attributes.doi']

# give results
print(df_DOI)

# create csv file of the count
df_DOI.to_csv('DataCite-Data_DOI-List.csv')

0         10.5281/zenodo.7223794
1         10.5281/zenodo.7223795
2         10.5281/zenodo.7243699
3         10.5281/zenodo.7243700
4         10.5281/zenodo.7225465
                 ...            
97     10.13140/rg.2.1.3170.7766
98     10.13140/rg.2.1.1126.8720
99     10.13140/rg.2.1.3944.9206
100    10.13140/rg.2.1.5071.3209
101           10.15468/dl.77nmev
Name: attributes.doi, Length: 102, dtype: object

Count the Publication Types¶

In [7]:

# count values in the column "attributes.types.schemaOrg" and list it
df_pub_types = df['attributes.types.schemaOrg'].value_counts(dropna=False).rename_axis('Count_Publication_Types').sort_values(ascending=True)

# give results
print(df_pub_types)

# create csv file of the count
df_pub_types.to_csv('DataCite-Data_Publication-Types.csv')

Count_Publication_Types
CreativeWork           1
SoftwareSourceCode     2
Book                   9
ScholarlyArticle      43
Dataset               47
Name: attributes.types.schemaOrg, dtype: int64

Count the Relationship Clients¶

In [8]:

# count values in the column "relationships.client.data.id" and list it
df_relationship = df['relationships.client.data.id'].value_counts(dropna=False).rename_axis('Count_Relationship_Clients').sort_values(ascending=True)
print(df_relationship)

# create csv file of the count
df_relationship.to_csv('DataCite-Data_Relationship-Clients.csv')

Count_Relationship_Clients
tib.wdcrsat         2
gdcc.harvard-dv     5
rg.rg              14
gbif.gbif          40
cern.zenodo        41
Name: relationships.client.data.id, dtype: int64

Count by Year¶

In [9]:

# count values in the column "attributes.publisher" and list it
df_year = df['attributes.publicationYear'].value_counts(dropna=False).rename_axis('Publication_Year')

# sort values by year
df_year = df_year.sort_index(ascending=True)

# display result
print(df_year)

# create csv file of the count
df_year.to_csv('DataCite-Data_Count_by_Year.csv')

Publication_Year
2008     2
2009     2
2010     4
2011     8
2012     4
2013     2
2014     2
2015     2
2016     1
2017     6
2019     2
2020    12
2021    28
2022    27
Name: attributes.publicationYear, dtype: int64

Count Publisher¶

In [10]:

# create list of counted values in the column "attributes.publisher"
df_publisher = df['attributes.publisher'].value_counts(dropna=False).rename_axis('Count_Publisher').sort_values(ascending=True)

# create csv file of this list
df_publisher.to_csv('DataCite-Data_Publisher.csv')

# print list of counted publisher
print(df_publisher)

Count_Publisher
National Herbarium of Georgia TBI, Botanical Institute of Ilia State University                   1
Ilia State University Institute of Addictology, Alternative Georgia Addiction Research Center     1
ილიას სახელმწიფო უნივერსიტეტის გამომცემლობა, ILIA STATE UNIVERSITY PRESS                          1
Ilia State University                                                                             2
Unpublished                                                                                       2
World Data Center for Remote Sensing of the Atmosphere (WDC-RSAT)                                 2
Harvard Dataverse                                                                                 5
ILIA STATE UNIVERSITY PRESS                                                                       8
The Global Biodiversity Information Facility                                                     39
Zenodo                                                                                           41
Name: attributes.publisher, dtype: int64

Pivot by Publisher and Years¶

In [11]:

# create table with publishers by years
df_publisher_y = df[['attributes.publicationYear','attributes.publisher']].copy()

# count the published datasets by publisher and year
df_publisher_y['Count.Year'] = df_publisher_y.groupby(['attributes.publisher'])['attributes.publisher'].transform('count')

# create Pivot table with pandas
df_pivot = df_publisher_y.pivot_table(index='attributes.publisher', columns='attributes.publicationYear', values = 'Count.Year', aggfunc='count', fill_value = 0)

# create file with pivot values
df_pivot.to_csv('DataCite-Data_Publisher-Year-Pivot.csv')

Visualisations¶

Graph on Publications by Year¶

In [12]:

# create plot with bar  
ax = df_year.plot(x='attributes.publicationYear', y='Publication_Year', kind='bar', figsize = (15, 15), legend=False, rot=0)

# set plot title
plt.title('Publications by Year')

# set x axis title
plt.xlabel('Count of Publications')

# set y axis title
plt.ylabel('Publisher')

# set values at the bars
ax.bar_label(ax.containers[0], label_type='edge')

# create image file of it
plt.savefig('Publications_by_Year.jpg', bbox_inches='tight')

Graph on Relationship Clients¶

In [16]:

# create plot with crossbar
ax = df_relationship.plot(x='relationship.client.data.id', y='Count_Relationship_Clients', kind='barh', figsize = (15, 15), legend=False, rot=0)

# set plot title
plt.title('Counts of Relationship Clients')

# set x axis title
plt.xlabel('Count of Publications')

# set y axis title
plt.ylabel('Relationship Clients')

# set values at the bars
ax.bar_label(ax.containers[0], label_type='edge')

# create image file of it
plt.savefig('Relationship_Clients.jpg', bbox_inches='tight')

Graph on Publication Types¶

In [14]:

# create plot with pie chart
df_pub_types.plot.pie(autopct='%.2f', figsize = (10, 10))

# set plot title
plt.title('Count of Publication Types')

# create image file of it
plt.savefig('Publication_Types.jpg', bbox_inches='tight')

Graph on Publications by Publisher¶

In [17]:

# create plot with crossbar
ax = df_publisher.plot(x='attributes.publisher', y='Count_Publisher', kind='barh', figsize = (15,15), legend=False, rot=0)

# set plot title
plt.title('Total Publications by Publishers')

# set x axis title
plt.xlabel('Count of Publications')

# set y axis title
plt.ylabel('Publisher')

# set values at the bars
ax.bar_label(ax.containers[0], label_type='edge')

# create image file of it
plt.savefig('Publications_by_Publisher.jpg', bbox_inches='tight')