import pandas as pd
import matplotlib.pyplot as plt
import worldcatidentities
from ipywidgets import interact, fixed, widgets
import download_names as dn
import download_uris as du
First we tried to automate the process of retrieving data from the WorldCat Identities API. However, the data downloaded required extensive review.
#dn.download_names('authors.txt')
We manually reviewed each of the records obtained for each author, verifying that they were correct. In case of detecting more than one, these were included.
Due to duplication by some authors, a total of 590 entries have been generated out of the 398 EC3's Scholar Mirror. There are 114 authors with more than one record.
fixed_authors = pd.read_csv('data/Fixed_Authors.tsv',
sep = '\t',
na_values = 'NA',
header = 0)
fixed_authors
author | identity | work_count | record_count | languages | total_holdings | author_id | source | duplicated | |
---|---|---|---|---|---|---|---|---|---|
0 | Loet Leydesdorff | Leydesdorff, L. A. | 64.0 | 190.0 | 5.0 | 1.232 | lccn-n80112847 | API | True |
1 | Loet Leydesdorff | Lydsdorff, Loet | 1.0 | 2.0 | 1.0 | 2.000 | np-lydsdorff,%20loet | Web | True |
2 | Eugene Garfield* | Garfield, Eugene | 147.0 | 447.0 | 5.0 | 3.399 | lccn-n79061047 | API | True |
3 | Eugene Garfield* | Garfield, Eugen | 1.0 | 1.0 | 1.0 | 0.000 | np-garfield,%20eugen | Web | True |
4 | Mike Thelwall | Thelwall, Mike | 49.0 | 118.0 | 4.0 | 1.161 | lccn-no2005014137 | API | True |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
585 | Alberto Ramos-Alonso | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
586 | vaishali khaparde | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
587 | Vaishali Khaparde | Khaparde, Vaishali | 1.0 | 1.0 | 1.0 | 2.000 | np-khaparde,%20vaishali | API | True |
588 | Vaishali Khaparde | Khaparde, Vaishali S. | 1.0 | 1.0 | 1.0 | 2.000 | np-khaparde,%20vaishali%20s | Web | True |
589 | Jeyashree S | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
590 rows × 9 columns
fixed_authors.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 590 entries, 0 to 589 Data columns (total 9 columns): author 590 non-null object identity 461 non-null object work_count 461 non-null float64 record_count 461 non-null float64 languages 461 non-null float64 total_holdings 461 non-null float64 author_id 461 non-null object source 461 non-null object duplicated 461 non-null object dtypes: float64(4), object(5) memory usage: 41.6+ KB
fixed_authors.describe(include='all')
author | identity | work_count | record_count | languages | total_holdings | author_id | source | duplicated | |
---|---|---|---|---|---|---|---|---|---|
count | 590 | 461 | 461.000000 | 461.000000 | 461.000000 | 461.000000 | 461 | 461 | 461 |
unique | 398 | 427 | NaN | NaN | NaN | NaN | 461 | 2 | 2 |
top | Paul Wouters | Wouters, Paul | NaN | NaN | NaN | NaN | np-maura,%20mariano%20a | Web | True |
freq | 11 | 7 | NaN | NaN | NaN | NaN | 1 | 241 | 305 |
mean | NaN | NaN | 12.878525 | 25.971430 | 1.605206 | 68.487128 | NaN | NaN | NaN |
std | NaN | NaN | 40.322105 | 58.872502 | 1.126892 | 166.368177 | NaN | NaN | NaN |
min | NaN | NaN | 1.000000 | 1.000000 | 0.000000 | 0.000000 | NaN | NaN | NaN |
25% | NaN | NaN | 1.000000 | 1.000000 | 1.000000 | 1.232000 | NaN | NaN | NaN |
50% | NaN | NaN | 3.000000 | 4.000000 | 1.000000 | 4.000000 | NaN | NaN | NaN |
75% | NaN | NaN | 12.000000 | 24.000000 | 2.000000 | 39.000000 | NaN | NaN | NaN |
max | NaN | NaN | 746.000000 | 584.000000 | 11.000000 | 1598.000000 | NaN | NaN | NaN |
pd.crosstab(index=fixed_authors['duplicated'], columns='count')
col_0 | count |
---|---|
duplicated | |
False | 156 |
True | 305 |
There are a total of 129 authors not listed in WorldCat Identities. Therefore, in total we have 461 author records, 269 of which are unique authors. There are 113 authors with more than one record, and 156 with only one.
fixed_authors = fixed_authors[['author', 'identity', 'author_id']] # or fixed_authors.loc[:, [['author', 'identity', 'author_id']]]
print(fixed_authors.isna().sum())
author 0 identity 129 author_id 129 dtype: int64
fixed_authors = fixed_authors.loc[fixed_authors['identity'].notnull()]
fixed_authors.reset_index(inplace = True, drop = True)
fixed_authors
author | identity | author_id | |
---|---|---|---|
0 | Loet Leydesdorff | Leydesdorff, L. A. | lccn-n80112847 |
1 | Loet Leydesdorff | Lydsdorff, Loet | np-lydsdorff,%20loet |
2 | Eugene Garfield* | Garfield, Eugene | lccn-n79061047 |
3 | Eugene Garfield* | Garfield, Eugen | np-garfield,%20eugen |
4 | Mike Thelwall | Thelwall, Mike | lccn-no2005014137 |
... | ... | ... | ... |
456 | John Jeyasekar Jesubright | Jeyasekar, J. John 1965- | lccn-n2017033557 |
457 | Adèle Paul-Hus | Paul-Hus, Adèle | np-paul%20hus,%20adele |
458 | Magdalena Bemke-Świtilnik | Bemke-Świtilnik, Magdalena | viaf-280144782722013431396 |
459 | Vaishali Khaparde | Khaparde, Vaishali | np-khaparde,%20vaishali |
460 | Vaishali Khaparde | Khaparde, Vaishali S. | np-khaparde,%20vaishali%20s |
461 rows × 3 columns
len(set(fixed_authors['author']))
269
sum([not i for i in fixed_authors['author'].duplicated(keep=False).tolist()])
156
Finally, once the authors listed in WorldCat Identities were identified, retrieval of information from their records was automated.
#du.download_uris(fixed_authors)
Due to duplicate authors, it is necessary to aggregate all such records. Before performing this task and merging the different types of data (authors, works, Google Scholar citations...) they are imported and checked for errors.
We only consider authors related to the field of bibliometrics.
authors_data = pd.read_csv('data/uri_worldcat_identities_author.tsv',
sep = '\t',
na_values = 'NA')
authors_data
author | identity | languages | total_holdings | work_count | record_count | author_id | |
---|---|---|---|---|---|---|---|
0 | Loet Leydesdorff | Leydesdorff, L. A. | 5 | 1232 | 64 | 190 | lccn-n80112847 |
1 | Loet Leydesdorff | Lydsdorff, Loet | 1 | 2 | 1 | 2 | np-lydsdorff,%20loet |
2 | Eugene Garfield* | Garfield, Eugene | 5 | 3399 | 147 | 447 | lccn-n79061047 |
3 | Eugene Garfield* | Garfield, Eugen | 1 | 0 | 1 | 1 | np-garfield,%20eugen |
4 | Mike Thelwall | Thelwall, Mike | 4 | 1161 | 49 | 118 | lccn-no2005014137 |
... | ... | ... | ... | ... | ... | ... | ... |
456 | John Jeyasekar Jesubright | Jeyasekar, J. John 1965- | 1 | 51 | 3 | 8 | lccn-n2017033557 |
457 | Adèle Paul-Hus | Paul-Hus, Adèle | 1 | 11 | 5 | 7 | np-paul%20hus,%20adele |
458 | Magdalena Bemke-Świtilnik | Bemke-Świtilnik, Magdalena | 1 | 3 | 3 | 3 | viaf-280144782722013431396 |
459 | Vaishali Khaparde | Khaparde, Vaishali | 1 | 2 | 1 | 1 | np-khaparde,%20vaishali |
460 | Vaishali Khaparde | Khaparde, Vaishali S. | 1 | 2 | 1 | 1 | np-khaparde,%20vaishali%20s |
461 rows × 7 columns
authors_biblio_data = pd.read_csv('data/authors_biblio.txt',
sep = '\t',
na_values = 'NA')
authors_biblio_data = authors_biblio_data[authors_biblio_data['Bibliometrics'] == 'Yes']
authors_biblio_data
Author | Bibliometrics | Figure | Status | University | |
---|---|---|---|---|---|
0 | Caroline S. Wagner | Yes | Professor | Active | Ohio State University |
1 | Blaise Cronin | Yes | Professor | Emeritus | Indiana University Bloomington |
2 | Derek J. de Solla Price | Yes | Professor | Deceased | --- |
3 | Chaomei Chen | Yes | Researcher | Active | Drexel University |
4 | Jose Maria López Piñero | Yes | Researcher | Deceased | CSIC |
... | ... | ... | ... | ... | ... |
261 | Peter Sjögårde | Yes | Librarian | Active | Karolinska Institutet |
262 | R Jeyshankar | Yes | Professor | Active | Alagappa University |
263 | Samir Kumar Jalal | Yes | Librarian | Active | Indian Institute of Technology |
264 | Sibele Fausto | Yes | Librarian | Active | University of São Paulo |
265 | Jens Terliesner | Yes | Professor | Active | Heinrich-Heine-University Düsseldorf |
265 rows × 5 columns
list(set(authors_data['author'].tolist()) - set(authors_biblio_data['Author'].tolist()))
['SL Sangam', 'Rafael Bailón Moreno', 'SA Sanni', 'Guo Freeman (Zhang)']
all([i in set(authors_data['author'].tolist()) for i in set(authors_biblio_data['Author'].tolist())])
True
set(fixed_authors['author_id'].tolist()) == set(authors_data['author_id'].tolist())
True
set(authors_data['author_id'].tolist()) == set(fixed_authors['author_id'].tolist())
True
We remove four authors that are not entirely related to the field of bibliometrics.
authors_data = authors_data.loc[authors_data['author'].isin(set(authors_biblio_data['Author'].tolist()))]
authors_data
author | identity | languages | total_holdings | work_count | record_count | author_id | |
---|---|---|---|---|---|---|---|
0 | Loet Leydesdorff | Leydesdorff, L. A. | 5 | 1232 | 64 | 190 | lccn-n80112847 |
1 | Loet Leydesdorff | Lydsdorff, Loet | 1 | 2 | 1 | 2 | np-lydsdorff,%20loet |
2 | Eugene Garfield* | Garfield, Eugene | 5 | 3399 | 147 | 447 | lccn-n79061047 |
3 | Eugene Garfield* | Garfield, Eugen | 1 | 0 | 1 | 1 | np-garfield,%20eugen |
4 | Mike Thelwall | Thelwall, Mike | 4 | 1161 | 49 | 118 | lccn-no2005014137 |
... | ... | ... | ... | ... | ... | ... | ... |
456 | John Jeyasekar Jesubright | Jeyasekar, J. John 1965- | 1 | 51 | 3 | 8 | lccn-n2017033557 |
457 | Adèle Paul-Hus | Paul-Hus, Adèle | 1 | 11 | 5 | 7 | np-paul%20hus,%20adele |
458 | Magdalena Bemke-Świtilnik | Bemke-Świtilnik, Magdalena | 1 | 3 | 3 | 3 | viaf-280144782722013431396 |
459 | Vaishali Khaparde | Khaparde, Vaishali | 1 | 2 | 1 | 1 | np-khaparde,%20vaishali |
460 | Vaishali Khaparde | Khaparde, Vaishali S. | 1 | 2 | 1 | 1 | np-khaparde,%20vaishali%20s |
456 rows × 7 columns
Filter to only authors with works in WorldCat Identities.
google_scholar = pd.read_csv('data/google_scholar.tsv',
sep = '\t',
na_values = 'NA')
google_scholar
Author | All | Since 2014 | LC | User | |
---|---|---|---|---|---|
0 | Loet Leydesdorff | 49466 | 25909 | Yes | ych9gNYAAAAJ |
1 | Eugene Garfield* | 30681 | 9590 | Yes | 26U7IAEAAAAJ |
2 | Mike Thelwall | 29666 | 18658 | Yes | 8jCKL1sAAAAJ |
3 | Derek J. de Solla Price | 21002 | 5928 | Yes | Ev26B2YAAAAJ |
4 | Francis Narin | 15582 | 4433 | Yes | ZZ56uad45oYC |
... | ... | ... | ... | ... | ... |
392 | Rouhallah Khademi | 81 | 81 | No | pUgDOmUAAAAJ |
393 | Saeed Roshani | 15 | 15 | No | Q46atc0AAAAJ |
394 | Alberto Ramos-Alonso | 4 | 4 | No | CVN6mUAAAAJ |
395 | Vaishali Khaparde | 0 | 0 | No | XPWareQAAAAJ |
396 | Jeyashree S | 8 | 8 | No | zzGa2LIAAAAJ |
397 rows × 5 columns
google_scholar = google_scholar[google_scholar['LC'] == 'Yes']
google_scholar.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 269 entries, 0 to 390 Data columns (total 5 columns): Author 269 non-null object All 269 non-null int64 Since 2014 269 non-null int64 LC 269 non-null object User 269 non-null object dtypes: int64(2), object(3) memory usage: 12.6+ KB
all([i in set(google_scholar['Author'].tolist()) for i in set(authors_data['author'].tolist())])
True
In this case there are four missed authorities.
authors_langs = pd.read_csv('data/uri_worldcat_identities_langs.tsv',
sep = '\t',
na_values = 'NA')
authors_langs
author | author_id | lang | count | |
---|---|---|---|---|
0 | Loet Leydesdorff | lccn-n80112847 | eng | 120 |
1 | Loet Leydesdorff | lccn-n80112847 | dut | 15 |
2 | Loet Leydesdorff | lccn-n80112847 | chi | 3 |
3 | Loet Leydesdorff | lccn-n80112847 | ger | 1 |
4 | Loet Leydesdorff | lccn-n80112847 | fre | 1 |
... | ... | ... | ... | ... |
726 | John Jeyasekar Jesubright | lccn-n2017033557 | eng | 8 |
727 | Adèle Paul-Hus | np-paul%20hus,%20adele | eng | 7 |
728 | Magdalena Bemke-Świtilnik | viaf-280144782722013431396 | pol | 3 |
729 | Vaishali Khaparde | np-khaparde,%20vaishali | eng | 1 |
730 | Vaishali Khaparde | np-khaparde,%20vaishali%20s | eng | 1 |
731 rows × 4 columns
There are some lost authorities.
set(fixed_authors['author_id'].tolist()) == set(authors_langs['author_id'].tolist())
False
Four authorities have no information about language.
list(set(fixed_authors['author_id'].tolist()) - set(authors_langs['author_id'].tolist()))
['np-west,%20jevin', 'np-moravcsik,%20michael%20j%20ed', 'np-mukherjee,%20mohammad%20nazim%20bhaskar', 'np-shelton,%20robert%20duane%201938']
There is one authority problem related with "about" works.
authors_works = pd.read_csv('data/uri_worldcat_identities_works.tsv',
sep = '\t',
na_values = 'NA')
authors_works
author | author_id | title | lang | holdings | editions | type | |
---|---|---|---|---|---|---|---|
0 | Loet Leydesdorff | lccn-n80112847 | Evolutionary economics and chaos theory : new ... | 1 | 332 | 21 | book |
1 | Loet Leydesdorff | lccn-n80112847 | Universities and the global knowledge economy ... | 1 | 304 | 23 | book |
2 | Loet Leydesdorff | lccn-n80112847 | The challenge of scientometrics : the developm... | 2 | 123 | 23 | book |
3 | Loet Leydesdorff | lccn-n80112847 | The knowledge based economy : modeled, measure... | 1 | 101 | 12 | book |
4 | Loet Leydesdorff | lccn-n80112847 | A sociological theory of communication : the s... | 1 | 78 | 14 | book |
... | ... | ... | ... | ... | ... | ... | ... |
3129 | Magdalena Bemke-Świtilnik | viaf-280144782722013431396 | Zarządzanie gromadzeniem źródeł informacji ... | 1 | 1 | 1 | book |
3130 | Magdalena Bemke-Świtilnik | viaf-280144782722013431396 | Analiza bibliometryczna współczesnych czasopi... | 1 | 1 | 1 | file |
3131 | Magdalena Bemke-Świtilnik | viaf-280144782722013431396 | Zarządzanie gromadzeniem źródeł informacji ... | 1 | 1 | 1 | art |
3132 | Vaishali Khaparde | np-khaparde,%20vaishali | BIBLIOMETRIC ANALYSIS : the electronic library | 1 | 2 | 1 | file |
3133 | Vaishali Khaparde | np-khaparde,%20vaishali%20s | LIBRARY MANAGEMENT | 1 | 2 | 1 | file |
3134 rows × 7 columns
As before, there are lost records.
set(fixed_authors['author_id'].tolist()) == set(authors_works['author_id'].tolist())
False
One record have no works by the author, only about.
list(set(fixed_authors['author_id'].tolist()) - set(authors_works['author_id'].tolist()))
['np-hinze,%20sybil']
authors_data_group = authors_data[['author', 'total_holdings', 'work_count', 'record_count']].groupby(['author']).sum().reset_index()
#or
#authors_data_group = authors_data[['author', 'total_holdings', 'work_count', 'record_count']].groupby(['author'], as_index=False).sum()
authors_data_group.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 265 entries, 0 to 264 Data columns (total 4 columns): author 265 non-null object total_holdings 265 non-null int64 work_count 265 non-null int64 record_count 265 non-null int64 dtypes: int64(3), object(1) memory usage: 8.4+ KB
authors_gs = pd.merge(left=authors_data_group, right=google_scholar[['Author', 'All', 'Since 2014', 'User']],
how='inner',
left_on='author',
right_on='Author')
authors_gs = authors_gs[['author', 'total_holdings', 'work_count', 'record_count', 'All', 'Since 2014', 'User']]
authors_gs
author | total_holdings | work_count | record_count | All | Since 2014 | User | |
---|---|---|---|---|---|---|---|
0 | Adrián A. Díaz-Faes | 2 | 2 | 2 | 168 | 158 | qbu_JY4AAAAJ |
1 | Adèle Paul-Hus | 11 | 5 | 7 | 719 | 717 | ZsZex3IAAAAJ |
2 | Alan Pritchard | 2515 | 78 | 199 | 2893 | 1596 | quOCDDEAAAAJ |
3 | Alberto Martín-Martín | 39 | 3 | 6 | 1060 | 1056 | YlPd48UAAAAJ |
4 | Alesia Zuccala | 12 | 4 | 6 | 1154 | 716 | FubDq0QAAAAJ |
... | ... | ... | ... | ... | ... | ... | ... |
260 | Yves-François Le Coadic | 1302 | 30 | 91 | 2504 | 904 | BeGPwbgAAAAJ |
261 | Zaida Chinchilla-Rodríguez | 165 | 19 | 38 | 3058 | 1688 | eI_07rMAAAAJ |
262 | Zohreh Zahedi | 5 | 5 | 5 | 1350 | 1337 | X8O5sZ4AAAAJ |
263 | maryam shekofteh | 2 | 2 | 2 | 129 | 96 | KFidCf0AAAAJ |
264 | Álvaro Cabezas-Clavijo | 63 | 5 | 11 | 1189 | 849 | z9mQzfYAAAAJ |
265 rows × 7 columns
Merge with additional information.
authors_gs = pd.merge(left=authors_gs, right=authors_biblio_data,
how='inner',
left_on='author',
right_on='Author')
authors_gs = authors_gs.drop('Author', 1)
authors_gs
author | total_holdings | work_count | record_count | All | Since 2014 | User | Bibliometrics | Figure | Status | University | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Adrián A. Díaz-Faes | 2 | 2 | 2 | 168 | 158 | qbu_JY4AAAAJ | Yes | Researcher | Active | ingenio CSIC-UPV |
1 | Adèle Paul-Hus | 11 | 5 | 7 | 719 | 717 | ZsZex3IAAAAJ | Yes | Professor | Active | Université de Montréal |
2 | Alan Pritchard | 2515 | 78 | 199 | 2893 | 1596 | quOCDDEAAAAJ | Yes | Researcher | Deceased | ---- |
3 | Alberto Martín-Martín | 39 | 3 | 6 | 1060 | 1056 | YlPd48UAAAAJ | Yes | Professor | Active | University of Granada |
4 | Alesia Zuccala | 12 | 4 | 6 | 1154 | 716 | FubDq0QAAAAJ | Yes | Professor | Active | University of Copenhagen |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
260 | Yves-François Le Coadic | 1302 | 30 | 91 | 2504 | 904 | BeGPwbgAAAAJ | Yes | Professor | Emeritus | Cnam - Paris |
261 | Zaida Chinchilla-Rodríguez | 165 | 19 | 38 | 3058 | 1688 | eI_07rMAAAAJ | Yes | Researcher | Active | CSIC |
262 | Zohreh Zahedi | 5 | 5 | 5 | 1350 | 1337 | X8O5sZ4AAAAJ | Yes | Researcher | Active | University of Leiden |
263 | maryam shekofteh | 2 | 2 | 2 | 129 | 96 | KFidCf0AAAAJ | Yes | Professor | Active | Shahid Beheshti University of Medical Sciences |
264 | Álvaro Cabezas-Clavijo | 63 | 5 | 11 | 1189 | 849 | z9mQzfYAAAAJ | Yes | Researcher | Active | EC3metrics |
265 rows × 11 columns
@interact
def show_entities_more_than(column=['total_holdings', 'work_count', 'record_count', 'All', 'Since 2014'], value=widgets.IntSlider(min=0,max=authors_gs[['total_holdings', 'work_count', 'record_count', 'All', 'Since 2014']].values.max(),step=5,value=0)):
return authors_gs.loc[authors_gs[column] >= value].sort_values(by = [column], ascending = False)
interactive(children=(Dropdown(description='column', options=('total_holdings', 'work_count', 'record_count', …
authors_langs.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 731 entries, 0 to 730 Data columns (total 4 columns): author 731 non-null object author_id 731 non-null object lang 731 non-null object count 731 non-null int64 dtypes: int64(1), object(3) memory usage: 23.0+ KB
authors_langs_group = authors_langs[['author','lang','count']].groupby(['author','lang']).sum().reset_index()
authors_langs_group
author | lang | count | |
---|---|---|---|
0 | Adrián A. Díaz-Faes | eng | 1 |
1 | Adrián A. Díaz-Faes | spa | 1 |
2 | Adèle Paul-Hus | eng | 7 |
3 | Alan Pritchard | eng | 147 |
4 | Alberto Martín-Martín | eng | 2 |
... | ... | ... | ... |
513 | Zohreh Zahedi | eng | 3 |
514 | Zohreh Zahedi | per | 2 |
515 | maryam shekofteh | eng | 2 |
516 | Álvaro Cabezas-Clavijo | eng | 1 |
517 | Álvaro Cabezas-Clavijo | spa | 10 |
518 rows × 3 columns
pd.crosstab(index=authors_langs_group['lang'], columns='count').sort_values(by='count', ascending=False)
col_0 | count |
---|---|
lang | |
eng | 223 |
spa | 78 |
ger | 45 |
fre | 24 |
por | 23 |
dut | 16 |
chi | 14 |
cat | 10 |
swe | 9 |
ita | 8 |
dan | 7 |
slv | 6 |
hun | 6 |
mul | 5 |
hrv | 5 |
jpn | 5 |
rus | 4 |
pol | 4 |
tur | 3 |
fin | 3 |
nob | 2 |
nor | 2 |
per | 2 |
glg | 2 |
ara | 2 |
lat | 1 |
may | 1 |
mis | 1 |
kor | 1 |
heb | 1 |
gre | 1 |
rum | 1 |
enm | 1 |
srp | 1 |
cze | 1 |
langs_group = authors_langs[['lang','count']].groupby(['lang']).sum().reset_index()
langs_group['perc'] = round(100*langs_group['count']/sum(langs_group['count']), 2)
langs_group.sort_values(by='count', ascending=False)
lang | count | perc | |
---|---|---|---|
6 | eng | 6404 | 68.32 |
31 | spa | 1293 | 13.79 |
10 | ger | 390 | 4.16 |
9 | fre | 296 | 3.16 |
5 | dut | 153 | 1.63 |
33 | swe | 129 | 1.38 |
27 | por | 117 | 1.25 |
16 | ita | 116 | 1.24 |
29 | rus | 85 | 0.91 |
2 | chi | 77 | 0.82 |
26 | pol | 54 | 0.58 |
15 | hun | 48 | 0.51 |
4 | dan | 41 | 0.44 |
24 | nor | 37 | 0.39 |
14 | hrv | 29 | 0.31 |
8 | fin | 18 | 0.19 |
1 | cat | 17 | 0.18 |
17 | jpn | 15 | 0.16 |
34 | tur | 10 | 0.11 |
22 | mul | 6 | 0.06 |
30 | slv | 6 | 0.06 |
7 | enm | 5 | 0.05 |
0 | ara | 5 | 0.05 |
23 | nob | 3 | 0.03 |
25 | per | 3 | 0.03 |
11 | glg | 3 | 0.03 |
19 | lat | 2 | 0.02 |
20 | may | 2 | 0.02 |
13 | heb | 2 | 0.02 |
12 | gre | 2 | 0.02 |
21 | mis | 1 | 0.01 |
18 | kor | 1 | 0.01 |
28 | rum | 1 | 0.01 |
32 | srp | 1 | 0.01 |
3 | cze | 1 | 0.01 |
Get total general languages and merge with authors.
authors_langs_unique = authors_langs_group.copy()
authors_langs_unique['lang'] = 1
authors_langs_unique = authors_langs_unique[['author','lang']].groupby(['author']).sum().reset_index()
authors_langs_unique.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 269 entries, 0 to 268 Data columns (total 2 columns): author 269 non-null object lang 269 non-null int64 dtypes: int64(1), object(1) memory usage: 4.3+ KB
all([i in set(authors_langs_unique['author'].tolist()) for i in set(authors_gs['author'].tolist())])
True
authors_gs = pd.merge(left=authors_gs, right=authors_langs_unique,
how='inner',
on='author')
authors_gs
author | total_holdings | work_count | record_count | All | Since 2014 | User | Bibliometrics | Figure | Status | University | lang | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Adrián A. Díaz-Faes | 2 | 2 | 2 | 168 | 158 | qbu_JY4AAAAJ | Yes | Researcher | Active | ingenio CSIC-UPV | 2 |
1 | Adèle Paul-Hus | 11 | 5 | 7 | 719 | 717 | ZsZex3IAAAAJ | Yes | Professor | Active | Université de Montréal | 1 |
2 | Alan Pritchard | 2515 | 78 | 199 | 2893 | 1596 | quOCDDEAAAAJ | Yes | Researcher | Deceased | ---- | 1 |
3 | Alberto Martín-Martín | 39 | 3 | 6 | 1060 | 1056 | YlPd48UAAAAJ | Yes | Professor | Active | University of Granada | 2 |
4 | Alesia Zuccala | 12 | 4 | 6 | 1154 | 716 | FubDq0QAAAAJ | Yes | Professor | Active | University of Copenhagen | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
260 | Yves-François Le Coadic | 1302 | 30 | 91 | 2504 | 904 | BeGPwbgAAAAJ | Yes | Professor | Emeritus | Cnam - Paris | 5 |
261 | Zaida Chinchilla-Rodríguez | 165 | 19 | 38 | 3058 | 1688 | eI_07rMAAAAJ | Yes | Researcher | Active | CSIC | 1 |
262 | Zohreh Zahedi | 5 | 5 | 5 | 1350 | 1337 | X8O5sZ4AAAAJ | Yes | Researcher | Active | University of Leiden | 2 |
263 | maryam shekofteh | 2 | 2 | 2 | 129 | 96 | KFidCf0AAAAJ | Yes | Professor | Active | Shahid Beheshti University of Medical Sciences | 1 |
264 | Álvaro Cabezas-Clavijo | 63 | 5 | 11 | 1189 | 849 | z9mQzfYAAAAJ | Yes | Researcher | Active | EC3metrics | 2 |
265 rows × 12 columns
Our final sample is composed of 3134 works (52.89 %) and 9484 publications (68.79 %).
sum(authors_gs['work_count'])
5925
sum(authors_gs['record_count'])
13786
len(authors_works)
3134
sum(authors_works['editions'])
9484
round(100 * len(authors_works)/sum(authors_gs['work_count']), 2)
52.89
round(100 * sum(authors_works['editions'])/sum(authors_gs['record_count']), 2)
68.79
#authors_gs.to_csv('data/authorities_gs.tsv', sep='\t', index_label=False, index=False)
#authors_langs_group.to_csv('data/langs.tsv', sep='\t', index_label=False, index=False)
#authors_works.to_csv('data/works.tsv', sep='\t', index_label=False, index=False)
authors_gs.describe().round(2)
total_holdings | work_count | record_count | All | Since 2014 | lang | |
---|---|---|---|---|---|---|
count | 265.00 | 265.00 | 265.00 | 265.00 | 265.00 | 265.00 |
mean | 531.69 | 22.36 | 52.02 | 3186.43 | 1651.24 | 1.94 |
std | 1142.85 | 53.38 | 134.67 | 5274.83 | 2646.61 | 1.33 |
min | 0.00 | 1.00 | 1.00 | 8.00 | 8.00 | 1.00 |
25% | 5.00 | 3.00 | 3.00 | 399.00 | 252.00 | 1.00 |
50% | 54.00 | 8.00 | 15.00 | 1201.00 | 727.00 | 2.00 |
75% | 439.00 | 26.00 | 57.00 | 3754.00 | 1994.00 | 2.00 |
max | 7157.00 | 753.00 | 1839.00 | 49466.00 | 25909.00 | 11.00 |
pd.crosstab(index=authors_gs['Status'], columns='count').sort_values(by='count', ascending=False)
col_0 | count |
---|---|
Status | |
Active | 231 |
Emeritus | 13 |
Deceased | 11 |
Retired | 10 |
pd.crosstab(index=authors_gs['Figure'], columns='count').sort_values(by='count', ascending=False)
col_0 | count |
---|---|
Figure | |
Professor | 150 |
Researcher | 70 |
Librarian | 42 |
Other professionals | 3 |
round(100*sum(authors_gs.sort_values(by='work_count', ascending=False)['work_count'][0:25])/sum(authors_gs['work_count']),2)
49.77
round(100*sum(authors_gs[authors_gs['Status'] == 'Active'].sort_values(by='work_count', ascending=False)['work_count'][0:25])/sum(authors_gs['work_count']),2)
26.43
round(100*sum(authors_gs[authors_gs['Status'] == 'Active'].sort_values(by='work_count', ascending=False)['work_count'])/sum(authors_gs['work_count']),2)
58.38
round(100*sum(authors_gs[authors_gs['Status'] != 'Active'].sort_values(by='work_count', ascending=False)['work_count'][0:25])/sum(authors_gs['work_count']),2)
40.22
round(100*sum(authors_gs[authors_gs['Status'] != 'Active'].sort_values(by='work_count', ascending=False)['work_count'])/sum(authors_gs['work_count']),2)
41.62
@interact
def show_boxplot_more_than(column=['total_holdings', 'work_count', 'record_count', 'All', 'Since 2014']):
fig, ax = plt.subplots()
ax.set_title('Basic Plot')
ax.boxplot(authors_gs[column])
return plt.show()
interactive(children=(Dropdown(description='column', options=('total_holdings', 'work_count', 'record_count', …