This is a colab notebook that demonstrated how to calculate the CD index using Dimesnions on Google BigQuery.
For more details, see the accompanying paper TBC.
!pip install cdindex
import cdindex
import datetime
from string import Template
import pandas as pd
import numpy as np
import re
import os
import shutil
import pandas as pd
import itertools
import tqdm
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from google.colab import auth
from google.auth import default
from google.cloud import bigquery
# Authenticate Google
auth.authenticate_user()
creds, _ = default()
# Get the tables with all fields and nesting for the GBQ Dimensions tables
def gbq(sql, verbose=False):
if verbose:
print("==========\nQuerying:")
print(sql)
print("...")
df = pd.io.gbq.read_gbq(sql, project_id = gbq_project_id)
if verbose:
print("\t...loaded ", len(df), " records")
display(df)
return df
# You need to set these variables before running the script
gbq_project_id = "[your GBQ billing project ID]"
cdindex_gbq_table_name = "[your GBQ project].[GBQ dataset].[prefix for tables with CD indices]"
Collecting cdindex Downloading cdindex-1.0.19.tar.gz (9.6 kB) Preparing metadata (setup.py) ... done Requirement already satisfied: future in /usr/local/lib/python3.10/dist-packages (from cdindex) (0.18.3) Building wheels for collected packages: cdindex Building wheel for cdindex (setup.py) ... done Created wheel for cdindex: filename=cdindex-1.0.19-cp310-cp310-linux_x86_64.whl size=26233 sha256=959604541392dfcaadb8090b62b95fc4d5d5a49c40bd256262166176ee286052 Stored in directory: /root/.cache/pip/wheels/43/75/3c/d1af2e9aaab2040d9907be8ebc640b05912fee43cc98471689 Successfully built cdindex Installing collected packages: cdindex Successfully installed cdindex-1.0.19
This is the query shown in our paper
sql = """
-- This is the focal publication f
DECLARE focal_publication_id STRING DEFAULT "pub.1019844293";
-- This is the impact span t
DECLARE time_diff INT64 DEFAULT 5;
WITH cd_raw_data AS
(
-- Calculating s’ for each citation to the focal publication
-- All are assigned a score s’=-1. Any other publications appearing in
-- the second SELECT and aren’t included here
-- implicitly get a score s’= 0
(
SELECT
DISTINCT -- make sure we list unique citations otherwise we may double count
publications.id AS focal_id, -- focal publication
citation.id AS citation_id, -- citing publication to focal publication
-1 AS score -- s’
-- the Dimensions GBQ table for publications
FROM `dimensions-ai.data_analytics.publications` AS publications
-- fetch all its citing publications: id and year
LEFT JOIN UNNEST(publications.citations) AS citation
-- for this experiment we only look at one publication
WHERE publications.id = focal_publication_id
-- we only consider citations that appear at most time_diff years after
-- the focal publication has been published
AND citation.year - publications.year BETWEEN 1 AND time_diff
)
UNION ALL
-- Calculating s’’ for each citation to the references of
-- the focal publication
-- All are assigned a score s’’=-2. Any other publications appearing in
-- the first SELECT and aren’t included here
-- implicitly get a score s’’= 0
(
SELECT DISTINCT
publications.id AS focal_id, -- focal publication
reference_citation.id AS citation_id,-- citing publication to references
-2 AS score -- s’’
FROM `dimensions-ai.data_analytics.publications` AS publications
-- get all the reference publication IDs of the focal publication
LEFT JOIN UNNEST(publications.reference_ids) AS reference_id
-- get the references’ meta data - mainly citations to it
INNER JOIN `dimensions-ai.data_analytics.publications` AS references
ON references.id = reference_id
-- get the citations to the references
LEFT JOIN UNNEST(references.citations) AS reference_citation
WHERE publications.id = focal_publication_id
AND reference_citation.year - publications.year BETWEEN 1 AND time_diff
)
)
-- Now add up all scores, count the distinct ids of the citations in both SELECTs
-- above and use that information to calculate the CD index
SELECT focal_id,
((SUM(score)/COUNT(DISTINCT citation_id))+2) AS sql_cd_5
FROM cd_raw_data
GROUP BY focal_id
"""
results = gbq(sql)
Here we calculate the CD index for 3 selected publications in the most basic fashion and compare it against calculations in Park, Michael, Erin Leahey, and Russell J. Funk. 2023. “Papers and Patents Are Becoming Less Disruptive over Time.” Nature 613 (7942): 138–44.
Our result for the CD index will be sql_cd_5
and the values of the Nature paper are in column nature_cd_5
In order to make the SQL AS simple AS possible (and AS easy to understand AS possible) we do not introduce any limitations on the publication network (i.e. the citations and references).
This is a bit more elaborate example where we also supply some metadata. We also explicitely exclude publications and citations with no valid publication year (which is very rare ca. 0.05% of all publictions so we are a bit pedantic here).
# The results of the journal
nature_results = pd.DataFrame([
{"focal_publication_id" : "pub.1040343773", "nature_cd_5" : 0.52},
{"focal_publication_id" : "pub.1060431417", "nature_cd_5" : -0.22},
{"focal_publication_id" : "pub.1019844293", "nature_cd_5" : -0.55}
])
focal_publication_ids_text = "'" + "','".join(nature_results.focal_publication_id.unique())+"'"
sql = f"""
-- This is the focal publications f
DECLARE focal_publication_ids ARRAY<STRING> DEFAULT [{focal_publication_ids_text}];
-- This is the impact span t
DECLARE time_diff INT64 DEFAULT 5;
WITH cd_raw_data AS
(
-- Calculating s’ for each citation to the focal publication
-- All are assigned a score s’=-1. Any other publications appearing in
-- the second SELECT and aren’t included here
-- implicitly get a score s’= 0
(
SELECT
DISTINCT -- make sure we list unique citations otherwise we may double count
publications.id AS focal_id, -- focal publication
citation.id AS citation_id, -- citing publication to focal publication
-1 AS score -- s’
-- the Dimensions GBQ table for publications
FROM `dimensions-ai.data_analytics.publications` AS publications
-- fetch all its citing publications: id and year
LEFT JOIN UNNEST(publications.citations) AS citation
-- for this experiment we only look at one publication
WHERE publications.id IN UNNEST(focal_publication_ids)
-- we only consider citations that appear at most time_diff years after
-- the focal publication has been published
AND citation.year - publications.year BETWEEN 1 AND time_diff
-- exclude cases where there is no publication year...very rare
AND citation.year IS NOT NULL
AND publications.year IS NOT NULL
)
UNION ALL
-- Calculating s’’ for each citation to the references of
-- the focal publication
-- All are assigned a score s’’=-2. Any other publications appearing in
-- the first SELECT and aren’t included here
-- implicitly get a score s’’= 0
(
SELECT DISTINCT
publications.id AS focal_id, -- focal publication
reference_citation.id AS citation_id,-- citing publication to references
-2 AS score -- s’’
FROM `dimensions-ai.data_analytics.publications` AS publications
-- get all the reference publication IDs of the focal publication
LEFT JOIN UNNEST(publications.reference_ids) AS reference_id
-- get the references’ meta data - mainly citations to it
INNER JOIN `dimensions-ai.data_analytics.publications` AS references
ON references.id = reference_id
-- get the citations to the references
LEFT JOIN UNNEST(references.citations) AS reference_citation
WHERE publications.id IN UNNEST(focal_publication_ids)
AND reference_citation.year - publications.year BETWEEN 1 AND time_diff
-- exclude cases where there is no publication year...very rare
AND reference_citation.year IS NOT NULL
AND publications.year IS NOT NULL
)
)
-- Now add up all scores, count the distinct ids of the citations in both SELECTs
-- above and use that information to calculate the CD index
SELECT focal_id AS focal_publication_id,
publications.doi AS doi,
CONCAT(publications.authors[SAFE_OFFSET(0)].last_name) AS first_author,
publications.title.preferred AS title,
publications.journal.title AS journal,
publications.year AS publication_year,
((SUM(score)/COUNT(DISTINCT citation_id))+2) AS sql_cd_5
FROM cd_raw_data
LEFT JOIN `dimensions-ai.data_analytics.publications` AS publications ON publications.id = cd_raw_data.focal_id
GROUP BY 1,2,3,4,5,6
"""
results = gbq(sql)
# Add the results of the Nature paper for comparison
results = results.merge(nature_results, on="focal_publication_id")
display(results)
focal_publication_id | doi | first_author | title | journal | publication_year | sql_cd_5 | nature_cd_5 | |
---|---|---|---|---|---|---|---|---|
0 | pub.1019844293 | 10.1038/2261209a0 | BALTIMORE | Viral RNA-dependent DNA Polymerase: RNA-depend... | Nature | 1970 | -0.443149 | -0.55 |
1 | pub.1040343773 | 10.1038/171737a0 | WATSON | Molecular Structure of Nucleic Acids: A Struct... | Nature | 1953 | 0.600000 | 0.52 |
2 | pub.1060431417 | 10.1103/physrev.140.a1133 | Kohn | Self-Consistent Equations Including Exchange a... | Physical Review | 1965 | -0.258621 | -0.22 |
We are now comparing the SQL-based calculations against the standard python implementation cdindex by Russel Funk.
The python package requires a graph to calculate the CD index. Therefore we will build a subgraph of the larger network with only those citations and references and citations to references needed to calculate the CD index.
We will use the same publications as above. Our result for the CD index will be sql_cd_5
and the ones from the Python package py_cd_5
.
sql = f"""
DECLARE focal_publication_ids ARRAY<STRING> DEFAULT [{focal_publication_ids_text}];
# We now get all the vertices and edges of the directed graph of publications with a vertex attribute "time"
# The relationship source->target means source cites target
# We also don't impose a time limit i.e. we get potentially more publications that we really need
WITH raw_data AS
(
(
# Citations of the focal publications
SELECT DISTINCT
citation.id AS source,
citation.year AS source_year,
publications.id AS target,
publications.year AS target_year,
"focal paper citations" AS kind # not needed but maybe useful for debugging
FROM `dimensions-ai.data_analytics.publications` AS publications
LEFT JOIN UNNEST(publications.citations) AS citation
WHERE publications.id IN UNNEST(focal_publication_ids)
AND citation.year IS NOT NULL
AND publications.year IS NOT NULL
)
UNION ALL
(
# Citations of the references of the focal publications
SELECT DISTINCT
reference_citation.id AS source,
reference_citation.year AS source_year,
references.id AS target,
references.year AS target_year,
"reference citations" AS kind # not needed but maybe useful for debugging
FROM `dimensions-ai.data_analytics.publications` AS publications
LEFT JOIN UNNEST(publications.reference_ids) AS reference_id
INNER JOIN `dimensions-ai.data_analytics.publications` AS references
ON references.id = reference_id
LEFT JOIN UNNEST(references.citations) AS reference_citation
WHERE publications.id IN UNNEST(focal_publication_ids)
AND reference_citation.year IS NOT NULL
AND publications.year IS NOT NULL
)
)
# It is possible that we get a citation several times e.g. if focal publications
# share citations hence we make sure each citation only happens once:
SELECT DISTINCT *
FROM raw_data
"""
raw_edge_list = gbq(sql)
# Paranoia check
assert len(raw_edge_list[(raw_edge_list.source_year.isna()) | (raw_edge_list.target_year.isna())])==0
# Now create a list of all vertices for our mini publication network
# for this we need to concatenate the target and source vertices of all edges and delete duplicates
# We also need the "time" i.e. publication year for each of the publications aka nodes
vertex_list = pd.concat([
raw_edge_list[["source", "source_year"]].rename(columns={"source" : "name", "source_year" : "time"}),
raw_edge_list[["target", "target_year"]].rename(columns={"target" : "name", "target_year" : "time"})
], axis =0).drop_duplicates()
vertex_list.time = vertex_list.time.astype(int)
# Paranoia check: each vertex appears only once
assert vertex_list.name.nunique() == len(vertex_list)
# The final edge list only needs source and target ids
edge_list = raw_edge_list[["source", "target"]].drop_duplicates()
# Now we feed this graph into cdindex
graph = cdindex.Graph(vertices = vertex_list.to_dict("records"), edges = edge_list.to_dict("records"))
results["py_cd_5"] = results.focal_publication_id.apply(lambda id : graph.cdindex(id, 5))
display(results)
focal_publication_id | doi | first_author | title | journal | publication_year | sql_cd_5 | nature_cd_5 | py_cd_5 | |
---|---|---|---|---|---|---|---|---|---|
0 | pub.1019844293 | 10.1038/2261209a0 | BALTIMORE | Viral RNA-dependent DNA Polymerase: RNA-depend... | Nature | 1970 | -0.443149 | -0.55 | -0.443149 |
1 | pub.1040343773 | 10.1038/171737a0 | WATSON | Molecular Structure of Nucleic Acids: A Struct... | Nature | 1953 | 0.600000 | 0.52 | 0.600000 |
2 | pub.1060431417 | 10.1103/physrev.140.a1133 | Kohn | Self-Consistent Equations Including Exchange a... | Physical Review | 1965 | -0.258621 | -0.22 | -0.258621 |
This is the query that allows you to calculate the CD index for all or some publications on Dimensions GBQ. Be advised that this query can take up to 5h. In order to track progress we strongly recommend to run this query in the GBQ console rather than in a notebook. You will also need to provide a table name in variable cdindex_gbq_table_name
where the script will write the results to. The amount is too large to make writing it to a pandas dataframe reasonable.
This python script has 3 modes which depends on the value of run_query
:
all: Calculates the CD index for all publications
journals : Only calculate the CD index for all journal articles with at least 10 references. Most of the literature calculates CD indices for journal articles only. The lower limit for references rules out publications with metadata issues and this can lead too many publications having a CD = 1 as any publication with at least one citation and no reference has automatically CD=1
pubmed: Only calculates CD index for pubmed
# As in most of the literature we calculate CD_5 but you can also compute for other time spans
time_diff = 5
# Mode
run_query = "pubmed"
if run_query == "journals":
sql_where = 'AND type ="article" AND ARRAY_LENGTH(reference_ids)>=10 AND journal.id IS NOT NULL'
elif run_query == "pubmed":
sql_where = 'AND pmid IS NOT NULL'
elif run_query == "all":
sql_where = ""
else:
print("WRONG VALUE for run_query")
1/0
print("Running query...")
sql = f"""
CREATE OR REPLACE TABLE `{cdindex_gbq_table_name}_{run_query}`
CLUSTER BY id
AS
(
WITH publications AS
(
SELECT id, year, citations, reference_ids
FROM `dimensions-ai.data_analytics.publications`
WHERE year IS NOT NULL {sql_where}
)
SELECT focal_id AS id,
(SUM(score)/COUNT(DISTINCT citation_id))+2 AS cd_{time_diff},
COUNTIF(score = -1)*((SUM(score)/COUNT(DISTINCT citation_id))+2) AS mcd_{time_diff}
FROM
(
(
SELECT DISTINCT publications.id AS focal_id,
citation.id AS citation_id,
-1 AS score
FROM publications
LEFT JOIN UNNEST(publications.citations) AS citation
WHERE citation.year - publications.year BETWEEN 1 AND {time_diff}
)
UNION ALL
(
SELECT DISTINCT publications.id AS focal_id,
reference_citation.id as citation_id,
-2 as score
FROM publications
LEFT JOIN UNNEST(publications.reference_ids) AS reference_id
INNER JOIN publications AS references
ON references.id = reference_id
LEFT JOIN UNNEST(references.citations) AS reference_citation
WHERE reference_citation.year - publications.year BETWEEN 1 AND {time_diff}
)
)
GROUP BY 1
)
"""
print("=========== The SQL ============")
print(sql)
print("================================")
answer =input("Sure you want to run it in colab? (type 'yes' to do it)")
if answer == "yes":
start = datetime.datetime.now()
print(f"Started {start}")
tmp = gbq(sql)
finish = datetime.datetime.now()
print(f"Finished {finish}")
print(f"Duration: {finish-start}")
else:
print("...not running SQL")
Running query... =========== The SQL ============ CREATE OR REPLACE TABLE `reports-dimensions-ai.precomputed.publications_cd_index_pubmed` CLUSTER BY id AS ( WITH publications AS ( SELECT id, year, citations, reference_ids FROM `dimensions-ai.data_analytics.publications` WHERE year IS NOT NULL AND pmid IS NOT NULL ) SELECT focal_id AS id, (SUM(score)/COUNT(DISTINCT citation_id))+2 AS cd_5, COUNTIF(score = -1)*((SUM(score)/COUNT(DISTINCT citation_id))+2) AS mcd_5 FROM ( ( SELECT DISTINCT publications.id AS focal_id, citation.id AS citation_id, -1 AS score FROM publications LEFT JOIN UNNEST(publications.citations) AS citation WHERE citation.year - publications.year BETWEEN 1 AND 5 ) UNION ALL ( SELECT DISTINCT publications.id AS focal_id, reference_citation.id as citation_id, -2 as score FROM publications LEFT JOIN UNNEST(publications.reference_ids) AS reference_id INNER JOIN publications AS references ON references.id = reference_id LEFT JOIN UNNEST(references.citations) AS reference_citation WHERE reference_citation.year - publications.year BETWEEN 1 AND 5 ) ) GROUP BY 1 ) ================================ Sure you want to run it in colab? (type 'yes' to do it)no ...not running SQL