from __future__ import unicode_literals
import json
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import xlrd
from collections import defaultdict
The PLOS thesaurus was kindly provided to us as a spreadsheet with thousands of rows, one node per row. It is a polyhierarchy represented in the form of a tree. We need to transform it into a JSON object that also includes article counts for all the nodes in the tree.
An example of the desired data structure for PLOS thesaurus:
{"count": #total, "name": "PLOS", "children": [
{"name": "Computer and information sciences",
"count": ###,
"children": [
{"name": "Information technology",
"count": ###,
"children": [
{"name": "Data mining", "count": ###},
{"name": "Data reduction", "count": ###},
{"name": "Databases",
"count": ###,
"children": [
{"name": "Relational databases", "count": ###}
]
},
...,
{"name": "Text mining","count": ###}
]
}
]
},
...
]
}
In Python, each node is a dict. Children are specified as a list of dicts. The whole thing is a list of nodes, therefore, a list of dicts.
# Import article data
df = pd.read_pickle('../data/all_plos_df.pkl')
# Drop unused data
df.drop(['author', 'title_display', 'journal', 'abstract', 'publication_date', 'score'], axis=1, inplace=True)
df.set_index('id', inplace=True)
df.head()
subject | |
---|---|
id | |
10.1371/journal.pone.0008858 | [/Biology and life sciences/Biochemistry/Prote... |
10.1371/journal.pone.0004722 | [/Biology and life sciences/Cell biology/Cellu... |
10.1371/journal.pone.0076865 | [/Biology and life sciences/Biochemistry/DNA/D... |
10.1371/journal.pbio.0040157 | [/Research and analysis methods/Research asses... |
10.1371/journal.pone.0080851 | [/Biology and life sciences/Biochemistry/Prote... |
5 rows × 1 columns
# Let's make sure we are counting articles correctly for each subject node.
def count_articles(df, subject_path):
s = df.subject.apply(lambda s: str(s))
matching = s[s.str.contains(subject_path)]
return len(matching)
print 'Total articles:', len(df)
print 'Science policy:', count_articles(df, 'Science policy')
print 'Science policy/Bioethics:', count_articles(df, 'Science policy/Bioethics')
Total articles: 115489 Science policy: 518 Science policy/Bioethics: 71
def tree_from_spreadsheet(f, df, verbose=False):
subjects = df.subject.apply(lambda s: str(s))
book = xlrd.open_workbook(f)
pt = book.sheet_by_index(0)
# spreadsheet cells : (row, col) :: cell A1 : (0, 0)
# Initialize a list to contain the thesaurus.
# Our test case will only have one item in this list.
pt_list = []
# Keep track of the path in the tree.
cur_path = Series([np.nan]*10)
for r in range(1, pt.nrows):
# Start on row two.
# Columns: the hierarchy goes up to 10 tiers.
for c in range(10):
if pt.cell_value(r, c):
# If this condition is satisfied, we are at the node that's in this line.
# Construct the path to this node.
# Clean strings because some terms (RNA nomenclature) cause unicode error
text = pt.cell_value(r, c).replace(u'\u2019', "'")
cur_path[c] = text
cur_path[c+1:] = np.nan
path_list = list(cur_path.dropna())
tier = len(path_list)
path_str = '/'.join(path_list)
if verbose:
print tier, path_str
# Add the node to the JSON-like tree structure.
node = defaultdict(list)
node['name'] = text
node['count']= len(subjects[subjects.str.contains(path_str)])
# This part is completely ridiculous. But it seems to work.
if tier == 1:
pt_list.append(node)
pt_list.append
elif tier == 2:
pt_list[-1]['children'].append(node)
elif tier == 3:
pt_list[-1]['children'][-1]['children'].append(node)
elif tier == 4:
pt_list[-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 5:
pt_list[-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 6:
pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 7:
pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 8:
pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 9:
pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
elif tier == 10:
pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)
# Go to next row after finding a term. There is only one term listed per row.
break
# Make a single JSON object to contain all the branches.
pt_obj = {'count': len(df), 'name': 'PLOS', 'children': pt_list}
return pt_obj
Experimenting on a smaller subset of the thesaurus: the very small Science policy
branch.
plosthes_test_file = '../data/plosthes_test.xlsx'
json.dumps(tree_from_spreadsheet(plosthes_test_file, df, verbose=True))
1 Science policy 2 Science policy/Bioethics 3 Science policy/Bioethics/Justice in science 3 Science policy/Bioethics/Respect for human dignity 3 Science policy/Bioethics/Sanctity of life 3 Science policy/Bioethics/Scientific beneficence 3 Science policy/Bioethics/Scientific nonmaleficence 2 Science policy/Material transfer agreements 2 Science policy/Research funding 3 Science policy/Research funding/Corporate funding of science 3 Science policy/Research funding/Government funding of science 3 Science policy/Research funding/Institutional funding of science 3 Science policy/Research funding/Military funding of science 3 Science policy/Research funding/Philanthropic funding of science 3 Science policy/Research funding/Research grants 2 Science policy/Research integrity 3 Science policy/Research integrity/Publication ethics 3 Science policy/Research integrity/Scientific misconduct 2 Science policy/Science and technology workforce 3 Science policy/Science and technology workforce/Careers in research 2 Science policy/Science education 3 Science policy/Science education/Science fairs 2 Science policy/Science policy and economics 2 Science policy/Technology regulations
'{"count": 115489, "name": "PLOS", "children": [{"count": 518, "name": "Science policy", "children": [{"count": 71, "name": "Bioethics", "children": [{"count": 1, "name": "Justice in science"}, {"count": 2, "name": "Respect for human dignity"}, {"count": 1, "name": "Sanctity of life"}, {"count": 2, "name": "Scientific beneficence"}, {"count": 1, "name": "Scientific nonmaleficence"}]}, {"count": 1, "name": "Material transfer agreements"}, {"count": 198, "name": "Research funding", "children": [{"count": 17, "name": "Corporate funding of science"}, {"count": 67, "name": "Government funding of science"}, {"count": 18, "name": "Institutional funding of science"}, {"count": 2, "name": "Military funding of science"}, {"count": 4, "name": "Philanthropic funding of science"}, {"count": 91, "name": "Research grants"}]}, {"count": 96, "name": "Research integrity", "children": [{"count": 68, "name": "Publication ethics"}, {"count": 17, "name": "Scientific misconduct"}]}, {"count": 31, "name": "Science and technology workforce", "children": [{"count": 31, "name": "Careers in research"}]}, {"count": 26, "name": "Science education", "children": [{"count": 0, "name": "Science fairs"}]}, {"count": 3, "name": "Science policy and economics"}, {"count": 6, "name": "Technology regulations"}]}]}'
Warning: it takes about 90 minutes.
plosthes_full_file = '../data/plosthes.2014-1.full.xlsx'
# Generate tree structure
# Change to verbose=True if you want to see it happening.
# (Fills up the output cell with ~10000 lines.)
plos_tree = tree_from_spreadsheet(plosthes_full_file, df, verbose=False)
# Export tree structure as JSON
# (changed output filename to something we aren't currently using)
with open('../data/plos_hierarchy_full.json', 'wb') as f:
json.dump(plos_tree, f)