from __future__ import unicode_literals import json import numpy as np import pandas as pd from pandas import DataFrame, Series with open('biotech500.json', 'rb') as fp: data = json.load(fp) print data.keys() print data['response'].keys() len(data['response']['docs']) print data['response']['docs'][0].keys() for k in data['response']['docs'][0].keys(): print data['response']['docs'][0][k], type(data['response']['docs'][0][k]) articles_list = data['response']['docs'] articles = DataFrame(articles_list) articles.head() # We got more abstracts this time. articles = articles[articles['abstract'].notnull()] print len(articles) articles.head() # Abstracts and authors are lists print type(articles.ix[7,0]), len(articles.ix[7,0]) print type(articles.ix[7,2]), len(articles.ix[7,2]) DataFrame([articles.abstract.apply(len), articles.author_display.apply(len)]) # Are they all from PLoS ONE? ... We can tell by eISSN == 1932-6203 # ... http://www.plosone.org/static/information.action len(articles[articles.eissn == '1932-6203']) # If we care, we can fill in the missing journals based on the eISSN. articles[articles.eissn != '1932-6203'].ix[:,['eissn', 'journal']] import nltk from nltk.corpus import stopwords import string # Globally define a set of stopwords. We can add sciency stuff to it as well. stops = set(stopwords.words('english')) stops.add('conclusions') # just an example def wordify(abs_list): '''Take the abstract field from PLoS API and convert it to a filtered list of words.''' # Make it a string. text = ' '.join(abs_list).strip(' \n') if text == '': return np.nan else: # Remove punctuation & replace with space, # because we want 'metal-contaminated' => 'metal contaminated' # ...not 'metalcontaminated', and so on. for c in string.punctuation: text = text.replace(c, ' ') # Now make it a Series of words, and do some cleaning. words = Series(text.split(' ')) # aseries.apply(lambda s: s.strip()) # should be unnecessary: split should already do this. words = words.str.lower() words = words[words.str.len() > 1] words = words[~words.str.contains(r'[^#@a-z]')] # What exactly does this do? # Filter globally-defined stopwords ignore = stops & set(words.unique()) words_out = [w for w in words.tolist() if w not in ignore] return words_out # Test test_abstract = articles.ix[16, 0] wordify(test_abstract) articles['words'] = articles.apply(lambda s: wordify(s['abstract'] + [s['title_display']]), axis=1) articles.drop(['article_type', 'score', 'title_display', 'abstract'], axis=1, inplace=True) articles.head() abs_df = DataFrame(articles['words'].apply(lambda x: ' '.join(x)).tolist(), columns=['text']) abs_df.head() abs_set_df = DataFrame(articles['words'].apply(lambda x: ' '.join(set(x))).tolist(), columns=['text']) abs_set_df.head() #include all words from abstracts for getting common word pairs words_all = pd.Series(' '.join(abs_df['text']).split(' ')) words_all.value_counts() relevant_words_pairs = words_all.copy()#.str.lower() #relevant_words_pairs = relevant_words_pairs[~relevant_words_pairs.str.contains(r'[^#@a-z]')] #relevant_words_pairs = relevant_words_pairs[relevant_words_pairs.str.len() > 1] #ignore = set(stopwords.words('english')) & set(relevant_words_pairs.unique()) relevant_words_pairs.value_counts()#.drop(ignore) from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures bcf = BigramCollocationFinder.from_words(relevant_words_pairs) for pair in bcf.nbest(BigramAssocMeasures.likelihood_ratio, 30): print ' '.join(pair) #these are the most common paired words bcf.nbest(BigramAssocMeasures.likelihood_ratio, 20) #following http://nbviewer.ipython.org/github/sanand0/ipython-notebooks/blob/master/Text-analysis.ipynb #only includes a set() of words from each abstract words = pd.Series(' '.join(abs_set_df['text']).split(' ')) words.value_counts() top_words = words.value_counts().reset_index() top_words.columns = ['word', 'count'] top_words.head(15) word_list = list(top_words.word) word_str = ' '.join(word_list) print word_str # Note the changed filename top_words.to_csv('../wordcloud2.csv', index=False)