from __future__ import unicode_literals
import json
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

with open('biotech500.json', 'rb') as fp:
    data = json.load(fp)

print data.keys()
print data['response'].keys()

len(data['response']['docs'])

print data['response']['docs'][0].keys()

for k in data['response']['docs'][0].keys():
    print data['response']['docs'][0][k], type(data['response']['docs'][0][k])

articles_list = data['response']['docs']
articles = DataFrame(articles_list)
articles.head()

# We got more abstracts this time.
articles = articles[articles['abstract'].notnull()]
print len(articles)
articles.head()

# Abstracts and authors are lists
print type(articles.ix[7,0]), len(articles.ix[7,0])
print type(articles.ix[7,2]), len(articles.ix[7,2])

DataFrame([articles.abstract.apply(len), articles.author_display.apply(len)])

# Are they all from PLoS ONE? ... We can tell by eISSN == 1932-6203 
# ... http://www.plosone.org/static/information.action
len(articles[articles.eissn == '1932-6203'])

# If we care, we can fill in the missing journals based on the eISSN.
articles[articles.eissn != '1932-6203'].ix[:,['eissn', 'journal']]

import nltk
from nltk.corpus import stopwords
import string

# Globally define a set of stopwords. We can add sciency stuff to it as well.
stops = set(stopwords.words('english'))
stops.add('conclusions') # just an example

def wordify(abs_list):
    '''Take the abstract field from PLoS API and convert it to a filtered list of words.'''
    
    # Make it a string.
    text = ' '.join(abs_list).strip(' \n')
    
    if text == '':
        return np.nan
    
    else:
        # Remove punctuation & replace with space, 
        # because we want 'metal-contaminated' => 'metal contaminated'
        # ...not 'metalcontaminated', and so on.
        for c in string.punctuation:
            text = text.replace(c, ' ')

        # Now make it a Series of words, and do some cleaning.
        words = Series(text.split(' '))
        # aseries.apply(lambda s: s.strip()) # should be unnecessary: split should already do this.
        words = words.str.lower()
        words = words[words.str.len() > 1]
        words = words[~words.str.contains(r'[^#@a-z]')]  # What exactly does this do?
        
        # Filter globally-defined stopwords
        ignore = stops & set(words.unique())
        words_out = [w for w in words.tolist() if w not in ignore]
        
        return words_out

# Test
test_abstract = articles.ix[16, 0]
wordify(test_abstract)

articles['words'] = articles.apply(lambda s: wordify(s['abstract'] + [s['title_display']]), axis=1)
articles.drop(['article_type', 'score', 'title_display', 'abstract'], axis=1, inplace=True)
articles.head()

abs_df = DataFrame(articles['words'].apply(lambda x: ' '.join(x)).tolist(), columns=['text'])
abs_df.head()

abs_set_df = DataFrame(articles['words'].apply(lambda x: ' '.join(set(x))).tolist(), columns=['text'])
abs_set_df.head()

#include all words from abstracts for getting common word pairs
words_all = pd.Series(' '.join(abs_df['text']).split(' '))
words_all.value_counts()

relevant_words_pairs = words_all.copy()#.str.lower()
#relevant_words_pairs = relevant_words_pairs[~relevant_words_pairs.str.contains(r'[^#@a-z]')]
#relevant_words_pairs = relevant_words_pairs[relevant_words_pairs.str.len() > 1]
#ignore = set(stopwords.words('english')) & set(relevant_words_pairs.unique())
relevant_words_pairs.value_counts()#.drop(ignore)

from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

bcf = BigramCollocationFinder.from_words(relevant_words_pairs)
for pair in bcf.nbest(BigramAssocMeasures.likelihood_ratio, 30):
    print ' '.join(pair)

#these are the most common paired words
bcf.nbest(BigramAssocMeasures.likelihood_ratio, 20)

#following http://nbviewer.ipython.org/github/sanand0/ipython-notebooks/blob/master/Text-analysis.ipynb
#only includes a set() of words from each abstract
words = pd.Series(' '.join(abs_set_df['text']).split(' '))
words.value_counts()

top_words = words.value_counts().reset_index()
top_words.columns = ['word', 'count']
top_words.head(15)

word_list = list(top_words.word)
word_str = ' '.join(word_list)
print word_str

# Note the changed filename
top_words.to_csv('../wordcloud2.csv', index=False)