#I'm using the scraperwiki Python library for scraping text from PDFs.
#There may be easier alternatives, but it's the way I'm using...
#Unfortunately, there are a few dependencies for the function I actually want to be able to call...
!apt-get -y install poppler-utils
!apt-get -y install libxml2-dev
!apt-get -y install  libxslt-dev
!pip install lxml
!pip install scraperwiki

import scraperwiki
import urllib2, lxml.etree

def get_PDF_XML(url):
    pdfdata = urllib2.urlopen(url).read()
    xmldata = scraperwiki.pdftoxml(pdfdata)
    root = lxml.etree.fromstring(xmldata)
    return root

#Usage:
#root=get_PDF_XML('http://www.anc.org.za/2014/wp-content/themes/anc/downloads/Manifesto_Booklet.pdf')

#Get a PDF document from a URL/web address and then XMLify it - here are some example manifestos
manifesto_ANC='http://www.anc.org.za/2014/wp-content/themes/anc/downloads/Manifesto_Booklet.pdf'
manifesto_EFF='http://effighters.org.za/wp-content/uploads/2014/02/EFF-ELECTIONS-MANIFESTO.pdf'#untested
manifesto_DA='http://www.da.org.za/docs/15652/DA%20Manifesto.pdf'#untested

#The document I am going to look at is the ANC manifesto
manifesto=manifesto_ANC
root=get_PDF_XML(manifesto)

#We're now going to try to extract the text elements
pages = list(root)

#In the simplext case we can just look at what's in the text elements - here's a preview from one of the pages:
for page in pages:
    for el in page[5:6]:
        if el.tag == "text":
            print el.text

#By inspection of the preview, we can see we don't want to scrape everything
#For example, we can omit:
## None
## The lines starting with dates: 2014/01/10
## The lines starting with 'ANC Manifesto Booklet_v2.indd'
## Lines that just contain a number
## We can strip the bullet marks
txt=''
for page in pages:  
    for el in page:
        if el.tag == "text":
            try:
                #If the line is just a number, omit it
                int(el.text.strip().split()[0])
                notNum=False
            except:
                notNum = True
            #This line tries to exclude things that aren't interesting
            if el.text !=None and notNum and 'ANC Manifesto Booklet_v2.indd' not in el.text and not el.text.startswith('2014/01/'):
                txt=' '.join([txt,el.text.encode('utf8').replace('•','').strip()])
#The variable txt should contain the substance of the PDF text
print txt

#We're going to use a natural language toolkit
import nltk
# Downloading nltk packages used in this example
nltk.download('punkt')

#We can parse the original text into a set of sentences
#I'm doing a little bit of tidying along the way by replace the new line characters with spaces.
sentences = nltk.tokenize.sent_tokenize(txt.replace('\n',' '))
print sentences

#Split each sentence into a  list of tokens - that is, words.
tokens = [nltk.tokenize.word_tokenize(s) for s in sentences]
print tokens

# Downloading nltk packages used in this example
# This library can be used to identify different grammatical elements in a sentence (noun, verb etc)
nltk.download('maxent_treebank_pos_tagger')

#Let's see how the tokens appear grammatically
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
print pos_tagged_tokens

# Not sure we need these? If you get an error below, try uncommenting and running these downloads!
#nltk.download('maxent_ne_chunker')
#nltk.download('words')

#import nltk #Already loaded..
import json
import numpy

#This routine allows us to generate a summary of a text

N = 100  # Number of words to consider
CLUSTER_THRESHOLD = 5  # Distance between words to consider
TOP_SENTENCES = 5  # Number of sentences to return for a "top n" summary

# Download nltk packages used in this example
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english') + ['.',',','--]','\'s','?',')','(',':','\\','"','-','}','{']

# Approach taken from "The Automatic Creation of Literature Abstracts" by H.P. Luhn

def _score_sentences(sentences, important_words):
    scores = []
    sentence_idx = -1

    for s in [nltk.tokenize.word_tokenize(s) for s in sentences]:

        sentence_idx += 1
        word_idx = []

        # For each word in the word list...
        for w in important_words:
            try:
                # Compute an index for where any important words occur in the sentence.

                word_idx.append(s.index(w))
            except ValueError, e: # w not in this particular sentence
                pass

        word_idx.sort()

        # It is possible that some sentences may not contain any important words at all.
        if len(word_idx)== 0: continue

        # Using the word index, compute clusters by using a max distance threshold
        # for any two consecutive words.

        clusters = []
        cluster = [word_idx[0]]
        i = 1
        while i < len(word_idx):
            if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD:
                cluster.append(word_idx[i])
            else:
                clusters.append(cluster[:])
                cluster = [word_idx[i]]
            i += 1
        clusters.append(cluster)

        # Score each cluster. The max score for any given cluster is the score 
        # for the sentence.

        max_cluster_score = 0
        for c in clusters:
            significant_words_in_cluster = len(c)
            total_words_in_cluster = c[-1] - c[0] + 1
            score = 1.0 * significant_words_in_cluster \
                * significant_words_in_cluster / total_words_in_cluster

            if score > max_cluster_score:
                max_cluster_score = score

        scores.append((sentence_idx, score))

    return scores

def summarize(txt):
    sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
    normalized_sentences = [s.lower() for s in sentences]

    words = [w.lower() for sentence in normalized_sentences for w in
             nltk.tokenize.word_tokenize(sentence)]

    fdist = nltk.FreqDist(words)

    top_n_words = [w[0] for w in fdist.items() 
            if w[0] not in nltk.corpus.stopwords.words('english')][:N]

    scored_sentences = _score_sentences(normalized_sentences, top_n_words)

    # Summarization Approach 1:
    # Filter out nonsignificant sentences by using the average score plus a
    # fraction of the std dev as a filter

    avg = numpy.mean([s[1] for s in scored_sentences])
    std = numpy.std([s[1] for s in scored_sentences])
    mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
                   if score > avg + 0.5 * std]

    # Summarization Approach 2:
    # Another approach would be to return only the top N ranked sentences

    top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
    top_n_scored = sorted(top_n_scored, key=lambda s: s[0])

    # Decorate the post object with summaries

    return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
                mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])


txt_summarized= summarize(txt)

print txt_summarized['top_n_summary']
#These libraries install a word cloud library - I'm not convinced they didn't also break bits of the underlying nltk installation
#If you can manage withouth the word cloud, maybe hold off installing this for a bit?
#This cell is currently set to a Raw Text cell type. You will need to make it into a Code cell (menu on toolbar at top of notebook)
# and then run it in order to download the libraries required to generate the word clouds
!apt-get install -y python-pygame
!pip install -U pytagcloud
!pip install simplejson
from operator import itemgetter
import re
#Hack of pytagcloud function - the original through errors for me with language handling
#I also added the ability to pass in your own additional stopwords list
def get_tag_counts(text,mystopwords=[]):
    """
    Search tags in a given text. The language detection is based on stop lists.
    This implementation is inspired by https://github.com/jdf/cue.language. Thanks Jonathan Feinberg.
    """
    words = map(lambda x:x.lower(), re.findall(r"[\w']+", text, re.UNICODE))
    
    #Stopwords are commpnly occurring words we arenlt interest in, such as: the, and, but, of
    stopwords = nltk.corpus.stopwords.words('english')+mystopwords
    
    counted = {}
    
    for word in words:
        if not word in stopwords and len(word) > 1:
            if counted.has_key(word):
                counted[word] += 1
            else: 
                counted[word] = 1
      
    return sorted(counted.iteritems(), key=itemgetter(1), reverse=True)


#If you are interested in what stop words are contained in the list uncomment the following line and run this cell
#nltk.corpus.stopwords.words('english')  

from pytagcloud import create_tag_image, make_tags
import IPython.display

#If there are too many tags we get an error. Limit the number of words displayed explicitly
maxwords=50
maxfontsize=30
#After you generate the word cloud, if there are words you want to exlcude, add them to your own stoplist
mystopwords=['000','20']

tags = make_tags(get_tag_counts(txt,mystopwords)[:maxwords], maxsize=maxfontsize)
create_tag_image(tags, 'wordcloud.png', size=(600, 400))

#The word cloud has been saved as an image with the specificed filename.
#We can pull it back in to the cell and render it for display purposes
IPython.display.display(IPython.display.Image(filename='wordcloud.png'))

#Look at the most commonly occurring words that aren't in the stop list
wordcounts=get_tag_counts(txt)
#Or if you want to add further stopwords and have already defined a list: mystopwords
#get_tag_counts(txt,mystopwords)
wordcounts[:20]

#Set up the imported text as an nltk tokens list and Text structure
tokens = nltk.word_tokenize(txt)
text = nltk.Text(tokens)

#Generate a plot of where particular words appear in the document

#Unfortunately, this doesn't work for me.
#Not sure if it's an issue with the original VM or if I broke something with additional installs eg for pytagcloud
text.dispersion_plot(['development','infrastructure','services','education', 'health', 'security'])

#If we call the *concordance* function on the nltk-ified text, and provide a key word,
# we can get a list of words around occurrences of the that key word.
text.concordance('development')

#Ngrams are groups of N words that commonly occur together (bigrams are N=2, trigrams are N=3)
#If we look for common Ngrams, we can see commonly repeated phrases.
#To find interesting Ngrams, we look for ones where

#sort of via http://stackoverflow.com/a/2453229/454773
finder = nltk.BigramCollocationFinder.from_words(w for w in tokens if w.isalpha())
# only bigrams that appear 3+ times
finder.apply_freq_filter(3) 

# return the 10 n-grams with the highest point wise mutual information (PMI)
#This identifies pairs where the words appear together more frequently than you might expect
#given the frequency of each term taken on its own in the document.
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder.nbest(bigram_measures.pmi, 20)  

#We can also look for interesting trigrams
finder = nltk.TrigramCollocationFinder.from_words(w for w in tokens if w.isalpha())
# only bigrams that appear 3+ times
finder.apply_freq_filter(3) 

# return the 10 n-grams with the highest point wise mutual information (PMI)
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder.nbest(trigram_measures.pmi, 20)