#I'm using the scraperwiki Python library for scraping text from PDFs. #There may be easier alternatives, but it's the way I'm using... #Unfortunately, there are a few dependencies for the function I actually want to be able to call... !apt-get -y install poppler-utils !apt-get -y install libxml2-dev !apt-get -y install libxslt-dev !pip install lxml !pip install scraperwiki import scraperwiki import urllib2, lxml.etree def get_PDF_XML(url): pdfdata = urllib2.urlopen(url).read() xmldata = scraperwiki.pdftoxml(pdfdata) root = lxml.etree.fromstring(xmldata) return root #Usage: #root=get_PDF_XML('http://www.anc.org.za/2014/wp-content/themes/anc/downloads/Manifesto_Booklet.pdf') #Get a PDF document from a URL/web address and then XMLify it - here are some example manifestos manifesto_ANC='http://www.anc.org.za/2014/wp-content/themes/anc/downloads/Manifesto_Booklet.pdf' manifesto_EFF='http://effighters.org.za/wp-content/uploads/2014/02/EFF-ELECTIONS-MANIFESTO.pdf'#untested manifesto_DA='http://www.da.org.za/docs/15652/DA%20Manifesto.pdf'#untested #The document I am going to look at is the ANC manifesto manifesto=manifesto_ANC root=get_PDF_XML(manifesto) #We're now going to try to extract the text elements pages = list(root) #In the simplext case we can just look at what's in the text elements - here's a preview from one of the pages: for page in pages: for el in page[5:6]: if el.tag == "text": print el.text #By inspection of the preview, we can see we don't want to scrape everything #For example, we can omit: ## None ## The lines starting with dates: 2014/01/10 ## The lines starting with 'ANC Manifesto Booklet_v2.indd' ## Lines that just contain a number ## We can strip the bullet marks txt='' for page in pages: for el in page: if el.tag == "text": try: #If the line is just a number, omit it int(el.text.strip().split()[0]) notNum=False except: notNum = True #This line tries to exclude things that aren't interesting if el.text !=None and notNum and 'ANC Manifesto Booklet_v2.indd' not in el.text and not el.text.startswith('2014/01/'): txt=' '.join([txt,el.text.encode('utf8').replace('•','').strip()]) #The variable txt should contain the substance of the PDF text print txt #We're going to use a natural language toolkit import nltk # Downloading nltk packages used in this example nltk.download('punkt') #We can parse the original text into a set of sentences #I'm doing a little bit of tidying along the way by replace the new line characters with spaces. sentences = nltk.tokenize.sent_tokenize(txt.replace('\n',' ')) print sentences #Split each sentence into a list of tokens - that is, words. tokens = [nltk.tokenize.word_tokenize(s) for s in sentences] print tokens # Downloading nltk packages used in this example # This library can be used to identify different grammatical elements in a sentence (noun, verb etc) nltk.download('maxent_treebank_pos_tagger') #Let's see how the tokens appear grammatically pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens] print pos_tagged_tokens # Not sure we need these? If you get an error below, try uncommenting and running these downloads! #nltk.download('maxent_ne_chunker') #nltk.download('words') #import nltk #Already loaded.. import json import numpy #This routine allows us to generate a summary of a text N = 100 # Number of words to consider CLUSTER_THRESHOLD = 5 # Distance between words to consider TOP_SENTENCES = 5 # Number of sentences to return for a "top n" summary # Download nltk packages used in this example nltk.download('stopwords') stop_words = nltk.corpus.stopwords.words('english') + ['.',',','--]','\'s','?',')','(',':','\\','"','-','}','{'] # Approach taken from "The Automatic Creation of Literature Abstracts" by H.P. Luhn def _score_sentences(sentences, important_words): scores = [] sentence_idx = -1 for s in [nltk.tokenize.word_tokenize(s) for s in sentences]: sentence_idx += 1 word_idx = [] # For each word in the word list... for w in important_words: try: # Compute an index for where any important words occur in the sentence. word_idx.append(s.index(w)) except ValueError, e: # w not in this particular sentence pass word_idx.sort() # It is possible that some sentences may not contain any important words at all. if len(word_idx)== 0: continue # Using the word index, compute clusters by using a max distance threshold # for any two consecutive words. clusters = [] cluster = [word_idx[0]] i = 1 while i < len(word_idx): if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD: cluster.append(word_idx[i]) else: clusters.append(cluster[:]) cluster = [word_idx[i]] i += 1 clusters.append(cluster) # Score each cluster. The max score for any given cluster is the score # for the sentence. max_cluster_score = 0 for c in clusters: significant_words_in_cluster = len(c) total_words_in_cluster = c[-1] - c[0] + 1 score = 1.0 * significant_words_in_cluster \ * significant_words_in_cluster / total_words_in_cluster if score > max_cluster_score: max_cluster_score = score scores.append((sentence_idx, score)) return scores def summarize(txt): sentences = [s for s in nltk.tokenize.sent_tokenize(txt)] normalized_sentences = [s.lower() for s in sentences] words = [w.lower() for sentence in normalized_sentences for w in nltk.tokenize.word_tokenize(sentence)] fdist = nltk.FreqDist(words) top_n_words = [w[0] for w in fdist.items() if w[0] not in nltk.corpus.stopwords.words('english')][:N] scored_sentences = _score_sentences(normalized_sentences, top_n_words) # Summarization Approach 1: # Filter out nonsignificant sentences by using the average score plus a # fraction of the std dev as a filter avg = numpy.mean([s[1] for s in scored_sentences]) std = numpy.std([s[1] for s in scored_sentences]) mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences if score > avg + 0.5 * std] # Summarization Approach 2: # Another approach would be to return only the top N ranked sentences top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:] top_n_scored = sorted(top_n_scored, key=lambda s: s[0]) # Decorate the post object with summaries return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored], mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored]) txt_summarized= summarize(txt) print txt_summarized['top_n_summary'] #These libraries install a word cloud library - I'm not convinced they didn't also break bits of the underlying nltk installation #If you can manage withouth the word cloud, maybe hold off installing this for a bit? #This cell is currently set to a Raw Text cell type. You will need to make it into a Code cell (menu on toolbar at top of notebook) # and then run it in order to download the libraries required to generate the word clouds !apt-get install -y python-pygame !pip install -U pytagcloud !pip install simplejson from operator import itemgetter import re #Hack of pytagcloud function - the original through errors for me with language handling #I also added the ability to pass in your own additional stopwords list def get_tag_counts(text,mystopwords=[]): """ Search tags in a given text. The language detection is based on stop lists. This implementation is inspired by https://github.com/jdf/cue.language. Thanks Jonathan Feinberg. """ words = map(lambda x:x.lower(), re.findall(r"[\w']+", text, re.UNICODE)) #Stopwords are commpnly occurring words we arenlt interest in, such as: the, and, but, of stopwords = nltk.corpus.stopwords.words('english')+mystopwords counted = {} for word in words: if not word in stopwords and len(word) > 1: if counted.has_key(word): counted[word] += 1 else: counted[word] = 1 return sorted(counted.iteritems(), key=itemgetter(1), reverse=True) #If you are interested in what stop words are contained in the list uncomment the following line and run this cell #nltk.corpus.stopwords.words('english') from pytagcloud import create_tag_image, make_tags import IPython.display #If there are too many tags we get an error. Limit the number of words displayed explicitly maxwords=50 maxfontsize=30 #After you generate the word cloud, if there are words you want to exlcude, add them to your own stoplist mystopwords=['000','20'] tags = make_tags(get_tag_counts(txt,mystopwords)[:maxwords], maxsize=maxfontsize) create_tag_image(tags, 'wordcloud.png', size=(600, 400)) #The word cloud has been saved as an image with the specificed filename. #We can pull it back in to the cell and render it for display purposes IPython.display.display(IPython.display.Image(filename='wordcloud.png')) #Look at the most commonly occurring words that aren't in the stop list wordcounts=get_tag_counts(txt) #Or if you want to add further stopwords and have already defined a list: mystopwords #get_tag_counts(txt,mystopwords) wordcounts[:20] #Set up the imported text as an nltk tokens list and Text structure tokens = nltk.word_tokenize(txt) text = nltk.Text(tokens) #Generate a plot of where particular words appear in the document #Unfortunately, this doesn't work for me. #Not sure if it's an issue with the original VM or if I broke something with additional installs eg for pytagcloud text.dispersion_plot(['development','infrastructure','services','education', 'health', 'security']) #If we call the *concordance* function on the nltk-ified text, and provide a key word, # we can get a list of words around occurrences of the that key word. text.concordance('development') #Ngrams are groups of N words that commonly occur together (bigrams are N=2, trigrams are N=3) #If we look for common Ngrams, we can see commonly repeated phrases. #To find interesting Ngrams, we look for ones where #sort of via http://stackoverflow.com/a/2453229/454773 finder = nltk.BigramCollocationFinder.from_words(w for w in tokens if w.isalpha()) # only bigrams that appear 3+ times finder.apply_freq_filter(3) # return the 10 n-grams with the highest point wise mutual information (PMI) #This identifies pairs where the words appear together more frequently than you might expect #given the frequency of each term taken on its own in the document. bigram_measures = nltk.collocations.BigramAssocMeasures() finder.nbest(bigram_measures.pmi, 20) #We can also look for interesting trigrams finder = nltk.TrigramCollocationFinder.from_words(w for w in tokens if w.isalpha()) # only bigrams that appear 3+ times finder.apply_freq_filter(3) # return the 10 n-grams with the highest point wise mutual information (PMI) trigram_measures = nltk.collocations.TrigramAssocMeasures() finder.nbest(trigram_measures.pmi, 20)