This IPython notebook is designed to run on the virtual machine (VM) provided in support of Mining the Social Web, 2nd Edition. Chunks of this script are also borrowed from the same source, specifically the chapter on mining web pages.
If you don't use the VM suggested, you'll need to install stuff as you go along (starting with nltk, numpy and matplotlib; the easiest alternative might be to install a scientific Python distribution such as Anaconda](http://continuum.io/downloads)).
As it is, there a a few libraries you will have to install additionally, in which case, if you aren't using Linux or a Linux VM, you're on you're own, I 'm afraid..
#I'm using the scraperwiki Python library for scraping text from PDFs.
#There may be easier alternatives, but it's the way I'm using...
#Unfortunately, there are a few dependencies for the function I actually want to be able to call...
!apt-get -y install poppler-utils
!apt-get -y install libxml2-dev
!apt-get -y install libxslt-dev
!pip install lxml
!pip install scraperwiki
import scraperwiki
import urllib2, lxml.etree
def get_PDF_XML(url):
pdfdata = urllib2.urlopen(url).read()
xmldata = scraperwiki.pdftoxml(pdfdata)
root = lxml.etree.fromstring(xmldata)
return root
#Usage:
#root=get_PDF_XML('http://www.anc.org.za/2014/wp-content/themes/anc/downloads/Manifesto_Booklet.pdf')
#Get a PDF document from a URL/web address and then XMLify it - here are some example manifestos
manifesto_ANC='http://www.anc.org.za/2014/wp-content/themes/anc/downloads/Manifesto_Booklet.pdf'
manifesto_EFF='http://effighters.org.za/wp-content/uploads/2014/02/EFF-ELECTIONS-MANIFESTO.pdf'#untested
manifesto_DA='http://www.da.org.za/docs/15652/DA%20Manifesto.pdf'#untested
#The document I am going to look at is the ANC manifesto
manifesto=manifesto_ANC
root=get_PDF_XML(manifesto)
#We're now going to try to extract the text elements
pages = list(root)
#In the simplext case we can just look at what's in the text elements - here's a preview from one of the pages:
for page in pages:
for el in page[5:6]:
if el.tag == "text":
print el.text
#By inspection of the preview, we can see we don't want to scrape everything
#For example, we can omit:
## None
## The lines starting with dates: 2014/01/10
## The lines starting with 'ANC Manifesto Booklet_v2.indd'
## Lines that just contain a number
## We can strip the bullet marks
txt=''
for page in pages:
for el in page:
if el.tag == "text":
try:
#If the line is just a number, omit it
int(el.text.strip().split()[0])
notNum=False
except:
notNum = True
#This line tries to exclude things that aren't interesting
if el.text !=None and notNum and 'ANC Manifesto Booklet_v2.indd' not in el.text and not el.text.startswith('2014/01/'):
txt=' '.join([txt,el.text.encode('utf8').replace('•','').strip()])
#The variable txt should contain the substance of the PDF text
print txt
#We're going to use a natural language toolkit
import nltk
# Downloading nltk packages used in this example
nltk.download('punkt')
[nltk_data] Downloading package 'punkt' to /root/nltk_data... [nltk_data] Package punkt is already up-to-date!
True
#We can parse the original text into a set of sentences
#I'm doing a little bit of tidying along the way by replace the new line characters with spaces.
sentences = nltk.tokenize.sent_tokenize(txt.replace('\n',' '))
print sentences
#Split each sentence into a list of tokens - that is, words.
tokens = [nltk.tokenize.word_tokenize(s) for s in sentences]
print tokens
# Downloading nltk packages used in this example
# This library can be used to identify different grammatical elements in a sentence (noun, verb etc)
nltk.download('maxent_treebank_pos_tagger')
[nltk_data] Downloading package 'maxent_treebank_pos_tagger' to [nltk_data] /root/nltk_data... [nltk_data] Package maxent_treebank_pos_tagger is already up-to- [nltk_data] date!
True
#Let's see how the tokens appear grammatically
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
print pos_tagged_tokens
# Not sure we need these? If you get an error below, try uncommenting and running these downloads!
#nltk.download('maxent_ne_chunker')
#nltk.download('words')
#import nltk #Already loaded..
import json
import numpy
#This routine allows us to generate a summary of a text
N = 100 # Number of words to consider
CLUSTER_THRESHOLD = 5 # Distance between words to consider
TOP_SENTENCES = 5 # Number of sentences to return for a "top n" summary
# Download nltk packages used in this example
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english') + ['.',',','--]','\'s','?',')','(',':','\\','"','-','}','{']
# Approach taken from "The Automatic Creation of Literature Abstracts" by H.P. Luhn
def _score_sentences(sentences, important_words):
scores = []
sentence_idx = -1
for s in [nltk.tokenize.word_tokenize(s) for s in sentences]:
sentence_idx += 1
word_idx = []
# For each word in the word list...
for w in important_words:
try:
# Compute an index for where any important words occur in the sentence.
word_idx.append(s.index(w))
except ValueError, e: # w not in this particular sentence
pass
word_idx.sort()
# It is possible that some sentences may not contain any important words at all.
if len(word_idx)== 0: continue
# Using the word index, compute clusters by using a max distance threshold
# for any two consecutive words.
clusters = []
cluster = [word_idx[0]]
i = 1
while i < len(word_idx):
if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD:
cluster.append(word_idx[i])
else:
clusters.append(cluster[:])
cluster = [word_idx[i]]
i += 1
clusters.append(cluster)
# Score each cluster. The max score for any given cluster is the score
# for the sentence.
max_cluster_score = 0
for c in clusters:
significant_words_in_cluster = len(c)
total_words_in_cluster = c[-1] - c[0] + 1
score = 1.0 * significant_words_in_cluster \
* significant_words_in_cluster / total_words_in_cluster
if score > max_cluster_score:
max_cluster_score = score
scores.append((sentence_idx, score))
return scores
def summarize(txt):
sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
normalized_sentences = [s.lower() for s in sentences]
words = [w.lower() for sentence in normalized_sentences for w in
nltk.tokenize.word_tokenize(sentence)]
fdist = nltk.FreqDist(words)
top_n_words = [w[0] for w in fdist.items()
if w[0] not in nltk.corpus.stopwords.words('english')][:N]
scored_sentences = _score_sentences(normalized_sentences, top_n_words)
# Summarization Approach 1:
# Filter out nonsignificant sentences by using the average score plus a
# fraction of the std dev as a filter
avg = numpy.mean([s[1] for s in scored_sentences])
std = numpy.std([s[1] for s in scored_sentences])
mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
if score > avg + 0.5 * std]
# Summarization Approach 2:
# Another approach would be to return only the top N ranked sentences
top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
top_n_scored = sorted(top_n_scored, key=lambda s: s[0])
# Decorate the post object with summaries
return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])
[nltk_data] Downloading package 'stopwords' to /root/nltk_data... [nltk_data] Package stopwords is already up-to-date!
txt_summarized= summarize(txt)
print txt_summarized['top_n_summary']
['The private sector must actively contribute to inclusive growth, investment, social development and economic transformation.', 'Ensure all South Africans have access to adequate human settlements and quality living conditions through programmes that provide one million housing opportunities for qualifying households over the next five years, and providing basic services and infrastructure in all existing informal settlements.', 'The state\xe2\x80\x99s buying power will support small enterprises, co-operatives and broad-based black economic empowerment.', 'Nearly 500 informal settlements have been replaced with quality housing and basic services.', 'This manifesto is our pledge to move South Africa forward, together.']
from operator import itemgetter
import re
#Hack of pytagcloud function - the original through errors for me with language handling
#I also added the ability to pass in your own additional stopwords list
def get_tag_counts(text,mystopwords=[]):
"""
Search tags in a given text. The language detection is based on stop lists.
This implementation is inspired by https://github.com/jdf/cue.language. Thanks Jonathan Feinberg.
"""
words = map(lambda x:x.lower(), re.findall(r"[\w']+", text, re.UNICODE))
#Stopwords are commpnly occurring words we arenlt interest in, such as: the, and, but, of
stopwords = nltk.corpus.stopwords.words('english')+mystopwords
counted = {}
for word in words:
if not word in stopwords and len(word) > 1:
if counted.has_key(word):
counted[word] += 1
else:
counted[word] = 1
return sorted(counted.iteritems(), key=itemgetter(1), reverse=True)
#If you are interested in what stop words are contained in the list uncomment the following line and run this cell
#nltk.corpus.stopwords.words('english')
from pytagcloud import create_tag_image, make_tags
import IPython.display
#If there are too many tags we get an error. Limit the number of words displayed explicitly
maxwords=50
maxfontsize=30
#After you generate the word cloud, if there are words you want to exlcude, add them to your own stoplist
mystopwords=['000','20']
tags = make_tags(get_tag_counts(txt,mystopwords)[:maxwords], maxsize=maxfontsize)
create_tag_image(tags, 'wordcloud.png', size=(600, 400))
#The word cloud has been saved as an image with the specificed filename.
#We can pull it back in to the cell and render it for display purposes
IPython.display.display(IPython.display.Image(filename='wordcloud.png'))
#Look at the most commonly occurring words that aren't in the stop list
wordcounts=get_tag_counts(txt)
#Or if you want to add further stopwords and have already defined a list: mystopwords
#get_tag_counts(txt,mystopwords)
wordcounts[:20]
[('years', 65), ('public', 57), ('people', 56), ('south', 55), ('development', 54), ('infrastructure', 38), ('sector', 37), ('ensure', 36), ('support', 36), ('million', 36), ('work', 34), ('social', 33), ('health', 32), ('national', 32), ('new', 31), ('promote', 31), ('last', 30), ('000', 28), ('economic', 28), ('african', 28)]
#Set up the imported text as an nltk tokens list and Text structure
tokens = nltk.word_tokenize(txt)
text = nltk.Text(tokens)
#Generate a plot of where particular words appear in the document
#Unfortunately, this doesn't work for me.
#Not sure if it's an issue with the original VM or if I broke something with additional installs eg for pytagcloud
text.dispersion_plot(['development','infrastructure','services','education', 'health', 'security'])
--------------------------------------------------------------------------- ImportError Traceback (most recent call last) <ipython-input-19-73b3bbd4dadf> in <module>() ----> 1 text.dispersion_plot(['development','infrastructure','services','education', 'health', 'security']) /usr/local/lib/python2.7/dist-packages/nltk/text.pyc in dispersion_plot(self, words) 453 :seealso: nltk.draw.dispersion_plot() 454 """ --> 455 from nltk.draw import dispersion_plot 456 dispersion_plot(self, words) 457 ImportError: cannot import name dispersion_plot
/usr/local/lib/python2.7/dist-packages/nltk/draw/__init__.py:14: UserWarning: nltk.draw package not loaded (please install Tkinter library). warnings.warn("nltk.draw package not loaded "
#If we call the *concordance* function on the nltk-ified text, and provide a key word,
# we can get a list of words around occurrences of the that key word.
text.concordance('development')
Building index... Displaying 25 of 48 matches: r social fabric and undermine our development efforts. Our economy continues to reduces inequality. The National Development Plan ( NDP ) aims to eradicate po shifts the trajectory of economic development , the National Infrastructure Pla ihoods for inclusive growth Rural development , land reform and food security E ustrialisation and infrastructure development programmes for inclusive growth a sive growth , investment , social development and economic transformation. Prom the financial sector , including development finance institutions , through bo d labour broking. Implement rural development focusing on meeting basic needs , land reform and rural enterprise development , supported by localised markets in support of small-holder farmer development , prioritising former homeland co ety , affirmative action , skills development , minimum wages for workers in vu ive infrastructure and industrial development The skills and capabilities of th spheres of government. Align the development mandate of state-owned enterprise date of state-owned enterprises , development finance institutions ( DFIs ) and l be promoted. Through our skills development initiative we will turn every pub itizens to play a greater role in development We will strengthen existing forum people to play a greater role in development through partnerships with a range sive growth , investment , social development and economic transformation. cons nd technology ; and advance rural development , land and agrarian reform and fo er with state-owned enterprises , development finance institutions and the priv e our work to establish the BRICS development bank , which will help finance in will help finance infrastructure development in developing countries. Invest i the Eastern Cape , an integrated development that includes a new Mzimvubu Dam ger transport systems through the development of bus-rapid transit systems to m new jobs and contribute to skills development as buses , taxis , locomotives an
#Ngrams are groups of N words that commonly occur together (bigrams are N=2, trigrams are N=3)
#If we look for common Ngrams, we can see commonly repeated phrases.
#To find interesting Ngrams, we look for ones where
#sort of via http://stackoverflow.com/a/2453229/454773
finder = nltk.BigramCollocationFinder.from_words(w for w in tokens if w.isalpha())
# only bigrams that appear 3+ times
finder.apply_freq_filter(3)
# return the 10 n-grams with the highest point wise mutual information (PMI)
#This identifies pairs where the words appear together more frequently than you might expect
#given the frequency of each term taken on its own in the document.
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder.nbest(bigram_measures.pmi, 20)
[('AFRICA', 'FORWARD'), ('MOVE', 'SOUTH'), ('SOUTH', 'AFRICA'), ('TOGETHER', 'WE'), ('WE', 'MOVE'), ('found', 'guilty'), ('value', 'chains'), ('Take', 'steps'), ('Grade', 'R'), ('actively', 'contribute'), ('firm', 'action'), ('must', 'actively'), ('communal', 'areas'), ('sustainable', 'livelihoods'), ('those', 'who'), ('small', 'scale'), ('set', 'out'), ('inclusive', 'growth'), ('Working', 'together'), ('training', 'space')]
#We can also look for interesting trigrams
finder = nltk.TrigramCollocationFinder.from_words(w for w in tokens if w.isalpha())
# only bigrams that appear 3+ times
finder.apply_freq_filter(3)
# return the 10 n-grams with the highest point wise mutual information (PMI)
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder.nbest(trigram_measures.pmi, 20)
[('MOVE', 'SOUTH', 'AFRICA'), ('SOUTH', 'AFRICA', 'FORWARD'), ('TOGETHER', 'WE', 'MOVE'), ('WE', 'MOVE', 'SOUTH'), ('must', 'actively', 'contribute'), ('free', 'primary', 'health'), ('primary', 'health', 'care'), ('move', 'South', 'Africa'), ('development', 'finance', 'institutions'), ('South', 'Africa', 'forward'), ('a', 'training', 'space'), ('Take', 'steps', 'to'), ('public', 'works', 'programmes'), ('actively', 'contribute', 'to'), ('next', 'five', 'years'), ('South', 'Africa', 'is'), ('It', 'has', 'been'), ('into', 'a', 'training'), ('all', 'South', 'Africans'), ('Intensify', 'the', 'fight')]