Notebook

Election Demo - Text Analysis¶

This IPython notebook is designed to run on the virtual machine (VM) provided in support of Mining the Social Web, 2nd Edition. Chunks of this script are also borrowed from the same source, specifically the chapter on mining web pages.

If you don't use the VM suggested, you'll need to install stuff as you go along (starting with nltk, numpy and matplotlib; the easiest alternative might be to install a scientific Python distribution such as Anaconda](http://continuum.io/downloads)).

As it is, there a a few libraries you will have to install additionally, in which case, if you aren't using Linux or a Linux VM, you're on you're own, I 'm afraid..

In [ ]:

#I'm using the scraperwiki Python library for scraping text from PDFs.
#There may be easier alternatives, but it's the way I'm using...
#Unfortunately, there are a few dependencies for the function I actually want to be able to call...
!apt-get -y install poppler-utils
!apt-get -y install libxml2-dev
!apt-get -y install  libxslt-dev
!pip install lxml
!pip install scraperwiki

In [2]:

import scraperwiki
import urllib2, lxml.etree

def get_PDF_XML(url):
    pdfdata = urllib2.urlopen(url).read()
    xmldata = scraperwiki.pdftoxml(pdfdata)
    root = lxml.etree.fromstring(xmldata)
    return root

#Usage:
#root=get_PDF_XML('http://www.anc.org.za/2014/wp-content/themes/anc/downloads/Manifesto_Booklet.pdf')

In [3]:

#Get a PDF document from a URL/web address and then XMLify it - here are some example manifestos
manifesto_ANC='http://www.anc.org.za/2014/wp-content/themes/anc/downloads/Manifesto_Booklet.pdf'
manifesto_EFF='http://effighters.org.za/wp-content/uploads/2014/02/EFF-ELECTIONS-MANIFESTO.pdf'#untested
manifesto_DA='http://www.da.org.za/docs/15652/DA%20Manifesto.pdf'#untested

#The document I am going to look at is the ANC manifesto
manifesto=manifesto_ANC
root=get_PDF_XML(manifesto)

In [ ]:

#We're now going to try to extract the text elements
pages = list(root)

#In the simplext case we can just look at what's in the text elements - here's a preview from one of the pages:
for page in pages:
    for el in page[5:6]:
        if el.tag == "text":
            print el.text

In [ ]:

#By inspection of the preview, we can see we don't want to scrape everything
#For example, we can omit:
## None
## The lines starting with dates: 2014/01/10
## The lines starting with 'ANC Manifesto Booklet_v2.indd'
## Lines that just contain a number
## We can strip the bullet marks
txt=''
for page in pages:  
    for el in page:
        if el.tag == "text":
            try:
                #If the line is just a number, omit it
                int(el.text.strip().split()[0])
                notNum=False
            except:
                notNum = True
            #This line tries to exclude things that aren't interesting
            if el.text !=None and notNum and 'ANC Manifesto Booklet_v2.indd' not in el.text and not el.text.startswith('2014/01/'):
                txt=' '.join([txt,el.text.encode('utf8').replace('•','').strip()])
#The variable txt should contain the substance of the PDF text
print txt

In [6]:

#We're going to use a natural language toolkit
import nltk
# Downloading nltk packages used in this example
nltk.download('punkt')

[nltk_data] Downloading package 'punkt' to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

Out[6]:

True

In [ ]:

#We can parse the original text into a set of sentences
#I'm doing a little bit of tidying along the way by replace the new line characters with spaces.
sentences = nltk.tokenize.sent_tokenize(txt.replace('\n',' '))
print sentences

In [ ]:

#Split each sentence into a  list of tokens - that is, words.
tokens = [nltk.tokenize.word_tokenize(s) for s in sentences]
print tokens

In [9]:

# Downloading nltk packages used in this example
# This library can be used to identify different grammatical elements in a sentence (noun, verb etc)
nltk.download('maxent_treebank_pos_tagger')

[nltk_data] Downloading package 'maxent_treebank_pos_tagger' to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_treebank_pos_tagger is already up-to-
[nltk_data]       date!

Out[9]:

True

In [ ]:

#Let's see how the tokens appear grammatically
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
print pos_tagged_tokens

In [11]:

# Not sure we need these? If you get an error below, try uncommenting and running these downloads!
#nltk.download('maxent_ne_chunker')
#nltk.download('words')

In [12]:

#import nltk #Already loaded..
import json
import numpy

In [20]:

#This routine allows us to generate a summary of a text

N = 100  # Number of words to consider
CLUSTER_THRESHOLD = 5  # Distance between words to consider
TOP_SENTENCES = 5  # Number of sentences to return for a "top n" summary

# Download nltk packages used in this example
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english') + ['.',',','--]','\'s','?',')','(',':','\\','"','-','}','{']

# Approach taken from "The Automatic Creation of Literature Abstracts" by H.P. Luhn

def _score_sentences(sentences, important_words):
    scores = []
    sentence_idx = -1

    for s in [nltk.tokenize.word_tokenize(s) for s in sentences]:

        sentence_idx += 1
        word_idx = []

        # For each word in the word list...
        for w in important_words:
            try:
                # Compute an index for where any important words occur in the sentence.

                word_idx.append(s.index(w))
            except ValueError, e: # w not in this particular sentence
                pass

        word_idx.sort()

        # It is possible that some sentences may not contain any important words at all.
        if len(word_idx)== 0: continue

        # Using the word index, compute clusters by using a max distance threshold
        # for any two consecutive words.

        clusters = []
        cluster = [word_idx[0]]
        i = 1
        while i < len(word_idx):
            if word_idx[i] - word_idx[i - 1] < CLUSTER_THRESHOLD:
                cluster.append(word_idx[i])
            else:
                clusters.append(cluster[:])
                cluster = [word_idx[i]]
            i += 1
        clusters.append(cluster)

        # Score each cluster. The max score for any given cluster is the score 
        # for the sentence.

        max_cluster_score = 0
        for c in clusters:
            significant_words_in_cluster = len(c)
            total_words_in_cluster = c[-1] - c[0] + 1
            score = 1.0 * significant_words_in_cluster \
                * significant_words_in_cluster / total_words_in_cluster

            if score > max_cluster_score:
                max_cluster_score = score

        scores.append((sentence_idx, score))

    return scores

def summarize(txt):
    sentences = [s for s in nltk.tokenize.sent_tokenize(txt)]
    normalized_sentences = [s.lower() for s in sentences]

    words = [w.lower() for sentence in normalized_sentences for w in
             nltk.tokenize.word_tokenize(sentence)]

    fdist = nltk.FreqDist(words)

    top_n_words = [w[0] for w in fdist.items() 
            if w[0] not in nltk.corpus.stopwords.words('english')][:N]

    scored_sentences = _score_sentences(normalized_sentences, top_n_words)

    # Summarization Approach 1:
    # Filter out nonsignificant sentences by using the average score plus a
    # fraction of the std dev as a filter

    avg = numpy.mean([s[1] for s in scored_sentences])
    std = numpy.std([s[1] for s in scored_sentences])
    mean_scored = [(sent_idx, score) for (sent_idx, score) in scored_sentences
                   if score > avg + 0.5 * std]

    # Summarization Approach 2:
    # Another approach would be to return only the top N ranked sentences

    top_n_scored = sorted(scored_sentences, key=lambda s: s[1])[-TOP_SENTENCES:]
    top_n_scored = sorted(top_n_scored, key=lambda s: s[0])

    # Decorate the post object with summaries

    return dict(top_n_summary=[sentences[idx] for (idx, score) in top_n_scored],
                mean_scored_summary=[sentences[idx] for (idx, score) in mean_scored])

[nltk_data] Downloading package 'stopwords' to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

In [27]:

txt_summarized= summarize(txt)

In [28]:

print txt_summarized['top_n_summary']

['The private sector must actively contribute to inclusive growth, investment, social development and economic transformation.', 'Ensure all South Africans have access to adequate human settlements and quality living conditions through programmes that provide one million housing opportunities for qualifying households over the next five years, and providing basic services and infrastructure in all existing informal settlements.', 'The state\xe2\x80\x99s buying power will support small enterprises, co-operatives and broad-based black economic empowerment.', 'Nearly 500 informal settlements have been replaced with quality housing and basic services.', 'This manifesto is our pledge to move South Africa forward, together.']

Word Clouds¶

#These libraries install a word cloud library - I'm not convinced they didn't also break bits of the underlying nltk installation #If you can manage withouth the word cloud, maybe hold off installing this for a bit? #This cell is currently set to a Raw Text cell type. You will need to make it into a Code cell (menu on toolbar at top of notebook) # and then run it in order to download the libraries required to generate the word clouds !apt-get install -y python-pygame !pip install -U pytagcloud !pip install simplejson

In [40]:

from operator import itemgetter
import re
#Hack of pytagcloud function - the original through errors for me with language handling
#I also added the ability to pass in your own additional stopwords list
def get_tag_counts(text,mystopwords=[]):
    """
    Search tags in a given text. The language detection is based on stop lists.
    This implementation is inspired by https://github.com/jdf/cue.language. Thanks Jonathan Feinberg.
    """
    words = map(lambda x:x.lower(), re.findall(r"[\w']+", text, re.UNICODE))
    
    #Stopwords are commpnly occurring words we arenlt interest in, such as: the, and, but, of
    stopwords = nltk.corpus.stopwords.words('english')+mystopwords
    
    counted = {}
    
    for word in words:
        if not word in stopwords and len(word) > 1:
            if counted.has_key(word):
                counted[word] += 1
            else: 
                counted[word] = 1
      
    return sorted(counted.iteritems(), key=itemgetter(1), reverse=True)

In [37]:

#If you are interested in what stop words are contained in the list uncomment the following line and run this cell
#nltk.corpus.stopwords.words('english')  

In [16]:

from pytagcloud import create_tag_image, make_tags
import IPython.display

In [41]:

#If there are too many tags we get an error. Limit the number of words displayed explicitly
maxwords=50
maxfontsize=30
#After you generate the word cloud, if there are words you want to exlcude, add them to your own stoplist
mystopwords=['000','20']

tags = make_tags(get_tag_counts(txt,mystopwords)[:maxwords], maxsize=maxfontsize)
create_tag_image(tags, 'wordcloud.png', size=(600, 400))

#The word cloud has been saved as an image with the specificed filename.
#We can pull it back in to the cell and render it for display purposes
IPython.display.display(IPython.display.Image(filename='wordcloud.png'))

In [42]:

#Look at the most commonly occurring words that aren't in the stop list
wordcounts=get_tag_counts(txt)
#Or if you want to add further stopwords and have already defined a list: mystopwords
#get_tag_counts(txt,mystopwords)
wordcounts[:20]

Out[42]:

[('years', 65),
 ('public', 57),
 ('people', 56),
 ('south', 55),
 ('development', 54),
 ('infrastructure', 38),
 ('sector', 37),
 ('ensure', 36),
 ('support', 36),
 ('million', 36),
 ('work', 34),
 ('social', 33),
 ('health', 32),
 ('national', 32),
 ('new', 31),
 ('promote', 31),
 ('last', 30),
 ('000', 28),
 ('economic', 28),
 ('african', 28)]

In [18]:

#Set up the imported text as an nltk tokens list and Text structure
tokens = nltk.word_tokenize(txt)
text = nltk.Text(tokens)

In [19]:

#Generate a plot of where particular words appear in the document

#Unfortunately, this doesn't work for me.
#Not sure if it's an issue with the original VM or if I broke something with additional installs eg for pytagcloud
text.dispersion_plot(['development','infrastructure','services','education', 'health', 'security'])

---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-19-73b3bbd4dadf> in <module>()
----> 1 text.dispersion_plot(['development','infrastructure','services','education', 'health', 'security'])

/usr/local/lib/python2.7/dist-packages/nltk/text.pyc in dispersion_plot(self, words)
    453         :seealso: nltk.draw.dispersion_plot()
    454         """
--> 455         from nltk.draw import dispersion_plot
    456         dispersion_plot(self, words)
    457 

ImportError: cannot import name dispersion_plot

/usr/local/lib/python2.7/dist-packages/nltk/draw/__init__.py:14: UserWarning: nltk.draw package not loaded (please install Tkinter library).
  warnings.warn("nltk.draw package not loaded "

In [43]:

#If we call the *concordance* function on the nltk-ified text, and provide a key word,
# we can get a list of words around occurrences of the that key word.
text.concordance('development')

Building index...
Displaying 25 of 48 matches:
r social fabric and undermine our development efforts. Our economy continues to
 reduces inequality. The National Development Plan ( NDP ) aims to eradicate po
shifts the trajectory of economic development , the National Infrastructure Pla
ihoods for inclusive growth Rural development , land reform and food security E
ustrialisation and infrastructure development programmes for inclusive growth a
sive growth , investment , social development and economic transformation. Prom
 the financial sector , including development finance institutions , through bo
d labour broking. Implement rural development focusing on meeting basic needs ,
 land reform and rural enterprise development , supported by localised markets 
in support of small-holder farmer development , prioritising former homeland co
ety , affirmative action , skills development , minimum wages for workers in vu
ive infrastructure and industrial development The skills and capabilities of th
 spheres of government. Align the development mandate of state-owned enterprise
date of state-owned enterprises , development finance institutions ( DFIs ) and
l be promoted. Through our skills development initiative we will turn every pub
itizens to play a greater role in development We will strengthen existing forum
 people to play a greater role in development through partnerships with a range
sive growth , investment , social development and economic transformation. cons
nd technology ; and advance rural development , land and agrarian reform and fo
er with state-owned enterprises , development finance institutions and the priv
e our work to establish the BRICS development bank , which will help finance in
 will help finance infrastructure development in developing countries. Invest i
 the Eastern Cape , an integrated development that includes a new Mzimvubu Dam 
ger transport systems through the development of bus-rapid transit systems to m
new jobs and contribute to skills development as buses , taxis , locomotives an

In [44]:

#Ngrams are groups of N words that commonly occur together (bigrams are N=2, trigrams are N=3)
#If we look for common Ngrams, we can see commonly repeated phrases.
#To find interesting Ngrams, we look for ones where

#sort of via http://stackoverflow.com/a/2453229/454773
finder = nltk.BigramCollocationFinder.from_words(w for w in tokens if w.isalpha())
# only bigrams that appear 3+ times
finder.apply_freq_filter(3) 

# return the 10 n-grams with the highest point wise mutual information (PMI)
#This identifies pairs where the words appear together more frequently than you might expect
#given the frequency of each term taken on its own in the document.
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder.nbest(bigram_measures.pmi, 20)  

Out[44]:

[('AFRICA', 'FORWARD'),
 ('MOVE', 'SOUTH'),
 ('SOUTH', 'AFRICA'),
 ('TOGETHER', 'WE'),
 ('WE', 'MOVE'),
 ('found', 'guilty'),
 ('value', 'chains'),
 ('Take', 'steps'),
 ('Grade', 'R'),
 ('actively', 'contribute'),
 ('firm', 'action'),
 ('must', 'actively'),
 ('communal', 'areas'),
 ('sustainable', 'livelihoods'),
 ('those', 'who'),
 ('small', 'scale'),
 ('set', 'out'),
 ('inclusive', 'growth'),
 ('Working', 'together'),
 ('training', 'space')]

In [45]:

#We can also look for interesting trigrams
finder = nltk.TrigramCollocationFinder.from_words(w for w in tokens if w.isalpha())
# only bigrams that appear 3+ times
finder.apply_freq_filter(3) 

# return the 10 n-grams with the highest point wise mutual information (PMI)
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder.nbest(trigram_measures.pmi, 20)  

Out[45]:

[('MOVE', 'SOUTH', 'AFRICA'),
 ('SOUTH', 'AFRICA', 'FORWARD'),
 ('TOGETHER', 'WE', 'MOVE'),
 ('WE', 'MOVE', 'SOUTH'),
 ('must', 'actively', 'contribute'),
 ('free', 'primary', 'health'),
 ('primary', 'health', 'care'),
 ('move', 'South', 'Africa'),
 ('development', 'finance', 'institutions'),
 ('South', 'Africa', 'forward'),
 ('a', 'training', 'space'),
 ('Take', 'steps', 'to'),
 ('public', 'works', 'programmes'),
 ('actively', 'contribute', 'to'),
 ('next', 'five', 'years'),
 ('South', 'Africa', 'is'),
 ('It', 'has', 'been'),
 ('into', 'a', 'training'),
 ('all', 'South', 'Africans'),
 ('Intensify', 'the', 'fight')]

In [ ]: