In this excercise, the objective is to figure out what a program does and how it does it based on the methodology in the programming concepts cheat sheet. The code to be studied comes from a tutorial on the Programming Historian. On a headline level, it takes a transcription of a historic trial, and tries to summarize what the trial is about by printing meaningful frequent words. But, how does it do it? What processing steps and decisions have gone into it? For example, how is "meaningful" defined?
Feel free to also add/replace print statements in the code below to figure out for example what the variables contain.
Note: load the library in the second cell (by running the cell) first before running this for it to work.
# html-to-freq-3.py
# create sorted dictionary of word-frequency pairs
url = 'https://www.dhi.ac.uk/api/data/oldbailey_record_single?idkey=t17800628-33'
text = getTextFromAPI(url)
fullwordlist = stripNonAlphaNum(text)
wordlist = removeStopwords(fullwordlist, stopwords)
dictionary = wordListToFreqDict(wordlist)
sorteddict = sortFreqDict(dictionary)
# compile dictionary into string:
outstring = ""
for key,value in sorteddict:
outstring += value + ": " + str(key) + "\n"
print(outstring)
# obo.py library
stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards']
stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along']
stopwords += ['already', 'also', 'although', 'always', 'am', 'among']
stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another']
stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere']
stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became']
stopwords += ['because', 'become', 'becomes', 'becoming', 'been']
stopwords += ['before', 'beforehand', 'behind', 'being', 'below']
stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both']
stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant']
stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de']
stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due']
stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else']
stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever']
stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except']
stopwords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first']
stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found']
stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give']
stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her']
stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers']
stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however']
stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed']
stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep']
stopwords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made']
stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine']
stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much']
stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never']
stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none']
stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of']
stopwords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or']
stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves']
stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please']
stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed']
stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should']
stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so']
stopwords += ['some', 'somehow', 'someone', 'something', 'sometime']
stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take']
stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves']
stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby']
stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they']
stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three']
stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to']
stopwords += ['together', 'too', 'top', 'toward', 'towards', 'twelve']
stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon']
stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what']
stopwords += ['whatever', 'when', 'whence', 'whenever', 'where']
stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon']
stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who']
stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with']
stopwords += ['within', 'without', 'would', 'yet', 'you', 'your']
stopwords += ['yours', 'yourself', 'yourselves']
def stripTags(pageContents):
startLoc = pageContents.find("<p>")
endLoc = pageContents.rfind("<br/>")
pageContents = pageContents[startLoc:endLoc]
inside = 0
text = ''
for char in pageContents:
if char == '<':
inside = 1
elif (inside == 1 and char == '>'):
inside = 0
elif inside == 1:
continue
else:
text += char
return text
# Get the text from an Old Bailey Online API request
def getTextFromAPI(url):
from urllib.request import urlopen
response = urlopen(url)
import json
text = json.loads(response.read().decode('utf-8'))['hits']['hits'][0]['_source']['text']
return text
# Given a text string, remove all non-alphanumeric
# characters (using Unicode definition of alphanumeric).
def stripNonAlphaNum(text):
import re
return re.compile(r'\W+', re.UNICODE).split(text)
# Given a list of words, return a dictionary of
# word-frequency pairs.
def wordListToFreqDict(wordlist):
wordfreq = [wordlist.count(p) for p in wordlist]
return dict(zip(wordlist,wordfreq))
# Sort a dictionary of word-frequency pairs in
# order of descending frequency.
def sortFreqDict(freqdict):
aux = [(freqdict[key], key) for key in freqdict]
aux.sort()
aux.reverse()
return aux
# Given a list of words, remove any that are
# in a list of stop words.
def removeStopwords(wordlist, stopwords):
return [w for w in wordlist if w not in stopwords]
# Given a URL, return string of lowercase text from page.
def webPageToText(url):
from urllib.request import urlopen
response = urlopen(url)
html = response.read().decode('utf-8')
text = stripTags(html).lower()
return text
# Given name of calling program, a url and a string to wrap,
# output string in html body with basic metadata and open in Firefox tab.
def wrapStringInHTMLMac(program, url, body):
import datetime
from webbrowser import open_new_tab
now = datetime.datetime.today().strftime("%Y%m%d-%H%M%S")
filename = program + '.html'
f = open(filename,'w')
wrapper = """<html>
<head>
<title>%s output - %s</title>
</head>
<body><p>URL: <a href=\"%s\">%s</a></p><p>%s</p></body>
</html>"""
whole = wrapper % (program, now, url, url, body)
f.write(whole)
f.close()
#Change the filepath variable below to match the location of your directory
filename = 'file:///Users/username/Desktop/programming-historian/' + filename
open_new_tab(filename)
# Given name of calling program, a url and a string to wrap,
# output string in html body with basic metadata
# and open in Firefox tab.
def wrapStringInHTMLWindows(program, url, body):
import datetime
from webbrowser import open_new_tab
now = datetime.datetime.today().strftime("%Y%m%d-%H%M%S")
filename = program + '.html'
f = open(filename,'w')
wrapper = """<html>
<head>
<title>%s output - %s</title>
</head>
<body><p>URL: <a href=\"%s\">%s</a></p><p>%s</p></body>
</html>"""
whole = wrapper % (program, now, url, url, body)
f.write(whole)
f.close()
open_new_tab(filename)
# Given a list of words and a number n, return a list
# of n-grams.
def getNGrams(wordlist, n):
return [wordlist[i:i+n] for i in range(len(wordlist)-(n-1))]
# Given a list of n-grams, return a dictionary of KWICs,
# indexed by keyword.
def nGramsToKWICDict(ngrams):
keyindex = len(ngrams[0]) // 2
kwicdict = {}
for k in ngrams:
if k[keyindex] not in kwicdict:
kwicdict[k[keyindex]] = [k]
else:
kwicdict[k[keyindex]].append(k)
return kwicdict
# Given a KWIC, return a string that is formatted for
# pretty printing.
def prettyPrintKWIC(kwic):
n = len(kwic)
keyindex = n // 2
width = 10
outstring = ' '.join(kwic[:keyindex]).rjust(width*keyindex)
outstring += str(kwic[keyindex]).center(len(kwic[keyindex])+6)
outstring += ' '.join(kwic[(keyindex+1):])
return outstring