This example shows how to automatically build a corpus from research paper abstracts and to generate insights about research projects in a certain research area or company.
To go through the following analyses, you should install the following python libraries:
with their respective dependencies.
> conda install ipython-notebook
Then start the notebook from your shell or command prompt:
> ipython notebook
For some analysis, you also need a few standard corpora from the nltk library:
>> nltk.download()
Then select and install the corpora "Wordnet" and "English Stopwords"
The five fundamental steps in doing analyses such as this one are:
The technical stuff in detail (see nltk.org):
For practice we will be taking a look at the research projects done at Google. Here's the data in human readable form:
from IPython.display import HTML
HTML('<iframe src="http://research.google.com/pubs/papers.html" width=100% height=350></iframe>')
What we want to find out:
from bs4 import BeautifulSoup
import urllib2
url = "http://research.google.com/pubs/papers.html" # Base URL
raw = urllib2.urlopen(url).read() # Download page
soup = BeautifulSoup(raw) # Read HTML
links = [l.get("href") for l in soup.find_all("a")] # Identify links
links = ["http://research.google.com" + l for l in links if "/pubs/" in l] # Only publication links
links = [l for l in links if l != url] # Exclude self-referential links
links[:4]
['http://research.google.com/pubs/AlgorithmsandTheory.html', 'http://research.google.com/pubs/ArtificialIntelligenceandMachineLearning.html', 'http://research.google.com/pubs/DataManagement.html', 'http://research.google.com/pubs/DataMining.html']
title = list()
abstract = list()
for l in links:
raw = urllib2.urlopen(l).read()
soup = BeautifulSoup(raw)
title = title + [t.string.strip() for t in soup.find_all("p", class_="pub-title")] # Title in pub-title
abstract = abstract + [a["href"] for a in soup.find_all("a", class_="abstract-icon tooltip")] # Abstract in tooltip
import re
title = [re.sub(' +',' ',t) for t in title]
title = [t.replace('\n','') for t in title]
print title[:10]
[u'2014 Recent Books and Journals in Public Opinion, Survey Methods, and Survey Statistics', u'A critical review of studies investigating the quality of data obtained with online panels based on probability and nonprobability samples', u'An efficient reconciliation algorithm for social networks.', u'Circumlocution in Diagnostic Medical Queries', u'Collaboration in the Cloud at Google', u'Coupled and k-Sided Placements: Generalizing Generalized Assignment', u'Data enrichment for incremental reach estimation', u'Inferring causal impact using Bayesian structural time-series models', u'Insulin Resistance: Regression and Clustering', u'Internet and mobile ratings panels']
words = [ w.lower() for t in title for w in t.split() ]
words[:10]
[u'2014', u'recent', u'books', u'and', u'journals', u'in', u'public', u'opinion,', u'survey', u'methods,']
import nltk
# Download stopwords
# nltk.download() # Öffnet Download-Manager
stop = nltk.corpus.stopwords.words('english')
print stop[:10]
words_without_stopwords = [i for i in words if i not in stop]
print
words_without_stopwords[:10]
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your']
[u'2014', u'recent', u'books', u'journals', u'public', u'opinion,', u'survey', u'methods,', u'survey', u'statistics']
from collections import Counter
print "All words"
for item in [words]:
c = Counter(item)
print c.most_common()[:10] # top 10
print "Without stopword"
for item in [words_without_stopwords]:
c = Counter(item)
print c.most_common()[:10] # top 10
All words [(u'for', 1600), (u'of', 1360), (u'and', 1192), (u'the', 1102), (u'in', 956), (u'a', 900), (u'with', 586), (u'on', 466), (u'to', 454), (u'learning', 352)] Without stopword [(u'learning', 352), (u'search', 334), (u'using', 310), (u'web', 306), (u'data', 302), (u'online', 220), (u'networks', 180), (u'google', 168), (u'models', 162), (u'social', 152)]
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
raw = " ".join(title)
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)
text.collocations()
Building collocations list Large Scale; Machine Translation; Neural Networks; Speech Recognition; Deep Neural; Language Modeling; Finite-State Transducers; Dependency Parsing; Grammar Induction; Public Opinion; Weighted Finite-State; Latent Variable; Domain Adaptation; Image Annotation; Extended Abstract; World Wide; Information Retrieval; Statistical Machine; search engine; Automatic Speech
porter = nltk.PorterStemmer()
text_stemmed = [porter.stem(t).lower() for t in tokens if t not in stop]
c = Counter(text_stemmed)
c.most_common()[:20]
[(u':', 1456), (u'a', 492), (u'model', 408), (u',', 402), (u'search', 378), (u'learn', 368), (u'use', 346), (u'web', 332), (u'network', 324), (u'data', 312), (u'the', 278), (u'onlin', 228), (u'algorithm', 204), (u'system', 202), (u'googl', 196), (u'distribut', 184), (u'comput', 182), (u'optim', 172), (u'languag', 162), (u'queri', 158)]
# Lemmatizer needs nltk Corpus "wordnet
# Install with nltk.download() in ipython shell
wnl = nltk.WordNetLemmatizer()
tokens_lower = [w.lower() for w in tokens]
text_lemmata = [wnl.lemmatize(t) for t in tokens_lower if t not in stop]
# Remove punctuation
text_lemmata = filter(lambda word: word not in ',-:', text_lemmata)
c = Counter(text_lemmata)
c.most_common()[:20]
[(u'learning', 358), (u'search', 348), (u'web', 332), (u'data', 312), (u'network', 310), (u'using', 310), (u'model', 286), (u'online', 228), (u'system', 202), (u'google', 196), (u'algorithm', 184), (u'language', 162), (u'query', 154), (u'social', 152), (u'video', 152), (u'large', 148), (u'user', 146), (u'application', 146), (u'mobile', 136), (u'distributed', 136)]
text.concordance('code')
print
text.concordance('user')
Building index... Displaying 25 of 32 matches: Handle an Enormous Error-Correcting Code A Unified Construction of the Glushk obabilistic Automata A Loopless Gray Code for Minimal Signed-Binary Representa tterns as partial orders from source code : from usage scenarios to specificat Change Education ? Google summer of code and google code-in BoF ( abstract on transcription factor affinity-based code for mammalian transcription initiati ime adaptation : a case for reactive code alignment Bubble-Up : Increasing Uti x : A Two-Way Sandbox for x86 Native Code SSAC Advisory on Search List Process t detection and forensic analysis of code injection attacks Third Internationa ( at Google ) RLint : Reformatting R Code to Follow the Google Style Guide 2nd r – data race detection in practice. Code Coverage , Performance , Approximati ction Service RLint : Reformatting R Code to Follow the Google Style Guide Res ime adaptation : a case for reactive code alignment Simbeeotic : a simulation- -Time Compilation and Self-Modifying Code Logical Attestation : An Authorizati r Safe and Unwinding Aware Identical Code Folding in Gold Scalable I/O Event H for Portable , Untrusted x86 Native Code Optimizing Programs with Intended Se ive Java , Second Edition Fault-Safe Code Motion for Type-Safe Languages Feedb Handle an Enormous Error-Correcting Code A Unified Construction of the Glushk obabilistic Automata A Loopless Gray Code for Minimal Signed-Binary Representa tterns as partial orders from source code : from usage scenarios to specificat Change Education ? Google summer of code and google code-in BoF ( abstract on transcription factor affinity-based code for mammalian transcription initiati ime adaptation : a case for reactive code alignment Bubble-Up : Increasing Uti x : A Two-Way Sandbox for x86 Native Code SSAC Advisory on Search List Process t detection and forensic analysis of code injection attacks Third Internationa ( at Google ) RLint : Reformatting R Code to Follow the Google Style Guide 2nd Displaying 25 of 118 matches: ? A multinational survey of YouTube user satisfaction Incremental Clicks Impa ierarchical Geographical Modeling of User locations from Social Media Posts Im ith Additive Hierarchically-smoothed User Preferences Learning Binary Codes fo Factorization by Embedding Multiple User Interests On Rectified Linear Units inese Restaurant Franchise Process : User Tracking and Document Modeling Trans ta Integration Systems Pay-as-you-go User Feedback for Dataspace Systems The S ith Additive Hierarchically-smoothed User Preferences Nowcasting with Google T inese Restaurant Franchise Process : User Tracking and Document Modeling Track s Reviews Mining advertiser-specific user behavior using adfactors Overlapping for Orkut Communities : Discovery of User Latent Behavior Computers and iPhone g : Addressing system complexity and user behavior Characterization and Compar nal media Mining advertiser-specific user behavior using adfactors Online Stoc , transparency & control in inferred user interest models Advisory on Internal ? A multinational survey of YouTube user satisfaction Sticking Together : Han l on Android Online Microsurveys for User Experience Research Photographing in , transparency & control in inferred user interest models Would a privacy fund acking 2nd international workshop on user evaluations for software engineering r software engineering researchers ( USER ) A meteoroid on steroids : ranking nizers Using Crowdsourcing Design of user interfaces for selective editing of idelines Exploring and enhancing the user experience for TV FFitts Law : Model dation of a Questionnaire to Measure User Satisfaction with the Intranet Swipe l comparative analysis for analyzing user behavior Comparing collaboration and raction : a new dimension for mobile user interfaces Google Image Swirl : A La lization System How do designers and user experience professionals actually pe rch : Getting stakeholder buy-in for user experience research projects Look Wh
text.similar("code")
print
text.similar("user")
print
text.similar("data")
Building word-context index... automata citations classes compilation content control dynamic fame finger hierarchical human moments multisets network objects oi online pagerank privacy searching mobile benefits body case chromecast comparative comparing concepts cray crowd design digital directed enterprise extent field formal goals home numa assignments based efficient future gaussians graph halfspaces high images new point resource sampled signals speech strategies stream svms token tsp
%matplotlib inline
fdist = nltk.FreqDist(text)
fdist.plot(50, cumulative=True)
import re
abstracts = list()
title = list()
year = list()
author = list()
journal = list()
for l in list(set(abstract)):
raw = urllib2.urlopen("http://research.google.com" + l).read() # Download page
soup = BeautifulSoup(raw) # Parse page
a = [t.text.strip().lower() for t in (td.find('div') for td in soup.findAll('div', class_="maia-col-8")) if a]
if a:
abstracts = abstracts + a
m = re.search(r"year\s*=\s*(\d\d\d\d)", soup.find('textarea').text)
if m:
year.append(m.group(1))
else:
year.append("No Year")
m = re.search(r"title\s*=\s*{(.*?)}", soup.find('textarea').text)
if m:
title.append(m.group(1))
else:
title.append("No Titel")
m = re.search(r"author\s*=\s*{(.*?)}", soup.find('textarea').text)
if m:
author.append(m.group(1).replace(" and ", ",").split(","))
else:
author.append("No Author")
m = re.search(r"journal\s*=\s*{(.*?)}", soup.find('textarea').text)
if m:
journal.append(m.group(1))
else:
journal.append("Other")
import codecs
with codecs.open('research_abstracts.txt', 'r', 'utf-8') as f:
abstracts = f.read()
textfile = "\n".join(abstracts)
tokens = nltk.word_tokenize(textfile)
text = nltk.Text(tokens)
text_clean = [word for word in text if word not in ',-:()']
all_authors = [a for sublist in author for a in sublist]
print "Autors"
c = Counter(all_authors)
c.most_common()[:20]
Autors
[(u'Samy Bengio', 33), (u'Ciprian Chelba', 18), ('o', 16), (u'Mario Callegaro', 15), (u'Martin Jansche', 14), (u'Jason Weston', 14), (u'Richard F. Lyon', 13), (u'Robert Hundt', 12), (u'Michele Covell', 12), (u'Andrew Senior', 12), (u'Michael Riley', 11), (u'Shumeet Baluja', 11), (u'Jim Koehler', 11), (u'Rahul Sukthankar', 11), (u'Luiz Andr\xe9 Barroso', 11), (u'Dennis Abts', 10), (u'Fernando Pereira', 10), (u'Thomas Dean', 10), (u'Vivek Kwatra', 10), (u'Mark S. Miller', 10)]
print "Journals"
c = Counter(journal)
c.most_common()[:20]
Journals
[('Other', 855), (u'Communications of the ACM', 10), (u'Journal of Machine Learning Research', 6), (u'Journal of Advertising Research', 4), (u'IEEE Micro', 4), (u'Theory of Computing', 4), (u'Computer', 3), (u'Journal Machine Learning Research (JMLR)', 3), (u'IEEE Computer', 3), (u'IEEE Internet Computing', 3), (u'Lecture Notes in Computer Science', 2), (u'IEEE Transactions on Multimedia', 2), (u'IEEE Transactions on Information Theory', 2), (u'IEEE Transactions on Visualization and Computer Graphics', 2), (u'Science', 2), (u'Personal and Ubiquitous Computing', 2), (u'Computational Linguistics', 2), (u'Mathematical Structures in Computer Science', 2), (u'Microprocessors and Microsystems', 2), (u'Social Science Computer Review', 2)]
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
t = Counter(year)
x = sorted([c for c in t])
y = [t[c] for c in x]
width = .75
ind = np.arange(len(y))
plt.bar(ind,y)
plt.xticks(ind + width / 2, x, rotation=45)
plt.show()
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
print "Bigrams"
finder = BigramCollocationFinder.from_words(text_clean)
scored = finder.score_ngrams(bigram_measures.likelihood_ratio)
print scored[:20]
print "Trigrams"
finder = TrigramCollocationFinder.from_words(text_clean)
scored = finder.score_ngrams(trigram_measures.raw_freq)
print scored[:20]
Bigrams [((u'this', u'paper'), 7630.523877332284), ((u'can', u'be'), 4915.636479420465), ((u'based', u'on'), 3986.709338297177), ((u'such', u'as'), 3495.67445642069), ((u'in', u'this'), 2902.8211764870275), ((u'we', u'present'), 2807.9677939185112), ((u'paper', u'we'), 2727.079679798322), ((u'show', u'that'), 2593.1166653981827), ((u'of', u'the'), 2340.8347275647625), ((u'number', u'of'), 2140.233668940303), ((u'we', u'propose'), 2002.6025420464705), ((u'as', u'well'), 1745.5958675196175), ((u'well', u'as'), 1677.4215856080632), ((u'we', u'describe'), 1607.7850364146311), ((u'we', u'show'), 1559.9124633680044), ((u'has', u'been'), 1473.8730755374886), ((u'set', u'of'), 1462.8419092156469), ((u'we', u'also'), 1369.6673211246862), ((u'the', u'same'), 1348.5215392768089), ((u'in', u'the'), 1333.7289111945145)] Trigrams [((u'in', u'this', u'paper'), 0.0015237422586699626), ((u'this', u'paper', u'we'), 0.0014845042605926245), ((u'as', u'well', u'as'), 0.0006801253000071936), ((u'we', u'present', u'a'), 0.0006539666346223016), ((u'we', u'show', u'that'), 0.0005558716394289563), ((u'we', u'propose', u'a'), 0.0004904749759667261), ((u'the', u'number', u'of'), 0.0004773956432742801), ((u'the', u'use', u'of'), 0.00044469731154316505), ((u'a', u'set', u'of'), 0.00043815764519694204), ((u'abstract', u'this', u'paper'), 0.0003858403144271579), ((u'in', u'order', u'to'), 0.0003662213153884889), ((u'in', u'terms', u'of'), 0.0003662213153884889), ((u'abstract', u'in', u'this'), 0.0003269833173111508), ((u'the', u'problem', u'of'), 0.0003269833173111508), ((u'abstract', u'we', u'present'), 0.00030082465192625874), ((u'can', u'be', u'used'), 0.00029428498558003573), ((u'paper', u'we', u'present'), 0.0002615866538489206), ((u'a', u'variety', u'of'), 0.0002550469875026976), ((u'based', u'on', u'the'), 0.0002550469875026976), ((u'show', u'that', u'the'), 0.0002550469875026976)]
print "Important bigrams"
finder = BigramCollocationFinder.from_words(text_clean)
scored = finder.score_ngrams(bigram_measures.raw_freq)
scored_no_stop = [bi for bi in scored if (bi[0][0] not in stop) and (bi[0][1] not in stop)]
print scored_no_stop[:20]
Important bigrams [((u'machine', u'learning'), 0.00030082465192625874), ((u'paper', u'presents'), 0.00029428498558003573), ((u'experimental', u'results'), 0.0002877453192338127), ((u'speech', u'recognition'), 0.0002615866538489206), ((u'large', u'scale'), 0.00022888832211780555), ((u'web', u'search'), 0.00022888832211780555), ((u'training', u'data'), 0.00019618999038669048), ((u'paper', u'describes'), 0.00018311065769424444), ((u'data', u'sets'), 0.0001765709913480214), ((u'results', u'show'), 0.0001765709913480214), ((u'open', u'source'), 0.0001700313250017984), ((u'%', u'.'), 0.0001634916586555754), ((u'user', u"'s"), 0.0001634916586555754), ((u'search', u'engine'), 0.00015695199230935237), ((u'large', u'number'), 0.00014387265961690634), ((u'wide', u'range'), 0.00014387265961690634), ((u'google', u"'s"), 0.00013733299327068333), ((u'machine', u'translation'), 0.00013733299327068333), ((u'experiments', u'show'), 0.0001307933269244603), ((u'previous', u'work'), 0.0001307933269244603)]
50 shades of gray?
print text.concordance("gray")
print [abstract[i] for i in [abstracts.index(a) for a in abstracts if "gray codes" in a]]
print
print text.concordance("markov")
print [abstract[i] for i in [abstracts.index(a) for a in abstracts if "markov" in a]]
Building index... Displaying 5 of 5 matches: ocal optimization. to achieve this , gray codes are often employed for encodin roximate high-dimensional variant of gray codes. the labeling procedure is use and simulated annealing , to employ gray codes for encoding ordinal points or roximate high-dimensional variant of gray codes with standard gray codes as a variant of gray codes with standard gray codes as a subset of the labels foun None ['/pubs/pub36483.html', '/pubs/pub41097.html'] Displaying 25 of 30 matches: e , and 17.0\ % over vanilla hidden markov models induced with em. abstract : d approaches to the existing hidden markov model-based one. abstract : this pa ently outperforms a standard hidden markov model in terms of conditional predi ed through the partially observable markov decision process ( pomdp ) framewor roblem is solved with the theory of markov chains by modeling the stochastic p ic process of face recognition as a markov chain. as conventional face alignme lignment is most often modeled as a markov process that generates a sentence f -clustered context-dependent hidden markov models ( hmms ) to represent probab t networks embedding variable-order markov models. each component network has n the hierarchy. the variable-order markov models account for features that ar t networks embedding variable-order markov models. each component network has n the hierarchy. the variable-order markov models account for features that ar ently outperforms a standard hidden markov model in terms of conditional predi outperforms an input/output hidden markov model. furthermore , these models a ently outperforms a standard hidden markov model in terms of conditional predi es derived from input/output hidden markov models ( iohmms ) . likelihoods and f artificial neural network - hidden markov model ( ann/hmm ) hybrid systems fo est gaussian mixture model - hidden markov model ( gmm/hmm ) baseline , built ently outperforms a standard hidden markov model in terms of conditional predi show that a real-time hybrid hidden markov model / neural network ( hmm/nn ) l text that are represented by hidden markov models. this modeling approach may and is suitable primarily for large markov chains where reachability analysis ion probability -- -when applied to markov chains -- -is tied to the subdomina rforming multiple random walks on a markov graph that approximates user search eech recognition systems use hidden markov models ( hmms ) to deal with the te None ['/pubs/pub38137.html', '/pubs/pub41466.html', '/pubs/pub37230.html', '/pubs/pub41769.html', '/pubs/pub42510.html', '/pubs/pub42191.html', '/pubs/pub41892.html', '/pubs/pub37194.html', '/pubs/pub36356.html', '/pubs/pub41688.html', '/pubs/pub41398.html', '/pubs/pub34503.html', '/pubs/pub41438.html', '/pubs/pub36461.html', '/pubs/pub34911.html', '/pubs/pub34510.html', '/pubs/pub35613.html', '/pubs/pub34342.html', '/pubs/pub42235.html', '/pubs/pub41185.html', '/pubs/pub38107.html', '/pubs/pub40812.html', '/pubs/pub36240.html']
search = "markov"
finder = BigramCollocationFinder.from_words(text_clean)
scored = finder.score_ngrams(bigram_measures.raw_freq)
scored_no_stop = [bi for bi in scored if (search in bi[0][0]) or (search in bi[0][1])]
print scored_no_stop[:20]
[((u'hidden', u'markov'), 9.809499519334524e-05), ((u'markov', u'model'), 5.231733076978413e-05), ((u'markov', u'models'), 3.923799807733809e-05), ((u'a', u'markov'), 2.6158665384892064e-05), ((u'variable-order', u'markov'), 2.6158665384892064e-05), ((u'markov', u'chains'), 1.9618999038669047e-05), ((u'markov', u'models.'), 1.9618999038669047e-05), ((u'the', u'markov'), 1.3079332692446032e-05), ((u'large', u'markov'), 6.539666346223016e-06), ((u'markov', u'chain'), 6.539666346223016e-06), ((u'markov', u'chain.'), 6.539666346223016e-06), ((u'markov', u'decision'), 6.539666346223016e-06), ((u'markov', u'graph'), 6.539666346223016e-06), ((u'markov', u'inequality'), 6.539666346223016e-06), ((u'markov', u'inequality.'), 6.539666346223016e-06), ((u'markov', u'model-based'), 6.539666346223016e-06), ((u'markov', u'model.'), 6.539666346223016e-06), ((u'markov', u'process'), 6.539666346223016e-06), ((u'markov', u'state'), 6.539666346223016e-06), ((u'observable', u'markov'), 6.539666346223016e-06)]
text.generate(100)
Building ngram index... abstract : we describe the key role mobile photos play in a data structure for the capacity of datacenters , possibly modifying database schemata , and behavioral optimization , and mathematically tractable generative model loses sparsity with the stacked cards-based switching interface allows for carving around fast moving salient regions. additionally , a relevant criterion in a container of gas. the conjugate variables of these vulnerabilities , we show that it outperforms several existing online as well : pruning kneser-ney models ( e.g. , by a cluster analysis , but to share each computer. unfortunately , the number of applications
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer().fit_transform(abstracts)
pairwise_similarity = tfidf * tfidf.T
print pairwise_similarity
(0, 742) 0.000826670891242 (0, 170) 0.00126923093472 (0, 503) 0.00300675382624 (0, 428) 0.00774612927246 (0, 130) 0.00922215439267 (0, 357) 0.0378511512937 (0, 52) 0.0309375609278 (0, 1023) 0.0297836091549 (0, 583) 0.0366965293426 (0, 976) 0.0243614233825 (0, 685) 0.0349825603202 (0, 49) 0.0114511817329 (0, 740) 0.0421584157434 (0, 721) 0.0268484760517 (0, 390) 0.0406558918989 (0, 252) 0.026976177692 (0, 222) 0.0173791005636 (0, 1048) 0.0178477951235 (0, 686) 0.0256307629053 (0, 916) 0.025792007925 (0, 909) 0.0353774758612 (0, 46) 0.0530485033593 (0, 999) 0.0561473971456 (0, 957) 0.0299272761011 (0, 898) 0.0393577113207 : : (1058, 972) 0.115156529311 (1058, 862) 0.0721348276585 (1058, 727) 0.119843137638 (1058, 699) 0.106540292132 (1058, 695) 0.106232962195 (1058, 617) 0.139077371705 (1058, 498) 0.0604916306015 (1058, 420) 0.144904306104 (1058, 379) 0.131266285378 (1058, 358) 0.0741461957034 (1058, 931) 0.0627358516624 (1058, 890) 0.105201141288 (1058, 439) 0.0564022532165 (1058, 989) 0.0997134763274 (1058, 789) 0.0949863212759 (1058, 455) 0.0932557428069 (1058, 540) 0.103824592638 (1058, 942) 0.123999992957 (1058, 743) 0.217744583663 (1058, 637) 0.0938344506503 (1058, 1053) 0.122416272868 (1058, 1015) 0.21258000495 (1058, 845) 0.121965567775 (1058, 955) 0.147951649139 (1058, 1058) 1.0
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words='english')
dtm = cv.fit_transform(abstracts)
dtm = dtm.toarray()
from sklearn.metrics.pairwise import euclidean_distances
dist = euclidean_distances(dtm)
dist
array([[ 0. , 14.6628783 , 22.3159136 , ..., 20.5669638 , 12.80624847, 20.49390153], [ 14.6628783 , 0. , 22.64950331, ..., 20.39607805, 13.89244399, 21.37755833], [ 22.3159136 , 22.64950331, 0. , ..., 25.90366769, 22.3159136 , 26.94438717], ..., [ 20.5669638 , 20.39607805, 25.90366769, ..., 0. , 20.1246118 , 24.31049156], [ 12.80624847, 13.89244399, 22.3159136 , ..., 20.1246118 , 0. , 20.09975124], [ 20.49390153, 21.37755833, 26.94438717, ..., 24.31049156, 20.09975124, 0. ]])
from scipy.cluster.hierarchy import ward, dendrogram
normalizing = np.sqrt(np.sum(dtm * dtm, axis=1, keepdims=True))
dtm_normed = dtm / normalizing
clust = ward(dtm_normed)
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure(figsize=(20,100))
dendrogram(clust, orientation="right", labels=title)
plt.tight_layout()
from sklearn.cluster import KMeans
km_fit = KMeans(init='k-means++', n_clusters=8, n_init=10, n_jobs=-1).fit(dtm_normed)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_fit = pca.fit_transform(dtm_normed)
colors = ['blue', 'red', 'green', 'black', 'purple', 'gray', 'yellow', 'brown']
plt.figure(figsize=(16,9))
plt.scatter(pca_fit[:,0], pca_fit[:,1], color=[colors[i] for i in km_fit.labels_])
<matplotlib.collections.PathCollection at 0x3876f048>
[title[i] for i in range(len(title)) if km_fit.labels_[i] == 4]
[u'Fast and Scalable Decoding with Language Model Look-Ahead for Phrase-based Statistical Machine Translation', u'User browsing models: relevance versus examination', u"Google's Cross-Dialect Arabic Voice Search", u'Adolescent search roles', u'sos: Searching Help Pages of R Packages', u'Deploying Google Search by Voice in Cantonese', u'Blognoon: Exploring a Topic in the Blogosphere', u'Language Modeling for Automatic Speech Recognition Meets the Web: Google Search by Voice', u'Impact Of Ranking Of Organic Search Results On The Incrementality Of Search Ads', u'Incremental Clicks Impact of Mobile Search Advertising', u'Point Representation for Local Optimization: Towards Multi-Dimensional Gray Codes', u'Good Abandonment in Mobile and PC Internet Search', u'Computers and iPhones and Mobile Phones, oh my! A logs-based comparison of search users on different devices', u'Google Correlate Whitepaper', u'Permutation Indexing: Fast Approximate Retrieval from Large Corpora', u'Learning to Search Efficiently in High Dimensions', u'Neighborhood Preserving Codes for Assigning Point Labels: Applications to Stochastic Search', u'Shopping for Top Forums: Discovering Online Discussion for Product Research', u'Nearest Neighbor Search in Google Correlate', u'The Politics of Search: A Decade Retrospective.', u'A Tale of Two (Similar) Cities: Inferring City Similarity Through Geo-Spatial Query Log Analysis', u'VisualRank: Applying PageRank to Large-Scale Image Search', u"A Room with a View: Understanding Users' Stages in Picking a Hotel Online", u'Query Language Modeling for Voice Search', u'Large-Scale Speaker Identification', u'Perception and Understanding of Social Annotations in Web Search', u'On-Demand Language Model Interpolation for Mobile Speech Input', u'Optimal Size, Freshness and Time-frame for Voice Search Vocabulary', u'Say What? Why users choose to speak their web queries', u'The SMAPH System for Query Entity Recognition and Disambiguation', u'Biperpedia: An Ontology for Search Applications', u'Vine Pruning for Efficient Multi-Pass Dependency Parsing', u'Empirical Exploration of Language Modeling for the google.com Query Stream as Applied to Mobile Voice Search', u'Google TV Search: Dual-Wielding Search and Discovery in a Large-Scale Product', u'Knowledge Base Completion via Search-Based Question Answering', u'Learning Query-Specific Distance Functions for Large-Scale Web Image Search', u'Online Effects of Offline Ads', u'Incremental Clicks: The Impact of Search Advertising', u'Search by Voice in Mandarin Chinese', u'Modelling Score Distributions Without Actual Scores', u'Answer typing for information retrieval', u'Search Worms', u'Voice Search for Development', u'Incremental Clicks Impact Of Search Advertising', u'Optimizing Budget Constrained Spend in Search Advertising', u'Robust Local Search for Solving RCPSP/max with Durational Uncertainty', u'Clustering Query Refinements by User Intent', u'Translating Queries into Snippets for Improved Query Expansion', u'SSAC Advisory on Search List Processing', u"Children's Roles Using Keyword Search Interfaces in the Home", u'Topical clustering of search results', u'Near to the brain: Functional near-infrared spectroscopy as a lightweight brain imaging technique for visualization', u'Gesture Search: A Tool for Fast Mobile Data Access', u'YouPivot: Improving Recall with Contextual Search', u'Detecting influenza epidemics using search engine query data', u'Extracting Unambiguous Keywords from Microposts Using Web and Query Logs Data', u'Piggyback: Using Search Engines for Robust Cross-Domain Named Entity Recognition', u'Confucius and Its Intelligent Disciples: Integrating Social with Search', u'On the Difficulty of Nearest Neighbor Search', u'Social Annotations in Web Search', u'Theoretical Convergence Guarantees for Cooperative Coevolutionary Algorithms']
abs_14 = " ".join([abstracts[y] for y in range(len(year)) if year[y] == "2014"])
tok_14 = nltk.WordPunctTokenizer().tokenize(abs_14)
txt_14 = nltk.Text(tok_14)
abs_bm = " ".join([abstracts[y] for y in range(len(year)) if year[y] != "2014"])
tok_bm = nltk.WordPunctTokenizer().tokenize(abs_bm)
txt_bm = nltk.Text(tok_bm)
import pandas as pd
fd_14 = nltk.FreqDist(txt_14)
fd_bm = nltk.FreqDist(txt_bm)
data_14 = pd.DataFrame(fd_14.items())
data_14.columns = ['word', 'count14']
numwords = sum(data_14['count14'])
data_14['rel14'] = data_14['count14'] / numwords
data_bm = pd.DataFrame(fd_bm.items())
data_bm.columns = ['word', 'countbm']
numwords = sum(data_bm['countbm'])
data_bm['relbm'] = data_bm['countbm'] / numwords
data_comp = data_14.merge(data_bm)
data_comp['delta'] = data_comp.rel14/data_comp.relbm
data_comp[data_comp.count14 > 1].sort("delta", ascending=False)
word | count14 | rel14 | countbm | relbm | delta | |
---|---|---|---|---|---|---|
531 | √ | 5 | 0.000313 | 1 | 0.000006 | 50.754977 |
670 | retrospective | 4 | 0.000250 | 1 | 0.000006 | 40.603981 |
159 | course | 12 | 0.000751 | 3 | 0.000019 | 40.603981 |
598 | folding | 4 | 0.000250 | 1 | 0.000006 | 40.603981 |
587 | employees | 4 | 0.000250 | 1 | 0.000006 | 40.603981 |
893 | predicts | 3 | 0.000188 | 1 | 0.000006 | 30.452986 |
701 | %). | 3 | 0.000188 | 1 | 0.000006 | 30.452986 |
738 | causal | 3 | 0.000188 | 1 | 0.000006 | 30.452986 |
849 | kd | 3 | 0.000188 | 1 | 0.000006 | 30.452986 |
761 | convolutional | 3 | 0.000188 | 1 | 0.000006 | 30.452986 |
879 | parses | 3 | 0.000188 | 1 | 0.000006 | 30.452986 |
880 | particles | 3 | 0.000188 | 1 | 0.000006 | 30.452986 |
746 | competition | 3 | 0.000188 | 1 | 0.000006 | 30.452986 |
956 | troubleshooting | 3 | 0.000188 | 1 | 0.000006 | 30.452986 |
765 | cosine | 3 | 0.000188 | 1 | 0.000006 | 30.452986 |
942 | substitutes | 3 | 0.000188 | 1 | 0.000006 | 30.452986 |
933 | snapshot | 3 | 0.000188 | 1 | 0.000006 | 30.452986 |
830 | ignore | 3 | 0.000188 | 1 | 0.000006 | 30.452986 |
414 | taxonomy | 6 | 0.000376 | 2 | 0.000012 | 30.452986 |
812 | facts | 3 | 0.000188 | 1 | 0.000006 | 30.452986 |
608 | hijacking | 4 | 0.000250 | 2 | 0.000012 | 20.301991 |
403 | release | 6 | 0.000376 | 3 | 0.000019 | 20.301991 |
1288 | office | 2 | 0.000125 | 1 | 0.000006 | 20.301991 |
1374 | representativeness | 2 | 0.000125 | 1 | 0.000006 | 20.301991 |
1120 | dropped | 2 | 0.000125 | 1 | 0.000006 | 20.301991 |
1388 | shifted | 2 | 0.000125 | 1 | 0.000006 | 20.301991 |
1194 | habits | 2 | 0.000125 | 1 | 0.000006 | 20.301991 |
1285 | noting | 2 | 0.000125 | 1 | 0.000006 | 20.301991 |
1422 | synchronization | 2 | 0.000125 | 1 | 0.000006 | 20.301991 |
1299 | outsourcing | 2 | 0.000125 | 1 | 0.000006 | 20.301991 |
... | ... | ... | ... | ... | ... | ... |
1195 | hardware | 2 | 0.000125 | 50 | 0.000308 | 0.406040 |
783 | does | 3 | 0.000188 | 75 | 0.000463 | 0.406040 |
853 | latency | 3 | 0.000188 | 76 | 0.000469 | 0.400697 |
931 | size | 3 | 0.000188 | 76 | 0.000469 | 0.400697 |
1111 | difficult | 2 | 0.000125 | 51 | 0.000315 | 0.398078 |
1148 | examples | 2 | 0.000125 | 51 | 0.000315 | 0.398078 |
1429 | testing | 2 | 0.000125 | 52 | 0.000321 | 0.390423 |
1306 | parameters | 2 | 0.000125 | 52 | 0.000321 | 0.390423 |
1337 | programs | 2 | 0.000125 | 54 | 0.000333 | 0.375963 |
921 | security | 3 | 0.000188 | 82 | 0.000506 | 0.371378 |
1189 | give | 2 | 0.000125 | 55 | 0.000339 | 0.369127 |
1020 | because | 2 | 0.000125 | 55 | 0.000339 | 0.369127 |
1387 | server | 2 | 0.000125 | 55 | 0.000339 | 0.369127 |
1400 | sparse | 2 | 0.000125 | 56 | 0.000345 | 0.362536 |
1393 | similarity | 2 | 0.000125 | 56 | 0.000345 | 0.362536 |
612 | images | 4 | 0.000250 | 112 | 0.000691 | 0.362536 |
479 | machine | 5 | 0.000313 | 141 | 0.000870 | 0.359964 |
1254 | loss | 2 | 0.000125 | 57 | 0.000352 | 0.356175 |
950 | three | 3 | 0.000188 | 88 | 0.000543 | 0.346057 |
1199 | help | 2 | 0.000125 | 59 | 0.000364 | 0.344102 |
1062 | computer | 2 | 0.000125 | 62 | 0.000382 | 0.327451 |
1181 | functions | 2 | 0.000125 | 65 | 0.000401 | 0.312338 |
696 | will | 4 | 0.000250 | 131 | 0.000808 | 0.309954 |
294 | algorithms | 7 | 0.000438 | 231 | 0.001425 | 0.307606 |
1053 | common | 2 | 0.000125 | 69 | 0.000426 | 0.294232 |
1107 | detection | 2 | 0.000125 | 70 | 0.000432 | 0.290028 |
1330 | presents | 2 | 0.000125 | 70 | 0.000432 | 0.290028 |
1392 | similar | 2 | 0.000125 | 81 | 0.000500 | 0.250642 |
620 | language | 4 | 0.000250 | 173 | 0.001067 | 0.234705 |
1460 | without | 2 | 0.000125 | 92 | 0.000567 | 0.220674 |
1468 rows × 6 columns