from IPython.display import HTML
HTML('')
import json
# Load Data (if you don't want to crawl the data)
with open('strata_abstracts.json') as f:
abstracts = json.load(f)
import nltk
bigram_measures = nltk.collocations.BigramAssocMeasures()
stop = nltk.corpus.stopwords.words('english')
text = {}
words = {}
for year in abstracts:
raw = " ".join(abstracts[year])
tokens = nltk.WordPunctTokenizer().tokenize(raw)
text[year] = nltk.Text(tokens)
words[year] = [w.lower() for w in text[year]]
words[year] = [w for w in words[year] if w not in stop]
words[year] = filter(lambda word: word not in u'%,-:()$\/;?.’–“”', words[year])
words[year] = [w for w in words[year] if w not in ["ll", "II", "ll", "http", "://", "e", "g", "2", "0"]]
text["2012"]
for year in text:
print year
text[year].collocations()
print
numwords = {}
uniwords = {}
for year in text:
numwords[year] = len(text[year])
uniwords[year] = len(set(text[year]))
print numwords
print uniwords
import pandas as pd
freq_table = pd.DataFrame()
for year in words:
fd = nltk.FreqDist(words[year])
if (len(freq_table) == 0):
freq_table = pd.DataFrame(fd.items(), columns=["Word", "Freq_" + str(year)])
else:
freq_table = freq_table.merge(pd.DataFrame(fd.items(), columns=["Word", "Freq_" + str(year)]))
print freq_table[:10]
for year in numwords:
freq_table["Perc_" + year] = 100.0 * freq_table["Freq_" + year] / numwords[year]
for year in ["2012", "2013", "2014"]:
print year
freq_table["Growth_" + year] = 100.0 * freq_table["Perc_" + year] / freq_table["Perc_" + str(int(year)-1)]
tb = freq_table[freq_table['Perc_' + str(year)] >= 0.08].sort(columns="Growth_" + str(year), ascending=False)[["Word", "Freq_" + str(year), "Perc_" + str(year), "Growth_" + str(year)]]
tb.columns = ["Word", "Freq", "Percent", "Index"]
tb.Index = tb['Index'].round(1)
tb.Percent = tb['Percent'].round(4)
print tb[:10]
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
for year in ["2011", "2012", "2013", "2014"]:
print "Bigrams " + str(year)
finder = BigramCollocationFinder.from_words(words[year])
scored = finder.score_ngrams(bigram_measures.raw_freq)
print pd.DataFrame(scored[:10])
for year in abstracts:
print "Trigrams " + str(year)
finder = TrigramCollocationFinder.from_words(text[year])
scored = finder.score_ngrams(trigram_measures.raw_freq)
print pd.DataFrame(scored[:10])
from collections import Counter
import pandas as pd
trending_words = pd.DataFrame()
for year in words:
fdist = nltk.FreqDist(words[year])
if len(trending_words) == 0:
trending_words = pd.DataFrame(fdist.items(), columns=["word", str(year)])
trending_words[str(year)] = trending_words[str(year)] / float(trending_words[str(year)].sum())
else:
trending_words = trending_words.merge(pd.DataFrame(fdist.items(), columns=["word", str(year)]), how="outer")
trending_words[str(year)] = trending_words[str(year)] / float(trending_words[str(year)].sum())
print trending_words[:10]
trending_words["plus12"] = trending_words["2012"] / trending_words["2011"]
trending_words["plus13"] = trending_words["2013"] / trending_words["2012"]
trending_words["plus14"] = trending_words["2014"] / trending_words["2013"]
trending_words = trending_words.fillna(0)
print trending_words[(trending_words["2012"] > 0.001) & (trending_words["2011"] > 0)].sort("plus12", ascending=False)[:10]
print
print trending_words[(trending_words["2013"] > 0.0005) & (trending_words["2011"] > 0)].sort("plus13", ascending=False)[:10]
print
print trending_words[(trending_words["2014"] > 0.0005) & (trending_words["2011"] > 0)].sort("plus14", ascending=False)[:10]
import pandas as pd
result = pd.DataFrame()
for year in words:
finder = BigramCollocationFinder.from_words(words[year], window_size = 2)
#finder.apply_freq_filter(2)
ignored_words = nltk.corpus.stopwords.words('english')
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
scores = finder.score_ngrams(bigram_measures.raw_freq)
if len(result) == 0:
result = pd.DataFrame(scores, columns=["ngram", str(year)])
else:
result = result.merge(pd.DataFrame(scores, columns=["ngram", str(year)]))
print result[:10]
result["plus12"] = result["2012"] / result["2011"]
result["plus13"] = result["2013"] / result["2012"]
result["plus14"] = result["2014"] / result["2013"]
print result[result["2014"] > 0.0005].sort("plus14", ascending=False)[:10]
print
print result[result["2013"] > 0.0005].sort("plus13", ascending=False)[:10]
print
print result[result["2012"] > 0.0005].sort("plus12", ascending=False)[:10]
%matplotlib inline
import matplotlib.pyplot as plt
query = [("big", "data"), ("data", "science"), ("real", "time"), ("machine", "learning"), ("social", "media"), ("open", "source")]
query_results = result[result['ngram'].isin(query)][["2011", "2012", "2013", "2014"]].transpose()
query_results.columns = [" ".join(q) for q in query]
print query_results.plot(figsize=(10,5), title="Strata topics")
%run lda.py -f strata_abstracts.txt -s --stopwords -k 7
%matplotlib inline
import matplotlib.pyplot as plt
query = ["hadoop", "yarn", "storm"]
query = ["python", "julia", "r", "sas", "stata", "excel"]
query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]]
query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose()
print query_results.plot(figsize=(10,6), title="Programming Langugages @ Strata Conferences 2011-2014")
query = ["business", "energy", "advertising", "banking", "health", "politics", "government", "finance", "automotive"]
query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]]
query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose()
print query_results.plot(figsize=(10,6), title="Strata topics")
query = ["google", "facebook", "yahoo", "linkedin", "microsoft"]
query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]]
query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose()
print query_results.plot(figsize=(10,6), title="Strata topics")
query = ["modern", "machine", "learning"]
query_results = trending_words[trending_words['word'].isin(query)][["word", "2011", "2012", "2013", "2014"]]
query_results = query_results.set_index(query_results['word']).drop("word", 1).transpose()
print query_results.plot(figsize=(10,6), title="Topics at Strata Conferences 2011-14")
plt.savefig("Strata_ModernMachineLearning.png")