#!/usr/bin/env python # coding: utf-8 # In[2]: import pandas as pd import numpy as np from nltk.corpus import sentiwordnet as swn import re import os import nltk import langdetect import matplotlib.pyplot as plt from bs4 import BeautifulSoup get_ipython().run_line_magic('matplotlib', 'inline') get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # In[3]: """ Get Data """ reload = False #check if data already exists if not os.path.isfile('taglines.pickle') or reload: filePath = '/Users/arimorcos/Databases/imdb/taglines.list' # create pandas data frame df = pd.DataFrame(columns=['title','year','tagline']) f = open(filePath,'r') skipInd = False for line in f: if line[0] == '#': # if the beginning of a new entry try: # search for title filmTitle = re.search('(?<=\# \").*(?=\" \()', line).group(0).encode('utf8') # get date filmYear = int(re.search('(?<=\" \()\d{4}',line).group(0)) except (AttributeError, UnicodeDecodeError): skipInd = True #initialize tagline filmTag = '' elif line != '\n': # if string is not empty try: filmTag = filmTag + line.encode("utf8") except UnicodeDecodeError: skipInd = True else: if not skipInd: #check language and skip if not english lang = langdetect.detect(filmTag) if lang != 'en': continue # remove html tags filmTag = BeautifulSoup(filmTag).get_text() # store data dfInd = df.shape[0] df.loc[dfInd] = [filmTitle, filmYear, filmTag] else: skipInd = False df.to_pickle('taglines.pickle') else: df = pd.read_pickle('taglines.pickle') # In[13]: """ Get genres """ genreReload = False #check if data already exists if not os.path.isfile('genres.pickle') or genreReload: filePath = '/Users/arimorcos/Databases/imdb/genres.list' # create pandas data frame genreDF = pd.DataFrame(columns=['title','year','genre']) f = open(filePath,'r') for line in f: if line[0] == '"': # if the beginning of a new entry try: # search for title filmTitle = re.search('(?<=\").*(?=\")', line).group(0).encode('utf8') # get date filmYear = int(re.search('(?<=\" \()\d{4}',line).group(0)) # get genre filmGenre = re.search('(?<=\s)[a-z,A-Z,-]+$',line).group(0) # store data dfInd = genreDF.shape[0] genreDF.loc[dfInd] = [filmTitle, filmYear, filmGenre] except (AttributeError, UnicodeDecodeError): continue genreDF.to_pickle('genres.pickle') else: genreDF = pd.read_pickle('genres.pickle') # In[7]: genreDF = genreDF.drop_duplicates(subset=['title','year']) genreDF.to_pickle('genres.pickle') # In[8]: df = df.merge(genreDF,on='title',how='left') # In[4]: def synSetMatch(synSet,POS): #get part of speech in synSet m = re.search('(?<=\.)[a-z](?=\.\d\d)',synSet.unicode_repr()) testPOS = m.group(0) #determine if a match if testPOS == 'n' and POS in ['NN']: return True elif testPOS == 'v' and POS in ['VBG','VBZ','VBP','MD']: return True elif testPOS == 'a' and POS in ['JJ','JJS']: return True elif testPOS == 'r' and POS in ['RB']: return True else: return False # In[179]: """ Save """ df.to_pickle('taglines.pickle') # In[133]: """ Get word count for each tagline""" # add wordCount column if not any(df.columns == 'wordCount'): df.insert(len(df.columns),'wordCount',None) #loop throughe ach index and count for index in range(df.shape[0]): currLine = df.loc[index,'tagline'] words = nltk.wordpunct_tokenize(currLine) df.loc[index,'wordCount'] = len(words) # In[4]: """ Get score for each tagline """ # add score column if not any(df.columns == 'score'): df.insert(len(df.columns),'score',None) # loop through each row and get score for index in range(df.shape[0]): #for index in range(5): # get word list wordList = nltk.wordpunct_tokenize(df.loc[index,'tagline']) # add pos tags wordList = nltk.pos_tag(wordList) keepPOS = ['VBG','NN','VBZ','JJ','JJS','RB','MD','VBP'] # get score totalScore = 0.0 for word in wordList: if word[1] in keepPOS: try: wordSet = swn.senti_synsets(word[0]) #filter non-matching value wordSet = [testWord for testWord in wordSet if synSetMatch(testWord, word[1])] posScore = np.nanmean([word.pos_score() for word in wordSet]) negScore = np.nanmean([word.neg_score() for word in wordSet]) netScore = posScore - negScore if np.isnan(netScore): continue except IndexError: continue else: continue totalScore += netScore # store score df.loc[index,'score'] = totalScore # In[5]: df.shape # In[8]: df[df.genre=='Musical'].shape # In[6]: import datetime def addCopyright(ax, xOffset=0, yOffset=0): year = datetime.date.today().year if any([i == '_axis3don' for i in dir(ax)]): ax.text(0 + xOffset, -.05 + yOffset, 0, r'$\copyright$ Ari Morcos ' + str(year), transform=ax.transAxes) else: ax.text(0 + xOffset, -.05 + yOffset, r'$\copyright$ Ari Morcos ' + str(year), transform=ax.transAxes) # In[7]: # %matplotlib inline get_ipython().run_line_magic('pylab', 'inline') x = df.groupby(['year'], as_index=False).agg({'wordCount':'mean', 'score': 'mean'}) ax = x.plot(x='year', y='score', legend=False, figsize=(20,10), fontsize=20, lw=2.5, color='b') ax.figure.autofmt_xdate(); ax.set_xlabel('Year', fontsize=30); ax.set_ylabel('Mean Sentiment Score', fontsize=30); addCopyright(ax, yOffset=-0.1) # In[8]: def getMeanScore(tempDF): """ Calculate mean score for each year """ maxYear = int(tempDF.year.max()) minYear = int(tempDF.year.min()) minYear = 1960 yearList = range(minYear, maxYear) avgScore = [] for year in yearList: avgScore.append(np.mean(tempDF[tempDF.year == year]['score'])) return avgScore, yearList # In[12]: avgScore, yearList = getMeanScore(df) # In[13]: smoothScore = pd.Series(avgScore); smoothScore = pd.rolling_mean(smoothScore,5) plt.plot(yearList,smoothScore); # In[9]: uniqueGenres = df[df.genre.notnull()].genre.unique() #uniqueGenres = ['Comedy'] genreScores = {} genreYearLists = {} for genre in uniqueGenres: genreScores[genre], genreYearLists[genre] = getMeanScore(df[df.genre == genre]) # In[10]: fig = plt.figure(figsize=(20,10)) for genre in uniqueGenres: plt.plot(genreYearLists[genre],genreScores[genre],lw=2) fig.autofmt_xdate() # In[11]: meanGenreScores = {} semGenreScores = {} for genre in uniqueGenres: subset = df[df.genre == genre] meanGenreScores[genre] = subset.score.mean() semGenreScores[genre] = subset.score.std()/sqrt(subset.shape[0]) # In[13]: get_ipython().run_line_magic('pylab', 'inline') fig = plt.figure(figsize=(20,10)) plt.bar(range(len(meanGenreScores)), sorted(meanGenreScores.values()), yerr=[x for (y,x) in sorted(zip(meanGenreScores.values(),semGenreScores.values()))], align='center', ecolor='k', error_kw={'lw': 3}); plt.xticks(range(len(meanGenreScores))); ax = plt.gca() ax.set_xticklabels(sorted(meanGenreScores.keys(), key=meanGenreScores.get)); ax.set_xlim(left=-0.6, right=len(meanGenreScores)-0.4) fig.autofmt_xdate() ax.set_ylabel('Mean Sentiment Score', fontsize = 30) for item in ax.get_xticklabels() + ax.get_yticklabels(): item.set_fontsize(20) addCopyright(ax, yOffset=-0.1) plt.savefig('temp.png', transparent=True) # In[29]: [x for (y,x) in sorted(zip(meanGenreScores.values(),semGenreScores.values()))] # In[16]: """ Get word distributions """ from nltk.probability import FreqDist taglineDists = {} for genre in uniqueGenres: # get concatenated taglines word list catTags = ' '.join(df[df.genre == genre]['tagline'].values) # make lowercase catTags = catTags.lower() #tokenize tagTokens = nltk.wordpunct_tokenize(catTags) # remove punctation noPunct = [word for word in tagTokens if word.isalpha()] #create distribution taglineDists[genre] = FreqDist(noPunct) # In[74]: taglineDists['Horror'].plot(30)