The data is from:
Mahesh Joshi, Dipanjan Das, Kevin Gimpel, and Noah A. Smith. Movie Reviews and Revenues: An Experiment in Text Regression. In Proceedings of the North American Chapter of the Association for Computational Linguistics Human Language Technologies Conference, Los Angeles, CA, June 2010.
%matplotlib inline
import scattertext as st
import re, io
from pprint import pprint
import pandas as pd
import numpy as np
import spacy.en
import seaborn as sns
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
nlp = spacy.en.English()
def draw_corpus(df, corpus, category, other_category, category_col, extra='', scores=None,
singleScoreMode=False, minimum_term_frequency=2):
html = st.produce_scattertext_explorer(corpus,
category=category,
category_name=category.lower() +' Term',
not_category_name=other_category.lower() + ' Term',
pmi_filter_thresold=6,
minimum_term_frequency=minimum_term_frequency,
metadata=df['metadata'],
scores=scores,
width_in_pixels=1000,
singleScoreMode=singleScoreMode,
use_full_doc = False)
file_name = category.lower() + '-' + other_category.lower() + extra + '.html'
open(file_name, 'wb').write(html.encode('utf-8'))
return IFrame(src=file_name, width = 1200, height=1000)
def draw_plot(df, category, other_category, category_col, extra='', minimum_term_frequency=3):
category_vs_other_df = df[(df[category_col] == category) | (df[category_col] == other_category)]
corpus = st.CorpusFromPandas(category_vs_other_df,
category_col = category_col,
text_col = 'text',
nlp = nlp).build()
return draw_corpus(category_vs_other_df, corpus, category, other_category, category_col, extra=extra, minimum_term_frequency=minimum_term_frequency)
df = pd.read_csv('movies_and_revenue.csv')
df['revenue_level'] = df['revenue percentile'].apply(lambda x: ('High' if x >= 2./3 else ('Low' if x <= 1./3 else 'Mid')) + ' Revenue')
df['metadata'] = df['name'] + ' Rated: ' + df['rating'] + ' Made: ' + df['revenue'].apply(lambda x:'${:,.0f}'.format(x))
df = df[df['split'] == 'train']
print('Number of review', len(df))
print('Number of movies', len(df['id'].unique()))
import matplotlib.pyplot as plt
plt.figure(figsize=(15, 5))
sns.boxplot(x="rating", y="revenue", data=df[df['rating'].isin({'R', 'G', 'PG', 'PG-13', 'NC-17'})])
Number of review 4779 Number of movies 1147
<matplotlib.axes._subplots.AxesSubplot at 0x11d961358>
df_pg13 = df[df['rating'] == 'PG-13']
print(len(df_pg13))
# Replace revenue-level through a wonky one-liner
df_pg13 = df_pg13[[c for c in df_pg13.columns if c != 'revenue_level']].join(pd.DataFrame(
{'revenue_level': df.ix[df_pg13['id'].drop_duplicates().index].set_index('id')['revenue']
.rank(pct=True)
.apply(lambda x: 'High Revenue' if x > 2./3 else ('Low Revenue' if x < 1./3 else 'Mid Revenue'))}), on='id')
df_pg13.iloc[0]
1383
id 1408 revenue 2.06177e+07 split train text Based on a Stephen King short story that bears... revenue percentile 0.854363 label True name 1408 number_of_screens 2678 origin USA rating PG-13 metadata 1408 Rated: PG-13 Made: $20,617,667 revenue_level High Revenue Name: 7, dtype: object
draw_plot(df_pg13, 'High Revenue', 'Low Revenue', 'revenue_level', extra='naive', minimum_term_frequency=10)
one_movie_per_doc_df = (df_pg13.groupby(['id', 'revenue', 'revenue_level', 'rating', 'name', 'metadata'])
.apply(lambda x: pd.Series({'text':'\n\n\n'.join(x['text'])}))
.reset_index())
one_movie_per_doc_df.iloc[0]
id 1408 revenue 2.06177e+07 revenue_level High Revenue rating PG-13 name 1408 metadata 1408 Rated: PG-13 Made: $20,617,667 text Based on a Stephen King short story that bears... Name: 0, dtype: object
draw_plot(one_movie_per_doc_df, 'High Revenue', 'Low Revenue', 'revenue_level',
extra='onemoveperdoc',
term_ranker = st.termranking.OncePerDocFrequencyRanker, # only count one mention per document in frequency ranking
minimum_term_frequency=8)
corpus = st.CorpusFromPandas(df[df['revenue_level'].isin(('High Revenue', 'Low Revenue'))],
category_col = 'revenue_level',
text_col = 'text',
nlp = nlp).build()
target_term = 'war'
html = st.word_similarity_explorer(corpus,
category=category,
category_name=category.lower() +' Term',
not_category_name=other_category.lower() + ' Term',
pmi_filter_thresold=6,
minimum_term_frequency=10,
metadata=df['metadata'],
target_term=target_term,
width_in_pixels=1200,
singleScoreMode=True,
use_full_doc = False,
alpha=0.01,
max_p_val=0.05,
term_ranker = st.termranking.OncePerDocFrequencyRanker)
file_name = category.lower() + '-' + other_category.lower() + extra + '.html'
open(file_name, 'wb').write(html.encode('utf-8'))
return IFrame(src=file_name, width = 1400, height=1000)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-25-88f81cff6846> in <module>() 1 target_term = 'war' 2 html = st.word_similarity_explorer(corpus, ----> 3 category=category, 4 category_name=category.lower() +' Term', 5 not_category_name=other_category.lower() + ' Term', NameError: name 'category' is not defined