Analyze Worldwide Box Office Data with Seaborn and Python (Part 1)

In [1]:
!pip install plotly
Requirement already satisfied: plotly in d:\softwares\anaconda\lib\site-packages (4.11.0)
Requirement already satisfied: six in d:\softwares\anaconda\lib\site-packages (from plotly) (1.15.0)
Requirement already satisfied: retrying>=1.3.3 in d:\softwares\anaconda\lib\site-packages (from plotly) (1.3.3)

Libraries

In [2]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')
import datetime
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
import nltk
nltk.download('stopwords')
stop = set(stopwords.words('english'))
import os
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import json
import ast
from urllib.request import urlopen
from PIL import Image
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\azam_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

Task 1: Data Loading and Exploration

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
In [4]:
train.head(1)
Out[4]:
id belongs_to_collection budget genres homepage imdb_id original_language original_title overview popularity poster_path production_companies production_countries release_date runtime spoken_languages status tagline title Keywords cast crew revenue
0 1 [{'id': 313576, 'name': 'Hot Tub Time Machine ... 14000000 [{'id': 35, 'name': 'Comedy'}] NaN tt2637294 en Hot Tub Time Machine 2 When Lou, who has become the "father of the In... 6.575393 /tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg [{'name': 'Paramount Pictures', 'id': 4}, {'na... [{'iso_3166_1': 'US', 'name': 'United States o... 2/20/15 93.0 [{'iso_639_1': 'en', 'name': 'English'}] Released The Laws of Space and Time are About to be Vio... Hot Tub Time Machine 2 [{'id': 4379, 'name': 'time travel'}, {'id': 9... [{'cast_id': 4, 'character': 'Lou', 'credit_id... [{'credit_id': '59ac067c92514107af02c8c8', 'de... 12314651

Task 2: Visualizing the Target Distribution

In [5]:
train.revenue.hist()
Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x2125af52970>
In [6]:
fig, ax = plt.subplots(figsize =[10,10])
plt.subplot(1,2,1)
sns.distplot(train["revenue"],kde= True);
plt.title("Distribution of revenue");
plt.subplot(1,2,2)
sns.distplot(np.log1p(train["revenue"]), kde= False);
plt.title("Distribution of log-transformed revenue");
In [7]:
train["log_revenue"] = np.log1p(train["revenue"])
In [ ]:
 

Task 3: Relationship between Film Revenue and Budget

In [8]:
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)#First row, there are two columns and this plot is on the first.
sns.scatterplot(train["budget"],train["revenue"]);
plt.title("Revenue vs Budget")

plt.subplot(1,2,2)
sns.scatterplot(np.log1p(train["budget"]),train["log_revenue"]);
plt.title("Log Revenue vs Log Budget")
Out[8]:
Text(0.5, 1.0, 'Log Revenue vs Log Budget')
In [9]:
train["log_budget"] = np.log1p(train["budget"])
test["log_budget"] = np.log1p(test["budget"])

Task 4: Does having an Official Homepage Affect Revenue?

In [10]:
train["homepage"].value_counts().head(10)
Out[10]:
http://www.transformersmovie.com/                                                4
http://www.lordoftherings.net/                                                   2
http://www.thehobbit.com/                                                        2
http://baladatristedetrompeta.blogspot.com/                                      1
http://www.wim-wenders.com/movies/movies_spec/wingsofdesire/wingsofdesire.htm    1
http://www.filminfocus.com/film/in_bruges                                        1
http://movies.disney.com/frankenweenie-2012                                      1
http://www.mgm.com/view/movie/234/Quantum-of-Solace/                             1
http://www.anjaanaanjaani.erosentertainment.com                                  1
http://www.foxmovies.com/movies/spy                                              1
Name: homepage, dtype: int64
In [17]:
train["homepage"].count()
Out[17]:
946
In [11]:
train["has_homepage"] = 0
train.loc[train["homepage"].isnull() == False, "has_homepage"] = 1
test.loc[test["homepage"].isnull() == False, "has_homepage"] = 1
In [12]:
sns.catplot(x = "has_homepage", y= "revenue", data = train);
plt.title("Revenue for films with and without a homepage")
Out[12]:
Text(0.5, 1.0, 'Revenue for films with and without a homepage')

Task 5: Distribution of Languages in Film

In [13]:
print(train["original_language"].value_counts().head(10))
print(train["original_language"].value_counts().head(10).index)
en    2575
fr      78
ru      47
es      43
hi      42
ja      37
it      24
cn      20
ko      20
zh      19
Name: original_language, dtype: int64
Index(['en', 'fr', 'ru', 'es', 'hi', 'ja', 'it', 'cn', 'ko', 'zh'], dtype='object')
In [14]:
language_data = train.loc[train["original_language"].isin(train["original_language"].value_counts().head(10).index)]
print(language_data)
        id                              belongs_to_collection    budget  \
0        1  [{'id': 313576, 'name': 'Hot Tub Time Machine ...  14000000   
1        2  [{'id': 107674, 'name': 'The Princess Diaries ...  40000000   
2        3                                                NaN   3300000   
3        4                                                NaN   1200000   
4        5                                                NaN         0   
...    ...                                                ...       ...   
2994  2995                                                NaN         0   
2995  2996                                                NaN         0   
2997  2998                                                NaN  65000000   
2998  2999                                                NaN  42000000   
2999  3000                                                NaN  35000000   

                                                 genres  \
0                        [{'id': 35, 'name': 'Comedy'}]   
1     [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   
2                         [{'id': 18, 'name': 'Drama'}]   
3     [{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...   
4     [{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...   
...                                                 ...   
2994                      [{'id': 18, 'name': 'Drama'}]   
2995  [{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...   
2997  [{'id': 80, 'name': 'Crime'}, {'id': 28, 'name...   
2998  [{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...   
2999  [{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...   

                               homepage    imdb_id original_language  \
0                                   NaN  tt2637294                en   
1                                   NaN  tt0368933                en   
2     http://sonyclassics.com/whiplash/  tt2582802                en   
3            http://kahaanithefilm.com/  tt1821480                hi   
4                                   NaN  tt1380152                ko   
...                                 ...        ...               ...   
2994                                NaN  tt0105327                en   
2995                                NaN  tt0109403                en   
2997                                NaN  tt0116908                en   
2998     http://www.alongcamepolly.com/  tt0343135                en   
2999   http://www.abductionthefilm.com/  tt1600195                en   

                                original_title  \
0                       Hot Tub Time Machine 2   
1     The Princess Diaries 2: Royal Engagement   
2                                     Whiplash   
3                                      Kahaani   
4                                         마린보이   
...                                        ...   
2994                               School Ties   
2995                                   Chasers   
2997                   The Long Kiss Goodnight   
2998                          Along Came Polly   
2999                                 Abduction   

                                               overview  popularity  \
0     When Lou, who has become the "father of the In...    6.575393   
1     Mia Thermopolis is now a college graduate and ...    8.248895   
2     Under the direction of a ruthless instructor, ...   64.299990   
3     Vidya Bagchi (Vidya Balan) arrives in Kolkata ...    3.174936   
4     Marine Boy is the story of a former national s...    1.148070   
...                                                 ...         ...   
2994  When David Greene receives a football scholars...    7.438381   
2995  Military men Rock Reilly and Eddie Devane are ...    9.853270   
2997  Samantha Caine, suburban homemaker, is the ide...   14.482345   
2998  Reuben Feffer is a guy who's spent his entire ...   15.725542   
2999  A young man sets out to uncover the truth abou...   10.512109   

                           poster_path  \
0     /tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg   
1     /w9Z7A0GHEhIp7etpj0vyKOeU1Wx.jpg   
2     /lIv1QinFqz4dlp5U4lQ6HaiskOZ.jpg   
3     /aTXRaPrWSinhcmCrcfJK17urp3F.jpg   
4     /m22s7zvkVFDU9ir56PiiqIEWFdT.jpg   
...                                ...   
2994  /poV3j71mcmQkmjezc2H35xJsAhD.jpg   
2995  /j8Q7pQ27hvH54wpxJzIuQgQCdro.jpg   
2997  /4MENR8x6mYqnZvp2hGjSaPJz64J.jpg   
2998  /nIY4kvJTTnxoBR0wycrXng5MOYs.jpg   
2999  /cUT6NQP5LAJpmUoStGtXmvNt4zA.jpg   

                                   production_companies  \
0     [{'name': 'Paramount Pictures', 'id': 4}, {'na...   
1           [{'name': 'Walt Disney Pictures', 'id': 2}]   
2     [{'name': 'Bold Films', 'id': 2266}, {'name': ...   
3                                                   NaN   
4                                                   NaN   
...                                                 ...   
2994          [{'name': 'Paramount Pictures', 'id': 4}]   
2995  [{'name': 'Warner Bros.', 'id': 6194}, {'name'...   
2997  [{'name': 'New Line Cinema', 'id': 12}, {'name...   
2998  [{'name': 'Jersey Films', 'id': 216}, {'name':...   
2999  [{'name': 'Lions Gate Films', 'id': 35}, {'nam...   

                                   production_countries release_date  runtime  \
0     [{'iso_3166_1': 'US', 'name': 'United States o...      2/20/15     93.0   
1     [{'iso_3166_1': 'US', 'name': 'United States o...       8/6/04    113.0   
2     [{'iso_3166_1': 'US', 'name': 'United States o...     10/10/14    105.0   
3               [{'iso_3166_1': 'IN', 'name': 'India'}]       3/9/12    122.0   
4         [{'iso_3166_1': 'KR', 'name': 'South Korea'}]       2/5/09    118.0   
...                                                 ...          ...      ...   
2994  [{'iso_3166_1': 'US', 'name': 'United States o...      9/18/92    106.0   
2995  [{'iso_3166_1': 'US', 'name': 'United States o...      4/22/94    102.0   
2997  [{'iso_3166_1': 'US', 'name': 'United States o...     10/11/96    120.0   
2998  [{'iso_3166_1': 'US', 'name': 'United States o...      1/16/04     90.0   
2999  [{'iso_3166_1': 'US', 'name': 'United States o...      9/22/11    106.0   

                                       spoken_languages    status  \
0              [{'iso_639_1': 'en', 'name': 'English'}]  Released   
1              [{'iso_639_1': 'en', 'name': 'English'}]  Released   
2              [{'iso_639_1': 'en', 'name': 'English'}]  Released   
3     [{'iso_639_1': 'en', 'name': 'English'}, {'iso...  Released   
4              [{'iso_639_1': 'ko', 'name': '한국어/조선말'}]  Released   
...                                                 ...       ...   
2994           [{'iso_639_1': 'en', 'name': 'English'}]  Released   
2995           [{'iso_639_1': 'en', 'name': 'English'}]  Released   
2997           [{'iso_639_1': 'en', 'name': 'English'}]  Released   
2998           [{'iso_639_1': 'en', 'name': 'English'}]  Released   
2999           [{'iso_639_1': 'en', 'name': 'English'}]  Released   

                                                tagline  \
0     The Laws of Space and Time are About to be Vio...   
1     It can take a lifetime to find true love; she'...   
2       The road to greatness can take you to the edge.   
3                                                   NaN   
4                                                   NaN   
...                                                 ...   
2994  Just because you’re accepted doesn’t mean you ...   
2995  It was supposed to be a routine prisoner trans...   
2997               What's forgotten is not always gone.   
2998  For the most cautious man on Earth, life is ab...   
2999          They stole his life. He's taking it back.   

                                         title  \
0                       Hot Tub Time Machine 2   
1     The Princess Diaries 2: Royal Engagement   
2                                     Whiplash   
3                                      Kahaani   
4                                   Marine Boy   
...                                        ...   
2994                               School Ties   
2995                                   Chasers   
2997                   The Long Kiss Goodnight   
2998                          Along Came Polly   
2999                                 Abduction   

                                               Keywords  \
0     [{'id': 4379, 'name': 'time travel'}, {'id': 9...   
1     [{'id': 2505, 'name': 'coronation'}, {'id': 42...   
2     [{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...   
3     [{'id': 10092, 'name': 'mystery'}, {'id': 1054...   
4                                                   NaN   
...                                                 ...   
2994  [{'id': 6075, 'name': 'sport'}, {'id': 10144, ...   
2995  [{'id': 378, 'name': 'prison'}, {'id': 572, 'n...   
2997  [{'id': 441, 'name': 'assassination'}, {'id': ...   
2998  [{'id': 966, 'name': 'beach'}, {'id': 2676, 'n...   
2999  [{'id': 591, 'name': 'cia'}, {'id': 822, 'name...   

                                                   cast  \
0     [{'cast_id': 4, 'character': 'Lou', 'credit_id...   
1     [{'cast_id': 1, 'character': 'Mia Thermopolis'...   
2     [{'cast_id': 5, 'character': 'Andrew Neimann',...   
3     [{'cast_id': 1, 'character': 'Vidya Bagchi', '...   
4     [{'cast_id': 3, 'character': 'Chun-soo', 'cred...   
...                                                 ...   
2994  [{'cast_id': 2, 'character': 'David Greene', '...   
2995  [{'cast_id': 2, 'character': 'Rock Reilly', 'c...   
2997  [{'cast_id': 10, 'character': 'Samantha Caine ...   
2998  [{'cast_id': 8, 'character': 'Reuben Feffer', ...   
2999  [{'cast_id': 2, 'character': 'Nathan Harper', ...   

                                                   crew    revenue  \
0     [{'credit_id': '59ac067c92514107af02c8c8', 'de...   12314651   
1     [{'credit_id': '52fe43fe9251416c7502563d', 'de...   95149435   
2     [{'credit_id': '54d5356ec3a3683ba0000039', 'de...   13092000   
3     [{'credit_id': '52fe48779251416c9108d6eb', 'de...   16000000   
4     [{'credit_id': '52fe464b9251416c75073b43', 'de...    3923970   
...                                                 ...        ...   
2994  [{'credit_id': '5637777ac3a3681b4d01f9f5', 'de...   14715067   
2995  [{'credit_id': '52fe4494c3a368484e02ac7d', 'de...    1596687   
2997  [{'credit_id': '52fe443a9251416c7502d579', 'de...   89456761   
2998  [{'credit_id': '556f817b9251410866000a63', 'de...  171963386   
2999  [{'credit_id': '5391990d0e0a260fb5001629', 'de...   82087155   

      log_revenue  log_budget  has_homepage  
0       16.326300   16.454568             0  
1       18.370959   17.504390             0  
2       16.387512   15.009433             1  
3       16.588099   13.997833             1  
4       15.182615    0.000000             0  
...           ...         ...           ...  
2994    16.504383    0.000000             0  
2995    14.283442    0.000000             0  
2997    18.309266   17.989898             0  
2998    18.962792   17.553180             1  
2999    18.223292   17.370859             1  

[2905 rows x 26 columns]
In [15]:
plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
sns.boxplot(x = "original_language", y = "revenue", data = language_data)
plt.title("Mean revenue per language")

plt.subplot(1,2,2)
sns.boxplot(x = "original_language", y = "log_revenue", data = language_data)
plt.title("Mean log revenue per language")
Out[15]:
Text(0.5, 1.0, 'Mean log revenue per language')

Task 6: Frequent Words in Film Titles and Descriptions

In [16]:
stop = ['the','man', 'Last']
#mask = np.array(Image.open("wine glass.jpg"))
plt.figure(figsize=(12,12))
text = " ".join(train["original_title"].fillna("").values)
wordcloud = WordCloud(max_font_size = 30,  max_words= 10, background_color="white", 
                      width=1200,  height=800, stopwords = stop, contour_width=1, contour_color='firebrick').generate(text)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
% contour_width=3, contour_color='firebrick' max_words
UsageError: Line magic function `%` not found.
In [ ]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
text="'azam', 'akram', 'ayesha', 'azam', 'ayesha', 'sameena', 'azizu', 'azizu', 'burhan', 'sameena', 'sameena', 'zakir', 'nazim'"
wordcloud = WordCloud(width=200,height=200,margin=0).generate(text)
plt.imshow(wordcloud)
plt.grid(False)
In [ ]:
text="'azam', 'akram', 'ayesha', 'azam', 'ayesha', 'sameena', 'azizu', 'azizu', 'burhan', 'sameena', 'sameena', 'zakir', 'nazim'"
wordcloud = WordCloud(width=480,height=480,margin=10,colormap="Blues",background_color="purple",#contour_width=100,contour_color='black'
                     ).generate(text)
plt.imshow(wordcloud)
plt.grid(False)
In [ ]:
import numpy as np; np.random.seed(0)
import seaborn as sns;
uniform_data = np.random.rand(10, 12)
ax = sns.heatmap(uniform_data,annot=True,linewidths=1)
In [ ]:
a=np.ones_like(uniform_data, dtype=np.bool)
a
In [ ]:
a[:,-1]=False
In [ ]:
ax = sns.heatmap(uniform_data,annot=True,linewidths=1,mask=a)
In [ ]:
b=np.ones_like(uniform_data, dtype=np.bool)
b[:,:-1]=False
b[0,:]=True
sns.heatmap(uniform_data,linewidths=1,annot=b*uniform_data,cmap='YlGnBu')
In [ ]:
sns.heatmap(uniform_data,linewidths=1,annot=a*uniform_data,cmap='YlGnBu')

Task 7: Do Film Descriptions Impact Revenue?

In [ ]:
!pip install eli5
In [ ]:
import eli5
from sklearn.linear_model import LinearRegression
In [ ]:
vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2),
    min_df=5
)
overview_text = vectorizer.fit_transform(train["overview"].fillna(""))
linreg = LinearRegression()
linreg.fit(overview_text, train["log_revenue"])
eli5.show_weights(linreg, vec=vectorizer, top=20, feature_filter=lambda x: x!= "<BIAS>")
In [ ]:
 
In [ ]: