!pip install plotly
Requirement already satisfied: plotly in d:\softwares\anaconda\lib\site-packages (4.11.0) Requirement already satisfied: six in d:\softwares\anaconda\lib\site-packages (from plotly) (1.15.0) Requirement already satisfied: retrying>=1.3.3 in d:\softwares\anaconda\lib\site-packages (from plotly) (1.3.3)
import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')
import datetime
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, KFold
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import StandardScaler
import nltk
nltk.download('stopwords')
stop = set(stopwords.words('english'))
import os
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import json
import ast
from urllib.request import urlopen
from PIL import Image
[nltk_data] Downloading package stopwords to [nltk_data] C:\Users\azam_\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date!
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.head(1)
id | belongs_to_collection | budget | genres | homepage | imdb_id | original_language | original_title | overview | popularity | poster_path | production_companies | production_countries | release_date | runtime | spoken_languages | status | tagline | title | Keywords | cast | crew | revenue | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | [{'id': 313576, 'name': 'Hot Tub Time Machine ... | 14000000 | [{'id': 35, 'name': 'Comedy'}] | NaN | tt2637294 | en | Hot Tub Time Machine 2 | When Lou, who has become the "father of the In... | 6.575393 | /tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg | [{'name': 'Paramount Pictures', 'id': 4}, {'na... | [{'iso_3166_1': 'US', 'name': 'United States o... | 2/20/15 | 93.0 | [{'iso_639_1': 'en', 'name': 'English'}] | Released | The Laws of Space and Time are About to be Vio... | Hot Tub Time Machine 2 | [{'id': 4379, 'name': 'time travel'}, {'id': 9... | [{'cast_id': 4, 'character': 'Lou', 'credit_id... | [{'credit_id': '59ac067c92514107af02c8c8', 'de... | 12314651 |
train.revenue.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x2125af52970>
fig, ax = plt.subplots(figsize =[10,10])
plt.subplot(1,2,1)
sns.distplot(train["revenue"],kde= True);
plt.title("Distribution of revenue");
plt.subplot(1,2,2)
sns.distplot(np.log1p(train["revenue"]), kde= False);
plt.title("Distribution of log-transformed revenue");
train["log_revenue"] = np.log1p(train["revenue"])
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)#First row, there are two columns and this plot is on the first.
sns.scatterplot(train["budget"],train["revenue"]);
plt.title("Revenue vs Budget")
plt.subplot(1,2,2)
sns.scatterplot(np.log1p(train["budget"]),train["log_revenue"]);
plt.title("Log Revenue vs Log Budget")
Text(0.5, 1.0, 'Log Revenue vs Log Budget')
train["log_budget"] = np.log1p(train["budget"])
test["log_budget"] = np.log1p(test["budget"])
train["homepage"].value_counts().head(10)
http://www.transformersmovie.com/ 4 http://www.lordoftherings.net/ 2 http://www.thehobbit.com/ 2 http://baladatristedetrompeta.blogspot.com/ 1 http://www.wim-wenders.com/movies/movies_spec/wingsofdesire/wingsofdesire.htm 1 http://www.filminfocus.com/film/in_bruges 1 http://movies.disney.com/frankenweenie-2012 1 http://www.mgm.com/view/movie/234/Quantum-of-Solace/ 1 http://www.anjaanaanjaani.erosentertainment.com 1 http://www.foxmovies.com/movies/spy 1 Name: homepage, dtype: int64
train["homepage"].count()
946
train["has_homepage"] = 0
train.loc[train["homepage"].isnull() == False, "has_homepage"] = 1
test.loc[test["homepage"].isnull() == False, "has_homepage"] = 1
sns.catplot(x = "has_homepage", y= "revenue", data = train);
plt.title("Revenue for films with and without a homepage")
Text(0.5, 1.0, 'Revenue for films with and without a homepage')
print(train["original_language"].value_counts().head(10))
print(train["original_language"].value_counts().head(10).index)
en 2575 fr 78 ru 47 es 43 hi 42 ja 37 it 24 cn 20 ko 20 zh 19 Name: original_language, dtype: int64 Index(['en', 'fr', 'ru', 'es', 'hi', 'ja', 'it', 'cn', 'ko', 'zh'], dtype='object')
language_data = train.loc[train["original_language"].isin(train["original_language"].value_counts().head(10).index)]
print(language_data)
id belongs_to_collection budget \ 0 1 [{'id': 313576, 'name': 'Hot Tub Time Machine ... 14000000 1 2 [{'id': 107674, 'name': 'The Princess Diaries ... 40000000 2 3 NaN 3300000 3 4 NaN 1200000 4 5 NaN 0 ... ... ... ... 2994 2995 NaN 0 2995 2996 NaN 0 2997 2998 NaN 65000000 2998 2999 NaN 42000000 2999 3000 NaN 35000000 genres \ 0 [{'id': 35, 'name': 'Comedy'}] 1 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... 2 [{'id': 18, 'name': 'Drama'}] 3 [{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n... 4 [{'id': 28, 'name': 'Action'}, {'id': 53, 'nam... ... ... 2994 [{'id': 18, 'name': 'Drama'}] 2995 [{'id': 35, 'name': 'Comedy'}, {'id': 10749, '... 2997 [{'id': 80, 'name': 'Crime'}, {'id': 28, 'name... 2998 [{'id': 35, 'name': 'Comedy'}, {'id': 10749, '... 2999 [{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n... homepage imdb_id original_language \ 0 NaN tt2637294 en 1 NaN tt0368933 en 2 http://sonyclassics.com/whiplash/ tt2582802 en 3 http://kahaanithefilm.com/ tt1821480 hi 4 NaN tt1380152 ko ... ... ... ... 2994 NaN tt0105327 en 2995 NaN tt0109403 en 2997 NaN tt0116908 en 2998 http://www.alongcamepolly.com/ tt0343135 en 2999 http://www.abductionthefilm.com/ tt1600195 en original_title \ 0 Hot Tub Time Machine 2 1 The Princess Diaries 2: Royal Engagement 2 Whiplash 3 Kahaani 4 마린보이 ... ... 2994 School Ties 2995 Chasers 2997 The Long Kiss Goodnight 2998 Along Came Polly 2999 Abduction overview popularity \ 0 When Lou, who has become the "father of the In... 6.575393 1 Mia Thermopolis is now a college graduate and ... 8.248895 2 Under the direction of a ruthless instructor, ... 64.299990 3 Vidya Bagchi (Vidya Balan) arrives in Kolkata ... 3.174936 4 Marine Boy is the story of a former national s... 1.148070 ... ... ... 2994 When David Greene receives a football scholars... 7.438381 2995 Military men Rock Reilly and Eddie Devane are ... 9.853270 2997 Samantha Caine, suburban homemaker, is the ide... 14.482345 2998 Reuben Feffer is a guy who's spent his entire ... 15.725542 2999 A young man sets out to uncover the truth abou... 10.512109 poster_path \ 0 /tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg 1 /w9Z7A0GHEhIp7etpj0vyKOeU1Wx.jpg 2 /lIv1QinFqz4dlp5U4lQ6HaiskOZ.jpg 3 /aTXRaPrWSinhcmCrcfJK17urp3F.jpg 4 /m22s7zvkVFDU9ir56PiiqIEWFdT.jpg ... ... 2994 /poV3j71mcmQkmjezc2H35xJsAhD.jpg 2995 /j8Q7pQ27hvH54wpxJzIuQgQCdro.jpg 2997 /4MENR8x6mYqnZvp2hGjSaPJz64J.jpg 2998 /nIY4kvJTTnxoBR0wycrXng5MOYs.jpg 2999 /cUT6NQP5LAJpmUoStGtXmvNt4zA.jpg production_companies \ 0 [{'name': 'Paramount Pictures', 'id': 4}, {'na... 1 [{'name': 'Walt Disney Pictures', 'id': 2}] 2 [{'name': 'Bold Films', 'id': 2266}, {'name': ... 3 NaN 4 NaN ... ... 2994 [{'name': 'Paramount Pictures', 'id': 4}] 2995 [{'name': 'Warner Bros.', 'id': 6194}, {'name'... 2997 [{'name': 'New Line Cinema', 'id': 12}, {'name... 2998 [{'name': 'Jersey Films', 'id': 216}, {'name':... 2999 [{'name': 'Lions Gate Films', 'id': 35}, {'nam... production_countries release_date runtime \ 0 [{'iso_3166_1': 'US', 'name': 'United States o... 2/20/15 93.0 1 [{'iso_3166_1': 'US', 'name': 'United States o... 8/6/04 113.0 2 [{'iso_3166_1': 'US', 'name': 'United States o... 10/10/14 105.0 3 [{'iso_3166_1': 'IN', 'name': 'India'}] 3/9/12 122.0 4 [{'iso_3166_1': 'KR', 'name': 'South Korea'}] 2/5/09 118.0 ... ... ... ... 2994 [{'iso_3166_1': 'US', 'name': 'United States o... 9/18/92 106.0 2995 [{'iso_3166_1': 'US', 'name': 'United States o... 4/22/94 102.0 2997 [{'iso_3166_1': 'US', 'name': 'United States o... 10/11/96 120.0 2998 [{'iso_3166_1': 'US', 'name': 'United States o... 1/16/04 90.0 2999 [{'iso_3166_1': 'US', 'name': 'United States o... 9/22/11 106.0 spoken_languages status \ 0 [{'iso_639_1': 'en', 'name': 'English'}] Released 1 [{'iso_639_1': 'en', 'name': 'English'}] Released 2 [{'iso_639_1': 'en', 'name': 'English'}] Released 3 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released 4 [{'iso_639_1': 'ko', 'name': '한국어/조선말'}] Released ... ... ... 2994 [{'iso_639_1': 'en', 'name': 'English'}] Released 2995 [{'iso_639_1': 'en', 'name': 'English'}] Released 2997 [{'iso_639_1': 'en', 'name': 'English'}] Released 2998 [{'iso_639_1': 'en', 'name': 'English'}] Released 2999 [{'iso_639_1': 'en', 'name': 'English'}] Released tagline \ 0 The Laws of Space and Time are About to be Vio... 1 It can take a lifetime to find true love; she'... 2 The road to greatness can take you to the edge. 3 NaN 4 NaN ... ... 2994 Just because you’re accepted doesn’t mean you ... 2995 It was supposed to be a routine prisoner trans... 2997 What's forgotten is not always gone. 2998 For the most cautious man on Earth, life is ab... 2999 They stole his life. He's taking it back. title \ 0 Hot Tub Time Machine 2 1 The Princess Diaries 2: Royal Engagement 2 Whiplash 3 Kahaani 4 Marine Boy ... ... 2994 School Ties 2995 Chasers 2997 The Long Kiss Goodnight 2998 Along Came Polly 2999 Abduction Keywords \ 0 [{'id': 4379, 'name': 'time travel'}, {'id': 9... 1 [{'id': 2505, 'name': 'coronation'}, {'id': 42... 2 [{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n... 3 [{'id': 10092, 'name': 'mystery'}, {'id': 1054... 4 NaN ... ... 2994 [{'id': 6075, 'name': 'sport'}, {'id': 10144, ... 2995 [{'id': 378, 'name': 'prison'}, {'id': 572, 'n... 2997 [{'id': 441, 'name': 'assassination'}, {'id': ... 2998 [{'id': 966, 'name': 'beach'}, {'id': 2676, 'n... 2999 [{'id': 591, 'name': 'cia'}, {'id': 822, 'name... cast \ 0 [{'cast_id': 4, 'character': 'Lou', 'credit_id... 1 [{'cast_id': 1, 'character': 'Mia Thermopolis'... 2 [{'cast_id': 5, 'character': 'Andrew Neimann',... 3 [{'cast_id': 1, 'character': 'Vidya Bagchi', '... 4 [{'cast_id': 3, 'character': 'Chun-soo', 'cred... ... ... 2994 [{'cast_id': 2, 'character': 'David Greene', '... 2995 [{'cast_id': 2, 'character': 'Rock Reilly', 'c... 2997 [{'cast_id': 10, 'character': 'Samantha Caine ... 2998 [{'cast_id': 8, 'character': 'Reuben Feffer', ... 2999 [{'cast_id': 2, 'character': 'Nathan Harper', ... crew revenue \ 0 [{'credit_id': '59ac067c92514107af02c8c8', 'de... 12314651 1 [{'credit_id': '52fe43fe9251416c7502563d', 'de... 95149435 2 [{'credit_id': '54d5356ec3a3683ba0000039', 'de... 13092000 3 [{'credit_id': '52fe48779251416c9108d6eb', 'de... 16000000 4 [{'credit_id': '52fe464b9251416c75073b43', 'de... 3923970 ... ... ... 2994 [{'credit_id': '5637777ac3a3681b4d01f9f5', 'de... 14715067 2995 [{'credit_id': '52fe4494c3a368484e02ac7d', 'de... 1596687 2997 [{'credit_id': '52fe443a9251416c7502d579', 'de... 89456761 2998 [{'credit_id': '556f817b9251410866000a63', 'de... 171963386 2999 [{'credit_id': '5391990d0e0a260fb5001629', 'de... 82087155 log_revenue log_budget has_homepage 0 16.326300 16.454568 0 1 18.370959 17.504390 0 2 16.387512 15.009433 1 3 16.588099 13.997833 1 4 15.182615 0.000000 0 ... ... ... ... 2994 16.504383 0.000000 0 2995 14.283442 0.000000 0 2997 18.309266 17.989898 0 2998 18.962792 17.553180 1 2999 18.223292 17.370859 1 [2905 rows x 26 columns]
plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
sns.boxplot(x = "original_language", y = "revenue", data = language_data)
plt.title("Mean revenue per language")
plt.subplot(1,2,2)
sns.boxplot(x = "original_language", y = "log_revenue", data = language_data)
plt.title("Mean log revenue per language")
Text(0.5, 1.0, 'Mean log revenue per language')
stop = ['the','man', 'Last']
#mask = np.array(Image.open("wine glass.jpg"))
plt.figure(figsize=(12,12))
text = " ".join(train["original_title"].fillna("").values)
wordcloud = WordCloud(max_font_size = 30, max_words= 10, background_color="white",
width=1200, height=800, stopwords = stop, contour_width=1, contour_color='firebrick').generate(text)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
% contour_width=3, contour_color='firebrick' max_words
UsageError: Line magic function `%` not found.
from wordcloud import WordCloud
import matplotlib.pyplot as plt
text="'azam', 'akram', 'ayesha', 'azam', 'ayesha', 'sameena', 'azizu', 'azizu', 'burhan', 'sameena', 'sameena', 'zakir', 'nazim'"
wordcloud = WordCloud(width=200,height=200,margin=0).generate(text)
plt.imshow(wordcloud)
plt.grid(False)
text="'azam', 'akram', 'ayesha', 'azam', 'ayesha', 'sameena', 'azizu', 'azizu', 'burhan', 'sameena', 'sameena', 'zakir', 'nazim'"
wordcloud = WordCloud(width=480,height=480,margin=10,colormap="Blues",background_color="purple",#contour_width=100,contour_color='black'
).generate(text)
plt.imshow(wordcloud)
plt.grid(False)
import numpy as np; np.random.seed(0)
import seaborn as sns;
uniform_data = np.random.rand(10, 12)
ax = sns.heatmap(uniform_data,annot=True,linewidths=1)
a=np.ones_like(uniform_data, dtype=np.bool)
a
a[:,-1]=False
ax = sns.heatmap(uniform_data,annot=True,linewidths=1,mask=a)
b=np.ones_like(uniform_data, dtype=np.bool)
b[:,:-1]=False
b[0,:]=True
sns.heatmap(uniform_data,linewidths=1,annot=b*uniform_data,cmap='YlGnBu')
sns.heatmap(uniform_data,linewidths=1,annot=a*uniform_data,cmap='YlGnBu')
!pip install eli5
import eli5
from sklearn.linear_model import LinearRegression
vectorizer = TfidfVectorizer(
sublinear_tf=True,
analyzer='word',
token_pattern=r'\w{1,}',
ngram_range=(1, 2),
min_df=5
)
overview_text = vectorizer.fit_transform(train["overview"].fillna(""))
linreg = LinearRegression()
linreg.fit(overview_text, train["log_revenue"])
eli5.show_weights(linreg, vec=vectorizer, top=20, feature_filter=lambda x: x!= "<BIAS>")