Notebook

In [14]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter('ignore')

In [15]:

fandango = pd.read_csv(r"C:\Users\Teni\Desktop\Git-Github\06-Capstone-Project\fandango_scrape.csv")

In [16]:

fandango

Out[16]:

	FILM	STARS	RATING	VOTES
0	Fifty Shades of Grey (2015)	4.0	3.9	34846
1	Jurassic World (2015)	4.5	4.5	34390
2	American Sniper (2015)	5.0	4.8	34085
3	Furious 7 (2015)	5.0	4.8	33538
4	Inside Out (2015)	4.5	4.5	15749
...	...	...	...	...
499	Valiyavan (2015)	0.0	0.0	0
500	WWE SummerSlam 2015 (2015)	0.0	0.0	0
501	Yagavarayinum Naa Kaakka (2015)	0.0	0.0	0
502	Yesterday, Today and Tomorrow (1964)	0.0	0.0	0
503	Zarafa (2012)	0.0	0.0	0

504 rows × 4 columns

Understand the Data

In [17]:

fandango.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   FILM    504 non-null    object 
 1   STARS   504 non-null    float64
 2   RATING  504 non-null    float64
 3   VOTES   504 non-null    int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 15.9+ KB

In [18]:

fandango.describe()

Out[18]:

	STARS	RATING	VOTES
count	504.000000	504.000000	504.000000
mean	3.558532	3.375794	1147.863095
std	1.563133	1.491223	3830.583136
min	0.000000	0.000000	0.000000
25%	3.500000	3.100000	3.000000
50%	4.000000	3.800000	18.500000
75%	4.500000	4.300000	189.750000
max	5.000000	5.000000	34846.000000

Explore the relationship between the Rating and Votes

In [19]:

plt.figure(figsize=(10, 4), dpi=200)

sns.scatterplot(data=fandango, x='RATING', y='VOTES');

Calculating the correlation between the columns.

This helps undersyand how closely knitted each column is to each other

In [20]:

fandango.corr()

Out[20]:

	STARS	RATING	VOTES
STARS	1.000000	0.994696	0.164218
RATING	0.994696	1.000000	0.163764
VOTES	0.164218	0.163764	1.000000

In [21]:

fandango['YEAR'] = fandango['FILM'].str.extract(r'\((\d+)\)')

# pd.to_datetime(fandango['YEAR'])

In [22]:

fandango

Out[22]:

	FILM	STARS	RATING	VOTES	YEAR
0	Fifty Shades of Grey (2015)	4.0	3.9	34846	2015
1	Jurassic World (2015)	4.5	4.5	34390	2015
2	American Sniper (2015)	5.0	4.8	34085	2015
3	Furious 7 (2015)	5.0	4.8	33538	2015
4	Inside Out (2015)	4.5	4.5	15749	2015
...	...	...	...	...	...
499	Valiyavan (2015)	0.0	0.0	0	2015
500	WWE SummerSlam 2015 (2015)	0.0	0.0	0	2015
501	Yagavarayinum Naa Kaakka (2015)	0.0	0.0	0	2015
502	Yesterday, Today and Tomorrow (1964)	0.0	0.0	0	1964
503	Zarafa (2012)	0.0	0.0	0	2012

504 rows × 5 columns

In [24]:

# fandango.drop('Numbers_in_brackets', axis=1, inplace=True)

In [25]:

fandango

Out[25]:

	FILM	STARS	RATING	VOTES	YEAR
0	Fifty Shades of Grey (2015)	4.0	3.9	34846	2015
1	Jurassic World (2015)	4.5	4.5	34390	2015
2	American Sniper (2015)	5.0	4.8	34085	2015
3	Furious 7 (2015)	5.0	4.8	33538	2015
4	Inside Out (2015)	4.5	4.5	15749	2015
...	...	...	...	...	...
499	Valiyavan (2015)	0.0	0.0	0	2015
500	WWE SummerSlam 2015 (2015)	0.0	0.0	0	2015
501	Yagavarayinum Naa Kaakka (2015)	0.0	0.0	0	2015
502	Yesterday, Today and Tomorrow (1964)	0.0	0.0	0	1964
503	Zarafa (2012)	0.0	0.0	0	2012

504 rows × 5 columns

How many movies are in the Fandango Dataframe per year?

In [26]:

fandango['YEAR'].value_counts()

Out[26]:

2015    477
2014     24
2016      1
1964      1
2012      1
Name: YEAR, dtype: int64

TASK: Visualize the count of movies per year with a plot:

In [29]:

# Not using the barplot because it'd require a Y variable- which we don't have or do not need. The countlot allows for only counting

plt.figure(figsize=(10, 6), dpi=200)
sns.countplot(data=fandango, x='YEAR');

What are the 10 movies with the highest number of votes?

In [30]:

fandango.loc[fandango['VOTES'].nlargest(10).index]

Out[30]:

	FILM	STARS	RATING	VOTES	YEAR
0	Fifty Shades of Grey (2015)	4.0	3.9	34846	2015
1	Jurassic World (2015)	4.5	4.5	34390	2015
2	American Sniper (2015)	5.0	4.8	34085	2015
3	Furious 7 (2015)	5.0	4.8	33538	2015
4	Inside Out (2015)	4.5	4.5	15749	2015
5	The Hobbit: The Battle of the Five Armies (2014)	4.5	4.3	15337	2014
6	Kingsman: The Secret Service (2015)	4.5	4.2	15205	2015
7	Minions (2015)	4.0	4.0	14998	2015
8	Avengers: Age of Ultron (2015)	5.0	4.5	14846	2015
9	Into the Woods (2014)	3.5	3.4	13055	2014

In [31]:

# OR
fandango.sort_values(by='VOTES', ascending=False).head(10)

Out[31]:

	FILM	STARS	RATING	VOTES	YEAR
0	Fifty Shades of Grey (2015)	4.0	3.9	34846	2015
1	Jurassic World (2015)	4.5	4.5	34390	2015
2	American Sniper (2015)	5.0	4.8	34085	2015
3	Furious 7 (2015)	5.0	4.8	33538	2015
4	Inside Out (2015)	4.5	4.5	15749	2015
5	The Hobbit: The Battle of the Five Armies (2014)	4.5	4.3	15337	2014
6	Kingsman: The Secret Service (2015)	4.5	4.2	15205	2015
7	Minions (2015)	4.0	4.0	14998	2015
8	Avengers: Age of Ultron (2015)	5.0	4.5	14846	2015
9	Into the Woods (2014)	3.5	3.4	13055	2014

How many movies have zero votes

In [104]:

fandango[fandango['VOTES']==0]

Out[104]:

	FILM	STARS	RATING	VOTES	YEAR
435	6 Years (2015)	0.0	0.0	0	2015
436	7 Minutes (2015)	0.0	0.0	0	2015
437	A Year in Champagne (2015)	0.0	0.0	0	2015
438	Balls Out (2015)	0.0	0.0	0	2015
439	Before I Wake (2015)	0.0	0.0	0	2015
...	...	...	...	...	...
499	Valiyavan (2015)	0.0	0.0	0	2015
500	WWE SummerSlam 2015 (2015)	0.0	0.0	0	2015
501	Yagavarayinum Naa Kaakka (2015)	0.0	0.0	0	2015
502	Yesterday, Today and Tomorrow (1964)	0.0	0.0	0	1964
503	Zarafa (2012)	0.0	0.0	0	2012

69 rows × 5 columns

In [105]:

(fandango['VOTES']==0).value_counts()

Out[105]:

False    435
True      69
Name: VOTES, dtype: int64

In [106]:

(fandango['VOTES']==0).sum()

Out[106]:

Create DataFrame of only reviewed films by removing any films that have zero votes.

In [107]:

fandango

Out[107]:

	FILM	STARS	RATING	VOTES	YEAR
0	Fifty Shades of Grey (2015)	4.0	3.9	34846	2015
1	Jurassic World (2015)	4.5	4.5	34390	2015
2	American Sniper (2015)	5.0	4.8	34085	2015
3	Furious 7 (2015)	5.0	4.8	33538	2015
4	Inside Out (2015)	4.5	4.5	15749	2015
...	...	...	...	...	...
499	Valiyavan (2015)	0.0	0.0	0	2015
500	WWE SummerSlam 2015 (2015)	0.0	0.0	0	2015
501	Yagavarayinum Naa Kaakka (2015)	0.0	0.0	0	2015
502	Yesterday, Today and Tomorrow (1964)	0.0	0.0	0	1964
503	Zarafa (2012)	0.0	0.0	0	2012

504 rows × 5 columns

In [108]:

reviewed_films = fandango[fandango['VOTES']!=0]

reviewed_films

Out[108]:

	FILM	STARS	RATING	VOTES	YEAR
0	Fifty Shades of Grey (2015)	4.0	3.9	34846	2015
1	Jurassic World (2015)	4.5	4.5	34390	2015
2	American Sniper (2015)	5.0	4.8	34085	2015
3	Furious 7 (2015)	5.0	4.8	33538	2015
4	Inside Out (2015)	4.5	4.5	15749	2015
...	...	...	...	...	...
430	That Sugar Film (2015)	5.0	5.0	1	2015
431	The Intern (2015)	5.0	5.0	1	2015
432	The Park Bench (2015)	5.0	5.0	1	2015
433	The Wanted 18 (2015)	5.0	5.0	1	2015
434	Z For Zachariah (2015)	5.0	5.0	1	2015

435 rows × 5 columns

Create a KDE plot (or multiple kdeplots) that displays the distribution of ratings that are displayed (STARS) versus what the true rating was from votes (RATING). Clip the KDEs to 0-5.**

In [109]:

from matplotlib.ticker import MaxNLocator  # Ensure MaxNLocator is imported

plt.figure(figsize=[10,4], dpi=500)

sns.kdeplot(reviewed_films['RATING'], clip=[0, 5], shade=True, label='True Rating')
sns.kdeplot(reviewed_films['STARS'], clip=[0, 5], shade=True, label='Stars Displayed')

plt.title('KDE Plot for True Rating & the Stars Displayed Comparism')
# plt.xlim(0, 5) The clip already catered to the ranges
# plt.ylim(0.0, 0.6) The clip already catered to the ranges

# plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))  # Display integers on x-axis
# plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True))  # Display integers on y-axis

plt.legend(loc=(1.05, 0.5)) 
plt.show()

TASK: Let's now actually quantify this discrepancy. Create a new column of the different between STARS displayed versus true RATING. Calculate this difference with STARS-RATING and round these differences to the nearest decimal point.

In [110]:

reviewed_films['STARS_DIFF'] = round(reviewed_films['STARS'] - reviewed_films['RATING'], 2)

In [111]:

reviewed_films

Out[111]:

	FILM	STARS	RATING	VOTES	YEAR	STARS_DIFF
0	Fifty Shades of Grey (2015)	4.0	3.9	34846	2015	0.1
1	Jurassic World (2015)	4.5	4.5	34390	2015	0.0
2	American Sniper (2015)	5.0	4.8	34085	2015	0.2
3	Furious 7 (2015)	5.0	4.8	33538	2015	0.2
4	Inside Out (2015)	4.5	4.5	15749	2015	0.0
...	...	...	...	...	...	...
430	That Sugar Film (2015)	5.0	5.0	1	2015	0.0
431	The Intern (2015)	5.0	5.0	1	2015	0.0
432	The Park Bench (2015)	5.0	5.0	1	2015	0.0
433	The Wanted 18 (2015)	5.0	5.0	1	2015	0.0
434	Z For Zachariah (2015)	5.0	5.0	1	2015	0.0

435 rows × 6 columns

TASK: Create a count plot to display the number of times a certain difference occurs

In [112]:

plt.figure(figsize=[12, 8], dpi=200)
sns.countplot(data = reviewed_films, x='STARS_DIFF');

TASK: We can see from the plot that one movie was displaying over a 1 star difference than its true rating! What movie had this close to 1 star differential?

In [115]:

reviewed_films[reviewed_films['STARS_DIFF']>=1.0]

Out[115]:

	FILM	STARS	RATING	VOTES	YEAR	STARS_DIFF
381	Turbo Kid (2015)	5.0	4.0	2	2015	1.0

New Data

In [116]:

all_sites = pd.read_csv(r"C:\Users\Teni\Desktop\Git-Github\06-Capstone-Project\all_sites_scores.csv")

In [117]:

all_sites

Out[117]:

	FILM	RottenTomatoes	RottenTomatoes_User	Metacritic	Metacritic_User	IMDB	Metacritic_user_vote_count	IMDB_user_vote_count
0	Avengers: Age of Ultron (2015)	74	86	66	7.1	7.8	1330	271107
1	Cinderella (2015)	85	80	67	7.5	7.1	249	65709
2	Ant-Man (2015)	80	90	64	8.1	7.8	627	103660
3	Do You Believe? (2015)	18	84	22	4.7	5.4	31	3136
4	Hot Tub Time Machine 2 (2015)	14	28	29	3.4	5.1	88	19560
...	...	...	...	...	...	...	...	...
141	Mr. Holmes (2015)	87	78	67	7.9	7.4	33	7367
142	'71 (2015)	97	82	83	7.5	7.2	60	24116
143	Two Days, One Night (2014)	97	78	89	8.8	7.4	123	24345
144	Gett: The Trial of Viviane Amsalem (2015)	100	81	90	7.3	7.8	19	1955
145	Kumiko, The Treasure Hunter (2015)	87	63	68	6.4	6.7	19	5289

146 rows × 8 columns

In [121]:

all_sites.head(5)

Out[121]:

	FILM	RottenTomatoes	RottenTomatoes_User	Metacritic	Metacritic_User	IMDB	Metacritic_user_vote_count	IMDB_user_vote_count
0	Avengers: Age of Ultron (2015)	74	86	66	7.1	7.8	1330	271107
1	Cinderella (2015)	85	80	67	7.5	7.1	249	65709
2	Ant-Man (2015)	80	90	64	8.1	7.8	627	103660
3	Do You Believe? (2015)	18	84	22	4.7	5.4	31	3136
4	Hot Tub Time Machine 2 (2015)	14	28	29	3.4	5.1	88	19560

In [120]:

all_sites.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   FILM                        146 non-null    object 
 1   RottenTomatoes              146 non-null    int64  
 2   RottenTomatoes_User         146 non-null    int64  
 3   Metacritic                  146 non-null    int64  
 4   Metacritic_User             146 non-null    float64
 5   IMDB                        146 non-null    float64
 6   Metacritic_user_vote_count  146 non-null    int64  
 7   IMDB_user_vote_count        146 non-null    int64  
dtypes: float64(2), int64(5), object(1)
memory usage: 9.2+ KB

In [119]:

all_sites.describe()

Out[119]:

	RottenTomatoes	RottenTomatoes_User	Metacritic	Metacritic_User	IMDB	Metacritic_user_vote_count	IMDB_user_vote_count
count	146.000000	146.000000	146.000000	146.000000	146.000000	146.000000	146.000000
mean	60.849315	63.876712	58.808219	6.519178	6.736986	185.705479	42846.205479
std	30.168799	20.024430	19.517389	1.510712	0.958736	316.606515	67406.509171
min	5.000000	20.000000	13.000000	2.400000	4.000000	4.000000	243.000000
25%	31.250000	50.000000	43.500000	5.700000	6.300000	33.250000	5627.000000
50%	63.500000	66.500000	59.000000	6.850000	6.900000	72.500000	19103.000000
75%	89.000000	81.000000	75.000000	7.500000	7.400000	168.500000	45185.750000
max	100.000000	94.000000	94.000000	9.600000	8.600000	2375.000000	334164.000000

Rotten Tomatoes

Let's first take a look at Rotten Tomatoes. RT has two sets of reviews, their critics reviews (ratings published by official critics) and user reviews.

TASK: Create a scatterplot exploring the relationship between RT Critic reviews and RT User reviews.

In [132]:

plt.figure(figsize=[12, 8], dpi=200)
sns.scatterplot(data=all_sites, x='RottenTomatoes', y= 'RottenTomatoes_User', s=80)

plt.xlim(0,100)
plt.ylim(0, 100);

Rotten Tomatoes

Let's quantify this difference by comparing the critics ratings and the RT User ratings. We will calculate this with RottenTomatoes-RottenTomatoes_User. Note: diff here is Critics - User Score. So values closer to 0 means aggrement between Critics and Users. Larger positive values means critics rated much higher than users. Larger negative values means users rated much higher than critics.

TASK: Create a new column based off the difference between critics ratings and users ratings for Rotten Tomatoes. Calculate this with RottenTomatoes-RottenTomatoes_User

In [133]:

all_sites['diff'] = round(all_sites['RottenTomatoes'] - all_sites['RottenTomatoes_User'], 2)

In [134]:

all_sites

Out[134]:

	FILM	RottenTomatoes	RottenTomatoes_User	Metacritic	Metacritic_User	IMDB	Metacritic_user_vote_count	IMDB_user_vote_count	diff
0	Avengers: Age of Ultron (2015)	74	86	66	7.1	7.8	1330	271107	-12
1	Cinderella (2015)	85	80	67	7.5	7.1	249	65709	5
2	Ant-Man (2015)	80	90	64	8.1	7.8	627	103660	-10
3	Do You Believe? (2015)	18	84	22	4.7	5.4	31	3136	-66
4	Hot Tub Time Machine 2 (2015)	14	28	29	3.4	5.1	88	19560	-14
...	...	...	...	...	...	...	...	...	...
141	Mr. Holmes (2015)	87	78	67	7.9	7.4	33	7367	9
142	'71 (2015)	97	82	83	7.5	7.2	60	24116	15
143	Two Days, One Night (2014)	97	78	89	8.8	7.4	123	24345	19
144	Gett: The Trial of Viviane Amsalem (2015)	100	81	90	7.3	7.8	19	1955	19
145	Kumiko, The Treasure Hunter (2015)	87	63	68	6.4	6.7	19	5289	24

146 rows × 9 columns

Rotten Tomatoes

Let's now compare the overall mean difference. Since we're dealing with differences that could be negative or positive, first take the absolute value of all the differences, then take the mean. This would report back on average to absolute difference between the critics rating versus the user rating.

TASK: Calculate the Mean Absolute Difference between RT scores and RT User scores as described above.

In [136]:

all_sites['diff'].apply(abs).mean()

Out[136]:

15.095890410958905

Rotten Tomatoes

TASK: Plot the distribution of the differences between RT Critics Score and RT User Score. There should be negative values in this distribution plot. Feel free to use KDE or Histograms to display this distribution.

In [150]:

plt.figure(figsize=[12, 6], dpi=200)
sns.displot(data=all_sites,x='diff', kde=True, bins=25)

plot.set_axis_labels("Rotten_Diff")
plot.fig.suptitle('RT Critics Score minus RT User Score', y=1.02)  # Title for the plot

plt.show()


# sns.displot(data=sample_ages, x='age', rug=True, bins=30, kde=True)

<Figure size 2400x1200 with 0 Axes>

Rotten Tomatoes

TASK: Now create a distribution showing the absolute value difference between Critics and Users on Rotten Tomatoes.

In [158]:

abs_mean = all_sites['diff'].apply(abs).mean()

all_sites['task_need'] = round(abs_mean-all_sites['diff'], 2)

all_sites

Out[158]:

	FILM	RottenTomatoes	RottenTomatoes_User	Metacritic	Metacritic_User	IMDB	Metacritic_user_vote_count	IMDB_user_vote_count	diff	task_need
0	Avengers: Age of Ultron (2015)	74	86	66	7.1	7.8	1330	271107	-12	27.1
1	Cinderella (2015)	85	80	67	7.5	7.1	249	65709	5	10.1
2	Ant-Man (2015)	80	90	64	8.1	7.8	627	103660	-10	25.1
3	Do You Believe? (2015)	18	84	22	4.7	5.4	31	3136	-66	81.1
4	Hot Tub Time Machine 2 (2015)	14	28	29	3.4	5.1	88	19560	-14	29.1
...	...	...	...	...	...	...	...	...	...	...
141	Mr. Holmes (2015)	87	78	67	7.9	7.4	33	7367	9	6.1
142	'71 (2015)	97	82	83	7.5	7.2	60	24116	15	0.1
143	Two Days, One Night (2014)	97	78	89	8.8	7.4	123	24345	19	-3.9
144	Gett: The Trial of Viviane Amsalem (2015)	100	81	90	7.3	7.8	19	1955	19	-3.9
145	Kumiko, The Treasure Hunter (2015)	87	63	68	6.4	6.7	19	5289	24	-8.9

146 rows × 10 columns

In [176]:

plt.figure(figsize=[12, 8], dpi=200)
sns.displot(x=all_sites['diff'].apply(abs), bins=25, kde=True)

plt.title('Abs Difference between RT Critics Score and RT User Score')
plt.show()

<Figure size 2400x1600 with 0 Axes>

In [178]:

plt.figure(figsize=[12, 8], dpi=200)
sns.histplot(x=all_sites['diff'].apply(abs), bins=25, kde=True)

plt.title('Abs Difference between RT Critics Score and RT User Score')
plt.show()

# Apparently the Histplot is neater and more srisp than the displot

Rotten Tomatoes

TASK: What are the top 5 movies users rated higher than critics on average:

In [216]:

all_sites.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   FILM                        146 non-null    object 
 1   RottenTomatoes              146 non-null    int64  
 2   RottenTomatoes_User         146 non-null    int64  
 3   Metacritic                  146 non-null    int64  
 4   Metacritic_User             146 non-null    float64
 5   IMDB                        146 non-null    float64
 6   Metacritic_user_vote_count  146 non-null    int64  
 7   IMDB_user_vote_count        146 non-null    int64  
 8   diff                        146 non-null    int64  
 9   task_need                   146 non-null    float64
dtypes: float64(3), int64(6), object(1)
memory usage: 11.5+ KB

In [185]:

print('Users Love but Critics Hate')
all_sites.nsmallest(5, 'diff')[['FILM', 'diff']]

Users Love but Critics Hate

Out[185]:

	FILM	diff
3	Do You Believe? (2015)	-66
85	Little Boy (2015)	-61
105	Hitman: Agent 47 (2015)	-42
134	The Longest Ride (2015)	-42
125	The Wedding Ringer (2015)	-39

OR¶

In [194]:

print('Users Love but Critics Hate')
all_sites.sort_values('diff', ascending=True)[['FILM', 'diff']].head(5)

Users Love but Critics Hate

Out[194]:

	FILM	diff
3	Do You Believe? (2015)	-66
85	Little Boy (2015)	-61
134	The Longest Ride (2015)	-42
105	Hitman: Agent 47 (2015)	-42
125	The Wedding Ringer (2015)	-39

OR¶

In [199]:

print('Users Love but Critics Hate')
all_sites.sort_values('diff', ascending=False)[['FILM', 'diff']].tail(5)

Users Love but Critics Hate

Out[199]:

	FILM	diff
125	The Wedding Ringer (2015)	-39
105	Hitman: Agent 47 (2015)	-42
134	The Longest Ride (2015)	-42
85	Little Boy (2015)	-61
3	Do You Believe? (2015)	-66

Rotten Tomatoes

TASK: Now show the top 5 movies critics scores higher than users on average.

In [190]:

print('Critics love, but Users Hate')
all_sites.nlargest(5, 'diff')[['FILM', 'diff']]

Critics love, but Users Hate

Out[190]:

	FILM	diff
69	Mr. Turner (2014)	42
112	It Follows (2015)	31
115	While We're Young (2015)	31
37	Welcome to Me (2015)	24
40	I'll See You In My Dreams (2015)	24

OR¶

In [192]:

print('Critics love, but Users Hate')
all_sites.sort_values('diff', ascending=False)[['FILM', 'diff']].head(5)

Critics love, but Users Hate

Out[192]:

	FILM	diff
69	Mr. Turner (2014)	42
112	It Follows (2015)	31
115	While We're Young (2015)	31
145	Kumiko, The Treasure Hunter (2015)	24
37	Welcome to Me (2015)	24

MetaCritic¶

Now let's take a quick look at the ratings from MetaCritic. Metacritic also shows an average user rating versus their official displayed rating.

TASK: Display a scatterplot of the Metacritic Rating versus the Metacritic User rating.

In [201]:

all_sites.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   FILM                        146 non-null    object 
 1   RottenTomatoes              146 non-null    int64  
 2   RottenTomatoes_User         146 non-null    int64  
 3   Metacritic                  146 non-null    int64  
 4   Metacritic_User             146 non-null    float64
 5   IMDB                        146 non-null    float64
 6   Metacritic_user_vote_count  146 non-null    int64  
 7   IMDB_user_vote_count        146 non-null    int64  
 8   diff                        146 non-null    int64  
 9   task_need                   146 non-null    float64
dtypes: float64(3), int64(6), object(1)
memory usage: 11.5+ KB

In [215]:

plt.figure(figsize=[12, 8], dpi=200)
sns.scatterplot(data= all_sites, x= 'Metacritic',y= 'Metacritic_User', s=80)

plt.xlim(0, 100)
plt.ylim(0, 10)

plt.show()

IMDB

Finally let's explore IMDB. Notice that both Metacritic and IMDB report back vote counts. Let's analyze the most popular movies.

TASK: Create a scatterplot for the relationship between vote counts on MetaCritic versus vote counts on IMDB.

In [217]:

all_sites.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   FILM                        146 non-null    object 
 1   RottenTomatoes              146 non-null    int64  
 2   RottenTomatoes_User         146 non-null    int64  
 3   Metacritic                  146 non-null    int64  
 4   Metacritic_User             146 non-null    float64
 5   IMDB                        146 non-null    float64
 6   Metacritic_user_vote_count  146 non-null    int64  
 7   IMDB_user_vote_count        146 non-null    int64  
 8   diff                        146 non-null    int64  
 9   task_need                   146 non-null    float64
dtypes: float64(3), int64(6), object(1)
memory usage: 11.5+ KB

In [220]:

plt.figure(figsize=[12, 8], dpi=200)

sns.scatterplot(data=all_sites, x= 'Metacritic_user_vote_count', y='IMDB_user_vote_count',s=70)

plt.show()

IMDB

Notice there are two outliers here. The movie with the highest vote count on IMDB only has about 500 Metacritic ratings. What is this movie?

TASK: What movie has the highest IMDB user vote count?

In [238]:

largest = all_sites[all_sites['Metacritic'] <=500]

largest.nlargest(1, 'IMDB_user_vote_count')

Out[238]:

	FILM	RottenTomatoes	RottenTomatoes_User	Metacritic	Metacritic_User	IMDB	Metacritic_user_vote_count	IMDB_user_vote_count	diff	task_need
14	The Imitation Game (2014)	90	92	73	8.2	8.1	566	334164	-2	17.1

OR¶

In [ ]:

largest = all_sites[all_sites['Metacritic'] <=500]

largest.sort_values('IMDB_user_vote_count', ascending=False).head(1)

IMDB

TASK: What movie has the highest Metacritic User Vote count?

In [240]:

all_sites.nlargest(1, 'Metacritic_user_vote_count')

Out[240]:

	FILM	RottenTomatoes	RottenTomatoes_User	Metacritic	Metacritic_User	IMDB	Metacritic_user_vote_count	IMDB_user_vote_count	diff	task_need
88	Mad Max: Fury Road (2015)	97	88	89	8.7	8.3	2375	292023	9	6.1

OR¶

In [242]:

all_sites.sort_values( 'Metacritic_user_vote_count', ascending=False).head(1)

Out[242]:

	FILM	RottenTomatoes	RottenTomatoes_User	Metacritic	Metacritic_User	IMDB	Metacritic_user_vote_count	IMDB_user_vote_count	diff	task_need
88	Mad Max: Fury Road (2015)	97	88	89	8.7	8.3	2375	292023	9	6.1

Fandago Scores vs. All Sites

Finally let's begin to explore whether or not Fandango artificially displays higher ratings than warranted to boost ticket sales.

TASK: Combine the Fandango Table with the All Sites table. Not every movie in the Fandango table is in the All Sites table, since some Fandango movies have very little or no reviews. We only want to compare movies that are in both DataFrames, so do an inner merge to merge together both DataFrames based on the FILM columns.

In [245]:

fandango.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   FILM    504 non-null    object 
 1   STARS   504 non-null    float64
 2   RATING  504 non-null    float64
 3   VOTES   504 non-null    int64  
 4   YEAR    504 non-null    object 
dtypes: float64(2), int64(1), object(2)
memory usage: 19.8+ KB

In [246]:

all_sites.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   FILM                        146 non-null    object 
 1   RottenTomatoes              146 non-null    int64  
 2   RottenTomatoes_User         146 non-null    int64  
 3   Metacritic                  146 non-null    int64  
 4   Metacritic_User             146 non-null    float64
 5   IMDB                        146 non-null    float64
 6   Metacritic_user_vote_count  146 non-null    int64  
 7   IMDB_user_vote_count        146 non-null    int64  
 8   diff                        146 non-null    int64  
 9   task_need                   146 non-null    float64
dtypes: float64(3), int64(6), object(1)
memory usage: 11.5+ KB

In [256]:

df = pd.merge(fandango, all_sites,how='inner', on='FILM')

In [257]:

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 145 entries, 0 to 144
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   FILM                        145 non-null    object 
 1   STARS                       145 non-null    float64
 2   RATING                      145 non-null    float64
 3   VOTES                       145 non-null    int64  
 4   YEAR                        145 non-null    object 
 5   RottenTomatoes              145 non-null    int64  
 6   RottenTomatoes_User         145 non-null    int64  
 7   Metacritic                  145 non-null    int64  
 8   Metacritic_User             145 non-null    float64
 9   IMDB                        145 non-null    float64
 10  Metacritic_user_vote_count  145 non-null    int64  
 11  IMDB_user_vote_count        145 non-null    int64  
 12  diff                        145 non-null    int64  
 13  task_need                   145 non-null    float64
dtypes: float64(5), int64(7), object(2)
memory usage: 17.0+ KB

In [259]:

df.head(5)

Out[259]:

	FILM	STARS	RATING	VOTES	YEAR	RottenTomatoes	RottenTomatoes_User	Metacritic	Metacritic_User	IMDB	Metacritic_user_vote_count	IMDB_user_vote_count	diff	task_need
0	Fifty Shades of Grey (2015)	4.0	3.9	34846	2015	25	42	46	3.2	4.2	778	179506	-17	32.1
1	Jurassic World (2015)	4.5	4.5	34390	2015	71	81	59	7.0	7.3	1281	241807	-10	25.1
2	American Sniper (2015)	5.0	4.8	34085	2015	72	85	72	6.6	7.4	850	251856	-13	28.1
3	Furious 7 (2015)	5.0	4.8	33538	2015	81	84	67	6.8	7.4	764	207211	-3	18.1
4	Inside Out (2015)	4.5	4.5	15749	2015	98	90	94	8.9	8.6	807	96252	8	7.1

Normalize columns to Fandango STARS and RATINGS 0-5¶

Notice that RT,Metacritic, and IMDB don't use a score between 0-5 stars like Fandango does. In order to do a fair comparison, we need to normalize these values so they all fall between 0-5 stars and the relationship between reviews stays the same.

TASK: Create new normalized columns for all ratings so they match up within the 0-5 star range shown on Fandango. There are many ways to do this.

In [263]:

df['RT_norm']=np.round(df['RottenTomatoes']/20, 1)
df['RTU_norm']=np.round(df['RottenTomatoes_User']/20, 1)

In [264]:

df['Meta_norm']=np.round(df['Metacritic']/20, 1)
df['MetaU_norm']=np.round(df['Metacritic_User']/2, 1)

In [265]:

df['IMDB_norm']=np.round(df['IMDB']/2, 1)

In [266]:

df.head()

Out[266]:

	FILM	STARS	RATING	VOTES	YEAR	RottenTomatoes	RottenTomatoes_User	Metacritic	Metacritic_User	IMDB	Metacritic_user_vote_count	IMDB_user_vote_count	diff	task_need	RT_norm	RTU_norm	Meta_norm	MetaU_norm	IMDB_norm
0	Fifty Shades of Grey (2015)	4.0	3.9	34846	2015	25	42	46	3.2	4.2	778	179506	-17	32.1	1.2	2.1	2.3	1.6	2.1
1	Jurassic World (2015)	4.5	4.5	34390	2015	71	81	59	7.0	7.3	1281	241807	-10	25.1	3.6	4.0	3.0	3.5	3.6
2	American Sniper (2015)	5.0	4.8	34085	2015	72	85	72	6.6	7.4	850	251856	-13	28.1	3.6	4.2	3.6	3.3	3.7
3	Furious 7 (2015)	5.0	4.8	33538	2015	81	84	67	6.8	7.4	764	207211	-3	18.1	4.0	4.2	3.4	3.4	3.7
4	Inside Out (2015)	4.5	4.5	15749	2015	98	90	94	8.9	8.6	807	96252	8	7.1	4.9	4.5	4.7	4.4	4.3

TASK: Now create a norm_scores DataFrame that only contains the normalizes ratings. Include both STARS and RATING from the original Fandango table.

In [267]:

norm_scores = df[['STARS', 'RATING', 'RT_norm', 'RTU_norm', 'Meta_norm', 'MetaU_norm', 'IMDB_norm']]

In [268]:

norm_scores

Out[268]:

	STARS	RATING	RT_norm	RTU_norm	Meta_norm	MetaU_norm	IMDB_norm
0	4.0	3.9	1.2	2.1	2.3	1.6	2.1
1	4.5	4.5	3.6	4.0	3.0	3.5	3.6
2	5.0	4.8	3.6	4.2	3.6	3.3	3.7
3	5.0	4.8	4.0	4.2	3.4	3.4	3.7
4	4.5	4.5	4.9	4.5	4.7	4.4	4.3
...	...	...	...	...	...	...	...
140	3.5	3.5	4.4	3.2	3.4	3.2	3.4
141	4.0	3.6	4.8	4.0	4.4	3.2	3.5
142	4.5	4.2	4.6	4.2	3.4	3.5	3.9
143	4.0	3.9	4.8	4.3	4.3	3.6	3.7
144	3.5	3.1	3.0	2.3	3.4	2.9	3.2

145 rows × 7 columns

In [269]:

norm_scores.head(5)

Out[269]:

	STARS	RATING	RT_norm	RTU_norm	Meta_norm	MetaU_norm	IMDB_norm
0	4.0	3.9	1.2	2.1	2.3	1.6	2.1
1	4.5	4.5	3.6	4.0	3.0	3.5	3.6
2	5.0	4.8	3.6	4.2	3.6	3.3	3.7
3	5.0	4.8	4.0	4.2	3.4	3.4	3.7
4	4.5	4.5	4.9	4.5	4.7	4.4	4.3

Comparing Distribution of Scores Across Sites¶

Now the moment of truth! Does Fandango display abnormally high ratings? We already know it pushs displayed RATING higher than STARS, but are the ratings themselves higher than average?

TASK: Create a plot comparing the distributions of normalized ratings across all sites. There are many ways to do this, but explore the Seaborn KDEplot docs for some simple ways to quickly show this. Don't worry if your plot format does not look exactly the same as ours, as long as the differences in distribution are clear.

In [277]:

def move_legend(ax, new_loc, **kws):
    old_legend = ax.legend_
    handles = old_legend.legendHandles
    labels = [t.get_text() for t in old_legend.get_texts()]
    title = old_legend.get_title().get_text()
    ax.legend(handles, labels, loc=new_loc, title=title, **kws)
    

In [284]:

fig, ax = plt.subplots(figsize=(15,6),dpi=150)

sns.kdeplot(data=norm_scores,clip=[0,5],shade=True,palette='Set1', ax=ax)

move_legend(ax, "upper left")

Clearly Fandango has an uneven distribution. We can also see that RT critics have the most uniform distribution. Let's directly compare these two.

TASK: Create a KDE plot that compare the distribution of RT critic ratings against the STARS displayed by Fandango.

In [285]:

norm_scores.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 145 entries, 0 to 144
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   STARS       145 non-null    float64
 1   RATING      145 non-null    float64
 2   RT_norm     145 non-null    float64
 3   RTU_norm    145 non-null    float64
 4   Meta_norm   145 non-null    float64
 5   MetaU_norm  145 non-null    float64
 6   IMDB_norm   145 non-null    float64
dtypes: float64(7)
memory usage: 9.1 KB

In [297]:

fig, ax = plt.subplots(figsize=[12, 8], dpi=200)

sns.kdeplot(data=norm_scores[['RT_norm', 'STARS']], clip=[0, 5], shade=True, ax=ax)

move_legend(ax, 'upper left')

OR¶

In [310]:

fig = plt.figure(figsize=[12, 8], dpi=200)

sns.kdeplot(norm_scores['RT_norm'], clip=[0, 5], shade=True, label='RT_norm')
sns.kdeplot(norm_scores[ 'STARS'], clip=[0, 5], shade=True, label='Stars')

plt.legend(loc=(0.01, 0.9)) 
plt.show()

OPTIONAL TASK: Create a histplot comparing all normalized scores.

In [311]:

norm_scores.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 145 entries, 0 to 144
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   STARS       145 non-null    float64
 1   RATING      145 non-null    float64
 2   RT_norm     145 non-null    float64
 3   RTU_norm    145 non-null    float64
 4   Meta_norm   145 non-null    float64
 5   MetaU_norm  145 non-null    float64
 6   IMDB_norm   145 non-null    float64
dtypes: float64(7)
memory usage: 9.1 KB

In [328]:

fig, ax= plt.subplots(figsize=[12, 8], dpi=200)

sns.histplot(data=norm_scores, ax=ax, bins=50)

move_legend(ax, "center left")

plt.show()

How are the worst movies rated across all platforms?¶

TASK: Create a clustermap visualization of all normalized scores. Note the differences in ratings, highly rated movies should be clustered together versus poorly rated movies. Note: This clustermap does not need to have the FILM titles as the index, feel free to drop it for the clustermap.

In [331]:

sns.clustermap(norm_scores);

In [336]:

sns.clustermap(norm_scores, col_cluster=False, cmap='magma');

TASK: Clearly Fandango is rating movies much higher than other sites, especially considering that it is then displaying a rounded up version of the rating. Let's examine the top 10 worst movies. Based off the Rotten Tomatoes Critic Ratings, what are the top 10 lowest rated movies? What are the normalized scores across all platforms for these movies? You may need to add the FILM column back in to your DataFrame of normalized scores to see the results.

In [342]:

norm_scores

Out[342]:

	STARS	RATING	RT_norm	RTU_norm	Meta_norm	MetaU_norm	IMDB_norm
0	4.0	3.9	1.2	2.1	2.3	1.6	2.1
1	4.5	4.5	3.6	4.0	3.0	3.5	3.6
2	5.0	4.8	3.6	4.2	3.6	3.3	3.7
3	5.0	4.8	4.0	4.2	3.4	3.4	3.7
4	4.5	4.5	4.9	4.5	4.7	4.4	4.3
...	...	...	...	...	...	...	...
140	3.5	3.5	4.4	3.2	3.4	3.2	3.4
141	4.0	3.6	4.8	4.0	4.4	3.2	3.5
142	4.5	4.2	4.6	4.2	3.4	3.5	3.9
143	4.0	3.9	4.8	4.3	4.3	3.6	3.7
144	3.5	3.1	3.0	2.3	3.4	2.9	3.2

145 rows × 7 columns

In [346]:

norm_scores = df[['STARS','RATING','RT_norm','RTU_norm','Meta_norm','MetaU_norm','IMDB_norm','FILM']]

In [347]:

norm_scores

Out[347]:

	STARS	RATING	RT_norm	RTU_norm	Meta_norm	MetaU_norm	IMDB_norm	FILM
0	4.0	3.9	1.2	2.1	2.3	1.6	2.1	Fifty Shades of Grey (2015)
1	4.5	4.5	3.6	4.0	3.0	3.5	3.6	Jurassic World (2015)
2	5.0	4.8	3.6	4.2	3.6	3.3	3.7	American Sniper (2015)
3	5.0	4.8	4.0	4.2	3.4	3.4	3.7	Furious 7 (2015)
4	4.5	4.5	4.9	4.5	4.7	4.4	4.3	Inside Out (2015)
...	...	...	...	...	...	...	...	...
140	3.5	3.5	4.4	3.2	3.4	3.2	3.4	Kumiko, The Treasure Hunter (2015)
141	4.0	3.6	4.8	4.0	4.4	3.2	3.5	The Diary of a Teenage Girl (2015)
142	4.5	4.2	4.6	4.2	3.4	3.5	3.9	The Wrecking Crew (2015)
143	4.0	3.9	4.8	4.3	4.3	3.6	3.7	Tangerine (2015)
144	3.5	3.1	3.0	2.3	3.4	2.9	3.2	Maps to the Stars (2015)

145 rows × 8 columns

In [350]:

norm_scores.nsmallest(10, 'RT_norm')

Out[350]:

	STARS	RATING	RT_norm	RTU_norm	Meta_norm	MetaU_norm	IMDB_norm	FILM
49	3.5	3.5	0.2	1.8	0.6	1.2	2.2	Paul Blart: Mall Cop 2 (2015)
25	4.5	4.1	0.4	2.3	1.3	2.3	3.0	Taken 3 (2015)
28	3.0	2.7	0.4	1.0	1.4	1.2	2.0	Fantastic Four (2015)
54	4.0	3.7	0.4	1.8	1.6	1.8	2.4	Hot Pursuit (2015)
84	4.0	3.9	0.4	2.4	1.4	1.6	3.0	Hitman: Agent 47 (2015)
50	4.0	3.6	0.5	1.8	1.5	2.8	2.3	The Boy Next Door (2015)
77	3.5	3.2	0.6	1.8	1.5	2.0	2.8	Seventh Son (2015)
78	3.5	3.2	0.6	1.5	1.4	1.6	2.8	Mortdecai (2015)
83	3.5	3.3	0.6	1.7	1.6	2.5	2.8	Sinister 2 (2015)
87	3.5	3.2	0.6	1.4	1.6	1.9	2.7	Unfinished Business (2015)

OR¶

In [353]:

norm_scores.sort_values('RT_norm', ascending = True).head(10)

# norm_scores.sort_values('RT_norm', ascending = False).tail(10)

Out[353]:

	STARS	RATING	RT_norm	RTU_norm	Meta_norm	MetaU_norm	IMDB_norm	FILM
49	3.5	3.5	0.2	1.8	0.6	1.2	2.2	Paul Blart: Mall Cop 2 (2015)
25	4.5	4.1	0.4	2.3	1.3	2.3	3.0	Taken 3 (2015)
28	3.0	2.7	0.4	1.0	1.4	1.2	2.0	Fantastic Four (2015)
84	4.0	3.9	0.4	2.4	1.4	1.6	3.0	Hitman: Agent 47 (2015)
54	4.0	3.7	0.4	1.8	1.6	1.8	2.4	Hot Pursuit (2015)
50	4.0	3.6	0.5	1.8	1.5	2.8	2.3	The Boy Next Door (2015)
109	3.0	2.9	0.6	1.0	1.8	2.7	2.3	The Vatican Tapes (2015)
88	4.0	3.6	0.6	2.0	1.2	1.2	3.2	The Loft (2015)
87	3.5	3.2	0.6	1.4	1.6	1.9	2.7	Unfinished Business (2015)
83	3.5	3.3	0.6	1.7	1.6	2.5	2.8	Sinister 2 (2015)

FINAL TASK: Visualize the distribution of ratings across all sites for the top 10 worst movies.

In [355]:

norm_scores

Out[355]:

	STARS	RATING	RT_norm	RTU_norm	Meta_norm	MetaU_norm	IMDB_norm	FILM
0	4.0	3.9	1.2	2.1	2.3	1.6	2.1	Fifty Shades of Grey (2015)
1	4.5	4.5	3.6	4.0	3.0	3.5	3.6	Jurassic World (2015)
2	5.0	4.8	3.6	4.2	3.6	3.3	3.7	American Sniper (2015)
3	5.0	4.8	4.0	4.2	3.4	3.4	3.7	Furious 7 (2015)
4	4.5	4.5	4.9	4.5	4.7	4.4	4.3	Inside Out (2015)
...	...	...	...	...	...	...	...	...
140	3.5	3.5	4.4	3.2	3.4	3.2	3.4	Kumiko, The Treasure Hunter (2015)
141	4.0	3.6	4.8	4.0	4.4	3.2	3.5	The Diary of a Teenage Girl (2015)
142	4.5	4.2	4.6	4.2	3.4	3.5	3.9	The Wrecking Crew (2015)
143	4.0	3.9	4.8	4.3	4.3	3.6	3.7	Tangerine (2015)
144	3.5	3.1	3.0	2.3	3.4	2.9	3.2	Maps to the Stars (2015)

145 rows × 8 columns

In [368]:

print('\n\n\n\n\n')
# This signmifies some space before the plot or whatever comes next

plt.figure(figsize=[12, 8], dpi=200)

worst_films= norm_scores.nsmallest(10, 'RT_norm').drop('FILM', axis=1)
sns.kdeplot(data= worst_films, shade=True, clip=[0, 5])

plt.title("Ratings for RT Critics' Worst Reviewed Films")
plt.show()

Final thoughts: Wow! Fandango is showing around 3-4 star ratings for films that are clearly bad! Notice the biggest offender, Taken 3!. Fandango is displaying 4.5 stars on their site for a film with an average rating of 1.86 across the other platforms!

In [371]:

norm_scores.iloc[25]

Out[371]:

STARS                    4.5
RATING                   4.1
RT_norm                  0.4
RTU_norm                 2.3
Meta_norm                1.3
MetaU_norm               2.3
IMDB_norm                3.0
FILM          Taken 3 (2015)
Name: 25, dtype: object

In [ ]:

	STARS	RATING	RT_norm	RTU_norm	Meta_norm	MetaU_norm	IMDB_norm
0	4.0	3.9	1.2	2.1	2.3	1.6	2.1
1	4.5	4.5	3.6	4.0	3.0	3.5	3.6
2	5.0	4.8	3.6	4.2	3.6	3.3	3.7
3	5.0	4.8	4.0	4.2	3.4	3.4	3.7
4	4.5	4.5	4.9	4.5	4.7	4.4	4.3
...	...	...	...	...	...	...	...
140	3.5	3.5	4.4	3.2	3.4	3.2	3.4
141	4.0	3.6	4.8	4.0	4.4	3.2	3.5
142	4.5	4.2	4.6	4.2	3.4	3.5	3.9
143	4.0	3.9	4.8	4.3	4.3	3.6	3.7
144	3.5	3.1	3.0	2.3	3.4	2.9	3.2

	STARS	RATING	RT_norm	RTU_norm	Meta_norm	MetaU_norm	IMDB_norm
0	4.0	3.9	1.2	2.1	2.3	1.6	2.1
1	4.5	4.5	3.6	4.0	3.0	3.5	3.6
2	5.0	4.8	3.6	4.2	3.6	3.3	3.7
3	5.0	4.8	4.0	4.2	3.4	3.4	3.7
4	4.5	4.5	4.9	4.5	4.7	4.4	4.3

	STARS	RATING	RT_norm	RTU_norm	Meta_norm	MetaU_norm	IMDB_norm
0	4.0	3.9	1.2	2.1	2.3	1.6	2.1
1	4.5	4.5	3.6	4.0	3.0	3.5	3.6
2	5.0	4.8	3.6	4.2	3.6	3.3	3.7
3	5.0	4.8	4.0	4.2	3.4	3.4	3.7
4	4.5	4.5	4.9	4.5	4.7	4.4	4.3
...	...	...	...	...	...	...	...
140	3.5	3.5	4.4	3.2	3.4	3.2	3.4
141	4.0	3.6	4.8	4.0	4.4	3.2	3.5
142	4.5	4.2	4.6	4.2	3.4	3.5	3.9
143	4.0	3.9	4.8	4.3	4.3	3.6	3.7
144	3.5	3.1	3.0	2.3	3.4	2.9	3.2

	STARS	RATING	RT_norm	RTU_norm	Meta_norm	MetaU_norm	IMDB_norm
0	4.0	3.9	1.2	2.1	2.3	1.6	2.1
1	4.5	4.5	3.6	4.0	3.0	3.5	3.6
2	5.0	4.8	3.6	4.2	3.6	3.3	3.7
3	5.0	4.8	4.0	4.2	3.4	3.4	3.7
4	4.5	4.5	4.9	4.5	4.7	4.4	4.3
...	...	...	...	...	...	...	...
140	3.5	3.5	4.4	3.2	3.4	3.2	3.4
141	4.0	3.6	4.8	4.0	4.4	3.2	3.5
142	4.5	4.2	4.6	4.2	3.4	3.5	3.9
143	4.0	3.9	4.8	4.3	4.3	3.6	3.7
144	3.5	3.1	3.0	2.3	3.4	2.9	3.2

	STARS	RATING	RT_norm	RTU_norm	Meta_norm	MetaU_norm	IMDB_norm
0	4.0	3.9	1.2	2.1	2.3	1.6	2.1
1	4.5	4.5	3.6	4.0	3.0	3.5	3.6
2	5.0	4.8	3.6	4.2	3.6	3.3	3.7
3	5.0	4.8	4.0	4.2	3.4	3.4	3.7
4	4.5	4.5	4.9	4.5	4.7	4.4	4.3

	STARS	RATING	RT_norm	RTU_norm	Meta_norm	MetaU_norm	IMDB_norm
0	4.0	3.9	1.2	2.1	2.3	1.6	2.1
1	4.5	4.5	3.6	4.0	3.0	3.5	3.6
2	5.0	4.8	3.6	4.2	3.6	3.3	3.7
3	5.0	4.8	4.0	4.2	3.4	3.4	3.7
4	4.5	4.5	4.9	4.5	4.7	4.4	4.3
...	...	...	...	...	...	...	...
140	3.5	3.5	4.4	3.2	3.4	3.2	3.4
141	4.0	3.6	4.8	4.0	4.4	3.2	3.5
142	4.5	4.2	4.6	4.2	3.4	3.5	3.9
143	4.0	3.9	4.8	4.3	4.3	3.6	3.7
144	3.5	3.1	3.0	2.3	3.4	2.9	3.2

	STARS	RATING	RT_norm	RTU_norm	Meta_norm	MetaU_norm	IMDB_norm
0	4.0	3.9	1.2	2.1	2.3	1.6	2.1
1	4.5	4.5	3.6	4.0	3.0	3.5	3.6
2	5.0	4.8	3.6	4.2	3.6	3.3	3.7
3	5.0	4.8	4.0	4.2	3.4	3.4	3.7
4	4.5	4.5	4.9	4.5	4.7	4.4	4.3
...	...	...	...	...	...	...	...
140	3.5	3.5	4.4	3.2	3.4	3.2	3.4
141	4.0	3.6	4.8	4.0	4.4	3.2	3.5
142	4.5	4.2	4.6	4.2	3.4	3.5	3.9
143	4.0	3.9	4.8	4.3	4.3	3.6	3.7
144	3.5	3.1	3.0	2.3	3.4	2.9	3.2

	STARS	RATING	RT_norm	RTU_norm	Meta_norm	MetaU_norm	IMDB_norm
0	4.0	3.9	1.2	2.1	2.3	1.6	2.1
1	4.5	4.5	3.6	4.0	3.0	3.5	3.6
2	5.0	4.8	3.6	4.2	3.6	3.3	3.7
3	5.0	4.8	4.0	4.2	3.4	3.4	3.7
4	4.5	4.5	4.9	4.5	4.7	4.4	4.3

	STARS	RATING	RT_norm	RTU_norm	Meta_norm	MetaU_norm	IMDB_norm
0	4.0	3.9	1.2	2.1	2.3	1.6	2.1
1	4.5	4.5	3.6	4.0	3.0	3.5	3.6
2	5.0	4.8	3.6	4.2	3.6	3.3	3.7
3	5.0	4.8	4.0	4.2	3.4	3.4	3.7
4	4.5	4.5	4.9	4.5	4.7	4.4	4.3
...	...	...	...	...	...	...	...
140	3.5	3.5	4.4	3.2	3.4	3.2	3.4
141	4.0	3.6	4.8	4.0	4.4	3.2	3.5
142	4.5	4.2	4.6	4.2	3.4	3.5	3.9
143	4.0	3.9	4.8	4.3	4.3	3.6	3.7
144	3.5	3.1	3.0	2.3	3.4	2.9	3.2