In [1]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('vgsales.csv')

df.head()

Out[1]:

	Name	Platform	Year_of_Release	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales	Critic_Score	Critic_Count	User_Score	User_Count	Developer	Rating
0	Wii Sports	Wii	2006.0	Sports	Nintendo	41.36	28.96	3.77	8.45	82.53	76.0	51.0	8	322.0	Nintendo	E
1	Super Mario Bros.	NES	1985.0	Platform	Nintendo	29.08	3.58	6.81	0.77	40.24	NaN	NaN	NaN	NaN	NaN	NaN
2	Mario Kart Wii	Wii	2008.0	Racing	Nintendo	15.68	12.76	3.79	3.29	35.52	82.0	73.0	8.3	709.0	Nintendo	E
3	Wii Sports Resort	Wii	2009.0	Sports	Nintendo	15.61	10.93	3.28	2.95	32.77	80.0	73.0	8	192.0	Nintendo	E
4	Pokemon Red/Pokemon Blue	GB	1996.0	Role-Playing	Nintendo	11.27	8.89	10.22	1.00	31.37	NaN	NaN	NaN	NaN	NaN	NaN

In [2]:

df.head()

Out[2]:

	Name	Platform	Year_of_Release	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales	Critic_Score	Critic_Count	User_Score	User_Count	Developer	Rating
0	Wii Sports	Wii	2006.0	Sports	Nintendo	41.36	28.96	3.77	8.45	82.53	76.0	51.0	8	322.0	Nintendo	E
1	Super Mario Bros.	NES	1985.0	Platform	Nintendo	29.08	3.58	6.81	0.77	40.24	NaN	NaN	NaN	NaN	NaN	NaN
2	Mario Kart Wii	Wii	2008.0	Racing	Nintendo	15.68	12.76	3.79	3.29	35.52	82.0	73.0	8.3	709.0	Nintendo	E
3	Wii Sports Resort	Wii	2009.0	Sports	Nintendo	15.61	10.93	3.28	2.95	32.77	80.0	73.0	8	192.0	Nintendo	E
4	Pokemon Red/Pokemon Blue	GB	1996.0	Role-Playing	Nintendo	11.27	8.89	10.22	1.00	31.37	NaN	NaN	NaN	NaN	NaN	NaN

In [3]:

df.tail()

Out[3]:

	Name	Platform	Year_of_Release	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Global_Sales	Critic_Score	Critic_Count	User_Score	User_Count	Developer	Rating
16714	Samurai Warriors: Sanada Maru	PS3	2016.0	Action	Tecmo Koei	0.00	0.00	0.01	0.01	NaN	NaN	NaN	NaN	NaN	NaN
16715	LMA Manager 2007	X360	2006.0	Sports	Codemasters	0.00	0.01	0.00	0.01	NaN	NaN	NaN	NaN	NaN	NaN
16716	Haitaka no Psychedelica	PSV	2016.0	Adventure	Idea Factory	0.00	0.00	0.01	0.01	NaN	NaN	NaN	NaN	NaN	NaN
16717	Spirits & Spells	GBA	2003.0	Platform	Wanadoo	0.01	0.00	0.00	0.01	NaN	NaN	NaN	NaN	NaN	NaN
16718	Winning Post 8 2016	PSV	2016.0	Simulation	Tecmo Koei	0.00	0.00	0.01	0.01	NaN	NaN	NaN	NaN	NaN	NaN

In [4]:

df.sample()

Out[4]:

	Name	Platform	Year_of_Release	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales	Critic_Score	Critic_Count	User_Score	User_Count	Developer	Rating
15145	Toukiden 2	PS3	2016.0	Action	Tecmo Koei	0.0	0.0	0.02	0.0	0.02	NaN	NaN	NaN	NaN	NaN	NaN

In [5]:

df.shape

Out[5]:

(16719, 16)

In [6]:

data_types = df.dtypes
print(data_types)

Name                object
Platform            object
Year_of_Release    float64
Genre               object
Publisher           object
NA_Sales           float64
EU_Sales           float64
JP_Sales           float64
Other_Sales        float64
Global_Sales       float64
Critic_Score       float64
Critic_Count       float64
User_Score          object
User_Count         float64
Developer           object
Rating              object
dtype: object

In [7]:

df['Year_of_Release'] = df['Year_of_Release'].fillna(0).astype(int)

In [8]:

df['User_Score'] = pd.to_numeric(df['User_Score'], errors='coerce')

In [9]:

df['Critic_Count'] = df['Critic_Count'].fillna(0).astype(int)
df['User_Count'] = df['User_Count'].fillna(0).astype(int)

In [10]:

print("\nMissing values in the dataset:")
print(df.isnull().sum())

Missing values in the dataset:
Name                  2
Platform              0
Year_of_Release       0
Genre                 2
Publisher            54
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8582
Critic_Count          0
User_Score         9129
User_Count            0
Developer          6623
Rating             6769
dtype: int64

In [11]:

# Remove rows where 'Name' or 'Genre' is missing
df.dropna(subset=['Name', 'Genre'], inplace=True)

# Replace missing 'Publisher' values with 'Unknown'
df['Publisher'].fillna('Unknown', inplace=True)

# Display the DataFrame to confirm changes
df.head()

Out[11]:

	Name	Platform	Year_of_Release	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales	Critic_Score	Critic_Count	User_Score	User_Count	Developer	Rating
0	Wii Sports	Wii	2006	Sports	Nintendo	41.36	28.96	3.77	8.45	82.53	76.0	51	8.0	322	Nintendo	E
1	Super Mario Bros.	NES	1985	Platform	Nintendo	29.08	3.58	6.81	0.77	40.24	NaN	0	NaN	0	NaN	NaN
2	Mario Kart Wii	Wii	2008	Racing	Nintendo	15.68	12.76	3.79	3.29	35.52	82.0	73	8.3	709	Nintendo	E
3	Wii Sports Resort	Wii	2009	Sports	Nintendo	15.61	10.93	3.28	2.95	32.77	80.0	73	8.0	192	Nintendo	E
4	Pokemon Red/Pokemon Blue	GB	1996	Role-Playing	Nintendo	11.27	8.89	10.22	1.00	31.37	NaN	0	NaN	0	NaN	NaN

In [12]:

# Calculate the average difference where both scores are present
df['User_Score'] = pd.to_numeric(df['User_Score'], errors='coerce')
valid_scores = df.dropna(subset=['User_Score', 'Critic_Score'])
average_diff = (valid_scores['User_Score'] - valid_scores['Critic_Score']).mean()

# Impute missing User_Scores with Critic_Score + average_diff
missing_user = df['User_Score'].isnull() & df['Critic_Score'].notnull()
df.loc[missing_user, 'User_Score'] = df['Critic_Score'] + average_diff

# Impute missing Critic_Scores with User_Score - average_diff
missing_critic = df['Critic_Score'].isnull() & df['User_Score'].notnull()
df.loc[missing_critic, 'Critic_Score'] = df['User_Score'] - average_diff

In [13]:

print("\nStatistical details of the dataset:")
print(df.describe())

Statistical details of the dataset:
       Year_of_Release      NA_Sales      EU_Sales      JP_Sales  \
count     16717.000000  16717.000000  16717.000000  16717.000000   
mean       1974.201771      0.263255      0.145010      0.077610   
std         252.545637      0.813475      0.503303      0.308836   
min           0.000000      0.000000      0.000000      0.000000   
25%        2003.000000      0.000000      0.000000      0.000000   
50%        2007.000000      0.080000      0.020000      0.000000   
75%        2010.000000      0.240000      0.110000      0.040000   
max        2020.000000     41.360000     28.960000     10.220000   

        Other_Sales  Global_Sales  Critic_Score  Critic_Count   User_Score  \
count  16717.000000  16717.000000   8710.000000  16717.000000  8710.000000   
mean       0.047333      0.533462     69.002023     12.831130     5.934629   
std        0.186721      1.547956     13.481816     18.680383     5.311803   
min        0.000000      0.010000     13.000000      0.000000   -40.067393   
25%        0.000000      0.060000     61.000000      0.000000     5.900000   
50%        0.010000      0.170000     70.267393      0.000000     7.300000   
75%        0.030000      0.470000     79.000000     21.000000     8.200000   
max       10.570000     82.530000     98.000000    113.000000    26.932607   

         User_Count  
count  16717.000000  
mean      73.657056  
std      386.717446  
min        0.000000  
25%        0.000000  
50%        0.000000  
75%       20.000000  
max    10665.000000

In [14]:

# Normalize Critic_Score to be out of 10
df['Normalized_Critic_Score'] = df['Critic_Score'] / 10

# Fill missing values with 0 for calculation purposes
df['Normalized_Critic_Score'].fillna(0, inplace=True)
df['User_Score'].fillna(0, inplace=True)
df['Critic_Count'].fillna(0, inplace=True)
df['User_Count'].fillna(0, inplace=True)

# Calculate the weighted score
df['Weighted_Rating_Score'] = df.apply(lambda x: (x['Normalized_Critic_Score'] * x['Critic_Count'] + x['User_Score'] * x['User_Count']) / (x['Critic_Count'] + x['User_Count']) if (x['Critic_Count'] + x['User_Count']) > 0 else 0, axis=1)

In [15]:

# Exclude 'Year_of_Release' from the statistical summary
statistical_details = df.drop(columns='Year_of_Release').describe()

# Print the statistical details of the dataset excluding 'Year_of_Release'
print("\nStatistical details of the dataset (excluding 'Year_of_Release'):")
print(statistical_details)

Statistical details of the dataset (excluding 'Year_of_Release'):
           NA_Sales      EU_Sales      JP_Sales   Other_Sales  Global_Sales  \
count  16717.000000  16717.000000  16717.000000  16717.000000  16717.000000   
mean       0.263255      0.145010      0.077610      0.047333      0.533462   
std        0.813475      0.503303      0.308836      0.186721      1.547956   
min        0.000000      0.000000      0.000000      0.000000      0.010000   
25%        0.000000      0.000000      0.000000      0.000000      0.060000   
50%        0.080000      0.020000      0.000000      0.010000      0.170000   
75%        0.240000      0.110000      0.040000      0.030000      0.470000   
max       41.360000     28.960000     10.220000     10.570000     82.530000   

       Critic_Score  Critic_Count    User_Score    User_Count  \
count   8710.000000  16717.000000  16717.000000  16717.000000   
mean      69.002023     12.831130      3.092099     73.657056   
std       13.481816     18.680383      4.846648    386.717446   
min       13.000000      0.000000    -40.067393      0.000000   
25%       61.000000      0.000000      0.000000      0.000000   
50%       70.267393      0.000000      0.000000      0.000000   
75%       79.000000     21.000000      7.400000     20.000000   
max       98.000000    113.000000     26.932607  10665.000000   

       Normalized_Critic_Score  Weighted_Rating_Score  
count             16717.000000           16717.000000  
mean                  3.595188               3.580325  
std                   3.581874               3.573915  
min                   0.000000               0.000000  
25%                   0.000000               0.000000  
50%                   4.200000               4.000000  
75%                   7.100000               7.173810  
max                   9.800000               9.700000

In [16]:

platform_counts = df['Platform'].value_counts()
print("\nNumber of games per platform:")
print(platform_counts)

Number of games per platform:
PS2     2161
DS      2152
PS3     1331
Wii     1320
X360    1262
PSP     1209
PS      1197
PC       974
XB       824
GBA      822
GC       556
3DS      520
PSV      432
PS4      393
N64      319
XOne     247
SNES     239
SAT      173
WiiU     147
2600     133
NES       98
GB        98
DC        52
GEN       27
NG        12
SCD        6
WS         6
3DO        3
TG16       2
GG         1
PCFX       1
Name: Platform, dtype: int64

In [17]:

genre_counts = df['Genre'].value_counts()
print("\nNumber of games per genre:")
print(genre_counts)

Number of games per genre:
Action          3370
Sports          2348
Misc            1750
Role-Playing    1500
Shooter         1323
Adventure       1303
Racing          1249
Platform         888
Simulation       874
Fighting         849
Strategy         683
Puzzle           580
Name: Genre, dtype: int64

In [19]:

# Adding a small constant because log(0) is undefined
df['Log_Global_Sales'] = np.log(df['Global_Sales'] + 0.01)  

plt.figure(figsize=(10, 6))
sns.histplot(df['Log_Global_Sales'].dropna(), kde=False, bins=50)
plt.title('Log-transformed Distribution of Global Sales')
plt.xlabel('Log of Global Sales')
plt.show()

In [20]:

plt.figure(figsize=(10, 6))
sns.histplot(df[df['Global_Sales'] < 5]['Global_Sales'], kde=False, bins=50)  # Adjust the threshold as needed
plt.title('Distribution of Global Sales (Zoomed In)')
plt.show()

In [ ]:

In [21]:

# Sum of global sales by genre
sales_by_genre = df.groupby('Genre')['Global_Sales'].sum().sort_values(ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x=sales_by_genre.values, y=sales_by_genre.index)
plt.title('Global Sales by Genre')
plt.xlabel('Global Sales (in millions)')
plt.ylabel('Genre')
plt.show()

Part 1

WHat were the top 5 global sales?

In [23]:

# Sort the DataFrame by 'Global_Sales' select the top 5
top_5_games = df.sort_values(by='Global_Sales', ascending=False).head(5)

# Display the top 5 games
print(top_5_games[['Name', 'Global_Sales']])

                       Name  Global_Sales
0                Wii Sports         82.53
1         Super Mario Bros.         40.24
2            Mario Kart Wii         35.52
3         Wii Sports Resort         32.77
4  Pokemon Red/Pokemon Blue         31.37

In [24]:

plt.figure(figsize=(12, 6))
sns.barplot(x='Global_Sales', y='Name', data=top_5_games, palette='viridis')
plt.title('Top 5 Games by Global Sales')
plt.xlabel('Global Sales (in millions)')
plt.ylabel('Game Name')
plt.show()

In [25]:

# Convert 'Year_of_Release' to string for concatenation
top_5_games['Year_of_Release'] = top_5_games['Year_of_Release'].astype(str)

# Create a new column 'Name_Year' that combines 'Name' and 'Year_of_Release'
top_5_games['Name_Year'] = top_5_games['Name'] + ' (' + top_5_games['Year_of_Release'] + ')'

plt.figure(figsize=(12, 6))
sns.barplot(x='Global_Sales', y='Name_Year', data=top_5_games, palette='viridis')
plt.title('Top 5 Games by Global Sales')
plt.xlabel('Global Sales (in millions)')
plt.ylabel('Game Name (Year of Release)')
plt.show()

Is there a correlation between the “na_sales” and “jp_sales” for the years 2010-2014?

In [ ]:

print(df.columns)

In [29]:

# Filter the DataFrame for the years 2010-2014
df_filtered = df[(df['Year_of_Release'] >= 2010) & (df['Year_of_Release'] <= 2014)]

# Calculate the correlation coefficient between 'NA_Sales' and 'JP_Sales'
correlation = df_filtered['NA_Sales'].corr(df_filtered['JP_Sales'])

print(f"The correlation between NA sales and JP sales for the years 2010-2014 is: {correlation}")

The correlation between NA sales and JP sales for the years 2010-2014 is: 0.26043134778810034

In [34]:

import matplotlib.pyplot as plt
import seaborn as sns

palette = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)

# Create a scatter plot with a regression line
plt.figure(figsize=(10, 6))

sns.regplot(x='NA_Sales', y='JP_Sales', data=df_filtered,
            scatter_kws={'alpha':0.6, 'cmap': palette}, line_kws={'color':'#2ca02c'},
            scatter=True, fit_reg=True)

plt.title('Relationship Between NA Sales and JP Sales (2010-2014)')
plt.xlabel('NA Sales (in millions)')
plt.ylabel('JP Sales (in millions)')

norm = plt.Normalize(df_filtered['NA_Sales'].min(), df_filtered['NA_Sales'].max())
sm = plt.cm.ScalarMappable(cmap=palette, norm=norm)
sm.set_array([])
plt.colorbar(sm, label='NA Sales Density')

plt.show()

C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\regression.py:395: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  ax.scatter(x, y, **kws)
C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_22852\4170679984.py:20: MatplotlibDeprecationWarning: Unable to determine Axes to steal space for Colorbar. Using gca(), but will raise in the future. Either provide the *cax* argument to use as the Axes for the Colorbar, provide the *ax* argument to steal space from it, or add *mappable* to an Axes.
  plt.colorbar(sm, label='NA Sales Density')

In [35]:

plt.figure(figsize=(10, 6))
# Adding a small constant to avoid log(0)
df_filtered['Log_NA_Sales'] = np.log(df_filtered['NA_Sales'] + 0.01)
df_filtered['Log_JP_Sales'] = np.log(df_filtered['JP_Sales'] + 0.01)
sns.regplot(x='Log_NA_Sales', y='Log_JP_Sales', data=df_filtered,
            scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
plt.title('Log-Transformed Relationship Between NA Sales and JP Sales (2010-2014)')
plt.xlabel('Log of NA Sales')
plt.ylabel('Log of JP Sales')
plt.show()

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_22852\2392025704.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Log_NA_Sales'] = np.log(df_filtered['NA_Sales'] + 0.01)
C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_22852\2392025704.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Log_JP_Sales'] = np.log(df_filtered['JP_Sales'] + 0.01)

In [36]:

plt.figure(figsize=(10, 6))
sns.regplot(x='NA_Sales', y='JP_Sales', data=df_filtered,
            scatter_kws={'alpha':0.2, 's': 20}, line_kws={'color':'red'})
plt.title('Relationship Between NA Sales and JP Sales (2010-2014) with Adjusted Point Opacity and Size')
plt.xlabel('NA Sales (in millions)')
plt.ylabel('JP Sales (in millions)')
plt.show()

In [37]:

plt.figure(figsize=(10, 6))
sns.regplot(x='NA_Sales', y='JP_Sales', data=df_filtered,
            scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
plt.xlim(0, 5)  # Adjust limits based on your data
plt.ylim(0, 2.5)
plt.title('Zoomed Relationship Between NA Sales and JP Sales (2010-2014)')
plt.xlabel('NA Sales (in millions)')
plt.ylabel('JP Sales (in millions)')
plt.show()

In [38]:

plt.figure(figsize=(10, 6))
# Assuming 'Genre' is a relevant variable
sns.scatterplot(x='NA_Sales', y='JP_Sales', data=df_filtered, hue='Genre', alpha=0.5, palette='Set1')
plt.title('Colored Relationship Between NA Sales and JP Sales by Genre (2010-2014)')
plt.xlabel('NA Sales (in millions)')
plt.ylabel('JP Sales (in millions)')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [39]:

plt.figure(figsize=(12, 8))
# Apply log transformation and add a small constant to avoid log(0)
df_filtered['Log_NA_Sales'] = np.log(df_filtered['NA_Sales'] + 0.01)
df_filtered['Log_JP_Sales'] = np.log(df_filtered['JP_Sales'] + 0.01)

# Use scatterplot from seaborn to plot the data with color by 'Genre'
sns.scatterplot(x='Log_NA_Sales', y='Log_JP_Sales', data=df_filtered, hue='Genre', alpha=0.5, palette='Set2')

plt.title('Log-Transformed Relationship Between NA Sales and JP Sales by Genre (2010-2014)')
plt.xlabel('Log of NA Sales')
plt.ylabel('Log of JP Sales')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_22852\1098487685.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Log_NA_Sales'] = np.log(df_filtered['NA_Sales'] + 0.01)
C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_22852\1098487685.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Log_JP_Sales'] = np.log(df_filtered['JP_Sales'] + 0.01)

What is the distribution of the most popular 4 game genres?

In [44]:

genre_sales = df.groupby('Genre')['Global_Sales'].sum().sort_values(ascending=False)

In [45]:

top_4_genres = genre_sales.head(4).index

In [46]:

top_genres_df = df[df['Genre'].isin(top_4_genres)]

In [47]:

plt.figure(figsize=(10, 6))
sns.boxplot(x='Genre', y='Global_Sales', data=top_genres_df)
plt.title('Distribution of Global Sales for Top 4 Game Genres')
plt.xlabel('Genre')
plt.ylabel('Global Sales (in millions)')
plt.show()

In [48]:

plt.figure(figsize=(10, 6))
sns.violinplot(x='Genre', y='Global_Sales', data=top_genres_df)
plt.title('Distribution of Global Sales for Top 4 Game Genres')
plt.xlabel('Genre')
plt.ylabel('Global Sales (in millions)')
plt.show()

In [54]:

top_genres_df['Log_Global_Sales'] = np.log1p(top_genres_df['Global_Sales'])

plt.figure(figsize=(10, 6))
sns.violinplot(x='Genre', y='Log_Global_Sales', data=top_genres_df, palette='Set2')

plt.title('Log-Transformed Distribution of Global Sales for Top 4 Game Genres')
plt.xlabel('Genre')
plt.ylabel('Log of Global Sales (in millions)')

plt.show()

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_22852\689221707.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_genres_df['Log_Global_Sales'] = np.log1p(top_genres_df['Global_Sales'])

In [56]:

plt.figure(figsize=(12, 6))
sns.swarmplot(x='Genre', y='Log_Global_Sales', data=top_genres_df, palette='Set2')
plt.title('Swarm Plot of Log-Transformed Global Sales for Top 4 Game Genres')
plt.xlabel('Genre')
plt.ylabel('Log of Global Sales (in millions)')
plt.show()

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_22852\86292373.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated.
  sns.swarmplot(x='Genre', y='Log_Global_Sales', data=top_genres_df, palette='Set2')
C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 74.6% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 65.5% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 57.1% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 80.1% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 77.9% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 69.4% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 61.8% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 82.5% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)

In [57]:

plt.figure(figsize=(12, 6))
sns.stripplot(x='Genre', y='Log_Global_Sales', data=top_genres_df, palette='Set2', jitter=True, alpha=0.5)
plt.title('Strip Plot of Log-Transformed Global Sales for Top 4 Game Genres')
plt.xlabel('Genre')
plt.ylabel('Log of Global Sales (in millions)')
plt.show()

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_22852\1324788717.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated.
  sns.stripplot(x='Genre', y='Log_Global_Sales', data=top_genres_df, palette='Set2', jitter=True, alpha=0.5)

In [58]:

plt.figure(figsize=(12, 6))
sns.pointplot(x='Genre', y='Log_Global_Sales', data=top_genres_df, capsize=.2, palette='Set2')
plt.title('Point Plot of Log-Transformed Global Sales for Top 4 Game Genres')
plt.xlabel('Genre')
plt.ylabel('Log of Global Sales (in millions)')
plt.show()

In [59]:

g = sns.FacetGrid(top_genres_df, col='Genre', col_wrap=2, height=4, aspect=1.5)
g.map(sns.histplot, 'Log_Global_Sales', kde=True, bins=15, color='skyblue')
g.add_legend()
g.set_titles('{col_name} Genre')
g.set_axis_labels('Log of Global Sales (in millions)', 'Count')
plt.show()

Do older games (2005 and earlier) have a higher MEAN “eu_sales” than newer games (after 2005)?

In [61]:

# Group the dataset into older and newer games
older_games = df[df['Year_of_Release'] <= 2005]
newer_games = df[df['Year_of_Release'] > 2005]

# Calculate the mean EU sales for each group
mean_eu_sales_older = older_games['EU_Sales'].mean()
mean_eu_sales_newer = newer_games['EU_Sales'].mean()

print(f"Mean EU Sales for Older Games (2005 and earlier): {mean_eu_sales_older:.2f}")
print(f"Mean EU Sales for Newer Games (after 2005): {mean_eu_sales_newer:.2f}")

# Compare the means
if mean_eu_sales_older > mean_eu_sales_newer:
    print("Older games (2005 and earlier) have higher mean EU sales than newer games.")
elif mean_eu_sales_older < mean_eu_sales_newer:
    print("Newer games (after 2005) have higher mean EU sales than older games.")
else:
    print("Mean EU sales are the same for older and newer games.")

Mean EU Sales for Older Games (2005 and earlier): 0.15
Mean EU Sales for Newer Games (after 2005): 0.14
Older games (2005 and earlier) have higher mean EU sales than newer games.

In [64]:

import matplotlib.pyplot as plt
import seaborn as sns

# Data for plotting
categories = ['Games (≤2005)', 'Games (>2005)']
mean_sales = [mean_eu_sales_older, mean_eu_sales_newer]

plt.figure(figsize=(8, 6))

# Create a bar plot
sns.barplot(x=categories, y=mean_sales, palette='coolwarm')

# Add titles and labels
plt.title('Comparison of Mean EU Sales')
plt.ylabel('Mean EU Sales (in millions)')
plt.xlabel('Game Category')

# Display the values on the bars
for i, value in enumerate(mean_sales):
    plt.text(i, value + 0.01, f"{value:.2f}", ha='center', va='bottom')

plt.show()

In [66]:

plt.figure(figsize=(8, 6))
sns.barplot(x=categories, y=mean_sales, palette='coolwarm')
plt.ylim(0.13, 0.16)  # Adjust the limits based on your data to zoom in
plt.title('Comparison of Mean EU Sales for Older vs. Newer Games')
plt.ylabel('Mean EU Sales (in millions)')
plt.xlabel('Game Category')
plt.show()

In [69]:

plt.figure(figsize=(8, 6))
sns.barplot(x=categories, y=mean_sales, palette='coolwarm')

plt.ylim(0.14, 0.155)

plt.title('Comparison of Mean EU Sales for Older vs. Newer Games')
plt.ylabel('Mean EU Sales (in millions)')
plt.xlabel('Game Category')

# Add horizontal grid lines for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.show()

What are the 3 most common “developer” in the dataset?

In [71]:

# Get the counts of each unique developer and select the top 3
top_3_developers = df['Developer'].value_counts().head(3)

print(top_3_developers)

Ubisoft      204
EA Sports    172
EA Canada    167
Name: Developer, dtype: int64

In [75]:

import matplotlib.pyplot as plt
import seaborn as sns

top_3_developers = pd.Series([204, 172, 167], index=['Ubisoft', 'EA Sports', 'EA Canada'])

plt.figure(figsize=(10, 6))

sns.barplot(x=top_3_developers.values, y=top_3_developers.index, palette='viridis')

for i, value in enumerate(top_3_developers.values):
    plt.text(value + 1, i, f'{value}', va='center')  # Adding a small offset (+1) for better visibility

plt.xlim(min(top_3_developers.values) - 5, max(top_3_developers.values) + 5)

plt.title('Top 3 Most Common Game Developers')
plt.xlabel('Number of Games Developed')
plt.ylabel('Developer')

plt.show()

Part 2:

How do the dynamics of game genre preferences, regional sales patterns, and review scores collectively impact the global sales of video games, and which of these factors most strongly predict market success?

In [76]:

# Aggregate sales by genre and region
genre_region_sales = df.groupby('Genre')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].sum()

# Visualize genre popularity in different regions with a bar chart
genre_region_sales.plot(kind='bar', figsize=(14, 8), title='Genre Popularity by Region')
plt.ylabel('Sales (in millions)')
plt.show()

In [77]:

# Scatter plot for Critic Scores vs Global Sales
sns.regplot(x='Critic_Score', y='Global_Sales', data=df, scatter_kws={'alpha':0.3})
plt.title('Critic Score vs Global Sales')
plt.show()

# Scatter plot for User Scores vs Global Sales
sns.regplot(x='User_Score', y='Global_Sales', data=df, scatter_kws={'alpha':0.3})
plt.title('User Score vs Global Sales')
plt.show()

Step 1: Aggregate Data by Genre and Region with Weighted Score

In [79]:

# Calculate the mean weighted rating score and sales by genre
genre_analysis = df.groupby('Genre').agg({
    'Weighted_Rating_Score': 'mean',
    'NA_Sales': 'sum',
    'EU_Sales': 'sum',
    'JP_Sales': 'sum',
    'Other_Sales': 'sum'
}).reset_index()

In [80]:

regions = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']

for region in regions:
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='Weighted_Rating_Score', y=region, data=genre_analysis, hue='Genre', s=100)
    plt.title(f'Genre Weighted Rating Score vs. {region}')
    plt.xlabel('Average Weighted Rating Score')
    plt.ylabel(f'Total Sales in {region} (in millions)')
    plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()

In [81]:

for region in regions:
    correlation = genre_analysis['Weighted_Rating_Score'].corr(genre_analysis[region])
    print(f'Correlation between Weighted Rating Score and {region}: {correlation:.2f}')

Correlation between Weighted Rating Score and NA_Sales: 0.58
Correlation between Weighted Rating Score and EU_Sales: 0.57
Correlation between Weighted Rating Score and JP_Sales: 0.14
Correlation between Weighted Rating Score and Other_Sales: 0.53

Why is Japan Different?

In [82]:

# Aggregate sales data by platform for each region
platform_sales = df.groupby('Platform').agg({
    'JP_Sales': 'sum',
    'NA_Sales': 'sum',
    'EU_Sales': 'sum'
}).reset_index()

In [83]:

import matplotlib.pyplot as plt

# Plotting platform sales in Japan
plt.figure(figsize=(12, 8))
platform_sales.sort_values('JP_Sales', ascending=False).plot(x='Platform', y='JP_Sales', kind='bar', color='skyblue')
plt.title('Game Sales by Platform in Japan')
plt.xlabel('Platform')
plt.ylabel('Total Sales (in millions)')
plt.xticks(rotation=45)
plt.show()

# Plotting platform sales in North America
plt.figure(figsize=(12, 8))
platform_sales.sort_values('NA_Sales', ascending=False).plot(x='Platform', y='NA_Sales', kind='bar', color='orange')
plt.title('Game Sales by Platform in North America')
plt.xlabel('Platform')
plt.ylabel('Total Sales (in millions)')
plt.xticks(rotation=45)
plt.show()

# Plotting platform sales in Europe
plt.figure(figsize=(12, 8))
platform_sales.sort_values('EU_Sales', ascending=False).plot(x='Platform', y='EU_Sales', kind='bar', color='green')
plt.title('Game Sales by Platform in Europe')
plt.xlabel('Platform')
plt.ylabel('Total Sales (in millions)')
plt.xticks(rotation=45)
plt.show()

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

In [84]:

# Print top platforms in Japan
print("Top Platforms in Japan:")
print(platform_sales[['Platform', 'JP_Sales']].sort_values('JP_Sales', ascending=False).head())

# Print top platforms in North America
print("\nTop Platforms in North America:")
print(platform_sales[['Platform', 'NA_Sales']].sort_values('NA_Sales', ascending=False).head())

# Print top platforms in Europe
print("\nTop Platforms in Europe:")
print(platform_sales[['Platform', 'EU_Sales']].sort_values('EU_Sales', ascending=False).head())

Top Platforms in Japan:
   Platform  JP_Sales
4        DS    175.57
15       PS    139.82
16      PS2    139.20
23     SNES    116.55
2       3DS    100.67

Top Platforms in North America:
   Platform  NA_Sales
28     X360    602.47
16      PS2    583.84
26      Wii    496.90
17      PS3    393.49
4        DS    382.67

Top Platforms in Europe:
   Platform  EU_Sales
16      PS2    339.29
17      PS3    330.29
28     X360    270.76
26      Wii    262.21
15       PS    213.61

In [85]:

# Top platforms in Japan excluding the bottom 8
top_platforms_japan = platform_sales.sort_values('JP_Sales', ascending=False).head(len(platform_sales) - 8)

plt.figure(figsize=(12, 8))
top_platforms_japan.plot(x='Platform', y='JP_Sales', kind='bar', color='skyblue')
plt.title('Top Game Sales by Platform in Japan')
plt.xlabel('Platform')
plt.ylabel('Total Sales (in millions)')
plt.xticks(rotation=45)
plt.show()

<Figure size 1200x800 with 0 Axes>

In [86]:

# Top platforms in North America excluding the bottom 8
top_platforms_na = platform_sales.sort_values('NA_Sales', ascending=False).head(len(platform_sales) - 8)

plt.figure(figsize=(12, 8))
top_platforms_na.plot(x='Platform', y='NA_Sales', kind='bar', color='orange')
plt.title('Top Game Sales by Platform in North America')
plt.xlabel('Platform')
plt.ylabel('Total Sales (in millions)')
plt.xticks(rotation=45)
plt.show()

<Figure size 1200x800 with 0 Axes>

In [87]:

# Top platforms in Europe excluding the bottom 8
top_platforms_europe = platform_sales.sort_values('EU_Sales', ascending=False).head(len(platform_sales) - 8)

plt.figure(figsize=(12, 8))
top_platforms_europe.plot(x='Platform', y='EU_Sales', kind='bar', color='green')
plt.title('Top Game Sales by Platform in Europe')
plt.xlabel('Platform')
plt.ylabel('Total Sales (in millions)')
plt.xticks(rotation=45)
plt.show()

<Figure size 1200x800 with 0 Axes>

In [94]:

# Sort by sales and exclude the bottom 8 platforms for each region
top_platforms_jp = platform_sales.sort_values('JP_Sales', ascending=False).head(-11)
top_platforms_na = platform_sales.sort_values('NA_Sales', ascending=False).head(-11)
top_platforms_eu = platform_sales.sort_values('EU_Sales', ascending=False).head(-11)

In [95]:

fig, ax = plt.subplots(1, 3, figsize=(18, 6))

# Japan
top_platforms_jp.plot(ax=ax[0], x='Platform', y='JP_Sales', kind='bar', color='skyblue')
ax[0].set_title('Top Game Platforms in Japan')
ax[0].set_xlabel('Platform')
ax[0].set_ylabel('Total Sales (in millions)')
ax[0].tick_params(axis='x', rotation=45)

# North America
top_platforms_na.plot(ax=ax[1], x='Platform', y='NA_Sales', kind='bar', color='orange')
ax[1].set_title('Top Game Platforms in North America')
ax[1].set_xlabel('Platform')
ax[1].set_ylabel('Total Sales (in millions)')
ax[1].tick_params(axis='x', rotation=45)

# Europe
top_platforms_eu.plot(ax=ax[2], x='Platform', y='EU_Sales', kind='bar', color='green')
ax[2].set_title('Top Game Platforms in Europe')
ax[2].set_xlabel('Platform')
ax[2].set_ylabel('Total Sales (in millions)')
ax[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [100]:

# Exclude 'DC', 'GEN', and '2600' platforms from the combined data
excluded_platforms = ['DC', 'GEN', '2600', 'SAT']
filtered_combined_platforms = combined_platforms[~combined_platforms['Platform'].isin(excluded_platforms)]
plt.figure(figsize=(14, 8))

# Recalculate positions for the updated set of platforms
positions = np.arange(len(filtered_combined_platforms['Platform']))

# Stacked bar chart with the updated filtered data
plt.bar(positions - width, filtered_combined_platforms['JP_Sales'], width, label='Japan', color='skyblue')
plt.bar(positions, filtered_combined_platforms['NA_Sales'], width, label='North America', color='orange')
plt.bar(positions + width, filtered_combined_platforms['EU_Sales'], width, label='Europe', color='green')

plt.title('Comparative Game Sales by Top Platforms Across Regions')
plt.xlabel('Platform')
plt.ylabel('Total Sales (in millions)')
plt.xticks(positions, filtered_combined_platforms['Platform'], rotation=45)
plt.legend()

plt.show()

In [101]:

top5_jp = platform_sales.sort_values('JP_Sales', ascending=False).head(5)['Platform']
top5_na = platform_sales.sort_values('NA_Sales', ascending=False).head(5)['Platform']
top5_eu = platform_sales.sort_values('EU_Sales', ascending=False).head(5)['Platform']

In [102]:

import seaborn as sns
import matplotlib.pyplot as plt

def plot_genre_preferences(region_top_platforms, region_sales_col, region_name):
    # Filter data for the top platforms in the region
    df_top_platforms = df[df['Platform'].isin(region_top_platforms)]

    # Aggregate sales by platform and genre
    genre_sales = df_top_platforms.groupby(['Platform', 'Genre'])[region_sales_col].sum().unstack().fillna(0)

    # Plot
    genre_sales.plot(kind='bar', stacked=True, figsize=(14, 8), colormap='viridis')
    plt.title(f'Genre Preferences for Top Platforms in {region_name}')
    plt.xlabel('Platform')
    plt.ylabel(f'Total Sales in {region_name} (in millions)')
    plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45)
    plt.show()

# Plot genre preferences for top platforms in Japan
plot_genre_preferences(top5_jp, 'JP_Sales', 'Japan')

In [105]:

def plot_ratings_impact(region_top_platforms, region_sales_col, region_name):
    # Filter data for top platforms
    df_top_platforms = df[df['Platform'].isin(region_top_platforms)]

    # Plot
    plt.figure(figsize=(14, 8))
    sns.scatterplot(data=df_top_platforms, x='Critic_Score', y=region_sales_col, hue='Platform', style='Platform', alpha=0.6)
    plt.title(f'Impact of Critic Scores on Sales in {region_name}')
    plt.xlabel('Critic Score')
    plt.ylabel(f'Total Sales in {region_name} (in millions)')
    plt.legend(title='Platform', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()

# Plot the impact of game ratings on sales for top platforms in Japan
plot_ratings_impact(top5_jp, 'JP_Sales', 'Japan')

In [106]:

# Define thresholds for high ratings and high sales
critic_score_threshold = df['Critic_Score'].quantile(0.75)
user_score_threshold = df['User_Score'].quantile(0.75)
sales_threshold = df['Global_Sales'].quantile(0.75)

In [107]:

highly_rated_and_high_sales = df[
    (df['Critic_Score'] >= critic_score_threshold) & 
    (df['User_Score'] >= user_score_threshold) & 
    (df['Global_Sales'] >= sales_threshold)
]

In [108]:

summary = highly_rated_and_high_sales[['Critic_Score', 'User_Score', 'Global_Sales']].describe()
print(summary)

       Critic_Score   User_Score  Global_Sales
count   1044.000000  1044.000000   1044.000000
mean      85.851533     8.454079      2.247787
std        4.632826     1.445498      3.103293
min       79.000000     7.400000      0.470000
25%       82.000000     7.900000      0.777500
50%       85.000000     8.300000      1.270000
75%       89.000000     8.700000      2.482500
max       98.000000    26.932607     35.520000

In [109]:

plt.figure(figsize=(10, 6))
sns.scatterplot(data=highly_rated_and_high_sales, x='Critic_Score', y='Global_Sales', alpha=0.6)
plt.title('Critic Score vs. Global Sales for Top-Performing Games')
plt.xlabel('Critic Score')
plt.ylabel('Global Sales (in millions)')
plt.show()

In [112]:

# Define thresholds for high ratings and high sales
high_rating_threshold = 80  # Adjust based on your scoring scale
high_sales_threshold = df['Global_Sales'].quantile(0.75)  # Top 25% of sales

# Filter games that are both highly rated and have high sales
highly_rated_and_high_sales_games = df[(df['Critic_Score'] > high_rating_threshold) & (df['Global_Sales'] > high_sales_threshold)]

In [113]:

# Aggregate the number of games by genre
top_genres = highly_rated_and_high_sales_games['Genre'].value_counts()

In [114]:

fig, ax = plt.subplots(figsize=(10, 6))

top_genres.plot(kind='bar', ax=ax, color='skyblue')
ax.set_title('Top Genres Among Highly Rated and High Sales Games')
ax.set_ylabel('Number of Games')
ax.set_xlabel('Genre')

plt.xticks(rotation=45)
plt.show()

Highly Rated and Low Sales

In [115]:

# Define the threshold for highest-rated games based on critic scores
rating_threshold = df['Critic_Score'].quantile(0.75)  # Top 25% of scores

# Filter highest-rated games
highest_rated_games = df[df['Critic_Score'] >= rating_threshold]

In [116]:

# Japan
least_popular_highest_rated_jp = highest_rated_games.sort_values('JP_Sales').head(5)

# North America
least_popular_highest_rated_na = highest_rated_games.sort_values('NA_Sales').head(5)

# Europe
least_popular_highest_rated_eu = highest_rated_games.sort_values('EU_Sales').head(5)

In [117]:

# Display least popular highest-rated games in Japan
print("Least Popular Highest-Rated Games in Japan:")
print(least_popular_highest_rated_jp[['Name', 'Critic_Score', 'JP_Sales']])

# Display least popular highest-rated games in North America
print("\nLeast Popular Highest-Rated Games in North America:")
print(least_popular_highest_rated_na[['Name', 'Critic_Score', 'NA_Sales']])

# Display least popular highest-rated games in Europe
print("\nLeast Popular Highest-Rated Games in Europe:")
print(least_popular_highest_rated_eu[['Name', 'Critic_Score', 'EU_Sales']])

Least Popular Highest-Rated Games in Japan:
                                        Name  Critic_Score  JP_Sales
3171    X-Men Legends II: Rise of Apocalypse          82.0       0.0
5170                           Left 4 Dead 2          89.0       0.0
5165  Sid Meier's Civilization: Beyond Earth          81.0       0.0
5162                       Elite Beat Agents          87.0       0.0
5156                           NBA Live 2003          82.0       0.0

Least Popular Highest-Rated Games in North America:
                                       Name  Critic_Score  NA_Sales
16696     Metal Gear Solid V: Ground Zeroes          80.0       0.0
13440  Silent Hunter: Wolves of the Pacific          79.0       0.0
7521                   Phantasy Star Online          89.0       0.0
13431                     Company of Heroes          93.0       0.0
7643                             Grandia II          90.0       0.0

Least Popular Highest-Rated Games in Europe:
                                            Name  Critic_Score  EU_Sales
10450  Winning Eleven: Pro Evolution Soccer 2007          79.0       0.0
9417                                   Ninjatown          80.0       0.0
9280                    Persona 4: Arena Ultimax          84.0       0.0
9204                           College Hoops 2K8          82.0       0.0
9024                     Geometry Wars: Galaxies          79.0       0.0

In [118]:

# Combine the lists and drop duplicates
combined_games = pd.concat([least_popular_highest_rated_jp, least_popular_highest_rated_na, least_popular_highest_rated_eu]).drop_duplicates(subset=['Name'])

# Reset index
combined_games.reset_index(drop=True, inplace=True)

In [119]:

# Display sales across regions for the combined list of games
print("Sales Across Regions for Least Popular Highest-Rated Games:")
print(combined_games[['Name', 'JP_Sales', 'NA_Sales', 'EU_Sales']])

Sales Across Regions for Least Popular Highest-Rated Games:
                                         Name  JP_Sales  NA_Sales  EU_Sales
0        X-Men Legends II: Rise of Apocalypse      0.00      0.53      0.02
1                               Left 4 Dead 2      0.00      0.00      0.32
2      Sid Meier's Civilization: Beyond Earth      0.00      0.11      0.22
3                           Elite Beat Agents      0.00      0.30      0.03
4                               NBA Live 2003      0.00      0.31      0.04
5           Metal Gear Solid V: Ground Zeroes      0.00      0.00      0.01
6        Silent Hunter: Wolves of the Pacific      0.00      0.00      0.04
7                        Phantasy Star Online      0.20      0.00      0.00
8                           Company of Heroes      0.00      0.00      0.04
9                                  Grandia II      0.20      0.00      0.00
10  Winning Eleven: Pro Evolution Soccer 2007      0.00      0.09      0.00
11                                  Ninjatown      0.00      0.12      0.00
12                   Persona 4: Arena Ultimax      0.13      0.00      0.00
13                          College Hoops 2K8      0.00      0.13      0.00
14                    Geometry Wars: Galaxies      0.00      0.13      0.00

In [122]:

# Calculate the variance in sales across regions to find games with the most significant differences
combined_games['Sales_Variance'] = combined_games[['JP_Sales', 'NA_Sales', 'EU_Sales']].var(axis=1)

# Sort by 'Sales_Variance' and select the top N games
top_n_games = combined_games.sort_values('Sales_Variance', ascending=False).head(10)

# Plotting sales for the top N games with the most significant differences across regions
top_n_games[['JP_Sales', 'NA_Sales', 'EU_Sales']].plot(kind='barh', figsize=(10, 8), width=0.75)
plt.title('Top Games with Significant Regional Sales Differences')
plt.xlabel('Sales (in millions)')
plt.ylabel('Game Name')
plt.legend(title='Region')
plt.tight_layout()
plt.show()

In [123]:

# Example mappings
platform_to_brand = {
    'Wii': 'Nintendo', 'NES': 'Nintendo', 'GB': 'Nintendo', 'DS': 'Nintendo', 'SNES': 'Nintendo',
    '3DS': 'Nintendo', 'N64': 'Nintendo', 'GBA': 'Nintendo', 'GC': 'Nintendo', 'WiiU': 'Nintendo', 'Switch': 'Nintendo',
    'PS': 'Sony', 'PS2': 'Sony', 'PS3': 'Sony', 'PS4': 'Sony', 'PSP': 'Sony', 'PSV': 'Sony',
    'X360': 'Microsoft', 'XB': 'Microsoft', 'XOne': 'Microsoft',
    # Add other platforms and their corresponding brands
}

In [124]:

df['Console_Brand'] = df['Platform'].map(platform_to_brand)

In [125]:

brand_sales_by_region = df.groupby('Console_Brand').agg({
    'JP_Sales': 'sum',
    'NA_Sales': 'sum',
    'EU_Sales': 'sum',
}).reset_index()

In [126]:

# Stacked bar chart for sales by console brand in each region
brand_sales_by_region.set_index('Console_Brand')[['JP_Sales', 'NA_Sales', 'EU_Sales']].plot(kind='bar', stacked=True, figsize=(12, 8))
plt.title('Sales by Console Brand Across Regions')
plt.xlabel('Console Brand')
plt.ylabel('Total Sales (in millions)')
plt.legend(title='Region')
plt.xticks(rotation=45)
plt.show()

# Alternatively, use a grouped bar chart for a side-by-side comparison
brand_sales_by_region.plot(x='Console_Brand', kind='bar', figsize=(12, 8))
plt.title('Sales by Console Brand Across Regions')
plt.xlabel('Console Brand')
plt.ylabel('Total Sales (in millions)')
plt.xticks(rotation=45)
plt.legend(title='Region')
plt.show()

In [127]:

brands = ['Nintendo', 'Sony', 'Microsoft']
regions = ['JP_Sales', 'NA_Sales', 'EU_Sales']

top_games_by_brand_and_region = {}

for brand in brands:
    for region in regions:
        top_games = df[df['Console_Brand'] == brand].sort_values(by=region, ascending=False).head(5)[['Name', region]]
        key = f'{brand} - {region}'
        top_games_by_brand_and_region[key] = top_games

# Print the top-selling games for each brand in each region
for key, value in top_games_by_brand_and_region.items():
    print(f'\nTop Selling Games for {key}:')
    print(value)

Top Selling Games for Nintendo - JP_Sales:
                             Name  JP_Sales
4        Pokemon Red/Pokemon Blue     10.22
12    Pokemon Gold/Pokemon Silver      7.20
1               Super Mario Bros.      6.81
6           New Super Mario Bros.      6.50
20  Pokemon Diamond/Pokemon Pearl      6.04

Top Selling Games for Nintendo - NA_Sales:
                Name  NA_Sales
0         Wii Sports     41.36
1  Super Mario Bros.     29.08
9          Duck Hunt     26.93
5             Tetris     23.20
2     Mario Kart Wii     15.68

Top Selling Games for Nintendo - EU_Sales:
                                            Name  EU_Sales
0                                     Wii Sports     28.96
2                                 Mario Kart Wii     12.76
10                                    Nintendogs     10.95
3                              Wii Sports Resort     10.93
19  Brain Age: Train Your Brain in Minutes a Day      9.20

Top Selling Games for Sony - JP_Sales:
                                              Name  JP_Sales
215                       Monster Hunter Freedom 3      4.87
163                   Monster Hunter Freedom Unite      4.13
244             Dragon Quest VII: Warriors of Eden      4.10
88                              Final Fantasy VIII      3.63
186  Dragon Quest VIII: Journey of the Cursed King      3.61

Top Selling Games for Sony - NA_Sales:
                             Name  NA_Sales
17  Grand Theft Auto: San Andreas      9.43
24    Grand Theft Auto: Vice City      8.41
16             Grand Theft Auto V      7.02
38           Grand Theft Auto III      6.99
28         Gran Turismo 3: A-Spec      6.85

Top Selling Games for Sony - EU_Sales:
                         Name  EU_Sales
16         Grand Theft Auto V      9.09
42         Grand Theft Auto V      6.31
77                    FIFA 16      6.12
31  Call of Duty: Black Ops 3      5.86
94                    FIFA 17      5.75

Top Selling Games for Microsoft - JP_Sales:
                                   Name  JP_Sales
14                   Kinect Adventures!      0.24
987                     Dead or Alive 3      0.24
2044  Ace Combat 6: Fires of Liberation      0.22
2262                        Blue Dragon      0.21
2608          Star Ocean: The Last Hope      0.21

Top Selling Games for Microsoft - NA_Sales:
                              Name  NA_Sales
14              Kinect Adventures!     15.00
32         Call of Duty: Black Ops      9.70
23              Grand Theft Auto V      9.66
29  Call of Duty: Modern Warfare 3      9.04
36  Call of Duty: Modern Warfare 2      8.52

Top Selling Games for Microsoft - EU_Sales:
                              Name  EU_Sales
23              Grand Theft Auto V      5.14
14              Kinect Adventures!      4.89
29  Call of Duty: Modern Warfare 3      4.24
35      Call of Duty: Black Ops II      4.24
32         Call of Duty: Black Ops      3.68

In [137]:

data_microsoft_jp = {
    'Name': ['Kinect Adventures!', 'Dead or Alive 3', 'Ace Combat 6: Fires of Liberation', 'Blue Dragon', 'Star Ocean: The Last Hope'],
    'JP_Sales': [0.24, 0.24, 0.22, 0.21, 0.21]
}
df_microsoft_jp = pd.DataFrame(data_microsoft_jp)

data_microsoft_na = {
    'Name': ['Kinect Adventures!', 'Call of Duty: Black Ops', 'Grand Theft Auto V', 'Call of Duty: Modern Warfare 3', 'Call of Duty: Modern Warfare 2'],
    'NA_Sales': [15.00, 9.70, 9.66, 9.04, 8.52]
}
df_microsoft_na = pd.DataFrame(data_microsoft_na)

data_microsoft_eu = {
    'Name': ['Grand Theft Auto V', 'Kinect Adventures!', 'Call of Duty: Modern Warfare 3', 'Call of Duty: Black Ops II', 'Call of Duty: Black Ops'],
    'EU_Sales': [5.14, 4.89, 4.24, 4.24, 3.68]
}
df_microsoft_eu = pd.DataFrame(data_microsoft_eu)

In [146]:

import matplotlib.pyplot as plt

%matplotlib inline

def plot_sales(dataframe, title, sales_column):
    fig, ax = plt.subplots(figsize=(10, 6))
    dataframe.plot(kind='bar', x='Name', y=sales_column, ax=ax, legend=False, color='skyblue')
    ax.set_title(title)
    ax.set_ylabel('Sales (in millions)')
    ax.set_xlabel('')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

In [145]:

# Plot for Microsoft in Japan
plot_sales(df_microsoft_jp, 'Top Selling Microsoft Games in Japan', 'JP_Sales')

# Plot for Microsoft in North America
plot_sales(df_microsoft_na, 'Top Selling Microsoft Games in North America', 'NA_Sales')

# Plot for Microsoft in Europe
plot_sales(df_microsoft_eu, 'Top Selling Microsoft Games in Europe', 'EU_Sales')
plt.show

Out[145]:

<function matplotlib.pyplot.show(close=None, block=None)>

In [ ]: