import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv('vgsales.csv')
df.head()
Name | Platform | Year_of_Release | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales | Critic_Score | Critic_Count | User_Score | User_Count | Developer | Rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Wii Sports | Wii | 2006.0 | Sports | Nintendo | 41.36 | 28.96 | 3.77 | 8.45 | 82.53 | 76.0 | 51.0 | 8 | 322.0 | Nintendo | E |
1 | Super Mario Bros. | NES | 1985.0 | Platform | Nintendo | 29.08 | 3.58 | 6.81 | 0.77 | 40.24 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | Mario Kart Wii | Wii | 2008.0 | Racing | Nintendo | 15.68 | 12.76 | 3.79 | 3.29 | 35.52 | 82.0 | 73.0 | 8.3 | 709.0 | Nintendo | E |
3 | Wii Sports Resort | Wii | 2009.0 | Sports | Nintendo | 15.61 | 10.93 | 3.28 | 2.95 | 32.77 | 80.0 | 73.0 | 8 | 192.0 | Nintendo | E |
4 | Pokemon Red/Pokemon Blue | GB | 1996.0 | Role-Playing | Nintendo | 11.27 | 8.89 | 10.22 | 1.00 | 31.37 | NaN | NaN | NaN | NaN | NaN | NaN |
df.head()
Name | Platform | Year_of_Release | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales | Critic_Score | Critic_Count | User_Score | User_Count | Developer | Rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Wii Sports | Wii | 2006.0 | Sports | Nintendo | 41.36 | 28.96 | 3.77 | 8.45 | 82.53 | 76.0 | 51.0 | 8 | 322.0 | Nintendo | E |
1 | Super Mario Bros. | NES | 1985.0 | Platform | Nintendo | 29.08 | 3.58 | 6.81 | 0.77 | 40.24 | NaN | NaN | NaN | NaN | NaN | NaN |
2 | Mario Kart Wii | Wii | 2008.0 | Racing | Nintendo | 15.68 | 12.76 | 3.79 | 3.29 | 35.52 | 82.0 | 73.0 | 8.3 | 709.0 | Nintendo | E |
3 | Wii Sports Resort | Wii | 2009.0 | Sports | Nintendo | 15.61 | 10.93 | 3.28 | 2.95 | 32.77 | 80.0 | 73.0 | 8 | 192.0 | Nintendo | E |
4 | Pokemon Red/Pokemon Blue | GB | 1996.0 | Role-Playing | Nintendo | 11.27 | 8.89 | 10.22 | 1.00 | 31.37 | NaN | NaN | NaN | NaN | NaN | NaN |
df.tail()
Name | Platform | Year_of_Release | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales | Critic_Score | Critic_Count | User_Score | User_Count | Developer | Rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
16714 | Samurai Warriors: Sanada Maru | PS3 | 2016.0 | Action | Tecmo Koei | 0.00 | 0.00 | 0.01 | 0.0 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN |
16715 | LMA Manager 2007 | X360 | 2006.0 | Sports | Codemasters | 0.00 | 0.01 | 0.00 | 0.0 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN |
16716 | Haitaka no Psychedelica | PSV | 2016.0 | Adventure | Idea Factory | 0.00 | 0.00 | 0.01 | 0.0 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN |
16717 | Spirits & Spells | GBA | 2003.0 | Platform | Wanadoo | 0.01 | 0.00 | 0.00 | 0.0 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN |
16718 | Winning Post 8 2016 | PSV | 2016.0 | Simulation | Tecmo Koei | 0.00 | 0.00 | 0.01 | 0.0 | 0.01 | NaN | NaN | NaN | NaN | NaN | NaN |
df.sample()
Name | Platform | Year_of_Release | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales | Critic_Score | Critic_Count | User_Score | User_Count | Developer | Rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
15145 | Toukiden 2 | PS3 | 2016.0 | Action | Tecmo Koei | 0.0 | 0.0 | 0.02 | 0.0 | 0.02 | NaN | NaN | NaN | NaN | NaN | NaN |
df.shape
(16719, 16)
data_types = df.dtypes
print(data_types)
Name object Platform object Year_of_Release float64 Genre object Publisher object NA_Sales float64 EU_Sales float64 JP_Sales float64 Other_Sales float64 Global_Sales float64 Critic_Score float64 Critic_Count float64 User_Score object User_Count float64 Developer object Rating object dtype: object
df['Year_of_Release'] = df['Year_of_Release'].fillna(0).astype(int)
df['User_Score'] = pd.to_numeric(df['User_Score'], errors='coerce')
df['Critic_Count'] = df['Critic_Count'].fillna(0).astype(int)
df['User_Count'] = df['User_Count'].fillna(0).astype(int)
print("\nMissing values in the dataset:")
print(df.isnull().sum())
Missing values in the dataset: Name 2 Platform 0 Year_of_Release 0 Genre 2 Publisher 54 NA_Sales 0 EU_Sales 0 JP_Sales 0 Other_Sales 0 Global_Sales 0 Critic_Score 8582 Critic_Count 0 User_Score 9129 User_Count 0 Developer 6623 Rating 6769 dtype: int64
# Remove rows where 'Name' or 'Genre' is missing
df.dropna(subset=['Name', 'Genre'], inplace=True)
# Replace missing 'Publisher' values with 'Unknown'
df['Publisher'].fillna('Unknown', inplace=True)
# Display the DataFrame to confirm changes
df.head()
Name | Platform | Year_of_Release | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales | Critic_Score | Critic_Count | User_Score | User_Count | Developer | Rating | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Wii Sports | Wii | 2006 | Sports | Nintendo | 41.36 | 28.96 | 3.77 | 8.45 | 82.53 | 76.0 | 51 | 8.0 | 322 | Nintendo | E |
1 | Super Mario Bros. | NES | 1985 | Platform | Nintendo | 29.08 | 3.58 | 6.81 | 0.77 | 40.24 | NaN | 0 | NaN | 0 | NaN | NaN |
2 | Mario Kart Wii | Wii | 2008 | Racing | Nintendo | 15.68 | 12.76 | 3.79 | 3.29 | 35.52 | 82.0 | 73 | 8.3 | 709 | Nintendo | E |
3 | Wii Sports Resort | Wii | 2009 | Sports | Nintendo | 15.61 | 10.93 | 3.28 | 2.95 | 32.77 | 80.0 | 73 | 8.0 | 192 | Nintendo | E |
4 | Pokemon Red/Pokemon Blue | GB | 1996 | Role-Playing | Nintendo | 11.27 | 8.89 | 10.22 | 1.00 | 31.37 | NaN | 0 | NaN | 0 | NaN | NaN |
# Calculate the average difference where both scores are present
df['User_Score'] = pd.to_numeric(df['User_Score'], errors='coerce')
valid_scores = df.dropna(subset=['User_Score', 'Critic_Score'])
average_diff = (valid_scores['User_Score'] - valid_scores['Critic_Score']).mean()
# Impute missing User_Scores with Critic_Score + average_diff
missing_user = df['User_Score'].isnull() & df['Critic_Score'].notnull()
df.loc[missing_user, 'User_Score'] = df['Critic_Score'] + average_diff
# Impute missing Critic_Scores with User_Score - average_diff
missing_critic = df['Critic_Score'].isnull() & df['User_Score'].notnull()
df.loc[missing_critic, 'Critic_Score'] = df['User_Score'] - average_diff
print("\nStatistical details of the dataset:")
print(df.describe())
Statistical details of the dataset: Year_of_Release NA_Sales EU_Sales JP_Sales \ count 16717.000000 16717.000000 16717.000000 16717.000000 mean 1974.201771 0.263255 0.145010 0.077610 std 252.545637 0.813475 0.503303 0.308836 min 0.000000 0.000000 0.000000 0.000000 25% 2003.000000 0.000000 0.000000 0.000000 50% 2007.000000 0.080000 0.020000 0.000000 75% 2010.000000 0.240000 0.110000 0.040000 max 2020.000000 41.360000 28.960000 10.220000 Other_Sales Global_Sales Critic_Score Critic_Count User_Score \ count 16717.000000 16717.000000 8710.000000 16717.000000 8710.000000 mean 0.047333 0.533462 69.002023 12.831130 5.934629 std 0.186721 1.547956 13.481816 18.680383 5.311803 min 0.000000 0.010000 13.000000 0.000000 -40.067393 25% 0.000000 0.060000 61.000000 0.000000 5.900000 50% 0.010000 0.170000 70.267393 0.000000 7.300000 75% 0.030000 0.470000 79.000000 21.000000 8.200000 max 10.570000 82.530000 98.000000 113.000000 26.932607 User_Count count 16717.000000 mean 73.657056 std 386.717446 min 0.000000 25% 0.000000 50% 0.000000 75% 20.000000 max 10665.000000
# Normalize Critic_Score to be out of 10
df['Normalized_Critic_Score'] = df['Critic_Score'] / 10
# Fill missing values with 0 for calculation purposes
df['Normalized_Critic_Score'].fillna(0, inplace=True)
df['User_Score'].fillna(0, inplace=True)
df['Critic_Count'].fillna(0, inplace=True)
df['User_Count'].fillna(0, inplace=True)
# Calculate the weighted score
df['Weighted_Rating_Score'] = df.apply(lambda x: (x['Normalized_Critic_Score'] * x['Critic_Count'] + x['User_Score'] * x['User_Count']) / (x['Critic_Count'] + x['User_Count']) if (x['Critic_Count'] + x['User_Count']) > 0 else 0, axis=1)
# Exclude 'Year_of_Release' from the statistical summary
statistical_details = df.drop(columns='Year_of_Release').describe()
# Print the statistical details of the dataset excluding 'Year_of_Release'
print("\nStatistical details of the dataset (excluding 'Year_of_Release'):")
print(statistical_details)
Statistical details of the dataset (excluding 'Year_of_Release'): NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales \ count 16717.000000 16717.000000 16717.000000 16717.000000 16717.000000 mean 0.263255 0.145010 0.077610 0.047333 0.533462 std 0.813475 0.503303 0.308836 0.186721 1.547956 min 0.000000 0.000000 0.000000 0.000000 0.010000 25% 0.000000 0.000000 0.000000 0.000000 0.060000 50% 0.080000 0.020000 0.000000 0.010000 0.170000 75% 0.240000 0.110000 0.040000 0.030000 0.470000 max 41.360000 28.960000 10.220000 10.570000 82.530000 Critic_Score Critic_Count User_Score User_Count \ count 8710.000000 16717.000000 16717.000000 16717.000000 mean 69.002023 12.831130 3.092099 73.657056 std 13.481816 18.680383 4.846648 386.717446 min 13.000000 0.000000 -40.067393 0.000000 25% 61.000000 0.000000 0.000000 0.000000 50% 70.267393 0.000000 0.000000 0.000000 75% 79.000000 21.000000 7.400000 20.000000 max 98.000000 113.000000 26.932607 10665.000000 Normalized_Critic_Score Weighted_Rating_Score count 16717.000000 16717.000000 mean 3.595188 3.580325 std 3.581874 3.573915 min 0.000000 0.000000 25% 0.000000 0.000000 50% 4.200000 4.000000 75% 7.100000 7.173810 max 9.800000 9.700000
platform_counts = df['Platform'].value_counts()
print("\nNumber of games per platform:")
print(platform_counts)
Number of games per platform: PS2 2161 DS 2152 PS3 1331 Wii 1320 X360 1262 PSP 1209 PS 1197 PC 974 XB 824 GBA 822 GC 556 3DS 520 PSV 432 PS4 393 N64 319 XOne 247 SNES 239 SAT 173 WiiU 147 2600 133 NES 98 GB 98 DC 52 GEN 27 NG 12 SCD 6 WS 6 3DO 3 TG16 2 GG 1 PCFX 1 Name: Platform, dtype: int64
genre_counts = df['Genre'].value_counts()
print("\nNumber of games per genre:")
print(genre_counts)
Number of games per genre: Action 3370 Sports 2348 Misc 1750 Role-Playing 1500 Shooter 1323 Adventure 1303 Racing 1249 Platform 888 Simulation 874 Fighting 849 Strategy 683 Puzzle 580 Name: Genre, dtype: int64
# Adding a small constant because log(0) is undefined
df['Log_Global_Sales'] = np.log(df['Global_Sales'] + 0.01)
plt.figure(figsize=(10, 6))
sns.histplot(df['Log_Global_Sales'].dropna(), kde=False, bins=50)
plt.title('Log-transformed Distribution of Global Sales')
plt.xlabel('Log of Global Sales')
plt.show()
plt.figure(figsize=(10, 6))
sns.histplot(df[df['Global_Sales'] < 5]['Global_Sales'], kde=False, bins=50) # Adjust the threshold as needed
plt.title('Distribution of Global Sales (Zoomed In)')
plt.show()
# Sum of global sales by genre
sales_by_genre = df.groupby('Genre')['Global_Sales'].sum().sort_values(ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(x=sales_by_genre.values, y=sales_by_genre.index)
plt.title('Global Sales by Genre')
plt.xlabel('Global Sales (in millions)')
plt.ylabel('Genre')
plt.show()
Part 1
WHat were the top 5 global sales?
# Sort the DataFrame by 'Global_Sales' select the top 5
top_5_games = df.sort_values(by='Global_Sales', ascending=False).head(5)
# Display the top 5 games
print(top_5_games[['Name', 'Global_Sales']])
Name Global_Sales 0 Wii Sports 82.53 1 Super Mario Bros. 40.24 2 Mario Kart Wii 35.52 3 Wii Sports Resort 32.77 4 Pokemon Red/Pokemon Blue 31.37
plt.figure(figsize=(12, 6))
sns.barplot(x='Global_Sales', y='Name', data=top_5_games, palette='viridis')
plt.title('Top 5 Games by Global Sales')
plt.xlabel('Global Sales (in millions)')
plt.ylabel('Game Name')
plt.show()
# Convert 'Year_of_Release' to string for concatenation
top_5_games['Year_of_Release'] = top_5_games['Year_of_Release'].astype(str)
# Create a new column 'Name_Year' that combines 'Name' and 'Year_of_Release'
top_5_games['Name_Year'] = top_5_games['Name'] + ' (' + top_5_games['Year_of_Release'] + ')'
plt.figure(figsize=(12, 6))
sns.barplot(x='Global_Sales', y='Name_Year', data=top_5_games, palette='viridis')
plt.title('Top 5 Games by Global Sales')
plt.xlabel('Global Sales (in millions)')
plt.ylabel('Game Name (Year of Release)')
plt.show()
Is there a correlation between the “na_sales” and “jp_sales” for the years 2010-2014?
print(df.columns)
# Filter the DataFrame for the years 2010-2014
df_filtered = df[(df['Year_of_Release'] >= 2010) & (df['Year_of_Release'] <= 2014)]
# Calculate the correlation coefficient between 'NA_Sales' and 'JP_Sales'
correlation = df_filtered['NA_Sales'].corr(df_filtered['JP_Sales'])
print(f"The correlation between NA sales and JP sales for the years 2010-2014 is: {correlation}")
The correlation between NA sales and JP sales for the years 2010-2014 is: 0.26043134778810034
import matplotlib.pyplot as plt
import seaborn as sns
palette = sns.cubehelix_palette(start=.5, rot=-.75, as_cmap=True)
# Create a scatter plot with a regression line
plt.figure(figsize=(10, 6))
sns.regplot(x='NA_Sales', y='JP_Sales', data=df_filtered,
scatter_kws={'alpha':0.6, 'cmap': palette}, line_kws={'color':'#2ca02c'},
scatter=True, fit_reg=True)
plt.title('Relationship Between NA Sales and JP Sales (2010-2014)')
plt.xlabel('NA Sales (in millions)')
plt.ylabel('JP Sales (in millions)')
norm = plt.Normalize(df_filtered['NA_Sales'].min(), df_filtered['NA_Sales'].max())
sm = plt.cm.ScalarMappable(cmap=palette, norm=norm)
sm.set_array([])
plt.colorbar(sm, label='NA Sales Density')
plt.show()
C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\regression.py:395: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored ax.scatter(x, y, **kws) C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_22852\4170679984.py:20: MatplotlibDeprecationWarning: Unable to determine Axes to steal space for Colorbar. Using gca(), but will raise in the future. Either provide the *cax* argument to use as the Axes for the Colorbar, provide the *ax* argument to steal space from it, or add *mappable* to an Axes. plt.colorbar(sm, label='NA Sales Density')
plt.figure(figsize=(10, 6))
# Adding a small constant to avoid log(0)
df_filtered['Log_NA_Sales'] = np.log(df_filtered['NA_Sales'] + 0.01)
df_filtered['Log_JP_Sales'] = np.log(df_filtered['JP_Sales'] + 0.01)
sns.regplot(x='Log_NA_Sales', y='Log_JP_Sales', data=df_filtered,
scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
plt.title('Log-Transformed Relationship Between NA Sales and JP Sales (2010-2014)')
plt.xlabel('Log of NA Sales')
plt.ylabel('Log of JP Sales')
plt.show()
C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_22852\2392025704.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_filtered['Log_NA_Sales'] = np.log(df_filtered['NA_Sales'] + 0.01) C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_22852\2392025704.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_filtered['Log_JP_Sales'] = np.log(df_filtered['JP_Sales'] + 0.01)
plt.figure(figsize=(10, 6))
sns.regplot(x='NA_Sales', y='JP_Sales', data=df_filtered,
scatter_kws={'alpha':0.2, 's': 20}, line_kws={'color':'red'})
plt.title('Relationship Between NA Sales and JP Sales (2010-2014) with Adjusted Point Opacity and Size')
plt.xlabel('NA Sales (in millions)')
plt.ylabel('JP Sales (in millions)')
plt.show()
plt.figure(figsize=(10, 6))
sns.regplot(x='NA_Sales', y='JP_Sales', data=df_filtered,
scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
plt.xlim(0, 5) # Adjust limits based on your data
plt.ylim(0, 2.5)
plt.title('Zoomed Relationship Between NA Sales and JP Sales (2010-2014)')
plt.xlabel('NA Sales (in millions)')
plt.ylabel('JP Sales (in millions)')
plt.show()
plt.figure(figsize=(10, 6))
# Assuming 'Genre' is a relevant variable
sns.scatterplot(x='NA_Sales', y='JP_Sales', data=df_filtered, hue='Genre', alpha=0.5, palette='Set1')
plt.title('Colored Relationship Between NA Sales and JP Sales by Genre (2010-2014)')
plt.xlabel('NA Sales (in millions)')
plt.ylabel('JP Sales (in millions)')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
plt.figure(figsize=(12, 8))
# Apply log transformation and add a small constant to avoid log(0)
df_filtered['Log_NA_Sales'] = np.log(df_filtered['NA_Sales'] + 0.01)
df_filtered['Log_JP_Sales'] = np.log(df_filtered['JP_Sales'] + 0.01)
# Use scatterplot from seaborn to plot the data with color by 'Genre'
sns.scatterplot(x='Log_NA_Sales', y='Log_JP_Sales', data=df_filtered, hue='Genre', alpha=0.5, palette='Set2')
plt.title('Log-Transformed Relationship Between NA Sales and JP Sales by Genre (2010-2014)')
plt.xlabel('Log of NA Sales')
plt.ylabel('Log of JP Sales')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_22852\1098487685.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_filtered['Log_NA_Sales'] = np.log(df_filtered['NA_Sales'] + 0.01) C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_22852\1098487685.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df_filtered['Log_JP_Sales'] = np.log(df_filtered['JP_Sales'] + 0.01)
What is the distribution of the most popular 4 game genres?
genre_sales = df.groupby('Genre')['Global_Sales'].sum().sort_values(ascending=False)
top_4_genres = genre_sales.head(4).index
top_genres_df = df[df['Genre'].isin(top_4_genres)]
plt.figure(figsize=(10, 6))
sns.boxplot(x='Genre', y='Global_Sales', data=top_genres_df)
plt.title('Distribution of Global Sales for Top 4 Game Genres')
plt.xlabel('Genre')
plt.ylabel('Global Sales (in millions)')
plt.show()
plt.figure(figsize=(10, 6))
sns.violinplot(x='Genre', y='Global_Sales', data=top_genres_df)
plt.title('Distribution of Global Sales for Top 4 Game Genres')
plt.xlabel('Genre')
plt.ylabel('Global Sales (in millions)')
plt.show()
top_genres_df['Log_Global_Sales'] = np.log1p(top_genres_df['Global_Sales'])
plt.figure(figsize=(10, 6))
sns.violinplot(x='Genre', y='Log_Global_Sales', data=top_genres_df, palette='Set2')
plt.title('Log-Transformed Distribution of Global Sales for Top 4 Game Genres')
plt.xlabel('Genre')
plt.ylabel('Log of Global Sales (in millions)')
plt.show()
C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_22852\689221707.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy top_genres_df['Log_Global_Sales'] = np.log1p(top_genres_df['Global_Sales'])
plt.figure(figsize=(12, 6))
sns.swarmplot(x='Genre', y='Log_Global_Sales', data=top_genres_df, palette='Set2')
plt.title('Swarm Plot of Log-Transformed Global Sales for Top 4 Game Genres')
plt.xlabel('Genre')
plt.ylabel('Log of Global Sales (in millions)')
plt.show()
C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_22852\86292373.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated. sns.swarmplot(x='Genre', y='Log_Global_Sales', data=top_genres_df, palette='Set2') C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 74.6% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 65.5% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 57.1% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 80.1% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 77.9% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 69.4% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 61.8% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) C:\Users\Luke Holmes\anaconda3\Lib\site-packages\seaborn\categorical.py:3544: UserWarning: 82.5% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning)
plt.figure(figsize=(12, 6))
sns.stripplot(x='Genre', y='Log_Global_Sales', data=top_genres_df, palette='Set2', jitter=True, alpha=0.5)
plt.title('Strip Plot of Log-Transformed Global Sales for Top 4 Game Genres')
plt.xlabel('Genre')
plt.ylabel('Log of Global Sales (in millions)')
plt.show()
C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_22852\1324788717.py:2: FutureWarning: Passing `palette` without assigning `hue` is deprecated. sns.stripplot(x='Genre', y='Log_Global_Sales', data=top_genres_df, palette='Set2', jitter=True, alpha=0.5)
plt.figure(figsize=(12, 6))
sns.pointplot(x='Genre', y='Log_Global_Sales', data=top_genres_df, capsize=.2, palette='Set2')
plt.title('Point Plot of Log-Transformed Global Sales for Top 4 Game Genres')
plt.xlabel('Genre')
plt.ylabel('Log of Global Sales (in millions)')
plt.show()
g = sns.FacetGrid(top_genres_df, col='Genre', col_wrap=2, height=4, aspect=1.5)
g.map(sns.histplot, 'Log_Global_Sales', kde=True, bins=15, color='skyblue')
g.add_legend()
g.set_titles('{col_name} Genre')
g.set_axis_labels('Log of Global Sales (in millions)', 'Count')
plt.show()
Do older games (2005 and earlier) have a higher MEAN “eu_sales” than newer games (after 2005)?
# Group the dataset into older and newer games
older_games = df[df['Year_of_Release'] <= 2005]
newer_games = df[df['Year_of_Release'] > 2005]
# Calculate the mean EU sales for each group
mean_eu_sales_older = older_games['EU_Sales'].mean()
mean_eu_sales_newer = newer_games['EU_Sales'].mean()
print(f"Mean EU Sales for Older Games (2005 and earlier): {mean_eu_sales_older:.2f}")
print(f"Mean EU Sales for Newer Games (after 2005): {mean_eu_sales_newer:.2f}")
# Compare the means
if mean_eu_sales_older > mean_eu_sales_newer:
print("Older games (2005 and earlier) have higher mean EU sales than newer games.")
elif mean_eu_sales_older < mean_eu_sales_newer:
print("Newer games (after 2005) have higher mean EU sales than older games.")
else:
print("Mean EU sales are the same for older and newer games.")
Mean EU Sales for Older Games (2005 and earlier): 0.15 Mean EU Sales for Newer Games (after 2005): 0.14 Older games (2005 and earlier) have higher mean EU sales than newer games.
import matplotlib.pyplot as plt
import seaborn as sns
# Data for plotting
categories = ['Games (≤2005)', 'Games (>2005)']
mean_sales = [mean_eu_sales_older, mean_eu_sales_newer]
plt.figure(figsize=(8, 6))
# Create a bar plot
sns.barplot(x=categories, y=mean_sales, palette='coolwarm')
# Add titles and labels
plt.title('Comparison of Mean EU Sales')
plt.ylabel('Mean EU Sales (in millions)')
plt.xlabel('Game Category')
# Display the values on the bars
for i, value in enumerate(mean_sales):
plt.text(i, value + 0.01, f"{value:.2f}", ha='center', va='bottom')
plt.show()
plt.figure(figsize=(8, 6))
sns.barplot(x=categories, y=mean_sales, palette='coolwarm')
plt.ylim(0.13, 0.16) # Adjust the limits based on your data to zoom in
plt.title('Comparison of Mean EU Sales for Older vs. Newer Games')
plt.ylabel('Mean EU Sales (in millions)')
plt.xlabel('Game Category')
plt.show()
plt.figure(figsize=(8, 6))
sns.barplot(x=categories, y=mean_sales, palette='coolwarm')
plt.ylim(0.14, 0.155)
plt.title('Comparison of Mean EU Sales for Older vs. Newer Games')
plt.ylabel('Mean EU Sales (in millions)')
plt.xlabel('Game Category')
# Add horizontal grid lines for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
What are the 3 most common “developer” in the dataset?
# Get the counts of each unique developer and select the top 3
top_3_developers = df['Developer'].value_counts().head(3)
print(top_3_developers)
Ubisoft 204 EA Sports 172 EA Canada 167 Name: Developer, dtype: int64
import matplotlib.pyplot as plt
import seaborn as sns
top_3_developers = pd.Series([204, 172, 167], index=['Ubisoft', 'EA Sports', 'EA Canada'])
plt.figure(figsize=(10, 6))
sns.barplot(x=top_3_developers.values, y=top_3_developers.index, palette='viridis')
for i, value in enumerate(top_3_developers.values):
plt.text(value + 1, i, f'{value}', va='center') # Adding a small offset (+1) for better visibility
plt.xlim(min(top_3_developers.values) - 5, max(top_3_developers.values) + 5)
plt.title('Top 3 Most Common Game Developers')
plt.xlabel('Number of Games Developed')
plt.ylabel('Developer')
plt.show()
Part 2:
How do the dynamics of game genre preferences, regional sales patterns, and review scores collectively impact the global sales of video games, and which of these factors most strongly predict market success?
# Aggregate sales by genre and region
genre_region_sales = df.groupby('Genre')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].sum()
# Visualize genre popularity in different regions with a bar chart
genre_region_sales.plot(kind='bar', figsize=(14, 8), title='Genre Popularity by Region')
plt.ylabel('Sales (in millions)')
plt.show()
# Scatter plot for Critic Scores vs Global Sales
sns.regplot(x='Critic_Score', y='Global_Sales', data=df, scatter_kws={'alpha':0.3})
plt.title('Critic Score vs Global Sales')
plt.show()
# Scatter plot for User Scores vs Global Sales
sns.regplot(x='User_Score', y='Global_Sales', data=df, scatter_kws={'alpha':0.3})
plt.title('User Score vs Global Sales')
plt.show()
Step 1: Aggregate Data by Genre and Region with Weighted Score
# Calculate the mean weighted rating score and sales by genre
genre_analysis = df.groupby('Genre').agg({
'Weighted_Rating_Score': 'mean',
'NA_Sales': 'sum',
'EU_Sales': 'sum',
'JP_Sales': 'sum',
'Other_Sales': 'sum'
}).reset_index()
regions = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
for region in regions:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Weighted_Rating_Score', y=region, data=genre_analysis, hue='Genre', s=100)
plt.title(f'Genre Weighted Rating Score vs. {region}')
plt.xlabel('Average Weighted Rating Score')
plt.ylabel(f'Total Sales in {region} (in millions)')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
for region in regions:
correlation = genre_analysis['Weighted_Rating_Score'].corr(genre_analysis[region])
print(f'Correlation between Weighted Rating Score and {region}: {correlation:.2f}')
Correlation between Weighted Rating Score and NA_Sales: 0.58 Correlation between Weighted Rating Score and EU_Sales: 0.57 Correlation between Weighted Rating Score and JP_Sales: 0.14 Correlation between Weighted Rating Score and Other_Sales: 0.53
Why is Japan Different?
# Aggregate sales data by platform for each region
platform_sales = df.groupby('Platform').agg({
'JP_Sales': 'sum',
'NA_Sales': 'sum',
'EU_Sales': 'sum'
}).reset_index()
import matplotlib.pyplot as plt
# Plotting platform sales in Japan
plt.figure(figsize=(12, 8))
platform_sales.sort_values('JP_Sales', ascending=False).plot(x='Platform', y='JP_Sales', kind='bar', color='skyblue')
plt.title('Game Sales by Platform in Japan')
plt.xlabel('Platform')
plt.ylabel('Total Sales (in millions)')
plt.xticks(rotation=45)
plt.show()
# Plotting platform sales in North America
plt.figure(figsize=(12, 8))
platform_sales.sort_values('NA_Sales', ascending=False).plot(x='Platform', y='NA_Sales', kind='bar', color='orange')
plt.title('Game Sales by Platform in North America')
plt.xlabel('Platform')
plt.ylabel('Total Sales (in millions)')
plt.xticks(rotation=45)
plt.show()
# Plotting platform sales in Europe
plt.figure(figsize=(12, 8))
platform_sales.sort_values('EU_Sales', ascending=False).plot(x='Platform', y='EU_Sales', kind='bar', color='green')
plt.title('Game Sales by Platform in Europe')
plt.xlabel('Platform')
plt.ylabel('Total Sales (in millions)')
plt.xticks(rotation=45)
plt.show()
<Figure size 1200x800 with 0 Axes>
<Figure size 1200x800 with 0 Axes>
<Figure size 1200x800 with 0 Axes>
# Print top platforms in Japan
print("Top Platforms in Japan:")
print(platform_sales[['Platform', 'JP_Sales']].sort_values('JP_Sales', ascending=False).head())
# Print top platforms in North America
print("\nTop Platforms in North America:")
print(platform_sales[['Platform', 'NA_Sales']].sort_values('NA_Sales', ascending=False).head())
# Print top platforms in Europe
print("\nTop Platforms in Europe:")
print(platform_sales[['Platform', 'EU_Sales']].sort_values('EU_Sales', ascending=False).head())
Top Platforms in Japan: Platform JP_Sales 4 DS 175.57 15 PS 139.82 16 PS2 139.20 23 SNES 116.55 2 3DS 100.67 Top Platforms in North America: Platform NA_Sales 28 X360 602.47 16 PS2 583.84 26 Wii 496.90 17 PS3 393.49 4 DS 382.67 Top Platforms in Europe: Platform EU_Sales 16 PS2 339.29 17 PS3 330.29 28 X360 270.76 26 Wii 262.21 15 PS 213.61
# Top platforms in Japan excluding the bottom 8
top_platforms_japan = platform_sales.sort_values('JP_Sales', ascending=False).head(len(platform_sales) - 8)
plt.figure(figsize=(12, 8))
top_platforms_japan.plot(x='Platform', y='JP_Sales', kind='bar', color='skyblue')
plt.title('Top Game Sales by Platform in Japan')
plt.xlabel('Platform')
plt.ylabel('Total Sales (in millions)')
plt.xticks(rotation=45)
plt.show()
<Figure size 1200x800 with 0 Axes>
# Top platforms in North America excluding the bottom 8
top_platforms_na = platform_sales.sort_values('NA_Sales', ascending=False).head(len(platform_sales) - 8)
plt.figure(figsize=(12, 8))
top_platforms_na.plot(x='Platform', y='NA_Sales', kind='bar', color='orange')
plt.title('Top Game Sales by Platform in North America')
plt.xlabel('Platform')
plt.ylabel('Total Sales (in millions)')
plt.xticks(rotation=45)
plt.show()
<Figure size 1200x800 with 0 Axes>
# Top platforms in Europe excluding the bottom 8
top_platforms_europe = platform_sales.sort_values('EU_Sales', ascending=False).head(len(platform_sales) - 8)
plt.figure(figsize=(12, 8))
top_platforms_europe.plot(x='Platform', y='EU_Sales', kind='bar', color='green')
plt.title('Top Game Sales by Platform in Europe')
plt.xlabel('Platform')
plt.ylabel('Total Sales (in millions)')
plt.xticks(rotation=45)
plt.show()
<Figure size 1200x800 with 0 Axes>
# Sort by sales and exclude the bottom 8 platforms for each region
top_platforms_jp = platform_sales.sort_values('JP_Sales', ascending=False).head(-11)
top_platforms_na = platform_sales.sort_values('NA_Sales', ascending=False).head(-11)
top_platforms_eu = platform_sales.sort_values('EU_Sales', ascending=False).head(-11)
fig, ax = plt.subplots(1, 3, figsize=(18, 6))
# Japan
top_platforms_jp.plot(ax=ax[0], x='Platform', y='JP_Sales', kind='bar', color='skyblue')
ax[0].set_title('Top Game Platforms in Japan')
ax[0].set_xlabel('Platform')
ax[0].set_ylabel('Total Sales (in millions)')
ax[0].tick_params(axis='x', rotation=45)
# North America
top_platforms_na.plot(ax=ax[1], x='Platform', y='NA_Sales', kind='bar', color='orange')
ax[1].set_title('Top Game Platforms in North America')
ax[1].set_xlabel('Platform')
ax[1].set_ylabel('Total Sales (in millions)')
ax[1].tick_params(axis='x', rotation=45)
# Europe
top_platforms_eu.plot(ax=ax[2], x='Platform', y='EU_Sales', kind='bar', color='green')
ax[2].set_title('Top Game Platforms in Europe')
ax[2].set_xlabel('Platform')
ax[2].set_ylabel('Total Sales (in millions)')
ax[2].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
# Exclude 'DC', 'GEN', and '2600' platforms from the combined data
excluded_platforms = ['DC', 'GEN', '2600', 'SAT']
filtered_combined_platforms = combined_platforms[~combined_platforms['Platform'].isin(excluded_platforms)]
plt.figure(figsize=(14, 8))
# Recalculate positions for the updated set of platforms
positions = np.arange(len(filtered_combined_platforms['Platform']))
# Stacked bar chart with the updated filtered data
plt.bar(positions - width, filtered_combined_platforms['JP_Sales'], width, label='Japan', color='skyblue')
plt.bar(positions, filtered_combined_platforms['NA_Sales'], width, label='North America', color='orange')
plt.bar(positions + width, filtered_combined_platforms['EU_Sales'], width, label='Europe', color='green')
plt.title('Comparative Game Sales by Top Platforms Across Regions')
plt.xlabel('Platform')
plt.ylabel('Total Sales (in millions)')
plt.xticks(positions, filtered_combined_platforms['Platform'], rotation=45)
plt.legend()
plt.show()
top5_jp = platform_sales.sort_values('JP_Sales', ascending=False).head(5)['Platform']
top5_na = platform_sales.sort_values('NA_Sales', ascending=False).head(5)['Platform']
top5_eu = platform_sales.sort_values('EU_Sales', ascending=False).head(5)['Platform']
import seaborn as sns
import matplotlib.pyplot as plt
def plot_genre_preferences(region_top_platforms, region_sales_col, region_name):
# Filter data for the top platforms in the region
df_top_platforms = df[df['Platform'].isin(region_top_platforms)]
# Aggregate sales by platform and genre
genre_sales = df_top_platforms.groupby(['Platform', 'Genre'])[region_sales_col].sum().unstack().fillna(0)
# Plot
genre_sales.plot(kind='bar', stacked=True, figsize=(14, 8), colormap='viridis')
plt.title(f'Genre Preferences for Top Platforms in {region_name}')
plt.xlabel('Platform')
plt.ylabel(f'Total Sales in {region_name} (in millions)')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.show()
# Plot genre preferences for top platforms in Japan
plot_genre_preferences(top5_jp, 'JP_Sales', 'Japan')
def plot_ratings_impact(region_top_platforms, region_sales_col, region_name):
# Filter data for top platforms
df_top_platforms = df[df['Platform'].isin(region_top_platforms)]
# Plot
plt.figure(figsize=(14, 8))
sns.scatterplot(data=df_top_platforms, x='Critic_Score', y=region_sales_col, hue='Platform', style='Platform', alpha=0.6)
plt.title(f'Impact of Critic Scores on Sales in {region_name}')
plt.xlabel('Critic Score')
plt.ylabel(f'Total Sales in {region_name} (in millions)')
plt.legend(title='Platform', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
# Plot the impact of game ratings on sales for top platforms in Japan
plot_ratings_impact(top5_jp, 'JP_Sales', 'Japan')
# Define thresholds for high ratings and high sales
critic_score_threshold = df['Critic_Score'].quantile(0.75)
user_score_threshold = df['User_Score'].quantile(0.75)
sales_threshold = df['Global_Sales'].quantile(0.75)
highly_rated_and_high_sales = df[
(df['Critic_Score'] >= critic_score_threshold) &
(df['User_Score'] >= user_score_threshold) &
(df['Global_Sales'] >= sales_threshold)
]
summary = highly_rated_and_high_sales[['Critic_Score', 'User_Score', 'Global_Sales']].describe()
print(summary)
Critic_Score User_Score Global_Sales count 1044.000000 1044.000000 1044.000000 mean 85.851533 8.454079 2.247787 std 4.632826 1.445498 3.103293 min 79.000000 7.400000 0.470000 25% 82.000000 7.900000 0.777500 50% 85.000000 8.300000 1.270000 75% 89.000000 8.700000 2.482500 max 98.000000 26.932607 35.520000
plt.figure(figsize=(10, 6))
sns.scatterplot(data=highly_rated_and_high_sales, x='Critic_Score', y='Global_Sales', alpha=0.6)
plt.title('Critic Score vs. Global Sales for Top-Performing Games')
plt.xlabel('Critic Score')
plt.ylabel('Global Sales (in millions)')
plt.show()
# Define thresholds for high ratings and high sales
high_rating_threshold = 80 # Adjust based on your scoring scale
high_sales_threshold = df['Global_Sales'].quantile(0.75) # Top 25% of sales
# Filter games that are both highly rated and have high sales
highly_rated_and_high_sales_games = df[(df['Critic_Score'] > high_rating_threshold) & (df['Global_Sales'] > high_sales_threshold)]
# Aggregate the number of games by genre
top_genres = highly_rated_and_high_sales_games['Genre'].value_counts()
fig, ax = plt.subplots(figsize=(10, 6))
top_genres.plot(kind='bar', ax=ax, color='skyblue')
ax.set_title('Top Genres Among Highly Rated and High Sales Games')
ax.set_ylabel('Number of Games')
ax.set_xlabel('Genre')
plt.xticks(rotation=45)
plt.show()
Highly Rated and Low Sales
# Define the threshold for highest-rated games based on critic scores
rating_threshold = df['Critic_Score'].quantile(0.75) # Top 25% of scores
# Filter highest-rated games
highest_rated_games = df[df['Critic_Score'] >= rating_threshold]
# Japan
least_popular_highest_rated_jp = highest_rated_games.sort_values('JP_Sales').head(5)
# North America
least_popular_highest_rated_na = highest_rated_games.sort_values('NA_Sales').head(5)
# Europe
least_popular_highest_rated_eu = highest_rated_games.sort_values('EU_Sales').head(5)
# Display least popular highest-rated games in Japan
print("Least Popular Highest-Rated Games in Japan:")
print(least_popular_highest_rated_jp[['Name', 'Critic_Score', 'JP_Sales']])
# Display least popular highest-rated games in North America
print("\nLeast Popular Highest-Rated Games in North America:")
print(least_popular_highest_rated_na[['Name', 'Critic_Score', 'NA_Sales']])
# Display least popular highest-rated games in Europe
print("\nLeast Popular Highest-Rated Games in Europe:")
print(least_popular_highest_rated_eu[['Name', 'Critic_Score', 'EU_Sales']])
Least Popular Highest-Rated Games in Japan: Name Critic_Score JP_Sales 3171 X-Men Legends II: Rise of Apocalypse 82.0 0.0 5170 Left 4 Dead 2 89.0 0.0 5165 Sid Meier's Civilization: Beyond Earth 81.0 0.0 5162 Elite Beat Agents 87.0 0.0 5156 NBA Live 2003 82.0 0.0 Least Popular Highest-Rated Games in North America: Name Critic_Score NA_Sales 16696 Metal Gear Solid V: Ground Zeroes 80.0 0.0 13440 Silent Hunter: Wolves of the Pacific 79.0 0.0 7521 Phantasy Star Online 89.0 0.0 13431 Company of Heroes 93.0 0.0 7643 Grandia II 90.0 0.0 Least Popular Highest-Rated Games in Europe: Name Critic_Score EU_Sales 10450 Winning Eleven: Pro Evolution Soccer 2007 79.0 0.0 9417 Ninjatown 80.0 0.0 9280 Persona 4: Arena Ultimax 84.0 0.0 9204 College Hoops 2K8 82.0 0.0 9024 Geometry Wars: Galaxies 79.0 0.0
# Combine the lists and drop duplicates
combined_games = pd.concat([least_popular_highest_rated_jp, least_popular_highest_rated_na, least_popular_highest_rated_eu]).drop_duplicates(subset=['Name'])
# Reset index
combined_games.reset_index(drop=True, inplace=True)
# Display sales across regions for the combined list of games
print("Sales Across Regions for Least Popular Highest-Rated Games:")
print(combined_games[['Name', 'JP_Sales', 'NA_Sales', 'EU_Sales']])
Sales Across Regions for Least Popular Highest-Rated Games: Name JP_Sales NA_Sales EU_Sales 0 X-Men Legends II: Rise of Apocalypse 0.00 0.53 0.02 1 Left 4 Dead 2 0.00 0.00 0.32 2 Sid Meier's Civilization: Beyond Earth 0.00 0.11 0.22 3 Elite Beat Agents 0.00 0.30 0.03 4 NBA Live 2003 0.00 0.31 0.04 5 Metal Gear Solid V: Ground Zeroes 0.00 0.00 0.01 6 Silent Hunter: Wolves of the Pacific 0.00 0.00 0.04 7 Phantasy Star Online 0.20 0.00 0.00 8 Company of Heroes 0.00 0.00 0.04 9 Grandia II 0.20 0.00 0.00 10 Winning Eleven: Pro Evolution Soccer 2007 0.00 0.09 0.00 11 Ninjatown 0.00 0.12 0.00 12 Persona 4: Arena Ultimax 0.13 0.00 0.00 13 College Hoops 2K8 0.00 0.13 0.00 14 Geometry Wars: Galaxies 0.00 0.13 0.00
# Calculate the variance in sales across regions to find games with the most significant differences
combined_games['Sales_Variance'] = combined_games[['JP_Sales', 'NA_Sales', 'EU_Sales']].var(axis=1)
# Sort by 'Sales_Variance' and select the top N games
top_n_games = combined_games.sort_values('Sales_Variance', ascending=False).head(10)
# Plotting sales for the top N games with the most significant differences across regions
top_n_games[['JP_Sales', 'NA_Sales', 'EU_Sales']].plot(kind='barh', figsize=(10, 8), width=0.75)
plt.title('Top Games with Significant Regional Sales Differences')
plt.xlabel('Sales (in millions)')
plt.ylabel('Game Name')
plt.legend(title='Region')
plt.tight_layout()
plt.show()
# Example mappings
platform_to_brand = {
'Wii': 'Nintendo', 'NES': 'Nintendo', 'GB': 'Nintendo', 'DS': 'Nintendo', 'SNES': 'Nintendo',
'3DS': 'Nintendo', 'N64': 'Nintendo', 'GBA': 'Nintendo', 'GC': 'Nintendo', 'WiiU': 'Nintendo', 'Switch': 'Nintendo',
'PS': 'Sony', 'PS2': 'Sony', 'PS3': 'Sony', 'PS4': 'Sony', 'PSP': 'Sony', 'PSV': 'Sony',
'X360': 'Microsoft', 'XB': 'Microsoft', 'XOne': 'Microsoft',
# Add other platforms and their corresponding brands
}
df['Console_Brand'] = df['Platform'].map(platform_to_brand)
brand_sales_by_region = df.groupby('Console_Brand').agg({
'JP_Sales': 'sum',
'NA_Sales': 'sum',
'EU_Sales': 'sum',
}).reset_index()
# Stacked bar chart for sales by console brand in each region
brand_sales_by_region.set_index('Console_Brand')[['JP_Sales', 'NA_Sales', 'EU_Sales']].plot(kind='bar', stacked=True, figsize=(12, 8))
plt.title('Sales by Console Brand Across Regions')
plt.xlabel('Console Brand')
plt.ylabel('Total Sales (in millions)')
plt.legend(title='Region')
plt.xticks(rotation=45)
plt.show()
# Alternatively, use a grouped bar chart for a side-by-side comparison
brand_sales_by_region.plot(x='Console_Brand', kind='bar', figsize=(12, 8))
plt.title('Sales by Console Brand Across Regions')
plt.xlabel('Console Brand')
plt.ylabel('Total Sales (in millions)')
plt.xticks(rotation=45)
plt.legend(title='Region')
plt.show()
brands = ['Nintendo', 'Sony', 'Microsoft']
regions = ['JP_Sales', 'NA_Sales', 'EU_Sales']
top_games_by_brand_and_region = {}
for brand in brands:
for region in regions:
top_games = df[df['Console_Brand'] == brand].sort_values(by=region, ascending=False).head(5)[['Name', region]]
key = f'{brand} - {region}'
top_games_by_brand_and_region[key] = top_games
# Print the top-selling games for each brand in each region
for key, value in top_games_by_brand_and_region.items():
print(f'\nTop Selling Games for {key}:')
print(value)
Top Selling Games for Nintendo - JP_Sales: Name JP_Sales 4 Pokemon Red/Pokemon Blue 10.22 12 Pokemon Gold/Pokemon Silver 7.20 1 Super Mario Bros. 6.81 6 New Super Mario Bros. 6.50 20 Pokemon Diamond/Pokemon Pearl 6.04 Top Selling Games for Nintendo - NA_Sales: Name NA_Sales 0 Wii Sports 41.36 1 Super Mario Bros. 29.08 9 Duck Hunt 26.93 5 Tetris 23.20 2 Mario Kart Wii 15.68 Top Selling Games for Nintendo - EU_Sales: Name EU_Sales 0 Wii Sports 28.96 2 Mario Kart Wii 12.76 10 Nintendogs 10.95 3 Wii Sports Resort 10.93 19 Brain Age: Train Your Brain in Minutes a Day 9.20 Top Selling Games for Sony - JP_Sales: Name JP_Sales 215 Monster Hunter Freedom 3 4.87 163 Monster Hunter Freedom Unite 4.13 244 Dragon Quest VII: Warriors of Eden 4.10 88 Final Fantasy VIII 3.63 186 Dragon Quest VIII: Journey of the Cursed King 3.61 Top Selling Games for Sony - NA_Sales: Name NA_Sales 17 Grand Theft Auto: San Andreas 9.43 24 Grand Theft Auto: Vice City 8.41 16 Grand Theft Auto V 7.02 38 Grand Theft Auto III 6.99 28 Gran Turismo 3: A-Spec 6.85 Top Selling Games for Sony - EU_Sales: Name EU_Sales 16 Grand Theft Auto V 9.09 42 Grand Theft Auto V 6.31 77 FIFA 16 6.12 31 Call of Duty: Black Ops 3 5.86 94 FIFA 17 5.75 Top Selling Games for Microsoft - JP_Sales: Name JP_Sales 14 Kinect Adventures! 0.24 987 Dead or Alive 3 0.24 2044 Ace Combat 6: Fires of Liberation 0.22 2262 Blue Dragon 0.21 2608 Star Ocean: The Last Hope 0.21 Top Selling Games for Microsoft - NA_Sales: Name NA_Sales 14 Kinect Adventures! 15.00 32 Call of Duty: Black Ops 9.70 23 Grand Theft Auto V 9.66 29 Call of Duty: Modern Warfare 3 9.04 36 Call of Duty: Modern Warfare 2 8.52 Top Selling Games for Microsoft - EU_Sales: Name EU_Sales 23 Grand Theft Auto V 5.14 14 Kinect Adventures! 4.89 29 Call of Duty: Modern Warfare 3 4.24 35 Call of Duty: Black Ops II 4.24 32 Call of Duty: Black Ops 3.68
data_microsoft_jp = {
'Name': ['Kinect Adventures!', 'Dead or Alive 3', 'Ace Combat 6: Fires of Liberation', 'Blue Dragon', 'Star Ocean: The Last Hope'],
'JP_Sales': [0.24, 0.24, 0.22, 0.21, 0.21]
}
df_microsoft_jp = pd.DataFrame(data_microsoft_jp)
data_microsoft_na = {
'Name': ['Kinect Adventures!', 'Call of Duty: Black Ops', 'Grand Theft Auto V', 'Call of Duty: Modern Warfare 3', 'Call of Duty: Modern Warfare 2'],
'NA_Sales': [15.00, 9.70, 9.66, 9.04, 8.52]
}
df_microsoft_na = pd.DataFrame(data_microsoft_na)
data_microsoft_eu = {
'Name': ['Grand Theft Auto V', 'Kinect Adventures!', 'Call of Duty: Modern Warfare 3', 'Call of Duty: Black Ops II', 'Call of Duty: Black Ops'],
'EU_Sales': [5.14, 4.89, 4.24, 4.24, 3.68]
}
df_microsoft_eu = pd.DataFrame(data_microsoft_eu)
import matplotlib.pyplot as plt
%matplotlib inline
def plot_sales(dataframe, title, sales_column):
fig, ax = plt.subplots(figsize=(10, 6))
dataframe.plot(kind='bar', x='Name', y=sales_column, ax=ax, legend=False, color='skyblue')
ax.set_title(title)
ax.set_ylabel('Sales (in millions)')
ax.set_xlabel('')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()
# Plot for Microsoft in Japan
plot_sales(df_microsoft_jp, 'Top Selling Microsoft Games in Japan', 'JP_Sales')
# Plot for Microsoft in North America
plot_sales(df_microsoft_na, 'Top Selling Microsoft Games in North America', 'NA_Sales')
# Plot for Microsoft in Europe
plot_sales(df_microsoft_eu, 'Top Selling Microsoft Games in Europe', 'EU_Sales')
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>