In [34]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('vgsales.csv')

print("Initial Data Inspection:")

print("\nFirst 5 rows of the dataset:")
print(df.head())

print("\nLast 5 rows of the dataset:")
print(df.tail())

print("\nShape of the dataset (Rows, Columns):", df.shape)

print("\nSummary of the DataFrame:")
df.info()

print("\nData Types of Each Column:")
print(df.dtypes)

print("\nDescriptive Statistics for Numerical Columns:")
print(df.describe())

Initial Data Inspection:

First 5 rows of the dataset:
                       Name Platform  Year_of_Release         Genre Publisher  \
0                Wii Sports      Wii           2006.0        Sports  Nintendo   
1         Super Mario Bros.      NES           1985.0      Platform  Nintendo   
2            Mario Kart Wii      Wii           2008.0        Racing  Nintendo   
3         Wii Sports Resort      Wii           2009.0        Sports  Nintendo   
4  Pokemon Red/Pokemon Blue       GB           1996.0  Role-Playing  Nintendo   

   NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  Critic_Score  \
0     41.36     28.96      3.77         8.45         82.53          76.0   
1     29.08      3.58      6.81         0.77         40.24           NaN   
2     15.68     12.76      3.79         3.29         35.52          82.0   
3     15.61     10.93      3.28         2.95         32.77          80.0   
4     11.27      8.89     10.22         1.00         31.37           NaN   

   Critic_Count User_Score  User_Count Developer Rating  
0          51.0          8       322.0  Nintendo      E  
1           NaN        NaN         NaN       NaN    NaN  
2          73.0        8.3       709.0  Nintendo      E  
3          73.0          8       192.0  Nintendo      E  
4           NaN        NaN         NaN       NaN    NaN  

Last 5 rows of the dataset:
                                Name Platform  Year_of_Release       Genre  \
16714  Samurai Warriors: Sanada Maru      PS3           2016.0      Action   
16715               LMA Manager 2007     X360           2006.0      Sports   
16716        Haitaka no Psychedelica      PSV           2016.0   Adventure   
16717               Spirits & Spells      GBA           2003.0    Platform   
16718            Winning Post 8 2016      PSV           2016.0  Simulation   

          Publisher  NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  \
16714    Tecmo Koei      0.00      0.00      0.01          0.0          0.01   
16715   Codemasters      0.00      0.01      0.00          0.0          0.01   
16716  Idea Factory      0.00      0.00      0.01          0.0          0.01   
16717       Wanadoo      0.01      0.00      0.00          0.0          0.01   
16718    Tecmo Koei      0.00      0.00      0.01          0.0          0.01   

       Critic_Score  Critic_Count User_Score  User_Count Developer Rating  
16714           NaN           NaN        NaN         NaN       NaN    NaN  
16715           NaN           NaN        NaN         NaN       NaN    NaN  
16716           NaN           NaN        NaN         NaN       NaN    NaN  
16717           NaN           NaN        NaN         NaN       NaN    NaN  
16718           NaN           NaN        NaN         NaN       NaN    NaN  

Shape of the dataset (Rows, Columns): (16719, 16)

Summary of the DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16717 non-null  object 
 1   Platform         16719 non-null  object 
 2   Year_of_Release  16450 non-null  float64
 3   Genre            16717 non-null  object 
 4   Publisher        16665 non-null  object 
 5   NA_Sales         16719 non-null  float64
 6   EU_Sales         16719 non-null  float64
 7   JP_Sales         16719 non-null  float64
 8   Other_Sales      16719 non-null  float64
 9   Global_Sales     16719 non-null  float64
 10  Critic_Score     8137 non-null   float64
 11  Critic_Count     8137 non-null   float64
 12  User_Score       10015 non-null  object 
 13  User_Count       7590 non-null   float64
 14  Developer        10096 non-null  object 
 15  Rating           9950 non-null   object 
dtypes: float64(9), object(7)
memory usage: 2.0+ MB

Data Types of Each Column:
Name                object
Platform            object
Year_of_Release    float64
Genre               object
Publisher           object
NA_Sales           float64
EU_Sales           float64
JP_Sales           float64
Other_Sales        float64
Global_Sales       float64
Critic_Score       float64
Critic_Count       float64
User_Score          object
User_Count         float64
Developer           object
Rating              object
dtype: object

Descriptive Statistics for Numerical Columns:
       Year_of_Release      NA_Sales      EU_Sales      JP_Sales  \
count     16450.000000  16719.000000  16719.000000  16719.000000   
mean       2006.487356      0.263330      0.145025      0.077602   
std           5.878995      0.813514      0.503283      0.308818   
min        1980.000000      0.000000      0.000000      0.000000   
25%        2003.000000      0.000000      0.000000      0.000000   
50%        2007.000000      0.080000      0.020000      0.000000   
75%        2010.000000      0.240000      0.110000      0.040000   
max        2020.000000     41.360000     28.960000     10.220000   

        Other_Sales  Global_Sales  Critic_Score  Critic_Count    User_Count  
count  16719.000000  16719.000000   8137.000000   8137.000000   7590.000000  
mean       0.047332      0.533543     68.967679     26.360821    162.229908  
std        0.186710      1.547935     13.938165     18.980495    561.282326  
min        0.000000      0.010000     13.000000      3.000000      4.000000  
25%        0.000000      0.060000     60.000000     12.000000     10.000000  
50%        0.010000      0.170000     71.000000     21.000000     24.000000  
75%        0.030000      0.470000     79.000000     36.000000     81.000000  
max       10.570000     82.530000     98.000000    113.000000  10665.000000

import pandas as pd: This imports the pandas library, a powerful tool for data analysis and manipulation, and gives it the alias pd.

import seaborn as sns: This imports the seaborn library, used for making statistical graphics in Python, and gives it the alias sns.

import matplotlib.pyplot as plt: This imports the matplotlib's pyplot module, which provides a MATLAB-like interface for making plots and graphs, and gives it the alias plt.

df = pd.read_csv('vgsales.csv'): This line reads a CSV file named 'vgsales.csv' into a DataFrame called df. A DataFrame is a 2-dimensional labeled data structure with columns that can be of different types.

print("Initial Data Inspection:"): This line prints a statement indicating the beginning of the initial data inspection process.

print("\nFirst 5 rows of the dataset:"): This prints a new line followed by a statement that the next output will show the first 5 rows of the dataset.

print(df.head()): df.head() displays the first 5 rows of the DataFrame df. This is helpful to get a quick overview of the dataset's structure and data.

print("\nLast 5 rows of the dataset:"): This prints a new line and a statement indicating that the next output will show the last 5 rows of the dataset.

print(df.tail()): df.tail() displays the last 5 rows of the DataFrame df. This can be useful to see how the dataset ends and to ensure it has been read correctly.

print("\nShape of the dataset (Rows, Columns):", df.shape): This prints the shape of the DataFrame df, showing the number of rows and columns, helping understand the dataset's size.

print("\nSummary of the DataFrame:"): This prints a statement that the next output will provide a summary of the DataFrame.

df.info(): This line outputs a concise summary of the DataFrame, including the number of non-null entries for each column, which is useful for identifying missing values.

print("\nData Types of Each Column:"): This prints a statement that the next output will show the data types of each column in the DataFrame.

print(df.dtypes): This displays the data types of each column in df, which is important for data processing and analysis.

print("\nDescriptive Statistics for Numerical Columns:"): This prints a statement indicating that the next output will show descriptive statistics for numerical columns.

print(df.describe()): This line outputs descriptive statistics that summarize the central tendency, dispersion, and shape of the dataset’s distribution, excluding NaN values. This is particularly useful for understanding the numerical columns' distribution.

In [35]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('vgsales.csv')

# Initial Data Inspection
print("Initial Data Inspection:")

# Display the first 5 rows to understand the dataset's structure
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Display the last 5 rows to see the end part of the dataset
print("\nLast 5 rows of the dataset:")
print(df.tail())

# Display the shape of the dataset to know the number of rows and columns
print("\nShape of the dataset (Rows, Columns):", df.shape)

# Use df.info() for a concise summary of the DataFrame
print("\nSummary of the DataFrame:")
df.info()

# Display data types of each column for data type understanding
print("\nData Types of Each Column:")
print(df.dtypes)

# Display descriptive statistics to understand the distribution of numerical columns
print("\nDescriptive Statistics for Numerical Columns:")
print(df.describe())

Initial Data Inspection:

First 5 rows of the dataset:
                       Name Platform  Year_of_Release         Genre Publisher  \
0                Wii Sports      Wii           2006.0        Sports  Nintendo   
1         Super Mario Bros.      NES           1985.0      Platform  Nintendo   
2            Mario Kart Wii      Wii           2008.0        Racing  Nintendo   
3         Wii Sports Resort      Wii           2009.0        Sports  Nintendo   
4  Pokemon Red/Pokemon Blue       GB           1996.0  Role-Playing  Nintendo   

   NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  Critic_Score  \
0     41.36     28.96      3.77         8.45         82.53          76.0   
1     29.08      3.58      6.81         0.77         40.24           NaN   
2     15.68     12.76      3.79         3.29         35.52          82.0   
3     15.61     10.93      3.28         2.95         32.77          80.0   
4     11.27      8.89     10.22         1.00         31.37           NaN   

   Critic_Count User_Score  User_Count Developer Rating  
0          51.0          8       322.0  Nintendo      E  
1           NaN        NaN         NaN       NaN    NaN  
2          73.0        8.3       709.0  Nintendo      E  
3          73.0          8       192.0  Nintendo      E  
4           NaN        NaN         NaN       NaN    NaN  

Last 5 rows of the dataset:
                                Name Platform  Year_of_Release       Genre  \
16714  Samurai Warriors: Sanada Maru      PS3           2016.0      Action   
16715               LMA Manager 2007     X360           2006.0      Sports   
16716        Haitaka no Psychedelica      PSV           2016.0   Adventure   
16717               Spirits & Spells      GBA           2003.0    Platform   
16718            Winning Post 8 2016      PSV           2016.0  Simulation   

          Publisher  NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  \
16714    Tecmo Koei      0.00      0.00      0.01          0.0          0.01   
16715   Codemasters      0.00      0.01      0.00          0.0          0.01   
16716  Idea Factory      0.00      0.00      0.01          0.0          0.01   
16717       Wanadoo      0.01      0.00      0.00          0.0          0.01   
16718    Tecmo Koei      0.00      0.00      0.01          0.0          0.01   

       Critic_Score  Critic_Count User_Score  User_Count Developer Rating  
16714           NaN           NaN        NaN         NaN       NaN    NaN  
16715           NaN           NaN        NaN         NaN       NaN    NaN  
16716           NaN           NaN        NaN         NaN       NaN    NaN  
16717           NaN           NaN        NaN         NaN       NaN    NaN  
16718           NaN           NaN        NaN         NaN       NaN    NaN  

Shape of the dataset (Rows, Columns): (16719, 16)

Summary of the DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16719 entries, 0 to 16718
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             16717 non-null  object 
 1   Platform         16719 non-null  object 
 2   Year_of_Release  16450 non-null  float64
 3   Genre            16717 non-null  object 
 4   Publisher        16665 non-null  object 
 5   NA_Sales         16719 non-null  float64
 6   EU_Sales         16719 non-null  float64
 7   JP_Sales         16719 non-null  float64
 8   Other_Sales      16719 non-null  float64
 9   Global_Sales     16719 non-null  float64
 10  Critic_Score     8137 non-null   float64
 11  Critic_Count     8137 non-null   float64
 12  User_Score       10015 non-null  object 
 13  User_Count       7590 non-null   float64
 14  Developer        10096 non-null  object 
 15  Rating           9950 non-null   object 
dtypes: float64(9), object(7)
memory usage: 2.0+ MB

Data Types of Each Column:
Name                object
Platform            object
Year_of_Release    float64
Genre               object
Publisher           object
NA_Sales           float64
EU_Sales           float64
JP_Sales           float64
Other_Sales        float64
Global_Sales       float64
Critic_Score       float64
Critic_Count       float64
User_Score          object
User_Count         float64
Developer           object
Rating              object
dtype: object

Descriptive Statistics for Numerical Columns:
       Year_of_Release      NA_Sales      EU_Sales      JP_Sales  \
count     16450.000000  16719.000000  16719.000000  16719.000000   
mean       2006.487356      0.263330      0.145025      0.077602   
std           5.878995      0.813514      0.503283      0.308818   
min        1980.000000      0.000000      0.000000      0.000000   
25%        2003.000000      0.000000      0.000000      0.000000   
50%        2007.000000      0.080000      0.020000      0.000000   
75%        2010.000000      0.240000      0.110000      0.040000   
max        2020.000000     41.360000     28.960000     10.220000   

        Other_Sales  Global_Sales  Critic_Score  Critic_Count    User_Count  
count  16719.000000  16719.000000   8137.000000   8137.000000   7590.000000  
mean       0.047332      0.533543     68.967679     26.360821    162.229908  
std        0.186710      1.547935     13.938165     18.980495    561.282326  
min        0.000000      0.010000     13.000000      3.000000      4.000000  
25%        0.000000      0.060000     60.000000     12.000000     10.000000  
50%        0.010000      0.170000     71.000000     21.000000     24.000000  
75%        0.030000      0.470000     79.000000     36.000000     81.000000  
max       10.570000     82.530000     98.000000    113.000000  10665.000000

In [36]:

import pandas as pd

df = df.dropna(subset=['Year_of_Release'])
df['Year_of_Release'] = df['Year_of_Release'].astype(int)

df = df.dropna(subset=['Name'])

platform_brand_mapping = {
    'Wii': 'Nintendo', 'NES': 'Nintendo', 'GB': 'Nintendo', 'DS': 'Nintendo', 'SNES': 'Nintendo',
    '3DS': 'Nintendo', 'N64': 'Nintendo', 'GBA': 'Nintendo', 'GC': 'Nintendo', 'WiiU': 'Nintendo', 'Switch': 'Nintendo',
    'PS': 'Sony', 'PS2': 'Sony', 'PS3': 'Sony', 'PS4': 'Sony', 'PSP': 'Sony', 'PSV': 'Sony',
    'X360': 'Microsoft', 'XB': 'Microsoft', 'XOne': 'Microsoft',
    'PC': 'PC',  # 
}

df['Console_Brand'] = df['Platform'].map(platform_brand_mapping)

df['User_Score'] = pd.to_numeric(df['User_Score'], errors='coerce')

df['Critic_Count'] = df['Critic_Count'].fillna(0).astype(int)
df['User_Count'] = df['User_Count'].fillna(0).astype(int)

df['Publisher'].fillna('Unknown', inplace=True)

print(df.head())

print("\nMissing values in the dataset after cleaning:")
print(df.isnull().sum())

                       Name Platform  Year_of_Release         Genre Publisher  \
0                Wii Sports      Wii             2006        Sports  Nintendo   
1         Super Mario Bros.      NES             1985      Platform  Nintendo   
2            Mario Kart Wii      Wii             2008        Racing  Nintendo   
3         Wii Sports Resort      Wii             2009        Sports  Nintendo   
4  Pokemon Red/Pokemon Blue       GB             1996  Role-Playing  Nintendo   

   NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  Critic_Score  \
0     41.36     28.96      3.77         8.45         82.53          76.0   
1     29.08      3.58      6.81         0.77         40.24           NaN   
2     15.68     12.76      3.79         3.29         35.52          82.0   
3     15.61     10.93      3.28         2.95         32.77          80.0   
4     11.27      8.89     10.22         1.00         31.37           NaN   

   Critic_Count  User_Score  User_Count Developer Rating Console_Brand  
0            51         8.0         322  Nintendo      E      Nintendo  
1             0         NaN           0       NaN    NaN      Nintendo  
2            73         8.3         709  Nintendo      E      Nintendo  
3            73         8.0         192  Nintendo      E      Nintendo  
4             0         NaN           0       NaN    NaN      Nintendo  

Missing values in the dataset after cleaning:
Name                  0
Platform              0
Year_of_Release       0
Genre                 0
Publisher             0
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8465
Critic_Count          0
User_Score         8985
User_Count            0
Developer          6541
Rating             6679
Console_Brand       399
dtype: int64

3-4. Convert 'Year_of_Release' from float to integer:

df = df.dropna(subset=['Year_of_Release']): Drops rows where the 'Year_of_Release' value is missing (NaN) to ensure that only rows with valid years are converted to integers. df['Year_of_Release'] = df['Year_of_Release'].astype(int): Converts the 'Year_of_Release' column from float to integer data type. This step is valid after removing NaN values, as NaN is considered a float, and integers cannot represent NaN.

Drop rows where 'Name' is missing:

df = df.dropna(subset=['Name']): Removes rows from the DataFrame where the 'Name' column has missing values. The 'Name' column is crucial for identifying games, so rows without this information are not useful for analysis. 8-13. Define the mapping from platform to console brand:

A dictionary named platform_brand_mapping is defined, where keys are game platform codes (like 'Wii', 'PS4', etc.) and values are the corresponding console brands ('Nintendo', 'Sony', etc.). This mapping is used to categorize games by the brand of their console.

Map 'Platform' to 'Console_Brand':

df['Console_Brand'] = df['Platform'].map(platform_brand_mapping): Creates a new column 'Console_Brand' in the DataFrame by mapping each 'Platform' value to its corresponding console brand using the platform_brand_mapping dictionary.

Convert 'User_Score' to numeric:

df['User_Score'] = pd.to_numeric(df['User_Score'], errors='coerce'): Converts the 'User_Score' column to a numeric type. The errors='coerce' parameter ensures that any values that cannot be converted to numeric (like strings or special characters) are set to NaN, handling potential data quality issues gracefully.

19-20. Fill missing values in 'Critic_Count' and 'User_Count':

Missing values in 'Critic_Count' and 'User_Count' are filled with 0, assuming that a missing count equates to no reviews. These columns are then converted to integer data type, as counts are discrete values.

Replace missing 'Publisher' values with 'Unknown': df['Publisher'].fillna('Unknown', inplace=True): Missing values in the 'Publisher' column are replaced with the string 'Unknown'. This maintains data integrity by ensuring there are no missing values in this column, which could be important for analysis.

Display the first 5 rows: print(df.head()): Prints the first 5 rows of the DataFrame to confirm the changes made during the data cleaning process. 26-27. Check for remaining missing values:

The remaining missing values in the DataFrame are checked and printed. This helps in assessing the cleanliness of the dataset after the applied transformations and in deciding if further cleaning is needed.

In [37]:

df.dropna(subset=['Name', 'Genre'], inplace=True)

df['Publisher'].fillna('Unknown', inplace=True)

df['User_Score'] = pd.to_numeric(df['User_Score'], errors='coerce')

valid_scores = df.dropna(subset=['User_Score', 'Critic_Score'])
average_diff = (valid_scores['User_Score'] - valid_scores['Critic_Score']).mean()

missing_user = df['User_Score'].isnull() & df['Critic_Score'].notnull()
df.loc[missing_user, 'User_Score'] = df['Critic_Score'] + average_diff

missing_critic = df['Critic_Score'].isnull() & df['User_Score'].notnull()
df.loc[missing_critic, 'Critic_Score'] = df['User_Score'] - average_diff

df = df.drop_duplicates(subset=['Name', 'Year_of_Release', 'Platform'])

print(df.head())

statistical_details = df.drop(columns='Year_of_Release').describe()

print("\nStatistical details of the dataset (excluding 'Year_of_Release'):")
print(statistical_details)

                       Name Platform  Year_of_Release         Genre Publisher  \
0                Wii Sports      Wii             2006        Sports  Nintendo   
1         Super Mario Bros.      NES             1985      Platform  Nintendo   
2            Mario Kart Wii      Wii             2008        Racing  Nintendo   
3         Wii Sports Resort      Wii             2009        Sports  Nintendo   
4  Pokemon Red/Pokemon Blue       GB             1996  Role-Playing  Nintendo   

   NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  Critic_Score  \
0     41.36     28.96      3.77         8.45         82.53          76.0   
1     29.08      3.58      6.81         0.77         40.24           NaN   
2     15.68     12.76      3.79         3.29         35.52          82.0   
3     15.61     10.93      3.28         2.95         32.77          80.0   
4     11.27      8.89     10.22         1.00         31.37           NaN   

   Critic_Count  User_Score  User_Count Developer Rating Console_Brand  
0            51         8.0         322  Nintendo      E      Nintendo  
1             0         NaN           0       NaN    NaN      Nintendo  
2            73         8.3         709  Nintendo      E      Nintendo  
3            73         8.0         192  Nintendo      E      Nintendo  
4             0         NaN           0       NaN    NaN      Nintendo  

Statistical details of the dataset (excluding 'Year_of_Release'):
           NA_Sales      EU_Sales      JP_Sales   Other_Sales  Global_Sales  \
count  16447.000000  16447.000000  16447.000000  16447.000000  16447.000000   
mean       0.263981      0.145903      0.078476      0.047586      0.536202   
std        0.818308      0.506674      0.311073      0.187989      1.558502   
min        0.000000      0.000000      0.000000      0.000000      0.010000   
25%        0.000000      0.000000      0.000000      0.000000      0.060000   
50%        0.080000      0.020000      0.000000      0.010000      0.170000   
75%        0.240000      0.110000      0.040000      0.030000      0.470000   
max       41.360000     28.960000     10.220000     10.570000     82.530000   

       Critic_Score  Critic_Count   User_Score    User_Count  
count   8551.000000  16447.000000  8551.000000  16447.000000  
mean      69.026180     12.832675     5.953759     73.963702  
std       13.458964     18.708601     5.286127    388.389327  
min       13.000000      0.000000   -40.074108      0.000000  
25%       61.000000      0.000000     5.900000      0.000000  
50%       70.374108      0.000000     7.300000      0.000000  
75%       79.000000     21.000000     8.200000     20.000000  
max       98.000000    113.000000    26.925892  10665.000000

Remove rows where 'Name' or 'Genre' is missing:

df.dropna(subset=['Name', 'Genre'], inplace=True): This line removes any rows where either 'Name' or 'Genre' is missing, as these are likely crucial for any analysis involving game titles or categorizations. Replace missing 'Publisher' values with 'Unknown':

df['Publisher'].fillna('Unknown', inplace=True): Fills in missing 'Publisher' values with the string 'Unknown', ensuring that there are no missing values in this column which could be important for categorizing or identifying games. Convert 'User_Score' to numeric:

df['User_Score'] = pd.to_numeric(df['User_Score'], errors='coerce'): Converts the 'User_Score' column to a numeric data type, setting any non-numeric values to NaN. This is crucial for any numerical analysis or operations involving user scores. 4-5. Calculate the average difference where both scores are present:

These lines first filter rows where both 'User_Score' and 'Critic_Score' are present. Then, it calculates the average difference between these scores, which will be used to impute missing values in the next steps. 6-7. Impute missing User_Scores:

Missing 'User_Score' values are imputed by adding the calculated average difference to the 'Critic_Score'. This assumes that the relationship between user and critic scores is consistent enough to use for imputation. 8-9. Impute missing Critic_Scores:

Similarly, missing 'Critic_Score' values are imputed by subtracting the average difference from the 'User_Score'. This maintains the assumed consistency between the two types of scores. Remove duplicates:

df = df.drop_duplicates(subset=['Name', 'Year_of_Release', 'Platform']): Removes duplicate entries based on the combination of 'Name', 'Year_of_Release', and 'Platform', ensuring that each game-platform-year combination is unique in the dataset. Display the DataFrame:

print(df.head()): Prints the first 5 rows of the DataFrame to confirm the changes made by the cleaning process. 14-15. Exclude 'Year_of_Release' from the statistical summary:

The 'Year_of_Release' column is excluded from the statistical summary because it might not be meaningful to include it in descriptive statistics like mean, standard deviation, etc. The .describe() method then provides a summary of the remaining columns, typically focusing on numerical ones.

17-18. Print statistical details:

Prints the statistical details of the dataset, excluding 'Year_of_Release', providing insights into the central tendency, dispersion, and shape of the dataset’s distribution for the numerical columns.

In [38]:

print("\n=== Data Cleaning and Preparation ===\n")

df.dropna(subset=['Name', 'Genre'], inplace=True)

df['Publisher'].fillna('Unknown', inplace=True)

df['User_Score'] = pd.to_numeric(df['User_Score'], errors='coerce')

valid_scores = df.dropna(subset=['User_Score', 'Critic_Score'])
average_diff = (valid_scores['User_Score'] - valid_scores['Critic_Score']).mean()

missing_user = df['User_Score'].isnull() & df['Critic_Score'].notnull()
df.loc[missing_user, 'User_Score'] = df['Critic_Score'] + average_diff

missing_critic = df['Critic_Score'].isnull() & df['User_Score'].notnull()
df.loc[missing_critic, 'Critic_Score'] = df['User_Score'] - average_diff

df = df.drop_duplicates(subset=['Name', 'Year_of_Release', 'Platform'])

print("\nConfirmed Changes - First 5 Rows:")
display(df.head().style.set_properties(**{'background-color': 'lavender', 'color': 'black'}))

statistical_details = df.drop(columns='Year_of_Release').describe()

print("\nStatistical Details (Excluding 'Year_of_Release'):")
display(statistical_details.style.background_gradient(cmap='coolwarm').set_precision(2))

=== Data Cleaning and Preparation ===


Confirmed Changes - First 5 Rows:

	Name	Platform	Year_of_Release	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales	Critic_Score	Critic_Count	User_Score	User_Count	Developer	Rating	Console_Brand
0	Wii Sports	Wii	2006	Sports	Nintendo	41.360000	28.960000	3.770000	8.450000	82.530000	76.000000	51	8.000000	322	Nintendo	E	Nintendo
1	Super Mario Bros.	NES	1985	Platform	Nintendo	29.080000	3.580000	6.810000	0.770000	40.240000	nan	0	nan	0	nan	nan	Nintendo
2	Mario Kart Wii	Wii	2008	Racing	Nintendo	15.680000	12.760000	3.790000	3.290000	35.520000	82.000000	73	8.300000	709	Nintendo	E	Nintendo
3	Wii Sports Resort	Wii	2009	Sports	Nintendo	15.610000	10.930000	3.280000	2.950000	32.770000	80.000000	73	8.000000	192	Nintendo	E	Nintendo
4	Pokemon Red/Pokemon Blue	GB	1996	Role-Playing	Nintendo	11.270000	8.890000	10.220000	1.000000	31.370000	nan	0	nan	0	nan	nan	Nintendo

Statistical Details (Excluding 'Year_of_Release'):

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\1336875849.py:26: FutureWarning:

this method is deprecated in favour of `Styler.format(precision=..)`

	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales	Critic_Score	Critic_Count	User_Score	User_Count
count	16447.00	16447.00	16447.00	16447.00	16447.00	8551.00	16447.00	8551.00	16447.00
mean	0.26	0.15	0.08	0.05	0.54	69.03	12.83	5.95	73.96
std	0.82	0.51	0.31	0.19	1.56	13.46	18.71	5.29	388.39
min	0.00	0.00	0.00	0.00	0.01	13.00	0.00	-40.07	0.00
25%	0.00	0.00	0.00	0.00	0.06	61.00	0.00	5.90	0.00
50%	0.08	0.02	0.00	0.01	0.17	70.37	0.00	7.30	0.00
75%	0.24	0.11	0.04	0.03	0.47	79.00	21.00	8.20	20.00
max	41.36	28.96	10.22	10.57	82.53	98.00	113.00	26.93	10665.00

Start of Data Cleaning and Preparation: The print statement "\n=== Data Cleaning and Preparation ===\n" serves as a header to mark the beginning of this phase, providing clear segmentation in the output. 2-3. Remove rows with missing 'Name' or 'Genre':

Rows where either 'Name' or 'Genre' is missing are removed from the DataFrame, as these fields are essential for identifying and categorizing the games. 4-5. Replace missing 'Publisher' values:

Missing values in the 'Publisher' column are filled with 'Unknown', ensuring every game has an associated publisher, even if it's not specified. 6-7. Convert 'User_Score' to numeric:

The 'User_Score' column is converted to numeric values, with any non-numeric values being set to NaN. This allows for numerical operations and analyses to be performed on user scores. 8-9. Calculate the average difference between scores:

The average difference between 'User_Score' and 'Critic_Score' is calculated for rows where both scores are present. This value will be used for imputing missing scores in subsequent steps. 10-11. Impute missing User_Scores:

For rows with a missing 'User_Score' but a present 'Critic_Score', the missing 'User_Score' is imputed by adding the previously calculated average difference to the 'Critic_Score'. 12-13. Impute missing Critic_Scores:

Similarly, for rows with a missing 'Critic_Score' but a present 'User_Score', the missing 'Critic_Score' is imputed by subtracting the average difference from the 'User_Score'. 14-15. Remove duplicates:

Duplicate rows based on 'Name', 'Year_of_Release', and 'Platform' are removed, ensuring each row in the DataFrame represents a unique game-platform-year combination. Display the DataFrame with styled output: The first 5 rows of the cleaned DataFrame are displayed with a styled output, using lavender background and black text color for better readability and visual distinction. 19-20. Exclude 'Year_of_Release' from statistical summary:

A statistical summary of the DataFrame, excluding the 'Year_of_Release' column, is prepared. The 'Year_of_Release' is likely excluded because it doesn't provide meaningful descriptive statistics like mean or standard deviation. Print statistical details with improved formatting: The statistical details are displayed with a background gradient color scheme from the 'coolwarm' colormap and numerical precision set to 2 decimal places. This enhanced formatting makes the statistical summary more visually appealing and easier to interpret.

In [39]:

print(df.head())

print("\n--- Exploratory Data Analysis (EDA) ---")

statistical_details = df.drop(columns='Year_of_Release').describe()
print("\nStatistical details of the dataset (excluding 'Year_of_Release'):")
print(statistical_details)

platform_counts = df['Platform'].value_counts()
print("\nNumber of games per platform:")
print(platform_counts)

                       Name Platform  Year_of_Release         Genre Publisher  \
0                Wii Sports      Wii             2006        Sports  Nintendo   
1         Super Mario Bros.      NES             1985      Platform  Nintendo   
2            Mario Kart Wii      Wii             2008        Racing  Nintendo   
3         Wii Sports Resort      Wii             2009        Sports  Nintendo   
4  Pokemon Red/Pokemon Blue       GB             1996  Role-Playing  Nintendo   

   NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  Critic_Score  \
0     41.36     28.96      3.77         8.45         82.53          76.0   
1     29.08      3.58      6.81         0.77         40.24           NaN   
2     15.68     12.76      3.79         3.29         35.52          82.0   
3     15.61     10.93      3.28         2.95         32.77          80.0   
4     11.27      8.89     10.22         1.00         31.37           NaN   

   Critic_Count  User_Score  User_Count Developer Rating Console_Brand  
0            51         8.0         322  Nintendo      E      Nintendo  
1             0         NaN           0       NaN    NaN      Nintendo  
2            73         8.3         709  Nintendo      E      Nintendo  
3            73         8.0         192  Nintendo      E      Nintendo  
4             0         NaN           0       NaN    NaN      Nintendo  

--- Exploratory Data Analysis (EDA) ---

Statistical details of the dataset (excluding 'Year_of_Release'):
           NA_Sales      EU_Sales      JP_Sales   Other_Sales  Global_Sales  \
count  16447.000000  16447.000000  16447.000000  16447.000000  16447.000000   
mean       0.263981      0.145903      0.078476      0.047586      0.536202   
std        0.818308      0.506674      0.311073      0.187989      1.558502   
min        0.000000      0.000000      0.000000      0.000000      0.010000   
25%        0.000000      0.000000      0.000000      0.000000      0.060000   
50%        0.080000      0.020000      0.000000      0.010000      0.170000   
75%        0.240000      0.110000      0.040000      0.030000      0.470000   
max       41.360000     28.960000     10.220000     10.570000     82.530000   

       Critic_Score  Critic_Count   User_Score    User_Count  
count   8551.000000  16447.000000  8551.000000  16447.000000  
mean      69.026180     12.832675     5.953759     73.963702  
std       13.458964     18.708601     5.286127    388.389327  
min       13.000000      0.000000   -40.074108      0.000000  
25%       61.000000      0.000000     5.900000      0.000000  
50%       70.374108      0.000000     7.300000      0.000000  
75%       79.000000     21.000000     8.200000     20.000000  
max       98.000000    113.000000    26.925892  10665.000000  

Number of games per platform:
PS2     2127
DS      2122
PS3     1305
Wii     1286
X360    1232
PSP     1193
PS      1190
PC       957
GBA      811
XB       803
GC       542
3DS      512
PSV      431
PS4      393
N64      316
XOne     247
SNES     239
SAT      173
WiiU     147
2600     116
NES       98
GB        97
DC        52
GEN       27
NG        12
SCD        6
WS         6
3DO        3
TG16       2
GG         1
PCFX       1
Name: Platform, dtype: int64

print(df.head()): This line prints the first 5 rows of the DataFrame df. This is a common practice to quickly get a sense of the data's structure, including the column names and types of data contained in each column.

print("\n--- Exploratory Data Analysis (EDA) ---"): This prints a header to indicate the start of the exploratory data analysis section. EDA is a crucial step in data analysis, where you summarize the main characteristics of the data, often with visual methods.

3-4. Statistical details excluding 'Year_of_Release':

The df.drop(columns='Year_of_Release').describe() part of the code generates statistical summaries for all numerical columns in the DataFrame except 'Year_of_Release'. The .describe() method provides descriptive statistics that summarize the central tendency, dispersion, and shape of the dataset's distribution, excluding NaN values. The resulting statistical details are printed out. This summary typically includes count, mean, standard deviation, minimum, 25th percentile (Q1), median (50th percentile, Q2), 75th percentile (Q3), and maximum values for each numerical column. 5-6. Number of games per platform:

platform_counts = df['Platform'].value_counts(): This line counts the number of occurrences (games) for each unique value in the 'Platform' column of the DataFrame, effectively giving a distribution of the number of games available on each platform. The print("\nNumber of games per platform:") followed by print(platform_counts) prints out these counts, showing which platforms have the most games, which can be useful for understanding trends or biases in the data towards certain gaming platforms.

In [40]:

# Exploratory Data Analysis (EDA)
print("\n=== Exploratory Data Analysis (EDA) ===\n")

# Display the first 5 rows with styled output
print("Dataset Preview - First 5 Rows:")
display(df.head().style.set_properties(**{'background-color': 'lightblue', 'color': 'black'}))

# Exclude 'Year_of_Release' from the statistical summary
statistical_details = df.drop(columns='Year_of_Release').describe()

# Print the statistical details of the dataset excluding 'Year_of_Release' with improved readability
print("\nStatistical Details (Excluding 'Year_of_Release'):")
display(statistical_details.style.background_gradient(cmap='viridis').set_precision(2))

# Number of games per platform with styled output
platform_counts = df['Platform'].value_counts()
print("\nNumber of Games per Platform:")
platform_counts_df = platform_counts.reset_index().rename(columns={'index': 'Platform', 'Platform': 'Count'})
display(platform_counts_df.style.set_properties(**{'background-color': 'lightgreen', 'color': 'black'}).hide_index())

=== Exploratory Data Analysis (EDA) ===

Dataset Preview - First 5 Rows:

	Name	Platform	Year_of_Release	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales	Critic_Score	Critic_Count	User_Score	User_Count	Developer	Rating	Console_Brand
0	Wii Sports	Wii	2006	Sports	Nintendo	41.360000	28.960000	3.770000	8.450000	82.530000	76.000000	51	8.000000	322	Nintendo	E	Nintendo
1	Super Mario Bros.	NES	1985	Platform	Nintendo	29.080000	3.580000	6.810000	0.770000	40.240000	nan	0	nan	0	nan	nan	Nintendo
2	Mario Kart Wii	Wii	2008	Racing	Nintendo	15.680000	12.760000	3.790000	3.290000	35.520000	82.000000	73	8.300000	709	Nintendo	E	Nintendo
3	Wii Sports Resort	Wii	2009	Sports	Nintendo	15.610000	10.930000	3.280000	2.950000	32.770000	80.000000	73	8.000000	192	Nintendo	E	Nintendo
4	Pokemon Red/Pokemon Blue	GB	1996	Role-Playing	Nintendo	11.270000	8.890000	10.220000	1.000000	31.370000	nan	0	nan	0	nan	nan	Nintendo

Statistical Details (Excluding 'Year_of_Release'):

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\3464526511.py:13: FutureWarning:

this method is deprecated in favour of `Styler.format(precision=..)`

	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales	Critic_Score	Critic_Count	User_Score	User_Count
count	16447.00	16447.00	16447.00	16447.00	16447.00	8551.00	16447.00	8551.00	16447.00
mean	0.26	0.15	0.08	0.05	0.54	69.03	12.83	5.95	73.96
std	0.82	0.51	0.31	0.19	1.56	13.46	18.71	5.29	388.39
min	0.00	0.00	0.00	0.00	0.01	13.00	0.00	-40.07	0.00
25%	0.00	0.00	0.00	0.00	0.06	61.00	0.00	5.90	0.00
50%	0.08	0.02	0.00	0.01	0.17	70.37	0.00	7.30	0.00
75%	0.24	0.11	0.04	0.03	0.47	79.00	21.00	8.20	20.00
max	41.36	28.96	10.22	10.57	82.53	98.00	113.00	26.93	10665.00

Number of Games per Platform:

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\3464526511.py:19: FutureWarning:

this method is deprecated in favour of `Styler.hide(axis="index")`

Platform	Count
PS2	2127
DS	2122
PS3	1305
Wii	1286
X360	1232
PSP	1193
PS	1190
PC	957
GBA	811
XB	803
GC	542
3DS	512
PSV	431
PS4	393
N64	316
XOne	247
SNES	239
SAT	173
WiiU	147
2600	116
NES	98
GB	97
DC	52
GEN	27
NG	12
SCD	6
WS	6
3DO	3
TG16	2
GG	1
PCFX	1

Start of EDA Section:

print("\n=== Exploratory Data Analysis (EDA) ===\n"): This print statement serves as a header to clearly mark the beginning of the exploratory data analysis part of the script, setting the stage for the analysis to follow. Dataset Preview:

The first 5 rows of the DataFrame df are displayed with a styled output. The style is set to have a lightblue background and black text color, enhancing readability and visual appeal. This provides a quick glance at the dataset's structure, including column names and sample data. Statistical Summary Excluding 'Year_of_Release':

The code first excludes the 'Year_of_Release' column from the statistical summary because it might not provide meaningful descriptive statistics like mean or standard deviation. Then, the .describe() method is used to generate a summary for the remaining columns, typically focusing on numerical ones. The statistical details are then printed with enhanced readability, applying a viridis color gradient and setting numerical precision to 2 decimal places. This makes the summary more visually engaging and easier to interpret. Number of Games per Platform:

The number of games per platform is calculated using the value_counts() method on the 'Platform' column, giving an insight into the distribution of games across different gaming platforms. This distribution is converted into a DataFrame for better presentation (platform_counts_df). The DataFrame is then styled with a lightgreen background and black text color, and the index is hidden to emphasize the data over the DataFrame's default indexing. This styled output provides a clear and visually appealing view of how many games are available per platform, highlighting the most and least popular platforms in the dataset.

In [41]:

# Genre Analysis
print("\n--- Genre Analysis ---")

genre_counts = df['Genre'].value_counts()
print("\nNumber of games per genre:")
print(genre_counts)

global_genre_popularity = df.groupby('Genre')['Global_Sales'].sum().sort_values(ascending=False).reset_index()
global_genre_popularity.rename(columns={'Global_Sales': 'Global_Genre_Popularity'}, inplace=True)

eu_genre_popularity = df.groupby('Genre')['EU_Sales'].sum().sort_values(ascending=False).reset_index()
eu_genre_popularity.rename(columns={'EU_Sales': 'EU_Genre_Popularity'}, inplace=True)

jp_genre_popularity = df.groupby('Genre')['JP_Sales'].sum().sort_values(ascending=False).reset_index()
jp_genre_popularity.rename(columns={'JP_Sales': 'JP_Genre_Popularity'}, inplace=True)

print("\nGlobal Genre Popularity:\n", global_genre_popularity)
print("\nEU Genre Popularity:\n", eu_genre_popularity)
print("\nJP Genre Popularity:\n", jp_genre_popularity)

print("\n--- Series Installment Calculation ---")
df.sort_values(by=['Name', 'Platform', 'Year_of_Release'], inplace=True)
df['Series_Installment'] = df.groupby(['Name', 'Platform']).cumcount() + 1
print(df[['Name', 'Platform', 'Year_of_Release', 'Series_Installment']].head())

print("\n--- Market Share Calculation ---")
total_sales_by_region = df[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Global_Sales']].sum()
df_genres = df.groupby('Genre')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Global_Sales']].sum()
df_genres = df_genres.divide(total_sales_by_region) * 100  # Convert to percentage for market share
print(df_genres)

--- Genre Analysis ---

Number of games per genre:
Action          3308
Sports          2305
Misc            1721
Role-Playing    1483
Shooter         1296
Adventure       1293
Racing          1226
Platform         878
Simulation       858
Fighting         837
Strategy         673
Puzzle           569
Name: Genre, dtype: int64

Global Genre Popularity:
            Genre  Global_Genre_Popularity
0         Action                  1717.63
1         Sports                  1310.38
2        Shooter                  1041.83
3   Role-Playing                   930.90
4       Platform                   825.86
5           Misc                   790.94
6         Racing                   723.70
7       Fighting                   442.63
8     Simulation                   388.40
9         Puzzle                   240.33
10     Adventure                   233.46
11      Strategy                   172.85

EU Genre Popularity:
            Genre  EU_Genre_Popularity
0         Action               510.99
1         Sports               371.32
2        Shooter               314.52
3         Racing               234.49
4           Misc               210.60
5       Platform               199.39
6   Role-Playing               188.24
7     Simulation               113.35
8       Fighting                99.00
9      Adventure                63.20
10        Puzzle                49.78
11      Strategy                44.79

JP Genre Popularity:
            Genre  JP_Genre_Popularity
0   Role-Playing               353.44
1         Action               160.15
2         Sports               134.93
3       Platform               130.71
4           Misc               107.02
5       Fighting                87.28
6     Simulation                63.64
7         Puzzle                56.68
8         Racing                56.63
9      Adventure                52.24
10      Strategy                49.30
11       Shooter                38.68

--- Series Installment Calculation ---
                              Name Platform  Year_of_Release  \
14985               Beyblade Burst      3DS             2016   
1079             Fire Emblem Fates      3DS             2015   
3358          Frozen: Olaf's Quest      3DS             2013   
3862          Frozen: Olaf's Quest       DS             2013   
13795   Haikyu!! Cross Team Match!      3DS             2016   

       Series_Installment  
14985                   1  
1079                    1  
3358                    1  
3862                    1  
13795                   1  

--- Market Share Calculation ---
               NA_Sales   EU_Sales   JP_Sales  Global_Sales
Genre                                                      
Action        19.880968  21.294178  12.407996     19.476670
Adventure      2.338260   2.633695   4.047416      2.647266
Fighting       5.078898   4.125567   6.762222      5.019101
Misc           9.203098   8.776207   8.291625      8.968682
Platform      10.236567   8.309059  10.127063      9.364649
Puzzle         2.789927   2.074452   4.391416      2.725167
Racing         8.219380   9.771760   4.387542      8.206230
Role-Playing   7.601648   7.844412  27.383590     10.555726
Shooter       13.470100  13.106802   2.996823     11.813591
Simulation     4.161283   4.723566   4.930658      4.404172
Sports        15.459418  15.473794  10.454017     14.858752
Strategy       1.560452   1.866507   3.819633      1.959993

Genre Analysis Section: Header: print("\n--- Genre Analysis ---") marks the beginning of the genre analysis section. Number of Games per Genre: genre_counts = df['Genre'].value_counts() counts the number of games in each genre, which is then printed. This gives an insight into which genres are most populated with titles in the dataset. Regional Genre Popularity: The dataset is grouped by 'Genre' and the sales columns ('Global_Sales', 'EU_Sales', 'JP_Sales') are aggregated to sum up the total sales per genre. This data is sorted in descending order to highlight the most popular genres in terms of sales. Global Genre Popularity: Shows which genres have sold the most globally. EU Genre Popularity: Focuses on genre popularity in Europe. JP Genre Popularity: Focuses on genre popularity in Japan. These sections provide insights into regional preferences and global popularity by genre, which can inform market strategies and content creation. Series Installment Calculation: The dataset is sorted by 'Name', 'Platform', and 'Year_of_Release' to ensure a logical sequence for installment numbering. df['Series_Installment'] = df.groupby(['Name', 'Platform']).cumcount() + 1 assigns an installment number to each game within its series on a specific platform, starting from 1. This helps identify sequels and their order in the dataset. A preview of this data is printed, showcasing the new 'Series_Installment' column alongside 'Name', 'Platform', and 'Year_of_Release'. Market Share Calculation: total_sales_by_region = df[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Global_Sales']].sum() calculates the total sales in each region and globally. The dataset is then grouped by 'Genre', and sales are summed up per genre for each region. This sum is divided by the total sales calculated earlier and multiplied by 100 to get the market share percentage of each genre in each region. This market share data is printed, providing insights into how different genres perform and compete in various markets.

In [42]:

print("\n=== Genre Analysis ===\n")

genre_counts = df['Genre'].value_counts()
print("\nNumber of Games per Genre:")
genre_counts_df = genre_counts.reset_index().rename(columns={'index': 'Genre', 'Genre': 'Count'})
display(genre_counts_df.style.set_properties(**{'background-color': 'lightcoral', 'color': 'black'}).hide_index())

global_genre_popularity = df.groupby('Genre')['Global_Sales'].sum().sort_values(ascending=False).reset_index()
global_genre_popularity.rename(columns={'Global_Sales': 'Global_Genre_Popularity'}, inplace=True)

eu_genre_popularity = df.groupby('Genre')['EU_Sales'].sum().sort_values(ascending=False).reset_index()
eu_genre_popularity.rename(columns={'EU_Sales': 'EU_Genre_Popularity'}, inplace=True)

jp_genre_popularity = df.groupby('Genre')['JP_Sales'].sum().sort_values(ascending=False).reset_index()
jp_genre_popularity.rename(columns={'JP_Sales': 'JP_Genre_Popularity'}, inplace=True)

print("\nGlobal Genre Popularity:")
display(global_genre_popularity.style.background_gradient(cmap='coolwarm').set_precision(2))

print("\nEU Genre Popularity:")
display(eu_genre_popularity.style.background_gradient(cmap='coolwarm').set_precision(2))

print("\nJP Genre Popularity:")
display(jp_genre_popularity.style.background_gradient(cmap='coolwarm').set_precision(2))

print("\n=== Series Installment Calculation ===\n")
df.sort_values(by=['Name', 'Platform', 'Year_of_Release'], inplace=True)
df['Series_Installment'] = df.groupby(['Name', 'Platform']).cumcount() + 1
display(df[['Name', 'Platform', 'Year_of_Release', 'Series_Installment']].head().style.set_properties(**{'background-color': 'lavender', 'color': 'black'}))

print("\n=== Market Share Calculation ===\n")
total_sales_by_region = df[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Global_Sales']].sum()
df_genres = df.groupby('Genre')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Global_Sales']].sum()
df_genres = df_genres.divide(total_sales_by_region) * 100  # Convert to percentage for market share
display(df_genres.style.background_gradient(cmap='Spectral').set_precision(2))

=== Genre Analysis ===


Number of Games per Genre:

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\525494124.py:6: FutureWarning:

this method is deprecated in favour of `Styler.hide(axis="index")`

Genre	Count
Action	3308
Sports	2305
Misc	1721
Role-Playing	1483
Shooter	1296
Adventure	1293
Racing	1226
Platform	878
Simulation	858
Fighting	837
Strategy	673
Puzzle	569

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\525494124.py:18: FutureWarning:

this method is deprecated in favour of `Styler.format(precision=..)`

Global Genre Popularity:

	Genre	Global_Genre_Popularity
0	Action	1717.63
1	Sports	1310.38
2	Shooter	1041.83
3	Role-Playing	930.90
4	Platform	825.86
5	Misc	790.94
6	Racing	723.70
7	Fighting	442.63
8	Simulation	388.40
9	Puzzle	240.33
10	Adventure	233.46
11	Strategy	172.85

EU Genre Popularity:

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\525494124.py:21: FutureWarning:

this method is deprecated in favour of `Styler.format(precision=..)`

	Genre	EU_Genre_Popularity
0	Action	510.99
1	Sports	371.32
2	Shooter	314.52
3	Racing	234.49
4	Misc	210.60
5	Platform	199.39
6	Role-Playing	188.24
7	Simulation	113.35
8	Fighting	99.00
9	Adventure	63.20
10	Puzzle	49.78
11	Strategy	44.79

JP Genre Popularity:

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\525494124.py:24: FutureWarning:

this method is deprecated in favour of `Styler.format(precision=..)`

	Genre	JP_Genre_Popularity
0	Role-Playing	353.44
1	Action	160.15
2	Sports	134.93
3	Platform	130.71
4	Misc	107.02
5	Fighting	87.28
6	Simulation	63.64
7	Puzzle	56.68
8	Racing	56.63
9	Adventure	52.24
10	Strategy	49.30
11	Shooter	38.68

=== Series Installment Calculation ===

	Name	Platform	Year_of_Release	Series_Installment
14985	Beyblade Burst	3DS	2016	1
1079	Fire Emblem Fates	3DS	2015	1
3358	Frozen: Olaf's Quest	3DS	2013	1
3862	Frozen: Olaf's Quest	DS	2013	1
13795	Haikyu!! Cross Team Match!	3DS	2016	1

=== Market Share Calculation ===

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\525494124.py:35: FutureWarning:

this method is deprecated in favour of `Styler.format(precision=..)`

	NA_Sales	EU_Sales	JP_Sales	Global_Sales
Genre
Action	19.88	21.29	12.41	19.48
Adventure	2.34	2.63	4.05	2.65
Fighting	5.08	4.13	6.76	5.02
Misc	9.20	8.78	8.29	8.97
Platform	10.24	8.31	10.13	9.36
Puzzle	2.79	2.07	4.39	2.73
Racing	8.22	9.77	4.39	8.21
Role-Playing	7.60	7.84	27.38	10.56
Shooter	13.47	13.11	3.00	11.81
Simulation	4.16	4.72	4.93	4.40
Sports	15.46	15.47	10.45	14.86
Strategy	1.56	1.87	3.82	1.96

Genre Analysis: Introduction: The section starts with a clear header, setting the context for the genre analysis. Number of Games per Genre: This part presents the count of games in each genre with an improved presentation. The counts are converted into a DataFrame and displayed with a styled output, featuring a lightcoral background and black text for enhanced readability. The index is hidden to focus on the genre and count data. Global Genre Popularity: Calculates the total global sales per genre and sorts them in descending order to identify the most popular genres worldwide. The results are presented with a gradient color scale for visual appeal. Regional Genre Popularity: Similar to the global popularity calculation, this part computes and displays the popularity of genres within the European and Japanese markets, using the same visual enhancement for consistency and clarity. Series Installment Calculation: Header: Marks the beginning of the series installment calculation section. The dataset is sorted by 'Name', 'Platform', and 'Year_of_Release' to logically sequence the games. A 'Series_Installment' column is added to the dataset, indicating the installment number of each game within its series and platform. This provides insight into the sequence of game releases. The first few rows of the updated dataset are displayed with a styled output, using a lavender background to highlight the new 'Series_Installment' information. Market Share Calculation: Introduction: The section begins with a clear header, transitioning to the market share analysis. Total sales by region are calculated to establish a denominator for market share computation. The dataset is grouped by 'Genre', and sales are summed up per genre for each region. These sums are then divided by the total sales to calculate the market share percentages. The market share data is presented with a Spectral color gradient and numerical precision set to 2 decimal places, enhancing the visual interpretation of how different genres fare in terms of market share across regions.

In [43]:

import pandas as pd


df['NA_Sales_Percent'] = (df['NA_Sales'] / df['Global_Sales']) * 100
df['EU_Sales_Percent'] = (df['EU_Sales'] / df['Global_Sales']) * 100
df['JP_Sales_Percent'] = (df['JP_Sales'] / df['Global_Sales']) * 100

total_sales_by_region = df[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Global_Sales']].sum()
publisher_market_share = df.groupby('Publisher')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Global_Sales']].sum()
publisher_market_share = publisher_market_share.divide(total_sales_by_region) * 100  # Convert to percentage

df['Market_Saturation'] = df.groupby(['Year_of_Release', 'Genre'])['Name'].transform('count')

df.sort_values(by=['Name', 'Platform', 'Year_of_Release'], inplace=True)
df['Multiplatform_Release'] = df.groupby('Name')['Platform'].transform('nunique')
df['Years_Since_Last_Installment'] = df.groupby('Name')['Year_of_Release'].diff().fillna(0).astype(int)

publisher_sales = df.groupby('Publisher')['Global_Sales'].sum()
publisher_game_count = df.groupby('Publisher').size()
publisher_performance_score = publisher_sales / publisher_game_count

publisher_performance = pd.DataFrame({
    'Publisher': publisher_sales.index,
    'Total_Sales': publisher_sales.values,
    'Game_Count': publisher_game_count.values,
    'Sales_Performance_Score': publisher_performance_score.values
})


print("\nTop 10 Publishers by Sales Performance Score:")
print(publisher_performance.sort_values(by='Sales_Performance_Score', ascending=False).head(10))

print("\nSample Data with New Metrics:")
print(df[['Name', 'Year_of_Release', 'Genre', 'NA_Sales_Percent', 'EU_Sales_Percent', 'JP_Sales_Percent', 'Market_Saturation']].head())

Top 10 Publishers by Sales Performance Score:
                              Publisher  Total_Sales  Game_Count  \
387                              Palcom         4.17           1   
426                             Red Orb         5.24           2   
361                            Nintendo      1786.72         700   
40                  Arena Entertainment         4.72           2   
524                         UEP Systems         2.25           1   
428                           RedOctane         8.68           4   
222                         Hello Games         1.70           1   
534                               Valve         1.70           1   
459  Sony Computer Entertainment Europe        23.37          15   
553                    Westwood Studios         1.55           1   

     Sales_Performance_Score  
387                 4.170000  
426                 2.620000  
361                 2.552457  
40                  2.360000  
524                 2.250000  
428                 2.170000  
222                 1.700000  
534                 1.700000  
459                 1.558000  
553                 1.550000  

Sample Data with New Metrics:
                              Name  Year_of_Release         Genre  \
14985               Beyblade Burst             2016  Role-Playing   
1079             Fire Emblem Fates             2015  Role-Playing   
3358          Frozen: Olaf's Quest             2013      Platform   
3862          Frozen: Olaf's Quest             2013      Platform   
13795   Haikyu!! Cross Team Match!             2016     Adventure   

       NA_Sales_Percent  EU_Sales_Percent  JP_Sales_Percent  Market_Saturation  
14985          0.000000          0.000000        100.000000                 54  
1079          48.214286         13.690476         30.952381                 78  
3358          45.000000         45.000000          0.000000                 36  
3862          40.384615         50.000000          0.000000                 36  
13795          0.000000          0.000000        100.000000                 56

Regional Sales Percentages: Calculation: For North America (NA), Europe (EU), and Japan (JP), the percentage of sales relative to global sales is calculated for each game. This provides insight into the regional performance of each game relative to its global appeal. Publisher Market Share: Total Sales by Region: The sum of sales in NA, EU, JP, and globally is computed, providing a baseline for calculating market share. Market Share Calculation: The sales sum for each publisher in these regions is calculated and divided by the total sales to determine the market share in percentage. This metric helps identify dominant publishers in various markets. Market Saturation Metric: Calculation: The 'Market_Saturation' column is created by counting the number of games released in the same year and genre, indicating how crowded each market segment is. Multiplatform Release and Years Since Last Installment: Sorting: The DataFrame is sorted by 'Name', 'Platform', and 'Year_of_Release' to ensure chronological order for each game across platforms. Multiplatform Release: The number of unique platforms each game is released on is calculated, highlighting games with broad platform distribution. Years Since Last Installment: The difference in release years between consecutive games of the same name is computed, showing the time gap between sequels or series installments. Sales Performance Score per Publisher: Publisher Sales and Game Count: The total global sales and the number of games released by each publisher are calculated. Performance Score Calculation: The sales performance score for each publisher is computed by dividing total sales by the number of games published. This score is used to evaluate the efficiency or impact of a publisher's portfolio. Dataframe Creation: A new DataFrame is created to neatly present each publisher's total sales, game count, and sales performance score. Displaying Results: Top 10 Publishers by Sales Performance Score: Publishers are sorted by their sales performance score in descending order, and the top 10 are printed. This highlights the most commercially successful publishers relative to the number of games they have released. Sample Data with New Metrics: A sample of the DataFrame is printed, showcasing the newly calculated metrics like regional sales percentages and market saturation for the first few games.

In [44]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")


df['NA_Sales_Percent'] = (df['NA_Sales'] / df['Global_Sales']) * 100
df['EU_Sales_Percent'] = (df['EU_Sales'] / df['Global_Sales']) * 100
df['JP_Sales_Percent'] = (df['JP_Sales'] / df['Global_Sales']) * 100

total_sales_by_region = df[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Global_Sales']].sum()
publisher_market_share = df.groupby('Publisher')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Global_Sales']].sum()
publisher_market_share = publisher_market_share.divide(total_sales_by_region) * 100  # Convert to percentage

df['Market_Saturation'] = df.groupby(['Year_of_Release', 'Genre'])['Name'].transform('count')

df.sort_values(by=['Name', 'Platform', 'Year_of_Release'], inplace=True)
df['Multiplatform_Release'] = df.groupby('Name')['Platform'].transform('nunique')
df['Years_Since_Last_Installment'] = df.groupby('Name')['Year_of_Release'].diff().fillna(0).astype(int)

publisher_sales = df.groupby('Publisher')['Global_Sales'].sum()
publisher_game_count = df.groupby('Publisher').size()
publisher_performance_score = publisher_sales / publisher_game_count

publisher_performance = pd.DataFrame({
    'Publisher': publisher_sales.index,
    'Total_Sales': publisher_sales.values,
    'Game_Count': publisher_game_count.values,
    'Sales_Performance_Score': publisher_performance_score.values
}).sort_values(by='Sales_Performance_Score', ascending=False)

print("\nTop 10 Publishers by Sales Performance Score:")
display(publisher_performance.head(10).style.background_gradient(cmap='PuBu').set_precision(2))

print("\nSample Data with New Metrics:")
display(df[['Name', 'Year_of_Release', 'Genre', 'NA_Sales_Percent', 'EU_Sales_Percent', 'JP_Sales_Percent', 'Market_Saturation']].head().style.set_table_styles([{'selector': 'tr:hover', 'props': [('background-color', 'yellow')]}]))

Top 10 Publishers by Sales Performance Score:

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\923382921.py:34: FutureWarning:

this method is deprecated in favour of `Styler.format(precision=..)`

	Publisher	Total_Sales	Game_Count	Sales_Performance_Score
387	Palcom	4.17	1	4.17
426	Red Orb	5.24	2	2.62
361	Nintendo	1786.72	700	2.55
40	Arena Entertainment	4.72	2	2.36
524	UEP Systems	2.25	1	2.25
428	RedOctane	8.68	4	2.17
222	Hello Games	1.70	1	1.70
534	Valve	1.70	1	1.70
459	Sony Computer Entertainment Europe	23.37	15	1.56
553	Westwood Studios	1.55	1	1.55

Sample Data with New Metrics:

	Name	Year_of_Release	Genre	NA_Sales_Percent	EU_Sales_Percent	JP_Sales_Percent	Market_Saturation
14985	Beyblade Burst	2016	Role-Playing	0.000000	0.000000	100.000000	54
1079	Fire Emblem Fates	2015	Role-Playing	48.214286	13.690476	30.952381	78
3358	Frozen: Olaf's Quest	2013	Platform	45.000000	45.000000	0.000000	36
3862	Frozen: Olaf's Quest	2013	Platform	40.384615	50.000000	0.000000	36
13795	Haikyu!! Cross Team Match!	2016	Adventure	0.000000	0.000000	100.000000	56

Data Manipulation and Metric Calculation: Regional Sales Percentages: Calculates the percentage of sales from North America, Europe, and Japan relative to global sales for each game. Publisher Market Share: Computes each publisher's market share in NA, EU, JP, and globally by summing up their sales in these regions, dividing by the total sales in the dataset, and converting the result to percentage. Market Saturation: Determines the number of games released each year within each genre, providing insight into market competition levels. Multiplatform Release: Counts the number of unique platforms each game is released on, highlighting multiplatform games. Years Since Last Installment: Calculates the number of years since the last installment of each game series, useful for tracking franchise development over time. Publisher Performance Score: Divides the total global sales by the number of games for each publisher, yielding a performance score that reflects sales efficiency. Presentation of Results: Top 10 Publishers by Sales Performance Score: Displays a styled DataFrame showing the top 10 publishers ranked by their sales performance score, using a 'PuBu' color gradient for visual appeal and setting precision to 2 decimal places. Sample Data with New Metrics: Presents a sample of the DataFrame, showcasing the newly calculated metrics with enhanced styling. It includes a table style that changes the background color to yellow when hovering over rows, improving interactivity and user experience.

In [45]:

import seaborn as sns
import matplotlib.pyplot as plt


sales_by_genre = df.groupby('Genre')['Global_Sales'].sum().sort_values(ascending=False)

plt.figure(figsize=(12, 8))

sns.barplot(x=sales_by_genre.values, y=sales_by_genre.index, palette='viridis')
plt.title('Global Sales by Genre', fontsize=20)
plt.xlabel('Global Sales (in millions)', fontsize=14)
plt.ylabel('Genre', fontsize=14)

plt.grid(axis='x', linestyle='--', alpha=0.6)

plt.show()

Grouping and Summation: The script starts by grouping the DataFrame df by the 'Genre' column and summing up the 'Global_Sales' for each genre. The result, sales_by_genre, is a Series where each index is a genre and the corresponding value is the total global sales for that genre. This Series is sorted in descending order of sales.

Figure Setup: plt.figure(figsize=(12, 8)) sets up the figure for plotting, specifying its size as 12 inches by 8 inches.

Bar Plot: sns.barplot() creates a horizontal bar plot. The x-values represent the global sales figures, and the y-values represent the different genres. The palette='viridis' parameter applies the 'viridis' color palette to the bars, providing a visually appealing gradient.

Title and Labels: The plot is titled 'Global Sales by Genre', with the x-axis labeled 'Global Sales (in millions)' and the y-axis labeled 'Genre'. The fontsize parameter in plt.title(), plt.xlabel(), and plt.ylabel() adjusts the font size for better readability.

Grid: plt.grid(axis='x', linestyle='--', alpha=0.6) adds a horizontal grid to the plot with a dashed line style and partial transparency (alpha=0.6). This grid helps in estimating sales values for each genre.

Display Plot: plt.show() displays the plot. In a Jupyter Notebook environment or similar interactive Python environments, this step might be optional as plots can be automatically rendered after each cell execution.

In [46]:

import pandas as pd


print("\n=== Data Cleaning and Feature Engineering ===\n")

df['NA_Sales_Percent'] = (df['NA_Sales'] / df['Global_Sales']) * 100
df['EU_Sales_Percent'] = (df['EU_Sales'] / df['Global_Sales']) * 100
df['JP_Sales_Percent'] = (df['JP_Sales'] / df['Global_Sales']) * 100

total_sales_by_region = df[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Global_Sales']].sum()
publisher_market_share = df.groupby('Publisher')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Global_Sales']].sum()
publisher_market_share = publisher_market_share.divide(total_sales_by_region) * 100  # Convert to percentage

df['Market_Saturation'] = df.groupby(['Year_of_Release', 'Genre'])['Name'].transform('count')

df.sort_values(by=['Name', 'Platform', 'Year_of_Release'], inplace=True)
df['Multiplatform_Release'] = df.groupby('Name')['Platform'].transform('nunique')
df['Years_Since_Last_Installment'] = df.groupby('Name')['Year_of_Release'].diff().fillna(0).astype(int)

publisher_sales = df.groupby('Publisher')['Global_Sales'].sum()
publisher_game_count = df.groupby('Publisher').size()
publisher_performance_score = publisher_sales / publisher_game_count

publisher_performance = pd.DataFrame({
    'Publisher': publisher_sales.index,
    'Total_Sales': publisher_sales.values,
    'Game_Count': publisher_game_count.values,
    'Sales_Performance_Score': publisher_performance_score.values
})

print("\nTop 10 Publishers by Sales Performance Score:")
display(publisher_performance.sort_values(by='Sales_Performance_Score', ascending=False).head(10).style.background_gradient(cmap='BuPu').set_precision(2))

print("\nSample Data with New Metrics:")
display(df[['Name', 'Year_of_Release', 'Genre', 'NA_Sales_Percent', 'EU_Sales_Percent', 'JP_Sales_Percent', 'Market_Saturation']].head().style.set_properties(**{'background-color': 'lavenderblush', 'color': 'black'}))

=== Data Cleaning and Feature Engineering ===


Top 10 Publishers by Sales Performance Score:

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\941504301.py:32: FutureWarning:

this method is deprecated in favour of `Styler.format(precision=..)`

	Publisher	Total_Sales	Game_Count	Sales_Performance_Score
387	Palcom	4.17	1	4.17
426	Red Orb	5.24	2	2.62
361	Nintendo	1786.72	700	2.55
40	Arena Entertainment	4.72	2	2.36
524	UEP Systems	2.25	1	2.25
428	RedOctane	8.68	4	2.17
222	Hello Games	1.70	1	1.70
534	Valve	1.70	1	1.70
459	Sony Computer Entertainment Europe	23.37	15	1.56
553	Westwood Studios	1.55	1	1.55

Sample Data with New Metrics:

	Name	Year_of_Release	Genre	NA_Sales_Percent	EU_Sales_Percent	JP_Sales_Percent	Market_Saturation
14985	Beyblade Burst	2016	Role-Playing	0.000000	0.000000	100.000000	54
1079	Fire Emblem Fates	2015	Role-Playing	48.214286	13.690476	30.952381	78
3358	Frozen: Olaf's Quest	2013	Platform	45.000000	45.000000	0.000000	36
3862	Frozen: Olaf's Quest	2013	Platform	40.384615	50.000000	0.000000	36
13795	Haikyu!! Cross Team Match!	2016	Adventure	0.000000	0.000000	100.000000	56

Regional Sales Percentages: Calculation: For North America (NA), Europe (EU), and Japan (JP), the script calculates the percentage of sales relative to global sales for each game. This metric provides insights into the regional performance of each game. Publisher Market Share: Total Sales by Region: The sum of sales in NA, EU, JP, and globally is computed, providing a baseline for calculating market share. Market Share Calculation: The sales for each publisher in these regions are summed up and then divided by the total sales to calculate the market share in percentage. This helps identify dominant publishers in different markets. Market Saturation Metric: Calculation: The 'Market_Saturation' column is added by counting the number of games released in the same year and genre, providing a measure of how crowded each market segment is. Multiplatform Release and Years Since Last Installment: Sorting: The DataFrame is sorted by 'Name', 'Platform', and 'Year_of_Release' to ensure a logical sequence for installment numbering. Multiplatform Release: The number of unique platforms each game is released on is calculated, indicating the breadth of each game's release. Years Since Last Installment: The time difference in years between consecutive games of the same name is computed, shedding light on sequel or series release patterns. Sales Performance Score per Publisher: Publisher Sales and Game Count: The total global sales and the number of games released by each publisher are calculated. Performance Score: The sales performance score for each publisher is computed by dividing total sales by the number of games published. This score evaluates the average commercial success of a publisher's games. Publisher Performance DataFrame: A new DataFrame is created to display each publisher's total sales, game count, and sales performance score, sorted by the performance score in descending order. Displaying Results: Top 10 Publishers: The top 10 publishers by sales performance score are displayed, highlighting the most efficient publishers in terms of sales per game released. Sample Data with New Metrics: A sample of the DataFrame is displayed, showcasing the newly calculated metrics for the first few games. The styling with a lavenderblush background enhances readability and visual appeal.

In [47]:

import seaborn as sns
import matplotlib.pyplot as plt

sales_by_genre = df.groupby('Genre')['Global_Sales'].sum().sort_values(ascending=False)

plt.figure(figsize=(12, 8))
sns.set_style("whitegrid")

sns.barplot(x=sales_by_genre.values, y=sales_by_genre.index, palette='viridis')  
plt.title('Global Sales by Genre', fontsize=20, fontweight='bold', color='navy')
plt.xlabel('Global Sales (in millions)', fontsize=14, fontweight='bold')
plt.ylabel('Genre', fontsize=14, fontweight='bold')

plt.grid(axis='x', linestyle='--', alpha=0.6)

plt.show()

Data Preparation: The script begins by grouping the DataFrame df by the 'Genre' column and summing up 'Global_Sales' for each genre. The result, sales_by_genre, is sorted in descending order to prioritize genres with higher sales.

Figure Setup: plt.figure(figsize=(12, 8)) establishes the size of the plot, making it wide enough to comfortably accommodate the genre names and sales values.

Seaborn Style: sns.set_style("whitegrid") applies a white grid style to the plot background, offering a clean and uncluttered look that enhances readability.

Bar Plot: sns.barplot() is used to create the horizontal bar plot, with global sales values as the x-axis and genres as the y-axis. The 'viridis' color palette applies a visually appealing gradient to the bars.

Titles and Labels: The plot is titled 'Global Sales by Genre', with bold and navy-colored font for emphasis. The x-axis and y-axis labels are also bolded, clearly indicating what the numbers and categories represent.

Grid Customization: plt.grid(axis='x', linestyle='--', alpha=0.6) adds a dashed horizontal grid to the plot, with partial transparency for subtle guidance on sales values without overwhelming the data presentation.

Displaying the Plot: plt.show() renders the plot. In interactive environments like Jupyter Notebooks, this might be optional, but it's a necessary call in scripts to ensure the plot is displayed.

Top 5 Games by Global Sales

In [48]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


top_5_games = df.sort_values(by='Global_Sales', ascending=False).head(5)

print("Top 5 Games by Global Sales:")
print(top_5_games[['Name', 'Global_Sales']])

top_5_games['Year_of_Release'] = top_5_games['Year_of_Release'].fillna(0).astype(int).astype(str)

top_5_games['Name_Year'] = top_5_games['Name'] + " (" + top_5_games['Year_of_Release'] + ")"

plt.figure(figsize=(12, 6))
sns.barplot(x='Global_Sales', y='Name_Year', data=top_5_games, palette='viridis')

plt.title('Top 5 Games by Global Sales', fontsize=18)
plt.xlabel('Global Sales (in millions)', fontsize=14)
plt.ylabel('Game Name (Year of Release)', fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)

for index, value in enumerate(top_5_games['Global_Sales']):
    plt.text(value, index, f' {value:.2f}M', va='center', fontsize=12, color='black')

plt.show()

Top 5 Games by Global Sales:
                       Name  Global_Sales
0                Wii Sports         82.53
1         Super Mario Bros.         40.24
2            Mario Kart Wii         35.52
3         Wii Sports Resort         32.77
4  Pokemon Red/Pokemon Blue         31.37

Identifying Top 5 Games:

The DataFrame df is sorted by 'Global_Sales' in descending order, and the top 5 entries are selected and stored in top_5_games. You print the 'Name' and 'Global_Sales' columns of these top 5 games to provide an immediate textual overview. Data Preparation:

The 'Year_of_Release' column is first filled with 0 for missing values, then converted to integer, and finally to string type. This ensures that the years can be combined with game names without issues. A new column 'Name_Year' is created by concatenating the 'Name' and 'Year_of_Release' columns, formatted to include the game's release year in parentheses. This provides a clear identifier for each game, especially useful if there are games with the same name released in different years. Plotting:

A bar plot is created using seaborn's barplot function, setting 'Global_Sales' as the x-axis and the newly created 'Name_Year' as the y-axis. The 'viridis' palette is applied for aesthetic coloring. The plot is sized at 12x6 inches, providing ample space for readability. Customization:

Titles and axis labels are added with custom font sizes, enhancing the plot's readability and providing context. The title specifies that the plot shows the top 5 games by global sales, and the axis labels clearly indicate what the numbers and names represent. A grid is added along the x-axis with a dashed line style and slight transparency (alpha=0.7), helping estimate sales figures without cluttering the visual. Annotations:

A loop iterates over the 'Global_Sales' values of the top 5 games, placing a text annotation next to each bar. This text displays the sales figures in millions, formatted to two decimal places, directly on the plot for easy reference. The va='center' argument vertically aligns the text to the center of each bar, ensuring clarity and neatness. Displaying the Plot:

plt.show() renders the plot. This is particularly useful in script environments to ensure the plot is displayed.

In [49]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")

print("Initial Data Inspection:\n")
print("First 5 rows of the dataset:")
display(df.head())
print("\nLast 5 rows of the dataset:")
display(df.tail())
print(f"\nShape of the dataset: {df.shape[0]} Rows, {df.shape[1]} Columns")
print("\nSummary of the DataFrame:")
df.info()
print("\nData Types of Each Column:")
print(df.dtypes.to_string())
print("\nDescriptive Statistics for Numerical Columns:")
display(df.describe())

Initial Data Inspection:

First 5 rows of the dataset:

	Name	Platform	Year_of_Release	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales	...	Developer	Rating	Console_Brand	Series_Installment	NA_Sales_Percent	EU_Sales_Percent	JP_Sales_Percent	Market_Saturation	Multiplatform_Release
14985	Beyblade Burst	3DS	2016	Role-Playing	FuRyu	0.00	0.00	0.03	0.00	0.03	...	NaN	NaN	Nintendo	1	0.000000	0.000000	100.000000	54	1
1079	Fire Emblem Fates	3DS	2015	Role-Playing	Nintendo	0.81	0.23	0.52	0.11	1.68	...	NaN	NaN	Nintendo	1	48.214286	13.690476	30.952381	78	1
3358	Frozen: Olaf's Quest	3DS	2013	Platform	Disney Interactive Studios	0.27	0.27	0.00	0.05	0.60	...	NaN	NaN	Nintendo	1	45.000000	45.000000	0.000000	36	2
3862	Frozen: Olaf's Quest	DS	2013	Platform	Disney Interactive Studios	0.21	0.26	0.00	0.04	0.52	...	NaN	NaN	Nintendo	1	40.384615	50.000000	0.000000	36	2
13795	Haikyu!! Cross Team Match!	3DS	2016	Adventure	Namco Bandai Games	0.00	0.00	0.04	0.00	0.04	...	NaN	NaN	Nintendo	1	0.000000	0.000000	100.000000	56	1

5 rows × 24 columns

Last 5 rows of the dataset:

	Name	Platform	Year_of_Release	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales	...	Developer	Rating	Console_Brand	Series_Installment	NA_Sales_Percent	EU_Sales_Percent	JP_Sales_Percent	Market_Saturation	Multiplatform_Release
16498	thinkSMART: Chess for Kids	DS	2011	Misc	Mentor Interactive	0.01	0.00	0.00	0.00	0.01	...	Mentor Interactive	E	Nintendo	1	100.000000	0.000000	0.0	183	1
645	uDraw Studio	Wii	2010	Misc	THQ	1.65	0.57	0.00	0.20	2.42	...	THQ	E	Nintendo	1	68.181818	23.553719	0.0	201	1
8281	uDraw Studio: Instant Artist	Wii	2011	Misc	THQ	0.06	0.09	0.00	0.02	0.17	...	THQ	E	Nintendo	1	35.294118	52.941176	0.0	183	2
15614	uDraw Studio: Instant Artist	X360	2011	Misc	THQ	0.01	0.01	0.00	0.00	0.02	...	THQ	E	Microsoft	1	50.000000	50.000000	0.0	183	2
9135	¡Shin Chan Flipa en colores!	DS	2007	Platform	505 Games	0.00	0.00	0.14	0.00	0.14	...	NaN	NaN	Nintendo	1	0.000000	0.000000	100.0	42	1

5 rows × 24 columns

Shape of the dataset: 16447 Rows, 24 Columns

Summary of the DataFrame:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16447 entries, 14985 to 9135
Data columns (total 24 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Name                          16447 non-null  object 
 1   Platform                      16447 non-null  object 
 2   Year_of_Release               16447 non-null  int32  
 3   Genre                         16447 non-null  object 
 4   Publisher                     16447 non-null  object 
 5   NA_Sales                      16447 non-null  float64
 6   EU_Sales                      16447 non-null  float64
 7   JP_Sales                      16447 non-null  float64
 8   Other_Sales                   16447 non-null  float64
 9   Global_Sales                  16447 non-null  float64
 10  Critic_Score                  8551 non-null   float64
 11  Critic_Count                  16447 non-null  int32  
 12  User_Score                    8551 non-null   float64
 13  User_Count                    16447 non-null  int32  
 14  Developer                     9906 non-null   object 
 15  Rating                        9768 non-null   object 
 16  Console_Brand                 16048 non-null  object 
 17  Series_Installment            16447 non-null  int64  
 18  NA_Sales_Percent              16447 non-null  float64
 19  EU_Sales_Percent              16447 non-null  float64
 20  JP_Sales_Percent              16447 non-null  float64
 21  Market_Saturation             16447 non-null  int64  
 22  Multiplatform_Release         16447 non-null  int64  
 23  Years_Since_Last_Installment  16447 non-null  int32  
dtypes: float64(10), int32(4), int64(3), object(7)
memory usage: 2.9+ MB

Data Types of Each Column:
Name                             object
Platform                         object
Year_of_Release                   int32
Genre                            object
Publisher                        object
NA_Sales                        float64
EU_Sales                        float64
JP_Sales                        float64
Other_Sales                     float64
Global_Sales                    float64
Critic_Score                    float64
Critic_Count                      int32
User_Score                      float64
User_Count                        int32
Developer                        object
Rating                           object
Console_Brand                    object
Series_Installment                int64
NA_Sales_Percent                float64
EU_Sales_Percent                float64
JP_Sales_Percent                float64
Market_Saturation                 int64
Multiplatform_Release             int64
Years_Since_Last_Installment      int32

Descriptive Statistics for Numerical Columns:

	Year_of_Release	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales	Critic_Score	Critic_Count	User_Score	User_Count	Series_Installment	NA_Sales_Percent	EU_Sales_Percent	JP_Sales_Percent	Market_Saturation	Multiplatform_Release	Years_Since_Last_Installment
count	16447.000000	16447.000000	16447.000000	16447.000000	16447.000000	16447.000000	8551.000000	16447.000000	8551.000000	16447.000000	16447.000000	16447.000000	16447.000000	16447.000000	16447.000000	16447.000000	16447.000000
mean	2006.488661	0.263981	0.145903	0.078476	0.047586	0.536202	69.026180	12.832675	5.953759	73.963702	1.000122	45.607227	22.781458	24.479532	106.054296	2.104031	0.003466
std	5.877492	0.818308	0.506674	0.311073	0.187989	1.558502	13.458964	18.708601	5.286127	388.389327	0.011027	34.094901	25.104862	40.257511	72.078069	1.568358	1.236991
min	1980.000000	0.000000	0.000000	0.000000	0.000000	0.010000	13.000000	0.000000	-40.074108	0.000000	1.000000	0.000000	0.000000	0.000000	1.000000	1.000000	-24.000000
25%	2003.000000	0.000000	0.000000	0.000000	0.000000	0.060000	61.000000	0.000000	5.900000	0.000000	1.000000	0.000000	0.000000	0.000000	48.000000	1.000000	0.000000
50%	2007.000000	0.080000	0.020000	0.000000	0.010000	0.170000	70.374108	0.000000	7.300000	0.000000	1.000000	50.000000	19.512195	0.000000	85.000000	1.000000	0.000000
75%	2010.000000	0.240000	0.110000	0.040000	0.030000	0.470000	79.000000	21.000000	8.200000	20.000000	1.000000	75.000000	37.500000	37.053001	166.000000	3.000000	0.000000
max	2020.000000	41.360000	28.960000	10.220000	10.570000	82.530000	98.000000	113.000000	26.925892	10665.000000	2.000000	100.000000	100.000000	100.000000	272.000000	10.000000	34.000000

In [50]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_theme(style="whitegrid")

print("Initial Data Inspection:\n")
print("First 5 rows of the dataset:")
display(df.head())
print("\nLast 5 rows of the dataset:")
display(df.tail())
print(f"\nShape of the dataset: {df.shape[0]} Rows, {df.shape[1]} Columns")
print("\nSummary of the DataFrame:")
df.info()
print("\nData Types of Each Column:")
print(df.dtypes.to_string())
print("\nDescriptive Statistics for Numerical Columns:")
display(df.describe())

Initial Data Inspection:

First 5 rows of the dataset:

	Name	Platform	Year_of_Release	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales	...	Developer	Rating	Console_Brand	Series_Installment	NA_Sales_Percent	EU_Sales_Percent	JP_Sales_Percent	Market_Saturation	Multiplatform_Release
14985	Beyblade Burst	3DS	2016	Role-Playing	FuRyu	0.00	0.00	0.03	0.00	0.03	...	NaN	NaN	Nintendo	1	0.000000	0.000000	100.000000	54	1
1079	Fire Emblem Fates	3DS	2015	Role-Playing	Nintendo	0.81	0.23	0.52	0.11	1.68	...	NaN	NaN	Nintendo	1	48.214286	13.690476	30.952381	78	1
3358	Frozen: Olaf's Quest	3DS	2013	Platform	Disney Interactive Studios	0.27	0.27	0.00	0.05	0.60	...	NaN	NaN	Nintendo	1	45.000000	45.000000	0.000000	36	2
3862	Frozen: Olaf's Quest	DS	2013	Platform	Disney Interactive Studios	0.21	0.26	0.00	0.04	0.52	...	NaN	NaN	Nintendo	1	40.384615	50.000000	0.000000	36	2
13795	Haikyu!! Cross Team Match!	3DS	2016	Adventure	Namco Bandai Games	0.00	0.00	0.04	0.00	0.04	...	NaN	NaN	Nintendo	1	0.000000	0.000000	100.000000	56	1

5 rows × 24 columns

Last 5 rows of the dataset:

	Name	Platform	Year_of_Release	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales	...	Developer	Rating	Console_Brand	Series_Installment	NA_Sales_Percent	EU_Sales_Percent	JP_Sales_Percent	Market_Saturation	Multiplatform_Release
16498	thinkSMART: Chess for Kids	DS	2011	Misc	Mentor Interactive	0.01	0.00	0.00	0.00	0.01	...	Mentor Interactive	E	Nintendo	1	100.000000	0.000000	0.0	183	1
645	uDraw Studio	Wii	2010	Misc	THQ	1.65	0.57	0.00	0.20	2.42	...	THQ	E	Nintendo	1	68.181818	23.553719	0.0	201	1
8281	uDraw Studio: Instant Artist	Wii	2011	Misc	THQ	0.06	0.09	0.00	0.02	0.17	...	THQ	E	Nintendo	1	35.294118	52.941176	0.0	183	2
15614	uDraw Studio: Instant Artist	X360	2011	Misc	THQ	0.01	0.01	0.00	0.00	0.02	...	THQ	E	Microsoft	1	50.000000	50.000000	0.0	183	2
9135	¡Shin Chan Flipa en colores!	DS	2007	Platform	505 Games	0.00	0.00	0.14	0.00	0.14	...	NaN	NaN	Nintendo	1	0.000000	0.000000	100.0	42	1

5 rows × 24 columns

Shape of the dataset: 16447 Rows, 24 Columns

Summary of the DataFrame:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16447 entries, 14985 to 9135
Data columns (total 24 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Name                          16447 non-null  object 
 1   Platform                      16447 non-null  object 
 2   Year_of_Release               16447 non-null  int32  
 3   Genre                         16447 non-null  object 
 4   Publisher                     16447 non-null  object 
 5   NA_Sales                      16447 non-null  float64
 6   EU_Sales                      16447 non-null  float64
 7   JP_Sales                      16447 non-null  float64
 8   Other_Sales                   16447 non-null  float64
 9   Global_Sales                  16447 non-null  float64
 10  Critic_Score                  8551 non-null   float64
 11  Critic_Count                  16447 non-null  int32  
 12  User_Score                    8551 non-null   float64
 13  User_Count                    16447 non-null  int32  
 14  Developer                     9906 non-null   object 
 15  Rating                        9768 non-null   object 
 16  Console_Brand                 16048 non-null  object 
 17  Series_Installment            16447 non-null  int64  
 18  NA_Sales_Percent              16447 non-null  float64
 19  EU_Sales_Percent              16447 non-null  float64
 20  JP_Sales_Percent              16447 non-null  float64
 21  Market_Saturation             16447 non-null  int64  
 22  Multiplatform_Release         16447 non-null  int64  
 23  Years_Since_Last_Installment  16447 non-null  int32  
dtypes: float64(10), int32(4), int64(3), object(7)
memory usage: 2.9+ MB

Data Types of Each Column:
Name                             object
Platform                         object
Year_of_Release                   int32
Genre                            object
Publisher                        object
NA_Sales                        float64
EU_Sales                        float64
JP_Sales                        float64
Other_Sales                     float64
Global_Sales                    float64
Critic_Score                    float64
Critic_Count                      int32
User_Score                      float64
User_Count                        int32
Developer                        object
Rating                           object
Console_Brand                    object
Series_Installment                int64
NA_Sales_Percent                float64
EU_Sales_Percent                float64
JP_Sales_Percent                float64
Market_Saturation                 int64
Multiplatform_Release             int64
Years_Since_Last_Installment      int32

Descriptive Statistics for Numerical Columns:

	Year_of_Release	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales	Critic_Score	Critic_Count	User_Score	User_Count	Series_Installment	NA_Sales_Percent	EU_Sales_Percent	JP_Sales_Percent	Market_Saturation	Multiplatform_Release	Years_Since_Last_Installment
count	16447.000000	16447.000000	16447.000000	16447.000000	16447.000000	16447.000000	8551.000000	16447.000000	8551.000000	16447.000000	16447.000000	16447.000000	16447.000000	16447.000000	16447.000000	16447.000000	16447.000000
mean	2006.488661	0.263981	0.145903	0.078476	0.047586	0.536202	69.026180	12.832675	5.953759	73.963702	1.000122	45.607227	22.781458	24.479532	106.054296	2.104031	0.003466
std	5.877492	0.818308	0.506674	0.311073	0.187989	1.558502	13.458964	18.708601	5.286127	388.389327	0.011027	34.094901	25.104862	40.257511	72.078069	1.568358	1.236991
min	1980.000000	0.000000	0.000000	0.000000	0.000000	0.010000	13.000000	0.000000	-40.074108	0.000000	1.000000	0.000000	0.000000	0.000000	1.000000	1.000000	-24.000000
25%	2003.000000	0.000000	0.000000	0.000000	0.000000	0.060000	61.000000	0.000000	5.900000	0.000000	1.000000	0.000000	0.000000	0.000000	48.000000	1.000000	0.000000
50%	2007.000000	0.080000	0.020000	0.000000	0.010000	0.170000	70.374108	0.000000	7.300000	0.000000	1.000000	50.000000	19.512195	0.000000	85.000000	1.000000	0.000000
75%	2010.000000	0.240000	0.110000	0.040000	0.030000	0.470000	79.000000	21.000000	8.200000	20.000000	1.000000	75.000000	37.500000	37.053001	166.000000	3.000000	0.000000
max	2020.000000	41.360000	28.960000	10.220000	10.570000	82.530000	98.000000	113.000000	26.925892	10665.000000	2.000000	100.000000	100.000000	100.000000	272.000000	10.000000	34.000000

In [51]:

print("\n--- Exploratory Data Analysis (EDA) ---")

platform_counts = df['Platform'].value_counts()
plt.figure(figsize=(12, 8))
sns.barplot(x=platform_counts.values, y=platform_counts.index, palette='viridis')
plt.title('Number of Games per Platform', fontsize=20)
plt.xlabel('Number of Games', fontsize=14)
plt.ylabel('Platform', fontsize=14)
plt.show()

--- Exploratory Data Analysis (EDA) ---

In [52]:

print("\n--- Exploratory Data Analysis (EDA) ---")

platform_counts = df['Platform'].value_counts()
plt.figure(figsize=(12, 8))
sns.barplot(x=platform_counts.values, y=platform_counts.index, palette='viridis')
plt.title('Number of Games per Platform', fontsize=20)
plt.xlabel('Number of Games', fontsize=14)
plt.ylabel('Platform', fontsize=14)
plt.show()

--- Exploratory Data Analysis (EDA) ---

In [53]:

top_5_games = df.sort_values(by='Global_Sales', ascending=False).head(5)
top_5_games['Year_of_Release'] = top_5_games['Year_of_Release'].fillna(0).astype(int).astype(str)
top_5_games['Name_Year'] = top_5_games['Name'] + " (" + top_5_games['Year_of_Release'] + ")"

plt.figure(figsize=(12, 6))
sns.barplot(x='Global_Sales', y='Name_Year', data=top_5_games, palette='viridis')
plt.title('Top 5 Games by Global Sales', fontsize=18)
plt.xlabel('Global Sales (in millions)', fontsize=14)
plt.ylabel('Game Name (Year of Release)', fontsize=14)
for index, value in enumerate(top_5_games['Global_Sales']):
    plt.text(value, index, f' {value:.2f}M', va='center', fontsize=12, color='black')
plt.show()

In [54]:

# Print the DataFrame containing the top 5 games by global sales
print("Top 5 Games by Global Sales:")
print(top_5_games[['Name_Year', 'Global_Sales']])

Top 5 Games by Global Sales:
                         Name_Year  Global_Sales
0                Wii Sports (2006)         82.53
1         Super Mario Bros. (1985)         40.24
2            Mario Kart Wii (2008)         35.52
3         Wii Sports Resort (2009)         32.77
4  Pokemon Red/Pokemon Blue (1996)         31.37

In [55]:

!pip install plotly

Requirement already satisfied: plotly in c:\users\luke holmes\anaconda3\lib\site-packages (5.20.0)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\luke holmes\anaconda3\lib\site-packages (from plotly) (8.2.2)
Requirement already satisfied: packaging in c:\users\luke holmes\anaconda3\lib\site-packages (from plotly) (23.0)

In [56]:

df_filtered = df[(df['Year_of_Release'] >= 2010) & (df['Year_of_Release'] <= 2014) & df['Console_Brand'].notnull()]

correlation = df_filtered['NA_Sales'].corr(df_filtered['JP_Sales'])
print(f"The correlation between NA sales and JP sales for the years 2010-2014 is: {correlation:.2f}")

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.regplot(x='NA_Sales', y='JP_Sales', data=df_filtered,
            scatter_kws={'alpha':0.6}, line_kws={'color':'#2ca02c'})
plt.title('Relationship Between NA Sales and JP Sales (2010-2014)')
plt.xlabel('NA Sales (in millions)')
plt.ylabel('JP Sales (in millions)')
plt.show()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='NA_Sales', y='JP_Sales', data=df_filtered, hue='Genre', alpha=0.5, palette='Set1')
plt.title('Colored Relationship Between NA Sales and JP Sales by Genre (2010-2014)')
plt.xlabel('NA Sales (in millions)')
plt.ylabel('JP Sales (in millions)')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

The correlation between NA sales and JP sales for the years 2010-2014 is: 0.26

Filtering the Data: df_filtered is created by filtering df for games released from 2010 to 2014 and with non-null 'Console_Brand' values. This subset ensures the analysis focuses on a specific timeframe and games associated with known console brands. Correlation Calculation: The Pearson correlation coefficient between 'NA_Sales' and 'JP_Sales' is calculated using the .corr() method. This coefficient quantifies the strength and direction of the linear relationship between NA and JP sales. The result is printed, providing a quick statistical insight into how sales in these regions might be related for the specified period. First Visualization - Regression Plot: A regression plot is created using sns.regplot, plotting 'NA_Sales' against 'JP_Sales' with a regression line to indicate the trend. scatter_kws={'alpha':0.6} makes the scatter points semi-transparent, helping visualize the density of points when they overlap. line_kws={'color':'#2ca02c'} sets the color of the regression line to a green shade, making it distinct from the scatter points. The plot is titled and labeled to clearly convey the relationship it's illustrating, specifically highlighting the years analyzed. Second Visualization - Scatter Plot with Hue: A scatter plot is created using sns.scatterplot, again plotting 'NA_Sales' against 'JP_Sales', but this time coloring points by 'Genre' to explore how different genres might influence sales relationships. alpha=0.5 makes the scatter points semi-transparent for better overlap visualization, and palette='Set1' applies a distinct color set for different genres. The legend is positioned outside the plot area (bbox_to_anchor=(1.05, 1), loc='upper left'), ensuring it doesn't obscure any data points. This colored scatter plot provides a more nuanced view of the relationship by showing how genre might play a role in sales performance across these regions.

In [57]:

import plotly.express as px

df_filtered_2010_2014 = df_filtered[(df_filtered['Year_of_Release'] >= 2010) & (df_filtered['Year_of_Release'] <= 2014)]

fig = px.scatter(df_filtered_2010_2014,
                 x='NA_Sales',
                 y='JP_Sales',
                 title='Interactive Relationship Between NA Sales and JP Sales (2010-2014)',
                 labels={'NA_Sales': 'Log of NA Sales (in millions)', 'JP_Sales': 'Log of JP Sales (in millions)'},
                 hover_data=['Name'],
                 color='Console_Brand',
                 color_continuous_scale=px.colors.sequential.Viridis,
                 log_x=True,  
                 log_y=True,  
                 opacity=0.7)  

fig.update_layout(width=1000, height=800)
fig.update_traces(marker=dict(size=5))  

fig.show()

In [58]:

!pip install plotly seaborn matplotlib

Requirement already satisfied: plotly in c:\users\luke holmes\anaconda3\lib\site-packages (5.20.0)
Requirement already satisfied: seaborn in c:\users\luke holmes\anaconda3\lib\site-packages (0.12.2)
Requirement already satisfied: matplotlib in c:\users\luke holmes\anaconda3\lib\site-packages (3.7.1)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\luke holmes\anaconda3\lib\site-packages (from plotly) (8.2.2)
Requirement already satisfied: packaging in c:\users\luke holmes\anaconda3\lib\site-packages (from plotly) (23.0)
Requirement already satisfied: numpy!=1.24.0,>=1.17 in c:\users\luke holmes\anaconda3\lib\site-packages (from seaborn) (1.24.3)
Requirement already satisfied: pandas>=0.25 in c:\users\luke holmes\anaconda3\lib\site-packages (from seaborn) (1.5.3)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\luke holmes\anaconda3\lib\site-packages (from matplotlib) (1.0.5)
Requirement already satisfied: cycler>=0.10 in c:\users\luke holmes\anaconda3\lib\site-packages (from matplotlib) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\luke holmes\anaconda3\lib\site-packages (from matplotlib) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\luke holmes\anaconda3\lib\site-packages (from matplotlib) (1.4.4)
Requirement already satisfied: pillow>=6.2.0 in c:\users\luke holmes\anaconda3\lib\site-packages (from matplotlib) (9.4.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\luke holmes\anaconda3\lib\site-packages (from matplotlib) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\luke holmes\anaconda3\lib\site-packages (from matplotlib) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\luke holmes\anaconda3\lib\site-packages (from pandas>=0.25->seaborn) (2022.7)
Requirement already satisfied: six>=1.5 in c:\users\luke holmes\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)

In [ ]:

In [59]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px


sns.set_theme(style="whitegrid")

df_filtered = df[(df['Year_of_Release'] >= 2010) & (df['Year_of_Release'] <= 2014)]

correlation = df_filtered['NA_Sales'].corr(df_filtered['JP_Sales'])
print(f"The correlation between NA sales and JP sales for the years 2010-2014 is: {correlation:.2f}")

The correlation between NA sales and JP sales for the years 2010-2014 is: 0.26

In [60]:

import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
bubble_size = df_filtered['Global_Sales'] * 10

colors = np.random.rand(len(df_filtered))

plt.scatter('NA_Sales', 'JP_Sales', s=bubble_size, alpha=0.6, data=df_filtered, c=colors, cmap='viridis', edgecolors='w', linewidth=0.5)

plt.title('Relationship Between NA Sales and JP Sales (Bubble Size: Global Sales)')
plt.xlabel('NA Sales (in millions)')
plt.ylabel('JP Sales (in millions)')
plt.show()

In [ ]:

In [61]:

plt.figure(figsize=(10, 6))
sns.scatterplot(x='NA_Sales', y='JP_Sales', data=df_filtered, hue='Genre', alpha=0.5, palette='Set1')
plt.title('Colored Relationship Between NA Sales and JP Sales by Genre (2010-2014)')
plt.xlabel('NA Sales (in millions)')
plt.ylabel('JP Sales (in millions)')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [62]:

import plotly.express as px

fig = px.scatter(df_filtered, x='NA_Sales', y='JP_Sales',
                 title='Interactive Relationship Between NA Sales and JP Sales (2010-2014)',
                 labels={'NA_Sales': 'NA Sales (in millions)', 'JP_Sales': 'JP Sales (in millions)'},
                 hover_data=['Name'],
                 color='Global_Sales',  
                 color_continuous_scale=px.colors.sequential.Viridis, 
                 log_x=True, log_y=True, opacity=0.7)

fig.update_layout(width=1000, height=800, 
                  coloraxis_colorbar=dict(title='Global Sales<br>(in millions)'))

fig.update_traces(marker=dict(size=5, line=dict(width=1, color='DarkSlateGrey')))

fig.show()

In [63]:

import plotly.express as px
import pandas as pd

df_filtered['Sales_Category'] = pd.cut(df_filtered['Global_Sales'], bins=[0, 1, 5, 10, np.inf], labels=['Low', 'Medium', 'High', 'Very High'])

fig = px.scatter(df_filtered, x='NA_Sales', y='JP_Sales',
                 title='Interactive Relationship Between NA Sales and JP Sales (2010-2014)',
                 labels={'NA_Sales': 'NA Sales (in millions)', 'JP_Sales': 'JP Sales (in millions)'},
                 hover_data=['Name'],
                 color='Sales_Category',  
                 color_discrete_sequence=px.colors.qualitative.Set1,  
                 log_x=True, log_y=True, opacity=0.7)

fig.update_layout(width=1000, height=800,
                  coloraxis_colorbar=dict(title='Sales Category'))

fig.update_traces(marker=dict(size=5, line=dict(width=1, color='DarkSlateGrey')))

fig.show()

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\1936538915.py:4: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [64]:

import plotly.express as px
import pandas as pd
import numpy as np


colors = np.random.rand(len(df_filtered))

fig = px.scatter(df_filtered, x='NA_Sales', y='JP_Sales',
                 size='Global_Sales', size_max=60,
                 color=colors, color_continuous_scale=px.colors.sequential.Viridis,
                 labels={'NA_Sales': 'NA Sales (in millions)', 'JP_Sales': 'JP Sales (in millions)'},
                 title='Relationship Between NA Sales and JP Sales (Bubble Size: Global Sales)')

fig.update_layout(legend_title_text='Global Sales',
                  title_font_size=20, font_size=12,
                  title_x=0.5, 
                  margin=dict(t=60, l=40, r=40, b=40), 
                  hovermode='closest') 

fig.update_xaxes(title_font=dict(size=18))
fig.update_yaxes(title_font=dict(size=18))

fig.show()

In [ ]:

Data Preparation: Filtering: You filter df to include only games released from 2010 to 2014, creating df_filtered. This subset focuses the analysis on a specific, recent timeframe. Correlation Analysis: Calculation: The Pearson correlation coefficient between 'NA_Sales' and 'JP_Sales' within df_filtered is calculated to quantify the linear relationship between these sales figures. Output: The calculated correlation coefficient is printed, providing a statistical insight into the relationship between NA and JP sales during the specified years. Visualizations with Seaborn and Matplotlib: Regression Plot:

A regression plot illustrates the relationship between 'NA_Sales' and 'JP_Sales' with a regression line indicating the trend. Scatter points are semi-transparent (alpha=0.6), and the regression line is green (color='#2ca02c'), enhancing the plot's clarity and visual appeal. The plot includes a title and axis labels to clearly convey the analysis context and data dimensions. Scatter Plot with Hue:

A scatter plot colors points by 'Genre' to explore how different genres might influence the sales relationship. The alpha=0.5 setting improves point overlap visibility, and the Set1 palette ensures genre distinctions are clear. A legend is placed outside the plot, avoiding data obscuration. Interactive Visualization with Plotly: Interactive Scatter Plot: An interactive scatter plot using Plotly offers dynamic exploration of the NA vs. JP sales relationship. Points are log-transformed on both axes (log_x=True, log_y=True) to better handle wide data ranges and outliers, making the plot more informative, especially for lower sales figures. The color_continuous_scale set to Viridis and point opacity to 0.7 enhance visual differentiation. Hovering over points reveals the game 'Name', adding a layer of detail that invites deeper investigation into specific data points. The plot size is customized for an expansive view, and marker size is set for easy visibility.

What is the distribution of the most popular 4 game genres?

In [65]:

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

genre_sales = df.groupby('Genre')['Global_Sales'].sum().sort_values(ascending=False)
top_4_genres = genre_sales.head(4).index

top_genres_df = df[df['Genre'].isin(top_4_genres)]

top_genres_df['Log_Global_Sales'] = np.log1p(top_genres_df['Global_Sales'])

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\1955933327.py:11: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [66]:

plt.figure(figsize=(10, 6))
sns.violinplot(x='Genre', y='Log_Global_Sales', data=top_genres_df, palette='viridis', inner='quartile', linewidth=1.5)

plt.title('Distribution of Global Sales by Genre (Log Scale)\nTop 4 Genres from 2010-2014', fontsize=16)
plt.xlabel('Genre', fontsize=14)
plt.ylabel('Log of Global Sales (in millions)', fontsize=14)
plt.xticks(rotation=45)  # Rotate x labels for better legibility
plt.tight_layout()
plt.show()

In [67]:

pip install altair vega_datasets

Requirement already satisfied: altair in c:\users\luke holmes\anaconda3\lib\site-packages (5.2.0)
Requirement already satisfied: vega_datasets in c:\users\luke holmes\anaconda3\lib\site-packages (0.9.0)
Requirement already satisfied: jinja2 in c:\users\luke holmes\anaconda3\lib\site-packages (from altair) (3.1.2)
Requirement already satisfied: jsonschema>=3.0 in c:\users\luke holmes\anaconda3\lib\site-packages (from altair) (4.17.3)
Requirement already satisfied: numpy in c:\users\luke holmes\anaconda3\lib\site-packages (from altair) (1.24.3)
Requirement already satisfied: packaging in c:\users\luke holmes\anaconda3\lib\site-packages (from altair) (23.0)
Requirement already satisfied: pandas>=0.25 in c:\users\luke holmes\anaconda3\lib\site-packages (from altair) (1.5.3)
Requirement already satisfied: toolz in c:\users\luke holmes\anaconda3\lib\site-packages (from altair) (0.12.0)
Requirement already satisfied: attrs>=17.4.0 in c:\users\luke holmes\anaconda3\lib\site-packages (from jsonschema>=3.0->altair) (22.1.0)
Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in c:\users\luke holmes\anaconda3\lib\site-packages (from jsonschema>=3.0->altair) (0.18.0)
Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\luke holmes\anaconda3\lib\site-packages (from pandas>=0.25->altair) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\luke holmes\anaconda3\lib\site-packages (from pandas>=0.25->altair) (2022.7)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\luke holmes\anaconda3\lib\site-packages (from jinja2->altair) (2.1.1)
Requirement already satisfied: six>=1.5 in c:\users\luke holmes\anaconda3\lib\site-packages (from python-dateutil>=2.8.1->pandas>=0.25->altair) (1.16.0)
Note: you may need to restart the kernel to use updated packages.

In [68]:

plt.figure(figsize=(12, 6))

sns.stripplot(x='Genre', y='Log_Global_Sales', data=top_genres_df, palette='muted', jitter=0.25, alpha=0.8, size=4)

plt.title('Individual Global Sales Data Points by Genre (Log Scale)\nTop 4 Genres from 2010-2014', fontsize=16)
plt.xlabel('Genre', fontsize=14)
plt.ylabel('Log of Global Sales (in millions)', fontsize=14)

plt.xticks(rotation=45)

sns.set_style("whitegrid")

plt.tight_layout()
plt.show()

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\3363370563.py:3: FutureWarning:

Passing `palette` without assigning `hue` is deprecated.

In [ ]:

In [69]:

g = sns.FacetGrid(top_genres_df, col='Genre', col_wrap=2, height=4, aspect=1.5, sharex=False, sharey=False)
g.map(sns.histplot, 'Log_Global_Sales', bins=15, color='teal', kde=True, edgecolor='w')
g.add_legend()

g.set_titles('{col_name} Genre Sales Distribution', fontsize=12)
g.set_axis_labels('Log of Global Sales (in millions)', 'Frequency')
for ax in g.axes.flat:
    ax.grid(True, which='both', linestyle='--', linewidth=0.5, color='grey')  # Add grid lines for readability

plt.show()

Data Preparation: Genre Sales Calculation: You start by grouping df by 'Genre' and summing up 'Global_Sales', then sort the sums in descending order. This gives you the total global sales for each genre. Top 4 Genres: You identify the top 4 genres based on these sums and create a new DataFrame top_genres_df that only includes games from these genres. Log Transformation: You add a 'Log_Global_Sales' column to top_genres_df, applying a logarithmic transformation to 'Global_Sales' using np.log1p. This transformation helps in dealing with skewed data and makes the visualization of wide-ranging data more manageable. Visualizations: Violin Plot:

A violin plot is created using sns.violinplot, with genres on the x-axis and the log-transformed sales on the y-axis. The 'viridis' palette is used, and the inner='quartile' setting displays the quartiles inside the violins. This plot provides a comprehensive view of the sales distribution for each genre, showing both the density estimation and the quartiles, which offers insights into the distribution's shape, central tendency, and variability. Strip Plot:

A strip plot (sns.stripplot) visualizes individual data points for log-transformed global sales by genre, using jitter to avoid overlapping points and 'muted' palette for coloring. This plot complements the violin plot by showing individual sales data, providing a sense of data density and distribution outliers. Faceted Histograms with KDE:

Faceted grids (sns.FacetGrid) are used to create separate histograms for each genre, with Kernel Density Estimation (KDE) overlaid. Each histogram is set to have 15 bins, and the 'teal' color is used for consistency and contrast against the white background. These facet grids allow for a detailed comparison of sales distributions across genres, with separate x and y axes scales enhancing readability. KDE curves help in understanding the distribution's shape beyond the discrete histogram bins. Customization and Styling: Titles, axis labels, and x-tick rotations are customized for clarity and aesthetics. The use of plt.tight_layout() ensures that the plots are neatly arranged without overlapping elements. The seaborn style is set to "whitegrid" before the strip plot, which provides a clean background that enhances data visibility. The facet grid histograms include custom titles and axis labels for each subplot and add grid lines for better readability of the sales values.

Do older games (2005 and earlier) have a higher MEAN “eu_sales” than newer games (after 2005)?

In [70]:

older_games = df[df['Year_of_Release'] <= 2005]
newer_games = df[df['Year_of_Release'] > 2005]

mean_eu_sales_older = older_games['EU_Sales'].mean()
mean_eu_sales_newer = newer_games['EU_Sales'].mean()

print(f"Mean EU Sales for Older Games (2005 and earlier): {mean_eu_sales_older:.2f}")
print(f"Mean EU Sales for Newer Games (after 2005): {mean_eu_sales_newer:.2f}")

if mean_eu_sales_older > mean_eu_sales_newer:
    print("Older games (2005 and earlier) have higher mean EU sales than newer games.")
elif mean_eu_sales_older < mean_eu_sales_newer:
    print("Newer games (after 2005) have higher mean EU sales than older games.")
else:
    print("Mean EU sales are the same for older and newer games.")

Mean EU Sales for Older Games (2005 and earlier): 0.15
Mean EU Sales for Newer Games (after 2005): 0.14
Older games (2005 and earlier) have higher mean EU sales than newer games.

In [71]:

import matplotlib.pyplot as plt
import seaborn as sns

# Define categories and mean sales for older and newer games
categories = ['Older Games (≤ 2005)', 'Newer Games (> 2005)']
mean_sales = [mean_eu_sales_older, mean_eu_sales_newer]

plt.figure(figsize=(8, 6))
sns.barplot(x=categories, y=mean_sales, palette='coolwarm', edgecolor='black')

plt.title('Mean EU Sales: Older Games vs. Newer Games', fontsize=16)
plt.ylabel('Mean EU Sales (in millions)', fontsize=14)

# Add data points on bars
for i, value in enumerate(mean_sales):
    plt.text(i, value + 0.001, f'{value:.2f}', ha='center', va='bottom', fontsize=12)  # Adjust for visibility

plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [72]:

import plotly.express as px
import pandas as pd

categories = ['Older Games (≤ 2005)', 'Newer Games (> 2005)']
mean_sales = [0.15, 0.14]  # Example values based on your results

plotly_data = pd.DataFrame({
    'Period': categories,
    'Mean EU Sales': mean_sales
})

fig = px.bar(plotly_data, x='Period', y='Mean EU Sales',
             title='Mean EU Sales: Older Games vs. Newer Games',
             color='Period',
             color_discrete_map={'Older Games (≤ 2005)': '#EF553B', 'Newer Games (> 2005)': '#00CC96'},
             text='Mean EU Sales')

fig.update_layout(xaxis_title='Period',
                  yaxis_title='Mean EU Sales (in millions)',
                  legend_title='Release Period',
                  plot_bgcolor='rgba(245, 245, 245, 1)',
                  margin=dict(l=20, r=20, t=50, b=20),
                  yaxis=dict(range=[0.135, 0.155]))  

fig.update_traces(texttemplate='%{text:.2f}', textposition='outside')

fig.show()

Visualization with Seaborn/Matplotlib: Bar Plot: A seaborn bar plot visualizes the mean EU sales for older and newer games. The 'coolwarm' palette distinguishes the two categories, and the edge color enhances bar definition. Annotations: Text annotations add exact mean sales values to each bar, improving the plot's informativeness. Styling: The plot title, axis label, grid lines, and layout are well-defined for readability and aesthetic appeal. Visualization with Plotly: Data Preparation: You re-define categories and mean_sales (seemingly for illustration since the values are hardcoded in this section) and create a DataFrame plotly_data for Plotly visualization. This step would be more dynamically linked to the previous analysis in a typical workflow. Interactive Bar Plot: The Plotly bar plot provides an interactive visualization, with distinct colors for older and newer games and textual annotations for mean EU sales. Layout and Style Customization: The plot layout, including background color, margin, axis titles, and y-axis range, is customized for clarity and visual appeal. The color scheme is well-chosen for differentiation, and the text formatting ensures clarity.

What are the 3 most common “developer” in the dataset?

In [73]:

top_3_developers = df['Developer'].value_counts().head(3)
print(top_3_developers)

Ubisoft      201
EA Sports    169
EA Canada    164
Name: Developer, dtype: int64

In [74]:

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 6))
barplot = sns.barplot(x=top_3_developers.values, y=top_3_developers.index, palette='viridis')

plt.title('Top 3 Game Developers by Number of Games Developed', fontsize=16)
plt.xlabel('Number of Games Developed', fontsize=14)
plt.ylabel('Developer', fontsize=14)

for i, value in enumerate(top_3_developers.values):
    plt.text(value + 0.5, i, f'{value}', va='center', fontsize=12)

plt.xlim(0, max(top_3_developers.values) + 10)  # Adjust x-axis limits for better visibility
plt.tight_layout()
plt.show()

In [75]:

top_3_developers_df = top_3_developers.reset_index()
top_3_developers_df.columns = ['Developer', 'Games_Developed']

In [76]:

import altair as alt

chart = alt.Chart(top_3_developers_df).mark_bar().encode(
    x=alt.X('Games_Developed:Q', title='Number of Games Developed'),
    y=alt.Y('Developer:N', title='Developer', sort='-x'),  
    color=alt.Color('Developer:N', legend=None)  
).properties(
    width=600,  
    height=200,  
    title='Top 3 Game Developers by Number of Games Developed'
)

text = chart.mark_text(
    align='left',
    baseline='middle',
    dx=3  
).encode(
    text='Games_Developed:Q'
)

# Display the chart with labels
(chart + text).display()

Data Extraction: Top 3 Developers: You identify the top 3 developers using value_counts() on the 'Developer' column of df and take the first 3 entries with .head(3). This provides a quick overview of the most prolific developers in your dataset. Visualization with Matplotlib/Seaborn: Bar Plot: You create a horizontal bar plot displaying the top 3 developers and the number of games they've developed. The 'viridis' palette ensures an appealing color scheme. Annotations: Each bar is annotated with the exact number of games developed, enhancing the plot's informativeness. Customization: The plot's title, axis labels, x-axis limits, and layout are carefully chosen for clarity and visual appeal. Visualization with Plotly: Data Preparation: There seems to be a minor syntax issue with the data dictionary definition (a closing square bracket is missing at the end of 'Total Sales': [200, 150, 180]). Once fixed, this section creates a DataFrame plotly_df for Plotly visualization, incorporating both the number of games developed and total sales (hardcoded values for demonstration). Interactive Bar Plot: An interactive bar plot is created with hover data for total sales, providing an additional layer of information. The use of color_discrete_sequence ensures distinct coloring for each developer. Customization: The plot's layout, background color, and text annotations are customized for readability and aesthetics. The inclusion of total sales as hover data adds depth to the analysis, even though the sales figures are illustrative. Visualization with Altair: Data Preparation: You convert the Series top_3_developers into a DataFrame and rename the columns to prepare for Altair visualization. Bar Chart with Text: An Altair bar chart is created to display the number of games developed by each top developer, with text annotations indicating the exact counts. The chart's width, height, and title are customized for presentation purposes. Customization: The chart uses direct encoding for the x and y axes, color, and text, ensuring clarity and visual appeal. The sort='-x' option in the y-axis encoding ensures that developers are sorted by the number of games developed, enhancing interpretability.

Part 3: How do the preferences for game genres and console developers differ across various regions?

In [77]:

genre_region_sales = df.groupby('Genre')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].sum()

In [78]:

console_brand_region_sales_filtered = df_filtered.groupby('Console_Brand').agg({
    'JP_Sales': 'sum',
    'NA_Sales': 'sum',
    'EU_Sales': 'sum',
    'Other_Sales': 'sum'
}).reset_index()

In [79]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import numpy as np


console_brand_region_sales_filtered = df_filtered.groupby('Console_Brand').agg({
    'JP_Sales': 'sum',
    'NA_Sales': 'sum',
    'EU_Sales': 'sum',
    'Other_Sales': 'sum'
}).reset_index()

# Plotly Interactive Stacked Bar Chart for Console Brand Sales
fig = go.Figure()
regions = ['JP_Sales', 'NA_Sales', 'EU_Sales', 'Other_Sales']
colors = px.colors.qualitative.Plotly  # Choose a color scheme

for i, region in enumerate(regions):
    fig.add_trace(go.Bar(
        x=console_brand_region_sales_filtered['Console_Brand'],
        y=console_brand_region_sales_filtered[region],
        name=region.replace('_', ' '),
        marker_color=colors[i]
    ))

fig.update_layout(
    barmode='stack',
    title_text='Game Sales by Region for Each Console Brand',
    xaxis_title='Console Brand',
    yaxis_title='Total Sales (in millions)',
    legend_title='Region',
    plot_bgcolor='white'
)
fig.update_xaxes(tickangle=45)
fig.show()

In [80]:

genre_region_sales = df_filtered.groupby('Genre')[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].sum()

plt.figure(figsize=(14, 10))
barWidth = 0.25
r1 = np.arange(len(genre_region_sales))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]

plt.bar(r1, genre_region_sales['JP_Sales'], color='skyblue', width=barWidth, edgecolor='gray', label='Japan')
plt.bar(r2, genre_region_sales['NA_Sales'], color='orange', width=barWidth, edgecolor='gray', label='North America')
plt.bar(r3, genre_region_sales['EU_Sales'], color='green', width=barWidth, edgecolor='gray', label='Europe')

plt.xlabel('Game Genre', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(genre_region_sales))], genre_region_sales.index, rotation=45)
plt.ylabel('Total Sales (in millions)', fontweight='bold')
plt.title('Game Sales by Genre Across Different Regions')
plt.legend()
plt.show()

plt.figure(figsize=(14, 10))
genre_region_sales.plot(kind='bar', stacked=True, color=['skyblue', 'orange', 'green'])
plt.title('Stacked Game Sales by Genre Across Different Regions')
plt.xlabel('Genre')
plt.ylabel('Total Sales (in millions)')
plt.xticks(rotation=45)
plt.legend(['Japan', 'North America', 'Europe'])
plt.show()

<Figure size 1400x1000 with 0 Axes>

In [81]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import numpy as np

# Set a global style for matplotlib
plt.style.use('seaborn-darkgrid')

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\3631954802.py:9: MatplotlibDeprecationWarning:

The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead.

In [82]:

colors = px.colors.qualitative.Plotly
fig = go.Figure()
regions = ['JP_Sales', 'NA_Sales', 'EU_Sales', 'Other_Sales']

for i, region in enumerate(regions):
    fig.add_trace(go.Bar(
        x=console_brand_region_sales_filtered['Console_Brand'],
        y=console_brand_region_sales_filtered[region],
        name=region,
        marker_color=colors[i]
    ))

fig.update_layout(
    barmode='stack',
    title='Interactive Game Sales by Region for Each Console Brand',
    xaxis_title='Console Brand',
    yaxis_title='Total Sales (in millions)',
    legend_title='Region',
    plot_bgcolor="white"
)
fig.update_xaxes(tickangle=45)
fig.show()

In [83]:

print(df.columns)

Index(['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Critic_Score',
       'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating',
       'Console_Brand', 'Series_Installment', 'NA_Sales_Percent',
       'EU_Sales_Percent', 'JP_Sales_Percent', 'Market_Saturation',
       'Multiplatform_Release', 'Years_Since_Last_Installment'],
      dtype='object')

In [84]:

# Ensure 'console_brand_region_sales_filtered' is prepared with the required data
colors = px.colors.qualitative.Plotly
fig = go.Figure()
regions = ['JP_Sales', 'NA_Sales', 'EU_Sales', 'Other_Sales']

for i, region in enumerate(regions):
    fig.add_trace(go.Bar(
        x=console_brand_region_sales_filtered['Console_Brand'],
        y=console_brand_region_sales_filtered[region],
        name=region,
        marker_color=colors[i]
    ))

fig.update_layout(
    barmode='stack',
    title='Interactive Game Sales by Region for Each Console Brand',
    xaxis_title='Console Brand',
    yaxis_title='Total Sales (in millions)',
    legend_title='Region',
    plot_bgcolor="white"
)
fig.update_xaxes(tickangle=45)
fig.show()

In [85]:

palette = ['#3498db', '#e74c3c', '#2ecc71']  # Enhanced color palette
genre_region_sales.plot(kind='area', stacked=True, color=palette, figsize=(14, 10), alpha=0.65)
plt.title('Game Sales by Genre across Different Regions', fontsize=18, fontweight='bold')
plt.xlabel('Genre', fontsize=14, fontweight='bold')
plt.ylabel('Total Sales (in millions)', fontsize=14, fontweight='bold')
plt.xticks(rotation=45, fontsize=12)
plt.legend(['Japan', 'North America', 'Europe'], fontsize=12, title='Region', loc='upper left')
plt.grid(color='grey', linestyle='-', linewidth=0.25, alpha=0.5)
plt.tight_layout()
plt.show()

In [86]:

top_genres = df.groupby('Genre')['Global_Sales'].sum().sort_values(ascending=False).head(5).index

df_top_genres = df[df['Genre'].isin(top_genres)]

def top_games_by_region(region_sales_column):
    top_games = df_top_genres.sort_values(by=region_sales_column, ascending=False).groupby('Genre').head(5)
    
    top_games = top_games[['Name', 'Genre', region_sales_column, 'Console_Brand']]
    
    return top_games

top_games_eu = top_games_by_region('EU_Sales')
top_games_na = top_games_by_region('NA_Sales')
top_games_jp = top_games_by_region('JP_Sales')

print("Top 5 Selling Games in Top 5 Genres in EU:")
print(top_games_eu)
print("\nTop 5 Selling Games in Top 5 Genres in NA:")
print(top_games_na)
print("\nTop 5 Selling Games in Top 5 Genres in Japan:")
print(top_games_jp)

Top 5 Selling Games in Top 5 Genres in EU:
                                        Name         Genre  EU_Sales  \
0                                 Wii Sports        Sports     28.96   
3                          Wii Sports Resort        Sports     10.93   
6                      New Super Mario Bros.      Platform      9.14   
16                        Grand Theft Auto V        Action      9.09   
4                   Pokemon Red/Pokemon Blue  Role-Playing      8.89   
15                              Wii Fit Plus        Sports      8.49   
13                                   Wii Fit        Sports      8.03   
8                  New Super Mario Bros. Wii      Platform      6.94   
42                        Grand Theft Auto V        Action      6.31   
138                        World of Warcraft  Role-Playing      6.21   
12               Pokemon Gold/Pokemon Silver  Role-Playing      6.18   
77                                   FIFA 16        Sports      6.12   
31                 Call of Duty: Black Ops 3       Shooter      5.86   
37            Call of Duty: Modern Warfare 3       Shooter      5.73   
34                Call of Duty: Black Ops II       Shooter      5.73   
24               Grand Theft Auto: Vice City        Action      5.49   
23                        Grand Theft Auto V        Action      5.14   
30   Pokémon Yellow: Special Pikachu Edition  Role-Playing      5.04   
81                            FIFA Soccer 13        Action      5.01   
20             Pokemon Diamond/Pokemon Pearl  Role-Playing      4.46   
41                   Call of Duty: Black Ops       Shooter      4.37   
35                Call of Duty: Black Ops II       Shooter      4.24   
18                         Super Mario World      Platform      3.75   
1                          Super Mario Bros.      Platform      3.58   
22                       Super Mario Bros. 3      Platform      3.44   

    Console_Brand  
0        Nintendo  
3        Nintendo  
6        Nintendo  
16           Sony  
4        Nintendo  
15       Nintendo  
13       Nintendo  
8        Nintendo  
42           Sony  
138            PC  
12       Nintendo  
77           Sony  
31           Sony  
37           Sony  
34           Sony  
24           Sony  
23      Microsoft  
30       Nintendo  
81           Sony  
20       Nintendo  
41           Sony  
35      Microsoft  
18       Nintendo  
1        Nintendo  
22       Nintendo  

Top 5 Selling Games in Top 5 Genres in NA:
                                        Name         Genre  NA_Sales  \
0                                 Wii Sports        Sports     41.36   
1                          Super Mario Bros.      Platform     29.08   
9                                  Duck Hunt       Shooter     26.93   
3                          Wii Sports Resort        Sports     15.61   
8                  New Super Mario Bros. Wii      Platform     14.44   
18                         Super Mario World      Platform     12.78   
6                      New Super Mario Bros.      Platform     11.28   
4                   Pokemon Red/Pokemon Blue  Role-Playing     11.27   
21                          Super Mario Land      Platform     10.83   
32                   Call of Duty: Black Ops       Shooter      9.70   
23                        Grand Theft Auto V        Action      9.66   
17             Grand Theft Auto: San Andreas        Action      9.43   
29            Call of Duty: Modern Warfare 3       Shooter      9.04   
15                              Wii Fit Plus        Sports      9.01   
12               Pokemon Gold/Pokemon Silver  Role-Playing      9.00   
13                                   Wii Fit        Sports      8.92   
36            Call of Duty: Modern Warfare 2       Shooter      8.52   
24               Grand Theft Auto: Vice City        Action      8.41   
35                Call of Duty: Black Ops II       Shooter      8.25   
16                        Grand Theft Auto V        Action      7.02   
38                      Grand Theft Auto III        Action      6.99   
20             Pokemon Diamond/Pokemon Pearl  Role-Playing      6.38   
25             Pokemon Ruby/Pokemon Sapphire  Role-Playing      6.06   
30   Pokémon Yellow: Special Pikachu Edition  Role-Playing      5.89   
238                          Madden NFL 2005        Sports      4.18   

    Console_Brand  
0        Nintendo  
1        Nintendo  
9        Nintendo  
3        Nintendo  
8        Nintendo  
18       Nintendo  
6        Nintendo  
4        Nintendo  
21       Nintendo  
32      Microsoft  
23      Microsoft  
17           Sony  
29      Microsoft  
15       Nintendo  
12       Nintendo  
13       Nintendo  
36      Microsoft  
24           Sony  
35      Microsoft  
16           Sony  
38           Sony  
20       Nintendo  
25       Nintendo  
30       Nintendo  
238          Sony  

Top 5 Selling Games in Top 5 Genres in Japan:
                                      Name         Genre  JP_Sales  \
4                 Pokemon Red/Pokemon Blue  Role-Playing     10.22   
12             Pokemon Gold/Pokemon Silver  Role-Playing      7.20   
1                        Super Mario Bros.      Platform      6.81   
6                    New Super Mario Bros.      Platform      6.50   
20           Pokemon Diamond/Pokemon Pearl  Role-Playing      6.04   
27             Pokemon Black/Pokemon White  Role-Playing      5.65   
25           Pokemon Ruby/Pokemon Sapphire  Role-Playing      5.38   
8                New Super Mario Bros. Wii      Platform      4.70   
21                        Super Mario Land      Platform      4.18   
46    Pokemon HeartGold/Pokemon SoulSilver        Action      3.96   
22                     Super Mario Bros. 3      Platform      3.84   
0                               Wii Sports        Sports      3.77   
13                                 Wii Fit        Sports      3.60   
3                        Wii Sports Resort        Sports      3.28   
406                       Monster Hunter X        Action      2.79   
15                            Wii Fit Plus        Sports      2.53   
300                                   Golf        Sports      2.46   
694                    Yokai Watch Busters        Action      2.29   
151                        Resident Evil 2        Action      2.02   
124                    The Legend of Zelda        Action      1.69   
247                               Splatoon       Shooter      1.46   
1233                               Xevious       Shooter      1.27   
1681                               TwinBee       Shooter      1.20   
1424                               Gradius       Shooter      1.00   
449                        Resident Evil 6       Shooter      0.88   

     Console_Brand  
4         Nintendo  
12        Nintendo  
1         Nintendo  
6         Nintendo  
20        Nintendo  
27        Nintendo  
25        Nintendo  
8         Nintendo  
21        Nintendo  
46        Nintendo  
22        Nintendo  
0         Nintendo  
13        Nintendo  
3         Nintendo  
406       Nintendo  
15        Nintendo  
300       Nintendo  
694       Nintendo  
151           Sony  
124       Nintendo  
247       Nintendo  
1233      Nintendo  
1681      Nintendo  
1424      Nintendo  
449           Sony

In [87]:

print(df.columns)

Index(['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Critic_Score',
       'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating',
       'Console_Brand', 'Series_Installment', 'NA_Sales_Percent',
       'EU_Sales_Percent', 'JP_Sales_Percent', 'Market_Saturation',
       'Multiplatform_Release', 'Years_Since_Last_Installment'],
      dtype='object')

In [88]:

top_genres = df.groupby('Genre')['Global_Sales'].sum().sort_values(ascending=False).head(5).index

In [89]:

import seaborn as sns

sales_heatmap_data = df.groupby('Genre')[['NA_Sales', 'EU_Sales', 'JP_Sales']].sum()

plt.figure(figsize=(10, 8))
sns.heatmap(sales_heatmap_data, annot=True, fmt=".1f", cmap="YlGnBu", linewidths=.5)
plt.title('Heatmap of Game Sales by Genre Across Regions')
plt.ylabel('Genre')
plt.xlabel('Region')
plt.show()

In [90]:

import plotly.express as px

top_games = pd.concat([top_games_eu, top_games_na, top_games_jp]).drop_duplicates(subset=['Name'])

top_games_long = top_games.melt(id_vars=['Name', 'Genre', 'Console_Brand'], value_vars=['EU_Sales', 'NA_Sales', 'JP_Sales'],
                                var_name='Region', value_name='Sales')

fig = px.bar(top_games_long, y='Name', x='Sales',
             color='Region', barmode='group',
             hover_data=['Genre', 'Console_Brand'],
             category_orders={"Region": ["EU_Sales", "NA_Sales", "JP_Sales"]},  # Ensure consistent order
             labels={'Sales': 'Sales (in millions)', 'Name': 'Game Name'})

fig.update_layout(title_text='Top Games Sales Across EU, NA, and JP Regions',
                  yaxis_title='Game Name',
                  xaxis_title='Sales (in millions)',
                  legend_title='Region')

fig.show()

In [91]:

print(df.columns)

Index(['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Critic_Score',
       'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating',
       'Console_Brand', 'Series_Installment', 'NA_Sales_Percent',
       'EU_Sales_Percent', 'JP_Sales_Percent', 'Market_Saturation',
       'Multiplatform_Release', 'Years_Since_Last_Installment'],
      dtype='object')

In [92]:

import plotly.express as px
import pandas as pd


top_games = pd.concat([top_games_eu, top_games_na, top_games_jp]).drop_duplicates(subset=['Name'])

top_games_long = top_games.melt(id_vars=['Name', 'Genre', 'Console_Brand'], value_vars=['EU_Sales', 'NA_Sales', 'JP_Sales'],
                                var_name='Region', value_name='Sales')

fig = px.bar(top_games_long, x='Region', y='Sales',
             color='Name',  # Differentiate bars by game name
             hover_data=['Genre', 'Console_Brand'],
             category_orders={"Region": ["EU_Sales", "NA_Sales", "JP_Sales"]},  # Ensure consistent order
             labels={'Sales': 'Sales (in millions)', 'Region': 'Sales Region'})

fig.update_layout(title_text='Sales of Top Games Across EU, NA, and JP Regions by Game',
                  xaxis_title='Sales Region',
                  yaxis_title='Sales (in millions)',
                  legend_title='Game Name')

fig.show()

In [93]:

top_eu_games = df.sort_values(by='EU_Sales', ascending=False).head(5)

top_na_games = df.sort_values(by='NA_Sales', ascending=False).head(5)

top_jp_games = df.sort_values(by='JP_Sales', ascending=False).head(5)

In [94]:

print(df.columns)

Index(['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Critic_Score',
       'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating',
       'Console_Brand', 'Series_Installment', 'NA_Sales_Percent',
       'EU_Sales_Percent', 'JP_Sales_Percent', 'Market_Saturation',
       'Multiplatform_Release', 'Years_Since_Last_Installment'],
      dtype='object')

In [95]:

import pandas as pd
from math import pi
import matplotlib.pyplot as plt


df['Year_of_Release'] = df['Year_of_Release'].fillna(0).astype(int)

def lump_and_customize_game_names(row):
    name = row['Name']
    year = str(row['Year_of_Release']) if row['Year_of_Release'] > 0 else 'Unknown'
    platform = row['Platform']

    if 'Mario' in name and not 'Mario' in name.split()[-1]:  
        return f'Super Mario Series ({year}, {platform})'
    elif 'Zelda' in name:
        return f'Zelda Series ({year}, {platform})'
    elif 'Pokemon' in name:
        return f'Pokemon Series ({year}, {platform})'
    elif 'Call of Duty' in name:
        return f'Call of Duty Series ({year}, {platform})'
    elif 'Resident Evil' in name or 'Res Evil' in name:
        return f'Resident Evil Series ({year}, {platform})'
    elif 'Wii Fit' in name:
        return f'Wii Fit Series ({year}, {platform})'
    elif 'Grand Theft Auto' in name or 'GTA' in name:
        return f'GTA Series ({year}, {platform})'
    else:
        return f'{name} ({year}, {platform})'

df['Lumped_and_Customized_Name'] = df.apply(lump_and_customize_game_names, axis=1)

top_eu_games = df.sort_values(by='EU_Sales', ascending=False).head(5)
top_na_games = df.sort_values(by='NA_Sales', ascending=False).head(5)
top_jp_games = df.sort_values(by='JP_Sales', ascending=False).head(5)

top_games = pd.concat([top_eu_games, top_na_games, top_jp_games]).drop_duplicates(subset=['Name'])

categories = top_games['Lumped_and_Customized_Name'].unique()
N = len(categories)
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]

fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))

for region in ['NA_Sales', 'EU_Sales', 'JP_Sales']:
    grouped_values = top_games.groupby('Lumped_and_Customized_Name')[region].sum().reindex(categories).fillna(0)
    values = grouped_values.values.flatten().tolist()
    values += values[:1]  # Complete the loop
    ax.plot(angles, values, linewidth=1, linestyle='solid', label=f"{region} Sales")
    ax.fill(angles, values, alpha=0.1)

ax.set_thetagrids([angle * 180/pi for angle in angles[:-1]], categories, fontsize=10)
plt.title('Radar Chart of Top Game Sales Across Regions (Lumped Series)', size=15, color='saddlebrown', y=1.1)
plt.legend(loc='upper right', bbox_to_anchor=(1.1, 1.1))

plt.show()

In [96]:

import pandas as pd
import plotly.graph_objects as go

# Function to lump game names and customize them
def lump_and_customize_game_names(row):
    name = row['Name']
    year = str(row['Year_of_Release']) if row['Year_of_Release'] > 0 else 'Unknown'
    platform = row['Platform']
    
    # Lumping conditions for various game series
    if 'Mario' in name and 'Mario' not in name.split()[-1]:
        return f'Super Mario Series ({year}, {platform})'
    elif 'Zelda' in name:
        return f'Zelda Series ({year}, {platform})'
    elif 'Pokemon' in name:
        return f'Pokemon Series ({year}, {platform})'
    elif 'Call of Duty' in name:
        return f'Call of Duty Series ({year}, {platform})'
    elif 'Resident Evil' in name or 'Res Evil' in name:
        return f'Resident Evil Series ({year}, {platform})'
    elif 'Wii Fit' in name:
        return f'Wii Fit Series ({year}, {platform})'
    elif 'Grand Theft Auto' in name or 'GTA' in name:
        return f'GTA Series ({year}, {platform})'
    else:
        return f'{name} ({year}, {platform})'

# Ensure 'Name' column is string type
df['Name'] = df['Name'].astype(str)

# Apply the function to the DataFrame
df['Lumped_and_Customized_Name'] = df.apply(lump_and_customize_game_names, axis=1)

# Filter and sort to get top games in EU, NA, and JP
top_eu_games = df.sort_values(by='EU_Sales', ascending=False).head(5)
top_na_games = df.sort_values(by='NA_Sales', ascending=False).head(5)
top_jp_games = df.sort_values(by='JP_Sales', ascending=False).head(5)

# Combine top games from each region into a single DataFrame and remove duplicates
top_games = pd.concat([top_eu_games, top_na_games, top_jp_games]).drop_duplicates(subset=['Name'])

# Prepare data for the radar chart
categories = top_games['Lumped_and_Customized_Name'].unique()

# Create the radar chart
fig = go.Figure()

# Loop through each region to add data to the radar chart
for region in ['NA_Sales', 'EU_Sales', 'JP_Sales']:
    grouped_values = top_games.groupby('Lumped_and_Customized_Name')[region].sum().reindex(categories).fillna(0)
    values = grouped_values.values.flatten().tolist()
    values += values[:1]  # Complete the loop

    fig.add_trace(go.Scatterpolar(
        r=values,
        theta=categories.tolist() + [categories[0]],  # Complete the loop for categories
        fill='toself',
        name=f'{region} Sales'
    ))

# Update the layout for the radar chart
fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, top_games[['NA_Sales', 'EU_Sales', 'JP_Sales']].max().max()]  # Set the range based on max sales
        )
    ),
    title={
        'text': 'Interactive Radar Chart of Top Game Sales Across Regions (Lumped Series)',
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    showlegend=True
)

# Display the radar chart
fig.show()

Data Grouping and Aggregation: Console Brand and Genre Sales: The data is grouped by 'Console_Brand' and 'Genre' to aggregate regional sales, setting the stage for comparing sales distributions across different segments. Plotly Stacked Bar Charts: Console Brand Regional Sales: Plotly is used to create interactive stacked bar charts displaying the sales distribution across different regions for each console brand. The choice of a stacked layout and distinct color coding for each region enhances the readability of the sales distribution. Matplotlib/Seaborn Plots: Bar Plots: With Matplotlib and Seaborn, bar plots are crafted to visualize regional sales by genre, employing both grouped and stacked bar chart styles. These visualizations provide a clear comparative view of how different genres perform across various markets. Plotly Grouped Bar Chart: Genre Sales by Region: Another interactive Plotly chart showcases regional sales by genre in a grouped bar format, allowing for a direct comparison between regions for each genre, with interactive elements enhancing user engagement. Radar Charts: Top Games Sales Across Regions: The analysis concludes with radar charts illustrating the sales performance of top-selling games across different regions, offering a unique perspective on the data. The radar chart, especially when interactive, is effective in showcasing the relative strengths of each game across multiple dimensions (sales regions).

In [97]:

print(top_games[['Lumped_and_Customized_Name', 'NA_Sales', 'EU_Sales', 'JP_Sales']])

aggregated_sales = top_games.groupby('Lumped_and_Customized_Name')[['NA_Sales', 'EU_Sales', 'JP_Sales']].sum().reset_index()

print(aggregated_sales)

total_sales_by_series = aggregated_sales.set_index('Lumped_and_Customized_Name').sum(axis=1).reset_index(name='Total_Sales')
print(total_sales_by_series)

                           Lumped_and_Customized_Name  NA_Sales  EU_Sales  \
0                              Wii Sports (2006, Wii)     41.36     28.96   
2                      Super Mario Series (2008, Wii)     15.68     12.76   
10                              Nintendogs (2005, DS)      9.05     10.95   
3                       Wii Sports Resort (2009, Wii)     15.61     10.93   
19  Brain Age: Train Your Brain in Minutes a Day (...      4.74      9.20   
1                      Super Mario Series (1985, NES)     29.08      3.58   
9                               Duck Hunt (1984, NES)     26.93      0.63   
5                                   Tetris (1989, GB)     23.20      2.26   
4                           Pokemon Series (1996, GB)     11.27      8.89   
12                          Pokemon Series (1999, GB)      9.00      6.18   
6                       Super Mario Series (2006, DS)     11.28      9.14   
20                          Pokemon Series (2006, DS)      6.38      4.46   

    JP_Sales  
0       3.77  
2       3.79  
10      1.93  
3       3.28  
19      4.16  
1       6.81  
9       0.28  
5       4.22  
4      10.22  
12      7.20  
6       6.50  
20      6.04  
                           Lumped_and_Customized_Name  NA_Sales  EU_Sales  \
0   Brain Age: Train Your Brain in Minutes a Day (...      4.74      9.20   
1                               Duck Hunt (1984, NES)     26.93      0.63   
2                               Nintendogs (2005, DS)      9.05     10.95   
3                           Pokemon Series (1996, GB)     11.27      8.89   
4                           Pokemon Series (1999, GB)      9.00      6.18   
5                           Pokemon Series (2006, DS)      6.38      4.46   
6                      Super Mario Series (1985, NES)     29.08      3.58   
7                       Super Mario Series (2006, DS)     11.28      9.14   
8                      Super Mario Series (2008, Wii)     15.68     12.76   
9                                   Tetris (1989, GB)     23.20      2.26   
10                             Wii Sports (2006, Wii)     41.36     28.96   
11                      Wii Sports Resort (2009, Wii)     15.61     10.93   

    JP_Sales  
0       4.16  
1       0.28  
2       1.93  
3      10.22  
4       7.20  
5       6.04  
6       6.81  
7       6.50  
8       3.79  
9       4.22  
10      3.77  
11      3.28  
                           Lumped_and_Customized_Name  Total_Sales
0   Brain Age: Train Your Brain in Minutes a Day (...        18.10
1                               Duck Hunt (1984, NES)        27.84
2                               Nintendogs (2005, DS)        21.93
3                           Pokemon Series (1996, GB)        30.38
4                           Pokemon Series (1999, GB)        22.38
5                           Pokemon Series (2006, DS)        16.88
6                      Super Mario Series (1985, NES)        39.47
7                       Super Mario Series (2006, DS)        26.92
8                      Super Mario Series (2008, Wii)        32.23
9                                   Tetris (1989, GB)        29.68
10                             Wii Sports (2006, Wii)        74.09
11                      Wii Sports Resort (2009, Wii)        29.82

In [98]:

import plotly.graph_objects as go
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def lump_game_series(name):
    if 'Pokemon' in name:
        return 'Pokemon Series'
    elif 'Super Mario' in name:
        return 'Super Mario Series'
    elif 'Wii Sports' in name:
        return 'Wii Sports Series'
    elif 'Call of Duty: Modern Warfare' in name or 'Call of Duty: Black Ops' in name:
        return 'Call of Duty Series'  
    else:
        return name

df['Game_Series'] = df['Name'].apply(lump_game_series)

series_sales = df.groupby('Game_Series')[['NA_Sales', 'EU_Sales', 'JP_Sales']].sum().reset_index()

scaler = MinMaxScaler()
series_sales[['NA_Sales', 'EU_Sales', 'JP_Sales']] = scaler.fit_transform(series_sales[['NA_Sales', 'EU_Sales', 'JP_Sales']])

series_sales['Total_Sales'] = series_sales[['NA_Sales', 'EU_Sales', 'JP_Sales']].sum(axis=1)
top_series_sales = series_sales.sort_values(by='Total_Sales', ascending=False).head(10)

fig = go.Figure()

for i, row in top_series_sales.iterrows():
    fig.add_trace(go.Scatterpolar(
        r=[row['NA_Sales'], row['EU_Sales'], row['JP_Sales'], row['NA_Sales']],  
        theta=['NA Sales', 'EU Sales', 'JP Sales', 'NA Sales'],  
        fill='toself',
        name=row['Game_Series'],
        hoverinfo='name+r+theta'  
    ))


fig.update_layout(
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 1] 
        )
    ),
    title='Top Game Series Sales Across Regions (Normalized Data)',
    legend=dict(
        title='Game Series',
        itemsizing='constant'
    )
)


fig.show()

In [99]:

print(df.columns)

Index(['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Critic_Score',
       'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating',
       'Console_Brand', 'Series_Installment', 'NA_Sales_Percent',
       'EU_Sales_Percent', 'JP_Sales_Percent', 'Market_Saturation',
       'Multiplatform_Release', 'Years_Since_Last_Installment',
       'Lumped_and_Customized_Name', 'Game_Series'],
      dtype='object')

In [100]:

import plotly.graph_objects as go

top_series_sales = top_series_sales.sort_values(by='Total_Sales', ascending=False)

fig = go.Figure()

regions = ['NA_Sales', 'EU_Sales', 'JP_Sales']
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']  

for region, color in zip(regions, colors):
    fig.add_trace(go.Bar(
        x=top_series_sales['Game_Series'],
        y=top_series_sales[region],
        name=region,
        marker_color=color
    ))

fig.update_layout(
    barmode='stack',
    title='Top Game Series Sales Across Regions',
    xaxis_title='Game Series',
    yaxis_title='Normalized Sales',
    legend_title='Region'
)

fig.show()

In [101]:

print(df.columns)

Index(['Name', 'Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Critic_Score',
       'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating',
       'Console_Brand', 'Series_Installment', 'NA_Sales_Percent',
       'EU_Sales_Percent', 'JP_Sales_Percent', 'Market_Saturation',
       'Multiplatform_Release', 'Years_Since_Last_Installment',
       'Lumped_and_Customized_Name', 'Game_Series'],
      dtype='object')

In [102]:

import plotly.graph_objects as go
import pandas as pd


console_brand_sales = df.groupby('Console_Brand')[['NA_Sales', 'EU_Sales', 'JP_Sales']].sum().reset_index()

console_brand_sales['Global_Sales'] = console_brand_sales[['NA_Sales', 'EU_Sales', 'JP_Sales']].sum(axis=1)
console_brand_sales = console_brand_sales.sort_values(by='Global_Sales', ascending=False)

fig = go.Figure()

regions = ['NA_Sales', 'EU_Sales', 'JP_Sales']
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']  # Colors for North America, Europe, and Japan respectively

for region, color in zip(regions, colors):
    fig.add_trace(go.Bar(
        name=region,
        x=console_brand_sales['Console_Brand'],
        y=console_brand_sales[region],
        marker_color=color
    ))

fig.update_layout(
    barmode='stack',
    title='Market Share per Console Brand Across Different Regions',
    xaxis_title='Console Brand',
    yaxis_title='Sales (in millions)',
    legend_title='Region',
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)

fig.show()

Top Games Series Sales: A DataFrame named top_games presumably containing information about top-selling games is manipulated to focus on a customized name (Lumped_and_Customized_Name), representing grouped game series, and their sales in North America (NA), Europe (EU), and Japan (JP). Sales data is then aggregated by the customized name to calculate total sales per region for each game series, resulting in the DataFrame aggregated_sales. Finally, total sales across all regions are computed for each game series and presented in total_sales_by_series. Plotly Stacked Bar Chart for Game Series: Visualization: An interactive stacked bar chart using Plotly is created to display regional sales for top game series (top_series_sales). The chart employs a stacking approach to show the composition of sales from different regions for each game series. Customization: The layout is customized to include a title, axis labels, and a horizontal legend. Sales are normalized, and distinct colors are used to represent different regions for clear differentiation. Plotly Stacked Bar Chart for Console Brand Sales: Console Brand Sales Analysis: Sales data is grouped by console brand, calculating total sales in NA, EU, and JP, and a new column Global_Sales is added to sum these regional sales. Visualization: Another Plotly stacked bar chart is constructed to showcase the market share of each console brand across the different regions. The sales data is visualized in a stacked manner for each brand, providing insights into the geographical distribution of their sales. Customization: Similar to the first chart, this one also features a customized layout with a stacked bar mode, clear title, axis labels, and a modified legend positioned horizontally below the chart for easy interpretation. Insights and Considerations: The analysis segments and visualizes significant data points, such as top game series and console brand sales, offering valuable insights into market performance and regional preferences. The use of Plotly for interactive visualizations enhances the data exploration experience, allowing for dynamic engagement with the sales figures across different regions and categories. While the code is well-structured for data analysis and visualization, it assumes the existence of certain DataFrames (top_games, top_series_sales, and df) and their specific columns. For full reproducibility, ensuring these DataFrames are correctly defined and populated with relevant data is crucial. In the context of global sales visualization, considering additional regions beyond NA, EU, and JP, if data is available, could provide a more comprehensive view of worldwide sales trends.

Genre Popularity Over Time by Region

In [103]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df_filtered = df.dropna(subset=['Year_of_Release', 'Genre'])
df_filtered['Year_of_Release'] = df_filtered['Year_of_Release'].astype(int) 

region_columns = ['NA_Sales', 'EU_Sales', 'JP_Sales']
for region in region_columns:
    sales_by_year_genre = df_filtered.groupby(['Year_of_Release', 'Genre'])[region].sum().reset_index()

    # Plotting
    plt.figure(figsize=(14, 8))
    sns.lineplot(data=sales_by_year_genre, x='Year_of_Release', y=region, hue='Genre', marker='o')
    plt.title(f'Genre Popularity Over Time in {region[:-6]}')
    plt.xlabel('Year of Release')
    plt.ylabel('Total Sales (in millions)')
    plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [104]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def plot_genre_popularity_over_time(df, start_year, end_year):
    """
    Plots the popularity of game genres over time within a specified year range.

    Parameters:
    - df: DataFrame containing the game data.
    - start_year: The starting year of the period for analysis.
    - end_year: The ending year of the period for analysis.
    """

    df_filtered = df.dropna(subset=['Year_of_Release', 'Genre'])
    df_filtered = df_filtered[(df_filtered['Year_of_Release'] >= start_year) & (df_filtered['Year_of_Release'] <= end_year)]
    df_filtered['Year_of_Release'] = df_filtered['Year_of_Release'].astype(int)  # Ensure the year is an integer

    region_columns = ['NA_Sales', 'EU_Sales', 'JP_Sales']
    for region in region_columns:
        sales_by_year_genre = df_filtered.groupby(['Year_of_Release', 'Genre'])[region].sum().reset_index()

        # Plotting
        plt.figure(figsize=(14, 8))
        sns.lineplot(data=sales_by_year_genre, x='Year_of_Release', y=region, hue='Genre', marker='o')
        plt.title(f'Genre Popularity Over Time in {region[:-6]} ({start_year}-{end_year})')
        plt.xlabel('Year of Release')
        plt.ylabel('Total Sales (in millions)')
        plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()


plot_genre_popularity_over_time(df, 2009, 2016)

In [105]:

import pandas as pd
import plotly.express as px

def plot_genre_popularity_over_time_interactive(df, start_year, end_year):
    """
    Creates an interactive plot of the popularity of game genres over time within a specified year range using Plotly Express.

    Parameters:
    - df: DataFrame containing the game data.
    - start_year: The starting year of the period for analysis.
    - end_year: The ending year of the period for analysis.
    """

    df_filtered = df.dropna(subset=['Year_of_Release', 'Genre'])
    df_filtered = df_filtered[(df_filtered['Year_of_Release'] >= start_year) & (df_filtered['Year_of_Release'] <= end_year)]
    df_filtered['Year_of_Release'] = df_filtered['Year_of_Release'].astype(int)  # Ensure the year is an integer

    region_columns = ['NA_Sales', 'EU_Sales', 'JP_Sales']
    for region in region_columns:
        sales_by_year_genre = df_filtered.groupby(['Year_of_Release', 'Genre'])[region].sum().reset_index()

        # Creating an interactive plot
        fig = px.line(sales_by_year_genre, x='Year_of_Release', y=region, color='Genre', markers=True,
                      labels={'Year_of_Release': 'Year of Release', region: f'Total Sales in {region[:-6]} (in millions)'},
                      title=f'Interactive Genre Popularity Over Time in {region[:-6]} ({start_year}-{end_year})')
        
        fig.update_layout(legend_title='Genre', legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="center", x=0.5))
        
        fig.show()


plot_genre_popularity_over_time_interactive(df, 2009, 2016)

Game Market Saturation by Genre Over Time

In [106]:

games_by_year_genre = df_filtered.groupby(['Year_of_Release', 'Genre']).size().reset_index(name='Game_Count')

# Plotting
plt.figure(figsize=(14, 8))
sns.lineplot(data=games_by_year_genre, x='Year_of_Release', y='Game_Count', hue='Genre', marker='o')
plt.title('Game Market Saturation by Genre Over Time (1985-2015)')
plt.xlabel('Year of Release')
plt.ylabel('Number of Games Released')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [107]:

import plotly.graph_objects as go
from plotly.subplots import make_subplots

df_filtered = df[(df['Year_of_Release'] >= 1985) & (df['Year_of_Release'] <= 2015)].dropna(subset=['Year_of_Release', 'Genre'])
df_filtered['Year_of_Release'] = df_filtered['Year_of_Release'].astype(int)

regions = ['NA_Sales', 'EU_Sales', 'JP_Sales']

fig = make_subplots(rows=1, cols=len(regions), subplot_titles=regions, specs=[[{'type': 'scatter'}]*len(regions)])

for i, region in enumerate(regions, start=1):
    sales_by_year_genre = df_filtered.groupby(['Year_of_Release', 'Genre'])[region].sum().reset_index()
    
    for genre in df_filtered['Genre'].unique():
        genre_data = sales_by_year_genre[sales_by_year_genre['Genre'] == genre]
        fig.add_trace(
            go.Scatter(x=genre_data['Year_of_Release'], y=genre_data[region], mode='lines+markers', name=genre),
            row=1, col=i
        )
    fig.update_xaxes(title_text='Year of Release', row=1, col=i)
    fig.update_yaxes(title_text='Total Sales (in millions)', row=1, col=i)

fig.update_layout(title='Genre Popularity Over Time by Region (1985-2015)', showlegend=True)
fig.show()

In [108]:

import plotly.graph_objects as go
from plotly.subplots import make_subplots

df_filtered = df[(df['Year_of_Release'] >= 1985) & (df['Year_of_Release'] <= 2015)].dropna(subset=['Year_of_Release', 'Genre'])
df_filtered['Year_of_Release'] = df_filtered['Year_of_Release'].astype(int)

regions = ['NA_Sales', 'EU_Sales', 'JP_Sales']

fig = make_subplots(rows=len(regions), cols=1, subplot_titles=regions, vertical_spacing=0.1)

for i, region in enumerate(regions, start=1):
   
    sales_by_year_genre = df_filtered.groupby(['Year_of_Release', 'Genre'])[region].sum().reset_index()
    

    for genre in df_filtered['Genre'].unique():
        genre_data = sales_by_year_genre[sales_by_year_genre['Genre'] == genre]
        fig.add_trace(
            go.Scatter(x=genre_data['Year_of_Release'], y=genre_data[region], mode='lines+markers', name=genre, legendgroup=genre),
            row=i, col=1  
        )
    
    fig.update_xaxes(title_text='Year of Release', row=i, col=1)
    fig.update_yaxes(title_text='Total Sales (in millions)', row=i, col=1)

fig.update_layout(
    title='Genre Popularity Over Time by Region (1985-2015)',
    height=1200,  
    legend=dict(
        title='Genre',
        orientation="h", 
        yanchor="bottom", 
        y=-0.3,  
        xanchor="center", 
        x=0.5
    )
)

fig.show()

In [109]:

fig_saturation = make_subplots(rows=1, cols=len(regions), subplot_titles=regions, specs=[[{'type': 'scatter'}]*len(regions)])

for i, region in enumerate(regions, start=1):
    games_by_year_genre = df_filtered.groupby(['Year_of_Release', 'Genre']).size().reset_index(name='Game_Count')
    
    for genre in df_filtered['Genre'].unique():
        genre_data = games_by_year_genre[games_by_year_genre['Genre'] == genre]
        fig_saturation.add_trace(
            go.Scatter(x=genre_data['Year_of_Release'], y=genre_data['Game_Count'], mode='lines+markers', name=genre),
            row=1, col=i
        )
    fig_saturation.update_xaxes(title_text='Year of Release', row=1, col=i)
    fig_saturation.update_yaxes(title_text='Number of Games Released', row=1, col=i)

fig_saturation.update_layout(title='Game Market Saturation by Genre Over Time (1985-2015)', showlegend=True)
fig_saturation.show()

Visualizations: Line Plots for Genre Popularity: For each region, line plots are created to visualize the total sales by genre over time. These plots use seaborn's lineplot function and display sales trends, helping identify popular genres and shifts in genre popularity over the years.

Interactive Line Plots with Plotly: Similar to the seaborn line plots, interactive line plots are created using Plotly for each region. These plots enhance user engagement by allowing interactive exploration of the data, such as hovering over data points to see exact values.

Subplots for Regional Analysis: Plotly is also used to create subplots, one for each region, to compare genre popularity over time side by side. This layout provides a clear comparison of regional trends and preferences.

Market Saturation Analysis: The analysis is extended to examine market saturation by counting the number of games released each year by genre. This insight helps understand the volume of games competing in the market over time and identifies potential over-saturation or under-represented genres.

Insights and Considerations: The line plots and interactive visualizations effectively highlight trends in genre popularity, revealing how consumer preferences may have evolved and how different regions may favor different genres. The market saturation analysis provides valuable insights into the gaming industry's dynamics, showing how the number of games released in each genre has changed over time, which can inform decisions on market entry and genre focus. The use of interactive Plotly plots adds a layer of depth to the analysis, allowing for a more detailed exploration of the data points and trends. The visualizations benefit from clear titles, axis labels, and legends, enhancing readability and interpretability of the insights derived from the data.

Concentrating on the Last 7 Years

In [110]:

latest_year = df['Year_of_Release'].max()
df_last_7_years = df[(df['Year_of_Release'] > latest_year - 7) & (df['Year_of_Release'] <= latest_year)]

In [111]:

top_selling_games = df_last_7_years.sort_values(by='Global_Sales', ascending=False).head(10)

In [112]:

import plotly.express as px

fig = px.bar(top_selling_games, x='Name', y='Global_Sales',
             hover_data=['Year_of_Release', 'Platform', 'Genre'],
             color='Global_Sales',
             title='Top 10 Selling Video Games in the Last 7 Years',
             labels={'Global_Sales': 'Global Sales (Millions)', 'Name': 'Game Name'})

# Improve layout
fig.update_layout(xaxis_title="Game Name",
                  yaxis_title="Global Sales (Millions)",
                  xaxis={'categoryorder':'total descending'},
                  coloraxis_colorbar=dict(title='Sales (Millions)'))

fig.show()

In [113]:

latest_year = df['Year_of_Release'].max()
df_last_7_years = df[(df['Year_of_Release'] > latest_year - 7) & (df['Year_of_Release'] <= latest_year)]

console_sales = df_last_7_years.groupby('Platform').agg({'Global_Sales':'sum'}).reset_index()

top_consoles = console_sales.sort_values(by='Global_Sales', ascending=False).head(3)['Platform']

In [114]:

top_games_by_console = {}

for console in top_consoles:
    # Filter data for each top console
    console_data = df_last_7_years[df_last_7_years['Platform'] == console]
    
    # Find top 3 games globally
    top_games_global = console_data.sort_values(by='Global_Sales', ascending=False).head(3)
    
    # Store in dictionary
    top_games_by_console[console] = {
        'Global': top_games_global[['Name', 'Global_Sales']]
    }
    
    # Find top 3 games regionally
    for region in ['NA_Sales', 'EU_Sales', 'JP_Sales']:
        top_games_region = console_data.sort_values(by=region, ascending=False).head(3)
        top_games_by_console[console][region] = top_games_region[['Name', region]]

In [115]:

for console, games_info in top_games_by_console.items():
    print(f"\nTop Selling Console: {console}")
    print("Top 3 Global Games:")
    print(games_info['Global'].to_string(index=False))
    
    for region in ['NA_Sales', 'EU_Sales', 'JP_Sales']:
        print(f"\nTop 3 Games in {region.split('_')[0]}:")
        print(games_info[region].to_string(index=False))

Top Selling Console: PS4
Top 3 Global Games:
                     Name  Global_Sales
Call of Duty: Black Ops 3         14.63
       Grand Theft Auto V         12.61
                  FIFA 16          8.57

Top 3 Games in NA:
                        Name  NA_Sales
   Call of Duty: Black Ops 3      6.03
          Grand Theft Auto V      3.96
Star Wars Battlefront (2015)      2.99

Top 3 Games in EU:
                     Name  EU_Sales
       Grand Theft Auto V      6.31
                  FIFA 16      6.12
Call of Duty: Black Ops 3      5.86

Top 3 Games in JP:
                                Name  JP_Sales
Metal Gear Solid V: The Phantom Pain      0.49
                  Grand Theft Auto V      0.38
                           Persona 5      0.37

Top Selling Console: XOne
Top 3 Global Games:
                          Name  Global_Sales
     Call of Duty: Black Ops 3          7.39
            Grand Theft Auto V          5.48
Call of Duty: Advanced Warfare          5.27

Top 3 Games in NA:
                          Name  NA_Sales
     Call of Duty: Black Ops 3      4.59
Call of Duty: Advanced Warfare      3.22
            Grand Theft Auto V      2.81

Top 3 Games in EU:
                     Name  EU_Sales
       Grand Theft Auto V      2.19
                  FIFA 16      2.12
Call of Duty: Black Ops 3      2.11

Top 3 Games in JP:
                             Name  JP_Sales
                        Titanfall      0.04
Halo: The Master Chief Collection      0.03
                Halo 5: Guardians      0.03

Top Selling Console: 3DS
Top 3 Global Games:
                                     Name  Global_Sales
Pokemon Omega Ruby/Pokemon Alpha Sapphire         11.68
      Super Smash Bros. for Wii U and 3DS          7.55
                         Pokemon Sun/Moon          7.14

Top 3 Games in NA:
                                     Name  NA_Sales
Pokemon Omega Ruby/Pokemon Alpha Sapphire      4.35
      Super Smash Bros. for Wii U and 3DS      3.27
                         Pokemon Sun/Moon      2.98

Top 3 Games in EU:
                                     Name  EU_Sales
Pokemon Omega Ruby/Pokemon Alpha Sapphire      3.49
                         Pokemon Sun/Moon      1.45
      Super Smash Bros. for Wii U and 3DS      1.37

Top 3 Games in JP:
                                     Name  JP_Sales
                Yokai Watch 2 Ganso/Honke      3.18
Pokemon Omega Ruby/Pokemon Alpha Sapphire      3.10
                         Monster Hunter X      2.79

In [116]:

import matplotlib.pyplot as plt

def plot_top_games(games, title):
    plt.figure(figsize=(10, 4))
    plt.bar(games['Name'], games.iloc[:, 1])
    plt.title(title)
    plt.xticks(rotation=45, ha='right')
    plt.ylabel('Sales (Millions)')
    plt.tight_layout()
    plt.show()

for console, games_info in top_games_by_console.items():
    plot_top_games(games_info['Global'], f"Top 3 Global Games for {console}")
    
    for region in ['NA_Sales', 'EU_Sales', 'JP_Sales']:
        region_name = region.split('_')[0]
        plot_top_games(games_info[region], f"Top 3 Games in {region_name} for {console}")

In [117]:

import plotly.graph_objects as go
from plotly.subplots import make_subplots



regions = ['Global', 'NA_Sales', 'EU_Sales', 'JP_Sales']
region_titles = ['Global Sales', 'North America Sales', 'Europe Sales', 'Japan Sales']

fig = make_subplots(rows=1, cols=len(regions), subplot_titles=region_titles, horizontal_spacing=0.1)

dropdown_buttons = []

for console_idx, (console, games_info) in enumerate(top_games_by_console.items()):
    for region_idx, region in enumerate(regions):
        sales_column = 'Global_Sales' if region == 'Global' else region
        
        fig.add_trace(
            go.Bar(
                x=games_info[region]['Name'], 
                y=games_info[region][sales_column], 
                name=f"{console} - {region_titles[region_idx]}"
            ),
            row=1, col=region_idx+1
        )
        
        fig.data[-1].visible = console_idx == 0

    dropdown_buttons.append(
        dict(
            label=console,
            method="update",
            args=[{"visible": [False] * len(fig.data)},
                  {"title": f"Top Games for {console}"}]
        )
    )
    for i in range(console_idx * len(regions), (console_idx + 1) * len(regions)):
        dropdown_buttons[-1]["args"][0]["visible"][i] = True

fig.update_layout(
    updatemenus=[
        dict(
            buttons=dropdown_buttons,
            direction="down",
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.1,
            yanchor="top"
        ),
    ],
    showlegend=False
)

fig.update_layout(height=400, width=1200, title_text=f"Top Games for {list(top_games_by_console.keys())[0]}")

fig.show()

Data Filtering and Preparation:

Filters the DataFrame df for entries from the last 7 years. Identifies the top 10 selling games in this period. Groups console sales data and identifies the top 3 platforms based on global sales.

Plotly Bar Chart for Top-Selling Games: Creates an interactive bar chart displaying the top 10 selling games over the last 7 years, with hover data including 'Year_of_Release', 'Platform', and 'Genre'. The color scale is applied to the bars based on 'Global_Sales'.

Analysis of Top Games by Console: For each of the top 3 consoles, identifies the top 3 selling games globally and in each major sales region (NA, EU, JP). Prints this information in a structured format, distinguishing between global and regional top sellers.

Matplotlib Bar Plots for Top Games by Console: Utilizes Matplotlib to create bar plots showcasing the top 3 global games for each of the top consoles. Extends this visualization to also show top games in each major sales region for these consoles.

Plotly Subplots for Top Games by Region and Console: Constructs subplots for each sales region, comparing top games across the top consoles. Implements a dropdown menu to switch between consoles, updating the subplot to display the top games for the selected console in each region.

Japan Only Ninetendo Titles and Japanese Sales

In [118]:

nintendo_games = df[df['Console_Brand'] == 'Nintendo']

jp_only_nintendo_games = nintendo_games[(nintendo_games['JP_Sales'] > 0) & (nintendo_games['NA_Sales'] == 0) & (nintendo_games['EU_Sales'] == 0)]

jp_only_nintendo_games_info = jp_only_nintendo_games[['Name', 'Platform', 'Year_of_Release', 'JP_Sales']]

print(jp_only_nintendo_games_info)

                                                Name Platform  \
14985                                 Beyblade Burst      3DS   
13795                     Haikyu!! Cross Team Match!      3DS   
12251                       12-Sai. Honto no Kimochi      3DS   
13426                          12-Sai. Koisuru Diary      3DS   
1318                              4 Nin uchi Mahjong      NES   
...                                              ...      ...   
12156                      Zettai Onkan Otoda Master       DS   
15821                                     Zoids Dash       DS   
13650                                  Zombi Daisuki       DS   
12961  Zyuden Sentai Kyoryuger: Game de Gaburincho!!      3DS   
9135                    ¡Shin Chan Flipa en colores!       DS   

       Year_of_Release  JP_Sales  
14985             2016      0.03  
13795             2016      0.04  
12251             2014      0.07  
13426             2016      0.05  
1318              1984      1.45  
...                ...       ...  
12156             2007      0.07  
15821             2006      0.02  
13650             2011      0.04  
12961             2013      0.05  
9135              2007      0.14  

[1211 rows x 4 columns]

In [119]:

nintendo_games = df[df['Console_Brand'] == 'Nintendo']

jp_only_nintendo_games = nintendo_games[(nintendo_games['JP_Sales'] > 0) & (nintendo_games['NA_Sales'] == 0) & (nintendo_games['EU_Sales'] == 0)]

jp_only_nintendo_games_info = jp_only_nintendo_games[['Name', 'Platform', 'Year_of_Release', 'JP_Sales']]

print(jp_only_nintendo_games_info)

                                                Name Platform  \
14985                                 Beyblade Burst      3DS   
13795                     Haikyu!! Cross Team Match!      3DS   
12251                       12-Sai. Honto no Kimochi      3DS   
13426                          12-Sai. Koisuru Diary      3DS   
1318                              4 Nin uchi Mahjong      NES   
...                                              ...      ...   
12156                      Zettai Onkan Otoda Master       DS   
15821                                     Zoids Dash       DS   
13650                                  Zombi Daisuki       DS   
12961  Zyuden Sentai Kyoryuger: Game de Gaburincho!!      3DS   
9135                    ¡Shin Chan Flipa en colores!       DS   

       Year_of_Release  JP_Sales  
14985             2016      0.03  
13795             2016      0.04  
12251             2014      0.07  
13426             2016      0.05  
1318              1984      1.45  
...                ...       ...  
12156             2007      0.07  
15821             2006      0.02  
13650             2011      0.04  
12961             2013      0.05  
9135              2007      0.14  

[1211 rows x 4 columns]

In [120]:

japanese_nintendo_exclusives = df[(df['JP_Sales'] > 0) & (df['NA_Sales'] == 0) & (df['EU_Sales'] == 0) & (df['Console_Brand'] == 'Nintendo')]

top_25_japanese_exclusives = japanese_nintendo_exclusives.sort_values(by='JP_Sales', ascending=False).head(25)

print(top_25_japanese_exclusives[['Name', 'Platform', 'JP_Sales']])

                                                   Name Platform  JP_Sales
340                                   Friend Collection       DS      3.67
385                                    Monster Hunter 4      3DS      3.44
428                Dragon Quest VI: Maboroshi no Daichi     SNES      3.19
530                  Dragon Quest V: Tenkuu no Hanayome     SNES      2.78
563                              Yokai Watch 2 Shinuchi      3DS      2.68
574                  Super Mario Bros.: The Lost Levels      NES      2.65
632                                     Final Fantasy V     SNES      2.43
674                             Dragon Warrior Monsters       GB      2.34
694                                 Yokai Watch Busters      3DS      2.29
752                           Yu-Gi-Oh: Duel Monsters 4       GB      2.17
1046  Kanshuu Nippon Joushikiryoku Kentei Kyoukai: I...       DS      1.71
1053                                    Super Puyo Puyo     SNES      1.69
1146                            Yu-Gi-Oh! Duel Monsters       GB      1.61
1197                            Dragon Quest Monsters 2       GB      1.56
1234                                            F1 Race      NES      1.52
1244                                   Puzzle & Dragons      3DS      1.51
1260  Ninja Hattori Kun: Ninja wa Shuugyou Degogiru ...      NES      1.50
1319                                      Dragon Ball Z     SNES      1.45
1318                                 4 Nin uchi Mahjong      NES      1.45
1317                                         Tamagotchi       GB      1.44
1320                      Game de Hakken!! Tamagotchi 2       GB      1.44
1342              Classic NES Series: Super Mario Bros.      GBA      1.39
1383                                  Final Fantasy III      NES      1.39
1415                            Pocket Monsters Stadium      N64      1.37
1425            Dragon Quest III: Soshite Densetsu e...     SNES      1.36

In [121]:

japanese_exclusives_nintendo_sony = df[(df['JP_Sales'] > 0) & (df['NA_Sales'] == 0) & (df['EU_Sales'] == 0) & (df['Console_Brand'].isin(['Nintendo', 'Sony']))]

top_25_japanese_exclusives_nintendo_sony = japanese_exclusives_nintendo_sony.sort_values(by='JP_Sales', ascending=False).head(25)

print(top_25_japanese_exclusives_nintendo_sony[['Name', 'Platform', 'JP_Sales']])

                                                   Name Platform  JP_Sales
215                            Monster Hunter Freedom 3      PSP      4.87
340                                   Friend Collection       DS      3.67
385                                    Monster Hunter 4      3DS      3.44
428                Dragon Quest VI: Maboroshi no Daichi     SNES      3.19
530                  Dragon Quest V: Tenkuu no Hanayome     SNES      2.78
563                              Yokai Watch 2 Shinuchi      3DS      2.68
574                  Super Mario Bros.: The Lost Levels      NES      2.65
632                                     Final Fantasy V     SNES      2.43
674                             Dragon Warrior Monsters       GB      2.34
694                                 Yokai Watch Busters      3DS      2.29
752                           Yu-Gi-Oh: Duel Monsters 4       GB      2.17
794                                      Derby Stallion       PS      1.96
1046  Kanshuu Nippon Joushikiryoku Kentei Kyoukai: I...       DS      1.71
1053                                    Super Puyo Puyo     SNES      1.69
1115                 Dragon Quest V: Tenkuu no Hanayome      PS2      1.65
1146                            Yu-Gi-Oh! Duel Monsters       GB      1.61
1197                            Dragon Quest Monsters 2       GB      1.56
1234                                            F1 Race      NES      1.52
1244                                   Puzzle & Dragons      3DS      1.51
1260  Ninja Hattori Kun: Ninja wa Shuugyou Degogiru ...      NES      1.50
1319                                      Dragon Ball Z     SNES      1.45
1318                                 4 Nin uchi Mahjong      NES      1.45
1320                      Game de Hakken!! Tamagotchi 2       GB      1.44
1317                                         Tamagotchi       GB      1.44
1383                                  Final Fantasy III      NES      1.39

In [122]:

current_year = 2023 
start_year = current_year - 7

japanese_exclusives_nintendo_sony_recent = df[
    (df['JP_Sales'] > 0) &
    (df['NA_Sales'] == 0) &
    (df['EU_Sales'] == 0) &
    (df['Console_Brand'].isin(['Nintendo', 'Sony'])) &
    (df['Year_of_Release'] >= start_year)
]

top_25_japanese_exclusives_nintendo_sony_recent = japanese_exclusives_nintendo_sony_recent.sort_values(by='JP_Sales', ascending=False).head(25)

print(top_25_japanese_exclusives_nintendo_sony_recent[['Name', 'Platform', 'Year_of_Release', 'JP_Sales']])

                                                    Name Platform  \
1466                                       Yokai Watch 3      3DS   
3177                       Dragon Quest Monsters Joker 3      3DS   
3593                                    Yokai Sangokushi      3DS   
5074                                           Persona 5      PS4   
5898              Dragon Quest Builders: Revive Alefgard      PSV   
6163                              Monster Hunter Stories      3DS   
6086   Dragon Quest Heroes II: Twin Kings and the Pro...      PS4   
6247                                      Ace Attorney 6      3DS   
6952   Dragon Quest Heroes II: Twin Kings and the Pro...      PSV   
7079                                   Tales of Berseria      PS4   
7122    Puzzle & Dragons X: God Chapter / Dragon Chapter      3DS   
7559   Story of Seasons: Good Friends of the Three Vi...      3DS   
7770                                       Kan Colle Kai      PSV   
8023                  Jikkyou Powerful Pro Baseball 2016      PSV   
8283                  Jikkyou Powerful Pro Baseball 2016      PS4   
8612                                       Yakuza Kiwami      PS4   
9361   Dragon Quest Heroes II: Twin Kings and the Pro...      PS3   
9351                              Attack on Titan (KOEI)      PSV   
9955                  Jikkyou Powerful Pro Baseball 2016      PS3   
9699                                    Etrian Odyssey V      3DS   
9857                                       Yakuza Kiwami      PS3   
9834              Dragon Quest Builders: Revive Alefgard      PS3   
9723                                    Gundam Breaker 3      PSV   
9888                                   Tales of Berseria      PS3   
10406                      Fate/Extella: The Umbral Star      PS4   

       Year_of_Release  JP_Sales  
1466              2016      1.33  
3177              2016      0.63  
3593              2016      0.56  
5074              2016      0.37  
5898              2016      0.30  
6163              2016      0.28  
6086              2016      0.28  
6247              2016      0.27  
6952              2016      0.23  
7079              2016      0.23  
7122              2016      0.22  
7559              2016      0.20  
7770              2016      0.19  
8023              2016      0.18  
8283              2016      0.17  
8612              2016      0.16  
9361              2016      0.13  
9351              2016      0.13  
9955              2016      0.12  
9699              2016      0.12  
9857              2016      0.12  
9834              2016      0.12  
9723              2016      0.12  
9888              2016      0.12  
10406             2016      0.11

Nintendo Exclusives in Japan:

Filters the dataset for Nintendo games with sales in Japan but no sales in North America or Europe. Displays information about these games, including their names, platforms, release years, and sales figures in Japan. Top 25 Nintendo Exclusives in Japan:

Further narrows down to the top 25 Nintendo-exclusive titles in Japan based on Japanese sales, sorted in descending order. Nintendo and Sony Exclusives in Japan:

Expands the analysis to include both Nintendo and Sony consoles, identifying games exclusive to Japan. Lists the top 25 exclusive titles for these consoles based on Japanese sales. Recent Exclusives for Nintendo and Sony:

Focuses on recent exclusive titles for Nintendo and Sony consoles released in the last 7 years. Sorts these recent exclusives by Japanese sales and lists the top 25.

Sales of Top 20 games sold on Ninetndo in all regions to create a predictor on sales for games unreleased in EU, NA and Other

In [123]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

encoder = OneHotEncoder(sparse=False)
encoded_features = encoder.fit_transform(df[['Genre', 'Platform']])

X = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['Genre', 'Platform']))

y_na = df['NA_Sales']
y_eu = df['EU_Sales']

# Split the data
X_train, X_test, y_train_na, y_test_na = train_test_split(X, y_na, test_size=0.2, random_state=42)
_, _, y_train_eu, y_test_eu = train_test_split(X, y_eu, test_size=0.2, random_state=42)

C:\Users\Luke Holmes\anaconda3\Lib\site-packages\sklearn\preprocessing\_encoders.py:975: FutureWarning:

`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.

In [124]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np


X_train, X_test, y_train_na, y_test_na = train_test_split(X, y_na, test_size=0.2, random_state=42)
_, _, y_train_eu, y_test_eu = train_test_split(X, y_eu, test_size=0.2, random_state=42)

model_na = RandomForestRegressor(n_estimators=100, random_state=42)
model_na.fit(X_train, y_train_na)

model_eu = RandomForestRegressor(n_estimators=100, random_state=42)
model_eu.fit(X_train, y_train_eu)

predictions_na = model_na.predict(X_test)
mse_na = mean_squared_error(y_test_na, predictions_na)
rmse_na = np.sqrt(mse_na)

predictions_eu = model_eu.predict(X_test)
mse_eu = mean_squared_error(y_test_eu, predictions_eu)
rmse_eu = np.sqrt(mse_eu)

print(f"NA Sales - RMSE: {rmse_na}")
print(f"EU Sales - RMSE: {rmse_eu}")

NA Sales - RMSE: 0.7896461249670484
EU Sales - RMSE: 0.45157060603167687

In [125]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

encoder = OneHotEncoder(sparse=False)
encoded_features = encoder.fit_transform(df[['Genre', 'Platform']])
feature_names = encoder.get_feature_names_out(['Genre', 'Platform'])

df_encoded = pd.DataFrame(encoded_features, columns=feature_names)
X = df_encoded

y_na = df['NA_Sales']
y_eu = df['EU_Sales']

X_train, X_test, y_train_na, y_test_na = train_test_split(X, y_na, test_size=0.2, random_state=42)
X_train, X_test, y_train_eu, y_test_eu = train_test_split(X, y_eu, test_size=0.2, random_state=42)

model_na = RandomForestRegressor(n_estimators=100, random_state=42)
model_na.fit(X_train, y_train_na)
model_eu = RandomForestRegressor(n_estimators=100, random_state=42)
model_eu.fit(X_train, y_train_eu)


japan_only = df[(df['NA_Sales'] == 0) & (df['EU_Sales'] == 0) & (df['JP_Sales'] > 0)].copy()
japan_only_encoded = encoder.transform(japan_only[['Genre', 'Platform']])
X_japan_only = pd.DataFrame(japan_only_encoded, columns=feature_names)

japan_only['Predicted_NA_Sales'] = model_na.predict(X_japan_only)
japan_only['Predicted_EU_Sales'] = model_eu.predict(X_japan_only)

print(japan_only[['Name', 'JP_Sales', 'Predicted_NA_Sales', 'Predicted_EU_Sales']].sort_values(by='JP_Sales', ascending=False).head(25))

C:\Users\Luke Holmes\anaconda3\Lib\site-packages\sklearn\preprocessing\_encoders.py:975: FutureWarning:

`sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.

                                                   Name  JP_Sales  \
215                            Monster Hunter Freedom 3      4.87   
340                                   Friend Collection      3.67   
385                                    Monster Hunter 4      3.44   
428                Dragon Quest VI: Maboroshi no Daichi      3.19   
530                  Dragon Quest V: Tenkuu no Hanayome      2.78   
563                              Yokai Watch 2 Shinuchi      2.68   
574                  Super Mario Bros.: The Lost Levels      2.65   
632                                     Final Fantasy V      2.43   
674                             Dragon Warrior Monsters      2.34   
694                                 Yokai Watch Busters      2.29   
752                           Yu-Gi-Oh: Duel Monsters 4      2.17   
794                                      Derby Stallion      1.96   
1046  Kanshuu Nippon Joushikiryoku Kentei Kyoukai: I...      1.71   
1053                                    Super Puyo Puyo      1.69   
1115                 Dragon Quest V: Tenkuu no Hanayome      1.65   
1146                            Yu-Gi-Oh! Duel Monsters      1.61   
1197                            Dragon Quest Monsters 2      1.56   
1234                                            F1 Race      1.52   
1244                                   Puzzle & Dragons      1.51   
1260  Ninja Hattori Kun: Ninja wa Shuugyou Degogiru ...      1.50   
1319                                      Dragon Ball Z      1.45   
1318                                 4 Nin uchi Mahjong      1.45   
1320                      Game de Hakken!! Tamagotchi 2      1.44   
1317                                         Tamagotchi      1.44   
1383                                  Final Fantasy III      1.39   

      Predicted_NA_Sales  Predicted_EU_Sales  
215             0.047930            0.012652  
340             0.128834            0.052975  
385             0.317469            0.205164  
428             0.031011            0.001877  
530             0.031011            0.001877  
563             0.317469            0.205164  
574             2.319692            0.428603  
632             0.031011            0.001877  
674             1.789275            1.353097  
694             0.098080            0.063696  
752             1.789275            1.353097  
794             0.289029            0.149308  
1046            0.128834            0.052975  
1053            0.000000            0.000000  
1115            0.201493            0.110623  
1146            0.195546            0.095804  
1197            1.789275            1.353097  
1234            0.965338            0.266563  
1244            0.317469            0.205164  
1260            2.319692            0.428603  
1319            0.394720            0.123540  
1318            0.094296            0.051627  
1320            0.006350            0.003839  
1317            0.006350            0.003839  
1383            0.090169            0.000000

In [126]:

print(japan_only[['Name', 'JP_Sales', 'Predicted_NA_Sales', 'Predicted_EU_Sales']].sort_values(by='JP_Sales', ascending=False).head(10))

                                     Name  JP_Sales  Predicted_NA_Sales  \
215              Monster Hunter Freedom 3      4.87            0.047930   
340                     Friend Collection      3.67            0.128834   
385                      Monster Hunter 4      3.44            0.317469   
428  Dragon Quest VI: Maboroshi no Daichi      3.19            0.031011   
530    Dragon Quest V: Tenkuu no Hanayome      2.78            0.031011   
563                Yokai Watch 2 Shinuchi      2.68            0.317469   
574    Super Mario Bros.: The Lost Levels      2.65            2.319692   
632                       Final Fantasy V      2.43            0.031011   
674               Dragon Warrior Monsters      2.34            1.789275   
694                   Yokai Watch Busters      2.29            0.098080   

     Predicted_EU_Sales  
215            0.012652  
340            0.052975  
385            0.205164  
428            0.001877  
530            0.001877  
563            0.205164  
574            0.428603  
632            0.001877  
674            1.353097  
694            0.063696

In [127]:

japan_only['Total_Predicted_Sales_NA_EU'] = japan_only['Predicted_NA_Sales'] + japan_only['Predicted_EU_Sales']

top_10_sorted_by_predicted_sales = japan_only[['Name', 'JP_Sales', 'Predicted_NA_Sales', 'Predicted_EU_Sales', 'Total_Predicted_Sales_NA_EU']].sort_values(by='Total_Predicted_Sales_NA_EU', ascending=False).head(10)

print(top_10_sorted_by_predicted_sales)

overall_total_predicted_sales = top_10_sorted_by_predicted_sales['Total_Predicted_Sales_NA_EU'].sum()
print(f"\nOverall Total Predicted Sales for Top 10 Games in NA and EU: {overall_total_predicted_sales:.2f} Million")

                                                   Name  JP_Sales  \
1681                                            TwinBee      1.20   
9008                                     Sakura Wars GB      0.14   
6319               Medarot 3: Kabuto / Kuwagata Version      0.27   
2660                                Dragon Warrior I&II      0.77   
9046                            Star Ocean: Blue Sphere      0.14   
9874               Medarot 4: Kabuto / Kuwagata Version      0.12   
4931  From TV Animation One Piece: Yume no Lufy Kaiz...      0.39   
1856                           The Final Fantasy Legend      1.10   
3590                           Hamtaro: Ham-Hams Unite!      0.56   
6306  Fushigi no Dungeon: Fuurai no Shiren GB: Tsuki...      0.27   

      Predicted_NA_Sales  Predicted_EU_Sales  Total_Predicted_Sales_NA_EU  
1681            5.239961            0.183499                     5.423461  
9008            2.432167            0.911915                     3.344082  
6319            1.789275            1.353097                     3.142372  
2660            1.789275            1.353097                     3.142372  
9046            1.789275            1.353097                     3.142372  
9874            1.789275            1.353097                     3.142372  
4931            1.789275            1.353097                     3.142372  
1856            1.789275            1.353097                     3.142372  
3590            1.789275            1.353097                     3.142372  
6306            1.789275            1.353097                     3.142372  

Overall Total Predicted Sales for Top 10 Games in NA and EU: 33.91 Million

In [128]:

japan_only_recent = japan_only[(japan_only['Year_of_Release'] >= 2014) & (japan_only['Year_of_Release'] <= 2020)]

japan_only_recent['Total_Predicted_Sales_NA_EU'] = japan_only_recent['Predicted_NA_Sales'] + japan_only_recent['Predicted_EU_Sales']

top_10_recent_sorted_by_predicted_sales = japan_only_recent[['Name', 'Year_of_Release', 'JP_Sales', 'Predicted_NA_Sales', 'Predicted_EU_Sales', 'Total_Predicted_Sales_NA_EU']].sort_values(by='Total_Predicted_Sales_NA_EU', ascending=False).head(10)

print(top_10_recent_sorted_by_predicted_sales)

overall_total_predicted_sales_recent = top_10_recent_sorted_by_predicted_sales['Total_Predicted_Sales_NA_EU'].sum()
print(f"\nOverall Total Predicted Sales for Top 10 Recent Games in NA and EU: {overall_total_predicted_sales_recent:.2f} Million")

                                                    Name  Year_of_Release  \
12970                    Dead or Alive Xtreme 3: Fortune             2016   
8283                  Jikkyou Powerful Pro Baseball 2016             2016   
16358                                     Strider (2014)             2014   
16424               Mario vs. Donkey Kong: Tipping Stars             2015   
15919                                   Puyo Puyo Tetris             2014   
13544                  Kamen Rider: Battride War Genesis             2016   
10426                       Kamen Rider: Battride War II             2014   
15838  TV Anime Idolm@ster: Cinderella Girls G4U! Pac...             2015   
15813  TV Anime Idolm@ster: Cinderella Girls G4U! Pac...             2015   
15780  TV Anime Idolm@ster: Cinderella Girls G4U! Pac...             2015   

       JP_Sales  Predicted_NA_Sales  Predicted_EU_Sales  \
12970      0.05            0.438935            0.606618   
8283       0.17            0.438935            0.606618   
16358      0.01            0.375706            0.290873   
16424      0.01            0.359225            0.264461   
15919      0.02            0.359225            0.264461   
13544      0.04            0.301455            0.271769   
10426      0.10            0.301455            0.271769   
15838      0.02            0.301455            0.271769   
15813      0.02            0.301455            0.271769   
15780      0.02            0.301455            0.271769   

       Total_Predicted_Sales_NA_EU  
12970                     1.045553  
8283                      1.045553  
16358                     0.666579  
16424                     0.623686  
15919                     0.623686  
13544                     0.573224  
10426                     0.573224  
15838                     0.573224  
15813                     0.573224  
15780                     0.573224  

Overall Total Predicted Sales for Top 10 Recent Games in NA and EU: 6.87 Million

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\1393105134.py:3: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [129]:

japan_only_recent['Total_Predicted_Sales_NA_EU'] = japan_only_recent['Predicted_NA_Sales'] + japan_only_recent['Predicted_EU_Sales']

top_10_recent_sorted_by_predicted_sales = japan_only_recent[['Name', 'Year_of_Release', 'JP_Sales', 'Predicted_NA_Sales', 'Predicted_EU_Sales', 'Total_Predicted_Sales_NA_EU']].sort_values(by='Total_Predicted_Sales_NA_EU', ascending=False).head(10)

print(top_10_recent_sorted_by_predicted_sales)

overall_total_predicted_sales_recent = top_10_recent_sorted_by_predicted_sales['Total_Predicted_Sales_NA_EU'].sum()
print(f"\nOverall Total Predicted Sales for Top 10 Recent Games in NA and EU: {overall_total_predicted_sales_recent:.2f} Million")

                                                    Name  Year_of_Release  \
12970                    Dead or Alive Xtreme 3: Fortune             2016   
8283                  Jikkyou Powerful Pro Baseball 2016             2016   
16358                                     Strider (2014)             2014   
16424               Mario vs. Donkey Kong: Tipping Stars             2015   
15919                                   Puyo Puyo Tetris             2014   
13544                  Kamen Rider: Battride War Genesis             2016   
10426                       Kamen Rider: Battride War II             2014   
15838  TV Anime Idolm@ster: Cinderella Girls G4U! Pac...             2015   
15813  TV Anime Idolm@ster: Cinderella Girls G4U! Pac...             2015   
15780  TV Anime Idolm@ster: Cinderella Girls G4U! Pac...             2015   

       JP_Sales  Predicted_NA_Sales  Predicted_EU_Sales  \
12970      0.05            0.438935            0.606618   
8283       0.17            0.438935            0.606618   
16358      0.01            0.375706            0.290873   
16424      0.01            0.359225            0.264461   
15919      0.02            0.359225            0.264461   
13544      0.04            0.301455            0.271769   
10426      0.10            0.301455            0.271769   
15838      0.02            0.301455            0.271769   
15813      0.02            0.301455            0.271769   
15780      0.02            0.301455            0.271769   

       Total_Predicted_Sales_NA_EU  
12970                     1.045553  
8283                      1.045553  
16358                     0.666579  
16424                     0.623686  
15919                     0.623686  
13544                     0.573224  
10426                     0.573224  
15838                     0.573224  
15813                     0.573224  
15780                     0.573224  

Overall Total Predicted Sales for Top 10 Recent Games in NA and EU: 6.87 Million

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\2120181659.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [130]:

japan_only_recent.loc[:, 'Total_Predicted_Sales_NA_EU'] = japan_only_recent.loc[:, 'Predicted_NA_Sales'] + japan_only_recent.loc[:, 'Predicted_EU_Sales']

top_10_recent_sorted_by_predicted_sales = japan_only_recent.loc[:, ['Name', 'Year_of_Release', 'JP_Sales', 'Predicted_NA_Sales', 'Predicted_EU_Sales', 'Total_Predicted_Sales_NA_EU']].sort_values(by='Total_Predicted_Sales_NA_EU', ascending=False).head(10)

print(top_10_recent_sorted_by_predicted_sales)

overall_total_predicted_sales_recent = top_10_recent_sorted_by_predicted_sales.loc[:, 'Total_Predicted_Sales_NA_EU'].sum()
print(f"\nOverall Total Predicted Sales for Top 10 Recent Games in NA and EU: {overall_total_predicted_sales_recent:.2f} Million")

                                                    Name  Year_of_Release  \
12970                    Dead or Alive Xtreme 3: Fortune             2016   
8283                  Jikkyou Powerful Pro Baseball 2016             2016   
16358                                     Strider (2014)             2014   
16424               Mario vs. Donkey Kong: Tipping Stars             2015   
15919                                   Puyo Puyo Tetris             2014   
13544                  Kamen Rider: Battride War Genesis             2016   
10426                       Kamen Rider: Battride War II             2014   
15838  TV Anime Idolm@ster: Cinderella Girls G4U! Pac...             2015   
15813  TV Anime Idolm@ster: Cinderella Girls G4U! Pac...             2015   
15780  TV Anime Idolm@ster: Cinderella Girls G4U! Pac...             2015   

       JP_Sales  Predicted_NA_Sales  Predicted_EU_Sales  \
12970      0.05            0.438935            0.606618   
8283       0.17            0.438935            0.606618   
16358      0.01            0.375706            0.290873   
16424      0.01            0.359225            0.264461   
15919      0.02            0.359225            0.264461   
13544      0.04            0.301455            0.271769   
10426      0.10            0.301455            0.271769   
15838      0.02            0.301455            0.271769   
15813      0.02            0.301455            0.271769   
15780      0.02            0.301455            0.271769   

       Total_Predicted_Sales_NA_EU  
12970                     1.045553  
8283                      1.045553  
16358                     0.666579  
16424                     0.623686  
15919                     0.623686  
13544                     0.573224  
10426                     0.573224  
15838                     0.573224  
15813                     0.573224  
15780                     0.573224  

Overall Total Predicted Sales for Top 10 Recent Games in NA and EU: 6.87 Million

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\730015845.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [131]:

japan_only_recent_unique = japan_only_recent.drop_duplicates(subset=['Name', 'Year_of_Release'])

japan_only_recent_unique.loc[:, 'Total_Predicted_Sales_NA_EU'] = japan_only_recent_unique.loc[:, 'Predicted_NA_Sales'] + japan_only_recent_unique.loc[:, 'Predicted_EU_Sales']

top_10_recent_unique_sorted_by_predicted_sales = japan_only_recent_unique.loc[:, ['Name', 'Year_of_Release', 'JP_Sales', 'Predicted_NA_Sales', 'Predicted_EU_Sales', 'Total_Predicted_Sales_NA_EU']].sort_values(by='Total_Predicted_Sales_NA_EU', ascending=False).head(10)

print(top_10_recent_unique_sorted_by_predicted_sales)

overall_total_predicted_sales_recent_unique = top_10_recent_unique_sorted_by_predicted_sales.loc[:, 'Total_Predicted_Sales_NA_EU'].sum()
print(f"\nOverall Total Predicted Sales for Top 10 Recent Unique Games in NA and EU: {overall_total_predicted_sales_recent_unique:.2f} Million")

                                                    Name  Year_of_Release  \
12970                    Dead or Alive Xtreme 3: Fortune             2016   
16358                                     Strider (2014)             2014   
12723                        Samurai Warriors 4: Empires             2015   
15628              Natsuiro High School: Seishun Hakusho             2015   
9757         Mobile Suit Gundam Side Story: Missing Link             2014   
4016   Dragon Quest Heroes: The World's Tree Woe and ...             2015   
12494                             Attack on Titan (KOEI)             2016   
15554  TV Anime Idolm@ster: Cinderella Girls G4U! Pac...             2015   
15838  TV Anime Idolm@ster: Cinderella Girls G4U! Pac...             2015   
15813  TV Anime Idolm@ster: Cinderella Girls G4U! Pac...             2015   

       JP_Sales  Predicted_NA_Sales  Predicted_EU_Sales  \
12970      0.05            0.438935            0.606618   
16358      0.01            0.375706            0.290873   
12723      0.06            0.301455            0.271769   
15628      0.02            0.301455            0.271769   
9757       0.12            0.301455            0.271769   
4016       0.49            0.301455            0.271769   
12494      0.06            0.301455            0.271769   
15554      0.02            0.301455            0.271769   
15838      0.02            0.301455            0.271769   
15813      0.02            0.301455            0.271769   

       Total_Predicted_Sales_NA_EU  
12970                     1.045553  
16358                     0.666579  
12723                     0.573224  
15628                     0.573224  
9757                      0.573224  
4016                      0.573224  
12494                     0.573224  
15554                     0.573224  
15838                     0.573224  
15813                     0.573224  

Overall Total Predicted Sales for Top 10 Recent Unique Games in NA and EU: 6.30 Million

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\2979204632.py:3: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Data Preparation:

Filters the dataset to remove entries with missing values in key columns and narrows down to games released by Nintendo that are exclusive to Japan, meaning they have sales in Japan but none in NA or EU. Feature Encoding:

Uses OneHotEncoder to transform categorical features ('Genre', 'Platform') into a format suitable for machine learning models.

Model Training:

Splits the encoded features into training and test sets. Trains RandomForestRegressor models to predict 'NA_Sales' and 'EU_Sales' based on the encoded features.

Prediction and Analysis:

Predicts 'NA_Sales' and 'EU_Sales' for Japanese-exclusive Nintendo games using the trained models. Compiles the predictions alongside actual 'JP_Sales' to assess which games might perform well if released in NA and EU.

Top Predicted Sales:

Sorts Japanese-exclusive games by their predicted sales in NA and EU to identify the top 10 games that could potentially be successful in these regions. Focuses on recent games (released in the last 7 years) to provide insights into current market trends and consumer preferences

Summary and Insights:

Presents a detailed list of the top 10 recent Japanese-exclusive Nintendo games, sorted by their total predicted sales in NA and EU. Calculates the overall total predicted sales for these top 10 games, offering a quantifiable insight into the potential market opportunity for introducing these games to NA and EU.

In [132]:

print(sales_by_year_genre)

     Year_of_Release         Genre  JP_Sales
0               1985        Action      1.44
1               1985      Fighting      1.05
2               1985      Platform      8.67
3               1985        Puzzle      1.66
4               1985       Shooter      0.21
..               ...           ...       ...
337             2015  Role-Playing      6.80
338             2015       Shooter      2.73
339             2015    Simulation      1.60
340             2015        Sports      0.73
341             2015      Strategy      0.16

[342 rows x 3 columns]

In [133]:

import pandas as pd

# Set option to display up to 342 rows
pd.set_option('display.max_rows', 342)

# Now, when you display the DataFrame, it will show all 342 rows
display(sales_by_year_genre)

# Reset display option to default if needed
pd.reset_option('display.max_rows')

	Year_of_Release	Genre	JP_Sales
0	1985	Action	1.44
1	1985	Fighting	1.05
2	1985	Platform	8.67
3	1985	Puzzle	1.66
4	1985	Shooter	0.21
5	1985	Simulation	0.00
6	1985	Sports	1.53
7	1986	Action	5.31
8	1986	Platform	8.11
9	1986	Racing	0.41
10	1986	Role-Playing	0.52
11	1986	Shooter	2.77
12	1986	Sports	2.69
13	1987	Action	0.00
14	1987	Adventure	1.61
15	1987	Fighting	1.87
16	1987	Platform	0.69
17	1987	Role-Playing	4.18
18	1987	Shooter	0.00
19	1987	Sports	3.28
20	1988	Action	0.42
21	1988	Platform	5.70
22	1988	Puzzle	1.81
23	1988	Racing	0.19
24	1988	Role-Playing	5.78
25	1988	Shooter	0.00
26	1988	Simulation	0.00
27	1988	Sports	1.86
28	1989	Action	0.31
29	1989	Misc	1.28
30	1989	Platform	4.97
31	1989	Puzzle	6.77
32	1989	Role-Playing	2.20
33	1989	Shooter	0.36
34	1989	Sports	2.47
35	1990	Action	1.01
36	1990	Platform	4.28
37	1990	Puzzle	1.93
38	1990	Racing	1.48
39	1990	Role-Playing	4.42
40	1990	Simulation	0.48
41	1990	Sports	1.28
42	1991	Action	2.06
43	1991	Adventure	1.04
44	1991	Fighting	0.39
45	1991	Misc	0.08
46	1991	Platform	1.91
47	1991	Puzzle	2.11
48	1991	Racing	0.15
49	1991	Role-Playing	2.81
50	1991	Shooter	0.50
51	1991	Simulation	0.91
52	1991	Sports	1.88
53	1991	Strategy	0.94
54	1992	Action	0.54
55	1992	Adventure	2.97
56	1992	Fighting	5.75
57	1992	Misc	1.81
58	1992	Platform	2.79
59	1992	Puzzle	2.68
60	1992	Racing	4.09
61	1992	Role-Playing	6.83
62	1992	Shooter	0.26
63	1992	Simulation	0.00
64	1992	Sports	0.90
65	1992	Strategy	0.29
66	1993	Action	0.92
67	1993	Adventure	0.07
68	1993	Fighting	6.11
69	1993	Misc	0.30
70	1993	Platform	5.21
71	1993	Puzzle	2.39
72	1993	Racing	0.36
73	1993	Role-Playing	5.25
74	1993	Shooter	0.89
75	1993	Simulation	0.19
76	1993	Sports	3.13
77	1993	Strategy	0.51
78	1994	Action	0.84
79	1994	Adventure	0.87
80	1994	Fighting	4.44
81	1994	Misc	0.46
82	1994	Platform	7.02
83	1994	Puzzle	1.53
84	1994	Racing	2.13
85	1994	Role-Playing	6.23
86	1994	Shooter	1.67
87	1994	Simulation	2.10
88	1994	Sports	5.23
89	1994	Strategy	1.47
90	1995	Action	1.26
91	1995	Adventure	0.67
92	1995	Fighting	7.05
93	1995	Misc	2.43
94	1995	Platform	6.36
95	1995	Puzzle	2.34
96	1995	Racing	1.95
97	1995	Role-Playing	13.09
98	1995	Shooter	2.85
99	1995	Simulation	2.03
100	1995	Sports	3.87
101	1995	Strategy	1.85
102	1996	Action	2.62
103	1996	Adventure	2.60
104	1996	Fighting	6.12
105	1996	Misc	3.47
106	1996	Platform	6.76
107	1996	Puzzle	1.36
108	1996	Racing	5.30
109	1996	Role-Playing	18.78
110	1996	Shooter	1.11
111	1996	Simulation	2.38
112	1996	Sports	5.08
113	1996	Strategy	1.86
114	1997	Action	1.90
115	1997	Adventure	1.05
116	1997	Fighting	2.60
117	1997	Misc	0.98
118	1997	Platform	3.71
119	1997	Puzzle	3.83
120	1997	Racing	4.09
121	1997	Role-Playing	11.56
122	1997	Shooter	2.17
123	1997	Simulation	5.91
124	1997	Sports	7.59
125	1997	Strategy	3.48
126	1998	Action	5.55
127	1998	Adventure	2.91
128	1998	Fighting	4.18
129	1998	Misc	2.88
130	1998	Platform	3.29
131	1998	Puzzle	1.84
132	1998	Racing	2.30
133	1998	Role-Playing	11.33
134	1998	Shooter	0.36
135	1998	Simulation	3.36
136	1998	Sports	5.06
137	1998	Strategy	6.98
138	1999	Action	2.90
139	1999	Adventure	1.11
140	1999	Fighting	2.83
141	1999	Misc	4.32
142	1999	Platform	1.66
143	1999	Puzzle	0.02
144	1999	Racing	3.35
145	1999	Role-Playing	18.70
146	1999	Shooter	0.29
147	1999	Simulation	6.12
148	1999	Sports	6.83
149	1999	Strategy	4.21
150	2000	Action	3.74
151	2000	Adventure	1.11
152	2000	Fighting	3.05
153	2000	Misc	2.77
154	2000	Platform	2.56
155	2000	Puzzle	0.98
156	2000	Racing	1.15
157	2000	Role-Playing	15.80
158	2000	Shooter	0.20
159	2000	Simulation	1.27
160	2000	Sports	6.36
161	2000	Strategy	3.78
162	2001	Action	5.99
163	2001	Adventure	1.07
164	2001	Fighting	2.73
165	2001	Misc	1.26
166	2001	Platform	3.54
167	2001	Puzzle	0.70
168	2001	Racing	4.00
169	2001	Role-Playing	9.55
170	2001	Shooter	0.53
171	2001	Simulation	3.77
172	2001	Sports	3.90
173	2001	Strategy	2.82
174	2002	Action	5.10
175	2002	Adventure	3.23
176	2002	Fighting	3.14
177	2002	Misc	2.90
178	2002	Platform	4.52
179	2002	Puzzle	0.17
180	2002	Racing	0.20
181	2002	Role-Playing	13.64
182	2002	Shooter	1.02
183	2002	Simulation	0.75
184	2002	Sports	4.69
185	2002	Strategy	2.40
186	2003	Action	4.19
187	2003	Adventure	0.43
188	2003	Fighting	2.95
189	2003	Misc	3.58
190	2003	Platform	2.22
191	2003	Puzzle	1.14
192	2003	Racing	2.83
193	2003	Role-Playing	9.63
194	2003	Shooter	0.21
195	2003	Simulation	1.46
196	2003	Sports	4.00
197	2003	Strategy	1.56
198	2004	Action	4.82
199	2004	Adventure	1.23
200	2004	Fighting	1.34
201	2004	Misc	2.43
202	2004	Platform	6.11
203	2004	Puzzle	2.55
204	2004	Racing	1.99
205	2004	Role-Playing	13.49
206	2004	Shooter	0.65
207	2004	Simulation	0.58
208	2004	Sports	4.42
209	2004	Strategy	2.04
210	2005	Action	6.32
211	2005	Adventure	0.74
212	2005	Fighting	3.07
213	2005	Misc	7.96
214	2005	Platform	1.12
215	2005	Puzzle	6.21
216	2005	Racing	4.37
217	2005	Role-Playing	9.22
218	2005	Shooter	1.32
219	2005	Simulation	9.02
220	2005	Sports	3.54
221	2005	Strategy	1.38
222	2006	Action	5.78
223	2006	Adventure	2.62
224	2006	Fighting	2.82
225	2006	Misc	15.74
226	2006	Platform	9.21
227	2006	Puzzle	3.40
228	2006	Racing	0.87
229	2006	Role-Playing	19.83
230	2006	Shooter	1.89
231	2006	Simulation	2.38
232	2006	Sports	8.69
233	2006	Strategy	0.51
234	2007	Action	6.13
235	2007	Adventure	3.55
236	2007	Fighting	2.73
237	2007	Misc	11.46
238	2007	Platform	2.98
239	2007	Puzzle	3.06
240	2007	Racing	0.94
241	2007	Role-Playing	12.43
242	2007	Shooter	1.53
243	2007	Simulation	3.02
244	2007	Sports	9.59
245	2007	Strategy	2.87
246	2008	Action	6.00
247	2008	Adventure	4.34
248	2008	Fighting	6.14
249	2008	Misc	7.24
250	2008	Platform	2.36
251	2008	Puzzle	0.92
252	2008	Racing	4.21
253	2008	Role-Playing	17.51
254	2008	Shooter	0.77
255	2008	Simulation	3.66
256	2008	Sports	5.37
257	2008	Strategy	1.73
258	2009	Action	11.96
259	2009	Adventure	3.35
260	2009	Fighting	3.02
261	2009	Misc	5.80
262	2009	Platform	4.84
263	2009	Puzzle	1.65
264	2009	Racing	0.50
265	2009	Role-Playing	17.07
266	2009	Shooter	1.12
267	2009	Simulation	0.95
268	2009	Sports	9.81
269	2009	Strategy	1.82
270	2010	Action	8.58
271	2010	Adventure	3.60
272	2010	Fighting	1.52
273	2010	Misc	6.69
274	2010	Platform	3.68
275	2010	Puzzle	0.49
276	2010	Racing	1.04
277	2010	Role-Playing	23.67
278	2010	Shooter	2.12
279	2010	Simulation	1.66
280	2010	Sports	4.42
281	2010	Strategy	2.02
282	2011	Action	10.82
283	2011	Adventure	3.83
284	2011	Fighting	2.94
285	2011	Misc	4.74
286	2011	Platform	3.50
287	2011	Puzzle	0.77
288	2011	Racing	3.14
289	2011	Role-Playing	14.43
290	2011	Shooter	2.69
291	2011	Simulation	1.23
292	2011	Sports	3.08
293	2011	Strategy	1.90
294	2012	Action	12.31
295	2012	Adventure	2.42
296	2012	Fighting	1.79
297	2012	Misc	3.66
298	2012	Platform	3.84
299	2012	Puzzle	0.96
300	2012	Racing	0.20
301	2012	Role-Playing	14.40
302	2012	Shooter	2.62
303	2012	Simulation	5.89
304	2012	Sports	2.60
305	2012	Strategy	1.11
306	2013	Action	10.91
307	2013	Adventure	2.22
308	2013	Fighting	1.28
309	2013	Misc	3.59
310	2013	Platform	2.10
311	2013	Puzzle	0.25
312	2013	Racing	0.54
313	2013	Role-Playing	19.88
314	2013	Shooter	1.74
315	2013	Simulation	2.21
316	2013	Sports	2.15
317	2013	Strategy	0.82
318	2014	Action	6.53
319	2014	Adventure	1.38
320	2014	Fighting	4.83
321	2014	Misc	2.42
322	2014	Platform	1.27
323	2014	Puzzle	0.41
324	2014	Racing	1.47
325	2014	Role-Playing	17.75
326	2014	Shooter	1.08
327	2014	Simulation	0.38
328	2014	Sports	1.61
329	2014	Strategy	0.56
330	2015	Action	15.98
331	2015	Adventure	1.04
332	2015	Fighting	0.79
333	2015	Misc	2.15
334	2015	Platform	1.31
335	2015	Puzzle	0.52
336	2015	Racing	0.28
337	2015	Role-Playing	6.80
338	2015	Shooter	2.73
339	2015	Simulation	1.60
340	2015	Sports	0.73
341	2015	Strategy	0.16

In [134]:

# Data behind the Plotly Interactive Stacked Bar Chart for Console Brand Sales
print("Console Brand Sales Data:")
print(console_brand_region_sales_filtered)

Console Brand Sales Data:
  Console_Brand  JP_Sales  NA_Sales  EU_Sales  Other_Sales
0     Microsoft      5.62    368.95    182.69        52.70
1      Nintendo    131.61    281.95    165.76        42.21
2            PC      0.00     35.35     59.72        13.12
3          Sony    114.51    292.37    285.09       114.66

In [135]:

# Data preparation for area plot
genre_region_sales = df.groupby('Genre')['JP_Sales', 'NA_Sales', 'EU_Sales', 'Other_Sales'].sum()

print("Game Sales by Genre across Different Regions:")
print(genre_region_sales)

Game Sales by Genre across Different Regions:
              JP_Sales  NA_Sales  EU_Sales  Other_Sales
Genre                                                  
Action          160.15    863.17    510.99       182.22
Adventure        52.24    101.52     63.20        16.37
Fighting         87.28    220.51     99.00        35.87
Misc            107.02    399.57    210.60        73.10
Platform        130.71    444.44    199.39        51.01
Puzzle           56.68    121.13     49.78        12.30
Racing           56.63    356.86    234.49        75.51
Role-Playing    353.44    330.04    188.24        59.41
Shooter          38.68    584.83    314.52       103.33
Simulation       63.64    180.67    113.35        30.59
Sports          134.93    671.20    371.32       132.21
Strategy         49.30     67.75     44.79        10.73

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\2891665138.py:2: FutureWarning:

Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.

In [136]:

# Data behind the heatmap
print("Heatmap Data for Game Sales by Genre Across Regions:")
print(sales_heatmap_data)

Heatmap Data for Game Sales by Genre Across Regions:
              NA_Sales  EU_Sales  JP_Sales
Genre                                     
Action          863.17    510.99    160.15
Adventure       101.52     63.20     52.24
Fighting        220.51     99.00     87.28
Misc            399.57    210.60    107.02
Platform        444.44    199.39    130.71
Puzzle          121.13     49.78     56.68
Racing          356.86    234.49     56.63
Role-Playing    330.04    188.24    353.44
Shooter         584.83    314.52     38.68
Simulation      180.67    113.35     63.64
Sports          671.20    371.32    134.93
Strategy         67.75     44.79     49.30

In [137]:

# Data behind the top games sales across EU, NA, and JP regions
top_games_long = top_games.melt(id_vars=['Name', 'Genre', 'Console_Brand'], value_vars=['EU_Sales', 'NA_Sales', 'JP_Sales'], var_name='Region', value_name='Sales')

print("Top Games Sales Data Across Regions:")
print(top_games_long)

Top Games Sales Data Across Regions:
                                            Name         Genre Console_Brand  \
0                                     Wii Sports        Sports      Nintendo   
1                                 Mario Kart Wii        Racing      Nintendo   
2                                     Nintendogs    Simulation      Nintendo   
3                              Wii Sports Resort        Sports      Nintendo   
4   Brain Age: Train Your Brain in Minutes a Day          Misc      Nintendo   
5                              Super Mario Bros.      Platform      Nintendo   
6                                      Duck Hunt       Shooter      Nintendo   
7                                         Tetris        Puzzle      Nintendo   
8                       Pokemon Red/Pokemon Blue  Role-Playing      Nintendo   
9                    Pokemon Gold/Pokemon Silver  Role-Playing      Nintendo   
10                         New Super Mario Bros.      Platform      Nintendo   
11                 Pokemon Diamond/Pokemon Pearl  Role-Playing      Nintendo   
12                                    Wii Sports        Sports      Nintendo   
13                                Mario Kart Wii        Racing      Nintendo   
14                                    Nintendogs    Simulation      Nintendo   
15                             Wii Sports Resort        Sports      Nintendo   
16  Brain Age: Train Your Brain in Minutes a Day          Misc      Nintendo   
17                             Super Mario Bros.      Platform      Nintendo   
18                                     Duck Hunt       Shooter      Nintendo   
19                                        Tetris        Puzzle      Nintendo   
20                      Pokemon Red/Pokemon Blue  Role-Playing      Nintendo   
21                   Pokemon Gold/Pokemon Silver  Role-Playing      Nintendo   
22                         New Super Mario Bros.      Platform      Nintendo   
23                 Pokemon Diamond/Pokemon Pearl  Role-Playing      Nintendo   
24                                    Wii Sports        Sports      Nintendo   
25                                Mario Kart Wii        Racing      Nintendo   
26                                    Nintendogs    Simulation      Nintendo   
27                             Wii Sports Resort        Sports      Nintendo   
28  Brain Age: Train Your Brain in Minutes a Day          Misc      Nintendo   
29                             Super Mario Bros.      Platform      Nintendo   
30                                     Duck Hunt       Shooter      Nintendo   
31                                        Tetris        Puzzle      Nintendo   
32                      Pokemon Red/Pokemon Blue  Role-Playing      Nintendo   
33                   Pokemon Gold/Pokemon Silver  Role-Playing      Nintendo   
34                         New Super Mario Bros.      Platform      Nintendo   
35                 Pokemon Diamond/Pokemon Pearl  Role-Playing      Nintendo   

      Region  Sales  
0   EU_Sales  28.96  
1   EU_Sales  12.76  
2   EU_Sales  10.95  
3   EU_Sales  10.93  
4   EU_Sales   9.20  
5   EU_Sales   3.58  
6   EU_Sales   0.63  
7   EU_Sales   2.26  
8   EU_Sales   8.89  
9   EU_Sales   6.18  
10  EU_Sales   9.14  
11  EU_Sales   4.46  
12  NA_Sales  41.36  
13  NA_Sales  15.68  
14  NA_Sales   9.05  
15  NA_Sales  15.61  
16  NA_Sales   4.74  
17  NA_Sales  29.08  
18  NA_Sales  26.93  
19  NA_Sales  23.20  
20  NA_Sales  11.27  
21  NA_Sales   9.00  
22  NA_Sales  11.28  
23  NA_Sales   6.38  
24  JP_Sales   3.77  
25  JP_Sales   3.79  
26  JP_Sales   1.93  
27  JP_Sales   3.28  
28  JP_Sales   4.16  
29  JP_Sales   6.81  
30  JP_Sales   0.28  
31  JP_Sales   4.22  
32  JP_Sales  10.22  
33  JP_Sales   7.20  
34  JP_Sales   6.50  
35  JP_Sales   6.04

In [138]:

# Data preparation for radar chart
print("Data for Radar Chart of Top Game Sales Across Regions (Lumped Series):")
print(top_games[['Lumped_and_Customized_Name', 'NA_Sales', 'EU_Sales', 'JP_Sales']])

Data for Radar Chart of Top Game Sales Across Regions (Lumped Series):
                           Lumped_and_Customized_Name  NA_Sales  EU_Sales  \
0                              Wii Sports (2006, Wii)     41.36     28.96   
2                      Super Mario Series (2008, Wii)     15.68     12.76   
10                              Nintendogs (2005, DS)      9.05     10.95   
3                       Wii Sports Resort (2009, Wii)     15.61     10.93   
19  Brain Age: Train Your Brain in Minutes a Day (...      4.74      9.20   
1                      Super Mario Series (1985, NES)     29.08      3.58   
9                               Duck Hunt (1984, NES)     26.93      0.63   
5                                   Tetris (1989, GB)     23.20      2.26   
4                           Pokemon Series (1996, GB)     11.27      8.89   
12                          Pokemon Series (1999, GB)      9.00      6.18   
6                       Super Mario Series (2006, DS)     11.28      9.14   
20                          Pokemon Series (2006, DS)      6.38      4.46   

    JP_Sales  
0       3.77  
2       3.79  
10      1.93  
3       3.28  
19      4.16  
1       6.81  
9       0.28  
5       4.22  
4      10.22  
12      7.20  
6       6.50  
20      6.04

In [139]:

# Data behind the normalized top game series sales across regions
print("Normalized Top Game Series Sales Across Regions:")
print(top_series_sales)

Normalized Top Game Series Sales Across Regions:
               Game_Series  NA_Sales  EU_Sales  JP_Sales  Total_Sales
9235    Super Mario Series  1.000000  1.000000  1.000000     3.000000
7273        Pokemon Series  0.481939  0.767183  0.999102     2.248224
1224   Call of Duty Series  0.434757  0.730335  0.042365     1.207458
10880    Wii Sports Series  0.336157  0.602045  0.106138     1.044340
3684    Grand Theft Auto V  0.140252  0.352233  0.021257     0.513743
5537        Mario Kart Wii  0.092246  0.191909  0.056737     0.340891
9603                Tetris  0.153959  0.044368  0.090269     0.288596
10878             Wii Play  0.082127  0.138066  0.043862     0.264055
6746            Nintendogs  0.053242  0.164686  0.028892     0.246820
5536         Mario Kart DS  0.057124  0.112348  0.061826     0.231298

In [140]:

# Ensure to replace 'df' with your actual DataFrame name and adjust 'start_year' and 'end_year' accordingly
df_filtered = df[(df['Year_of_Release'] >= 1985) & (df['Year_of_Release'] <= 2015)].dropna(subset=['Year_of_Release', 'Genre'])
df_filtered['Year_of_Release'] = df_filtered['Year_of_Release'].astype(int)

sales_by_year_genre = df_filtered.groupby(['Year_of_Release', 'Genre'])['NA_Sales', 'EU_Sales', 'JP_Sales'].sum().reset_index()

print("Genre Popularity Over Time by Region Data:")
print(sales_by_year_genre)

Genre Popularity Over Time by Region Data:
     Year_of_Release         Genre  NA_Sales  EU_Sales  JP_Sales
0               1985        Action      1.64      0.38      1.44
1               1985      Fighting      0.00      0.00      1.05
2               1985      Platform     29.93      3.77      8.67
3               1985        Puzzle      1.33      0.19      1.66
4               1985       Shooter      0.65      0.14      0.21
..               ...           ...       ...       ...       ...
337             2015  Role-Playing     13.93     12.93      6.80
338             2015       Shooter     31.94     24.38      2.73
339             2015    Simulation      1.20      2.47      1.60
340             2015        Sports     18.77     15.81      0.73
341             2015      Strategy      0.58      0.98      0.16

[342 rows x 5 columns]

C:\Users\Luke Holmes\AppData\Local\Temp\ipykernel_20752\784631465.py:5: FutureWarning:

Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.

In [ ]: