#!/usr/bin/env python # coding: utf-8 # # 1-3.DATA CLEANING/ DATA WRANGLING # In[6]: #Importing libraries import pandas as pd pd.options.display.max_columns = None import numpy as np import seaborn as sns import matplotlib.pyplot as plt # In[7]: #Loading,reading and storing dataset of FIFA21 players df = pd.read_csv('fifa21_male2.csv') # In[8]: #Review of data fields df. head(5) # In[9]: df = df.loc[df['Club'] != 'Colombia'] # In[10]: #Show the DataFrame's shape df.shape # In[ ]: # In[11]: #Printing all columns in the dataset for better understanding def col_data(): l=df.columns.tolist() for i in l: print(i) col_data() # In[12]: # I make sure there are no duplicate columns if df.columns.duplicated().any(): print("Hay columnas repetidas.") else: print("No hay columnas repetidas.") # In[13]: # Standardize column names df.columns=[e.lower().replace(' ', '_') for e in df.columns] df.columns # In[14]: # Deleting irrelevant columns for the analysis, teAm& contrat same oinfo that club and contract df=df.drop(columns=['player_photo', 'club_logo', 'flag_photo', 'gender', 'team_&_contract', 'contract', 'id', 'loan_date_end'],axis=1) df # In[15]: #Cheking data type for col_name, dtype in df.dtypes.items(): print(col_name, "is", dtype) # In[16]: #Converting weight, hits, height into numerical type df['weight'] = df['weight'].str.extract('(\d+)').astype(int) df['weight'] # In[17]: df['height'] = df['height'].str.extract('(\d+)').astype(int) df['height'] # In[18]: valores_unicos = df['hits'].unique() print(valores_unicos) # In[19]: def clean_hits(x): x= str(x) if x.endswith('K'): x=float(x.replace('K','')) x=x*1000 return x # In[20]: x= '1.8K' test= clean_hits(x) test # In[21]: df['hits'] = df['hits'].apply(clean_hits) df['hits'] = pd.to_numeric(df['hits']) # In[22]: df.hits.dtypes # In[23]: df.head(5) # In[24]: # Numerical Data Cleaning # In[25]: financials = ['value', 'wage', 'release_clause'] def clean_value(x): x= str(x).replace('€','') if x.endswith('K'): x=float(x.replace('K','000').replace('€','')) elif x.endswith('M'): x = float(x.replace('M','').replace('€','')) x= x*1000000 return x # In[26]: df['value']= df['value'].apply(clean_value) # In[27]: df['wage']= df['wage'].apply(clean_value) # In[28]: df['release_clause']= df['release_clause'].apply(clean_value) # In[29]: type(x) # In[30]: df['value'] = pd.to_numeric(df['value']) # In[31]: df['wage'] = pd.to_numeric(df['wage']) # In[32]: df['release_clause'] = pd.to_numeric(df['release_clause']) # In[33]: df.head(1) # In[34]: #Remover estrellas star_columns = ['w/f', 'sm', 'ir'] # Check unique values for column in star_columns: print(df[column].unique()) # In[35]: # Select only the first character from the string, then convert the data type to integer # and check the operation was successful for column in star_columns: df[column] = df[column].str[0] df[column] = pd.to_numeric(df[column], errors='raise') print(df[column].dtypes) # In[36]: df.head(1) # In[37]: #Checking for duplicate rows in the data to remove if any df.duplicated().sum() # In[38]: #Checking null values, I will remove columns have more than 75% print("Nulls") for i in df.columns: print(i, "contains", 100*sum(df[i].isnull()) / len(df[i]) ,"%") # In[39]: #Split data into categorical and numerical to deal with nulls and check unique values # In[40]: df_cat=df.select_dtypes(include=['object']) df_cat.head(5) # In[41]: df_cat.columns # In[42]: #Delete column with numeric values and name because is not relevant for the analysis df_cat=df_cat.drop(columns=['name', 'joined', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk','ls'], axis=1) # In[43]: df_cat # In[44]: #Checking unique values df_cat.value_counts() # In[45]: #chequeamso si hay valor unico para alguna columna, no lo hay porque ya elimamos gender def check_unique_values(df_cat): single_value_columns = [] for column in df: if len(df[column].unique()) == 1: single_value_columns.append(column) return single_value_columns check_unique_values(df_cat) # In[46]: #we check null for categorical df_cat.isnull().sum() # In[47]: #Fill null categorical values with mode # In[48]: df_cat['club'].fillna(df_cat['club'].mode()[0], inplace=True) df_cat['position'].fillna(df_cat['position'].mode()[0], inplace=True) df_cat['a/w'].fillna(df_cat['a/w'].mode()[0], inplace=True) df_cat['d/w'].fillna(df_cat['d/w'].mode()[0], inplace=True) # In[49]: df_cat.isnull().sum() # In[50]: #Save numerical data df_num= df.select_dtypes(exclude=["object"]) df_num # In[51]: #check null values DESC order df_num.isnull().sum().sort_values(ascending=False) # In[52]: #Fill nules with media df_num['composure']=df_num['composure'].fillna(np.mean(df_num['composure'])) df_num['curve'] = df_num['curve'].fillna(np.mean(df_num['curve'])) df_num['jumping']=df_num['jumping'].fillna(np.mean(df_num['jumping'])) df_num['vision']=df_num['vision'].fillna(np.mean(df_num['vision'])) df_num['balance']=df_num['balance'].fillna(np.mean(df_num['balance'])) df_num['agility']= df_num['agility'].fillna(np.mean(df_num['agility'])) df_num['volleys']= df_num['volleys'].fillna(np.mean(df_num['volleys'])) df_num['sliding_tackle']=df_num['sliding_tackle'].fillna(np.mean(df_num['sliding_tackle'])) df_num['interceptions']=df_num['interceptions'].fillna(np.mean(df_num['interceptions'])) df_num['positioning']=df_num['positioning'].fillna(np.mean(df_num['positioning'])) df_num # In[53]: df_num.isnull().sum() # In[54]: # Save "cleaner" data to csv # In[55]: df_num.shape # In[56]: df_cat.shape # In[57]: df_concat = pd.concat([df_num, df_cat], axis=1) # In[58]: df_concat.to_csv('df_concat.csv') # In[59]: df_concat # ## 3-4. DATA WRANGLING, EDA # # Business Case: # # ## Let's imagine that one of the top teams in the world is interested in signing a player for the next season and has to make a decision soon. We will identify the best options taking as a sample the following questions: # # 1. Who are the 10 best players according to their statistics and performance? # 2. Who are the best players in every position in the game? # 3. Who are the 5 players under 25 years old with the highest potential? # 4. Players with the highest release clauses and their Ratings. # 5. Which are the market value of the Top 10 players? # In[60]: df_concat.describe() # ## 1.Who are the 10 best players according to their statistics and performance? # In[61]: top_players = df.sort_values(['ova', 'base_stats'],ascending=False).head(10) top_players.reset_index(drop=True, inplace=True) top_players.index = top_players.index + 1 top_players.index.name = 'Rank' top_players # In[62]: ## Crear un gráfico de barras horizontal con los 10 mejores jugadores según OVA # In[63]: sns.set(style="whitegrid") plt.figure(figsize=(10,6)) sns.barplot(x="name", y="ova", data=top_players, palette="Blues_r") # Agregar títulos y etiquetas de los ejes plt.title("TOP 10 PLAYERS BY OVA", fontsize=18) plt.xticks(rotation=45, ha='right', fontsize=12) # Mostrar el gráfico plt.show() # ## 2.Who are the best players in every position in the game? # In[64]: #La función "counter" toma una columna de posiciones (por ejemplo, 'ls' que se refiere a la calificación de disparo con la pierna izquierda) y la convierte en un número entero sumando los primeros dos #y los últimos dígitos de cada valor en esa columna. Esto se hace para normalizar las calificaciones y convertirlas en un solo número para cada posición. def counter(df): front = df.str[:2].astype(int) back = df.str[-1].astype(int) return front+back # Seleccionar columnas relevantes positions = df[['name','ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb']] #se eliminan todas las filas con valores faltantes positions = positions.dropna() #Se crea un nuevo conjunto de datos ("Position") sin la columna 'name' y se aplica la función "counter" a cada columna para convertir las calificaciones en un solo número. Position = positions.drop('name',axis=1) for x in Position.columns: Position[x] = counter(Position[x]) #En un bucle "for", para cada columna de la posición (por ejemplo, 'ls', 'st', etc.) se encuentra la calificación máxima y se obtiene la fila correspondiente utilizando el índice de esa calificación máxima en la columna. #Luego, el nombre del jugador en esa fila se utiliza para imprimir un mensaje que muestra el mejor jugador en esa posición y su calificación máxima. for x in Position.columns: rating = Position[x].max() current_position = list(Position[x]) player = list(positions['name'])[current_position.index(rating)] best_player = f'Best {x}, is {player} with {rating} rating' print(best_player) # ### 3.Who are the 5 players under 25 years old with the highest potential? # In[65]: young_players = df[df['age'] <= 25] young_players= pd.DataFrame(young_players , columns=['name', 'age', 'ova']) young_players = young_players.sort_values(by=['ova'], ascending=False) young_players.reset_index(drop=True, inplace=True) young_players=young_players.head(5) young_players # In[66]: sns.set_style('whitegrid') sns.barplot(data=young_players, x='name', y='ova', palette='Dark2') plt.title('TOP 5 PLAYERS WITH HIGHEST POTENCIAL') plt.xlabel('Players') plt.ylabel('Overall Rating') plt.xticks(rotation=45) plt.figure(figsize=(8, 6)) plt.show() # ## 4.Players with the highest release clauses and their Ratings # In[67]: #Players with the highest release clauses and their Ratings. release_clause = top_players.loc[:, ['name', 'release_clause', 'ova']] release_clause = release_clause.dropna(subset=['release_clause']) release_clause['release_clause'] = release_clause['release_clause'] release_clause = pd.DataFrame(release_clause.sort_values(by='release_clause', ascending=False)[:10]) release_clause = release_clause.set_index('name') release_clause # ## 5.Which are the top 10 players market value? # In[68]: Top10_values= top_players[['name', 'value']] Top10_values # In[69]: Top10_values_by_value= Top10_values.sort_values(by='value',ascending=False) Top10_values_by_value # In[70]: #Graficamos algunso datos más del dataset # In[71]: df_cat.columns # In[72]: for c in df_cat.columns: plt.figure(figsize=(30,15)) df[c].value_counts().plot(kind='bar') plt.xticks(rotation = 90) plt.title(c) plt.show() # In[73]: from ipywidgets import interact @interact def plot_bar(col = df_cat.columns): plt.figure(figsize = (10,5)) plt.bar(df[col].unique(), df[col].value_counts()) plt.title(col) plt.show() # In[74]: df_num.columns # In[75]: for c in df_num.columns: plt.figure(figsize=(10,4)) sns.histplot(df[c], kde= True) plt.title(c) plt.show() # We can see that many of the attributes are normally distributed, except for the wage, value, release_clause, hits,ir which are heavily skewed towards lower values. We'll also represent the data using boxplots, to get an image of the amount of outliers present: # In[76]: for col in df_num.columns: # Crea un nuevo subplot para cada gráfico fig, ax = plt.subplots(figsize=(8, 6)) # Crea el gráfico de boxplot utilizando seaborn sns.boxplot(x=df[col], ax=ax) # Establece el título de cada gráfico ax.set_title('Boxplot de {}'.format(col)) # Muestra cada gráfico plt.show() # In[77]: #Chequemaos correlacion corr_matrix=df_num.corr() corr_matrix # In[78]: corr = df_num.corr() ones = np.ones_like(corr, dtype="bool") mask = np.triu(ones) mask = mask[1:, :-1] corr = corr.iloc[1:,:-1].copy() fig, ax = plt.subplots(figsize=(40,30)) sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap="Greens", linewidths=.6, cbar_kws={"shrink":.9}) ax.xaxis.tick_bottom() plt.title("Correlations heatmap for numerical data".upper(), fontdict={"fontsize": 18}, loc="left") # In[79]: fig, ax = plt.subplots(figsize=(40,30) ) heatmap = sns.heatmap(df_num.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG') # In[80]: #Alta correlacion:.85 a 1 #Media correlacion:.60 a .85 #Baja correlacion: .40 a .60 #Vemos que nuestro target a predecir value no tiene ninguna correlacion alta # In[81]: #Multicolinealidad:chequeamos la relacion entre la data numerica de las variables seleccionadas para la predcción # In[82]: df_num = df[['age','ova','bov','pot','height','weight','release_clause', 'attacking', 'sm', 'movement', 'power', 'mentality', 'defending', 'value','wage']] for i in df_num.corr().items(): for k in i: if isinstance(k, pd.Series): for x in k: if (x >= .8) and (x!= 1): print("High variance ", x , " in", i[0], "&", k[k == x].index[0]) # In[83]: df_num = df[['age','ova','pot','height','weight','wage','release_clause','power','movement', 'sm', 'defending','value' ]] for i in df_num.corr().items(): for k in i: if isinstance(k, pd.Series): for x in k: if (x >= .8) and (x!= 1): print("High variance ", x , " in", i[0], "&", k[k == x].index[0]) # In[84]: fig, ax = plt.subplots(figsize=(10,5) ) heatmap = sns.heatmap(df_num.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG') # In[85]: df_num # In[86]: for i in df_num: sns.distplot(df_num[i]) plt.show() # In[ ]: # ## 5. Predicting market value # Real Madrid. F.C. needs to create a model to predict the market value ofplayers to make wise budget decisions for next season. # In[87]: df_cat.head(2) # In[88]: df_num.head(2) # In[ ]: # In[89]: #comprobar correlacion lineal import scipy.stats as stats stats.pearsonr(df_num['age'],df_num['value']) # In[90]: stats.pearsonr(df_num['ova'],df_num['value']) # In[91]: stats.pearsonr(df_num['pot'],df_num['value']) # In[92]: stats.pearsonr(df_num['height'],df_num['value']) # In[93]: stats.pearsonr(df_num['weight'],df_num['value']) # In[94]: stats.pearsonr(df_num['wage'],df_num['value']) # In[95]: stats.pearsonr(df_num['release_clause'],df_num['value']) # In[96]: stats.pearsonr(df_num['power'],df_num['value']) # In[97]: stats.pearsonr(df_num['movement'],df_num['value']) # In[98]: stats.pearsonr(df_num['sm'],df_num['value']) # In[99]: stats.pearsonr(df_num['defending'],df_num['value']) # In[100]: #X/Y SPLIT #X-y split y=df_concat['value'] x_x=df_concat.drop('value',axis=1) # In[101]: X_num= x_x.select_dtypes(include=np.number) X_cat= x_x.select_dtypes(include=np.object) # In[102]: #Normalizer from sklearn.preprocessing import StandardScaler transformer=StandardScaler().fit(X_num) X_norm=transformer.transform(X_num) X_norm # In[103]: sns.distplot(X_norm) # In[104]: #Encodign categorical data from sklearn.preprocessing import OneHotEncoder encoder=OneHotEncoder(drop='first').fit(X_cat) encoder=encoder.transform(X_cat).toarray() encoder # In[105]: X_em=np.concatenate((X_norm, encoder),axis=1) X_em # In[106]: #1. Running linear regression model ##Splitting into train set and test set from sklearn.model_selection import train_test_split X_em_train, X_em_test, y_train, y_test = train_test_split(X_em, y, test_size=0.3, random_state=42) # In[107]: X_em_train.shape # In[108]: X_em_test.shape # In[109]: ## Running linear regression model from sklearn import linear_model from sklearn.metrics import mean_squared_error, r2_score lm=linear_model.LinearRegression() model=lm.fit(X_em_train,y_train) model # In[110]: model.coef_ # In[111]: model.intercept_ # In[112]: from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error from math import sqrt train_predictions=lm.predict(X_em_train) r2_score(y_train, train_predictions) # In[113]: mse_new = mean_squared_error(y_train, train_predictions) mae_new = mean_absolute_error(y_train, train_predictions) rmse_new = sqrt(mse_new) print('Mean Squared Error:', mse_new) print('Mean absolute Error:', mae_new) print('Root Mean Squared Error:',rmse_new) # In[114]: #solo con numerico y=df_num['value'] X= df_num.drop(['value'], axis =1) # In[115]: ##Splitting into train set and test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # In[116]: X_train.shape # In[117]: X_test.shape # In[118]: lm=linear_model.LinearRegression() model=lm.fit(X_train,y_train) model # In[119]: model.coef_ # In[120]: model.intercept_ # In[121]: train_predictions=lm.predict(X_train) # In[122]: r2_score(y_train, train_predictions) # In[123]: mse_new = mean_squared_error(y_train, train_predictions) mae_new = mean_absolute_error(y_train, train_predictions) rmse_new = sqrt(mse_new) print('Mean Squared Error:', mse_new) print('Mean absolute Error:', mae_new) print('Root Mean Squared Error:',rmse_new) # In[124]: #Elimino outliers pasándolos a nulos y rellenándolos con la media # In[125]: df_num # In[126]: for x in ['age']: q75,q25 = np.percentile(df_num.loc[:,x],[75,25]) intr_qr = q75-q25 max = q75+(1.5*intr_qr) min = q25-(1.5*intr_qr) df_num.loc[df_num[x] < min,x] = np.nan df_num.loc[df_num[x] > max,x] = np.nan # In[127]: for x in ['ova']: q75,q25 = np.percentile(df_num.loc[:,x],[75,25]) intr_qr = q75-q25 max = q75+(1.5*intr_qr) min = q25-(1.5*intr_qr) df_num.loc[df_num[x] < min,x] = np.nan df_num.loc[df_num[x] > max,x] = np.nan # In[128]: for x in ['pot']: q75,q25 = np.percentile(df_num.loc[:,x],[75,25]) intr_qr = q75-q25 max = q75+(1.5*intr_qr) min = q25-(1.5*intr_qr) df_num.loc[df_num[x] < min,x] = np.nan df_num.loc[df_num[x] > max,x] = np.nan # In[129]: for x in ['height']: q75,q25 = np.percentile(df_num.loc[:,x],[75,25]) intr_qr = q75-q25 max = q75+(1.5*intr_qr) min = q25-(1.5*intr_qr) df_num.loc[df_num[x] < min,x] = np.nan df_num.loc[df_num[x] > max,x] = np.nan # In[130]: for x in ['weight']: q75,q25 = np.percentile(df_num.loc[:,x],[75,25]) intr_qr = q75-q25 max = q75+(1.5*intr_qr) min = q25-(1.5*intr_qr) df_num.loc[df_num[x] < min,x] = np.nan df_num.loc[df_num[x] > max,x] = np.nan # In[131]: for x in ['value']: q75,q25 = np.percentile(df_num.loc[:,x],[75,25]) intr_qr = q75-q25 max = q75+(1.5*intr_qr) min = q25-(1.5*intr_qr) df_num.loc[df_num[x] < min,x] = np.nan df_num.loc[df_num[x] > max,x] = np.nan # In[132]: for x in ['wage']: q75,q25 = np.percentile(df_num.loc[:,x],[75,25]) intr_qr = q75-q25 max = q75+(1.5*intr_qr) min = q25-(1.5*intr_qr) df_num.loc[df_num[x] < min,x] = np.nan df_num.loc[df_num[x] > max,x] = np.nan # In[133]: for x in ['release_clause']: q75,q25 = np.percentile(df_num.loc[:,x],[75,25]) intr_qr = q75-q25 max = q75+(1.5*intr_qr) min = q25-(1.5*intr_qr) df_num.loc[df_num[x] < min,x] = np.nan df_num.loc[df_num[x] > max,x] = np.nan # In[134]: for x in ['power']: q75,q25 = np.percentile(df_num.loc[:,x],[75,25]) intr_qr = q75-q25 max = q75+(1.5*intr_qr) min = q25-(1.5*intr_qr) df_num.loc[df_num[x] < min,x] = np.nan df_num.loc[df_num[x] > max,x] = np.nan # In[135]: for x in ['movement']: q75,q25 = np.percentile(df_num.loc[:,x],[75,25]) intr_qr = q75-q25 max = q75+(1.5*intr_qr) min = q25-(1.5*intr_qr) df_num.loc[df_num[x] < min,x] = np.nan df_num.loc[df_num[x] > max,x] = np.nan # In[136]: for x in ['sm']: q75,q25 = np.percentile(df_num.loc[:,x],[75,25]) intr_qr = q75-q25 max = q75+(1.5*intr_qr) min = q25-(1.5*intr_qr) df_num.loc[df_num[x] < min,x] = np.nan df_num.loc[df_num[x] > max,x] = np.nan # In[137]: for x in ['defending']: q75,q25 = np.percentile(df_num.loc[:,x],[75,25]) intr_qr = q75-q25 max = q75+(1.5*intr_qr) min = q25-(1.5*intr_qr) df_num.loc[df_num[x] < min,x] = np.nan df_num.loc[df_num[x] > max,x] = np.nan # In[138]: df_num.isnull().sum() # In[139]: df_num['age']= df_num['age'].fillna(np.mean(df_num['age'])) df_num['ova']= df_num['ova'].fillna(np.mean(df_num['ova'])) df_num['pot']= df_num['pot'].fillna(np.mean(df_num['pot'])) df_num['height']= df_num['height'].fillna(np.mean(df_num['height'])) df_num['weight']= df_num['weight'].fillna(np.mean(df_num['weight'])) df_num['value']= df_num['value'].fillna(np.mean(df_num['value'])) df_num['wage']= df_num['wage'].fillna(np.mean(df_num['wage'])) df_num['release_clause']= df_num['release_clause'].fillna(np.mean(df_num['release_clause'])) df_num['power']= df_num['power'].fillna(np.mean(df_num['power'])) df_num['movement']= df_num['movement'].fillna(np.mean(df_num['movement'])) df_num['sm']= df_num['sm'].fillna(np.mean(df_num['sm'])) df_num['defending']= df_num['defending'].fillna(np.mean(df_num['defending'])) # In[140]: df_num.isnull().sum() # In[141]: #Split X=df_num.drop(columns=['value'], axis=1) y=df_num.value # In[142]: #Normalizer from sklearn.preprocessing import StandardScaler transformer=StandardScaler().fit(X) X_norm_x=transformer.transform(X) X_norm_x # In[143]: X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42) # In[144]: X_train.shape # In[145]: X_test.shape # In[146]: lm=linear_model.LinearRegression() model=lm.fit(X_train,y_train) model # In[147]: train_predictions=lm.predict(X_train) # In[148]: #R2+ train_score=lm.score(X_train, y_train) #R2 test_score=lm.score(X_test, y_test) print (train_score, test_score) # In[149]: mse_new = mean_squared_error(y_train, train_predictions) mae_new = mean_absolute_error(y_train, train_predictions) rmse_new = sqrt(mse_new) print('Mean Squared Error:', mse_new) print('Mean absolute Error:', mae_new) print('Root Mean Squared Error:',rmse_new) # In[150]: ##Elimando dos columnas de numericas # In[151]: df_num # In[152]: df_num = df_num.drop(['release_clause','height'], axis=1) df_num # In[153]: df_cat # In[154]: df_concat_model = pd.concat([df_num, df_cat], axis=1) # In[155]: #X-y split y=df_concat_model['value'] x_m=df_concat_model.drop('value',axis=1) # In[156]: X_num_m= x_x.select_dtypes(include=np.number) X_cat_c= x_x.select_dtypes(include=np.object) # In[157]: from sklearn.preprocessing import Normalizer transformer=Normalizer().fit(X_num_m) x_norm_m=transformer.transform(X_num_m) x_norm_m # In[158]: from sklearn import preprocessing labels=preprocessing.OrdinalEncoder().fit(X_cat_c).transform(X_cat_c) labels # In[159]: X=np.concatenate((x_norm_m,labels),axis=1) # In[160]: X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42) # In[161]: lm=linear_model.LinearRegression() model=lm.fit(X_train,y_train) # In[162]: predictions=lm.predict(X_train) # In[163]: r2_score(y_train,predictions) # In[164]: mse_new = mean_squared_error(y_train, predictions) mae_new = mean_absolute_error(y_train, predictions) rmse_new = sqrt(mse_new) print('Mean Squared Error:', mse_new) print('Mean absolute Error:', mae_new) print('Root Mean Squared Error:',rmse_new) # ## RESULTS # # # Numerical and categorical data # # **R2:0.97901212553766** # Mean Squared Error: 608462032800.9713 # Mean absolute Error: 380202.62010170677 # Root Mean Squared Error: 780039.7636024534 # # # # # Data numerical only # **R2:0.9701210029938898** # Mean Squared Error: 866225652769.8446 # Mean absolute Error: 428295.5255877738 # Root Mean Squared Error: 930712.4436526271 # # # Removing Outliers # **R2: 0.6982558910881252** # Mean Squared Error: 334912734051.11316 # Mean absolute Error: 291899.0036340945 # Root Mean Squared Error: 578716.453931554 # # # Removing two numerical columns: Release clause and Height # **R2:0.07250571063361666** # Mean Squared Error: 1029447267052.3954 # Mean absolute Error: 708737.3275927786 # Root Mean Squared Error: 1014616.8079883141 # # # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: