#!/usr/bin/env python
# coding: utf-8

# # 1-3.DATA CLEANING/ DATA WRANGLING

# In[6]:


#Importing libraries
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# In[7]:


#Loading,reading and storing dataset of FIFA21 players
df = pd.read_csv('fifa21_male2.csv')


# In[8]:


#Review of data fields

df. head(5)


# In[9]:


df = df.loc[df['Club'] != 'Colombia']


# In[10]:


#Show the DataFrame's shape
df.shape


# In[ ]:


# In[11]:


#Printing all columns in the dataset for better understanding

def col_data():
    l=df.columns.tolist()
    for i in l:
        print(i)
col_data()


# In[12]:


# I make sure there are no duplicate columns

if df.columns.duplicated().any():
    print("Hay columnas repetidas.")
else:
    print("No hay columnas repetidas.")


# In[13]:


# Standardize column names
df.columns=[e.lower().replace(' ', '_') for e in df.columns]  
df.columns


# In[14]:


# Deleting irrelevant columns for the analysis, teAm& contrat same oinfo that club and contract

df=df.drop(columns=['player_photo', 'club_logo', 'flag_photo', 'gender', 'team_&_contract', 'contract', 'id', 'loan_date_end'],axis=1)
df


# In[15]:


#Cheking data type 
for col_name, dtype in df.dtypes.items():
    print(col_name, "is", dtype)


# In[16]:


#Converting weight, hits,  height into numerical type
df['weight'] = df['weight'].str.extract('(\d+)').astype(int)
df['weight']


# In[17]:


df['height'] = df['height'].str.extract('(\d+)').astype(int)
df['height']


# In[18]:


valores_unicos = df['hits'].unique()
print(valores_unicos)


# In[19]:


def clean_hits(x):
    x= str(x)
    if x.endswith('K'):
        x=float(x.replace('K',''))
        x=x*1000
    return x


# In[20]:


x= '1.8K'
test= clean_hits(x)
test


# In[21]:


df['hits'] = df['hits'].apply(clean_hits)
df['hits'] = pd.to_numeric(df['hits'])


# In[22]:


df.hits.dtypes


# In[23]:


df.head(5)


# In[24]:


# Numerical Data Cleaning


# In[25]:


financials = ['value', 'wage', 'release_clause']

def clean_value(x):
    x= str(x).replace('€','')
    if x.endswith('K'):
        x=float(x.replace('K','000').replace('€',''))
    elif x.endswith('M'):
        x = float(x.replace('M','').replace('€',''))
        x= x*1000000
    return x


# In[26]:


df['value']= df['value'].apply(clean_value)


# In[27]:


df['wage']= df['wage'].apply(clean_value)


# In[28]:


df['release_clause']= df['release_clause'].apply(clean_value)


# In[29]:


type(x)


# In[30]:


df['value'] = pd.to_numeric(df['value'])


# In[31]:


df['wage'] = pd.to_numeric(df['wage'])


# In[32]:


df['release_clause'] = pd.to_numeric(df['release_clause'])


# In[33]:


df.head(1)


# In[34]:


#Remover estrellas
star_columns = ['w/f', 'sm', 'ir']

# Check unique values
for column in star_columns:
    print(df[column].unique())


# In[35]:


# Select only the first character from the string, then convert the data type to integer
# and check the operation was successful
for column in star_columns:
    df[column] = df[column].str[0]
    df[column] = pd.to_numeric(df[column], errors='raise')
    print(df[column].dtypes)


# In[36]:


df.head(1)


# In[37]:


#Checking for duplicate rows in the data to remove if any

df.duplicated().sum()


# In[38]:


#Checking null values, I will remove columns have more than 75%
print("Nulls")
for i in df.columns: 
  print(i, "contains", 100*sum(df[i].isnull()) / len(df[i]) ,"%")


# In[39]:


#Split data into categorical and numerical to deal with nulls and check unique values


# In[40]:


df_cat=df.select_dtypes(include=['object'])
df_cat.head(5)


# In[41]:


df_cat.columns


# In[42]:


#Delete column with numeric values and name because is not relevant for the analysis
df_cat=df_cat.drop(columns=['name', 'joined',
          'st', 'rs',
       'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm',
       'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb',
   'rcb', 'rb', 'gk','ls'], axis=1)


# In[43]:


df_cat


# In[44]:


#Checking unique values
df_cat.value_counts()


# In[45]:


#chequeamso si hay valor unico para alguna columna, no lo hay porque ya elimamos gender 

def check_unique_values(df_cat):
    single_value_columns = []
    for column in df:
        if len(df[column].unique()) == 1:
            single_value_columns.append(column)
    return single_value_columns

check_unique_values(df_cat)


# In[46]:


#we check null for categorical 
df_cat.isnull().sum()


# In[47]:


#Fill null categorical values with mode


# In[48]:


df_cat['club'].fillna(df_cat['club'].mode()[0], inplace=True)
df_cat['position'].fillna(df_cat['position'].mode()[0], inplace=True)
df_cat['a/w'].fillna(df_cat['a/w'].mode()[0], inplace=True)
df_cat['d/w'].fillna(df_cat['d/w'].mode()[0], inplace=True)


# In[49]:


df_cat.isnull().sum()


# In[50]:


#Save numerical data
df_num= df.select_dtypes(exclude=["object"])
df_num


# In[51]:


#check null values DESC order
df_num.isnull().sum().sort_values(ascending=False)


# In[52]:


#Fill nules with media
df_num['composure']=df_num['composure'].fillna(np.mean(df_num['composure']))
df_num['curve'] = df_num['curve'].fillna(np.mean(df_num['curve']))
df_num['jumping']=df_num['jumping'].fillna(np.mean(df_num['jumping']))
df_num['vision']=df_num['vision'].fillna(np.mean(df_num['vision']))
df_num['balance']=df_num['balance'].fillna(np.mean(df_num['balance']))
df_num['agility']= df_num['agility'].fillna(np.mean(df_num['agility']))
df_num['volleys']= df_num['volleys'].fillna(np.mean(df_num['volleys']))
df_num['sliding_tackle']=df_num['sliding_tackle'].fillna(np.mean(df_num['sliding_tackle']))
df_num['interceptions']=df_num['interceptions'].fillna(np.mean(df_num['interceptions']))
df_num['positioning']=df_num['positioning'].fillna(np.mean(df_num['positioning']))
df_num


# In[53]:


df_num.isnull().sum()


# In[54]:


# Save "cleaner" data to csv


# In[55]:


df_num.shape


# In[56]:


df_cat.shape


# In[57]:


df_concat = pd.concat([df_num, df_cat], axis=1)


# In[58]:


df_concat.to_csv('df_concat.csv')


# In[59]:


df_concat


# ## 3-4. DATA WRANGLING, EDA

# # Business Case:
# 
# ## Let's imagine that one of the top teams in the world is interested in signing a player for the next season and has to make a decision soon. We will identify the best options taking as a sample the following questions:
# 
# 1. Who are the 10 best players according to their statistics and performance?
# 2. Who are the best players in every position in the game?
# 3. Who are the 5 players under 25 years old with the highest potential?
# 4. Players with the highest release clauses and their Ratings.
# 5. Which are the market value of the Top 10 players?

# In[60]:


df_concat.describe()


# ## 1.Who are the 10 best players according to their statistics and performance?

# In[61]:


top_players = df.sort_values(['ova', 'base_stats'],ascending=False).head(10)
top_players.reset_index(drop=True, inplace=True)
top_players.index = top_players.index + 1
top_players.index.name = 'Rank'
top_players 


# In[62]:


## Crear un gráfico de barras horizontal con los 10 mejores jugadores según OVA


# In[63]:


sns.set(style="whitegrid")
plt.figure(figsize=(10,6))
sns.barplot(x="name", y="ova", data=top_players, palette="Blues_r")

# Agregar títulos y etiquetas de los ejes
plt.title("TOP 10 PLAYERS BY OVA", fontsize=18)
plt.xticks(rotation=45, ha='right', fontsize=12)

# Mostrar el gráfico
plt.show()


# ## 2.Who are the best players in every position in the game?

# In[64]:


#La función "counter" toma una columna de posiciones (por ejemplo, 'ls' que se refiere a la calificación de disparo con la pierna izquierda) y la convierte en un número entero sumando los primeros dos
#y los últimos dígitos de cada valor en esa columna. Esto se hace para normalizar las calificaciones y convertirlas en un solo número para cada posición.

def counter(df):

    front = df.str[:2].astype(int)
    back = df.str[-1].astype(int)
    return front+back

# Seleccionar columnas relevantes
positions = df[['name','ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw',
'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm',
'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb']]
#se eliminan todas las filas con valores faltantes
positions = positions.dropna()
#Se crea un nuevo conjunto de datos ("Position") sin la columna 'name' y se aplica la función "counter" a cada columna para convertir las calificaciones en un solo número.
Position = positions.drop('name',axis=1)
for x in Position.columns:
    Position[x] = counter(Position[x])
#En un bucle "for", para cada columna de la posición (por ejemplo, 'ls', 'st', etc.) se encuentra la calificación máxima y se obtiene la fila correspondiente utilizando el índice de esa calificación máxima en la columna. 
#Luego, el nombre del jugador en esa fila se utiliza para imprimir un mensaje que muestra el mejor jugador en esa posición y su calificación máxima.
for x in Position.columns:
    rating = Position[x].max()
    current_position = list(Position[x])
    player = list(positions['name'])[current_position.index(rating)]
    best_player  = f'Best {x}, is {player} with {rating} rating'
    print(best_player)


# ### 3.Who are the 5 players under 25 years old with the highest potential?

# In[65]:


young_players = df[df['age'] <= 25]
young_players= pd.DataFrame(young_players , columns=['name', 'age', 'ova'])
young_players = young_players.sort_values(by=['ova'], ascending=False)
young_players.reset_index(drop=True, inplace=True)
young_players=young_players.head(5)
young_players


# In[66]:


sns.set_style('whitegrid')
sns.barplot(data=young_players, x='name', y='ova', palette='Dark2')
plt.title('TOP 5 PLAYERS WITH HIGHEST POTENCIAL')
plt.xlabel('Players')
plt.ylabel('Overall Rating')
plt.xticks(rotation=45)
plt.figure(figsize=(8, 6))
plt.show()


# ## 4.Players with the highest release clauses and their Ratings

# In[67]:


#Players with the highest release clauses and their Ratings.

release_clause = top_players.loc[:, ['name', 'release_clause', 'ova']]
release_clause = release_clause.dropna(subset=['release_clause'])
release_clause['release_clause'] = release_clause['release_clause'] 
release_clause = pd.DataFrame(release_clause.sort_values(by='release_clause', ascending=False)[:10])
release_clause = release_clause.set_index('name')
release_clause


# ## 5.Which are the top 10 players market value?

# In[68]:


Top10_values= top_players[['name', 'value']]
Top10_values


# In[69]:


Top10_values_by_value= Top10_values.sort_values(by='value',ascending=False)
Top10_values_by_value


# In[70]:


#Graficamos algunso datos más del dataset


# In[71]:


df_cat.columns


# In[72]:


for c in df_cat.columns:
    plt.figure(figsize=(30,15))
    df[c].value_counts().plot(kind='bar')
    plt.xticks(rotation = 90)
    plt.title(c)
    plt.show()


# In[73]:


from ipywidgets import interact

@interact

def plot_bar(col = df_cat.columns):

    plt.figure(figsize = (10,5))

    plt.bar(df[col].unique(), df[col].value_counts())

    plt.title(col)

    plt.show()


# In[74]:


df_num.columns


# In[75]:


for c in df_num.columns:
    plt.figure(figsize=(10,4))
    sns.histplot(df[c], kde= True)
    plt.title(c)
    plt.show()


# We can see that many of the attributes are normally distributed, except for the wage, value, release_clause, hits,ir which are heavily skewed towards lower values. We'll also represent the data using boxplots, to get an image of the amount of outliers present:

# In[76]:


for col in df_num.columns:
    # Crea un nuevo subplot para cada gráfico
    fig, ax = plt.subplots(figsize=(8, 6))
    # Crea el gráfico de boxplot utilizando seaborn
    sns.boxplot(x=df[col], ax=ax)
    # Establece el título de cada gráfico
    ax.set_title('Boxplot de {}'.format(col))
    # Muestra cada gráfico
    plt.show()


# In[77]:


#Chequemaos correlacion
corr_matrix=df_num.corr()
corr_matrix


# In[78]:


corr = df_num.corr()
ones = np.ones_like(corr, dtype="bool")
mask = np.triu(ones)
mask = mask[1:, :-1]
corr = corr.iloc[1:,:-1].copy()
fig, ax = plt.subplots(figsize=(40,30))
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap="Greens", linewidths=.6, cbar_kws={"shrink":.9})
ax.xaxis.tick_bottom()
plt.title("Correlations heatmap for numerical data".upper(), fontdict={"fontsize": 18}, loc="left")


# In[79]:


fig, ax = plt.subplots(figsize=(40,30)  )
heatmap = sns.heatmap(df_num.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')


# In[80]:


#Alta correlacion:.85 a 1
#Media correlacion:.60 a .85
#Baja correlacion: .40 a .60

#Vemos que nuestro target a predecir value no tiene ninguna correlacion alta 


# In[81]:


#Multicolinealidad:chequeamos la relacion entre la data numerica de las variables seleccionadas para la predcción 


# In[82]:


df_num = df[['age','ova','bov','pot','height','weight','release_clause', 'attacking', 'sm', 'movement', 'power', 'mentality', 'defending', 'value','wage']]

for i in df_num.corr().items():
  for k in i:
    if isinstance(k, pd.Series):
      for x in k:
        if (x >= .8) and (x!= 1):
          print("High variance ", x , " in", i[0], "&", k[k == x].index[0])


# In[83]:


df_num = df[['age','ova','pot','height','weight','wage','release_clause','power','movement', 'sm', 'defending','value' ]]

for i in df_num.corr().items():
  for k in i:
    if isinstance(k, pd.Series):
      for x in k:
        if (x >= .8) and (x!= 1):
          print("High variance ", x , " in", i[0], "&", k[k == x].index[0])


# In[84]:


fig, ax = plt.subplots(figsize=(10,5)  )
heatmap = sns.heatmap(df_num.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')


# In[85]:


df_num


# In[86]:


for i in df_num: 
  sns.distplot(df_num[i])
  plt.show()


# In[ ]:


# ## 5. Predicting market value 
# Real Madrid. F.C. needs to create a model to predict the market value ofplayers to make wise budget decisions for next season.

# In[87]:


df_cat.head(2)


# In[88]:


df_num.head(2)


# In[ ]:


# In[89]:


#comprobar correlacion lineal

import scipy.stats as stats
stats.pearsonr(df_num['age'],df_num['value'])


# In[90]:


stats.pearsonr(df_num['ova'],df_num['value'])


# In[91]:


stats.pearsonr(df_num['pot'],df_num['value'])


# In[92]:


stats.pearsonr(df_num['height'],df_num['value'])


# In[93]:


stats.pearsonr(df_num['weight'],df_num['value'])


# In[94]:


stats.pearsonr(df_num['wage'],df_num['value'])


# In[95]:


stats.pearsonr(df_num['release_clause'],df_num['value'])


# In[96]:


stats.pearsonr(df_num['power'],df_num['value'])


# In[97]:


stats.pearsonr(df_num['movement'],df_num['value'])


# In[98]:


stats.pearsonr(df_num['sm'],df_num['value'])


# In[99]:


stats.pearsonr(df_num['defending'],df_num['value'])


# In[100]:


#X/Y SPLIT
#X-y split 
y=df_concat['value']
x_x=df_concat.drop('value',axis=1)


# In[101]:


X_num= x_x.select_dtypes(include=np.number)
X_cat= x_x.select_dtypes(include=np.object)


# In[102]:


#Normalizer
from sklearn.preprocessing import StandardScaler 
transformer=StandardScaler().fit(X_num)
X_norm=transformer.transform(X_num)
X_norm


# In[103]:


sns.distplot(X_norm)


# In[104]:


#Encodign categorical data 
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder(drop='first').fit(X_cat)
encoder=encoder.transform(X_cat).toarray()
encoder


# In[105]:


X_em=np.concatenate((X_norm, encoder),axis=1)
X_em


# In[106]:


#1. Running linear regression model

##Splitting into train set and test set
from sklearn.model_selection import train_test_split
X_em_train, X_em_test, y_train, y_test = train_test_split(X_em, y, test_size=0.3, random_state=42)


# In[107]:


X_em_train.shape


# In[108]:


X_em_test.shape


# In[109]:


## Running linear regression model
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
lm=linear_model.LinearRegression()
model=lm.fit(X_em_train,y_train)
model


# In[110]:


model.coef_


# In[111]:


model.intercept_


# In[112]:


from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from math import sqrt
train_predictions=lm.predict(X_em_train)
r2_score(y_train, train_predictions)


# In[113]:


mse_new = mean_squared_error(y_train, train_predictions)
mae_new = mean_absolute_error(y_train, train_predictions)
rmse_new = sqrt(mse_new)


print('Mean Squared Error:', mse_new)
print('Mean absolute Error:', mae_new)
print('Root Mean Squared Error:',rmse_new)


# In[114]:


#solo con numerico 
y=df_num['value']
X= df_num.drop(['value'], axis =1)


# In[115]:


##Splitting into train set and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# In[116]:


X_train.shape


# In[117]:


X_test.shape


# In[118]:


lm=linear_model.LinearRegression()
model=lm.fit(X_train,y_train)
model


# In[119]:


model.coef_


# In[120]:


model.intercept_


# In[121]:


train_predictions=lm.predict(X_train)


# In[122]:


r2_score(y_train, train_predictions)


# In[123]:


mse_new = mean_squared_error(y_train, train_predictions)
mae_new = mean_absolute_error(y_train, train_predictions)
rmse_new = sqrt(mse_new)


print('Mean Squared Error:', mse_new)
print('Mean absolute Error:', mae_new)
print('Root Mean Squared Error:',rmse_new)


# In[124]:


#Elimino outliers pasándolos a nulos y rellenándolos con la media


# In[125]:


df_num


# In[126]:


for x in ['age']:
    q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    df_num.loc[df_num[x] < min,x] = np.nan
    df_num.loc[df_num[x] > max,x] = np.nan


# In[127]:


for x in ['ova']:
    q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    df_num.loc[df_num[x] < min,x] = np.nan
    df_num.loc[df_num[x] > max,x] = np.nan


# In[128]:


for x in ['pot']:
    q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    df_num.loc[df_num[x] < min,x] = np.nan
    df_num.loc[df_num[x] > max,x] = np.nan


# In[129]:


for x in ['height']:
    q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    df_num.loc[df_num[x] < min,x] = np.nan
    df_num.loc[df_num[x] > max,x] = np.nan


# In[130]:


for x in ['weight']:
    q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    df_num.loc[df_num[x] < min,x] = np.nan
    df_num.loc[df_num[x] > max,x] = np.nan


# In[131]:


for x in ['value']:
    q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    df_num.loc[df_num[x] < min,x] = np.nan
    df_num.loc[df_num[x] > max,x] = np.nan


# In[132]:


for x in ['wage']:
    q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    df_num.loc[df_num[x] < min,x] = np.nan
    df_num.loc[df_num[x] > max,x] = np.nan


# In[133]:


for x in ['release_clause']:
    q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    df_num.loc[df_num[x] < min,x] = np.nan
    df_num.loc[df_num[x] > max,x] = np.nan


# In[134]:


for x in ['power']:
    q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    df_num.loc[df_num[x] < min,x] = np.nan
    df_num.loc[df_num[x] > max,x] = np.nan


# In[135]:


for x in ['movement']:
    q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    df_num.loc[df_num[x] < min,x] = np.nan
    df_num.loc[df_num[x] > max,x] = np.nan


# In[136]:


for x in ['sm']:
    q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    df_num.loc[df_num[x] < min,x] = np.nan
    df_num.loc[df_num[x] > max,x] = np.nan


# In[137]:


for x in ['defending']:
    q75,q25 = np.percentile(df_num.loc[:,x],[75,25])
    intr_qr = q75-q25
 
    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
 
    df_num.loc[df_num[x] < min,x] = np.nan
    df_num.loc[df_num[x] > max,x] = np.nan


# In[138]:


df_num.isnull().sum()


# In[139]:


df_num['age']= df_num['age'].fillna(np.mean(df_num['age']))
df_num['ova']= df_num['ova'].fillna(np.mean(df_num['ova']))
df_num['pot']= df_num['pot'].fillna(np.mean(df_num['pot']))
df_num['height']= df_num['height'].fillna(np.mean(df_num['height']))
df_num['weight']= df_num['weight'].fillna(np.mean(df_num['weight']))
df_num['value']= df_num['value'].fillna(np.mean(df_num['value']))
df_num['wage']= df_num['wage'].fillna(np.mean(df_num['wage']))
df_num['release_clause']= df_num['release_clause'].fillna(np.mean(df_num['release_clause']))
df_num['power']= df_num['power'].fillna(np.mean(df_num['power']))
df_num['movement']= df_num['movement'].fillna(np.mean(df_num['movement']))
df_num['sm']= df_num['sm'].fillna(np.mean(df_num['sm']))
df_num['defending']= df_num['defending'].fillna(np.mean(df_num['defending']))


# In[140]:


df_num.isnull().sum()


# In[141]:


#Split
X=df_num.drop(columns=['value'], axis=1)
y=df_num.value


# In[142]:


#Normalizer
from sklearn.preprocessing import StandardScaler 
transformer=StandardScaler().fit(X)
X_norm_x=transformer.transform(X)
X_norm_x


# In[143]:


X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)


# In[144]:


X_train.shape


# In[145]:


X_test.shape


# In[146]:


lm=linear_model.LinearRegression()
model=lm.fit(X_train,y_train)
model


# In[147]:


train_predictions=lm.predict(X_train)


# In[148]:


#R2+
train_score=lm.score(X_train, y_train) #R2
test_score=lm.score(X_test, y_test)
print (train_score, test_score)


# In[149]:


mse_new = mean_squared_error(y_train, train_predictions)
mae_new = mean_absolute_error(y_train, train_predictions)
rmse_new = sqrt(mse_new)


print('Mean Squared Error:', mse_new)
print('Mean absolute Error:', mae_new)
print('Root Mean Squared Error:',rmse_new)


# In[150]:


##Elimando dos columnas de numericas


# In[151]:


df_num


# In[152]:


df_num = df_num.drop(['release_clause','height'], axis=1)
df_num


# In[153]:


df_cat


# In[154]:


df_concat_model = pd.concat([df_num, df_cat], axis=1)


# In[155]:


#X-y split 
y=df_concat_model['value']
x_m=df_concat_model.drop('value',axis=1)
     

# In[156]:


X_num_m= x_x.select_dtypes(include=np.number)
X_cat_c= x_x.select_dtypes(include=np.object)
     

# In[157]:


from sklearn.preprocessing import Normalizer
transformer=Normalizer().fit(X_num_m)
x_norm_m=transformer.transform(X_num_m)
x_norm_m
     

# In[158]:


from sklearn import preprocessing
labels=preprocessing.OrdinalEncoder().fit(X_cat_c).transform(X_cat_c)
labels


# In[159]:


X=np.concatenate((x_norm_m,labels),axis=1)


# In[160]:


X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)


# In[161]:


lm=linear_model.LinearRegression()
model=lm.fit(X_train,y_train)


# In[162]:


predictions=lm.predict(X_train)


# In[163]:


r2_score(y_train,predictions)


# In[164]:


mse_new = mean_squared_error(y_train, predictions)
mae_new = mean_absolute_error(y_train, predictions)
rmse_new = sqrt(mse_new)


print('Mean Squared Error:', mse_new)
print('Mean absolute Error:', mae_new)
print('Root Mean Squared Error:',rmse_new)


# ## RESULTS

# 
# # Numerical and categorical data
# 
# **R2:0.97901212553766**    
# Mean Squared Error: 608462032800.9713  
# Mean absolute Error: 380202.62010170677  
# Root Mean Squared Error: 780039.7636024534    
#   
# 
# 
# # Data numerical only  
# **R2:0.9701210029938898**       
# Mean Squared Error: 866225652769.8446  
# Mean absolute Error: 428295.5255877738  
# Root Mean Squared Error: 930712.4436526271    
# 
# # Removing Outliers
# **R2: 0.6982558910881252**       
# Mean Squared Error: 334912734051.11316    
# Mean absolute Error: 291899.0036340945    
# Root Mean Squared Error: 578716.453931554        
# 
# # Removing two numerical columns: Release clause and Height
# **R2:0.07250571063361666**   
# Mean Squared Error: 1029447267052.3954    
# Mean absolute Error: 708737.3275927786    
# Root Mean Squared Error: 1014616.8079883141    
# 
# 

# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]: