#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import numpy as np

cols = ['symboling', 'normalized-losses', 'make', 'fuel-type', 
'aspiration', 'num-of-doors', 'body-style', 'srive-wheels', 
'engine-location', 'wheel-base', 'length', 'width', 'height',
'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size',
'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower',
'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

cars = pd.read_csv('imports-85.data', names=cols)
cars.head(5)


# The numeric columns are symboling, normalized-losses, num-of-doors?, wheel-base, length, width, height, curb-weight, num-of-cylinders?, engine-size, bore, stroke, compression-ratio, horsepower, peak-rpm, city-mpg, highway-mpg, and price. The ones that can be used as features to predict the price of a car are num-of-doors, num-of-cylinders, engine-size, city-mpg, and highway-mpg. The target column is price since this is what we are going to predict.

# In[2]:


cars = cars.replace("?", np.nan)


# In[3]:


cars.info()


# Replacing "?" to NaN caused the affected columns to be converted to object data type. In order to use the numeric columns, we will convert them to the numeric type.

# In[4]:


continuous_val_cols = ['normalized-losses', 'wheel-base',
'length', 'width', 'height', 'curb-weight', 'engine-size',
'bore', 'stroke', 'compression-ratio', 'horsepower', 
'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

cars_to_numeric = cars[continuous_val_cols].astype('float')


# In[5]:


cars_to_numeric.info()


# We've now converted all of the numeric columns to float64, a numeric data type. In our analysis, we will focus on these columns.

# In[6]:


print(cars_to_numeric['normalized-losses'].isnull().sum())
print(cars_to_numeric.shape)


# There are 41 rows with missing values in the normalized-losses column. This is 20% of our data so we will drop this column. Let's do so below.

# In[7]:


cars_to_numeric = cars_to_numeric.drop('normalized-losses', axis=1)


# Now we will consider the missing values remaining in other columns of the dataframe.

# In[8]:


cars_to_numeric.isnull().sum()


# The remaining missing values are only 2%, at most, of the total values. So we will replace them with the average values of the columns.

# In[9]:


cars_to_numeric = cars_to_numeric.fillna(cars_to_numeric.mean())


# Normalize all rows except for price, which will be our target column.

# In[10]:


normalized_cars_to_numeric = (cars_to_numeric - cars_to_numeric.min()) / (cars_to_numeric.max() - cars_to_numeric.min())
#replace normalized price column with the not normalized values
normalized_cars_to_numeric['price'] = cars_to_numeric['price']


# In[11]:


def knn_train_test(train_col, target_col, dataframe):
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.metrics import mean_squared_error
    from math import sqrt
    np.random.seed(1)
    
    #Shuffle the rows of the dataframe
    shuffled_rows = np.random.permutation(dataframe.index)
    randomized_df = dataframe.reindex(shuffled_rows)
    
    length = int(len(randomized_df) / 2)
    train_df = randomized_df.iloc[0:length]
    test_df = randomized_df.iloc[length:]
    knn = KNeighborsRegressor()
    train_features = train_df[[train_col]]
    train_target = train_df[target_col]
    knn.fit(train_features, train_target)
    predictions = knn.predict(test_df[[train_col]])
    mse = mean_squared_error(test_df[target_col], predictions)
    rmse = sqrt(mse)
    return rmse

#Use function above to calculate rmses. First drop price from
#the training dataset since it is our target.
train_col = normalized_cars_to_numeric.columns.drop('price')
#calc rmses for all train columns
rmses = {}
for col in train_col:
    rmse_val = knn_train_test(col, 'price', normalized_cars_to_numeric)
    rmses[col] = rmse_val

#View rmse results in agreeable sorted format
rmses_series = pd.Series(rmses)
rmses_series.sort_values()


# Engine size performed best using the default k value. Now we will modify the function above to also accept k as a parameter.

# In[12]:


def updated_knn_train_test(train_col, target_col, dataframe):   
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.metrics import mean_squared_error
    from math import sqrt
    np.random.seed(1)
    
    #Shuffle the rows of the dataframe
    shuffled_rows = np.random.permutation(dataframe.index)
    randomized_df = dataframe.reindex(shuffled_rows)
    length = int(len(randomized_df) / 2)
    train_df = randomized_df.iloc[0:length]
    test_df = randomized_df.iloc[length:]
    
    k_vals = [1,3,5,7,9]
    k_rmses = {}
    for k_vl in k_vals:
        knn = KNeighborsRegressor(n_neighbors=k_vl)
        train_features = train_df[[train_col]]
        train_target = train_df[target_col]
        knn.fit(train_features, train_target)
        predictions = knn.predict(test_df[[train_col]])
        k_mse = mean_squared_error(test_df[target_col], predictions)
        k_rmse = sqrt(k_mse)
        k_rmses[k_vl] = k_rmse
    return k_rmses

#Use function above to calculate rmses. First drop price from
#the training dataset since it is our target.
train_col = normalized_cars_to_numeric.columns.drop('price')
#calc rmses for all train columns
rmses = {}
for col in train_col:
    rmse_val = updated_knn_train_test(col, 'price', normalized_cars_to_numeric)
    rmses[col] = rmse_val
#rmses

import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

for a, b in rmses.items():
    x = list(b.keys())
    y = list(b.values())
    
    plt.plot(x, y)
    plt.ylabel("RMSE (Price, USD)")
    plt.xlabel("k-value, number of similar prices")
    plt.show()


# This looks like this:

# In[14]:


import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

for a, b in rmses.items():
    x = list(b.keys())
    y = list(b.values())
    
    plt.plot(x, y)
    plt.ylabel("RMSE (Price, USD)")
    plt.xlabel("k-value, number of similar prices")
    plt.show()
    

# In[ ]: