import pandas as pd
import numpy as np
cols = ['symboling', 'normalized-losses', 'make', 'fuel-type',
'aspiration', 'num-of-doors', 'body-style', 'srive-wheels',
'engine-location', 'wheel-base', 'length', 'width', 'height',
'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size',
'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower',
'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars = pd.read_csv('imports-85.data', names=cols)
cars.head(5)
The numeric columns are symboling, normalized-losses, num-of-doors?, wheel-base, length, width, height, curb-weight, num-of-cylinders?, engine-size, bore, stroke, compression-ratio, horsepower, peak-rpm, city-mpg, highway-mpg, and price. The ones that can be used as features to predict the price of a car are num-of-doors, num-of-cylinders, engine-size, city-mpg, and highway-mpg. The target column is price since this is what we are going to predict.
cars = cars.replace("?", np.nan)
cars.info()
Replacing "?" to NaN caused the affected columns to be converted to object data type. In order to use the numeric columns, we will convert them to the numeric type.
continuous_val_cols = ['normalized-losses', 'wheel-base',
'length', 'width', 'height', 'curb-weight', 'engine-size',
'bore', 'stroke', 'compression-ratio', 'horsepower',
'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars_to_numeric = cars[continuous_val_cols].astype('float')
cars_to_numeric.info()
We've now converted all of the numeric columns to float64, a numeric data type. In our analysis, we will focus on these columns.
print(cars_to_numeric['normalized-losses'].isnull().sum())
print(cars_to_numeric.shape)
There are 41 rows with missing values in the normalized-losses column. This is 20% of our data so we will drop this column. Let's do so below.
cars_to_numeric = cars_to_numeric.drop('normalized-losses', axis=1)
Now we will consider the missing values remaining in other columns of the dataframe.
cars_to_numeric.isnull().sum()
The remaining missing values are only 2%, at most, of the total values. So we will replace them with the average values of the columns.
cars_to_numeric = cars_to_numeric.fillna(cars_to_numeric.mean())
Normalize all rows except for price, which will be our target column.
normalized_cars_to_numeric = (cars_to_numeric - cars_to_numeric.min()) / (cars_to_numeric.max() - cars_to_numeric.min())
#replace normalized price column with the not normalized values
normalized_cars_to_numeric['price'] = cars_to_numeric['price']
def knn_train_test(train_col, target_col, dataframe):
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
np.random.seed(1)
#Shuffle the rows of the dataframe
shuffled_rows = np.random.permutation(dataframe.index)
randomized_df = dataframe.reindex(shuffled_rows)
length = int(len(randomized_df) / 2)
train_df = randomized_df.iloc[0:length]
test_df = randomized_df.iloc[length:]
knn = KNeighborsRegressor()
train_features = train_df[[train_col]]
train_target = train_df[target_col]
knn.fit(train_features, train_target)
predictions = knn.predict(test_df[[train_col]])
mse = mean_squared_error(test_df[target_col], predictions)
rmse = sqrt(mse)
return rmse
#Use function above to calculate rmses. First drop price from
#the training dataset since it is our target.
train_col = normalized_cars_to_numeric.columns.drop('price')
#calc rmses for all train columns
rmses = {}
for col in train_col:
rmse_val = knn_train_test(col, 'price', normalized_cars_to_numeric)
rmses[col] = rmse_val
#View rmse results in agreeable sorted format
rmses_series = pd.Series(rmses)
rmses_series.sort_values()
Engine size performed best using the default k value. Now we will modify the function above to also accept k as a parameter.
def updated_knn_train_test(train_col, target_col, dataframe):
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
np.random.seed(1)
#Shuffle the rows of the dataframe
shuffled_rows = np.random.permutation(dataframe.index)
randomized_df = dataframe.reindex(shuffled_rows)
length = int(len(randomized_df) / 2)
train_df = randomized_df.iloc[0:length]
test_df = randomized_df.iloc[length:]
k_vals = [1,3,5,7,9]
k_rmses = {}
for k_vl in k_vals:
knn = KNeighborsRegressor(n_neighbors=k_vl)
train_features = train_df[[train_col]]
train_target = train_df[target_col]
knn.fit(train_features, train_target)
predictions = knn.predict(test_df[[train_col]])
k_mse = mean_squared_error(test_df[target_col], predictions)
k_rmse = sqrt(k_mse)
k_rmses[k_vl] = k_rmse
return k_rmses
#Use function above to calculate rmses. First drop price from
#the training dataset since it is our target.
train_col = normalized_cars_to_numeric.columns.drop('price')
#calc rmses for all train columns
rmses = {}
for col in train_col:
rmse_val = updated_knn_train_test(col, 'price', normalized_cars_to_numeric)
rmses[col] = rmse_val
#rmses
import matplotlib.pyplot as plt
%matplotlib inline
for a, b in rmses.items():
x = list(b.keys())
y = list(b.values())
plt.plot(x, y)
plt.ylabel("RMSE (Price, USD)")
plt.xlabel("k-value, number of similar prices")
plt.show()
This looks like this:
import matplotlib.pyplot as plt
%matplotlib inline
for a, b in rmses.items():
x = list(b.keys())
y = list(b.values())
plt.plot(x, y)
plt.ylabel("RMSE (Price, USD)")
plt.xlabel("k-value, number of similar prices")
plt.show()