In [1]:

import pandas as pd
import numpy as np

cols = ['symboling', 'normalized-losses', 'make', 'fuel-type', 
'aspiration', 'num-of-doors', 'body-style', 'srive-wheels', 
'engine-location', 'wheel-base', 'length', 'width', 'height',
'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size',
'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower',
'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

cars = pd.read_csv('imports-85.data', names=cols)
cars.head(5)

Out[1]:

	symboling	normalized-losses	make	fuel-type	aspiration	num-of-doors	body-style	srive-wheels	engine-location	wheel-base	...	engine-size	fuel-system	bore	stroke	compression-ratio	horsepower	peak-rpm	city-mpg	highway-mpg	price
0	3	?	alfa-romero	gas	std	two	convertible	rwd	front	88.6	...	130	mpfi	3.47	2.68	9.0	111	5000	21	27	13495
1	3	?	alfa-romero	gas	std	two	convertible	rwd	front	88.6	...	130	mpfi	3.47	2.68	9.0	111	5000	21	27	16500
2	1	?	alfa-romero	gas	std	two	hatchback	rwd	front	94.5	...	152	mpfi	2.68	3.47	9.0	154	5000	19	26	16500
3	2	164	audi	gas	std	four	sedan	fwd	front	99.8	...	109	mpfi	3.19	3.40	10.0	102	5500	24	30	13950
4	2	164	audi	gas	std	four	sedan	4wd	front	99.4	...	136	mpfi	3.19	3.40	8.0	115	5500	18	22	17450

5 rows × 26 columns

The numeric columns are symboling, normalized-losses, num-of-doors?, wheel-base, length, width, height, curb-weight, num-of-cylinders?, engine-size, bore, stroke, compression-ratio, horsepower, peak-rpm, city-mpg, highway-mpg, and price. The ones that can be used as features to predict the price of a car are num-of-doors, num-of-cylinders, engine-size, city-mpg, and highway-mpg. The target column is price since this is what we are going to predict.

In [2]:

cars = cars.replace("?", np.nan)

In [3]:

cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized-losses    164 non-null object
make                 205 non-null object
fuel-type            205 non-null object
aspiration           205 non-null object
num-of-doors         203 non-null object
body-style           205 non-null object
srive-wheels         205 non-null object
engine-location      205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-type          205 non-null object
num-of-cylinders     205 non-null object
engine-size          205 non-null int64
fuel-system          205 non-null object
bore                 201 non-null object
stroke               201 non-null object
compression-ratio    205 non-null float64
horsepower           203 non-null object
peak-rpm             203 non-null object
city-mpg             205 non-null int64
highway-mpg          205 non-null int64
price                201 non-null object
dtypes: float64(5), int64(5), object(16)
memory usage: 41.7+ KB

Replacing "?" to NaN caused the affected columns to be converted to object data type. In order to use the numeric columns, we will convert them to the numeric type.

In [4]:

continuous_val_cols = ['normalized-losses', 'wheel-base',
'length', 'width', 'height', 'curb-weight', 'engine-size',
'bore', 'stroke', 'compression-ratio', 'horsepower', 
'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

cars_to_numeric = cars[continuous_val_cols].astype('float')

In [5]:

cars_to_numeric.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 15 columns):
normalized-losses    164 non-null float64
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null float64
engine-size          205 non-null float64
bore                 201 non-null float64
stroke               201 non-null float64
compression-ratio    205 non-null float64
horsepower           203 non-null float64
peak-rpm             203 non-null float64
city-mpg             205 non-null float64
highway-mpg          205 non-null float64
price                201 non-null float64
dtypes: float64(15)
memory usage: 24.1 KB

We've now converted all of the numeric columns to float64, a numeric data type. In our analysis, we will focus on these columns.

In [6]:

print(cars_to_numeric['normalized-losses'].isnull().sum())
print(cars_to_numeric.shape)

41
(205, 15)

There are 41 rows with missing values in the normalized-losses column. This is 20% of our data so we will drop this column. Let's do so below.

In [7]:

cars_to_numeric = cars_to_numeric.drop('normalized-losses', axis=1)

Now we will consider the missing values remaining in other columns of the dataframe.

In [8]:

cars_to_numeric.isnull().sum()

Out[8]:

wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-size          0
bore                 4
stroke               4
compression-ratio    0
horsepower           2
peak-rpm             2
city-mpg             0
highway-mpg          0
price                4
dtype: int64

The remaining missing values are only 2%, at most, of the total values. So we will replace them with the average values of the columns.

In [9]:

cars_to_numeric = cars_to_numeric.fillna(cars_to_numeric.mean())

Normalize all rows except for price, which will be our target column.

In [10]:

normalized_cars_to_numeric = (cars_to_numeric - cars_to_numeric.min()) / (cars_to_numeric.max() - cars_to_numeric.min())
#replace normalized price column with the not normalized values
normalized_cars_to_numeric['price'] = cars_to_numeric['price']

In [11]:

def knn_train_test(train_col, target_col, dataframe):
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.metrics import mean_squared_error
    from math import sqrt
    np.random.seed(1)
    
    #Shuffle the rows of the dataframe
    shuffled_rows = np.random.permutation(dataframe.index)
    randomized_df = dataframe.reindex(shuffled_rows)
    
    length = int(len(randomized_df) / 2)
    train_df = randomized_df.iloc[0:length]
    test_df = randomized_df.iloc[length:]
    knn = KNeighborsRegressor()
    train_features = train_df[[train_col]]
    train_target = train_df[target_col]
    knn.fit(train_features, train_target)
    predictions = knn.predict(test_df[[train_col]])
    mse = mean_squared_error(test_df[target_col], predictions)
    rmse = sqrt(mse)
    return rmse

#Use function above to calculate rmses. First drop price from
#the training dataset since it is our target.
train_col = normalized_cars_to_numeric.columns.drop('price')
#calc rmses for all train columns
rmses = {}
for col in train_col:
    rmse_val = knn_train_test(col, 'price', normalized_cars_to_numeric)
    rmses[col] = rmse_val

#View rmse results in agreeable sorted format
rmses_series = pd.Series(rmses)
rmses_series.sort_values()

Out[11]:

engine-size          4244.103421
highway-mpg          4245.734567
curb-weight          4436.523561
width                5030.139338
city-mpg             5085.047194
horsepower           5092.272401
length               5418.778301
wheel-base           5743.877086
bore                 6746.031651
compression-ratio    7177.202061
height               7832.152833
peak-rpm             7965.541705
stroke               8096.653898
dtype: float64

Engine size performed best using the default k value. Now we will modify the function above to also accept k as a parameter.

In [12]:

def updated_knn_train_test(train_col, target_col, dataframe):   
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.metrics import mean_squared_error
    from math import sqrt
    np.random.seed(1)
    
    #Shuffle the rows of the dataframe
    shuffled_rows = np.random.permutation(dataframe.index)
    randomized_df = dataframe.reindex(shuffled_rows)
    length = int(len(randomized_df) / 2)
    train_df = randomized_df.iloc[0:length]
    test_df = randomized_df.iloc[length:]
    
    k_vals = [1,3,5,7,9]
    k_rmses = {}
    for k_vl in k_vals:
        knn = KNeighborsRegressor(n_neighbors=k_vl)
        train_features = train_df[[train_col]]
        train_target = train_df[target_col]
        knn.fit(train_features, train_target)
        predictions = knn.predict(test_df[[train_col]])
        k_mse = mean_squared_error(test_df[target_col], predictions)
        k_rmse = sqrt(k_mse)
        k_rmses[k_vl] = k_rmse
    return k_rmses

#Use function above to calculate rmses. First drop price from
#the training dataset since it is our target.
train_col = normalized_cars_to_numeric.columns.drop('price')
#calc rmses for all train columns
rmses = {}
for col in train_col:
    rmse_val = updated_knn_train_test(col, 'price', normalized_cars_to_numeric)
    rmses[col] = rmse_val
#rmses

import matplotlib.pyplot as plt
%matplotlib inline

for a, b in rmses.items():
    x = list(b.keys())
    y = list(b.values())
    
    plt.plot(x, y)
    plt.ylabel("RMSE (Price, USD)")
    plt.xlabel("k-value, number of similar prices")
    plt.show()

Out[12]:

{'bore': {1: 7930.548590614734,
  3: 6353.8909942494565,
  5: 6746.031650636273,
  7: 7246.705124103924,
  9: 7184.417703627346},
 'city-mpg': {1: 6449.848972168971,
  3: 4808.843642391777,
  5: 5085.0471938761675,
  7: 4958.158653064454,
  9: 5021.788908379828},
 'compression-ratio': {1: 7255.772135357892,
  3: 7485.226183865361,
  5: 7177.202060550313,
  7: 7333.967327565648,
  9: 7426.77234607814},
 'curb-weight': {1: 5842.169653813219,
  3: 4527.849193303364,
  5: 4436.523561278551,
  7: 4320.057970537657,
  9: 4181.086220663609},
 'engine-size': {1: 4775.980594973188,
  3: 4211.354845313718,
  5: 4244.103420588581,
  7: 4028.227456215567,
  9: 4125.116174076774},
 'height': {1: 10653.050778292903,
  3: 8176.934988463841,
  5: 7832.152832500197,
  7: 7743.870132519971,
  9: 7632.665665042622},
 'highway-mpg': {1: 4445.090471315778,
  3: 4442.016315845971,
  5: 4245.734566600798,
  7: 4266.445995724475,
  9: 4522.647593669363},
 'horsepower': {1: 5261.091645623908,
  3: 4995.302906280692,
  5: 5092.272401434547,
  7: 5025.308887429083,
  9: 4989.9865689273765},
 'length': {1: 6853.715819594109,
  3: 5113.209767682691,
  5: 5418.778300690459,
  7: 5618.149068803402,
  9: 5641.426309574451},
 'peak-rpm': {1: 9386.46161279999,
  3: 8096.1146568480835,
  5: 7965.54170513815,
  7: 8256.807774278761,
  9: 8101.58286959599},
 'stroke': {1: 7345.451363903154,
  3: 7455.212313651712,
  5: 8096.65389823726,
  7: 7935.188913776914,
  9: 8066.594604492951},
 'wheel-base': {1: 5400.51418967299,
  3: 5629.538534558065,
  5: 5743.8770860517825,
  7: 6048.744597831812,
  9: 6360.529322830884},
 'width': {1: 5739.245757378005,
  3: 5291.949721784998,
  5: 5030.139338284197,
  7: 4933.6396353832,
  9: 4951.829909070571}}

This looks like this:

In [14]:

import matplotlib.pyplot as plt
%matplotlib inline

for a, b in rmses.items():
    x = list(b.keys())
    y = list(b.values())
    
    plt.plot(x, y)
    plt.ylabel("RMSE (Price, USD)")
    plt.xlabel("k-value, number of similar prices")
    plt.show()
    

In [ ]: