In [1]:
import pandas as pd
import numpy as np

cols = ['symboling', 'normalized-losses', 'make', 'fuel-type', 
'aspiration', 'num-of-doors', 'body-style', 'srive-wheels', 
'engine-location', 'wheel-base', 'length', 'width', 'height',
'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size',
'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower',
'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

cars = pd.read_csv('imports-85.data', names=cols)
cars.head(5)
Out[1]:
symboling normalized-losses make fuel-type aspiration num-of-doors body-style srive-wheels engine-location wheel-base ... engine-size fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg price
0 3 ? alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 13495
1 3 ? alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 16500
2 1 ? alfa-romero gas std two hatchback rwd front 94.5 ... 152 mpfi 2.68 3.47 9.0 154 5000 19 26 16500
3 2 164 audi gas std four sedan fwd front 99.8 ... 109 mpfi 3.19 3.40 10.0 102 5500 24 30 13950
4 2 164 audi gas std four sedan 4wd front 99.4 ... 136 mpfi 3.19 3.40 8.0 115 5500 18 22 17450

5 rows × 26 columns

The numeric columns are symboling, normalized-losses, num-of-doors?, wheel-base, length, width, height, curb-weight, num-of-cylinders?, engine-size, bore, stroke, compression-ratio, horsepower, peak-rpm, city-mpg, highway-mpg, and price. The ones that can be used as features to predict the price of a car are num-of-doors, num-of-cylinders, engine-size, city-mpg, and highway-mpg. The target column is price since this is what we are going to predict.

In [2]:
cars = cars.replace("?", np.nan)
In [3]:
cars.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized-losses    164 non-null object
make                 205 non-null object
fuel-type            205 non-null object
aspiration           205 non-null object
num-of-doors         203 non-null object
body-style           205 non-null object
srive-wheels         205 non-null object
engine-location      205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-type          205 non-null object
num-of-cylinders     205 non-null object
engine-size          205 non-null int64
fuel-system          205 non-null object
bore                 201 non-null object
stroke               201 non-null object
compression-ratio    205 non-null float64
horsepower           203 non-null object
peak-rpm             203 non-null object
city-mpg             205 non-null int64
highway-mpg          205 non-null int64
price                201 non-null object
dtypes: float64(5), int64(5), object(16)
memory usage: 41.7+ KB

Replacing "?" to NaN caused the affected columns to be converted to object data type. In order to use the numeric columns, we will convert them to the numeric type.

In [4]:
continuous_val_cols = ['normalized-losses', 'wheel-base',
'length', 'width', 'height', 'curb-weight', 'engine-size',
'bore', 'stroke', 'compression-ratio', 'horsepower', 
'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

cars_to_numeric = cars[continuous_val_cols].astype('float')
In [5]:
cars_to_numeric.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 15 columns):
normalized-losses    164 non-null float64
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null float64
engine-size          205 non-null float64
bore                 201 non-null float64
stroke               201 non-null float64
compression-ratio    205 non-null float64
horsepower           203 non-null float64
peak-rpm             203 non-null float64
city-mpg             205 non-null float64
highway-mpg          205 non-null float64
price                201 non-null float64
dtypes: float64(15)
memory usage: 24.1 KB

We've now converted all of the numeric columns to float64, a numeric data type. In our analysis, we will focus on these columns.

In [6]:
print(cars_to_numeric['normalized-losses'].isnull().sum())
print(cars_to_numeric.shape)
41
(205, 15)

There are 41 rows with missing values in the normalized-losses column. This is 20% of our data so we will drop this column. Let's do so below.

In [7]:
cars_to_numeric = cars_to_numeric.drop('normalized-losses', axis=1)

Now we will consider the missing values remaining in other columns of the dataframe.

In [8]:
cars_to_numeric.isnull().sum()
Out[8]:
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-size          0
bore                 4
stroke               4
compression-ratio    0
horsepower           2
peak-rpm             2
city-mpg             0
highway-mpg          0
price                4
dtype: int64

The remaining missing values are only 2%, at most, of the total values. So we will replace them with the average values of the columns.

In [9]:
cars_to_numeric = cars_to_numeric.fillna(cars_to_numeric.mean())

Normalize all rows except for price, which will be our target column.

In [10]:
normalized_cars_to_numeric = (cars_to_numeric - cars_to_numeric.min()) / (cars_to_numeric.max() - cars_to_numeric.min())
#replace normalized price column with the not normalized values
normalized_cars_to_numeric['price'] = cars_to_numeric['price']
In [11]:
def knn_train_test(train_col, target_col, dataframe):
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.metrics import mean_squared_error
    from math import sqrt
    np.random.seed(1)
    
    #Shuffle the rows of the dataframe
    shuffled_rows = np.random.permutation(dataframe.index)
    randomized_df = dataframe.reindex(shuffled_rows)
    
    length = int(len(randomized_df) / 2)
    train_df = randomized_df.iloc[0:length]
    test_df = randomized_df.iloc[length:]
    knn = KNeighborsRegressor()
    train_features = train_df[[train_col]]
    train_target = train_df[target_col]
    knn.fit(train_features, train_target)
    predictions = knn.predict(test_df[[train_col]])
    mse = mean_squared_error(test_df[target_col], predictions)
    rmse = sqrt(mse)
    return rmse

#Use function above to calculate rmses. First drop price from
#the training dataset since it is our target.
train_col = normalized_cars_to_numeric.columns.drop('price')
#calc rmses for all train columns
rmses = {}
for col in train_col:
    rmse_val = knn_train_test(col, 'price', normalized_cars_to_numeric)
    rmses[col] = rmse_val

#View rmse results in agreeable sorted format
rmses_series = pd.Series(rmses)
rmses_series.sort_values()
Out[11]:
engine-size          4244.103421
highway-mpg          4245.734567
curb-weight          4436.523561
width                5030.139338
city-mpg             5085.047194
horsepower           5092.272401
length               5418.778301
wheel-base           5743.877086
bore                 6746.031651
compression-ratio    7177.202061
height               7832.152833
peak-rpm             7965.541705
stroke               8096.653898
dtype: float64

Engine size performed best using the default k value. Now we will modify the function above to also accept k as a parameter.

In [12]:
def updated_knn_train_test(train_col, target_col, dataframe):   
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.metrics import mean_squared_error
    from math import sqrt
    np.random.seed(1)
    
    #Shuffle the rows of the dataframe
    shuffled_rows = np.random.permutation(dataframe.index)
    randomized_df = dataframe.reindex(shuffled_rows)
    length = int(len(randomized_df) / 2)
    train_df = randomized_df.iloc[0:length]
    test_df = randomized_df.iloc[length:]
    
    k_vals = [1,3,5,7,9]
    k_rmses = {}
    for k_vl in k_vals:
        knn = KNeighborsRegressor(n_neighbors=k_vl)
        train_features = train_df[[train_col]]
        train_target = train_df[target_col]
        knn.fit(train_features, train_target)
        predictions = knn.predict(test_df[[train_col]])
        k_mse = mean_squared_error(test_df[target_col], predictions)
        k_rmse = sqrt(k_mse)
        k_rmses[k_vl] = k_rmse
    return k_rmses

#Use function above to calculate rmses. First drop price from
#the training dataset since it is our target.
train_col = normalized_cars_to_numeric.columns.drop('price')
#calc rmses for all train columns
rmses = {}
for col in train_col:
    rmse_val = updated_knn_train_test(col, 'price', normalized_cars_to_numeric)
    rmses[col] = rmse_val
#rmses

import matplotlib.pyplot as plt
%matplotlib inline

for a, b in rmses.items():
    x = list(b.keys())
    y = list(b.values())
    
    plt.plot(x, y)
    plt.ylabel("RMSE (Price, USD)")
    plt.xlabel("k-value, number of similar prices")
    plt.show()
Out[12]:
{'bore': {1: 7930.548590614734,
  3: 6353.8909942494565,
  5: 6746.031650636273,
  7: 7246.705124103924,
  9: 7184.417703627346},
 'city-mpg': {1: 6449.848972168971,
  3: 4808.843642391777,
  5: 5085.0471938761675,
  7: 4958.158653064454,
  9: 5021.788908379828},
 'compression-ratio': {1: 7255.772135357892,
  3: 7485.226183865361,
  5: 7177.202060550313,
  7: 7333.967327565648,
  9: 7426.77234607814},
 'curb-weight': {1: 5842.169653813219,
  3: 4527.849193303364,
  5: 4436.523561278551,
  7: 4320.057970537657,
  9: 4181.086220663609},
 'engine-size': {1: 4775.980594973188,
  3: 4211.354845313718,
  5: 4244.103420588581,
  7: 4028.227456215567,
  9: 4125.116174076774},
 'height': {1: 10653.050778292903,
  3: 8176.934988463841,
  5: 7832.152832500197,
  7: 7743.870132519971,
  9: 7632.665665042622},
 'highway-mpg': {1: 4445.090471315778,
  3: 4442.016315845971,
  5: 4245.734566600798,
  7: 4266.445995724475,
  9: 4522.647593669363},
 'horsepower': {1: 5261.091645623908,
  3: 4995.302906280692,
  5: 5092.272401434547,
  7: 5025.308887429083,
  9: 4989.9865689273765},
 'length': {1: 6853.715819594109,
  3: 5113.209767682691,
  5: 5418.778300690459,
  7: 5618.149068803402,
  9: 5641.426309574451},
 'peak-rpm': {1: 9386.46161279999,
  3: 8096.1146568480835,
  5: 7965.54170513815,
  7: 8256.807774278761,
  9: 8101.58286959599},
 'stroke': {1: 7345.451363903154,
  3: 7455.212313651712,
  5: 8096.65389823726,
  7: 7935.188913776914,
  9: 8066.594604492951},
 'wheel-base': {1: 5400.51418967299,
  3: 5629.538534558065,
  5: 5743.8770860517825,
  7: 6048.744597831812,
  9: 6360.529322830884},
 'width': {1: 5739.245757378005,
  3: 5291.949721784998,
  5: 5030.139338284197,
  7: 4933.6396353832,
  9: 4951.829909070571}}

This looks like this:

In [14]:
import matplotlib.pyplot as plt
%matplotlib inline

for a, b in rmses.items():
    x = list(b.keys())
    y = list(b.values())
    
    plt.plot(x, y)
    plt.ylabel("RMSE (Price, USD)")
    plt.xlabel("k-value, number of similar prices")
    plt.show()
    
In [ ]: