Predicting Car Prices

In [1]:
from IPython.display import display, HTML
display(HTML(data="""
<style>
    div#notebook-container    { width: 85%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))
import datetime as dt
from itertools import permutations 
from collections import Counter

import numpy as np
from numpy import arange
from numpy.random import randint, seed, random
import pandas as pd
import pandas_profiling

import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
import seaborn as sns

from scipy.stats import percentileofscore, chisquare, chi2_contingency
from scipy import stats
from scipy.spatial import distance

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score

Introduction to the Data Set

In [2]:
columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 
           'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 
           'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars = pd.read_csv('imports-85.data', names=columns)
In [3]:
cars.head(3)
Out[3]:
symboling normalized-losses make fuel-type aspiration num-of-doors body-style drive-wheels engine-location wheel-base ... engine-size fuel-system bore stroke compression-rate horsepower peak-rpm city-mpg highway-mpg price
0 3 ? alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 13495
1 3 ? alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 16500
2 1 ? alfa-romero gas std two hatchback rwd front 94.5 ... 152 mpfi 2.68 3.47 9.0 154 5000 19 26 16500

3 rows × 26 columns

In [4]:
numerical_values = ['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-rate', 
            'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars_numerical_values = cars[numerical_values]
In [5]:
cars_numerical_values.head(3)
Out[5]:
normalized-losses wheel-base length width height curb-weight engine-size bore stroke compression-rate horsepower peak-rpm city-mpg highway-mpg price
0 ? 88.6 168.8 64.1 48.8 2548 130 3.47 2.68 9.0 111 5000 21 27 13495
1 ? 88.6 168.8 64.1 48.8 2548 130 3.47 2.68 9.0 111 5000 21 27 16500
2 ? 94.5 171.2 65.5 52.4 2823 152 2.68 3.47 9.0 154 5000 19 26 16500

Data Cleaning

In [6]:
cars_numerical_values.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 15 columns):
normalized-losses    205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-size          205 non-null int64
bore                 205 non-null object
stroke               205 non-null object
compression-rate     205 non-null float64
horsepower           205 non-null object
peak-rpm             205 non-null object
city-mpg             205 non-null int64
highway-mpg          205 non-null int64
price                205 non-null object
dtypes: float64(5), int64(4), object(6)
memory usage: 24.1+ KB
In [7]:
cars_numerical_values.replace('?', np.nan, inplace=True)
C:\Users\Leo\Anaconda3\lib\site-packages\pandas\core\frame.py:4263: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,
In [8]:
cars_numerical_values
Out[8]:
normalized-losses wheel-base length width height curb-weight engine-size bore stroke compression-rate horsepower peak-rpm city-mpg highway-mpg price
0 NaN 88.6 168.8 64.1 48.8 2548 130 3.47 2.68 9.0 111 5000 21 27 13495
1 NaN 88.6 168.8 64.1 48.8 2548 130 3.47 2.68 9.0 111 5000 21 27 16500
2 NaN 94.5 171.2 65.5 52.4 2823 152 2.68 3.47 9.0 154 5000 19 26 16500
3 164 99.8 176.6 66.2 54.3 2337 109 3.19 3.40 10.0 102 5500 24 30 13950
4 164 99.4 176.6 66.4 54.3 2824 136 3.19 3.40 8.0 115 5500 18 22 17450
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
200 95 109.1 188.8 68.9 55.5 2952 141 3.78 3.15 9.5 114 5400 23 28 16845
201 95 109.1 188.8 68.8 55.5 3049 141 3.78 3.15 8.7 160 5300 19 25 19045
202 95 109.1 188.8 68.9 55.5 3012 173 3.58 2.87 8.8 134 5500 18 23 21485
203 95 109.1 188.8 68.9 55.5 3217 145 3.01 3.40 23.0 106 4800 26 27 22470
204 95 109.1 188.8 68.9 55.5 3062 141 3.78 3.15 9.5 114 5400 19 25 22625

205 rows × 15 columns

In [9]:
cars_numerical_values.describe()
Out[9]:
wheel-base length width height curb-weight engine-size compression-rate city-mpg highway-mpg
count 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000
mean 98.756585 174.049268 65.907805 53.724878 2555.565854 126.907317 10.142537 25.219512 30.751220
std 6.021776 12.337289 2.145204 2.443522 520.680204 41.642693 3.972040 6.542142 6.886443
min 86.600000 141.100000 60.300000 47.800000 1488.000000 61.000000 7.000000 13.000000 16.000000
25% 94.500000 166.300000 64.100000 52.000000 2145.000000 97.000000 8.600000 19.000000 25.000000
50% 97.000000 173.200000 65.500000 54.100000 2414.000000 120.000000 9.000000 24.000000 30.000000
75% 102.400000 183.100000 66.900000 55.500000 2935.000000 141.000000 9.400000 30.000000 34.000000
max 120.900000 208.100000 72.300000 59.800000 4066.000000 326.000000 23.000000 49.000000 54.000000
In [10]:
# convert all fields to float data type.
cars_numerical_values = cars_numerical_values.astype(float)
In [11]:
# remove all price data points that are missing because this is what we're trying to predict
cars_numerical_values = cars_numerical_values.dropna(subset=['price'])
cars_numerical_values.isnull().sum()
Out[11]:
normalized-losses    37
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-size           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 0
dtype: int64
In [12]:
# replace null values with the means of its attribute
cars_numerical_values = cars_numerical_values.fillna(cars_numerical_values.mean())
cars_numerical_values.isnull().sum()
Out[12]:
normalized-losses    0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-size          0
bore                 0
stroke               0
compression-rate     0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64
In [13]:
# normalize the feature by using min-max scaling
price_column = cars_numerical_values['price']
cars_numerical_values = (cars_numerical_values - cars_numerical_values.min())/(cars_numerical_values.max() - cars_numerical_values.min())
cars_numerical_values['price'] = price_column
cars_numerical_values.head(3)
Out[13]:
normalized-losses wheel-base length width height curb-weight engine-size bore stroke compression-rate horsepower peak-rpm city-mpg highway-mpg price
0 0.298429 0.058309 0.413433 0.324786 0.083333 0.411171 0.260377 0.664286 0.290476 0.125 0.294393 0.346939 0.222222 0.289474 13495.0
1 0.298429 0.058309 0.413433 0.324786 0.083333 0.411171 0.260377 0.664286 0.290476 0.125 0.294393 0.346939 0.222222 0.289474 16500.0
2 0.298429 0.230321 0.449254 0.444444 0.383333 0.517843 0.343396 0.100000 0.666667 0.125 0.495327 0.346939 0.166667 0.263158 16500.0
In [14]:
cars_numerical_values.describe()
Out[14]:
normalized-losses wheel-base length width height curb-weight engine-size bore stroke compression-rate horsepower peak-rpm city-mpg highway-mpg price
count 201.000000 201.000000 201.000000 201.000000 201.000000 201.000000 201.000000 201.000000 201.000000 201.000000 201.000000 201.000000 201.000000 201.000000 201.000000
mean 0.298429 0.355598 0.494045 0.477697 0.497222 0.414145 0.248587 0.564793 0.565192 0.197767 0.258864 0.394934 0.338308 0.386489 13207.129353
std 0.167520 0.176862 0.183913 0.179613 0.203985 0.200658 0.156781 0.191480 0.150499 0.250310 0.174606 0.195148 0.178423 0.179346 7947.066342
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5118.000000
25% 0.188482 0.230321 0.383582 0.324786 0.350000 0.264158 0.139623 0.435714 0.495238 0.100000 0.102804 0.265306 0.166667 0.236842 7775.000000
50% 0.298429 0.303207 0.479104 0.444444 0.525000 0.359193 0.222642 0.550000 0.580952 0.125000 0.219626 0.394934 0.305556 0.368421 10295.000000
75% 0.376963 0.460641 0.632836 0.538462 0.641667 0.557797 0.301887 0.742857 0.638095 0.150000 0.317757 0.551020 0.472222 0.473684 16500.000000
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 45400.000000

Univariate Model

A univariate model involves the analysis of a single variable (feature)

In [15]:
def knn_train_test(train, target, k):
    kf = KFold(n_splits=2, shuffle=True, random_state=1)
    knn = KNeighborsRegressor(n_neighbors=k)
    mses = cross_val_score(knn, train, target, scoring="neg_mean_squared_error", cv=kf)
    rmse = np.mean(np.sqrt(np.abs(mses)))
    return rmse
In [16]:
# loop through all features to see which performs best.
features = cars_numerical_values.columns.drop('price').to_list()
train = cars_numerical_values
target = cars_numerical_values['price']
k = 5
features_rmse = {}

for feature in features:
    rmse = knn_train_test(cars_numerical_values[[feature]], cars_numerical_values['price'], k)
    features_rmse[feature] = int(rmse)

# create a Series object from the dictionary so we can easily view the results and work better with the results
features_rmse = pd.Series(features_rmse)
features_rmse.sort_values()
Out[16]:
engine-size          3364
horsepower           3983
curb-weight          4130
highway-mpg          4336
width                4480
city-mpg             4788
length               5645
wheel-base           5709
bore                 6561
compression-rate     6875
normalized-losses    7482
peak-rpm             7721
height               7735
stroke               7768
dtype: int64
In [17]:
k_range = range(1,31)
i_list = []

features = cars_numerical_values.columns.drop('price').to_list()
train = cars_numerical_values
target = cars_numerical_values['price']
columns=['k', 'feature', 'score']

for k in k_range:
    for feature in features:
        rmse = knn_train_test(cars_numerical_values[[feature]], cars_numerical_values['price'], k)
        i_list.append([k, feature, int(rmse)])

k_features_scores = pd.DataFrame(i_list, columns=columns)
k_features_scores
Out[17]:
k feature score
0 1 normalized-losses 10243
1 1 wheel-base 5676
2 1 length 5289
3 1 width 4657
4 1 height 10146
... ... ... ...
415 30 compression-rate 7634
416 30 horsepower 5428
417 30 peak-rpm 7832
418 30 city-mpg 5596
419 30 highway-mpg 5359

420 rows × 3 columns

In [18]:
k_features_scores.sort_values(by=['score']).head(10)
Out[18]:
k feature score
34 3 engine-size 3282
48 4 engine-size 3327
62 5 engine-size 3364
76 6 engine-size 3487
20 2 engine-size 3519
90 7 engine-size 3650
104 8 engine-size 3697
24 2 horsepower 3802
118 9 engine-size 3841
66 5 horsepower 3983
In [19]:
plt.figure(figsize=(20, 10))
plt.plot(k_features_scores['k'], k_features_scores['score'])
plt.xlabel('K Value for KNN')
plt.ylabel('Cross Validation Accuracy')
Out[19]:
Text(0, 0.5, 'Cross Validation Accuracy')
In [20]:
# assessing which K values is the best
k_means = k_features_scores.groupby('k').mean()
k_means.sort_values('score').head(3)
Out[20]:
score
k
4 5699.000000
3 5745.785714
5 5755.500000
In [21]:
# assessing which feature produces the best score
feature_means = k_features_scores.drop(columns='k').groupby('feature').mean()
feature_means.sort_values('score').head(7)
Out[21]:
score
feature
engine-size 4404.800000
curb-weight 4698.633333
horsepower 4771.966667
highway-mpg 5064.666667
city-mpg 5095.133333
width 5140.200000
length 5718.933333

Multivariate Model

A multivariate model analysis examines two or more variables (features)

In [22]:
def knn_train_test(train, target, k):
    kf = KFold(n_splits=2, shuffle=True, random_state=1)
    knn = KNeighborsRegressor(n_neighbors=k)
    mses = cross_val_score(knn, train, target, scoring="neg_mean_squared_error", cv=kf)
    rmse = np.mean(np.sqrt(np.abs(mses)))
    return rmse
In [23]:
# running model on all features with k-value = 5
k_range = range(1,31)
i_list = []

features = cars_numerical_values.columns.drop('price').to_list()
train = cars_numerical_values[features]
target = cars_numerical_values['price']
columns=['k', 'score']
k = 5

rmse = knn_train_test(train, target, k)
i_list.append([5, int(rmse)])

k_scores = pd.DataFrame(i_list, columns=columns)
k_scores.sort_values(by='score').head()
Out[23]:
k score
0 5 4009
In [24]:
# assessing which feature produces the best score
feature_means = k_features_scores.drop(columns='k').groupby('feature').mean()
feature_means.sort_values('score').head(7)
Out[24]:
score
feature
engine-size 4404.800000
curb-weight 4698.633333
horsepower 4771.966667
highway-mpg 5064.666667
city-mpg 5095.133333
width 5140.200000
length 5718.933333
In [25]:
# convert resorted groupby object indexes into a list for model
best_features = feature_means.sort_values('score').head(5)
best_features = best_features.index.to_list()
best_features
Out[25]:
['engine-size', 'curb-weight', 'horsepower', 'highway-mpg', 'city-mpg']
In [26]:
# running model on best n features
i_list = []
n_feature = []

target = cars_numerical_values['price']
columns=['n features', 'score']
k = 5

for feature in best_features:
    n_feature.append(feature)
    if feature != 'engine-size':
        rmse = knn_train_test(cars_numerical_values[n_feature], target, k)
        i_list.append([len(n_feature), int(rmse)])

k_scores = pd.DataFrame(i_list, columns=columns)
k_scores.sort_values(by='score').head()
Out[26]:
n features score
2 4 3299
3 5 3353
1 3 3422
0 2 3427

It looks like the Multivariate model preforms best with four features.

Hyperparameter Tuning

In [27]:
# convert resorted groupby object indexes into a list for model
best_features = feature_means.sort_values('score').head(5)
best_features = best_features.index.to_list()
best_features
Out[27]:
['engine-size', 'curb-weight', 'horsepower', 'highway-mpg', 'city-mpg']
In [28]:
# running model on best n features
i_list = []
n_feature = []
k_range = range(1,31)

target = cars_numerical_values['price']
columns= ['n features', 'k', 'score']

for feature in best_features:
    n_feature.append(feature)
    if feature != 'engine-size':
        for k in k_range:
            rmse = knn_train_test(cars_numerical_values[n_feature], target, k)
            i_list.append([len(n_feature), k, int(rmse)])

best_feature_k_scores = pd.DataFrame(i_list, columns=columns)
best_feature_k_scores.sort_values(by='score').head()
Out[28]:
n features k score
91 5 2 2882
90 5 1 2949
61 4 2 2980
60 4 1 3030
92 5 3 3063

According to the knn model, in order to predict the most accurate price per car, it's best to use to the 5 best features with only 2 nearest neighbors (k-value).

In [29]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(20, 10))
plt.plot(best_feature_k_scores['k'], best_feature_k_scores['score'])
plt.xlabel('K Value for KNN')
plt.ylabel('RMSE')
plt.title("Cross Validation of K values ", fontdict={'fontsize': '40'})
Out[29]:
Text(0.5, 1.0, 'Cross Validation of K values ')