Predicting Car Prices¶

In [1]:

from IPython.display import display, HTML
display(HTML(data="""
<style>
    div#notebook-container    { width: 85%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))
import datetime as dt
from itertools import permutations 
from collections import Counter

import numpy as np
from numpy import arange
from numpy.random import randint, seed, random
import pandas as pd
import pandas_profiling

import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
import seaborn as sns

from scipy.stats import percentileofscore, chisquare, chi2_contingency
from scipy import stats
from scipy.spatial import distance

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score

Introduction to the Data Set¶

In [2]:

columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 
           'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 
           'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars = pd.read_csv('imports-85.data', names=columns)

In [3]:

cars.head(3)

Out[3]:

	symboling	normalized-losses	make	fuel-type	aspiration	num-of-doors	body-style	drive-wheels	engine-location	wheel-base	...	engine-size	fuel-system	bore	stroke	compression-rate	horsepower	peak-rpm	city-mpg	highway-mpg	price
0	3	?	alfa-romero	gas	std	two	convertible	rwd	front	88.6	...	130	mpfi	3.47	2.68	9.0	111	5000	21	27	13495
1	3	?	alfa-romero	gas	std	two	convertible	rwd	front	88.6	...	130	mpfi	3.47	2.68	9.0	111	5000	21	27	16500
2	1	?	alfa-romero	gas	std	two	hatchback	rwd	front	94.5	...	152	mpfi	2.68	3.47	9.0	154	5000	19	26	16500

3 rows × 26 columns

In [4]:

numerical_values = ['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-rate', 
            'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars_numerical_values = cars[numerical_values]

In [5]:

cars_numerical_values.head(3)

Out[5]:

	normalized-losses	wheel-base	length	width	height	curb-weight	engine-size	bore	stroke	compression-rate	horsepower	peak-rpm	city-mpg	highway-mpg	price
0	?	88.6	168.8	64.1	48.8	2548	130	3.47	2.68	9.0	111	5000	21	27	13495
1	?	88.6	168.8	64.1	48.8	2548	130	3.47	2.68	9.0	111	5000	21	27	16500
2	?	94.5	171.2	65.5	52.4	2823	152	2.68	3.47	9.0	154	5000	19	26	16500

Data Cleaning¶

In [6]:

cars_numerical_values.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 15 columns):
normalized-losses    205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-size          205 non-null int64
bore                 205 non-null object
stroke               205 non-null object
compression-rate     205 non-null float64
horsepower           205 non-null object
peak-rpm             205 non-null object
city-mpg             205 non-null int64
highway-mpg          205 non-null int64
price                205 non-null object
dtypes: float64(5), int64(4), object(6)
memory usage: 24.1+ KB

In [7]:

cars_numerical_values.replace('?', np.nan, inplace=True)

C:\Users\Leo\Anaconda3\lib\site-packages\pandas\core\frame.py:4263: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,

In [8]:

cars_numerical_values

Out[8]:

	normalized-losses	wheel-base	length	width	height	curb-weight	engine-size	bore	stroke	compression-rate	horsepower	peak-rpm	city-mpg	highway-mpg	price
0	NaN	88.6	168.8	64.1	48.8	2548	130	3.47	2.68	9.0	111	5000	21	27	13495
1	NaN	88.6	168.8	64.1	48.8	2548	130	3.47	2.68	9.0	111	5000	21	27	16500
2	NaN	94.5	171.2	65.5	52.4	2823	152	2.68	3.47	9.0	154	5000	19	26	16500
3	164	99.8	176.6	66.2	54.3	2337	109	3.19	3.40	10.0	102	5500	24	30	13950
4	164	99.4	176.6	66.4	54.3	2824	136	3.19	3.40	8.0	115	5500	18	22	17450
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
200	95	109.1	188.8	68.9	55.5	2952	141	3.78	3.15	9.5	114	5400	23	28	16845
201	95	109.1	188.8	68.8	55.5	3049	141	3.78	3.15	8.7	160	5300	19	25	19045
202	95	109.1	188.8	68.9	55.5	3012	173	3.58	2.87	8.8	134	5500	18	23	21485
203	95	109.1	188.8	68.9	55.5	3217	145	3.01	3.40	23.0	106	4800	26	27	22470
204	95	109.1	188.8	68.9	55.5	3062	141	3.78	3.15	9.5	114	5400	19	25	22625

205 rows × 15 columns

In [9]:

cars_numerical_values.describe()

Out[9]:

	wheel-base	length	width	height	curb-weight	engine-size	compression-rate	city-mpg	highway-mpg
count	205.000000	205.000000	205.000000	205.000000	205.000000	205.000000	205.000000	205.000000	205.000000
mean	98.756585	174.049268	65.907805	53.724878	2555.565854	126.907317	10.142537	25.219512	30.751220
std	6.021776	12.337289	2.145204	2.443522	520.680204	41.642693	3.972040	6.542142	6.886443
min	86.600000	141.100000	60.300000	47.800000	1488.000000	61.000000	7.000000	13.000000	16.000000
25%	94.500000	166.300000	64.100000	52.000000	2145.000000	97.000000	8.600000	19.000000	25.000000
50%	97.000000	173.200000	65.500000	54.100000	2414.000000	120.000000	9.000000	24.000000	30.000000
75%	102.400000	183.100000	66.900000	55.500000	2935.000000	141.000000	9.400000	30.000000	34.000000
max	120.900000	208.100000	72.300000	59.800000	4066.000000	326.000000	23.000000	49.000000	54.000000

In [10]:

# convert all fields to float data type.
cars_numerical_values = cars_numerical_values.astype(float)

In [11]:

# remove all price data points that are missing because this is what we're trying to predict
cars_numerical_values = cars_numerical_values.dropna(subset=['price'])
cars_numerical_values.isnull().sum()

Out[11]:

normalized-losses    37
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-size           0
bore                  4
stroke                4
compression-rate      0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

In [12]:

# replace null values with the means of its attribute
cars_numerical_values = cars_numerical_values.fillna(cars_numerical_values.mean())
cars_numerical_values.isnull().sum()

Out[12]:

normalized-losses    0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-size          0
bore                 0
stroke               0
compression-rate     0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

In [13]:

# normalize the feature by using min-max scaling
price_column = cars_numerical_values['price']
cars_numerical_values = (cars_numerical_values - cars_numerical_values.min())/(cars_numerical_values.max() - cars_numerical_values.min())
cars_numerical_values['price'] = price_column
cars_numerical_values.head(3)

Out[13]:

	normalized-losses	wheel-base	length	width	height	curb-weight	engine-size	bore	stroke	compression-rate	horsepower	peak-rpm	city-mpg	highway-mpg	price
0	0.298429	0.058309	0.413433	0.324786	0.083333	0.411171	0.260377	0.664286	0.290476	0.125	0.294393	0.346939	0.222222	0.289474	13495.0
1	0.298429	0.058309	0.413433	0.324786	0.083333	0.411171	0.260377	0.664286	0.290476	0.125	0.294393	0.346939	0.222222	0.289474	16500.0
2	0.298429	0.230321	0.449254	0.444444	0.383333	0.517843	0.343396	0.100000	0.666667	0.125	0.495327	0.346939	0.166667	0.263158	16500.0

In [14]:

cars_numerical_values.describe()

Out[14]:

	normalized-losses	wheel-base	length	width	height	curb-weight	engine-size	bore	stroke	compression-rate	horsepower	peak-rpm	city-mpg	highway-mpg	price
count	201.000000	201.000000	201.000000	201.000000	201.000000	201.000000	201.000000	201.000000	201.000000	201.000000	201.000000	201.000000	201.000000	201.000000	201.000000
mean	0.298429	0.355598	0.494045	0.477697	0.497222	0.414145	0.248587	0.564793	0.565192	0.197767	0.258864	0.394934	0.338308	0.386489	13207.129353
std	0.167520	0.176862	0.183913	0.179613	0.203985	0.200658	0.156781	0.191480	0.150499	0.250310	0.174606	0.195148	0.178423	0.179346	7947.066342
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	5118.000000
25%	0.188482	0.230321	0.383582	0.324786	0.350000	0.264158	0.139623	0.435714	0.495238	0.100000	0.102804	0.265306	0.166667	0.236842	7775.000000
50%	0.298429	0.303207	0.479104	0.444444	0.525000	0.359193	0.222642	0.550000	0.580952	0.125000	0.219626	0.394934	0.305556	0.368421	10295.000000
75%	0.376963	0.460641	0.632836	0.538462	0.641667	0.557797	0.301887	0.742857	0.638095	0.150000	0.317757	0.551020	0.472222	0.473684	16500.000000
max	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	45400.000000

Univariate Model¶

A univariate model involves the analysis of a single variable (feature)

In [15]:

def knn_train_test(train, target, k):
    kf = KFold(n_splits=2, shuffle=True, random_state=1)
    knn = KNeighborsRegressor(n_neighbors=k)
    mses = cross_val_score(knn, train, target, scoring="neg_mean_squared_error", cv=kf)
    rmse = np.mean(np.sqrt(np.abs(mses)))
    return rmse

In [16]:

# loop through all features to see which performs best.
features = cars_numerical_values.columns.drop('price').to_list()
train = cars_numerical_values
target = cars_numerical_values['price']
k = 5
features_rmse = {}

for feature in features:
    rmse = knn_train_test(cars_numerical_values[[feature]], cars_numerical_values['price'], k)
    features_rmse[feature] = int(rmse)

# create a Series object from the dictionary so we can easily view the results and work better with the results
features_rmse = pd.Series(features_rmse)
features_rmse.sort_values()

Out[16]:

engine-size          3364
horsepower           3983
curb-weight          4130
highway-mpg          4336
width                4480
city-mpg             4788
length               5645
wheel-base           5709
bore                 6561
compression-rate     6875
normalized-losses    7482
peak-rpm             7721
height               7735
stroke               7768
dtype: int64

In [17]:

k_range = range(1,31)
i_list = []

features = cars_numerical_values.columns.drop('price').to_list()
train = cars_numerical_values
target = cars_numerical_values['price']
columns=['k', 'feature', 'score']

for k in k_range:
    for feature in features:
        rmse = knn_train_test(cars_numerical_values[[feature]], cars_numerical_values['price'], k)
        i_list.append([k, feature, int(rmse)])

k_features_scores = pd.DataFrame(i_list, columns=columns)
k_features_scores

Out[17]:

	k	feature	score
0	1	normalized-losses	10243
1	1	wheel-base	5676
2	1	length	5289
3	1	width	4657
4	1	height	10146
...	...	...	...
415	30	compression-rate	7634
416	30	horsepower	5428
417	30	peak-rpm	7832
418	30	city-mpg	5596
419	30	highway-mpg	5359

420 rows × 3 columns

In [18]:

k_features_scores.sort_values(by=['score']).head(10)

Out[18]:

	k	feature	score
34	3	engine-size	3282
48	4	engine-size	3327
62	5	engine-size	3364
76	6	engine-size	3487
20	2	engine-size	3519
90	7	engine-size	3650
104	8	engine-size	3697
24	2	horsepower	3802
118	9	engine-size	3841
66	5	horsepower	3983

In [19]:

plt.figure(figsize=(20, 10))
plt.plot(k_features_scores['k'], k_features_scores['score'])
plt.xlabel('K Value for KNN')
plt.ylabel('Cross Validation Accuracy')

Out[19]:

Text(0, 0.5, 'Cross Validation Accuracy')

In [20]:

# assessing which K values is the best
k_means = k_features_scores.groupby('k').mean()
k_means.sort_values('score').head(3)

Out[20]:

	score
k
4	5699.000000
3	5745.785714
5	5755.500000

In [21]:

# assessing which feature produces the best score
feature_means = k_features_scores.drop(columns='k').groupby('feature').mean()
feature_means.sort_values('score').head(7)

Out[21]:

	score
feature
engine-size	4404.800000
curb-weight	4698.633333
horsepower	4771.966667
highway-mpg	5064.666667
city-mpg	5095.133333
width	5140.200000
length	5718.933333

Multivariate Model¶

A multivariate model analysis examines two or more variables (features)

In [22]:

def knn_train_test(train, target, k):
    kf = KFold(n_splits=2, shuffle=True, random_state=1)
    knn = KNeighborsRegressor(n_neighbors=k)
    mses = cross_val_score(knn, train, target, scoring="neg_mean_squared_error", cv=kf)
    rmse = np.mean(np.sqrt(np.abs(mses)))
    return rmse

In [23]:

# running model on all features with k-value = 5
k_range = range(1,31)
i_list = []

features = cars_numerical_values.columns.drop('price').to_list()
train = cars_numerical_values[features]
target = cars_numerical_values['price']
columns=['k', 'score']
k = 5

rmse = knn_train_test(train, target, k)
i_list.append([5, int(rmse)])

k_scores = pd.DataFrame(i_list, columns=columns)
k_scores.sort_values(by='score').head()

Out[23]:

	k	score
0	5	4009

In [24]:

# assessing which feature produces the best score
feature_means = k_features_scores.drop(columns='k').groupby('feature').mean()
feature_means.sort_values('score').head(7)

Out[24]:

	score
feature
engine-size	4404.800000
curb-weight	4698.633333
horsepower	4771.966667
highway-mpg	5064.666667
city-mpg	5095.133333
width	5140.200000
length	5718.933333

In [25]:

# convert resorted groupby object indexes into a list for model
best_features = feature_means.sort_values('score').head(5)
best_features = best_features.index.to_list()
best_features

Out[25]:

['engine-size', 'curb-weight', 'horsepower', 'highway-mpg', 'city-mpg']

In [26]:

# running model on best n features
i_list = []
n_feature = []

target = cars_numerical_values['price']
columns=['n features', 'score']
k = 5

for feature in best_features:
    n_feature.append(feature)
    if feature != 'engine-size':
        rmse = knn_train_test(cars_numerical_values[n_feature], target, k)
        i_list.append([len(n_feature), int(rmse)])

k_scores = pd.DataFrame(i_list, columns=columns)
k_scores.sort_values(by='score').head()

Out[26]:

	n features	score
2	4	3299
3	5	3353
1	3	3422
0	2	3427

It looks like the Multivariate model preforms best with four features.

Hyperparameter Tuning¶

In [27]:

# convert resorted groupby object indexes into a list for model
best_features = feature_means.sort_values('score').head(5)
best_features = best_features.index.to_list()
best_features

Out[27]:

['engine-size', 'curb-weight', 'horsepower', 'highway-mpg', 'city-mpg']

In [28]:

# running model on best n features
i_list = []
n_feature = []
k_range = range(1,31)

target = cars_numerical_values['price']
columns= ['n features', 'k', 'score']

for feature in best_features:
    n_feature.append(feature)
    if feature != 'engine-size':
        for k in k_range:
            rmse = knn_train_test(cars_numerical_values[n_feature], target, k)
            i_list.append([len(n_feature), k, int(rmse)])

best_feature_k_scores = pd.DataFrame(i_list, columns=columns)
best_feature_k_scores.sort_values(by='score').head()

Out[28]:

	n features	k	score
91	5	2	2882
90	5	1	2949
61	4	2	2980
60	4	1	3030
92	5	3	3063

According to the knn model, in order to predict the most accurate price per car, it's best to use to the 5 best features with only 2 nearest neighbors (k-value).

In [29]:

plt.style.use('fivethirtyeight')
plt.figure(figsize=(20, 10))
plt.plot(best_feature_k_scores['k'], best_feature_k_scores['score'])
plt.xlabel('K Value for KNN')
plt.ylabel('RMSE')
plt.title("Cross Validation of K values ", fontdict={'fontsize': '40'})

Out[29]:

Text(0.5, 1.0, 'Cross Validation of K values ')