from IPython.display import display, HTML
display(HTML(data="""
<style>
div#notebook-container { width: 85%; }
div#menubar-container { width: 65%; }
div#maintoolbar-container { width: 99%; }
</style>
"""))
import datetime as dt
from itertools import permutations
from collections import Counter
import numpy as np
from numpy import arange
from numpy.random import randint, seed, random
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
import seaborn as sns
from scipy.stats import percentileofscore, chisquare, chi2_contingency
from scipy import stats
from scipy.spatial import distance
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score
columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore',
'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars = pd.read_csv('imports-85.data', names=columns)
cars.head(3)
symboling | normalized-losses | make | fuel-type | aspiration | num-of-doors | body-style | drive-wheels | engine-location | wheel-base | ... | engine-size | fuel-system | bore | stroke | compression-rate | horsepower | peak-rpm | city-mpg | highway-mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | ? | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | ... | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 13495 |
1 | 3 | ? | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | ... | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 16500 |
2 | 1 | ? | alfa-romero | gas | std | two | hatchback | rwd | front | 94.5 | ... | 152 | mpfi | 2.68 | 3.47 | 9.0 | 154 | 5000 | 19 | 26 | 16500 |
3 rows × 26 columns
numerical_values = ['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-rate',
'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars_numerical_values = cars[numerical_values]
cars_numerical_values.head(3)
normalized-losses | wheel-base | length | width | height | curb-weight | engine-size | bore | stroke | compression-rate | horsepower | peak-rpm | city-mpg | highway-mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ? | 88.6 | 168.8 | 64.1 | 48.8 | 2548 | 130 | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 13495 |
1 | ? | 88.6 | 168.8 | 64.1 | 48.8 | 2548 | 130 | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 16500 |
2 | ? | 94.5 | 171.2 | 65.5 | 52.4 | 2823 | 152 | 2.68 | 3.47 | 9.0 | 154 | 5000 | 19 | 26 | 16500 |
cars_numerical_values.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 205 entries, 0 to 204 Data columns (total 15 columns): normalized-losses 205 non-null object wheel-base 205 non-null float64 length 205 non-null float64 width 205 non-null float64 height 205 non-null float64 curb-weight 205 non-null int64 engine-size 205 non-null int64 bore 205 non-null object stroke 205 non-null object compression-rate 205 non-null float64 horsepower 205 non-null object peak-rpm 205 non-null object city-mpg 205 non-null int64 highway-mpg 205 non-null int64 price 205 non-null object dtypes: float64(5), int64(4), object(6) memory usage: 24.1+ KB
cars_numerical_values.replace('?', np.nan, inplace=True)
C:\Users\Leo\Anaconda3\lib\site-packages\pandas\core\frame.py:4263: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy method=method,
cars_numerical_values
normalized-losses | wheel-base | length | width | height | curb-weight | engine-size | bore | stroke | compression-rate | horsepower | peak-rpm | city-mpg | highway-mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | 88.6 | 168.8 | 64.1 | 48.8 | 2548 | 130 | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 13495 |
1 | NaN | 88.6 | 168.8 | 64.1 | 48.8 | 2548 | 130 | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 16500 |
2 | NaN | 94.5 | 171.2 | 65.5 | 52.4 | 2823 | 152 | 2.68 | 3.47 | 9.0 | 154 | 5000 | 19 | 26 | 16500 |
3 | 164 | 99.8 | 176.6 | 66.2 | 54.3 | 2337 | 109 | 3.19 | 3.40 | 10.0 | 102 | 5500 | 24 | 30 | 13950 |
4 | 164 | 99.4 | 176.6 | 66.4 | 54.3 | 2824 | 136 | 3.19 | 3.40 | 8.0 | 115 | 5500 | 18 | 22 | 17450 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
200 | 95 | 109.1 | 188.8 | 68.9 | 55.5 | 2952 | 141 | 3.78 | 3.15 | 9.5 | 114 | 5400 | 23 | 28 | 16845 |
201 | 95 | 109.1 | 188.8 | 68.8 | 55.5 | 3049 | 141 | 3.78 | 3.15 | 8.7 | 160 | 5300 | 19 | 25 | 19045 |
202 | 95 | 109.1 | 188.8 | 68.9 | 55.5 | 3012 | 173 | 3.58 | 2.87 | 8.8 | 134 | 5500 | 18 | 23 | 21485 |
203 | 95 | 109.1 | 188.8 | 68.9 | 55.5 | 3217 | 145 | 3.01 | 3.40 | 23.0 | 106 | 4800 | 26 | 27 | 22470 |
204 | 95 | 109.1 | 188.8 | 68.9 | 55.5 | 3062 | 141 | 3.78 | 3.15 | 9.5 | 114 | 5400 | 19 | 25 | 22625 |
205 rows × 15 columns
cars_numerical_values.describe()
wheel-base | length | width | height | curb-weight | engine-size | compression-rate | city-mpg | highway-mpg | |
---|---|---|---|---|---|---|---|---|---|
count | 205.000000 | 205.000000 | 205.000000 | 205.000000 | 205.000000 | 205.000000 | 205.000000 | 205.000000 | 205.000000 |
mean | 98.756585 | 174.049268 | 65.907805 | 53.724878 | 2555.565854 | 126.907317 | 10.142537 | 25.219512 | 30.751220 |
std | 6.021776 | 12.337289 | 2.145204 | 2.443522 | 520.680204 | 41.642693 | 3.972040 | 6.542142 | 6.886443 |
min | 86.600000 | 141.100000 | 60.300000 | 47.800000 | 1488.000000 | 61.000000 | 7.000000 | 13.000000 | 16.000000 |
25% | 94.500000 | 166.300000 | 64.100000 | 52.000000 | 2145.000000 | 97.000000 | 8.600000 | 19.000000 | 25.000000 |
50% | 97.000000 | 173.200000 | 65.500000 | 54.100000 | 2414.000000 | 120.000000 | 9.000000 | 24.000000 | 30.000000 |
75% | 102.400000 | 183.100000 | 66.900000 | 55.500000 | 2935.000000 | 141.000000 | 9.400000 | 30.000000 | 34.000000 |
max | 120.900000 | 208.100000 | 72.300000 | 59.800000 | 4066.000000 | 326.000000 | 23.000000 | 49.000000 | 54.000000 |
# convert all fields to float data type.
cars_numerical_values = cars_numerical_values.astype(float)
# remove all price data points that are missing because this is what we're trying to predict
cars_numerical_values = cars_numerical_values.dropna(subset=['price'])
cars_numerical_values.isnull().sum()
normalized-losses 37 wheel-base 0 length 0 width 0 height 0 curb-weight 0 engine-size 0 bore 4 stroke 4 compression-rate 0 horsepower 2 peak-rpm 2 city-mpg 0 highway-mpg 0 price 0 dtype: int64
# replace null values with the means of its attribute
cars_numerical_values = cars_numerical_values.fillna(cars_numerical_values.mean())
cars_numerical_values.isnull().sum()
normalized-losses 0 wheel-base 0 length 0 width 0 height 0 curb-weight 0 engine-size 0 bore 0 stroke 0 compression-rate 0 horsepower 0 peak-rpm 0 city-mpg 0 highway-mpg 0 price 0 dtype: int64
# normalize the feature by using min-max scaling
price_column = cars_numerical_values['price']
cars_numerical_values = (cars_numerical_values - cars_numerical_values.min())/(cars_numerical_values.max() - cars_numerical_values.min())
cars_numerical_values['price'] = price_column
cars_numerical_values.head(3)
normalized-losses | wheel-base | length | width | height | curb-weight | engine-size | bore | stroke | compression-rate | horsepower | peak-rpm | city-mpg | highway-mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.298429 | 0.058309 | 0.413433 | 0.324786 | 0.083333 | 0.411171 | 0.260377 | 0.664286 | 0.290476 | 0.125 | 0.294393 | 0.346939 | 0.222222 | 0.289474 | 13495.0 |
1 | 0.298429 | 0.058309 | 0.413433 | 0.324786 | 0.083333 | 0.411171 | 0.260377 | 0.664286 | 0.290476 | 0.125 | 0.294393 | 0.346939 | 0.222222 | 0.289474 | 16500.0 |
2 | 0.298429 | 0.230321 | 0.449254 | 0.444444 | 0.383333 | 0.517843 | 0.343396 | 0.100000 | 0.666667 | 0.125 | 0.495327 | 0.346939 | 0.166667 | 0.263158 | 16500.0 |
cars_numerical_values.describe()
normalized-losses | wheel-base | length | width | height | curb-weight | engine-size | bore | stroke | compression-rate | horsepower | peak-rpm | city-mpg | highway-mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 201.000000 | 201.000000 | 201.000000 | 201.000000 | 201.000000 | 201.000000 | 201.000000 | 201.000000 | 201.000000 | 201.000000 | 201.000000 | 201.000000 | 201.000000 | 201.000000 | 201.000000 |
mean | 0.298429 | 0.355598 | 0.494045 | 0.477697 | 0.497222 | 0.414145 | 0.248587 | 0.564793 | 0.565192 | 0.197767 | 0.258864 | 0.394934 | 0.338308 | 0.386489 | 13207.129353 |
std | 0.167520 | 0.176862 | 0.183913 | 0.179613 | 0.203985 | 0.200658 | 0.156781 | 0.191480 | 0.150499 | 0.250310 | 0.174606 | 0.195148 | 0.178423 | 0.179346 | 7947.066342 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 5118.000000 |
25% | 0.188482 | 0.230321 | 0.383582 | 0.324786 | 0.350000 | 0.264158 | 0.139623 | 0.435714 | 0.495238 | 0.100000 | 0.102804 | 0.265306 | 0.166667 | 0.236842 | 7775.000000 |
50% | 0.298429 | 0.303207 | 0.479104 | 0.444444 | 0.525000 | 0.359193 | 0.222642 | 0.550000 | 0.580952 | 0.125000 | 0.219626 | 0.394934 | 0.305556 | 0.368421 | 10295.000000 |
75% | 0.376963 | 0.460641 | 0.632836 | 0.538462 | 0.641667 | 0.557797 | 0.301887 | 0.742857 | 0.638095 | 0.150000 | 0.317757 | 0.551020 | 0.472222 | 0.473684 | 16500.000000 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 45400.000000 |
A univariate model involves the analysis of a single variable (feature)
def knn_train_test(train, target, k):
kf = KFold(n_splits=2, shuffle=True, random_state=1)
knn = KNeighborsRegressor(n_neighbors=k)
mses = cross_val_score(knn, train, target, scoring="neg_mean_squared_error", cv=kf)
rmse = np.mean(np.sqrt(np.abs(mses)))
return rmse
# loop through all features to see which performs best.
features = cars_numerical_values.columns.drop('price').to_list()
train = cars_numerical_values
target = cars_numerical_values['price']
k = 5
features_rmse = {}
for feature in features:
rmse = knn_train_test(cars_numerical_values[[feature]], cars_numerical_values['price'], k)
features_rmse[feature] = int(rmse)
# create a Series object from the dictionary so we can easily view the results and work better with the results
features_rmse = pd.Series(features_rmse)
features_rmse.sort_values()
engine-size 3364 horsepower 3983 curb-weight 4130 highway-mpg 4336 width 4480 city-mpg 4788 length 5645 wheel-base 5709 bore 6561 compression-rate 6875 normalized-losses 7482 peak-rpm 7721 height 7735 stroke 7768 dtype: int64
k_range = range(1,31)
i_list = []
features = cars_numerical_values.columns.drop('price').to_list()
train = cars_numerical_values
target = cars_numerical_values['price']
columns=['k', 'feature', 'score']
for k in k_range:
for feature in features:
rmse = knn_train_test(cars_numerical_values[[feature]], cars_numerical_values['price'], k)
i_list.append([k, feature, int(rmse)])
k_features_scores = pd.DataFrame(i_list, columns=columns)
k_features_scores
k | feature | score | |
---|---|---|---|
0 | 1 | normalized-losses | 10243 |
1 | 1 | wheel-base | 5676 |
2 | 1 | length | 5289 |
3 | 1 | width | 4657 |
4 | 1 | height | 10146 |
... | ... | ... | ... |
415 | 30 | compression-rate | 7634 |
416 | 30 | horsepower | 5428 |
417 | 30 | peak-rpm | 7832 |
418 | 30 | city-mpg | 5596 |
419 | 30 | highway-mpg | 5359 |
420 rows × 3 columns
k_features_scores.sort_values(by=['score']).head(10)
k | feature | score | |
---|---|---|---|
34 | 3 | engine-size | 3282 |
48 | 4 | engine-size | 3327 |
62 | 5 | engine-size | 3364 |
76 | 6 | engine-size | 3487 |
20 | 2 | engine-size | 3519 |
90 | 7 | engine-size | 3650 |
104 | 8 | engine-size | 3697 |
24 | 2 | horsepower | 3802 |
118 | 9 | engine-size | 3841 |
66 | 5 | horsepower | 3983 |
plt.figure(figsize=(20, 10))
plt.plot(k_features_scores['k'], k_features_scores['score'])
plt.xlabel('K Value for KNN')
plt.ylabel('Cross Validation Accuracy')
Text(0, 0.5, 'Cross Validation Accuracy')
# assessing which K values is the best
k_means = k_features_scores.groupby('k').mean()
k_means.sort_values('score').head(3)
score | |
---|---|
k | |
4 | 5699.000000 |
3 | 5745.785714 |
5 | 5755.500000 |
# assessing which feature produces the best score
feature_means = k_features_scores.drop(columns='k').groupby('feature').mean()
feature_means.sort_values('score').head(7)
score | |
---|---|
feature | |
engine-size | 4404.800000 |
curb-weight | 4698.633333 |
horsepower | 4771.966667 |
highway-mpg | 5064.666667 |
city-mpg | 5095.133333 |
width | 5140.200000 |
length | 5718.933333 |
A multivariate model analysis examines two or more variables (features)
def knn_train_test(train, target, k):
kf = KFold(n_splits=2, shuffle=True, random_state=1)
knn = KNeighborsRegressor(n_neighbors=k)
mses = cross_val_score(knn, train, target, scoring="neg_mean_squared_error", cv=kf)
rmse = np.mean(np.sqrt(np.abs(mses)))
return rmse
# running model on all features with k-value = 5
k_range = range(1,31)
i_list = []
features = cars_numerical_values.columns.drop('price').to_list()
train = cars_numerical_values[features]
target = cars_numerical_values['price']
columns=['k', 'score']
k = 5
rmse = knn_train_test(train, target, k)
i_list.append([5, int(rmse)])
k_scores = pd.DataFrame(i_list, columns=columns)
k_scores.sort_values(by='score').head()
k | score | |
---|---|---|
0 | 5 | 4009 |
# assessing which feature produces the best score
feature_means = k_features_scores.drop(columns='k').groupby('feature').mean()
feature_means.sort_values('score').head(7)
score | |
---|---|
feature | |
engine-size | 4404.800000 |
curb-weight | 4698.633333 |
horsepower | 4771.966667 |
highway-mpg | 5064.666667 |
city-mpg | 5095.133333 |
width | 5140.200000 |
length | 5718.933333 |
# convert resorted groupby object indexes into a list for model
best_features = feature_means.sort_values('score').head(5)
best_features = best_features.index.to_list()
best_features
['engine-size', 'curb-weight', 'horsepower', 'highway-mpg', 'city-mpg']
# running model on best n features
i_list = []
n_feature = []
target = cars_numerical_values['price']
columns=['n features', 'score']
k = 5
for feature in best_features:
n_feature.append(feature)
if feature != 'engine-size':
rmse = knn_train_test(cars_numerical_values[n_feature], target, k)
i_list.append([len(n_feature), int(rmse)])
k_scores = pd.DataFrame(i_list, columns=columns)
k_scores.sort_values(by='score').head()
n features | score | |
---|---|---|
2 | 4 | 3299 |
3 | 5 | 3353 |
1 | 3 | 3422 |
0 | 2 | 3427 |
It looks like the Multivariate model preforms best with four features.
# convert resorted groupby object indexes into a list for model
best_features = feature_means.sort_values('score').head(5)
best_features = best_features.index.to_list()
best_features
['engine-size', 'curb-weight', 'horsepower', 'highway-mpg', 'city-mpg']
# running model on best n features
i_list = []
n_feature = []
k_range = range(1,31)
target = cars_numerical_values['price']
columns= ['n features', 'k', 'score']
for feature in best_features:
n_feature.append(feature)
if feature != 'engine-size':
for k in k_range:
rmse = knn_train_test(cars_numerical_values[n_feature], target, k)
i_list.append([len(n_feature), k, int(rmse)])
best_feature_k_scores = pd.DataFrame(i_list, columns=columns)
best_feature_k_scores.sort_values(by='score').head()
n features | k | score | |
---|---|---|---|
91 | 5 | 2 | 2882 |
90 | 5 | 1 | 2949 |
61 | 4 | 2 | 2980 |
60 | 4 | 1 | 3030 |
92 | 5 | 3 | 3063 |
According to the knn model, in order to predict the most accurate price per car, it's best to use to the 5 best features with only 2 nearest neighbors (k-value).
plt.style.use('fivethirtyeight')
plt.figure(figsize=(20, 10))
plt.plot(best_feature_k_scores['k'], best_feature_k_scores['score'])
plt.xlabel('K Value for KNN')
plt.ylabel('RMSE')
plt.title("Cross Validation of K values ", fontdict={'fontsize': '40'})
Text(0.5, 1.0, 'Cross Validation of K values ')