from IPython.display import display, HTML
display(HTML(data="""
<style>
div#notebook-container { width: 85%; }
div#menubar-container { width: 65%; }
div#maintoolbar-container { width: 99%; }
</style>
"""))
import datetime as dt
from itertools import permutations
from collections import Counter
import numpy as np
from numpy import arange
from numpy.random import randint, seed, random
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.style as style
import seaborn as sns
from scipy.stats import percentileofscore, chisquare, chi2_contingency
from scipy import stats
from scipy.spatial import distance
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_score
columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore',
'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars = pd.read_csv('imports-85.data', names=columns)
cars.head(3)
numerical_values = ['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-rate',
'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars_numerical_values = cars[numerical_values]
cars_numerical_values.head(3)
cars_numerical_values.info()
cars_numerical_values.replace('?', np.nan, inplace=True)
cars_numerical_values
cars_numerical_values.describe()
# convert all fields to float data type.
cars_numerical_values = cars_numerical_values.astype(float)
# remove all price data points that are missing because this is what we're trying to predict
cars_numerical_values = cars_numerical_values.dropna(subset=['price'])
cars_numerical_values.isnull().sum()
# replace null values with the means of its attribute
cars_numerical_values = cars_numerical_values.fillna(cars_numerical_values.mean())
cars_numerical_values.isnull().sum()
# normalize the feature by using min-max scaling
price_column = cars_numerical_values['price']
cars_numerical_values = (cars_numerical_values - cars_numerical_values.min())/(cars_numerical_values.max() - cars_numerical_values.min())
cars_numerical_values['price'] = price_column
cars_numerical_values.head(3)
cars_numerical_values.describe()
A univariate model involves the analysis of a single variable (feature)
def knn_train_test(train, target, k):
kf = KFold(n_splits=2, shuffle=True, random_state=1)
knn = KNeighborsRegressor(n_neighbors=k)
mses = cross_val_score(knn, train, target, scoring="neg_mean_squared_error", cv=kf)
rmse = np.mean(np.sqrt(np.abs(mses)))
return rmse
# loop through all features to see which performs best.
features = cars_numerical_values.columns.drop('price').to_list()
train = cars_numerical_values
target = cars_numerical_values['price']
k = 5
features_rmse = {}
for feature in features:
rmse = knn_train_test(cars_numerical_values[[feature]], cars_numerical_values['price'], k)
features_rmse[feature] = int(rmse)
# create a Series object from the dictionary so we can easily view the results and work better with the results
features_rmse = pd.Series(features_rmse)
features_rmse.sort_values()
k_range = range(1,31)
i_list = []
features = cars_numerical_values.columns.drop('price').to_list()
train = cars_numerical_values
target = cars_numerical_values['price']
columns=['k', 'feature', 'score']
for k in k_range:
for feature in features:
rmse = knn_train_test(cars_numerical_values[[feature]], cars_numerical_values['price'], k)
i_list.append([k, feature, int(rmse)])
k_features_scores = pd.DataFrame(i_list, columns=columns)
k_features_scores
k_features_scores.sort_values(by=['score']).head(10)
plt.figure(figsize=(20, 10))
plt.plot(k_features_scores['k'], k_features_scores['score'])
plt.xlabel('K Value for KNN')
plt.ylabel('Cross Validation Accuracy')
# assessing which K values is the best
k_means = k_features_scores.groupby('k').mean()
k_means.sort_values('score').head(3)
# assessing which feature produces the best score
feature_means = k_features_scores.drop(columns='k').groupby('feature').mean()
feature_means.sort_values('score').head(7)
A multivariate model analysis examines two or more variables (features)
def knn_train_test(train, target, k):
kf = KFold(n_splits=2, shuffle=True, random_state=1)
knn = KNeighborsRegressor(n_neighbors=k)
mses = cross_val_score(knn, train, target, scoring="neg_mean_squared_error", cv=kf)
rmse = np.mean(np.sqrt(np.abs(mses)))
return rmse
# running model on all features with k-value = 5
k_range = range(1,31)
i_list = []
features = cars_numerical_values.columns.drop('price').to_list()
train = cars_numerical_values[features]
target = cars_numerical_values['price']
columns=['k', 'score']
k = 5
rmse = knn_train_test(train, target, k)
i_list.append([5, int(rmse)])
k_scores = pd.DataFrame(i_list, columns=columns)
k_scores.sort_values(by='score').head()
# assessing which feature produces the best score
feature_means = k_features_scores.drop(columns='k').groupby('feature').mean()
feature_means.sort_values('score').head(7)
# convert resorted groupby object indexes into a list for model
best_features = feature_means.sort_values('score').head(5)
best_features = best_features.index.to_list()
best_features
# running model on best n features
i_list = []
n_feature = []
target = cars_numerical_values['price']
columns=['n features', 'score']
k = 5
for feature in best_features:
n_feature.append(feature)
if feature != 'engine-size':
rmse = knn_train_test(cars_numerical_values[n_feature], target, k)
i_list.append([len(n_feature), int(rmse)])
k_scores = pd.DataFrame(i_list, columns=columns)
k_scores.sort_values(by='score').head()
It looks like the Multivariate model preforms best with four features.
# convert resorted groupby object indexes into a list for model
best_features = feature_means.sort_values('score').head(5)
best_features = best_features.index.to_list()
best_features
# running model on best n features
i_list = []
n_feature = []
k_range = range(1,31)
target = cars_numerical_values['price']
columns= ['n features', 'k', 'score']
for feature in best_features:
n_feature.append(feature)
if feature != 'engine-size':
for k in k_range:
rmse = knn_train_test(cars_numerical_values[n_feature], target, k)
i_list.append([len(n_feature), k, int(rmse)])
best_feature_k_scores = pd.DataFrame(i_list, columns=columns)
best_feature_k_scores.sort_values(by='score').head()
According to the knn model, in order to predict the most accurate price per car, it's best to use to the 5 best features with only 2 nearest neighbors (k-value).
plt.style.use('fivethirtyeight')
plt.figure(figsize=(20, 10))
plt.plot(best_feature_k_scores['k'], best_feature_k_scores['score'])
plt.xlabel('K Value for KNN')
plt.ylabel('RMSE')
plt.title("Cross Validation of K values ", fontdict={'fontsize': '40'})