#!/usr/bin/env python # coding: utf-8 # # Predicting Car Prices # In[1]: from IPython.display import display, HTML display(HTML(data=""" """)) import datetime as dt from itertools import permutations from collections import Counter import numpy as np from numpy import arange from numpy.random import randint, seed, random import pandas as pd import pandas_profiling import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.style as style import seaborn as sns from scipy.stats import percentileofscore, chisquare, chi2_contingency from scipy import stats from scipy.spatial import distance from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import mean_squared_error from sklearn.model_selection import KFold, cross_val_score # # Introduction to the Data Set # In[2]: columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'] cars = pd.read_csv('imports-85.data', names=columns) # In[3]: cars.head(3) # In[4]: numerical_values = ['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'] cars_numerical_values = cars[numerical_values] # In[5]: cars_numerical_values.head(3) # # Data Cleaning # In[6]: cars_numerical_values.info() # In[7]: cars_numerical_values.replace('?', np.nan, inplace=True) # In[8]: cars_numerical_values # In[9]: cars_numerical_values.describe() # In[10]: # convert all fields to float data type. cars_numerical_values = cars_numerical_values.astype(float) # In[11]: # remove all price data points that are missing because this is what we're trying to predict cars_numerical_values = cars_numerical_values.dropna(subset=['price']) cars_numerical_values.isnull().sum() # In[12]: # replace null values with the means of its attribute cars_numerical_values = cars_numerical_values.fillna(cars_numerical_values.mean()) cars_numerical_values.isnull().sum() # In[13]: # normalize the feature by using min-max scaling price_column = cars_numerical_values['price'] cars_numerical_values = (cars_numerical_values - cars_numerical_values.min())/(cars_numerical_values.max() - cars_numerical_values.min()) cars_numerical_values['price'] = price_column cars_numerical_values.head(3) # In[14]: cars_numerical_values.describe() # # Univariate Model # A univariate model involves the analysis of a single variable (feature) # In[15]: def knn_train_test(train, target, k): kf = KFold(n_splits=2, shuffle=True, random_state=1) knn = KNeighborsRegressor(n_neighbors=k) mses = cross_val_score(knn, train, target, scoring="neg_mean_squared_error", cv=kf) rmse = np.mean(np.sqrt(np.abs(mses))) return rmse # In[16]: # loop through all features to see which performs best. features = cars_numerical_values.columns.drop('price').to_list() train = cars_numerical_values target = cars_numerical_values['price'] k = 5 features_rmse = {} for feature in features: rmse = knn_train_test(cars_numerical_values[[feature]], cars_numerical_values['price'], k) features_rmse[feature] = int(rmse) # create a Series object from the dictionary so we can easily view the results and work better with the results features_rmse = pd.Series(features_rmse) features_rmse.sort_values() # In[17]: k_range = range(1,31) i_list = [] features = cars_numerical_values.columns.drop('price').to_list() train = cars_numerical_values target = cars_numerical_values['price'] columns=['k', 'feature', 'score'] for k in k_range: for feature in features: rmse = knn_train_test(cars_numerical_values[[feature]], cars_numerical_values['price'], k) i_list.append([k, feature, int(rmse)]) k_features_scores = pd.DataFrame(i_list, columns=columns) k_features_scores # In[18]: k_features_scores.sort_values(by=['score']).head(10) # In[19]: plt.figure(figsize=(20, 10)) plt.plot(k_features_scores['k'], k_features_scores['score']) plt.xlabel('K Value for KNN') plt.ylabel('Cross Validation Accuracy') # In[20]: # assessing which K values is the best k_means = k_features_scores.groupby('k').mean() k_means.sort_values('score').head(3) # In[21]: # assessing which feature produces the best score feature_means = k_features_scores.drop(columns='k').groupby('feature').mean() feature_means.sort_values('score').head(7) # # Multivariate Model # A multivariate model analysis examines two or more variables (features) # In[22]: def knn_train_test(train, target, k): kf = KFold(n_splits=2, shuffle=True, random_state=1) knn = KNeighborsRegressor(n_neighbors=k) mses = cross_val_score(knn, train, target, scoring="neg_mean_squared_error", cv=kf) rmse = np.mean(np.sqrt(np.abs(mses))) return rmse # In[23]: # running model on all features with k-value = 5 k_range = range(1,31) i_list = [] features = cars_numerical_values.columns.drop('price').to_list() train = cars_numerical_values[features] target = cars_numerical_values['price'] columns=['k', 'score'] k = 5 rmse = knn_train_test(train, target, k) i_list.append([5, int(rmse)]) k_scores = pd.DataFrame(i_list, columns=columns) k_scores.sort_values(by='score').head() # In[24]: # assessing which feature produces the best score feature_means = k_features_scores.drop(columns='k').groupby('feature').mean() feature_means.sort_values('score').head(7) # In[25]: # convert resorted groupby object indexes into a list for model best_features = feature_means.sort_values('score').head(5) best_features = best_features.index.to_list() best_features # In[26]: # running model on best n features i_list = [] n_feature = [] target = cars_numerical_values['price'] columns=['n features', 'score'] k = 5 for feature in best_features: n_feature.append(feature) if feature != 'engine-size': rmse = knn_train_test(cars_numerical_values[n_feature], target, k) i_list.append([len(n_feature), int(rmse)]) k_scores = pd.DataFrame(i_list, columns=columns) k_scores.sort_values(by='score').head() # It looks like the **Multivariate model** preforms best with **four features**. # # Hyperparameter Tuning # In[27]: # convert resorted groupby object indexes into a list for model best_features = feature_means.sort_values('score').head(5) best_features = best_features.index.to_list() best_features # In[28]: # running model on best n features i_list = [] n_feature = [] k_range = range(1,31) target = cars_numerical_values['price'] columns= ['n features', 'k', 'score'] for feature in best_features: n_feature.append(feature) if feature != 'engine-size': for k in k_range: rmse = knn_train_test(cars_numerical_values[n_feature], target, k) i_list.append([len(n_feature), k, int(rmse)]) best_feature_k_scores = pd.DataFrame(i_list, columns=columns) best_feature_k_scores.sort_values(by='score').head() # According to the knn model, in order **to predict the most accurate price per car**, it's best to use to the **5 best features** with only **2 nearest neighbors** (k-value). # In[29]: plt.style.use('fivethirtyeight') plt.figure(figsize=(20, 10)) plt.plot(best_feature_k_scores['k'], best_feature_k_scores['score']) plt.xlabel('K Value for KNN') plt.ylabel('RMSE') plt.title("Cross Validation of K values ", fontdict={'fontsize': '40'})