#!/usr/bin/env python # coding: utf-8 # # *Predicting Car Prices* # # ***In this project, we will predict a car's market price using its attributes. The data set we will be working with contains information on various cars. For each car we have information about the technical aspects of the vehicle such as the motor's displacement, the weight of the car, the miles per gallon, how fast the car accelerates, and more.*** # # [Download](https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data) # [Documentation](https://archive.ics.uci.edu/ml/datasets/automobile) # # ***We will be using K-Nearset Neighbors algorithm to predict a Car's price accurately.*** # ## Exploring Data # # We will read data into a Dataframe. Since, the data file doesn't come with Header, we need to add proper column names.
Also, we will drop non-numerical columns which can't be used as features for our model. # In[1]: import pandas as pd import numpy as np pd.set_option('display.max_columns', 100) cars = pd.read_csv('imports-85.csv') print(cars.shape) cars.head() # It looks like this dataset does not include the column names. We'll have to add in the column names manually using the [documentation](https://archive.ics.uci.edu/ml/datasets/automobile). # In[2]: columns = ['symboling', 'normalized_losses', 'make', 'fuel_type', 'aspiration', 'num_of_doors', 'body_style', 'drive_wheels', 'engine_location', 'wheel_base', 'length', 'width', 'height', 'curb_weight', 'engine_type', 'num_of_cylinders', 'engine_size', 'fuel_system', 'bore', 'stroke', 'compression_ratio', 'horsepower', 'peak_rpm', 'city_mpg', 'highway_mpg', 'price'] cars = pd.read_csv('imports-85.csv', names=columns) cars.head() # ## Data Cleaning and Preparation # # The k-nearest neighbors algorithm uses the distance formula to determine the nearest neighbors. That means, we can only use numerical columns for this machine learning algorithm. Afterwards, we'll have to do a little bit of data cleaning. We will perform the following steps: # # 1. Replace missing and meaningless values like `?` with np.nan # - Convert String columns (which are actually numeric) to Numeric datatype # - Drop rows where target Column is missing/np.nan # - Replace missing/np.nan values for other places using the average values from that column. # - Normalize the Dataframe except Price Column # We can also seperate numerical columns given in the documentation as follows # In[3]: continuous_values_cols = ['normalized_losses', 'wheel_base', 'length', 'width', 'height', 'curb_weight', 'bore', 'stroke', 'compression_ratio', 'horsepower', 'peak_rpm', 'city_mpg', 'highway_mpg', 'price'] numeric_cars = cars[continuous_values_cols].copy() numeric_cars.head() # 1. Replace missing and meaningless values like `?` with np.nan # In[4]: numeric_cars.replace('?', np.nan, inplace=True) # 2. Convert String columns (which are actually numeric) to Numeric datatype # In[5]: # Check columns which are of object type text_cols = numeric_cars.select_dtypes(include=['object']).columns print(text_cols) numeric_cars[text_cols] = numeric_cars[text_cols].astype('float') # Checking if any non-numerical column is left numeric_cars.dtypes.value_counts() # 3. Drop rows where target Column is missing/np.nan # In[6]: # Because `price` is the column we want to predict, let's remove any rows with missing `price` values. numeric_cars.dropna(subset=['price'], inplace=True) # Checking if there is any null value numeric_cars['price'].isnull().sum() # 4. Replace missing/np.nan values for other places using the average values from that column. # In[7]: # Replace missing values in other columns using their respective column means. numeric_cars.fillna(numeric_cars.mean(), inplace=True) numeric_cars.isnull().sum().value_counts() # The k-nearest neighbors algorithm uses the euclidean distance to determine the closest neighbor. # # $$ Distance = \sqrt{{(q_1-p_1)}^2+{(q_2-p_2)}^2+...{(q_n-p_n)}^2} $$ # # Where q and p represent two rows and the subscript representing a column. However, each column have different scaling. For example, if we take row 2, and row 3. The peak RPM has a difference of 500, while the difference in width is 0.7. The algorithm will give extra weight towards the difference in peak RPM. # # That is why it is important to normalize the dataset into a unit vector. After normalization we'll have values from -1 to 1. For more information on feature scaling click [here](https://en.wikipedia.org/wiki/Feature_scaling). # # $$ x' = \frac{x - mean(x)}{x(max) - x(min)}$$ # # In pandas this would be: # # $$ df' = \frac{df - df.mean()}{df.max() - df.min()}$$ # # Where df is any dataframe. # # ------------- # 5. Normalize the Dataframe except Price Column # In[8]: # Normalizing The Dataframe normalised_cars = (numeric_cars.max() - numeric_cars)/numeric_cars.max() normalised_cars['price'] = numeric_cars['price'] normalised_cars.head() # ### Applying Machine Learning # # > **K-Nearest Neighbors**
# Suppose we have a dataframe named 'train', and a row named 'test'. The idea behind k-nearest neighbors is to find k number of rows from 'train' with the lowest distance to 'test'. Then we can determine the average of the target column of 'train' of those five rows and return the result to 'test'. # # ***We will create a `knn_train_test` function which uses KNeighborsRegressor class from scikit-learn.*** # In[9]: from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import mean_squared_error def knn_train_test(feature, target, df): # Randomizing the Dataset np.random.seed(1) new_df = df.iloc[np.random.permutation(len(df))].copy() # Divide the data in half half_point = int(len(df)/2) train_df = new_df[:half_point] test_df = new_df[half_point:] # Fit a KNN Model using default K value knn = KNeighborsRegressor() knn.fit(train_df[[feature]], train_df[target]) # Making predictions using the model predictions = knn.predict(test_df[[feature]]) # Calculate and return RMSE Value rmse = np.sqrt(mean_squared_error(test_df[target], predictions)) return rmse # This function will train and test univariate models. # # **First, we will evaluate which features give us the most accurate prediction.** # In[10]: # Extracting all feature names except price columns = normalised_cars.columns.tolist() columns.remove('price') # Create a dictionary of RMSE Values aling with Features rmse_results = {} for col in columns: rmse_results[col] = knn_train_test(col, 'price', normalised_cars) # Converting dictionary into Series and sorting it to display results rmse_results = pd.Series(rmse_results) rmse_results.sort_values() # It looks like `horsepower` feature gives us the least amount of error. We should definitely keep this list in mind when using the function for multiple features. # # But, we need explore further. Let's modify the function to include k value or the number of neighbors as a parameter. Then we can loop through a list of K values and features to determine which K value and features are most optimal in our machine learning model. # # ***Modifying the `knn_train_test()` function to accept `k` value as a parameter.*** # In[11]: def knn_train_test2(feature, target, df, k_value): # Randomizing the Dataset np.random.seed(1) new_df = df.iloc[np.random.permutation(len(df))].copy() # Divide the data in half half_point = int(len(df)/2) train_df = new_df[:half_point] test_df = new_df[half_point:] k_results = [] # Fitting the model wih k neighbors for k in k_value: knn = KNeighborsRegressor(n_neighbors=k) knn.fit(train_df[[feature]], train_df[target]) # Making predictions using the model predictions = knn.predict(test_df[[feature]]) # Calculate and return RMSE Value rmse = np.sqrt(mean_squared_error(test_df[target], predictions)) k_results.append(rmse) return k_results # ***Training, and testing a univariate model using following `k` values `(1, 3, 5, 7, and 9)`*** # In[12]: # K Nearest Neighbors k_values = [1, 3, 5, 7, 9] # Create a dictionary of RMSE Values along with Features k_rmse_results = {} # Looping through all the features for col in columns: k_rmse_results[col] = knn_train_test2(col, 'price', normalised_cars, k_values) k_rmse_results # ***Visualising `RMSE`s for various `K` and `Features`*** # In[96]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.style as style style.use('fivethirtyeight') plt.figure(figsize=(10, 12)) for k,v in k_rmse_results.items(): x = [1, 3, 5, 7, 9] y = v plt.plot(x, y, label=k) plt.xlabel('k value') plt.ylabel('RMSE') plt.legend(bbox_to_anchor=(1.3, 1), borderaxespad=0) # plt.legend() plt.show() # The visualisation isn't very helpful. Let's arrange the *Avg. RMSE (Root Mean Squared Error)* and *features* in a sorted manner. # # ***Finding best `features` (with lowest `RMSE`s)*** # In[14]: # Getting average RMSE across different `k` values for each feature. feature_rmse = {} for k,v in k_rmse_results.items(): avg_rmse = np.mean(v) feature_rmse[k] = avg_rmse top_features = pd.Series(feature_rmse).sort_values() top_features # ***The above table reiterates our finding from our earlier that `horsepower` gives least amount of error.*** # # ## Multivariate Model # # ***Now, we will optimize `knn_train_test` function to work along with multiple features at once.*** # In[15]: from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import mean_squared_error def knn_train_test3(features, target, df): # Randomizing the Dataset np.random.seed(1) # new_df = df.iloc[np.random.permutation(len(df))].copy() shuffled_index = np.random.permutation(df.index) new_df = df.reindex(shuffled_index) # Divide the data in half half_point = int(len(df)/2) train_df = new_df[:half_point] test_df = new_df[half_point:] # Fitting the model wih k neighbors knn = KNeighborsRegressor(n_neighbors=5) knn.fit(train_df[features], train_df[target]) # Making predictions using the model predictions = knn.predict(test_df[features]) # Calculate and return RMSE Value rmse = np.sqrt(mean_squared_error(test_df[target], predictions)) return rmse # ***Applying this function with top features (having lowest amount of error), will further improve accuracy of our model.*** # In[16]: rmse_results = {} rmse_results['Top Two Features'] = knn_train_test3(top_features[:2].index, 'price', normalised_cars) rmse_results['Top Three Features'] = knn_train_test3(top_features[:3].index, 'price', normalised_cars) rmse_results['Top Four Features'] = knn_train_test3(top_features[:4].index, 'price', normalised_cars) rmse_results['Top Five Features'] = knn_train_test3(top_features[:5].index, 'price', normalised_cars) # Displaying results sorted as per the RMSEs pd.Series(rmse_results).sort_values() # We got the least error from Top `Three` Features followed by `Four` and `Five` features. # # ## Hyperparameter Tuning # # ***Now, let's try varying the `K` values. We can further tune our machine learning model by finding the optimal `K` value to use.*** # In[17]: def knn_train_test_hyp(feature, target, df, k_value): # Randomizing the Dataset np.random.seed(1) new_df = df.iloc[np.random.permutation(len(df))].copy() # Divide the data in half half_point = int(len(df)/2) train_df = new_df[:half_point] test_df = new_df[half_point:] k_results = [] # Fitting the model wih k neighbors for k in k_value: knn = KNeighborsRegressor(n_neighbors=k) knn.fit(train_df[feature], train_df[target]) # Making predictions using the model predictions = knn.predict(test_df[feature]) # Calculate and return RMSE Value rmse = np.sqrt(mean_squared_error(test_df[target], predictions)) k_results.append(rmse) return k_results # In[18]: # Training and Testing on all, five and four features col_names = ['Top Three', 'Top Four', 'Top Five'] k_values = [x for x in range(1, 26)] rmse_results = {} for i in range(3): rmse = knn_train_test_hyp(top_features[:i+3].index, 'price', normalised_cars, k_values) rmse_results['{} Features'.format(col_names[i])] = rmse rmse_results # In[85]: from numpy import arange labels = ['{} features'.format(x) for x in col_names] plt.figure(figsize=(8, 4)) for k,v in rmse_results.items(): x = np.arange(1, 26, 1) y = v plt.plot(x, y) plt.xlabel('K Value') plt.ylabel('RMSE') font = {'family': 'serif', 'color': 'gray', 'weight': 'bold', 'size': 14} # plt.legend(labels=labels, bbox_to_anchor=(1.05, 1), borderaxespad=0) plt.legend(labels=labels, loc='lower right') plt.tight_layout() plt.title('RMSE Scores for Various K and Feature Combinations', fontdict= font) plt.savefig('rmse.png', dpi=300) plt.show() # In[20]: # Getting Min. RMSE across different `k` values for each feature combination. k_rmse = {} for k,v in rmse_results.items(): min_rmse = min(v) k_rmse[min_rmse] = [k, v.index(min_rmse)+1] pd.Series(k_rmse).sort_index() # From last two cells, we can observe that choosing the best `four features` with a `K value of 2` will give us the `lowest RMSE of 2514`. # # ### K-Fold Cross Validation # # We can improve our model by splitting data into more then 2 folds. Now, we can use cross validation with KFold and check how many `splits` may help us predict price in a better way. # In[21]: from sklearn.model_selection import cross_val_score, KFold num_folds = [3, 5, 7, 9, 10, 11, 13, 15, 17, 19, 21, 23] rmse_scores = {} for fold in num_folds: kf = KFold(fold, shuffle=True, random_state=1) knn = KNeighborsRegressor(n_neighbors=2) mses = cross_val_score(knn, normalised_cars[top_features[:4].index], normalised_cars["price"], scoring="neg_mean_squared_error", cv=kf) rmses = np.sqrt(np.absolute(mses)) avg_rmse = np.mean(rmses) rmse_scores[avg_rmse] = [str(fold) + ' folds'] pd.Series(rmse_scores).sort_index() # print(str(fold), "folds: ", "avg RMSE: ", str(avg_rmse), "std RMSE: ", str(std_rmse)) # ***Here, we can observe that the least RMSE score of `2416.12` is shown when `folds(data divisions) = 21`, `k = 2` along with top four features, which are `horsepower, width, curb_weight and city_mpg`.*** # # ***That is it for now though, the goal of this project is to explore the fundamentals of K-Nearest Neighbors.***