#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd import numpy as np cols = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'srive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'] cars = pd.read_csv('imports-85.data', names=cols) cars.head(5) # The numeric columns are symboling, normalized-losses, num-of-doors?, wheel-base, length, width, height, curb-weight, num-of-cylinders?, engine-size, bore, stroke, compression-ratio, horsepower, peak-rpm, city-mpg, highway-mpg, and price. The ones that can be used as features to predict the price of a car are num-of-doors, num-of-cylinders, engine-size, city-mpg, and highway-mpg. The target column is price since this is what we are going to predict. # In[2]: cars = cars.replace("?", np.nan) # In[3]: cars.info() # Replacing "?" to NaN caused the affected columns to be converted to object data type. In order to use the numeric columns, we will convert them to the numeric type. # In[4]: continuous_val_cols = ['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'] cars_to_numeric = cars[continuous_val_cols].astype('float') # In[5]: cars_to_numeric.info() # We've now converted all of the numeric columns to float64, a numeric data type. In our analysis, we will focus on these columns. # In[6]: print(cars_to_numeric['normalized-losses'].isnull().sum()) print(cars_to_numeric.shape) # There are 41 rows with missing values in the normalized-losses column. This is 20% of our data so we will drop this column. Let's do so below. # In[7]: cars_to_numeric = cars_to_numeric.drop('normalized-losses', axis=1) # Now we will consider the missing values remaining in other columns of the dataframe. # In[8]: cars_to_numeric.isnull().sum() # The remaining missing values are only 2%, at most, of the total values. So we will replace them with the average values of the columns. # In[9]: cars_to_numeric = cars_to_numeric.fillna(cars_to_numeric.mean()) # Normalize all rows except for price, which will be our target column. # In[10]: normalized_cars_to_numeric = (cars_to_numeric - cars_to_numeric.min()) / (cars_to_numeric.max() - cars_to_numeric.min()) #replace normalized price column with the not normalized values normalized_cars_to_numeric['price'] = cars_to_numeric['price'] # In[11]: def knn_train_test(train_col, target_col, dataframe): from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import mean_squared_error from math import sqrt np.random.seed(1) #Shuffle the rows of the dataframe shuffled_rows = np.random.permutation(dataframe.index) randomized_df = dataframe.reindex(shuffled_rows) length = int(len(randomized_df) / 2) train_df = randomized_df.iloc[0:length] test_df = randomized_df.iloc[length:] knn = KNeighborsRegressor() train_features = train_df[[train_col]] train_target = train_df[target_col] knn.fit(train_features, train_target) predictions = knn.predict(test_df[[train_col]]) mse = mean_squared_error(test_df[target_col], predictions) rmse = sqrt(mse) return rmse #Use function above to calculate rmses. First drop price from #the training dataset since it is our target. train_col = normalized_cars_to_numeric.columns.drop('price') #calc rmses for all train columns rmses = {} for col in train_col: rmse_val = knn_train_test(col, 'price', normalized_cars_to_numeric) rmses[col] = rmse_val #View rmse results in agreeable sorted format rmses_series = pd.Series(rmses) rmses_series.sort_values() # Engine size performed best using the default k value. Now we will modify the function above to also accept k as a parameter. # In[12]: def updated_knn_train_test(train_col, target_col, dataframe): from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import mean_squared_error from math import sqrt np.random.seed(1) #Shuffle the rows of the dataframe shuffled_rows = np.random.permutation(dataframe.index) randomized_df = dataframe.reindex(shuffled_rows) length = int(len(randomized_df) / 2) train_df = randomized_df.iloc[0:length] test_df = randomized_df.iloc[length:] k_vals = [1,3,5,7,9] k_rmses = {} for k_vl in k_vals: knn = KNeighborsRegressor(n_neighbors=k_vl) train_features = train_df[[train_col]] train_target = train_df[target_col] knn.fit(train_features, train_target) predictions = knn.predict(test_df[[train_col]]) k_mse = mean_squared_error(test_df[target_col], predictions) k_rmse = sqrt(k_mse) k_rmses[k_vl] = k_rmse return k_rmses #Use function above to calculate rmses. First drop price from #the training dataset since it is our target. train_col = normalized_cars_to_numeric.columns.drop('price') #calc rmses for all train columns rmses = {} for col in train_col: rmse_val = updated_knn_train_test(col, 'price', normalized_cars_to_numeric) rmses[col] = rmse_val #rmses import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') for a, b in rmses.items(): x = list(b.keys()) y = list(b.values()) plt.plot(x, y) plt.ylabel("RMSE (Price, USD)") plt.xlabel("k-value, number of similar prices") plt.show() # This looks like this: # In[14]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') for a, b in rmses.items(): x = list(b.keys()) y = list(b.values()) plt.plot(x, y) plt.ylabel("RMSE (Price, USD)") plt.xlabel("k-value, number of similar prices") plt.show() # In[ ]: