import pandas as pd
import numpy as np
cols = ['symboling', 'normalized-losses', 'make', 'fuel-type',
'aspiration', 'num-of-doors', 'body-style', 'srive-wheels',
'engine-location', 'wheel-base', 'length', 'width', 'height',
'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size',
'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower',
'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars = pd.read_csv('imports-85.data', names=cols)
cars.head(5)
symboling | normalized-losses | make | fuel-type | aspiration | num-of-doors | body-style | srive-wheels | engine-location | wheel-base | ... | engine-size | fuel-system | bore | stroke | compression-ratio | horsepower | peak-rpm | city-mpg | highway-mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | ? | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | ... | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 13495 |
1 | 3 | ? | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | ... | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 16500 |
2 | 1 | ? | alfa-romero | gas | std | two | hatchback | rwd | front | 94.5 | ... | 152 | mpfi | 2.68 | 3.47 | 9.0 | 154 | 5000 | 19 | 26 | 16500 |
3 | 2 | 164 | audi | gas | std | four | sedan | fwd | front | 99.8 | ... | 109 | mpfi | 3.19 | 3.40 | 10.0 | 102 | 5500 | 24 | 30 | 13950 |
4 | 2 | 164 | audi | gas | std | four | sedan | 4wd | front | 99.4 | ... | 136 | mpfi | 3.19 | 3.40 | 8.0 | 115 | 5500 | 18 | 22 | 17450 |
5 rows × 26 columns
The numeric columns are symboling, normalized-losses, num-of-doors?, wheel-base, length, width, height, curb-weight, num-of-cylinders?, engine-size, bore, stroke, compression-ratio, horsepower, peak-rpm, city-mpg, highway-mpg, and price. The ones that can be used as features to predict the price of a car are num-of-doors, num-of-cylinders, engine-size, city-mpg, and highway-mpg. The target column is price since this is what we are going to predict.
cars = cars.replace("?", np.nan)
cars.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 205 entries, 0 to 204 Data columns (total 26 columns): symboling 205 non-null int64 normalized-losses 164 non-null object make 205 non-null object fuel-type 205 non-null object aspiration 205 non-null object num-of-doors 203 non-null object body-style 205 non-null object srive-wheels 205 non-null object engine-location 205 non-null object wheel-base 205 non-null float64 length 205 non-null float64 width 205 non-null float64 height 205 non-null float64 curb-weight 205 non-null int64 engine-type 205 non-null object num-of-cylinders 205 non-null object engine-size 205 non-null int64 fuel-system 205 non-null object bore 201 non-null object stroke 201 non-null object compression-ratio 205 non-null float64 horsepower 203 non-null object peak-rpm 203 non-null object city-mpg 205 non-null int64 highway-mpg 205 non-null int64 price 201 non-null object dtypes: float64(5), int64(5), object(16) memory usage: 41.7+ KB
Replacing "?" to NaN caused the affected columns to be converted to object data type. In order to use the numeric columns, we will convert them to the numeric type.
continuous_val_cols = ['normalized-losses', 'wheel-base',
'length', 'width', 'height', 'curb-weight', 'engine-size',
'bore', 'stroke', 'compression-ratio', 'horsepower',
'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cars_to_numeric = cars[continuous_val_cols].astype('float')
cars_to_numeric.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 205 entries, 0 to 204 Data columns (total 15 columns): normalized-losses 164 non-null float64 wheel-base 205 non-null float64 length 205 non-null float64 width 205 non-null float64 height 205 non-null float64 curb-weight 205 non-null float64 engine-size 205 non-null float64 bore 201 non-null float64 stroke 201 non-null float64 compression-ratio 205 non-null float64 horsepower 203 non-null float64 peak-rpm 203 non-null float64 city-mpg 205 non-null float64 highway-mpg 205 non-null float64 price 201 non-null float64 dtypes: float64(15) memory usage: 24.1 KB
We've now converted all of the numeric columns to float64, a numeric data type. In our analysis, we will focus on these columns.
print(cars_to_numeric['normalized-losses'].isnull().sum())
print(cars_to_numeric.shape)
41 (205, 15)
There are 41 rows with missing values in the normalized-losses column. This is 20% of our data so we will drop this column. Let's do so below.
cars_to_numeric = cars_to_numeric.drop('normalized-losses', axis=1)
Now we will consider the missing values remaining in other columns of the dataframe.
cars_to_numeric.isnull().sum()
wheel-base 0 length 0 width 0 height 0 curb-weight 0 engine-size 0 bore 4 stroke 4 compression-ratio 0 horsepower 2 peak-rpm 2 city-mpg 0 highway-mpg 0 price 4 dtype: int64
The remaining missing values are only 2%, at most, of the total values. So we will replace them with the average values of the columns.
cars_to_numeric = cars_to_numeric.fillna(cars_to_numeric.mean())
Normalize all rows except for price, which will be our target column.
normalized_cars_to_numeric = (cars_to_numeric - cars_to_numeric.min()) / (cars_to_numeric.max() - cars_to_numeric.min())
#replace normalized price column with the not normalized values
normalized_cars_to_numeric['price'] = cars_to_numeric['price']
def knn_train_test(train_col, target_col, dataframe):
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
np.random.seed(1)
#Shuffle the rows of the dataframe
shuffled_rows = np.random.permutation(dataframe.index)
randomized_df = dataframe.reindex(shuffled_rows)
length = int(len(randomized_df) / 2)
train_df = randomized_df.iloc[0:length]
test_df = randomized_df.iloc[length:]
knn = KNeighborsRegressor()
train_features = train_df[[train_col]]
train_target = train_df[target_col]
knn.fit(train_features, train_target)
predictions = knn.predict(test_df[[train_col]])
mse = mean_squared_error(test_df[target_col], predictions)
rmse = sqrt(mse)
return rmse
#Use function above to calculate rmses. First drop price from
#the training dataset since it is our target.
train_col = normalized_cars_to_numeric.columns.drop('price')
#calc rmses for all train columns
rmses = {}
for col in train_col:
rmse_val = knn_train_test(col, 'price', normalized_cars_to_numeric)
rmses[col] = rmse_val
#View rmse results in agreeable sorted format
rmses_series = pd.Series(rmses)
rmses_series.sort_values()
engine-size 4244.103421 highway-mpg 4245.734567 curb-weight 4436.523561 width 5030.139338 city-mpg 5085.047194 horsepower 5092.272401 length 5418.778301 wheel-base 5743.877086 bore 6746.031651 compression-ratio 7177.202061 height 7832.152833 peak-rpm 7965.541705 stroke 8096.653898 dtype: float64
Engine size performed best using the default k value. Now we will modify the function above to also accept k as a parameter.
def updated_knn_train_test(train_col, target_col, dataframe):
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
np.random.seed(1)
#Shuffle the rows of the dataframe
shuffled_rows = np.random.permutation(dataframe.index)
randomized_df = dataframe.reindex(shuffled_rows)
length = int(len(randomized_df) / 2)
train_df = randomized_df.iloc[0:length]
test_df = randomized_df.iloc[length:]
k_vals = [1,3,5,7,9]
k_rmses = {}
for k_vl in k_vals:
knn = KNeighborsRegressor(n_neighbors=k_vl)
train_features = train_df[[train_col]]
train_target = train_df[target_col]
knn.fit(train_features, train_target)
predictions = knn.predict(test_df[[train_col]])
k_mse = mean_squared_error(test_df[target_col], predictions)
k_rmse = sqrt(k_mse)
k_rmses[k_vl] = k_rmse
return k_rmses
#Use function above to calculate rmses. First drop price from
#the training dataset since it is our target.
train_col = normalized_cars_to_numeric.columns.drop('price')
#calc rmses for all train columns
rmses = {}
for col in train_col:
rmse_val = updated_knn_train_test(col, 'price', normalized_cars_to_numeric)
rmses[col] = rmse_val
#rmses
import matplotlib.pyplot as plt
%matplotlib inline
for a, b in rmses.items():
x = list(b.keys())
y = list(b.values())
plt.plot(x, y)
plt.ylabel("RMSE (Price, USD)")
plt.xlabel("k-value, number of similar prices")
plt.show()
{'bore': {1: 7930.548590614734, 3: 6353.8909942494565, 5: 6746.031650636273, 7: 7246.705124103924, 9: 7184.417703627346}, 'city-mpg': {1: 6449.848972168971, 3: 4808.843642391777, 5: 5085.0471938761675, 7: 4958.158653064454, 9: 5021.788908379828}, 'compression-ratio': {1: 7255.772135357892, 3: 7485.226183865361, 5: 7177.202060550313, 7: 7333.967327565648, 9: 7426.77234607814}, 'curb-weight': {1: 5842.169653813219, 3: 4527.849193303364, 5: 4436.523561278551, 7: 4320.057970537657, 9: 4181.086220663609}, 'engine-size': {1: 4775.980594973188, 3: 4211.354845313718, 5: 4244.103420588581, 7: 4028.227456215567, 9: 4125.116174076774}, 'height': {1: 10653.050778292903, 3: 8176.934988463841, 5: 7832.152832500197, 7: 7743.870132519971, 9: 7632.665665042622}, 'highway-mpg': {1: 4445.090471315778, 3: 4442.016315845971, 5: 4245.734566600798, 7: 4266.445995724475, 9: 4522.647593669363}, 'horsepower': {1: 5261.091645623908, 3: 4995.302906280692, 5: 5092.272401434547, 7: 5025.308887429083, 9: 4989.9865689273765}, 'length': {1: 6853.715819594109, 3: 5113.209767682691, 5: 5418.778300690459, 7: 5618.149068803402, 9: 5641.426309574451}, 'peak-rpm': {1: 9386.46161279999, 3: 8096.1146568480835, 5: 7965.54170513815, 7: 8256.807774278761, 9: 8101.58286959599}, 'stroke': {1: 7345.451363903154, 3: 7455.212313651712, 5: 8096.65389823726, 7: 7935.188913776914, 9: 8066.594604492951}, 'wheel-base': {1: 5400.51418967299, 3: 5629.538534558065, 5: 5743.8770860517825, 7: 6048.744597831812, 9: 6360.529322830884}, 'width': {1: 5739.245757378005, 3: 5291.949721784998, 5: 5030.139338284197, 7: 4933.6396353832, 9: 4951.829909070571}}
This looks like this:
import matplotlib.pyplot as plt
%matplotlib inline
for a, b in rmses.items():
x = list(b.keys())
y = list(b.values())
plt.plot(x, y)
plt.ylabel("RMSE (Price, USD)")
plt.xlabel("k-value, number of similar prices")
plt.show()