In this guided project, we'll practice the machine learning workflow to predict a car's market price using its attributes.
The data set we will be working with contains information on various cars. For each car we have information about the technical aspects of the vehicle such as the motor's displacement, the weight of the car, the miles per gallon, how fast the car accelerates, and more.
import pandas as pd
# Read dataset into a DataFrame
cars = pd.read_csv('imports-85.data', header = None) # Data has no header
cars.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | ? | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | 168.8 | 64.1 | 48.8 | 2548 | dohc | four | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 13495 |
1 | 3 | ? | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | 168.8 | 64.1 | 48.8 | 2548 | dohc | four | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 16500 |
2 | 1 | ? | alfa-romero | gas | std | two | hatchback | rwd | front | 94.5 | 171.2 | 65.5 | 52.4 | 2823 | ohcv | six | 152 | mpfi | 2.68 | 3.47 | 9.0 | 154 | 5000 | 19 | 26 | 16500 |
3 | 2 | 164 | audi | gas | std | four | sedan | fwd | front | 99.8 | 176.6 | 66.2 | 54.3 | 2337 | ohc | four | 109 | mpfi | 3.19 | 3.40 | 10.0 | 102 | 5500 | 24 | 30 | 13950 |
4 | 2 | 164 | audi | gas | std | four | sedan | 4wd | front | 99.4 | 176.6 | 66.4 | 54.3 | 2824 | ohc | five | 136 | mpfi | 3.19 | 3.40 | 8.0 | 115 | 5500 | 18 | 22 | 17450 |
The dataset has no header, but we can get the header infomation from our source as below:
alfa-romero, audi, bmw, chevrolet, dodge, honda, isuzu, jaguar, mazda, mercedes-benz, mercury, mitsubishi, nissan, peugot, plymouth, porsche, renault, saab, subaru, toyota, volkswagen, volvo
# Create a string with all the header information
header = '''1. symboling: -3, -2, -1, 0, 1, 2, 3.A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe.
2. normalized-losses: continuous from 65 to 256.
3. make:
alfa-romero, audi, bmw, chevrolet, dodge, honda,
isuzu, jaguar, mazda, mercedes-benz, mercury,
mitsubishi, nissan, peugot, plymouth, porsche,
renault, saab, subaru, toyota, volkswagen, volvo
4. fuel-type: diesel, gas.
5. aspiration: std, turbo.
6. num-of-doors: four, two.
7. body-style: hardtop, wagon, sedan, hatchback, convertible.
8. drive-wheels: 4wd, fwd, rwd.
9. engine-location: front, rear.
10. wheel-base: continuous from 86.6 120.9.
11. length: continuous from 141.1 to 208.1.
12. width: continuous from 60.3 to 72.3.
13. height: continuous from 47.8 to 59.8.
14. curb-weight: continuous from 1488 to 4066.
15. engine-type: dohc, dohcv, l, ohc, ohcf, ohcv, rotor.
16. num-of-cylinders: eight, five, four, six, three, twelve, two.
17. engine-size: continuous from 61 to 326.
18. fuel-system: 1bbl, 2bbl, 4bbl, idi, mfi, mpfi, spdi, spfi.
19. bore: continuous from 2.54 to 3.94.
20. stroke: continuous from 2.07 to 4.17.
21. compression-ratio: continuous from 7 to 23.
22. horsepower: continuous from 48 to 288.
23. peak-rpm: continuous from 4150 to 6600.
24. city-mpg: continuous from 13 to 49.
25. highway-mpg: continuous from 16 to 54.
26. price: continuous from 5118 to 45400.'''
# After observing, split header by '. '
header = header.split('. ')
header
['1', 'symboling: -3, -2, -1, 0, 1, 2, 3.A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe.\n\n2', 'normalized-losses: continuous from 65 to 256.\n3', 'make:\nalfa-romero, audi, bmw, chevrolet, dodge, honda,\nisuzu, jaguar, mazda, mercedes-benz, mercury,\nmitsubishi, nissan, peugot, plymouth, porsche,\nrenault, saab, subaru, toyota, volkswagen, volvo\n4', 'fuel-type: diesel, gas.\n5', 'aspiration: std, turbo.\n6', 'num-of-doors: four, two.\n7', 'body-style: hardtop, wagon, sedan, hatchback, convertible.\n8', 'drive-wheels: 4wd, fwd, rwd.\n9', 'engine-location: front, rear.\n10', 'wheel-base: continuous from 86.6 120.9.\n11', 'length: continuous from 141.1 to 208.1.\n12', 'width: continuous from 60.3 to 72.3.\n13', 'height: continuous from 47.8 to 59.8.\n14', 'curb-weight: continuous from 1488 to 4066.\n15', 'engine-type: dohc, dohcv, l, ohc, ohcf, ohcv, rotor.\n16', 'num-of-cylinders: eight, five, four, six, three, twelve, two.\n17', 'engine-size: continuous from 61 to 326.\n18', 'fuel-system: 1bbl, 2bbl, 4bbl, idi, mfi, mpfi, spdi, spfi.\n19', 'bore: continuous from 2.54 to 3.94.\n20', 'stroke: continuous from 2.07 to 4.17.\n21', 'compression-ratio: continuous from 7 to 23.\n22', 'horsepower: continuous from 48 to 288.\n23', 'peak-rpm: continuous from 4150 to 6600.\n24', 'city-mpg: continuous from 13 to 49.\n25', 'highway-mpg: continuous from 16 to 54.\n26', 'price: continuous from 5118 to 45400.']
# Extract column names from the list of headers
import re
pat = '[^:]*' # Matches anything that's not ':' therefore stops at first ':'
columns = []
for h in header:
m = re.search(pat, h)
if m:
found = m.group(0) # If pattern exist, extract group(0)
columns.append(found)
columns, len(columns) # Checking columns result and make sure all of the headers are included
(['1', 'symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'], 27)
# Add columns to cars DataFrame and exclude the first element in columns that shouldn't be included
cars.columns = columns[1:]
cars.head()
symboling | normalized-losses | make | fuel-type | aspiration | num-of-doors | body-style | drive-wheels | engine-location | wheel-base | length | width | height | curb-weight | engine-type | num-of-cylinders | engine-size | fuel-system | bore | stroke | compression-ratio | horsepower | peak-rpm | city-mpg | highway-mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | ? | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | 168.8 | 64.1 | 48.8 | 2548 | dohc | four | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 13495 |
1 | 3 | ? | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | 168.8 | 64.1 | 48.8 | 2548 | dohc | four | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 16500 |
2 | 1 | ? | alfa-romero | gas | std | two | hatchback | rwd | front | 94.5 | 171.2 | 65.5 | 52.4 | 2823 | ohcv | six | 152 | mpfi | 2.68 | 3.47 | 9.0 | 154 | 5000 | 19 | 26 | 16500 |
3 | 2 | 164 | audi | gas | std | four | sedan | fwd | front | 99.8 | 176.6 | 66.2 | 54.3 | 2337 | ohc | four | 109 | mpfi | 3.19 | 3.40 | 10.0 | 102 | 5500 | 24 | 30 | 13950 |
4 | 2 | 164 | audi | gas | std | four | sedan | 4wd | front | 99.4 | 176.6 | 66.4 | 54.3 | 2824 | ohc | five | 136 | mpfi | 3.19 | 3.40 | 8.0 | 115 | 5500 | 18 | 22 | 17450 |
cars.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 205 entries, 0 to 204 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 symboling 205 non-null int64 1 normalized-losses 205 non-null object 2 make 205 non-null object 3 fuel-type 205 non-null object 4 aspiration 205 non-null object 5 num-of-doors 205 non-null object 6 body-style 205 non-null object 7 drive-wheels 205 non-null object 8 engine-location 205 non-null object 9 wheel-base 205 non-null float64 10 length 205 non-null float64 11 width 205 non-null float64 12 height 205 non-null float64 13 curb-weight 205 non-null int64 14 engine-type 205 non-null object 15 num-of-cylinders 205 non-null object 16 engine-size 205 non-null int64 17 fuel-system 205 non-null object 18 bore 205 non-null object 19 stroke 205 non-null object 20 compression-ratio 205 non-null float64 21 horsepower 205 non-null object 22 peak-rpm 205 non-null object 23 city-mpg 205 non-null int64 24 highway-mpg 205 non-null int64 25 price 205 non-null object dtypes: float64(5), int64(5), object(16) memory usage: 41.8+ KB
pd.options.display.max_columns = 26
cars.describe(include = 'all')
symboling | normalized-losses | make | fuel-type | aspiration | num-of-doors | body-style | drive-wheels | engine-location | wheel-base | length | width | height | curb-weight | engine-type | num-of-cylinders | engine-size | fuel-system | bore | stroke | compression-ratio | horsepower | peak-rpm | city-mpg | highway-mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 205.000000 | 205 | 205 | 205 | 205 | 205 | 205 | 205 | 205 | 205.000000 | 205.000000 | 205.000000 | 205.000000 | 205.000000 | 205 | 205 | 205.000000 | 205 | 205 | 205 | 205.000000 | 205 | 205 | 205.000000 | 205.000000 | 205 |
unique | NaN | 52 | 22 | 2 | 2 | 3 | 5 | 3 | 2 | NaN | NaN | NaN | NaN | NaN | 7 | 7 | NaN | 8 | 39 | 37 | NaN | 60 | 24 | NaN | NaN | 187 |
top | NaN | ? | toyota | gas | std | four | sedan | fwd | front | NaN | NaN | NaN | NaN | NaN | ohc | four | NaN | mpfi | 3.62 | 3.40 | NaN | 68 | 5500 | NaN | NaN | ? |
freq | NaN | 41 | 32 | 185 | 168 | 114 | 96 | 120 | 202 | NaN | NaN | NaN | NaN | NaN | 148 | 159 | NaN | 94 | 23 | 20 | NaN | 19 | 37 | NaN | NaN | 4 |
mean | 0.834146 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 98.756585 | 174.049268 | 65.907805 | 53.724878 | 2555.565854 | NaN | NaN | 126.907317 | NaN | NaN | NaN | 10.142537 | NaN | NaN | 25.219512 | 30.751220 | NaN |
std | 1.245307 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6.021776 | 12.337289 | 2.145204 | 2.443522 | 520.680204 | NaN | NaN | 41.642693 | NaN | NaN | NaN | 3.972040 | NaN | NaN | 6.542142 | 6.886443 | NaN |
min | -2.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 86.600000 | 141.100000 | 60.300000 | 47.800000 | 1488.000000 | NaN | NaN | 61.000000 | NaN | NaN | NaN | 7.000000 | NaN | NaN | 13.000000 | 16.000000 | NaN |
25% | 0.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 94.500000 | 166.300000 | 64.100000 | 52.000000 | 2145.000000 | NaN | NaN | 97.000000 | NaN | NaN | NaN | 8.600000 | NaN | NaN | 19.000000 | 25.000000 | NaN |
50% | 1.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 97.000000 | 173.200000 | 65.500000 | 54.100000 | 2414.000000 | NaN | NaN | 120.000000 | NaN | NaN | NaN | 9.000000 | NaN | NaN | 24.000000 | 30.000000 | NaN |
75% | 2.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 102.400000 | 183.100000 | 66.900000 | 55.500000 | 2935.000000 | NaN | NaN | 141.000000 | NaN | NaN | NaN | 9.400000 | NaN | NaN | 30.000000 | 34.000000 | NaN |
max | 3.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 120.900000 | 208.100000 | 72.300000 | 59.800000 | 4066.000000 | NaN | NaN | 326.000000 | NaN | NaN | NaN | 23.000000 | NaN | NaN | 49.000000 | 54.000000 | NaN |
cars.columns
Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'], dtype='object')
After exploring the dataset, we can determine the columns that are numerical and can be used as features as below:
Columns 'num-of-doors', 'num-of-cylinders' are not numerical but can be converted to numerical.
Column 'price' will be our target column.
# Keep only selected features and target columns
cars_selected = cars[['symboling', 'normalized-losses', 'num-of-doors', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'num-of-cylinders', 'engine-size', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']].copy()
From data exploration, we can see that missing values are replaced with '?' in column 'normalized-losses'. Let's replace '?' with null.
import numpy as np
cars_selected = cars_selected.replace('?', np.nan)
# Convert strings in columns 'num-of-doors', 'num-of-cylinders' to numerical values
cars_selected['num-of-doors'] = cars_selected['num-of-doors'].map({'two':2, 'four':4})
cars_selected['num-of-cylinders'] = cars_selected['num-of-cylinders'].map({'eight': 8,
'five':5,
'four':4,
'six':6,
'three':3,
'twelve':12,
'two':2})
# Convert all columns in the dataframe to type float
cars_selected = cars_selected.astype(float)
# Check for missing values in the dataframe
cars_selected.isnull().sum()
symboling 0 normalized-losses 41 num-of-doors 2 wheel-base 0 length 0 width 0 height 0 curb-weight 0 num-of-cylinders 0 engine-size 0 bore 4 stroke 4 compression-ratio 0 horsepower 2 peak-rpm 2 city-mpg 0 highway-mpg 0 price 4 dtype: int64
Since we are predicting car prices, and there are only 4 missing car prices, we should drop the rows with missing price. There are also 2 missing values in num-of-doors
column. Let's look into that in the original dataframe and check out the car make and body-type so maybe we can figure out the number of doors.
For the other columns, we can fill the missing values with their column mean.
# Drop rows with missing price
cars_selected.dropna(subset = ['price'], inplace = True)
# Check out the rows with missing num-of-doors value
idx = cars_selected[cars_selected['num-of-doors'].isnull()].index
cars.iloc[idx]
symboling | normalized-losses | make | fuel-type | aspiration | num-of-doors | body-style | drive-wheels | engine-location | wheel-base | length | width | height | curb-weight | engine-type | num-of-cylinders | engine-size | fuel-system | bore | stroke | compression-ratio | horsepower | peak-rpm | city-mpg | highway-mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
27 | 1 | 148 | dodge | gas | turbo | ? | sedan | fwd | front | 93.7 | 157.3 | 63.8 | 50.6 | 2191 | ohc | four | 98 | mpfi | 3.03 | 3.39 | 7.6 | 102 | 5500 | 24 | 30 | 8558 |
63 | 0 | ? | mazda | diesel | std | ? | sedan | fwd | front | 98.8 | 177.8 | 66.5 | 55.5 | 2443 | ohc | four | 122 | idi | 3.39 | 3.39 | 22.7 | 64 | 4650 | 36 | 42 | 10795 |
We can see that the cars with missing door number values are Dodge sedan and Mazda sedan. With a little googling, it's easy to get that both door numbers are 4.
# Assign door number values to rows with the missing values
cars_selected.loc[idx, 'num-of-doors'] = 4
# Fill the missing values in the rest of columns with missing values with their column mean
cars_selected = cars_selected.fillna(cars_selected.mean())
# Make sure there are no null values anymore
cars_selected.isnull().sum()
symboling 0 normalized-losses 0 num-of-doors 0 wheel-base 0 length 0 width 0 height 0 curb-weight 0 num-of-cylinders 0 engine-size 0 bore 0 stroke 0 compression-ratio 0 horsepower 0 peak-rpm 0 city-mpg 0 highway-mpg 0 price 0 dtype: int64
Next, we will normalize the feature columns.
# Normalize features
cars_features = cars_selected.drop('price', axis = 1)
cars_features = (cars_features - cars_features.mean())/np.std(cars_features)
cars_features.head()
symboling | normalized-losses | num-of-doors | wheel-base | length | width | height | curb-weight | num-of-cylinders | engine-size | bore | stroke | compression-ratio | horsepower | peak-rpm | city-mpg | highway-mpg | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.725050 | 0.000000 | -1.156378 | -1.685107 | -0.439409 | -0.853460 | -2.034081 | -0.014858 | -0.343660 | 0.075389 | 0.520894 | -1.829927 | -0.291435 | 0.203984 | -0.246556 | -0.652249 | -0.542288 |
1 | 1.725050 | 0.000000 | -1.156378 | -1.685107 | -0.439409 | -0.853460 | -2.034081 | -0.014858 | -0.343660 | 0.075389 | 0.520894 | -1.829927 | -0.291435 | 0.203984 | -0.246556 | -0.652249 | -0.542288 |
2 | 0.127193 | 0.000000 | -1.156378 | -0.710103 | -0.244152 | -0.185597 | -0.559713 | 0.518080 | 1.548823 | 0.606234 | -2.433435 | 0.675938 | -0.291435 | 1.357649 | -0.246556 | -0.964397 | -0.689386 |
3 | 0.926121 | 1.315931 | 0.864769 | 0.165748 | 0.195176 | 0.148335 | 0.218425 | -0.423766 | -0.343660 | -0.431327 | -0.526210 | 0.453899 | -0.041121 | -0.037480 | 0.801833 | -0.184027 | -0.100993 |
4 | 0.926121 | 1.315931 | 0.864769 | 0.099646 | 0.195176 | 0.243744 | 0.218425 | 0.520017 | 0.602582 | 0.220165 | -0.526210 | 0.453899 | -0.541748 | 0.311302 | 0.801833 | -1.120471 | -1.277779 |
cars_clean = pd.concat([cars_features, cars_selected.price], axis = 1)
First, we will create a function, named knn_train_test()
that encapsulates the training and simple validation process.
# Import model & validation methods from sklearn
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
# Training & validation function
def knn_train_test(feature_col, target_col, df):
train, test = train_test_split(df, train_size = 0.8, test_size = 0.2, random_state = 1)
model = KNeighborsRegressor()
model.fit(train[feature_col], train[target_col])
predictions = model.predict(test[feature_col])
mse = mean_squared_error(test[target_col], predictions)
rmse = np.sqrt(np.abs(mse))
return rmse
Next, use this function to train and test univariate models using the different numeric columns in the data set.
rmses = {}
feature_cols = cars_features.columns
for col in feature_cols:
rmses[col] = knn_train_test([col], 'price', cars_clean)
rmses
{'symboling': 7836.647600451262, 'normalized-losses': 6497.942733571948, 'num-of-doors': 7799.557801536713, 'wheel-base': 5209.889397262414, 'length': 3680.2602213138903, 'width': 3497.269895915677, 'height': 6949.164753038889, 'curb-weight': 3298.8369422733394, 'num-of-cylinders': 5193.949944629528, 'engine-size': 3382.540534323493, 'bore': 6369.594598042691, 'stroke': 6045.587048175981, 'compression-ratio': 7135.188143689951, 'horsepower': 3089.5711228141076, 'peak-rpm': 6208.025557214177, 'city-mpg': 3654.3038536271374, 'highway-mpg': 4009.908052105617}
# Get the key of the minimum value in the rmses dictionary
min(rmses, key=rmses.get)
'horsepower'
We can see from the rmse values returned, column horsepower
performed the best using the default k=5 value.
Next, we will modify the knn_train_test() function we wrote to accept a parameter for the k value.
# Modify knn_train_test() function
def knn_train_test(feature_col, target_col, df, k):
train, test = train_test_split(df, train_size = 0.8, test_size = 0.2, random_state = 1)
model = KNeighborsRegressor(n_neighbors = k)
model.fit(train[feature_col], train[target_col])
predictions = model.predict(test[feature_col])
mse = mean_squared_error(test[target_col], predictions)
rmse = np.sqrt(np.abs(mse))
return rmse
Next, for each numeric column, we will create, train, and test a univariate model using the following k values (1, 3, 5, 7, and 9).
# List of k_values
k_values = range(1,10,2)
# Create a dataframe to store the result
univariant_k_rmse = pd.DataFrame(data = 0, index = range(len(k_values)),columns = feature_cols)
univariant_k_rmse['k_values'] = k_values
for col in feature_cols:
for n in k_values:
univariant_k_rmse.loc[univariant_k_rmse.k_values == n, col] = knn_train_test([col], 'price', cars_clean, n)
univariant_k_rmse
symboling | normalized-losses | num-of-doors | wheel-base | length | width | height | curb-weight | num-of-cylinders | engine-size | bore | stroke | compression-ratio | horsepower | peak-rpm | city-mpg | highway-mpg | k_values | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 8258.989783 | 8995.791322 | 10116.487225 | 3645.964087 | 4263.596095 | 2693.358408 | 7240.737771 | 4653.181458 | 6055.146483 | 3714.054228 | 11903.649134 | 6789.369646 | 8655.748861 | 3337.032938 | 6134.101163 | 3587.858686 | 4560.978663 | 1 |
1 | 7668.881921 | 6968.008202 | 7882.924284 | 4484.303173 | 4343.596243 | 3043.840925 | 7353.395464 | 3605.752663 | 4761.428050 | 3231.245093 | 8203.990749 | 5775.936419 | 6153.628320 | 2842.623602 | 5906.222247 | 3780.269293 | 3422.217502 | 3 |
2 | 7836.647600 | 6497.942734 | 7799.557802 | 5209.889397 | 3680.260221 | 3497.269896 | 6949.164753 | 3298.836942 | 5193.949945 | 3382.540534 | 6369.594598 | 6045.587048 | 7135.188144 | 3089.571123 | 6208.025557 | 3654.303854 | 4009.908052 | 5 |
3 | 6772.504429 | 7147.670924 | 8169.327266 | 4815.355890 | 3607.974629 | 4174.658870 | 6774.763764 | 3050.303391 | 4692.004353 | 3635.696170 | 6096.932125 | 6314.676349 | 6783.971660 | 3166.497053 | 6824.139775 | 3657.336525 | 3625.796908 | 7 |
4 | 6749.572221 | 6958.498001 | 7725.307668 | 4837.831088 | 3729.111331 | 3652.480414 | 6699.551811 | 2749.952443 | 4721.111597 | 3704.126061 | 6283.913801 | 6447.946926 | 6375.728573 | 3597.161999 | 7323.484184 | 3802.811718 | 3462.182438 | 9 |
Let's visualize the result to get a better idea on the influence of columns and k values.
# Visualize with plotly bar graph and slider
import plotly.graph_objects as go
import plotly.offline as pyo
pyo.init_notebook_mode()
# Initialize a set of colors
colors = ['#30336b',
'#4834d4', '#686de0',
'#22a6b3', '#7ed6df']
# Create figure
fig = go.Figure()
i = 0
# Add traces, one for each slider step
for step in np.arange(1, 10, 2):
fig.add_trace(
go.Bar(
visible=False,
name="k-value = " + str(step),
x=feature_cols,
y=univariant_k_rmse.loc[i, feature_cols],
marker=dict(
color=colors[i]
)))
i+=1
# Make first trace visible
fig.data[0].visible = True
# Create and add slider
steps = []
for i in range(len(fig.data)):
step = dict(
method="update",
args=[{"visible": [False] * len(fig.data)},
{"title": "Slider switched to K-Value: " + str(k_values[i])}],
label = str(k_values[i]) # layout attribute
)
step["args"][0]["visible"][i] = True # Toggle i'th trace to "visible"
steps.append(step)
sliders = [dict(
active=0,
currentvalue={"prefix": "K-Value: "},
pad={"t": 50},
steps=steps
)]
fig.update_layout(
sliders=sliders,
yaxis=dict(range=[0,1.2e4])
)
fig.show()
*Let's fit the knn_train_test() function we wrote in the last step to work with multiple columns in this section.*
First we will train knn_train_test() function with best 2,3,4,5 features from the previous step and a defualt value_k
# find mean rmse for each feature from previous step
best_five = univariant_k_rmse[feature_cols].mean().sort_values().index[:5]
best_eight = univariant_k_rmse[feature_cols].mean().sort_values().index[:8]
best_eight
Index(['horsepower', 'width', 'curb-weight', 'engine-size', 'city-mpg', 'highway-mpg', 'length', 'wheel-base'], dtype='object')
# Use the best 2,3,4,5 features from the previous step to train and test a multivariate k-nearest neighbors model
for i in range(7):
rmse = knn_train_test(feature_col = best_eight[:i+1], target_col = 'price', df = cars_clean, k = 5)
i += 1
print('RMSE from default k and feature columns', list(best_eight[:i+1]), 'is: ', rmse)
RMSE from default k and feature columns ['horsepower', 'width'] is: 3089.5711228141076 RMSE from default k and feature columns ['horsepower', 'width', 'curb-weight'] is: 2513.780568648957 RMSE from default k and feature columns ['horsepower', 'width', 'curb-weight', 'engine-size'] is: 2332.5805193014003 RMSE from default k and feature columns ['horsepower', 'width', 'curb-weight', 'engine-size', 'city-mpg'] is: 2154.537920467335 RMSE from default k and feature columns ['horsepower', 'width', 'curb-weight', 'engine-size', 'city-mpg', 'highway-mpg'] is: 2124.816029683511 RMSE from default k and feature columns ['horsepower', 'width', 'curb-weight', 'engine-size', 'city-mpg', 'highway-mpg', 'length'] is: 2531.65145387402 RMSE from default k and feature columns ['horsepower', 'width', 'curb-weight', 'engine-size', 'city-mpg', 'highway-mpg', 'length', 'wheel-base'] is: 2219.2931990646557
Let's now optimize the top 3 models that performed the best in the previous step with different k.
# Initialize a list of k values from 1 to 25
multi_k = range(1,26)
# Initialize a dataframe to store result
models = ['3_best_features', '4_best_features', '5_best_features']
multivariate_k_rmse = pd.DataFrame(data = 0, columns = models, index = range(len(multi_k)))
multivariate_k_rmse['k_values'] = multi_k
# Fit the best 3 models from the previous step
for i in range(3):
for n in multi_k:
rmse = knn_train_test(best_five[:i+2], 'price', cars_clean, n)
multivariate_k_rmse.loc[multivariate_k_rmse.k_values == n, models[i]] = rmse
multivariate_k_rmse
3_best_features | 4_best_features | 5_best_features | k_values | |
---|---|---|---|---|
0 | 3517.353984 | 2594.354556 | 1574.476289 | 1 |
1 | 3346.549908 | 1997.573494 | 1833.899833 | 2 |
2 | 2871.387166 | 2366.558834 | 2070.619683 | 3 |
3 | 2768.578544 | 2262.544261 | 1984.776493 | 4 |
4 | 2513.780569 | 2332.580519 | 2154.537920 | 5 |
5 | 2509.702498 | 2342.225881 | 2323.359547 | 6 |
6 | 2705.569230 | 2326.922493 | 2334.653131 | 7 |
7 | 2815.604530 | 2156.943969 | 2571.886333 | 8 |
8 | 2860.156889 | 2198.884918 | 2668.581346 | 9 |
9 | 2948.031106 | 2192.460243 | 2770.550951 | 10 |
10 | 3011.056938 | 2216.034270 | 2781.717352 | 11 |
11 | 2978.508997 | 2299.147739 | 2888.216214 | 12 |
12 | 2866.344422 | 2404.647893 | 3009.064161 | 13 |
13 | 2761.342904 | 2467.923574 | 3083.214543 | 14 |
14 | 2822.926437 | 2552.949496 | 3176.558768 | 15 |
15 | 2909.767037 | 2741.170002 | 3241.015530 | 16 |
16 | 3067.722573 | 2845.815408 | 3181.600411 | 17 |
17 | 3177.587386 | 2892.138078 | 3223.500825 | 18 |
18 | 3184.740400 | 2992.402606 | 3339.882703 | 19 |
19 | 3242.551053 | 3053.509345 | 3411.867303 | 20 |
20 | 3302.079503 | 3134.606971 | 3400.589978 | 21 |
21 | 3339.894333 | 3156.278021 | 3460.414819 | 22 |
22 | 3476.333512 | 3231.840893 | 3451.310640 | 23 |
23 | 3561.803102 | 3288.584844 | 3435.873065 | 24 |
24 | 3605.410364 | 3370.702134 | 3496.670729 | 25 |
Now let's visualize our result!
# Visualize with plotly line graph and slider
# Initialize a set of colors
colors = ['#d54062', '#ffa36c','#799351']
# Create figure
fig = go.Figure()
i = 0
# Add traces, one for each slider step
for step in models:
fig.add_trace(
go.Scatter(
visible=False,
name="Number of features = " + str(step),
x=multivariate_k_rmse.k_values,
y=multivariate_k_rmse[step],
marker=dict(
color=colors[i]
)))
i+=1
# Make first trace visible
fig.data[0].visible = True
# Create and add slider
steps = []
for i in range(len(fig.data)):
step = dict(
method="update",
args=[{"visible": [False] * len(fig.data)},
{"title": "Slider switched to: " + str(models[i])}],
label = str(models[i]) # layout attribute
)
step["args"][0]["visible"][i] = True # Toggle i'th trace to "visible"
steps.append(step)
sliders = [dict(
active = 0,
currentvalue={"prefix": "Model: "},
pad={"t": 50},
steps=steps
)]
fig.update_layout(
sliders=sliders,
yaxis=dict(range=[1500, 3600]),
xaxis = dict(range = [1,25],
nticks = 25)
)
fig.show()
In this project, we experimented with K-nearest Neighbor Regression and tuned the model with different features and k values. Here's what we found: