This is my first attempt at a machine learning model. The following notebook consists of developing a simple ML model and tuning it with different columns, k-values etc. Being me I couldn't obediently follow all the lessons steps. No, instead I've decided to fiddle with a few more knobs and combinations:
Links: Dataset My Github My LinkedIn
Imports:
import pandas as pd
import numpy as np
import random
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from numpy import arange
from matplotlib.patches import ConnectionPatch
from matplotlib.patches import Rectangle
from collections import Counter
from itertools import combinations
import matplotlib.image as mpimg
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, KFold
cols = ['symboling', 'normalized_losses', 'make', 'fuel_type', 'aspiration', 'num_doors', 'body_style',
'drive_wheels', 'engine_location', 'wheel_base', 'length', 'width', 'height', 'curb_weight', 'engine_type',
'num_cylinders', 'engine_size', 'fuel_system', 'bore', 'stroke', 'compression_rate', 'horsepower', 'peak_rpm', 'city_mpg', 'highway_mpg', 'price']
df = pd.read_csv('imports-85.data', names=cols)
df.head()
symboling | normalized_losses | make | fuel_type | aspiration | num_doors | body_style | drive_wheels | engine_location | wheel_base | ... | engine_size | fuel_system | bore | stroke | compression_rate | horsepower | peak_rpm | city_mpg | highway_mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | ? | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | ... | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 13495 |
1 | 3 | ? | alfa-romero | gas | std | two | convertible | rwd | front | 88.6 | ... | 130 | mpfi | 3.47 | 2.68 | 9.0 | 111 | 5000 | 21 | 27 | 16500 |
2 | 1 | ? | alfa-romero | gas | std | two | hatchback | rwd | front | 94.5 | ... | 152 | mpfi | 2.68 | 3.47 | 9.0 | 154 | 5000 | 19 | 26 | 16500 |
3 | 2 | 164 | audi | gas | std | four | sedan | fwd | front | 99.8 | ... | 109 | mpfi | 3.19 | 3.40 | 10.0 | 102 | 5500 | 24 | 30 | 13950 |
4 | 2 | 164 | audi | gas | std | four | sedan | 4wd | front | 99.4 | ... | 136 | mpfi | 3.19 | 3.40 | 8.0 | 115 | 5500 | 18 | 22 | 17450 |
5 rows × 26 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 205 entries, 0 to 204 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 symboling 205 non-null int64 1 normalized_losses 205 non-null object 2 make 205 non-null object 3 fuel_type 205 non-null object 4 aspiration 205 non-null object 5 num_doors 205 non-null object 6 body_style 205 non-null object 7 drive_wheels 205 non-null object 8 engine_location 205 non-null object 9 wheel_base 205 non-null float64 10 length 205 non-null float64 11 width 205 non-null float64 12 height 205 non-null float64 13 curb_weight 205 non-null int64 14 engine_type 205 non-null object 15 num_cylinders 205 non-null object 16 engine_size 205 non-null int64 17 fuel_system 205 non-null object 18 bore 205 non-null object 19 stroke 205 non-null object 20 compression_rate 205 non-null float64 21 horsepower 205 non-null object 22 peak_rpm 205 non-null object 23 city_mpg 205 non-null int64 24 highway_mpg 205 non-null int64 25 price 205 non-null object dtypes: float64(5), int64(5), object(16) memory usage: 41.8+ KB
df.describe(include='all')
symboling | normalized_losses | make | fuel_type | aspiration | num_doors | body_style | drive_wheels | engine_location | wheel_base | ... | engine_size | fuel_system | bore | stroke | compression_rate | horsepower | peak_rpm | city_mpg | highway_mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 205.000000 | 205 | 205 | 205 | 205 | 205 | 205 | 205 | 205 | 205.000000 | ... | 205.000000 | 205 | 205 | 205 | 205.000000 | 205 | 205 | 205.000000 | 205.000000 | 205 |
unique | NaN | 52 | 22 | 2 | 2 | 3 | 5 | 3 | 2 | NaN | ... | NaN | 8 | 39 | 37 | NaN | 60 | 24 | NaN | NaN | 187 |
top | NaN | ? | toyota | gas | std | four | sedan | fwd | front | NaN | ... | NaN | mpfi | 3.62 | 3.40 | NaN | 68 | 5500 | NaN | NaN | ? |
freq | NaN | 41 | 32 | 185 | 168 | 114 | 96 | 120 | 202 | NaN | ... | NaN | 94 | 23 | 20 | NaN | 19 | 37 | NaN | NaN | 4 |
mean | 0.834146 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 98.756585 | ... | 126.907317 | NaN | NaN | NaN | 10.142537 | NaN | NaN | 25.219512 | 30.751220 | NaN |
std | 1.245307 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 6.021776 | ... | 41.642693 | NaN | NaN | NaN | 3.972040 | NaN | NaN | 6.542142 | 6.886443 | NaN |
min | -2.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 86.600000 | ... | 61.000000 | NaN | NaN | NaN | 7.000000 | NaN | NaN | 13.000000 | 16.000000 | NaN |
25% | 0.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 94.500000 | ... | 97.000000 | NaN | NaN | NaN | 8.600000 | NaN | NaN | 19.000000 | 25.000000 | NaN |
50% | 1.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 97.000000 | ... | 120.000000 | NaN | NaN | NaN | 9.000000 | NaN | NaN | 24.000000 | 30.000000 | NaN |
75% | 2.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 102.400000 | ... | 141.000000 | NaN | NaN | NaN | 9.400000 | NaN | NaN | 30.000000 | 34.000000 | NaN |
max | 3.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 120.900000 | ... | 326.000000 | NaN | NaN | NaN | 23.000000 | NaN | NaN | 49.000000 | 54.000000 | NaN |
11 rows × 26 columns
We have some missing values - instead of applying 1 method to a single dataframe, we'll create 6 different dataframes:
We'll also create normalized versions of the above dataframes: numeric_a_n, numeric_b_n, numeric_c_n
numeric_cols = ['normalized_losses', 'price', 'bore', 'stroke', 'horsepower', 'peak_rpm',]
df[numeric_cols] = np.where(df[numeric_cols] == '?', '', df[numeric_cols])
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric)
df[df.columns[df.isna().any()]].isna().sum()
normalized_losses 41 bore 4 stroke 4 horsepower 2 peak_rpm 2 price 4 dtype: int64
numeric = df.loc[:, df.dtypes != object].copy()
numeric_a = numeric.dropna(subset = ['price']).copy()
numeric_a = numeric_a.fillna(numeric.mean())
numeric_b = numeric.dropna(subset = [ 'bore', 'stroke', 'horsepower', 'peak_rpm', 'price']).copy()
numeric_b = numeric_b.fillna(numeric_b.mean())
corr_df = numeric_a.corr()['price'].to_frame().merge(numeric_b.corr()['price'],
left_index=True, right_index=True, suffixes=['_A_corr', '_B_corr'])
corr_df.sort_values('price_A_corr')
price_A_corr | price_B_corr | |
---|---|---|
highway_mpg | -0.704692 | -0.715590 |
city_mpg | -0.686571 | -0.702685 |
peak_rpm | -0.101616 | -0.104333 |
symboling | -0.082391 | -0.084118 |
compression_rate | 0.071107 | 0.069500 |
stroke | 0.082269 | 0.093746 |
normalized_losses | 0.133999 | 0.131736 |
height | 0.135486 | 0.138291 |
bore | 0.543155 | 0.546873 |
wheel_base | 0.584642 | 0.585793 |
length | 0.690628 | 0.695331 |
width | 0.751265 | 0.754273 |
horsepower | 0.809575 | 0.811027 |
curb_weight | 0.834415 | 0.835729 |
engine_size | 0.872335 | 0.888942 |
price | 1.000000 | 1.000000 |
How to fill in the missing price values for numeric_c?
In numeric a and b dataframes we've filled in the missing values, using the mean value for the column. Very simplistic aproach that we're going to modify and upgrade for the dataframe numeric_c. Our plan assumed being very thorough in filling in the missing price values in the last dataframe, that's why we've created a corelation dataframe for the 'price' column.
numeric_c = numeric.dropna(subset = [ 'bore', 'stroke', 'horsepower', 'peak_rpm']).copy()
numeric_c[numeric_c['price'].isnull()]
symboling | normalized_losses | wheel_base | length | width | height | curb_weight | engine_size | bore | stroke | compression_rate | horsepower | peak_rpm | city_mpg | highway_mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9 | 0 | NaN | 99.5 | 178.2 | 67.9 | 52.0 | 3053 | 131 | 3.13 | 3.40 | 7.0 | 160.0 | 5500.0 | 16 | 22 | NaN |
44 | 1 | NaN | 94.5 | 155.9 | 63.6 | 52.0 | 1874 | 90 | 3.03 | 3.11 | 9.6 | 70.0 | 5400.0 | 38 | 43 | NaN |
45 | 0 | NaN | 94.5 | 155.9 | 63.6 | 52.0 | 1909 | 90 | 3.03 | 3.11 | 9.6 | 70.0 | 5400.0 | 38 | 43 | NaN |
129 | 1 | NaN | 98.4 | 175.7 | 72.3 | 50.5 | 3366 | 203 | 3.94 | 3.11 | 10.0 | 288.0 | 5750.0 | 17 | 28 | NaN |
bins = numeric_c['engine_size'].value_counts(bins=20).index
numeric_c.groupby(pd.cut(numeric_c.engine_size, bins=bins))['price'].mean()
engine_size (87.5, 100.75] 7265.948276 (100.75, 114.0] 9754.628571 (114.0, 127.25] 11619.034483 (140.5, 153.75] 15516.047619 (127.25, 140.5] 17510.142857 (180.25, 193.5] 21027.000000 (153.75, 167.0] 16678.333333 (193.5, 206.75] 34528.000000 (167.0, 180.25] 17432.750000 (206.75, 220.0] 36318.333333 (233.25, 246.5] 34620.000000 (299.5, 312.75] 43180.000000 (246.5, 259.75] 33900.000000 (60.734, 74.25] 5151.000000 (74.25, 87.5] 5399.000000 (312.75, 326.0] 36000.000000 (220.0, 233.25] NaN (259.75, 273.0] NaN (273.0, 286.25] NaN (286.25, 299.5] NaN Name: price, dtype: float64
numeric_c[numeric_c['price'].isnull()]
symboling | normalized_losses | wheel_base | length | width | height | curb_weight | engine_size | bore | stroke | compression_rate | horsepower | peak_rpm | city_mpg | highway_mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9 | 0 | NaN | 99.5 | 178.2 | 67.9 | 52.0 | 3053 | 131 | 3.13 | 3.40 | 7.0 | 160.0 | 5500.0 | 16 | 22 | NaN |
44 | 1 | NaN | 94.5 | 155.9 | 63.6 | 52.0 | 1874 | 90 | 3.03 | 3.11 | 9.6 | 70.0 | 5400.0 | 38 | 43 | NaN |
45 | 0 | NaN | 94.5 | 155.9 | 63.6 | 52.0 | 1909 | 90 | 3.03 | 3.11 | 9.6 | 70.0 | 5400.0 | 38 | 43 | NaN |
129 | 1 | NaN | 98.4 | 175.7 | 72.3 | 50.5 | 3366 | 203 | 3.94 | 3.11 | 10.0 | 288.0 | 5750.0 | 17 | 28 | NaN |
numeric_c.loc[9,'price'] = 16520.750000
numeric_c.loc[44,'price'] = 7265.948276
numeric_c.loc[45,'price'] = 7265.948276
numeric_c.loc[129,'price'] = 34528.000000
numeric_c[numeric_c.columns[numeric_c.isna().any()]].isna().sum()
normalized_losses 39 dtype: int64
numeric_c = numeric_c.fillna(numeric_c.mean())
Now that we have 3 different datasets, lets create normalized versions of them. We'll add "_n" at the end of their names to for 'normalized'
price_col = numeric_a['price']
numeric_a_n = (numeric_a - numeric_a.min())/(numeric_a.max() - numeric_a.min()).copy()
numeric_a_n['price'] = price_col
price_col = numeric_b['price']
numeric_b_n = (numeric_b - numeric_b.min())/(numeric_b.max() - numeric_b.min()).copy()
numeric_b_n['price'] = price_col
price_col = numeric_c['price']
numeric_c_n = (numeric_c - numeric_c.min())/(numeric_c.max() - numeric_c.min()).copy()
numeric_c_n['price'] = price_col
In the following steps will create a model using only 1 column from the dataframe to predict the price value. We'll test out different dataframes and columns and check their performance.
def knn_train_test(df, feature_col, target_col, r=1):
# randomize
np.random.seed(r)
shuffled_index = np.random.permutation(df.index)
df = df.reindex(index = shuffled_index)
split_loc = int(0.5*len(df))
# split
train_set = df.iloc[:split_loc].copy()
test_set = df.iloc[split_loc:].copy()
# train and predict
knn = KNeighborsRegressor()
knn.fit(train_set[[feature_col]], train_set[target_col])
predictions = knn.predict(test_set[[feature_col]])
rmse = np.sqrt(mean_squared_error(test_set[target_col], predictions))
return rmse
all_features = numeric.columns.tolist()
all_features.remove('price')
df_list = [numeric_a, numeric_b, numeric_c]
df_n_list = [numeric_a_n, numeric_b_n, numeric_c_n]
def fill_dict(model, dlist=df_list, r=1):
rmse_dict_a = {}
rmse_dict_b = {}
rmse_dict_c = {}
for col in all_features:
rmse_dict_a[col] = model(dlist[0], col, 'price', r)
for col in all_features:
rmse_dict_b[col] = model(dlist[1], col, 'price', r)
for col in all_features:
rmse_dict_c[col] = model(dlist[2], col, 'price', r)
return rmse_dict_a, rmse_dict_b, rmse_dict_c
def give_results_df(dict1, dict2, dict3):
results_a = pd.DataFrame.from_dict(dict1, orient='index')
results_b = pd.DataFrame.from_dict(dict2, orient='index')
results_c = pd.DataFrame.from_dict(dict3, orient='index')
results_frame_ab = results_a.merge(results_b, left_index=True, right_index=True, suffixes=['_numeric_A', '_numeric_B'])
results_frame = results_frame_ab.merge(results_c, left_index=True, right_index=True)
results_frame = results_frame.rename(columns={results_frame.columns[0]: 'numeric_a', results_frame.columns[1]: 'numeric_b',
results_frame.columns[2]: 'numeric_c'
})
results_frame = results_frame.sort_values(results_frame.loc[results_frame.idxmin(axis=0)[0],:].idxmin(axis=0))
return results_frame
rmse_dict_a, rmse_dict_b, rmse_dict_c = fill_dict(knn_train_test)
results_frame = give_results_df(rmse_dict_a, rmse_dict_b, rmse_dict_c )
results_frame
numeric_a | numeric_b | numeric_c | |
---|---|---|---|
engine_size | 3271.449938 | 4206.269209 | 3014.293169 |
curb_weight | 4445.141920 | 4594.629028 | 3667.924168 |
highway_mpg | 4643.046278 | 4368.230464 | 4139.949639 |
width | 4772.459149 | 5038.643838 | 4290.488430 |
horsepower | 4064.464572 | 4543.317619 | 4441.723870 |
city_mpg | 4760.449842 | 4440.237523 | 4717.122297 |
wheel_base | 5460.851458 | 5692.794469 | 6121.214509 |
length | 5428.810101 | 5212.513611 | 6186.265681 |
compression_rate | 6610.812153 | 7020.882648 | 6210.718457 |
peak_rpm | 7649.170564 | 7309.974616 | 7043.426558 |
normalized_losses | 7339.675755 | 7083.841259 | 7217.866305 |
stroke | 7954.301034 | 7217.395544 | 7269.245529 |
bore | 6778.637424 | 7225.667155 | 7590.625100 |
symboling | 7979.434920 | 7368.869758 | 8050.034443 |
height | 8073.119759 | 7574.428009 | 8304.016043 |
# lets create some functions we're going to use constantly on plots:
def spines(ax,yl='RMSE',xl=''):
x1 = ax.spines['right'].set_visible(False)
x2 = ax.spines['top'].set_visible(False)
x3 = ax.spines['left'].set_linewidth(2)
x4 = ax.spines['bottom'].set_linewidth(2)
x5 = ax.set_ylabel(yl,size=14)
x6 = ax.set_xlabel(xl,size=14)
return x1, x2, x3, x4, x5, x6
def spines2():
x1 = ax.spines['right'].set_visible(False)
x2 = ax.spines['top'].set_visible(False)
x3 = ax.spines['left'].set_linewidth(3)
x4 = ax.spines['bottom'].set_linewidth(3)
return x1, x2, x3, x4
labels = ['numeric_a','numeric_b' ,'numeric_c']
y_list = [results_frame['numeric_a'], results_frame['numeric_b'], results_frame['numeric_c']]
def scatter_3(x, y_list, labels, size=10, alpha = 1):
p1 = plt.scatter( x=x, y=y_list[0] ,label=labels[0], s=size)
p2 = plt.scatter( x=x, y=y_list[1] ,label=labels[1], s=size)
p3 = plt.scatter( x=x, y=y_list[2] ,label=labels[2], s=size)
return p1, p2, p3
def scatter_3v2(x, y_list, labels, size=1, alpha = 1, z=True):
p1 = sns.stripplot(x=x, y=y_list[0], label=labels[0], s=size, alpha=alpha, color='#1f77b4', jitter=z)
p2 = sns.stripplot(x=x, y=y_list[1], label=labels[1], s=size, alpha=alpha, color='#ff7f0e' , jitter=z)
p3 = sns.stripplot(x=x, y=y_list[2], label=labels[2], s=size, alpha=alpha, color='green', jitter=z)
return p1, p2, p3
fig, ax = plt.subplots(figsize=(16,8))
scatter_3(results_frame.index, y_list, labels, 50)
spines(ax,xl='column')
ax.tick_params(axis = 'x',labelsize=12, rotation=65)
plt.title('RMSE results for each column and dataframe')
plt.legend()
plt.show()
# create a function with a random seed input
def knn_train_test_v2(df, feature_col, target_col, r):
# randomize
np.random.seed(r)
shuffled_index = np.random.permutation(df.index)
df = df.reindex(index = shuffled_index)
split_loc = int(0.5*len(df))
# split
train_set = df.iloc[:split_loc].copy()
test_set = df.iloc[split_loc:].copy()
# train and test
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(train_set[[feature_col]], train_set[target_col])
predictions = knn.predict(test_set[[feature_col]])
rmse = np.sqrt(mean_squared_error(test_set[target_col], predictions))
return rmse
df_list = [numeric_a, numeric_b, numeric_c]
results = []
def df_seed_check(df_list, col_name,n):
best_results = []
for df in df_list:
seed_dictionary = {}
for i in list(range(0,n)):
seed_dictionary[i] = knn_train_test_v2(df,col_name, 'price', i)
results.append(seed_dictionary[i])
best_results.append(min(seed_dictionary.items(), key=lambda x: x[1]))
return best_results
results = []
labels = ['numeric_a','numeric_b' ,'numeric_c']
df_seed_check(df_list,'engine_size',1000)
num_a_results = results[:1000].copy()
num_b_results = results[1000:2000].copy()
num_c_results = results[2000:].copy()
fig, ax = plt.subplots(figsize=(16,12))
fig.suptitle('Why random seed is important?')
grid = plt.GridSpec(8, 1, wspace=0, hspace=19.3)
ax1 = plt.subplot(grid[0:6, 0])
y_list = [num_a_results, num_b_results, num_c_results]
scatter_3(list(range(1,1001)), y_list, labels, 10)
plt.legend()
plt.title(' engine_size column RMSE results')
plt.xlabel('Random seed', size=16)
plt.ylabel('RMSE', size=16)
ax1.add_patch(Rectangle((0, 2300), 1000, 400, alpha=0.3, facecolor = 'grey'))
plt.legend(loc=2)
plt.ylim(2000,6500)
plt.arrow(2,-3000, 10,500)
ax2 = plt.subplot(grid[6:, 0])
scatter_3(list(range(1,1001)), y_list, labels, 40)
plt.legend()
plt.title('engine_size columns results ZOOMED IN')
spines(ax1,xl='Random seed')
ax.tick_params(labelsize=12)
plt.ylim(2300,2700)
plt.legend(loc=3)
con = ConnectionPatch(
xyA=(0.1, 0.1), coordsA=ax1.transAxes,
xyB=(0.1, 2759.7), coordsB=ax2.get_yaxis_transform(),
arrowstyle="->", linewidth=1.5)
ax2.add_artist(con)
plt.show()
The above scatter plot presents 1000 rmse results for each of our 3 dataframes, but only the engine size column. For each result we've changed the random seed number. This clearly shows how different the results can be if we change the order of our dataframes index and split.
At first using various random seeds looks very attractive - we can improve our results in a very easy way. But we're only improving them in this single dataset example... and our end goal is to create a model that can be 'released into the wild' and work on various datasets isn't it? Thus we should pursue not the single best result but mean of many results. All of this comes at a price: computing models with multiple random seeds takes time and computational power. We must try to answer the question: how much is enough?
nums = []
x = 1
for n in list(range(1,7)):
x *= 4
nums.append(x)
nums
[4, 16, 64, 256, 1024, 4096]
mean_list = []
std_list = []
best_list = []
for n in nums:
results = []
df_list = [numeric_c_n]
x = df_seed_check(df_list,'engine_size',n)
mean_list.append(np.std(results))
std_list.append(np.mean(results))
best_list.append(x)
seed_frame = pd.DataFrame([mean_list,std_list,best_list], index=('std','mean','best result'), columns=[str(col) for col in nums])
seed_frame = seed_frame.transpose()
seed_frame['best result'] = seed_frame['best result'].str[0].str[1]
seed_frame['mean change'] =(seed_frame['mean'] - seed_frame['mean'].shift()) / seed_frame['mean'].shift() * 100
seed_frame['best result change'] =(seed_frame['best result'] - seed_frame['best result'].shift()) / seed_frame['best result'].shift() * 100
seed_frame.index.name = 'number of seeds'
seed_frame
std | mean | best result | mean change | best result change | |
---|---|---|---|---|---|
number of seeds | |||||
4 | 169.773707 | 3220.754961 | 3014.293169 | NaN | NaN |
16 | 531.362191 | 3388.531008 | 2636.971126 | 5.209215 | -12.517762 |
64 | 420.287777 | 3438.397706 | 2636.971126 | 1.471632 | 0.000000 |
256 | 447.763487 | 3397.101321 | 2636.971126 | -1.201036 | 0.000000 |
1024 | 453.550087 | 3385.32257 | 2353.206318 | -0.346729 | -10.761013 |
4096 | 439.748785 | 3372.912832 | 2306.099998 | -0.366575 | -2.001793 |
df_list = [numeric_c]
%timeit df_seed_check(df_list,'engine_size',4)
21.8 ms ± 731 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit df_seed_check(df_list,'engine_size',16)
84.8 ms ± 1.68 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit df_seed_check(df_list,'engine_size',256)
1.43 s ± 79.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit df_seed_check(df_list,'engine_size',1024)
5.59 s ± 101 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
df_n_list = [numeric_a_n, numeric_b_n, numeric_c_n]
results = []
labels = ['numeric_a_n','numeric_b_n' ,'numeric_c_n']
df_seed_check(df_n_list,'engine_size',100)
num_a_results = results[:100].copy()
num_b_results = results[100:200].copy()
num_c_results = results[200:].copy()
fig, ax = plt.subplots(figsize=(16,8))
ax1 = plt.subplot()
y_list = [num_a_results, num_b_results, num_c_results]
scatter_3(list(range(1,101)), y_list, labels=labels, size=50)
plt.title('Engine_size column RMSE results - 100 randoms seeds',size=18)
plt.legend(loc=1)
spines(ax1,xl='Random seed')
plt.show()
# create dataframes with average results for every column:
# non normalized results:
df_list = [numeric_a, numeric_b, numeric_c]
for col in results_frame.columns:
results_frame[col].values[:] = 0
for n in list(range(0,100)):
rmse_dict_a, rmse_dict_b, rmse_dict_c = fill_dict(knn_train_test,dlist=df_list ,r=n)
results_df = give_results_df(rmse_dict_a, rmse_dict_b, rmse_dict_c )
results_frame += results_df
results_frame_non = results_frame / len(list(range(0,100)))
# normalized results:
for col in results_frame.columns:
results_frame[col].values[:] = 0
for n in list(range(0,100)):
rmse_dict_a, rmse_dict_b, rmse_dict_c = fill_dict(knn_train_test,dlist=df_n_list ,r=n)
results_df = give_results_df(rmse_dict_a, rmse_dict_b, rmse_dict_c )
results_frame += results_df
results_frame_norm = results_frame / len(list(range(0,100)))
results_frame_norm.columns = ['numeric_a_n','numeric_b_n','numeric_c_n']
y_list1 = [results_frame_non['numeric_a'], results_frame_non['numeric_b'], results_frame_non['numeric_c']]
y_list2 = [results_frame_norm['numeric_a_n'], results_frame_norm['numeric_b_n'], results_frame_norm['numeric_c_n']]
fig, ax = plt.subplots(figsize=(16,8))
scatter_3v2(results_frame_non.index, y_list1, labels, 13, alpha=0.6)
scatter_3v2(results_frame_norm.index, y_list2, labels, 13)
spines(ax,xl='column')
ax.tick_params(axis = 'x',labelsize=12, rotation=65)
handles, labels = ax.get_legend_handles_labels()
l = plt.legend(handles[::15], ['numeric_a','numeric_b','numeric_c','numeric_a_n','numeric_b_n','numeric_c_n'], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('RMSE average results for each column and dataframe - 100 random seeds')
plt.show()