This is my first attempt at a machine learning model. The following notebook consists of developing a simple ML model and tuning it with different columns, k-values etc. Being me I couldn't obediently follow all the lessons steps. No, instead I've decided to fiddle with a few more knobs and combinations:
Links: Dataset My Github My LinkedIn
Imports:
import pandas as pd
import numpy as np
import random
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from numpy import arange
from matplotlib.patches import ConnectionPatch
from matplotlib.patches import Rectangle
from collections import Counter
from itertools import combinations
import matplotlib.image as mpimg
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, KFold
cols = ['symboling', 'normalized_losses', 'make', 'fuel_type', 'aspiration', 'num_doors', 'body_style',
'drive_wheels', 'engine_location', 'wheel_base', 'length', 'width', 'height', 'curb_weight', 'engine_type',
'num_cylinders', 'engine_size', 'fuel_system', 'bore', 'stroke', 'compression_rate', 'horsepower', 'peak_rpm', 'city_mpg', 'highway_mpg', 'price']
df = pd.read_csv('imports-85.data', names=cols)
df.head()
df.info()
df.describe(include='all')
We have some missing values - instead of applying 1 method to a single dataframe, we'll create 6 different dataframes:
We'll also create normalized versions of the above dataframes: numeric_a_n, numeric_b_n, numeric_c_n
numeric_cols = ['normalized_losses', 'price', 'bore', 'stroke', 'horsepower', 'peak_rpm',]
df[numeric_cols] = np.where(df[numeric_cols] == '?', '', df[numeric_cols])
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric)
df[df.columns[df.isna().any()]].isna().sum()
numeric = df.loc[:, df.dtypes != object].copy()
numeric_a = numeric.dropna(subset = ['price']).copy()
numeric_a = numeric_a.fillna(numeric.mean())
numeric_b = numeric.dropna(subset = [ 'bore', 'stroke', 'horsepower', 'peak_rpm', 'price']).copy()
numeric_b = numeric_b.fillna(numeric_b.mean())
corr_df = numeric_a.corr()['price'].to_frame().merge(numeric_b.corr()['price'],
left_index=True, right_index=True, suffixes=['_A_corr', '_B_corr'])
corr_df.sort_values('price_A_corr')
How to fill in the missing price values for numeric_c?
In numeric a and b dataframes we've filled in the missing values, using the mean value for the column. Very simplistic aproach that we're going to modify and upgrade for the dataframe numeric_c. Our plan assumed being very thorough in filling in the missing price values in the last dataframe, that's why we've created a corelation dataframe for the 'price' column.
numeric_c = numeric.dropna(subset = [ 'bore', 'stroke', 'horsepower', 'peak_rpm']).copy()
numeric_c[numeric_c['price'].isnull()]
bins = numeric_c['engine_size'].value_counts(bins=20).index
numeric_c.groupby(pd.cut(numeric_c.engine_size, bins=bins))['price'].mean()
numeric_c[numeric_c['price'].isnull()]
numeric_c.loc[9,'price'] = 16520.750000
numeric_c.loc[44,'price'] = 7265.948276
numeric_c.loc[45,'price'] = 7265.948276
numeric_c.loc[129,'price'] = 34528.000000
numeric_c[numeric_c.columns[numeric_c.isna().any()]].isna().sum()
numeric_c = numeric_c.fillna(numeric_c.mean())
Now that we have 3 different datasets, lets create normalized versions of them. We'll add "_n" at the end of their names to for 'normalized'
price_col = numeric_a['price']
numeric_a_n = (numeric_a - numeric_a.min())/(numeric_a.max() - numeric_a.min()).copy()
numeric_a_n['price'] = price_col
price_col = numeric_b['price']
numeric_b_n = (numeric_b - numeric_b.min())/(numeric_b.max() - numeric_b.min()).copy()
numeric_b_n['price'] = price_col
price_col = numeric_c['price']
numeric_c_n = (numeric_c - numeric_c.min())/(numeric_c.max() - numeric_c.min()).copy()
numeric_c_n['price'] = price_col
In the following steps will create a model using only 1 column from the dataframe to predict the price value. We'll test out different dataframes and columns and check their performance.
def knn_train_test(df, feature_col, target_col, r=1):
# randomize
np.random.seed(r)
shuffled_index = np.random.permutation(df.index)
df = df.reindex(index = shuffled_index)
split_loc = int(0.5*len(df))
# split
train_set = df.iloc[:split_loc].copy()
test_set = df.iloc[split_loc:].copy()
# train and predict
knn = KNeighborsRegressor()
knn.fit(train_set[[feature_col]], train_set[target_col])
predictions = knn.predict(test_set[[feature_col]])
rmse = np.sqrt(mean_squared_error(test_set[target_col], predictions))
return rmse
all_features = numeric.columns.tolist()
all_features.remove('price')
df_list = [numeric_a, numeric_b, numeric_c]
df_n_list = [numeric_a_n, numeric_b_n, numeric_c_n]
def fill_dict(model, dlist=df_list, r=1):
rmse_dict_a = {}
rmse_dict_b = {}
rmse_dict_c = {}
for col in all_features:
rmse_dict_a[col] = model(dlist[0], col, 'price', r)
for col in all_features:
rmse_dict_b[col] = model(dlist[1], col, 'price', r)
for col in all_features:
rmse_dict_c[col] = model(dlist[2], col, 'price', r)
return rmse_dict_a, rmse_dict_b, rmse_dict_c
def give_results_df(dict1, dict2, dict3):
results_a = pd.DataFrame.from_dict(dict1, orient='index')
results_b = pd.DataFrame.from_dict(dict2, orient='index')
results_c = pd.DataFrame.from_dict(dict3, orient='index')
results_frame_ab = results_a.merge(results_b, left_index=True, right_index=True, suffixes=['_numeric_A', '_numeric_B'])
results_frame = results_frame_ab.merge(results_c, left_index=True, right_index=True)
results_frame = results_frame.rename(columns={results_frame.columns[0]: 'numeric_a', results_frame.columns[1]: 'numeric_b',
results_frame.columns[2]: 'numeric_c'
})
results_frame = results_frame.sort_values(results_frame.loc[results_frame.idxmin(axis=0)[0],:].idxmin(axis=0))
return results_frame
rmse_dict_a, rmse_dict_b, rmse_dict_c = fill_dict(knn_train_test)
results_frame = give_results_df(rmse_dict_a, rmse_dict_b, rmse_dict_c )
results_frame
# lets create some functions we're going to use constantly on plots:
def spines(ax,yl='RMSE',xl=''):
x1 = ax.spines['right'].set_visible(False)
x2 = ax.spines['top'].set_visible(False)
x3 = ax.spines['left'].set_linewidth(2)
x4 = ax.spines['bottom'].set_linewidth(2)
x5 = ax.set_ylabel(yl,size=14)
x6 = ax.set_xlabel(xl,size=14)
return x1, x2, x3, x4, x5, x6
def spines2():
x1 = ax.spines['right'].set_visible(False)
x2 = ax.spines['top'].set_visible(False)
x3 = ax.spines['left'].set_linewidth(3)
x4 = ax.spines['bottom'].set_linewidth(3)
return x1, x2, x3, x4
labels = ['numeric_a','numeric_b' ,'numeric_c']
y_list = [results_frame['numeric_a'], results_frame['numeric_b'], results_frame['numeric_c']]
def scatter_3(x, y_list, labels, size=10, alpha = 1):
p1 = plt.scatter( x=x, y=y_list[0] ,label=labels[0], s=size)
p2 = plt.scatter( x=x, y=y_list[1] ,label=labels[1], s=size)
p3 = plt.scatter( x=x, y=y_list[2] ,label=labels[2], s=size)
return p1, p2, p3
def scatter_3v2(x, y_list, labels, size=1, alpha = 1, z=True):
p1 = sns.stripplot(x=x, y=y_list[0], label=labels[0], s=size, alpha=alpha, color='#1f77b4', jitter=z)
p2 = sns.stripplot(x=x, y=y_list[1], label=labels[1], s=size, alpha=alpha, color='#ff7f0e' , jitter=z)
p3 = sns.stripplot(x=x, y=y_list[2], label=labels[2], s=size, alpha=alpha, color='green', jitter=z)
return p1, p2, p3
fig, ax = plt.subplots(figsize=(16,8))
scatter_3(results_frame.index, y_list, labels, 50)
spines(ax,xl='column')
ax.tick_params(axis = 'x',labelsize=12, rotation=65)
plt.title('RMSE results for each column and dataframe')
plt.legend()
plt.show()
# create a function with a random seed input
def knn_train_test_v2(df, feature_col, target_col, r):
# randomize
np.random.seed(r)
shuffled_index = np.random.permutation(df.index)
df = df.reindex(index = shuffled_index)
split_loc = int(0.5*len(df))
# split
train_set = df.iloc[:split_loc].copy()
test_set = df.iloc[split_loc:].copy()
# train and test
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(train_set[[feature_col]], train_set[target_col])
predictions = knn.predict(test_set[[feature_col]])
rmse = np.sqrt(mean_squared_error(test_set[target_col], predictions))
return rmse
df_list = [numeric_a, numeric_b, numeric_c]
results = []
def df_seed_check(df_list, col_name,n):
best_results = []
for df in df_list:
seed_dictionary = {}
for i in list(range(0,n)):
seed_dictionary[i] = knn_train_test_v2(df,col_name, 'price', i)
results.append(seed_dictionary[i])
best_results.append(min(seed_dictionary.items(), key=lambda x: x[1]))
return best_results
results = []
labels = ['numeric_a','numeric_b' ,'numeric_c']
df_seed_check(df_list,'engine_size',1000)
num_a_results = results[:1000].copy()
num_b_results = results[1000:2000].copy()
num_c_results = results[2000:].copy()
fig, ax = plt.subplots(figsize=(16,12))
fig.suptitle('Why random seed is important?')
grid = plt.GridSpec(8, 1, wspace=0, hspace=19.3)
ax1 = plt.subplot(grid[0:6, 0])
y_list = [num_a_results, num_b_results, num_c_results]
scatter_3(list(range(1,1001)), y_list, labels, 10)
plt.legend()
plt.title(' engine_size column RMSE results')
plt.xlabel('Random seed', size=16)
plt.ylabel('RMSE', size=16)
ax1.add_patch(Rectangle((0, 2300), 1000, 400, alpha=0.3, facecolor = 'grey'))
plt.legend(loc=2)
plt.ylim(2000,6500)
plt.arrow(2,-3000, 10,500)
ax2 = plt.subplot(grid[6:, 0])
scatter_3(list(range(1,1001)), y_list, labels, 40)
plt.legend()
plt.title('engine_size columns results ZOOMED IN')
spines(ax1,xl='Random seed')
ax.tick_params(labelsize=12)
plt.ylim(2300,2700)
plt.legend(loc=3)
con = ConnectionPatch(
xyA=(0.1, 0.1), coordsA=ax1.transAxes,
xyB=(0.1, 2759.7), coordsB=ax2.get_yaxis_transform(),
arrowstyle="->", linewidth=1.5)
ax2.add_artist(con)
plt.show()
The above scatter plot presents 1000 rmse results for each of our 3 dataframes, but only the engine size column. For each result we've changed the random seed number. This clearly shows how different the results can be if we change the order of our dataframes index and split.
At first using various random seeds looks very attractive - we can improve our results in a very easy way. But we're only improving them in this single dataset example... and our end goal is to create a model that can be 'released into the wild' and work on various datasets isn't it? Thus we should pursue not the single best result but mean of many results. All of this comes at a price: computing models with multiple random seeds takes time and computational power. We must try to answer the question: how much is enough?
nums = []
x = 1
for n in list(range(1,7)):
x *= 4
nums.append(x)
nums
mean_list = []
std_list = []
best_list = []
for n in nums:
results = []
df_list = [numeric_c_n]
x = df_seed_check(df_list,'engine_size',n)
mean_list.append(np.std(results))
std_list.append(np.mean(results))
best_list.append(x)
seed_frame = pd.DataFrame([mean_list,std_list,best_list], index=('std','mean','best result'), columns=[str(col) for col in nums])
seed_frame = seed_frame.transpose()
seed_frame['best result'] = seed_frame['best result'].str[0].str[1]
seed_frame['mean change'] =(seed_frame['mean'] - seed_frame['mean'].shift()) / seed_frame['mean'].shift() * 100
seed_frame['best result change'] =(seed_frame['best result'] - seed_frame['best result'].shift()) / seed_frame['best result'].shift() * 100
seed_frame.index.name = 'number of seeds'
seed_frame
df_list = [numeric_c]
%timeit df_seed_check(df_list,'engine_size',4)
%timeit df_seed_check(df_list,'engine_size',16)
%timeit df_seed_check(df_list,'engine_size',256)
%timeit df_seed_check(df_list,'engine_size',1024)
df_n_list = [numeric_a_n, numeric_b_n, numeric_c_n]
results = []
labels = ['numeric_a_n','numeric_b_n' ,'numeric_c_n']
df_seed_check(df_n_list,'engine_size',100)
num_a_results = results[:100].copy()
num_b_results = results[100:200].copy()
num_c_results = results[200:].copy()
fig, ax = plt.subplots(figsize=(16,8))
ax1 = plt.subplot()
y_list = [num_a_results, num_b_results, num_c_results]
scatter_3(list(range(1,101)), y_list, labels=labels, size=50)
plt.title('Engine_size column RMSE results - 100 randoms seeds',size=18)
plt.legend(loc=1)
spines(ax1,xl='Random seed')
plt.show()
# create dataframes with average results for every column:
# non normalized results:
df_list = [numeric_a, numeric_b, numeric_c]
for col in results_frame.columns:
results_frame[col].values[:] = 0
for n in list(range(0,100)):
rmse_dict_a, rmse_dict_b, rmse_dict_c = fill_dict(knn_train_test,dlist=df_list ,r=n)
results_df = give_results_df(rmse_dict_a, rmse_dict_b, rmse_dict_c )
results_frame += results_df
results_frame_non = results_frame / len(list(range(0,100)))
# normalized results:
for col in results_frame.columns:
results_frame[col].values[:] = 0
for n in list(range(0,100)):
rmse_dict_a, rmse_dict_b, rmse_dict_c = fill_dict(knn_train_test,dlist=df_n_list ,r=n)
results_df = give_results_df(rmse_dict_a, rmse_dict_b, rmse_dict_c )
results_frame += results_df
results_frame_norm = results_frame / len(list(range(0,100)))
results_frame_norm.columns = ['numeric_a_n','numeric_b_n','numeric_c_n']
y_list1 = [results_frame_non['numeric_a'], results_frame_non['numeric_b'], results_frame_non['numeric_c']]
y_list2 = [results_frame_norm['numeric_a_n'], results_frame_norm['numeric_b_n'], results_frame_norm['numeric_c_n']]
fig, ax = plt.subplots(figsize=(16,8))
scatter_3v2(results_frame_non.index, y_list1, labels, 13, alpha=0.6)
scatter_3v2(results_frame_norm.index, y_list2, labels, 13)
spines(ax,xl='column')
ax.tick_params(axis = 'x',labelsize=12, rotation=65)
handles, labels = ax.get_legend_handles_labels()
l = plt.legend(handles[::15], ['numeric_a','numeric_b','numeric_c','numeric_a_n','numeric_b_n','numeric_c_n'], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('RMSE average results for each column and dataframe - 100 random seeds')
plt.show()
Observations:
# lets check our average results BUT only for the first 6 columns (the idea is -
# we know we won't use all of the columns for multi-column models)
pd.concat([results_frame_norm[:6].mean(), results_frame_non[:6].mean()])
Knowing that in the future we'll try to develop multiple-column models it's worth trying to guess which columns we're going to use in those models. Engine_size column is always going to be included, but what other columns are worthy of our attention?
results_frame_norm[:6]
we'll use the index of the above dataframe for column selection:
Lets check average rmse results for every dataframe for 2 single columns:
results_frame_norm[:6][:2].mean()
Lets check average rmse results for every dataframe for 4 columns:
results_frame_norm[:6][:4].mean()
Lets check average rmse results for every dataframe for 6 columns:
results_frame_norm[:6][:6].mean()
Lets include multiple columns in our model, we'll use the index of the dataframe with best columns(step 2.4.) as an indicator of which columns should we use.
# lets change our previous train/ test function so that it includes multiple columns:
def knn_train_test_v3(df, train_cols, target_col, seed, k=5):
# randomize
np.random.seed(seed)
shuffled_index = np.random.permutation(df.index)
df = df.reindex(index = shuffled_index)
split_loc = int(0.5*len(df))
# split
train_set = df.iloc[:split_loc].copy()
test_set = df.iloc[split_loc:].copy()
rmse_dict = {}
# futureproof: we're not going to use multiple k values yet, but we will in the future
knn = KNeighborsRegressor(n_neighbors=k)
#model:
knn.fit(train_set[train_cols], train_set[target_col])
predictions = knn.predict(test_set[train_cols])
rmse = np.sqrt(mean_squared_error(test_set[target_col], predictions))
return rmse
def fill_dict_v2(model,col_list, dlist=df_n_list, r=1, k=5, ):
rmse_dict_a = {}
rmse_dict_b = {}
rmse_dict_c = {}
rmse_dict_a[str(col_list)] = model(dlist[0], col_list, 'price', r, k)
rmse_dict_b[str(col_list)] = model(dlist[1], col_list, 'price', r, k)
rmse_dict_c[str(col_list)] = model(dlist[2], col_list, 'price', r, k)
return rmse_dict_a, rmse_dict_b, rmse_dict_c
# loop of columns:
df_results_list = []
for n in list(range(2,8)):
col_list = results_frame_norm.index[:n].tolist()
dict_a, dict_b, dict_c = fill_dict_v2(knn_train_test_v3, col_list)
results_frame = give_results_df(dict_a, dict_b, dict_c)
df_results_list.append(results_frame)
mutli_cols_df = pd.concat(df_results_list)
mutli_cols_df.columns = ['numeric_a_n', 'numeric_b_n', 'numeric_c_n']
mutli_cols_df.sort_values('numeric_c_n')
That's a significant improvement, lets not forget though: it's a single test on 1 random seed.
for col in mutli_cols_df.columns:
mutli_cols_df[col].values[:] = 0
cols_frame = mutli_cols_df[:0].copy()
# loop of random seeds:
for i in list(range(0,100)):
# loop of columns:
df_results_list = []
for n in list(range(2,8)):
col_list = results_frame_norm.index[:n].tolist()
dict_a, dict_b, dict_c = fill_dict_v2(knn_train_test_v3, col_list, r=i)
results_frame = give_results_df(dict_a, dict_b, dict_c)
df_results_list.append(results_frame)
to_add = pd.concat(df_results_list)
to_add.columns = ['numeric_a_n', 'numeric_b_n', 'numeric_c_n']
cols_frame = cols_frame.append(to_add)
cols_frame[:5]
Lets group the results by their input columns and display the mean value for every column combination:
mean_frame = cols_frame.groupby(level=0).mean()
mean_frame['min value'] = mean_frame.min(axis=1)
mean_frame['best df'] = mean_frame.loc[:,:'numeric_c_n'].idxmin(axis=1)
mean_frame.sort_values('min value')[:5]
Mean observations:
Lets group the results by their input columns and display the minimum value for every column combination:
min_frame = cols_frame.groupby(level=0).min()
min_frame['min value'] = min_frame.min(axis=1)
min_frame['best df'] = min_frame.loc[:,:'numeric_c_n'].idxmin(axis=1)
min_frame.sort_values('min value')
cols_frame.groupby(level=0).min().min()
Min value observations:
std_frame = cols_frame.groupby(level=0).std()
std_frame['min value'] = std_frame.min(axis=1)
std_frame['best df'] = std_frame.loc[:,:'numeric_c_n'].idxmin(axis=1)
std_frame.sort_values('min value')
cols_frame.groupby(level=0).std().min()
Std value observations:
Having tested multivariate model, we've improved our results. But lets not forget how we choose the columns for that model:
best_cols_list = results_frame_norm.index[:7].tolist()
top6cols = best_cols_list[1:7]
best_cols_list
for col in mutli_cols_df.columns:
mutli_cols_df[col].values[:] = 0
cols_frame = mutli_cols_df[:0].copy()
# random seed loop:
for i in list(range(0,100)):
# column combinations loop:
for n in list(range(1,7)):
cc = list(combinations(top6cols,n))
cols_list = []
for el in cc:
cols_list.append(list(el))
for el in cols_list:
el.append(best_cols_list[0])
# cols list ready now
df_results_list = []
for el in cols_list:
dict_a, dict_b, dict_c = fill_dict_v2(knn_train_test_v3, el, r=i)
results_frame = give_results_df(dict_a, dict_b, dict_c)
df_results_list.append(results_frame)
to_add = pd.concat(df_results_list)
to_add.columns = ['numeric_a_n', 'numeric_b_n', 'numeric_c_n']
cols_frame = cols_frame.append(to_add)
cols_frame[:10]
That's a few rows to process and analyse, we'll start with a simple visualization, to help us understand the next steps.
cols_frame_plot = cols_frame.copy()
index_list = []
for el in cols_frame_plot.index:
index_list.append((str(el.count(',')+1)+' cols'))
cols_frame_plot.index = index_list
cols_frame_plot = cols_frame_plot.sort_index()
mean_frame = cols_frame.groupby(level=0).mean()
index_list = []
for el in mean_frame.index:
index_list.append((str(el.count(',')+1)+' cols'))
mean_frame_plot = mean_frame.copy()
mean_frame_plot.index = index_list
mean_frame_plot = mean_frame_plot.sort_index()
mean_frame_plot2 = mean_frame_plot.groupby(level=0).mean()
fig, ax = plt.subplots(figsize=(16,23))
fig.subplots_adjust(hspace=0.2, wspace=0.2)
ax0=plt.subplot(311)
y_list = [cols_frame_plot['numeric_a_n'], cols_frame_plot['numeric_b_n'], cols_frame_plot['numeric_c_n']]
scatter_3v2(cols_frame_plot.index,y_list,labels, size=1.2, alpha = 1, z=0.3)
plt.title('Plot 1: Results (63 column combinations, 100 random seeds)',size=14)
handles, labels = ax0.get_legend_handles_labels()
l = plt.legend(handles[::6], ['numeric_a_n','numeric_b_n','numeric_c_n'], loc=1, borderaxespad=0.)
# legend points to small? solution:
for el in l.legendHandles:
el._sizes = [40]
ax0.set_facecolor('black')
ax1=plt.subplot(312)
y_list = [mean_frame_plot['numeric_a_n'], mean_frame_plot['numeric_b_n'], mean_frame_plot['numeric_c_n']]
scatter_3v2(mean_frame_plot.index,y_list,labels, size=8, alpha = 1)
handles, labels = ax1.get_legend_handles_labels()
l = plt.legend(handles[::6], ['numeric_a_n','numeric_b_n','numeric_c_n'], loc=1, borderaxespad=0.)
plt.title('Plot 2: Average results for every column combination',size=14)
ax2=plt.subplot(313)
y_list = [mean_frame_plot2['numeric_a_n'], mean_frame_plot2['numeric_b_n'], mean_frame_plot2['numeric_c_n']]
scatter_3v2(mean_frame_plot2.index, y_list, labels, size=12, alpha = 1)
handles, labels = ax2.get_legend_handles_labels()
l = plt.legend(handles[::6], ['numeric_a_n','numeric_b_n','numeric_c_n'], loc=2, borderaxespad=0.)
plt.title('Plot 3: Average results for column count',size=14)
axes = [ax0, ax1, ax2]
for ax in axes:
spines(ax, xl='Column count')
# ax.set_ylabel('Rmse')
# ax.set_xlabel('Column count')
plt.show()
The three plots may look a bit overwhelming at first but it's shouldn't be hard to understand:
Now instead of trying to guess which dot is lower, lets look at the numbers:
# lets group all the '100 random seed runs' into 1 row:
mean_frame = cols_frame.groupby(level=0).mean()
# analyze results:
mean_frame['min value'] = mean_frame.min(axis=1)
mean_frame['best df'] = mean_frame.loc[:,:'numeric_c_n'].idxmin(axis=1)
mean_frame.sort_values('min value')[:5]
(sidenote: remember which columns and dataframe were considered the best in step 2.4? are they different to the columns in the above table?)
Quick reminder: every row is an individual column combination that had gone trough our model 100 times with a different random seed each time. Then we've calculated the mean of those 100 results for every combination and now we can see the best (lowest) 5 results above.
Observations:
# lets group all the '100 random seed runs' into 1 row:
std_frame = cols_frame.groupby(level=0).std()
# analyze results:
std_frame['min value'] = std_frame.min(axis=1)
std_frame['best df'] = std_frame.loc[:,:'numeric_c_n'].idxmin(axis=1)
std_frame.sort_values('min value')[:5]
Observations:
# lets group all the '100 random seed runs' into 1 row:
min_frame = cols_frame.groupby(level=0).min()
# analyze results:
min_frame['min value'] = min_frame.min(axis=1)
min_frame['best df'] = min_frame.loc[:,:'numeric_c_n'].idxmin(axis=1)
min_frame.sort_values('min value')[:5]
Having looked at the mean of the results and their std, it's worth taking a peak at what is the single best result and what combination achieved it ('curb_weight', 'highway_mpg', 'horsepower', 'city_mpg', 'engine_size' columns on a numeric_a_n dataframe)
std_frame.loc["['curb_weight', 'highway_mpg', 'horsepower', 'city_mpg', 'engine_size']"]
mean_frame.loc["['curb_weight', 'highway_mpg', 'horsepower', 'city_mpg', 'engine_size']"]
Now lets recreate all the steps from previous chapter and multiply them by 25 'k' values... Woah that's like 3 dataframes, 63 column combinations, 100 random seeds and now 25 different k values? Remember we used to test more than 1000 random seeds?
best_cols_list = results_frame_norm.index[:7].tolist()
top6cols = best_cols_list[1:7]
best_cols_list
for col in mutli_cols_df.columns:
mutli_cols_df[col].values[:] = 0
cols_framek = mutli_cols_df[:0].copy()
# k value loop (26)
for z in list(range(1,26)):
# random seed loop - 100:
for i in list(range(0,100)):
# column combinations loop(7):
for n in list(range(1,7)):
cc = list(combinations(top6cols,n))
cols_list = []
for el in cc:
cols_list.append(list(el))
for el in cols_list:
el.append(best_cols_list[0])
# cols list ready now
df_results_list = []
for el in cols_list:
dict_a, dict_b, dict_c = fill_dict_v2(knn_train_test_v3, el, r=i, k=z)
results_frame = give_results_df(dict_a, dict_b, dict_c)
results_frame = results_frame.rename(index={results_frame.index[-1]: results_frame.index[-1]+' k: '+str(z) })
df_results_list.append(results_frame)
to_add = pd.concat(df_results_list)
to_add.columns = ['numeric_a_n', 'numeric_b_n', 'numeric_c_n']
cols_framek = cols_framek.append(to_add)
cols_framek[:10]
# lets group all the '100 random seed runs' into 1 row:
mean_frame = cols_framek.groupby(level=0).mean()
# analyze results:
mean_frame['min value'] = mean_frame.min(axis=1)
mean_frame['best df'] = mean_frame.loc[:,:'numeric_c_n'].idxmin(axis=1)
mean_frame.sort_values('min value')[:5]
min_frame = cols_framek.groupby(level=0).min()
# analyze results:
min_frame['min value'] = min_frame.min(axis=1)
min_frame['best df'] = min_frame.loc[:,:'numeric_c_n'].idxmin(axis=1)
min_frame.sort_values('min value')[:5]
# lets group the dataframe rows by number of columns and k value:
cols_framek_plot = cols_framek.groupby(level=0).mean()
index_list = []
for el in cols_framek_plot.index:
index_list.append((str(el.count(',')+1)+' cols' + el[el.find(' k:'):]))
cols_framek_plot.index = index_list
cols_ks_frame = cols_framek_plot.groupby(level=0).mean()
cols_ks_frame['cols'] = cols_ks_frame.index.str[:6]
index_list = []
for el in cols_ks_frame.index:
index_list.append(int(el[el.find('k:')+2:]))
cols_ks_frame.index = index_list
cols_ks_frame.sort_index
n = 0
fig, ax = plt.subplots(figsize=(16,16))
for el in cols_ks_frame['cols'].value_counts().index.tolist()[:-2]:
n += 0.25
frame1 = cols_ks_frame[cols_ks_frame['cols']==el].sort_index()
y_list = [frame1['numeric_a_n'], frame1['numeric_b_n'], frame1['numeric_c_n']]
scatter_3v2(frame1.index,y_list,labels, size=10, alpha = 1.25-n, z=0.3)
spines(ax,xl='k value')
# legend labels:
labels_list = []
col_label_list = cols_ks_frame['cols'].value_counts().index.tolist()[:-2]
df_label_list = ['numeric_a_n', 'numeric_b_n', 'numeric_c_n']
for ele in col_label_list:
for el in df_label_list:
labels_list.append(el+' '+ele)
handles, labels = ax.get_legend_handles_labels()
l = plt.legend(handles[::25],labels_list, loc=2, borderaxespad=0.)
plt.title('Average rmse results per k value, dataframe and number of columns')
plt.show()
mean_frame_plot0 = cols_framek.groupby(level=0).mean().copy()
k_index = []
for el in mean_frame_plot0.index:
k_index.append(int(el[el.find('k:')+2:]))
mean_frame_plot0.index = k_index
index_list = []
mean_frame_plot = cols_framek.groupby(level=0).mean()
for el in mean_frame_plot.index:
index_list.append((str(el.count(',')+1)+' cols'))
mean_frame_plot.index = index_list
fig, ax = plt.subplots(figsize=(16,12))
ax1 = plt.subplot(211)
y_list = [mean_frame_plot0['numeric_a_n'], mean_frame_plot0['numeric_b_n'], mean_frame_plot0['numeric_c_n']]
scatter_3v2(mean_frame_plot0.index,y_list,labels, size=3, alpha = 1, z=0.4)
spines(ax1, xl='K value')
handles, labels = ax0.get_legend_handles_labels()
l = plt.legend(handles[::6], ['numeric_a_n','numeric_b_n','numeric_c_n'], loc=2, borderaxespad=0.)
# legend points to small? solution:
for el in l.legendHandles:
el._sizes = [40]
ax2 = plt.subplot(212)
y_list = [mean_frame_plot['numeric_a_n'], mean_frame_plot['numeric_b_n'], mean_frame_plot['numeric_c_n']]
scatter_3v2(mean_frame_plot.index,y_list,labels, size=3, alpha = 1,z=0.4)
spines(ax2, xl='Number of columns')
handles, labels = ax0.get_legend_handles_labels()
l = plt.legend(handles[::6], ['numeric_a_n', 'numeric_b_n', 'numeric_c_n'], loc=1, borderaxespad=0.)
# legend points to small? solution:
for el in l.legendHandles:
el._sizes = [40]
plt.show()