import pandas as pd
import numpy as np
import time
# metrics
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score, GridSearchCV
# models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
x = pd.read_csv('stockx.csv')
# removing $ and , from the data and changing the data type to integers for calculations
x['Sale Price'] = x['Sale Price'].str.replace('$', '')
x['Sale Price'] = x['Sale Price'].str.replace(',', '')
x['Sale Price'] = x['Sale Price'].astype('int64')
# removing $ and , from the data and changing the data type to integers for calculations
x['Retail Price'] = x['Retail Price'].str.replace('$', '')
x['Retail Price'] = x['Retail Price'].str.replace(',', '')
x['Retail Price'] = x['Retail Price'].astype('int64')
# calculating the user's profit margin per item based on retail price as the acquisition cost
x['Profit Margin'] = x['Sale Price'] - x['Retail Price']
x['Profit Percent'] = round(( x['Profit Margin'] / x['Sale Price'] ) , 5)
# handling datetime objects
x['Order Date'] = pd.to_datetime(x['Order Date'], infer_datetime_format=True)
x['Release Date'] = pd.to_datetime(x['Release Date'], infer_datetime_format=True)
x['Date Difference'] = (x['Order Date'] - x['Release Date'])
# converts the timedelta into just an integer
x['Date Difference'] = x['Date Difference'].apply(lambda x: x.days)
# creates a pivot table which is moved into excel for use in Tableau visualizations
# contains aggregate functions for the sale price and profit percent columns
average_sale = pd.pivot_table(x, values=['Sale Price', 'Profit Percent'], index=['Sneaker Name','Shoe Size'], aggfunc=[lambda x: len(x.unique()), 'mean', 'max', 'min', 'median']).reset_index()
average_sale = average_sale.drop(('<lambda>', 'Profit Percent'), axis=1)
# average_sale.to_excel('sneaker_size_labels.xlsx')
# columns to keep as were numerical from original dataset
# xc = x[['Sale Price', 'Retail Price', 'Date Difference']].copy()
xc = x[['Sale Price', 'Retail Price', 'Date Difference']].copy()
# these columns to become OneHotEncoded using pd.get_dummies()
# y = x[['Brand','Sneaker Name', 'Shoe Size', 'Buyer Region']].copy()
y = x[['Sneaker Name', 'Shoe Size']].copy()
y['Shoe Size'] = y['Shoe Size'].astype('str')
# creating numerical values out of categorical values - did not opt in for dropping the first column
# g = pd.get_dummies(y, prefix=['Brand','Sneaker Name', 'Size','Region'])
g = pd.get_dummies(y, prefix=['Sneaker Name', 'Size'])
n = pd.concat([xc, g],axis=1)
n_target = n['Sale Price']
X = n.iloc[:,2:]
score_dict = {}
def score_me(mn, mc, pn):
global model_name
model_name = {}
model_name['Train Score'] = mc.score(X_train, y_train)
model_name['Test Score'] = mc.score(X_test, y_test)
model_name['MAE'] = metrics.mean_absolute_error(y_test, pn)
model_name['MSE'] = metrics.mean_squared_error(y_test, pn)
model_name['RMSE'] = np.sqrt(metrics.mean_squared_error(y_test, pn))
score_dict[mn] = model_name
# Linear Regression to predict the Sales Price of Sneakers
X_train, X_test, y_train, y_test = train_test_split(X, n_target, random_state=42, test_size=0.25)
linear_start = time.time()
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)
y_pred = linear_regressor.predict(X_test)
score_me(mn='Linear Regression', mc=linear_regressor, pn=y_pred)
linear_end = time.time()
linear_time = linear_end - linear_start
model_name['Run Time(secs)'] = round(linear_time, 3)
# Ridge Regression to predict the Sales Price of Sneakers
ridge_start = time.time()
ridgelin = Ridge(alpha=.0001).fit(X_train, y_train)
ridge_pred = ridgelin.predict(X_test)
score_me('Ridge Regression', ridgelin, ridge_pred)
ridge_end = time.time()
ridge_time = ridge_end - ridge_start
model_name['Run Time(secs)'] = round(ridge_time, 3)
# Lasso Regression to predict the Sales Price of Sneakers
lasso_start = time.time()
lassolin = Lasso(alpha=.00001, max_iter=100000).fit(X_train, y_train)
lasso_pred = lassolin.predict(X_test)
score_me('Lasso Regression', lassolin, lasso_pred)
lasso_end = time.time()
lasso_time = lasso_end - lasso_start
model_name['Run Time(secs)'] = round(lasso_time, 3)
/Users/oldvasegreenbird/miniconda3/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:476: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 230244458.61999825, tolerance: 491388.22208783333 positive)
# random forest regressor
rf_start = time.time()
rf = RandomForestRegressor(n_estimators = 50, random_state = 42)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
score_me('Random Forest Regression', rf, predictions)
rf_end = time.time()
rf_time = rf_end - rf_start
model_name['Run Time(secs)'] = round(rf_time, 3)
#decision tree regressor
dt_start = time.time()
tree = DecisionTreeRegressor().fit(X_train, y_train)
tree_predictor = tree.predict(X_test)
score_me('Decision Tree Regression', tree, tree_predictor)
dt_end = time.time()
dt_time = dt_end - dt_start
model_name['Run Time(secs)'] = round(dt_time, 3)
model_results = pd.DataFrame.from_dict(score_dict)
model_results
Linear Regression | Ridge Regression | Lasso Regression | Random Forest Regression | Decision Tree Regression | |
---|---|---|---|---|---|
Train Score | 0.837779 | 0.837779 | 0.837779 | 0.996295 | 0.998664 |
Test Score | 0.837084 | 0.837084 | 0.837084 | 0.980792 | 0.975012 |
MAE | 60.432033 | 60.432033 | 60.432031 | 15.163111 | 16.986706 |
MSE | 10664.318866 | 10664.318800 | 10664.319873 | 1257.314259 | 1635.722941 |
RMSE | 103.268189 | 103.268189 | 103.268194 | 35.458627 | 40.444072 |
Run Time(secs) | 0.227000 | 0.130000 | 527.401000 | 25.100000 | 0.777000 |
test_1 = n.iloc[:10, 2:]
pd.Series(tree.predict(test_1))
0 1097.0 1 685.0 2 690.0 3 1075.0 4 828.0 5 798.0 6 784.0 7 466.5 8 465.0 9 465.0 dtype: float64
n.iloc[:10, :1]
Sale Price | |
---|---|
0 | 1097 |
1 | 685 |
2 | 690 |
3 | 1075 |
4 | 828 |
5 | 798 |
6 | 784 |
7 | 460 |
8 | 465 |
9 | 465 |