Off-White & Yeezy Sneaker Analysis by Stockx.com

In [53]:
import pandas as pd
import numpy as np
import time

# metrics 
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.model_selection import cross_val_score, GridSearchCV

# models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
In [54]:
x = pd.read_csv('stockx.csv')
In [55]:
# removing $ and , from the data and changing the data type to integers for calculations
x['Sale Price'] = x['Sale Price'].str.replace('$', '')
x['Sale Price'] = x['Sale Price'].str.replace(',', '')
x['Sale Price'] = x['Sale Price'].astype('int64')
# removing $ and , from the data and changing the data type to integers for calculations
x['Retail Price'] = x['Retail Price'].str.replace('$', '')
x['Retail Price'] = x['Retail Price'].str.replace(',', '')
x['Retail Price'] = x['Retail Price'].astype('int64')
# calculating the user's profit margin per item based on retail price as the acquisition cost 
x['Profit Margin'] = x['Sale Price'] - x['Retail Price']
x['Profit Percent'] = round(( x['Profit Margin'] / x['Sale Price'] ) , 5)

# handling datetime objects
x['Order Date'] = pd.to_datetime(x['Order Date'], infer_datetime_format=True)
x['Release Date'] = pd.to_datetime(x['Release Date'], infer_datetime_format=True)
x['Date Difference'] = (x['Order Date'] - x['Release Date'])
# converts the timedelta into just an integer
x['Date Difference'] = x['Date Difference'].apply(lambda x: x.days)
In [56]:
# creates a pivot table which is moved into excel for use in Tableau visualizations 
# contains aggregate functions for the sale price and profit percent columns
average_sale = pd.pivot_table(x, values=['Sale Price', 'Profit Percent'], index=['Sneaker Name','Shoe Size'], aggfunc=[lambda x: len(x.unique()), 'mean', 'max', 'min', 'median']).reset_index()
average_sale = average_sale.drop(('<lambda>', 'Profit Percent'), axis=1)
# average_sale.to_excel('sneaker_size_labels.xlsx')
In [57]:
# columns to keep as were numerical from original dataset
# xc = x[['Sale Price', 'Retail Price', 'Date Difference']].copy()
xc = x[['Sale Price', 'Retail Price', 'Date Difference']].copy()

# these columns to become OneHotEncoded using pd.get_dummies()
# y = x[['Brand','Sneaker Name', 'Shoe Size', 'Buyer Region']].copy()
y = x[['Sneaker Name', 'Shoe Size']].copy()

y['Shoe Size'] = y['Shoe Size'].astype('str')
# creating numerical values out of categorical values - did not opt in for dropping the first column 
# g = pd.get_dummies(y, prefix=['Brand','Sneaker Name', 'Size','Region'])
g = pd.get_dummies(y, prefix=['Sneaker Name', 'Size'])
n = pd.concat([xc, g],axis=1)
In [58]:
n_target = n['Sale Price']
X = n.iloc[:,2:]
In [59]:
score_dict = {}
def score_me(mn, mc, pn):
    global model_name
    model_name = {}
    model_name['Train Score'] = mc.score(X_train, y_train)
    model_name['Test Score'] = mc.score(X_test, y_test)
    model_name['MAE'] = metrics.mean_absolute_error(y_test, pn)
    model_name['MSE'] = metrics.mean_squared_error(y_test, pn)
    model_name['RMSE'] = np.sqrt(metrics.mean_squared_error(y_test, pn))
    score_dict[mn] = model_name 
    
In [60]:
# Linear Regression to predict the Sales Price of Sneakers
X_train, X_test, y_train, y_test = train_test_split(X, n_target, random_state=42, test_size=0.25)

linear_start = time.time()
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, y_train)
y_pred = linear_regressor.predict(X_test)
score_me(mn='Linear Regression', mc=linear_regressor, pn=y_pred)
linear_end = time.time()
linear_time = linear_end - linear_start
model_name['Run Time(secs)'] = round(linear_time, 3)
In [61]:
# Ridge Regression to predict the Sales Price of Sneakers
ridge_start = time.time()
ridgelin = Ridge(alpha=.0001).fit(X_train, y_train)
ridge_pred = ridgelin.predict(X_test)
score_me('Ridge Regression', ridgelin, ridge_pred)
ridge_end = time.time()
ridge_time = ridge_end - ridge_start
model_name['Run Time(secs)'] = round(ridge_time, 3)
In [62]:
# Lasso Regression to predict the Sales Price of Sneakers
lasso_start = time.time()
lassolin = Lasso(alpha=.00001, max_iter=100000).fit(X_train, y_train)
lasso_pred = lassolin.predict(X_test)
score_me('Lasso Regression', lassolin, lasso_pred)
lasso_end = time.time()
lasso_time = lasso_end - lasso_start
model_name['Run Time(secs)'] = round(lasso_time, 3)
/Users/oldvasegreenbird/miniconda3/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:476: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 230244458.61999825, tolerance: 491388.22208783333
  positive)
In [63]:
# random forest regressor
rf_start = time.time()
rf = RandomForestRegressor(n_estimators = 50, random_state = 42)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
score_me('Random Forest Regression', rf, predictions)
rf_end = time.time()
rf_time = rf_end - rf_start
model_name['Run Time(secs)'] = round(rf_time, 3)
In [64]:
#decision tree regressor
dt_start = time.time()
tree = DecisionTreeRegressor().fit(X_train, y_train)
tree_predictor = tree.predict(X_test)
score_me('Decision Tree Regression', tree, tree_predictor)
dt_end = time.time()
dt_time = dt_end - dt_start
model_name['Run Time(secs)'] = round(dt_time, 3)
In [65]:
model_results = pd.DataFrame.from_dict(score_dict)
In [66]:
model_results
Out[66]:
Linear Regression Ridge Regression Lasso Regression Random Forest Regression Decision Tree Regression
Train Score 0.837779 0.837779 0.837779 0.996295 0.998664
Test Score 0.837084 0.837084 0.837084 0.980792 0.975012
MAE 60.432033 60.432033 60.432031 15.163111 16.986706
MSE 10664.318866 10664.318800 10664.319873 1257.314259 1635.722941
RMSE 103.268189 103.268189 103.268194 35.458627 40.444072
Run Time(secs) 0.227000 0.130000 527.401000 25.100000 0.777000
In [67]:
test_1 = n.iloc[:10, 2:]
pd.Series(tree.predict(test_1))
Out[67]:
0    1097.0
1     685.0
2     690.0
3    1075.0
4     828.0
5     798.0
6     784.0
7     466.5
8     465.0
9     465.0
dtype: float64
In [68]:
n.iloc[:10, :1]
Out[68]:
Sale Price
0 1097
1 685
2 690
3 1075
4 828
5 798
6 784
7 460
8 465
9 465
In [ ]: