#!/usr/bin/env python # coding: utf-8 # # Top 10 Statistics Mistakes Made by Data Scientists - Examples Code # # Blog post available at https://github.com/d6t/d6t-python/blob/master/blogs/top10-mistakes-statistics.md # In[1]: # calc import pandas as pd import numpy as np import scipy.stats # viz import seaborn as sns from matplotlib import pyplot as plt # ML import sklearn.model_selection import sklearn.metrics from sklearn.metrics import mean_squared_error as sklmse # In[2]: # helpers def print2(a,b,ta=None,tb=None): print(ta,np.round(a,3),tb,np.round(b,3)) def CVTestMean(model, dfg): return -sklearn.model_selection.cross_validate(model,dfg.iloc[:,:-1],dfg.iloc[:,-1], cv=10, scoring='neg_mean_squared_error')['test_score'].mean() def mse(model, dfg): return sklearn.metrics.mean_squared_error(model.predict(dfg.iloc[:,:-1]),dfg.iloc[:,-1]) # ## Load data # # The code uses workflow management library [d6tflow](https://github.com/d6t/d6tflow) and data is shared with dataset management library [d6tpipe](https://github.com/d6t/d6tpipe). # In[3]: import d6tflow.pipes d6tflow.pipes.init('top10-mistakes-stats') d6tflow.pipes.get_pipe().pull() # In[4]: import cfg, tasks # ## Example 1 # In[5]: # objective function from sklearn.metrics import f1_score, accuracy_score y_true = [0, 0, 0, 0, 0] y_pred = [0, 1, 0, 1, 0] y_true = np.tile(y_true,10)+[1] y_pred = np.tile(y_pred,10)+[0] print2(f1_score(y_true,y_pred), accuracy_score(y_true,y_pred), 'f1', 'accuracy') # ## Example 2 # In[6]: df = tasks.DataRegression().outputLoad() dfp = df.melt(value_vars=['x1','x2'],id_vars='y') sns.lmplot(x="value", y="y", col="variable", data=dfp) plt.savefig('reports/top10-stats-example2.png') # ## Example 3 # In[7]: df = tasks.DataOutliers().outputLoad() m1, m2 = tasks.ModelOutliers().outputLoad() dfp = df.melt(value_vars=['x1','x2'],id_vars='y') sns.lmplot(x="value", y="y", col="variable", data=dfp) plt.savefig('reports/top10-stats-example3.png') print2(m1.coef_[0],m2.coef_[0], 'slope x1 w/o outlier', 'slope w/ outlier') # ## Example 4 # In[8]: df_ts = tasks.DataTS().outputLoad() m1, m2 = tasks.ModelTS().outputLoad() print('ols CV mse',round(CVTestMean(m1,df_ts),3)) print('rf CV mse', round(CVTestMean(m2,df_ts),3)) print('last out-sample mse', round(sklmse(df_ts.iloc[1:,-1],df_ts['y'].shift().dropna()),3)) # ## Example 5 # In[9]: df = tasks.DataRegression().outputLoad() m1, m2 = tasks.OLSvsRF().outputLoad() print('in-sample') print2(mse(m2, df),mse(m1, df),'rf mse','ols mse') print('out-sample') print2(CVTestMean(m2,df),CVTestMean(m1,df),'rf mse','ols mse') # ## Example 6 # In[10]: df = tasks.DataRegression().outputLoad() m1, m2 = tasks.OLSvsRF().outputLoad() # mixing training and test data df_insample = pd.DataFrame(scipy.stats.zscore(df.copy())) # everything just became training data! # better X=df.iloc[:,:-1]; y=df.iloc[:,-1];test_error=[] for train_index, test_index in sklearn.model_selection.KFold(n_splits=10).split(df): X_train, X_test = X.iloc[train_index], X.iloc[test_index] X_train, X_test = scipy.stats.zscore(X.iloc[train_index]), scipy.stats.zscore(X.iloc[test_index]) y_train, y_test = scipy.stats.zscore(y.iloc[train_index]), scipy.stats.zscore(y.iloc[test_index]) m1.fit(X_train,y_train) test_error.append(sklmse(y_test,m1.predict(X_test))) # => distributional properties haven't changed print2(CVTestMean(m1,df_insample),np.mean(test_error),'mixed out-sample CV mse','true out-sample CV mse') # ## Example 7 # In[11]: df_ts = tasks.DataTS().outputLoad().reset_index(drop=True).sort_index() df_ts = pd.concat([df_ts,df_ts]) # data for two entities which are highly correlated, in this case identical # default CV m1 = sklearn.linear_model.LinearRegression() m2 = sklearn.ensemble.RandomForestRegressor(n_estimators=10) print('normal CV') print2(CVTestMean(m1,df_ts),CVTestMean(m2,df_ts),'ols','rf') # roll-forward testing mroll = sklearn.linear_model.LinearRegression() mroll2 = sklearn.ensemble.RandomForestRegressor(n_estimators=10) pred_ols = []; pred_rf = []; for i in range(cfg.nobs//3,df.shape[0]): x_train = df.iloc[:i-1,:-1] y_train = df.iloc[:i-1,-1] x_test = df.iloc[i,:-1] mroll.fit(x_train,y_train) mroll2.fit(x_train,y_train) pred_ols.append(mroll.predict([x_test])[0]) pred_rf.append(mroll2.predict([x_test])[0]) y_os_true = df.iloc[cfg.nobs//3:,-1] print('true out-sample error') print2(sklmse(y_os_true,np.array(pred_ols)),sklmse(y_os_true,np.array(pred_rf)),'ols','rf') # ## Example 8 # In[12]: df1 = tasks.DataRegression().outputLoad() m1, m2 = tasks.OLSvsRF().outputLoad() print('first dataset') print2(CVTestMean(m2,df1),CVTestMean(m1,df1),'rf mse','ols mse') print('new dataset') params = {'random_state':10, 'noise':20} tasks.DataRegression(**params).run() df2 = tasks.DataRegression(**params).outputLoad() print2(CVTestMean(m2,df2),CVTestMean(m1,df2),'rf mse','ols mse') # In[ ]: