%load_ext watermark
%watermark -d -v -u -t -z -p numpy
Last updated: 05/07/2014 18:39:56 EDT CPython 3.4.1 IPython 2.1.0 numpy 1.8.1
More information about the watermark
magic command extension.
Normalization via Z-score standardization and Min-Max scaling are important data pre-processing steps for many machine learning and pattern classification tasks, and various other data analyses.
The popular open-source scikit-learn
machine learning library provides handy feature scaling methods that make this task very convenient.
In one of my recent articles, I discussed the importance of feature scaling in more detail, but now I wanted to see which approach is actually the faster one, which is especially interesting for scaling of large datesets: A NumPy bottom-up approach vs. the scikit-preprocessing tools...
Using the following equation for Min-Max scaling: \begin{equation} X_{norm} = \frac{X - X_{min}}{X_{max}-X_{min}} \end{equation}
will scale our data to a range between 0 and 1.
import numpy as np
np.random.seed(123)
# A random 2D-array ranging from 0-100
X = np.random.rand(100,2)
X.dtype = np.float64
X *= 100
def numpy_minmax(X):
xmin = X.min()
return (X - xmin) / (X.max() - xmin)
from sklearn import preprocessing
def sci_minmax(X):
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1), copy=True)
return minmax_scale.fit_transform(X)
%matplotlib inline
from matplotlib import pyplot as plt
sci_mm = sci_minmax(X)
numpy_mm = numpy_minmax(X)
plt.scatter(numpy_mm[:,0], numpy_mm[:,1],
color='g',
label='NumPy bottom-up',
alpha=0.5,
marker='o'
)
plt.scatter(sci_mm[:,0], sci_mm[:,1],
color='b',
label='scikit-learn',
alpha=0.5,
marker='x'
)
plt.legend()
plt.grid()
plt.show()
timeit
¶import timeit
from numpy import append as np_append
from numpy import concatenate as np_concatenate
from numpy import hstack as np_hstack
from numpy import vstack as np_vstack
funcs = ('np_append', 'np_concatenate', 'np_hstack', 'np_linalg_norm')
t_append, t_hconc, t_vconc, t_hstack, t_vstack = [], [], [], [], []
orders_5 = [10**i for i in range(1, 5)]
for n in orders_5:
nxn_dim = np.random.randn(n,n)
t_vconc.append(min(timeit.Timer('np_concatenate((nxn_dim, nxn_dim))',
'from __main__ import nxn_dim, np_concatenate').repeat(repeat=5, number=1)))
t_vstack.append(min(timeit.Timer('np_vstack((nxn_dim, nxn_dim))',
'from __main__ import nxn_dim, np_vstack').repeat(repeat=5, number=1)))
orders_6 = [10**i for i in range(1, 6)]
for n in orders_6:
nx1_dim = np.random.randn(n,1)
t_append.append(min(timeit.Timer('np_append(nx1_dim, nx1_dim)',
'from __main__ import nx1_dim, np_append').repeat(repeat=5, number=1)))
t_hconc.append(min(timeit.Timer('np_concatenate((nx1_dim, nx1_dim), axis=1)',
'from __main__ import nx1_dim, np_concatenate').repeat(repeat=5, number=1)))
t_hstack.append(min(timeit.Timer('np_hstack((nx1_dim, nx1_dim))',
'from __main__ import nx1_dim, np_hstack').repeat(repeat=5, number=1)))
%matplotlib inline
from matplotlib import pyplot as plt
def plot():
def settings():
plt.xlim([min(orders_6) / 10, max(orders_6)* 10])
plt.legend(loc=2, fontsize=14)
plt.grid()
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xscale('log')
plt.yscale('log')
plt.legend(loc=2, fontsize=14)
fig = plt.figure(figsize=(15,8))
plt.subplot(1,2,1)
plt.plot(orders_5, t_vconc, alpha=0.7, label='np.concatenate((a,b))')
plt.plot(orders_5, t_vstack, alpha=0.7, label='np.vstack((a,b))')
plt.xlabel(r'sample size $n$ ($n\times \, n$ NumPy array)', fontsize=14)
plt.ylabel('time per computation in seconds', fontsize=14)
plt.title('Vertical stacking of NumPy arrays (row wise)', fontsize=14)
settings()
plt.subplot(1,2,2)
plt.plot(orders_6, t_hconc, alpha=0.7, label='np.concatenate((a,b), axis=1)')
plt.plot(orders_6, t_hstack, alpha=0.7, label='np.hstack((a,b))')
plt.plot(orders_6, t_append, alpha=0.7, label='np.append(a,b)')
plt.xlabel(r'sample size $n$ ($n\times \, 1$ NumPy array)', fontsize=14)
plt.ylabel('time per computation in seconds', fontsize=14)
plt.title('Horizontal stacking of NumPy arrays (column wise)', fontsize=14)
settings()
plt.tight_layout()
plt.show()
plot()
%watermark
05/07/2014 19:18:43 CPython 3.4.1 IPython 2.1.0 compiler : GCC 4.2.1 (Apple Inc. build 5577) system : Darwin release : 13.2.0 machine : x86_64 processor : i386 CPU cores : 2 interpreter: 64bit
The plots above are indicating that the concatenate
functions are indeed faster to call for small sample sizes. However, large arrays are the ones where performance really matters, and we can see that the other functions are catching up performance-wise with increasing array sizes.