Sebastian Raschka
last updated: 05/25/2014



I would be happy to hear your comments and suggestions.
Please feel free to drop me a note via twitter, email, or google+.


Day 2 - One Python Benchmark per Day

Calculating sample means



Mean functions

In [1]:
# The statistics module has been added to
# the standard library in Python 3.4

import statistics as stats
import numpy as np

def calc_mean(samples):
    return sum(samples)/float(len(samples))

def np_mean(samples):
    return np.mean(samples)

def np_mean_ary(np_array):
    return np.mean(np_array)

def st_mean(samples):
    return stats.mean(samples)

def np_convert_and_mean_ary(samples):
    return np.mean(np.array(samples))



Verifying that functions work correctly

In [2]:
n = 1000
samples = list(range(n))
samples_array = np.arange(n)

assert(st_mean(samples) == np_mean(samples)
       == calc_mean(samples) == np_mean_ary(samples_array) == np_convert_and_mean_ary(samples))
print('ok')
ok



Timing

In [3]:
import timeit

funcs = ['st_mean', 'np_mean', 'calc_mean', 'np_mean_ary', 'np_convert_and_mean_ary']
orders_n = [10**n for n in range(1, 6)]
times_n = {f:[] for f in funcs}

for n in orders_n:
    samples = list(range(n))
    for f in funcs:
        if f == 'np_mean_ary':
            samples = np.arange(n)
        times_n[f].append(min(timeit.Timer('%s(samples)' %f, 
                'from __main__ import %s, samples' %f)
                    .repeat(repeat=3, number=1000)))



Setting up the plot

In [4]:
import platform
import multiprocessing


def print_sysinfo():
    
    print('\nPython version:', platform.python_version())
    print('NumPy version', np.__version__)
    print('compiler:', platform.python_compiler())
    
    print('\nsystem     :', platform.system())
    print('release    :', platform.release())
    print('machine    :', platform.machine())
    print('processor  :', platform.processor())
    print('interpreter:', platform.architecture()[0])
    print('CPU count  :', multiprocessing.cpu_count())
    print('\n\n')
In [5]:
%matplotlib inline
In [6]:
import matplotlib.pyplot as plt

def plot_timing():

    labels = [('st_mean', 'statistics.mean()'), 
          ('np_mean', 'numpy.mean() on list'),
          ('np_mean_ary', 'numpy.mean() on array'),
          ('calc_mean', 'sum(samples)/len(samples)'),
          ('np_convert_and_mean_ary', 'convert to array then numpy.mean()')
          ]

    plt.rcParams.update({'font.size': 12})

    fig = plt.figure(figsize=(10,8))
    for lb in labels:
        plt.plot(orders_n, times_n[lb[0]], 
             alpha=0.5, label=lb[1], marker='o', lw=3)
    plt.xlabel('sample size n')
    plt.ylabel('time per computation in milliseconds [ms]')
    plt.legend(loc=2)
    plt.grid()
    plt.xscale('log')
    plt.yscale('log')
    plt.title('Performance of different approaches for calculating sample means')

    max_perf = max( s/c for s,c in zip(times_n['st_mean'],
                                   times_n['np_convert_and_mean_ary']) )
    min_perf = min( s/c for s,c in zip(times_n['st_mean'],
                                   times_n['np_convert_and_mean_ary']) )

    ftext = 'Converting a list to a numpy array and then using numpy.mean() \n is {:.2f}x to '\
        '{:.2f}x faster than statistics.mean() on lists'\
        .format(min_perf, max_perf)
    plt.figtext(.14,.15, ftext, fontsize=11, ha='left')

    plt.show()



Results

In [7]:
print_sysinfo()
plot_timing()
('\nPython version:', '2.7.5')
('NumPy version', '1.6.2')
('compiler:', 'GCC 4.2.1 Compatible Apple LLVM 5.0 (clang-500.0.68)')
('\nsystem     :', 'Darwin')
('release    :', '13.4.0')
('machine    :', 'x86_64')
('processor  :', 'i386')
('interpreter:', '64bit')
('CPU count  :', 8)



In [7]: