#!/usr/bin/env python3
# -*- coding: utf-8 -*-
10_000
(only python 3.6)_
_ = f()
(dangerous with internationnaization)_something
(won't be imported with import *
)list_
__stuff
-> _ClassName__mangled_stuff
__init__
_()
Use
if __name__ == '__main__':
some_computation()
for name, surname in zip(names, surnames):
...
for index, prime in enumerate(primes):
...
False, 0, None, __nonzero__(), __len__()
square1 = lambda x: x ** 2
def square2(x):
return x ** 2
print(square1(5))
print(square2(5))
a is b # a and b are references to the same object
a = 2
b = a
print(a, b)
a = 3
print(a, b)
l1 = [1, 2, 3]
l2 = l1
l3 = l1[:]
print(l1, l2, l3)
l1[1] = 4
print(l1, l2, l3)
def good(a_list=None):
if a_list is None:
a_list = []
a_list.append(1)
return a_list
print(good())
print(good())
print(good([2]))
print(good())
def wrong(a_list=[]): # never use a mutable optionnal argument
a_list.append(1)
return a_list
print(wrong())
print(wrong())
print(wrong([2]))
print(wrong())
from copy import copy
def good(my_set):
for value in copy(my_set):
if 'bert' in value:
my_set.remove(value)
return(my_set)
print("list ok ", good(["einstein", "albert", "bert", "curie"]))
print("set ok ", good({"einstein", "albert", "bert", "curie"}))
def wrong(my_set):
for value in my_set:
if 'bert' in value:
my_set.remove(value)
return(my_set)
print("list Nok ", wrong(["einstein", "albert", "bert", "curie"]))
print("set Nok ", wrong({"einstein", "albert", "bert", "curie"}))
print("END")
try:
print("set Nok ", wrong({"einstein", "albert", "bert", "curie"}))
except RuntimeError as e:
print()
print("Oups, something went wrong:")
print(e)
print("Continuing anyway")
print()
print("I'm continuing")
import sys
class MyException(Exception):
pass
try:
for i in range(10):
for j in range(10):
if i == j == 5:
raise MyException("Found it")
except MyException as e:
print("Out of the loop")
print(e)
print("Stop")
# In a script, use a non-zero return code
# exit(1)
# In jupyter you can do
raise
print("I will never appear")
from functools import wraps
from time import time
def PrintAppel(f):
def before_f():
new_f.NbAppels += 1
print("Entering {}".format(f.__name__))
new_f.tin = time()
def after_f():
new_f.tout = time()
new_f.tcum += new_f.tout - new_f.tin
print("Exiting {}".format(f.__name__))
print("This was the call n° {}".format(new_f.NbAppels))
print("It took {} s".format(new_f.tout - new_f.tin))
print("in average {} s".format(new_f.tcum / new_f.NbAppels))
@wraps(f)
def new_f(*args, **xargs):
before_f()
res = f(*args, **xargs)
after_f()
return res
new_f.NbAppels = 0
new_f.tcum = 0.
return new_f
import numpy as np
from time import sleep
import numpy as np
@PrintAppel
def a_function(x):
np.random.rand()
sleep(np.random.rand())
return 2 * x
res = a_function(2)
print(res)
print(a_function.tcum / a_function.NbAppels)
class egg(object): # All objects derived from the same object "object"
""" Full exemple of a class in python """
total_number = 0 # shared attribut between all instances **DANGER** !
def __init__(self, number=1): # constructor
""" constructor from number """
self.number = number # Good way of defining attributes
egg.total_number += number
@classmethod
def from_recipe(cls, recipe): # Alternative constructor
""" constructor from recipe """
return cls(recipe["oeufs"])
def __del__(self): # destructor (rare)
""" destructor """
egg.total_number -= self.number
def __str__(self): # convert your object into printable string
""" egg to str convertor """
return "On a total of {} eggs, I own {}".format(egg.total_number, self.number)
def how_many(self): # a function of the instance
""" Return the current number of eggs in the recipe """
return self.number
@staticmethod
def how_many_egg(): # a function on the class (rare)
""" Return the total number of eggs for all recipes """
return egg.total_number
fried_egg = egg()
omelette = egg(3)
recipe_pancake = {"oeufs":2, "lait":0.5, "farine":300}
pancake = egg.from_recipe(recipe_pancake)
print("Fried egg : ", fried_egg)
print("Omelette : ", omelette)
print("Pancake : ", pancake)
print()
print("{:<12} : {:>5} | {}".format("egg",
"NaN",
egg.how_many_egg()))
print("{:<12} : {:>5} | {}".format("fried_egg",
fried_egg.how_many(),
fried_egg.how_many_egg()))
print("{:<12} : {:>5} | {}".format("omelette",
omelette.how_many(), omelette.how_many_egg()))
print("{:<12} : {:>5} | {}".format("pancake",
pancake.how_many(),
pancake.how_many_egg()))
del omelette
print("{:<12} : {:>5} | {}".format("egg",
"NaN",
egg.how_many_egg()))
print("{:<12} : {:>5} | {}".format("fried_egg",
fried_egg.how_many(),
fried_egg.how_many_egg()))
print("{:<12} : {:>5} | {}".format("pancake",
pancake.how_many(),
pancake.how_many_egg()))
del fried_egg
del pancake
print()
help(egg)
subprocess.check_call(["cmd", "arg1", "arg2"])
# or in jupyter
!cmd arg1 arg2
data = subprocess.check_output(["cmd", "arg1", "arg2"]).decode('utf-8')
!python3 script.py Marc
import subprocess
import sys
data = subprocess.check_output([sys.executable, "script.py", "Marc"]).decode('utf-8')
print(data)
__init__.py
)def greeting(name: str) -> str:
var = "Hello" # type: str
# python 3.7 : var = "Hello" : str
return var + " " + name
#ONLY for ipython
import ipytest.magics
import pytest
__file__ = '04.advanced.ipynb'
%%run_pytest[clean] -qq
#this was only for ipython
def test_sorted():
assert sorted([5, 1, 4, 2, 3]) == [1, 2, 3, 4, 5]
# as does parametrize
@pytest.mark.parametrize('input, expected', [
([2, 1], [1, 2]),
('zasdqw', list('adqswz')),
]
)
def test_exemples(input, expected):
actual = sorted(input)
assert actual == expected
import logging
import warnings
def prepare_logging():
"""
Prepare all logging facilities
This should be done in a separate module
"""
# if not already done, initialize logging facilities
logging.basicConfig()
# create a logger for the current module
logger = logging.getLogger(__name__)
## ONLY FOR IPYTHON
# clean logger (ipython + multiple call)
from copy import copy
for handler in copy(logger.handlers):
logger.removeHandler(handler)
# Do not propagate message to ipython (or else thy will be printed twice)
logger.propagate=False
## ONLY FOR IPYTHON
# optionnal : change format of the log
logFormatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s")
# optionnal : create a handler for file output
fileHandler = logging.FileHandler("{logPath}/{fileName}.log".format(logPath=".", fileName="test"))
# optionnal : create a handler for console output
consoleHandler = logging.StreamHandler()
# optionnal : Apply formatter to both handles
fileHandler.setFormatter(logFormatter)
consoleHandler.setFormatter(logFormatter)
# optionnal : attach handler to the logger
logger.addHandler(fileHandler)
logger.addHandler(consoleHandler)
# what severity to log (default is NOTSET, i.e. all)
logger.setLevel(logging.DEBUG) # ALL
fileHandler.setLevel(logging.INFO) # NO DEBUG
consoleHandler.setLevel(logging.WARNING) # ONLY WARNING AND ERRORS
return logger
def egg():
warnings.warn("A warning only once")
logger = prepare_logging()
egg()
logger.info('Start reading database')
records = {'john': 55, 'tom': 66}
logger.debug('Records: {}'.format(records))
logger.info('Updating records ...')
logger.warning("There is only 2 record !")
logger.info('Saving records ...')
logger.error("Something happend, impossible to save the records")
logger.info('Restoring records ...')
logger.critical("Database corrupted !")
logger.info('End of program')
egg()
%timeit [1 + i for i in range(1,10000)]
%timeit [1 * i for i in range(1,10000)]
%timeit [1 / i for i in range(1,10000)]
%timeit [1 // i for i in range(1,10000)]
%timeit [1. + float(i) for i in range(1,10000)]
%timeit [1. * float(i) for i in range(1,10000)]
%timeit [1. / float(i) for i in range(1,10000)]
%timeit [1. // float(i) for i in range(1,10000)]
import numpy as np
import cProfile
import re
def function2(array):
for i in range(500):
array += 3
array = array * 2
return array
def function1():
array = np.random.randint(500000, size=5000000)
array = function2(array)
return sorted(array)
cProfile.run('function1()', sort="tottime")
# or in jupyter
%prun function1()
Or, for a beautifull call graph of a complex program:
python3 -m cProfile -o profile.pstats script.py
gprof2dot -f pstats profile.pstats | dot -Tpng -o profile.png
from IPython.display import Image
Image(filename='images/profile.png')
Stop optimizing your Python code here (and compile it)
def init_copy(f, g):
"""
Just to be sure that the arguments are constant
"""
return f.copy(), g.copy()
def finish(res):
"""
A dummy function that does nothing
"""
return res
import textwrap
def checker(ref, res):
"""
A function that, given two results, check that they are identical
"""
if (type(ref) is not type(res) or
ref.dtype != res.dtype or
not np.array_equal(res, ref)):
print("Failed")
print("types: ",type(ref), type(res))
print("dtypes: ",ref.dtype, res.dtype)
differ = np.where(np.logical_not(np.isclose(ref,res)))
print(textwrap.dedent("""results:
ref shape: {}
res shape: {}
idx
{}
ref
{}
res
{}""".format(ref.shape,
res.shape,
differ,
ref[differ],
res[differ])
))
return False
return True
def instrument(check=None, setup=init_copy, finish=finish, timing=True):
"""
A decorator that will time a function, and if given,
check it's result with a reference function
setup and finish are part of the function not timed
"""
def _check(function):
def wrapped(*arg, check=check, timing=timing):
# Our result
a, b = setup(*arg)
res = function(a, b)
res = finish(res)
if check is not None:
print("Testing ", function.__name__, " ... ",end="")
# The reference (might not be decorated)
try:
ref = check(*arg, check=None, timing=False)
except TypeError:
ref = check(*arg)
if not checker(ref, res):
raise RuntimeError
else:
print("OK")
if timing:
print("Timing ", function.__name__, " ...")
%timeit function(a, b)
return res
return wrapped
return _check
# Create data
a = np.arange(1,1e6)
b = np.arange(1,1e6)
n = 4
s = 2 * n + 1
g = np.arange(s ** 2, dtype=np.int).reshape((s, s))
N = 200
small_f = np.arange(N * N, dtype=np.int).reshape((N, N))
N = 2000
large_f = np.arange(N * N, dtype=np.int).reshape((N, N))
@instrument()
def py_simple_operations_with_tmparrays_and_loops(a, b):
n = len(a)
c = np.empty_like(a)
for i in range(n):
c[i] = a[i] * b[i]
d = np.empty_like(a)
for i in range(n):
d[i] = 4.1 * a[i]
e = np.empty_like(a)
for i in range(n):
e[i] = c[i] - d[i]
f = np.empty_like(a)
for i in range(n):
f[i] = 2.5 * b[i]
g = np.empty_like(a, dtype=np.bool)
for i in range(n):
g[i] = e[i] > f[i]
return g
py_simple_operations_with_tmparrays_and_loops(a, b);
@instrument(check=py_simple_operations_with_tmparrays_and_loops)
def py_simple_operations_with_loops(a, b):
n = len(a)
g = np.empty_like(a, dtype=np.bool)
for i in range(n):
c = a[i] * b[i]
d = 4.1 * a[i]
e = c - d
f = 2.5 * b[i]
g[i] = e > f
return g
py_simple_operations_with_loops(a, b);
@instrument(check=py_simple_operations_with_loops)
def py_simple_operations(a, b):
return a * b - 4.1 * a > 2.5 * b
py_simple_operations(a, b);
def py_tough_operations(a, b):
return np.sin(a) + np.arcsinh(a / b)
i_py_tough_operations = instrument()(py_tough_operations)
i_py_tough_operations(a, b);
And use it to optimize some things
import dis
print(dis.code_info(py_tough_operations))
print()
print("Code :")
dis.dis(py_tough_operations)
print()
from numpy import sin, arcsinh
# We can avoid the step 0 and 12
def py_tough_operations_with_localsin(a, b):
return sin(a) + arcsinh(a / b)
i_py_tough_operations_with_localsin = instrument()(py_tough_operations_with_localsin)
i_py_tough_operations_with_localsin(a, b);
import dis
print(dis.code_info(py_tough_operations_with_localsin))
print()
print("Code :")
dis.dis(py_tough_operations_with_localsin)
print()
from numpy import sum as npsum
@instrument()
def convolve_python(f, g):
# f is an image and is indexed by (v, w)
# g is a filter kernel and is indexed by (s, t),
# it needs odd dimensions
# h is the output image and is indexed by (x, y),
if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
raise ValueError("Only odd dimensions on filter supported")
# smid and tmid are number of pixels between the center pixel
# and the edge, ie for a 5x5 filter they will be 2.
vmax, wmax = f.shape
smax, tmax = g.shape
smid = smax // 2
tmid = tmax // 2
# Allocate result image.
h = np.zeros_like(f)
# Do convolution
for x in range(smid, vmax - smid):
for y in range(tmid, wmax - tmid):
v1 = x - smid
v2 = v1 + smax
w1 = y - tmid
w2 = w1 + tmax
h[x, y] = npsum(g * f[v1:v2, w1:w2])
return h
convolve_python(small_f, g);
from scipy.signal import convolve2d
def scipy_setup(f, g):
# for some reason, scipy take the filter in reverse...
gr = g[::-1,::-1]
return f.copy(), gr.copy()
@instrument(check=convolve_python, setup=scipy_setup)
def scipy_convolve(f, g):
vmax, wmax = f.shape
smax, tmax = g.shape
smid = smax // 2
tmid = tmax // 2
h = np.zeros_like(f)
h[smid:vmax - smid, tmid:wmax - tmid] = convolve2d(f, g, mode="valid")
return h
scipy_convolve(small_f, g);
scipy_convolve(large_f, g, check=None);
And is multithreaded
import numexpr as ne
@instrument(check=py_simple_operations)
def ne_simple_operations(a, b):
return ne.evaluate('a * b - 4.1 * a > 2.5 * b')
ne_simple_operations(a,b);
@instrument(check=py_tough_operations_with_localsin)
def ne_tough_operations(a, b):
return ne.evaluate("sin(a) + arcsinh(a / b)")
ne_tough_operations(a,b);
And can parallelize part of your code
But it doesn't work everytime
import numba as nb
@instrument(check=ne_simple_operations)
@nb.jit(nopython=True, nogil=True, cache=False, parallel=True)
def nb_simple_operations(a, b):
return a * b - 4.1 * a > 2.5 * b
nb_simple_operations(a, b);
@instrument(check=ne_tough_operations)
@nb.jit(nopython=True, nogil=True, cache=False, parallel=True)
def nb_tough_operations(a, b):
return np.sin(a) + np.arcsinh(a / b)
nb_tough_operations(a, b);
@instrument(check=scipy_convolve)
@nb.jit(nopython=True, nogil=True, cache=False, parallel=True)
def convolve_numba(f, g):
# smid and tmid are number of pixels between the center pixel
# and the edge, ie for a 5x5 filter they will be 2.
vmax, wmax = f.shape
smax, tmax = g.shape
if smax % 2 != 1 or tmax % 2 != 1:
raise ValueError("Only odd dimensions on filter supported")
smid = smax // 2
tmid = tmax // 2
# Allocate result image.
h = np.zeros_like(f)
# Do convolution
for x in range(smid, vmax - smid):
for y in range(tmid, wmax - tmid):
# Calculate pixel value for h at (x,y). Sum one component
# for each pixel (s, t) of the filter g.
value = 0
for s in range(smax):
for t in range(tmax):
v = x - smid + s
w = y - tmid + t
value += g[s, t] * f[v, w]
h[x, y] = value
return h
convolve_numba(small_f, g);
convolve_numba(large_f, g);
# Stencil contains implicit loops
@nb.stencil(standard_indexing=("g",),neighborhood=((-4, 4),(-4, 4)))
def convolve_kernel(f, g):
smax, tmax = g.shape
smid = smax // 2
tmid = tmax // 2
h = 0
for s in range(smax):
for t in range(tmax):
h += g[s, t] * f[s - smid, t - tmid]
return h
@instrument(check=convolve_numba)
@nb.jit(nopython=True, nogil=True, cache=False, parallel=True)
def convolve_numba_with_stencil(f, g):
if g.shape[0] % 2 != 1 or g.shape[1] % 2 != 1:
raise ValueError("Only odd dimensions on filter supported")
return convolve_kernel(f, g).astype(f.dtype)
convolve_numba_with_stencil(small_f, g);
convolve_numba_with_stencil(large_f, g);
But you have to:
%load_ext Cython
%set_env CFLAGS="-Ofast -march=native -fvect-cost-model=cheap" \
"-fopenmp -Wno-unused-variable -Wno-cpp -Wno-maybe-uninitialized"
%%cython
# cython: language_level=3
# cython: initializedcheck=False
# cython: binding=True
# cython: nonecheck=False
# distutils: extra_link_args = -fopenmp
import numpy as np
cimport numpy as np
cimport cython
from cython.parallel cimport parallel, prange
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function
def csimple_operations(const double[::1]& a, const double[::1]& b):
cdef long n = a.shape[0]
cdef long[:] res = np.empty([n], dtype=long)
cdef Py_ssize_t i
for i in prange(n, nogil=True, num_threads=8):
res[i] = a[i] * b[i] - 4.1 * a[i] > 2.5 * b[i]
return np.asarray(res, dtype=bool)
i_csimple_operations = instrument(check=nb_simple_operations)(csimple_operations)
i_csimple_operations(a, b);
%%cython
# cython: language_level=3
# cython: initializedcheck=False
# cython: binding=True
# cython: nonecheck=False
# cython: boundscheck=False
# cython: wraparound=False
# distutils: extra_link_args = -fopenmp
import numpy as np
cimport numpy as np
from libc.math cimport sin, asinh
from cython.parallel cimport parallel, prange
def ctough_operations(const double[::1]& a, const double[::1]& b):
cdef long n = a.shape[0]
cdef double[:] res = np.empty([n], dtype=np.double)
cdef Py_ssize_t i
for i in prange(n, nogil=True, num_threads=8):
res[i] = sin(a[i]) + asinh(a[i] / b[i])
return np.asarray(res)
i_ctough_operations = instrument(check=nb_tough_operations)(ctough_operations)
i_ctough_operations(a, b);
%%cython
# cython: language_level=3
# cython: initializedcheck=False
# cython: binding=True
# cython: nonecheck=False
# cython: boundscheck=False
# cython: wraparound=False
# distutils: extra_link_args = -fopenmp
import numpy as np
cimport numpy as np
from cython.parallel cimport parallel, prange
def convolve_cython(const long[:,::1]& f, const long[:,::1]& g):
cdef long vmax = f.shape[0]
cdef long wmax = f.shape[1]
cdef long smax = g.shape[0]
cdef long tmax = g.shape[1]
# f is an image and is indexed by (v, w)
# g is a filter kernel and is indexed by (s, t),
# it needs odd dimensions
# h is the output image and is indexed by (x, y),
if smax % 2 != 1 or tmax % 2 != 1:
raise ValueError("Only odd dimensions on filter supported")
# smid and tmid are number of pixels between the center pixel
# and the edge, ie for a 5x5 filter they will be 2.
cdef long smid = smax // 2
cdef long tmid = tmax // 2
# Allocate result image.
cdef long[:,::1] h = np.zeros([vmax, wmax], dtype=long)
cdef long value
cdef long x, y, s, t, v, w
# Do convolution
for x in prange(smid, vmax - smid, nogil=True, num_threads=8):
for y in range(tmid, wmax - tmid):
# Calculate pixel value for h at (x,y). Sum one component
# for each pixel (s, t) of the filter g.
value = 0
for s in range(smax):
for t in range(tmax):
v = x - smid + s
w = y - tmid + t
value = value + g[s, t] * f[v, w]
h[x, y] = value
return np.asarray(h)
i_convolve_cython = instrument(check=convolve_numba_with_stencil)(convolve_cython)
i_convolve_cython(small_f, g);
i_convolve_cython(large_f, g);
%%writefile CModule.h
#include <stddef.h>
void convolve_c (const long f[],
const long g[],
long h[],
const size_t vmax,
const size_t wmax,
const size_t smax,
const size_t tmax);
%%writefile CModule.c
#include "CModule.h"
void convolve_c (const long f[],
const long g[],
long h[],
const size_t vmax,
const size_t wmax,
const size_t smax,
const size_t tmax)
{
const size_t smid = smax / 2;
const size_t tmid = tmax / 2;
for(size_t s = 0; s < vmax * wmax; ++s) {
h[s] = 0;
}
// Do convolution
#pragma omp parallel for default(shared) num_threads(8)
for(size_t x = smid; x < vmax - smid; ++x) {
for(size_t y = tmid; y < wmax - tmid; ++y) {
// Calculate pixel value for h at (x,y).
// Sum one component for each pixel (s, t) of the filter g.
long value = 0;
for(size_t s = 0; s < smax; ++s) {
for(size_t t = 0; t < tmax; ++t) {
size_t v = x - smid + s;
size_t w = y - tmid + t;
value = value + g[s*tmax + t] * f[v*wmax + w];
}
}
h[x*wmax + y] = value;
}
}
}
%%cython
# cython: language_level=3
# cython: initializedcheck=False
# cython: binding=True
# cython: nonecheck=False
# cython: boundscheck=False
# cython: wraparound=False
# distutils: extra_link_args = -fopenmp
# distutils: sources = CModule.c
import numpy as np
cimport numpy as np
from cython.parallel cimport parallel, prange
cdef extern from "CModule.h":
long* convolve_c (const long f[],
const long g[],
long h[],
const size_t vmax,
const size_t wmax,
const size_t smax,
const size_t tmax) nogil
def convolve_cython_pure(const long[:,::1]& f, const long[:,::1]& g):
# f is an image and is indexed by (v, w)
# g is a filter kernel and is indexed by (s, t),
# it needs odd dimensions
# h is the output image and is indexed by (x, y),
cdef long vmax = f.shape[0]
cdef long wmax = f.shape[1]
cdef long smax = g.shape[0]
cdef long tmax = g.shape[1]
if smax % 2 != 1 or tmax % 2 != 1:
raise ValueError("Only odd dimensions on filter supported")
cdef long[:,::1] h = np.empty([vmax, wmax], dtype=long)
# Do convolution
with nogil:
convolve_c(&f[0,0],
&g[0,0],
&h[0,0], vmax, wmax, smax, tmax)
return np.asarray(h)
i_convolve_cython_pure = instrument(check=i_convolve_cython)(convolve_cython_pure)
i_convolve_cython_pure(small_f, g);
i_convolve_cython_pure(large_f, g);
!gcc CModule.c main.c -o full_c -Ofast -march=native -fopenmp -fvect-cost-model=cheap
!./full_c
But you have to
%load_ext fortranmagic
%set_env F90FLAGS="-Ofast -march=native -fvect-cost-model=cheap -fopenmp"
%fortran_config --extra="-lgomp" --extra="--noarch"
%%fortran
subroutine fsimple_operations(a, b, c, n)
implicit none
integer(kind=8), intent(in) :: n
double precision,intent(in) :: a(n)
double precision,intent(in) :: b(n)
logical,intent(out) :: c(n)
!$OMP PARALLEL WORKSHARE NUM_THREADS(8)
c = a * b - 4.1 * a > 2.5 * b
!$OMP END PARALLEL WORKSHARE
end subroutine fsimple_operations
@instrument(check=i_csimple_operations)
def i_fsimple_operations(a ,b):
return fsimple_operations(a, b).astype(bool)
i_fsimple_operations(a, b);
%%fortran
subroutine ftough_operations(a, b, c, n)
implicit none
integer(kind=8), intent(in) :: n
double precision,intent(in) :: a(n)
double precision,intent(in) :: b(n)
double precision,intent(out) :: c(n)
!$OMP PARALLEL WORKSHARE NUM_THREADS(8)
c = sin(a) + asinh(a / b)
!$OMP END PARALLEL WORKSHARE
end subroutine ftough_operations
@instrument(check=i_ctough_operations)
def i_ftough_operations(a ,b):
return ftough_operations(a, b)
i_ftough_operations(a, b);
%%fortran
subroutine convolve_fortran(f, g, vmax, wmax, smax, tmax, h, err)
implicit none
integer(kind=8),intent(in) :: vmax,wmax,smax,tmax
integer(kind=8),intent(in) :: f(vmax, wmax), g(smax, tmax)
integer(kind=8),intent(out) :: h(vmax, wmax)
integer(kind=8),intent(out) :: err
integer(kind=8) :: smid,tmid
integer(kind=8) :: x, y
integer(kind=8) :: v1,v2,w1,w2
! f is an image and is indexed by (v, w)
! g is a filter kernel and is indexed by (s, t),
! it needs odd dimensions
! h is the output image and is indexed by (v, w),
err = 0
if (modulo(smax, 2) /= 1 .or. modulo(tmax, 2) /= 1) then
err = 1
return
endif
! smid and tmid are number of pixels between the center pixel
! and the edge, ie for a 5x5 filter they will be 2.
smid = smax / 2
tmid = tmax / 2
h = 0
! Do convolution
! warning : memory layout is different in fortran
! warning : array start at 1 in fortran
!$OMP PARALLEL DO DEFAULT(SHARED) COLLAPSE(1) &
!$OMP PRIVATE(v1,v2,w1,w2) NUM_THREADS(8)
do y = tmid + 1,wmax - tmid
do x = smid + 1,vmax - smid
! Calculate pixel value for h at (x,y). Sum one component
! for each pixel (s, t) of the filter g.
v1 = x - smid
v2 = v1 + smax
w1 = y - tmid
w2 = w1 + tmax
h(x, y) = sum(g(1:smax,1:tmax) * f(v1:v2,w1:w2))
enddo
enddo
!$OMP END PARALLEL DO
return
end subroutine convolve_fortran
def fortran_setup(f, g):
# memory ordering for fortran
ft = np.asfortranarray(f.copy())
gt = np.asfortranarray(g.copy())
return ft, gt
@instrument(check=i_convolve_cython_pure, setup=fortran_setup)
def i_convolve_fortran(f, g):
h, err = convolve_fortran(f, g)
if err:
print(err)
raise ValueError("FORTRAN ERROR ! (Probably : Only odd dimensions on filter supported)")
return h
i_convolve_fortran(small_f, g);
i_convolve_fortran(large_f, g);
%%writefile fortranModule.f90
module fortranmodule
implicit none
contains
subroutine convolve_fortran_pure(f, g, vmax, wmax, smax, tmax, h, err)
implicit none
integer(kind=8),intent(in) :: vmax,wmax,smax,tmax
integer(kind=8),intent(in) :: f(vmax, wmax), g(smax, tmax)
integer(kind=8),intent(out) :: h(vmax, wmax)
integer(kind=8),intent(out) :: err
integer(kind=8) :: smid,tmid
integer(kind=8) :: x, y
integer(kind=8) :: v1,v2,w1,w2
! f is an image and is indexed by (v, w)
! g is a filter kernel and is indexed by (s, t),
! it needs odd dimensions
! h is the output image and is indexed by (v, w),
err = 0
if (modulo(smax, 2) /= 1 .or. modulo(tmax, 2) /= 1) then
err = 1
return
endif
! smid and tmid are number of pixels between the center pixel
! and the edge, ie for a 5x5 filter they will be 2.
smid = smax / 2
tmid = tmax / 2
h = 0
! Do convolution
! warning : memory layout is different in fortran
! warning : array start at 1 in fortran
!$OMP PARALLEL DO DEFAULT(SHARED) COLLAPSE(1) &
!$OMP PRIVATE(v1,v2,w1,w2) NUM_THREADS(8)
do y = tmid + 1,wmax - tmid
do x = smid + 1,vmax - smid
! Calculate pixel value for h at (x,y). Sum one component
! for each pixel (s, t) of the filter g.
v1 = x - smid
v2 = v1 + smax
w1 = y - tmid
w2 = w1 + tmax
h(x, y) = sum(g(1:smax,1:tmax) * f(v1:v2,w1:w2))
enddo
enddo
!$OMP END PARALLEL DO
return
end subroutine convolve_fortran_pure
end module fortranmodule
!gfortran fortranModule.f90 main.f90 -o full_f -Ofast -march=native -fopenmp -fvect-cost-model=cheap
!./full_f
"""
CUDA DOESN'T WORK ON THE VIRTUAL MACHINE
YOU ARE WELCOME TO TRY THIS ON YOU OWN COMPUTER
"""
from string import Template
cuda_src_template = Template("""
// Cuda splitting
#define MTB ${max_threads_per_block}
#define MBP ${max_blocks_per_grid}
// Array size
#define fx ${fx}
#define fy ${fy}
#define gx ${gx}
#define gy ${gy}
// Macro for converting subscripts to linear index:
#define f_INDEX(i, j) (i) * (fy) + (j)
// Macro for converting subscripts to linear index:
#define g_INDEX(i, j) (i) * (gy) + (j)
__global__ void convolve_cuda(long *f, long *g, long *h) {
unsigned int idx = blockIdx.y * MTB * MBP + blockIdx.x * MTB + threadIdx.x;
// Convert the linear index to subscripts:
unsigned int i = idx / fy;
unsigned int j = idx % fy;
long smax = gx;
long tmax = gy;
long smid = smax / 2;
long tmid = tmax / 2;
if (smid <= i && i < fx - smid) {
if (tmid <= j && j < fy - tmid) {
h[f_INDEX(i, j)] = 0.;
for (long s = 0; s < smax; s++)
for (long t = 0; t < tmax; t++)
h[f_INDEX(i, j)] += g[g_INDEX(s, t)] * f[f_INDEX(i + s - smid, j + t - tmid)];
}
}
}
""")
"""
CUDA DOESN'T WORK ON THE VIRTUAL MACHINE
YOU ARE WELCOME TO TRY THIS ON YOU OWN COMPUTER
"""
import skcuda.misc as misc
import pycuda.autoinit
device = pycuda.autoinit.device
max_threads_per_block, _, max_grid_dim = misc.get_dev_attrs(device)
max_blocks_per_grid = max(max_grid_dim)
"""
CUDA DOESN'T WORK ON THE VIRTUAL MACHINE
YOU ARE WELCOME TO TRY THIS ON YOU OWN COMPUTER
"""
from functools import partial
from pycuda.compiler import SourceModule
cuda_src = cuda_src_template.substitute(max_threads_per_block=max_threads_per_block,
max_blocks_per_grid=max_blocks_per_grid,
fx=large_f.shape[0], fy=large_f.shape[1],
gx=g.shape[0], gy=g.shape[1]
)
cuda_module = SourceModule(cuda_src, options= ["-O3", "-use_fast_math", "-default-stream=per-thread"])
print("Compilation OK")
__convolve_cuda = cuda_module.get_function('convolve_cuda')
block_dim, grid_dim = misc.select_block_grid_sizes(device, large_f.shape)
_convolve_cuda = partial(__convolve_cuda,
block=block_dim,
grid=grid_dim)
"""
CUDA DOESN'T WORK ON THE VIRTUAL MACHINE
YOU ARE WELCOME TO TRY THIS ON YOU OWN COMPUTER
"""
import pycuda.gpuarray as gpuarray
def cuda_setup(f, g):
f_gpu = gpuarray.to_gpu(f)
g_gpu = gpuarray.to_gpu(g)
return f_gpu, g_gpu
def cuda_finish(h_gpu):
return h_gpu.get()
@instrument(check=i_convolve_cython_pure)
def convolve_cuda(f, g):
f_gpu, g_gpu = cuda_setup(f, g)
h_gpu = gpuarray.zeros_like(f_gpu)
_convolve_cuda(f_gpu, g_gpu, h_gpu)
return cuda_finish(h_gpu)
convolve_cuda(large_f, g);
"""
CUDA DOESN'T WORK ON THE VIRTUAL MACHINE
YOU ARE WELCOME TO TRY THIS ON YOU OWN COMPUTER
"""
@instrument(check=i_convolve_cython_pure, setup=cuda_setup, finish=cuda_finish)
def convolve_cuda2(f_gpu, g_gpu):
h_gpu = gpuarray.zeros_like(f_gpu)
_convolve_cuda(f_gpu, g_gpu, h_gpu)
return h_gpu
convolve_cuda2(large_f, g);
(values may be different as before)
context | time | comment |
---|---|---|
Python | 1040ms | naive implementation |
Python | 578ms | removing tmparrays |
Python | 4.67ms | using implicit loops |
numexpr | 1.02ms | |
numba | 793us | |
cython | 2.36ms | |
f2py | 1.28ms |
context | time | comment |
---|---|---|
Python | 58.8ms | naive implementation |
Python | 58.6ms | using local sin |
numexpr | 11.7ms | |
numba | 14.2ms | |
cython | 15ms | |
f2py | 12ms |
context | time small case |
time large case |
comment |
---|---|---|---|
Python | 169ms | naive implementation | |
scipy | 6.19ms | 693ms | |
numba | 2.34ms | 253ms | |
numba | 1.50ms | 103ms | using stencil |
cython | 748us | 81.2ms | using numpy datastructure |
cython | 731us | 64.8ms | using c datastructure |
c | 57.2ms | ||
f2py | 707ms | 70ms | |
fortran | 61.9ms | ||
cuda | 42.1ms | including communication | |
cuda | 31.9ms | excluding communication |
import time
import numpy as np
def heavy_fonction(i):
t = np.random.rand() / 10
time.sleep(t)
return i, t
from joblib import Parallel, delayed
if __name__ == "__main__":
tic = time.time()
res = Parallel(n_jobs=-1, backend='threading')(delayed(heavy_fonction)(i) \
for i in range(2000))
tac = time.time()
index, times = np.asarray(res).T
print(tac - tic)
print(times.sum())
from threading import Thread, RLock
N = 2000
N_t = 10
current = 0
nprocs = 8
output_list = np.empty(N)
lock = RLock()
class ThreadJob(Thread):
def run(self):
"""This code will be executed by each thread"""
global current
while current < N:
with lock:
position = current
current += N_t
fin = min(position + N_t + 1, N)
for i in range(position, fin):
j, t = heavy_fonction(i)
output_list[j] = t
if __name__ == "__main__":
# Threads creation
threads = [ThreadJob() for i in range(nprocs)]
tic = time.time()
# Threads starts
for thread in threads:
thread.start()
# Waiting that all thread have finish
for thread in threads:
thread.join()
tac = time.time()
print(tac - tic)
print(output_list.sum())
import multiprocessing as mp
from queue import Empty
def process_job(q,r):
"""This code will be executed by each process"""
while True:
try:
i = q.get(block=False)
r.put(heavy_fonction(i))
except Empty:
if q.empty():
if q.qsize() == 0:
break
if __name__ == "__main__":
# Define an output queue
r = mp.Queue()
# Define an input queue
q = mp.Queue()
for i in range(2000):
q.put(i)
nprocs = 8
# Setup a list of processes that we want to run
processes = [mp.Process(target=process_job, args=(q, r)) for i in range(nprocs)]
tic = time.time()
# Run processes
for p in processes:
p.start()
# Get process results from the output queue
results = np.empty(2000)
for i in range(2000):
j, t = r.get()
results[j] = t
tac = time.time()
# Exit the completed processes
for p in processes:
p.join()
print(tac - tic)
print(results.sum())
The following code read an image in pgm format (ascii) and store it in a 2D list.
For each pixel of the image a kernel get all neighbors (9 counting the pixel itself) and apply a computation.
Analyse the performance of the code, identify bottleneck and try to optimize it.
You can apply your code on the following images :
For reference, the timing on my computer are :
For data/test.pgm
On my computer :
Reading Files
6.67 ms ± 262 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Computing
503 µs ± 5.41 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
My solution
Reading Files
65.3 µs ± 1.58 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Computing
66.2 µs ± 1.02 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
And, the bigger the image, the bigger the gain !
Part 1
Part 2
Part 3
def get_description(filename):
"""
Read the header part of the file
"""
f = open(filename, 'r')
nline = 0
description = {}
while nline < 3:
line = f.readline()
if line[0] == '#':
continue
nline += 1
if nline == 1:
description['format'] = line.strip()
elif nline == 2:
description['dimension'] = int(line.split()[1]), int(line.split()[0])
elif nline ==3:
description['deep'] = int(line.strip())
f.close()
return description
def get_value(filename, coord):
"""
Get value at coord in an image in the PGM format
The main problem here is that the file have a limited width, and the values are wrapped
Thus, the value at coord 12,32 might be in the 24,6 in the file
"""
description = get_description(filename)
xdim, ydim = description['dimension']
i = coord[0]
j = coord[1]
f = open(filename, 'r', encoding='utf-8')
nline = 0
while nline < 3:
line = f.readline()
if line[0] == '#':
continue
nline += 1
#here we are at coordinate (0,0)
icur, jcur = 0,0
test = True
while(test):
values = f.readline().split()
nvalues = len(values)
if (icur == i):
if (jcur + nvalues > j):
jvalues = j - jcur
value = values[jvalues]
test=False
else:
jcur += nvalues
else:
jcur += nvalues
if (jcur >= ydim):
icur += jcur // ydim
jcur = jcur % ydim
f.close()
return int(value)
def read_values(filename, description):
"""
Read all the values
"""
values = []
for i in range(description['dimension'][0]):
values.append([])
for j in range(description['dimension'][1]):
values[i].append(get_value(filename, (i, j)))
return values
def read_file(filename):
"""
Read an image in the PGM format
"""
# read the header part
description = get_description(filename)
# read the values
values = read_values(filename, description)
return values
def init(files):
"""
Read all files
"""
data = []
for file in files:
data.append(read_file(file))
return data
def get_neighbors(tab, i, j):
"""
Extract from the array the neighbors of a pixel
"""
neigh = []
for jrange in [-1, 0, 1]:
for irange in [-1, 0, 1]:
neigh.append(tab[i + irange][j + jrange])
return neigh
import math
def compute_wtf(neigh):
"""
Apply a reduction operation on the array neigh
"""
value = 1.
for i in range(len(neigh)):
value *= math.exp(neigh[i]) ** (1 / len(neigh))
value = math.log(value)
return float(value)
def kernel(tab):
"""
Apply compute_wtf on each pixel except boundary
"""
xdim = len(tab)
ydim = len(tab[0])
# create the result list
result = []
#1st line contains only 0
result.append([0])
for jrange in range(1, ydim):
result[0].append(0)
for irange in range(1, xdim - 1):
#1st column contains only 0
result.append([0])
# For each pixel inside the image
for jrange in range(1, ydim - 1):
# Extract the neighboring pixels
neigh = get_neighbors(tab, irange, jrange)
# Apply compute_wtf on it
res = compute_wtf(neigh)
# Store the result
result[irange].append(res)
#last colum contains only 0
result[irange].append(0)
#last line contains only 0
result.append([])
for jrange in range(ydim):
result[xdim - 1].append(0)
return result
def job(data):
"""
Apply kernel of each image
"""
results = []
for image in data:
results.append(kernel(image))
return results
%matplotlib notebook
import matplotlib.pyplot as plt
def plot(data):
nimages = len(data)
if nimages > 1:
fig, axes = plt.subplots(nimages, 1)
for image, ax in zip(data, axes):
ax.imshow(image)
else:
plt.figure()
plt.imshow(data[0])
plt.show()
files = ["data/test.pgm"] #,
#"data/test32.pgm",
#"data/brain_604.ascii.pgm",
#"data/apollonian_gasket.ascii.pgm",
#"data/dla.ascii.pgm",
#]
if __name__ == "__main__":
print("Reading Files")
data = init(files)
%timeit init(files)
print("Computing")
result = job(data)
%timeit job(data)
plot(data)
plot(result)
%load_ext Cython
%set_env CFLAGS="-Ofast -march=native -fvect-cost-model=cheap -fopenmp -Wno-unused-variable -Wno-cpp -Wno-maybe-uninitialized"
%%cython --link-args=-fopenmp
# cython: language_level=3
# cython: initializedcheck=False
# cython: binding=True
# cython: nonecheck=False
# cython: boundscheck=False
# cython: wraparound=False
# distutils: extra_link_args = -fopenmp
import numpy as np
cimport numpy as np
cimport cython
from cython.parallel cimport parallel, prange
def ckernel(const double[:,::1] &data, const long nt):
cdef long n = data.shape[0]
cdef long m = data.shape[1]
cdef double[:,::1] res = np.zeros([n, m], dtype=np.double)
cdef double value
cdef long i, j, s, t
with nogil, parallel(num_threads=nt):
for i in prange(1, n - 1):
for j in range(1, m - 1):
value = 0
for s in range(-1, 2):
for t in range(-1, 2):
value += data[i + s, j + t]
res[i, j] += value / 9
return np.asarray(res)
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
from threading import Thread, RLock
from copy import deepcopy
import os
nprocs = os.cpu_count()
result = []
data = []
current = 0
verrou = RLock()
class ThreadJob(Thread):
def run(self):
global current, verrou
"""Code à exécuter pendant l'exécution du thread."""
while current < len(data):
with verrou:
position = current
current += 1
kernel(position)
def get_description(file):
"""
Read the header part of the file
if file is an opened file:
go back the the begining of the file
read the header
leave the file at the end of header (start of values)
else:
call itself with the file openened
(this should be never called)
"""
if isinstance(file, str):
with open(file,'r', encoding="utf-8") as opened_file:
return get_description(opened_file)
# return to begining
file.seek(0)
nline = 0
description = {}
while nline < 3:
line = file.readline()
if line[0] == '#':
continue
nline += 1
if nline == 1:
description['format']=line.strip()
elif nline == 2:
description['dimension']=int(line.split()[1]), int(line.split()[0])
elif nline == 3:
description['deep']=int(line.strip())
return description
def read_values(file, description):
"""
Read all the values directly
The file must be already opened
The values are stored in a numpy array
"""
# pre-allocate the array
nx, ny = description['dimension']
values = np.empty((nx * ny))
i = 0
for line in file:
if line[0] == '#':
continue
vals = line.split()
nvals = len(vals)
values[i:i + nvals] = vals
i += nvals
return values.reshape((nx, ny))
def read_file(filename):
"""
Read an image in the PGM format
Open the file *once*
"""
# open the file once
with open(filename, 'r', encoding="utf-8") as file:
# read the header part
description = get_description(file)
# read the values
values = read_values(file, description)
return values
def kernel(i):
"""
Apply compute_wtf on each pixel except boundary
"""
global data, result, nt_omp, nt_job
if data[i].size < 1000:
result[i] = ckernel(data[i], 1)
else:
result[i] = ckernel(data[i], nt_job)
def job(data):
"""
Apply kernel of each image
"""
global current, nt_job
current = 0
# Création des threads
threads = [ThreadJob() for i in range(nt_job)]
# Lancement des threads
for thread in threads:
thread.start()
# Attend que les threads se terminent
for thread in threads:
thread.join()
def init(files):
"""
Read all files
"""
data = []
for file in files:
data.append(read_file(file))
return data
def plot(data):
nimages = len(data)
if nimages > 1:
fig, axes = plt.subplots(nimages, 1)
for image, ax in zip(data, axes):
ax.imshow(image)
else:
plt.figure()
plt.imshow(data[0])
plt.show()
files = ["data/test.pgm"] #,
#"data/test32.pgm",
#"data/brain_604.ascii.pgm",
#"data/apollonian_gasket.ascii.pgm",
#"data/dla.ascii.pgm",
#]
nt_job = min(nprocs // 2, len(files))
nt_omp = nprocs - nt_job
if __name__ == "__main__":
print("Reading Files")
data = init(files)
%timeit init(files)
print("Computing")
#sort data: biggest images first for better equilibrium in parallel
data = sorted(data, key=np.size, reverse=True)
#prepare result array
result = deepcopy(data)
job(data)
%timeit job(data)
plot(data)
plot(result)