#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('matplotlib', 'nbagg')


# In[2]:


import os
os.environ["PYOPENCL_COMPILER_OUTPUT"]="1"
import numpy
import fabio
import pyopencl
from pyopencl import array as cla
from matplotlib.pyplot import subplots


# In[3]:


ctx = pyopencl.create_some_context(interactive=True)
queue = pyopencl.CommandQueue(ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
ctx


# In[4]:


image = fabio.open("/users/kieffer/workspace-400/tmp/pyFAI/test/testimages/Pilatus6M.cbf").data
mask = (image<0).astype("int8")


# In[5]:


fig, ax = subplots()
ax.imshow(image.clip(0,100))


# In[6]:


get_ipython().run_line_magic('load_ext', 'pyopencl.ipython_ext')


# In[10]:


get_ipython().run_cell_magic('cl_kernel', '', '\n//read withou caching\nfloat inline read_simple(global int *img, \n                     int height,\n                     int width,\n                     int row,\n                     int col){\n    //This kernel reads the value and returns it without active caching\n    float value = NAN;\n    \n    // Read\n    if ((col>=0) && (col<width) && (row>=0) && (row<height)){\n        int read_pos = col + row*width;\n        value = (float)img[read_pos];\n        if (value<0){\n            value = NAN;\n        }\n    }\n    return value;\n}\n\n\nvoid inline read_and_store(global int *img, \n                     int height,\n                     int width,\n                     int row,\n                     int col,\n                     int half_wind_height,\n                     int half_wind_width,\n                     local float* storage){\n    //This kernel reads the value and stores in the local storage\n    int line_size, write_pos, idx_line;\n    float value = NAN;\n    \n    // Read\n    if ((col>=0) && (col<width) && (row>0) && (row<height)){\n        int read_pos = col + row*width;\n        value = (float)img[read_pos];\n        if (value<0){\n            value = NAN;\n        }\n    }\n    // Save locally\n    if ((col>=-half_wind_width) && (col<=width+half_wind_width) && (row>-half_wind_height) && (row<=height+half_wind_height)){\n        line_size = get_local_size(0) + 2 * half_wind_width;\n        idx_line = (half_wind_height+row)%(2*half_wind_height+1);\n        write_pos = line_size*idx_line + half_wind_width + col - get_group_id(0)*get_local_size(0);\n        storage[write_pos] = value;\n    }\n    //return value\n}\n\n//Store a complete line\nvoid inline store_line(global int *img, \n                       int height,\n                       int width,\n                       int row,\n                       int half_wind_height,\n                       int half_wind_width,\n                       local float* storage){\n        read_and_store(img, height, width, \n                       row, get_global_id(0), \n                       half_wind_height, half_wind_width, storage);\n        if (get_local_id(0)<half_wind_width){\n            // read_and_store_left\n            read_and_store(img, height, width, \n                           row, get_group_id(0)*get_local_size(0)-half_wind_width+get_local_id(0), \n                           half_wind_height, half_wind_width, storage);\n            //read_and_store_right\n            read_and_store(img, height, width, \n                           row, (get_group_id(0)+1)*get_local_size(0)+get_local_id(0), \n                           half_wind_height, half_wind_width, storage);            \n        }\n}\n\nfloat read_back( int height,\n                 int width,\n                 int row,\n                 int col,\n                 int half_wind_height,\n                 int half_wind_width,\n                 local float* storage){\n    float value=NAN;\n    int write_pos, line_size, idx_line;\n    if ((col>=-half_wind_width) && (col<=width+half_wind_width) && (row>-half_wind_height) && (row<=height+half_wind_height)){\n        line_size = get_local_size(0) + 2 * half_wind_width;\n        idx_line = (half_wind_height+row)%(2*half_wind_height+1);\n        write_pos = line_size*idx_line + half_wind_width + col - get_group_id(0)*get_local_size(0);\n        value = storage[write_pos];   \n    }\n    return value;\n}\n\n// workgroup size of kernel: 32 to 128, cache_read needs to be (wg+2*half_wind_width)*(2*half_wind_height+1)*sizeof(float)\nkernel void spot_finder(global int *img, \n                               int height,\n                               int width,\n                               int half_wind_height,\n                               int half_wind_width,\n                               float threshold,\n                               float radius,\n                        global int *cnt_high, //output\n                        global int *high,     //output\n                               int high_size,\n                        local  float *cache_read,\n                        local  int *local_high,\n                               int local_size){\n    //decaration of variables\n    int col, row, cnt, i, j, where;\n    float value, sum, std, centroid_r, centroid_c, dist, mean;\n    col = get_global_id(0);\n    \n    local int local_cnt_high[1];\n    local_cnt_high[0] = 0;\n    for (i=0; i<local_size; i+=get_local_size(0)){\n        local_high[i+get_local_id(0)] = 0;\n    }\n        \n    row=0;\n    \n    //pre-load data for the first line\n    for (i=-half_wind_height; i<half_wind_height; i++){\n        store_line(img, height, width, row+i, half_wind_height, half_wind_width, cache_read);\n    }\n    barrier(CLK_LOCAL_MEM_FENCE);\n    //loop within a column\n    for (row=0;row<height; row++){\n        //read data\n        store_line(img, height, width, row+half_wind_height, half_wind_height, half_wind_width, cache_read);\n        barrier(CLK_LOCAL_MEM_FENCE);\n        //calculate mean\n        sum = 0.0f;\n        centroid_r = 0.0f;\n        centroid_c = 0.0f;\n        cnt = 0;\n        for (i=-half_wind_height; i<=half_wind_height; i++){\n            for (j=-half_wind_width; j<=half_wind_width; j++){\n                value = read_back(height, width, row+i, col+j, half_wind_height, half_wind_width, cache_read);\n                if (isfinite(value)){\n                    sum += value;\n                    centroid_r += value*i; \n                    centroid_c += value*j;\n                    cnt += 1;\n                }\n            }\n        }\n        if (cnt){\n            mean = sum/cnt;\n            dist = sum*radius;\n            if ((fabs(centroid_r)<dist) && (fabs(centroid_c)<dist)){\n                // calculate std\n                sum = 0.0;\n                for (i=-half_wind_height; i<=half_wind_height; i++){\n                    for (j=-half_wind_width; j<=half_wind_width; j++){\n                        value = read_back(height, width, row+i, col+j, half_wind_height, half_wind_width, cache_read);\n                        if (isfinite(value)){\n                            sum += pown(mean-value,2);\n                        }\n                    }\n                }\n                std = sqrt(sum/cnt);\n                value = read_back(height, width, row, col, half_wind_height, half_wind_width, cache_read);\n                if ((value-mean)>threshold*std){\n                    where = atomic_inc(local_cnt_high);\n                    if (where<local_size){\n                        local_high[where] = col+width*row;\n                    }\n                } // if intense signal\n            } // if properly centered\n        } // if patch not empty            \n        barrier(CLK_LOCAL_MEM_FENCE);\n    } //for row      \n    \n    //Store the results in global memory\n    barrier(CLK_LOCAL_MEM_FENCE);\n    if (get_local_id(0) == 0) {\n        cnt = local_cnt_high[0];\n        if ((cnt>0) && (cnt<local_size)) {\n            where = atomic_add(cnt_high, cnt);\n            if (where+cnt>high_size){\n                cnt = high_size-where; //store what we can\n            }\n            for (i=0; i<cnt; i++){\n                high[where+i] = local_high[i];\n            }\n        }\n    }//store results\n} //kernel\n\n// workgroup size of kernel: without cacheing read\nkernel void simple_spot_finder(global int *img, \n                               int height,\n                               int width,\n                               int half_wind_height,\n                               int half_wind_width,\n                               float threshold,\n                               float radius,\n                        global int *cnt_high, //output\n                        global int *high,     //output\n                               int high_size,\n                        local  int *local_high,\n                               int local_size){\n    //decaration of variables\n    int col, row, cnt, i, j, where, tid, blocksize;\n    float value, sum, std, centroid_r, centroid_c, dist, mean, M2, delta, delta2, target_value;\n    col = get_global_id(0);\n    row = get_global_id(1);\n    \n    //Initialization of output array in shared\n    local int local_cnt_high[2];\n    blocksize = get_local_size(0) * get_local_size(1);\n    tid = get_local_id(0) + get_local_id(1) * get_local_size(0);\n    if (tid < 2){\n        local_cnt_high[tid] = 0;\n    }\n        \n    for (i=0; i<local_size; i+=blocksize){\n        if ((i+tid)<local_size)\n            local_high[i+tid] = 0;\n    }\n    barrier(CLK_LOCAL_MEM_FENCE);        \n    \n    \n    //Calculate mean + std + centroids\n    mean = 0.0f;\n    M2 = 0.0f;\n    centroid_r = 0.0f;\n    centroid_c = 0.0f;\n    cnt = 0;\n    \n    for (i=-half_wind_height; i<=half_wind_height; i++){\n        for (j=-half_wind_width; j<=half_wind_width; j++){\n            value = read_simple(img, height, width, row+i, col+j);\n            if (isfinite(value)){\n                centroid_r += value*i; \n                centroid_c += value*j;\n                cnt += 1;\n                delta = value - mean;\n                mean += delta / cnt;\n                delta2 = value - mean;\n                M2 += delta * delta2;\n            }                \n        }\n    }\n    if (cnt){\n        dist = mean*radius*cnt;\n        std = sqrt(M2 / cnt);\n        target_value = read_simple(img, height, width, row, col);\n        if (((target_value-mean)>threshold*std) && (fabs(centroid_r)<dist) && (fabs(centroid_c)<dist)){\n                where = atomic_inc(local_cnt_high);\n                if (where<local_size){\n                    local_high[where] = col+width*row;\n                }\n        } // if intense signal properly centered\n    } // if patch not empty            \n    \n    //Store the results in global memory\n    barrier(CLK_LOCAL_MEM_FENCE);\n    if (tid==0) {\n        cnt = local_cnt_high[0];\n        if ((cnt>0) && (cnt<local_size)) {\n            where = atomic_add(cnt_high, cnt);\n            if (where+cnt>high_size){\n                cnt = high_size-where; //store what we can\n            }\n            local_cnt_high[0] = cnt;\n            local_cnt_high[1] = where;\n        }\n    }\n    barrier(CLK_LOCAL_MEM_FENCE);\n    //copy the data from local to global memory\n    for (i=0; i<local_cnt_high[0]; i+=blocksize){\n        high[local_cnt_high[1]+i+tid] = local_high[i+tid];\n    }//store results\n} //kernel\n')


# In[11]:


def peak_count(img,
               window=3,
               threshold=3.0,
               radius=1.0,
               workgroup=32,
               array_size=10000):
    img_d = cla.to_device(queue, image)
    high_d = cla.zeros(queue, (array_size,), dtype=numpy.int32)
    high_cnt_d = cla.zeros(queue, (1,), dtype=numpy.int32)
    read_cache = pyopencl.LocalMemory(4*(workgroup+2*window)*(2*window+1))
    write_cache = pyopencl.LocalMemory(4096)
    height, width = img.shape
    size = (width+workgroup-1)&~(workgroup-1)
    ev = spot_finder(queue, (size,), (workgroup,),
                    img_d.data, 
                    numpy.int32(height),
                    numpy.int32(width),
                    numpy.int32(window),
                    numpy.int32(window),
                    numpy.float32( threshold),
                    numpy.float32( radius),
                    high_cnt_d.data,
                    high_d.data,
                    numpy.int32(array_size),
                    read_cache,
                    write_cache,
                    numpy.int32(1024))
    size = high_cnt_d.get()[0]     
    print("found %i peaks in %.3fms"%(size, (ev.profile.end-ev.profile.start)*1e-6))
    return high_d.get()[:size]
get_ipython().run_line_magic('time', 'raw = peak_count(image, window=5, threshold=6)')
x=raw%image.shape[-1]
y=raw//image.shape[-1]
ax.plot(x,y,".w")


# In[12]:


def simple_peak_count(img,
               window=3,
               threshold=3.0,
               radius=1.0,
               workgroup=32,
               array_size=10000):
    img_d = cla.to_device(queue, image)
    high_d = cla.zeros(queue, (array_size,), dtype=numpy.int32)
    high_cnt_d = cla.zeros(queue, (1,), dtype=numpy.int32)
    #read_cache = pyopencl.LocalMemory(4*(workgroup+2*window)*(2*window+1))
    write_cache = pyopencl.LocalMemory(4096)
    height, width = img.shape
    size_w = (width+workgroup-1)&~(workgroup-1)
    size_h = (height+workgroup-1)&~(workgroup-1)
    ev = simple_spot_finder(queue, (size_w,size_h), (workgroup, workgroup),
                    img_d.data, 
                    numpy.int32(height),
                    numpy.int32(width),
                    numpy.int32(window),
                    numpy.int32(window),
                    numpy.float32( threshold),
                    numpy.float32( radius),
                    high_cnt_d.data,
                    high_d.data,
                    numpy.int32(array_size),
                    #read_cache,
                    write_cache,
                    numpy.int32(1024))
    size = high_cnt_d.get()[0]     
    print("found %i peaks in %.3fms"%(size, (ev.profile.end-ev.profile.start)*1e-6))
    return high_d.get()[:size]
get_ipython().run_line_magic('time', 'raw = simple_peak_count(image, window=5, threshold=6)')
x=raw%image.shape[-1]
y=raw//image.shape[-1]
ax.plot(x,y,".y")


# In[43]:


# Work on scan
from math import log2
n = 32
ary = numpy.ones(n)
ary


# In[44]:


ary1 = numpy.copy(ary)
ary2 = numpy.empty_like(ary)

for i in range(int(log2(n))):
    start = 1<<i
    print(i,start)
    for j in range(start):
        ary2[j] = ary1[j]
    for j in range(start, n):
        ary2[j] = ary1[j] + ary1[j-start]
    ary1, ary2 = ary2, ary1
print(ary1)


# In[34]:


ary-numpy.ones(n).cumsum()


# In[39]:


(32+6)*7*4*2*4


# In[ ]: