#ifndef GROUP_SIZE
#define GROUP_SIZE (64)
#endif
__kernel void
reduce(__global float *output, __global const float *input,
                       __local float *shared, unsigned int n)
{
    const unsigned int lid = get_local_id(0);
    const unsigned int lsize = GROUP_SIZE;//get_local_size(0);
    // NOTE: get_local_size(0) must equal GROUP_SIZE
    const unsigned int gid = get_group_id(0);
    const unsigned int gsize = get_num_groups(0);
    const unsigned int gs2 = GROUP_SIZE * 2;
    const size_t stride = gs2 * gsize;
    shared[lid] = 0.0f;
    size_t i = gid * gs2 + lid;
    while (i < n)
    {
         shared[lid] += input[i] + input[(i+GROUP_SIZE)];
         i += stride;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
#if (GROUP_SIZE >= 512)
    if (lid < 256)
        shared[lid] += shared[lid + 256];
    barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if (GROUP_SIZE >= 256)
    if (lid < 128)
        shared[lid] += shared[lid + 128];
    barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if (GROUP_SIZE >= 128)
    if (lid < 64)
        shared[lid] += shared[lid +   64];
    barrier(CLK_LOCAL_MEM_FENCE);
#endif
    if (lid < 32)
    {
#if (GROUP_SIZE >= 64)
         shared[lid] += shared[lid + 32];
        barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if (GROUP_SIZE >= 32)
         shared[lid] += shared[lid + 16];
        barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if (GROUP_SIZE >= 16)
         shared[lid] += shared[lid + 8];
        barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if (GROUP_SIZE >=    8)
         shared[lid] += shared[lid + 4];
        barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if (GROUP_SIZE >=    4)
         shared[lid] += shared[lid + 2];
        barrier(CLK_LOCAL_MEM_FENCE);
#endif
#if (GROUP_SIZE >=    2)
         shared[lid] += shared[lid + 1];
        barrier(CLK_LOCAL_MEM_FENCE);
#endif
    }
    if (lid == 0)
        output[gid] = shared[0];
}

