#define IN_RANGE(x, min, max) ((x)>=(min) && (x)<=(max))
#define CLAMP_RANGE(x, min, max) x = (x<(min)) ? min : ((x>(max)) ? max : x )
#define MIN(a, b) ((a)<=(b) ? (a) : (b))

__kernel void dynproc_kernel (int iteration,
                              __global int* gpuWall,
                              __global int* gpuSrc,
                              __global int* gpuResults,
                              int cols,
                              int rows,
                              int startStep,
                              int border,
                              int HALO,
                              __local int* prev,
                              __local int* result,
                              __global int* outputBuffer)
{
	int BLOCK_SIZE = get_local_size(0);
	int bx = get_group_id(0);
	int tx = get_local_id(0);

	// Each block finally computes result for a small block
	// after N iterations.
	// it is the non-overlapping small blocks that cover
	// all the input data

	// calculate the small block size.
	int small_block_cols = BLOCK_SIZE - (iteration*HALO*2);

	// calculate the boundary for the block according to
	// the boundary of its small block
	int blkX = (small_block_cols*bx) - border;
	int blkXmax = blkX+BLOCK_SIZE-1;

	// calculate the global thread coordination
	int xidx = blkX+tx;

	// effective range within this block that falls within
	// the valid range of the input data
	// used to rule out computation outside the boundary.
	int validXmin = (blkX < 0) ? -blkX : 0;
	int validXmax = (blkXmax > cols-1) ? BLOCK_SIZE-1-(blkXmax-cols+1) : BLOCK_SIZE-1;
	
	int W = tx-1;
	int E = tx+1;

	W = (W < validXmin) ? validXmin : W;
	E = (E > validXmax) ? validXmax : E;

	bool isValid = IN_RANGE(tx, validXmin, validXmax);

	if(IN_RANGE(xidx, 0, cols-1))
	{
		prev[tx] = gpuSrc[xidx];
	}
	
	barrier(CLK_LOCAL_MEM_FENCE);

	bool computed;
	for (int i = 0; i < iteration; i++)
	{
		computed = false;
		
		if( IN_RANGE(tx, i+1, BLOCK_SIZE-i-2) && isValid )
		{
			computed = true;
			int left = prev[W];
			int up = prev[tx];
			int right = prev[E];
			int shortest = MIN(left, up);
			shortest = MIN(shortest, right);
			
			int index = cols*(startStep+i)+xidx;
			result[tx] = shortest + gpuWall[index];
			
			// ===================================================================
			// add debugging info to the debug output buffer...
			if (tx==11 && i==0)
			{
				// set bufIndex to what value/range of values you want to know.
				int bufIndex = gpuSrc[xidx];
				// dont touch the line below.
				outputBuffer[bufIndex] = 1;
			}
			// ===================================================================
		}

		barrier(CLK_LOCAL_MEM_FENCE);

		if(i==iteration-1)
		{
			// we are on the last iteration, and thus don't need to 
			// compute for the next step.
			break;
		}

		if(computed)
		{
			//Assign the computation range
			prev[tx] = result[tx];
		}
		barrier(CLK_LOCAL_MEM_FENCE);
	}

	// update the global memory
	// after the last iteration, only threads coordinated within the
	// small block perform the calculation and switch on "computed"
	if (computed)
	{
		gpuResults[xidx] = result[tx];
	}
}