#ifdef DOUBLE_PRECISION
	#pragma OPENCL EXTENSION cl_khr_fp64: enable
	#define FP_TYPE double
	#define FP_CONST(num) num
#else
	#define FP_TYPE float
	#define FP_CONST(num) num##f
#endif

// Constants used in the MGVF computation
#define PI FP_CONST(3.14159)
#define ONE_OVER_PI (FP_CONST(1.0) / PI)
#define MU FP_CONST(0.5)
#define LAMBDA (FP_CONST(8.0) * MU + FP_CONST(1.0))

// The number of threads per thread block
#define LOCAL_WORK_SIZE 256
#define NEXT_LOWEST_POWER_OF_TWO 256


// Regularized version of the Heaviside step function:
// He(x) = (atan(x) / pi) + 0.5
FP_TYPE heaviside(FP_TYPE x) {
	return (atan(x) * ONE_OVER_PI) + FP_CONST(0.5);

	// A simpler, faster approximation of the Heaviside function
	/* float out = 0.0;
	if (x > -0.0001) out = 0.5;
	if (x >  0.0001) out = 1.0;
	return out; */
}


// Kernel to compute the Motion Gradient Vector Field (MGVF) matrix for multiple cells
__kernel void IMGVF_kernel(__global FP_TYPE *IMGVF_array, __global FP_TYPE *I_array, __constant int *I_offsets, int __constant *m_array,
						   __constant int *n_array, FP_TYPE vx, FP_TYPE vy, FP_TYPE e, int max_iterations, FP_TYPE cutoff) {
	
	// Shared copy of the matrix being computed
	__local FP_TYPE IMGVF[41 * 81];
	
	// Shared buffer used for two purposes:
	// 1) To temporarily store newly computed matrix values so that only
	//    values from the previous iteration are used in the computation.
	// 2) To store partial sums during the tree reduction which is performed
	//    at the end of each iteration to determine if the computation has converged.
	__local FP_TYPE buffer[LOCAL_WORK_SIZE];
	
	// Figure out which cell this thread block is working on
	int cell_num = get_group_id(0);
	
	// Get pointers to current cell's input image and inital matrix
	int I_offset = I_offsets[cell_num];
	__global FP_TYPE *IMGVF_global = &(IMGVF_array[I_offset]);
	__global FP_TYPE *I = &(I_array[I_offset]);
	
	// Get current cell's matrix dimensions
	int m = m_array[cell_num];
	int n = n_array[cell_num];
	
	// Compute the number of virtual thread blocks
	int IMGVF_Size = m * n;
	int tb_count = (IMGVF_Size + LOCAL_WORK_SIZE - 1) / LOCAL_WORK_SIZE;
	
	// Load the initial IMGVF matrix into shared memory
	int thread_id = get_local_id(0);
	int thread_block, i, j;
	for (thread_block = 0; thread_block < tb_count; thread_block++) {
		int offset = thread_block * LOCAL_WORK_SIZE + thread_id;
		if (offset < IMGVF_Size)
			IMGVF[offset] = IMGVF_global[offset];
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	// Set the converged flag to false
	__local int cell_converged;
	if (thread_id == 0) cell_converged = 0;
	barrier(CLK_LOCAL_MEM_FENCE);
	
	// Constants used to iterate through virtual thread blocks
	const float one_nth = 1.0f / (float) n;
	const int tid_mod = thread_id % n;
	const int tbsize_mod = LOCAL_WORK_SIZE % n;
	
	// Constant used in the computation of Heaviside values
	FP_TYPE one_over_e = FP_CONST(1.0) / e;
	
	// Iteratively compute the IMGVF matrix until the computation has
	//  converged or we have reached the maximum number of iterations
	int iterations = 0;
	while ((! cell_converged) && (iterations < max_iterations)) {
	
		// The total change to this thread's matrix elements in the current iteration
		FP_TYPE total_diff = FP_CONST(0.0);
		
		int old_i = 0, old_j = 0;
		j = tid_mod - tbsize_mod;
		
		// Iterate over virtual thread blocks
		for (thread_block = 0; thread_block < tb_count; thread_block++) {
			// Store the index of this thread's previous matrix element
			//  (used in the buffering scheme below)
			old_i = i;
			old_j = j;
			
			// Determine the index of this thread's current matrix element 
			int offset = thread_block * LOCAL_WORK_SIZE;
			i = (thread_id + offset) * one_nth;
			j += tbsize_mod;
			if (j >= n) j -= n;
			
			FP_TYPE new_val, old_val;
			
			// Make sure the thread has not gone off the end of the matrix
			if (i < m) {
				// Compute neighboring matrix element indices
				int rowU = (i == 0) ? 0 : i - 1;
				int rowD = (i == m - 1) ? m - 1 : i + 1;
				int colL = (j == 0) ? 0 : j - 1;
				int colR = (j == n - 1) ? n - 1 : j + 1;
				
				// Compute the difference between the matrix element and its eight neighbors
				old_val = IMGVF[(i * n) + j];
				FP_TYPE U  = IMGVF[(rowU * n) + j   ] - old_val;
				FP_TYPE D  = IMGVF[(rowD * n) + j   ] - old_val;
				FP_TYPE L  = IMGVF[(i    * n) + colL] - old_val;
				FP_TYPE R  = IMGVF[(i    * n) + colR] - old_val;
				FP_TYPE UR = IMGVF[(rowU * n) + colR] - old_val;
				FP_TYPE DR = IMGVF[(rowD * n) + colR] - old_val;
				FP_TYPE UL = IMGVF[(rowU * n) + colL] - old_val;
				FP_TYPE DL = IMGVF[(rowD * n) + colL] - old_val;
				
				// Compute the regularized heaviside value for these differences
				FP_TYPE UHe  = heaviside((U  *       -vy)  * one_over_e);
				FP_TYPE DHe  = heaviside((D  *        vy)  * one_over_e);
				FP_TYPE LHe  = heaviside((L  *  -vx     )  * one_over_e);
				FP_TYPE RHe  = heaviside((R  *   vx     )  * one_over_e);
				FP_TYPE URHe = heaviside((UR * ( vx - vy)) * one_over_e);
				FP_TYPE DRHe = heaviside((DR * ( vx + vy)) * one_over_e);
				FP_TYPE ULHe = heaviside((UL * (-vx - vy)) * one_over_e);
				FP_TYPE DLHe = heaviside((DL * (-vx + vy)) * one_over_e);
				
				// Update the IMGVF value in two steps:
				// 1) Compute IMGVF += (mu / lambda)(UHe .*U  + DHe .*D  + LHe .*L  + RHe .*R +
				//                                   URHe.*UR + DRHe.*DR + ULHe.*UL + DLHe.*DL);
				new_val = old_val + (MU / LAMBDA) * (UHe  * U  + DHe  * D  + LHe  * L  + RHe  * R +
													 URHe * UR + DRHe * DR + ULHe * UL + DLHe * DL);
				// 2) Compute IMGVF -= (1 / lambda)(I .* (IMGVF - I))
				FP_TYPE vI = I[(i * n) + j];
				new_val -= ((1.0 / LAMBDA) * vI * (new_val - vI));

				// Keep track of the total change of this thread's matrix elements
				total_diff += fabs(new_val - old_val);
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			// Save the previous virtual thread block's value (if it exists)
			if (thread_block > 0) {
				offset = (thread_block - 1) * LOCAL_WORK_SIZE;
				if (old_i < m) IMGVF[(old_i * n) + old_j] = buffer[thread_id];
			}
			if (thread_block < tb_count - 1) {
				// Write the new value to the buffer
				buffer[thread_id] = new_val;
			} else {
				// We've reached the final virtual thread block,
				//  so write directly to the matrix
				if (i < m) IMGVF[(i * n) + j] = new_val;
			}
			
			// We need to synchronize between virtual thread blocks to prevent
			//  threads from writing the values from the buffer to the actual
			//  IMGVF matrix too early
			barrier(CLK_LOCAL_MEM_FENCE);
		}
		
		// We need to compute the overall sum of the change at each matrix element
		//  by performing a tree reduction across the whole threadblock
		buffer[thread_id] = total_diff;
		barrier(CLK_LOCAL_MEM_FENCE);
		
		// Account for thread block sizes that are not a power of 2
		if (thread_id >= NEXT_LOWEST_POWER_OF_TWO) {
			buffer[thread_id - NEXT_LOWEST_POWER_OF_TWO] += buffer[thread_id];
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		// Perform the tree reduction
		int th;
		for (th = NEXT_LOWEST_POWER_OF_TWO / 2; th > 0; th /= 2) {
			if (thread_id < th) {
				buffer[thread_id] += buffer[thread_id + th];
			}
			barrier(CLK_LOCAL_MEM_FENCE);
		}
		
		// Figure out if we have converged
		if(thread_id == 0) {
			FP_TYPE mean = buffer[thread_id] / (FP_TYPE) (m * n);
			if (mean < cutoff) {
				// We have converged, so set the appropriate flag
				cell_converged = 1;
			}
		}
		
		// We need to synchronize to ensure that all threads
		//  read the correct value of the convergence flag
		barrier(CLK_LOCAL_MEM_FENCE);
		
		// Keep track of the number of iterations we have performed
		iterations++;
	}
	
	// Save the final IMGVF matrix to global memory
	for (thread_block = 0; thread_block < tb_count; thread_block++) {
		int offset = thread_block * LOCAL_WORK_SIZE + thread_id;
		if (offset < IMGVF_Size)
			IMGVF_global[offset] = IMGVF[offset];
	}
}