/*===--------------------------------------------------------------------------
 *              ATMI (Asynchronous Task and Memory Interface)
 *
 * This file is distributed under the MIT License. See LICENSE.txt for details.
 *===------------------------------------------------------------------------*/

#include "nw.h"

#define SCORE(i, j) input_itemsets_l[j + i * (BLOCK_SIZE+1)]
#define REF(i, j)   reference_l[j + i * BLOCK_SIZE]

int maximum( int a,
    int b,
    int c){

  int k;
  if( a <= b )
    k = b;
  else
    k = a;

  if( k <=c )
    return(c);
  else
    return(k);
}

  kernel void
dummy_kernel_gpu(
    global int * tmp_var)
{
  int tx = get_global_id(0);
  if (tx == 0)
    *tmp_var = *tmp_var + 1;
}

kernel void
nw_kernel1_gpu(
    global int  * reference_d,
    global int  * input_itemsets_d,
    global int  * output_itemsets_d,
    //              local      int  * input_itemsets_l,
    //              local      int  * reference_l,
    int cols,
    int penalty,
    int blk,
    int block_width,
    int worksize,
    int offset_r,
    int offset_c,
    int offset
    )
{
  local int input_itemsets_l[(BLOCK_SIZE + 1) * (BLOCK_SIZE + 1)];
  local int reference_l[BLOCK_SIZE * BLOCK_SIZE];

  // Block index
  int bx = offset + get_group_id(0);
  //int bx = get_global_id(0)/BLOCK_SIZE;

  // Thread index
  int tx = get_local_id(0);

  // Base elements
  int base = offset_r * cols + offset_c;

  int b_index_x = bx;
  int b_index_y = blk - 1 - bx;


  int index   =   base + cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( cols + 1 );
  int index_n   = base + cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( 1 );
  int index_w   = base + cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + ( cols );
  int index_nw =  base + cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;


  if (tx == 0){
    SCORE(tx, 0) = input_itemsets_d[index_nw + tx];
  }

  barrier(CLK_LOCAL_MEM_FENCE);

  for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
    REF(ty, tx) =  reference_d[index + cols * ty];

  barrier(CLK_LOCAL_MEM_FENCE);

  SCORE((tx + 1), 0) = input_itemsets_d[index_w + cols * tx];

  barrier(CLK_LOCAL_MEM_FENCE);

  SCORE(0, (tx + 1)) = input_itemsets_d[index_n];

  barrier(CLK_LOCAL_MEM_FENCE);


  for( int m = 0 ; m < BLOCK_SIZE ; m++){

    if ( tx <= m ){

      int t_index_x =  tx + 1;
      int t_index_y =  m - tx + 1;

      SCORE(t_index_y, t_index_x) = maximum( SCORE((t_index_y-1), (t_index_x-1)) + REF((t_index_y-1), (t_index_x-1)),
          SCORE((t_index_y),   (t_index_x-1)) - (penalty),
          SCORE((t_index_y-1), (t_index_x))   - (penalty));
    }
    barrier(CLK_LOCAL_MEM_FENCE);
  }

  barrier(CLK_LOCAL_MEM_FENCE);

  for( int m = BLOCK_SIZE - 2 ; m >=0 ; m--){

    if ( tx <= m){

      int t_index_x =  tx + BLOCK_SIZE - m ;
      int t_index_y =  BLOCK_SIZE - tx;

      SCORE(t_index_y, t_index_x) = maximum(  SCORE((t_index_y-1), (t_index_x-1)) + REF((t_index_y-1), (t_index_x-1)),
          SCORE((t_index_y),   (t_index_x-1)) - (penalty),
          SCORE((t_index_y-1), (t_index_x))   - (penalty));

    }

    barrier(CLK_LOCAL_MEM_FENCE);
  }


  for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
    input_itemsets_d[index + cols * ty] = SCORE((ty+1), (tx+1));

  return;

}

kernel void
nw_kernel2_gpu(
    global int  * reference_d,
    global int  * input_itemsets_d,
    global int  * output_itemsets_d,
    //         local      int  * input_itemsets_l,
    //         local      int  * reference_l,
    int cols,
    int penalty,
    int blk,
    int block_width,
    int worksize,
    int offset_r,
    int offset_c,
    int offset
    )
{
  local int input_itemsets_l[(BLOCK_SIZE + 1) * (BLOCK_SIZE + 1)];
  local int reference_l[BLOCK_SIZE * BLOCK_SIZE];

  int bx = offset + get_group_id(0);
  //int bx = get_global_id(0)/BLOCK_SIZE;

  // Thread index
  int tx = get_local_id(0);

  // Base elements
  int base = offset_r * cols + offset_c;

  int b_index_x = bx + block_width - blk  ;
  int b_index_y = block_width - bx -1;


  int index   =   base + cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( cols + 1 );
  int index_n   = base + cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + tx + ( 1 );
  int index_w   = base + cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x + ( cols );
  int index_nw =  base + cols * BLOCK_SIZE * b_index_y + BLOCK_SIZE * b_index_x;

  if (tx == 0)
    SCORE(tx, 0) = input_itemsets_d[index_nw];

  for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
    REF(ty, tx) =  reference_d[index + cols * ty];

  barrier(CLK_LOCAL_MEM_FENCE);

  SCORE((tx + 1), 0) = input_itemsets_d[index_w + cols * tx];

  barrier(CLK_LOCAL_MEM_FENCE);

  SCORE(0, (tx + 1)) = input_itemsets_d[index_n];

  barrier(CLK_LOCAL_MEM_FENCE);

  for( int m = 0 ; m < BLOCK_SIZE ; m++){

    if ( tx <= m ){

      int t_index_x =  tx + 1;
      int t_index_y =  m - tx + 1;

      SCORE(t_index_y, t_index_x) = maximum(  SCORE((t_index_y-1), (t_index_x-1)) + REF((t_index_y-1), (t_index_x-1)),
          SCORE((t_index_y),   (t_index_x-1)) - (penalty),
          SCORE((t_index_y-1), (t_index_x))   - (penalty));
    }
    barrier(CLK_LOCAL_MEM_FENCE);
  }

  for( int m = BLOCK_SIZE - 2 ; m >=0 ; m--){

    if ( tx <= m){

      int t_index_x =  tx + BLOCK_SIZE - m ;
      int t_index_y =  BLOCK_SIZE - tx;

      SCORE(t_index_y, t_index_x) = maximum( SCORE((t_index_y-1), (t_index_x-1)) + REF((t_index_y-1), (t_index_x-1)),
          SCORE((t_index_y),   (t_index_x-1)) - (penalty),
          SCORE((t_index_y-1), (t_index_x))   - (penalty));

    }

    barrier(CLK_LOCAL_MEM_FENCE);
  }

  for ( int ty = 0 ; ty < BLOCK_SIZE ; ty++)
    input_itemsets_d[index + ty * cols] = SCORE((ty+1), (tx+1));


  return;

}