///
///  hostcall_stubs.hip
///

#include "hip/hip_runtime.h"
#include "hostcall_service_id.h"
#include "hostcall_stubs.h"
// -----------------------------------------------------------------------------
//
// printf: stubs to support printf 
//
// GPUs typically do not support vaargs style functions.  So to implement
// printf or any vaargs function as a hostcall service requires the compiler 
// to generate code to allocate a buffer, fill the buffer with the value of
// each argument, and then call a stub to execute the service with a pointer to 
// the buffer. The clang compiler does this in the CGGPUBuiltin.cpp source.
// Here we define printf_alloc and printf_execute device functions that are
// generated by the clang compiler when it encounters a printf statement.
// printf_alloc is implemented as a hostcall stub. We assume that the 
// host routine for printf_execute will free the buffer that was allocated
// by printf_alloc.

EXTERN char * printf_alloc(uint bufsz) {
  ulong arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7;
  arg0 = (ulong) bufsz;
  hostcall_result_t result = hostcall_invoke(PACK_VERS(HOSTCALL_SERVICE_MALLOC),
    arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7);
  __builtin_amdgcn_wave_barrier();
  return (char *) result.arg1;
}

EXTERN int printf_execute(char * print_buffer, uint bufsz) { 
  ulong arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7;
  __builtin_amdgcn_wave_barrier();
  arg0 = (ulong) bufsz;
  arg1 = (ulong) print_buffer ;
  hostcall_result_t result = hostcall_invoke(PACK_VERS(HOSTCALL_SERVICE_PRINTF),
    arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7);
  return (int) result.arg0;
}

// This utility is used for printf arguments that are variable length strings
// The clang compiler will generate calls to this only when a string length is 
// not a compile time constant.
EXTERN uint32_t __strlen_max(char *instr, uint32_t maxstrlen) {
  for (uint32_t i = 0; i < maxstrlen ; i++)
    if (instr[i] == (char) 0 )
       return (uint32_t) (i+1);
  return maxstrlen;
}

// -----------------------------------------------------------------------------
//
// vector_product_zeros: Example stub to demonstrate hostcall services
//
// This is an example hostcall stub for a service called vector_product_zeros.
// This function calculates C = A*B and returns the number of zeros.
// Naturally, one would typically do this type of operation on a GPU. 
// But this is a demo  to illustrate the use of hostcall to run a service
// on the host.  The host service for HOSTCALL_SERVICE_DEMO is in 
//        aomp-extras/hostcall/lib/src/hostall_handlers.c 
// After copying the vectors from the GPU it calls this routine
// on the host. 
//
// int vector_product_zeros(int N, int*A, int*B, int*C) {
//    int zeros = 0;   
//    for (int i =0 ; i<N; i++) {
//       C[i] = A[i] * B[i];
//       if ( C[i] == 0  )
//          zeros++ ;
//    }
//    return zeros;
// }
//

EXTERN int vector_product_zeros(int N, int*A,  int*B, int*C) {
  ulong arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7;
  arg0 = (long) N;
  // Pass these pointers to the host for memcpy
  arg1 = (long) A;
  arg2 = (long) B;
  arg3 = (long) C;
  hostcall_result_t result = hostcall_invoke(PACK_VERS(HOSTCALL_SERVICE_DEMO),
    arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7);
  int rc = (int) result.arg0;
  int num_zeros = (int) result.arg1;
  return num_zeros;
}
