#
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

function print_hdrs()
{
  print "\
/*\n\
 *     Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.\n\
 *\n\
 * NVIDIA CORPORATION and its licensors retain all intellectual property\n\
 * and proprietary rights in and to this software, related documentation\n\
 * and any modifications thereto.  Any use, reproduction, disclosure or\n\
 * distribution of this software and related documentation without an express\n\
 * license agreement from NVIDIA CORPORATION is strictly prohibited.\n\
 *\n\
 */\n\
\n\n\
/*\n\
 *\n\
 * WARNING - this file is automatically generated. DO NOT EDIT.\n\
 *\n\
 */\n\
\n\
#include \"mth_intrinsics.h\" \n\
#include \"mth_tbldefs.h\" \n\
\n\n\
#if defined (TARGET_X8664) \n\
#include \"immintrin.h\" \n\
#else \n\
#error Unknown TARGET. Must be \"TARGET_X8664\" \n\
#endif \n\
\n\
#include \"mth_z2yy.h\"\n\
\n\
/*\n\
 * Common set of interface routines to convert an intrinsic math library call\n\
 * using Intel AVX-512 vectors in to two calls of the corresponding AVX2\n\
 * implementation.\n\
 *\n\
 * Note: code is common to both AVX-512 and KNL architectures.\n\
 *       Thus, have to use Intel intrinsics that are common to both systems.\n\
 */\n\
"
if (0) {
print "\n\
static\nvrs16_t\n\
__attribute__((noinline))\n\
__gs_z2yy_x(vrs16_t x, vrs8_t(*func)(vrs8_t))\n\
{ \n\
  vrs8_t rl, ru;\n\
  ru = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1));\n\
  rl = func((vrs8_t) _mm512_castps512_ps256(x));\n\
  return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),\n\
                                      (__m256d)ru, 1);\n\
}\n\
\n\
static\nvrs16_t\n\
__attribute__((noinline))\n\
__gs_z2yy_xy(vrs16_t x, vrs16_t y, vrs8_t(*func)(vrs8_t, vrs8_t))\n\
{ \n\
  vrs8_t rl, ru;\n\
  ru = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1),\n\
            (vrs8_t) _mm512_extractf64x4_pd((__m512d)y, 1));\n\
  rl = func((vrs8_t) _mm512_castps512_ps256(x),\n\
            (vrs8_t) _mm512_castps512_ps256(y));\n\
  return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),\n\
                                      (__m256d)ru, 1);\n\
}\n\
\n\
static\nvrs16_t\n\
__attribute__((noinline))\n\
__gs_z2yy_sincos(vrs16_t x, vrs8_t(*func)(vrs8_t))\n\
{ \n\
  vrs8_t su, sl, cu;\n\
  su = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1));\n\
  asm(\"vmovaps  %%ymm1, %0\" : :\"m\"(cu) :);\n\
  sl = func((vrs8_t) _mm512_castps512_ps256(x));\n\
  asm(\"vinsertf64x4 $0x1,%0,%%zmm1,%%zmm1\" : : \"m\"(cu) : );\n\
  return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)sl),\n\
                                      (__m256d)su, 1);\n\
}\n\
\n\
static\nvrs16_t\n\
__attribute__((noinline))\n\
__gs_z2yy_xk1(vrs16_t x, int64_t iy, vrs8_t(*func)(vrs8_t, int64_t))\n\
{\n\
  vrs8_t rl, ru;\n\
  ru = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1), iy);\n\
  rl = func((vrs8_t) _mm512_castps512_ps256(x), iy);\n\
  return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),\n\
                                      (__m256d)ru, 1);\n\
}\n\
\n\
static\nvrs16_t\n\
__attribute__((noinline))\n\
__gs_z2yy_xi(vrs16_t x, vis16_t iy, vrs8_t(*func)(vrs8_t, vis8_t))\n\
{\n\
  vrs8_t rl, ru;\n\
  ru = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1),\n\
            (vis8_t) _mm512_extractf64x4_pd((__m512d)iy, 1));\n\
  rl = func((vrs8_t) _mm512_castps512_ps256(x),\n\
            (vis8_t) _mm512_castps512_ps256((__m512)iy));\n\
  return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),\n\
                                     (__m256d)ru, 1);\n\
}\n\
\n\
static\nvrs16_t\n\
__attribute__((noinline))\n\
__gs_z2yy_xk(vrs16_t x, vid8_t iyu, vid8_t iyl, vrs8_t(*func)(vrs8_t, vid4_t, vid4_t))\n\
{\n\
  vrs8_t rl, ru;\n\
  ru = func((vrs8_t) _mm512_extractf64x4_pd((__m512d)x, 1),\n\
            (vid4_t) _mm512_extractf64x4_pd((__m512d)iyu, 1),\n\
            (vid4_t) _mm512_extractf64x4_pd((__m512d)iyu, 0));\n\
  rl = func((vrs8_t) _mm512_castps512_ps256(x),\n\
            (vid4_t) _mm512_extractf64x4_pd((__m512d)iyl, 1),\n\
            (vid4_t) _mm512_extractf64x4_pd((__m512d)iyl, 0));\n\
  return (vrs16_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),\n\
                                     (__m256d)ru, 1);\n\
}\n\
\n\
static\nvrd8_t\n\
__attribute__((noinline))\n\
__gd_z2yy_x(vrd8_t x, vrd4_t(*func)(vrd4_t))\n\
{\n\
  vrd4_t rl, ru;\n\
  ru = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1));\n\
  rl = func((vrd4_t) _mm512_castpd512_pd256(x));\n\
  return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),\n\
                                     (__m256d)ru, 1);\n\
}\n\
\n\
static\nvrd8_t\n\
__attribute__((noinline))\n\
__gd_z2yy_xy(vrd8_t x, vrd8_t y, vrd4_t(*func)(vrd4_t, vrd4_t))\n\
{\n\
  vrd4_t rl, ru;\n\
  ru = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1),\n\
            (vrd4_t) _mm512_extractf64x4_pd((__m512d)y, 1));\n\
  rl = func((vrd4_t) _mm512_castpd512_pd256(x),\n\
            (vrd4_t) _mm512_castpd512_pd256(y));\n\
  return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),\n\
                                     (__m256d)ru, 1);\n\
}\n\
\n\
static\nvrd8_t\n\
__attribute__((noinline))\n\
__gd_z2yy_sincos(vrd8_t x, vrd4_t(*func)(vrd4_t))\n\
{ \n\
  vrd4_t su, sl, cu;\n\
  su = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1));\n\
  asm(\"vmovaps  %%ymm1, %0\" : :\"m\"(cu) :);\n\
  sl = func((vrd4_t) _mm512_castpd512_pd256(x));\n\
  asm(\"vinsertf64x4 $0x1,%0,%%zmm1,%%zmm1\" : : \"m\"(cu) : );\n\
  return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)sl),\n\
                                      (__m256d)su, 1);\n\
}\n\
\n\
static\nvrd8_t\n\
__attribute__((noinline))\n\
__gd_z2yy_xk1(vrd8_t x, int64_t iy, vrd4_t(*func)(vrd4_t, int64_t))\n\
{\n\
  vrd4_t rl, ru;\n\
  ru = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1), iy);\n\
  rl = func((vrd4_t) _mm512_castpd512_pd256(x), iy);\n\
  return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),\n\
                                     (__m256d)ru, 1);\n\
}\n\
\n\
static\nvrd8_t\n\
__attribute__((noinline))\n\
__gd_z2yy_xk(vrd8_t x, vid8_t iy, vrd4_t(*func)(vrd4_t, vid4_t))\n\
{\n\
  vrd4_t rl, ru;\n\
  ru = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1),\n\
            (vid4_t) _mm512_extractf64x4_pd((__m512d)iy, 1));\n\
  rl = func((vrd4_t) _mm512_castpd512_pd256(x),\n\
            (vid4_t) _mm512_castpd512_pd256((__m512d)iy));\n\
  return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),\n\
                                     (__m256d)ru, 1);\n\
}\n\
\n\
static\nvrd8_t\n\
__attribute__((noinline))\n\
__gd_z2yy_xi(vrd8_t x, vis8_t iy, vrd4_t(*func)(vrd4_t, vis4_t))\n\
{\n\
  vrd4_t rl, ru;\n\
  ru = func((vrd4_t) _mm512_extractf64x4_pd((__m512d)x, 1),\n\
            (vis4_t) _mm256_extractf128_si256((__m256i)iy, 1));\n\
  rl = func((vrd4_t) _mm512_castpd512_pd256(x),\n\
            (vis4_t) _mm256_castsi256_si128((__m256i)iy));\n\
  return (vrd8_t) _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl),\n\
                                     (__m256d)ru, 1);\n\
}\n\
\n\
"
} # if (0)
}

function init_target_arrays()
{
    _mm = "_mm512"
    __m = "__m512"
    _si = "_si512"
   
  frps["f"]= ""
  frps["r"]= ""
  frps["p"]= ""
  sds["s"]= ""
  sds["d"]= ""
  iks["i"]= ""
  iks["k"]= ""
}

function VL(sd)
{
  return sd == "s" ? VLS : VLD
}

function VL_BY2(sd)
{
  return sd == "s" ? VLS/2 : VLD/2
}

function VR_T(sd) {
  return "vr" sd (sd == "s" ? VLS : VLD) "_t"
}

function VI_T(sd) {
  return "vi" sd (sd == "s" ? VLS : VLD) "_t"
}

function VR_T_BY2(sd) {
  return "vr" sd VL_BY2(sd) "_t"
}

function VI_T_BY2(sd) {
  return "vi" sd VL_BY2(sd) "_t"
}

function arg_ne_0(yarg, a, b)
{
  return yarg != 0 ? a : b
}

function func_r_decl(name, frp, sd, yarg)
{
  print "\n" VR_T(sd)
  print "__" frp sd "_" name "_" VL(sd) "_z2yy" \
        "(" VR_T(sd) " x" \
        arg_ne_0(yarg, ", " VR_T(sd) " y",  "") \
        ")"
        
}

function func_rr_def(name, frp, sd, safeval, yarg) {
  func_r_decl(name, frp, sd, yarg)
  print "{\n  " \
        VR_T_BY2(sd) " rl, ru;\n  " \
        VR_T_BY2(sd) " (*fptr) (" VR_T_BY2(sd) \
        arg_ne_0(yarg, ", " VR_T_BY2(sd), "") \
        ");"
  print "  fptr = (" VR_T_BY2(sd) "(*) (" VR_T_BY2(sd), \
          (yarg != 0) ? ", " VR_T_BY2(sd) : "", \
          ")) MTH_DISPATCH_TBL[func_" name "][sv_" sd "v" VL_BY2(sd) "][frp_" frp "];"
  print "  return __g" sd "_z2yy_" (name != "sincos" ? "x" : name) \
        arg_ne_0(yarg, "y", "") "(x" arg_ne_0(yarg, ", y", "") ", fptr);"
#  print "  ru = fptr((" VR_T_BY2(sd) ") _mm512_extractf64x4_pd((__m512d)x, 1)" \
#          arg_ne_0(yarg, ", (" VR_T_BY2(sd) ") _mm512_extractf64x4_pd((__m512d)y, 1)", "") \
#          ");"
#  print "  rl = fptr((" VR_T_BY2(sd) ") _mm512_castp" sd "512_p" sd "256(x)" \
#          arg_ne_0(yarg, ", (" VR_T_BY2(sd) ") _mm512_castp" sd "512_p" sd "256(y)") \
#          ");"
#  print "  return (" VR_T(sd) ") "\
#          "_mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl), (__m256d)ru, 1);"
  print "}"
}

function func_pow_args(sd, is_scalar, ik, with_vars, by2)
{
  ll = arg_ne_0(by2, VR_T_BY2(sd), VR_T(sd)) arg_ne_0(with_vars, " x", "") ", "
  if (is_scalar) {
    ll = ll ((ik == "i") ? "int32_t" : "int64_t") arg_ne_0(with_vars, " iy", "")
  } else {
    if (sd == "s" && ik == "k") {
      ll = ll arg_ne_0(by2, VI_T_BY2("d"), VI_T("d")) \
           arg_ne_0(with_vars, " iyu", "") ", " \
           arg_ne_0(by2, VI_T_BY2("d"), VI_T("d")) \
           arg_ne_0(with_vars, " iyl", "")
    } else {
      ll = ll arg_ne_0(by2, VI_T_BY2(sd), VI_T(sd)) \
           arg_ne_0(with_vars, " iy", "")
    }
  }

  return ll
}

function func_pow_decl(name, frp, sd, is_scalar, ik)
{
  print "\n" VR_T(sd)
  l = "__" frp sd "_" name arg_ne_0(is_scalar, ik"1", ik)"_" VL(sd) "_z2yy" "("
  l = l func_pow_args(sd, is_scalar, ik, 1, 0)
  l = l ")"
  print l
        
}

function func_pow_decl_scalar(name, frp, sd, ik)
{
#
# Waring: when iy is an int32_t, we promote it to an int64_t
# and then call the func_powk1 entry point from the dispatch
# table.
# It is inconceivable that the int32_t and int64_t power routines
# do not produce the same bit-for-bit results.
#
  print VR_T(sd)
  print "__" frp sd "_pow" ik "1_" VL(sd) "_z2yy(" VR_T(sd) " x, " \
        (ik == "i" ? "int32_t" : "int64_t") " iy)"
  print "{"
  print "  " VR_T_BY2(sd) " (*fptr)(" VR_T_BY2(sd) ", int64_t);"
  print "  fptr = (" VR_T_BY2(sd) "(*) (" \
        VR_T_BY2(sd) ", int64_t" \
        ")) MTH_DISPATCH_TBL[func_" name \
        "k1][sv_" sd "v" VL_BY2(sd) "][frp_" frp "];"
  print "  return __g" sd "_z2yy_xk1(x, iy, fptr);\n}\n"
}

function func_pow_decl_vect(name, frp, sd, ik)
{
  print VR_T(sd)
  print "__" frp sd "_pow" ik "_" VL(sd) "_z2yy(" VR_T(sd) " x, " \
        VI_T(sd) " iy)"
  print "{"
  print "  " VR_T_BY2(sd) " (*fptr)(" VR_T_BY2(sd) ", " VI_T_BY2(sd) ");"
  print "  fptr = (" VR_T_BY2(sd) "(*) (" \
        VR_T_BY2(sd) ", " VI_T_BY2(sd) \
        ")) MTH_DISPATCH_TBL[func_" name ik \
        "][sv_" sd "v" VL_BY2(sd) "][frp_" frp "];"
  print "  return __g" sd "_z2yy_x" ik "(x, iy, fptr);\n}\n"
}

function func_pow_decl_vect_di(name, frp)
{
  print VR_T("d")
  print "__" frp "d_powi_" VL("d") "_z2yy(" VR_T("d") " x, " \
        VI_T_BY2("s") " iy)"
  print "{"
#  print "  " VR_T_BY2(sd) " (*fptr)(" VR_T_BY2(sd) ", " VI_T_BY2("s") ");"
#  print "  fptr = (" VR_T_BY2("d") "(*) (" \
#        VR_T_BY2("d") ", " VI_T_BY2("s") \
#        ")) MTH_DISPATCH_TBL[func_" name "i" \
#        "][sv_" sd "v" VL_BY2("d") "][frp_" frp "];"
# Gave up trying to make this general.
  print "  vrd4_t (*fptr)(vrd4_t, vis4_t);"
  print "  fptr = (vrd4_t(*) (vrd4_t, vis4_t)) MTH_DISPATCH_TBL[func_powi][sv_dv4][frp_" frp "];"
  print "  return __gd_z2yy_xi(x, iy, fptr);\n}\n"
}

function func_pow_decl_vect_sk(name, frp)
{
  print VR_T("s")
  print "__" frp "s_powk_" VL("s") "_z2yy(" VR_T("s") " x, " \
        VI_T("d") " iyu, " VI_T("d") " iyl)"
  print "{"
  print "  vrs8_t (*fptr)(vrs8_t, vid4_t, vid4_t);"
  print "  fptr = (vrs8_t(*) (vrs8_t, vid4_t, vid4_t)) MTH_DISPATCH_TBL[func_powk][sv_dv4][frp_" frp "];"
  print "  return __gs_z2yy_xk(x, iyu, iyl, fptr);\n}\n"
}

function func_pow_def(name, frp, sd, is_scalar, ik)
{
  if (is_scalar) {
    func_pow_decl_scalar(name, frp, sd, ik)
  } else {
    # Four variants of R(:)**I(:)
    # 1) sd == "d" && ik == "k" - trivial both args same size(512)
    # 2) sd == "s" && ik == "i" - trivial both args same size(512)
    # 3) sd == "d" && ik == "i" - x is 512, iy is effectively 256
    # 4) sd == "s" && ik == "k" - x is 512, iy is effectively 1024

    # Trivial first
    if (sd == "d" && ik == "k" || sd == "s" && ik == "i") {
      func_pow_decl_vect(name, frp, sd, ik)
    } else if (sd == "d" && ik == "i") {
      func_pow_decl_vect_di(name, frp)
    } else {
      func_pow_decl_vect_sk(name, frp)
    }
  }
  return
}
function XXfunc_pow_def(name, frp, sd, is_scalar, ik)
{
  func_pow_decl(name, frp, sd, is_scalar, ik)
  print "{"

  if (is_scalar) {
    print "  " VR_T_BY2(sd) " (*fptr)(" VR_T_BY2(sd) ", int64_t);"
    print "  fptr = (" VR_T_BY2(sd) "(*) (" \
          func_pow_args(sd, is_scalar, "k", 0, 1) \
          ")) MTH_DISPATCH_TBL[func_" name arg_ne_0(is_scalar, ik"1", ik) \
          "][sv_" sd "v" VL_BY2(sd) "][frp_" frp "];"
    print "  return __g" sd "_z2yy_xk1(x, iy, fptr);\n}\n"
    return
  }

#
# This has turned in to a bit of a mess/hack.
#

  print "  " VR_T_BY2(sd) " rl, ru;\n  " \
        VR_T_BY2(sd) " (*fptr) (" func_pow_args(sd, is_scalar, ik, 0, 1) ");"
  print "  fptr = (" VR_T_BY2(sd) "(*) (" \
        func_pow_args(sd, is_scalar, ik, 0, 1) \
        ")) MTH_DISPATCH_TBL[func_" name arg_ne_0(is_scalar, ik"1", ik) \
        "][sv_" sd "v" VL_BY2(sd) "][frp_" frp "];"

  printf ("%s", "  ru = fptr((" VR_T_BY2(sd) ") _mm512_extractf64x4_pd((__m512d)x, 1), ")
  if (is_scalar) {
    printf ("%s", "iy")
  } else {
    if (sd == "s" && ik == "k") {
      printf ("%s", "(" VI_T_BY2("d") \
             ") _mm512_extractf64x4_pd((__m512d)iyu, 1) , ("\
             VI_T_BY2("d") ") _mm512_extractf64x4_pd((__m512d)iyl, 1)")
    } else {
      printf ("%s", "(" VI_T_BY2(sd) \
              ") _mm512_extractf64x4_pd((__m512d) iy, 1)")
    }
  }
  print ");"

  printf ("%s", "  rl = fptr((" VR_T_BY2(sd) ") _mm512_castp" sd "512_p" sd "256(x), ")
  if (is_scalar) {
    printf ("%s", "iy")
  } else {
    if (sd == "s" && ik == "k") {
      printf("%s", "(" VI_T_BY2("d") \
             ") _mm512_castps512_ps256((__m512)iyu) , ("\
             VI_T_BY2("d") ") _mm512_castps512_ps256((__m512)iyl)")
    } else {
    printf ("%s", "(" VI_T_BY2(sd) \
            ") _mm512_castps512_ps256((__m512)iy)")
    }
  }
  print ");"
  print "  return (" VR_T(sd) ") _mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)rl), (__m256d)ru, 1);"
  print "}"
}

function do_all_rr(name, safeval, yarg)
{

  for (frp in frps) {
    for (sd in sds) {
      func_rr_def(name, frp, sd, safeval, yarg)
    }
  }
}

function do_all_pow_r2i()
{
  for (frp in frps) {
#    frp = "f"
    for (sd in sds) {
      for (ik in iks) {
#         ik = "k"
        func_pow_def("pow", frp, sd, 1, ik)
        func_pow_def("pow", frp, sd, 0, ik)
      }
    }
  }
}

BEGIN {
  if (TARGET != "X8664") {
    print "TARGET must X8664"
    exit(1)
  }
  MAX_VREG_SIZE=512
  VLS = 16
  VLD = 8

# Initialize some associative arrays
  init_target_arrays()

  print_hdrs()
  one_arg = 0
  two_args = 1


#  do_all_rr("acos", 0, one_arg)
#  do_all_rr("atan2", 1, two_args)
  do_all_rr("acos", 0, one_arg)
  do_all_rr("asin", 0, one_arg)
  do_all_rr("atan", 0, one_arg)
  do_all_rr("atan2", 1, two_args)
  do_all_rr("cos", 0, one_arg)
  do_all_rr("sin", 0, one_arg)
  do_all_rr("tan", 0, one_arg)
  do_all_rr("sincos", 0, one_arg)
  do_all_rr("cosh", 0, one_arg)
  do_all_rr("sinh", 0, one_arg)
  do_all_rr("tanh", 0, one_arg)
  do_all_rr("exp", 0, one_arg)
  do_all_rr("log", 1, one_arg)
  do_all_rr("log10", 1, one_arg)
  do_all_rr("pow", 0, two_args)
  do_all_rr("mod", 1, two_args)
  do_all_rr("aint", 0, one_arg)
  do_all_rr("ceil", 0, one_arg)
  do_all_rr("floor", 0, one_arg)
#not used  do_all_rr("div", 1, two_args)
#not used  do_all_rr("sqrt", 0, one_arg)

  do_all_pow_r2i()
}