/*
 * Copyright 2011 Christoph Bumiller
 *           2014 Red Hat Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

#include "codegen/nv50_ir.h"
#include "codegen/nv50_ir_build_util.h"

#include "codegen/nv50_ir_target_nvc0.h"
#include "codegen/nv50_ir_lowering_gm107.h"

#include <limits>

namespace nv50_ir {

#define QOP_ADD  0
#define QOP_SUBR 1
#define QOP_SUB  2
#define QOP_MOV2 3

//             UL UR LL LR
#define QUADOP(q, r, s, t)                      \
   ((QOP_##q << 6) | (QOP_##r << 4) |           \
    (QOP_##s << 2) | (QOP_##t << 0))

bool
GM107LoweringPass::handleManualTXD(TexInstruction *i)
{
   static const uint8_t qOps[4][2] =
   {
      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
   };
   Value *def[4][4];
   Value *crd[3];
   Value *tmp;
   Instruction *tex, *add;
   Value *zero = bld.loadImm(bld.getSSA(), 0);
   int l, c;
   const int dim = i->tex.target.getDim();

   i->op = OP_TEX; // no need to clone dPdx/dPdy later

   for (c = 0; c < dim; ++c)
      crd[c] = bld.getScratch();
   tmp = bld.getScratch();

   for (l = 0; l < 4; ++l) {
      // mov coordinates from lane l to all lanes
      bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
      for (c = 0; c < dim; ++c) {
         bld.mkOp2(OP_SHFL, TYPE_F32, crd[c], i->getSrc(c), bld.mkImm(l));
         add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], crd[c], zero);
         add->subOp = 0x00;
         add->lanes = 1; /* abused for .ndv */
      }

      // add dPdx from lane l to lanes dx
      for (c = 0; c < dim; ++c) {
         bld.mkOp2(OP_SHFL, TYPE_F32, tmp, i->dPdx[c].get(), bld.mkImm(l));
         add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
         add->subOp = qOps[l][0];
         add->lanes = 1; /* abused for .ndv */
      }

      // add dPdy from lane l to lanes dy
      for (c = 0; c < dim; ++c) {
         bld.mkOp2(OP_SHFL, TYPE_F32, tmp, i->dPdy[c].get(), bld.mkImm(l));
         add = bld.mkOp2(OP_QUADOP, TYPE_F32, crd[c], tmp, crd[c]);
         add->subOp = qOps[l][1];
         add->lanes = 1; /* abused for .ndv */
      }

      // texture
      bld.insert(tex = cloneForward(func, i));
      for (c = 0; c < dim; ++c)
         tex->setSrc(c, crd[c]);
      bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);

      // save results
      for (c = 0; i->defExists(c); ++c) {
         Instruction *mov;
         def[c][l] = bld.getSSA();
         mov = bld.mkMov(def[c][l], tex->getDef(c));
         mov->fixed = 1;
         mov->lanes = 1 << l;
      }
   }

   for (c = 0; i->defExists(c); ++c) {
      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
      for (l = 0; l < 4; ++l)
         u->setSrc(l, def[c][l]);
   }

   i->bb->remove(i);
   return true;
}

bool
GM107LoweringPass::handleDFDX(Instruction *insn)
{
   Instruction *shfl;
   int qop = 0, xid = 0;

   switch (insn->op) {
   case OP_DFDX:
      qop = QUADOP(SUB, SUBR, SUB, SUBR);
      xid = 1;
      break;
   case OP_DFDY:
      qop = QUADOP(SUB, SUB, SUBR, SUBR);
      xid = 2;
      break;
   default:
      assert(!"invalid dfdx opcode");
      break;
   }

   shfl = bld.mkOp2(OP_SHFL, TYPE_F32, bld.getScratch(),
                    insn->getSrc(0), bld.mkImm(xid));
   shfl->subOp = NV50_IR_SUBOP_SHFL_BFLY;
   insn->op = OP_QUADOP;
   insn->subOp = qop;
   insn->lanes = 0; /* abused for !.ndv */
   insn->setSrc(1, insn->getSrc(0));
   insn->setSrc(0, shfl->getDef(0));
   return true;
}

bool
GM107LoweringPass::handlePFETCH(Instruction *i)
{
   Value *tmp0 = bld.getScratch();
   Value *tmp1 = bld.getScratch();
   Value *tmp2 = bld.getScratch();
   bld.mkOp1(OP_RDSV, TYPE_U32, tmp0, bld.mkSysVal(SV_INVOCATION_INFO, 0));
   bld.mkOp2(OP_SHR , TYPE_U32, tmp1, tmp0, bld.mkImm(16));
   bld.mkOp2(OP_AND , TYPE_U32, tmp0, tmp0, bld.mkImm(0xff));
   bld.mkOp2(OP_AND , TYPE_U32, tmp1, tmp1, bld.mkImm(0xff));
   bld.mkOp1(OP_MOV , TYPE_U32, tmp2, bld.mkImm(i->getSrc(0)->reg.data.u32));
   bld.mkOp3(OP_MAD , TYPE_U32, tmp0, tmp0, tmp1, tmp2);
   i->setSrc(0, tmp0);
   i->setSrc(1, NULL);
   return true;
}

bool
GM107LoweringPass::handlePOPCNT(Instruction *i)
{
   Value *tmp = bld.mkOp2v(OP_AND, i->sType, bld.getScratch(),
                           i->getSrc(0), i->getSrc(1));
   i->setSrc(0, tmp);
   i->setSrc(1, NULL);
   return TRUE;
}

//
// - add quadop dance for texturing
// - put FP outputs in GPRs
// - convert instruction sequences
//
bool
GM107LoweringPass::visit(Instruction *i)
{
   bld.setPosition(i, false);

   if (i->cc != CC_ALWAYS)
      checkPredicate(i);

   switch (i->op) {
   case OP_TEX:
   case OP_TXB:
   case OP_TXL:
   case OP_TXF:
   case OP_TXG:
      return handleTEX(i->asTex());
   case OP_TXD:
      return handleTXD(i->asTex());
   case OP_TXLQ:
      return handleTXLQ(i->asTex());
   case OP_TXQ:
      return handleTXQ(i->asTex());
   case OP_EX2:
      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
      i->setSrc(0, i->getDef(0));
      break;
   case OP_POW:
      return handlePOW(i);
   case OP_DIV:
      return handleDIV(i);
   case OP_MOD:
      return handleMOD(i);
   case OP_SQRT:
      return handleSQRT(i);
   case OP_EXPORT:
      return handleEXPORT(i);
   case OP_PFETCH:
      return handlePFETCH(i);
   case OP_EMIT:
   case OP_RESTART:
      return handleOUT(i);
   case OP_RDSV:
      return handleRDSV(i);
   case OP_WRSV:
      return handleWRSV(i);
   case OP_LOAD:
      if (i->src(0).getFile() == FILE_SHADER_INPUT) {
         if (prog->getType() == Program::TYPE_COMPUTE) {
            i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
            i->getSrc(0)->reg.fileIndex = 0;
         } else
         if (prog->getType() == Program::TYPE_GEOMETRY &&
             i->src(0).isIndirect(0)) {
            // XXX: this assumes vec4 units
            Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
                                    i->getIndirect(0, 0), bld.mkImm(4));
            i->setIndirect(0, 0, ptr);
         } else {
            i->op = OP_VFETCH;
            assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
         }
      }
      break;
   case OP_ATOM:
   {
      const bool cctl = i->src(0).getFile() == FILE_MEMORY_GLOBAL;
      handleATOM(i);
      handleCasExch(i, cctl);
   }
      break;
   case OP_SULDB:
   case OP_SULDP:
   case OP_SUSTB:
   case OP_SUSTP:
   case OP_SUREDB:
   case OP_SUREDP:
      handleSurfaceOpNVE4(i->asTex());
      break;
   case OP_DFDX:
   case OP_DFDY:
      handleDFDX(i);
      break;
   case OP_POPCNT:
      handlePOPCNT(i);
      break;
   default:
      break;
   }
   return true;
}

} // namespace nv50_ir
