////////////////////////////////////////////////////////////////////////////////
//
// The University of Illinois/NCSA
// Open Source License (NCSA)
//
// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved.
//
// Developed by:
//
//                 AMD Research and AMD HSA Software Development
//
//                 Advanced Micro Devices, Inc.
//
//                 www.amd.com
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal with the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
//  - Redistributions of source code must retain the above copyright notice,
//    this list of conditions and the following disclaimers.
//  - Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimers in
//    the documentation and/or other materials provided with the distribution.
//  - Neither the names of Advanced Micro Devices, Inc,
//    nor the names of its contributors may be used to endorse or promote
//    products derived from this Software without specific prior written
//    permission.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS WITH THE SOFTWARE.
//
////////////////////////////////////////////////////////////////////////////////

#define NOMINMAX
#include "image_manager_nv.h"

#include <assert.h>

#include <algorithm>
#include <climits>

#include "inc/hsa_ext_amd.h"
#include "core/inc/hsa_internal.h"
#include "addrlib/src/core/addrlib.h"
#include "image_runtime.h"
#include "resource.h"
#include "resource_nv.h"
#include "util.h"
#include "device_info.h"

namespace rocr {
namespace image {

//-----------------------------------------------------------------------------
// Workaround switch to combined format/type codes and missing gfx10
// specific look up table.  Only covers types used in image_lut_kv.cpp.
//-----------------------------------------------------------------------------
struct formatconverstion_t {
  FMT fmt;
  type type;
  FORMAT format;
};

// Format/Type to combined format code table.
// Sorted and indexed to allow fast searches.
static const formatconverstion_t FormatLUT[] = {
  {FMT_1_5_5_5, TYPE_UNORM, CFMT_1_5_5_5_UNORM},
  {FMT_10_10_10_2, TYPE_UNORM, CFMT_10_10_10_2_UNORM},
  {FMT_10_10_10_2, TYPE_SNORM, CFMT_10_10_10_2_SNORM},
  {FMT_10_10_10_2, TYPE_UINT, CFMT_10_10_10_2_UINT},
  {FMT_10_10_10_2, TYPE_SINT, CFMT_10_10_10_2_SINT},
  {FMT_16, TYPE_UNORM, CFMT_16_UNORM},
  {FMT_16, TYPE_SNORM, CFMT_16_SNORM},
  {FMT_16, TYPE_UINT, CFMT_16_UINT},
  {FMT_16, TYPE_SINT, CFMT_16_SINT},
  {FMT_16, TYPE_FLOAT, CFMT_16_FLOAT},
  {FMT_16_16, TYPE_UNORM, CFMT_16_16_UNORM},
  {FMT_16_16, TYPE_SNORM, CFMT_16_16_SNORM},
  {FMT_16_16, TYPE_UINT, CFMT_16_16_UINT},
  {FMT_16_16, TYPE_SINT, CFMT_16_16_SINT},
  {FMT_16_16, TYPE_FLOAT, CFMT_16_16_FLOAT},
  {FMT_16_16_16_16, TYPE_UNORM, CFMT_16_16_16_16_UNORM},
  {FMT_16_16_16_16, TYPE_SNORM, CFMT_16_16_16_16_SNORM},
  {FMT_16_16_16_16, TYPE_UINT, CFMT_16_16_16_16_UINT},
  {FMT_16_16_16_16, TYPE_SINT, CFMT_16_16_16_16_SINT},
  {FMT_16_16_16_16, TYPE_FLOAT, CFMT_16_16_16_16_FLOAT},
  {FMT_2_10_10_10, TYPE_UNORM, CFMT_2_10_10_10_UNORM},
  {FMT_2_10_10_10, TYPE_SNORM, CFMT_2_10_10_10_SNORM},
  {FMT_2_10_10_10, TYPE_UINT, CFMT_2_10_10_10_UINT},
  {FMT_2_10_10_10, TYPE_SINT, CFMT_2_10_10_10_SINT},
  {FMT_24_8, TYPE_UNORM, CFMT_24_8_UNORM},
  {FMT_24_8, TYPE_UINT, CFMT_24_8_UINT},
  {FMT_32, TYPE_UINT, CFMT_32_UINT},
  {FMT_32, TYPE_SINT, CFMT_32_SINT},
  {FMT_32, TYPE_FLOAT, CFMT_32_FLOAT},
  {FMT_32_32, TYPE_UINT, CFMT_32_32_UINT},
  {FMT_32_32, TYPE_SINT, CFMT_32_32_SINT},
  {FMT_32_32, TYPE_FLOAT, CFMT_32_32_FLOAT},
  {FMT_32_32_32, TYPE_UINT, CFMT_32_32_32_UINT},
  {FMT_32_32_32, TYPE_SINT, CFMT_32_32_32_SINT},
  {FMT_32_32_32, TYPE_FLOAT, CFMT_32_32_32_FLOAT},
  {FMT_32_32_32_32, TYPE_UINT, CFMT_32_32_32_32_UINT},
  {FMT_32_32_32_32, TYPE_SINT, CFMT_32_32_32_32_SINT},
  {FMT_32_32_32_32, TYPE_FLOAT, CFMT_32_32_32_32_FLOAT},
  {FMT_5_5_5_1, TYPE_UNORM, CFMT_5_5_5_1_UNORM},
  {FMT_5_6_5, TYPE_UNORM, CFMT_5_6_5_UNORM},
  {FMT_8, TYPE_UNORM, CFMT_8_UNORM},
  {FMT_8, TYPE_SNORM, CFMT_8_SNORM},
  {FMT_8, TYPE_UINT, CFMT_8_UINT},
  {FMT_8, TYPE_SINT, CFMT_8_SINT},
  {FMT_8, TYPE_SRGB, CFMT_8_SRGB},
  {FMT_8_24, TYPE_UNORM, CFMT_8_24_UNORM},
  {FMT_8_24, TYPE_UINT, CFMT_8_24_UINT},
  {FMT_8_8, TYPE_UNORM, CFMT_8_8_UNORM},
  {FMT_8_8, TYPE_SNORM, CFMT_8_8_SNORM},
  {FMT_8_8, TYPE_UINT, CFMT_8_8_UINT},
  {FMT_8_8, TYPE_SINT, CFMT_8_8_SINT},
  {FMT_8_8, TYPE_SRGB, CFMT_8_8_SRGB},
  {FMT_8_8_8_8, TYPE_UNORM, CFMT_8_8_8_8_UNORM},
  {FMT_8_8_8_8, TYPE_SNORM, CFMT_8_8_8_8_SNORM},
  {FMT_8_8_8_8, TYPE_UINT, CFMT_8_8_8_8_UINT},
  {FMT_8_8_8_8, TYPE_SINT, CFMT_8_8_8_8_SINT},
  {FMT_8_8_8_8, TYPE_SRGB, CFMT_8_8_8_8_SRGB}
};
static const int FormatLUTSize = sizeof(FormatLUT)/sizeof(formatconverstion_t);

//Index in FormatLUT to start search, indexed by FMT enum.
static const int FormatEntryPoint[] = {
  57,
  40,
  5,
  47,
  26,
  10,
  57,
  57,
  1,
  20,
  52,
  29,
  15,
  32,
  35,
  57,
  39,
  0,
  38,
  57,
  45,
  24
};

static FORMAT GetCombinedFormat(uint8_t fmt, uint8_t type) {
  assert(fmt < sizeof(FormatEntryPoint)/sizeof(int) && "FMT out of range.");
  int start = FormatEntryPoint[fmt];
  int stop = std::min(start + 6, FormatLUTSize); // Only 6 types are used in image_kv_lut.cpp

  for(int i=start; i<stop; i++) {
    if((FormatLUT[i].fmt == fmt) && (FormatLUT[i].type == type))
      return FormatLUT[i].format;
  }
  return CFMT_INVALID;
};
//-----------------------------------------------------------------------------
// End workaround 
//-----------------------------------------------------------------------------

ImageManagerNv::ImageManagerNv() : ImageManagerKv() {}

ImageManagerNv::~ImageManagerNv() {}

// TODO(cfreehil) remove from class, make it a utility function
hsa_status_t ImageManagerNv::CalculateImageSizeAndAlignment(
    hsa_agent_t component, const hsa_ext_image_descriptor_t& desc,
    hsa_ext_image_data_layout_t image_data_layout,
    size_t image_data_row_pitch,
    size_t image_data_slice_pitch,
    hsa_ext_image_data_info_t& image_info) const {
  ADDR2_COMPUTE_SURFACE_INFO_OUTPUT out = {0};
  hsa_profile_t profile;
  hsa_status_t status = HSA::hsa_agent_get_info(component, HSA_AGENT_INFO_PROFILE, &profile);
  Image::TileMode tileMode = Image::TileMode::LINEAR;
  if (image_data_layout == HSA_EXT_IMAGE_DATA_LAYOUT_OPAQUE) {
    tileMode = (profile == HSA_PROFILE_BASE &&
                desc.geometry != HSA_EXT_IMAGE_GEOMETRY_1DB)?
      Image::TileMode::TILED : Image::TileMode::LINEAR;
  }
  if (GetAddrlibSurfaceInfoNv(component, desc, tileMode,
        image_data_row_pitch, image_data_slice_pitch, out) ==
                                                             (uint32_t)(-1)) {
    return HSA_STATUS_ERROR;
  }

  size_t rowPitch   = (out.bpp >> 3) * out.pitch;
  size_t slicePitch = rowPitch * out.height;
  if (desc.geometry != HSA_EXT_IMAGE_GEOMETRY_1DB &&
      image_data_layout == HSA_EXT_IMAGE_DATA_LAYOUT_LINEAR &&
      ((image_data_row_pitch && (rowPitch != image_data_row_pitch)) ||
       (image_data_slice_pitch && (slicePitch != image_data_slice_pitch)))) {
    return static_cast<hsa_status_t>(
                                HSA_EXT_STATUS_ERROR_IMAGE_PITCH_UNSUPPORTED);
  }

  image_info.size = out.surfSize;
  assert(image_info.size != 0);
  image_info.alignment = out.baseAlign;
  assert(image_info.alignment != 0);

  return HSA_STATUS_SUCCESS;
}

bool ImageManagerNv::IsLocalMemory(const void* address) const {
  return true;
}

hsa_status_t ImageManagerNv::PopulateImageSrd(Image& image,
                                     const metadata_amd_t* descriptor) const {
  const metadata_amd_nv_t* desc =
                       reinterpret_cast<const metadata_amd_nv_t*>(descriptor);
  bool atc_access = true;
  const void* image_data_addr = image.data;

  ImageProperty image_prop =
                 image_lut_.MapFormat(image.desc.format, image.desc.geometry);
  if ((image_prop.cap == HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED) ||
     (image_prop.element_size == 0))
    return (hsa_status_t)HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED;

  const Swizzle swizzle =
                       image_lut_.MapSwizzle(image.desc.format.channel_order);

  if (IsLocalMemory(image.data)) {
    atc_access = false;
    image_data_addr = reinterpret_cast<const void*>(
        reinterpret_cast<uintptr_t>(image.data) - local_memory_base_address_);
  }

  image.srd[0] = desc->word0.u32All;
  image.srd[1] = desc->word1.u32All;
  image.srd[2] = desc->word2.u32All;
  image.srd[3] = desc->word3.u32All;
  image.srd[4] = desc->word4.u32All;
  image.srd[5] = desc->word5.u32All;
  image.srd[6] = desc->word6.u32All;
  image.srd[7] = desc->word7.u32All;

  if (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) {
    SQ_BUF_RSRC_WORD0 word0;
    SQ_BUF_RSRC_WORD1 word1;
    SQ_BUF_RSRC_WORD2 word2;
    SQ_BUF_RSRC_WORD3 word3;

    word0.val = 0;
    word0.f.BASE_ADDRESS = PtrLow32(image_data_addr);

    word1.val = image.srd[1];
    word1.f.BASE_ADDRESS_HI = PtrHigh32(image_data_addr);
    word1.f.STRIDE = image_prop.element_size;

    word3.val = image.srd[3];
    word3.f.DST_SEL_X = swizzle.x;
    word3.f.DST_SEL_Y = swizzle.y;
    word3.f.DST_SEL_Z = swizzle.z;
    word3.f.DST_SEL_W = swizzle.w;

    word3.f.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type);
    word3.f.INDEX_STRIDE = image_prop.element_size;

    image.srd[0] = word0.val;
    image.srd[1] = word1.val;
    image.srd[3] = word3.val;
  } else {
    uint32_t hwPixelSize = image_lut_.GetPixelSize(image_prop.data_format, image_prop.data_type);

    if (image_prop.element_size != hwPixelSize) {
      return (hsa_status_t)HSA_EXT_STATUS_ERROR_IMAGE_FORMAT_UNSUPPORTED;
    }
    reinterpret_cast<SQ_IMG_RSRC_WORD0*>(&image.srd[0])->bits.BASE_ADDRESS =
        PtrLow40Shift8(image_data_addr);
    reinterpret_cast<SQ_IMG_RSRC_WORD1*>(&image.srd[1])->bits.BASE_ADDRESS_HI =
        PtrHigh64Shift40(image_data_addr);
    reinterpret_cast<SQ_IMG_RSRC_WORD1*>(&image.srd[1])->bits.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type);
    reinterpret_cast<SQ_IMG_RSRC_WORD3*>(&image.srd[3])->bits.DST_SEL_X =
                                                                    swizzle.x;
    reinterpret_cast<SQ_IMG_RSRC_WORD3*>(&image.srd[3])->bits.DST_SEL_Y =
                                                                    swizzle.y;
    reinterpret_cast<SQ_IMG_RSRC_WORD3*>(&image.srd[3])->bits.DST_SEL_Z =
                                                                    swizzle.z;
    reinterpret_cast<SQ_IMG_RSRC_WORD3*>(&image.srd[3])->bits.DST_SEL_W =
                                                                    swizzle.w;
    if (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DA ||
        image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1D) {
      reinterpret_cast<SQ_IMG_RSRC_WORD3*>(&image.srd[3])->bits.TYPE =
                                  image_lut_.MapGeometry(image.desc.geometry);
    }
    
    // Imported metadata holds the offset to metadata, add the image base address.
    uintptr_t meta = uintptr_t(((SQ_IMG_RSRC_WORD7*)(&image.srd[7]))->bits.META_DATA_ADDRESS_HI) << 16;
    meta |= uintptr_t(((SQ_IMG_RSRC_WORD6*)(&image.srd[6]))->bits.META_DATA_ADDRESS) << 8;
    meta += reinterpret_cast<uintptr_t>(image_data_addr);

    ((SQ_IMG_RSRC_WORD6*)(&image.srd[6]))->bits.META_DATA_ADDRESS = PtrLow16Shift8((void*)meta);
    ((SQ_IMG_RSRC_WORD7*)(&image.srd[7]))->bits.META_DATA_ADDRESS_HI =
        PtrHigh64Shift16((void*)meta);
  }
  // Looks like this is only used for CPU copies.
  image.row_pitch = 0;
  image.slice_pitch = 0;

  // Used by HSAIL shader ABI
  image.srd[8] = image.desc.format.channel_type;
  image.srd[9] = image.desc.format.channel_order;
  image.srd[10] = static_cast<uint32_t>(image.desc.width);

  return HSA_STATUS_SUCCESS;
}

static TEX_BC_SWIZZLE GetBcSwizzle(const Swizzle& swizzle) {
    SEL r = (SEL)swizzle.x;
    SEL g = (SEL)swizzle.y;
    SEL b = (SEL)swizzle.z;
    SEL a = (SEL)swizzle.w;

    TEX_BC_SWIZZLE bcSwizzle = TEX_BC_Swizzle_XYZW;

    if (a == SEL_X) {
        // Have to use either TEX_BC_Swizzle_WZYX or TEX_BC_Swizzle_WXYZ
        //
        // For the pre-defined border color values (white, opaque black,
        // transparent black), the only thing that matters is that the alpha
        // channel winds up in the correct place (because the RGB channels are
        // all the same) so either of these TEX_BC_Swizzle enumerations will
        // work.  Not sure what happens with border color palettes.
        if (b == SEL_Y) {
            // ABGR
            bcSwizzle = TEX_BC_Swizzle_WZYX;
        } else if ((r == SEL_X) && (g == SEL_X) && (b == SEL_X)) {
            // RGBA
            bcSwizzle = TEX_BC_Swizzle_XYZW;
        } else {
            // ARGB
            bcSwizzle = TEX_BC_Swizzle_WXYZ;
        }
    } else if (r == SEL_X) {
        // Have to use either TEX_BC_Swizzle_XYZW or TEX_BC_Swizzle_XWYZ
        if (g == SEL_Y) {
            // RGBA
            bcSwizzle = TEX_BC_Swizzle_XYZW;
        } else if ((g == SEL_X) && (b == SEL_X) && (a == SEL_W)) {
            // RGBA
            bcSwizzle = TEX_BC_Swizzle_XYZW;
        } else {
            // RAGB
            bcSwizzle = TEX_BC_Swizzle_XWYZ;
        }
    } else if (g == SEL_X) {
        // GRAB, have to use TEX_BC_Swizzle_YXWZ
        bcSwizzle = TEX_BC_Swizzle_YXWZ;
    } else if (b == SEL_X) {
        // BGRA, have to use TEX_BC_Swizzle_ZYXW
        bcSwizzle = TEX_BC_Swizzle_ZYXW;
    }

    return bcSwizzle;
}


hsa_status_t ImageManagerNv::PopulateImageSrd(Image& image) const {
  ImageProperty image_prop =
      image_lut_.MapFormat(image.desc.format, image.desc.geometry);
  assert(image_prop.cap != HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED);
  assert(image_prop.element_size != 0);

  bool atc_access = true;
  const void* image_data_addr = image.data;

  if (IsLocalMemory(image.data)) {
    atc_access = false;
    image_data_addr = reinterpret_cast<const void*>(
        reinterpret_cast<uintptr_t>(image.data) - local_memory_base_address_);
  }

  if (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) {
    SQ_BUF_RSRC_WORD0 word0;
    SQ_BUF_RSRC_WORD1 word1;
    SQ_BUF_RSRC_WORD2 word2;
    SQ_BUF_RSRC_WORD3 word3;

    word0.val = 0;
    word0.f.BASE_ADDRESS = PtrLow32(image_data_addr);

    word1.val = 0;
    word1.f.BASE_ADDRESS_HI = PtrHigh32(image_data_addr);
    word1.f.STRIDE = image_prop.element_size;
    word1.f.SWIZZLE_ENABLE = false;
    word1.f.CACHE_SWIZZLE = false;

    word2.f.NUM_RECORDS = image.desc.width * image_prop.element_size;

    const Swizzle swizzle =
        image_lut_.MapSwizzle(image.desc.format.channel_order);
    word3.val = 0;
    word3.f.RESOURCE_LEVEL = 1;
    word3.f.DST_SEL_X = swizzle.x;
    word3.f.DST_SEL_Y = swizzle.y;
    word3.f.DST_SEL_Z = swizzle.z;
    word3.f.DST_SEL_W = swizzle.w;
    word3.f.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type);
    word3.f.INDEX_STRIDE = image_prop.element_size;
    word3.f.TYPE = image_lut_.MapGeometry(image.desc.geometry);

    image.srd[0] = word0.val;
    image.srd[1] = word1.val;
    image.srd[2] = word2.val;
    image.srd[3] = word3.val;

    image.row_pitch = image.desc.width * image_prop.element_size;
    image.slice_pitch = image.row_pitch;
  } else {
    SQ_IMG_RSRC_WORD0 word0;
    SQ_IMG_RSRC_WORD1 word1;
    SQ_IMG_RSRC_WORD2 word2;
    SQ_IMG_RSRC_WORD3 word3;
    SQ_IMG_RSRC_WORD4 word4;
    SQ_IMG_RSRC_WORD5 word5;
    SQ_IMG_RSRC_WORD5 word6;
    SQ_IMG_RSRC_WORD5 word7;

    ADDR2_COMPUTE_SURFACE_INFO_OUTPUT out = {0};

    uint32_t swizzleMode = GetAddrlibSurfaceInfoNv(
         image.component, image.desc, image.tile_mode,
                                     image.row_pitch, image.slice_pitch, out);
    if (swizzleMode == (uint32_t)(-1)) {
      return HSA_STATUS_ERROR;
    }

    assert((out.bpp / 8) == image_prop.element_size);

    const size_t row_pitch_size = out.pitch * image_prop.element_size;

    word0.f.BASE_ADDRESS = PtrLow40Shift8(image_data_addr);

    word1.val = 0;
    word1.f.BASE_ADDRESS_HI = PtrHigh64Shift40(image_data_addr);
    word1.f.MIN_LOD = 0;
    word1.f.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type);
    // Only take the lowest 2 bits of (image.desc.width - 1)
    word1.f.WIDTH = BitSelect<0, 1>(image.desc.width - 1);

    word2.val = 0;
    // Take the high 12 bits of (image.desc.width - 1)
    word2.f.WIDTH_HI = BitSelect<2, 13>(image.desc.width - 1);
    word2.f.HEIGHT = image.desc.height ? image.desc.height - 1 : 0;
    word2.f.RESOURCE_LEVEL = 1;

    const Swizzle swizzle =
        image_lut_.MapSwizzle(image.desc.format.channel_order);
    word3.val = 0;
    word3.f.DST_SEL_X = swizzle.x;
    word3.f.DST_SEL_Y = swizzle.y;
    word3.f.DST_SEL_Z = swizzle.z;
    word3.f.DST_SEL_W = swizzle.w;
    word3.f.SW_MODE = swizzleMode;
    word3.f.BC_SWIZZLE = GetBcSwizzle(swizzle);
    word3.f.TYPE = image_lut_.MapGeometry(image.desc.geometry);

    const bool image_array =
        (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DA ||
         image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_2DA ||
         image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_2DADEPTH);
    const bool image_3d = (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_3D);

    word4.val = 0;
    word4.f.DEPTH =
        (image_array) // Doesn't hurt but isn't array_size already >0?
            ? std::max(image.desc.array_size, static_cast<size_t>(1)) - 1
            : (image_3d) ? image.desc.depth - 1 : 0;
    uint32_t minor_ver = MinorVerFromDevID(chip_id_);
    // For 1d, 2d and 2d-msaa in gfx1030 and beyond this is pitch-1
    if ((minor_ver >= 3) && !image_array && !image_3d)
      word4.f.PITCH = out.pitch - 1;

    word5.val = 0;
    word6.val = 0;
    word7.val = 0;

    image.srd[0] = word0.val;
    image.srd[1] = word1.val;
    image.srd[2] = word2.val;
    image.srd[3] = word3.val;
    image.srd[4] = word4.val;
    image.srd[5] = word5.val;
    image.srd[6] = word6.val;
    image.srd[7] = word7.val;

    image.row_pitch = row_pitch_size;
    image.slice_pitch = out.sliceSize;
  }

  image.srd[8] = image.desc.format.channel_type;
  image.srd[9] = image.desc.format.channel_order;
  image.srd[10] = static_cast<uint32_t>(image.desc.width);

  return HSA_STATUS_SUCCESS;
}

hsa_status_t ImageManagerNv::ModifyImageSrd(
    Image& image, hsa_ext_image_format_t& new_format) const {
  image.desc.format = new_format;

  ImageProperty image_prop =
      image_lut_.MapFormat(image.desc.format, image.desc.geometry);
  assert(image_prop.cap != HSA_EXT_IMAGE_CAPABILITY_NOT_SUPPORTED);
  assert(image_prop.element_size != 0);

  if (image.desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) {
    const Swizzle swizzle =
        image_lut_.MapSwizzle(image.desc.format.channel_order);
    SQ_BUF_RSRC_WORD3* word3 =
        reinterpret_cast<SQ_BUF_RSRC_WORD3*>(&image.srd[3]);
    word3->bits.DST_SEL_X = swizzle.x;
    word3->bits.DST_SEL_Y = swizzle.y;
    word3->bits.DST_SEL_Z = swizzle.z;
    word3->bits.DST_SEL_W = swizzle.w;
    word3->bits.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type);
  } else {
    SQ_IMG_RSRC_WORD1* word1 =
        reinterpret_cast<SQ_IMG_RSRC_WORD1*>(&image.srd[1]);
    word1->bits.FORMAT = GetCombinedFormat(image_prop.data_format, image_prop.data_type);

    const Swizzle swizzle =
        image_lut_.MapSwizzle(image.desc.format.channel_order);
    SQ_IMG_RSRC_WORD3* word3 =
        reinterpret_cast<SQ_IMG_RSRC_WORD3*>(&image.srd[3]);
    word3->bits.DST_SEL_X = swizzle.x;
    word3->bits.DST_SEL_Y = swizzle.y;
    word3->bits.DST_SEL_Z = swizzle.z;
    word3->bits.DST_SEL_W = swizzle.w;
  }

  image.srd[8] = image.desc.format.channel_type;
  image.srd[9] = image.desc.format.channel_order;
  image.srd[10] = static_cast<uint32_t>(image.desc.width);

  return HSA_STATUS_SUCCESS;
}

hsa_status_t ImageManagerNv::PopulateSamplerSrd(Sampler& sampler) const {
  const hsa_ext_sampler_descriptor_t sampler_descriptor = sampler.desc;

  SQ_IMG_SAMP_WORD0 word0;
  SQ_IMG_SAMP_WORD1 word1;
  SQ_IMG_SAMP_WORD2 word2;
  SQ_IMG_SAMP_WORD3 word3;

  word0.u32All = 0;
  switch (sampler_descriptor.address_mode) {
    case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE:
      word0.bits.CLAMP_X = static_cast<int>(SQ_TEX_CLAMP_LAST_TEXEL);
      break;
    case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER:
      word0.bits.CLAMP_X = static_cast<int>(SQ_TEX_CLAMP_BORDER);
      break;
    case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT:
      word0.bits.CLAMP_X = static_cast<int>(SQ_TEX_MIRROR);
      break;
    case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED:
    case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT:
      word0.bits.CLAMP_X = static_cast<int>(SQ_TEX_WRAP);
      break;
    default:
      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
  }
  word0.bits.CLAMP_Y = word0.bits.CLAMP_X;
  word0.bits.CLAMP_Z = word0.bits.CLAMP_X;
  word0.bits.FORCE_UNNORMALIZED = (sampler_descriptor.coordinate_mode ==
                                  HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED);

  word1.u32All = 0;
  word1.bits.MAX_LOD = 4095;

  word2.u32All = 0;
  switch (sampler_descriptor.filter_mode) {
    case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST:
      word2.bits.XY_MAG_FILTER = static_cast<int>(SQ_TEX_XY_FILTER_POINT);
      break;
    case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR:
      word2.bits.XY_MAG_FILTER = static_cast<int>(SQ_TEX_XY_FILTER_BILINEAR);
      break;
    default:
      return HSA_STATUS_ERROR_INVALID_ARGUMENT;
  }
  word2.bits.XY_MIN_FILTER = word2.bits.XY_MAG_FILTER;
  word2.bits.Z_FILTER = SQ_TEX_Z_FILTER_NONE;
  word2.bits.MIP_FILTER = SQ_TEX_MIP_FILTER_NONE;

  word3.u32All = 0;

  // TODO: check this bit with HSAIL spec.
  word3.bits.BORDER_COLOR_TYPE = SQ_TEX_BORDER_COLOR_TRANS_BLACK;

  sampler.srd[0] = word0.u32All;
  sampler.srd[1] = word1.u32All;
  sampler.srd[2] = word2.u32All;
  sampler.srd[3] = word3.u32All;

  return HSA_STATUS_SUCCESS;
}

uint32_t ImageManagerNv::GetAddrlibSurfaceInfoNv(
    hsa_agent_t component, const hsa_ext_image_descriptor_t& desc,
    Image::TileMode tileMode,
    size_t image_data_row_pitch,
    size_t image_data_slice_pitch,
    ADDR2_COMPUTE_SURFACE_INFO_OUTPUT& out) const {
  const ImageProperty image_prop =
      GetImageProperty(component, desc.format, desc.geometry);

  const AddrFormat addrlib_format = GetAddrlibFormat(image_prop);

  const uint32_t width = static_cast<uint32_t>(desc.width);
  const uint32_t height = static_cast<uint32_t>(desc.height);
  static const size_t kMinNumSlice = 1;
  const uint32_t num_slice = static_cast<uint32_t>(
      std::max(kMinNumSlice, std::max(desc.array_size, desc.depth)));

  uint32_t minor_ver = MinorVerFromDevID(chip_id_);
  ADDR2_COMPUTE_SURFACE_INFO_INPUT in = {0};
  in.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_INPUT);
  in.format = addrlib_format;
  in.bpp = static_cast<unsigned int>(image_prop.element_size) * 8;
  in.width = width;
  in.height = height;
  in.numSlices = num_slice;
  // Custom Pitch is supported in gfx1030 and beyond
  if (minor_ver >= 3)
    in.pitchInElement = image_data_row_pitch / image_prop.element_size;
  switch (desc.geometry) {
    case HSA_EXT_IMAGE_GEOMETRY_1D:
    case HSA_EXT_IMAGE_GEOMETRY_1DB:
    case HSA_EXT_IMAGE_GEOMETRY_1DA:
      in.resourceType = ADDR_RSRC_TEX_1D;
      break;

    case HSA_EXT_IMAGE_GEOMETRY_2D:
    case HSA_EXT_IMAGE_GEOMETRY_2DDEPTH:
    case HSA_EXT_IMAGE_GEOMETRY_2DA:
    case HSA_EXT_IMAGE_GEOMETRY_2DADEPTH:
      in.resourceType = ADDR_RSRC_TEX_2D;
      break;

    case HSA_EXT_IMAGE_GEOMETRY_3D:
      in.resourceType = ADDR_RSRC_TEX_3D;
      break;
  }
  in.flags.texture = 1;

  ADDR2_GET_PREFERRED_SURF_SETTING_INPUT  prefSettingsInput = { 0 };
  ADDR2_GET_PREFERRED_SURF_SETTING_OUTPUT prefSettingsOutput = { 0 };

  prefSettingsInput.size            = sizeof(prefSettingsInput);
  prefSettingsInput.flags           = in.flags;
  prefSettingsInput.bpp             = in.bpp;
  prefSettingsInput.format          = in.format;
  prefSettingsInput.width           = in.width;
  prefSettingsInput.height          = in.height;
  prefSettingsInput.numFrags        = in.numFrags;
  prefSettingsInput.numSamples      = in.numSamples;
  prefSettingsInput.numMipLevels    = in.numMipLevels;
  prefSettingsInput.numSlices       = in.numSlices;
  prefSettingsInput.resourceLoction = ADDR_RSRC_LOC_UNDEF;
  prefSettingsInput.resourceType    = in.resourceType;

  // Disallow all swizzles but linear.
  if (tileMode == Image::TileMode::LINEAR) {
      prefSettingsInput.forbiddenBlock.macroThin4KB = 1;
      prefSettingsInput.forbiddenBlock.macroThick4KB = 1;
      prefSettingsInput.forbiddenBlock.macroThin64KB = 1;
      prefSettingsInput.forbiddenBlock.macroThick64KB = 1;
      prefSettingsInput.forbiddenBlock.micro = 1;
      prefSettingsInput.forbiddenBlock.var = 1;
  } else {
      // Debug setting, simplifies buffer alignment until language runtimes have official gfx10
      // support.
      prefSettingsInput.forbiddenBlock.macroThin64KB = 1;
      prefSettingsInput.forbiddenBlock.macroThick64KB = 1;
  }

  // but don't ever allow the 256b swizzle modes
  //prefSettingsInput.forbiddenBlock.micro = 1;
  // and don't allow variable-size block modes
  //prefSettingsInput.forbiddenBlock.var = 1;

  if (ADDR_OK != Addr2GetPreferredSurfaceSetting(addr_lib_,
                                   &prefSettingsInput, &prefSettingsOutput)) {
    return (uint32_t)(-1);
  }

  in.swizzleMode = prefSettingsOutput.swizzleMode;

  out.size = sizeof(ADDR2_COMPUTE_SURFACE_INFO_OUTPUT);
  if (ADDR_OK != Addr2ComputeSurfaceInfo(addr_lib_, &in, &out)) {
    return (uint32_t)(-1);
  }
  if (out.surfSize == 0) {
    return (uint32_t)(-1);
  }

  return in.swizzleMode;
}

hsa_status_t ImageManagerNv::FillImage(const Image& image, const void* pattern,
                                       const hsa_ext_image_region_t& region) {
  if (BlitQueueInit().queue_ == NULL) {
    return HSA_STATUS_ERROR_OUT_OF_RESOURCES;
  }

  Image* image_view = const_cast<Image*>(&image);

  SQ_BUF_RSRC_WORD3* word3_buff = NULL;
  SQ_IMG_RSRC_WORD3* word3_image = NULL;
  uint32_t dst_sel_w_original = 0;
  if (image_view->desc.format.channel_type ==
      HSA_EXT_IMAGE_CHANNEL_TYPE_UNORM_SHORT_101010) {
    // Force GPU to ignore the last two bits (alpha bits).
    if (image_view->desc.geometry == HSA_EXT_IMAGE_GEOMETRY_1DB) {
      word3_buff = reinterpret_cast<SQ_BUF_RSRC_WORD3*>(&image_view->srd[3]);
      dst_sel_w_original = word3_buff->bits.DST_SEL_W;
      word3_buff->bits.DST_SEL_W = SEL_0;
    } else {
      word3_image = reinterpret_cast<SQ_IMG_RSRC_WORD3*>(&image_view->srd[3]);
      dst_sel_w_original = word3_image->bits.DST_SEL_W;
      word3_image->bits.DST_SEL_W = SEL_0;
    }
  }

  SQ_IMG_RSRC_WORD1* word1 = NULL;
  uint32_t num_format_original = 0;
  const void* new_pattern = pattern;
  float fill_value[4] = {0};
  switch (image_view->desc.format.channel_order) {
    case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBA:
    case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGB:
    case HSA_EXT_IMAGE_CHANNEL_ORDER_SRGBX:
    case HSA_EXT_IMAGE_CHANNEL_ORDER_SBGRA: {
      // KV and CZ don't have write support for SRGBA image, so convert pattern
      // to standard form and treat the image as RGBA image.
      const float* pattern_f = reinterpret_cast<const float*>(pattern);
      fill_value[0] = LinearToStandardRGB(pattern_f[0]);
      fill_value[1] = LinearToStandardRGB(pattern_f[1]);
      fill_value[2] = LinearToStandardRGB(pattern_f[2]);
      fill_value[3] = pattern_f[3];
      new_pattern = fill_value;

      ImageProperty image_prop = image_lut_.MapFormat(image.desc.format, image.desc.geometry);

      word1 = reinterpret_cast<SQ_IMG_RSRC_WORD1*>(&image_view->srd[1]);
      num_format_original = word1->bits.FORMAT;
      word1->bits.FORMAT = GetCombinedFormat(image_prop.data_format, TYPE_UNORM);
    } break;
    default:
      break;
  }

  hsa_status_t status = ImageRuntime::instance()->blit_kernel().FillImage(
      blit_queue_, blit_code_catalog_, *image_view, new_pattern, region);

  // Revert back original configuration.
  if (word3_buff != NULL) {
    word3_buff->bits.DST_SEL_W = dst_sel_w_original;
  }

  if (word3_image != NULL) {
    word3_image->bits.DST_SEL_W = dst_sel_w_original;
  }

  if (word1 != NULL) {
    word1->bits.FORMAT = num_format_original;
  }

  return status;
}

}  // namespace image
}  // namespace rocr
