//
// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved.
//

#include "platform/commandqueue.hpp"
#include "device/device.hpp"
#include "device/blit.hpp"
#include "utils/debug.hpp"

#include <cmath>

namespace device {

HostBlitManager::HostBlitManager(VirtualDevice& vDev, Setup setup)
    : BlitManager(setup), vDev_(vDev), dev_(vDev.device()) {}

bool HostBlitManager::readBuffer(device::Memory& srcMemory, void* dstHost,
                                 const amd::Coord3D& origin, const amd::Coord3D& size,
                                 bool entire) const {
  // Map the device memory to CPU visible
  void* src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly);
  if (NULL == src) {
    LogError("Couldn't map device memory for host read");
    return false;
  }

  // Copy memory
  amd::Os::fastMemcpy(dstHost, reinterpret_cast<const_address>(src) + origin[0], size[0]);

  // Unmap device memory
  srcMemory.cpuUnmap(vDev_);

  return true;
}

bool HostBlitManager::readBufferRect(device::Memory& srcMemory, void* dstHost,
                                     const amd::BufferRect& bufRect,
                                     const amd::BufferRect& hostRect, const amd::Coord3D& size,
                                     bool entire) const {
  // Map source memory
  void* src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly);
  if (src == NULL) {
    LogError("Couldn't map source memory");
    return false;
  }

  size_t srcOffset;
  size_t dstOffset;

  for (size_t z = 0; z < size[2]; ++z) {
    for (size_t y = 0; y < size[1]; ++y) {
      srcOffset = bufRect.offset(0, y, z);
      dstOffset = hostRect.offset(0, y, z);

      // Copy memory line by line
      amd::Os::fastMemcpy((reinterpret_cast<address>(dstHost) + dstOffset),
                          (reinterpret_cast<const_address>(src) + srcOffset), size[0]);
    }
  }

  // Unmap source memory
  srcMemory.cpuUnmap(vDev_);

  return true;
}

bool HostBlitManager::readImage(device::Memory& srcMemory, void* dstHost,
                                const amd::Coord3D& origin, const amd::Coord3D& size,
                                size_t rowPitch, size_t slicePitch, bool entire) const {
  size_t startLayer = origin[2];
  size_t numLayers = size[2];
  if (srcMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
    startLayer = origin[1];
    numLayers = size[1];
  }

  // rowPitch and slicePitch in bytes
  size_t srcRowPitch;
  size_t srcSlicePitch;

  // Get physical GPU memmory
  void* src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly, startLayer, numLayers, &srcRowPitch,
                               &srcSlicePitch);
  if (NULL == src) {
    LogError("Couldn't map GPU memory for host read");
    return false;
  }

  size_t elementSize = srcMemory.owner()->asImage()->getImageFormat().getElementSize();
  size_t srcOffsBase = origin[0] * elementSize;
  size_t copySize = size[0] * elementSize;
  size_t srcOffs;
  size_t dstOffs = 0;

  // Make sure we use the right pitch if it's not specified
  if (rowPitch == 0) {
    rowPitch = size[0] * elementSize;
  }

  // Make sure we use the right slice if it's not specified
  if (slicePitch == 0) {
    slicePitch = size[0] * size[1] * elementSize;
  }

  // Adjust destination offset with Y dimension
  srcOffsBase += srcRowPitch * origin[1];

  // Adjust the destination offset with Z dimension
  srcOffsBase += srcSlicePitch * origin[2];

  // Copy memory line by line
  for (size_t slice = 0; slice < size[2]; ++slice) {
    srcOffs = srcOffsBase + slice * srcSlicePitch;
    dstOffs = slice * slicePitch;

    // Copy memory line by line
    for (size_t row = 0; row < size[1]; ++row) {
      // Copy memory
      amd::Os::fastMemcpy((reinterpret_cast<address>(dstHost) + dstOffs),
                          (reinterpret_cast<const_address>(src) + srcOffs), copySize);

      srcOffs += srcRowPitch;
      dstOffs += rowPitch;
    }
  }

  // Unmap the device memory
  srcMemory.cpuUnmap(vDev_);

  return true;
}

bool HostBlitManager::writeBuffer(const void* srcHost, device::Memory& dstMemory,
                                  const amd::Coord3D& origin, const amd::Coord3D& size,
                                  bool entire) const {
  uint flags = 0;
  if (entire) {
    flags = Memory::CpuWriteOnly;
  }

  // Map the device memory to CPU visible
  void* dst = dstMemory.cpuMap(vDev_, flags);
  if (NULL == dst) {
    LogError("Couldn't map GPU memory for host write");
    return false;
  }

  // Copy memory
  amd::Os::fastMemcpy(reinterpret_cast<address>(dst) + origin[0], srcHost, size[0]);

  // Unmap the device memory
  dstMemory.cpuUnmap(vDev_);

  return true;
}

bool HostBlitManager::writeBufferRect(const void* srcHost, device::Memory& dstMemory,
                                      const amd::BufferRect& hostRect,
                                      const amd::BufferRect& bufRect, const amd::Coord3D& size,
                                      bool entire) const {
  // Map destination memory
  void* dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0);
  if (dst == NULL) {
    LogError("Couldn't map destination memory");
    return false;
  }

  size_t srcOffset;
  size_t dstOffset;

  for (size_t z = 0; z < size[2]; ++z) {
    for (size_t y = 0; y < size[1]; ++y) {
      srcOffset = hostRect.offset(0, y, z);
      dstOffset = bufRect.offset(0, y, z);

      // Copy memory line by line
      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffset),
                          (reinterpret_cast<const_address>(srcHost) + srcOffset), size[0]);
    }
  }

  // Unmap destination memory
  dstMemory.cpuUnmap(vDev_);

  return true;
}

bool HostBlitManager::writeImage(const void* srcHost, device::Memory& dstMemory,
                                 const amd::Coord3D& origin, const amd::Coord3D& size,
                                 size_t rowPitch, size_t slicePitch, bool entire) const {
  uint flags = 0;
  if (entire) {
    flags = Memory::CpuWriteOnly;
  }

  size_t startLayer = origin[2];
  size_t numLayers = size[2];
  if (dstMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
    startLayer = origin[1];
    numLayers = size[1];
  }

  // rowPitch and slicePitch in bytes
  size_t dstRowPitch;
  size_t dstSlicePitch;
  // Map the device memory to CPU visible
  void* dst = dstMemory.cpuMap(vDev_, flags, startLayer, numLayers, &dstRowPitch, &dstSlicePitch);
  if (NULL == dst) {
    LogError("Couldn't map GPU memory for host write");
    return false;
  }

  size_t elementSize = dstMemory.owner()->asImage()->getImageFormat().getElementSize();
  size_t srcOffs = 0;
  size_t copySize = size[0] * elementSize;
  size_t dstOffsBase = origin[0] * elementSize;
  size_t dstOffs;

  // Make sure we use the right pitch if it's not specified
  if (rowPitch == 0) {
    rowPitch = size[0] * elementSize;
  }

  // Make sure we use the right slice if it's not specified
  if (slicePitch == 0) {
    slicePitch = size[0] * size[1] * elementSize;
  }

  // Adjust the destination offset with Y dimension
  dstOffsBase += dstRowPitch * origin[1];

  // Adjust the destination offset with Z dimension
  dstOffsBase += dstSlicePitch * origin[2];

  // Copy memory slice by slice
  for (size_t slice = 0; slice < size[2]; ++slice) {
    dstOffs = dstOffsBase + slice * dstSlicePitch;
    srcOffs = slice * slicePitch;

    // Copy memory line by line
    for (size_t row = 0; row < size[1]; ++row) {
      // Copy memory
      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
                          (reinterpret_cast<const_address>(srcHost) + srcOffs), copySize);

      dstOffs += dstRowPitch;
      srcOffs += rowPitch;
    }
  }

  // Unmap the device memory
  dstMemory.cpuUnmap(vDev_);

  return true;
}

bool HostBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
                                 const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                                 const amd::Coord3D& size, bool entire) const {
  // Map source memory
  void* src = srcMemory.cpuMap(vDev_,
                               // Overlap detection
                               (&srcMemory == &dstMemory) ? 0 : Memory::CpuReadOnly);
  if (src == NULL) {
    LogError("Couldn't map source memory");
    return false;
  }

  // Map destination memory
  void* dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0);
  if (dst == NULL) {
    LogError("Couldn't map destination memory");
    return false;
  }

  // Straight forward buffer copy
  amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOrigin[0]),
                      (reinterpret_cast<const_address>(src) + srcOrigin[0]), size[0]);

  // Unmap source and destination memory
  dstMemory.cpuUnmap(vDev_);
  srcMemory.cpuUnmap(vDev_);

  return true;
}

bool HostBlitManager::copyBufferRect(device::Memory& srcMemory, device::Memory& dstMemory,
                                     const amd::BufferRect& srcRect, const amd::BufferRect& dstRect,
                                     const amd::Coord3D& size, bool entire) const {
  // Map source memory
  void* src = srcMemory.cpuMap(vDev_,
                               // Overlap detection
                               (&srcMemory == &dstMemory) ? 0 : Memory::CpuReadOnly);
  if (src == NULL) {
    LogError("Couldn't map source memory");
    return false;
  }

  // Map destination memory
  void* dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0);
  if (dst == NULL) {
    LogError("Couldn't map destination memory");
    return false;
  }

  for (size_t z = 0; z < size[2]; ++z) {
    for (size_t y = 0; y < size[1]; ++y) {
      size_t srcOffset = srcRect.offset(0, y, z);
      size_t dstOffset = dstRect.offset(0, y, z);

      // Copy memory line by line
      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffset),
                          (reinterpret_cast<const_address>(src) + srcOffset), size[0]);
    }
  }

  // Unmap source and destination memory
  dstMemory.cpuUnmap(vDev_);
  srcMemory.cpuUnmap(vDev_);

  return true;
}

bool HostBlitManager::copyImageToBuffer(device::Memory& srcMemory, device::Memory& dstMemory,
                                        const amd::Coord3D& srcOrigin,
                                        const amd::Coord3D& dstOrigin, const amd::Coord3D& size,
                                        bool entire, size_t rowPitch, size_t slicePitch) const {
  size_t startLayer = srcOrigin[2];
  size_t numLayers = size[2];
  if (srcMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
    startLayer = srcOrigin[1];
    numLayers = size[1];
  }
  // rowPitch and slicePitch in bytes
  size_t srcRowPitch;
  size_t srcSlicePitch;
  // Map source memory
  void* src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly, startLayer, numLayers, &srcRowPitch,
                               &srcSlicePitch);
  if (src == NULL) {
    LogError("Couldn't map source memory");
    return false;
  }
  size_t elementSize = srcMemory.owner()->asImage()->getImageFormat().getElementSize();

  // Map destination memory
  void* dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0);
  if (dst == NULL) {
    LogError("Couldn't map destination memory");
    return false;
  }

  size_t srcOffs = srcOrigin[0];
  size_t dstOffs = dstOrigin[0];
  size_t srcOffsOrg;
  size_t copySize = size[0];

  // Calculate the offset in bytes
  srcOffs *= elementSize;
  copySize *= elementSize;

  // Adjust source offset with Y and Z dimensions
  srcOffs += srcRowPitch * srcOrigin[1];
  srcOffs += srcSlicePitch * srcOrigin[2];

  srcOffsOrg = srcOffs;

  // Copy memory slice by slice
  for (size_t slice = 0; slice < size[2]; ++slice) {
    srcOffs = srcOffsOrg + slice * srcSlicePitch;

    // Copy memory line by line
    for (size_t rows = 0; rows < size[1]; ++rows) {
      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
                          (reinterpret_cast<const_address>(src) + srcOffs), copySize);

      srcOffs += srcRowPitch;
      dstOffs += copySize;
    }
  }

  // Unmap source and destination memory
  srcMemory.cpuUnmap(vDev_);
  dstMemory.cpuUnmap(vDev_);

  return true;
}

bool HostBlitManager::copyBufferToImage(device::Memory& srcMemory, device::Memory& dstMemory,
                                        const amd::Coord3D& srcOrigin,
                                        const amd::Coord3D& dstOrigin, const amd::Coord3D& size,
                                        bool entire, size_t rowPitch, size_t slicePitch) const {
  // Map source memory
  void* src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly);
  if (src == NULL) {
    LogError("Couldn't map source memory");
    return false;
  }

  size_t startLayer = dstOrigin[2];
  size_t numLayers = size[2];
  if (dstMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
    startLayer = dstOrigin[1];
    numLayers = size[1];
  }
  // rowPitch and slicePitch in bytes
  size_t dstRowPitch;
  size_t dstSlicePitch;
  // Map destination memory
  void* dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0, startLayer, numLayers,
                               &dstRowPitch, &dstSlicePitch);
  if (dst == NULL) {
    LogError("Couldn't map destination memory");
    return false;
  }

  size_t elementSize = dstMemory.owner()->asImage()->getImageFormat().getElementSize();
  size_t srcOffs = srcOrigin[0];
  size_t dstOffs = dstOrigin[0];
  size_t dstOffsOrg;
  size_t copySize = size[0];

  // Calculate the offset in bytes
  dstOffs *= elementSize;
  copySize *= elementSize;

  // Adjust destination offset with Y and Z dimension
  dstOffs += dstRowPitch * dstOrigin[1];
  dstOffs += dstSlicePitch * dstOrigin[2];

  dstOffsOrg = dstOffs;

  // Copy memory slice by slice
  for (size_t slice = 0; slice < size[2]; ++slice) {
    dstOffs = dstOffsOrg + slice * dstSlicePitch;

    // Copy memory line by line
    for (size_t rows = 0; rows < size[1]; ++rows) {
      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
                          (reinterpret_cast<const_address>(src) + srcOffs), copySize);

      srcOffs += copySize;
      dstOffs += dstRowPitch;
    }
  }

  // Unmap source and destination memory
  srcMemory.cpuUnmap(vDev_);
  dstMemory.cpuUnmap(vDev_);

  return true;
}

bool HostBlitManager::copyImage(device::Memory& srcMemory, device::Memory& dstMemory,
                                const amd::Coord3D& srcOrigin, const amd::Coord3D& dstOrigin,
                                const amd::Coord3D& size, bool entire) const {
  size_t startLayer = srcOrigin[2];
  size_t numLayers = size[2];
  if (srcMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
    startLayer = srcOrigin[1];
    numLayers = size[1];
  }
  // rowPitch and slicePitch in bytes
  size_t srcRowPitch;
  size_t srcSlicePitch;
  // Map source memory
  void* src = srcMemory.cpuMap(vDev_, Memory::CpuReadOnly, startLayer, numLayers, &srcRowPitch,
                               &srcSlicePitch);
  if (src == NULL) {
    LogError("Couldn't map source memory");
    return false;
  }
  if (dstMemory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
    startLayer = dstOrigin[1];
    numLayers = size[1];
  } else {
    startLayer = dstOrigin[2];
    numLayers = size[2];
  }

  // rowPitch and slicePitch in bytes
  size_t dstRowPitch;
  size_t dstSlicePitch;
  // Map destination memory
  void* dst = dstMemory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0, startLayer, numLayers,
                               &dstRowPitch, &dstSlicePitch);
  if (dst == NULL) {
    LogError("Couldn't map destination memory");
    return false;
  }

  size_t elementSize = dstMemory.owner()->asImage()->getImageFormat().getElementSize();
  assert(elementSize == srcMemory.owner()->asImage()->getImageFormat().getElementSize());

  size_t srcOffs = srcOrigin[0];
  size_t dstOffs = dstOrigin[0];
  size_t srcOffsOrg;
  size_t dstOffsOrg;
  size_t copySize = size[0];

  // Calculate the offsets in bytes
  srcOffs *= elementSize;
  dstOffs *= elementSize;
  copySize *= elementSize;

  // Adjust destination and sorce offsets with Y dimension
  srcOffs += srcRowPitch * srcOrigin[1];
  dstOffs += dstRowPitch * dstOrigin[1];

  // Adjust destination and sorce offsets with Z dimension
  srcOffs += srcSlicePitch * srcOrigin[2];
  dstOffs += dstSlicePitch * dstOrigin[2];

  srcOffsOrg = srcOffs;
  dstOffsOrg = dstOffs;

  // Copy memory slice by slice
  for (size_t slice = 0; slice < size[2]; ++slice) {
    srcOffs = srcOffsOrg + slice * srcSlicePitch;
    dstOffs = dstOffsOrg + slice * dstSlicePitch;

    // Copy memory line by line
    for (size_t rows = 0; rows < size[1]; ++rows) {
      amd::Os::fastMemcpy((reinterpret_cast<address>(dst) + dstOffs),
                          (reinterpret_cast<const_address>(src) + srcOffs), copySize);

      srcOffs += srcRowPitch;
      dstOffs += dstRowPitch;
    }
  }

  // Unmap source and destination memory
  srcMemory.cpuUnmap(vDev_);
  dstMemory.cpuUnmap(vDev_);

  return true;
}

bool HostBlitManager::fillBuffer(device::Memory& memory, const void* pattern, size_t patternSize,
                                 const amd::Coord3D& origin, const amd::Coord3D& size,
                                 bool entire) const {
  // Map memory
  void* fillMem = memory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0);
  if (fillMem == NULL) {
    LogError("Couldn't map destination memory");
    return false;
  }

  size_t offset = origin[0];
  size_t fillSize = size[0];

  if ((fillSize % patternSize) != 0) {
    LogError("Misaligned buffer size and pattern size!");
  }

  // Fill the buffer memory with a pattern
  for (size_t i = 0; i < (fillSize / patternSize); i++) {
    memcpy((reinterpret_cast<address>(fillMem) + offset),
           (reinterpret_cast<const_address>(pattern)), patternSize);
    offset += patternSize;
  }

  // Unmap source and destination memory
  memory.cpuUnmap(vDev_);

  return true;
}

bool HostBlitManager::fillImage(device::Memory& memory, const void* pattern,
                                const amd::Coord3D& origin, const amd::Coord3D& size,
                                bool entire) const {
  size_t startLayer = origin[2];
  size_t numLayers = size[2];
  if (memory.owner()->getType() == CL_MEM_OBJECT_IMAGE1D_ARRAY) {
    startLayer = origin[1];
    numLayers = size[1];
  }
  // rowPitch and slicePitch in bytes
  size_t devRowPitch;
  size_t devSlicePitch;

  void* newpattern = const_cast<void*>(pattern);
  cl_float4 fFillColor;

  // Converting a linear RGB floating-point color value to a normalized 8-bit unsigned integer sRGB
  // value so that the cpu path can treat sRGB as RGB for host transfer.
  if (memory.owner()->asImage()->getImageFormat().image_channel_order == CL_sRGBA) {
    float* fColor = static_cast<float*>(newpattern);
    fFillColor.s[0] = sRGBmap(fColor[0]) / 255.0f;
    fFillColor.s[1] = sRGBmap(fColor[1]) / 255.0f;
    fFillColor.s[2] = sRGBmap(fColor[2]) / 255.0f;
    fFillColor.s[3] = fColor[3];
    newpattern = static_cast<void*>(&fFillColor);
  }

  // Map memory
  void* fillMem = memory.cpuMap(vDev_, (entire) ? Memory::CpuWriteOnly : 0, startLayer, numLayers,
                                &devRowPitch, &devSlicePitch);
  if (fillMem == NULL) {
    LogError("Couldn't map destination memory");
    return false;
  }

  float fillValue[4];
  memset(fillValue, 0, sizeof(fillValue));
  memory.owner()->asImage()->getImageFormat().formatColor(newpattern, fillValue);

  size_t elementSize = memory.owner()->asImage()->getImageFormat().getElementSize();
  size_t offset = origin[0] * elementSize;
  size_t offsetOrg;

  // Adjust offset with Y dimension
  offset += devRowPitch * origin[1];

  // Adjust offset with Z dimension
  offset += devSlicePitch * origin[2];

  offsetOrg = offset;

  // Fill the image memory with a pattern
  for (size_t slice = 0; slice < size[2]; ++slice) {
    offset = offsetOrg + slice * devSlicePitch;

    for (size_t rows = 0; rows < size[1]; ++rows) {
      size_t pixOffset = offset;

      // Copy memory pixel by pixel
      for (size_t column = 0; column < size[0]; ++column) {
        memcpy((reinterpret_cast<address>(fillMem) + pixOffset),
               (reinterpret_cast<const_address>(fillValue)), elementSize);
        pixOffset += elementSize;
      }

      offset += devRowPitch;
    }
  }

  // Unmap memory
  memory.cpuUnmap(vDev_);

  return true;
}

cl_uint HostBlitManager::sRGBmap(float fc) const {
  double c = (double)fc;

#ifdef ATI_OS_LINUX
  if (std::isnan(c)) c = 0.0;
#else
  if (_isnan(c)) c = 0.0;
#endif

  if (c > 1.0)
    c = 1.0;
  else if (c < 0.0)
    c = 0.0;
  else if (c < 0.0031308)
    c = 12.92 * c;
  else
    c = (1055.0 / 1000.0) * pow(c, 5.0 / 12.0) - (55.0 / 1000.0);

  return (cl_uint)(c * 255.0 + 0.5);
}
}  // namespace gpu
