/****************************************************************************
**
** Copyright (C) 2016 The Qt Company Ltd.
** Contact: https://www.qt.io/licensing/
**
** This file is part of the QtQuick module of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:LGPL$
** Commercial License Usage
** Licensees holding valid commercial Qt licenses may use this file in
** accordance with the commercial license agreement provided with the
** Software or, alternatively, in accordance with the terms contained in
** a written agreement between you and The Qt Company. For licensing terms
** and conditions see https://www.qt.io/terms-conditions. For further
** information use the contact form at https://www.qt.io/contact-us.
**
** GNU Lesser General Public License Usage
** Alternatively, this file may be used under the terms of the GNU Lesser
** General Public License version 3 as published by the Free Software
** Foundation and appearing in the file LICENSE.LGPL3 included in the
** packaging of this file. Please review the following information to
** ensure the GNU Lesser General Public License version 3 requirements
** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
**
** GNU General Public License Usage
** Alternatively, this file may be used under the terms of the GNU
** General Public License version 2.0 or (at your option) the GNU General
** Public license version 3 or any later version approved by the KDE Free
** Qt Foundation. The licenses are as published by the Free Software
** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
** included in the packaging of this file. Please review the following
** information to ensure the GNU General Public License requirements will
** be met: https://www.gnu.org/licenses/gpl-2.0.html and
** https://www.gnu.org/licenses/gpl-3.0.html.
**
** $QT_END_LICENSE$
**
****************************************************************************/

#include "qsgd3d12engine_p.h"
#include "qsgd3d12engine_p_p.h"
#include "cs_mipmapgen.hlslh"
#include <QString>
#include <QColor>
#include <QLoggingCategory>
#include <qmath.h>
#include <qalgorithms.h>

// Comment out to disable DeviceLossTester functionality in order to reduce
// code size and improve startup perf a tiny bit.
#define DEVLOSS_TEST

#ifdef DEVLOSS_TEST
#include "cs_tdr.hlslh"
#endif

#ifdef Q_OS_WINRT
#include <QtCore/private/qeventdispatcher_winrt_p.h>
#include <functional>
#include <windows.ui.xaml.h>
#include <windows.ui.xaml.media.dxinterop.h>
#endif

#include <comdef.h>

QT_BEGIN_NAMESPACE

// NOTE: Avoid categorized logging. It is slow.

#define DECLARE_DEBUG_VAR(variable) \
    static bool debug_ ## variable() \
    { static bool value = qgetenv("QSG_RENDERER_DEBUG").contains(QT_STRINGIFY(variable)); return value; }

DECLARE_DEBUG_VAR(render)
DECLARE_DEBUG_VAR(descheap)
DECLARE_DEBUG_VAR(buffer)
DECLARE_DEBUG_VAR(texture)

// Except for system info on startup.
Q_LOGGING_CATEGORY(QSG_LOG_INFO_GENERAL, "qt.scenegraph.general")


// Any changes to the defaults below must be reflected in adaptations.qdoc as
// well and proven by qmlbench or similar.

static const int DEFAULT_SWAP_CHAIN_BUFFER_COUNT = 3;
static const int DEFAULT_FRAME_IN_FLIGHT_COUNT = 2;
static const int DEFAULT_WAITABLE_SWAP_CHAIN_MAX_LATENCY = 0;

static const int MAX_DRAW_CALLS_PER_LIST = 4096;

static const int MAX_CACHED_ROOTSIG = 16;
static const int MAX_CACHED_PSO = 64;

static const int GPU_CBVSRVUAV_DESCRIPTORS = 512;

static const DXGI_FORMAT RT_COLOR_FORMAT = DXGI_FORMAT_R8G8B8A8_UNORM;

static const int BUCKETS_PER_HEAP = 8; // must match freeMap
static const int DESCRIPTORS_PER_BUCKET = 32; // the bit map (freeMap) is quint32
static const int MAX_DESCRIPTORS_PER_HEAP = BUCKETS_PER_HEAP * DESCRIPTORS_PER_BUCKET;

static QString comErrorMessage(HRESULT hr)
{
#ifndef Q_OS_WINRT
    const _com_error comError(hr);
#else
    const _com_error comError(hr, nullptr);
#endif
    QString result = QLatin1String("Error 0x") + QString::number(ulong(hr), 16);
    if (const wchar_t *msg = comError.ErrorMessage())
        result += QLatin1String(": ") + QString::fromWCharArray(msg);
    return result;
}

D3D12_CPU_DESCRIPTOR_HANDLE QSGD3D12CPUDescriptorHeapManager::allocate(D3D12_DESCRIPTOR_HEAP_TYPE type)
{
    D3D12_CPU_DESCRIPTOR_HANDLE h = {};
    for (Heap &heap : m_heaps) {
        if (heap.type == type) {
            for (int bucket = 0; bucket < _countof(heap.freeMap); ++bucket)
                if (heap.freeMap[bucket]) {
                    uint freePos = qCountTrailingZeroBits(heap.freeMap[bucket]);
                    heap.freeMap[bucket] &= ~(1UL << freePos);
                    if (Q_UNLIKELY(debug_descheap()))
                        qDebug("descriptor handle heap %p type %x reserve in bucket %d index %d", &heap, type, bucket, freePos);
                    freePos += bucket * DESCRIPTORS_PER_BUCKET;
                    h = heap.start;
                    h.ptr += freePos * heap.handleSize;
                    return h;
                }
        }
    }

    Heap heap;
    heap.type = type;
    heap.handleSize = m_handleSizes[type];

    D3D12_DESCRIPTOR_HEAP_DESC heapDesc = {};
    heapDesc.NumDescriptors = MAX_DESCRIPTORS_PER_HEAP;
    heapDesc.Type = type;
    // The heaps created here are _never_ shader-visible.

    HRESULT hr = m_device->CreateDescriptorHeap(&heapDesc, IID_PPV_ARGS(&heap.heap));
    if (FAILED(hr)) {
        qWarning("Failed to create heap with type 0x%x: %s",
                 type, qPrintable(comErrorMessage(hr)));
        return h;
    }

    heap.start = heap.heap->GetCPUDescriptorHandleForHeapStart();

    if (Q_UNLIKELY(debug_descheap()))
        qDebug("new descriptor heap, type %x, start %llu", type, heap.start.ptr);

    heap.freeMap[0] = 0xFFFFFFFE;
    for (int i = 1; i < _countof(heap.freeMap); ++i)
        heap.freeMap[i] = 0xFFFFFFFF;

    h = heap.start;

    m_heaps.append(heap);

    return h;
}

void QSGD3D12CPUDescriptorHeapManager::release(D3D12_CPU_DESCRIPTOR_HANDLE handle, D3D12_DESCRIPTOR_HEAP_TYPE type)
{
    for (Heap &heap : m_heaps) {
        if (heap.type == type
                && handle.ptr >= heap.start.ptr
                && handle.ptr < heap.start.ptr + heap.handleSize * MAX_DESCRIPTORS_PER_HEAP) {
            unsigned long pos = (handle.ptr - heap.start.ptr) / heap.handleSize;
            const int bucket = pos / DESCRIPTORS_PER_BUCKET;
            const int indexInBucket = pos - bucket * DESCRIPTORS_PER_BUCKET;
            heap.freeMap[bucket] |= 1UL << indexInBucket;
            if (Q_UNLIKELY(debug_descheap()))
                qDebug("free descriptor handle heap %p type %x bucket %d index %d", &heap, type, bucket, indexInBucket);
            return;
        }
    }
    qWarning("QSGD3D12CPUDescriptorHeapManager: Attempted to release untracked descriptor handle %llu of type %d", handle.ptr, type);
}

void QSGD3D12CPUDescriptorHeapManager::initialize(ID3D12Device *device)
{
    m_device = device;

    for (int i = 0; i < D3D12_DESCRIPTOR_HEAP_TYPE_NUM_TYPES; ++i)
        m_handleSizes[i] = m_device->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE(i));
}

void QSGD3D12CPUDescriptorHeapManager::releaseResources()
{
    for (Heap &heap : m_heaps)
        heap.heap = nullptr;

    m_heaps.clear();

    m_device = nullptr;
}

// One device per process, one everything else (engine) per window.
Q_GLOBAL_STATIC(QSGD3D12DeviceManager, deviceManager)

static void getHardwareAdapter(IDXGIFactory1 *factory, IDXGIAdapter1 **outAdapter)
{
    const D3D_FEATURE_LEVEL fl = D3D_FEATURE_LEVEL_11_0;
    ComPtr<IDXGIAdapter1> adapter;
    DXGI_ADAPTER_DESC1 desc;

    for (int adapterIndex = 0; factory->EnumAdapters1(adapterIndex, &adapter) != DXGI_ERROR_NOT_FOUND; ++adapterIndex) {
        DXGI_ADAPTER_DESC1 desc;
        adapter->GetDesc1(&desc);
        const QString name = QString::fromUtf16((char16_t *) desc.Description);
        qCDebug(QSG_LOG_INFO_GENERAL, "Adapter %d: '%s' (flags 0x%x)", adapterIndex, qPrintable(name), desc.Flags);
    }

    if (qEnvironmentVariableIsSet("QT_D3D_ADAPTER_INDEX")) {
        const int adapterIndex = qEnvironmentVariableIntValue("QT_D3D_ADAPTER_INDEX");
        if (SUCCEEDED(factory->EnumAdapters1(adapterIndex, &adapter))) {
            adapter->GetDesc1(&desc);
            const QString name = QString::fromUtf16((char16_t *) desc.Description);
            HRESULT hr = D3D12CreateDevice(adapter.Get(), fl, _uuidof(ID3D12Device), nullptr);
            if (SUCCEEDED(hr)) {
                qCDebug(QSG_LOG_INFO_GENERAL, "Using requested adapter '%s'", qPrintable(name));
                *outAdapter = adapter.Detach();
                return;
            } else {
                qWarning("Failed to create device for requested adapter '%s': %s",
                         qPrintable(name), qPrintable(comErrorMessage(hr)));
            }
        }
    }

    for (int adapterIndex = 0; factory->EnumAdapters1(adapterIndex, &adapter) != DXGI_ERROR_NOT_FOUND; ++adapterIndex) {
        adapter->GetDesc1(&desc);
        if (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE)
            continue;

        if (SUCCEEDED(D3D12CreateDevice(adapter.Get(), fl, _uuidof(ID3D12Device), nullptr))) {
            const QString name = QString::fromUtf16((char16_t *) desc.Description);
            qCDebug(QSG_LOG_INFO_GENERAL, "Using adapter '%s'", qPrintable(name));
            break;
        }
    }

    *outAdapter = adapter.Detach();
}

ID3D12Device *QSGD3D12DeviceManager::ref()
{
    ensureCreated();
    m_ref.ref();
    return m_device.Get();
}

void QSGD3D12DeviceManager::unref()
{
    if (!m_ref.deref()) {
        if (Q_UNLIKELY(debug_render()))
            qDebug("destroying d3d device");
        m_device = nullptr;
        m_factory = nullptr;
    }
}

void QSGD3D12DeviceManager::deviceLossDetected()
{
    for (DeviceLossObserver *observer : qAsConst(m_observers))
        observer->deviceLost();

    // Nothing else to do here. All windows are expected to release their
    // resources and call unref() in response immediately.
}

IDXGIFactory4 *QSGD3D12DeviceManager::dxgi()
{
    ensureCreated();
    return m_factory.Get();
}

void QSGD3D12DeviceManager::ensureCreated()
{
    if (m_device)
        return;

    HRESULT hr = CreateDXGIFactory2(0, IID_PPV_ARGS(&m_factory));
    if (FAILED(hr)) {
        qWarning("Failed to create DXGI: %s", qPrintable(comErrorMessage(hr)));
        return;
    }

    ComPtr<IDXGIAdapter1> adapter;
    getHardwareAdapter(m_factory.Get(), &adapter);

    bool warp = true;
    if (adapter) {
        HRESULT hr = D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&m_device));
        if (SUCCEEDED(hr))
            warp = false;
        else
            qWarning("Failed to create device: %s", qPrintable(comErrorMessage(hr)));
    }

    if (warp) {
        qCDebug(QSG_LOG_INFO_GENERAL, "Using WARP");
        m_factory->EnumWarpAdapter(IID_PPV_ARGS(&adapter));
        HRESULT hr = D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_11_0, IID_PPV_ARGS(&m_device));
        if (FAILED(hr)) {
            qWarning("Failed to create WARP device: %s", qPrintable(comErrorMessage(hr)));
            return;
        }
    }

    ComPtr<IDXGIAdapter3> adapter3;
    if (SUCCEEDED(adapter.As(&adapter3))) {
        DXGI_QUERY_VIDEO_MEMORY_INFO vidMemInfo;
        if (SUCCEEDED(adapter3->QueryVideoMemoryInfo(0, DXGI_MEMORY_SEGMENT_GROUP_LOCAL, &vidMemInfo))) {
            qCDebug(QSG_LOG_INFO_GENERAL, "Video memory info: LOCAL: Budget %llu KB CurrentUsage %llu KB AvailableForReservation %llu KB CurrentReservation %llu KB",
                    vidMemInfo.Budget / 1024, vidMemInfo.CurrentUsage / 1024,
                    vidMemInfo.AvailableForReservation / 1024, vidMemInfo.CurrentReservation / 1024);
        }
        if (SUCCEEDED(adapter3->QueryVideoMemoryInfo(0, DXGI_MEMORY_SEGMENT_GROUP_NON_LOCAL, &vidMemInfo))) {
            qCDebug(QSG_LOG_INFO_GENERAL, "Video memory info: NON-LOCAL: Budget %llu KB CurrentUsage %llu KB AvailableForReservation %llu KB CurrentReservation %llu KB",
                    vidMemInfo.Budget / 1024, vidMemInfo.CurrentUsage / 1024,
                    vidMemInfo.AvailableForReservation / 1024, vidMemInfo.CurrentReservation / 1024);
        }
    }
}

void QSGD3D12DeviceManager::registerDeviceLossObserver(DeviceLossObserver *observer)
{
    if (!m_observers.contains(observer))
        m_observers.append(observer);
}

QSGD3D12Engine::QSGD3D12Engine()
{
    d = new QSGD3D12EnginePrivate;
}

QSGD3D12Engine::~QSGD3D12Engine()
{
    d->waitGPU();
    d->releaseResources();
    delete d;
}

bool QSGD3D12Engine::attachToWindow(WId window, const QSize &size, float dpr, int surfaceFormatSamples, bool alpha)
{
    if (d->isInitialized()) {
        qWarning("QSGD3D12Engine: Cannot attach active engine to window");
        return false;
    }

    d->initialize(window, size, dpr, surfaceFormatSamples, alpha);
    return d->isInitialized();
}

void QSGD3D12Engine::releaseResources()
{
    d->releaseResources();
}

bool QSGD3D12Engine::hasResources() const
{
    // An explicit releaseResources() or a device loss results in initialized == false.
    return d->isInitialized();
}

void QSGD3D12Engine::setWindowSize(const QSize &size, float dpr)
{
    d->setWindowSize(size, dpr);
}

WId QSGD3D12Engine::window() const
{
    return d->currentWindow();
}

QSize QSGD3D12Engine::windowSize() const
{
    return d->currentWindowSize();
}

float QSGD3D12Engine::windowDevicePixelRatio() const
{
    return d->currentWindowDpr();
}

uint QSGD3D12Engine::windowSamples() const
{
    return d->currentWindowSamples();
}

void QSGD3D12Engine::beginFrame()
{
    d->beginFrame();
}

void QSGD3D12Engine::endFrame()
{
    d->endFrame();
}

void QSGD3D12Engine::beginLayer()
{
    d->beginLayer();
}

void QSGD3D12Engine::endLayer()
{
    d->endLayer();
}

void QSGD3D12Engine::invalidateCachedFrameState()
{
    d->invalidateCachedFrameState();
}

void QSGD3D12Engine::restoreFrameState(bool minimal)
{
    d->restoreFrameState(minimal);
}

void QSGD3D12Engine::finalizePipeline(const QSGD3D12PipelineState &pipelineState)
{
    d->finalizePipeline(pipelineState);
}

uint QSGD3D12Engine::genBuffer()
{
    return d->genBuffer();
}

void QSGD3D12Engine::releaseBuffer(uint id)
{
    d->releaseBuffer(id);
}

void QSGD3D12Engine::resetBuffer(uint id, const quint8 *data, int size)
{
    d->resetBuffer(id, data, size);
}

void QSGD3D12Engine::markBufferDirty(uint id, int offset, int size)
{
    d->markBufferDirty(id, offset, size);
}

void QSGD3D12Engine::queueViewport(const QRect &rect)
{
    d->queueViewport(rect);
}

void QSGD3D12Engine::queueScissor(const QRect &rect)
{
    d->queueScissor(rect);
}

void QSGD3D12Engine::queueSetRenderTarget(uint id)
{
    d->queueSetRenderTarget(id);
}

void QSGD3D12Engine::queueClearRenderTarget(const QColor &color)
{
    d->queueClearRenderTarget(color);
}

void QSGD3D12Engine::queueClearDepthStencil(float depthValue, quint8 stencilValue, ClearFlags which)
{
    d->queueClearDepthStencil(depthValue, stencilValue, which);
}

void QSGD3D12Engine::queueSetBlendFactor(const QVector4D &factor)
{
    d->queueSetBlendFactor(factor);
}

void QSGD3D12Engine::queueSetStencilRef(quint32 ref)
{
    d->queueSetStencilRef(ref);
}

void QSGD3D12Engine::queueDraw(const DrawParams &params)
{
    d->queueDraw(params);
}

void QSGD3D12Engine::present()
{
    d->present();
}

void QSGD3D12Engine::waitGPU()
{
    d->waitGPU();
}

uint QSGD3D12Engine::genTexture()
{
    return d->genTexture();
}

void QSGD3D12Engine::createTexture(uint id, const QSize &size, QImage::Format format, TextureCreateFlags flags)
{
    d->createTexture(id, size, format, flags);
}

void QSGD3D12Engine::queueTextureResize(uint id, const QSize &size)
{
    d->queueTextureResize(id, size);
}

void QSGD3D12Engine::queueTextureUpload(uint id, const QImage &image, const QPoint &dstPos, TextureUploadFlags flags)
{
    d->queueTextureUpload(id, QVector<QImage>() << image, QVector<QPoint>() << dstPos, flags);
}

void QSGD3D12Engine::queueTextureUpload(uint id, const QVector<QImage> &images, const QVector<QPoint> &dstPos,
                                        TextureUploadFlags flags)
{
    d->queueTextureUpload(id, images, dstPos, flags);
}

void QSGD3D12Engine::releaseTexture(uint id)
{
    d->releaseTexture(id);
}

void QSGD3D12Engine::useTexture(uint id)
{
    d->useTexture(id);
}

uint QSGD3D12Engine::genRenderTarget()
{
    return d->genRenderTarget();
}

void QSGD3D12Engine::createRenderTarget(uint id, const QSize &size, const QVector4D &clearColor, uint samples)
{
    d->createRenderTarget(id, size, clearColor, samples);
}

void QSGD3D12Engine::releaseRenderTarget(uint id)
{
    d->releaseRenderTarget(id);
}

void QSGD3D12Engine::useRenderTargetAsTexture(uint id)
{
    d->useRenderTargetAsTexture(id);
}

uint QSGD3D12Engine::activeRenderTarget() const
{
    return d->activeRenderTarget();
}

QImage QSGD3D12Engine::executeAndWaitReadbackRenderTarget(uint id)
{
    return d->executeAndWaitReadbackRenderTarget(id);
}

void QSGD3D12Engine::simulateDeviceLoss()
{
    d->simulateDeviceLoss();
}

void *QSGD3D12Engine::getResource(QQuickWindow *, QSGRendererInterface::Resource resource) const
{
    return d->getResource(resource);
}

static inline quint32 alignedSize(quint32 size, quint32 byteAlign)
{
    return (size + byteAlign - 1) & ~(byteAlign - 1);
}

quint32 QSGD3D12Engine::alignedConstantBufferSize(quint32 size)
{
    return alignedSize(size, D3D12_CONSTANT_BUFFER_DATA_PLACEMENT_ALIGNMENT);
}

QSGD3D12Format QSGD3D12Engine::toDXGIFormat(QSGGeometry::Type sgtype, int tupleSize, int *size)
{
    QSGD3D12Format format = FmtUnknown;

    static const QSGD3D12Format formatMap_ub[] = { FmtUnknown,
                                                   FmtUNormByte,
                                                   FmtUNormByte2,
                                                   FmtUnknown,
                                                   FmtUNormByte4 };

    static const QSGD3D12Format formatMap_f[] = { FmtUnknown,
                                                  FmtFloat,
                                                  FmtFloat2,
                                                  FmtFloat3,
                                                  FmtFloat4 };

    switch (sgtype) {
    case QSGGeometry::UnsignedByteType:
        format = formatMap_ub[tupleSize];
        if (size)
            *size = tupleSize;
        break;
    case QSGGeometry::FloatType:
        format = formatMap_f[tupleSize];
        if (size)
            *size = sizeof(float) * tupleSize;
        break;

    case QSGGeometry::UnsignedShortType:
        format = FmtUnsignedShort;
        if (size)
            *size = sizeof(ushort) * tupleSize;
        break;
    case QSGGeometry::UnsignedIntType:
        format = FmtUnsignedInt;
        if (size)
            *size = sizeof(uint) * tupleSize;
        break;

    case QSGGeometry::ByteType:
    case QSGGeometry::IntType:
    case QSGGeometry::ShortType:
        qWarning("no mapping for GL type 0x%x", sgtype);
        break;

    default:
        qWarning("unknown GL type 0x%x", sgtype);
        break;
    }

    return format;
}

int QSGD3D12Engine::mipMapLevels(const QSize &size)
{
    return ceil(log2(qMax(size.width(), size.height()))) + 1;
}

inline static bool isPowerOfTwo(int x)
{
    // Assumption: x >= 1
    return x == (x & -x);
}

QSize QSGD3D12Engine::mipMapAdjustedSourceSize(const QSize &size)
{
    if (size.isEmpty())
        return size;

    QSize adjustedSize = size;

    // ### for now only power-of-two sizes are mipmap-capable
    if (!isPowerOfTwo(size.width()))
        adjustedSize.setWidth(qNextPowerOfTwo(size.width()));
    if (!isPowerOfTwo(size.height()))
        adjustedSize.setHeight(qNextPowerOfTwo(size.height()));

    return adjustedSize;
}

void QSGD3D12EnginePrivate::releaseResources()
{
    if (!initialized)
        return;

    mipmapper.releaseResources();
    devLossTest.releaseResources();

    frameCommandList = nullptr;
    copyCommandList = nullptr;

    copyCommandAllocator = nullptr;
    for (int i = 0; i < frameInFlightCount; ++i) {
        frameCommandAllocator[i] = nullptr;
        pframeData[i].gpuCbvSrvUavHeap = nullptr;
        delete frameFence[i];
    }

    defaultDS = nullptr;
    for (int i = 0; i < swapChainBufferCount; ++i) {
        backBufferRT[i] = nullptr;
        defaultRT[i] = nullptr;
    }

    psoCache.clear();
    rootSigCache.clear();
    buffers.clear();
    textures.clear();
    renderTargets.clear();

    cpuDescHeapManager.releaseResources();

    commandQueue = nullptr;
    copyCommandQueue = nullptr;

#ifndef Q_OS_WINRT
    dcompTarget = nullptr;
    dcompVisual = nullptr;
    dcompDevice = nullptr;
#endif

    swapChain = nullptr;

    delete presentFence;
    textureUploadFence = nullptr;

    deviceManager()->unref();

    initialized = false;

    // 'window' must be kept, may just be a device loss
}

void QSGD3D12EnginePrivate::initialize(WId w, const QSize &size, float dpr, int surfaceFormatSamples, bool alpha)
{
    if (initialized)
        return;

    window = w;
    windowSize = size;
    windowDpr = dpr;
    windowSamples = qMax(1, surfaceFormatSamples); // may be -1 or 0, whereas windowSamples is uint and >= 1
    windowAlpha = alpha;

    swapChainBufferCount = qMin(qEnvironmentVariableIntValue("QT_D3D_BUFFER_COUNT"), MAX_SWAP_CHAIN_BUFFER_COUNT);
    if (swapChainBufferCount < 2)
        swapChainBufferCount = DEFAULT_SWAP_CHAIN_BUFFER_COUNT;

    frameInFlightCount = qMin(qEnvironmentVariableIntValue("QT_D3D_FRAME_COUNT"), MAX_FRAME_IN_FLIGHT_COUNT);
    if (frameInFlightCount < 1)
        frameInFlightCount = DEFAULT_FRAME_IN_FLIGHT_COUNT;

    static const char *latReqEnvVar = "QT_D3D_WAITABLE_SWAP_CHAIN_MAX_LATENCY";
    if (!qEnvironmentVariableIsSet(latReqEnvVar))
        waitableSwapChainMaxLatency = DEFAULT_WAITABLE_SWAP_CHAIN_MAX_LATENCY;
    else
        waitableSwapChainMaxLatency = qBound(0, qEnvironmentVariableIntValue(latReqEnvVar), 16);

    if (qEnvironmentVariableIsSet("QSG_INFO"))
        const_cast<QLoggingCategory &>(QSG_LOG_INFO_GENERAL()).setEnabled(QtDebugMsg, true);

    qCDebug(QSG_LOG_INFO_GENERAL, "d3d12 engine init. swap chain buffer count %d, max frames prepared without blocking %d",
            swapChainBufferCount, frameInFlightCount);
    if (waitableSwapChainMaxLatency)
        qCDebug(QSG_LOG_INFO_GENERAL, "Swap chain frame latency waitable object enabled. Frame latency is %d", waitableSwapChainMaxLatency);

    const bool debugLayer = qEnvironmentVariableIntValue("QT_D3D_DEBUG") != 0;
    if (debugLayer) {
        qCDebug(QSG_LOG_INFO_GENERAL, "Enabling debug layer");
#if !defined(Q_OS_WINRT) || !defined(NDEBUG)
        ComPtr<ID3D12Debug> debugController;
        if (SUCCEEDED(D3D12GetDebugInterface(IID_PPV_ARGS(&debugController))))
            debugController->EnableDebugLayer();
#else
        qCDebug(QSG_LOG_INFO_GENERAL, "Using DebugInterface will not allow certification to pass");
#endif
    }

    QSGD3D12DeviceManager *dev = deviceManager();
    device = dev->ref();
    dev->registerDeviceLossObserver(this);

    if (debugLayer) {
        ComPtr<ID3D12InfoQueue> infoQueue;
        if (SUCCEEDED(device->QueryInterface(IID_PPV_ARGS(&infoQueue)))) {
            infoQueue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_CORRUPTION, true);
            infoQueue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_ERROR, true);
            const bool breakOnWarning = qEnvironmentVariableIntValue("QT_D3D_DEBUG_BREAK_ON_WARNING") != 0;
            infoQueue->SetBreakOnSeverity(D3D12_MESSAGE_SEVERITY_WARNING, breakOnWarning);
            D3D12_INFO_QUEUE_FILTER filter = {};
            D3D12_MESSAGE_ID suppressedMessages[] = {
                // When using a render target other than the default one we
                // have no way to know the custom clear color, if there is one.
                D3D12_MESSAGE_ID_CLEARRENDERTARGETVIEW_MISMATCHINGCLEARVALUE
            };
            filter.DenyList.NumIDs = _countof(suppressedMessages);
            filter.DenyList.pIDList = suppressedMessages;
            // setting the filter would enable Info messages which we don't need
            D3D12_MESSAGE_SEVERITY infoSev = D3D12_MESSAGE_SEVERITY_INFO;
            filter.DenyList.NumSeverities = 1;
            filter.DenyList.pSeverityList = &infoSev;
            infoQueue->PushStorageFilter(&filter);
        }
    }

    D3D12_COMMAND_QUEUE_DESC queueDesc = {};
    queueDesc.Type = D3D12_COMMAND_LIST_TYPE_DIRECT;
    if (FAILED(device->CreateCommandQueue(&queueDesc, IID_PPV_ARGS(&commandQueue)))) {
        qWarning("Failed to create command queue");
        return;
    }

    queueDesc.Type = D3D12_COMMAND_LIST_TYPE_COPY;
    if (FAILED(device->CreateCommandQueue(&queueDesc, IID_PPV_ARGS(&copyCommandQueue)))) {
        qWarning("Failed to create copy command queue");
        return;
    }

#ifndef Q_OS_WINRT
    HWND hwnd = reinterpret_cast<HWND>(w);

    if (windowAlpha) {
        // Go through DirectComposition for semi-transparent windows since the
        // traditional approaches won't fly with flip model swapchains.
        HRESULT hr = DCompositionCreateDevice(nullptr, IID_PPV_ARGS(&dcompDevice));
        if (SUCCEEDED(hr)) {
            hr = dcompDevice->CreateTargetForHwnd(hwnd, true, &dcompTarget);
            if (SUCCEEDED(hr)) {
                hr = dcompDevice->CreateVisual(&dcompVisual);
                if (FAILED(hr)) {
                    qWarning("Failed to create DirectComposition visual: %s",
                             qPrintable(comErrorMessage(hr)));
                    windowAlpha = false;
                }
            } else {
                qWarning("Failed to create DirectComposition target: %s",
                         qPrintable(comErrorMessage(hr)));
                windowAlpha = false;
            }
        } else {
            qWarning("Failed to create DirectComposition device: %s",
                     qPrintable(comErrorMessage(hr)));
            windowAlpha = false;
        }
    }

    if (windowAlpha) {
        DXGI_SWAP_CHAIN_DESC1 swapChainDesc = {};
        swapChainDesc.Width = windowSize.width() * windowDpr;
        swapChainDesc.Height = windowSize.height() * windowDpr;
        swapChainDesc.Format = RT_COLOR_FORMAT;
        swapChainDesc.SampleDesc.Count = 1;
        swapChainDesc.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT;
        swapChainDesc.BufferCount = swapChainBufferCount;
        swapChainDesc.Scaling = DXGI_SCALING_STRETCH;
        swapChainDesc.SwapEffect = DXGI_SWAP_EFFECT_FLIP_DISCARD;
        swapChainDesc.AlphaMode = DXGI_ALPHA_MODE_PREMULTIPLIED;
        if (waitableSwapChainMaxLatency)
            swapChainDesc.Flags = DXGI_SWAP_CHAIN_FLAG_FRAME_LATENCY_WAITABLE_OBJECT;

        ComPtr<IDXGISwapChain1> baseSwapChain;
        HRESULT hr = dev->dxgi()->CreateSwapChainForComposition(commandQueue.Get(), &swapChainDesc, nullptr, &baseSwapChain);
        if (SUCCEEDED(hr)) {
            if (SUCCEEDED(baseSwapChain.As(&swapChain))) {
                hr = dcompVisual->SetContent(swapChain.Get());
                if (SUCCEEDED(hr)) {
                    hr = dcompTarget->SetRoot(dcompVisual.Get());
                    if (FAILED(hr)) {
                        qWarning("SetRoot failed for DirectComposition target: %s",
                                 qPrintable(comErrorMessage(hr)));
                        windowAlpha = false;
                    }
                } else {
                    qWarning("SetContent failed for DirectComposition visual: %s",
                             qPrintable(comErrorMessage(hr)));
                    windowAlpha = false;
                }
            } else {
                qWarning("Failed to cast swap chain");
                windowAlpha = false;
            }
        } else {
            qWarning("Failed to create swap chain for composition: 0x%x", hr);
            windowAlpha = false;
        }
    }

    if (!windowAlpha) {
        DXGI_SWAP_CHAIN_DESC swapChainDesc = {};
        swapChainDesc.BufferCount = swapChainBufferCount;
        swapChainDesc.BufferDesc.Width = windowSize.width() * windowDpr;
        swapChainDesc.BufferDesc.Height = windowSize.height() * windowDpr;
        swapChainDesc.BufferDesc.Format = RT_COLOR_FORMAT;
        swapChainDesc.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT;
        swapChainDesc.SwapEffect = DXGI_SWAP_EFFECT_FLIP_DISCARD; // D3D12 requires the flip model
        swapChainDesc.OutputWindow = hwnd;
        swapChainDesc.SampleDesc.Count = 1; // Flip does not support MSAA so no choice here
        swapChainDesc.Windowed = TRUE;
        if (waitableSwapChainMaxLatency)
            swapChainDesc.Flags = DXGI_SWAP_CHAIN_FLAG_FRAME_LATENCY_WAITABLE_OBJECT;

        ComPtr<IDXGISwapChain> baseSwapChain;
        HRESULT hr = dev->dxgi()->CreateSwapChain(commandQueue.Get(), &swapChainDesc, &baseSwapChain);
        if (FAILED(hr)) {
            qWarning("Failed to create swap chain: %s", qPrintable(comErrorMessage(hr)));
            return;
        }
        hr = baseSwapChain.As(&swapChain);
        if (FAILED(hr)) {
            qWarning("Failed to cast swap chain: %s", qPrintable(comErrorMessage(hr)));
            return;
        }
    }

    dev->dxgi()->MakeWindowAssociation(hwnd, DXGI_MWA_NO_ALT_ENTER);
#else
    DXGI_SWAP_CHAIN_DESC1 swapChainDesc = {};
    swapChainDesc.Width = windowSize.width() * windowDpr;
    swapChainDesc.Height = windowSize.height() * windowDpr;
    swapChainDesc.Format = RT_COLOR_FORMAT;
    swapChainDesc.SampleDesc.Count = 1;
    swapChainDesc.BufferUsage = DXGI_USAGE_RENDER_TARGET_OUTPUT;
    swapChainDesc.BufferCount = swapChainBufferCount;
    swapChainDesc.Scaling = DXGI_SCALING_STRETCH;
    swapChainDesc.SwapEffect = DXGI_SWAP_EFFECT_FLIP_DISCARD;
    swapChainDesc.AlphaMode = DXGI_ALPHA_MODE_PREMULTIPLIED;
    if (waitableSwapChainMaxLatency)
        swapChainDesc.Flags = DXGI_SWAP_CHAIN_FLAG_FRAME_LATENCY_WAITABLE_OBJECT;

    ComPtr<IDXGISwapChain1> baseSwapChain;
    HRESULT hr = dev->dxgi()->CreateSwapChainForComposition(commandQueue.Get(), &swapChainDesc, nullptr, &baseSwapChain);
    if (FAILED(hr)) {
        qWarning("Failed to create swap chain for composition: 0x%x", hr);
        return;
    }
    if (FAILED(baseSwapChain.As(&swapChain))) {
        qWarning("Failed to cast swap chain");
        return;
    }

    // The winrt platform plugin returns an ISwapChainPanel* from winId().
    ComPtr<ABI::Windows::UI::Xaml::Controls::ISwapChainPanel> swapChainPanel
            = reinterpret_cast<ABI::Windows::UI::Xaml::Controls::ISwapChainPanel *>(window);
    ComPtr<ISwapChainPanelNative> swapChainPanelNative;
    if (FAILED(swapChainPanel.As(&swapChainPanelNative))) {
        qWarning("Failed to cast swap chain panel to native");
        return;
    }
    hr = QEventDispatcherWinRT::runOnXamlThread([this, &swapChainPanelNative]() {
        return swapChainPanelNative->SetSwapChain(swapChain.Get());
    });
    if (FAILED(hr)) {
        qWarning("Failed to set swap chain on panel: 0x%x", hr);
        return;
    }
#endif

    if (waitableSwapChainMaxLatency) {
        if (FAILED(swapChain->SetMaximumFrameLatency(waitableSwapChainMaxLatency)))
            qWarning("Failed to set maximum frame latency to %d", waitableSwapChainMaxLatency);
        swapEvent = swapChain->GetFrameLatencyWaitableObject();
    }

    for (int i = 0; i < frameInFlightCount; ++i) {
        if (FAILED(device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&frameCommandAllocator[i])))) {
            qWarning("Failed to create command allocator");
            return;
        }
    }

    if (FAILED(device->CreateCommandAllocator(D3D12_COMMAND_LIST_TYPE_COPY, IID_PPV_ARGS(&copyCommandAllocator)))) {
        qWarning("Failed to create copy command allocator");
        return;
    }

    for (int i = 0; i < frameInFlightCount; ++i) {
        if (!createCbvSrvUavHeap(i, GPU_CBVSRVUAV_DESCRIPTORS))
            return;
    }

    cpuDescHeapManager.initialize(device);

    setupDefaultRenderTargets();

    if (FAILED(device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_DIRECT, frameCommandAllocator[0].Get(),
                                         nullptr, IID_PPV_ARGS(&frameCommandList)))) {
        qWarning("Failed to create command list");
        return;
    }
    // created in recording state, close it for now
    frameCommandList->Close();

    if (FAILED(device->CreateCommandList(0, D3D12_COMMAND_LIST_TYPE_COPY, copyCommandAllocator.Get(),
                                         nullptr, IID_PPV_ARGS(&copyCommandList)))) {
        qWarning("Failed to create copy command list");
        return;
    }
    copyCommandList->Close();

    frameIndex = 0;

    presentFence = createCPUWaitableFence();
    for (int i = 0; i < frameInFlightCount; ++i)
        frameFence[i] = createCPUWaitableFence();

    if (FAILED(device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&textureUploadFence)))) {
        qWarning("Failed to create fence");
        return;
    }

    psoCache.setMaxCost(MAX_CACHED_PSO);
    rootSigCache.setMaxCost(MAX_CACHED_ROOTSIG);

    if (!mipmapper.initialize(this))
        return;

    if (!devLossTest.initialize(this))
        return;

    currentRenderTarget = 0;

    initialized = true;
}

bool QSGD3D12EnginePrivate::createCbvSrvUavHeap(int pframeIndex, int descriptorCount)
{
    D3D12_DESCRIPTOR_HEAP_DESC gpuDescHeapDesc = {};
    gpuDescHeapDesc.NumDescriptors = descriptorCount;
    gpuDescHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
    gpuDescHeapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;

    if (FAILED(device->CreateDescriptorHeap(&gpuDescHeapDesc, IID_PPV_ARGS(&pframeData[pframeIndex].gpuCbvSrvUavHeap)))) {
        qWarning("Failed to create shader-visible CBV-SRV-UAV heap");
        return false;
    }

    pframeData[pframeIndex].gpuCbvSrvUavHeapSize = descriptorCount;

    return true;
}

DXGI_SAMPLE_DESC QSGD3D12EnginePrivate::makeSampleDesc(DXGI_FORMAT format, uint samples)
{
    DXGI_SAMPLE_DESC sampleDesc;
    sampleDesc.Count = 1;
    sampleDesc.Quality = 0;

    if (samples > 1) {
        D3D12_FEATURE_DATA_MULTISAMPLE_QUALITY_LEVELS msaaInfo = {};
        msaaInfo.Format = format;
        msaaInfo.SampleCount = samples;
        if (SUCCEEDED(device->CheckFeatureSupport(D3D12_FEATURE_MULTISAMPLE_QUALITY_LEVELS, &msaaInfo, sizeof(msaaInfo)))) {
            if (msaaInfo.NumQualityLevels > 0) {
                sampleDesc.Count = samples;
                sampleDesc.Quality = msaaInfo.NumQualityLevels - 1;
            } else {
                qWarning("No quality levels for multisampling with sample count %d", samples);
            }
        } else {
            qWarning("Failed to query multisample quality levels for sample count %d", samples);
        }
    }

    return sampleDesc;
}

ID3D12Resource *QSGD3D12EnginePrivate::createColorBuffer(D3D12_CPU_DESCRIPTOR_HANDLE viewHandle, const QSize &size,
                                                         const QVector4D &clearColor, uint samples)
{
    D3D12_CLEAR_VALUE clearValue = {};
    clearValue.Format = RT_COLOR_FORMAT;
    clearValue.Color[0] = clearColor.x();
    clearValue.Color[1] = clearColor.y();
    clearValue.Color[2] = clearColor.z();
    clearValue.Color[3] = clearColor.w();

    D3D12_HEAP_PROPERTIES heapProp = {};
    heapProp.Type = D3D12_HEAP_TYPE_DEFAULT;

    D3D12_RESOURCE_DESC rtDesc = {};
    rtDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;
    rtDesc.Width = size.width();
    rtDesc.Height = size.height();
    rtDesc.DepthOrArraySize = 1;
    rtDesc.MipLevels = 1;
    rtDesc.Format = RT_COLOR_FORMAT;
    rtDesc.SampleDesc = makeSampleDesc(rtDesc.Format, samples);
    rtDesc.Flags = D3D12_RESOURCE_FLAG_ALLOW_RENDER_TARGET;

    ID3D12Resource *resource = nullptr;
    const D3D12_RESOURCE_STATES initialState = samples <= 1
            ? D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE
            : D3D12_RESOURCE_STATE_RENDER_TARGET;
    if (FAILED(device->CreateCommittedResource(&heapProp, D3D12_HEAP_FLAG_NONE, &rtDesc,
                                               initialState, &clearValue, IID_PPV_ARGS(&resource)))) {
        qWarning("Failed to create offscreen render target of size %dx%d", size.width(), size.height());
        return nullptr;
    }

    device->CreateRenderTargetView(resource, nullptr, viewHandle);

    return resource;
}

ID3D12Resource *QSGD3D12EnginePrivate::createDepthStencil(D3D12_CPU_DESCRIPTOR_HANDLE viewHandle, const QSize &size, uint samples)
{
    D3D12_CLEAR_VALUE depthClearValue = {};
    depthClearValue.Format = DXGI_FORMAT_D24_UNORM_S8_UINT;
    depthClearValue.DepthStencil.Depth = 1.0f;
    depthClearValue.DepthStencil.Stencil = 0;

    D3D12_HEAP_PROPERTIES heapProp = {};
    heapProp.Type = D3D12_HEAP_TYPE_DEFAULT;

    D3D12_RESOURCE_DESC bufDesc = {};
    bufDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;
    bufDesc.Width = size.width();
    bufDesc.Height = size.height();
    bufDesc.DepthOrArraySize = 1;
    bufDesc.MipLevels = 1;
    bufDesc.Format = DXGI_FORMAT_D24_UNORM_S8_UINT;
    bufDesc.SampleDesc = makeSampleDesc(bufDesc.Format, samples);
    bufDesc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN;
    bufDesc.Flags = D3D12_RESOURCE_FLAG_ALLOW_DEPTH_STENCIL;

    ID3D12Resource *resource = nullptr;
    if (FAILED(device->CreateCommittedResource(&heapProp, D3D12_HEAP_FLAG_NONE, &bufDesc,
                                               D3D12_RESOURCE_STATE_DEPTH_WRITE, &depthClearValue, IID_PPV_ARGS(&resource)))) {
        qWarning("Failed to create depth-stencil buffer of size %dx%d", size.width(), size.height());
        return nullptr;
    }

    D3D12_DEPTH_STENCIL_VIEW_DESC depthStencilDesc = {};
    depthStencilDesc.Format = DXGI_FORMAT_D24_UNORM_S8_UINT;
    depthStencilDesc.ViewDimension = bufDesc.SampleDesc.Count <= 1 ? D3D12_DSV_DIMENSION_TEXTURE2D : D3D12_DSV_DIMENSION_TEXTURE2DMS;

    device->CreateDepthStencilView(resource, &depthStencilDesc, viewHandle);

    return resource;
}

void QSGD3D12EnginePrivate::setupDefaultRenderTargets()
{
    for (int i = 0; i < swapChainBufferCount; ++i) {
        if (FAILED(swapChain->GetBuffer(i, IID_PPV_ARGS(&backBufferRT[i])))) {
            qWarning("Failed to get buffer %d from swap chain", i);
            return;
        }
        defaultRTV[i] = cpuDescHeapManager.allocate(D3D12_DESCRIPTOR_HEAP_TYPE_RTV);
        if (windowSamples == 1) {
            defaultRT[i] = backBufferRT[i];
            device->CreateRenderTargetView(defaultRT[i].Get(), nullptr, defaultRTV[i]);
        } else {
            const QSize size(windowSize.width() * windowDpr, windowSize.height() * windowDpr);
            // Not optimal if the user called setClearColor, but there's so
            // much we can do. The debug layer warning is suppressed so we're good to go.
            const QColor cc(Qt::white);
            const QVector4D clearColor(cc.redF(), cc.greenF(), cc.blueF(), cc.alphaF());
            ID3D12Resource *msaaRT = createColorBuffer(defaultRTV[i], size, clearColor, windowSamples);
            if (msaaRT)
                defaultRT[i].Attach(msaaRT);
        }
    }

    defaultDSV = cpuDescHeapManager.allocate(D3D12_DESCRIPTOR_HEAP_TYPE_DSV);
    const QSize size(windowSize.width() * windowDpr, windowSize.height() * windowDpr);
    ID3D12Resource *ds = createDepthStencil(defaultDSV, size, windowSamples);
    if (ds)
        defaultDS.Attach(ds);

    presentFrameIndex = 0;
}

void QSGD3D12EnginePrivate::setWindowSize(const QSize &size, float dpr)
{
    if (!initialized || (windowSize == size && windowDpr == dpr))
        return;

    waitGPU();

    windowSize = size;
    windowDpr = dpr;

    if (Q_UNLIKELY(debug_render()))
        qDebug() << "resize" << size << dpr;

    // Clear these, otherwise resizing will fail.
    defaultDS = nullptr;
    cpuDescHeapManager.release(defaultDSV, D3D12_DESCRIPTOR_HEAP_TYPE_DSV);
    for (int i = 0; i < swapChainBufferCount; ++i) {
        backBufferRT[i] = nullptr;
        defaultRT[i] = nullptr;
        cpuDescHeapManager.release(defaultRTV[i], D3D12_DESCRIPTOR_HEAP_TYPE_RTV);
    }

    const int w = windowSize.width() * windowDpr;
    const int h = windowSize.height() * windowDpr;
    HRESULT hr = swapChain->ResizeBuffers(swapChainBufferCount, w, h, RT_COLOR_FORMAT,
                                          waitableSwapChainMaxLatency ? DXGI_SWAP_CHAIN_FLAG_FRAME_LATENCY_WAITABLE_OBJECT : 0);
    if (hr == DXGI_ERROR_DEVICE_REMOVED || hr == DXGI_ERROR_DEVICE_RESET) {
        deviceManager()->deviceLossDetected();
        return;
    } else if (FAILED(hr)) {
        qWarning("Failed to resize buffers: %s", qPrintable(comErrorMessage(hr)));
        return;
    }

    setupDefaultRenderTargets();
}

void QSGD3D12EnginePrivate::deviceLost()
{
    qWarning("D3D device lost, will attempt to reinitialize");

    // Release all resources. This is important because otherwise reinitialization may fail.
    releaseResources();

    // Now in uninitialized state (but 'window' is still valid). Will recreate
    // all the resources on the next beginFrame().
}

QSGD3D12CPUWaitableFence *QSGD3D12EnginePrivate::createCPUWaitableFence() const
{
    QSGD3D12CPUWaitableFence *f = new QSGD3D12CPUWaitableFence;
    HRESULT hr = device->CreateFence(f->value, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&f->fence));
    if (FAILED(hr)) {
        qWarning("Failed to create fence: %s", qPrintable(comErrorMessage(hr)));
        return f;
    }
    f->event = CreateEvent(nullptr, FALSE, FALSE, nullptr);
    return f;
}

void QSGD3D12EnginePrivate::waitForGPU(QSGD3D12CPUWaitableFence *f) const
{
    const UINT64 newValue = f->value.fetchAndAddAcquire(1) + 1;
    commandQueue->Signal(f->fence.Get(), newValue);
    if (f->fence->GetCompletedValue() < newValue) {
        HRESULT hr = f->fence->SetEventOnCompletion(newValue, f->event);
        if (FAILED(hr)) {
            qWarning("SetEventOnCompletion failed: %s", qPrintable(comErrorMessage(hr)));
            return;
        }
        WaitForSingleObject(f->event, INFINITE);
    }
}

void QSGD3D12EnginePrivate::transitionResource(ID3D12Resource *resource, ID3D12GraphicsCommandList *commandList,
                                               D3D12_RESOURCE_STATES before, D3D12_RESOURCE_STATES after) const
{
    D3D12_RESOURCE_BARRIER barrier;
    barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
    barrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
    barrier.Transition.pResource = resource;
    barrier.Transition.StateBefore = before;
    barrier.Transition.StateAfter = after;
    barrier.Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;

    commandList->ResourceBarrier(1, &barrier);
}

void QSGD3D12EnginePrivate::resolveMultisampledTarget(ID3D12Resource *msaa,
                                                      ID3D12Resource *resolve,
                                                      D3D12_RESOURCE_STATES resolveUsage,
                                                      ID3D12GraphicsCommandList *commandList) const
{
    D3D12_RESOURCE_BARRIER barriers[2];
    for (int i = 0; i < _countof(barriers); ++i) {
        barriers[i].Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
        barriers[i].Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
        barriers[i].Transition.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
    }

    barriers[0].Transition.pResource = msaa;
    barriers[0].Transition.StateBefore = D3D12_RESOURCE_STATE_RENDER_TARGET;
    barriers[0].Transition.StateAfter = D3D12_RESOURCE_STATE_RESOLVE_SOURCE;
    barriers[1].Transition.pResource = resolve;
    barriers[1].Transition.StateBefore = resolveUsage;
    barriers[1].Transition.StateAfter = D3D12_RESOURCE_STATE_RESOLVE_DEST;
    commandList->ResourceBarrier(2, barriers);

    commandList->ResolveSubresource(resolve, 0, msaa, 0, RT_COLOR_FORMAT);

    barriers[0].Transition.pResource = msaa;
    barriers[0].Transition.StateBefore = D3D12_RESOURCE_STATE_RESOLVE_SOURCE;
    barriers[0].Transition.StateAfter = D3D12_RESOURCE_STATE_RENDER_TARGET;
    barriers[1].Transition.pResource = resolve;
    barriers[1].Transition.StateBefore = D3D12_RESOURCE_STATE_RESOLVE_DEST;
    barriers[1].Transition.StateAfter = resolveUsage;
    commandList->ResourceBarrier(2, barriers);
}

void QSGD3D12EnginePrivate::uavBarrier(ID3D12Resource *resource, ID3D12GraphicsCommandList *commandList) const
{
    D3D12_RESOURCE_BARRIER barrier = {};
    barrier.Type = D3D12_RESOURCE_BARRIER_TYPE_UAV;
    barrier.UAV.pResource = resource;

    commandList->ResourceBarrier(1, &barrier);
}

ID3D12Resource *QSGD3D12EnginePrivate::createBuffer(int size)
{
    ID3D12Resource *buf;

    D3D12_HEAP_PROPERTIES uploadHeapProp = {};
    uploadHeapProp.Type = D3D12_HEAP_TYPE_UPLOAD;

    D3D12_RESOURCE_DESC bufDesc = {};
    bufDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
    bufDesc.Width = size;
    bufDesc.Height = 1;
    bufDesc.DepthOrArraySize = 1;
    bufDesc.MipLevels = 1;
    bufDesc.Format = DXGI_FORMAT_UNKNOWN;
    bufDesc.SampleDesc.Count = 1;
    bufDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;

    HRESULT hr = device->CreateCommittedResource(&uploadHeapProp, D3D12_HEAP_FLAG_NONE, &bufDesc,
                                                 D3D12_RESOURCE_STATE_GENERIC_READ, nullptr, IID_PPV_ARGS(&buf));
    if (FAILED(hr))
        qWarning("Failed to create buffer resource: %s", qPrintable(comErrorMessage(hr)));

    return buf;
}

void QSGD3D12EnginePrivate::ensureBuffer(Buffer *buf)
{
    Buffer::InFlightData &bfd(buf->d[currentPFrameIndex]);
    // Only enlarge, never shrink
    const bool newBufferNeeded = bfd.buffer ? (buf->cpuDataRef.size > bfd.resourceSize) : true;
    if (newBufferNeeded) {
        // Round it up and overallocate a little bit so that a subsequent
        // buffer contents rebuild with a slightly larger total size does
        // not lead to creating a new buffer.
        const quint32 sz = alignedSize(buf->cpuDataRef.size, 4096);
        if (Q_UNLIKELY(debug_buffer()))
            qDebug("new buffer[pf=%d] of size %d (actual data size %d)", currentPFrameIndex, sz, buf->cpuDataRef.size);
        bfd.buffer.Attach(createBuffer(sz));
        bfd.resourceSize = sz;
    }
    // Cache the actual data size in the per-in-flight-frame data as well.
    bfd.dataSize = buf->cpuDataRef.size;
}

void QSGD3D12EnginePrivate::updateBuffer(Buffer *buf)
{
    if (buf->cpuDataRef.dirty.isEmpty())
        return;

    Buffer::InFlightData &bfd(buf->d[currentPFrameIndex]);
    quint8 *p = nullptr;
    const D3D12_RANGE readRange = { 0, 0 };
    if (FAILED(bfd.buffer->Map(0, &readRange, reinterpret_cast<void **>(&p)))) {
        qWarning("Map failed for buffer of size %d", buf->cpuDataRef.size);
        return;
    }
    for (const auto &r : qAsConst(buf->cpuDataRef.dirty)) {
        if (Q_UNLIKELY(debug_buffer()))
            qDebug("%p o %d s %d", buf, r.first, r.second);
        memcpy(p + r.first, buf->cpuDataRef.p + r.first, r.second);
    }
    bfd.buffer->Unmap(0, nullptr);
    buf->cpuDataRef.dirty.clear();
}

void QSGD3D12EnginePrivate::ensureDevice()
{
    if (!initialized && window)
        initialize(window, windowSize, windowDpr, windowSamples, windowAlpha);
}

void QSGD3D12EnginePrivate::beginFrame()
{
    if (inFrame && !activeLayers)
        qFatal("beginFrame called again without an endFrame, frame index was %d", frameIndex);

    if (Q_UNLIKELY(debug_render()))
        qDebug() << "***** begin frame, logical" << frameIndex << "present" << presentFrameIndex << "layer" << activeLayers;

    if (inFrame && activeLayers) {
        if (Q_UNLIKELY(debug_render()))
            qDebug("frame %d already in progress", frameIndex);
        if (!currentLayerDepth) {
            // There are layers and the real frame preparation starts now. Prepare for present.
            beginFrameDraw();
        }
        return;
    }

    inFrame = true;

    // The device may have been lost. This is the point to attempt to start
    // again from scratch. Except when it is not. Operations that can happen
    // out of frame (e.g. textures, render targets) may trigger reinit earlier
    // than beginFrame.
    ensureDevice();

    // Wait for a buffer to be available for Present, if the waitable event is in use.
    if (waitableSwapChainMaxLatency)
        WaitForSingleObject(swapEvent, INFINITE);

    // Block if needed. With 2 frames in flight frame N waits for frame N - 2, but not N - 1, to finish.
    currentPFrameIndex = frameIndex % frameInFlightCount;
    if (frameIndex >= frameInFlightCount) {
        ID3D12Fence *fence = frameFence[currentPFrameIndex]->fence.Get();
        HANDLE event = frameFence[currentPFrameIndex]->event;
        // Frame fence values start from 1, hence the +1.
        const quint64 inFlightFenceValue = frameIndex - frameInFlightCount + 1;
        if (fence->GetCompletedValue() < inFlightFenceValue) {
            fence->SetEventOnCompletion(inFlightFenceValue, event);
            WaitForSingleObject(event, INFINITE);
        }
        frameCommandAllocator[currentPFrameIndex]->Reset();
    }

    PersistentFrameData &pfd(pframeData[currentPFrameIndex]);
    pfd.cbvSrvUavNextFreeDescriptorIndex = 0;

    for (Buffer &b : buffers) {
        if (b.entryInUse())
            b.d[currentPFrameIndex].dirty.clear();
    }

    if (frameIndex >= frameInFlightCount - 1) {
        // Now sync the buffer changes from the previous, potentially still in
        // flight, frames. This is done by taking the ranges dirtied in those
        // frames and adding them to the global CPU-side buffer's dirty list,
        // as if this frame changed those ranges. (however, dirty ranges
        // inherited this way are not added to this frame's persistent
        // per-frame dirty list because the next frame after this one should
        // inherit this frame's genuine changes only, the rest will come from
        // the earlier ones)
        for (int delta = frameInFlightCount - 1; delta >= 1; --delta) {
            const int prevPFrameIndex = (frameIndex - delta) % frameInFlightCount;
            PersistentFrameData &prevFrameData(pframeData[prevPFrameIndex]);
            for (uint id : qAsConst(prevFrameData.buffersUsedInFrame)) {
                Buffer &b(buffers[id - 1]);
                if (b.d[currentPFrameIndex].buffer && b.d[currentPFrameIndex].dataSize == b.cpuDataRef.size) {
                    if (Q_UNLIKELY(debug_buffer()))
                        qDebug() << "frame" << frameIndex << "takes dirty" << b.d[prevPFrameIndex].dirty
                                 << "from frame" << frameIndex - delta << "for buffer" << id;
                    for (const auto &range : qAsConst(b.d[prevPFrameIndex].dirty))
                        addDirtyRange(&b.cpuDataRef.dirty, range.first, range.second, b.cpuDataRef.size);
                } else {
                    if (Q_UNLIKELY(debug_buffer()))
                        qDebug() << "frame" << frameIndex << "makes all dirty from frame" << frameIndex - delta
                                 << "for buffer" << id;
                    addDirtyRange(&b.cpuDataRef.dirty, 0, b.cpuDataRef.size, b.cpuDataRef.size);
                }
            }
        }
    }

    if (frameIndex >= frameInFlightCount) {
        // Do some texture upload bookkeeping.
        const quint64 finishedFrameIndex = frameIndex - frameInFlightCount; // we know since we just blocked for this
        // pfd conveniently refers to the same slot that was used by that frame
        if (!pfd.pendingTextureUploads.isEmpty()) {
            if (Q_UNLIKELY(debug_texture()))
                qDebug("Removing texture upload data for frame %d", finishedFrameIndex);
            for (uint id : qAsConst(pfd.pendingTextureUploads)) {
                const int idx = id - 1;
                Texture &t(textures[idx]);
                // fenceValue is 0 when the previous frame cleared it, skip in
                // this case. Skip also when fenceValue > the value it was when
                // adding the last GPU wait - this is the case when more
                // uploads were queued for the same texture in the meantime.
                if (t.fenceValue && t.fenceValue == t.lastWaitFenceValue) {
                    t.fenceValue = 0;
                    t.lastWaitFenceValue = 0;
                    t.stagingBuffers.clear();
                    t.stagingHeaps.clear();
                    if (Q_UNLIKELY(debug_texture()))
                        qDebug("Cleaned staging data for texture %u", id);
                }
            }
            pfd.pendingTextureUploads.clear();
            if (!pfd.pendingTextureMipMap.isEmpty()) {
                if (Q_UNLIKELY(debug_texture()))
                    qDebug() << "cleaning mipmap generation data for " << pfd.pendingTextureMipMap;
                // no special cleanup is needed as mipmap generation uses the frame's resources
                pfd.pendingTextureMipMap.clear();
            }
            bool hasPending = false;
            for (int delta = 1; delta < frameInFlightCount; ++delta) {
                const PersistentFrameData &prevFrameData(pframeData[(frameIndex - delta) % frameInFlightCount]);
                if (!prevFrameData.pendingTextureUploads.isEmpty()) {
                    hasPending = true;
                    break;
                }
            }
            if (!hasPending) {
                if (Q_UNLIKELY(debug_texture()))
                    qDebug("no more pending textures");
                copyCommandAllocator->Reset();
            }
        }

        // Do the deferred deletes.
        if (!pfd.deleteQueue.isEmpty()) {
            for (PersistentFrameData::DeleteQueueEntry &e : pfd.deleteQueue) {
                e.res = nullptr;
                e.descHeap = nullptr;
                if (e.cpuDescriptorPtr) {
                    D3D12_CPU_DESCRIPTOR_HANDLE h = { e.cpuDescriptorPtr };
                    cpuDescHeapManager.release(h, e.descHeapType);
                }
            }
            pfd.deleteQueue.clear();
        }
        // Deferred deletes issued outside a begin-endFrame go to the next
        // frame's out-of-frame delete queue as these cannot be executed in the
        // next beginFrame, only in next + frameInFlightCount. Move to the
        // normal queue if this is the next beginFrame.
        if (!pfd.outOfFrameDeleteQueue.isEmpty()) {
            pfd.deleteQueue = pfd.outOfFrameDeleteQueue;
            pfd.outOfFrameDeleteQueue.clear();
        }

        // Mark released texture, buffer, etc. slots free.
        if (!pfd.pendingReleases.isEmpty()) {
            for (const auto &pr : qAsConst(pfd.pendingReleases)) {
                Q_ASSERT(pr.id);
                if (pr.type == PersistentFrameData::PendingRelease::TypeTexture) {
                    Texture &t(textures[pr.id - 1]);
                    Q_ASSERT(t.entryInUse());
                    t.flags &= ~RenderTarget::EntryInUse; // createTexture() can now reuse this entry
                    t.texture = nullptr;
                } else if (pr.type == PersistentFrameData::PendingRelease::TypeBuffer) {
                    Buffer &b(buffers[pr.id - 1]);
                    Q_ASSERT(b.entryInUse());
                    b.flags &= ~Buffer::EntryInUse;
                    for (int i = 0; i < frameInFlightCount; ++i)
                        b.d[i].buffer = nullptr;
                } else {
                    qFatal("Corrupt pending release list, type %d", pr.type);
                }
            }
            pfd.pendingReleases.clear();
        }
        if (!pfd.outOfFramePendingReleases.isEmpty()) {
            pfd.pendingReleases = pfd.outOfFramePendingReleases;
            pfd.outOfFramePendingReleases.clear();
        }
    }

    pfd.buffersUsedInFrame.clear();

    beginDrawCalls();

    // Prepare for present if this is a frame without layers.
    if (!activeLayers)
        beginFrameDraw();
}

void QSGD3D12EnginePrivate::beginDrawCalls()
{
    frameCommandList->Reset(frameCommandAllocator[frameIndex % frameInFlightCount].Get(), nullptr);
    commandList = frameCommandList.Get();
    invalidateCachedFrameState();
}

void QSGD3D12EnginePrivate::invalidateCachedFrameState()
{
    tframeData.drawingMode = QSGGeometry::DrawingMode(-1);
    tframeData.currentIndexBuffer = 0;
    tframeData.activeTextureCount = 0;
    tframeData.drawCount = 0;
    tframeData.lastPso = nullptr;
    tframeData.lastRootSig = nullptr;
    tframeData.descHeapSet = false;
}

void QSGD3D12EnginePrivate::restoreFrameState(bool minimal)
{
    queueSetRenderTarget(currentRenderTarget);
    if (!minimal) {
        queueViewport(tframeData.viewport);
        queueScissor(tframeData.scissor);
        queueSetBlendFactor(tframeData.blendFactor);
        queueSetStencilRef(tframeData.stencilRef);
    }
    finalizePipeline(tframeData.pipelineState);
}

void QSGD3D12EnginePrivate::beginFrameDraw()
{
    if (windowSamples == 1)
        transitionResource(defaultRT[presentFrameIndex % swapChainBufferCount].Get(), commandList,
                D3D12_RESOURCE_STATE_PRESENT, D3D12_RESOURCE_STATE_RENDER_TARGET);
}

void QSGD3D12EnginePrivate::endFrame()
{
    if (!inFrame)
        qFatal("endFrame called without beginFrame, frame index %d", frameIndex);

    if (Q_UNLIKELY(debug_render()))
        qDebug("***** end frame");

    endDrawCalls(true);

    commandQueue->Signal(frameFence[frameIndex % frameInFlightCount]->fence.Get(), frameIndex + 1);
    ++frameIndex;

    inFrame = false;
}

void QSGD3D12EnginePrivate::endDrawCalls(bool lastInFrame)
{
    PersistentFrameData &pfd(pframeData[currentPFrameIndex]);

    // Now is the time to sync all the changed areas in the buffers.
    if (Q_UNLIKELY(debug_buffer()))
        qDebug() << "buffers used in drawcall set" << pfd.buffersUsedInDrawCallSet;
    for (uint id : qAsConst(pfd.buffersUsedInDrawCallSet))
        updateBuffer(&buffers[id - 1]);

    pfd.buffersUsedInFrame += pfd.buffersUsedInDrawCallSet;
    pfd.buffersUsedInDrawCallSet.clear();

    // Add a wait on the 3D queue for the relevant texture uploads on the copy queue.
    if (!pfd.pendingTextureUploads.isEmpty()) {
        quint64 topFenceValue = 0;
        for (uint id : qAsConst(pfd.pendingTextureUploads)) {
            const int idx = id - 1;
            Texture &t(textures[idx]);
            Q_ASSERT(t.fenceValue);
            // skip if already added a Wait in the previous frame
            if (t.lastWaitFenceValue == t.fenceValue)
                continue;
            t.lastWaitFenceValue = t.fenceValue;
            if (t.fenceValue > topFenceValue)
                topFenceValue = t.fenceValue;
            if (t.mipmap())
                pfd.pendingTextureMipMap.insert(id);
        }
        if (topFenceValue) {
            if (Q_UNLIKELY(debug_texture()))
                qDebug("added wait for texture fence %llu", topFenceValue);
            commandQueue->Wait(textureUploadFence.Get(), topFenceValue);
            // Generate mipmaps after the wait, when necessary.
            if (!pfd.pendingTextureMipMap.isEmpty()) {
                if (Q_UNLIKELY(debug_texture()))
                    qDebug() << "starting mipmap generation for" << pfd.pendingTextureMipMap;
                for (uint id : qAsConst(pfd.pendingTextureMipMap))
                    mipmapper.queueGenerate(textures[id - 1]);
            }
        }
    }

    if (lastInFrame) {
        // Resolve and transition the backbuffer for present, if needed.
        const int idx = presentFrameIndex % swapChainBufferCount;
        if (windowSamples == 1) {
            transitionResource(defaultRT[idx].Get(), commandList,
                               D3D12_RESOURCE_STATE_RENDER_TARGET, D3D12_RESOURCE_STATE_PRESENT);
        } else {
            if (Q_UNLIKELY(debug_render())) {
                const D3D12_RESOURCE_DESC desc = defaultRT[idx]->GetDesc();
                qDebug("added resolve for multisampled render target (count %d, quality %d)",
                       desc.SampleDesc.Count, desc.SampleDesc.Quality);
            }
            resolveMultisampledTarget(defaultRT[idx].Get(), backBufferRT[idx].Get(),
                                      D3D12_RESOURCE_STATE_PRESENT, commandList);
        }

        if (activeLayers) {
            if (Q_UNLIKELY(debug_render()))
                qDebug("this frame had %d layers", activeLayers);
            activeLayers = 0;
        }
    }

    // Go!
    HRESULT hr = frameCommandList->Close();
    if (FAILED(hr)) {
        qWarning("Failed to close command list: %s", qPrintable(comErrorMessage(hr)));
        if (hr == E_INVALIDARG)
            qWarning("Invalid arguments. Some of the commands in the list is invalid in some way.");
    }

    ID3D12CommandList *commandLists[] = { frameCommandList.Get() };
    commandQueue->ExecuteCommandLists(_countof(commandLists), commandLists);

    commandList = nullptr;
}

void QSGD3D12EnginePrivate::beginLayer()
{
    if (inFrame && !activeLayers)
        qFatal("Layer rendering cannot be started while a frame is active");

    if (Q_UNLIKELY(debug_render()))
        qDebug("===== beginLayer active %d depth %d (inFrame=%d)", activeLayers, currentLayerDepth, inFrame);

    ++activeLayers;
    ++currentLayerDepth;

    // Do an early beginFrame. With multiple layers this results in
    // beginLayer - beginFrame - endLayer - beginLayer - beginFrame - endLayer - ... - (*) beginFrame - endFrame
    // where (*) denotes the start of the preparation of the actual, non-layer frame.

    if (activeLayers == 1)
        beginFrame();
}

void QSGD3D12EnginePrivate::endLayer()
{
    if (!inFrame || !activeLayers || !currentLayerDepth)
        qFatal("Mismatched endLayer");

    if (Q_UNLIKELY(debug_render()))
        qDebug("===== endLayer active %d depth %d", activeLayers, currentLayerDepth);

    --currentLayerDepth;

    // Do not touch activeLayers. It remains valid until endFrame.
}

// Root signature:
// [0] CBV - always present
// [1] table with one SRV per texture (must be a table since root descriptor SRVs cannot be textures) - optional
// one static sampler per texture - optional
//
// SRVs can be created freely via QSGD3D12CPUDescriptorHeapManager and stored
// in QSGD3D12TextureView. The engine will copy them onto a dedicated,
// shader-visible CBV-SRV-UAV heap in the correct order.

void QSGD3D12EnginePrivate::finalizePipeline(const QSGD3D12PipelineState &pipelineState)
{
    if (!inFrame) {
        qWarning("%s: Cannot be called outside begin/endFrame", __FUNCTION__);
        return;
    }

    tframeData.pipelineState = pipelineState;

    RootSigCacheEntry *cachedRootSig = rootSigCache[pipelineState.shaders.rootSig];
    if (!cachedRootSig) {
        if (Q_UNLIKELY(debug_render()))
            qDebug("NEW ROOTSIG");

        cachedRootSig = new RootSigCacheEntry;

        D3D12_ROOT_PARAMETER rootParams[4];
        int rootParamCount = 0;

        rootParams[0].ParameterType = D3D12_ROOT_PARAMETER_TYPE_CBV;
        rootParams[0].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
        rootParams[0].Descriptor.ShaderRegister = 0; // b0
        rootParams[0].Descriptor.RegisterSpace = 0;
        ++rootParamCount;

        D3D12_DESCRIPTOR_RANGE tvDescRange;
        if (pipelineState.shaders.rootSig.textureViewCount > 0) {
            rootParams[rootParamCount].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
            rootParams[rootParamCount].ShaderVisibility = D3D12_SHADER_VISIBILITY_PIXEL;
            rootParams[rootParamCount].DescriptorTable.NumDescriptorRanges = 1;
            tvDescRange.RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SRV;
            tvDescRange.NumDescriptors = pipelineState.shaders.rootSig.textureViewCount;
            tvDescRange.BaseShaderRegister = 0; // t0, t1, ...
            tvDescRange.RegisterSpace = 0;
            tvDescRange.OffsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND;
            rootParams[rootParamCount].DescriptorTable.pDescriptorRanges = &tvDescRange;
            ++rootParamCount;
        }

        Q_ASSERT(rootParamCount <= _countof(rootParams));
        D3D12_ROOT_SIGNATURE_DESC desc;
        desc.NumParameters = rootParamCount;
        desc.pParameters = rootParams;
        // Mixing up samplers and resource views in QSGD3D12TextureView means
        // that the number of static samplers has to match the number of
        // textures. This is not really ideal in general but works for Quick's use cases.
        // The shaders can still choose to declare and use fewer samplers, if they want to.
        desc.NumStaticSamplers = pipelineState.shaders.rootSig.textureViewCount;
        D3D12_STATIC_SAMPLER_DESC staticSamplers[8];
        int sdIdx = 0;
        Q_ASSERT(pipelineState.shaders.rootSig.textureViewCount <= _countof(staticSamplers));
        for (int i = 0; i < pipelineState.shaders.rootSig.textureViewCount; ++i) {
            const QSGD3D12TextureView &tv(pipelineState.shaders.rootSig.textureViews[i]);
            D3D12_STATIC_SAMPLER_DESC sd = {};
            sd.Filter = D3D12_FILTER(tv.filter);
            sd.AddressU = D3D12_TEXTURE_ADDRESS_MODE(tv.addressModeHoriz);
            sd.AddressV = D3D12_TEXTURE_ADDRESS_MODE(tv.addressModeVert);
            sd.AddressW = D3D12_TEXTURE_ADDRESS_MODE_CLAMP;
            sd.MinLOD = 0.0f;
            sd.MaxLOD = D3D12_FLOAT32_MAX;
            sd.ShaderRegister = sdIdx; // t0, t1, ...
            sd.ShaderVisibility = D3D12_SHADER_VISIBILITY_PIXEL;
            staticSamplers[sdIdx++] = sd;
        }
        desc.pStaticSamplers = staticSamplers;
        desc.Flags = D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT;

        ComPtr<ID3DBlob> signature;
        ComPtr<ID3DBlob> error;
        if (FAILED(D3D12SerializeRootSignature(&desc, D3D_ROOT_SIGNATURE_VERSION_1, &signature, &error))) {
            QByteArray msg(static_cast<const char *>(error->GetBufferPointer()), error->GetBufferSize());
            qWarning("Failed to serialize root signature: %s", qPrintable(msg));
            return;
        }
        if (FAILED(device->CreateRootSignature(0, signature->GetBufferPointer(), signature->GetBufferSize(),
                                               IID_PPV_ARGS(&cachedRootSig->rootSig)))) {
            qWarning("Failed to create root signature");
            return;
        }

        rootSigCache.insert(pipelineState.shaders.rootSig, cachedRootSig);
    }

    PSOCacheEntry *cachedPso = psoCache[pipelineState];
    if (!cachedPso) {
        if (Q_UNLIKELY(debug_render()))
            qDebug("NEW PSO");

        cachedPso = new PSOCacheEntry;

        D3D12_GRAPHICS_PIPELINE_STATE_DESC psoDesc = {};

        D3D12_INPUT_ELEMENT_DESC inputElements[QSGD3D12_MAX_INPUT_ELEMENTS];
        int ieIdx = 0;
        for (int i = 0; i < pipelineState.inputElementCount; ++i) {
            const QSGD3D12InputElement &ie(pipelineState.inputElements[i]);
            D3D12_INPUT_ELEMENT_DESC ieDesc = {};
            ieDesc.SemanticName = ie.semanticName;
            ieDesc.SemanticIndex = ie.semanticIndex;
            ieDesc.Format = DXGI_FORMAT(ie.format);
            ieDesc.InputSlot = ie.slot;
            ieDesc.AlignedByteOffset = ie.offset;
            ieDesc.InputSlotClass = D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA;
            if (Q_UNLIKELY(debug_render()))
                qDebug("input [%d]: %s %d 0x%x %d", ieIdx, ie.semanticName, ie.offset, ie.format, ie.slot);
            inputElements[ieIdx++] = ieDesc;
        }

        psoDesc.InputLayout = { inputElements, UINT(ieIdx) };

        psoDesc.pRootSignature = cachedRootSig->rootSig.Get();

        D3D12_SHADER_BYTECODE vshader;
        vshader.pShaderBytecode = pipelineState.shaders.vs;
        vshader.BytecodeLength = pipelineState.shaders.vsSize;
        D3D12_SHADER_BYTECODE pshader;
        pshader.pShaderBytecode = pipelineState.shaders.ps;
        pshader.BytecodeLength = pipelineState.shaders.psSize;

        psoDesc.VS = vshader;
        psoDesc.PS = pshader;

        D3D12_RASTERIZER_DESC rastDesc = {};
        rastDesc.FillMode = D3D12_FILL_MODE_SOLID;
        rastDesc.CullMode = D3D12_CULL_MODE(pipelineState.cullMode);
        rastDesc.FrontCounterClockwise = pipelineState.frontCCW;
        rastDesc.DepthBias = D3D12_DEFAULT_DEPTH_BIAS;
        rastDesc.DepthBiasClamp = D3D12_DEFAULT_DEPTH_BIAS_CLAMP;
        rastDesc.SlopeScaledDepthBias = D3D12_DEFAULT_SLOPE_SCALED_DEPTH_BIAS;
        rastDesc.DepthClipEnable = TRUE;

        psoDesc.RasterizerState = rastDesc;

        D3D12_BLEND_DESC blendDesc = {};
        if (pipelineState.blend == QSGD3D12PipelineState::BlendNone) {
            D3D12_RENDER_TARGET_BLEND_DESC noBlendDesc = {};
            noBlendDesc.RenderTargetWriteMask = pipelineState.colorWrite ? D3D12_COLOR_WRITE_ENABLE_ALL : 0;
            blendDesc.RenderTarget[0] = noBlendDesc;
        } else if (pipelineState.blend == QSGD3D12PipelineState::BlendPremul) {
            const D3D12_RENDER_TARGET_BLEND_DESC premulBlendDesc = {
                TRUE, FALSE,
                D3D12_BLEND_ONE, D3D12_BLEND_INV_SRC_ALPHA, D3D12_BLEND_OP_ADD,
                D3D12_BLEND_ONE, D3D12_BLEND_INV_SRC_ALPHA, D3D12_BLEND_OP_ADD,
                D3D12_LOGIC_OP_NOOP,
                UINT8(pipelineState.colorWrite ? D3D12_COLOR_WRITE_ENABLE_ALL : 0)
            };
            blendDesc.RenderTarget[0] = premulBlendDesc;
        } else if (pipelineState.blend == QSGD3D12PipelineState::BlendColor) {
            const D3D12_RENDER_TARGET_BLEND_DESC colorBlendDesc = {
                TRUE, FALSE,
                D3D12_BLEND_BLEND_FACTOR, D3D12_BLEND_INV_SRC_COLOR, D3D12_BLEND_OP_ADD,
                D3D12_BLEND_BLEND_FACTOR, D3D12_BLEND_INV_SRC_ALPHA, D3D12_BLEND_OP_ADD,
                D3D12_LOGIC_OP_NOOP,
                UINT8(pipelineState.colorWrite ? D3D12_COLOR_WRITE_ENABLE_ALL : 0)
            };
            blendDesc.RenderTarget[0] = colorBlendDesc;
        }
        psoDesc.BlendState = blendDesc;

        psoDesc.DepthStencilState.DepthEnable = pipelineState.depthEnable;
        psoDesc.DepthStencilState.DepthWriteMask = pipelineState.depthWrite ? D3D12_DEPTH_WRITE_MASK_ALL : D3D12_DEPTH_WRITE_MASK_ZERO;
        psoDesc.DepthStencilState.DepthFunc = D3D12_COMPARISON_FUNC(pipelineState.depthFunc);

        psoDesc.DepthStencilState.StencilEnable = pipelineState.stencilEnable;
        psoDesc.DepthStencilState.StencilReadMask = psoDesc.DepthStencilState.StencilWriteMask = 0xFF;
        D3D12_DEPTH_STENCILOP_DESC stencilOpDesc = {
            D3D12_STENCIL_OP(pipelineState.stencilFailOp),
            D3D12_STENCIL_OP(pipelineState.stencilDepthFailOp),
            D3D12_STENCIL_OP(pipelineState.stencilPassOp),
            D3D12_COMPARISON_FUNC(pipelineState.stencilFunc)
        };
        psoDesc.DepthStencilState.FrontFace = psoDesc.DepthStencilState.BackFace = stencilOpDesc;

        psoDesc.SampleMask = UINT_MAX;
        psoDesc.PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE(pipelineState.topologyType);
        psoDesc.NumRenderTargets = 1;
        psoDesc.RTVFormats[0] = RT_COLOR_FORMAT;
        psoDesc.DSVFormat = DXGI_FORMAT_D24_UNORM_S8_UINT;
        psoDesc.SampleDesc = defaultRT[0]->GetDesc().SampleDesc;

        HRESULT hr = device->CreateGraphicsPipelineState(&psoDesc, IID_PPV_ARGS(&cachedPso->pso));
        if (FAILED(hr)) {
            qWarning("Failed to create graphics pipeline state: %s",
                     qPrintable(comErrorMessage(hr)));
            return;
        }

        psoCache.insert(pipelineState, cachedPso);
    }

    if (cachedPso->pso.Get() != tframeData.lastPso) {
        tframeData.lastPso = cachedPso->pso.Get();
        commandList->SetPipelineState(tframeData.lastPso);
    }

    if (cachedRootSig->rootSig.Get() != tframeData.lastRootSig) {
        tframeData.lastRootSig = cachedRootSig->rootSig.Get();
        commandList->SetGraphicsRootSignature(tframeData.lastRootSig);
    }

    if (pipelineState.shaders.rootSig.textureViewCount > 0)
        setDescriptorHeaps();
}

void QSGD3D12EnginePrivate::setDescriptorHeaps(bool force)
{
    if (force || !tframeData.descHeapSet) {
        tframeData.descHeapSet = true;
        ID3D12DescriptorHeap *heaps[] = { pframeData[currentPFrameIndex].gpuCbvSrvUavHeap.Get() };
        commandList->SetDescriptorHeaps(_countof(heaps), heaps);
    }
}

void QSGD3D12EnginePrivate::queueViewport(const QRect &rect)
{
    if (!inFrame) {
        qWarning("%s: Cannot be called outside begin/endFrame", __FUNCTION__);
        return;
    }

    tframeData.viewport = rect;
    const D3D12_VIEWPORT viewport = { float(rect.x()), float(rect.y()), float(rect.width()), float(rect.height()), 0, 1 };
    commandList->RSSetViewports(1, &viewport);
}

void QSGD3D12EnginePrivate::queueScissor(const QRect &rect)
{
    if (!inFrame) {
        qWarning("%s: Cannot be called outside begin/endFrame", __FUNCTION__);
        return;
    }

    tframeData.scissor = rect;
    const D3D12_RECT scissorRect = { rect.x(), rect.y(), rect.x() + rect.width(), rect.y() + rect.height() };
    commandList->RSSetScissorRects(1, &scissorRect);
}

void QSGD3D12EnginePrivate::queueSetRenderTarget(uint id)
{
    if (!inFrame) {
        qWarning("%s: Cannot be called outside begin/endFrame", __FUNCTION__);
        return;
    }

    D3D12_CPU_DESCRIPTOR_HANDLE rtvHandle;
    D3D12_CPU_DESCRIPTOR_HANDLE dsvHandle;

    if (!id) {
        rtvHandle = defaultRTV[presentFrameIndex % swapChainBufferCount];
        dsvHandle = defaultDSV;
    } else {
        const int idx = id - 1;
        Q_ASSERT(idx < renderTargets.count() && renderTargets[idx].entryInUse());
        RenderTarget &rt(renderTargets[idx]);
        rtvHandle = rt.rtv;
        dsvHandle = rt.dsv;
        if (!(rt.flags & RenderTarget::NeedsReadBarrier)) {
            rt.flags |= RenderTarget::NeedsReadBarrier;
            if (!(rt.flags & RenderTarget::Multisample))
                transitionResource(rt.color.Get(), commandList, D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE,
                                   D3D12_RESOURCE_STATE_RENDER_TARGET);
        }
    }

    commandList->OMSetRenderTargets(1, &rtvHandle, FALSE, &dsvHandle);

    currentRenderTarget = id;
}

void QSGD3D12EnginePrivate::queueClearRenderTarget(const QColor &color)
{
    if (!inFrame) {
        qWarning("%s: Cannot be called outside begin/endFrame", __FUNCTION__);
        return;
    }

    const float clearColor[] = { float(color.redF()), float(color.blueF()), float(color.greenF()), float(color.alphaF()) };
    D3D12_CPU_DESCRIPTOR_HANDLE rtv = !currentRenderTarget
            ? defaultRTV[presentFrameIndex % swapChainBufferCount]
            : renderTargets[currentRenderTarget - 1].rtv;
    commandList->ClearRenderTargetView(rtv, clearColor, 0, nullptr);
}

void QSGD3D12EnginePrivate::queueClearDepthStencil(float depthValue, quint8 stencilValue, QSGD3D12Engine::ClearFlags which)
{
    if (!inFrame) {
        qWarning("%s: Cannot be called outside begin/endFrame", __FUNCTION__);
        return;
    }

    D3D12_CPU_DESCRIPTOR_HANDLE dsv = !currentRenderTarget
            ? defaultDSV
            : renderTargets[currentRenderTarget - 1].dsv;
    commandList->ClearDepthStencilView(dsv, D3D12_CLEAR_FLAGS(int(which)), depthValue, stencilValue, 0, nullptr);
}

void QSGD3D12EnginePrivate::queueSetBlendFactor(const QVector4D &factor)
{
    if (!inFrame) {
        qWarning("%s: Cannot be called outside begin/endFrame", __FUNCTION__);
        return;
    }

    tframeData.blendFactor = factor;
    const float f[4] = { factor.x(), factor.y(), factor.z(), factor.w() };
    commandList->OMSetBlendFactor(f);
}

void QSGD3D12EnginePrivate::queueSetStencilRef(quint32 ref)
{
    if (!inFrame) {
        qWarning("%s: Cannot be called outside begin/endFrame", __FUNCTION__);
        return;
    }

    tframeData.stencilRef = ref;
    commandList->OMSetStencilRef(ref);
}

void QSGD3D12EnginePrivate::queueDraw(const QSGD3D12Engine::DrawParams &params)
{
    if (!inFrame) {
        qWarning("%s: Cannot be called outside begin/endFrame", __FUNCTION__);
        return;
    }

    const bool skip = tframeData.scissor.isEmpty();

    PersistentFrameData &pfd(pframeData[currentPFrameIndex]);

    pfd.buffersUsedInDrawCallSet.insert(params.vertexBuf);
    const int vertexBufIdx = params.vertexBuf - 1;
    Q_ASSERT(params.vertexBuf && vertexBufIdx < buffers.count() && buffers[vertexBufIdx].entryInUse());
    pfd.buffersUsedInDrawCallSet.insert(params.constantBuf);
    const int constantBufIdx = params.constantBuf - 1;
    Q_ASSERT(params.constantBuf && constantBufIdx < buffers.count() && buffers[constantBufIdx].entryInUse());
    int indexBufIdx = -1;
    if (params.indexBuf) {
        pfd.buffersUsedInDrawCallSet.insert(params.indexBuf);
        indexBufIdx = params.indexBuf - 1;
        Q_ASSERT(indexBufIdx < buffers.count() && buffers[indexBufIdx].entryInUse());
    }

    // Ensure buffers are created but do not copy the data here, leave that to endDrawCalls().
    ensureBuffer(&buffers[vertexBufIdx]);
    ensureBuffer(&buffers[constantBufIdx]);
    if (indexBufIdx >= 0)
        ensureBuffer(&buffers[indexBufIdx]);

    // Set the CBV.
    if (!skip && params.cboOffset >= 0) {
        ID3D12Resource *cbuf = buffers[constantBufIdx].d[currentPFrameIndex].buffer.Get();
        if (cbuf)
            commandList->SetGraphicsRootConstantBufferView(0, cbuf->GetGPUVirtualAddress() + params.cboOffset);
    }

    // Set up vertex and index buffers.
    ID3D12Resource *vbuf = buffers[vertexBufIdx].d[currentPFrameIndex].buffer.Get();
    ID3D12Resource *ibuf = indexBufIdx >= 0 && params.startIndexIndex >= 0
            ? buffers[indexBufIdx].d[currentPFrameIndex].buffer.Get() : nullptr;

    if (!skip && params.mode != tframeData.drawingMode) {
        D3D_PRIMITIVE_TOPOLOGY topology;
        switch (params.mode) {
        case QSGGeometry::DrawPoints:
            topology = D3D_PRIMITIVE_TOPOLOGY_POINTLIST;
            break;
        case QSGGeometry::DrawLines:
            topology = D3D_PRIMITIVE_TOPOLOGY_LINELIST;
            break;
        case QSGGeometry::DrawLineStrip:
            topology = D3D_PRIMITIVE_TOPOLOGY_LINESTRIP;
            break;
        case QSGGeometry::DrawTriangles:
            topology = D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST;
            break;
        case QSGGeometry::DrawTriangleStrip:
            topology = D3D_PRIMITIVE_TOPOLOGY_TRIANGLESTRIP;
            break;
        default:
            qFatal("Unsupported drawing mode 0x%x", params.mode);
            break;
        }
        commandList->IASetPrimitiveTopology(topology);
        tframeData.drawingMode = params.mode;
    }

    if (!skip) {
        D3D12_VERTEX_BUFFER_VIEW vbv;
        vbv.BufferLocation = vbuf->GetGPUVirtualAddress() + params.vboOffset;
        vbv.SizeInBytes = params.vboSize;
        vbv.StrideInBytes = params.vboStride;

        // must be set after the topology
        commandList->IASetVertexBuffers(0, 1, &vbv);
    }

    if (!skip && params.startIndexIndex >= 0 && ibuf && tframeData.currentIndexBuffer != params.indexBuf) {
        tframeData.currentIndexBuffer = params.indexBuf;
        D3D12_INDEX_BUFFER_VIEW ibv;
        ibv.BufferLocation = ibuf->GetGPUVirtualAddress();
        ibv.SizeInBytes = buffers[indexBufIdx].cpuDataRef.size;
        ibv.Format = DXGI_FORMAT(params.indexFormat);
        commandList->IASetIndexBuffer(&ibv);
    }

    // Copy the SRVs to a drawcall-dedicated area of the shader-visible descriptor heap.
    Q_ASSERT(tframeData.activeTextureCount == tframeData.pipelineState.shaders.rootSig.textureViewCount);
    if (tframeData.activeTextureCount > 0) {
        if (!skip) {
            ensureGPUDescriptorHeap(tframeData.activeTextureCount);
            const uint stride = cpuDescHeapManager.handleSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
            D3D12_CPU_DESCRIPTOR_HANDLE dst = pfd.gpuCbvSrvUavHeap->GetCPUDescriptorHandleForHeapStart();
            dst.ptr += pfd.cbvSrvUavNextFreeDescriptorIndex * stride;
            for (int i = 0; i < tframeData.activeTextureCount; ++i) {
                const TransientFrameData::ActiveTexture &t(tframeData.activeTextures[i]);
                Q_ASSERT(t.id);
                const int idx = t.id - 1;
                const bool isTex = t.type == TransientFrameData::ActiveTexture::TypeTexture;
                device->CopyDescriptorsSimple(1, dst, isTex ? textures[idx].srv : renderTargets[idx].srv,
                                              D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
                dst.ptr += stride;
            }

            D3D12_GPU_DESCRIPTOR_HANDLE gpuAddr = pfd.gpuCbvSrvUavHeap->GetGPUDescriptorHandleForHeapStart();
            gpuAddr.ptr += pfd.cbvSrvUavNextFreeDescriptorIndex * stride;
            commandList->SetGraphicsRootDescriptorTable(1, gpuAddr);

            pfd.cbvSrvUavNextFreeDescriptorIndex += tframeData.activeTextureCount;
        }
        tframeData.activeTextureCount = 0;
    }

    // Add the draw call.
    if (!skip) {
        ++tframeData.drawCount;
        if (params.startIndexIndex >= 0)
            commandList->DrawIndexedInstanced(params.count, 1, params.startIndexIndex, 0, 0);
        else
            commandList->DrawInstanced(params.count, 1, 0, 0);
    }

    if (tframeData.drawCount == MAX_DRAW_CALLS_PER_LIST) {
        if (Q_UNLIKELY(debug_render()))
            qDebug("Limit of %d draw calls reached, executing command list", MAX_DRAW_CALLS_PER_LIST);
        // submit the command list
        endDrawCalls();
        // start a new one
        beginDrawCalls();
        // prepare for the upcoming drawcalls
        restoreFrameState();
    }
}

void QSGD3D12EnginePrivate::ensureGPUDescriptorHeap(int cbvSrvUavDescriptorCount)
{
    PersistentFrameData &pfd(pframeData[currentPFrameIndex]);
    int newSize = pfd.gpuCbvSrvUavHeapSize;
    while (pfd.cbvSrvUavNextFreeDescriptorIndex + cbvSrvUavDescriptorCount > newSize)
        newSize *= 2;
    if (newSize != pfd.gpuCbvSrvUavHeapSize) {
        if (Q_UNLIKELY(debug_descheap()))
            qDebug("Out of space for SRVs, creating new CBV-SRV-UAV descriptor heap with descriptor count %d", newSize);
        deferredDelete(pfd.gpuCbvSrvUavHeap);
        createCbvSrvUavHeap(currentPFrameIndex, newSize);
        setDescriptorHeaps(true);
        pfd.cbvSrvUavNextFreeDescriptorIndex = 0;
    }
}

void QSGD3D12EnginePrivate::present()
{
    if (!initialized)
        return;

    if (Q_UNLIKELY(debug_render()))
        qDebug("--- present with vsync ---");

    // This call will not block the CPU unless at least 3 buffers are queued,
    // unless the waitable frame latency event is enabled. Then the latency of
    // 3 is changed to whatever value desired, and blocking happens in
    // beginFrame. If none of these hold, the fence-based wait in beginFrame
    // throttles. Vsync (interval 1) is always enabled.
    HRESULT hr = swapChain->Present(1, 0);
    if (hr == DXGI_ERROR_DEVICE_REMOVED || hr == DXGI_ERROR_DEVICE_RESET) {
        deviceManager()->deviceLossDetected();
        return;
    } else if (FAILED(hr)) {
        qWarning("Present failed: %s", qPrintable(comErrorMessage(hr)));
        return;
    }

#ifndef Q_OS_WINRT
    if (dcompDevice)
        dcompDevice->Commit();
#endif

    ++presentFrameIndex;
}

void QSGD3D12EnginePrivate::waitGPU()
{
    if (!initialized)
        return;

    if (Q_UNLIKELY(debug_render()))
        qDebug("--- blocking wait for GPU ---");

    waitForGPU(presentFence);
}

template<class T> uint newId(T *tbl)
{
    uint id = 0;
    for (int i = 0; i < tbl->count(); ++i) {
        if (!(*tbl)[i].entryInUse()) {
            id = i + 1;
            break;
        }
    }

    if (!id) {
        tbl->resize(tbl->size() + 1);
        id = tbl->count();
    }

    (*tbl)[id - 1].flags = 0x01; // reset flags and set EntryInUse

    return id;
}

template<class T> void syncEntryFlags(T *e, int flag, bool b)
{
    if (b)
        e->flags |= flag;
    else
        e->flags &= ~flag;
}

uint QSGD3D12EnginePrivate::genBuffer()
{
    return newId(&buffers);
}

void QSGD3D12EnginePrivate::releaseBuffer(uint id)
{
    if (!id || !initialized)
        return;

    const int idx = id - 1;
    Q_ASSERT(idx < buffers.count());

    if (Q_UNLIKELY(debug_buffer()))
        qDebug("releasing buffer %u", id);

    Buffer &b(buffers[idx]);
    if (!b.entryInUse())
        return;

    // Do not null out and do not mark the entry reusable yet.
    // Do that only when the frames potentially in flight have finished for sure.

    for (int i = 0; i < frameInFlightCount; ++i) {
        if (b.d[i].buffer)
            deferredDelete(b.d[i].buffer);
    }

    QSet<PersistentFrameData::PendingRelease> *pendingReleasesSet = inFrame
            ? &pframeData[currentPFrameIndex].pendingReleases
            : &pframeData[(currentPFrameIndex + 1) % frameInFlightCount].outOfFramePendingReleases;

    pendingReleasesSet->insert(PersistentFrameData::PendingRelease(PersistentFrameData::PendingRelease::TypeBuffer, id));
}

void QSGD3D12EnginePrivate::resetBuffer(uint id, const quint8 *data, int size)
{
    if (!inFrame) {
        qWarning("%s: Cannot be called outside begin/endFrame", __FUNCTION__);
        return;
    }

    Q_ASSERT(id);
    const int idx = id - 1;
    Q_ASSERT(idx < buffers.count() && buffers[idx].entryInUse());
    Buffer &b(buffers[idx]);

    if (Q_UNLIKELY(debug_buffer()))
        qDebug("reset buffer %u, size %d", id, size);

    b.cpuDataRef.p = data;
    b.cpuDataRef.size = size;

    b.cpuDataRef.dirty.clear();
    b.d[currentPFrameIndex].dirty.clear();

    if (size > 0) {
        const QPair<int, int> range = qMakePair(0, size);
        b.cpuDataRef.dirty.append(range);
        b.d[currentPFrameIndex].dirty.append(range);
    }
}

void QSGD3D12EnginePrivate::addDirtyRange(DirtyList *dirty, int offset, int size, int bufferSize)
{
    // Bail out when the dirty list already spans the entire buffer.
    if (!dirty->isEmpty()) {
        if (dirty->at(0).first == 0 && dirty->at(0).second == bufferSize)
            return;
    }

    const QPair<int, int> range = qMakePair(offset, size);
    if (!dirty->contains(range))
        dirty->append(range);
}

void QSGD3D12EnginePrivate::markBufferDirty(uint id, int offset, int size)
{
    if (!inFrame) {
        qWarning("%s: Cannot be called outside begin/endFrame", __FUNCTION__);
        return;
    }

    Q_ASSERT(id);
    const int idx = id - 1;
    Q_ASSERT(idx < buffers.count() && buffers[idx].entryInUse());
    Buffer &b(buffers[idx]);

    addDirtyRange(&b.cpuDataRef.dirty, offset, size, b.cpuDataRef.size);
    addDirtyRange(&b.d[currentPFrameIndex].dirty, offset, size, b.cpuDataRef.size);
}

uint QSGD3D12EnginePrivate::genTexture()
{
    const uint id = newId(&textures);
    textures[id - 1].fenceValue = 0;
    return id;
}

static inline DXGI_FORMAT textureFormat(QImage::Format format, bool wantsAlpha, bool mipmap, bool force32bit,
                                        QImage::Format *imageFormat, int *bytesPerPixel)
{
    DXGI_FORMAT f = DXGI_FORMAT_R8G8B8A8_UNORM;
    QImage::Format convFormat = format;
    int bpp = 4;

    if (!mipmap) {
        switch (format) {
        case QImage::Format_Grayscale8:
        case QImage::Format_Indexed8:
        case QImage::Format_Alpha8:
            if (!force32bit) {
                f = DXGI_FORMAT_R8_UNORM;
                bpp = 1;
            } else {
                convFormat = QImage::Format_RGBA8888;
            }
            break;
        case QImage::Format_RGB32:
            f = DXGI_FORMAT_B8G8R8A8_UNORM;
            break;
        case QImage::Format_ARGB32:
            f = DXGI_FORMAT_B8G8R8A8_UNORM;
            convFormat = wantsAlpha ? QImage::Format_ARGB32_Premultiplied : QImage::Format_RGB32;
            break;
        case QImage::Format_ARGB32_Premultiplied:
            f = DXGI_FORMAT_B8G8R8A8_UNORM;
            convFormat = wantsAlpha ? format : QImage::Format_RGB32;
            break;
        default:
            convFormat = wantsAlpha ? QImage::Format_RGBA8888_Premultiplied : QImage::Format_RGBX8888;
            break;
        }
    } else {
        // Mipmap generation needs unordered access and BGRA is not an option for that. Stick to RGBA.
        convFormat = wantsAlpha ? QImage::Format_RGBA8888_Premultiplied : QImage::Format_RGBX8888;
    }

    if (imageFormat)
        *imageFormat = convFormat;

    if (bytesPerPixel)
        *bytesPerPixel = bpp;

    return f;
}

static inline QImage::Format imageFormatForTexture(DXGI_FORMAT format)
{
    QImage::Format f = QImage::Format_Invalid;

    switch (format) {
    case DXGI_FORMAT_R8G8B8A8_UNORM:
        f = QImage::Format_RGBA8888_Premultiplied;
        break;
    case DXGI_FORMAT_B8G8R8A8_UNORM:
        f = QImage::Format_ARGB32_Premultiplied;
        break;
    case DXGI_FORMAT_R8_UNORM:
        f = QImage::Format_Grayscale8;
        break;
    default:
        break;
    }

    return f;
}

void QSGD3D12EnginePrivate::createTexture(uint id, const QSize &size, QImage::Format format,
                                          QSGD3D12Engine::TextureCreateFlags createFlags)
{
    ensureDevice();

    Q_ASSERT(id);
    const int idx = id - 1;
    Q_ASSERT(idx < textures.count() && textures[idx].entryInUse());
    Texture &t(textures[idx]);

    syncEntryFlags(&t, Texture::Alpha, createFlags & QSGD3D12Engine::TextureWithAlpha);
    syncEntryFlags(&t, Texture::MipMap, createFlags & QSGD3D12Engine::TextureWithMipMaps);

    const QSize adjustedSize = !t.mipmap() ? size : QSGD3D12Engine::mipMapAdjustedSourceSize(size);

    D3D12_HEAP_PROPERTIES defaultHeapProp = {};
    defaultHeapProp.Type = D3D12_HEAP_TYPE_DEFAULT;

    D3D12_RESOURCE_DESC textureDesc = {};
    textureDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;
    textureDesc.Width = adjustedSize.width();
    textureDesc.Height = adjustedSize.height();
    textureDesc.DepthOrArraySize = 1;
    textureDesc.MipLevels = !t.mipmap() ? 1 : QSGD3D12Engine::mipMapLevels(adjustedSize);
    textureDesc.Format = textureFormat(format, t.alpha(), t.mipmap(),
                                       createFlags.testFlag(QSGD3D12Engine::TextureAlways32Bit),
                                       nullptr, nullptr);
    textureDesc.SampleDesc.Count = 1;
    textureDesc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN;
    if (t.mipmap())
        textureDesc.Flags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS;

    HRESULT hr = device->CreateCommittedResource(&defaultHeapProp, D3D12_HEAP_FLAG_NONE, &textureDesc,
                                                 D3D12_RESOURCE_STATE_COMMON, nullptr, IID_PPV_ARGS(&t.texture));
    if (FAILED(hr)) {
        qWarning("Failed to create texture resource: %s", qPrintable(comErrorMessage(hr)));
        return;
    }

    t.srv = cpuDescHeapManager.allocate(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);

    D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
    srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
    srvDesc.Format = textureDesc.Format;
    srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D;
    srvDesc.Texture2D.MipLevels = textureDesc.MipLevels;

    device->CreateShaderResourceView(t.texture.Get(), &srvDesc, t.srv);

    if (t.mipmap()) {
        // Mipmap generation will need an UAV for each level that needs to be generated.
        t.mipUAVs.clear();
        for (int level = 1; level < textureDesc.MipLevels; ++level) {
            D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
            uavDesc.Format = textureDesc.Format;
            uavDesc.ViewDimension = D3D12_UAV_DIMENSION_TEXTURE2D;
            uavDesc.Texture2D.MipSlice = level;
            D3D12_CPU_DESCRIPTOR_HANDLE h = cpuDescHeapManager.allocate(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
            device->CreateUnorderedAccessView(t.texture.Get(), nullptr, &uavDesc, h);
            t.mipUAVs.append(h);
        }
    }

    if (Q_UNLIKELY(debug_texture()))
        qDebug("created texture %u, size %dx%d, miplevels %d", id, adjustedSize.width(), adjustedSize.height(), textureDesc.MipLevels);
}

void QSGD3D12EnginePrivate::queueTextureResize(uint id, const QSize &size)
{
    Q_ASSERT(id);
    const int idx = id - 1;
    Q_ASSERT(idx < textures.count() && textures[idx].entryInUse());
    Texture &t(textures[idx]);

    if (!t.texture) {
        qWarning("Cannot resize non-created texture %u", id);
        return;
    }

    if (t.mipmap()) {
        qWarning("Cannot resize mipmapped texture %u", id);
        return;
    }

    if (Q_UNLIKELY(debug_texture()))
        qDebug("resizing texture %u, size %dx%d", id, size.width(), size.height());

    D3D12_RESOURCE_DESC textureDesc = t.texture->GetDesc();
    textureDesc.Width = size.width();
    textureDesc.Height = size.height();

    D3D12_HEAP_PROPERTIES defaultHeapProp = {};
    defaultHeapProp.Type = D3D12_HEAP_TYPE_DEFAULT;

    ComPtr<ID3D12Resource> oldTexture = t.texture;
    deferredDelete(t.texture);

    HRESULT hr = device->CreateCommittedResource(&defaultHeapProp, D3D12_HEAP_FLAG_NONE, &textureDesc,
                                                 D3D12_RESOURCE_STATE_COMMON, nullptr, IID_PPV_ARGS(&t.texture));
    if (FAILED(hr)) {
        qWarning("Failed to create resized texture resource: %s",
                 qPrintable(comErrorMessage(hr)));
        return;
    }

    deferredDelete(t.srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
    t.srv = cpuDescHeapManager.allocate(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);

    D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
    srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
    srvDesc.Format = textureDesc.Format;
    srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D;
    srvDesc.Texture2D.MipLevels = textureDesc.MipLevels;

    device->CreateShaderResourceView(t.texture.Get(), &srvDesc, t.srv);

    D3D12_TEXTURE_COPY_LOCATION dstLoc;
    dstLoc.pResource = t.texture.Get();
    dstLoc.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
    dstLoc.SubresourceIndex = 0;

    D3D12_TEXTURE_COPY_LOCATION srcLoc;
    srcLoc.pResource = oldTexture.Get();
    srcLoc.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
    srcLoc.SubresourceIndex = 0;

    copyCommandList->Reset(copyCommandAllocator.Get(), nullptr);

    copyCommandList->CopyTextureRegion(&dstLoc, 0, 0, 0, &srcLoc, nullptr);

    copyCommandList->Close();
    ID3D12CommandList *commandLists[] = { copyCommandList.Get() };
    copyCommandQueue->ExecuteCommandLists(_countof(commandLists), commandLists);

    t.fenceValue = nextTextureUploadFenceValue.fetchAndAddAcquire(1) + 1;
    copyCommandQueue->Signal(textureUploadFence.Get(), t.fenceValue);

    if (Q_UNLIKELY(debug_texture()))
        qDebug("submitted old content copy for texture %u on the copy queue, fence %llu", id, t.fenceValue);
}

void QSGD3D12EnginePrivate::queueTextureUpload(uint id, const QVector<QImage> &images, const QVector<QPoint> &dstPos,
                                               QSGD3D12Engine::TextureUploadFlags flags)
{
    Q_ASSERT(id);
    Q_ASSERT(images.count() == dstPos.count());
    if (images.isEmpty())
        return;

    const int idx = id - 1;
    Q_ASSERT(idx < textures.count() && textures[idx].entryInUse());
    Texture &t(textures[idx]);
    Q_ASSERT(t.texture);

    // When mipmapping is not in use, image can be smaller than the size passed
    // to createTexture() and dstPos can specify a non-zero destination position.

    if (t.mipmap() && (images.count() != 1 || dstPos.count() != 1 || !dstPos[0].isNull())) {
        qWarning("Mipmapped textures (%u) do not support partial uploads", id);
        return;
    }

    // Make life simpler by disallowing queuing a new mipmapped upload before the previous one finishes.
    if (t.mipmap() && t.fenceValue) {
        qWarning("Attempted to queue mipmapped texture upload (%u) while a previous upload is still in progress", id);
        return;
    }

    t.fenceValue = nextTextureUploadFenceValue.fetchAndAddAcquire(1) + 1;

    if (Q_UNLIKELY(debug_texture()))
        qDebug("adding upload for texture %u on the copy queue, fence %llu", id, t.fenceValue);

    D3D12_RESOURCE_DESC textureDesc = t.texture->GetDesc();
    const QSize adjustedTextureSize(textureDesc.Width, textureDesc.Height);

    int totalSize = 0;
    for (const QImage &image : images) {
        int bytesPerPixel;
        textureFormat(image.format(), t.alpha(), t.mipmap(),
                      flags.testFlag(QSGD3D12Engine::TextureUploadAlways32Bit),
                      nullptr, &bytesPerPixel);
        const int w = !t.mipmap() ? image.width() : adjustedTextureSize.width();
        const int h = !t.mipmap() ? image.height() : adjustedTextureSize.height();
        const int stride = alignedSize(w * bytesPerPixel, D3D12_TEXTURE_DATA_PITCH_ALIGNMENT);
        totalSize += alignedSize(h * stride, D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT);
    }

    if (Q_UNLIKELY(debug_texture()))
        qDebug("%d sub-uploads, heap size %d bytes", images.count(), totalSize);

    // Instead of individual committed resources for each upload buffer,
    // allocate only once and use placed resources.
    D3D12_HEAP_PROPERTIES uploadHeapProp = {};
    uploadHeapProp.Type = D3D12_HEAP_TYPE_UPLOAD;
    D3D12_HEAP_DESC uploadHeapDesc = {};
    uploadHeapDesc.SizeInBytes = totalSize;
    uploadHeapDesc.Properties = uploadHeapProp;
    uploadHeapDesc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS;

    Texture::StagingHeap sheap;
    if (FAILED(device->CreateHeap(&uploadHeapDesc, IID_PPV_ARGS(&sheap.heap)))) {
        qWarning("Failed to create texture upload heap of size %d", totalSize);
        return;
    }
    t.stagingHeaps.append(sheap);

    copyCommandList->Reset(copyCommandAllocator.Get(), nullptr);

    int placedOffset = 0;
    for (int i = 0; i < images.count(); ++i) {
        QImage::Format convFormat;
        int bytesPerPixel;
        textureFormat(images[i].format(), t.alpha(), t.mipmap(),
                      flags.testFlag(QSGD3D12Engine::TextureUploadAlways32Bit),
                      &convFormat, &bytesPerPixel);
        if (Q_UNLIKELY(debug_texture() && i == 0))
            qDebug("source image format %d, target format %d, bpp %d", images[i].format(), convFormat, bytesPerPixel);

        QImage convImage = images[i].format() == convFormat ? images[i] : images[i].convertToFormat(convFormat);

        if (t.mipmap() && adjustedTextureSize != convImage.size())
            convImage = convImage.scaled(adjustedTextureSize, Qt::IgnoreAspectRatio, Qt::SmoothTransformation);

        const int stride = alignedSize(convImage.width() * bytesPerPixel, D3D12_TEXTURE_DATA_PITCH_ALIGNMENT);

        D3D12_RESOURCE_DESC bufDesc = {};
        bufDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
        bufDesc.Width = stride * convImage.height();
        bufDesc.Height = 1;
        bufDesc.DepthOrArraySize = 1;
        bufDesc.MipLevels = 1;
        bufDesc.Format = DXGI_FORMAT_UNKNOWN;
        bufDesc.SampleDesc.Count = 1;
        bufDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;

        Texture::StagingBuffer sbuf;
        if (FAILED(device->CreatePlacedResource(sheap.heap.Get(), placedOffset,
                                                &bufDesc, D3D12_RESOURCE_STATE_GENERIC_READ,
                                                nullptr, IID_PPV_ARGS(&sbuf.buffer)))) {
            qWarning("Failed to create texture upload buffer");
            return;
        }

        quint8 *p = nullptr;
        const D3D12_RANGE readRange = { 0, 0 };
        if (FAILED(sbuf.buffer->Map(0, &readRange, reinterpret_cast<void **>(&p)))) {
            qWarning("Map failed (texture upload buffer)");
            return;
        }
        for (int y = 0, ye = convImage.height(); y < ye; ++y) {
            memcpy(p, convImage.constScanLine(y), convImage.width() * bytesPerPixel);
            p += stride;
        }
        sbuf.buffer->Unmap(0, nullptr);

        D3D12_TEXTURE_COPY_LOCATION dstLoc;
        dstLoc.pResource = t.texture.Get();
        dstLoc.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
        dstLoc.SubresourceIndex = 0;

        D3D12_TEXTURE_COPY_LOCATION srcLoc;
        srcLoc.pResource = sbuf.buffer.Get();
        srcLoc.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
        srcLoc.PlacedFootprint.Offset = 0;
        srcLoc.PlacedFootprint.Footprint.Format = textureDesc.Format;
        srcLoc.PlacedFootprint.Footprint.Width = convImage.width();
        srcLoc.PlacedFootprint.Footprint.Height = convImage.height();
        srcLoc.PlacedFootprint.Footprint.Depth = 1;
        srcLoc.PlacedFootprint.Footprint.RowPitch = stride;

        copyCommandList->CopyTextureRegion(&dstLoc, dstPos[i].x(), dstPos[i].y(), 0, &srcLoc, nullptr);

        t.stagingBuffers.append(sbuf);
        placedOffset += alignedSize(bufDesc.Width, D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT);
    }

    copyCommandList->Close();
    ID3D12CommandList *commandLists[] = { copyCommandList.Get() };
    copyCommandQueue->ExecuteCommandLists(_countof(commandLists), commandLists);
    copyCommandQueue->Signal(textureUploadFence.Get(), t.fenceValue);
}

void QSGD3D12EnginePrivate::releaseTexture(uint id)
{
    if (!id || !initialized)
        return;

    const int idx = id - 1;
    Q_ASSERT(idx < textures.count());

    if (Q_UNLIKELY(debug_texture()))
        qDebug("releasing texture %d", id);

    Texture &t(textures[idx]);
    if (!t.entryInUse())
        return;

    if (t.texture) {
        deferredDelete(t.texture);
        deferredDelete(t.srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
        for (D3D12_CPU_DESCRIPTOR_HANDLE h : t.mipUAVs)
            deferredDelete(h, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
    }

    QSet<PersistentFrameData::PendingRelease> *pendingReleasesSet = inFrame
            ? &pframeData[currentPFrameIndex].pendingReleases
            : &pframeData[(currentPFrameIndex + 1) % frameInFlightCount].outOfFramePendingReleases;

    pendingReleasesSet->insert(PersistentFrameData::PendingRelease(PersistentFrameData::PendingRelease::TypeTexture, id));
}

void QSGD3D12EnginePrivate::useTexture(uint id)
{
    if (!inFrame) {
        qWarning("%s: Cannot be called outside begin/endFrame", __FUNCTION__);
        return;
    }

    Q_ASSERT(id);
    const int idx = id - 1;
    Q_ASSERT(idx < textures.count() && textures[idx].entryInUse());

    // Within one frame the order of calling this function determines the
    // texture register (0, 1, ...) so fill up activeTextures accordingly.
    tframeData.activeTextures[tframeData.activeTextureCount++]
            = TransientFrameData::ActiveTexture(TransientFrameData::ActiveTexture::TypeTexture, id);

    if (textures[idx].fenceValue)
        pframeData[currentPFrameIndex].pendingTextureUploads.insert(id);
}

bool QSGD3D12EnginePrivate::MipMapGen::initialize(QSGD3D12EnginePrivate *enginePriv)
{
    engine = enginePriv;

    D3D12_STATIC_SAMPLER_DESC sampler = {};
    sampler.Filter = D3D12_FILTER_MIN_MAG_MIP_LINEAR;
    sampler.AddressU = D3D12_TEXTURE_ADDRESS_MODE_CLAMP;
    sampler.AddressV = D3D12_TEXTURE_ADDRESS_MODE_CLAMP;
    sampler.AddressW = D3D12_TEXTURE_ADDRESS_MODE_CLAMP;
    sampler.MinLOD = 0.0f;
    sampler.MaxLOD = D3D12_FLOAT32_MAX;

    D3D12_DESCRIPTOR_RANGE descRange[2];
    descRange[0].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_SRV;
    descRange[0].NumDescriptors = 1;
    descRange[0].BaseShaderRegister = 0; // t0
    descRange[0].RegisterSpace = 0;
    descRange[0].OffsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND;
    descRange[1].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_UAV;
    descRange[1].NumDescriptors = 4;
    descRange[1].BaseShaderRegister = 0; // u0..u3
    descRange[1].RegisterSpace = 0;
    descRange[1].OffsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND;

    // Split into two to allow switching between the first and second set of UAVs later.
    D3D12_ROOT_PARAMETER rootParameters[3];
    rootParameters[0].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
    rootParameters[0].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
    rootParameters[0].DescriptorTable.NumDescriptorRanges = 1;
    rootParameters[0].DescriptorTable.pDescriptorRanges = &descRange[0];

    rootParameters[1].ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
    rootParameters[1].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
    rootParameters[1].DescriptorTable.NumDescriptorRanges = 1;
    rootParameters[1].DescriptorTable.pDescriptorRanges = &descRange[1];

    rootParameters[2].ParameterType = D3D12_ROOT_PARAMETER_TYPE_32BIT_CONSTANTS;
    rootParameters[2].ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
    rootParameters[2].Constants.Num32BitValues = 4; // uint2 mip1Size, uint sampleLevel, uint totalMips
    rootParameters[2].Constants.ShaderRegister = 0; // b0
    rootParameters[2].Constants.RegisterSpace = 0;

    D3D12_ROOT_SIGNATURE_DESC desc = {};
    desc.NumParameters = 3;
    desc.pParameters = rootParameters;
    desc.NumStaticSamplers = 1;
    desc.pStaticSamplers = &sampler;

    ComPtr<ID3DBlob> signature;
    ComPtr<ID3DBlob> error;
    if (FAILED(D3D12SerializeRootSignature(&desc, D3D_ROOT_SIGNATURE_VERSION_1, &signature, &error))) {
        QByteArray msg(static_cast<const char *>(error->GetBufferPointer()), error->GetBufferSize());
        qWarning("Failed to serialize compute root signature: %s", qPrintable(msg));
        return false;
    }
    if (FAILED(engine->device->CreateRootSignature(0, signature->GetBufferPointer(), signature->GetBufferSize(),
                                                   IID_PPV_ARGS(&rootSig)))) {
        qWarning("Failed to create compute root signature");
        return false;
    }

    D3D12_COMPUTE_PIPELINE_STATE_DESC psoDesc = {};
    psoDesc.pRootSignature = rootSig.Get();
    psoDesc.CS.pShaderBytecode = g_CS_Generate4MipMaps;
    psoDesc.CS.BytecodeLength = sizeof(g_CS_Generate4MipMaps);

    if (FAILED(engine->device->CreateComputePipelineState(&psoDesc, IID_PPV_ARGS(&pipelineState)))) {
        qWarning("Failed to create compute pipeline state");
        return false;
    }

    return true;
}

void QSGD3D12EnginePrivate::MipMapGen::releaseResources()
{
    pipelineState = nullptr;
    rootSig = nullptr;
}

// The mipmap generator is used to insert commands on the main 3D queue. It is
// guaranteed that the queue has a wait for the base texture level upload
// before invoking queueGenerate(). There can be any number of invocations
// without waiting for earlier ones to finish. finished() is invoked when it is
// known for sure that frame containing the upload and mipmap generation has
// finished on the GPU.

void QSGD3D12EnginePrivate::MipMapGen::queueGenerate(const Texture &t)
{
    D3D12_RESOURCE_DESC textureDesc = t.texture->GetDesc();

    engine->commandList->SetPipelineState(pipelineState.Get());
    engine->commandList->SetComputeRootSignature(rootSig.Get());

    // 1 SRV + (miplevels - 1) UAVs
    const int descriptorCount = 1 + (textureDesc.MipLevels - 1);

    engine->ensureGPUDescriptorHeap(descriptorCount);

    // The descriptor heap is set on the command list either because the
    // ensure() call above resized, or, typically, due to a texture-dependent
    // draw call earlier.

    engine->transitionResource(t.texture.Get(), engine->commandList,
                               D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);

    QSGD3D12EnginePrivate::PersistentFrameData &pfd(engine->pframeData[engine->currentPFrameIndex]);

    const uint stride = engine->cpuDescHeapManager.handleSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
    D3D12_CPU_DESCRIPTOR_HANDLE h = pfd.gpuCbvSrvUavHeap->GetCPUDescriptorHandleForHeapStart();
    h.ptr += pfd.cbvSrvUavNextFreeDescriptorIndex * stride;

    engine->device->CopyDescriptorsSimple(1, h, t.srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
    h.ptr += stride;

    for (int level = 1; level < textureDesc.MipLevels; ++level, h.ptr += stride)
        engine->device->CopyDescriptorsSimple(1, h, t.mipUAVs[level - 1], D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);

    D3D12_GPU_DESCRIPTOR_HANDLE gpuAddr = pfd.gpuCbvSrvUavHeap->GetGPUDescriptorHandleForHeapStart();
    gpuAddr.ptr += pfd.cbvSrvUavNextFreeDescriptorIndex * stride;

    engine->commandList->SetComputeRootDescriptorTable(0, gpuAddr);
    gpuAddr.ptr += stride; // now points to the first UAV

    for (int level = 1; level < textureDesc.MipLevels; level += 4, gpuAddr.ptr += stride * 4) {
        engine->commandList->SetComputeRootDescriptorTable(1, gpuAddr);

        QSize sz(textureDesc.Width, textureDesc.Height);
        sz.setWidth(qMax(1, sz.width() >> level));
        sz.setHeight(qMax(1, sz.height() >> level));

        const quint32 constants[4] = { quint32(sz.width()), quint32(sz.height()),
                                       quint32(level - 1),
                                       quint32(textureDesc.MipLevels - 1) };

        engine->commandList->SetComputeRoot32BitConstants(2, 4, constants, 0);
        engine->commandList->Dispatch(sz.width(), sz.height(), 1);
        engine->uavBarrier(t.texture.Get(), engine->commandList);
    }

    engine->transitionResource(t.texture.Get(), engine->commandList,
                               D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE);

    pfd.cbvSrvUavNextFreeDescriptorIndex += descriptorCount;
}

void QSGD3D12EnginePrivate::deferredDelete(ComPtr<ID3D12Resource> res)
{
    PersistentFrameData::DeleteQueueEntry e;
    e.res = res;
    QVector<PersistentFrameData::DeleteQueueEntry> *dq = inFrame
            ? &pframeData[currentPFrameIndex].deleteQueue
            : &pframeData[(currentPFrameIndex + 1) % frameInFlightCount].outOfFrameDeleteQueue;
    (*dq) << e;
}

void QSGD3D12EnginePrivate::deferredDelete(ComPtr<ID3D12DescriptorHeap> dh)
{
    PersistentFrameData::DeleteQueueEntry e;
    e.descHeap = dh;
    QVector<PersistentFrameData::DeleteQueueEntry> *dq = inFrame
            ? &pframeData[currentPFrameIndex].deleteQueue
            : &pframeData[(currentPFrameIndex + 1) % frameInFlightCount].outOfFrameDeleteQueue;
    (*dq) << e;
}

void QSGD3D12EnginePrivate::deferredDelete(D3D12_CPU_DESCRIPTOR_HANDLE h, D3D12_DESCRIPTOR_HEAP_TYPE type)
{
    PersistentFrameData::DeleteQueueEntry e;
    e.cpuDescriptorPtr = h.ptr;
    e.descHeapType = type;
    QVector<PersistentFrameData::DeleteQueueEntry> *dq = inFrame
            ? &pframeData[currentPFrameIndex].deleteQueue
            : &pframeData[(currentPFrameIndex + 1) % frameInFlightCount].outOfFrameDeleteQueue;
    (*dq) << e;
}

uint QSGD3D12EnginePrivate::genRenderTarget()
{
    return newId(&renderTargets);
}

void QSGD3D12EnginePrivate::createRenderTarget(uint id, const QSize &size, const QVector4D &clearColor, uint samples)
{
    ensureDevice();

    Q_ASSERT(id);
    const int idx = id - 1;
    Q_ASSERT(idx < renderTargets.count() && renderTargets[idx].entryInUse());
    RenderTarget &rt(renderTargets[idx]);

    rt.rtv = cpuDescHeapManager.allocate(D3D12_DESCRIPTOR_HEAP_TYPE_RTV);
    rt.dsv = cpuDescHeapManager.allocate(D3D12_DESCRIPTOR_HEAP_TYPE_DSV);
    rt.srv = cpuDescHeapManager.allocate(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);

    ID3D12Resource *res = createColorBuffer(rt.rtv, size, clearColor, samples);
    if (res)
        rt.color.Attach(res);

    ID3D12Resource *dsres = createDepthStencil(rt.dsv, size, samples);
    if (dsres)
        rt.ds.Attach(dsres);

    const bool multisample = rt.color->GetDesc().SampleDesc.Count > 1;
    syncEntryFlags(&rt, RenderTarget::Multisample, multisample);

    if (!multisample) {
        device->CreateShaderResourceView(rt.color.Get(), nullptr, rt.srv);
    } else {
        D3D12_HEAP_PROPERTIES defaultHeapProp = {};
        defaultHeapProp.Type = D3D12_HEAP_TYPE_DEFAULT;

        D3D12_RESOURCE_DESC textureDesc = {};
        textureDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE2D;
        textureDesc.Width = size.width();
        textureDesc.Height = size.height();
        textureDesc.DepthOrArraySize = 1;
        textureDesc.MipLevels = 1;
        textureDesc.Format = RT_COLOR_FORMAT;
        textureDesc.SampleDesc.Count = 1;
        textureDesc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN;

        HRESULT hr = device->CreateCommittedResource(&defaultHeapProp, D3D12_HEAP_FLAG_NONE, &textureDesc,
                                                     D3D12_RESOURCE_STATE_COMMON, nullptr, IID_PPV_ARGS(&rt.colorResolve));
        if (FAILED(hr)) {
            qWarning("Failed to create resolve buffer: %s",
                     qPrintable(comErrorMessage(hr)));
            return;
        }

        device->CreateShaderResourceView(rt.colorResolve.Get(), nullptr, rt.srv);
    }

    if (Q_UNLIKELY(debug_render()))
        qDebug("created new render target %u, size %dx%d, samples %d", id, size.width(), size.height(), samples);
}

void QSGD3D12EnginePrivate::releaseRenderTarget(uint id)
{
    if (!id || !initialized)
        return;

    const int idx = id - 1;
    Q_ASSERT(idx < renderTargets.count());
    RenderTarget &rt(renderTargets[idx]);
    if (!rt.entryInUse())
        return;

    if (Q_UNLIKELY(debug_render()))
        qDebug("releasing render target %u", id);

    if (rt.colorResolve) {
        deferredDelete(rt.colorResolve);
        rt.colorResolve = nullptr;
    }
    if (rt.color) {
        deferredDelete(rt.color);
        rt.color = nullptr;
        deferredDelete(rt.rtv, D3D12_DESCRIPTOR_HEAP_TYPE_RTV);
        deferredDelete(rt.srv, D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
    }
    if (rt.ds) {
        deferredDelete(rt.ds);
        rt.ds = nullptr;
        deferredDelete(rt.dsv, D3D12_DESCRIPTOR_HEAP_TYPE_DSV);
    }

    rt.flags &= ~RenderTarget::EntryInUse;
}

void QSGD3D12EnginePrivate::useRenderTargetAsTexture(uint id)
{
    if (!inFrame) {
        qWarning("%s: Cannot be called outside begin/endFrame", __FUNCTION__);
        return;
    }

    Q_ASSERT(id);
    const int idx = id - 1;
    Q_ASSERT(idx < renderTargets.count());
    RenderTarget &rt(renderTargets[idx]);
    Q_ASSERT(rt.entryInUse() && rt.color);

    if (rt.flags & RenderTarget::NeedsReadBarrier) {
        rt.flags &= ~RenderTarget::NeedsReadBarrier;
        if (rt.flags & RenderTarget::Multisample)
            resolveMultisampledTarget(rt.color.Get(), rt.colorResolve.Get(), D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE, commandList);
        else
            transitionResource(rt.color.Get(), commandList, D3D12_RESOURCE_STATE_RENDER_TARGET, D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE);
    }

    tframeData.activeTextures[tframeData.activeTextureCount++] =
            TransientFrameData::ActiveTexture(TransientFrameData::ActiveTexture::TypeRenderTarget, id);
}

QImage QSGD3D12EnginePrivate::executeAndWaitReadbackRenderTarget(uint id)
{
    // Readback due to QQuickWindow::grabWindow() happens outside
    // begin-endFrame, but QQuickItemGrabResult leads to rendering a layer
    // without a real frame afterwards and triggering readback. This has to be
    // supported as well.
    if (inFrame && (!activeLayers || currentLayerDepth)) {
        qWarning("%s: Cannot be called while frame preparation is active", __FUNCTION__);
        return QImage();
    }

    // Due to the above we insert a fake "real" frame when a layer was just rendered into.
    if (inFrame) {
        beginFrame();
        endFrame();
    }

    frameCommandList->Reset(frameCommandAllocator[frameIndex % frameInFlightCount].Get(), nullptr);

    D3D12_RESOURCE_STATES bstate;
    bool needsBarrier = false;
    ID3D12Resource *rtRes;
    if (id == 0) {
        const int idx = presentFrameIndex % swapChainBufferCount;
        if (windowSamples > 1) {
            resolveMultisampledTarget(defaultRT[idx].Get(), backBufferRT[idx].Get(),
                                      D3D12_RESOURCE_STATE_COPY_SOURCE, frameCommandList.Get());
        } else {
            bstate = D3D12_RESOURCE_STATE_PRESENT;
            needsBarrier = true;
        }
        rtRes = backBufferRT[idx].Get();
    } else {
        const int idx = id - 1;
        Q_ASSERT(idx < renderTargets.count());
        RenderTarget &rt(renderTargets[idx]);
        Q_ASSERT(rt.entryInUse() && rt.color);

        if (rt.flags & RenderTarget::Multisample) {
            resolveMultisampledTarget(rt.color.Get(), rt.colorResolve.Get(),
                                      D3D12_RESOURCE_STATE_COPY_SOURCE, frameCommandList.Get());
            rtRes = rt.colorResolve.Get();
        } else {
            rtRes = rt.color.Get();
            bstate = D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE;
            needsBarrier = true;
        }
    }

    ComPtr<ID3D12Resource> readbackBuf;

    D3D12_RESOURCE_DESC rtDesc = rtRes->GetDesc();
    UINT64 textureByteSize = 0;
    D3D12_PLACED_SUBRESOURCE_FOOTPRINT textureLayout = {};
    device->GetCopyableFootprints(&rtDesc, 0, 1, 0, &textureLayout, nullptr, nullptr, &textureByteSize);

    D3D12_HEAP_PROPERTIES heapProp = {};
    heapProp.Type = D3D12_HEAP_TYPE_READBACK;

    D3D12_RESOURCE_DESC bufDesc = {};
    bufDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
    bufDesc.Width = textureByteSize;
    bufDesc.Height = 1;
    bufDesc.DepthOrArraySize = 1;
    bufDesc.MipLevels = 1;
    bufDesc.Format = DXGI_FORMAT_UNKNOWN;
    bufDesc.SampleDesc.Count = 1;
    bufDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;

    if (FAILED(device->CreateCommittedResource(&heapProp, D3D12_HEAP_FLAG_NONE, &bufDesc,
                                               D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&readbackBuf)))) {
        qWarning("Failed to create committed resource (readback buffer)");
        return QImage();
    }

    D3D12_TEXTURE_COPY_LOCATION dstLoc;
    dstLoc.pResource = readbackBuf.Get();
    dstLoc.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
    dstLoc.PlacedFootprint = textureLayout;
    D3D12_TEXTURE_COPY_LOCATION srcLoc;
    srcLoc.pResource = rtRes;
    srcLoc.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
    srcLoc.SubresourceIndex = 0;

    ID3D12GraphicsCommandList *cl = frameCommandList.Get();
    if (needsBarrier)
        transitionResource(rtRes, cl, bstate, D3D12_RESOURCE_STATE_COPY_SOURCE);
    cl->CopyTextureRegion(&dstLoc, 0, 0, 0, &srcLoc, nullptr);
    if (needsBarrier)
        transitionResource(rtRes, cl, D3D12_RESOURCE_STATE_COPY_SOURCE, bstate);

    cl->Close();
    ID3D12CommandList *commandLists[] = { cl };
    commandQueue->ExecuteCommandLists(_countof(commandLists), commandLists);

    QScopedPointer<QSGD3D12CPUWaitableFence> f(createCPUWaitableFence());
    waitForGPU(f.data()); // uh oh

    QImage::Format fmt = imageFormatForTexture(rtDesc.Format);
    if (fmt == QImage::Format_Invalid) {
        qWarning("Could not map render target format %d to a QImage format", rtDesc.Format);
        return QImage();
    }
    QImage img(rtDesc.Width, rtDesc.Height, fmt);
    quint8 *p = nullptr;
    const D3D12_RANGE readRange = { 0, 0 };
    if (FAILED(readbackBuf->Map(0, &readRange, reinterpret_cast<void **>(&p)))) {
        qWarning("Mapping the readback buffer failed");
        return QImage();
    }
    const int bpp = 4; // ###
    if (id == 0) {
        for (UINT y = 0; y < rtDesc.Height; ++y) {
            quint8 *dst = img.scanLine(y);
            memcpy(dst, p, rtDesc.Width * bpp);
            p += textureLayout.Footprint.RowPitch;
        }
    } else {
        for (int y = rtDesc.Height - 1; y >= 0; --y) {
            quint8 *dst = img.scanLine(y);
            memcpy(dst, p, rtDesc.Width * bpp);
            p += textureLayout.Footprint.RowPitch;
        }
    }
    readbackBuf->Unmap(0, nullptr);

    return img;
}

void QSGD3D12EnginePrivate::simulateDeviceLoss()
{
    qWarning("QSGD3D12Engine: Triggering device loss via TDR");
    devLossTest.killDevice();
}

bool QSGD3D12EnginePrivate::DeviceLossTester::initialize(QSGD3D12EnginePrivate *enginePriv)
{
    engine = enginePriv;

#ifdef DEVLOSS_TEST
    D3D12_DESCRIPTOR_RANGE descRange[2];
    descRange[0].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_CBV;
    descRange[0].NumDescriptors = 1;
    descRange[0].BaseShaderRegister = 0;
    descRange[0].RegisterSpace = 0;
    descRange[0].OffsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND;
    descRange[1].RangeType = D3D12_DESCRIPTOR_RANGE_TYPE_UAV;
    descRange[1].NumDescriptors = 1;
    descRange[1].BaseShaderRegister = 0;
    descRange[1].RegisterSpace = 0;
    descRange[1].OffsetInDescriptorsFromTableStart = D3D12_DESCRIPTOR_RANGE_OFFSET_APPEND;

    D3D12_ROOT_PARAMETER param;
    param.ParameterType = D3D12_ROOT_PARAMETER_TYPE_DESCRIPTOR_TABLE;
    param.ShaderVisibility = D3D12_SHADER_VISIBILITY_ALL;
    param.DescriptorTable.NumDescriptorRanges = 2;
    param.DescriptorTable.pDescriptorRanges = descRange;

    D3D12_ROOT_SIGNATURE_DESC desc = {};
    desc.NumParameters = 1;
    desc.pParameters = &param;

    ComPtr<ID3DBlob> signature;
    ComPtr<ID3DBlob> error;
    if (FAILED(D3D12SerializeRootSignature(&desc, D3D_ROOT_SIGNATURE_VERSION_1, &signature, &error))) {
        QByteArray msg(static_cast<const char *>(error->GetBufferPointer()), error->GetBufferSize());
        qWarning("Failed to serialize compute root signature: %s", qPrintable(msg));
        return false;
    }
    if (FAILED(engine->device->CreateRootSignature(0, signature->GetBufferPointer(), signature->GetBufferSize(),
                                        IID_PPV_ARGS(&computeRootSignature)))) {
        qWarning("Failed to create compute root signature");
        return false;
    }

    D3D12_COMPUTE_PIPELINE_STATE_DESC psoDesc = {};
    psoDesc.pRootSignature = computeRootSignature.Get();
    psoDesc.CS.pShaderBytecode = g_timeout;
    psoDesc.CS.BytecodeLength = sizeof(g_timeout);

    if (FAILED(engine->device->CreateComputePipelineState(&psoDesc, IID_PPV_ARGS(&computeState)))) {
        qWarning("Failed to create compute pipeline state");
        return false;
    }
#endif

    return true;
}

void QSGD3D12EnginePrivate::DeviceLossTester::releaseResources()
{
    computeState = nullptr;
    computeRootSignature = nullptr;
}

void QSGD3D12EnginePrivate::DeviceLossTester::killDevice()
{
#ifdef DEVLOSS_TEST
    ID3D12CommandAllocator *ca = engine->frameCommandAllocator[engine->frameIndex % engine->frameInFlightCount].Get();
    ID3D12GraphicsCommandList *cl = engine->frameCommandList.Get();
    cl->Reset(ca, computeState.Get());

    cl->SetComputeRootSignature(computeRootSignature.Get());
    cl->Dispatch(256, 1, 1);

    cl->Close();
    ID3D12CommandList *commandLists[] = { cl };
    engine->commandQueue->ExecuteCommandLists(_countof(commandLists), commandLists);

    engine->waitGPU();
#endif
}

void *QSGD3D12EnginePrivate::getResource(QSGRendererInterface::Resource resource) const
{
    switch (resource) {
    case QSGRendererInterface::DeviceResource:
        return device;
    case QSGRendererInterface::CommandQueueResource:
        return commandQueue.Get();
    case QSGRendererInterface::CommandListResource:
        return commandList;
    default:
        break;
    }
    return nullptr;
}

QT_END_NAMESPACE
