/*
 * Copyright © 2014 Broadcom
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */

#ifdef USE_VC4_SIMULATOR

#include "util/u_memory.h"
#include "util/ralloc.h"

#include "vc4_screen.h"
#include "vc4_context.h"
#include "kernel/vc4_drv.h"
#include "vc4_simulator_validate.h"
#include "simpenrose/simpenrose.h"

static mtx_t exec_mutex = _MTX_INITIALIZER_NP;

/* A marker placed just after each BO, then checked after rendering to make
 * sure it's still there.
 */
#define BO_SENTINEL		0xfedcba98

#define OVERFLOW_SIZE (32 * 1024 * 1024)

static struct drm_gem_cma_object *
vc4_wrap_bo_with_cma(struct drm_device *dev, struct vc4_bo *bo)
{
        struct vc4_context *vc4 = dev->vc4;
        struct vc4_screen *screen = vc4->screen;
        struct drm_vc4_bo *drm_bo = CALLOC_STRUCT(drm_vc4_bo);
        struct drm_gem_cma_object *obj = &drm_bo->base;
        uint32_t size = align(bo->size, 4096);

        drm_bo->bo = bo;
        obj->base.size = size;
        obj->base.dev = dev;
        obj->vaddr = screen->simulator_mem_base + dev->simulator_mem_next;
        obj->paddr = simpenrose_hw_addr(obj->vaddr);

        dev->simulator_mem_next += size + sizeof(uint32_t);
        dev->simulator_mem_next = align(dev->simulator_mem_next, 4096);
        assert(dev->simulator_mem_next <= screen->simulator_mem_size);

        *(uint32_t *)(obj->vaddr + bo->size) = BO_SENTINEL;

        return obj;
}

struct drm_gem_cma_object *
drm_gem_cma_create(struct drm_device *dev, size_t size)
{
        struct vc4_context *vc4 = dev->vc4;
        struct vc4_screen *screen = vc4->screen;

        struct vc4_bo *bo = vc4_bo_alloc(screen, size, "simulator validate");
        return vc4_wrap_bo_with_cma(dev, bo);
}

static int
vc4_simulator_pin_bos(struct drm_device *dev, struct vc4_job *job,
                      struct vc4_exec_info *exec)
{
        struct drm_vc4_submit_cl *args = exec->args;
        struct vc4_bo **bos = job->bo_pointers.base;

        exec->bo_count = args->bo_handle_count;
        exec->bo = calloc(exec->bo_count, sizeof(void *));
        for (int i = 0; i < exec->bo_count; i++) {
                struct vc4_bo *bo = bos[i];
                struct drm_gem_cma_object *obj = vc4_wrap_bo_with_cma(dev, bo);

                struct drm_vc4_bo *drm_bo = to_vc4_bo(&obj->base);
#if 0
                fprintf(stderr, "bo hindex %d: %s\n", i, bo->name);
#endif

                vc4_bo_map(bo);
                memcpy(obj->vaddr, bo->map, bo->size);

                exec->bo[i] = obj;

                /* The kernel does this validation at shader create ioctl
                 * time.
                 */
                if (strcmp(bo->name, "code") == 0) {
                        drm_bo->validated_shader = vc4_validate_shader(obj);
                        if (!drm_bo->validated_shader)
                                abort();
                }
        }
        return 0;
}

static int
vc4_simulator_unpin_bos(struct vc4_exec_info *exec)
{
        for (int i = 0; i < exec->bo_count; i++) {
                struct drm_gem_cma_object *obj = exec->bo[i];
                struct drm_vc4_bo *drm_bo = to_vc4_bo(&obj->base);
                struct vc4_bo *bo = drm_bo->bo;

                assert(*(uint32_t *)(obj->vaddr + bo->size) == BO_SENTINEL);
                memcpy(bo->map, obj->vaddr, bo->size);

                if (drm_bo->validated_shader) {
                        free(drm_bo->validated_shader->texture_samples);
                        free(drm_bo->validated_shader);
                }
                free(obj);
        }

        free(exec->bo);

        return 0;
}

static void
vc4_dump_to_file(struct vc4_exec_info *exec)
{
        static int dumpno = 0;
        struct drm_vc4_get_hang_state *state;
        struct drm_vc4_get_hang_state_bo *bo_state;
        unsigned int dump_version = 0;

        if (!(vc4_debug & VC4_DEBUG_DUMP))
                return;

        state = calloc(1, sizeof(*state));

        int unref_count = 0;
        list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list,
                                 unref_head) {
                unref_count++;
        }

        /* Add one more for the overflow area that isn't wrapped in a BO. */
        state->bo_count = exec->bo_count + unref_count + 1;
        bo_state = calloc(state->bo_count, sizeof(*bo_state));

        char *filename = NULL;
        asprintf(&filename, "vc4-dri-%d.dump", dumpno++);
        FILE *f = fopen(filename, "w+");
        if (!f) {
                fprintf(stderr, "Couldn't open %s: %s", filename,
                        strerror(errno));
                return;
        }

        fwrite(&dump_version, sizeof(dump_version), 1, f);

        state->ct0ca = exec->ct0ca;
        state->ct0ea = exec->ct0ea;
        state->ct1ca = exec->ct1ca;
        state->ct1ea = exec->ct1ea;
        state->start_bin = exec->ct0ca;
        state->start_render = exec->ct1ca;
        fwrite(state, sizeof(*state), 1, f);

        int i;
        for (i = 0; i < exec->bo_count; i++) {
                struct drm_gem_cma_object *cma_bo = exec->bo[i];
                bo_state[i].handle = i; /* Not used by the parser. */
                bo_state[i].paddr = cma_bo->paddr;
                bo_state[i].size = cma_bo->base.size;
        }

        list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list,
                                 unref_head) {
                struct drm_gem_cma_object *cma_bo = &bo->base;
                bo_state[i].handle = 0;
                bo_state[i].paddr = cma_bo->paddr;
                bo_state[i].size = cma_bo->base.size;
                i++;
        }

        /* Add the static overflow memory area. */
        bo_state[i].handle = exec->bo_count;
        bo_state[i].paddr = 0;
        bo_state[i].size = OVERFLOW_SIZE;
        i++;

        fwrite(bo_state, sizeof(*bo_state), state->bo_count, f);

        for (int i = 0; i < exec->bo_count; i++) {
                struct drm_gem_cma_object *cma_bo = exec->bo[i];
                fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f);
        }

        list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list,
                                 unref_head) {
                struct drm_gem_cma_object *cma_bo = &bo->base;
                fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f);
        }

        void *overflow = calloc(1, OVERFLOW_SIZE);
        fwrite(overflow, 1, OVERFLOW_SIZE, f);
        free(overflow);

        free(state);
        free(bo_state);
        fclose(f);
}

int
vc4_simulator_flush(struct vc4_context *vc4,
                    struct drm_vc4_submit_cl *args, struct vc4_job *job)
{
        struct vc4_screen *screen = vc4->screen;
        struct vc4_surface *csurf = vc4_surface(vc4->framebuffer.cbufs[0]);
        struct vc4_resource *ctex = csurf ? vc4_resource(csurf->base.texture) : NULL;
        uint32_t winsys_stride = ctex ? ctex->bo->simulator_winsys_stride : 0;
        uint32_t sim_stride = ctex ? ctex->slices[0].stride : 0;
        uint32_t row_len = MIN2(sim_stride, winsys_stride);
        struct vc4_exec_info exec;
        struct drm_device local_dev = {
                .vc4 = vc4,
                .simulator_mem_next = OVERFLOW_SIZE,
        };
        struct drm_device *dev = &local_dev;
        int ret;

        memset(&exec, 0, sizeof(exec));
        list_inithead(&exec.unref_list);

        if (ctex && ctex->bo->simulator_winsys_map) {
#if 0
                fprintf(stderr, "%dx%d %d %d %d\n",
                        ctex->base.b.width0, ctex->base.b.height0,
                        winsys_stride,
                        sim_stride,
                        ctex->bo->size);
#endif

                for (int y = 0; y < ctex->base.b.height0; y++) {
                        memcpy(ctex->bo->map + y * sim_stride,
                               ctex->bo->simulator_winsys_map + y * winsys_stride,
                               row_len);
                }
        }

        exec.args = args;

        ret = vc4_simulator_pin_bos(dev, job, &exec);
        if (ret)
                return ret;

        ret = vc4_cl_validate(dev, &exec);
        if (ret)
                return ret;

        if (vc4_debug & VC4_DEBUG_CL) {
                fprintf(stderr, "RCL:\n");
                vc4_dump_cl(screen->simulator_mem_base + exec.ct1ca,
                            exec.ct1ea - exec.ct1ca, true);
        }

        vc4_dump_to_file(&exec);

        if (exec.ct0ca != exec.ct0ea) {
                int bfc = simpenrose_do_binning(exec.ct0ca, exec.ct0ea);
                if (bfc != 1) {
                        fprintf(stderr, "Binning returned %d flushes, should be 1.\n",
                                bfc);
                        fprintf(stderr, "Relocated binning command list:\n");
                        vc4_dump_cl(screen->simulator_mem_base + exec.ct0ca,
                                    exec.ct0ea - exec.ct0ca, false);
                        abort();
                }
        }
        int rfc = simpenrose_do_rendering(exec.ct1ca, exec.ct1ea);
        if (rfc != 1) {
                fprintf(stderr, "Rendering returned %d frames, should be 1.\n",
                        rfc);
                fprintf(stderr, "Relocated render command list:\n");
                vc4_dump_cl(screen->simulator_mem_base + exec.ct1ca,
                            exec.ct1ea - exec.ct1ca, true);
                abort();
        }

        ret = vc4_simulator_unpin_bos(&exec);
        if (ret)
                return ret;

        list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec.unref_list,
                                 unref_head) {
		list_del(&bo->unref_head);
                assert(*(uint32_t *)(bo->base.vaddr + bo->bo->size) ==
                       BO_SENTINEL);
                vc4_bo_unreference(&bo->bo);
                free(bo);
        }

        if (ctex && ctex->bo->simulator_winsys_map) {
                for (int y = 0; y < ctex->base.b.height0; y++) {
                        memcpy(ctex->bo->simulator_winsys_map + y * winsys_stride,
                               ctex->bo->map + y * sim_stride,
                               row_len);
                }
        }

        return 0;
}

static void *sim_mem_base = NULL;
static int sim_mem_refcount = 0;
static ssize_t sim_mem_size = 256 * 1024 * 1024;

void
vc4_simulator_init(struct vc4_screen *screen)
{
        mtx_lock(&exec_mutex);
        if (sim_mem_refcount++) {
                screen->simulator_mem_size = sim_mem_size;
                screen->simulator_mem_base = sim_mem_base;
                mtx_unlock(&exec_mutex);
                return;
        }

        sim_mem_base = calloc(sim_mem_size, 1);
        if (!sim_mem_base)
                abort();

        screen->simulator_mem_size = sim_mem_size;
        screen->simulator_mem_base = sim_mem_base;

        /* We supply our own memory so that we can have more aperture
         * available (256MB instead of simpenrose's default 64MB).
         */
        simpenrose_init_hardware_supply_mem(screen->simulator_mem_base,
                                            screen->simulator_mem_size);

        /* Carve out low memory for tile allocation overflow.  The kernel
         * should be automatically handling overflow memory setup on real
         * hardware, but for simulation we just get one shot to set up enough
         * overflow memory before execution.  This overflow mem will be used
         * up over the whole lifetime of simpenrose (not reused on each
         * flush), so it had better be big.
         */
        simpenrose_supply_overflow_mem(0, OVERFLOW_SIZE);

        mtx_unlock(&exec_mutex);
}

void
vc4_simulator_destroy(struct vc4_screen *screen)
{
        mtx_lock(&exec_mutex);
        if (!--sim_mem_refcount) {
                free(sim_mem_base);
                sim_mem_base = NULL;
        }
        mtx_unlock(&exec_mutex);
}

#endif /* USE_VC4_SIMULATOR */
