# Copyright (c) 2018 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

from __future__ import print_function
from __future__ import absolute_import
from __future__ import division
from __future__ import unicode_literals
from future import standard_library
standard_library.install_aliases()
from builtins import *
from builtins import str
from builtins import range
import os, sys, struct, subprocess
import datetime, pytz
from nnir import *

tensor_type_nnir2openvx = {
    'F032' : 'VX_TYPE_FLOAT32',
    'F016' : 'VX_TYPE_FLOAT16',
    'U016' : 'VX_TYPE_UINT16',
    'I016' : 'VX_TYPE_INT16',
    'U008' : 'VX_TYPE_UINT8'
}

tensor_type2size = {
    'F032' : 4,
    'F016' : 2,
    'U016' : 2,
    'I016' : 2,
    'U008' : 1    
}

def generateLicenseForCPP(f):
        f.write( \
"""/*
MIT License

Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

/* This file is generated by nnir_to_clib.py on %s */
""" % (datetime.datetime.now(tz=pytz.timezone('America/Los_Angeles')).isoformat()))

def generateLicenseForScript(f):
        f.write( \
"""################################################################################
#
# MIT License
#
# Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
################################################################################

# This file is generated by nnir_to_clib.py on %s
""" % (datetime.datetime.now(tz=pytz.timezone('America/Los_Angeles')).isoformat()))

def generateCMakeFiles(graph,outputFolder):
    fileName = outputFolder + '/CMakeLists.txt'
    print('creating ' + fileName + ' ...')
    with open(fileName, 'w') as f:
        generateLicenseForScript(f)
        f.write( \
"""
cmake_minimum_required (VERSION 3.0)
project (mvdeploy)
set (CMAKE_CXX_STANDARD 11)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/bin)
set(CMAKE_INSTALL_PREFIX /opt/rocm/mivisionx)

list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)

set(ROCM_PATH /opt/rocm CACHE PATH "ROCm Installation Path")
#find the OPENVX backend type
set(OPENVX_BACKEND_OPENCL_FOUND 0)
set(OPENVX_BACKEND_HIP_FOUND 0)
if(EXISTS ${ROCM_PATH}/mivisionx/include/openvx_backend.h)
    file(READ ${ROCM_PATH}/mivisionx/include/openvx_backend.h OPENVX_BACKEND_FILE)
    string(REGEX MATCH "ENABLE_OPENCL ([0-9]*)" _ ${OPENVX_BACKEND_FILE})
    set(OPENVX_BACKEND_OPENCL_FOUND ${CMAKE_MATCH_1})
    string(REGEX MATCH "ENABLE_HIP ([0-9]*)" _ ${OPENVX_BACKEND_FILE})
    set(OPENVX_BACKEND_HIP_FOUND ${CMAKE_MATCH_1})
else()
    message("-- ${Red}WARNING: ${ROCM_PATH}/mivisionx/include/openvx_backend.h file Not Found. please install the latest mivisionx! ${ColourReset}")
endif()

if (OPENVX_BACKEND_OPENCL_FOUND)
    find_package(OpenCL REQUIRED)
    include_directories (${OpenCL_INCLUDE_DIRS} ${OpenCL_INCLUDE_DIRS}/Headers )
endif()

find_package(OpenCV QUIET)
include_directories (/opt/rocm/mivisionx/include)
link_directories    (/opt/rocm/mivisionx/lib)
list(APPEND SOURCES mvmodule.cpp)
add_library(mv_deploy SHARED ${SOURCES})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -std=gnu++14")
target_compile_definitions(mv_deploy PRIVATE ENABLE_MVDEPLOY=1)
target_link_libraries(mv_deploy openvx vx_nn pthread ${CMAKE_DL_LIBS})
install (TARGETS mv_deploy DESTINATION lib)

option (USE_POSTPROC  "Use postprocessing module implementation" OFF) 
add_executable(mvtestdeploy mvtestdeploy.cpp mvdeploy_api.cpp)
if (OpenCV_FOUND)
  target_compile_definitions(mvtestdeploy PUBLIC ENABLE_OPENCV=1)
  include_directories(${OpenCV_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}/lib)
  target_link_libraries(mvtestdeploy ${OpenCV_LIBRARIES})
else(OpenCV_FOUND)
  target_compile_definitions(mvtestdeploy PUBLIC ENABLE_OPENCV=0)
  include_directories(${PROJECT_SOURCE_DIR}/lib)
endif(OpenCV_FOUND)
#add optional postprocess module
if (USE_POSTPROC)
  include_directories ("${PROJECT_SOURCE_DIR}/mv_extras")
  add_subdirectory (mv_extras)
  set (EXTRA_LIBS ${EXTRA_LIBS} mv_extras)
endif (USE_POSTPROC)
target_compile_definitions (mvtestdeploy PRIVATE ENABLE_MVDEPLOY=1)
target_link_libraries(mvtestdeploy openvx vx_nn vx_amd_media pthread mv_deploy ${EXTRA_LIBS} ${CMAKE_DL_LIBS})
install (TARGETS mvtestdeploy DESTINATION bin)

""")
    if not os.path.isdir(outputFolder + '/cmake'):
        os.mkdir(outputFolder + '/cmake')
    fileName = outputFolder + '/cmake/FindOpenCL.cmake'
    print('creating ' + fileName + ' ...')
    with open(fileName, 'w') as f:
        generateLicenseForScript(f)
        f.write( \
"""
find_path(OPENCL_INCLUDE_DIRS
    NAMES OpenCL/cl.h CL/cl.h
    HINTS
    ${OPENCL_ROOT}/include
    $ENV{AMDAPPSDKROOT}/include
    PATHS
    /usr/include
    /usr/local/include
    /opt/rocm/opencl/include
    DOC "OpenCL header file path"
    )
mark_as_advanced( OPENCL_INCLUDE_DIRS )

if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
    find_library( OPENCL_LIBRARIES
        NAMES OpenCL
        HINTS
        ${OPENCL_ROOT}/lib
        $ENV{AMDAPPSDKROOT}/lib
        DOC "OpenCL dynamic library path"
        PATH_SUFFIXES x86_64 x64 x86_64/sdk
        PATHS
        /usr/lib
        /opt/rocm/opencl/lib
        )
else( )
    find_library( OPENCL_LIBRARIES
        NAMES OpenCL
        HINTS
        ${OPENCL_ROOT}/lib
        $ENV{AMDAPPSDKROOT}/lib
        DOC "OpenCL dynamic library path"
        PATH_SUFFIXES x86 Win32

        PATHS
        /usr/lib
        )
endif( )
mark_as_advanced( OPENCL_LIBRARIES )

include( FindPackageHandleStandardArgs )
find_package_handle_standard_args( OPENCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS )

set(OpenCL_FOUND ${OPENCL_FOUND} CACHE INTERNAL "")
set(OpenCL_LIBRARIES ${OPENCL_LIBRARIES} CACHE INTERNAL "")
set(OpenCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIRS} CACHE INTERNAL "")

if( NOT OPENCL_FOUND )
    message( STATUS "FindOpenCL looked for libraries named: OpenCL" )
endif()
""")

def generateCMakeExtras(graph,outputFolder):
    fileName = outputFolder + '/CMakeLists.txt'
    print('creating ' + fileName + ' ...')
    with open(fileName, 'w') as f:
        generateLicenseForScript(f)
        f.write( \
"""
cmake_minimum_required (VERSION 3.0)
project (mv_extras)
set (CMAKE_CXX_STANDARD 11)
list(APPEND CMAKE_MODULE_PATH ../cmake)

set(ROCM_PATH /opt/rocm CACHE PATH "ROCm Installation Path")
#find the OPENVX backend type
set(OPENVX_BACKEND_OPENCL_FOUND 0)
set(OPENVX_BACKEND_HIP_FOUND 0)
if(EXISTS ${ROCM_PATH}/mivisionx/include/openvx_backend.h)
    file(READ ${ROCM_PATH}/mivisionx/include/openvx_backend.h OPENVX_BACKEND_FILE)
    string(REGEX MATCH "ENABLE_OPENCL ([0-9]*)" _ ${OPENVX_BACKEND_FILE})
    set(OPENVX_BACKEND_OPENCL_FOUND ${CMAKE_MATCH_1})
    string(REGEX MATCH "ENABLE_HIP ([0-9]*)" _ ${OPENVX_BACKEND_FILE})
    set(OPENVX_BACKEND_HIP_FOUND ${CMAKE_MATCH_1})
else()
    message("-- ${Red}WARNING: ${ROCM_PATH}/mivisionx/include/openvx_backend.h file Not Found. please install the latest mivisionx! ${ColourReset}")
endif()

if (OPENVX_BACKEND_OPENCL_FOUND)
    find_package(OpenCL REQUIRED)
    include_directories (${OpenCL_INCLUDE_DIRS} ${OpenCL_INCLUDE_DIRS}/Headers )
endif()

find_package(OpenCV QUIET)
include_directories (/opt/rocm/mivisionx/include ../)
link_directories    (/opt/rocm/mivisionx/lib)
add_library(${PROJECT_NAME} SHARED mv_extras_postproc.cpp)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -std=gnu++14")
if (OpenCV_FOUND)
  target_compile_definitions(mv_extras PUBLIC ENABLE_OPENCV=1)
  include_directories(${OpenCV_INCLUDE_DIRS})
  target_link_libraries(mv_extras ${OpenCV_LIBRARIES})
else(OpenCV_FOUND)
  target_compile_definitions(mv_extras PUBLIC ENABLE_OPENCV=0)
endif(OpenCV_FOUND)
target_link_libraries(mv_extras openvx vx_nn vx_amd_media pthread ${CMAKE_DL_LIBS})
""")

def generateModuleCPP(graph,fileName):
    print('creating ' + fileName + ' ...')
    with open(fileName, 'w') as f:
        generateLicenseForCPP(f)
        if len(graph.inputs) < 1 or len(graph.outputs) < 1:
            f.write( \
"""
#include "mvdeploy.h"

MIVID_API_ENTRY void MIVID_API_CALL mvSetLogCallback(mivid_log_callback_f log_callback_f)
{
    return;
}

MIVID_API_ENTRY void MIVID_API_CALL mvSetPreProcessCallback(mivid_add_preprocess_callback_f preproc_f, mv_preprocess_callback_args *preproc_args)
{
    return;
}

MIVID_API_ENTRY void MIVID_API_CALL mvSetPostProcessCallback(mivid_add_postprocess_callback_f postproc_f)
{
    return;
}


MIVID_API_ENTRY mv_status MIVID_API_CALL mvQueryInference(int *num_inputs, int *num_outputs, const char **inp_out_config)
{
    return MV_FAILURE;
}

MIVID_API_ENTRY mivid_handle MIVID_API_CALL mvCreateInference(const char * binaryFilename, int mem_type)
{
    return nullptr;
}

MIVID_API_ENTRY mv_status MIVID_API_CALL mvCopyToTensorFromMem(mivid_handle handle, int input_num, void *input_data_ptr, size_t size, mivid_memory_type type)
{
    return MV_FAILURE;
}

MIVID_API_ENTRY mv_status MIVID_API_CALL mvCopyToTensorFromFile(mivid_handle handle, int input_num, const char *input_name, bool reverseOrder, float preprocess_mulfac, float preprocess_addfac)
{
    return MV_FAILURE;
}

MIVID_API_ENTRY mv_status MIVID_API_CALL mvGetOutput(mivid_handle handle, int output_num, void *out_tensor_mem, vx_size size)
{
    return MV_FAILURE;    
}

MIVID_API_ENTRY mv_status MIVID_API_CALL mvProcessInference(mivid_handle handle, float *ptime_in_ms, int num_iterations)
{
    return MV_FAILURE;
}

MIVID_API_ENTRY mv_status MIVID_API_CALL mvScheduleInference(mivid_handle handle);
{
    return MV_FAILURE;
}

MIVID_API_ENTRY mv_status MIVID_API_CALL mvWaitForCompletion(mivid_handle handle)
{
    return MV_FAILURE;
}

MIVID_API_ENTRY mv_status MIVID_API_CALL mvReleaseInference(mivid_handle handle)
{
    return MV_FAILURE;
}

""")

        else:
            input_shape = graph.inputs[0].shape
            input_elm_size = tensor_type2size[graph.inputs[0].type]
            output_elm_size =  tensor_type2size[graph.outputs[0].type]           
            input_buf_size = eval('*'.join([str(v) for v in input_shape])) * input_elm_size
            output_shape = []
            output_buf_size = []
            output_str = []
            input_str = []
            for i in range(len(graph.inputs)):
                if i==0:
                    config = 'input,' + graph.inputs[0].name + ',' + ','.join(str(v) for v in input_shape) + ';'
                else:
                    config += 'input' + str(i) + ',' +graph.inputs[i].name + ',' + ','.join(str(v) for v in input_shape) + ';'                    
                input_str.append('handle->inputs['+ str(i) + ']')
            for i in range(len(graph.outputs)):
                output_shape.append(graph.outputs[i].shape)
                output_buf_size.append(eval('*'.join([str(v) for v in output_shape[i]])) * tensor_type2size[graph.outputs[i].type])
                config += 'output' + str(i) + ',' +graph.outputs[i].name + ',' + ','.join(str(v) for v in output_shape[i])+';'
                output_str.append('handle->outputs['+ str(i) + ']')
            f.write( \
"""
#include "mvdeploy.h"

static mivid_log_callback_f g_mv_log_message_callback = nullptr;
static mivid_add_preprocess_callback_f g_mv_preprocess_callback = nullptr;
static mivid_add_postprocess_callback_f g_mv_postprocess_callback = nullptr;
static mv_preprocess_callback_args *g_mv_preproc_args = nullptr;

inline int64_t clockCounter()
{
    return std::chrono::high_resolution_clock::now().time_since_epoch().count();
}

inline int64_t clockFrequency()
{
    return std::chrono::high_resolution_clock::period::den / std::chrono::high_resolution_clock::period::num;
}

static void MIVID_CALLBACK log_callback(vx_context context, vx_reference ref, vx_status status, const vx_char string[])
{
    if (g_mv_log_message_callback) {
        g_mv_log_message_callback(string);
    }else
    {
        size_t len = strlen(string);
        if (len > 0) {
            printf("%%s", string);
            if (string[len - 1] != '\\n')
                printf("\\n");
            fflush(stdout);
        }
    }
}

static mv_status MIVID_CALLBACK preprocess_callback(mivid_session session, vx_tensor inp_tensor)
{
    if (g_mv_preprocess_callback) {
        return (mv_status)g_mv_preprocess_callback(session, inp_tensor, g_mv_preproc_args);
    }else
    {
        printf("ERROR: preprocess callback function is not set by user \\n");
        return MV_FAILURE;
    }
}

static mv_status MIVID_CALLBACK postprocess_callback(mivid_session session, vx_tensor outp_tensor)
{
    if (g_mv_preprocess_callback) {
        return (mv_status)g_mv_postprocess_callback(session, outp_tensor);
    }else
    {
        printf("ERROR: postprocess callback function is not set by user \\n");
        return MV_FAILURE;
    }
}

static vx_status initializeTensor(vx_context context, vx_tensor tensor, FILE * fp, const char * binaryFilename)
{
    vx_enum data_type = VX_TYPE_FLOAT32;
    vx_size num_of_dims = 4, dims[4] = { 1, 1, 1, 1 }, stride[4];
    ERROR_CHECK_STATUS(vxQueryTensor(tensor, VX_TENSOR_DATA_TYPE, &data_type, sizeof(vx_enum)));
    ERROR_CHECK_STATUS(vxQueryTensor(tensor, VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(vx_size)));
    ERROR_CHECK_STATUS(vxQueryTensor(tensor, VX_TENSOR_DIMS, &dims, num_of_dims * sizeof(vx_size)));
    vx_size itemsize = sizeof(float);
    if(data_type == VX_TYPE_UINT8 || data_type == VX_TYPE_INT8) {
        itemsize = sizeof(vx_uint8);
    }
    else if(data_type == VX_TYPE_UINT16 || data_type == VX_TYPE_INT16 || data_type == VX_TYPE_FLOAT16) {
        itemsize = sizeof(vx_uint16);
    }
    vx_size count = dims[0] * dims[1] * dims[2] * dims[3];

    vx_uint32 h[2] = { 0 };
    fread(h, 1, sizeof(h), fp);
    if(h[0] != 0xf00dd1e1 || (vx_size)h[1] != (count*itemsize)) {
      vxAddLogEntry((vx_reference)tensor, VX_FAILURE, "ERROR: invalid data (magic,size)=(0x%%x,%%d) in %%s at byte position %%d -- expected size is %%ld\\n", h[0], h[1], binaryFilename, ftell(fp)-sizeof(h), count*itemsize);
      return VX_FAILURE;
    }

    vx_map_id map_id;
    void * ptr;
    ERROR_CHECK_STATUS(vxMapTensorPatch(tensor, num_of_dims, nullptr, nullptr, &map_id, stride, (void **)&ptr, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST));
    vx_size n = fread(ptr, itemsize, count, fp);
    if(n != count) {
        vxAddLogEntry((vx_reference)tensor, VX_FAILURE, "ERROR: expected char[%%ld], but got char[%%ld] in %%s\\n", count*itemsize, n*itemsize, binaryFilename);
        return VX_FAILURE;
    }
    ERROR_CHECK_STATUS(vxUnmapTensorPatch(tensor, map_id));

    return VX_SUCCESS;
}


//! \brief Set callback for log messages.
//  - by default, log messages from library will be printed to stdout
//  - the log messages can be redirected to application using a callback
MIVID_API_ENTRY void MIVID_API_CALL mvSetLogCallback(mivid_log_callback_f log_callback_f)
{
    g_mv_log_message_callback = log_callback_f;
}

//! \brief: load and add preprocessing module/nodes to graph if needed.
// need to call this before calling CreateInferenceSession
// output of the preprocessing node should be same as input tensor NN module
MIVID_API_ENTRY void MIVID_API_CALL mvSetPreProcessCallback(mivid_add_preprocess_callback_f preproc_f, mv_preprocess_callback_args *preproc_args)
{
    g_mv_preprocess_callback = preproc_f;
    g_mv_preproc_args = preproc_args;
}

//! \brief: load and add postprocessing modules/nodes to graph if needed.
// need to call this before calling CreateInferenceSession
// input to the preprocessing node should be same as output tensor of NN module
MIVID_API_ENTRY void MIVID_API_CALL mvSetPostProcessCallback(mivid_add_postprocess_callback_f postproc_f)
{
    g_mv_postprocess_callback = postproc_f;
}


MIVID_API_ENTRY vx_status MIVID_API_CALL mvAddToGraph(vx_graph graph, %s, %s, const char * binaryFilename)
{
    vx_context context = vxGetContext((vx_reference)graph);
    ERROR_CHECK_OBJECT(context);

    // create variables
""" % (', '.join(['vx_tensor ' + tensor.name for tensor in graph.inputs]), \
       ', '.join(['vx_tensor ' + tensor.name for tensor in graph.outputs])))
        for tensor in graph.initializers:
            f.write( \
"""    vx_size dims_%s[%d] = { %s };
    vx_tensor %s = vxCreateTensor(context, %d, dims_%s, %s, 0);
    ERROR_CHECK_OBJECT(%s);
""" %(tensor.name, len(tensor.shape), ', '.join([str(v) for v in reversed(tensor.shape)]), \
      tensor.name, len(tensor.shape), tensor.name, tensor_type_nnir2openvx[tensor.type], tensor.name))
        f.write( \
"""
    // initialize variables
    FILE * fp__variables = fopen(binaryFilename, "rb");
    if(!fp__variables) {
        vxAddLogEntry((vx_reference)context, VX_FAILURE, "ERROR: unable to open: %s\\n", binaryFilename);
        return VX_FAILURE;
    }
    { vx_uint32 magic = 0;
      fread(&magic, 1, sizeof(magic), fp__variables);
      if(magic != 0xf00dd1e0) {
        vxAddLogEntry((vx_reference)context, VX_FAILURE, "ERROR: invalid file magic in %s\\n", binaryFilename);
        return VX_FAILURE;
      }
    }
""")
        for tensor in graph.initializers:
            f.write( \
"""    ERROR_CHECK_STATUS(initializeTensor(context, %s, fp__variables, binaryFilename));
""" %(tensor.name))
        f.write( \
"""    { vx_uint32 magic = 0;
      fread(&magic, 1, sizeof(magic), fp__variables);
      if(magic != 0xf00dd1e2) {
        vxAddLogEntry((vx_reference)context, VX_FAILURE, "ERROR: invalid eoff magic in %s\\n", binaryFilename);
        return VX_FAILURE;
      }
      fclose(fp__variables);
    }

    // create local tensors used in graph
""")
        localList = []
        for tensor in graph.locals:
            localList.append(tensor.name)
        outputList = []
        for tensor in graph.outputs:
            outputList.append(tensor.name)
        for idx, tensor in enumerate(graph.locals):
            if (not tensor.name in outputList) and (not tensor.name in localList[:idx]):
                f.write( \
"""    vx_size dims_%s[%d] = { %s };
    vx_tensor %s = vxCreateVirtualTensor(graph, %d, dims_%s, %s, 0);
    ERROR_CHECK_OBJECT(%s);
""" %(tensor.name, len(tensor.shape), ', '.join([str(v) for v in reversed(tensor.shape)]), \
      tensor.name, len(tensor.shape), tensor.name, tensor_type_nnir2openvx[tensor.type], tensor.name))
        f.write( \
"""
    // create nodes in graph
""")
        for node in graph.nodes:
            if node.type == 'conv':
                pads = node.attr.get('pads')
                dilations = node.attr.get('dilations')
                f.write( \
"""
    { vx_nn_convolution_params_t conv_params = { 0 };
      conv_params.padding_x = %d;
      conv_params.padding_y = %d;
      conv_params.overflow_policy = VX_CONVERT_POLICY_SATURATE;
      conv_params.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN;
      conv_params.down_scale_size_rounding = VX_NN_DS_SIZE_ROUNDING_FLOOR;
      conv_params.dilation_x = %d;
      conv_params.dilation_y = %d;
      vx_node node = vxConvolutionLayer(graph, %s, %s, %s, &conv_params, sizeof(conv_params), %s);
      ERROR_CHECK_OBJECT(node);
""" % (pads[0], pads[1], dilations[0] - 1, dilations[1] - 1, \
      node.inputs[0], node.inputs[1], node.inputs[2] if len(node.inputs) == 3 else 'nullptr', node.outputs[0]))
                if (node.attr.get('mode') != 0):
                    f.write( \
"""      vx_float32 alpha = 0;
      vx_scalar s_alpha = vxCreateScalarWithSize(context, VX_TYPE_FLOAT32, &alpha, sizeof(alpha));
      ERROR_CHECK_STATUS(vxSetParameterByIndex(node, 5, (vx_reference) s_alpha));
      ERROR_CHECK_STATUS(vxReleaseScalar(&s_alpha));
""")
                f.write( \
"""      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""")
            elif node.type == 'conv_transpose':
                pads = node.attr.get('pads')
                dilations = node.attr.get('dilations')
                kernel_shape = node.attr.get('kernel_shape')
                output_pads = [(dilations[0] - 1) * (kernel_shape[0] - 1), \
                                (dilations[1] - 1) * (kernel_shape[1] - 1)]
                f.write( \
"""
    { vx_nn_deconvolution_params_t conv_params = { 0 };
      conv_params.padding_x = %d;
      conv_params.padding_y = %d;
      conv_params.overflow_policy = VX_CONVERT_POLICY_SATURATE;
      conv_params.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN;
      conv_params.a_x = %d;
      conv_params.a_y = %d;
      vx_node node = vxDeconvolutionLayer(graph, %s, %s, %s, &conv_params, sizeof(conv_params), %s);
      ERROR_CHECK_OBJECT(node);
      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""" % (pads[0], pads[1], output_pads[0] , output_pads[1] , \
      node.inputs[0], node.inputs[1], node.inputs[2] if len(node.inputs) == 3 else 'nullptr', node.outputs[0]))
            elif node.type == 'gemm':
                alpha = node.attr.get('alpha')
                beta = node.attr.get('beta')
                transA = node.attr.get('transA')
                transB = node.attr.get('transB')
                hasBias = False
                if beta == 1.0 and len(node.inputs) == 3 and len(graph.tensor_shapes[node.inputs[2]]) <= 2:
                    hasBias = True
                if alpha == 1.0 and transA == 0 and transB == 1 and (beta == 0.0 or hasBias):
                    f.write( \
"""
    { vx_node node = vxFullyConnectedLayer(graph, %s, %s, %s, VX_CONVERT_POLICY_SATURATE, VX_ROUND_POLICY_TO_NEAREST_EVEN, %s);
      ERROR_CHECK_OBJECT(node);
      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""" % ( \
        node.inputs[0], node.inputs[1], node.inputs[2] if hasBias else 'nullptr', node.outputs[0]))
                else:
                    raise ValueError("Unsupported gemm configuration by OpenVX: alpha={} beta={} transA={} transB={}".format(alpha, beta, transA, transB))
            elif node.type == 'max_pool' or node.type == 'avg_pool':
                f.write( \
"""
    { vx_node node = vxPoolingLayer(graph, %s, %s, %d, %d, %d, %d, VX_ROUND_POLICY_TO_NEAREST_EVEN, %s);
      ERROR_CHECK_OBJECT(node);
      vx_enum border_mode = %d;
      vx_scalar s_border_mode = vxCreateScalarWithSize(context, VX_TYPE_ENUM, &border_mode, sizeof(border_mode));
      ERROR_CHECK_OBJECT(s_border_mode);
      ERROR_CHECK_STATUS(vxSetParameterByIndex(node, 8, (vx_reference) s_border_mode));
      ERROR_CHECK_STATUS(vxReleaseScalar(&s_border_mode));
""" % (node.inputs[0], 'VX_NN_POOLING_AVG' if node.type == 'avg_pool' else 'VX_NN_POOLING_MAX', \
       node.attr.get('kernel_shape')[0], node.attr.get('kernel_shape')[1], \
       node.attr.get('pads')[0], node.attr.get('pads')[1], node.outputs[0], \
       (1 if node.attr.get('border_mode') == 'discard' else 0)))
                if (node.attr.get('mode') != 0):
                    f.write( \
"""      vx_int32 mode = %s;
      vx_scalar s_mode = vxCreateScalarWithSize(context, VX_TYPE_INT32, &mode, sizeof(mode));
      ERROR_CHECK_STATUS(vxSetParameterByIndex(node, 9, (vx_reference) s_mode));
      ERROR_CHECK_STATUS(vxReleaseScalar(&s_mode));
""" % (node.attr.get('mode')))
                f.write( \
"""      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""")
            elif node.type == 'global_avg_pool':
                f.write( \
"""
    { vx_node node = vxPoolingLayer(graph, %s, VX_NN_POOLING_AVG, %d, %d, %d, %d, VX_ROUND_POLICY_TO_NEAREST_EVEN, %s);
      ERROR_CHECK_OBJECT(node);
""" % (node.inputs[0], graph.tensor_shapes[node.inputs[0]][2], graph.tensor_shapes[node.inputs[0]][3], \
       node.attr.get('pads')[0], node.attr.get('pads')[1], node.outputs[0]))
                if (node.attr.get('mode') != 0):
                    f.write( \
"""      vx_int32 mode = %s;
      vx_scalar s_mode = vxCreateScalarWithSize(context, VX_TYPE_INT32, &mode, sizeof(mode));
      ERROR_CHECK_STATUS(vxSetParameterByIndex(node, 9, (vx_reference) s_mode));
      ERROR_CHECK_STATUS(vxReleaseScalar(&s_mode));
""" % (node.attr.get('mode')))
                f.write( \
"""      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""")
            elif node.type == 'relu':
                f.write( \
"""
    { vx_node node = vxActivationLayer(graph, %s, VX_NN_ACTIVATION_RELU, 0.0f, 0.0f, %s);
      ERROR_CHECK_OBJECT(node);
      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""" % (node.inputs[0], node.outputs[0]))
            elif node.type == 'leaky_relu':
                f.write( \
"""
    {  vx_node node = vxActivationLayer(graph, %s, VX_NN_ACTIVATION_LEAKY_RELU, %f, 0.0f, %s);
       ERROR_CHECK_OBJECT(node);
       ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""" % (node.inputs[0], node.attr.get('alpha'), node.outputs[0]))
            elif node.type == 'add' or node.type == 'sum':
                if len(node.inputs) == 2:
                    f.write( \
"""
    { vx_node node = vxTensorAddNode(graph, %s, %s, VX_CONVERT_POLICY_SATURATE, %s);
      ERROR_CHECK_OBJECT(node);
      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""" % (node.inputs[0], node.inputs[1], node.outputs[0]))
                else:
                    raise ValueError("Unsupported number of input arguments by OpenVX: {}".format(node.type))
            elif node.type == 'sub':
                if len(node.inputs) == 2:
                    f.write( \
"""
    { vx_node node = vxTensorSubtractNode(graph, %s, %s, VX_CONVERT_POLICY_SATURATE, %s);
      ERROR_CHECK_OBJECT(node);
      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""" % (node.inputs[0], node.inputs[1], node.outputs[0]))
                else:
                    raise ValueError("Unsupported number of input arguments by OpenVX: {}".format(node.type))
            elif node.type == 'mul':
                if len(node.inputs) == 2:
                    f.write( \
"""
    { vx_float32 value = 1.0f;
      vx_scalar scale = vxCreateScalar(context, VX_TYPE_FLOAT32, &value);
      ERROR_CHECK_OBJECT(scale);
      vx_node node = vxTensorMultiplyNode(graph, %s, %s, scale, VX_CONVERT_POLICY_SATURATE, VX_ROUND_POLICY_TO_NEAREST_EVEN, %s);
      ERROR_CHECK_STATUS(vxReleaseScalar(&scale));
      ERROR_CHECK_OBJECT(node);
      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""" % (node.inputs[0], node.inputs[1], node.outputs[0]))
                else:
                    raise ValueError("Unsupported number of input arguments by OpenVX: {}".format(node.type))
            elif node.type == 'muladd':
                tensor = graph.tensor_dict[node.inputs[0]]
                f.write( \
"""
    { vx_float32 value = 1.0f;
      vx_scalar scale = vxCreateScalar(context, VX_TYPE_FLOAT32, &value);
      ERROR_CHECK_OBJECT(scale);
      vx_size dims[%d] = { %s };
      vx_tensor tmp__tensor = vxCreateVirtualTensor(graph, %d, dims, %s, 0);
      ERROR_CHECK_OBJECT(tmp__tensor);
      vx_node node = vxTensorMultiplyNode(graph, %s, %s, scale, VX_CONVERT_POLICY_SATURATE, VX_ROUND_POLICY_TO_NEAREST_EVEN, tmp__tensor);
      ERROR_CHECK_STATUS(vxReleaseScalar(&scale));
      ERROR_CHECK_OBJECT(node);
      ERROR_CHECK_STATUS(vxReleaseNode(&node));
      node = vxTensorAddNode(graph, tmp__tensor, %s, VX_CONVERT_POLICY_SATURATE, %s);
      ERROR_CHECK_OBJECT(node);
      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""" % (len(tensor.shape), ', '.join([str(v) for v in reversed(tensor.shape)]), len(tensor.shape), \
       tensor_type_nnir2openvx[tensor.type], node.inputs[0], node.inputs[1], node.inputs[2], node.outputs[0]))
            elif node.type == 'batch_norm':
                f.write( \
"""
    { vx_node node = vxBatchNormalizationLayer(graph, %s, %s, %s, %s, %s, %ef, %s);
      ERROR_CHECK_OBJECT(node);
      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""" % (node.inputs[0], node.inputs[3], node.inputs[4], node.inputs[1], node.inputs[2], node.attr.get('epsilon'), node.outputs[0]))
            elif node.type == 'lrn':
                f.write( \
"""
    { vx_node node = vxNormalizationLayer(graph, %s, %s , %d, %ef, %ef, %s);
""" % (node.inputs[0], "VX_NN_NORMALIZATION_SAME_MAP" if node.attr.get('mode') == 0 else "VX_NN_NORMALIZATION_ACROSS_MAPS" , \
       node.attr.get('size'), node.attr.get('alpha'), node.attr.get('beta'), node.outputs[0]))
                if (node.attr.get('bias') != 1.0):
                    f.write( \
"""   vx_float32 bias = %s;
      vx_scalar s_bias = vxCreateScalarWithSize(context, VX_TYPE_FLOAT32, &bias, sizeof(bias));
      ERROR_CHECK_STATUS(vxSetParameterByIndex(node, 6, (vx_reference) s_bias));
      ERROR_CHECK_STATUS(vxReleaseScalar(&s_bias));
""" % (node.attr.get('bias')))
                f.write( \
"""   ERROR_CHECK_OBJECT(node);
      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""")                        
            elif node.type == 'slice':
                f.write( \
"""
    { vx_node node = vxSliceLayer(graph, %s, %s, %s);
      ERROR_CHECK_OBJECT(node);
      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""" % (node.inputs[0], ', '.join([name for name in node.outputs]), \
       ', '.join(['nullptr' for i in range(8 - len(node.outputs))])))
            elif node.type == 'concat':
                f.write( \
"""
    { vx_node node = vxConcatLayer(graph, %s, %s, %s);
      ERROR_CHECK_OBJECT(node);
      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""" % (node.outputs[0], ', '.join([name for name in node.inputs]), \
       ', '.join(['nullptr' for i in range(8 - len(node.inputs))])))
            elif node.type == 'softmax':
                f.write( \
"""
    { vx_node node = vxSoftmaxLayer(graph, %s, %s);
      ERROR_CHECK_OBJECT(node);
      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""" % (node.inputs[0], node.outputs[0]))
            elif node.type == 'reshape':
                f.write( \
"""
    { vx_node node = vxReshapeLayer(graph, %s, %s);
      ERROR_CHECK_OBJECT(node);
      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""" % (node.inputs[0], node.outputs[0]))
            elif node.type == 'copy'or node.type == 'transpose':
                f.write( \
"""
      ERROR_CHECK_OBJECT(node);
      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }
""" % (node.inputs[0], node.outputs[0]))
            elif node.type == 'upsample':
                f.write( \
"""
    { vx_node node = vxUpsampleNearestLayer(graph, %s, %s);
      ERROR_CHECK_OBJECT(node);
      ERROR_CHECK_STATUS(vxReleaseNode(&node));
    }    
""" % (node.inputs[0], node.outputs[0]))
            else:
                raise ValueError("Unsupported node by OpenVX: {}".format(node.type))
        f.write( \
"""
    // release local tensors
""")
        for idx, tensor in enumerate(graph.locals):
            if (not tensor.name in outputList) and (not tensor.name in localList[:idx]):
                f.write( \
"""    ERROR_CHECK_STATUS(vxReleaseTensor(&%s));
""" %(tensor.name))
        f.write( \
"""
    // release initializer tensors
""")
        for tensor in graph.initializers:
            f.write( \
"""    ERROR_CHECK_STATUS(vxReleaseTensor(&%s));
""" %(tensor.name))
        f.write( \
"""
    return VX_SUCCESS;
}

const char * MIVID_API_CALL mvQueryInference(int *num_inputs, int *num_outputs)
{
    *num_inputs = %d;
    *num_outputs = %d;
    return "%s";
}
""" % (len(graph.inputs), len(graph.outputs), config))
        f.write( \
"""

MIVID_API_ENTRY mivid_handle MIVID_API_CALL mvCreateInference(const char * binaryFilename, int mem_type)
{
    bool successful = false;

    mivid_handle handle = new mivid_handle_t();
    memset(handle, 0, sizeof(mivid_handle_t));
    if(!handle) {
        printf("ERROR: new mv_handle: failed (nullptr)\\n");
        return handle;
    }
    else {
        vx_status status;
        handle->mv_add_preprocess_cb = g_mv_preprocess_callback;
        handle->mv_add_postprocess_cb = g_mv_postprocess_callback;        
        handle->context = vxCreateContext();
        if((status = vxGetStatus((vx_reference)handle->context)) != VX_SUCCESS) {
            printf("ERROR: vxCreateContext: failed (%d)\\n", status);
        }
        else {
            vxRegisterLogCallback(handle->context, log_callback, vx_false_e);
            handle->graph = vxCreateGraph(handle->context);
            if((status = vxGetStatus((vx_reference)handle->graph)) != VX_SUCCESS) {
                printf("ERROR: vxCreateGraph: failed (%d)\\n", status);
                goto fallback;
            }
            vx_tensor input_tensor;
            vx_tensor output_tensor;
            vx_size inp_stride[4];
            handle->mem_type_in = mem_type;
            if (( status = vxLoadKernels(handle->context, "vx_nn")) != VX_SUCCESS) {
                printf("ERROR: vxLoadKernels for vx_nn: failed (%d)\\n", status);
                goto fallback;
            }
""")
        for tensor in graph.inputs:
            f.write( \
"""            vx_size inp_dim_%s[%d] = { %s };
            inp_stride[0] = %d, inp_stride[1] = %d, inp_stride[2] = %d, inp_stride[3] = %d;
            void *inp_mem = nullptr;
            if (g_mv_preprocess_callback != nullptr) {
                input_tensor = vxCreateVirtualTensor(handle->graph, 4, inp_dim_%s, %s, 0);
                preprocess_callback(handle, input_tensor);
                handle->inputs.push_back(input_tensor);
            } else {
                if (!mem_type) {
                    input_tensor = vxCreateTensorFromHandle(handle->context, 4, inp_dim_%s, %s, 0, inp_stride, inp_mem, VX_MEMORY_TYPE_HOST);
                }
                else {
#if OPENVX_BACKEND_OPENCL_FOUND
                    input_tensor = vxCreateTensorFromHandle(handle->context, 4, inp_dim_%s, %s, 0, inp_stride, inp_mem, VX_MEMORY_TYPE_OPENCL);
#elif OPENVX_BACKEND_HIP_FOUND
                    input_tensor = vxCreateTensorFromHandle(handle->context, 4, inp_dim_%s, %s, 0, inp_stride, inp_mem, VX_MEMORY_TYPE_HIP);
#endif
                }
                if ((status = vxGetStatus((vx_reference)input_tensor)) != VX_SUCCESS) {
                    printf("ERROR: vxCreateTensor(input:[%s]): failed (%%d)\\n", status);
                }else
                {
                    handle->inputs.push_back(input_tensor);
                }
            }
""" % (tensor.name, len(tensor.shape), ', '.join([str(v) for v in reversed(tensor.shape)]), \
        input_elm_size, tensor.shape[3]*input_elm_size, tensor.shape[2]*tensor.shape[3]*input_elm_size, tensor.shape[1]*tensor.shape[2]*tensor.shape[3]*input_elm_size, \
        tensor.name, tensor_type_nnir2openvx[tensor.type], tensor.name, tensor_type_nnir2openvx[tensor.type], tensor.name, tensor_type_nnir2openvx[tensor.type], \
        tensor.name, tensor_type_nnir2openvx[tensor.type], tensor.name))
        for tensor in graph.outputs:
            f.write( \
"""            vx_size out_dim_%d[%d] = { %s };
            output_tensor = vxCreateTensor(handle->context, %d, out_dim_%d, %s, 0);
            if ((status = vxGetStatus((vx_reference)output_tensor)) != VX_SUCCESS) {
                printf("ERROR: vxCreateTensor(output:[%s]): failed (%%d)\\n", status);
            }
            else {
                handle->outputs.push_back(output_tensor);
""" % (i, len(tensor.shape), ', '.join([str(v) for v in reversed(tensor.shape)]), \
       len(tensor.shape), i, tensor_type_nnir2openvx[tensor.type], 'x'.join([str(v) for v in tensor.shape])))
            f.write( \
"""
                if((status = mvAddToGraph(handle->graph, %s, %s, binaryFilename)) != VX_SUCCESS) {
                    printf("ERROR: mvAddToGraph: failed (%%d)\\n", status);
                }
                else if((status = vxVerifyGraph(handle->graph)) != VX_SUCCESS) {
                    printf("ERROR: vxVerifyGraph: failed (%%d)\\n", status);
                }
                else {
                    successful = true;
                }
            }
        }
    }
fallback:
    if(!successful && handle) {
""" % (', '.join(input_str), ', '.join(output_str)))
            for i in range(len(graph.inputs)):
                f.write( \
"""        if(handle->inputs[%d])
            vxReleaseTensor(&handle->inputs[%d]);
""" % ( i, i))          
            for i in range(len(graph.outputs)):
                f.write( \
"""        if(handle->outputs[%d])
            vxReleaseTensor(&handle->outputs[%d]);
""" % ( i, i))          
            f.write( \
"""        if(handle->graph)
            vxReleaseGraph(&handle->graph);
        if(handle->context)
            vxReleaseContext(&handle->context);
        delete handle;
        handle = nullptr;
    }
    return handle;
}

static mv_status copyTensor(std::string fileName, vx_size *dims, vx_size *stride, void *write_ptr, vx_enum data_type, float preprocess_mulfac, float preprocess_addfac)
{
#if ENABLE_OPENCV
    if(dims[2] == 3 && fileName.size() > 4 && (fileName.substr(fileName.size()-4, 4) == ".png" || fileName.substr(fileName.size()-4, 4) == ".jpg"))
    {
        for(size_t n = 0; n < dims[3]; n++) {
            char imgFileName[1024];
            sprintf(imgFileName, fileName.c_str(), (int)n);
            Mat img = imread(imgFileName, CV_LOAD_IMAGE_COLOR);
            if(!img.data || img.rows != dims[1] || img.cols != dims[0]) {
                printf("ERROR: invalid image or dimensions: %%s\\n", imgFileName);
                return MV_ERROR_INVALID_TYPE;
            }
            for(vx_size y = 0; y < dims[1]; y++) {
                unsigned char * src = img.data + y*dims[0]*3;
                if(data_type == VX_TYPE_FLOAT32) {
                    float * dstR = (float *)write_ptr + ((n * stride[3] + y * stride[1]) >> 2);
                    float * dstG = dstR + (stride[2] >> 2);
                    float * dstB = dstG + (stride[2] >> 2);
                    for(vx_size x = 0; x < dims[0]; x++, src += 3) {
                        *dstR++ = src[2]*preprocess_mulfac + preprocess_addfac;
                        *dstG++ = src[1]*preprocess_mulfac + preprocess_addfac;
                        *dstB++ = src[0]*preprocess_mulfac + preprocess_addfac;
                    }
                } else
                {
                    unsigned short * dstR = (unsigned short *)write_ptr + ((n * stride[3] + y * stride[1]) >> 2);
                    unsigned short * dstG = dstR + (stride[2] >> 2);
                    unsigned short * dstB = dstG + (stride[2] >> 2);                    
                    for(vx_size x = 0; x < dims[0]; x++, src += 3) {
                        *dstR++ = _cvtss_sh((float)(src[2]*preprocess_mulfac + preprocess_addfac), 0);
                        *dstG++ = _cvtss_sh((float)(src[2]*preprocess_mulfac + preprocess_addfac), 0);
                        *dstB++ = _cvtss_sh((float)(src[2]*preprocess_mulfac + preprocess_addfac), 0);
                    }
                }
            }
        }
    }
    else
#endif    
    {
        FILE * fp = fopen(fileName.c_str(), "rb");
        if(!fp) {
            std::cerr << "ERROR: unable to open: " << fileName << std::endl;
            return MV_ERROR_INVALID_REFERENCE;
        }
        for(size_t n = 0; n < dims[3]; n++) {
            for(size_t c = 0; c < dims[2]; c++) {
                for(size_t y = 0; y < dims[1]; y++) {
                    if(data_type == VX_TYPE_FLOAT32) {
                        float * ptrY = (float *)write_ptr + ((n * stride[3] + c * stride[2] + y * stride[1]) >> 2);
                        vx_size n = fread(ptrY, sizeof(float), dims[0], fp);
                        if(n != dims[0]) {
                            std::cerr << "ERROR: couldn't read expected num. of bytes from file " << fileName << std::endl;
                            return MV_ERROR_INVALID_DIMENSION;
                        }
                    } else {
                        unsigned short * ptrY = (unsigned short *)write_ptr + ((n * stride[3] + c * stride[2] + y * stride[1]) >> 2);
                        vx_size n = fread(ptrY, sizeof(unsigned short), dims[0], fp);
                        if(n != dims[0]) {
                            std::cerr << "couldn't read expected num. of bytes from file " << fileName << std::endl;
                            return MV_ERROR_INVALID_DIMENSION;
                        }
                    }
                }
            }
        }
        fclose(fp);
    }
    return MV_SUCCESS;
}


MIVID_API_ENTRY mv_status MIVID_API_CALL mvCopyToTensorFromMem(mivid_handle handle, int input_num, void *input_data_ptr, size_t size, mivid_memory_type type)
{
    // create local tensors used in graph: mem_type = 0 for CPU buffer and 1 for OpenCL
    vx_status status = VX_SUCCESS;
    if (!input_data_ptr || !handle) {
        printf("ERROR: mvCopyToTensorFromMem: invalid input memory pointer or inference handle\\n");
        return MV_FAILURE;
    }else {
        // application is passing device memory pointer, do SwapTensorHandle()
        vx_enum data_type = VX_TYPE_FLOAT32;
        vx_size num_of_dims = 4, dims[4] = { 1, 1, 1, 1 }, stride[4];
        vxQueryTensor(handle->inputs[input_num], VX_TENSOR_DATA_TYPE, &data_type, sizeof(data_type));
        vxQueryTensor(handle->inputs[input_num], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims));
        vxQueryTensor(handle->inputs[input_num], VX_TENSOR_DIMS, &dims, sizeof(dims[0])*num_of_dims);
        if((data_type != VX_TYPE_FLOAT32) && (data_type != VX_TYPE_FLOAT16)) {
            std::cerr << "ERROR: mvCopyToTensorFromMem() supports only VX_TYPE_FLOAT32 or VX_TYPE_FLOAT16:" << std::endl;
            return MV_ERROR_INVALID_TYPE;
        }
        vx_size count = dims[0] * dims[1] * dims[2] * dims[3];
        if (size != count*%d )
            return MV_ERROR_INVALID_VALUE;

        if ((status = vxSwapTensorHandle(handle->inputs[input_num], input_data_ptr, nullptr)) != VX_SUCCESS) {
            printf("ERROR: mvCopyToTensorFromMem: vxSwapTensorHandle: failed (%%d)\\n", status);
        }
    }
    return (mv_status)status;
}
""" % (input_elm_size))          
            f.write( \
"""

MIVID_API_ENTRY mv_status MIVID_API_CALL mvCopyToTensorFromFile(mivid_handle handle, int input_num, const char *input_name, bool reverseOrder, float preprocess_mulfac, float preprocess_addfac)
{
    // create local tensors used in graph: mem_type = 0 for CPU buffer and 1 for OpenCL
    vx_status status = VX_SUCCESS;
    if (input_name == nullptr) {
        printf("ERROR: mvCopyToTensorFromFile: invalid input memory pointer or inference handle\\n");
        return MV_FAILURE;
    }else {
        std::string fileName(input_name);
        // application is passing device memory pointer, do SwapTensorHandle()
        vx_enum data_type = VX_TYPE_FLOAT32;
        vx_size num_of_dims = 4, dims[4] = { 1, 1, 1, 1 };
        vxQueryTensor(handle->inputs[input_num], VX_TENSOR_DATA_TYPE, &data_type, sizeof(data_type));
        vxQueryTensor(handle->inputs[input_num], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims));
        vxQueryTensor(handle->inputs[input_num], VX_TENSOR_DIMS, &dims, sizeof(dims[0])*num_of_dims);
        if((data_type != VX_TYPE_FLOAT32) && (data_type != VX_TYPE_FLOAT16)) {
            std::cerr << "ERROR: mvCopyToTensorFromMem() supports only VX_TYPE_FLOAT32 or VX_TYPE_FLOAT16:" << std::endl;
            return MV_ERROR_INVALID_TYPE;
        }
        if (handle->mem_type_in == mv_mem_type_host) {
            void * ptr;
            vx_map_id map_id;
            vx_size stride[4] = {%d, dims[0]*%d, dims[0]*dims[1]*%d, dims[0]*dims[1]*dims[2]*%d};
            vx_status status = vxMapTensorPatch(handle->inputs[input_num], num_of_dims, nullptr, nullptr, &map_id, stride, (void **)&ptr, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
            if(status) {
                std::cerr << "ERROR: vxMapTensorPatch() failed for " << fileName << std::endl;
                return MV_FAILURE;
            }
            status = copyTensor(fileName, dims, stride, ptr, data_type, preprocess_mulfac, preprocess_addfac);
            status = vxUnmapTensorPatch(handle->inputs[input_num], map_id);
            if(status) {
                std::cerr << "ERROR: vxUnmapTensorPatch() failed for " << fileName << std::endl;
                return MV_FAILURE;
            }
        } else if (handle->mem_type_in == mv_mem_type_opencl) {
            printf("ERROR: mvSetInputDataFromFile: INVALID_MEM_TYPE\\n");
            return MV_ERROR_INVALID_VALUE;

        } else {
            printf("ERROR: mvSetInputDataFromFile: INVALID_MEM_TYPE\\n");
            return MV_ERROR_INVALID_VALUE;
        }
    }
    return (mv_status)status;
}
""" % (input_elm_size, input_elm_size, input_elm_size, input_elm_size))          
            f.write( \
"""

MIVID_API_ENTRY mv_status MIVID_API_CALL mvGetOutput(mivid_handle handle, int output_num, void *out_tensor_mem, vx_size size)
{
    vx_status status = VX_SUCCESS;
    if(!handle || !handle->outputs[output_num] || !out_tensor_mem) {
        printf("ERROR: mvGetOutput: invalid handle or output_mem pointer\\n");
        status = MV_FAILURE;
    }
    else if(handle->outputs[output_num]) {
        // access the tensor object
        vx_enum data_type = VX_TYPE_FLOAT32;
        vx_size num_of_dims = 4, dims[4] = { 1, 1, 1, 1 };
        vxQueryTensor(handle->outputs[output_num], VX_TENSOR_DATA_TYPE, &data_type, sizeof(data_type));
        vxQueryTensor(handle->outputs[output_num], VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims));
        vxQueryTensor(handle->outputs[output_num], VX_TENSOR_DIMS, &dims, sizeof(dims[0])*num_of_dims);
        if((data_type != VX_TYPE_FLOAT32) && (data_type != VX_TYPE_FLOAT16)) {
            std::cerr << "ERROR: mvGetOutput() supports only VX_TYPE_FLOAT32 or VX_TYPE_FLOAT16 " << std::endl;
            return MV_ERROR_INVALID_TYPE;
        }
        vx_size count = dims[0] * dims[1] * dims[2] * dims[3];
        vx_size stride[4] = {%d, dims[0]*%d, dims[0]*dims[1]*%d, dims[0]*dims[1]*dims[2]*%d};
        if (size < count*%d){
            return MV_ERROR_INVALID_DIMENSION;
        }

        if ((status = vxCopyTensorPatch(handle->outputs[output_num], num_of_dims, nullptr, nullptr, stride, out_tensor_mem, VX_READ_ONLY, VX_MEMORY_TYPE_HOST)) != VX_SUCCESS) {
            printf("ERROR: ideGetOutput: vxCopyTensorPatch: failed (%%d)\\n", status);
        }
    }
    return (mv_status)status;
}
""" % (output_elm_size, output_elm_size, output_elm_size, output_elm_size, output_elm_size))          
        f.write( \
"""
MIVID_API_ENTRY mv_status MIVID_API_CALL mvProcessInference(mivid_handle handle, float *ptime_in_ms, int num_iterations)
{
    vx_status status = VX_SUCCESS;
    if(!handle) {
        printf("ERROR: mvProcessInference: invalid handle\\n");
        status = MV_FAILURE;
    }
    else {
        if (num_iterations > 1) {
            int64_t freq = clockFrequency(), t0, t1;
            t0 = clockCounter();
            for(int i = 0; i < num_iterations; i++) {
                status = vxProcessGraph(handle->graph);
                if(status != VX_SUCCESS)
                    break;
            }
            t1 = clockCounter();
            *ptime_in_ms = (float)(t1-t0)*1000.0f/(float)freq/(float)num_iterations;
            printf("OK: mvProcessInference() took %.3f msec (average over %d iterations)\\n", *ptime_in_ms, num_iterations);
        } else {
            status = vxProcessGraph(handle->graph);
        }
    }
    return (mv_status)status;
}

MIVID_API_ENTRY mv_status MIVID_API_CALL mvScheduleInference(mivid_handle handle)
{
    vx_status status = VX_SUCCESS;
    if(!handle) {
        status = VX_FAILURE;
        printf("ERROR: ideScheduleInference: invalid handle\\n");
    }
    else if ((status = vxScheduleGraph(handle->graph)) == VX_SUCCESS)
        handle->scheduled = true;
    else {
        handle->scheduled = false;
        printf("ERROR: mvScheduleInference: failed (%d)\\n", status);
    }
    return (mv_status)status;
}

MIVID_API_ENTRY mv_status MIVID_API_CALL mvWaitForCompletion(mivid_handle handle)
{
    vx_status status = VX_SUCCESS;
    if(!handle || !handle->scheduled) {
        status = VX_FAILURE;
        printf("ERROR: ideWaitForCompletion: invalid handle\\n");
    }
    else if(handle->scheduled && ((status = vxWaitGraph(handle->graph)) == VX_SUCCESS)) {
        handle->scheduled = false;
    } else
        printf("ERROR: mvWaitForCompletion: failed\\n");

    return (mv_status)status;
}

mv_status MIVID_API_CALL mvReleaseInference(mivid_handle handle)
{
    mv_status status = MV_SUCCESS;
    vx_status ret;
    if(!handle) {
        printf("ERROR: mvReleaseInference: invalid handle\\n");
        status = MV_FAILURE;
    }
    else if(handle->graph && (ret = vxReleaseGraph(&handle->graph)) != VX_SUCCESS) {
        printf("ERROR: mvReleaseInference: vxReleaseGraph: failed (%d)\\n", ret);
        status = MV_FAILURE;
    }
    else {
        for (int i=0; i<handle->num_inputs; i++) {
            if(handle->inputs[i] && (ret = vxReleaseTensor(&handle->inputs[i])) != VX_SUCCESS) {
                printf("ERROR: mvReleaseInference: vxReleaseTensor(input<%d>): failed (%d)\\n", i,ret);
                status = MV_FAILURE;
            }
        }
        for (int i=0; i<handle->num_outputs; i++) {
            if(handle->outputs[i] && (ret = vxReleaseTensor(&handle->outputs[i])) != VX_SUCCESS) {
                printf("ERROR: mvReleaseInference: vxReleaseTensor(output<%d>): failed (%d)\\n", i,ret);
                status = MV_FAILURE;
            }
        }
    }
    if(handle->context && (ret = vxReleaseContext(&handle->context)) != VX_SUCCESS) {
        printf("ERROR: mvReleaseInference: vxReleaseContext: failed (%d)\\n", ret);
        status = MV_FAILURE;
    }
    else {
        delete handle;
    }
    return status;
}

""")

def generateDeployCPP(graph, fileName):
    print('creating ' + fileName + ' ...')
    with open(fileName, 'w') as f:
        generateLicenseForCPP(f)
        f.write( \
"""
//mivid_api.cpp :: some definitions for mivid_api.cpp
#include "mvdeploy.h"

mvDeployAPI::mvDeployAPI(const char *library_name)
{
    libHandle = dlopen(library_name, RTLD_NOW | RTLD_LOCAL);
    if (!libHandle)
    {
        printf("ERROR: couldn't load MIVisionX deployment lib %s errorcode: %s \\n", library_name, dlerror());
        mvQueryInference_f          = nullptr;
        mvSetLogCallback_f          = nullptr;
        mvSetPreProcessCallback_f   = nullptr;
        mvSetPostProcessCallback_f  = nullptr;
        mvCreateInference_f         = nullptr;
        mvCopyToTensorFromMem_f     = nullptr;
        mvCopyToTensorFromFile_f    = nullptr;
        mvProcessInference_f        = nullptr;
        mvGetOutput_f               = nullptr;
        mvScheduleInference_f       = nullptr;
        mvWaitForCompletion_f       = nullptr;
        mvReleaseInference_f        = nullptr;
    } else {
        mvQueryInference_f          = (mvQueryInference_t) dlsym(libHandle, "mvQueryInference");
        mvSetLogCallback_f          = (mvSetLogCallback_t) dlsym(libHandle, "mvSetLogCallback");
        mvSetPreProcessCallback_f   = (mvSetPreProcessCallback_t) dlsym(libHandle, "mvSetPreProcessCallback");
        mvSetPostProcessCallback_f  = (mvSetPostProcessCallback_t) dlsym(libHandle, "mvSetPostProcessCallback");
        mvCreateInference_f         = (mvCreateInference_t) dlsym(libHandle, "mvCreateInference");
        mvCopyToTensorFromMem_f     = (mvCopyToTensorFromMem_t) dlsym(libHandle, "mvCopyToTensorFromMem");
        mvCopyToTensorFromFile_f    = (mvCopyToTensorFromFile_t) dlsym(libHandle, "mvCopyToTensorFromFile");
        mvProcessInference_f        = (mvProcessInference_t) dlsym(libHandle, "mvProcessInference");
        mvGetOutput_f               = (mvGetOutput_t) dlsym(libHandle, "mvGetOutput");
        mvScheduleInference_f       = (mvScheduleInference_t) dlsym(libHandle, "mvScheduleInference");
        mvWaitForCompletion_f       = (mvWaitForCompletion_t) dlsym(libHandle, "mvWaitForCompletion");
        mvReleaseInference_f        = (mvReleaseInference_t) dlsym(libHandle, "mvReleaseInference");
    }

    if (!mvQueryInference_f) {
        printf("ERROR: couldn't find function mvQueryInference_f in module %s \\n", library_name);      
    }
    if (!mvSetLogCallback_f) {
        printf("ERROR: couldn't find function mvSetLogCallback_f in module %s \\n", library_name);      
    }
    if (!mvSetPreProcessCallback_f) {
        printf("ERROR: couldn't find function mvSetPreProcessCallback_f in module %s \\n", library_name);      
    }
    if (!mvSetPostProcessCallback_f) {
        printf("ERROR: couldn't find function mvSetPostProcessCallback_f in module %s \\n", library_name);      
    }
    if (!mvCreateInference_f) {
        printf("ERROR: couldn't find function mvCreateInference_f in module %s \\n", library_name);      
    }
    if (!mvCopyToTensorFromFile_f) {
        printf("ERROR: couldn't find function mvCopyToTensorFromFile_f in module %s \\n", library_name);      
    }
    if (!mvCopyToTensorFromMem_f) {
        printf("ERROR: couldn't find function mvCopyToTensorFromMem_f in module %s \\n", library_name);      
    }
    if (!mvProcessInference_f) {
        printf("ERROR: couldn't find function mvProcessInference_f in module %s \\n", library_name);      
    }
    if (!mvGetOutput_f) {
        printf("ERROR: couldn't find function mvGetOutput_f in module %s \\n", library_name);      
    }
    if (!mvScheduleInference_f) {
        printf("ERROR: couldn't find function mvScheduleInference_f in module %s \\n", library_name);      
    }
    if (!mvReleaseInference_f) {
        printf("ERROR: couldn't find function mvReleaseInference_f in module %s \\n", library_name);      
    }
}

static mvDeployAPI *mvDeploy = nullptr;
static std::string INF_DEPLOY_LIB_NAME  = "libmv_deploy.so";

// helper function
void error(const char * format, ...)
{
    printf("ERROR: ");
    va_list args;
    va_start(args, format);
    int r = vprintf(format, args);
    va_end(args);
    printf("\\n");
}

//! \\brief Query the version of the MivisionX inference engine.
MIVID_API_ENTRY const char * MIVID_API_CALL mvGetVersion()
{
    return MIVIDA_VERSION;
}

//! \\brief Creates deployment instance (loads the deployment library for the specific compiled backend and intializes all function pointers)
MIVID_API_ENTRY mv_status MIVID_API_CALL mvInitializeDeployment(const char* install_folder)
{
    std::string libname = std::string(install_folder);
    libname += "/lib/" + INF_DEPLOY_LIB_NAME;
    mvDeploy = new mvDeployAPI((const char *)libname.c_str());
    if (!mvDeploy || !mvDeploy->mvQueryInference_f) {
        return MV_FAILURE;          
    }
    printf("Success::loading deployment library %s \\n", libname.c_str());
    return MV_SUCCESS;
}

MIVID_API_ENTRY mv_status MIVID_API_CALL QueryInference(int *num_inputs, int *num_outputs,  const char **inp_out_config)
{
    if (mvDeploy) {
        *inp_out_config = mvDeploy->mvQueryInference_f(num_inputs, num_outputs);
        return MV_SUCCESS;
    }else
        return MV_FAILURE;
}

void MIVID_API_CALL SetLogCallback(mivid_log_callback_f log_callback_f)
{
    if (mvDeploy) mvDeploy->mvSetLogCallback_f(log_callback_f);
}

//! \\brief: load and add preprocessing module/nodes to graph if needed.
// need to call this before calling CreateInferenceSession
// output of the preprocessing node should be same as input tensor NN module
MIVID_API_ENTRY void MIVID_API_CALL SetPreProcessCallback(mivid_add_preprocess_callback_f preproc_f, mv_preprocess_callback_args *preproc_args)
{
    if (mvDeploy) mvDeploy->mvSetPreProcessCallback_f(preproc_f, preproc_args);
}

//! \\brief: load and add postprocessing modules/nodes to graph if needed.
// need to call this before calling CreateInferenceSession
// input to the preprocessing node should be same as output tensor of NN module
MIVID_API_ENTRY void MIVID_API_CALL SetPostProcessCallback(mivid_add_postprocess_callback_f postproc_f)
{
    if (mvDeploy) mvDeploy->mvSetPostProcessCallback_f(postproc_f);
}

MIVID_API_ENTRY mv_status MIVID_API_CALL mvCreateInferenceSession(mivid_session *inf_session, const char *install_folder, mivid_memory_type in_type)
{
    //load automatically build inference deplyment library and intialize the functions. 
    std::string binfilename = std::string(install_folder) + "/weights.bin";
    mivid_handle mv_handle;

    if (mvDeploy && ((mv_handle = mvDeploy->mvCreateInference_f(binfilename.c_str(), in_type)) != nullptr)) {
        mv_handle->install_folder = install_folder;
        *inf_session = (mivid_session)mv_handle;
        printf("OK::mvCreateInferenceSession\\n");
        return MV_SUCCESS;
    } else {
        return MV_FAILURE;
    }
}

//! \\brief: Releases inference session and all the resources associated
MIVID_API_ENTRY mv_status MIVID_API_CALL mvReleaseInferenceSession(mivid_session inf_session)
{
    if (inf_session && mvDeploy) {
        return mvDeploy->mvReleaseInference_f((mivid_handle) inf_session);
    }else
        return MV_FAILURE;      
}

MIVID_API_ENTRY mv_status MIVID_API_CALL mvSetInputDataFromMemory(mivid_session inf_session, int input_num, void *input_data, size_t size, mivid_memory_type type)
{
    if (inf_session && mvDeploy) {
        return mvDeploy->mvCopyToTensorFromMem_f((mivid_handle)inf_session, input_num, input_data, size, type);
    }else
        return MV_FAILURE;
}

MIVID_API_ENTRY mv_status MIVID_API_CALL mvSetInputDataFromFile(mivid_session inf_session, int input_num, char *input_name, bool reverseOrder, float preprocess_mulfac, float preprocess_addfac)
{
    if (inf_session && mvDeploy) {
        return mvDeploy->mvCopyToTensorFromFile_f((mivid_handle)inf_session, input_num, input_name, reverseOrder, preprocess_mulfac, preprocess_addfac);
    }else
        return MV_FAILURE;  
}

//! \\brief: run an instance of the inference engine: can be run multiple iterations for performance timing
MIVID_API_ENTRY mv_status MIVID_API_CALL mvRunInference(mivid_session inf_session, float *p_time_in_millisec, int num_iterations)
{
    if (inf_session && mvDeploy) {
        return mvDeploy->mvProcessInference_f((mivid_handle)inf_session, p_time_in_millisec, num_iterations);
    }else
        return MV_FAILURE;  

}

//! \\brief: run an instance of the inference engine: can be run multiple iterations for performance timing
MIVID_API_ENTRY mv_status MIVID_API_CALL mvGetOutputData(mivid_session inf_session, int out_num, void *out_buf, size_t size)
{
    if (inf_session && mvDeploy) {
        return mvDeploy->mvGetOutput_f((mivid_handle)inf_session, out_num, out_buf, size);
    }else
        return MV_FAILURE;      
}

MIVID_API_ENTRY mv_status MIVID_API_CALL mvScheduleInferenceSession(mivid_session inf_session)
{
    if (inf_session && mvDeploy) {
        return mvDeploy->mvScheduleInference_f((mivid_handle)inf_session);
    }else
        return MV_FAILURE;          
}

MIVID_API_ENTRY mv_status MIVID_API_CALL mvWaitForSessionCompletion(mivid_session inf_session)
{
    if (inf_session && mvDeploy) {
        return mvDeploy->mvWaitForCompletion_f((mivid_handle)inf_session);
    }else
        return MV_FAILURE;          
}

MIVID_API_ENTRY void MIVID_API_CALL mvShutdown()
{
    if (mvDeploy) {
        delete mvDeploy;
        mvDeploy = nullptr;
    }
}
""")

def generateDeployH(graph,fileName):
    print('creating ' + fileName + ' ...')
    with open(fileName, 'w') as f:
        generateLicenseForCPP(f)
        f.write( \
"""
#ifndef included_file_mvdeploy_h
#define included_file_mvdeploy_h

#define ENABLE_MV_DEPLOY

#include <VX/vx.h>
#include "mvdeploy_api.h"
#if OPENVX_BACKEND_OPENCL_FOUND
    #include <CL/cl.h>
#endif
#include <VX/vx_khr_nn.h>
#include <vx_amd_nn.h>
#include <vx_ext_amd.h>
#include <stdarg.h>
#include <iostream>
#include <sstream>
#include <vector>
#include <stdio.h>
#include <string.h>
#include <string>
#include <chrono>
#include <inttypes.h>
#include <dlfcn.h>
#include <unistd.h> 

#if ENABLE_OPENCV
#include <opencv2/opencv.hpp>
#include <opencv/cv.h>
#include <opencv/highgui.h>
using namespace cv;
#endif
#include <half.hpp>
#include <immintrin.h>
using half_float::half;

#define MIVIDA_VERSION  "0.9.1"

#define ERROR_CHECK_OBJECT(obj) { vx_status status = vxGetStatus((vx_reference)(obj)); if(status != VX_SUCCESS) { vxAddLogEntry((vx_reference)context, status     , "ERROR: failed with status = (%%d) at " __FILE__ "#%%d\\n", status, __LINE__); return status; } }
#define ERROR_CHECK_STATUS(call) { vx_status status = (call); if(status != VX_SUCCESS) { vxAddLogEntry((vx_reference)context, status, "ERROR: failed with status = (%%d) at " __FILE__ "#%%d\\n", status, __LINE__); return status; } }

typedef struct mivid_handle_t {
    vx_context  context;
    vx_graph    graph;
    const char        *install_folder;
    mivid_log_callback_f mv_log_message_callback;
    mivid_add_preprocess_callback_f mv_add_preprocess_cb;
    mivid_add_postprocess_callback_f mv_add_postprocess_cb;
    std::vector<vx_tensor>   inputs;
    std::vector<vx_tensor>   outputs;
    vx_image inp_image;
    int mem_type_in;
    std::string model_url, model_name;
    int num_inputs, num_outputs;
    bool scheduled;
    void *postproc_data;
} *mivid_handle;

////
// MIVision Inference Deployment Engine(mivid) input output
""")
        for tensor in graph.inputs:
            f.write( \
"""//   %s -- dims[] = { %s } (input)
""" % (tensor.name, ', '.join([str(v) for v in reversed(tensor.shape)])))
        for tensor in graph.outputs:
            f.write( \
"""//   %s -- dims[] = { %s, } (output)
""" % (tensor.name, ', '.join([str(v) for v in reversed(tensor.shape)])))
        f.write( \
"""//
extern "C" {
    MIVID_API_ENTRY void MIVID_API_CALL mvSetLogCallback(mivid_log_callback_f log_callback_f);
    MIVID_API_ENTRY void MIVID_API_CALL mvSetPreProcessCallback(mivid_add_preprocess_callback_f preproc_f, mv_preprocess_callback_args *preproc_args);
    MIVID_API_ENTRY void MIVID_API_CALL mvSetPostProcessCallback(mivid_add_postprocess_callback_f postproc_f);
    MIVID_API_ENTRY const char * MIVID_API_CALL mvQueryInference(int *num_inputs, int *num_outputs);
    MIVID_API_ENTRY mivid_handle MIVID_API_CALL mvCreateInference(const char * binaryFilename, int mem_type);
    MIVID_API_ENTRY mv_status MIVID_API_CALL mvReleaseInference(mivid_handle handle);
    MIVID_API_ENTRY mv_status MIVID_API_CALL mvCopyToTensorFromMem(mivid_handle handle, int input_num, void *input_data_ptr, size_t size, mivid_memory_type type);
    MIVID_API_ENTRY mv_status MIVID_API_CALL mvCopyToTensorFromFile(mivid_handle handle, int input_num, const char *input_name, bool reverseOrder, float preprocess_mulfac, float preprocess_addfac);
    MIVID_API_ENTRY mv_status MIVID_API_CALL mvGetOutput(mivid_handle handle, int output_num, void *out_tensor_mem, vx_size size);
    MIVID_API_ENTRY mv_status MIVID_API_CALL mvProcessInference(mivid_handle handle, float *ptime_in_ms, int num_iterations);
    MIVID_API_ENTRY mv_status MIVID_API_CALL mvScheduleInference(mivid_handle handle);
    MIVID_API_ENTRY mv_status MIVID_API_CALL mvWaitForCompletion(mivid_handle handle);
    MIVID_API_ENTRY mv_status MIVID_API_CALL mvReleaseInference(mivid_handle handle);
};


extern "C" {
    typedef MIVID_API_ENTRY mv_status (MIVID_API_CALL *mvGetVersion_t)();
    typedef MIVID_API_ENTRY const char * (MIVID_API_CALL *mvQueryInference_t)(int *num_inputs, int *num_outputs);
    typedef MIVID_API_ENTRY mv_status (MIVID_API_CALL *mvSetLogCallback_t)(mivid_log_callback_f log_callback_f);
    typedef MIVID_API_ENTRY void (MIVID_API_CALL *mvSetPreProcessCallback_t)(mivid_add_preprocess_callback_f preproc_f, mv_preprocess_callback_args *preproc_args);
    typedef MIVID_API_ENTRY void (MIVID_API_CALL *mvSetPostProcessCallback_t)(mivid_add_postprocess_callback_f postproc_f);
    typedef MIVID_API_ENTRY mivid_handle (MIVID_API_CALL *mvCreateInference_t)(const char * binaryFilename, int mem_type);
    typedef MIVID_API_ENTRY mv_status (MIVID_API_CALL *mvReleaseInference_t)(mivid_handle handle);    
    typedef MIVID_API_ENTRY mv_status (MIVID_API_CALL *mvCopyToTensorFromMem_t)(mivid_handle inf_session, int input_num, void *input_data, size_t size, mivid_memory_type type);
    typedef MIVID_API_ENTRY mv_status (MIVID_API_CALL *mvCopyToTensorFromFile_t)(mivid_handle inf_session, int input_num, char *input_name, bool reverseOrder, float preprocess_mulfac, float preprocess_addfac);
    typedef MIVID_API_ENTRY mv_status (MIVID_API_CALL *mvProcessInference_t)(mivid_handle inf_session, float *p_time_in_millisec, int num_iterations);
    typedef MIVID_API_ENTRY mv_status (MIVID_API_CALL *mvGetOutput_t)(mivid_handle handle, int output_num, void *out_tensor_mem, vx_size size);
    typedef MIVID_API_ENTRY mv_status (MIVID_API_CALL *mvScheduleInference_t)(mivid_handle inf_session);
    typedef MIVID_API_ENTRY mv_status (MIVID_API_CALL *mvWaitForCompletion_t)(mivid_handle inf_session);
};

class mvDeployAPI
{
public:
    mvQueryInference_t          mvQueryInference_f;
    mvSetLogCallback_t          mvSetLogCallback_f;
    mvSetPreProcessCallback_t   mvSetPreProcessCallback_f;
    mvSetPostProcessCallback_t  mvSetPostProcessCallback_f;
    mvCreateInference_t         mvCreateInference_f;
    mvCopyToTensorFromMem_t     mvCopyToTensorFromMem_f;
    mvCopyToTensorFromFile_t    mvCopyToTensorFromFile_f;
    mvProcessInference_t        mvProcessInference_f;
    mvGetOutput_t               mvGetOutput_f;
    mvScheduleInference_t       mvScheduleInference_f;
    mvWaitForCompletion_t       mvWaitForCompletion_f;
    mvReleaseInference_t        mvReleaseInference_f;
private:    
    void * libHandle;

public: 
    mvDeployAPI(const char *library_name);
    ~mvDeployAPI(){dlclose(libHandle);}
};

#endif

""")

def generateBinary(graph,fileName):
    VARIABLES_FILE_MAGIC = 0xF00DD1E0
    VARIABLES_DATA_MAGIC = 0xF00DD1E1
    VARIABLES_EOFF_MAGIC = 0xF00DD1E2
    print('creating ' + fileName + ' ...')
    with open(fileName, 'wb') as f:
        f.write(struct.pack('I', VARIABLES_FILE_MAGIC))
        for tensor in graph.initializers:
            binary = graph.binaries[tensor.name]
            f.write(struct.pack('II', VARIABLES_DATA_MAGIC, len(binary)))
            f.write(binary)
        f.write(struct.pack('I', VARIABLES_EOFF_MAGIC))


def generateTestCPP(graph,argmaxOutput,fileName):        
    print('creating ' + fileName + ' ...')
    with open(fileName, 'w') as f:
        generateLicenseForCPP(f)
        f.write( \
"""
#include "mvdeploy.h"
#include "vx_amd_media.h"
#include <iterator>

// callback function for adding preprocessing nodes
// the module should output in the outp_tensor passed by the callback function
inline int64_t clockCounter()
{
    return std::chrono::high_resolution_clock::now().time_since_epoch().count();
}

inline int64_t clockFrequency()
{
    return std::chrono::high_resolution_clock::period::den / std::chrono::high_resolution_clock::period::num;
}

static vx_status MIVID_CALLBACK preprocess_addnodes_callback_fn(mivid_session inf_session, vx_tensor outp_tensor, const char *inp_string, float a, float b)
{
    if (inf_session) {
        // add your preprocessing OpenVX nodes here. Output of preprocessing goes to outp_tensor 
        return MV_ERROR_NOT_IMPLEMENTED;
    } else {
        printf("preprocess_addnodes_callback_fn:: inf_session not valid\\n");
        return VX_FAILURE;
    }
}

void printUsage() {
    printf("Usage: mvtestdeploy <options>\\n"
        "\t<input-data-file>: is raw tensor file OR .jpg/.png file OR <-> for empty input\t[required]\\n"
        "\t<output-data-file>: output file for inference output OR <-> for no output     \t[required]\\n"
        "\t--install_folder <install_folder>: location of the compiled model binary      \t[optional: default-current folder]\\n"
        "\t--backend <backend>: the name of the backend for compilation                  \t[optional: default-OpenVX_Rocm_GPU]\\n"
        "\t--t <num of interations>: to run for performance                              \t[optional: default 1]\\n"
        "\t--argmax <UINT/Lut>: give argmax output in UINT or LUT                        \t[optional: default no argmax]\\n"
        "\t--label <labels.txt>: labels file for classes                                 \t[optional: default no class detected]\\n"        
        "\t--preprocess <pmul, padd>: prepeocess multiply and add in floats              \t[optional: default (1.f, 0.f)]\\n"        
    );
}


int main(int argc, const char ** argv)
{
    // check command-line usage
    if(argc < 3) {
        printUsage();
        return -1;
    }
    mv_status status;
    size_t out_dims[4];
    const char *inoutConfig;
    int num_inputs=1, num_outputs=1;
    std::string install_folder = ".";       // default for install folder
    std::string  weightsFile  = "./weights.bin";    // default for weights file
    mivid_backend backend = OpenVX_Rocm_GPU;
    std::string inpFileName  = std::string(argv[1]);
    std::string outFileName  = std::string(argv[2]);
    int bPeformanceRun = 0, numIterations = 1;
    int  argmaxOutput = 0, gotLabels = 0;
    std::string labelText[1000];       // to read labels file
    float padd = 0.f, pmul = 1.f;

    for (int arg = 3; arg < argc; arg++) {
        if (!strcmp(argv[arg], "--install_folder")) {
            arg++;
            install_folder = std::string(argv[arg]);
            weightsFile = install_folder + "/" + "weights.bin";
        }
        if (!strcmp(argv[arg], "--backend")) {
            arg++;
            backend = (mivid_backend)atoi(argv[arg]);
        } 
        if (!strcmp(argv[arg], "--t")) {
            arg++;
            numIterations = atoi(argv[arg]);
        }
        if (!strcmp(argv[arg], "--argmax")) {
            arg++;
            if (!strcmp(argv[arg], "UINT")) {
                argmaxOutput = 1;
            }
            else if (!strcmp(argv[arg], "LUT")) {
                argmaxOutput = 2;
            }
        }
        if (!strcmp(argv[arg], "--label")) {
            if ((arg + 1) == argc)
            {
                printf("ERROR: missing label.txt file on command-line (see help for details)\\n");
                return -1;
            }            
            arg++;
            std::string labelFileName = argv[arg];
            std::string line;
            std::ifstream out(labelFileName);
            int lineNum = 0;
            while(getline(out, line)) {
                labelText[lineNum] = line;
                lineNum++;
            }
            out.close(); 
            gotLabels = 1;           
        }                
        if (!strcmp(argv[arg], "--preprocess")) {
            if ((arg + 2) == argc)
            {
                printf("ERROR: missing pmul and padd parameters on command-line (see help for details)\\n");
                return -1;
            }
            arg++;
            pmul = atof(argv[arg++]);
            padd = atof(argv[arg]);
        }
    }
    // initialize deployment
    if ((status = mvInitializeDeployment(install_folder.c_str()))){
        printf("ERROR: mvInitializeDeployment failed with status %d\\n", status);
        return -1;
    }

    if ((status = QueryInference(&num_inputs, &num_outputs, &inoutConfig))) {
        printf("ERROR: QueryInference returned status %d\\n", status);      
    }
    else {
        float *inpMem = nullptr;
        float *outMem = nullptr;
        size_t inp_dims[4], out_dims[4];        
        mivid_session infSession;
        mv_status status;
        float time_in_millisec;

        // get input and output dimensions from inoutConfig
        std::stringstream inout_dims(inoutConfig);
        std::vector<std::string> config_vec;
        std::string substr;
        std::string in_names[num_inputs];
        std::string out_names[num_outputs];
        std::vector<std::tuple<int, int, int, int>> input_dims;
        std::vector<std::tuple<int, int, int, int>> output_dims;
        while( inout_dims.good()) {
            getline(inout_dims, substr, ';' );
            if (!substr.empty()) {
                config_vec.push_back(substr);
            }
        }
        int in_num = 0, out_num = 0, n, c, h, w;        
        for (int i=0; i < config_vec.size(); i++)
        {
            std::stringstream ss(config_vec[i]);
            getline(ss, substr, ',');
            if ((substr.compare(0,5,"input") == 0))
            {
                getline(ss, substr, ',');
                in_names[in_num] =  substr;
                getline(ss, substr, ','); w = atoi(substr.c_str());
                getline(ss, substr, ','); h = atoi(substr.c_str());
                getline(ss, substr, ','); c = atoi(substr.c_str());
                getline(ss, substr, ','); n = atoi(substr.c_str());
                printf("Config_input::<%d %d %d %d>:%s \\n", w,h,c,n, in_names[in_num].c_str());
                input_dims.push_back(std::tuple<int,int,int,int>(w,h,c,n));
                in_num++;
            }
            else if ((substr.compare(0,6,"output") == 0))
            {
                getline(ss, substr, ',');
                out_names[out_num] =  substr;
                getline(ss, substr, ','); w = atoi(substr.c_str());
                getline(ss, substr, ','); h = atoi(substr.c_str());
                getline(ss, substr, ','); c = atoi(substr.c_str());
                getline(ss, substr, ','); n = atoi(substr.c_str());
                printf("Config_output::<%d %d %d %d>:%s \\n", w,h,c,n, out_names[out_num].c_str());
                output_dims.push_back(std::tuple<int,int,int,int>(w,h,c,n));
                out_num ++;
            }
        }

        inp_dims[3] = std::get<0>(input_dims[0]);
        inp_dims[2] = std::get<1>(input_dims[0]);
        inp_dims[1] = std::get<2>(input_dims[0]);
        inp_dims[0] = std::get<3>(input_dims[0]);
        out_dims[3] = std::get<0>(output_dims[0]);
        out_dims[2] = std::get<1>(output_dims[0]);
        out_dims[1] = std::get<2>(output_dims[0]);
        out_dims[0] = std::get<3>(output_dims[0]);

        status = mvCreateInferenceSession(&infSession, install_folder.c_str(), mv_mem_type_host);
        if (status != MV_SUCCESS)
        {
            printf("ERROR: mvCreateInferenceSession returned failure \\n");
            return -1;      
        }
        if (input_dims.size() == 0 || output_dims.size() == 0 ) {
            printf("ERROR: Couldn't get input and output dims %d %d \\n", (int)input_dims.size(), (int)output_dims.size());
            return -1;      
        }
        // create input tensor memory for swaphandle
        size_t inputSizeInBytes = 4 *inp_dims[0] * inp_dims[1] * inp_dims[2] * inp_dims[3];

        // read input and call mvSetInputDataFromMemory
        if ((strcmp(inpFileName.c_str(), "-") != 0)) {
            inpMem = (float *)new char[inputSizeInBytes];
            size_t istride[4] = { 4, (size_t)4 * inp_dims[0], (size_t)4 * inp_dims[0] * inp_dims[1], (size_t)4 * inp_dims[0] * inp_dims[1] * inp_dims[2] };
    #if ENABLE_OPENCV
            if(inp_dims[2] == 3 && inpFileName.size() > 4 && (inpFileName.substr(inpFileName.size()-4, 4) == ".png" || inpFileName.substr(inpFileName.size()-4, 4) == ".jpg" ||
                                                             inpFileName.substr(inpFileName.size()-4, 4) == ".PNG" || inpFileName.substr(inpFileName.size()-4, 4) == ".JPG"))
            {
                for(size_t n = 0; n < inp_dims[3]; n++) {
                    char imgFileName[1024];
                    sprintf(imgFileName, inpFileName.c_str(), (int)n);
                    unsigned char *img_data;
                    Mat img = imread(imgFileName, CV_LOAD_IMAGE_COLOR);
                    img_data = img.data;
                    if(!img.data || img.rows != inp_dims[1] || img.cols != inp_dims[0]) {
                        Mat matScaled;
                        cv::resize(img, matScaled, cv::Size(inp_dims[0], inp_dims[1]));
                        img_data = matScaled.data;
                    }
                    for(vx_size y = 0; y < inp_dims[1]; y++) {
                        unsigned char * src = img_data + y*inp_dims[0]*3;
                        float * dstR = inpMem + ((n * istride[3] + y * istride[1]) >> 2);
                        float * dstG = dstR + (istride[2] >> 2);
                        float * dstB = dstG + (istride[2] >> 2);
                        for(vx_size x = 0; x < inp_dims[0]; x++, src += 3) {
                            *dstR++ = src[2]*pmul + padd;
                            *dstG++ = src[1]*pmul + padd;
                            *dstB++ = src[0]*pmul + padd;
                        }
                    }
                }
            }
            else
    #endif
            {
                FILE * fp = fopen(inpFileName.c_str(), "rb");
                if(!fp) {
                    std::cerr << "ERROR: unable to open: " << inpFileName << std::endl;
                    return -1;
                }
                for(size_t n = 0; n < inp_dims[3]; n++) {
                    for(size_t c = 0; c < inp_dims[2]; c++) {
                        for(size_t y = 0; y < inp_dims[1]; y++) {
                            float * ptrY = inpMem + ((n * istride[3] + c * istride[2] + y * istride[1]) >> 2);
                            vx_size n = fread(ptrY, sizeof(float), inp_dims[0], fp);
                            if(n != inp_dims[0]) {
                                std::cerr << "ERROR: reading from file less than expected # of bytes " << inpFileName << std::endl;
                                return -1;
                            }
                        }
                    }
                }
                fclose(fp);
            }
            if ((status = mvSetInputDataFromMemory(infSession, 0, (void *)inpMem, inputSizeInBytes, mv_mem_type_host)) != MV_SUCCESS) {
                printf("ERROR: mvSetInputDataFromMemory returned failure(%d) \\n", status);
                return -1;
            }
        }
        // allocate output buffer corresponding to the first output
        size_t outputSizeInBytes = 4 *out_dims[0]*out_dims[1]*out_dims[2]*out_dims[3];
        outMem = (float *)new char[outputSizeInBytes];
        FILE *fp = nullptr;

        if (strcmp(outFileName.c_str(), "-") != 0)
        {
            fp = fopen(outFileName.c_str(), "wb");
            if(!fp) {
                std::cerr << "ERROR: unable to open: " << outFileName << std::endl;
                return -1;
            }
        }
        int64_t freq = clockFrequency(), t0, t1;
        if (numIterations == 1) t0 = clockCounter();

        if ((status = mvRunInference(infSession, &time_in_millisec, numIterations))) {
            printf("ERROR: mvRunInference terminated with status(%d) \\n", status);
            return -1;
        }
        if (numIterations == 1) {
            t1 = clockCounter();
            time_in_millisec = (float)(t1-t0)*1000.0f/(float)freq;
        }

        // get output
        if ((status = mvGetOutputData(infSession, 0, (void *) outMem, outputSizeInBytes)) != MV_SUCCESS) {
            printf("ERROR: mvGetOutputData returned failure(%d) \\n", status);
            return -1;
        }
        if (fp) {
            fwrite(outMem, sizeof(float), outputSizeInBytes>>2, fp);
        }
        if (fp) fclose(fp);
        printf("OK: mvRunInference() took %.3f msec (average over %d iterations)\\n", time_in_millisec, numIterations);

        if (argmaxOutput) {
            float *out_elements = (float*)outMem;
            int classID = std::distance(out_elements, std::max_element(out_elements, (out_elements + out_dims[1])));
            if (gotLabels)
                printf("Argmax output is %d with label: %s\\n", classID, labelText[classID].c_str());
            else
                printf("Argmax output is %d \\n", classID);
        }
        // Relese Inference
        mvReleaseInferenceSession(infSession);
        printf("OK: Inference Deploy Successful \\n");
        // delete resources
        if (inpMem) delete[] inpMem;
        if (outMem) delete[] outMem;
        mvShutdown();
    }
}
""")

def generateExtrasCPP(graph,extraFolder):
    print('copying mv_extras_postproc.cpp to ' + extraFolder + ' ...')
    file_dir = os.path.dirname(os.path.abspath(__file__))
    cmd = "cp " + file_dir + "/../mv_extras_postproc.cpp " + "./" + extraFolder
    ret = subprocess.call(cmd, shell=True)
    if ret:
        print(('ERROR: generateExtrasCPP', ret))
    else:  
        print('OK: generateExtrasCPP')

def generateExtrasH(graph,extraFolder):
    print('copying mv_extras_postproc.h to ' + extraFolder + ' ...')
    file_dir = os.path.dirname(os.path.abspath(__file__))
    cmd = "cp " + file_dir + "/../mv_extras_postproc.h " + "./" + extraFolder
    ret = subprocess.call(cmd, shell=True)
    if ret:
        print(('ERROR: generateExtrasCPP', ret))
    else:
        print('OK: generateExtrasCPP')

def generateCode(graph,argmaxOutput,outputFolder):
    extraFolder = outputFolder + '/mv_extras'
    if not os.path.isdir(outputFolder):
        os.mkdir(outputFolder)
    if not os.path.isdir(extraFolder):
        os.mkdir(extraFolder)
    generateCMakeFiles(graph,outputFolder)
    generateCMakeExtras(graph, extraFolder)
    generateModuleCPP(graph,outputFolder + '/mvmodule.cpp')
    generateBinary(graph,outputFolder + '/weights.bin')
    generateDeployH(graph, outputFolder + '/mvdeploy.h')
    generateDeployCPP(graph, outputFolder + '/mvdeploy_api.cpp')
    generateTestCPP(graph,argmaxOutput,outputFolder + '/mvtestdeploy.cpp')
    generateExtrasH(graph,extraFolder)
    generateExtrasCPP(graph,extraFolder)

def main():
    usage = """
Usage: python nnir_to_clib.py [OPTIONS] <nnirInputFolder> <outputFolder>

  OPTIONS:
    --argmax UINT8                    -- argmax at the end with 8-bit output
    --argmax UINT16                   -- argmax at the end with 16-bit output
    --argmax <fileNamePrefix>rgb.txt  -- argmax at the end with RGB color mapping using LUT
    --argmax <fileNamePrefix>rgba.txt -- argmax at the end with RGBA color mapping using LUT
    --help                            -- show this help message

  LUT File Format (RGB): 8-bit R G B values one per each label in text format
    R0 G0 B0
    R1 G1 B1
    ...

  LUT File Format (RGBA): 8-bit R G B A values one per each label in text format
    R0 G0 B0 A0
    R1 G1 B1 A1
    ...

"""
    pos = 1;
    argmaxOutput = None
    while len(sys.argv[pos:]) >= 2 and sys.argv[pos][:2] == '--':
        if sys.argv[pos] == '--argmax':
            argmaxOutput = sys.argv[pos+1]
            if argmaxOutput == 'UINT8':
                argmaxOutput = 'vx_uint8'
            elif argmaxOutput == 'UINT16':
                argmaxOutput = 'vx_uint16'
            else:
                if not os.path.isfile(argmaxOutput):
                    print('ERROR: unable to open: %s' % (argmaxOutput))
                    sys.exit(1)
                with open(argmaxOutput,'r') as f:
                    if argmaxOutput[-8:] == 'rgba.txt':
                        argmaxOutput = np.reshape(np.array([int(v) for v in f.read().split()]), [-1, 4]).transpose()
                    else:
                        argmaxOutput = np.reshape(np.array([int(v) for v in f.read().split()]), [-1, 3]).transpose()
        else:
            if sys.argv[pos] != '--help':
                print('ERROR: invalid option: %s' % (sys.argv[pos]))
            print(usage)
            sys.exit(1)
        pos = pos + 2
    if len(sys.argv[pos:]) < 2:
        print(usage)
        sys.exit(1)
    inputFolder = sys.argv[pos]
    outputFolder = sys.argv[pos+1]
    print('reading IR model from ' + inputFolder + ' ...')
    graph = IrGraph(True)
    graph.fromFile(inputFolder)
    print('creating C code in ' + outputFolder + ' ...')
    generateCode(graph,argmaxOutput,outputFolder)

if __name__ == '__main__':
    main()