/*
MIT License

Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

// header file "annmodule.h" generated by nnir_to_openvx.py
#include "annmodule.h"

// header file to include OpenVX Modules
#include <vx_ext_amd.h>
#include <vx_amd_nn.h>

// c/c++ includes
#include <chrono>
#include <thread>
#include <functional>

//header file for different mode
#include "detection.h"
#include "classification.h"
#include "segmentation.h"

bool runModel = false;
float modelTime_g;

#define ERROR_CHECK_OBJECT(obj) { vx_status status = vxGetStatus((vx_reference)(obj)); if(status != VX_SUCCESS) { vxAddLogEntry((vx_reference)context, status     , "ERROR: failed with status = (%d) at " __FILE__ "#%d\n", status, __LINE__); return status; } }
#define ERROR_CHECK_STATUS(call) { vx_status status = (call); if(status != VX_SUCCESS) { printf("ERROR: failed with status = (%d) at " __FILE__ "#%d\n", status, __LINE__); return -1; } }

static void VX_CALLBACK log_callback(vx_context context, vx_reference ref, vx_status status, const vx_char string[])
{
    size_t len = strlen(string);
    if (len > 0) {
        printf("%s", string);
        if (string[len - 1] != '\n')
            printf("\n");
        fflush(stdout);
    }
}

inline int64_t clockCounter()
{
    return std::chrono::high_resolution_clock::now().time_since_epoch().count();
}

inline int64_t clockFrequency()
{
    return std::chrono::high_resolution_clock::period::den / std::chrono::high_resolution_clock::period::num;
}

// usage guide
static void show_usage()
{ 
    printf(
            "\n"
            "Usage:\n\n"
            "./classifier"
            "\t--mode\t\t\t\t<1/2/3 - 1:classification 2:detection 3:segmentation>\t[required]\n"
            "\t\t--video/--capture/--image\t<video file>/<0>/<image file>\t\t\t\t[required]\n"
            "\t\t--model_weights\t\t\t<model_weights.bin>\t\t\t\t\t[required]\n"    
            "\t\t--label\t\t\t\t<label text>\t\t\t\t\t\t[required]\n"
            "\t\t--model_input_dims\t\t<c,h,w - channel,height,width>\t\t\t\t[required]\n"
            "\t\t--model_output_dims\t\t<c,h,w - channel,height,width>\t\t\t\t[required]\n\n"
            "\t\t--model_name\t\t\t<model name>\t\t\t\t\t[optional - default:NN_ModelName]\n"
            "\t\t--add\t\t\t\t<Ax,Ay,Az - input preprocessing factor>\t\t[optional - default:0,0,0]\n"
            "\t\t--multiply\t\t\t<Mx,My,Mz - input preprocessing factor>\t\t[optional - default:1,1,1]\n\n"
            "\n[usage help]\t--help/--h\n"   
            "\n"
        );
}

int main(int argc, const char ** argv)
{
    // check command-line usage   
    std::string modelWeights_str = "empty";
    std::string modeType = "empty"; 
    std::string videoFile = "empty";
    std::string imageFile = "empty";
    std::string labelFileName = "empty";
    std::string modelInputs = "empty";
    std::string modelOutputs = "empty";
    std::string NN_ModelName = "NN-Model";
    std::string labelText[10000];
    std::string preprocessAdd = "empty";
    std::string preprocessMultiply = "empty";

    int captureID = -1;
    bool captureFromVideo = false;
    bool imageFileInput = false;
    int input_c, input_h, input_w;
    int output_c, output_h, output_w ;
    float Mx = 1, My = 1 , Mz = 1;
    float Ax = 0, Ay = 0, Az = 0;
    int classes;
    
    int parameter = 0;
    vx_status status = 0;

    bool modeType_bool = false, model_weights_bool = false, label_bool = false, runType_bool = false, model_inputs_bool = false, model_outputs_bool = false;

    for(int arg = 1; arg < argc; arg++)
    {
        if (!strcasecmp(argv[arg], "--help") || !strcasecmp(argv[arg], "--H") || !strcasecmp(argv[arg], "--h"))
        {
            show_usage();
            exit(status);
        }
        else if (!strcasecmp(argv[arg], "--model_weights"))
        {
            if ((arg + 1) == argc)
            {
                printf("\n\nERROR: missing model weights .bin file location on command-line (see help for details)\n\n\n");
                show_usage();
                status = -1;
                exit(status);
            }
            arg++;
            modelWeights_str = (argv[arg]);
            parameter++;
            model_weights_bool = true;
        }
        else if (!strcasecmp(argv[arg], "--label"))
        {
            if ((arg + 1) == argc)
            {
                printf("\n\nERROR: missing label.txt file on command-line (see help for details)\n\n\n");
                show_usage();
                status = -1;
                exit(status);
            }
            arg++;
            labelFileName = (argv[arg]);
            std::string line;
            std::ifstream out(labelFileName);
            int lineNum = 0;
            while(getline(out, line)) {
                labelText[lineNum] = line;
                lineNum++;
            }
            classes = lineNum;
            out.close();
            parameter++;
            label_bool = true;
        }
        else if (!strcasecmp(argv[arg], "--video"))
        {
            if ((arg + 1) == argc)
            {
                printf("\n\nERROR: missing video file on command-line (see help for details)\n\n\n");
                show_usage();
                status = -1;
                exit(status);
            }
            arg++;
            videoFile = (argv[arg]);
            captureFromVideo = true;
            parameter++;
            runType_bool = true;
        }
        else if (!strcasecmp(argv[arg], "--image"))
        {
            if ((arg + 1) == argc)
            {
                printf("\n\nERROR: missing image file on command-line (see help for details)\n\n\n");
                show_usage();
                status = -1;
                exit(status);
            }
            arg++;
            imageFile = (argv[arg]);
            imageFileInput = true;
            parameter++;
            runType_bool = true;
        }
        else if (!strcasecmp(argv[arg], "--capture"))
        {
            if ((arg + 1) == argc)
            {
                printf("\n\nERROR: missing camera source on command-line (see help for details)\n\n\n");
                show_usage();
                status = -1;
                exit(status);
            }
            arg++;
            captureID = atoi(argv[arg]);
            parameter++;
            runType_bool = true;
        }
        else if(!strcasecmp(argv[arg], "--mode"))
        {
            if ((arg + 1) == argc)
            {
                printf("\n\nERROR: missing mode number on command-line (see help for details)\n\n\n");
                show_usage();
                status = -1;
                exit(status);
            }
            arg++;
            modeType = (argv[arg]);
            parameter++;
            modeType_bool = true;
        }
        else if (!strcasecmp(argv[arg], "--model_name"))
        {
            if ((arg + 1) == argc)
            {
                printf("\n\nERROR: missing model name on command-line (see help for details)\n\n\n");
                show_usage();
                status = -1;
                exit(status);
            }
            arg++;
            NN_ModelName = (argv[arg]);
        }
        else if (!strcasecmp(argv[arg], "--model_input_dims"))
        {
            if ((arg + 1) == argc)
            {
                printf("\n\nERROR: missing model inputs on command-line (see help for details)\n\n\n");
                show_usage();
                status = -1;
                exit(status);
            }
            arg++;
            modelInputs = (argv[arg]);
            
            std::vector<int> vect;
            std::stringstream ss(modelInputs);
            int i;
            while (ss >> i)
            {
                vect.push_back(i);
                if (ss.peek() == ',')
                    ss.ignore();
            }
            input_c = vect.at(0);
            input_h = vect.at(1);
            input_w = vect.at(2);
            parameter++;         
            model_inputs_bool = true;   
        }
        else if (!strcasecmp(argv[arg], "--model_output_dims"))
        {
            if ((arg + 1) == argc)
            {
                printf("\n\nERROR: missing model outputs on command-line (see help for details)\n\n\n");
                show_usage();
                status = -1;
                exit(status);
            }
            arg++;
            modelOutputs = (argv[arg]);
            
            std::vector<int> vect;
            std::stringstream ss(modelOutputs);
            int i;
            while (ss >> i)
            {
                vect.push_back(i);
                if (ss.peek() == ',')
                    ss.ignore();
            }
            output_c = vect.at(0);
            output_h = vect.at(1);
            output_w = vect.at(2);
            parameter++;
            model_outputs_bool = true;
        }
        else if (!strcasecmp(argv[arg], "--add"))
        {
            if ((arg + 1) == argc)
            {
                printf("\n\nERROR: missing model inputs on command-line (see help for details)\n\n\n");
                show_usage();
                status = -1;
                exit(status);
            }
            arg++;
            preprocessAdd = (argv[arg]);
            
            std::vector<float> vect;
            std::stringstream ss(preprocessAdd);
            float i;
            while (ss >> i)
            {
                vect.push_back(i);
                if (ss.peek() == ',')
                    ss.ignore();
            }
            Ax = vect.at(0);
            Ay = vect.at(1);
            Az = vect.at(2);
        }
        else if (!strcasecmp(argv[arg], "--multiply"))
        {
            if ((arg + 1) == argc)
            {
                printf("\n\nERROR: missing model inputs on command-line (see help for details)\n\n\n");
                show_usage();
                status = -1;
                exit(status);
            }
            arg++;
            preprocessMultiply = (argv[arg]);
            
            std::vector<float> vect;
            std::stringstream ss(preprocessMultiply);
            float i;
            while (ss >> i)
            {
                vect.push_back(i);
                if (ss.peek() == ',')
                    ss.ignore();
            }
            Mx = vect.at(0);
            My = vect.at(1);
            Mz = vect.at(2);
        }
    }
    
    if (parameter < 6)
    {
        if(modeType_bool == false)
            printf("\nERROR: missing parameter in command-line: mode type.\n");
        if(model_weights_bool == false)
            printf("\nERROR: missing parameter in command-line: model weights.\n");
        if(label_bool == false)
            printf("\nERROR: missing parameter in command-line: label file.\n");
        if(runType_bool == false)
            printf("\nERROR: missing parameter in command-line: image/video/capture.\n");
        if(model_inputs_bool == false)
            printf("\nERROR: missing parameter in command-line: model input dimensions (c,h,w).\n");
        if(model_outputs_bool == false)
            printf("\nERROR: missing parameter in command-line: model output dimensions (c,h,w).\n");
        show_usage();
        status = -1;
        exit(status);
    }

    // create context, input, output, and graph
    vxRegisterLogCallback(NULL, log_callback, vx_false_e);
    vx_context context = vxCreateContext();
    status = vxGetStatus((vx_reference)context);
    if(status) {
        printf("ERROR: vxCreateContext() failed\n");
        return -1;
    }
    vxRegisterLogCallback(context, log_callback, vx_false_e);

    // creation of graphs
    vx_graph model_graph = vxCreateGraph(context);
    status = vxGetStatus((vx_reference)model_graph);
    if(status) {
        printf("ERROR: vxCreateGraph(...) failed (%d)\n", status);
        return -1;
    }
    
    // create and initialize input tensor data
    vx_size dims_input_data[4] = { vx_size(input_w), vx_size(input_h), vx_size(input_c), 1 };

    // create data for different sizes
    vx_tensor input_data_tensor = vxCreateTensor(context, 4, dims_input_data, VX_TYPE_FLOAT32, 0);
    if(vxGetStatus((vx_reference)input_data_tensor)) {
        printf("ERROR: vxCreateTensor() failed for data\n");
        return -1;
    }

    // create output tensor prob
    vx_size dims_prob_data[4] = { vx_size(output_w), vx_size(output_h), vx_size(output_c), 1 };
    vx_tensor output_prob_tensor = vxCreateTensor(context, 4, dims_prob_data, VX_TYPE_FLOAT32, 0);
    if(vxGetStatus((vx_reference)output_prob_tensor)) {
        printf("ERROR: vxCreateTensor() failed for prob\n");
        return -1;
    }
    // build graph using annmodule
    int64_t freq = clockFrequency(), t0, t1;
    char modelWeights[1024];
    strcpy(modelWeights, modelWeights_str.c_str());


    // create model graph
    t0 = clockCounter();
    if(modelWeights_str != "empty"){
        status = annAddToGraph(model_graph, input_data_tensor, output_prob_tensor, modelWeights);
        if(status) {
            printf("ERROR: Model annAddToGraph() failed (%d)\n", status);
            return -1;
        }
        status = vxVerifyGraph(model_graph);
        if(status) {
            printf("ERROR: Model vxVerifyGraph(...) failed (%d)\n", status);
            return -1;
        }
        runModel = true;
    } 
    t1 = clockCounter();
    printf("OK: graph initialization with annAddToGraph() took %.3f msec\n", (float)(t1-t0)*1000.0f/(float)freq);

    // test process graph 
    t0 = clockCounter();
    status = vxProcessGraph(model_graph);
    if(status != VX_SUCCESS) {
        printf("ERROR: vxProcessGraph() failed (%d)\n", status);
        return -1;
    }
    t1 = clockCounter();
    printf("OK: vxProcessGraph() took %.3f msec (1st iteration)\n", (float)(t1-t0)*1000.0f/(float)freq);

    // avg model time on the system
    int N = 100;
    float modelTime;
    if(modelWeights_str != "empty")
    {
        t0 = clockCounter();
        for(int i = 0; i < N; i++) {
            status = vxProcessGraph(model_graph);
            if(status != VX_SUCCESS)
                break;
        }
        t1 = clockCounter();
        modelTime = (float)(t1-t0)*1000.0f/(float)freq/(float)N;
        printf("OK: NN Model took %.3f msec (average over %d iterations)\n", (float)(t1-t0)*1000.0f/(float)freq/(float)N, N);
    }

    /*****Additions for classification****/
    Classifier *mClassifier;
    float *outputBuffer;
    if(modeType == "1" or modeType == "classification") 
    {
        mClassifier= new Classifier;
        outputBuffer = new float[output_c];
    }

    /*****Additions for object detection****/
    Region *mRegion;
    if(modeType == "2" or modeType == "detection") 
        mRegion = new Region;
    float nms = 0.4;
    int targetBlockwd = 13;
    std::vector<DetectedObject> results;
    float threshold_detect = 0.18;
    
    /*****Additions for segmentation****/
    Segment *mSegment;
    int pipelineDepth = 2;
    int total_size = output_w*output_h*output_c*1;
    int input_dims[4]={0};
    input_dims[0] = 1; input_dims[1] = output_c;
    input_dims[2] = output_h; input_dims[3] = output_w;
    std::thread pipeLineThread[pipelineDepth];
    cv::Mat inputFrame[pipelineDepth];
    cv::Mat maskImage[pipelineDepth];

    int outputImgWidth = 1080, outputImgHeight = 720;
    cv::Size input_geometry = cv::Size(input_dims[3], input_dims[2]);
    cv::Size output_geometry = cv::Size(outputImgWidth, outputImgHeight);
    float *outputBuffer_seg[pipelineDepth];
    unsigned char *classIDBuf[pipelineDepth];
    float *prob[pipelineDepth];
    if (modeType == "3" or modeType == "segmentation"){
        mSegment  = new Segment;
        for(int p = 0; p < pipelineDepth; p++){
            outputBuffer_seg[p] = new float[total_size];
            classIDBuf[p] = new unsigned char[output_w * output_h];
            prob[p] = new float[output_w * output_h];
            maskImage[p].create(input_geometry, CV_8UC3);
        }
    }

    // Time per frame
    modelTime_g = modelTime;

    // define variables for run
    cv::Mat frame, img_cp;
    cv::Mat inputFrame_data_resized;
    int loopSeg = 1;
    
    //mode - image
    if(imageFileInput == true)
    {
        frame = cv::imread(imageFile);
        if (frame.empty()) {
            std::cout << "Unable to open the image: " << imageFile << std::endl;
            exit(1);
        }
        img_cp = frame.clone();
        int pipelinePointer = 0;

        if(modeType == "1" or modeType == "classification" or modeType == "2" or modeType == "detection")
            cv::resize(frame, inputFrame_data_resized, cv::Size(input_h,input_w));
        else if(modeType == "3" or modeType == "segmentation")
            cv::resize(frame, inputFrame[pipelinePointer], cv::Size(2048,1024));

        vx_enum usage = VX_WRITE_ONLY;
        vx_enum data_type = VX_TYPE_FLOAT32;
        vx_size num_of_dims = 4, dims[4] = { 1, 1, 1, 1 }, stride[4];
        vx_map_id map_id;
        float * ptr;
        vx_size count;

        // copy - image tensor
        if(runModel)
        {
            vxQueryTensor(input_data_tensor, VX_TENSOR_DATA_TYPE, &data_type, sizeof(data_type));
            vxQueryTensor(input_data_tensor, VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims));
            vxQueryTensor(input_data_tensor, VX_TENSOR_DIMS, &dims, sizeof(dims[0])*num_of_dims);
            if(data_type != VX_TYPE_FLOAT32) {
                std::cerr << "ERROR: copyTensor() supports only VX_TYPE_FLOAT32: invalid for " <<  std::endl;
                return -1;
            }
            count = dims[0] * dims[1] * dims[2] * dims[3];

            vx_status status = vxMapTensorPatch(input_data_tensor, num_of_dims, nullptr, nullptr, &map_id, stride, (void **)&ptr, usage, VX_MEMORY_TYPE_HOST);
            if(status) {
                std::cerr << "ERROR: vxMapTensorPatch() failed for " <<  std::endl;
                return -1;
            }

            cv::Mat srcImg;
            float *dstR, *dstG, *dstB;
            for(size_t n = 0; n < dims[3]; n++) {
                if(modeType == "1" or modeType == "classification" or modeType == "2" or modeType == "detection")
                    srcImg = inputFrame_data_resized;
                else if(modeType == "3" or modeType == "segmentation")
                    srcImg = inputFrame[pipelinePointer];
                
                if (dims[2] == 1) {
                    cv::cvtColor(srcImg, srcImg, CV_BGR2GRAY);
                }
                for(vx_size y = 0; y < dims[1]; y++) {
                    unsigned char * src = srcImg.data + y*dims[0]*dims[2];
                    dstR = ptr + ((n * stride[3] + y * stride[1]) >> 2);
                    if (dims[2] == 3) {
                        dstG = dstR + (stride[2] >> 2);
                        dstB = dstG + (stride[2] >> 2);
                    }
                    for(vx_size x = 0; x < dims[0]; x++, src += dims[2]) {
                        if(dims[2] == 3)
                        {
                            *dstR++ = (src[2] * Mx) + Ax;
                            *dstG++ = (src[1] * My) + Ay;
                            *dstB++ = (src[0] * Mz) + Az;
                        }
                        else if(dims[2] == 1)
                        {
                            *dstR++ = (src[2] * Mx) + Ax;
                        }
                    }
                }
            }
            status = vxUnmapTensorPatch(input_data_tensor, map_id);
            if(status) {
                std::cerr << "ERROR: vxUnmapTensorPatch() failed for " <<  std::endl;
                return -1;
            }
        }
        
        // process graph for the input           
        if(runModel)
        {
            t0 = clockCounter();
            status = vxProcessGraph(model_graph);
            if(status != VX_SUCCESS) std::cerr << "ERROR: vxProcessGraph() failed"  << std::endl;;
            t1 = clockCounter();
            modelTime_g = (float)(t1-t0)*1000.0f/(float)freq;
            //printf("LIVE: Process Resnet50 Classification Time -- %.3f msec\n", (float)(t1-t0)*1000.0f/(float)freq);
        }
            
        usage = VX_READ_ONLY;
        if(runModel)
        {
            if(modeType == "1" or modeType == "classification")
            {
                vxQueryTensor(output_prob_tensor, VX_TENSOR_DATA_TYPE, &data_type, sizeof(data_type));
                vxQueryTensor(output_prob_tensor, VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims));
                vxQueryTensor(output_prob_tensor, VX_TENSOR_DIMS, &dims, sizeof(dims[0])*num_of_dims);
                if(data_type != VX_TYPE_FLOAT32) {
                    std::cerr << "ERROR: copyTensor() supports only VX_TYPE_FLOAT32: invalid for "  << std::endl;
                    return -1;
                }
                count = dims[0] * dims[1] * dims[2] * dims[3];
                status = vxMapTensorPatch(output_prob_tensor, num_of_dims, nullptr, nullptr, &map_id, stride, (void **)&ptr, usage, VX_MEMORY_TYPE_HOST);
                if(status) {
                    std::cerr << "ERROR: vxMapTensorPatch() failed for "  << std::endl;
                    return -1;
                }
                memcpy(outputBuffer, ptr, (count*sizeof(float)));
                status = vxUnmapTensorPatch(output_prob_tensor, map_id);
                if(status) {
                    std::cerr << "ERROR: vxUnmapTensorPatch() failed for "  << std::endl;
                    return -1;
                }   

                mClassifier->visualize(img_cp, output_c, outputBuffer, NN_ModelName, labelText, modelTime_g);
                cv::waitKey(0);
            }
            else if(modeType == "2" or modeType == "detection")
            {
                vxQueryTensor(output_prob_tensor, VX_TENSOR_DATA_TYPE, &data_type, sizeof(data_type));
                vxQueryTensor(output_prob_tensor, VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims));
                vxQueryTensor(output_prob_tensor, VX_TENSOR_DIMS, &dims, sizeof(dims[0])*num_of_dims);
                if(data_type != VX_TYPE_FLOAT32) {
                    std::cerr << "ERROR: copyTensor() supports only VX_TYPE_FLOAT32: invalid for "  << std::endl;
                    return -1;
                }
                count = dims[0] * dims[1] * dims[2] * dims[3];
                status = vxMapTensorPatch(output_prob_tensor, num_of_dims, nullptr, nullptr, &map_id, stride, (void **)&ptr, usage, VX_MEMORY_TYPE_HOST);
                if(status) {
                    std::cerr << "ERROR: vxMapTensorPatch() failed for "  << std::endl;
                    return -1;
                }

                //call detect function for boxes!!
                mRegion->GetDetections(img_cp, ptr, (int)output_c, (int)output_h, (int)output_w, classes, frame.cols, frame.rows, threshold_detect, nms, targetBlockwd, results, labelText);

                status = vxUnmapTensorPatch(output_prob_tensor, map_id);
                if(status) {
                    std::cerr << "ERROR: vxUnmapTensorPatch() failed for "  << std::endl;
                    return -1;
                }
                cv::waitKey(0);
            }

            else if (modeType == "3" or modeType == "segmentation")
            {
            	if(runModel)
                {
                    t0 = clockCounter();
                    usage = VX_READ_ONLY;
                    vx_enum data_type = VX_TYPE_FLOAT32;
                    vx_size num_of_dims = 4, dims[4] = { 1, 1, 1, 1 }, stride[4];
                    vx_map_id map_id;
                    float * ptr;
                    vx_size count;
                    vx_enum usage = VX_READ_ONLY;
                    vxQueryTensor(output_prob_tensor, VX_TENSOR_DATA_TYPE, &data_type, sizeof(data_type));
                    vxQueryTensor(output_prob_tensor, VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims));
                    vxQueryTensor(output_prob_tensor, VX_TENSOR_DIMS, &dims, sizeof(dims[0])*num_of_dims);
                    if(data_type != VX_TYPE_FLOAT32) {
                        std::cerr << "ERROR: copyTensor() supports only VX_TYPE_FLOAT32: invalid for "  << std::endl;
                    }
                    count = dims[0] * dims[1] * dims[2] * dims[3];
                    vx_status status = vxMapTensorPatch(output_prob_tensor, num_of_dims, nullptr, nullptr, &map_id, stride, (void **)&ptr, usage, VX_MEMORY_TYPE_HOST);
                    if(status) {
                        std::cerr << "ERROR: vxMapTensorPatch() failed for "  << std::endl;
                    }
                    memcpy(outputBuffer_seg[pipelinePointer], ptr, (count*sizeof(float)));
                    status = vxUnmapTensorPatch(output_prob_tensor, map_id);
                    if(status) {
                        std::cerr << "ERROR: vxUnmapTensorPatch() failed for "  << std::endl;
                    }
                
                    mSegment->getMaskImage(img_cp, input_dims, prob[pipelinePointer], classIDBuf[pipelinePointer], outputBuffer_seg[pipelinePointer], input_geometry, maskImage[pipelinePointer], labelText);
                    cv::waitKey(0);
                }
            }
        }
    }
    //mode - video/ live capture
    else
    {
        while(argc && loopSeg)
        {
            cv::VideoCapture cap;
            if (captureFromVideo) {
                cap.open(videoFile);
                if(!cap.isOpened()) {
                    std::cout << "Unable to open the video: " << videoFile << std::endl;
                    return 0;
                }
            }
            else {
                cap.open(captureID);
                if(!cap.isOpened()) {
                    std::cout << "Unable to open the camera feed: " << captureID << std::endl;
                    return 0;
                }
            }

            int frameCount = 0;
            float msFrame = 0, fpsAvg = 0, frameMsecs = 0;
            int pipelinePointer = -1;

            for(;;)
            {
                if(modeType == "3" or modeType == "segmentation")
                {
                    // find pipeline pointer number as a variable of pipeline depth
                    if((frameCount%pipelineDepth) == 0) 
                        pipelinePointer = 0; 
                    else 
                        pipelinePointer = 1;
                }

                msFrame = 0;
                // capture image frame
                t0 = clockCounter();
                cap >> frame;
                img_cp = frame.clone();
                if( frame.empty() ) break; // end of video stream
                t1 = clockCounter();
                msFrame += (float)(t1-t0)*1000.0f/(float)freq;
                //printf("\n\nLIVE: OpenCV Frame Capture Time -- %.3f msec\n", (float)(t1-t0)*1000.0f/(float)freq);

                // preprocess image frame
                t0 = clockCounter();
                if(modeType == "1" or modeType == "classification" or modeType == "2" or modeType == "detection")

                    cv::resize(frame, inputFrame_data_resized, cv::Size(input_h,input_w));
                else if(modeType == "3" or modeType == "segmentation")
                     cv::resize(frame, inputFrame[pipelinePointer], cv::Size(2048,1024));

                t1 = clockCounter();
                msFrame += (float)(t1-t0)*1000.0f/(float)freq;
                //printf("LIVE: OpenCV Frame Resize Time -- %.3f msec\n", (float)(t1-t0)*1000.0f/(float)freq);

                // Copy Image frame into the input tensor
                t0 = clockCounter();
                vx_enum usage = VX_WRITE_ONLY;
                vx_enum data_type = VX_TYPE_FLOAT32;
                vx_size num_of_dims = 4, dims[4] = { 1, 1, 1, 1 }, stride[4];
                vx_map_id map_id;
                float * ptr;
                vx_size count;

                // copy - 224x224 image tensor
                if(runModel)
                {
                    vxQueryTensor(input_data_tensor, VX_TENSOR_DATA_TYPE, &data_type, sizeof(data_type));
                    vxQueryTensor(input_data_tensor, VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims));
                    vxQueryTensor(input_data_tensor, VX_TENSOR_DIMS, &dims, sizeof(dims[0])*num_of_dims);
                    if(data_type != VX_TYPE_FLOAT32) {
                        std::cerr << "ERROR: copyTensor() supports only VX_TYPE_FLOAT32: invalid for " <<  std::endl;
                        return -1;
                    }
                    count = dims[0] * dims[1] * dims[2] * dims[3];

                    vx_status status = vxMapTensorPatch(input_data_tensor, num_of_dims, nullptr, nullptr, &map_id, stride, (void **)&ptr, usage, VX_MEMORY_TYPE_HOST);
                    if(status) {
                        std::cerr << "ERROR: vxMapTensorPatch() failed for " <<  std::endl;
                        return -1;
                    }
                    cv::Mat srcImg;
                    float *dstR, *dstG, *dstB;
                    for(size_t n = 0; n < dims[3]; n++) {
                        if(modeType == "1" or modeType == "classification" or modeType == "2" or modeType == "detection")

                            srcImg = inputFrame_data_resized;
                        else if(modeType == "3" or modeType == "segmentation")
                             srcImg = inputFrame[pipelinePointer];

                        if (dims[2] == 1) {
                            cv::cvtColor(srcImg, srcImg, CV_BGR2GRAY);
                        }
                        for(vx_size y = 0; y < dims[1]; y++) {
                            unsigned char * src = srcImg.data + y*dims[0]*dims[2];
                            dstR = ptr + ((n * stride[3] + y * stride[1]) >> 2);
                            if (dims[2] == 3) {
                                dstG = dstR + (stride[2] >> 2);
                                dstB = dstG + (stride[2] >> 2);
                            }
                            for(vx_size x = 0; x < dims[0]; x++, src += dims[2]) {
                                   if(dims[2] == 3)
                                    {
                                        *dstR++ = (src[2] * Mx) + Ax;
                                        *dstG++ = (src[1] * My) + Ay;
                                        *dstB++ = (src[0] * Mz) + Az;
                                    }
                                    else if(dims[2] == 1)
                                    {
                                        *dstR++ = (src[2] * Mx) + Ax;
                                    }
                            }
                        }
                    }
                    status = vxUnmapTensorPatch(input_data_tensor, map_id);
                    if(status) {
                        std::cerr << "ERROR: vxUnmapTensorPatch() failed for " <<  std::endl;
                        return -1;
                    }
                }
                t1 = clockCounter();
                msFrame += (float)(t1-t0)*1000.0f/(float)freq;
                //printf("LIVE: Convert Image to Tensor Time -- %.3f msec\n", (float)(t1-t0)*1000.0f/(float)freq);
       
                // process graph for the input           
                if(runModel)
                {
                    t0 = clockCounter();
                    status = vxProcessGraph(model_graph);
                    if(status != VX_SUCCESS) break;
                    t1 = clockCounter();
                    modelTime_g = (float)(t1-t0)*1000.0f/(float)freq;
                    msFrame += (float)(t1-t0)*1000.0f/(float)freq;
                    //printf("LIVE: Process Resnet50 Classification Time -- %.3f msec\n", (float)(t1-t0)*1000.0f/(float)freq);
                }

                if(modeType == "1" or modeType == "classification")
                {   // copy output data into local buffer
                    t0 = clockCounter();
                    usage = VX_READ_ONLY;
                    if(runModel)
                    {
                        vxQueryTensor(output_prob_tensor, VX_TENSOR_DATA_TYPE, &data_type, sizeof(data_type));
                        vxQueryTensor(output_prob_tensor, VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims));
                        vxQueryTensor(output_prob_tensor, VX_TENSOR_DIMS, &dims, sizeof(dims[0])*num_of_dims);
                        if(data_type != VX_TYPE_FLOAT32) {
                            std::cerr << "ERROR: copyTensor() supports only VX_TYPE_FLOAT32: invalid for "  << std::endl;
                            return -1;
                        }
                        count = dims[0] * dims[1] * dims[2] * dims[3];
                        status = vxMapTensorPatch(output_prob_tensor, num_of_dims, nullptr, nullptr, &map_id, stride, (void **)&ptr, usage, VX_MEMORY_TYPE_HOST);
                        if(status) {
                            std::cerr << "ERROR: vxMapTensorPatch() failed for "  << std::endl;
                            return -1;
                        }
                        memcpy(outputBuffer, ptr, (count*sizeof(float)));
                        status = vxUnmapTensorPatch(output_prob_tensor, map_id);
                        if(status) {
                            std::cerr << "ERROR: vxUnmapTensorPatch() failed for "  << std::endl;
                            return -1;
                        }
                    }
                    t1 = clockCounter();
                    msFrame += (float)(t1-t0)*1000.0f/(float)freq;
                    //printf("LIVE: Copy probability Output Time -- %.3f msec\n", (float)(t1-t0)*1000.0f/(float)freq);
        
                    // visualize the classification output
                    t0 = clockCounter();
                    mClassifier->visualize(img_cp, output_c, outputBuffer, NN_ModelName, labelText, modelTime_g);
                    t1 = clockCounter();
                    msFrame += (float)(t1-t0)*1000.0f/(float)freq;
                    //printf("LIVE: Get Classification Results Time -- %.3f msec\n", (float)(t1-t0)*1000.0f/(float)freq);
                }
                else if(modeType == "2" or modeType == "detection")
                {  
                    t0 = clockCounter();
                    usage = VX_READ_ONLY;
                    if(runModel)
                    {
                        vxQueryTensor(output_prob_tensor, VX_TENSOR_DATA_TYPE, &data_type, sizeof(data_type));
                        vxQueryTensor(output_prob_tensor, VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims));
                        vxQueryTensor(output_prob_tensor, VX_TENSOR_DIMS, &dims, sizeof(dims[0])*num_of_dims);
                        if(data_type != VX_TYPE_FLOAT32) {
                            std::cerr << "ERROR: copyTensor() supports only VX_TYPE_FLOAT32: invalid for "  << std::endl;
                            return -1;
                        }
                        count = dims[0] * dims[1] * dims[2] * dims[3];
                        status = vxMapTensorPatch(output_prob_tensor, num_of_dims, nullptr, nullptr, &map_id, stride, (void **)&ptr, usage, VX_MEMORY_TYPE_HOST);
                        if(status) {
                            std::cerr << "ERROR: vxMapTensorPatch() failed for "  << std::endl;
                            return -1;
                        }
                        
                        //call detect function for boxes!!
                        mRegion->GetDetections(img_cp, ptr, (int)output_c, (int)output_h, (int)output_w, classes, frame.cols, frame.rows, threshold_detect, nms, targetBlockwd, results, labelText);

                        status = vxUnmapTensorPatch(output_prob_tensor, map_id);
                        if(status) {
                            std::cerr << "ERROR: vxUnmapTensorPatch() failed for "  << std::endl;
                            return -1;
                        }
                    }
                    t1 = clockCounter();
                    msFrame += (float)(t1-t0)*1000.0f/(float)freq;
                }

                else if(modeType == "3" or modeType == "segmentation")
                {
                    if(runModel)
                    {
                        t0 = clockCounter();
                        usage = VX_READ_ONLY;
                        vx_enum data_type = VX_TYPE_FLOAT32;
    				    vx_size num_of_dims = 4, dims[4] = { 1, 1, 1, 1 }, stride[4];
    				    vx_map_id map_id;
    				    float * ptr;
    				    vx_size count;
    				    vx_enum usage = VX_READ_ONLY;
    				    vxQueryTensor(output_prob_tensor, VX_TENSOR_DATA_TYPE, &data_type, sizeof(data_type));
    				    vxQueryTensor(output_prob_tensor, VX_TENSOR_NUMBER_OF_DIMS, &num_of_dims, sizeof(num_of_dims));
    				    vxQueryTensor(output_prob_tensor, VX_TENSOR_DIMS, &dims, sizeof(dims[0])*num_of_dims);
    				    if(data_type != VX_TYPE_FLOAT32) {
    				        std::cerr << "ERROR: copyTensor() supports only VX_TYPE_FLOAT32: invalid for "  << std::endl;
    				    }
    				    count = dims[0] * dims[1] * dims[2] * dims[3];
    				    vx_status status = vxMapTensorPatch(output_prob_tensor, num_of_dims, nullptr, nullptr, &map_id, stride, (void **)&ptr, usage, VX_MEMORY_TYPE_HOST);
    				    if(status) {
    				        std::cerr << "ERROR: vxMapTensorPatch() failed for "  << std::endl;
    				    }
    				    memcpy(outputBuffer_seg[pipelinePointer], ptr, (count*sizeof(float)));
    				    status = vxUnmapTensorPatch(output_prob_tensor, map_id);
    				    if(status) {
    				        std::cerr << "ERROR: vxUnmapTensorPatch() failed for "  << std::endl;
    				    }
                    
                        mSegment->getMaskImage(img_cp, input_dims, prob[pipelinePointer], classIDBuf[pipelinePointer], outputBuffer_seg[pipelinePointer], input_geometry, maskImage[pipelinePointer], labelText);
                    }
                    
                    t1 = clockCounter();
                    msFrame += (float)(t1-t0)*1000.0f/(float)freq;
                }


                // calculate FPS
                //printf("LIVE: msec for frame -- %.3f msec\n", (float)msFrame);
                frameMsecs += msFrame;
                if(frameCount && frameCount%10 == 0){
                    printf("FPS LIVE: Avg FPS -- %d\n", (int)((ceil)(1000/(frameMsecs/10))));
                    frameMsecs = 0;
                }

                // wait to close live inference application
                int key = cv::waitKey(1);
                //if( cv::waitKey(2) == 27 ){ loopSeg = 0; break; } // stop capturing by pressing ESC
                if((key & 255) == 27) { loopSeg = 0; break; }
                if( (key & 255) == 32) { int key = cv::waitKey(0); if((key&255) == 32) { continue; }} //press space bar for pause/play frame
                if( (key & 255) == 114 ){ break; } // for restart pressing R
                frameCount++;
            }  
        }
    }
   
    
    // release resources;
    if(modeType == "1" or modeType == "classification")
    {
        delete mClassifier;
        delete [] outputBuffer;  
    }
    else if(modeType == "2" or modeType == "detection")
        delete mRegion;
    else if(modeType == "3" or modeType == "segmentation")
    {
        delete mSegment;
        for(int p = 0; p < pipelineDepth; p++){
        delete outputBuffer_seg[p];
        delete classIDBuf[p];
        delete prob[p];
        }
    }
    // release input data
    ERROR_CHECK_STATUS(vxReleaseTensor(&input_data_tensor));
    // release output data
    ERROR_CHECK_STATUS(vxReleaseTensor(&output_prob_tensor));
    // release graphs
    ERROR_CHECK_STATUS(vxReleaseGraph(&model_graph));
    // release context
    ERROR_CHECK_STATUS(vxReleaseContext(&context));

    printf("OK: MIVisionX Classifier Successful\n");
    return 0;
}
