#include "circular_buffer.h"
#include "log.h"
CircularBuffer::CircularBuffer(DeviceResources ocl, size_t buffer_depth):
        BUFF_DEPTH(buffer_depth),
        _cl_cmdq(ocl.cmd_queue),
        _cl_context(ocl.context),
        _device_id(ocl.device_id),
        _dev_buffer(BUFF_DEPTH, nullptr),
        _host_buffer_ptrs(BUFF_DEPTH, nullptr),
        _actual_host_buffers(BUFF_DEPTH),
        _write_ptr(0),
        _read_ptr(0),
        _level(0)
{
    for(size_t bufIdx = 0; bufIdx < BUFF_DEPTH; bufIdx++)
        _dev_buffer[bufIdx] = nullptr;

}


void CircularBuffer::cancel_reading()
{
    // Wake up the reader thread in case it's waiting for a load
    _wait_for_load.notify_one();
}

void CircularBuffer::cancel_writing()
{
    // Wake up the writer thread in case it's waiting for an unload
    _wait_for_unload.notify_one();
}


cl_mem CircularBuffer::get_read_buffer_dev()
{
    wait_if_empty();
    return _dev_buffer[_read_ptr];
}

unsigned char* CircularBuffer::get_read_buffer_host()
{
    wait_if_empty();
    return _host_buffer_ptrs[_read_ptr];
}

unsigned char*  CircularBuffer::get_write_buffer()
{
    wait_if_full();
    return(_host_buffer_ptrs[_write_ptr]);
}

void CircularBuffer::sync()
{
    cl_int err = CL_SUCCESS;
    if(_output_mem_type== RaliMemType::OCL) 
    {
#if 0
        if(clEnqueueWriteBuffer(_cl_cmdq, _dev_sub_buffer[_write_ptr], CL_TRUE, 0, _output_mem_size, _host_buffer_ptrs[_write_ptr], 0, NULL, NULL) != CL_SUCCESS)
            THROW("clEnqueueMapBuffer of size "+ TOSTR(_output_mem_size) + " failed " + TOSTR(err));

#else        
        //NOTE: instead of calling clEnqueueWriteBuffer (shown above), 
        // an unmap/map cen be done to make sure data is copied from the host to device, it's fast
        //NOTE: Using clEnqueueUnmapMemObject/clEnqueuenmapMemObject when buffer is allocated with 
        // CL_MEM_ALLOC_HOST_PTR adds almost no overhead
        clEnqueueUnmapMemObject(_cl_cmdq, _dev_buffer[_write_ptr], _host_buffer_ptrs[_write_ptr], 0, NULL, NULL);
        _host_buffer_ptrs[_write_ptr] = (unsigned char*) clEnqueueMapBuffer(_cl_cmdq,
                                                                            _dev_buffer[_write_ptr] ,
                                                                            CL_FALSE,
                                                                            CL_MAP_WRITE,
                                                                            0,
                                                                            _output_mem_size,
                                                                            0, NULL, NULL, &err );
        if(err)
            THROW("clEnqueueUnmapMemObject of size "+ TOSTR(_output_mem_size) + " failed " + TOSTR(err));

#endif        
    }
    else 
    {
        // For the host processing no copy is needed, since data is already loaded in the host buffers
        // and handle will be swaped on it
    }
}

void CircularBuffer::push()
{
    sync();
    increment_write_ptr();
}

void CircularBuffer::pop()
{
    increment_read_ptr();
}
void CircularBuffer::init(RaliMemType output_mem_type, size_t output_mem_size)
{
    _output_mem_type = output_mem_type;
    _output_mem_size = output_mem_size;
    if(BUFF_DEPTH < 2)
        THROW ("Error internal buffer size for the circular buffer should be greater than one")
    
    // Allocating buffers
    if(_output_mem_type== RaliMemType::OCL) 
    {
        if(_cl_cmdq == nullptr || _device_id == nullptr || _cl_context == nullptr)
            THROW("Error ocl structure needed since memory type is OCL");

        cl_int err = CL_SUCCESS;

        for(size_t buffIdx = 0; buffIdx < BUFF_DEPTH; buffIdx++)
        {
            //NOTE: we don't need to use CL_MEM_ALLOC_HOST_PTR memory if this buffer is not going to be
            // used in the host. But we cannot ensure which Rali's copy function is going to be called 
            // (copy to host or OCL) by the user
            _dev_buffer[buffIdx] = clCreateBuffer(  _cl_context, 
                                                    CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR,
                                                    _output_mem_size, NULL, &err);// Create pinned memory
            if (!_dev_buffer[buffIdx]  || err)
                THROW("clCreateBuffer of size" + TOSTR(_output_mem_size)+  "failed " + TOSTR(err));

            //TODO: we don't need to map the buffers to host here if the output of the output of this
            //  loader_module is not required by the user to be part of the augmented output
            _host_buffer_ptrs[buffIdx] = (unsigned char*) clEnqueueMapBuffer(_cl_cmdq,
                                                                             _dev_buffer[buffIdx] ,
                                                                             CL_TRUE, CL_MAP_WRITE,
                                                                             0,
                                                                             _output_mem_size,
                                                                             0, NULL, NULL, &err );
            if(err)
                THROW("clEnqueueMapBuffer of size" + TOSTR(_output_mem_size)+  "failed " + TOSTR(err));
            clRetainMemObject(_dev_buffer[buffIdx]);


        }
    } 
    else 
    {
        for(size_t buffIdx = 0; buffIdx < BUFF_DEPTH; buffIdx++)
        {
            _actual_host_buffers[buffIdx].resize(_output_mem_size);
            _host_buffer_ptrs[buffIdx] = _actual_host_buffers[buffIdx].data();
        }


    }
}

bool CircularBuffer::empty()
{
    return (_level <= 0);
}

bool CircularBuffer::full()
{
    return (_level >= BUFF_DEPTH - 1);
}

size_t CircularBuffer::level()
{
    return _level;
}

void CircularBuffer::increment_read_ptr() 
{
    std::unique_lock<std::mutex> lock(_lock);
    _read_ptr = (_read_ptr+1)%BUFF_DEPTH;
    _level--;
    lock.unlock();
    // Wake up the writer thread (in case waiting) since there is an empty spot to write to,
    _wait_for_unload.notify_all();

}

void CircularBuffer::increment_write_ptr() 
{
    std::unique_lock<std::mutex> lock(_lock);
    _write_ptr = (_write_ptr+1)%BUFF_DEPTH;
    _level++;
    lock.unlock();
    // Wake up the reader thread (in case waiting) since there is a new load to be read
    _wait_for_load.notify_all();
}

void CircularBuffer::wait_if_empty() 
{
    std::unique_lock<std::mutex> lock(_lock);
    if(empty()) 
    { // if the current read buffer is being written wait on it
        _wait_for_load.wait(lock);
    }
}

void CircularBuffer:: wait_if_full() 
{
    std::unique_lock<std::mutex> lock(_lock);
    // Write the whole buffer except for the last spot which is being read by the reader thread
    if(full()) 
    {
        _wait_for_unload.wait(lock);
    }
}

CircularBuffer::~CircularBuffer() 
{
    for(size_t buffIdx = 0; buffIdx < BUFF_DEPTH; buffIdx++) 
    {
        if(_output_mem_type== RaliMemType::OCL) 
        {
            if(clEnqueueUnmapMemObject(_cl_cmdq, _dev_buffer[buffIdx], _host_buffer_ptrs[buffIdx], 0, NULL, NULL) != CL_SUCCESS)
                ERR("Could not unmap ocl memory")
            if(clReleaseMemObject(_dev_buffer[buffIdx]) != CL_SUCCESS)
                ERR("Could not release ocl memory in the circular buffer")
        }
    }

    _dev_buffer.clear();
    _host_buffer_ptrs.clear();
    _actual_host_buffers.clear();
    _write_ptr = 0;
    _read_ptr = 0;
    _level = 0;
    _cl_cmdq = 0;
    _cl_context = 0;
    _device_id = 0;
} 


