/* 
Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
 
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
 
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
 
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/


#include "ago_internal.h"

static DWORD WINAPI agoGraphThreadFunction(LPVOID graph_)
{
	AgoGraph * graph = (AgoGraph *)graph_;
	while (WaitForSingleObject(graph->hSemToThread, INFINITE) == WAIT_OBJECT_0) {
		if (graph->threadThreadTerminationState)
			break;

		// execute graph
		graph->status = agoProcessGraph(graph);

		// inform caller
		graph->threadExecuteCount++;
		ReleaseSemaphore(graph->hSemFromThread, 1, nullptr);
	}
	// inform caller about termination
	graph->threadThreadTerminationState = 2;
	ReleaseSemaphore(graph->hSemFromThread, 1, nullptr);
	return 0;
}

AgoContext * agoCreateContextFromPlatform(struct _vx_platform * platform)
{
	CAgoLockGlobalContext lock;

	// check if CPU hardware supports
	bool isHardwareSupported = agoIsCpuHardwareSupported();
	if (!isHardwareSupported) {
		agoAddLogEntry(NULL, VX_FAILURE, "ERROR: Unsupported CPU (requires SSE 4.2)\n");
		return NULL;
	}

	// create context and initialize
	AgoContext * acontext = new AgoContext;
	if (acontext) {
		acontext->ref.platform = platform;
		agoResetReference(&acontext->ref, VX_TYPE_CONTEXT, acontext, NULL);
		acontext->ref.external_count++;
		// initialize image formats
		if (agoInitializeImageComponentsAndPlanes(acontext)) {
			delete acontext;
			return NULL;
		}
		// initialize kernels
		if (agoPublishKernels(acontext)) {
			delete acontext;
			return NULL;
		}
		// initialize thread config
		char textBuffer[1024];
		if (agoGetEnvironmentVariable("AGO_THREAD_CONFIG", textBuffer, sizeof(textBuffer))) {
			acontext->thread_config = atoi(textBuffer);
		}
	}
	return (AgoContext *)acontext;
}

int agoReleaseContext(AgoContext * acontext)
{
	CAgoLockGlobalContext lock;

	if (!agoIsValidContext(acontext))
		return -1;

	EnterCriticalSection(&acontext->cs);
	// release all the resources
	LeaveCriticalSection(&acontext->cs);
	delete acontext;
	return 0;
}

AgoGraph * agoCreateGraph(AgoContext * acontext)
{
	AgoGraph * agraph = new AgoGraph;
	if (!agraph) {
		return nullptr;
	}

	// initialize
	agoResetReference(&agraph->ref, VX_TYPE_GRAPH, acontext, NULL);
	agraph->attr_affinity = acontext->attr_affinity;
	char textBuffer[256];
	if (agoGetEnvironmentVariable("VX_GRAPH_ATTRIBUTE_AMD_OPTIMIZER_FLAGS", textBuffer, sizeof(textBuffer))) {
		if (sscanf(textBuffer, "%i", &agraph->optimizer_flags) == 1) {
			agoAddLogEntry(&agraph->ref, VX_SUCCESS, "DEBUG: VX_GRAPH_ATTRIBUTE_AMD_OPTIMIZER_FLAGS = 0x%08x\n", agraph->optimizer_flags);
		}
	}

	{ // link graph to the context
		CAgoLock lock(acontext->cs);
		agoAddGraph(&acontext->graphList, agraph);
		agraph->ref.external_count++;
	}

	if (acontext->thread_config & 1) {
		// create semaphore and thread for graph scheduling: limit 1000 pending requests
		agraph->hSemToThread = CreateSemaphore(nullptr, 0, 1000, nullptr);
		agraph->hSemFromThread = CreateSemaphore(nullptr, 0, 1000, nullptr);
		if (agraph->hSemToThread == NULL || agraph->hSemFromThread == NULL) { 
			agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: CreateSemaphore() failed\n");
			agoReleaseGraph(agraph); 
			return nullptr; 
		}
		agraph->hThread = CreateThread(NULL, 0, agoGraphThreadFunction, agraph, 0, NULL);
#if _WIN32 // TBD: need to enable this check for non-windows platforms
		if (agraph->hThread == NULL) { 
			agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: CreateThread() failed\n");
			agoReleaseGraph(agraph); 
			return nullptr; 
		}
#if _DEBUG
		agoAddLogEntry(&agraph->ref, VX_SUCCESS, "OK: enabled graph scheduling in separate threads\n");
#endif
#endif
	}

	return (AgoGraph *)agraph;
}

int agoReleaseGraph(AgoGraph * agraph)
{
	CAgoLock lock(agraph->ref.context->cs);

	int status = 0;
	agraph->ref.external_count--;
	if (agraph->ref.external_count == 0) {
		EnterCriticalSection(&agraph->cs);
		// stop graph thread
		if (agraph->hThread) {
			if (agraph->hThread) {
				agraph->threadThreadTerminationState = 1;
				ReleaseSemaphore(agraph->hSemToThread, 1, nullptr);
				while (agraph->threadThreadTerminationState == 1) {
					std::this_thread::sleep_for(std::chrono::milliseconds(1));
				}
				CloseHandle(agraph->hThread);
			}
			if (agraph->hSemToThread) {
				CloseHandle(agraph->hSemToThread);
			}
			if (agraph->hSemFromThread) {
				CloseHandle(agraph->hSemFromThread);
			}
		}
		// deinitialize the graph
		for (AgoNode * node = agraph->nodeList.head; node; node = node->next)
		{
			status = agoShutdownNode(node);
			if (status) {
				break;
			}
		}
		if (!status) {
			// remove graph from context
			if (agoRemoveGraph(&agraph->ref.context->graphList, agraph) != agraph) {
				status = -1;
				LeaveCriticalSection(&agraph->cs);
			}
			else {
#if ENABLE_OPENCL
				// Releasing the command queue for the graph because it is not needed
				agoGpuOclReleaseGraph(agraph);
#endif
				LeaveCriticalSection(&agraph->cs);
				// move graph to garbage list
				agraph->next = agraph->ref.context->graph_garbage_list;
				agraph->ref.context->graph_garbage_list = agraph;
			}
		}
		else {
			LeaveCriticalSection(&agraph->cs);
		}
	}

	return status;
}

int agoOptimizeGraph(AgoGraph * agraph)
{
	if (!agraph->status) {
		CAgoLock lock(agraph->cs);
		CAgoLock lock2(agraph->ref.context->cs);

		// run DRAMA graph optimizer
		agraph->status = agoOptimizeDrama(agraph);
	}

	return agraph->status;
}

int agoWriteGraph(AgoGraph * agraph, AgoReference * * ref, int num_ref, FILE * fp, const char * comment)
{
	CAgoLock lock(agraph->cs);
	CAgoLock lock2(agraph->ref.context->cs);

#if ENABLE_DEBUG_MESSAGES
	agoOptimizeDramaMarkDataUsage(agraph);
#endif

	bool * imported = new bool[agraph->ref.context->num_active_modules + 1];
	for (vx_uint32 i = 0; i < agraph->ref.context->num_active_modules; i++)
		imported[i] = false;
	fprintf(fp, "# ago graph dump BEGIN [%s]\n", comment ? comment : "");
	for (auto aus = agraph->ref.context->userStructList.begin(); aus != agraph->ref.context->userStructList.end(); aus++) {
		if (aus->importing_module_index_plus1) {
			if (!imported[aus->importing_module_index_plus1 - 1]) {
				fprintf(fp, "import %s\n", agraph->ref.context->modules[aus->importing_module_index_plus1 - 1].module_name);
				imported[aus->importing_module_index_plus1 - 1] = true;
			}
		}
		else {
			if (!aus->name.length()) {
				vx_char name[64];
				sprintf(name, "AUTO-USER-STRUCT!%03d!", aus->id - VX_TYPE_USER_STRUCT_START + 1);
				aus->name = name;
			}
			fprintf(fp, "type %s userstruct:" VX_FMT_SIZE "\n", aus->name.c_str(), aus->size);
		}
	}
	for (AgoKernel * akernel = agraph->ref.context->kernelList.head; akernel; akernel = akernel->next) {
		if (akernel->flags & AGO_KERNEL_FLAG_GROUP_USER) {
			if (akernel->importing_module_index_plus1) {
				if (!imported[akernel->importing_module_index_plus1 - 1]) {
					fprintf(fp, "import %s\n", agraph->ref.context->modules[akernel->importing_module_index_plus1 - 1].module_name);
					imported[akernel->importing_module_index_plus1 - 1] = true;
				}
			}
		}
	}
	for (AgoData * adata = agraph->ref.context->dataList.head; adata; adata = adata->next) {
		// check if data is part of specified ref[] arguments
		int index = -1;
		for (int i = 0; i < num_ref; i++) {
			if (adata == (AgoData *)ref[i]) {
				index = i;
				break;
			}
		}
		// output data statements for non ref[] and non internal generated data objects
		if (index < 0 && adata->name.length() > 0 && adata->name[0] != '!' && !adata->parent) {
			char desc[1024] = "*ERROR*";
			agoGetDescriptionFromData(agraph->ref.context, desc, adata);
			fprintf(fp, "data %s = %s", adata->name.length() ? adata->name.c_str() : "*UNKNOWN*", desc);
#if ENABLE_DEBUG_MESSAGES
			if (adata->inputUsageCount | adata->outputUsageCount | adata->inoutUsageCount)
				fprintf(fp, " #usageCount[%d,%d,%d]", adata->inputUsageCount, adata->outputUsageCount, adata->inoutUsageCount);
#endif
			fprintf(fp, "\n");
		}
	}
	for (AgoData * adata = agraph->dataList.head; adata; adata = adata->next) {
		// check if data is part of specified ref[] arguments
		int index = -1;
		for (int i = 0; i < num_ref; i++) {
			if (adata == (AgoData *)ref[i]) {
				index = i;
				break;
			}
		}
		// output data statements for non ref[] and non internal generated data objects
		if (index < 0 && adata->name.length() > 0 && adata->name[0] != '!' && !adata->parent) {
			char desc[1024] = "*ERROR*";
			agoGetDescriptionFromData(agraph->ref.context, desc, adata);
			fprintf(fp, "data %s = %s", adata->name.length() ? adata->name.c_str() : "*UNKNOWN*", desc);
#if ENABLE_DEBUG_MESSAGES
			if (adata->inputUsageCount | adata->outputUsageCount | adata->inoutUsageCount)
				fprintf(fp, " #usageCount[%d,%d,%d]", adata->inputUsageCount, adata->outputUsageCount, adata->inoutUsageCount);
			fprintf(fp, " #(virtual)");
#endif
			fprintf(fp, "\n");
		}
	}
	for (AgoNode * anode = agraph->nodeList.head; anode; anode = anode->next) {
		fprintf(fp, "node %s", anode->akernel->name);
		vx_uint32 paramCount = anode->paramCount;
		while (paramCount > 0 && !anode->paramList[paramCount - 1])
			paramCount--;
		for (vx_uint32 i = 0; i < paramCount; i++) {
			AgoData * data = anode->paramList[i];
			if (!data) {
				fprintf(fp, " null");
			}
			else {
				// check if data is part of specified ref[] arguments, if so use $1..$N in output
				int index = -1;
				for (int i = 0; i < num_ref; i++) {
					if (data == (AgoData *)ref[i]) {
						index = i;
						break;
					}
				}
				if (index >= 0) {
					fprintf(fp, " $%d", index + 1);
				}
				else {
					char name[1024];
					agoGetDataName(name, data);
					if (name[0]) {
						fprintf(fp, " %s", name);
					}
					else {
						char desc[1024];
						agoGetDescriptionFromData(agraph->ref.context, desc, data);
						fprintf(fp, " %s", desc);
					}
				}
			}
		}
		if (anode->attr_border_mode.mode == VX_BORDER_MODE_REPLICATE) fprintf(fp, " attr:BORDER_MODE:REPLICATE");
		else if (anode->attr_border_mode.mode == VX_BORDER_MODE_CONSTANT) fprintf(fp, " attr:BORDER_MODE:CONSTANT,0x%08x", anode->attr_border_mode.constant_value.U32);
		if (anode->attr_affinity.device_type) {
			fprintf(fp, " attr:AFFINITY:%s", (anode->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU) ? "GPU" : "CPU");
			if (anode->attr_affinity.device_info) 
				fprintf(fp, "%d", anode->attr_affinity.device_info);
			if (anode->attr_affinity.group)
				fprintf(fp, ",%d", anode->attr_affinity.group);
		}
#if _DEBUG || ENABLE_DEBUG_MESSAGES
		fprintf(fp, " #L%d", anode->hierarchical_level);
#endif
		fprintf(fp, "\n");
	}
	fprintf(fp, "# ago graph dump END [%s]\n", comment ? comment : "");
	fflush(fp);
	delete[] imported;

	return 0;
}

static const char * agoReadLine(char * line, int size, const char * str)
{
	if (!str || !*str)
		return NULL;
	line[0] = 0; size -= 2;
	for (int i = 0; i < size; i++) {
		char c = line[i] = *str++;
		if (c == 0) {
			str--;
			break;
		}
		else if (c == '\n') {
			line[i + 1] = 0;
			break;
		}
	}
	return str;
}

static void agoUpdateLine(char * line, std::vector< std::pair< std::string, std::string > >& vars, std::string localPrefix)
{
	char lineOriginal[2048]; strcpy(lineOriginal, line);
	int ki = 0;
	for (int i = 0; lineOriginal[i]; i++, ki++) {
		line[ki] = lineOriginal[i];
		if (lineOriginal[i] == '$' && lineOriginal[i + 1] >= 'A' && lineOriginal[i + 1] <= 'Z') {
			// get variable name
			char * s = &lineOriginal[i + 1];
			int k = 1;
			for (; (s[k] >= 'A' && s[k] <= 'Z') || (s[k] >= 'a' && s[k] <= 'z') || (s[k] >= '0' && s[k] <= '9') || s[k] == '_'; k++)
				;
			// search variable name
			for (std::vector< std::pair< std::string, std::string > >::iterator it = vars.begin(); it != vars.end(); ++it) {
				if (!strncmp(it->first.c_str(), s, k)) {
					strcpy(&line[ki], it->second.c_str());
					ki = (int)strlen(line) - 1;
					i += k;
					break;
				}
			}
		}
		else if (lineOriginal[i] == '$' && lineOriginal[i + 1] == '!') {
			strcpy(&line[ki], localPrefix.c_str());
			ki = (int)strlen(line) - 1;
			line[++ki] = '!';
			i += 1;
		}
	}
	line[ki] = 0;
}

static void agoUpdateN(char * output, char * input, int N, int Nchar)
{
	int ki = 0;
	for (int i = 0; input[i]; i++, ki++) {
		output[ki] = input[i];
		if (input[i] == '{') {
			// get variable name
			char * s = &input[i + 1];
			int k = 0;
			int index = 0, v = 0, op = '+';
			for (; (s[k] >= '0' && s[k] <= '9') || (Nchar && s[k] == Nchar) || (s[k] == '+') || (s[k] == '-'); k++) {
				if (s[k] == Nchar) v = N;
				else if (s[k] == '+' || s[k] == '-') {
					index += (op == '+') ? v : -v;
					op = s[k];
					v = 0;
				}
				else v = v * 10 + s[k] - '0';
			}
			index += (op == '+') ? v : -v;
			if (s[k] == '}') {
				// replace $[expr] with index
				sprintf(&output[ki], "%d", index);
				ki = (int)strlen(output) - 1;
				i += k + 1;
			}
		}
	}
	output[ki] = 0;
}

static void agoReadGraphFromStringInternal(AgoGraph * agraph, AgoReference * * ref, int num_ref, ago_data_registry_callback_f callback_f, void * callback_obj, const char * str, vx_int32 dumpToConsole, std::vector< std::pair< std::string, std::string > >& vars, std::string localPrefix)
{
	vx_context context = agraph->ref.context;
	std::vector< std::pair< std::string, std::string > > aliases;
	// set default values to for/if constructs
	vx_int32 Nbegin = 0, Nend = 0, Nstep = 1, Nchar = '\0', forConstruct = 0;
	vx_uint32 ifdepth = 0, ifcur = 0, ifall = 0;
	// process one line at a time
	char line[2048];
	for (int lineno = 1; (str = agoReadLine(line, sizeof(line) - 16, str)) != NULL; lineno++)
	{
		int N = (int)strlen(line);
		while (N > 0 && (line[N - 1] == '\r' || line[N - 1] == '\n'))
			line[--N] = 0;
		if (dumpToConsole) agoAddLogEntry(NULL, VX_SUCCESS, "%s\n", line);
		while (N > 0 && line[N - 1] == '\\') {
			int pos = N - 1;
			if (!(str = agoReadLine(line + pos, sizeof(line) - 16 - pos, str))) break;
			N = (int)strlen(line);
			while (N > 0 && (line[N - 1] == '\r' || line[N - 1] == '\n'))
				line[--N] = 0;
			if (dumpToConsole) agoAddLogEntry(NULL, VX_SUCCESS, "%s\n", line+pos);
			lineno++;
		}
		agoUpdateLine(line, vars, localPrefix);
		char lineCopy[sizeof(line)]; strcpy(lineCopy, line);
		char * s = strstr(line, "#");
		if (s) { *s = 0; N = (int)strlen(line); }
		char * argv[64] = { 0 };
		int narg = 0;
		for (s = line; narg < 64;)
		{
			while (*s && (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n')) s++;
			if (!*s) break;
			argv[narg++] = s;
			while (*s && !(*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n')) s++;
			if (*s) *s++ = 0;
			else break;
		}
		// process for construct
		if (!forConstruct) {
			// reset for-loop parameters to single iteration
			Nbegin = 0, Nend = 0, Nstep = 1, Nchar = '\0';
			if (narg == 4 && !strcmp(argv[0], "for") && !strcmp(argv[2], "in") && strlen(argv[1]) == 1 && (argv[1][0] >= 'a' && argv[1][0] <= 'z')) {
				// set for-loop parameters
				Nchar = argv[1][0];
				char range[128]; agoUpdateN(range, argv[3], 0, '\0');
				if (sscanf(range, "%d:%d,%d", &Nbegin, &Nend, &Nstep) < 2 || Nstep <= 0) {
					agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: invalid for syntax: should be 'for i in <begin>:<end>[,<step>]'\n>>>> %s\n", lineno, lineCopy);
					agraph->status = -1;
					break;
				}
				forConstruct = 1;
				continue;
			}
		}
		else if (narg == 1 && !strcmp(argv[0], "endfor")) {
			// reset for-loop parameters to single iteration
			Nbegin = 0, Nend = 0, Nstep = 1, Nchar = '\0';
			forConstruct = 0;
			continue;
		}
		if (narg == 4 && (!strcmp(argv[0], "if") || !strcmp(argv[0], "elseif"))) {
			char expr1[128]; agoUpdateN(expr1, argv[1], 0, '\0');
			char expr2[128]; agoUpdateN(expr2, argv[3], 0, '\0');
			int value1 = atoi(expr1);
			int value2 = atoi(expr2);
			bool result = false;
			if (!strcmp(argv[2], "==")) result = (value1 == value2);
			else if (!strcmp(argv[2], "!=")) result = (value1 != value2);
			else if (!strcmp(argv[2], "<=")) result = (value1 <= value2);
			else if (!strcmp(argv[2], ">=")) result = (value1 >= value2);
			else if (!strcmp(argv[2], "<")) result = (value1 < value2);
			else if (!strcmp(argv[2], ">")) result = (value1 > value2);
			else {
				agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: invalid if-command syntax: should be '[else]if <value1> ==|!=|<|>|<=|>= <value2>'\n>>>> %s\n", lineno, lineCopy);
				agraph->status = -1;
				break;
			}
			if (!strcmp(argv[0], "if")) {
				// increase the depth and mark result in lowest bit (0:true, 1:false)
				ifdepth++;
				ifcur <<= 1;
				ifall <<= 1;
				if (!result) {
					ifcur += 1;
					ifall += 1;
				}
			}
			else {
				// if previously if/elseif resulted in true, mark result as false
				if (!(ifall & 1))
					result = false;
				// set lowest bit of both ifcur and ifall
				if (result) {
					ifcur &= ~1;
					ifall &= ~1;
				}
				else {
					ifcur |= 1;
				}
			}
			continue;
		}
		else if (narg == 1 && !strcmp(argv[0], "else")) {
			if (ifdepth == 0) {
				agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: found else without matching if statement'\n>>>> %s\n", lineno, lineCopy);
				agraph->status = -1;
				break;
			}
			ifcur = (ifcur & ~1) | !(ifall & 1);
			continue;
		}
		else if (narg == 1 && !strcmp(argv[0], "endif")) {
			if (ifdepth == 0) {
				agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: found endif without matching if statement'\n>>>> %s\n", lineno, lineCopy);
				agraph->status = -1;
				break;
			}
			ifdepth--;
			ifcur >>= 1;
			ifall >>= 1;
			continue;
		}
		// check skip if earlier conditional statements required to do so
		if (ifcur)
			continue;
		// process command with optional for-command-prefix support
		for (int N = Nbegin; N <= Nend; N += Nstep) {
			// create arguments with {N} expression substitution
			char argBuf[2048] = { 0 }, *arg[64] = { 0 };
			for (int i = 0, j = 0; i < narg; i++) {
				arg[i] = argBuf + j;
				agoUpdateN(arg[i], argv[i], N, Nchar);
				j += (int)strlen(arg[i]) + 1;
			}
			// process the actual commands
			if (narg == 4 && !strcmp(arg[0], "data") && !strcmp(arg[2], "=")) {
				// create new AgoData and add it to the dataList
				AgoData * data = agoCreateDataFromDescription(context, agraph, arg[3], false);
				if (!data) {
					agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: data type not supported\n>>>> %s\n", lineno, lineCopy);
					agraph->status = -1;
					break;
				}
				data->name = arg[1];
				agoAddData(data->isVirtual ? &agraph->dataList : &context->dataList, data);
				// if data has children (e.g., pyramid, delay, image), add them too
				if (data->children) {
					for (vx_uint32 i = 0; i < data->numChildren; i++) {
						if (data->children[i]) {
							for (vx_uint32 j = 0; j < data->children[i]->numChildren; j++) {
								if (data->children[i]->children[j]) {
									agoAddData(data->isVirtual ? &agraph->dataList : &context->dataList, data->children[i]->children[j]);
								}
							}
							agoAddData(data->isVirtual ? &agraph->dataList : &context->dataList, data->children[i]);
						}
					}
				}
				// inform application about data -- ignore this for scalar strings
				if (callback_f && !(data->ref.type == VX_TYPE_SCALAR && data->u.scalar.type == VX_TYPE_STRING_AMD)) {
					// skip till ':'
					const char * param = arg[3];
					for (; *param && *param != ':'; param++)
						;
					if (*param == ':') {
						// still till another ':'
						for (param++; *param && *param != ':'; param++)
							;
						if (*param == ':') {
							param++;
							// invoke the application callback with object name and parameter strings
							data->ref.external_count++;
							callback_f(callback_obj, &data->ref, data->name.c_str(), param);
						}
					}
				}
			}
			else if ((narg >= 3 && !strcmp(arg[0], "node")) || (narg >= 3 && !strcmp(arg[0], "macro")) || (narg >= 2 && !strcmp(arg[0], "file"))) {
				std::string localSuffix = "!";
				AgoKernel * akernel = NULL;
				AgoNode * node = NULL;
				char * str_subgraph = NULL;
				bool str_subgraph_allocated = false;
				AgoReference * ref_subgraph[AGO_MAX_PARAMS] = { 0 };
				if (!strcmp(arg[0], "node")) {
					if (!(akernel = agoFindKernelByName(context, arg[1]))) {
						agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: kernel not supported\n>>>> %s\n", lineno, lineCopy);
						agraph->status = -1;
						break;
					}
					// create a new AgoNode and add it to the nodeList
					node = agoCreateNode(agraph, akernel);
				}
				else if (!strcmp(arg[0], "macro")) {
					for (auto it = context->macros.begin(); it != context->macros.end(); ++it) {
						if (!strcmp(it->name, arg[1])) {
							localSuffix += it->name;
							str_subgraph = it->text;
							break;
						}
					}
					if (!str_subgraph) {
						agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: unable to find macro '%s'\n>>>> %s\n", lineno, arg[1], lineCopy);
						agraph->status = -1;
						break;
					}
				}
				else {
					FILE * fp = fopen(arg[1], "rb");
					if (!fp) {
						agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: unable to open '%s'\n>>>> %s\n", lineno, arg[1], lineCopy);
						agraph->status = -1;
						break;
					}
					fseek(fp, 0L, SEEK_END); long size = ftell(fp); fseek(fp, 0L, SEEK_SET);
					if (!(str_subgraph = new char[size + 1]()) || (fread(str_subgraph, sizeof(char), size, fp) != size)) {
						agoAddLogEntry(&agraph->ref, VX_FAILURE, "FATAL: calloc/fread(%d) failed\n", (int)size + 1);
						agraph->status = -1; 
						break; 
					}
					str_subgraph_allocated = true;
					fclose(fp);
					// update suffix
					const char * name = arg[1];
					for (char *p = arg[1]; *p; p++) {
						if (*p == '/' || *p == '\\' || *p == ':')
							name = p + 1;
						else if (*p == '.') *p = '\0';
					}
					localSuffix += name;
				}
				// look through all parameters
				for (int p = 0; p < narg - 2; p++)
				{
					if (node && strncmp(arg[2 + p], "attr:", 5) == 0) {
						if (!strncmp(&arg[2 + p][5], "BORDER_MODE:", 12)) {
							if (!strcmp(&arg[2 + p][17], "UNDEFINED")) {
								node->attr_border_mode.mode = VX_BORDER_MODE_UNDEFINED;
								memset(&node->attr_border_mode.constant_value, 0, sizeof(node->attr_border_mode.constant_value));
							}
							else if (!strcmp(&arg[2 + p][17], "REPLICATE")) {
								node->attr_border_mode.mode = VX_BORDER_MODE_REPLICATE;
								memset(&node->attr_border_mode.constant_value, 0, sizeof(node->attr_border_mode.constant_value));
							}
							else if (!strncmp(&arg[2 + p][17], "CONSTANT,", 9)) {
								node->attr_border_mode.mode = VX_BORDER_MODE_CONSTANT;
								memset(&node->attr_border_mode.constant_value, 0, sizeof(node->attr_border_mode.constant_value));
								(void)sscanf(&arg[2 + p][17 + 9], "%i", &node->attr_border_mode.constant_value.U32);
							}
							else {
								agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: invalid/unsupported border mode attribute -- arg#%d\n>>>> %s\n", lineno, p, lineCopy);
								agraph->status = -1;
								break;
							}
						}
						else if (!strncmp(&arg[2 + p][5], "AFFINITY:", 9)) {
							vx_uint32 group = 0;
							char device[64] = "CPU";
							const char * szGroup = strstr(&arg[2 + p][14], ",");
							if (szGroup) {
								group = atoi(&szGroup[1]);
							}
							node->attr_affinity.group = group;
							(void)sscanf(&arg[2 + p][14], "%s", device);
							if (!strncmp(device, "CPU", 3)) {
								node->attr_affinity.device_type = AGO_KERNEL_FLAG_DEVICE_CPU;
								node->attr_affinity.device_info = atoi(&device[3]);
							}
							else if (!strncmp(device, "GPU", 3)) {
								node->attr_affinity.device_type = AGO_KERNEL_FLAG_DEVICE_GPU;
								node->attr_affinity.device_info = atoi(&device[3]);
							}
							else {
								agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: invalid/unsupported affinity attribute -- arg#%d\n>>>> %s\n", lineno, p, lineCopy);
								agraph->status = -1;
								break;
							}
						}
					}
					else if (!strcmp(arg[0], "file") && strncmp(arg[2 + p], "/def-var:", 9) == 0) {
						char command[256]; sprintf(command, "def-var %s\n", &arg[2 + p][9]);
						char * equal = strstr(command, "=");
						if (!equal) {
							agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: invalid def-var syntax: expected /def-var:<NAME>=<VALUE>\n>>>> %s\n", lineno, lineCopy);
							agraph->status = -1;
							break;
						}
						*equal = ' ';
						agoReadGraphFromStringInternal(agraph, ref, num_ref, callback_f, callback_obj, command, 0, vars, localPrefix);
						if (agraph->status)
							break;
					}
					else {
						AgoData * data = NULL;
						if (arg[2 + p][0] == '$') {
							int index = atoi(&arg[2 + p][1]) - 1;
							if (index >= 0 && index < num_ref) {
								data = (AgoData *)ref[index];
							}
						}
						else if (strcmp(arg[2 + p], "null") != 0) {
							char name[128]; strcpy(name, arg[2 + p]);
							// check if there is an name alias
							for (std::vector< std::pair< std::string, std::string > >::iterator it = aliases.begin(); it != aliases.end(); ++it) {
								if (!strcmp(it->first.c_str(), name)) {
									strcpy(name, it->second.c_str());
									if (name[0] == '$') {
										int index = atoi(&name[1]) - 1;
										if (index >= 0 && index < num_ref) {
											data = (AgoData *)ref[index];
										}
									}
									break;
								}
							}
							// get data object
							if (!data) {
								data = agoFindDataByName(context, agraph, name);
							}
							if (!data) {
								// create new AgoData and add it to the dataList
								data = agoCreateDataFromDescription(context, agraph, name, false);
								if (!data) {
									agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: data type not supported -- arg#%d\n>>>> %s\n", lineno, p, lineCopy);
									agraph->status = -1;
									break;
								}
								agoAddData(&agraph->dataList, data);
								// if data has children (e.g., pyramid), add them too
								if (data->children) {
									for (vx_uint32 i = 0; i < data->numChildren; i++) {
										if (data->children[i]) {
											char childname[256];
											sprintf(childname, "%s[%d]", data->name.c_str(), i);
											data->children[i]->name = childname;
											agoAddData(&agraph->dataList, data->children[i]);
										}
									}
								}
							}
						}
						if (data) {
							if (node) {
								node->paramList[p] = data;
								// check if specified data type is correct
								// NOTE: kernel can specify to ignore this checking by setting argType[] to ZERO
								if (akernel->argType[p] && (akernel->argType[p] != data->ref.type)) {
									char type_expected_buf[64], type_specified_buf[64];
									const char * type_expected = agoEnum2Name(akernel->argType[p]);
									const char * type_specified = agoEnum2Name(data->ref.type);
									if (!type_expected) { sprintf(type_expected_buf, "0x%08x", akernel->argType[p]); type_expected = type_expected_buf; }
									if (!type_specified) { sprintf(type_specified_buf, "0x%08x", data->ref.type); type_specified = type_specified_buf; }
									agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: data type %s expected -- arg#%d has %s\n>>>> %s\n", lineno, type_expected, p, type_specified, lineCopy);
									agraph->status = -1;
									break;
								}
							}
							else if (p < AGO_MAX_PARAMS)
								ref_subgraph[p] = &data->ref;
							else {
								agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: number of arguments exceeded MAX(%d)\n>>>> %s\n", lineno, AGO_MAX_PARAMS, lineCopy);
								agraph->status = -1;
								break;
							}
						}
					}
				}
				if (str_subgraph && !agraph->status) {
					agoReadGraphFromStringInternal(agraph, ref_subgraph, narg - 2, callback_f, callback_obj, str_subgraph, (dumpToConsole > 0) ? dumpToConsole - 1 : vx_false_e, vars, localPrefix + localSuffix);
				}
				if (str_subgraph_allocated)
					delete[] str_subgraph;
				if (agraph->status)
					break;
			}
			else if (narg == 2 && !strcmp(arg[0], "import")) {
				char * module_name = arg[1];
				if (agoLoadModule(context, module_name)) {
					agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: import: unable to load module: %s\n", module_name);
					agraph->status = -1;
					break;
				}
			}
			else if (narg == 3 && !strcmp(arg[0], "type") && !strncmp(arg[2], "userstruct:", 11)) {
				vx_enum user_struct_id = 0;
				char * name = arg[1];
				if (agoGetUserStructSize(context, name) > 0) {
					agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: type: name already in-use: %s\n", name);
					agraph->status = -1;
					break;
				}
				vx_size size = atoi(&arg[2][11]);
				if (agoAddUserStruct(context, size, name) == VX_TYPE_INVALID) {
					agraph->status = -1;
				}
			}
			else if (narg == 2 && !strcmp(arg[0], "def-macro")) {
				char macro_name[256]; strncpy(macro_name, arg[1], sizeof(macro_name));
				const char * str_begin = str;
				const char * str_end = str;
				for (; (str = agoReadLine(line, sizeof(line)-16, str)) != NULL; lineno++) {
					if (dumpToConsole) agoAddLogEntry(NULL, VX_SUCCESS, "%s", line);
					agoUpdateLine(line, vars, localPrefix);
					char word[256];
					if (sscanf(line, "%s", word) == 1 && !strcmp(word, "endmacro"))
						break;
					str_end = str;
				}
				if (!str) {
					agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: incomplete macro definition: %s\n>>>> %s\n", lineno, macro_name, lineCopy);
					agraph->status = -1;
					break;
				}
				else {
					for (auto it = context->macros.begin(); it != context->macros.end(); ++it) {
						if (!strcmp(it->name, macro_name)) {
							agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: macro already exists: %s\n>>>> %s\n", lineno, macro_name, lineCopy);
							agraph->status = -1;
							break;
						}
					}
					if (agraph->status)
						break;
					else {
						MacroData macro;
						macro.text = macro.text_allocated = (char *)calloc(1, str_end - str_begin + 1);
						strncpy(macro.name, macro_name, sizeof(macro.name) - 1);
						strncpy(macro.text, str_begin, str_end - str_begin);
						context->macros.push_back(macro);
					}
				}
			}
			else if ((narg == 2 || narg == 3) && (!strcmp(arg[0], "def-var") || !strcmp(arg[0], "def-var-default"))) {
				for (vx_uint32 i = 0; arg[1][i]; i++) {
					char c = arg[1][i];
					if (!(i == 0 && c >= 'A' && c <= 'Z') && !(i > 0 && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_'))) {
						agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: variable names can only have [A-Z][A-Za-z0-9_]* format: %s\n>>>> %s\n", lineno, arg[1], lineCopy);
						agraph->status = -1;
						break;
					}
				}
				if (agraph->status)
					break;
				bool found = false;
				for (std::vector< std::pair< std::string, std::string > >::iterator it = vars.begin(); it != vars.end(); ++it) {
					if (!strcmp(it->first.c_str(), arg[1])) {
						found = true;
						if (!strcmp(arg[0], "def-var")) {
							agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: variable already exists: %s\n>>>> %s\n", lineno, arg[1], lineCopy);
							agraph->status = -1;
						}
						break;
					}
				}
				if (agraph->status)
					break;
				else if (!found) {
					char value[2048];
					if (narg == 2) {
						value[0] = 0;
					}
					else {
						strcpy(value, arg[2]);
						agoEvaluateIntegerExpression(value);
					}
					if ((!strncmp(value, "WIDTH(", 6) || !strncmp(value, "HEIGHT(", 7) || !strncmp(value, "FORMAT(", 7)) && value[strlen(value) - 1] == ')') {
						char * name = strstr(value, "(") + 1; value[strlen(value) - 1] = 0;
						AgoData * pdata = agoFindDataByName(context, agraph, name);
						if (!pdata && name[0] == '$' && name[1] >= '1' && name[1] <= '9') {
							int v = atoi(&name[1]) - 1;
							if (v < num_ref)
								pdata = (AgoData *)ref[v];
							if (!pdata) {
								agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: specified argument is not available: %s\n>>>> %s\n", lineno, arg[2], lineCopy);
								agraph->status = -1;
								break;
							}
						}
						if (!pdata || (pdata->ref.type != VX_TYPE_IMAGE && pdata->ref.type != VX_TYPE_PYRAMID)) {
							agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: invalid data name specified: %s\n>>>> %s\n", lineno, name, lineCopy);
							agraph->status = -1;
							break;
						}
						if (!strncmp(value, "WIDTH(", 6)) {
							int v = 0;
							if (pdata->ref.type == VX_TYPE_IMAGE) v = pdata->u.img.width;
							else if (pdata->ref.type == VX_TYPE_PYRAMID) v = pdata->u.pyr.width;
							sprintf(value, "%d", v);
						}
						else if (!strncmp(value, "HEIGHT(", 7)) {
							int v = 0;
							if (pdata->ref.type == VX_TYPE_IMAGE) v = pdata->u.img.height;
							else if (pdata->ref.type == VX_TYPE_PYRAMID) v = pdata->u.pyr.height;
							sprintf(value, "%d", v);
						}
						else if (!strncmp(value, "FORMAT(", 7)) {
							vx_df_image v = VX_DF_IMAGE_U8;
							if (pdata->ref.type == VX_TYPE_IMAGE) v = pdata->u.img.format;
							else if (pdata->ref.type == VX_TYPE_PYRAMID) v = pdata->u.pyr.format;
							sprintf(value, "%4.4s", FORMAT_STR(v));
						}
					}
					vars.push_back(std::pair< std::string, std::string >(arg[1], value));
					// special AGO flags
					if (!strcmp(arg[1], "AgoOptimizerFlags") && value[0] >= '0' && value[0] <= '9') {
						agraph->optimizer_flags = atoi(value);
						break;
					}
				}
			}
			else if (narg == 2 && !strcmp(arg[0], "affinity")) {
				if (!strcmp(arg[1], "GPU")) {
					agraph->attr_affinity.device_type = AGO_KERNEL_FLAG_DEVICE_GPU;
				}
				else if (!strcmp(arg[1], "CPU")) {
					agraph->attr_affinity.device_type = AGO_KERNEL_FLAG_DEVICE_CPU;
				}
				else {
					agraph->attr_affinity.device_type = 0;
				}
			}
			else if (narg == 3 && !strcmp(arg[0], "alias")) {
				for (vx_uint32 i = 0; arg[1][i]; i++) {
					char c = arg[1][i];
					if (!(i == 0 && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) && !(i > 0 && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '_'))) {
						agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: alias names can only have [A-Z][A-Za-z0-9_]* format: %s\n>>>> %s\n", lineno, arg[1], lineCopy);
						agraph->status = -1;
						break;
					}
				}
				if (agraph->status)
					break;
				char name1[128]; agoUpdateN(name1, arg[1], 0, '\0');
				char name2[128]; agoUpdateN(name2, arg[2], 0, '\0');
				for (std::vector< std::pair< std::string, std::string > >::iterator it = aliases.begin(); it != aliases.end(); ++it) {
					if (!strcmp(it->first.c_str(), name1)) {
						agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: alias already exists: %s\n>>>> %s\n", lineno, name1, lineCopy);
						agraph->status = -1;
						break;
					}
				}
				if (agraph->status)
					break;
				aliases.push_back(std::pair< std::string, std::string >(name1, name2));
			}
			else if (narg > 0 && !strcmp(arg[0], "set-args")) {
				if (narg - 1 > num_ref) {
					agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: set-args: number of argument (%d) exceeded internal buffer (%d)\n>>>> %s\n", lineno, narg - 1, num_ref, lineCopy);
					agraph->status = -1;
					break;
				}
				// clear all previous arguments
				for (int i = 0; i < num_ref; i++) {
					// TBD handle memory leaks
					ref[i] = NULL;
				}
				for (int j = 0; j < narg - 1; j++) {
					// create new AgoData and add it to the dataList
					AgoData * data = agoCreateDataFromDescription(context, agraph, arg[j + 1], false);
					if (!data) {
						agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: set-args: invalid object description: %s\n>>>> %s\n", lineno, arg[j + 1], lineCopy);
						agraph->status = -1;
						break;
					}
					agoAddData(data->isVirtual ? &agraph->dataList : &agraph->ref.context->dataList, data);
					// if data has children (e.g., pyramid), add them too
					if (data->children) {
						for (vx_uint32 i = 0; i < data->numChildren; i++) {
							if (data->children[i]) {
								agoAddData(data->isVirtual ? &agraph->dataList : &agraph->ref.context->dataList, data->children[i]);
							}
						}
					}
					ref[j] = &data->ref;
				}
				if (agraph->status)
					break;
			}
			else if (narg == 3 && !strcmp(arg[0], "directive")) {
				agraph->status = -1;
				char name1[128]; agoUpdateN(name1, arg[1], 0, '\0');
				char name2[128]; agoUpdateN(name2, arg[2], 0, '\0');
				AgoData * data = agoFindDataByName(context, agraph, name1);
				if (data) {
					vx_enum directive = agoName2Enum(name2);
					if (!directive) {
						(void)sscanf(name2, "%i", &directive);
					}
					if (directive) {
						agraph->status = agoDirective((vx_reference)data, directive);
					}
				}
				if (agraph->status) {
					agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: invalid object or directive: directive %s %s\n>>>> %s\n", lineno, name1, name2, lineCopy);
					break;
				}
			}
			else if (narg >= 1 && !strcmp(arg[0], "exit")) {
				break;
			}
			else if (narg > 0) {
				agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: line %d: syntax error\n>>>> %s\n", lineno, lineCopy);
				agraph->status = -1;
				break;
			}
		}
		if (agraph->status)
			break;
	}
}

int agoReadGraph(AgoGraph * agraph, AgoReference * * ref, int num_ref, ago_data_registry_callback_f callback_f, void * callback_obj, FILE * fp, vx_int32 dumpToConsole)
{
	if (!agraph) return -1;
	vx_context context = agraph->ref.context;
	CAgoLock lock(agraph->cs);
	CAgoLock lock2(context->cs);

	// read the whole file into a local buffer
	long cur = ftell(fp); fseek(fp,  0L, SEEK_END);
	long end = ftell(fp); fseek(fp, cur, SEEK_SET);
	long size = end - cur; if (size < 1) return agraph->status;
	char * str = new char [size + 1]();
	if (!str || (fread(str, sizeof(char), size, fp) != size))
		return -1;

	// read the graph from file
	std::vector< std::pair< std::string, std::string > > vars;
	agoReadGraphFromStringInternal(agraph, ref, num_ref, callback_f, callback_obj, str, dumpToConsole, vars, "L");
	delete[] str;

	// mark the scope of all virtual data to graph
	for (AgoData * data = agraph->dataList.head; data; data = data->next) {
		data->ref.scope = &agraph->ref;
	}
	// check if graph is a valid graph
	if (agraph->status == VX_SUCCESS) {
		agraph->status = agoVerifyGraph(agraph);
		if (agraph->status) {
			agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: agoVerifyGraph() => %d (failed)\n", agraph->status);
		}
	}
	return agraph->status;
}

int agoReadGraphFromString(AgoGraph * agraph, AgoReference * * ref, int num_ref, ago_data_registry_callback_f callback_f, void * callback_obj, char * str, vx_int32 dumpToConsole)
{
	if (!agraph) return -1;
	vx_context context = agraph->ref.context;
	CAgoLock lock(agraph->cs);
	CAgoLock lock2(context->cs);

	// read the graph from string
	std::vector< std::pair< std::string, std::string > > vars;
	agoReadGraphFromStringInternal(agraph, ref, num_ref, callback_f, callback_obj, str, dumpToConsole, vars, "L");

	// mark the scope of all virtual data to graph
	for (AgoData * data = agraph->dataList.head; data; data = data->next) {
		data->ref.scope = &agraph->ref;
	}
	// check if graph is a valid graph
	if (agraph->status == VX_SUCCESS) {
		agraph->status = agoVerifyGraph(agraph);
		if (agraph->status) {
			agoAddLogEntry(&agraph->ref, VX_FAILURE, "ERROR: agoReadGraph: agoVerifyGraph() => %d (failed)\n", agraph->status);
		}
	}
	return agraph->status;
}

int agoLoadModule(AgoContext * context, const char * module)
{
	vx_status status = VX_ERROR_INVALID_REFERENCE;
	if (agoIsValidContext(context)) {
		CAgoLock lock(context->cs);
		status = VX_ERROR_INVALID_PARAMETERS;
		char filePath[1024]; sprintf(filePath, SHARED_LIBRARY_PREFIX "%s" SHARED_LIBRARY_EXTENSION, module);
		for (vx_uint32 index = 0; index < context->num_active_modules; index++) {
			if (strcmp(filePath, context->modules[index].module_path) == 0) {
				agoAddLogEntry(&context->ref, VX_SUCCESS, "WARNING: kernels already loaded from %s\n", filePath);
				return VX_SUCCESS;
			}
		}
		ago_module hmodule = agoOpenModule(filePath);
		if (hmodule == NULL) {
			status = VX_ERROR_INVALID_MODULE;
			agoAddLogEntry(&context->ref, status, "ERROR: Unable to load module %s\n", filePath);
		}
		else {
			vx_publish_kernels_f publish_kernels_f = (vx_publish_kernels_f)agoGetFunctionAddress(hmodule, "vxPublishKernels");
			if (!publish_kernels_f) {
				status = VX_ERROR_INVALID_MODULE;
				agoAddLogEntry(&context->ref, status, "ERROR: vxPublishKernels symbol missing in %s\n", filePath);
			}
			else {
				// add module entry into context
				ModuleData data;
				strncpy(data.module_name, module, sizeof(data.module_name) - 1);
				strncpy(data.module_path, filePath, sizeof(data.module_path) - 1);
				data.hmodule = hmodule;
				data.module_internal_data_ptr = nullptr;
				data.module_internal_data_size = 0;
				context->modules.push_back(data);
				context->num_active_modules++;
				// invoke vxPublishKernels
				vx_uint32 count = context->kernelList.count;
				context->importing_module_index_plus1 = context->num_active_modules;
				status = publish_kernels_f(context);
				context->importing_module_index_plus1 = 0;
				if (status == VX_SUCCESS) {
					agoAddLogEntry(&context->ref, VX_SUCCESS, "OK: loaded %d kernels from %s\n", context->kernelList.count - count, filePath);
				}
				else {
					agoAddLogEntry(&context->ref, VX_FAILURE, "ERROR: vxPublishKernels(%s) failed (%d:%s)\n", module, status, agoEnum2Name(status));
				}
			}
		}
	}
	return status;
}

int agoUnloadModule(AgoContext * context, const char * module)
{
	vx_status status = VX_ERROR_INVALID_REFERENCE;
	if (agoIsValidContext(context)) {
		CAgoLock lock(context->cs);
		status = VX_ERROR_INVALID_MODULE;
		char filePath[1024]; sprintf(filePath, SHARED_LIBRARY_PREFIX "%s" SHARED_LIBRARY_EXTENSION, module);
		for (vx_uint32 index = 0; index < context->num_active_modules; index++) {
			if (strcmp(filePath, context->modules[index].module_path) == 0) {
				vx_unpublish_kernels_f unpublish_kernels_f = (vx_unpublish_kernels_f)agoGetFunctionAddress(context->modules[index].hmodule, "vxUnpublishKernels");
				if (!unpublish_kernels_f) {
					status = VX_ERROR_NOT_SUPPORTED;
					agoAddLogEntry(&context->ref, status, "ERROR: vxUnpublishKernels symbol missing in %s\n", filePath);
				}
				else {
					status = unpublish_kernels_f(context);
					if (status == VX_SUCCESS) {
						agoCloseModule(context->modules[index].hmodule);
						context->modules[index].hmodule = nullptr;
						context->modules[index].module_path[0] = '\0';
					}
				}
				break;
			}
		}
	}
	return status;
}

vx_status agoVerifyNode(AgoNode * node)
{
	AgoGraph * graph = (AgoGraph *)node->ref.scope;
	AgoKernel * kernel = node->akernel;
	vx_status status = VX_SUCCESS;

	// check if node has required arguments and initialize data required for further graph processing
	node->hierarchical_level = 0;
	for (vx_uint32 arg = 0; arg < AGO_MAX_PARAMS; arg++) {
		AgoData * data = node->paramList[arg];
		if (!data || (arg >= node->paramCount)) {
			if (((kernel->argConfig[arg] & AGO_KERNEL_ARG_OPTIONAL_FLAG) == 0) && ((kernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) != 0)) {
				agoAddLogEntry(&kernel->ref, VX_ERROR_NOT_SUFFICIENT, "ERROR: agoVerifyGraph: kernel %s: missing argument#%d\n", kernel->name, arg);
				return VX_ERROR_NOT_SUFFICIENT;
			}
		}
		else if ((kernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) == 0) {
			agoAddLogEntry(&kernel->ref, VX_ERROR_NOT_SUFFICIENT, "ERROR: agoVerifyGraph: kernel %s: unexpected argument#%d\n", kernel->name, arg);
			return VX_ERROR_NOT_SUFFICIENT;
		}
		if (data) {
			data->hierarchical_level = 0;
			// reset meta data of the node for output argument processing
			if ((kernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) == AGO_KERNEL_ARG_OUTPUT_FLAG) {
				vx_meta_format meta = &node->metaList[arg];
				meta->data.ref.type = data->ref.type;
				meta->data.ref.external_count = 1;
				meta->set_valid_rectangle_callback = nullptr;
				if (data->ref.type == VX_TYPE_IMAGE) {
					meta->data.u.img.rect_valid.start_x = 0;
					meta->data.u.img.rect_valid.start_y = 0;
					meta->data.u.img.rect_valid.end_x = INT_MAX;
					meta->data.u.img.rect_valid.end_y = INT_MAX;
				}
				else if (data->ref.type == VX_TYPE_PYRAMID) {
					meta->data.u.pyr.rect_valid.start_x = 0;
					meta->data.u.pyr.rect_valid.start_y = 0;
					meta->data.u.pyr.rect_valid.end_x = INT_MAX;
					meta->data.u.pyr.rect_valid.end_y = INT_MAX;
				}
			}
		}
	}

	// mark the kernels with VX_KERNEL_ATTRIBUTE_AMD_OPENCL_BUFFER_UPDATE_CALLBACK
	// with enableUserBufferOpenCL
	if (kernel->opencl_buffer_update_callback_f) {
		AgoData * data = node->paramList[kernel->opencl_buffer_update_param_index];
		if (!data || !data->isVirtual || data->ref.type != VX_TYPE_IMAGE || data->u.img.planes != 1 || data->ownerOfUserBufferOpenCL || data->u.img.enableUserBufferOpenCL) {
			status = VX_ERROR_INVALID_PARAMETERS;
			agoAddLogEntry(&kernel->ref, status, "ERROR: agoVerifyGraph: kernel %s: unexpected/unsupported argument#%d -- needs virtual image with single-plane\n", kernel->name, kernel->opencl_buffer_update_param_index);
			return status;
		}
		// mark that the buffer gets initialized a node
		data->u.img.enableUserBufferOpenCL = vx_true_e;
		data->ownerOfUserBufferOpenCL = node;
	}

	// check if node arguments are valid
	if (kernel->func) {
		// validate arguments for built-in kernel functions
		vx_status status = kernel->func(node, ago_kernel_cmd_validate);
		if (status) {
			agoAddLogEntry(&kernel->ref, status, "ERROR: agoVerifyGraph: kernel %s: ago_kernel_cmd_validate failed (%d)\n", kernel->name, status);
			return status;
		}
	}
	else if (kernel->validate_f) {
		// validate arguments for user-kernels functions
		vx_meta_format metaList[AGO_MAX_PARAMS];
		for (int i = 0; i < AGO_MAX_PARAMS; i++)
			metaList[i] = &node->metaList[i];
		status = kernel->validate_f(node, (vx_reference *)node->paramList, node->paramCount, metaList);
		if (status) {
			agoAddLogEntry(&kernel->ref, status, "ERROR: agoVerifyGraph: kernel %s: kernel_validate failed (%d)\n", kernel->name, status);
			return status;
		}
	}
	else {
		// check if node input arguments are valid
		for (vx_uint32 arg = 0; arg < node->paramCount; arg++) {
			if (node->paramList[arg]) {
				if (kernel->argConfig[arg] & AGO_KERNEL_ARG_INPUT_FLAG) {
					if (kernel->input_validate_f) {
						vx_status status = kernel->input_validate_f(node, arg);
						if (status) {
							agoAddLogEntry(&kernel->ref, status, "ERROR: agoVerifyGraph: kernel %s: input_validate failed (%d) for argument#%d\n", kernel->name, status, arg);
							return status;
						}
					}
				}
			}
		}
		// check if node output arguments are valid
		for (vx_uint32 arg = 0; arg < node->paramCount; arg++) {
			AgoData * data = node->paramList[arg];
			if (data) {
				if ((kernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) == AGO_KERNEL_ARG_OUTPUT_FLAG) {
					if (kernel->output_validate_f) {
						vx_meta_format meta = &node->metaList[arg];
						vx_status status = kernel->output_validate_f(node, arg, meta);
						if (status) {
							agoAddLogEntry(&kernel->ref, status, "ERROR: agoVerifyGraph: kernel %s: output_validate failed (%d) for argument#%d\n", kernel->name, status, arg);
							return status;
						}
					}
				}
			}
		}
	}
	// check if node output arguments are valid
	for (vx_uint32 arg = 0; arg < node->paramCount; arg++) {
		AgoData * data = node->paramList[arg];
		if (data) {
			if ((kernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) == AGO_KERNEL_ARG_OUTPUT_FLAG) {
				vx_meta_format meta = &node->metaList[arg];
				if (kernel->argType[arg] && kernel->argType[arg] != VX_TYPE_REFERENCE && (meta->data.ref.type != kernel->argType[arg])) {
					agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_TYPE, "ERROR: agoVerifyGraph: kernel %s: output argument type mismatch for argument#%d\n", kernel->name, arg);
					return VX_ERROR_INVALID_TYPE;
				}
				else if (meta->data.ref.type == VX_TYPE_IMAGE) {
					bool updated = false;
					if (data->isVirtual) {
						// update format/width/height if not specified
						if (data->u.img.format == VX_DF_IMAGE_VIRT) {
							data->u.img.format = meta->data.u.img.format;
							updated = true;
						}
						if (data->u.img.width == 0) {
							data->u.img.width = meta->data.u.img.width;
							updated = true;
						}
						if (data->u.img.height == 0) {
							data->u.img.height = meta->data.u.img.height;
							updated = true;
						}
					}
					// make sure that the data come from output validator matches with object
					if (data->u.img.format != meta->data.u.img.format) {
						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_FORMAT, "ERROR: agoVerifyGraph: kernel %s: invalid format for argument#%d\n", kernel->name, arg);
						return VX_ERROR_INVALID_FORMAT;
					}
					else if (data->u.img.width != meta->data.u.img.width || data->u.img.height != meta->data.u.img.height) {
						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_DIMENSION, "ERROR: agoVerifyGraph: kernel %s: invalid dimension for argument#%d\n", kernel->name, arg);
						return VX_ERROR_INVALID_DIMENSION;
					}
					// re-initialize, if updated
					if (updated) {
						char desc[64]; sprintf(desc, "image-virtual:%4.4s,%d,%d", FORMAT_STR(data->u.img.format), data->u.img.width, data->u.img.height);
						data->isNotFullyConfigured = vx_true_e;
						if (agoGetDataFromDescription(graph->ref.context, graph, data, desc)) {
							agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoVerifyGraph: agoVerifyGraph update failed for virtual-image: %4.4s %dx%d\n", FORMAT_STR(data->u.img.format), data->u.img.width, data->u.img.height);
							return -1;
						}
						if (data->children) {
							for (vx_uint32 i = 0; i < data->numChildren; i++) {
								agoAddData(&graph->dataList, data->children[i]);
							}
						}
						data->isNotFullyConfigured = vx_false_e;
					}
					// update valid rectangle
					data->u.img.rect_valid = meta->data.u.img.rect_valid;
					if (data->u.img.rect_valid.end_x == INT_MAX)
						data->u.img.rect_valid.end_x = data->u.img.width;
					if (data->u.img.rect_valid.end_y == INT_MAX)
						data->u.img.rect_valid.end_y = data->u.img.height;
					// check for VX_IMAGE_ATTRIBUTE_AMD_ENABLE_USER_BUFFER_OPENCL attribute
					if (meta->data.u.img.enableUserBufferOpenCL) {
						// supports only virtual images with single color plane and without ROI
						if (!data->isVirtual || data->u.img.planes != 1 || data->u.img.isROI || data->ownerOfUserBufferOpenCL) {
							agoAddLogEntry(&kernel->ref, VX_ERROR_NOT_SUPPORTED, "ERROR: agoVerifyGraph: kernel %s: VX_IMAGE_ATTRIBUTE_AMD_ENABLE_USER_BUFFER_OPENCL is not supported for argument#%d\n", kernel->name, arg);
							return VX_ERROR_NOT_SUPPORTED;
						}
						data->u.img.enableUserBufferOpenCL = vx_true_e;
					}
				}
				else if (meta->data.ref.type == VX_TYPE_PYRAMID) {
					bool updated = false;
					if (data->isVirtual) {
						// update format/width/height if not specified
						if (data->u.pyr.format == VX_DF_IMAGE_VIRT) {
							data->u.pyr.format = meta->data.u.pyr.format;
							updated = true;
						}
						if (data->u.pyr.width == 0) {
							data->u.pyr.width = meta->data.u.pyr.width;
							updated = true;
						}
						if (data->u.pyr.height == 0) {
							data->u.pyr.height = meta->data.u.pyr.height;
							updated = true;
						}
					}
					// make sure that the data come from output validator matches with object
					if (data->u.pyr.levels != meta->data.u.pyr.levels || data->u.pyr.scale != meta->data.u.pyr.scale) {
						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_VALUE, "ERROR: agoVerifyGraph: kernel %s: invalid value for argument#%d\n", kernel->name, arg);
						return VX_ERROR_INVALID_VALUE;
					}
					else if (data->u.pyr.format != meta->data.u.pyr.format) {
						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_FORMAT, "ERROR: agoVerifyGraph: kernel %s: invalid format for argument#%d\n", kernel->name, arg);
						return VX_ERROR_INVALID_FORMAT;
					}
					else if (data->u.pyr.width != meta->data.u.pyr.width || data->u.pyr.height != meta->data.u.pyr.height) {
						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_DIMENSION, "ERROR: agoVerifyGraph: kernel %s: invalid dimension for argument#%d\n", kernel->name, arg);
						return VX_ERROR_INVALID_DIMENSION;
					}
					// re-initialize, if updated
					if (updated) {
						char scale[64], desc[64];
						if (data->u.pyr.scale == VX_SCALE_PYRAMID_HALF) sprintf(scale, "HALF");
						else if (data->u.pyr.scale == VX_SCALE_PYRAMID_ORB) sprintf(scale, "ORB");
						else sprintf(scale, "%g", data->u.pyr.scale);
						sprintf(desc, "pyramid-virtual:%4.4s,%d,%d," VX_FMT_SIZE ",%s", FORMAT_STR(data->u.pyr.format), data->u.pyr.width, data->u.pyr.height, data->u.pyr.levels, scale);
						data->isNotFullyConfigured = vx_true_e;
						if (agoGetDataFromDescription(graph->ref.context, graph, data, desc)) {
							agoAddLogEntry(&data->ref, VX_FAILURE, "ERROR: agoVerifyGraph: agoVerifyGraph update failed for %s\n", desc);
							return -1;
						}
						if (data) {
							agoAddData(&graph->dataList, data);
							// add the children too
							for (vx_uint32 i = 0; i < data->numChildren; i++) {
								agoAddData(&graph->dataList, data->children[i]);
								for (vx_uint32 j = 0; j < data->children[i]->numChildren; j++) {
									if (data->children[i]->children[j]) {
										agoAddData(&graph->dataList, data->children[i]->children[j]);
									}
								}
							}
						}
						data->isNotFullyConfigured = vx_false_e;
					}
					// update valid rectangle
					data->u.pyr.rect_valid = meta->data.u.pyr.rect_valid;
					if (data->u.pyr.rect_valid.end_x == INT_MAX)
						data->u.pyr.rect_valid.end_x = data->u.pyr.width;
					if (data->u.pyr.rect_valid.end_y == INT_MAX)
						data->u.pyr.rect_valid.end_y = data->u.pyr.height;
					// propagate valid rectangle to all images inside the pyramid
					for (vx_uint32 i = 0; i < data->numChildren; i++) {
						AgoData * img = data->children[i];
						if (img) {
							vx_float32 xscale = (vx_float32)img->u.img.width / (vx_float32)data->u.pyr.width;
							vx_float32 yscale = (vx_float32)img->u.img.height / (vx_float32)data->u.pyr.height;
							img->u.img.rect_valid.start_x = (vx_uint32)ceilf(data->u.pyr.rect_valid.start_x * xscale);
							img->u.img.rect_valid.start_y = (vx_uint32)ceilf(data->u.pyr.rect_valid.start_y * yscale);
							img->u.img.rect_valid.end_x = (vx_uint32)floorf(data->u.pyr.rect_valid.end_x * xscale);
							img->u.img.rect_valid.end_y = (vx_uint32)floorf(data->u.pyr.rect_valid.end_y * yscale);
						}
					}
				}
				else if (meta->data.ref.type == VX_TYPE_ARRAY) {
					bool updated = false;
					if (data->isVirtual) {
						// update itemtype/capacity if not specified
						if (data->u.arr.itemtype == VX_TYPE_INVALID) {
							data->u.arr.itemtype = meta->data.u.arr.itemtype;
							updated = true;
						}
						if (data->u.arr.capacity == 0) {
							data->u.arr.capacity = meta->data.u.arr.capacity;
							updated = true;
						}
					}
					// make sure that the data come from output validator matches with object
					if (data->u.arr.itemtype != meta->data.u.arr.itemtype) {
						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_TYPE, "ERROR: agoVerifyGraph: kernel %s: invalid array type for argument#%d\n", kernel->name, arg);
						return VX_ERROR_INVALID_TYPE;
					}
					else if (!data->u.arr.capacity || (meta->data.u.arr.capacity && meta->data.u.arr.capacity > data->u.arr.capacity)) {
						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_DIMENSION, "ERROR: agoVerifyGraph: kernel %s: invalid dimension for argument#%d\n", kernel->name, arg);
						return VX_ERROR_INVALID_DIMENSION;
					}
					if (updated) {
						data->isNotFullyConfigured = vx_true_e;
						char desc[64]; sprintf(desc, "array-virtual:%u,%u", data->u.arr.itemtype, (vx_uint32)data->u.arr.capacity);
						if (agoGetDataFromDescription(graph->ref.context, graph, data, desc)) {
							agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: agoVerifyGraph: agoVerifyGraph update failed for %s\n", desc);
							return -1;
						}
						data->isNotFullyConfigured = vx_false_e;
					}
				}
				else if (meta->data.ref.type == VX_TYPE_SCALAR) {
					// make sure that the data come from output validator matches with object
					if (data->u.scalar.type != meta->data.u.scalar.type) {
						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_TYPE, "ERROR: agoVerifyGraph: kernel %s: invalid type for argument#%d\n", kernel->name, arg);
						return VX_ERROR_INVALID_TYPE;
					}
				}
				else if (meta->data.ref.type == VX_TYPE_MATRIX) {
					// make sure that the data come from output validator matches with object
					if ((data->u.mat.type != meta->data.u.mat.type) || (data->u.mat.columns != meta->data.u.mat.columns) || (data->u.mat.rows != meta->data.u.mat.rows)) {
						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_TYPE, "ERROR: agoVerifyGraph: kernel %s: invalid matrix meta for argument#%d\n", kernel->name, arg);
						return VX_ERROR_INVALID_TYPE;
					}
				}
				else if (meta->data.ref.type == VX_TYPE_DISTRIBUTION) {
					// nothing to do
				}
				else if (meta->data.ref.type == VX_TYPE_LUT) {
					// nothing to do
				}
				else if (meta->data.ref.type == VX_TYPE_REMAP) {
					// nothing to do
				}
				else if (meta->data.ref.type == VX_TYPE_TENSOR) {
					// make sure that the data come from output validator matches with object
					bool mismatched = false;
					if ((data->u.tensor.num_dims != meta->data.u.tensor.num_dims) || (data->u.tensor.data_type != meta->data.u.tensor.data_type) || (data->u.tensor.fixed_point_pos != meta->data.u.tensor.fixed_point_pos)) {
						mismatched = true;
					}
					for (vx_size i = 0; i < data->u.tensor.num_dims; i++) {
						if (data->u.tensor.dims[i] != meta->data.u.tensor.dims[i])
							mismatched = true;
					}
					if (mismatched) {
						agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_TYPE, "ERROR: agoVerifyGraph: kernel %s: invalid tensor meta for argument#%d\n", kernel->name, arg);
						return VX_ERROR_INVALID_TYPE;
					}
				}
				else if (meta->data.ref.type == AGO_TYPE_CANNY_STACK) {
					// nothing to do
				}
				else if (meta->data.ref.type == AGO_TYPE_MINMAXLOC_DATA) {
					// nothing to do
				}
				else if (meta->data.ref.type == AGO_TYPE_MEANSTDDEV_DATA) {
					// nothing to do
				}
				else if (kernel->argType[arg]) {
					agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_TYPE, "ERROR: agoVerifyGraph: kernel %s: invalid type for argument#%d\n", kernel->name, arg);
					return VX_ERROR_INVALID_TYPE;
				}
			}
			else if ((kernel->argConfig[arg] & (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) == (AGO_KERNEL_ARG_INPUT_FLAG | AGO_KERNEL_ARG_OUTPUT_FLAG)) {
#if 0 // TBD: disabled temporarily as a quick workaround for bidirectional buffer issue
				// virtual objects can not be used as bidirectional arguments
				if (data->isVirtual) {
					agoAddLogEntry(&kernel->ref, VX_ERROR_INVALID_PARAMETERS, "ERROR: agoVerifyGraph: kernel %s: bidirectional argument shouldn't be virtual for argument#%d (%s)\n", kernel->name, arg, data->name.c_str());
					return VX_ERROR_INVALID_PARAMETERS;
				}
#endif
			}
		}
	}

	return status;
}

int agoVerifyGraph(AgoGraph * graph)
{
	// compute node hierarchy in the graph: this takes care of
	//    - single writers
	//    - no loops
	vx_status status = agoOptimizeDramaComputeGraphHierarchy(graph);
	if (status) {
		return status;
	}
	agoOptimizeDramaSortGraphHierarchy(graph);

	// initialize valid region every input image/pyramid to its full region
	// and reset the user virtul buffer owner
	for (AgoNode * node = graph->nodeList.head; node; node = node->next) {
		for (vx_uint32 i = 0; i < node->paramCount; i++) {
			AgoData * data = node->paramList[i];
			if (data) {
				if (data->ref.type == VX_TYPE_IMAGE) {
					if (data->isVirtual) {
						data->ownerOfUserBufferOpenCL = nullptr;
					}
				}
			}
		}
	}

	////////////////////////////////////////////////
	// validate node arguments
	////////////////////////////////////////////////
	graph->detectedInvalidNode = false;
	for (AgoNode * node = graph->nodeList.head; node; node = node->next) {
		status = agoVerifyNode(node);
		if (status) {
			return status;
		}
	}

	// compute node hierarchy in the graph: this takes care of
	//    - single writers
	//    - no loops
	status = agoOptimizeDramaComputeGraphHierarchy(graph);
	if (status) {
		return status;
	}

#if ENABLE_OPENCL
	// if all nodes run on GPU, clear enable_node_level_opencl_flush
	bool cpuTargetBufferNodesExists = false;
	for (AgoNode * node = graph->nodeList.head; node; node = node->next) {
		if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_CPU &&
			!node->akernel->opencl_buffer_access_enable)
			cpuTargetBufferNodesExists = true;
	}
	if(!cpuTargetBufferNodesExists) {
		graph->enable_node_level_opencl_flush = false;
	}
#endif

	return status;
}

vx_status agoPrepareImageValidRectangleBuffers(AgoGraph * graph)
{
	vx_status status = VX_SUCCESS;

	////////////////////////////////////////////////
	// prepare for image valid rectangle computation
	////////////////////////////////////////////////
	for (AgoNode * node = graph->nodeList.head; node; node = node->next) {
		vx_uint32 valid_rect_num_inputs = 0, valid_rect_num_outputs = 0;
		for (vx_uint32 i = 0; i < node->paramCount; i++) {
			AgoParameter * param = &node->parameters[i];
			AgoData * data = node->paramList[i];
			if (data) {
				if (data->ref.type == VX_TYPE_IMAGE) {
					if (param->direction == VX_INPUT)
						valid_rect_num_inputs++;
					if (param->direction == VX_OUTPUT)
						valid_rect_num_outputs++;
				}
				else if (data->ref.type == VX_TYPE_PYRAMID) {
					if (param->direction == VX_INPUT)
						valid_rect_num_inputs += (vx_uint32)data->u.pyr.levels;
					if (param->direction == VX_OUTPUT)
						valid_rect_num_outputs += (vx_uint32)data->u.pyr.levels;
				}
			}
		}
		node->valid_rect_num_inputs = valid_rect_num_inputs;
		node->valid_rect_num_outputs = valid_rect_num_outputs;
		if (node->akernel->func && ((node->akernel->flags & AGO_KERNEL_FLAG_GROUP_MASK) == AGO_KERNEL_FLAG_GROUP_AMDLL ||
			(node->akernel->flags & AGO_KERNEL_FLAG_GROUP_MASK) == AGO_KERNEL_FLAG_GROUP_OVX10))
		{
			// nothing to do for built-in kernels
		}
		else
		{
			if (node->valid_rect_inputs)
				delete[] node->valid_rect_inputs;
			if (node->valid_rect_outputs)
				delete[] node->valid_rect_outputs;
			node->valid_rect_inputs = nullptr;
			node->valid_rect_outputs = nullptr;
			if (valid_rect_num_outputs > 0) {
				// allocate valid_rect_outputs[] and valid_rect_inputs[]
				node->valid_rect_outputs = new vx_rectangle_t *[valid_rect_num_outputs]();
				if (!node->valid_rect_outputs) {
					status = VX_ERROR_NO_MEMORY;
					break;
				}
				if (valid_rect_num_inputs > 0) {
					node->valid_rect_inputs = new vx_rectangle_t *[valid_rect_num_inputs]();
					if (!node->valid_rect_inputs) {
						status = VX_ERROR_NO_MEMORY;
						break;
					}
					// prepare valid_rect_inputs[] with valid pointers
					vx_uint32 index = 0;
					for (vx_uint32 i = 0; i < node->paramCount; i++) {
						AgoParameter * param = &node->parameters[i];
						AgoData * data = node->paramList[i];
						if (data && param->direction == VX_INPUT) {
							if (data->ref.type == VX_TYPE_IMAGE) {
								node->valid_rect_inputs[index++] = &data->u.img.rect_valid;
							}
							else if (data->ref.type == VX_TYPE_PYRAMID) {
								for (vx_size level = 0; level < data->u.pyr.levels; level++) {
									node->valid_rect_inputs[index++] = &data->children[level]->u.img.rect_valid;
								}
							}
						}
					}
				}
			}
		}
	}

	return status;
}

vx_status agoComputeImageValidRectangleOutputs(AgoGraph * graph)
{
	vx_status status = VX_SUCCESS;

	// compute output image valid rectangles
	for (AgoNode * node = graph->nodeList.head; node; node = node->next) {
		if (node->akernel->func && ((node->akernel->flags & AGO_KERNEL_FLAG_GROUP_MASK) == AGO_KERNEL_FLAG_GROUP_AMDLL ||
			(node->akernel->flags & AGO_KERNEL_FLAG_GROUP_MASK) == AGO_KERNEL_FLAG_GROUP_OVX10))
		{
			status = node->akernel->func(node, ago_kernel_cmd_valid_rect_callback);
			if (status == AGO_ERROR_KERNEL_NOT_IMPLEMENTED) {
				// consider unimplemented cases as success (nothing needed)
				status = VX_SUCCESS;
			}
		}
		else if (node->valid_rect_outputs) {
			// use node->valid_rect_outputs[] to propagate valid rectangles
			for (vx_uint32 i = 0; i < node->paramCount; i++) {
				AgoParameter * param = &node->parameters[i];
				AgoData * data = node->paramList[i];
				if (data && param->direction == VX_OUTPUT) {
					if (data->ref.type == VX_TYPE_IMAGE) {
						if (node->metaList[i].set_valid_rectangle_callback) {
							node->valid_rect_outputs[0] = &data->u.img.rect_valid;
							status = node->metaList[i].set_valid_rectangle_callback(node, i, node->valid_rect_inputs, node->valid_rect_outputs);
						}
						else if (node->valid_rect_reset) {
							data->u.img.rect_valid.start_x = 0;
							data->u.img.rect_valid.start_y = 0;
							data->u.img.rect_valid.end_x = data->u.img.width;
							data->u.img.rect_valid.end_y = data->u.img.height;
						}
					}
					else if (data->ref.type == VX_TYPE_PYRAMID) {
						if (node->metaList[i].set_valid_rectangle_callback) {
							for (vx_size level = 0; level < data->u.pyr.levels; level++) {
								node->valid_rect_outputs[level] = &data->children[level]->u.img.rect_valid;
							}
							status = node->metaList[i].set_valid_rectangle_callback(node, i, node->valid_rect_inputs, node->valid_rect_outputs);
						}
						else if (node->valid_rect_reset) {
							for (vx_size level = 0; level < data->u.pyr.levels; level++) {
								data->children[level]->u.img.rect_valid.start_x = 0;
								data->children[level]->u.img.rect_valid.start_y = 0;
								data->children[level]->u.img.rect_valid.end_x = data->children[level]->u.img.width;
								data->children[level]->u.img.rect_valid.end_y = data->children[level]->u.img.height;
							}
						}
					}
				}
			}
		}
#if 0 // TBD remove -- dump valid rectangle info
		for (vx_uint32 i = 0; i < node->paramCount; i++) {
			AgoParameter * param = &node->parameters[i];
			AgoData * data = node->paramList[i];
			if (data && param->direction == VX_OUTPUT) {
				if (data->ref.type == VX_TYPE_IMAGE) {
					printf("valid_rect [ %5d %5d %5d %5d ] image %s\n", data->u.img.rect_valid.start_x, data->u.img.rect_valid.start_y, data->u.img.rect_valid.end_x, data->u.img.rect_valid.end_y, data->name.c_str());
				}
				else if (data->ref.type == VX_TYPE_PYRAMID) {
					for (vx_size level = 0; level < data->u.pyr.levels; level++) {
						printf("valid_rect [ %5d %5d %5d %5d ] pyrL%d %s\n", data->children[level]->u.img.rect_valid.start_x, data->children[level]->u.img.rect_valid.start_y, data->children[level]->u.img.rect_valid.end_x, data->children[level]->u.img.rect_valid.end_y, (int)level, data->name.c_str());
					}
				}
			}
		}
#endif
	}

	return status;
}

int agoInitializeGraph(AgoGraph * graph)
{
	for (AgoNode * node = graph->nodeList.head; node; node = node->next)
	{
		AgoKernel * kernel = node->akernel;
		vx_status status = VX_SUCCESS;
		if (kernel->func) {
			status = kernel->func(node, ago_kernel_cmd_initialize);
		}
		else if (kernel->initialize_f) {
			status = kernel->initialize_f(node, (vx_reference *)node->paramList, node->paramCount);
		}
		if (status) {
			return status;
		}
		else {
			if (node->localDataSize > 0 && node->localDataPtr == nullptr) {
				if (node->localDataPtr_allocated)
					delete[] node->localDataPtr_allocated;
				node->localDataPtr = node->localDataPtr_allocated = (vx_uint8 *)agoAllocMemory(node->localDataSize);
				if (!node->localDataPtr) {
					return VX_ERROR_NO_MEMORY;
				}
				memset(node->localDataPtr, 0, node->localDataSize);
			}
			node->initialized = true;
			// keep a copy of paramList into paramListForAgeDelay
			// TBD: needs to handle reverification path
			memcpy(node->paramListForAgeDelay, node->paramList, sizeof(node->paramListForAgeDelay));
		}
	}
	return VX_SUCCESS;
}

#if ENABLE_OPENCL
static int agoWaitForNodesCompletion(AgoGraph * graph)
{
	int status = VX_SUCCESS;
	if (!graph->opencl_nodeListQueued.empty()) {
		for (vx_size i = 0; i < graph->opencl_nodeListQueued.size(); i++) {
			AgoNode * node = graph->opencl_nodeListQueued[i];
			if (node->supernode) {
				if (!node->supernode->launched || agoGpuOclSuperNodeWait(graph, node->supernode) < 0) {
					agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: agoWaitForNodesCompletion: launched=%d supernode wait failed\n", node->supernode->launched);
					return VX_FAILURE;
				}
				agoPerfCaptureStop(&node->perf);
				for (size_t index = 0; index < node->supernode->nodeList.size(); index++) {
					AgoNode * anode = node->supernode->nodeList[index];
					// node callback
					if (anode->callback) {
						vx_action action = anode->callback(anode);
						if (action == VX_ACTION_ABANDON) {
							status = VX_ERROR_GRAPH_ABANDONED;
							break;
						}
					}
				}
			}
			else {
				if (agoGpuOclSingleNodeWait(graph, node) < 0) {
					agoAddLogEntry(&node->ref, VX_FAILURE, "ERROR: agoWaitForNodesCompletion: single node wait failed\n");
					return VX_FAILURE;
				}
				agoPerfCaptureStop(&node->perf);
				// node callback
				if (node->callback) {
					vx_action action = node->callback(node);
					if (action == VX_ACTION_ABANDON) {
						status = VX_ERROR_GRAPH_ABANDONED;
						break;
					}
				}
			}
		}
		graph->opencl_nodeListQueued.clear();
	}
	return status;
}

static int agoDataSyncFromGpuToCpu(AgoGraph * graph, AgoNode * node, AgoData * dataToSync)
{
	cl_command_queue opencl_cmdq = graph->opencl_cmdq ? graph->opencl_cmdq : graph->ref.context->opencl_cmdq;

	if (dataToSync->opencl_buffer && !(dataToSync->buffer_sync_flags & AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED)) {
		if (node->flags & AGO_KERNEL_FLAG_DEVICE_GPU) {
			if (dataToSync->buffer_sync_flags & (AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE | AGO_BUFFER_SYNC_FLAG_DIRTY_BY_COMMIT)) {
				int64_t stime = agoGetClockCounter();
				if (dataToSync->ref.type == VX_TYPE_LUT) {
					size_t origin[3] = { 0, 0, 0 };
					size_t region[3] = { 256, 1, 1 };
					cl_int err = clEnqueueWriteImage(opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, origin, region, 256, 0, dataToSync->buffer, 0, NULL, NULL);
					if (err) { 
						agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: clEnqueueWriteImage(lut) => %d\n", err);
						return -1; 
					}
				}
				else {
					vx_size size = dataToSync->size;
					if (dataToSync->ref.type == VX_TYPE_ARRAY) {
						// transfer only valid data
						size = dataToSync->u.arr.itemsize * dataToSync->u.arr.numitems;
					}
					if (size > 0) {
						cl_int err = clEnqueueWriteBuffer(opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, dataToSync->opencl_buffer_offset, size, dataToSync->buffer, 0, NULL, NULL);
						if (err) { 
							agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: clEnqueueWriteBuffer() => %d\n", err);
							return -1; 
						}
					}
				}
				dataToSync->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
				int64_t etime = agoGetClockCounter();
				graph->opencl_perf.buffer_write += etime - stime;
			}
		}
		else {
			if (dataToSync->buffer_sync_flags & (AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE_CL)) {
				if (dataToSync->ref.type == VX_TYPE_LUT) {
					int64_t stime = agoGetClockCounter();
					size_t origin[3] = { 0, 0, 0 };
					size_t region[3] = { 256, 1, 1 };
					cl_int err = clEnqueueReadImage(opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, origin, region, 256, 0, dataToSync->buffer, 0, NULL, NULL);
					if (err) { 
						agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: clEnqueueReadImage(lut) => %d\n", err);
						return -1; 
					}
					dataToSync->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
					int64_t etime = agoGetClockCounter();
					graph->opencl_perf.buffer_read += etime - stime;
				}
				else {
					vx_size size = dataToSync->size;
					if (dataToSync->ref.type == VX_TYPE_ARRAY) {
						// transfer only region that has valid data
						size = dataToSync->u.arr.numitems * dataToSync->u.arr.itemsize;
					}
					else if (node->akernel->opencl_buffer_access_enable) {
						// no need to transfer to CPU for this node
						size = 0;
					}
					if (size > 0) {
						int64_t stime = agoGetClockCounter();
						if (size > 0) {
							cl_int err = clEnqueueReadBuffer(opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, dataToSync->opencl_buffer_offset, size, dataToSync->buffer, 0, NULL, NULL);
							if (err) {
								agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: clEnqueueReadBuffer(0x%x,%s,%ld,%ld) => %d\n", dataToSync->ref.type, dataToSync->name.c_str(), dataToSync->opencl_buffer_offset, size, err);
								return -1;
							}
						}
						dataToSync->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
						int64_t etime = agoGetClockCounter();
						graph->opencl_perf.buffer_read += etime - stime;
					}
				}
			}
		}
	}
	return 0;
}
#endif

int agoUpdateDelaySlots(AgoNode * node)
{
	vx_graph graph = (vx_graph)node->ref.scope;
	for (vx_uint32 arg = 0; arg < node->paramCount; arg++) {
		AgoData * data = node->paramList[arg];
		if (data && agoIsPartOfDelay(data)) {
			// get the trace to delay object from original node parameter without vxAgeDelay changes
			int siblingTrace[AGO_MAX_DEPTH_FROM_DELAY_OBJECT], siblingTraceCount = 0;
			AgoData * delay = agoGetSiblingTraceToDelayForUpdate(node->paramListForAgeDelay[arg], siblingTrace, siblingTraceCount);
			if (delay) {
				// get the data 
				data = agoGetDataFromTrace(delay, siblingTrace, siblingTraceCount);
				if (data) {
					// update the node parameter
					node->paramList[arg] = data;
				}
				else {
					agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: SiblingTrace#1 missing\n");
					return VX_FAILURE;
				}
			}
			else {
				agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: SiblingTrace#2 missing\n");
				return VX_FAILURE;
			}
		}
	}
	return 0;
}

int agoExecuteGraph(AgoGraph * graph)
{
	if (graph->detectedInvalidNode) {
		agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: agoExecuteGraph: detected invalid node\n");
		return VX_FAILURE;
	}
	else if (!graph->nodeList.head)
		return VX_SUCCESS;
	int status = VX_SUCCESS;

	agoPerfProfileEntry(graph, ago_profile_type_exec_begin, &graph->ref);
	agoPerfCaptureStart(&graph->perf);

	// update delay slots
	for (AgoNode * node = graph->nodeList.head; node; node = node->next) {
		status = agoUpdateDelaySlots(node);
		if (status != VX_SUCCESS)
			return status;
	}
#if ENABLE_OPENCL
	for (AgoSuperNode * supernode = graph->supernodeList; supernode; supernode = supernode->next) {
		for (size_t arg = 0; arg < supernode->dataList.size(); arg++) {
			AgoData * data = supernode->dataList[arg];
			if (data && agoIsPartOfDelay(data)) {
				// get the trace to delay object from original node parameter without vxAgeDelay changes
				int siblingTrace[AGO_MAX_DEPTH_FROM_DELAY_OBJECT], siblingTraceCount = 0;
				AgoData * delay = agoGetSiblingTraceToDelayForUpdate(supernode->dataListForAgeDelay[arg], siblingTrace, siblingTraceCount);
				if (delay) {
					// get the data 
					data = agoGetDataFromTrace(delay, siblingTrace, siblingTraceCount);
					if (data) {
						// update the supernode parameter
						supernode->dataList[arg] = data;
					}
					else {
						agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: SiblingTrace#3 missing\n");
						return VX_FAILURE;
					}
				}
				else {
					agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: SiblingTrace#4 missing\n");
					return VX_FAILURE;
				}
			}
		}
	}
#endif

#if ENABLE_OPENCL
	// clear opencl_buffer for all virtual images with enableUserBufferOpenCL == true
	for (AgoData * data = graph->dataList.head; data; data = data->next) {
		if (data->ref.type == VX_TYPE_IMAGE && data->u.img.enableUserBufferOpenCL) {
			data->opencl_buffer = nullptr;
		}
	}
#endif
	// mark that none of the supernode has been launched
	for (AgoNode * node = graph->nodeList.head; node; node = node->next) {
		if (node->supernode) {
			node->supernode->launched = false;
		}
	}
#if ENABLE_OPENCL
	graph->opencl_nodeListQueued.clear();
	vx_uint32 nodeLaunchHierarchicalLevel = 0;
	memset(&graph->opencl_perf, 0, sizeof(graph->opencl_perf));
#endif
	// execute one nodes in one hierarchical level at a time
	bool opencl_buffer_access_enable = false;
	for (auto enode = graph->nodeList.head; enode;) {
		// get snode..enode with next hierarchical_level 
		auto hierarchical_level = enode->hierarchical_level;
		auto snode = enode; enode = enode->next;
		while (enode && enode->hierarchical_level == hierarchical_level)
			enode = enode->next;
#if ENABLE_OPENCL
		// process GPU nodes at current hierarchical level
		for (auto node = snode; node != enode; node = node->next) {
			if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_GPU) {
				bool launched = true;
				agoPerfProfileEntry(graph, ago_profile_type_launch_begin, &node->ref);
				agoPerfCaptureStart(&node->perf);
				if (!node->supernode) {
					// launch the single node
					if (agoGpuOclSingleNodeLaunch(graph, node) < 0) {
						return -1;
					}
				}
				else if (!node->supernode->launched) {
					// launch the super node
					if (agoGpuOclSuperNodeLaunch(graph, node->supernode) < 0) {
						return -1;
					}
					node->supernode->launched = true;
				}
				else {
					launched = false;
				}
				if (launched) {
					graph->opencl_nodeListQueued.push_back(node);
					if (nodeLaunchHierarchicalLevel == 0) {
						nodeLaunchHierarchicalLevel = node->hierarchical_level;
					}
				}
				agoPerfProfileEntry(graph, ago_profile_type_launch_end, &node->ref);
			}
		}
#endif
		// process CPU nodes at current hierarchical level
		for (auto node = snode; node != enode; node = node->next) {
			if (node->attr_affinity.device_type == AGO_KERNEL_FLAG_DEVICE_CPU) {
#if ENABLE_OPENCL
				opencl_buffer_access_enable |= (node->akernel->opencl_buffer_access_enable ? true : false);
				if (!node->akernel->opencl_buffer_access_enable) {
					agoPerfProfileEntry(graph, ago_profile_type_wait_begin, &node->ref);
					if (nodeLaunchHierarchicalLevel > 0 && nodeLaunchHierarchicalLevel < node->hierarchical_level) {
						status = agoWaitForNodesCompletion(graph);
						if (status != VX_SUCCESS) {
							agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: agoWaitForNodesCompletion failed (%d:%s)\n", status, agoEnum2Name(status));
							return status;
						}
						nodeLaunchHierarchicalLevel = 0;
					}
					if(opencl_buffer_access_enable) {
						cl_int err = clFinish(graph->opencl_cmdq);
						if (err) {
							agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clFinish(graph) => %d\n", err);
							return VX_FAILURE;
						}
						opencl_buffer_access_enable = false;
					}
					agoPerfProfileEntry(graph, ago_profile_type_wait_end, &node->ref);
				}
				agoPerfProfileEntry(graph, ago_profile_type_copy_begin, &node->ref);
				// make sure that all input buffers are synched
				if (node->akernel->opencl_buffer_access_enable) {
					for (vx_uint32 i = 0; i < node->paramCount; i++) {
						AgoData * data = node->paramList[i];
						if (data && data->opencl_buffer &&
							(node->parameters[i].direction == VX_INPUT || node->parameters[i].direction == VX_BIDIRECTIONAL))
						{
							auto dataToSync = (data->ref.type == VX_TYPE_IMAGE && data->u.img.isROI) ? data->u.img.roiMasterImage : data;
							if (dataToSync->buffer_sync_flags & (AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE | AGO_BUFFER_SYNC_FLAG_DIRTY_BY_COMMIT) &&
							    dataToSync->opencl_buffer && !(dataToSync->buffer_sync_flags & AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED))
							{
								status = agoDirective((vx_reference)dataToSync, VX_DIRECTIVE_AMD_COPY_TO_OPENCL);
								if(status != VX_SUCCESS) {
									agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: agoDirective(*,VX_DIRECTIVE_AMD_COPY_TO_OPENCL) failed (%d:%s)\n", status, agoEnum2Name(status));
									return status;
								}
							}
						}
					}
				}
				else {
					for (vx_uint32 i = 0; i < node->paramCount; i++) {
						AgoData * data = node->paramList[i];
						if (data && (node->parameters[i].direction == VX_INPUT || node->parameters[i].direction == VX_BIDIRECTIONAL)) {
							auto dataToSync = (data->ref.type == VX_TYPE_IMAGE && data->u.img.isROI) ? data->u.img.roiMasterImage : data;
							status = agoDataSyncFromGpuToCpu(graph, node, dataToSync);
							for (vx_uint32 j = 0; !status && j < dataToSync->numChildren; j++) {
								AgoData * jdata = dataToSync->children[j];
								if (jdata)
									status = agoDataSyncFromGpuToCpu(graph, node, jdata);
							}
							if (status) {
								agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: agoDataSyncFromGpuToCpu failed (%d:%s) for node(%s) arg#%d data(%s)\n", status, agoEnum2Name(status), node->akernel->name, i, data->name.c_str());
								return status;
							}
						}
					}
				}
				agoPerfProfileEntry(graph, ago_profile_type_copy_end, &node->ref);
#endif
				// execute node
				agoPerfProfileEntry(graph, ago_profile_type_exec_begin, &node->ref);
				agoPerfCaptureStart(&node->perf);
				AgoKernel * kernel = node->akernel;
				status = VX_SUCCESS;
				if (kernel->func) {
					status = kernel->func(node, ago_kernel_cmd_execute);
					if (status == AGO_ERROR_KERNEL_NOT_IMPLEMENTED)
						status = VX_ERROR_NOT_IMPLEMENTED;
				}
				else if (kernel->kernel_f) {
					status = kernel->kernel_f(node, (vx_reference *)node->paramList, node->paramCount);
				}
				if (status) {
					agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: kernel %s exec failed (%d:%s)\n", kernel->name, status, agoEnum2Name(status));
					return status;
				}
				agoPerfCaptureStop(&node->perf);
				agoPerfProfileEntry(graph, ago_profile_type_exec_end, &node->ref);
#if ENABLE_OPENCL
				// mark that node outputs are dirty
				for (vx_uint32 i = 0; i < node->paramCount; i++) {
					AgoData * data = node->paramList[i];
					if (data && data->opencl_buffer &&
						(node->parameters[i].direction == VX_OUTPUT || node->parameters[i].direction == VX_BIDIRECTIONAL))
					{
						auto dataToSync = (data->ref.type == VX_TYPE_IMAGE && data->u.img.isROI) ? data->u.img.roiMasterImage : data;
						dataToSync->buffer_sync_flags &= ~AGO_BUFFER_SYNC_FLAG_DIRTY_MASK;
						dataToSync->buffer_sync_flags |=
							((node->akernel->opencl_buffer_access_enable || data->u.img.enableUserBufferOpenCL)
								? AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE_CL
								: AGO_BUFFER_SYNC_FLAG_DIRTY_BY_NODE);
					}
				}
#endif
				// node callback
				if (node->callback) {
					vx_action action = node->callback(node);
					if (action == VX_ACTION_ABANDON) {
						return VX_ERROR_GRAPH_ABANDONED;
					}
				}
			}
		}
	}
#if ENABLE_OPENCL
	agoPerfProfileEntry(graph, ago_profile_type_wait_begin, &graph->ref);
	if (nodeLaunchHierarchicalLevel > 0) {
		status = agoWaitForNodesCompletion(graph);
		if (status != VX_SUCCESS) {
			agoAddLogEntry((vx_reference)graph, VX_FAILURE, "ERROR: agoWaitForNodesCompletion failed (%d:%s)\n", status, agoEnum2Name(status));
			return status;
		}
	}
	if(opencl_buffer_access_enable) {
		cl_int err = clFinish(graph->opencl_cmdq);
		if (err) {
			agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clFinish(graph) => %d\n", err);
			return VX_FAILURE;
		}
	}
	agoPerfProfileEntry(graph, ago_profile_type_wait_end, &graph->ref);
	graph->opencl_perf_total.kernel_enqueue += graph->opencl_perf.kernel_enqueue;
	graph->opencl_perf_total.kernel_wait += graph->opencl_perf.kernel_wait;
	graph->opencl_perf_total.buffer_read += graph->opencl_perf.buffer_read;
	graph->opencl_perf_total.buffer_write += graph->opencl_perf.buffer_write;
#endif

	// auto age delays
	for (auto it = graph->autoAgeDelayList.begin(); it != graph->autoAgeDelayList.end(); it++) {
		if (agoIsValidData(*it, VX_TYPE_DELAY)) {
			agoAgeDelay(*it);
		}
	}

	agoPerfCaptureStop(&graph->perf);
	agoPerfProfileEntry(graph, ago_profile_type_exec_end, &graph->ref);
	graph->execFrameCount++;
	return status;
}

int agoAgeDelay(AgoData * delay)
{
	// cycle through all the pointers by swapping
	AgoData * childLast = delay->children[delay->u.delay.count - 1];
	for (vx_int32 i = (vx_int32)delay->u.delay.count - 1; i > 0; i--) {
		delay->children[i] = delay->children[i - 1];
	}
	delay->children[0] = childLast;
	delay->u.delay.age++;
	return VX_SUCCESS;
}

vx_status agoDirective(vx_reference reference, vx_enum directive)
{
	vx_status status = VX_ERROR_INVALID_REFERENCE;
	if (agoIsValidReference(reference)) {
		vx_context context = reference->context;
		if (agoIsValidContext(context)) {
			CAgoLock lock(context->cs);
			status = VX_SUCCESS;
			switch (directive)
			{
			case VX_DIRECTIVE_ENABLE_LOGGING:
				reference->enable_logging = true;
				break;
			case VX_DIRECTIVE_DISABLE_LOGGING:
				reference->enable_logging = false;
				break;
			case VX_DIRECTIVE_AMD_READ_ONLY:
				if (reference->type == VX_TYPE_CONVOLUTION || reference->type == VX_TYPE_MATRIX) {
					if (((AgoData *)reference)->buffer) {
						reference->read_only = true;
					}
					else {
						status = VX_ERROR_NOT_SUPPORTED;
					}
				}
				else {
					status = VX_ERROR_NOT_SUPPORTED;
				}
				break;
#if ENABLE_OPENCL
			case VX_DIRECTIVE_AMD_COPY_TO_OPENCL:
				status = VX_ERROR_NOT_SUPPORTED;
				if (reference->context->opencl_cmdq) {
					auto data = (AgoData *)reference;
					auto dataToSync = (data->ref.type == VX_TYPE_IMAGE && data->u.img.isROI) ? data->u.img.roiMasterImage : data;
					if (dataToSync->ref.type == VX_TYPE_LUT) {
						if (dataToSync->opencl_buffer) {
							size_t origin[3] = { 0, 0, 0 };
							size_t region[3] = { 256, 1, 1 };
							cl_int err = clEnqueueWriteImage(dataToSync->ref.context->opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, origin, region, 256, 0, dataToSync->buffer, 0, NULL, NULL);
							if (err) { 
								agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clEnqueueWriteImage(lut) => %d\n", err);
								return VX_FAILURE; 
							}
							dataToSync->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
							status = VX_SUCCESS;
						}
						else {
							status = VX_ERROR_NOT_ALLOCATED;
						}
					}
					else if (dataToSync->ref.type == VX_TYPE_IMAGE && dataToSync->numChildren > 0) {
						status = VX_ERROR_NOT_ALLOCATED;
						for (vx_uint32 plane = 0; plane < dataToSync->numChildren; plane++) {
							AgoData * img = dataToSync->children[plane];
							if (img) {
								if (img->opencl_buffer) {
									cl_int err = clEnqueueWriteBuffer(img->ref.context->opencl_cmdq, img->opencl_buffer, CL_TRUE, img->opencl_buffer_offset, img->size, img->buffer, 0, NULL, NULL);
									if (err) {
										agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clEnqueueWriteBuffer() => %d\n", err);
										return VX_FAILURE;
									}
									img->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
									status = VX_SUCCESS;
								}
							}
						}
					}
					else {
						if (dataToSync->opencl_buffer) {
							vx_size size = dataToSync->size;
							if (dataToSync->ref.type == VX_TYPE_ARRAY) {
								// transfer only valid data
								size = dataToSync->u.arr.itemsize * dataToSync->u.arr.numitems;
							}
							if (size > 0) {
								cl_int err = clEnqueueWriteBuffer(dataToSync->ref.context->opencl_cmdq, dataToSync->opencl_buffer, CL_TRUE, dataToSync->opencl_buffer_offset, size, dataToSync->buffer, 0, NULL, NULL);
								if (err) { 
									agoAddLogEntry(NULL, VX_FAILURE, "ERROR: clEnqueueWriteBuffer() => %d\n", err);
									return VX_FAILURE; 
								}
							}
							dataToSync->buffer_sync_flags |= AGO_BUFFER_SYNC_FLAG_DIRTY_SYNCHED;
							status = VX_SUCCESS;
						}
						else {
							status = VX_ERROR_NOT_ALLOCATED;
						}
					}
				}
				break;
#endif
			case VX_DIRECTIVE_AMD_ENABLE_PROFILE_CAPTURE:
			case VX_DIRECTIVE_AMD_DISABLE_PROFILE_CAPTURE:
				if (reference->type == VX_TYPE_GRAPH) {
					((AgoGraph *)reference)->enable_performance_profiling = 
						(directive == VX_DIRECTIVE_AMD_ENABLE_PROFILE_CAPTURE) ? true : false;
				}
				else {
					status = VX_ERROR_NOT_SUPPORTED;
				}
				break;
#if ENABLE_OPENCL
			case VX_DIRECTIVE_AMD_DISABLE_OPENCL_FLUSH:
				if (reference->type == VX_TYPE_GRAPH) {
					((AgoGraph *)reference)->enable_node_level_opencl_flush = false;
				}
				else {
					status = VX_ERROR_NOT_SUPPORTED;
				}
				break;
#endif
			default:
				status = VX_ERROR_NOT_SUPPORTED;
				break;
			}
		}
	}
	return status;
}

vx_status agoGraphDumpPerformanceProfile(AgoGraph * graph, const char * fileName)
{
	bool use_stdout = true;
	FILE * fp = stdout;
	if (fileName && strcmp(fileName, "stdout") != 0) {
		use_stdout = false;
		fp = fopen(fileName, "w");
		if (!fp) {
			agoAddLogEntry(NULL, VX_FAILURE, "ERROR: unable to create: %s\n", fileName);
			return VX_FAILURE;
		}
	}
	fprintf(fp, " COUNT,tmp(ms),avg(ms),min(ms),max(ms),DEV,KERNEL\n");
	int64_t freq = agoGetClockFrequency();
	float factor = 1000.0f / (float)freq; // to convert clock counter to ms
	if (graph->perf.num > 0) {
		fprintf(fp, "%6d,%7.3f,%7.3f,%7.3f,%7.3f,%s,%s\n",
			(int)graph->perf.num, (float)graph->perf.tmp * factor,
			(float)graph->perf.sum * factor / (float)graph->perf.num,
			(float)graph->perf.min * factor, (float)graph->perf.max * factor,
			graph->attr_affinity.device_type == AGO_TARGET_AFFINITY_GPU ? "GPU" : "CPU",
			"GRAPH");
	}
	for (AgoNode * node = graph->nodeList.head; node; node = node->next) {
		if (node->perf.num > 0) {
			fprintf(fp, "%6d,%7.3f,%7.3f,%7.3f,%7.3f,%s,%s\n",
				(int)node->perf.num, (float)node->perf.tmp * factor,
				(float)node->perf.sum * factor / (float)node->perf.num,
				(float)node->perf.min * factor, (float)node->perf.max * factor,
				node->attr_affinity.device_type == AGO_TARGET_AFFINITY_GPU ? "GPU" : "CPU",
				node->akernel->name);
		}
	}
	if (graph->enable_performance_profiling && graph->performance_profile.size() > 0) {
		fprintf(fp, "***PROFILER-OUTPUT***\n");
		fprintf(fp, " frame,type,timestamp(ms),object-name\n");
		int64_t stime = graph->performance_profile[0].time;
		for (auto entry : graph->performance_profile) {
			char name[256];
			if (entry.ref->type == VX_TYPE_GRAPH) strcpy(name, "GRAPH");
			else if (entry.ref->type == VX_TYPE_NODE) strncpy(name, ((AgoNode *)entry.ref)->akernel->name, sizeof(name) - 1);
			else agoGetDataName(name, (AgoData *)entry.ref);
			static const char * type_str[] = {
				"launch(s)", "launch(e)", "wait(s)", "wait(e)", "copy(s)", "copy(e)", "exec(s)", "exec(e)",
				"8", "9", "10", "11", "12", "13", "14", "15"
			};
			fprintf(fp, "%6d,%-9.9s,%13.3f,%s\n", entry.id, type_str[entry.type], (float)(entry.time - stime) * factor, name);
		}
		// clear the profiling data
		graph->performance_profile.clear();
	}
	fflush(fp);
	if (!use_stdout) {
		fclose(fp);
	}
	return VX_SUCCESS;
}

int agoProcessGraph(AgoGraph * graph)
{
	vx_status status = VX_ERROR_INVALID_REFERENCE;
	if (agoIsValidGraph(graph)) {
		CAgoLock lock(graph->cs);

		// make sure that graph is verified
		status = VX_SUCCESS;
		if (!graph->verified) {
			status = vxVerifyGraph(graph);
		}

		// execute graph if possible
		if (status == VX_SUCCESS) {
			if (graph->verified && graph->isReadyToExecute) {
				status = agoExecuteGraph(graph);
			}
			else {
				agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: agoProcessGraph: not verified (%d) or not ready to execute (%d)\n", graph->verified, graph->isReadyToExecute);
				status = VX_FAILURE;
			}
		}
	}
	return status;
}

int agoScheduleGraph(AgoGraph * graph)
{
	vx_status status = VX_ERROR_INVALID_REFERENCE;
	if (agoIsValidGraph(graph)) {
		status = VX_SUCCESS;
		graph->threadScheduleCount++;
		if (graph->hThread) {
			if (!graph->verified) {
				// make sure to verify the graph in master thread
				CAgoLock lock(graph->cs);
				status = vxVerifyGraph(graph);
			}
			if (status == VX_SUCCESS) {
				// inform graph thread to execute
				if (!ReleaseSemaphore(graph->hSemToThread, 1, nullptr)) {
					status = VX_ERROR_NO_RESOURCES;
				}
			}
		}
		else {
			status = agoProcessGraph(graph);
		}
	}
	return status;
}

int agoWaitGraph(AgoGraph * graph)
{
	vx_status status = VX_ERROR_INVALID_REFERENCE;
	if (agoIsValidGraph(graph)) {
		status = VX_SUCCESS;
		graph->threadWaitCount++;
		if (graph->hThread) {
			while (graph->threadExecuteCount != graph->threadScheduleCount) {
				if (WaitForSingleObject(graph->hSemFromThread, INFINITE) != WAIT_OBJECT_0) {
					agoAddLogEntry(&graph->ref, VX_FAILURE, "ERROR: agoWaitGraph: WaitForSingleObject failed\n");
					status = VX_FAILURE;
					break;
				}
			}
		}
		if (status == VX_SUCCESS) {
			status = graph->status;
		}
	}
	return status;
}
