// RUN: %hc %s -I/opt/rocm/hsa/include -L/opt/rocm/lib -lhsa-runtime64 -o %t.out && %t.out #include #include #include #include // loop to deliberately slow down kernel execution #define LOOP_COUNT (1024) #define TEST_DEBUG (0) // An example which shows how to use accelerator_view::create_blocking_marker(completion_future&) /// /// The test case only works on HSA because it directly uses HSA runtime API /// It would use completion_future::get_native_handle() to retrieve the /// underlying hsa_signal_t data structure to query if dependent kernels have /// really finished execution before the new kernel is executed. bool test() { bool ret = true; // define inputs and output const int vecSize = 2048; hc::array_view table_a(vecSize); hc::array_view table_b(vecSize); hc::array_view table_c(vecSize); // initialize test data std::random_device rd; std::uniform_int_distribution int_dist; for (int i = 0; i < vecSize; ++i) { table_a[i] = int_dist(rd); table_b[i] = int_dist(rd); } // launch kernel hc::extent<1> e(vecSize); hc::completion_future fut = hc::parallel_for_each( e, [=](hc::index<1> idx) __HC__ { for (int i = 0; i < LOOP_COUNT; ++i) table_c(idx) = table_a(idx) + table_b(idx); }); // create a barrier packet hc::accelerator_view av = hc::accelerator().get_default_view(); hc::accelerator_view av2 = hc::accelerator().create_view(); hc::accelerator_view av3 = hc::accelerator().create_view(); hc::completion_future fut2 = av.create_blocking_marker(fut); void* nativeHandle = fut.get_native_handle(); void* nativeHandle2 = fut2.get_native_handle(); #if TEST_DEBUG std::cout << nativeHandle << "\n"; std::cout << nativeHandle2 << "\n"; #endif hsa_signal_value_t signal_value; hsa_signal_value_t signal_value2; signal_value = hsa_signal_load_scacquire(*static_cast(nativeHandle)); #if TEST_DEBUG std::cout << "kernel signal value: " << signal_value << "\n"; #endif signal_value2 = hsa_signal_load_scacquire(*static_cast(nativeHandle2)); #if TEST_DEBUG std::cout << "blocking barrier signal value: " << signal_value << "\n"; #endif // wait on the barrier packet fut2.wait(); // the barrier packet would ensure all previous packets were processed signal_value = hsa_signal_load_scacquire(*static_cast(nativeHandle)); #if TEST_DEBUG std::cout << "kernel signal value: " << signal_value << "\n"; #endif ret &= (signal_value == 0); signal_value2 = hsa_signal_load_scacquire(*static_cast(nativeHandle2)); #if TEST_DEBUG std::cout << "barrier signal value: " << signal_value << "\n"; #endif ret &= (signal_value2 == 0); // verify int error = 0; for(unsigned i = 0; i < vecSize; i++) { error += table_c[i] - (table_a[i] + table_b[i]); } if (error == 0) { std::cout << "Verify first part success!\n"; } else { std::cout << "Verify first part failed!\n"; } ret &= (error == 0); // Try another case with events that are not dispatched. // These will have null asyncOps so tests corner case in HCC: { hc::completion_future nullcf[7]; hc::completion_future cfA; cfA = av.create_blocking_marker(nullcf[0]); cfA.wait(); cfA= av.create_blocking_marker({nullcf[0], nullcf[1], nullcf[2]}); cfA.wait(); // Test case with > 6 completion futures: cfA = av.create_blocking_marker({nullcf[0], nullcf[1], nullcf[2], nullcf[3], nullcf[4], nullcf[5], nullcf[6]}); cfA.wait(); std::cout << "null signals verify OK\n"; hc::completion_future cf_pfe, cf_pfe2; cf_pfe = hc::parallel_for_each(av, e, [=](hc::index<1> idx) __HC__ { for (int i = 0; i < LOOP_COUNT; ++i) table_c(idx) = table_a(idx) + table_b(idx); }); cfA = av2.create_blocking_marker({nullcf[0], cf_pfe}); cfA.wait(); hsa_signal_value_t signal_value_pfe = hsa_signal_load_scacquire(*static_cast(cf_pfe.get_native_handle())); hsa_signal_value_t signal_value_cbm = hsa_signal_load_scacquire(*static_cast(cfA.get_native_handle())); std::cout << "create_blocking_marker on single PFE verify OK\n"; // Both signals should have completed. assert(signal_value_pfe == 0); assert(signal_value_cbm == 0); // Try a 3-way: // Two kernels sent to different PFE, then wait on all three cf_pfe = hc::parallel_for_each(av, e, [=](hc::index<1> idx) __HC__ { for (int i = 0; i < LOOP_COUNT; ++i) table_c(idx) = table_a(idx) + table_b(idx); }); cf_pfe2 = hc::parallel_for_each(av2, e, [=](hc::index<1> idx) __HC__ { for (int i = 0; i < LOOP_COUNT; ++i) table_c(idx) = table_a(idx) + table_b(idx); }); cfA = av2.create_blocking_marker({nullcf[0], cf_pfe, nullcf[1], cf_pfe2}); cfA.wait(); assert (hsa_signal_load_scacquire(*static_cast(cf_pfe.get_native_handle())) == 0); assert (hsa_signal_load_scacquire(*static_cast(cf_pfe2.get_native_handle())) == 0); assert (hsa_signal_load_scacquire(*static_cast(cfA.get_native_handle())) == 0); std::cout << "create_blocking_marker on dual PFE verify OK\n"; } return ret; } int main() { bool ret = true; ret &= test(); return !(ret == true); }