/* ************************************************************************ * Copyright 2018-2022 Advanced Micro Devices, Inc. * ************************************************************************ */ #pragma once #include "cblas_interface.hpp" #include "norm.hpp" #include "rocblas.hpp" #include "rocblas_init.hpp" #include "rocblas_math.hpp" #include "rocblas_random.hpp" #include "rocblas_test.hpp" #include "rocblas_vector.hpp" #include "unit.hpp" #include "utility.hpp" template using rocblas_reduction_strided_batched_t = rocblas_status (*)(rocblas_handle handle, rocblas_int n, const T* x, rocblas_int incx, rocblas_stride stridex, rocblas_int batch_count, R* result); template void template_testing_reduction_strided_batched_bad_arg( const Arguments& arg, rocblas_reduction_strided_batched_t func) { rocblas_int N = 100, incx = 1, batch_count = 5; static const size_t safe_size = 100; rocblas_local_handle handle{arg}; // // allocate memory on device // device_vector dx(batch_count); CHECK_DEVICE_ALLOCATION(dx.memcheck()); R h_rocblas_result; EXPECT_ROCBLAS_STATUS(func(handle, N, nullptr, incx, incx * N, batch_count, &h_rocblas_result), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(func(handle, N, dx, incx, incx * N, batch_count, nullptr), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS(func(nullptr, N, dx, incx, incx * N, batch_count, &h_rocblas_result), rocblas_status_invalid_handle); } template void template_testing_reduction_strided_batched( const Arguments& arg, rocblas_reduction_strided_batched_t func, void (*REFBLAS_FUNC)(rocblas_int, const T*, rocblas_int, R*)) { rocblas_int N = arg.N, incx = arg.incx, batch_count = arg.batch_count; rocblas_stride stridex = arg.stride_x; double rocblas_error_1, rocblas_error_2; rocblas_local_handle handle{arg}; // check to prevent undefined memory allocation error if(N <= 0 || incx <= 0 || batch_count <= 0) { device_vector d_rocblas_result(std::max(batch_count, 1)); CHECK_DEVICE_ALLOCATION(d_rocblas_result.memcheck()); host_vector h_rocblas_result(std::max(batch_count, 1)); CHECK_HIP_ERROR(h_rocblas_result.memcheck()); CHECK_ROCBLAS_ERROR(rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device)); EXPECT_ROCBLAS_STATUS( func(handle, N, nullptr, incx, stridex, batch_count, d_rocblas_result), rocblas_status_success); CHECK_ROCBLAS_ERROR(rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host)); EXPECT_ROCBLAS_STATUS( func(handle, N, nullptr, incx, stridex, batch_count, h_rocblas_result), rocblas_status_success); if(batch_count > 0) { host_vector cpu_0(batch_count); host_vector gpu_0(batch_count); CHECK_HIP_ERROR(gpu_0.transfer_from(d_rocblas_result)); unit_check_general(1, 1, 1, 1, cpu_0, gpu_0, batch_count); unit_check_general(1, 1, 1, 1, cpu_0, h_rocblas_result, batch_count); } return; } host_vector hr1(batch_count); CHECK_HIP_ERROR(hr1.memcheck()); host_vector hr2(batch_count); CHECK_HIP_ERROR(hr2.memcheck()); host_vector cpu_result(batch_count); CHECK_HIP_ERROR(cpu_result.memcheck()); device_strided_batch_vector dx(N, incx, stridex, batch_count); CHECK_DEVICE_ALLOCATION(dx.memcheck()); host_strided_batch_vector hx(N, incx, stridex, batch_count); CHECK_HIP_ERROR(hx.memcheck()); device_vector dr(batch_count); CHECK_DEVICE_ALLOCATION(dr.memcheck()); double gpu_time_used, cpu_time_used; // // Initialize the host vector. // rocblas_init_vector(hx, arg, rocblas_client_alpha_sets_nan, true); // // Transfer host data to device. // CHECK_HIP_ERROR(dx.transfer_from(hx)); if(arg.unit_check || arg.norm_check) { // // GPU BLAS, rocblas_pointer_mode_host // { CHECK_ROCBLAS_ERROR(rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host)); CHECK_ROCBLAS_ERROR(func(handle, N, dx, incx, stridex, batch_count, hr1)); } // // GPU BLAS, rocblas_pointer_mode_device // { CHECK_ROCBLAS_ERROR(rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device)); CHECK_ROCBLAS_ERROR(func(handle, N, dx, incx, stridex, batch_count, dr)); // // Copy result back to host. // CHECK_HIP_ERROR(hr2.transfer_from(dr)); } // // COMPARE WITH CPU BLAS // { // // Time to execution // cpu_time_used = get_time_us_no_sync(); for(rocblas_int batch_index = 0; batch_index < batch_count; ++batch_index) { REFBLAS_FUNC(N, hx[batch_index], incx, cpu_result + batch_index); } cpu_time_used = get_time_us_no_sync() - cpu_time_used; } // // Check the results // if(arg.unit_check) { unit_check_general(batch_count, 1, 1, cpu_result, hr1); unit_check_general(batch_count, 1, 1, cpu_result, hr2); } // // Check the norm. // if(arg.norm_check) { rocblas_error_1 = 0.0; rocblas_error_2 = 0.0; for(rocblas_int batch_index = 0; batch_index < batch_count; ++batch_index) { double a1 = double(hr1[batch_index]); double a2 = double(hr2[batch_index]); double c = double(cpu_result[batch_index]); rocblas_error_1 = std::max(rocblas_error_1, std::abs((c - a1) / c)); rocblas_error_2 = std::max(rocblas_error_2, std::abs((c - a2) / c)); } } } if(arg.timing) { int number_cold_calls = arg.cold_iters; int number_hot_calls = arg.iters; CHECK_ROCBLAS_ERROR(rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host)); for(int iter = 0; iter < number_cold_calls; iter++) { func(handle, N, dx, incx, stridex, batch_count, hr2); } hipStream_t stream; CHECK_ROCBLAS_ERROR(rocblas_get_stream(handle, &stream)); gpu_time_used = get_time_us_sync(stream); // in microseconds for(int iter = 0; iter < number_hot_calls; iter++) { func(handle, N, dx, incx, stridex, batch_count, hr2); } gpu_time_used = (get_time_us_sync(stream) - gpu_time_used) / number_hot_calls; rocblas_cout << "N,incx,stridex,batch_count,rocblas(us)"; if(arg.norm_check) rocblas_cout << ",CPU(us),error_host_ptr,error_dev_ptr"; rocblas_cout << std::endl; rocblas_cout << N << "," << incx << "," << stridex << "," << batch_count << "," << gpu_time_used; if(arg.norm_check) rocblas_cout << "," << cpu_time_used << "," << rocblas_error_1 << "," << rocblas_error_2; rocblas_cout << std::endl; } }