/* ************************************************************************ * Copyright 2018-2019 Advanced Micro Devices, Inc. * ************************************************************************ */ #include "cblas_interface.hpp" #include "near.hpp" #include "norm.hpp" #include "rocblas.hpp" #include "rocblas_init.hpp" #include "rocblas_math.hpp" #include "rocblas_random.hpp" #include "rocblas_test.hpp" #include "rocblas_vector.hpp" #include "unit.hpp" #include "utility.hpp" template void testing_nrm2_batched_bad_arg_template(const Arguments& arg) { rocblas_int N = 100; rocblas_int incx = 1; rocblas_int batch_count = 1; static const size_t safe_size = 100; rocblas_local_handle handle; device_vector dx(safe_size); device_vector d_rocblas_result(1); if(!dx || !d_rocblas_result) { CHECK_HIP_ERROR(hipErrorOutOfMemory); return; } CHECK_ROCBLAS_ERROR(rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device)); EXPECT_ROCBLAS_STATUS( (rocblas_nrm2_batched(handle, N, nullptr, incx, batch_count, d_rocblas_result)), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS((rocblas_nrm2_batched(handle, N, dx, incx, batch_count, nullptr)), rocblas_status_invalid_pointer); EXPECT_ROCBLAS_STATUS( (rocblas_nrm2_batched(nullptr, N, dx, incx, batch_count, d_rocblas_result)), rocblas_status_invalid_handle); } template void testing_nrm2_batched_template(const Arguments& arg) { rocblas_int N = arg.N; rocblas_int incx = arg.incx; rocblas_int batch_count = arg.batch_count; double rocblas_error_1; double rocblas_error_2; rocblas_local_handle handle; // check to prevent undefined memory allocation error if(N <= 0 || incx <= 0 || batch_count <= 0) { device_batch_vector dx(3, 1, 2); CHECK_HIP_ERROR(dx.memcheck()); device_vector dr(std::max(2, std::abs(batch_count))); CHECK_HIP_ERROR(dr.memcheck()); CHECK_ROCBLAS_ERROR(rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device)); EXPECT_ROCBLAS_STATUS( (rocblas_nrm2_batched(handle, N, dx.ptr_on_device(), incx, batch_count, dr)), (N > 0 && incx > 0 && batch_count < 0) ? rocblas_status_invalid_size : rocblas_status_success); return; } T2 rocblas_result_1[batch_count]; T2 rocblas_result_2[batch_count]; T2 cpu_result[batch_count]; size_t size_x = N * size_t(incx); // allocate memory on device device_vector d_rocblas_result_2(batch_count); if(!d_rocblas_result_2) { CHECK_HIP_ERROR(hipErrorOutOfMemory); return; } // Naming: dx is in GPU (device) memory. hx is in CPU (host) memory, plz follow this practice host_vector hx[batch_count]; // Initial Data on CPU rocblas_seedrand(); for(int i = 0; i < batch_count; i++) { hx[i] = host_vector(size_x); rocblas_init(hx[i], 1, N, incx); } device_batch_vector hdx(batch_count, size_x); // copy data from host to device for(int i = 0; i < batch_count; i++) { CHECK_HIP_ERROR(hipMemcpy(hdx[i], hx[i], size_x * sizeof(T1), hipMemcpyHostToDevice)); } // vector pointers on gpu device_vector dx_pvec(batch_count); if(!dx_pvec) { CHECK_HIP_ERROR(hipErrorOutOfMemory); return; } // copy gpu vector pointers from host to device pointer array CHECK_HIP_ERROR(hipMemcpy(dx_pvec, hdx, sizeof(T1*) * batch_count, hipMemcpyHostToDevice)); double gpu_time_used, cpu_time_used; if(arg.unit_check || arg.norm_check) { // GPU BLAS, rocblas_pointer_mode_host CHECK_ROCBLAS_ERROR(rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host)); CHECK_ROCBLAS_ERROR(( rocblas_nrm2_batched(handle, N, dx_pvec, incx, batch_count, rocblas_result_1))); // GPU BLAS, rocblas_pointer_mode_device CHECK_ROCBLAS_ERROR(rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device)); CHECK_ROCBLAS_ERROR((rocblas_nrm2_batched( handle, N, dx_pvec, incx, batch_count, d_rocblas_result_2))); CHECK_HIP_ERROR(hipMemcpy( rocblas_result_2, d_rocblas_result_2, batch_count * sizeof(T2), hipMemcpyDeviceToHost)); // CPU BLAS cpu_time_used = get_time_us(); for(int i = 0; i < batch_count; i++) { cblas_nrm2(N, hx[i], incx, cpu_result + i); } cpu_time_used = get_time_us() - cpu_time_used; // allowable error is sqrt of precision. This is based on nrm2 calculating the // square root of a sum. It is assumed that the sum will have accuracy =approx= // precision, so nrm2 will have accuracy =approx= sqrt(precision) T2 abs_error = pow(10.0, -(std::numeric_limits::digits10 / 2.0)) * cpu_result[0]; T2 tolerance = 2.0; // accounts for rounding in reduction sum. depends on n. // If test fails, try decreasing n or increasing tolerance. abs_error *= tolerance; if(arg.unit_check) { near_check_general(batch_count, 1, 1, cpu_result, rocblas_result_1, abs_error); near_check_general(batch_count, 1, 1, cpu_result, rocblas_result_2, abs_error); } if(arg.norm_check) { printf("cpu=%e, gpu_host_ptr=%e, gpu_dev_ptr=%e\n", cpu_result[0], rocblas_result_1[0], rocblas_result_2[0]); rocblas_error_1 = std::abs((cpu_result[0] - rocblas_result_1[0]) / cpu_result[0]); rocblas_error_2 = std::abs((cpu_result[0] - rocblas_result_2[0]) / cpu_result[0]); } } if(arg.timing) { int number_cold_calls = 2; int number_hot_calls = 100; CHECK_ROCBLAS_ERROR(rocblas_set_pointer_mode(handle, rocblas_pointer_mode_host)); for(int iter = 0; iter < number_cold_calls; iter++) { rocblas_nrm2_batched(handle, N, dx_pvec, incx, batch_count, rocblas_result_2); } gpu_time_used = get_time_us(); // in microseconds for(int iter = 0; iter < number_hot_calls; iter++) { rocblas_nrm2_batched(handle, N, dx_pvec, incx, batch_count, rocblas_result_2); } gpu_time_used = (get_time_us() - gpu_time_used) / number_hot_calls; std::cout << "N,incx,batch_count,rocblas(us)"; if(arg.norm_check) std::cout << ",CPU(us),error_host_ptr,error_dev_ptr"; std::cout << std::endl; std::cout << N << "," << incx << "," << batch_count << "," << gpu_time_used; if(arg.norm_check) std::cout << "," << cpu_time_used << "," << rocblas_error_1 << "," << rocblas_error_2; std::cout << std::endl; } } template void testing_nrm2_batched_bad_arg(const Arguments& arg) { testing_nrm2_batched_bad_arg_template(arg); } template <> void testing_nrm2_batched_bad_arg(const Arguments& arg) { testing_nrm2_batched_bad_arg_template(arg); } template <> void testing_nrm2_batched_bad_arg(const Arguments& arg) { testing_nrm2_batched_bad_arg_template(arg); } template void testing_nrm2_batched(const Arguments& arg) { testing_nrm2_batched_template(arg); } template <> void testing_nrm2_batched(const Arguments& arg) { testing_nrm2_batched_template(arg); } template <> void testing_nrm2_batched(const Arguments& arg) { testing_nrm2_batched_template(arg); }