/* ************************************************************************ * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include #include #include #include "testing_common.hpp" using namespace std; /* ============================================================================================ */ template hipblasStatus_t testing_dot_strided_batched(const Arguments& argus) { bool FORTRAN = argus.fortran; auto hipblasDotStridedBatchedFn = FORTRAN ? (CONJ ? hipblasDotcStridedBatched : hipblasDotStridedBatched) : (CONJ ? hipblasDotcStridedBatched : hipblasDotStridedBatched); int N = argus.N; int incx = argus.incx; int incy = argus.incy; double stride_scale = argus.stride_scale; int batch_count = argus.batch_count; int abs_incx = incx >= 0 ? incx : -incx; int abs_incy = incy >= 0 ? incy : -incy; hipblasStride stridex = size_t(N) * abs_incx * stride_scale; hipblasStride stridey = size_t(N) * abs_incy * stride_scale; size_t sizeX = stridex * batch_count; size_t sizeY = stridey * batch_count; if(!sizeX) sizeX = 1; if(!sizeY) sizeY = 1; hipblasLocalHandle handle(argus); // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(N <= 0 || batch_count <= 0) { device_vector d_hipblas_result_0(std::max(batch_count, 1)); host_vector h_hipblas_result_0(std::max(1, batch_count)); hipblas_init_nan(h_hipblas_result_0.data(), std::max(1, batch_count)); CHECK_HIP_ERROR(hipMemcpy(d_hipblas_result_0, h_hipblas_result_0, sizeof(T) * std::max(1, batch_count), hipMemcpyHostToDevice)); CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); CHECK_HIPBLAS_ERROR(hipblasDotStridedBatchedFn(handle, N, nullptr, incx, stridex, nullptr, incy, stridey, batch_count, d_hipblas_result_0)); if(batch_count > 0) { host_vector cpu_0(batch_count); host_vector gpu_0(batch_count); CHECK_HIP_ERROR(hipMemcpy( gpu_0, d_hipblas_result_0, sizeof(T) * batch_count, hipMemcpyDeviceToHost)); unit_check_general(1, batch_count, 1, cpu_0, gpu_0); } return HIPBLAS_STATUS_SUCCESS; } // Naming: dX is in GPU (device) memory. hK is in CPU (host) memory, plz follow this practice host_vector hx(sizeX); host_vector hy(sizeY); host_vector h_hipblas_result1(batch_count); host_vector h_hipblas_result2(batch_count); host_vector h_cpu_result(batch_count); device_vector dx(sizeX); device_vector dy(sizeY); device_vector d_hipblas_result(batch_count); double gpu_time_used, hipblas_error_host, hipblas_error_device; // Initial Data on CPU srand(1); hipblas_init_alternating_sign(hx, 1, N, abs_incx, stridex, batch_count); hipblas_init(hy, 1, N, abs_incy, stridey, batch_count); // copy data from CPU to device, does not work for incx != 1 CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * sizeX, hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * sizeY, hipMemcpyHostToDevice)); if(argus.unit_check || argus.norm_check) { /* ===================================================================== HIPBLAS =================================================================== */ // hipblasDot accept both dev/host pointer for the scalar CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); CHECK_HIPBLAS_ERROR((hipblasDotStridedBatchedFn)(handle, N, dx, incx, stridex, dy, incy, stridey, batch_count, d_hipblas_result)); CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); CHECK_HIPBLAS_ERROR((hipblasDotStridedBatchedFn)(handle, N, dx, incx, stridex, dy, incy, stridey, batch_count, h_hipblas_result1)); CHECK_HIP_ERROR(hipMemcpy( h_hipblas_result2, d_hipblas_result, sizeof(T) * batch_count, hipMemcpyDeviceToHost)); /* ===================================================================== CPU BLAS =================================================================== */ for(int b = 0; b < batch_count; b++) { (CONJ ? cblas_dotc : cblas_dot)(N, hx.data() + b * stridex, incx, hy.data() + b * stridey, incy, &h_cpu_result[b]); } if(argus.unit_check) { unit_check_general(1, batch_count, 1, h_cpu_result, h_hipblas_result1); unit_check_general(1, batch_count, 1, h_cpu_result, h_hipblas_result2); } if(argus.norm_check) { hipblas_error_host = norm_check_general('F', 1, batch_count, 1, h_cpu_result, h_hipblas_result1); hipblas_error_device = norm_check_general('F', 1, batch_count, 1, h_cpu_result, h_hipblas_result2); } } // end of if unit/norm check if(argus.timing) { hipStream_t stream; CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) { if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); CHECK_HIPBLAS_ERROR((hipblasDotStridedBatchedFn)(handle, N, dx, incx, stridex, dy, incy, stridey, batch_count, d_hipblas_result)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; ArgumentModel{}.log_args( std::cout, argus, gpu_time_used, dot_gflop_count(N), dot_gbyte_count(N), hipblas_error_host, hipblas_error_device); } return HIPBLAS_STATUS_SUCCESS; } template hipblasStatus_t testing_dotc_strided_batched(const Arguments& argus) { return testing_dot_strided_batched(argus); }