/* ************************************************************************ * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include #include #include #include #include "testing_common.hpp" using namespace std; /* ============================================================================================ */ template hipblasStatus_t testing_gbmv_strided_batched(const Arguments& argus) { bool FORTRAN = argus.fortran; auto hipblasGbmvStridedBatchedFn = FORTRAN ? hipblasGbmvStridedBatched : hipblasGbmvStridedBatched; int M = argus.M; int N = argus.N; int KL = argus.KL; int KU = argus.KU; int lda = argus.lda; int incx = argus.incx; int incy = argus.incy; double stride_scale = argus.stride_scale; int batch_count = argus.batch_count; hipblasStride stride_A = size_t(lda) * N * stride_scale; hipblasStride stride_x; hipblasStride stride_y; size_t A_size = stride_A * batch_count; int dim_x; int dim_y; hipblasOperation_t transA = char2hipblas_operation(argus.transA_option); if(transA == HIPBLAS_OP_N) { dim_x = N; dim_y = M; } else { dim_x = M; dim_y = N; } int abs_incx = incx >= 0 ? incx : -incx; int abs_incy = incy >= 0 ? incy : -incy; stride_x = size_t(dim_x) * abs_incx * stride_scale; stride_y = size_t(dim_y) * abs_incy * stride_scale; size_t X_size = stride_x * batch_count; size_t Y_size = stride_y * batch_count; hipblasLocalHandle handle(argus); // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory bool invalid_size = M < 0 || N < 0 || lda < KL + KU + 1 || !incx || !incy || KL < 0 || KU < 0 || batch_count < 0; if(invalid_size || !M || !N || !batch_count) { hipblasStatus_t actual = hipblasGbmvStridedBatchedFn(handle, transA, M, N, KL, KU, nullptr, nullptr, lda, stride_A, nullptr, incx, stride_x, nullptr, nullptr, incy, stride_y, batch_count); EXPECT_HIPBLAS_STATUS( actual, (invalid_size ? HIPBLAS_STATUS_INVALID_VALUE : HIPBLAS_STATUS_SUCCESS)); return actual; } // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); host_vector hx(X_size); host_vector hy(Y_size); host_vector hy_host(Y_size); host_vector hy_device(Y_size); host_vector hy_cpu(Y_size); device_vector dA(A_size); device_vector dx(X_size); device_vector dy(Y_size); device_vector d_alpha(1); device_vector d_beta(1); double gpu_time_used, hipblas_error_host, hipblas_error_device; T h_alpha = argus.get_alpha(); T h_beta = argus.get_beta(); // Initial Data on CPU srand(1); hipblas_init(hA, M, N, lda, stride_A, batch_count); hipblas_init(hx, 1, dim_x, abs_incx, stride_x, batch_count); hipblas_init(hy, 1, dim_y, abs_incy, stride_y, batch_count); // copy vector is easy in STL; hz = hy: save a copy in hz which will be output of CPU BLAS hy_cpu = hy; // copy data from CPU to device CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * X_size, hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemcpy(d_beta, &h_beta, sizeof(T), hipMemcpyHostToDevice)); if(argus.unit_check || argus.norm_check) { /* ===================================================================== HIPBLAS =================================================================== */ CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); CHECK_HIPBLAS_ERROR(hipblasGbmvStridedBatchedFn(handle, transA, M, N, KL, KU, (T*)&h_alpha, dA, lda, stride_A, dx, incx, stride_x, (T*)&h_beta, dy, incy, stride_y, batch_count)); CHECK_HIP_ERROR(hipMemcpy(hy_host.data(), dy, sizeof(T) * Y_size, hipMemcpyDeviceToHost)); CHECK_HIP_ERROR(hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice)); CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); CHECK_HIPBLAS_ERROR(hipblasGbmvStridedBatchedFn(handle, transA, M, N, KL, KU, d_alpha, dA, lda, stride_A, dx, incx, stride_x, d_beta, dy, incy, stride_y, batch_count)); CHECK_HIP_ERROR(hipMemcpy(hy_device.data(), dy, sizeof(T) * Y_size, hipMemcpyDeviceToHost)); /* ===================================================================== CPU BLAS =================================================================== */ for(int b = 0; b < batch_count; b++) { cblas_gbmv(transA, M, N, KL, KU, h_alpha, hA.data() + b * stride_A, lda, hx.data() + b * stride_x, incx, h_beta, hy_cpu.data() + b * stride_y, incy); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { unit_check_general(1, dim_y, batch_count, abs_incy, stride_y, hy_cpu, hy_host); unit_check_general(1, dim_y, batch_count, abs_incy, stride_y, hy_cpu, hy_device); } if(argus.norm_check) { hipblas_error_host = norm_check_general( 'F', 1, dim_y, abs_incy, stride_y, hy_cpu, hy_host, batch_count); hipblas_error_device = norm_check_general( 'F', 1, dim_y, abs_incy, stride_y, hy_cpu, hy_device, batch_count); } } if(argus.timing) { hipMemcpy(dy, hy.data(), sizeof(T) * Y_size, hipMemcpyHostToDevice); hipStream_t stream; CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_DEVICE)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) { if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); CHECK_HIPBLAS_ERROR(hipblasGbmvStridedBatchedFn(handle, transA, M, N, KL, KU, d_alpha, dA, lda, stride_A, dx, incx, stride_x, d_beta, dy, incy, stride_y, batch_count)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; ArgumentModel{} .log_args(std::cout, argus, gpu_time_used, gbmv_gflop_count(transA, M, N, KL, KU), gbmv_gbyte_count(transA, M, N, KL, KU), hipblas_error_host, hipblas_error_device); } return HIPBLAS_STATUS_SUCCESS; }