/* ************************************************************************ * Copyright 2016 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include #include #include #include #include "cblas_interface.h" #include "flops.h" #include "hipblas.hpp" #include "norm.h" #include "unit.h" #include "utility.h" using namespace std; /* ============================================================================================ */ template hipblasStatus_t testing_ger_strided_batched(Arguments argus) { int M = argus.M; int N = argus.N; int incx = argus.incx; int incy = argus.incy; int lda = argus.lda; double stride_scale = argus.stride_scale; int batch_count = argus.batch_count; int stride_A = lda * N * stride_scale; int stride_x = M * incx * stride_scale; int stride_y = N * incy * stride_scale; int A_size = stride_A * batch_count; int x_size = stride_x * batch_count; int y_size = stride_y * batch_count; hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory if(M < 0 || N < 0 || lda < 0 || incx <= 0 || incy <= 0 || batch_count < 0) { return HIPBLAS_STATUS_INVALID_VALUE; } else if(batch_count == 0) { return HIPBLAS_STATUS_SUCCESS; } // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); host_vector hB(A_size); host_vector hx(x_size); host_vector hy(y_size); device_vector dA(A_size); device_vector dx(x_size); device_vector dy(y_size); double gpu_time_used, cpu_time_used; double hipblasGflops, cblas_gflops, hipblasBandwidth; double rocblas_error; T alpha = (T)argus.alpha; hipblasHandle_t handle; hipblasCreate(&handle); // Initial Data on CPU srand(1); hipblas_init(hA, M, N, lda, stride_A, batch_count); hipblas_init(hx, 1, M, incx, stride_x, batch_count); hipblas_init(hy, 1, N, incy, stride_y, batch_count); // copy matrix is easy in STL; hB = hA: save a copy in hB which will be output of CPU BLAS hB = hA; // copy data from CPU to device hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice); hipMemcpy(dx, hx.data(), sizeof(T) * x_size, hipMemcpyHostToDevice); hipMemcpy(dy, hy.data(), sizeof(T) * y_size, hipMemcpyHostToDevice); /* ===================================================================== ROCBLAS =================================================================== */ if(argus.timing) { gpu_time_used = get_time_us(); // in microseconds } for(int iter = 0; iter < 1; iter++) { status = hipblasGerStridedBatched(handle, M, N, (T*)&alpha, dx, incx, stride_x, dy, incy, stride_y, dA, lda, stride_A, batch_count); if(status != HIPBLAS_STATUS_SUCCESS) { hipblasDestroy(handle); return status; } } // copy output from device to CPU hipMemcpy(hA.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost); if(argus.unit_check) { /* ===================================================================== CPU BLAS =================================================================== */ for(int b = 0; b < batch_count; b++) { cblas_ger(M, N, alpha, hx.data() + b * stride_x, incx, hy.data() + b * stride_y, incy, hB.data() + b * stride_A, lda); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { unit_check_general(M, N, batch_count, lda, stride_A, hB.data(), hA.data()); } } hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; }