/* ************************************************************************ * Copyright 2016 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include #include #include #include #include "cblas_interface.h" #include "flops.h" #include "hipblas.hpp" #include "norm.h" #include "unit.h" #include "utility.h" using namespace std; /* ============================================================================================ */ template hipblasStatus_t testing_syr_strided_batched(Arguments argus) { int M = argus.M; int N = argus.N; int incx = argus.incx; int lda = argus.lda; char char_uplo = argus.uplo_option; hipblasFillMode_t uplo = char2hipblas_fill(char_uplo); double stride_scale = argus.stride_scale; int batch_count = argus.batch_count; int abs_incx = incx < 0 ? -incx : incx; int strideA = lda * N * stride_scale; int stridex = abs_incx * N * stride_scale; int A_size = strideA * batch_count; int x_size = stridex * batch_count; hipblasStatus_t status = HIPBLAS_STATUS_SUCCESS; // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory // TODO: not ACTUALLY incx < 0 returns invalid as a workaround for cuda tests right now if(M < 0 || N < 0 || lda < 0 || incx <= 0 || batch_count < 0) { return HIPBLAS_STATUS_INVALID_VALUE; } else if(batch_count == 0) { return HIPBLAS_STATUS_SUCCESS; } // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); host_vector hA_cpu(A_size); host_vector hx(x_size); device_vector dA(A_size); device_vector dx(x_size); double gpu_time_used, cpu_time_used; double hipblasGflops, cblas_gflops, hipblasBandwidth; double rocblas_error; T alpha = (T)argus.alpha; hipblasHandle_t handle; hipblasCreate(&handle); // Initial Data on CPU srand(1); hipblas_init(hA, M, N, lda, strideA, batch_count); hipblas_init(hx, 1, N, abs_incx, stridex, batch_count); hA_cpu = hA; // copy data from CPU to device hipMemcpy(dA, hA.data(), sizeof(T) * A_size, hipMemcpyHostToDevice); hipMemcpy(dx, hx.data(), sizeof(T) * x_size, hipMemcpyHostToDevice); /* ===================================================================== ROCBLAS =================================================================== */ if(argus.timing) { gpu_time_used = get_time_us(); // in microseconds } for(int iter = 0; iter < 1; iter++) { status = hipblasSyrStridedBatched( handle, uplo, N, (T*)&alpha, dx, incx, stridex, dA, lda, strideA, batch_count); if(status != HIPBLAS_STATUS_SUCCESS) { hipblasDestroy(handle); return status; } } // copy output from device to CPU hipMemcpy(hA.data(), dA, sizeof(T) * A_size, hipMemcpyDeviceToHost); if(argus.unit_check) { /* ===================================================================== CPU BLAS =================================================================== */ for(int b = 0; b < batch_count; b++) { cblas_syr( uplo, N, alpha, hx.data() + b * stridex, incx, hA_cpu.data() + b * strideA, lda); } // enable unit check, notice unit check is not invasive, but norm check is, // unit check and norm check can not be interchanged their order if(argus.unit_check) { unit_check_general(1, N, batch_count, lda, strideA, hA, hA_cpu); } } hipblasDestroy(handle); return HIPBLAS_STATUS_SUCCESS; }