/* ************************************************************************ * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include #include #include #include #include "testing_common.hpp" using namespace std; /* ============================================================================================ */ template hipblasStatus_t testing_trsv_batched(const Arguments& argus) { bool FORTRAN = argus.fortran; auto hipblasTrsvBatchedFn = FORTRAN ? hipblasTrsvBatched : hipblasTrsvBatched; int M = argus.M; int incx = argus.incx; int lda = argus.lda; char char_uplo = argus.uplo_option; char char_diag = argus.diag_option; char char_transA = argus.transA_option; hipblasFillMode_t uplo = char2hipblas_fill(char_uplo); hipblasDiagType_t diag = char2hipblas_diagonal(char_diag); hipblasOperation_t transA = char2hipblas_operation(char_transA); int batch_count = argus.batch_count; int abs_incx = incx < 0 ? -incx : incx; size_t size_A = size_t(lda) * M; hipblasLocalHandle handle(argus); // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory bool invalid_size = M < 0 || lda < M || lda < 1 || !incx || batch_count < 0; if(invalid_size || !M || !batch_count) { hipblasStatus_t actual = hipblasTrsvBatchedFn( handle, uplo, transA, diag, M, nullptr, lda, nullptr, incx, batch_count); EXPECT_HIPBLAS_STATUS( actual, (invalid_size ? HIPBLAS_STATUS_INVALID_VALUE : HIPBLAS_STATUS_SUCCESS)); return actual; } // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_batch_vector hA(size_A, 1, batch_count); host_batch_vector AAT(size_A, 1, batch_count); host_batch_vector hb(M, incx, batch_count); host_batch_vector hx(M, incx, batch_count); host_batch_vector hx_or_b_1(M, incx, batch_count); device_batch_vector dA(size_A, 1, batch_count); device_batch_vector dx_or_b(M, incx, batch_count); CHECK_HIP_ERROR(dA.memcheck()); CHECK_HIP_ERROR(dx_or_b.memcheck()); double gpu_time_used, hipblas_error, cumulative_hipblas_error; // Initial Data on CPU srand(1); for(int b = 0; b < batch_count; b++) { srand(1); hipblas_init(hA[b], M, M, lda); // calculate AAT = hA * hA ^ T cblas_gemm(HIPBLAS_OP_N, HIPBLAS_OP_T, M, M, M, (T)1.0, (T*)hA[b], lda, (T*)hA[b], lda, (T)0.0, (T*)AAT[b], lda); // copy AAT into hA, make hA strictly diagonal dominant, and therefore SPD for(int i = 0; i < M; i++) { T t = 0.0; for(int j = 0; j < M; j++) { hA[b][i + j * lda] = AAT[b][i + j * lda]; t += abs(AAT[b][i + j * lda]); } hA[b][i + i * lda] = t; } // calculate Cholesky factorization of SPD matrix hA cblas_potrf(char_uplo, M, hA[b], lda); // make hA unit diagonal if diag == rocblas_diagonal_unit if(char_diag == 'U' || char_diag == 'u') { if('L' == char_uplo || 'l' == char_uplo) for(int i = 0; i < M; i++) { T diag = hA[b][i + i * lda]; for(int j = 0; j <= i; j++) hA[b][i + j * lda] = hA[b][i + j * lda] / diag; } else for(int j = 0; j < M; j++) { T diag = hA[b][j + j * lda]; for(int i = 0; i <= j; i++) hA[b][i + j * lda] = hA[b][i + j * lda] / diag; } } } hipblas_init(hx, false); hb.copy_from(hx); for(int b = 0; b < batch_count; b++) { // Calculate hb = hA*hx; cblas_trmv(uplo, transA, diag, M, hA[b], lda, hb[b], incx); } hx_or_b_1.copy_from(hb); CHECK_HIP_ERROR(dx_or_b.transfer_from(hx_or_b_1)); CHECK_HIP_ERROR(dA.transfer_from(hA)); /* ===================================================================== HIPBLAS =================================================================== */ if(argus.unit_check || argus.norm_check) { CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); CHECK_HIPBLAS_ERROR(hipblasTrsvBatchedFn(handle, uplo, transA, diag, M, dA.ptr_on_device(), lda, dx_or_b.ptr_on_device(), incx, batch_count)); CHECK_HIP_ERROR(hx_or_b_1.transfer_from(dx_or_b)); // Calculating error // For norm_check/bench, currently taking the cumulative sum of errors over all batches for(int b = 0; b < batch_count; b++) { hipblas_error = std::abs(vector_norm_1(M, abs_incx, hx[b], hx_or_b_1[b])); if(argus.unit_check) { double tolerance = std::numeric_limits>::epsilon() * 40 * M; unit_check_error(hipblas_error, tolerance); } cumulative_hipblas_error += hipblas_error; } } if(argus.timing) { hipStream_t stream; CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) { if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); CHECK_HIPBLAS_ERROR(hipblasTrsvBatchedFn(handle, uplo, transA, diag, M, dA.ptr_on_device(), lda, dx_or_b.ptr_on_device(), incx, batch_count)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; // in microseconds ArgumentModel{} .log_args(std::cout, argus, gpu_time_used, trsv_gflop_count(M), trsv_gbyte_count(M), cumulative_hipblas_error); } return HIPBLAS_STATUS_SUCCESS; }