/* ************************************************************************ * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include #include #include #include #include "testing_common.hpp" using namespace std; /* ============================================================================================ */ template hipblasStatus_t testing_tpsv_batched(const Arguments& argus) { bool FORTRAN = argus.fortran; auto hipblasTpsvBatchedFn = FORTRAN ? hipblasTpsvBatched : hipblasTpsvBatched; int N = argus.N; int incx = argus.incx; char char_uplo = argus.uplo_option; char char_diag = argus.diag_option; char char_transA = argus.transA_option; hipblasFillMode_t uplo = char2hipblas_fill(char_uplo); hipblasDiagType_t diag = char2hipblas_diagonal(char_diag); hipblasOperation_t transA = char2hipblas_operation(char_transA); int batch_count = argus.batch_count; int abs_incx = incx < 0 ? -incx : incx; size_t size_A = size_t(N) * N; size_t size_AP = size_t(N) * (N + 1) / 2; size_t size_x = abs_incx * size_t(N); hipblasLocalHandle handle(argus); // argument sanity check, quick return if input parameters are invalid before allocating invalid // memory bool invalid_size = N < 0 || !incx || batch_count < 0; if(invalid_size || !N || !batch_count) { hipblasStatus_t actual = hipblasTpsvBatchedFn( handle, uplo, transA, diag, N, nullptr, nullptr, incx, batch_count); EXPECT_HIPBLAS_STATUS( actual, (invalid_size ? HIPBLAS_STATUS_INVALID_VALUE : HIPBLAS_STATUS_SUCCESS)); return actual; } // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_batch_vector hA(size_A, 1, batch_count); host_batch_vector hAP(size_AP, 1, batch_count); host_batch_vector AAT(size_A, 1, batch_count); host_batch_vector hb(N, incx, batch_count); host_batch_vector hx(N, incx, batch_count); host_batch_vector hx_or_b_1(N, incx, batch_count); host_batch_vector hx_or_b_2(N, incx, batch_count); host_batch_vector cpu_x_or_b(N, incx, batch_count); device_batch_vector dAP(size_AP, 1, batch_count); device_batch_vector dx_or_b(N, incx, batch_count); CHECK_HIP_ERROR(dAP.memcheck()); CHECK_HIP_ERROR(dx_or_b.memcheck()); double gpu_time_used, hipblas_error, cumulative_hipblas_error = 0; // Initial Data on CPU hipblas_init(hA, true); hipblas_init(hx); hb.copy_from(hx); for(int b = 0; b < batch_count; b++) { // calculate AAT = hA * hA ^ T cblas_gemm(HIPBLAS_OP_N, HIPBLAS_OP_T, N, N, N, (T)1.0, (T*)hA[b], N, (T*)hA[b], N, (T)0.0, (T*)AAT[b], N); // copy AAT into hA, make hA strictly diagonal dominant, and therefore SPD for(int i = 0; i < N; i++) { T t = 0.0; for(int j = 0; j < N; j++) { hA[b][i + j * N] = AAT[b][i + j * N]; t += abs(AAT[b][i + j * N]); } hA[b][i + i * N] = t; } // calculate Cholesky factorization of SPD matrix hA cblas_potrf(char_uplo, N, hA[b], N); // make hA unit diagonal if diag == rocblas_diagonal_unit if(char_diag == 'U' || char_diag == 'u') { if('L' == char_uplo || 'l' == char_uplo) for(int i = 0; i < N; i++) { T diag = hA[b][i + i * N]; for(int j = 0; j <= i; j++) hA[b][i + j * N] = hA[b][i + j * N] / diag; } else for(int j = 0; j < N; j++) { T diag = hA[b][j + j * N]; for(int i = 0; i <= j; i++) hA[b][i + j * N] = hA[b][i + j * N] / diag; } } // Calculate hb = hA*hx; cblas_trmv(uplo, transA, diag, N, hA[b], N, hb[b], incx); regular_to_packed(uplo == HIPBLAS_FILL_MODE_UPPER, (T*)hA[b], (T*)hAP[b], N); } cpu_x_or_b.copy_from(hb); hx_or_b_1.copy_from(hb); hx_or_b_2.copy_from(hb); CHECK_HIP_ERROR(dAP.transfer_from(hAP)); CHECK_HIP_ERROR(dx_or_b.transfer_from(hx_or_b_1)); /* ===================================================================== HIPBLAS =================================================================== */ if(argus.unit_check || argus.norm_check) { CHECK_HIPBLAS_ERROR(hipblasTpsvBatchedFn(handle, uplo, transA, diag, N, dAP.ptr_on_device(), dx_or_b.ptr_on_device(), incx, batch_count)); // copy output from device to CPU CHECK_HIP_ERROR(hx_or_b_1.transfer_from(dx_or_b)); // Calculating error // For norm_check/bench, currently taking the cumulative sum of errors over all batches for(int b = 0; b < batch_count; b++) { hipblas_error = std::abs(vector_norm_1(N, abs_incx, hx[b], hx_or_b_1[b])); if(argus.unit_check) { double tolerance = std::numeric_limits>::epsilon() * 40 * N; unit_check_error(hipblas_error, tolerance); } cumulative_hipblas_error += hipblas_error; } } if(argus.timing) { hipStream_t stream; CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); CHECK_HIPBLAS_ERROR(hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) { if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); CHECK_HIPBLAS_ERROR(hipblasTpsvBatchedFn(handle, uplo, transA, diag, N, dAP.ptr_on_device(), dx_or_b.ptr_on_device(), incx, batch_count)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; // in microseconds ArgumentModel{} .log_args(std::cout, argus, gpu_time_used, tpsv_gflop_count(N), tpsv_gbyte_count(N), cumulative_hipblas_error); } return HIPBLAS_STATUS_SUCCESS; }