/* ************************************************************************ * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include #include #include #include #include "testing_common.hpp" using namespace std; template hipblasStatus_t testing_getrf_strided_batched(const Arguments& argus) { using U = real_t; bool FORTRAN = argus.fortran; auto hipblasGetrfStridedBatchedFn = FORTRAN ? hipblasGetrfStridedBatched : hipblasGetrfStridedBatched; int M = argus.N; int N = argus.N; int lda = argus.lda; int batch_count = argus.batch_count; double stride_scale = argus.stride_scale; hipblasStride strideA = size_t(lda) * N * stride_scale; hipblasStride strideP = min(M, N) * stride_scale; size_t A_size = strideA * batch_count; size_t Ipiv_size = strideP * batch_count; // Check to prevent memory allocation error if(M < 0 || N < 0 || lda < M || batch_count < 0) { return HIPBLAS_STATUS_INVALID_VALUE; } if(batch_count == 0) { return HIPBLAS_STATUS_SUCCESS; } // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); host_vector hA1(A_size); host_vector hIpiv(Ipiv_size); host_vector hIpiv1(Ipiv_size); host_vector hInfo(batch_count); host_vector hInfo1(batch_count); device_vector dA(A_size); device_vector dIpiv(Ipiv_size); device_vector dInfo(batch_count); double gpu_time_used, hipblas_error; hipblasLocalHandle handle(argus); // Initial hA on CPU srand(1); for(int b = 0; b < batch_count; b++) { T* hAb = hA.data() + b * strideA; hipblas_init(hAb, M, N, lda); // scale A to avoid singularities for(int i = 0; i < M; i++) { for(int j = 0; j < N; j++) { if(i == j) hAb[i + j * lda] += 400; else hAb[i + j * lda] -= 4; } } } // Copy data from CPU to device CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), A_size * sizeof(T), hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemset(dIpiv, 0, Ipiv_size * sizeof(int))); CHECK_HIP_ERROR(hipMemset(dInfo, 0, batch_count * sizeof(int))); if(argus.unit_check || argus.norm_check) { /* ===================================================================== HIPBLAS =================================================================== */ CHECK_HIPBLAS_ERROR(hipblasGetrfStridedBatchedFn( handle, N, dA, lda, strideA, dIpiv, strideP, dInfo, batch_count)); // Copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hA1.data(), dA, A_size * sizeof(T), hipMemcpyDeviceToHost)); CHECK_HIP_ERROR( hipMemcpy(hIpiv1.data(), dIpiv, Ipiv_size * sizeof(int), hipMemcpyDeviceToHost)); CHECK_HIP_ERROR( hipMemcpy(hInfo1.data(), dInfo, batch_count * sizeof(int), hipMemcpyDeviceToHost)); /* ===================================================================== CPU LAPACK =================================================================== */ for(int b = 0; b < batch_count; b++) { hInfo[b] = cblas_getrf(M, N, hA.data() + b * strideA, lda, hIpiv.data() + b * strideP); } hipblas_error = norm_check_general('F', M, N, lda, strideA, hA, hA1, batch_count); if(argus.unit_check) { U eps = std::numeric_limits::epsilon(); double tolerance = eps * 2000; unit_check_error(hipblas_error, tolerance); } } if(argus.timing) { hipStream_t stream; CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) { if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); CHECK_HIPBLAS_ERROR(hipblasGetrfStridedBatchedFn( handle, N, dA, lda, strideA, dIpiv, strideP, dInfo, batch_count)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; ArgumentModel{}.log_args( std::cout, argus, gpu_time_used, getrf_gflop_count(N, M), ArgumentLogging::NA_value, hipblas_error); } return HIPBLAS_STATUS_SUCCESS; }