/* ************************************************************************ * Copyright 2016-2021 Advanced Micro Devices, Inc. * * ************************************************************************ */ #include #include #include #include #include "testing_common.hpp" using namespace std; template hipblasStatus_t testing_geqrf_strided_batched(const Arguments& argus) { using U = real_t; bool FORTRAN = argus.fortran; auto hipblasGeqrfStridedBatchedFn = FORTRAN ? hipblasGeqrfStridedBatched : hipblasGeqrfStridedBatched; int M = argus.M; int N = argus.N; int lda = argus.lda; int batch_count = argus.batch_count; double stride_scale = argus.stride_scale; hipblasStride strideA = lda * N * stride_scale; hipblasStride strideP = min(M, N) * stride_scale; int A_size = strideA * batch_count; int Ipiv_size = strideP * batch_count; // Check to prevent memory allocation error if(M < 0 || N < 0 || lda < M || batch_count < 0) { return HIPBLAS_STATUS_INVALID_VALUE; } if(batch_count == 0) { return HIPBLAS_STATUS_SUCCESS; } // Naming: dK is in GPU (device) memory. hK is in CPU (host) memory host_vector hA(A_size); host_vector hA1(A_size); host_vector hIpiv(Ipiv_size); host_vector hIpiv1(Ipiv_size); int info; device_vector dA(A_size); device_vector dIpiv(Ipiv_size); double gpu_time_used, hipblas_error; hipblasLocalHandle handle(argus); // Initial hA on CPU srand(1); for(int b = 0; b < batch_count; b++) { T* hAb = hA.data() + b * strideA; hipblas_init(hAb, M, N, lda); // scale A to avoid singularities for(int i = 0; i < M; i++) { for(int j = 0; j < N; j++) { if(i == j) hAb[i + j * lda] += 400; else hAb[i + j * lda] -= 4; } } } // Copy data from CPU to device CHECK_HIP_ERROR(hipMemcpy(dA, hA.data(), A_size * sizeof(T), hipMemcpyHostToDevice)); CHECK_HIP_ERROR(hipMemset(dIpiv, 0, Ipiv_size * sizeof(T))); /* ===================================================================== HIPBLAS =================================================================== */ CHECK_HIPBLAS_ERROR(hipblasGeqrfStridedBatchedFn( handle, M, N, dA, lda, strideA, dIpiv, strideP, &info, batch_count)); // Copy output from device to CPU CHECK_HIP_ERROR(hipMemcpy(hA1.data(), dA, A_size * sizeof(T), hipMemcpyDeviceToHost)); CHECK_HIP_ERROR(hipMemcpy(hIpiv1.data(), dIpiv, Ipiv_size * sizeof(T), hipMemcpyDeviceToHost)); if(argus.unit_check || argus.norm_check) { /* ===================================================================== CPU LAPACK =================================================================== */ // Workspace query host_vector work(1); cblas_geqrf(M, N, hA.data(), lda, hIpiv.data(), work.data(), -1); int lwork = type2int(work[0]); // Perform factorization work = host_vector(lwork); for(int b = 0; b < batch_count; b++) { cblas_geqrf( M, N, hA.data() + b * strideA, lda, hIpiv.data() + b * strideP, work.data(), N); } double e1 = norm_check_general('F', M, N, lda, strideA, hA, hA1, batch_count); double e2 = norm_check_general( 'F', min(M, N), 1, min(M, N), strideP, hIpiv, hIpiv1, batch_count); hipblas_error = e1 + e2; if(argus.unit_check) { U eps = std::numeric_limits::epsilon(); double tolerance = eps * 2000; unit_check_error(e1, tolerance); unit_check_error(e2, tolerance); } } if(argus.timing) { hipStream_t stream; CHECK_HIPBLAS_ERROR(hipblasGetStream(handle, &stream)); int runs = argus.cold_iters + argus.iters; for(int iter = 0; iter < runs; iter++) { if(iter == argus.cold_iters) gpu_time_used = get_time_us_sync(stream); CHECK_HIPBLAS_ERROR(hipblasGeqrfStridedBatchedFn( handle, M, N, dA, lda, strideA, dIpiv, strideP, &info, batch_count)); } gpu_time_used = get_time_us_sync(stream) - gpu_time_used; ArgumentModel{}.log_args( std::cout, argus, gpu_time_used, geqrf_gflop_count(N, M), ArgumentLogging::NA_value, hipblas_error); } return HIPBLAS_STATUS_SUCCESS; }