/*******************************************************************************
 *
 * MIT License
 *
 * Copyright (c) 2020 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 *******************************************************************************/
; generated by igemm_codegen.py
;
.macro .v_u32_div v_q, v_n, v_d, v_tmp4, s_tmp4
    v_cvt_f32_u32     v[\v_tmp4+0],   v[\v_d]
    v_rcp_f32         v[\v_tmp4+0],   v[\v_tmp4+0]
    v_mul_f32         v[\v_tmp4+0],   0x4f800000, v[\v_tmp4+0]
    v_cvt_u32_f32     v[\v_tmp4+0],   v[\v_tmp4+0]
    v_mul_lo_u32      v[\v_tmp4+1],   v[\v_d],      v[\v_tmp4+0]
    v_mul_hi_u32      v[\v_tmp4+2],   v[\v_d],      v[\v_tmp4+0]
    v_sub_co_u32      v[\v_tmp4+3],   vcc, 0,     v[\v_tmp4+1]
    v_cmp_ne_i32      s[\s_tmp4:\s_tmp4+1], 0,          v[\v_tmp4+2]
    v_cndmask_b32     v[\v_tmp4+1],   v[\v_tmp4+3],   v[\v_tmp4+1],   s[\s_tmp4:\s_tmp4+1]
    v_mul_hi_u32      v[\v_tmp4+1],   v[\v_tmp4+1],   v[\v_tmp4+0]
    v_sub_co_u32      v[\v_tmp4+2],   vcc,        v[\v_tmp4+0],   v[\v_tmp4+1]
    v_add_co_u32      v[\v_tmp4+0],   vcc,        v[\v_tmp4+0],   v[\v_tmp4+1]
    v_cndmask_b32     v[\v_tmp4+0],   v[\v_tmp4+0],   v[\v_tmp4+2],   s[\s_tmp4:\s_tmp4+1]
    v_mul_hi_u32      v[\v_tmp4+0],   v[\v_tmp4+0],   v[\v_n]
    v_mul_lo_u32      v[\v_tmp4+1],   v[\v_tmp4+0],   v[\v_d]
    v_sub_co_u32      v[\v_tmp4+2],   vcc,        v[\v_n],      v[\v_tmp4+1]
    v_cmp_ge_u32      s[\s_tmp4:\s_tmp4+1], v[\v_n],      v[\v_tmp4+1]
    v_cmp_ge_u32      s[\s_tmp4+2:\s_tmp4+3], v[\v_tmp4+2],   v[\v_d]
    v_add_co_u32      v[\v_tmp4+2],   vcc, 1, v[\v_tmp4+0]
    s_and_b64         s[\s_tmp4+2:\s_tmp4+3], s[\s_tmp4:\s_tmp4+1], s[\s_tmp4+2:\s_tmp4+3]
    v_add_co_u32      v[\v_tmp4+1],   vcc, -1,    v[\v_tmp4+0]
    v_cndmask_b32     v[\v_tmp4+2],   v[\v_tmp4+0],   v[\v_tmp4+2],      s[\s_tmp4+2:\s_tmp4+3]
    v_cndmask_b32     v[\v_tmp4+2],   v[\v_tmp4+1],   v[\v_tmp4+2],      s[\s_tmp4:\s_tmp4+1]
    v_cmp_ne_i32      vcc,          0,          v[\v_d]
    v_cndmask_b32     v[\v_q],      -1,         v[\v_tmp4+2],      vcc
.endm

.macro .v_u32_div_vs v_q, v_n, s_d, v_tmp4, s_tmp4
    v_cvt_f32_u32     v[\v_tmp4+0],   s[\s_d]
    v_rcp_f32         v[\v_tmp4+0],   v[\v_tmp4+0]
    v_mul_f32         v[\v_tmp4+0],   0x4f800000, v[\v_tmp4+0]
    v_cvt_u32_f32     v[\v_tmp4+0],   v[\v_tmp4+0]
    v_mul_lo_u32      v[\v_tmp4+1],   s[\s_d],      v[\v_tmp4+0]
    v_mul_hi_u32      v[\v_tmp4+2],   s[\s_d],      v[\v_tmp4+0]
    v_sub_co_u32      v[\v_tmp4+3],   vcc, 0,     v[\v_tmp4+1]
    v_cmp_ne_i32      s[\s_tmp4:\s_tmp4+1], 0,          v[\v_tmp4+2]
    v_cndmask_b32     v[\v_tmp4+1],   v[\v_tmp4+3],   v[\v_tmp4+1],   s[\s_tmp4:\s_tmp4+1]
    v_mul_hi_u32      v[\v_tmp4+1],   v[\v_tmp4+1],   v[\v_tmp4+0]
    v_sub_co_u32      v[\v_tmp4+2],   vcc,        v[\v_tmp4+0],   v[\v_tmp4+1]
    v_add_co_u32      v[\v_tmp4+0],   vcc,        v[\v_tmp4+0],   v[\v_tmp4+1]
    v_cndmask_b32     v[\v_tmp4+0],   v[\v_tmp4+0],   v[\v_tmp4+2],   s[\s_tmp4:\s_tmp4+1]
    v_mul_hi_u32      v[\v_tmp4+0],   v[\v_tmp4+0],   v[\v_n]
    v_mul_lo_u32      v[\v_tmp4+1],   s[\s_d],     v[\v_tmp4+0]
    v_sub_co_u32      v[\v_tmp4+2],   vcc,        v[\v_n],      v[\v_tmp4+1]
    v_cmp_ge_u32      s[\s_tmp4:\s_tmp4+1], v[\v_n],      v[\v_tmp4+1]
    v_cmp_le_u32      s[\s_tmp4+2:\s_tmp4+3],  s[\s_d],    v[\v_tmp4+2]
    v_add_co_u32      v[\v_tmp4+2],   vcc, 1, v[\v_tmp4+0]
    s_and_b64         s[\s_tmp4+2:\s_tmp4+3], s[\s_tmp4:\s_tmp4+1], s[\s_tmp4+2:\s_tmp4+3]
    v_add_co_u32      v[\v_tmp4+1],   vcc, -1,    v[\v_tmp4+0]
    v_cndmask_b32     v[\v_tmp4+2],   v[\v_tmp4+0],   v[\v_tmp4+2],      s[\s_tmp4+2:\s_tmp4+3]
    v_cndmask_b32     v[\v_tmp4+2],   v[\v_tmp4+1],   v[\v_tmp4+2],      s[\s_tmp4:\s_tmp4+1]
    v_cmp_ne_i32      vcc,          s[\s_d],   0
    v_cndmask_b32     v[\v_q],      -1,         v[\v_tmp4+2],      vcc
.endm

.macro .v_u32_div_ss v_q, s_n, s_d, v_tmp4, s_tmp4
    v_cvt_f32_u32     v[\v_tmp4+0],   s[\s_d]
    v_rcp_f32         v[\v_tmp4+0],   v[\v_tmp4+0]
    v_mul_f32         v[\v_tmp4+0],   0x4f800000, v[\v_tmp4+0]
    v_cvt_u32_f32     v[\v_tmp4+0],   v[\v_tmp4+0]
    v_mul_lo_u32      v[\v_tmp4+1],   s[\s_d],      v[\v_tmp4+0]
    v_mul_hi_u32      v[\v_tmp4+2],   s[\s_d],      v[\v_tmp4+0]
    v_sub_co_u32      v[\v_tmp4+3],   vcc, 0,     v[\v_tmp4+1]
    v_cmp_ne_i32      s[\s_tmp4:\s_tmp4+1], 0,          v[\v_tmp4+2]
    v_cndmask_b32     v[\v_tmp4+1],   v[\v_tmp4+3],   v[\v_tmp4+1],   s[\s_tmp4:\s_tmp4+1]
    v_mul_hi_u32      v[\v_tmp4+1],   v[\v_tmp4+1],   v[\v_tmp4+0]
    v_sub_co_u32      v[\v_tmp4+2],   vcc,        v[\v_tmp4+0],   v[\v_tmp4+1]
    v_add_co_u32      v[\v_tmp4+0],   vcc,        v[\v_tmp4+0],   v[\v_tmp4+1]
    v_cndmask_b32     v[\v_tmp4+0],   v[\v_tmp4+0],   v[\v_tmp4+2],   s[\s_tmp4:\s_tmp4+1]
    v_mul_hi_u32      v[\v_tmp4+0],   s[\s_n],   v[\v_tmp4+0]
    v_mul_lo_u32      v[\v_tmp4+1],   s[\s_d],     v[\v_tmp4+0]
    v_sub_co_u32      v[\v_tmp4+2],   vcc,        s[\s_n],      v[\v_tmp4+1]
    v_cmp_ge_u32      s[\s_tmp4:\s_tmp4+1], s[\s_n],      v[\v_tmp4+1]
    v_cmp_le_u32      s[\s_tmp4+2:\s_tmp4+3],  s[\s_d],    v[\v_tmp4+2]
    v_add_co_u32      v[\v_tmp4+2],   vcc, 1, v[\v_tmp4+0]
    s_and_b64         s[\s_tmp4+2:\s_tmp4+3], s[\s_tmp4:\s_tmp4+1], s[\s_tmp4+2:\s_tmp4+3]
    v_add_co_u32      v[\v_tmp4+1],   vcc, -1,    v[\v_tmp4+0]
    v_cndmask_b32     v[\v_tmp4+2],   v[\v_tmp4+0],   v[\v_tmp4+2],      s[\s_tmp4+2:\s_tmp4+3]
    v_cndmask_b32     v[\v_tmp4+2],   v[\v_tmp4+1],   v[\v_tmp4+2],      s[\s_tmp4:\s_tmp4+1]
    v_cmp_ne_i32      vcc,          s[\s_d],   0
    v_cndmask_b32     v[\v_q],      -1,         v[\v_tmp4+2],      vcc
.endm

; write 1d tensor to global with stride
.macro .v_write1d_strided v_src, s_p_buf_dst, v_dst_os, s_dst_diff, s_dst_os, t_dim_1d
    .itr_1d = 0
    .rept \t_dim_1d
        buffer_store_dword v[\v_src+.itr_1d], v[\v_dst_os], s[\s_p_buf_dst:\s_p_buf_dst+3], s[\s_dst_os] offen
        .if .itr_1d != \t_dim_1d - 1
            s_add_u32 s[\s_dst_os], s[\s_dst_os], s[\s_dst_diff]
        .endif
        .itr_1d = .itr_1d + 1
    .endr
.endm

; write 2d tensor to global with stride
.macro .v_write2d_strided v_src, s_p_dst, v_dst_os, s_dst_diff1d, s_dst_diff2d, s_dst_os_2, t_dim_1d, t_dim_2d
    .itr_2d = 0
    .rept \t_dim_2d
    .v_write1d_strided (\v_src + .itr_2d * \t_dim_1d), \s_p_dst, \v_dst_os, \s_dst_diff1d, \s_dst_os_2, \t_dim_1d
    .if .itr_2d != \t_dim_2d - 1
        s_add_u32 s[\s_dst_os_2+1], s[\s_dst_os_2+1], s[\s_dst_diff2d]
        s_mov_b32 s[\s_dst_os_2], s[\s_dst_os_2+1]
    .endif
    .itr_2d = .itr_2d + 1
    .endr
.endm

; write 3d tensor to global with stride
.macro .v_write3d_strided v_src, s_p_dst, v_dst_os, s_dst_diff1d, s_dst_diff2d, s_dst_diff3d, s_dst_os_3, t_dim_1d, t_dim_2d, t_dim_3d
    .itr_3d = 0
    .rept \t_dim_3d
    .v_write2d_strided (\v_src+ .itr_3d * \t_dim_1d * \t_dim_2d), \s_p_dst, \v_dst_os, \s_dst_diff1d, \s_dst_diff2d, \s_dst_os_3, \t_dim_1d, \t_dim_2d
    .if .itr_3d != \t_dim_3d - 1
        s_add_u32 s[\s_dst_os_3+2], s[\s_dst_os_3+2], s[\s_dst_diff3d]
        s_mov_b32 s[\s_dst_os_3+1], s[\s_dst_os_3+2]
        s_mov_b32 s[\s_dst_os_3], s[\s_dst_os_3+1]
    .endif
    .itr_3d = .itr_3d + 1
    .endr
.endm

; write 4d tensor to global with stride
.macro .v_write4d_strided v_src, s_p_dst, v_dst_os, s_dst_diff1d, s_dst_diff2d, s_dst_diff3d, s_dst_diff4d, s_dst_os_4, t_dim_1d, t_dim_2d, t_dim_3d, t_dim_4d
    .itr_4d = 0
    .rept \t_dim_4d
    .v_write3d_strided (\v_src+ .itr_4d * \t_dim_1d * \t_dim_2d * \t_dim_3d), \s_p_dst, \v_dst_os, \s_dst_diff1d, \s_dst_diff2d, \s_dst_diff3d, \s_dst_os_4, \t_dim_1d, \t_dim_2d, \t_dim_3d
    .if .itr_4d != \t_dim_4d - 1
        s_add_u32 s[\s_dst_os_4+3], s[\s_dst_os_4+3], s[\s_dst_diff4d]
        s_mov_b32 s[\s_dst_os_4+2], s[\s_dst_os_4+3]
        s_mov_b32 s[\s_dst_os_4+1], s[\s_dst_os_4+2]
        s_mov_b32 s[\s_dst_os_4], s[\s_dst_os_4+1]
    .endif
    .itr_4d = .itr_4d + 1
    .endr
.endm

.macro .v_clear_nc vid, num
    _v = \vid
    .rept \num
        v_mov_b32 v[_v], 0
        _v = _v + 1
    .endr
.endm

.macro .v_fma_2x2_s4 c, a, b
    v_mac_f32 v[\c], v[\a], v[\b]
    v_mac_f32 v[\c+1], v[\a], v[\b+1]
    v_mac_f32 v[\c+4], v[\a+1], v[\b]
    v_mac_f32 v[\c+5], v[\a+1], v[\b+1]
.endm

.macro .v_fma_4x4_s8 c, a, b
    v_mac_f32 v[\c], v[\a], v[\b]
    v_mac_f32 v[\c+1], v[\a], v[\b+1]
    v_mac_f32 v[\c+2], v[\a], v[\b+2]
    v_mac_f32 v[\c+3], v[\a], v[\b+3]
    v_mac_f32 v[\c+8], v[\a+1], v[\b]
    v_mac_f32 v[\c+9], v[\a+1], v[\b+1]
    v_mac_f32 v[\c+10], v[\a+1], v[\b+2]
    v_mac_f32 v[\c+11], v[\a+1], v[\b+3]
    v_mac_f32 v[\c+16], v[\a+2], v[\b]
    v_mac_f32 v[\c+17], v[\a+2], v[\b+1]
    v_mac_f32 v[\c+18], v[\a+2], v[\b+2]
    v_mac_f32 v[\c+19], v[\a+2], v[\b+3]
    v_mac_f32 v[\c+24], v[\a+3], v[\b]
    v_mac_f32 v[\c+25], v[\a+3], v[\b+1]
    v_mac_f32 v[\c+26], v[\a+3], v[\b+2]
    v_mac_f32 v[\c+27], v[\a+3], v[\b+3]
.endm

; update v_flag
.macro .v_in_set_flag v_flag, v_in_ihi, v_in_iwi, s_hi, s_wi, s_tmp2
    ;   flag: 0<= * <wi
    v_cmp_le_i32 vcc, 0, v[\v_in_ihi]
    v_cmp_gt_i32 s[\s_tmp2:\s_tmp2+1], s[\s_hi], v[\v_in_ihi]
    s_and_b64 vcc, vcc, s[\s_tmp2:\s_tmp2+1]
    v_cndmask_b32 v[\v_flag], 0, 1, vcc
    ;   flag: 0<= * <wi
    v_cmp_le_i32 vcc, 0, v[\v_in_iwi]
    v_cmp_gt_i32 s[\s_tmp2:\s_tmp2+1], s[\s_wi], v[\v_in_iwi]
    s_and_b64 vcc, vcc, s[\s_tmp2:\s_tmp2+1]
    v_cndmask_b32 v[\v_flag], 0, v[\v_flag], vcc
.endm

    ;   flag: 0<= * <wi
    ;v_cmp_gt_u32 vcc, s[\s_hi], v[\v_in_ihi]
    ;v_cndmask_b32 v[\v_flag], 0, 1, vcc

    ;   flag: 0<= * <wi
    ;v_cmp_gt_u32 vcc, s[\s_wi], v[\v_in_iwi] 
    ;v_cndmask_b32 v[\v_flag], 0, v[\v_flag], vcc

; load input from global. {e,n1,b,n2}:{1,2,1,4}
.macro .v_in_load_e_n1_b_n2_1_2_1_4 v_dst, s_p_buf_in, v_in_os, s_in_stride_n1, s_in_stride_n2, v_flag, s_tmp4
    .v_clear_nc \v_dst, 8
    v_cmp_eq_u32 vcc, 1, v[\v_flag]
    s_and_saveexec_b64 s[\s_tmp4+2:\s_tmp4+3], vcc
    buffer_load_dword v[\v_dst+0], v[\v_in_os], s[\s_p_buf_in:\s_p_buf_in+3], 0 offen
    s_mov_b32 s[\s_tmp4], s[\s_in_stride_n2]
    buffer_load_dword v[\v_dst+1], v[\v_in_os], s[\s_p_buf_in:\s_p_buf_in+3], s[\s_tmp4] offen
    s_add_u32 s[\s_tmp4], s[\s_tmp4], s[\s_in_stride_n2]
    buffer_load_dword v[\v_dst+2], v[\v_in_os], s[\s_p_buf_in:\s_p_buf_in+3], s[\s_tmp4] offen
    s_add_u32 s[\s_tmp4], s[\s_tmp4], s[\s_in_stride_n2]
    buffer_load_dword v[\v_dst+3], v[\v_in_os], s[\s_p_buf_in:\s_p_buf_in+3], s[\s_tmp4] offen
    s_mul_i32 s[\s_tmp4], 1, s[\s_in_stride_n1]
    buffer_load_dword v[\v_dst+4], v[\v_in_os], s[\s_p_buf_in:\s_p_buf_in+3], s[\s_tmp4] offen
    s_add_u32 s[\s_tmp4], s[\s_tmp4], s[\s_in_stride_n2]
    buffer_load_dword v[\v_dst+5], v[\v_in_os], s[\s_p_buf_in:\s_p_buf_in+3], s[\s_tmp4] offen
    s_add_u32 s[\s_tmp4], s[\s_tmp4], s[\s_in_stride_n2]
    buffer_load_dword v[\v_dst+6], v[\v_in_os], s[\s_p_buf_in:\s_p_buf_in+3], s[\s_tmp4] offen
    s_add_u32 s[\s_tmp4], s[\s_tmp4], s[\s_in_stride_n2]
    buffer_load_dword v[\v_dst+7], v[\v_in_os], s[\s_p_buf_in:\s_p_buf_in+3], s[\s_tmp4] offen
    s_or_b64 exec, exec, s[\s_tmp4+2:\s_tmp4+3]
.endm

; load input from global. {e,n1,b,n2}:{1,1,1,2}
.macro .v_in_load_e_n1_b_n2_1_1_1_2 v_dst, s_p_buf_in, v_in_os, s_in_stride_n1, s_in_stride_n2, v_flag, s_tmp4
    .v_clear_nc \v_dst, 2
    v_cmp_eq_u32 vcc, 1, v[\v_flag]
    s_and_saveexec_b64 s[\s_tmp4+2:\s_tmp4+3], vcc
    buffer_load_dword v[\v_dst+0], v[\v_in_os], s[\s_p_buf_in:\s_p_buf_in+3], 0 offen
    s_mov_b32 s[\s_tmp4], s[\s_in_stride_n2]
    buffer_load_dword v[\v_dst+1], v[\v_in_os], s[\s_p_buf_in:\s_p_buf_in+3], s[\s_tmp4] offen
    s_or_b64 exec, exec, s[\s_tmp4+2:\s_tmp4+3]
.endm

; load weight from global. {e,k}:{4,2}, vector_e:4
.macro .v_wei_load_e_k_4_2_ev4 v_dst, s_p_buf_wei, v_wei_os, s_wei_stride_k, s_tmp2
    buffer_load_dwordx4 v[\v_dst+0:\v_dst+3], v[\v_wei_os], s[\s_p_buf_wei:\s_p_buf_wei+3], 0 offen
    buffer_load_dwordx4 v[\v_dst+4:\v_dst+7], v[\v_wei_os], s[\s_p_buf_wei:\s_p_buf_wei+3], s[\s_wei_stride_k] offen
.endm

; load weight from global. {e,k}:{1,8}, vector_e:1
.macro .v_wei_load_e_k_1_8_ev1 v_dst, s_p_buf_wei, v_wei_os, s_wei_stride_k, s_tmp2
    buffer_load_dword v[\v_dst+0], v[\v_wei_os], s[\s_p_buf_wei:\s_p_buf_wei+3], 0 offen
    buffer_load_dword v[\v_dst+1], v[\v_wei_os], s[\s_p_buf_wei:\s_p_buf_wei+3], s[\s_wei_stride_k] offen
    s_lshl_b32 s[\s_tmp2], s[\s_wei_stride_k], 1
    buffer_load_dword v[\v_dst+2], v[\v_wei_os], s[\s_p_buf_wei:\s_p_buf_wei+3], s[\s_tmp2] offen
    s_add_u32 s[\s_tmp2], s[\s_tmp2], s[\s_wei_stride_k]
    buffer_load_dword v[\v_dst+3], v[\v_wei_os], s[\s_p_buf_wei:\s_p_buf_wei+3], s[\s_tmp2] offen
    s_add_u32 s[\s_tmp2], s[\s_tmp2], s[\s_wei_stride_k]
    buffer_load_dword v[\v_dst+4], v[\v_wei_os], s[\s_p_buf_wei:\s_p_buf_wei+3], s[\s_tmp2] offen
    s_add_u32 s[\s_tmp2], s[\s_tmp2], s[\s_wei_stride_k]
    buffer_load_dword v[\v_dst+5], v[\v_wei_os], s[\s_p_buf_wei:\s_p_buf_wei+3], s[\s_tmp2] offen
    s_add_u32 s[\s_tmp2], s[\s_tmp2], s[\s_wei_stride_k]
    buffer_load_dword v[\v_dst+6], v[\v_wei_os], s[\s_p_buf_wei:\s_p_buf_wei+3], s[\s_tmp2] offen
    s_add_u32 s[\s_tmp2], s[\s_tmp2], s[\s_wei_stride_k]
    buffer_load_dword v[\v_dst+7], v[\v_wei_os], s[\s_p_buf_wei:\s_p_buf_wei+3], s[\s_tmp2] offen
.endm

; load weight from global. {e,k}:{1,2}, vector_e:1
.macro .v_wei_load_e_k_1_2_ev1 v_dst, s_p_buf_wei, v_wei_os, s_wei_stride_k, s_tmp2
    buffer_load_dword v[\v_dst+0], v[\v_wei_os], s[\s_p_buf_wei:\s_p_buf_wei+3], 0 offen
    buffer_load_dword v[\v_dst+1], v[\v_wei_os], s[\s_p_buf_wei:\s_p_buf_wei+3], s[\s_wei_stride_k] offen
.endm

; store input to LDS. {e,n1,b,n2}:{1,2,1,4}, stride_n1:256, vector_n2:4, offset:0
.macro .v_in_sst_e_n1_b_n2_1_2_1_4_n1s256_n2v4 v_src, v_sst_os
    ds_write_b128 v[\v_sst_os], v[\v_src:\v_src+3] 
    ds_write_b128 v[\v_sst_os], v[\v_src+4:\v_src+4+3] offset:256
.endm

; store input to LDS. {e,n1,b,n2}:{1,1,1,2}, stride_n1:64, vector_n2:2, offset:0
.macro .v_in_sst_e_n1_b_n2_1_1_1_2_n1s64_n2v2 v_src, v_sst_os
    ds_write_b64 v[\v_sst_os], v[\v_src:\v_src+1] 
.endm

; store weight to LDS. {e,k}:{4,2}, stride_e:512, vector_k:2, offset:0
.macro .v_wei_sst_e_k_4_2_es512_kv2 v_src, v_sst_os
    v_swap_b32 v[\v_src+1], v[\v_src+4]
    ds_write2_b64 v[\v_sst_os], v[\v_src:\v_src+1], v[\v_src+4:\v_src+5], offset0:0, offset1:64
    v_swap_b32 v[\v_src+3], v[\v_src+6]
    ds_write2_b64 v[\v_sst_os], v[\v_src+2:\v_src+3], v[\v_src+6:\v_src+7], offset0:128, offset1:192
.endm

; store weight to LDS. {e,k}:{1,8}, vector_k:8, offset:0
.macro .v_wei_sst_e_k_1_8_kv8 v_src, v_sst_os
    ds_write_b128 v[\v_sst_os], v[\v_src:\v_src+3] 
    ds_write_b128 v[\v_sst_os], v[\v_src+4:\v_src+7] offset:16
.endm

; store weight to LDS. {e,k}:{1,2}, stride_e:128, vector_k:2, offset:0
.macro .v_wei_sst_e_k_1_2_es128_kv2 v_src, v_sst_os
    ds_write_b64 v[\v_sst_os], v[\v_src:\v_src+1] offset:0
.endm

; store output to global. s_dst_os_4 need be zero {k0,k1,n1,b,n2}:{2,4,2,1,4}
.macro .v_out_write_k0_k1_n1_b_n2_2_4_2_1_4 v_src, s_p_out, v_out_os, s_out_stride_k0, s_out_stride_k1, s_out_stride_n1, s_out_stride_n2, s_dst_os_4, t_k0, t_k1, t_n1, t_n2
    .v_write4d_strided \v_src,\s_p_out,\v_out_os,\s_out_stride_n2,\s_out_stride_n1,\s_out_stride_k1,\s_out_stride_k0,\s_dst_os_4,4,2,4,2
.endm

; store output to global. s_dst_os_4 need be zero {k0,k1,n1,b,n2}:{2,2,2,1,2}
.macro .v_out_write_k0_k1_n1_b_n2_2_2_2_1_2 v_src, s_p_out, v_out_os, s_out_stride_k0, s_out_stride_k1, s_out_stride_n1, s_out_stride_n2, s_dst_os_4, t_k0, t_k1, t_n1, t_n2
    .v_write4d_strided \v_src,\s_p_out,\v_out_os,\s_out_stride_n2,\s_out_stride_n1,\s_out_stride_k1,\s_out_stride_k0,\s_dst_os_4,2,2,2,2
.endm

; move input slice window. unified for all tunable along e=c*y*x 
; update v_in_os, v_flag, update v_in_ic, v_in_iy, v_in_ix (zero or possitive), v_in_ihi, v_in_iwi (negative, zero, possitive)
.macro .v_in_move_slice_window v_in_os, v_in_ic, v_in_iy, v_in_ix, v_in_ihi, v_in_iwi, v_flag, s_hi, s_wi, s_y, s_x, s_in_stride_c, s_dilation_h, s_dilation_w, s_in_ic, s_in_iy, s_in_ix, v_idc, v_idy, v_idx, s_tmp2
    ; record old ic, iy, ix
    v_mov_b32 v[\v_idx], v[\v_in_ix]
    v_mov_b32 v[\v_idy], v[\v_in_iy]
    v_mov_b32 v[\v_idc], v[\v_in_ic]

    ; update ix, calculate idx, carry-out to iy
    v_add_u32 v[\v_in_ix], s[\s_in_ix], v[\v_in_ix] ; v_in_ix=v_in_ix+s_in_ix
    v_cmp_le_u32 vcc, s[\s_x], v[\v_in_ix]
    s_and_saveexec_b64 s[\s_tmp2:\s_tmp2+1], vcc
    v_subrev_u32 v[\v_in_ix], s[\s_x], v[\v_in_ix]
    v_add_u32 v[\v_in_iy], 1, v[\v_in_iy]
    s_or_b64 exec, exec, s[\s_tmp2:\s_tmp2+1]
    v_sub_i32 v[\v_idx], v[\v_in_ix], v[\v_idx]

    ; update iy, calculate idy, carry-out to ic
    v_add_u32 v[\v_in_iy], s[\s_in_iy], v[\v_in_iy]
    v_cmp_le_u32 vcc, s[\s_y], v[\v_in_iy]
    s_and_saveexec_b64 s[\s_tmp2:\s_tmp2+1], vcc
    v_subrev_u32 v[\v_in_iy], s[\s_y], v[\v_in_iy]
    v_add_u32 v[\v_in_ic], 1, v[\v_in_ic]
    s_or_b64 exec, exec, s[\s_tmp2:\s_tmp2+1]
    v_sub_i32 v[\v_idy], v[\v_in_iy], v[\v_idy]

    ; update ic, calculate idc, ignore overflow check
    v_add_u32 v[\v_in_ic], s[\s_in_ic], v[\v_in_ic]
    v_sub_u32 v[\v_idc], v[\v_in_ic], v[\v_idc]

    ; calculate offset: idc*(s_hi*s_wi) + idy*s_dilation_h*s_wi + idx*s_dilation_w
    ; we use i24 as multiplier, for 24bit(-8388607 ~ 8388608) is enough for index
    ; also, update ihi, iwi here
    v_mul_i32_i24 v[\v_idy], s[\s_dilation_h], v[\v_idy]
    v_mul_i32_i24 v[\v_idx], s[\s_dilation_w], v[\v_idx]
    v_add_i32 v[\v_in_ihi], v[\v_idy], v[\v_in_ihi]
    v_add_i32 v[\v_in_iwi], v[\v_idx], v[\v_in_iwi]
    v_mul_i32_i24 v[\v_idy], s[\s_wi], v[\v_idy]

    v_add_i32 v[\v_idx], v[\v_idx], v[\v_idy]
    v_mul_lo_u32 v[\v_idc], s[\s_in_stride_c], v[\v_idc]
    v_add_i32 v[\v_idc], v[\v_idc], v[\v_idx]
    v_lshl_add_u32 v[\v_in_os], v[\v_idc], 2, v[\v_in_os]   ; indeed, v_idc here must be possitive

    ; update v_flag
    .v_in_set_flag \v_flag, \v_in_ihi, \v_in_iwi, \s_hi, \s_wi, \s_tmp2
.endm

; move weight slice window. unified for all tunable along e=c*y*x 
; update v_wei_os, update v_wei_ic, v_wei_iy, v_wei_ix (zero or possitive)
.macro .v_wei_move_slice_window v_wei_os, v_wei_ic, v_wei_iy, v_wei_ix, s_y, s_x, s_wei_stride_c, s_wei_ic, s_wei_iy, s_wei_ix, v_idc, v_idy, v_idx, s_tmp2
    ; record old ic, iy, ix
    v_mov_b32 v[\v_idx], v[\v_wei_ix]
    v_mov_b32 v[\v_idy], v[\v_wei_iy]
    v_mov_b32 v[\v_idc], v[\v_wei_ic]

    ; update ix, calculate idx, carry-out to iy
    v_add_u32 v[\v_wei_ix], s[\s_wei_ix], v[\v_wei_ix]
    v_cmp_le_u32 vcc, s[\s_x], v[\v_wei_ix]
    s_and_saveexec_b64 s[\s_tmp2:\s_tmp2+1], vcc
    v_subrev_u32 v[\v_wei_ix], s[\s_x], v[\v_wei_ix]
    v_add_u32 v[\v_wei_iy], 1, v[\v_wei_iy]
    s_or_b64 exec, exec, s[\s_tmp2:\s_tmp2+1]
    v_sub_i32 v[\v_idx], v[\v_wei_ix], v[\v_idx]

    ; update iy, calculate idy, carry-out to ic
    v_add_u32 v[\v_wei_iy], s[\s_wei_iy], v[\v_wei_iy]
    v_cmp_le_u32 vcc, s[\s_y], v[\v_wei_iy]
    s_and_saveexec_b64 s[\s_tmp2:\s_tmp2+1], vcc
    v_subrev_u32 v[\v_wei_iy], s[\s_y], v[\v_wei_iy]
    v_add_u32 v[\v_wei_ic], 1, v[\v_wei_ic]
    s_or_b64 exec, exec, s[\s_tmp2:\s_tmp2+1]
    v_sub_i32 v[\v_idy], v[\v_wei_iy], v[\v_idy]

    ; update ic, calculate idc, ignore overflow check
    v_add_u32 v[\v_wei_ic], s[\s_wei_ic], v[\v_wei_ic]
    v_sub_u32 v[\v_idc], v[\v_wei_ic], v[\v_idc]

    ; calculate offset: idc*(s_y*s_x) + idy*s_x + idx
    ; we use i24 as multiplier, for 24bit(-8388607 ~ 8388608) is enough for index
    v_mad_i32_i24 v[\v_idy], s[\s_x], v[\v_idy], v[\v_idx]
    v_mul_lo_u32 v[\v_idc], s[\s_wei_stride_c], v[\v_idc]
    v_add_i32 v[\v_idc], v[\v_idc], v[\v_idy]
    v_lshl_add_u32 v[\v_wei_os], v[\v_idc], 2, v[\v_wei_os]  ; indeed, idc here must be possitive
.endm

; move input slice window. unified for all tunable along e=c*y*x 
; update v_in_os, v_flag, update v_in_ic, v_in_iy, v_in_ix (zero or possitive), v_in_ihi, v_in_iwi (negative, zero, possitive)
.macro .v_in_wei_move_slice_window_wrw v_in_os, v_wei_os, v_in_ic, v_in_iy, v_in_ix, v_in_ihi, v_in_iwi, v_flag, s_hi, s_wi, s_y, s_x, s_in_stride_c, s_wei_stride_c, s_dilation_h, s_dilation_w, s_in_ic, s_in_iy, s_in_ix, v_idc, v_idy, v_idx, v_wei_idc, s_tmp2
    ; record old ic, iy, ix
    v_mov_b32 v[\v_idx], v[\v_in_ix]
    v_mov_b32 v[\v_idy], v[\v_in_iy]
    v_mov_b32 v[\v_idc], v[\v_in_ic]

    ; update ix, calculate idx, carry-out to iy
    v_add_u32 v[\v_in_ix], s[\s_in_ix], v[\v_in_ix] ; v_in_ix=v_in_ix+s_in_ix
    v_cmp_le_u32 vcc, s[\s_x], v[\v_in_ix]
    s_and_saveexec_b64 s[\s_tmp2:\s_tmp2+1], vcc
    v_subrev_u32 v[\v_in_ix], s[\s_x], v[\v_in_ix]
    v_add_u32 v[\v_in_iy], 1, v[\v_in_iy]
    s_or_b64 exec, exec, s[\s_tmp2:\s_tmp2+1]
    v_sub_i32 v[\v_idx], v[\v_in_ix], v[\v_idx]

    ; update iy, calculate idy, carry-out to ic
    v_add_u32 v[\v_in_iy], s[\s_in_iy], v[\v_in_iy]
    v_cmp_le_u32 vcc, s[\s_y], v[\v_in_iy]
    s_and_saveexec_b64 s[\s_tmp2:\s_tmp2+1], vcc
    v_subrev_u32 v[\v_in_iy], s[\s_y], v[\v_in_iy]
    v_add_u32 v[\v_in_ic], 1, v[\v_in_ic]
    s_or_b64 exec, exec, s[\s_tmp2:\s_tmp2+1]
    v_sub_i32 v[\v_idy], v[\v_in_iy], v[\v_idy]

    ; update ic, calculate idc, ignore overflow check
    v_add_u32 v[\v_in_ic], s[\s_in_ic], v[\v_in_ic]
    v_sub_u32 v[\v_idc], v[\v_in_ic], v[\v_idc]

    ; calculate wei offset: idc*(k*y*x)+idy*x+idx
    v_mul_lo_u32 v[\v_wei_idc], s[\s_wei_stride_c], v[\v_idc]
    v_mad_i32_i24 v[\v_wei_idc], v[\v_idy], s[\s_x], v[\v_wei_idc]
    v_add_u32 v[\v_wei_idc], v[\v_wei_idc], v[\v_idx]
    v_lshl_add_u32 v[\v_wei_os], v[\v_wei_idc], 2, v[\v_wei_os]

    ; calculate input offset: idc*(n*s_hi*s_wi) + idy*s_dilation_h*s_wi + idx*s_dilation_w
    ; we use i24 as multiplier, for 24bit(-8388607 ~ 8388608) is enough for index
    ; also, update ihi, iwi here
    v_mul_i32_i24 v[\v_idy], s[\s_dilation_h], v[\v_idy]
    v_mul_i32_i24 v[\v_idx], s[\s_dilation_w], v[\v_idx]
    v_add_i32 v[\v_in_ihi], v[\v_idy], v[\v_in_ihi]
    v_add_i32 v[\v_in_iwi], v[\v_idx], v[\v_in_iwi]
    v_mul_i32_i24 v[\v_idy], s[\s_wi], v[\v_idy]

    v_add_i32 v[\v_idx], v[\v_idx], v[\v_idy]
    v_mul_lo_u32 v[\v_idc], s[\s_in_stride_c], v[\v_idc]
    v_add_i32 v[\v_idc], v[\v_idc], v[\v_idx]
    v_lshl_add_u32 v[\v_in_os], v[\v_idc], 2, v[\v_in_os]   ; indeed, v_idc here must be possitive

    ; update v_flag
    .v_in_set_flag \v_flag, \v_in_ihi, \v_in_iwi, \s_hi, \s_wi, \s_tmp2
.endm

;----------------------------------------------------------
; starting of kernel igemm_v4r1_dynamic_wrw_32x32x4_4x4_2x2x4x2x4x2_4x2x8x1_4x16
; b_per_block                      : 8
; k_per_block                      : 32
; e_per_block                      : 4
; gemm_n_repeat                    : 2
; gemm_m_per_thread_subc           : 2
; gemm_m_level0_cluster            : 2
; gemm_m_level1_cluster            : 4
; gemm_n_per_thread_subc           : 2
; gemm_n_level0_cluster            : 4
; gemm_n_level1_cluster            : 2
; in_block_copy_cluster_lengths_e  : 4
; in_block_copy_cluster_lengths_n1 : 2
; in_block_copy_cluster_lengths_b  : 8
; in_block_copy_cluster_lengths_n2 : 1
; wei_block_copy_cluster_lengths_e : 4
; wei_block_copy_cluster_lengths_k : 16
; 
; in_block_copy_sub_lengths_e      : 1
; in_block_copy_sub_lengths_n1     : 1
; in_block_copy_sub_lengths_b      : 1
; in_block_copy_sub_lengths_n2     : 2
; wei_block_copy_sub_lengths_e     : 1
; wei_block_copy_sub_lengths_k     : 2
; block_size                       : 64
; thread_tile                      : 4x4
; 
; kernarg offset
.set k_p_in,                0
.set k_p_wei,               8
.set k_p_out,               16
.set k_hi,                  24
.set k_wi,                  28
.set k_n,                   32
.set k_k,                   36
.set k_c,                   40
.set k_ho,                  44
.set k_wo,                  48
.set k_stride_h,            52
.set k_stride_w,            56
.set k_dilation_h,          60
.set k_dilation_w,          64
.set k_pad_h,               68
.set k_pad_w,               72
.set k_y,                   76
.set k_x,                   80
.set k_gemmk_groups,        84

; sgpr
.set s_ka,                  0
.set s_bx,                  2
.set s_p_in,                4
.set s_p_wei,               6
.set s_hi,                  8
.set s_wi,                  9
.set s_n,                   10
.set s_k,                   11
.set s_c,                   12
.set s_ho,                  13
.set s_wo,                  14
.set s_stride_h,            15
.set s_stride_w,            16
.set s_dilation_h,          17
.set s_dilation_w,          18
.set s_pad_h,               19
.set s_pad_w,               20
.set s_y,                   21
.set s_x,                   22
.set s_gemmkgroups,         23
.set s_p_out,               24
.set s_block_ie,            52
.set s_block_ik,            26
.set s_block_ib,            27
.set s_in_stride_c,         28
.set s_in_stride_n2,        29
.set s_in_stride_n1,        30
.set s_in_ic,               31
.set s_in_iy,               32
.set s_in_ix,               33
.set s_group_stride,        53
.set s_wei_stride_c,        34
.set s_wei_stride_k,        35
.set s_wei_ic,              s_in_ic     ; weight&input ic, iy, ix from EPerBlock is the same
.set s_wei_iy,              s_in_iy
.set s_wei_ix,              s_in_ix
.set s_out_stride_k0,       36
.set s_out_stride_k1,       37
.set s_out_stride_n1,       38
.set s_out_stride_n2,       39
.set s_kitr,                0
.set s_tmp,                 40
.set s_p_buf_in,            s_p_in      ; 4 sgpr used for MUBUF
.set s_p_buf_wei,           44
.set s_p_buf_out,           s_p_out
.set s_wei_slice,           1
.set s_sub_c,               54
.set s_group_left,          55

; vgpr
.set v_c,                   0
.set v_a,                   16
.set v_b,                   20
.set v_gld_a,               24
.set v_gld_b,               26
.set v_in_os,               28
.set v_wei_os,              29
.set v_sst_a_os,            30
.set v_sst_b_os,            31
.set v_sld_a_os,            32
.set v_sld_b_os,            33
.set v_out_os,              34
.set v_flag,                35
.set v_in_ic,               36
.set v_in_iy,               37
.set v_in_ix,               38
.set v_in_ihi,              39
.set v_in_iwi,              40
.set v_wei_ic,              41
.set v_wei_iy,              42
.set v_wei_ix,              43
.set v_in_in0,              15
.set v_in_iho,              14
.set v_in_iwo,              13
.set v_in_ie,               12
.set v_in_in1,              11
.set v_in_ib,               10
.set v_in_in2,              9
.set v_wei_ie,              8
.set v_wei_ik,              7
.set v_out_ik0,             6
.set v_out_ik1,             44
.set v_out_ib,              45
.set v_gemm_in,             46
.set v_gemm_im,             47
.set v_idc,                 48
.set v_idy,                 49
.set v_idx,                 50
.set v_tmp,                 51
.set v_end,                 58

.text
.globl igemm_v4r1_dynamic_wrw_32x32x4_4x4_2x2x4x2x4x2_4x2x8x1_4x16
.p2align 8
.type igemm_v4r1_dynamic_wrw_32x32x4_4x4_2x2x4x2x4x2_4x2x8x1_4x16,@function
igemm_v4r1_dynamic_wrw_32x32x4_4x4_2x2x4x2x4x2_4x2x8x1_4x16:
    s_load_dwordx4  s[s_p_in:s_p_in+3],         s[s_ka:s_ka+1],     0+k_p_in
    s_load_dwordx2  s[s_p_out:s_p_out+1],       s[s_ka:s_ka+1],     0+k_p_out
    s_load_dwordx8  s[s_hi:s_hi+7],             s[s_ka:s_ka+1],     0+k_hi
    s_load_dwordx4  s[s_stride_w:s_stride_w+3], s[s_ka:s_ka+1],     0+k_stride_w
    s_load_dwordx2  s[s_pad_w:s_pad_w+1],       s[s_ka:s_ka+1],     0+k_pad_w
    s_load_dwordx2  s[s_x:s_gemmkgroups],       s[s_ka:s_ka+1],     0+k_x

    ; in e_n1_b_n2 cluster_lengths:{4,2,8,1}, sub_lengths:{1,1,1,2}, order:{0,1,3,2}
    v_and_b32 v[v_in_ib], 7, v0 ; v_in_ib=tid%8
    v_lshrrev_b32 v[v_tmp], 3, v0 ; v_tmp=tid/8
    v_mov_b32 v[v_in_in2], 0 ; v_in_in2=0
    v_and_b32 v[v_in_in1], 1, v[v_tmp] ; v_in_in1=(tid/8)%2
    v_lshrrev_b32 v[v_tmp], 1, v[v_tmp] ; v_tmp=tid/8/2
    v_and_b32 v[v_in_ie], 3, v[v_tmp] ; v_tmp=(tid/8/2)%4
    ; wei e_k cluster_lengths:{4,16}, sub_lengths:{1,2}, order:{1,0}
    v_and_b32 v[v_wei_ie], 3, v0 ; v_wei_ie=tid%4
    v_lshrrev_b32 v[v_tmp], 2, v0 ; v_tmp=tid/4
    v_and_b32 v[v_wei_ik], 15, v[v_tmp] ; v_wei_ik=(tid/4)%16
    v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] ; v_wei_ik=(tid/4)%16*2
    s_waitcnt lgkmcnt(0)

    ; calculate c per group
    s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmkgroups]

    ; calculate index
    s_mul_i32 s[s_out_stride_n2], s[s_ho], s[s_wo] ; out_stride_n2=ho*wo
    s_mul_i32 s[s_out_stride_k1], s[s_n], s[s_out_stride_n2] ; out_stride_k1=ho*wo*n
    s_lshl_b32 s[s_out_stride_k0], s[s_out_stride_k1], 4 ; out_stride_k0=ho*wo*n*16
    s_lshl_b32 s[s_out_stride_n1], s[s_out_stride_n2], 1 ; out_stride_n1=ho*wo*2
    s_mul_i32 s[s_in_stride_n2], s[s_hi], s[s_wi] ; in_stride_n2=hi*wi
    s_mul_i32 s[s_in_stride_c], s[s_n], s[s_in_stride_n2] ; in_stride_c=hi*wi*n
    s_mul_i32 s[s_wei_stride_k], s[s_y], s[s_x] ; wei_stride_k=y*x
    s_mul_i32 s[s_wei_stride_c], s[s_k], s[s_wei_stride_k] ; wei_stride_c=k*y*x
    s_mov_b64 s[s_p_buf_wei:s_p_buf_wei+1], s[s_p_wei:s_p_wei+1]
    s_mov_b32 s[s_p_buf_in+2], 0xffffffff
    s_mov_b32 s[s_p_buf_in+3], 0x27000
    s_mov_b32 s[s_p_buf_wei+2], 0xffffffff
    s_mov_b32 s[s_p_buf_wei+3], 0x27000

    ; block e,k,b index global
    s_lshr_b32 s[s_tmp], s[s_n], 2 ; tmp=n/4 maybe n0
    s_mul_i32 s[s_tmp+1], s[s_out_stride_n2], s[s_tmp]; tmp_1=ho*wo*n/4
    s_lshr_b32 s[0], s[s_tmp+1], 3 ; s_0=ho*wo*n/4/8
    s_lshr_b32 s[s_tmp], s[s_k], 5 ; s_tmp=k/32
    s_mul_i32 s[s_group_stride], s[s_tmp], s[0] ; s_group_stride=n*ho*wo/32*k/32

    ; block e index on global
    .v_u32_div_ss v_tmp+5, s_bx, s_group_stride, v_tmp, s_tmp ; v_tmp_5=block_idx/(n*ho*wo/32*k/32)
    v_readfirstlane_b32 s[s_block_ie], v[v_tmp+5] ; s_block_ie=block_idx/(n*ho*wo/32*k/32)
    
    ; block k, b index on global
    s_mul_i32 s[s_tmp+2], s[s_block_ie], s[s_group_stride] ; s_tmp_2=block_idx/(n*ho*wo/32*k/32)*(n*ho*wo/32*k/32)
    s_sub_i32 s[s_group_left], s[s_bx], s[s_tmp+2] ; s_tmp_1=block_idx-block_idx/(n*ho*wo/32*k/32)*(n*ho*wo/32*k/32)
    
    .v_u32_div_ss v_tmp+5, s_group_left, 0, v_tmp, s_tmp ; v_tmp_5=block_idx/(ho*wo*n/4/8)
    v_readfirstlane_b32 s[s_tmp], v[v_tmp+5] ; s_tmp=v_tmp_5
    s_mul_i32 s[s_tmp+2], s[s_tmp], s[0] ; s_tmp_2=block_idx/(ho*wo*n/4/8)*(ho*wo*n/4/8)
    s_sub_i32 s[s_tmp+1], s[s_group_left], s[s_tmp+2] ; s_tmp_1=block_idx-s_tmp_2
    s_lshl_b32 s[s_block_ik], s[s_tmp], 5 ; s_block_ik=block_idx/(ho*wo*n/4/8)*32
    s_lshl_b32 s[s_block_ib], s[s_tmp+1], 3 ; s_block_ib=s_tmp_1*8
    ;s_mul_i32 s[s_block_ie], s[s_block_ie], s[s_sub_c] ; s_block_ie*=s_sub_c

    ; calculate input transform
    ; e_n1_b_n2:b, transform: b -> n0*ho*wo
    v_add_u32 v[v_tmp+4], s[s_block_ib], v[v_in_ib] ; v_tmp_4=block_ib+v_in_ib(thread_ib)
    .v_u32_div_vs v_in_in0, v_tmp+4, s_out_stride_n2, v_tmp, s_tmp ; v_in_n0=(block_ib+v_in_ib)/(ho*wo)
    v_mul_lo_u32 v[v_tmp], s[s_out_stride_n2], v[v_in_in0] ; v_tmp=v_in_n0*(ho*wo)
    v_sub_u32 v[v_tmp+4], v[v_tmp+4], v[v_tmp] ; v_tmp_4=block_ib+v_in_ib-v_in_n0*(ho*wo)
    .v_u32_div_vs v_in_iho, v_tmp+4, s_wo, v_tmp, s_tmp ; v_in_iho=v_tmp_4/wo
    v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_in_iho] 
    v_sub_u32 v[v_in_iwo], v[v_tmp+4], v[v_tmp] ; v_in_iwo=v_tmp_4-v_in_iho*wo

    ; e_n1_b_n2:e
    ;   1) transform e -> c*y*x
    .v_u32_div_vs v_in_ic, v_in_ie, s_wei_stride_k, v_tmp, s_tmp ; v_in_ic=v_in_ie/(y*x)
    v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_in_ic] ; v_tmp=v_in_ic*(y*x)
    v_sub_u32 v[v_tmp+4], v[v_in_ie], v[v_tmp] ; v_tmp_4=v_in_ie-v_in_ic*(y*x)
    .v_u32_div_vs v_in_iy, v_tmp+4, s_x, v_tmp, s_tmp ; v_in_iy=v_tmp_4/x
    v_mul_lo_u32 v[v_tmp], s[s_x], v[v_in_iy] ; v_tmp=x*v_in_iy
    v_sub_u32 v[v_in_ix], v[v_tmp+4], v[v_tmp] ; v_in_ix=v_in_ie-v_in_ic*(y*x)-x*v_in_iy

    ;   2) transform iho, iwo, iy, ix -> hip, wip
    v_mul_lo_u32 v[v_tmp], s[s_stride_h], v[v_in_iho]
    v_mul_lo_u32 v[v_tmp+1], s[s_stride_w], v[v_in_iwo]
    v_mul_lo_u32 v[v_tmp+2], s[s_dilation_h], v[v_in_iy]
    v_mul_lo_u32 v[v_tmp+3], s[s_dilation_w], v[v_in_ix]

    ;   3) transform hip, wip -> hi, wi
    v_add_u32 v[v_tmp], v[v_tmp], v[v_tmp+2]
    v_add_u32 v[v_tmp+1], v[v_tmp+1], v[v_tmp+3]
    v_sub_i32 v[v_in_ihi], v[v_tmp], s[s_pad_h]
    v_sub_i32 v[v_in_iwi], v[v_tmp+1], s[s_pad_w]

    ; set input flag
    .v_in_set_flag v_flag, v_in_ihi, v_in_iwi, s_hi, s_wi, s_tmp

    ; in offset: from ihi, iwi, ic, in, calculate v_in_os
    v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi] ; v_tmp=v_in_ihi*s_wi
    v_add_u32 v[v_tmp], v[v_tmp], v[v_in_iwi] ; v_tmp=v_in_ihi*s_wi+v_in_iwi
    s_mul_i32 s[s_tmp], s[s_block_ie], s[s_sub_c] ; s_tmp=s_block_ie*s_sub_c
    v_add_i32 v[v_tmp+2], s[s_tmp], v[v_in_ic] ; v_tmp_2=s_block_ie*s_sub_c+v_in_ic
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_c], v[v_tmp+2] ; v_tmp_1=v_in_ic*n*hi*wi
    v_add_u32 v[v_tmp], v[v_tmp], v[v_tmp+1] ; v_tmp=v_in_ic*n*hi*wi+v_in_ihi*s_wi+v_in_iwi
    v_lshl_add_u32 v[v_tmp+1], v[v_in_in0], 2, v[v_in_in2] ; v_tmp_1=v_in_in0*8+v_in_in2
    v_lshl_add_u32 v[v_tmp+1], v[v_in_in1], 1, v[v_tmp+1] ; v_tmp_1=v_in_in0*8+v_in_in2+v_in_in1*4
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n2], v[v_tmp+1] ; v_tmp_1=(v_in_in0*8+v_in_in2+v_in_in1*4)*hi*wi
    v_add_lshl_u32 v[v_in_os], v[v_tmp], v[v_tmp+1], 2 ; v_in_os=((v_in_in0*8+v_in_in2+v_in_in1*4)*hi*wi
                                                       ;           +v_in_ic*n*hi*wi+v_in_ihi*s_wi+v_in_iwi)*sizeof(float)

    s_lshl_b32 s[s_in_stride_n2], s[s_in_stride_n2], 2
    s_lshl_b32 s[s_in_stride_n1], s[s_in_stride_n2], 1
    ; load input from global
    .v_in_load_e_n1_b_n2_1_1_1_2 v_gld_b, s_p_buf_in, v_in_os, s_in_stride_n1, s_in_stride_n2, v_flag, s_tmp

    ; calculate SliceWindow e=c*y*x. this is same for both input/weight
    s_mov_b32 s[s_wei_slice], 4
    .v_u32_div_ss v_tmp+4, s_wei_slice, s_wei_stride_k, v_tmp, s_tmp
    v_readfirstlane_b32 s[s_in_ic], v[v_tmp+4]
    s_mul_i32 s[s_tmp], s[s_wei_stride_k], s[s_in_ic]
    s_sub_i32 s[s_wei_slice], s[s_wei_slice], s[s_tmp]
    .v_u32_div_ss v_tmp+4, s_wei_slice, s_x, v_tmp, s_tmp
    v_readfirstlane_b32 s[s_in_iy], v[v_tmp+4]
    s_mul_i32 s[s_tmp], s[s_x], s[s_in_iy]
    s_sub_i32 s[s_in_ix], s[s_wei_slice], s[s_tmp]

    ; c thread mapping
    v_and_b32 v[v_tmp+4], 7, v0
    v_and_b32 v[v_tmp], 3, v[v_tmp+4]
    v_lshrrev_b32 v[v_tmp+1], 2, v[v_tmp+4]

    v_lshrrev_b32 v[v_tmp+4], 3, v0
    v_and_b32 v[v_tmp+2], 1, v[v_tmp+4]
    v_lshrrev_b32 v[v_tmp+3], 1, v[v_tmp+4]

    v_lshl_or_b32 v[v_gemm_in], v[v_tmp+2], 2, v[v_tmp]               ; in
    v_lshl_or_b32 v[v_gemm_im], v[v_tmp+3], 1, v[v_tmp+1]             ; im
    v_lshlrev_b32 v[v_sld_b_os], 3, v[v_gemm_in]
    v_lshlrev_b32 v[v_sld_a_os], 3, v[v_gemm_im]
    v_add_u32 v[v_sld_a_os], 512, v[v_sld_a_os]

    ; calculate weight transform
    ; e_k: e->c*y*x
    .v_u32_div_vs v_wei_ic, v_wei_ie, s_wei_stride_k, v_tmp, s_tmp
    v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_wei_ic]
    v_sub_u32 v[v_tmp+4], v[v_wei_ie], v[v_tmp]
    .v_u32_div_vs v_wei_iy, v_tmp+4, s_x, v_tmp, s_tmp
    v_mul_lo_u32 v[v_tmp], s[s_x], v[v_wei_iy]
    v_sub_u32 v[v_wei_ix], v[v_tmp+4], v[v_tmp]

    ; wei offset: from ic, iy, ix, ik, calculate v_wei_os
    s_mul_i32 s[s_tmp], s[s_block_ie], s[s_sub_c] ; s_tmp=s_block_ie*s_sub_c
    v_add_i32 v[v_tmp+3], s[s_tmp], v[v_wei_ic] ; v_tmp_3=s_block_ie*s_sub_c+v_wei_ic
    v_mul_lo_u32 v[v_tmp], s[s_wei_stride_c], v[v_tmp+3]
    v_mul_lo_u32 v[v_tmp+1], s[s_x], v[v_wei_iy]
    v_add3_u32 v[v_wei_os], v[v_tmp], v[v_tmp+1], v[v_wei_ix]

    v_add_u32 v[v_tmp], s[s_block_ik], v[v_wei_ik]
    v_mul_lo_u32 v[v_tmp+1], s[s_wei_stride_k], v[v_tmp]
    v_add_lshl_u32 v[v_wei_os], v[v_wei_os], v[v_tmp+1], 2

    s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2

    ; load wei from global
    .v_wei_load_e_k_1_2_ev1 v_gld_a, s_p_buf_wei, v_wei_os, s_wei_stride_k, s_tmp

    ; calculate out index ik0, ik1, ib
    v_lshlrev_b32 v[v_tmp+1], 1, v[v_gemm_im]
    v_add_u32 v[v_tmp], s[s_block_ik], v[v_tmp+1]
    v_lshrrev_b32 v[v_out_ik0], 4, v[v_tmp]
    v_and_b32 v[v_out_ik1], 15, v[v_tmp]

    v_add_u32 v[v_out_ib], s[s_block_ib], v[v_gemm_in]
    .v_u32_div_vs v_tmp+4, v_out_ib, s_out_stride_n2, v_tmp, s_tmp
    v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n2], v[v_tmp+4]
    v_sub_u32 v[v_tmp+5], v[v_out_ib], v[v_tmp+1]
    .v_u32_div_vs v_tmp+6, v_tmp+5, s_wo, v_tmp, s_tmp
    v_mul_lo_u32 v[v_tmp+1], s[s_wo], v[v_tmp+6]
    v_sub_u32 v[v_tmp+5], v[v_tmp+5], v[v_tmp+1]
    ; v_tmp+4:n0, v_tmp+6:ho, v_tmp+5:wo

    v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_tmp+6]
    v_add_u32 v[v_out_os], v[v_tmp], v[v_tmp+5]
    s_lshl_b32 s[s_tmp+1], s[s_out_stride_n2], 2
    v_mul_lo_u32 v[v_tmp], s[s_tmp+1], v[v_tmp+4]
    v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp]

    s_lshl_b32 s[s_out_stride_k0], s[s_out_stride_k0], 2
    v_lshl_or_b32 v[v_tmp], v[v_out_ik0], 4, v[v_out_ik1]
    s_lshl_b32 s[s_out_stride_n1], s[s_out_stride_n1], 2
    v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_k1], v[v_tmp]
    s_lshl_b32 s[s_out_stride_n2], s[s_out_stride_n2], 2
    v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp+1] ; v_out_os = (ik0*16+ik1)*n*ho*wo+in0*ho*wo*2+iho*wo+iwo
    s_mul_i32 s[s_tmp], s[s_out_stride_k1], s[s_k] ; s_tmp=k*n*wo*ho
    s_mul_i32 s[s_tmp], s[s_tmp], s[s_block_ie] ; s_tmp=s_block_ie*k*n*wo*ho
    v_add_u32 v[v_out_os], v[v_out_os], s[s_tmp] ; v_out_os=v_out_os+s_block_ie*k*n*wo*ho
    s_lshl_b32 s[s_out_stride_k1], s[s_out_stride_k1], 2
    v_lshlrev_b32 v[v_out_os], 2, v[v_out_os]

    ; in lds offset block e_n1_b_n2
    v_lshlrev_b32 v[v_tmp], 5, v[v_in_ie]
    v_lshl_or_b32 v[v_tmp], v[v_in_in1], 4, v[v_tmp]
    v_lshl_or_b32 v[v_tmp], v[v_in_ib], 1, v[v_tmp]
    v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp]

    ; wei lds offset block e_k
    v_lshl_or_b32 v[v_tmp], v[v_wei_ie], 5, v[v_wei_ik]
    v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp]
    v_add_u32 v[v_sst_a_os], 512, v[v_sst_a_os]

    s_mov_b32 s[s_p_buf_out+2], 0xffffffff
    s_mov_b32 s[s_p_buf_out+3], 0x27000
    .v_clear_nc v_c, 16

    ; start FMA loop, 4x4 thread tile with 2x2 sub-tile
    s_waitcnt vmcnt(2)
    .v_in_sst_e_n1_b_n2_1_1_1_2_n1s64_n2v2 v_gld_b, v_sst_b_os

    s_waitcnt vmcnt(0)
    .v_wei_sst_e_k_1_2_es128_kv2 v_gld_a, v_sst_a_os

    ; E = C * Y * X
    s_mul_i32 s[s_tmp], s[s_sub_c], s[s_wei_stride_k] ; s_tmp=sub_c*y*x
    s_lshr_b32 s[s_tmp], s[s_tmp], 2 ; cause wei stride k has shl 2
    s_sub_i32 s[s_kitr], s[s_tmp], 4
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_v4r1_dynamic_wrw_32x32x4_4x4_2x2x4x2x4x2_4x2x8x1_4x16_end

    .v_in_move_slice_window v_in_os, v_in_ic, v_in_iy, v_in_ix, v_in_ihi, v_in_iwi, v_flag, s_hi, s_wi, s_y, s_x, s_in_stride_c, s_dilation_h, s_dilation_w, s_in_ic, s_in_iy, s_in_ix, v_idc, v_idy, v_idx, s_tmp
    .v_wei_move_slice_window v_wei_os, v_wei_ic, v_wei_iy, v_wei_ix, s_y, s_x, s_wei_stride_c, s_wei_ic, s_wei_iy, s_wei_ix, v_idc, v_idy, v_idx, s_tmp
    v_xor_b32 v[v_sst_b_os], 0x400, v[v_sst_b_os] ; switch double buffer b store
    v_xor_b32 v[v_sst_a_os], 0x400, v[v_sst_a_os] ; switch double buffer a store
    s_waitcnt lgkmcnt(0)
    s_barrier

    .v_in_load_e_n1_b_n2_1_1_1_2 v_gld_b, s_p_buf_in, v_in_os, s_in_stride_n1, s_in_stride_n2, v_flag, s_tmp
    .v_wei_load_e_k_1_2_ev1 v_gld_a, s_p_buf_wei, v_wei_os, s_wei_stride_k, s_tmp

L_igemm_v4r1_dynamic_wrw_32x32x4_4x4_2x2x4x2x4x2_4x2x8x1_4x16_fma_body:
    ; do fma accumulate with unroll 4
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] 
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] 
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:64
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:64
    .itr_k = 0
    .rept 3
        s_waitcnt lgkmcnt(2)
        .v_fma_2x2_s4 v_c,v_a,v_b

        s_waitcnt lgkmcnt(1)
        .v_fma_2x2_s4 v_c+2,v_a,v_b+2

        ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0+(.itr_k+1)*128
        s_waitcnt lgkmcnt(1)
        .v_fma_2x2_s4 v_c+8,v_a+2,v_b

        ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+(.itr_k+1)*128
        .v_fma_2x2_s4 v_c+10,v_a+2,v_b+2

        ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+(.itr_k+1)*128+64
        ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:0+(.itr_k+1)*128+64
        .itr_k = .itr_k + 1
    .endr

    ; last unroll
    v_xor_b32 v[v_sld_b_os], 0x400, v[v_sld_b_os] ; switch double buffer b load
    v_xor_b32 v[v_sld_a_os], 0x400, v[v_sld_a_os] ; switch double buffer a load
    s_waitcnt lgkmcnt(2)
    .v_fma_2x2_s4 v_c,v_a,v_b

    s_waitcnt lgkmcnt(1)
    .v_fma_2x2_s4 v_c+2,v_a,v_b+2

    s_waitcnt vmcnt(2)
    .v_in_sst_e_n1_b_n2_1_1_1_2_n1s64_n2v2 v_gld_b, v_sst_b_os
    s_waitcnt vmcnt(0)
    .v_wei_sst_e_k_1_2_es128_kv2 v_gld_a, v_sst_a_os
    s_sub_i32 s[s_kitr], s[s_kitr], 4
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_v4r1_dynamic_wrw_32x32x4_4x4_2x2x4x2x4x2_4x2x8x1_4x16_fma_finishing
    .v_in_move_slice_window v_in_os, v_in_ic, v_in_iy, v_in_ix, v_in_ihi, v_in_iwi, v_flag, s_hi, s_wi, s_y, s_x, s_in_stride_c, s_dilation_h, s_dilation_w, s_in_ic, s_in_iy, s_in_ix, v_idc, v_idy, v_idx, s_tmp
    .v_wei_move_slice_window v_wei_os, v_wei_ic, v_wei_iy, v_wei_ix, s_y, s_x, s_wei_stride_c, s_wei_ic, s_wei_iy, s_wei_ix, v_idc, v_idy, v_idx, s_tmp
    s_waitcnt lgkmcnt(2)
    .v_fma_2x2_s4 v_c+8,v_a+2,v_b

    v_xor_b32 v[v_sst_b_os], 0x400, v[v_sst_b_os] ; switch double buffer b store
    v_xor_b32 v[v_sst_a_os], 0x400, v[v_sst_a_os] ; switch double buffer a store
    s_waitcnt lgkmcnt(0)
    s_barrier
    .v_in_load_e_n1_b_n2_1_1_1_2 v_gld_b, s_p_buf_in, v_in_os, s_in_stride_n1, s_in_stride_n2, v_flag, s_tmp
    .v_wei_load_e_k_1_2_ev1 v_gld_a, s_p_buf_wei, v_wei_os, s_wei_stride_k, s_tmp
    .v_fma_2x2_s4 v_c+10,v_a+2,v_b+2

    s_branch L_igemm_v4r1_dynamic_wrw_32x32x4_4x4_2x2x4x2x4x2_4x2x8x1_4x16_fma_body
L_igemm_v4r1_dynamic_wrw_32x32x4_4x4_2x2x4x2x4x2_4x2x8x1_4x16_fma_finishing:
    s_waitcnt lgkmcnt(2)
    .v_fma_2x2_s4 v_c+8,v_a+2,v_b
    .v_fma_2x2_s4 v_c+10,v_a+2,v_b+2
L_igemm_v4r1_dynamic_wrw_32x32x4_4x4_2x2x4x2x4x2_4x2x8x1_4x16_end:
    s_waitcnt lgkmcnt(0)
    s_barrier
    ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] 
    ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] 
    ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:64
    ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:64
    .itr_k = 0
    .rept 3
        s_waitcnt lgkmcnt(2)
        .v_fma_2x2_s4 v_c,v_a,v_b

        s_waitcnt lgkmcnt(1)
        .v_fma_2x2_s4 v_c+2,v_a,v_b+2

        ds_read_b64 v[v_a:v_a+1], v[v_sld_a_os] offset:0+(.itr_k+1)*128
        s_waitcnt lgkmcnt(1)
        .v_fma_2x2_s4 v_c+8,v_a+2,v_b

        ds_read_b64 v[v_b:v_b+1], v[v_sld_b_os] offset:0+(.itr_k+1)*128
        .v_fma_2x2_s4 v_c+10,v_a+2,v_b+2

        ds_read_b64 v[v_b+2:v_b+2+1], v[v_sld_b_os] offset:0+(.itr_k+1)*128+64
        ds_read_b64 v[v_a+2:v_a+2+1], v[v_sld_a_os] offset:0+(.itr_k+1)*128+64
        .itr_k = .itr_k + 1
    .endr

    ; last unroll
    s_waitcnt lgkmcnt(2)
    .v_fma_2x2_s4 v_c,v_a,v_b

    s_waitcnt lgkmcnt(1)
    .v_fma_2x2_s4 v_c+2,v_a,v_b+2

    s_waitcnt lgkmcnt(0)
    .v_fma_2x2_s4 v_c+8,v_a+2,v_b

    .v_fma_2x2_s4 v_c+10,v_a+2,v_b+2

    s_mov_b32 s[s_tmp], 0
    s_mov_b32 s[s_tmp+1], 0
    s_mov_b32 s[s_tmp+2], 0
    s_mov_b32 s[s_tmp+3], 0
    .v_out_write_k0_k1_n1_b_n2_2_2_2_1_2 v_c, s_p_buf_out, v_out_os, s_out_stride_k0, s_out_stride_k1, s_out_stride_n1, s_out_stride_n2, s_tmp
    s_endpgm
.rodata
.p2align 6
.amdhsa_kernel igemm_v4r1_dynamic_wrw_32x32x4_4x4_2x2x4x2x4x2_4x2x8x1_4x16
    .amdhsa_group_segment_fixed_size 2048
    .amdhsa_user_sgpr_kernarg_segment_ptr 1
    .amdhsa_system_sgpr_workgroup_id_x 1
    .amdhsa_system_vgpr_workitem_id 0
    .amdhsa_next_free_vgpr 60
    .amdhsa_next_free_sgpr 58
    .amdhsa_ieee_mode 0
    .amdhsa_dx10_clamp 0
    //xnack disabled by default for asm kernels
    .amdhsa_reserve_xnack_mask 0
.end_amdhsa_kernel

;----------------------------------------------------------
; starting of kernel  igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_4x64
; b_per_block                      : 16
; k_per_block                      : 128
; e_per_block                      : 16
; gemm_n_repeat                    : 2
; gemm_m_per_thread_subc           : 4
; gemm_m_level0_cluster            : 4
; gemm_m_level1_cluster            : 4
; gemm_n_per_thread_subc           : 4
; gemm_n_level0_cluster            : 4
; gemm_n_level1_cluster            : 4
; in_block_copy_cluster_lengths_e  : 16
; in_block_copy_cluster_lengths_n1 : 1
; in_block_copy_cluster_lengths_b  : 16
; in_block_copy_cluster_lengths_n2 : 1
; wei_block_copy_cluster_lengths_e : 4
; wei_block_copy_cluster_lengths_k : 64
; 
; in_block_copy_sub_lengths_e      : 1
; in_block_copy_sub_lengths_n1     : 2
; in_block_copy_sub_lengths_b      : 1
; in_block_copy_sub_lengths_n2     : 4
; wei_block_copy_sub_lengths_e     : 4
; wei_block_copy_sub_lengths_k     : 2
; block_size                       : 256
; thread_tile                      : 8x8
; 
; kernarg offset
.set k_p_in,                0
.set k_p_wei,               8
.set k_p_out,               16
.set k_hi,                  24
.set k_wi,                  28
.set k_n,                   32
.set k_k,                   36
.set k_c,                   40
.set k_ho,                  44
.set k_wo,                  48
.set k_stride_h,            52
.set k_stride_w,            56
.set k_dilation_h,          60
.set k_dilation_w,          64
.set k_pad_h,               68
.set k_pad_w,               72
.set k_y,                   76
.set k_x,                   80
.set k_gemmk_groups,        84

; sgpr
.set s_ka,                  0
.set s_bx,                  2
.set s_p_in,                4
.set s_p_wei,               6
.set s_hi,                  8
.set s_wi,                  9
.set s_n,                   10
.set s_k,                   11
.set s_c,                   12
.set s_ho,                  13
.set s_wo,                  14
.set s_stride_h,            15
.set s_stride_w,            16
.set s_dilation_h,          17
.set s_dilation_w,          18
.set s_pad_h,               19
.set s_pad_w,               20
.set s_y,                   21
.set s_x,                   22
.set s_gemmkgroups,         23
.set s_p_out,               24
.set s_block_ie,            52
.set s_block_ik,            26
.set s_block_ib,            27
.set s_in_stride_c,         28
.set s_in_stride_n2,        29
.set s_in_stride_n1,        30
.set s_in_ic,               31
.set s_in_iy,               32
.set s_in_ix,               33
.set s_group_stride,        53
.set s_wei_stride_c,        35
.set s_wei_stride_k,        36
.set s_out_stride_k0,       37
.set s_out_stride_k1,       38
.set s_out_stride_n1,       39
.set s_out_stride_n2,       40
.set s_kitr,                0
.set s_tmp,                 44
.set s_p_buf_in,            s_p_in      ; 4 sgpr used for MUBUF
.set s_p_buf_wei,           48
.set s_p_buf_out,           s_p_out
.set s_wei_slice,           1
.set s_sub_c,               54
.set s_group_left,          55

; vgpr
.set v_c,                   0
.set v_a,                   64
.set v_b,                   72
.set v_gld_a,               80
.set v_gld_b,               88
.set v_in_os,               96
.set v_wei_os,              97
.set v_sst_a_os,            98
.set v_sst_b_os,            99
.set v_sld_a_os,            100
.set v_sld_b_os,            101
.set v_out_os,              102
.set v_flag,                103
.set v_in_ic,               104
.set v_in_iy,               105
.set v_in_ix,               106
.set v_in_ihi,              107
.set v_in_iwi,              108
.set v_in_in0,              63
.set v_in_iho,              62
.set v_in_iwo,              61
.set v_in_ie,               60
.set v_in_in1,              59
.set v_in_ib,               58
.set v_in_in2,              57
.set v_wei_ie,              56
.set v_wei_ik,              55
.set v_out_ik0,             54
.set v_out_ik1,             53
.set v_out_ib,              52
.set v_gemm_in,             51
.set v_gemm_im,             50
.set v_idc,                 109
.set v_idy,                 110
.set v_idx,                 111
.set v_wei_idc,             112
.set v_wei_idyx,            113
.set v_tmp,                 44
.set v_end,                 114

.text
.globl  igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_4x64
.p2align 8
.type  igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_4x64,@function
 igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_4x64:
    s_load_dwordx4  s[s_p_in:s_p_in+3],         s[s_ka:s_ka+1],     0+k_p_in
    s_load_dwordx2  s[s_p_out:s_p_out+1],       s[s_ka:s_ka+1],     0+k_p_out
    s_load_dwordx8  s[s_hi:s_hi+7],             s[s_ka:s_ka+1],     0+k_hi
    s_load_dwordx4  s[s_stride_w:s_stride_w+3], s[s_ka:s_ka+1],     0+k_stride_w
    s_load_dwordx2  s[s_pad_w:s_pad_w+1],       s[s_ka:s_ka+1],     0+k_pad_w
    s_load_dwordx2  s[s_x:s_gemmkgroups],       s[s_ka:s_ka+1],     0+k_x

    ; in e_n1_b_n2 cluster_lengths:{16,1,16,1}, sub_lengths:{1,2,1,4}, order:{0,1,3,2}
    v_and_b32 v[v_in_ib], 15, v0 ; in_ib=tid%16
    v_lshrrev_b32 v[v_tmp], 4, v0 ; tmp=tid/16
    v_mov_b32 v[v_in_in2], 0 ; in_in2=0
    v_mov_b32 v[v_in_in1], 0 ; in_in1=0
    v_and_b32 v[v_in_ie], 15, v[v_tmp] ; in_ie=(tid/16)%16
    ; wei e_k cluster_lengths:{4,64}, sub_lengths:{4,2}, order:{1,0}
    v_and_b32 v[v_wei_ie], 3, v0 ; wei_ie=tid%4
    v_lshrrev_b32 v[v_tmp], 2, v0 ; tmp=tid/4
    v_lshlrev_b32 v[v_wei_ie], 2, v[v_wei_ie] ; wei_ie=(tid%4)*4
    v_and_b32 v[v_wei_ik], 63, v[v_tmp] ; wei_ik=(tid/4)%64
    v_lshlrev_b32 v[v_wei_ik], 1, v[v_wei_ik] ; wei_ik=((tid/4)%64)*2
    s_waitcnt lgkmcnt(0)

    ; calculate c per group
    s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmkgroups]

    ; calculate index
    s_mul_i32 s[s_out_stride_n2], s[s_ho], s[s_wo] ; out_stride_n2=ho*wo
    s_mul_i32 s[s_out_stride_k1], s[s_n], s[s_out_stride_n2] ; out_stride_k1=ho*wo*n
    s_lshl_b32 s[s_out_stride_k0], s[s_out_stride_k1], 6 ; out_stride_k0=ho*wo*n*64
    s_lshl_b32 s[s_out_stride_n1], s[s_out_stride_n2], 2 ; out_stride_n1=ho*wo*4
    s_mul_i32 s[s_in_stride_n2], s[s_hi], s[s_wi] ; in_stride_n2=hi*wi
    s_mul_i32 s[s_in_stride_c], s[s_n], s[s_in_stride_n2] ; in_stride_c=hi*wi*n
    s_mul_i32 s[s_wei_stride_k], s[s_y], s[s_x] ; wei_stride_k=y*x
    s_mul_i32 s[s_wei_stride_c], s[s_k], s[s_wei_stride_k] ; wei_stride_c=k*y*x
    s_mov_b64 s[s_p_buf_wei:s_p_buf_wei+1], s[s_p_wei:s_p_wei+1]
    s_mov_b32 s[s_p_buf_in+2], 0xffffffff
    s_mov_b32 s[s_p_buf_in+3], 0x27000
    s_mov_b32 s[s_p_buf_wei+2], 0xffffffff
    s_mov_b32 s[s_p_buf_wei+3], 0x27000

    ; block e,k,b index global
    s_lshr_b32 s[s_tmp], s[s_n], 3 ; tmp=n/8 maybe n0
    s_mul_i32 s[s_tmp+1], s[s_out_stride_n2], s[s_tmp]; tmp_1=ho*wo*n/8
    s_lshr_b32 s[0], s[s_tmp+1], 4 ; s_0=ho*wo*n/8/16
    s_lshr_b32 s[s_tmp], s[s_k], 7 ; s_tmp=k/128
    s_mul_i32 s[s_group_stride], s[s_tmp], s[0] ; s_group_stride=n*ho*wo/128*k/128

    ; block e index on global
    .v_u32_div_ss v_tmp+5, s_bx, s_group_stride, v_tmp, s_tmp ; v_tmp_5=block_idx/(n*ho*wo/128*k/128)
    v_readfirstlane_b32 s[s_block_ie], v[v_tmp+5] ; s_block_ie=block_idx/(n*ho*wo/128*k/128)
    
    ; block k, b index on global
    s_mul_i32 s[s_tmp+2], s[s_block_ie], s[s_group_stride] ; s_tmp_2=block_idx/(n*ho*wo/128*k/128)*(n*ho*wo/128*k/128)
    s_sub_i32 s[s_group_left], s[s_bx], s[s_tmp+2] ; s_tmp_1=block_idx-block_idx/(n*ho*wo/128*k/128)*(n*ho*wo/128*k/128)
    
    .v_u32_div_ss v_tmp+5, s_group_left, 0, v_tmp, s_tmp ; v_tmp_5=block_idx/(ho*wo*n/8/16)
    v_readfirstlane_b32 s[s_tmp], v[v_tmp+5] ; s_tmp=v_tmp_5
    s_mul_i32 s[s_tmp+2], s[s_tmp], s[0] ; s_tmp_2=block_idx/(ho*wo*n/8/16)*(ho*wo*n/8/16)
    s_sub_i32 s[s_tmp+1], s[s_group_left], s[s_tmp+2] ; s_tmp_1=block_idx-s_tmp_2
    s_lshl_b32 s[s_block_ik], s[s_tmp], 7 ; s_block_ik=block_idx/(ho*wo*n/8/16)*128
    s_lshl_b32 s[s_block_ib], s[s_tmp+1], 4 ; s_block_ib=s_tmp_1*16
    ;s_mul_i32 s[s_block_ie], s[s_block_ie], s[s_sub_c] ; s_block_ie*=s_sub_c

    ; calculate input transform
    ; e_n1_b_n2:b, transform: b -> n0*ho*wo
    v_add_u32 v[v_tmp+4], s[s_block_ib], v[v_in_ib] ; v_tmp_4=block_ib+v_in_ib(thread_ib)
    .v_u32_div_vs v_in_in0, v_tmp+4, s_out_stride_n2, v_tmp, s_tmp ; v_in_n0=(block_ib+v_in_ib)/(ho*wo)
    v_mul_lo_u32 v[v_tmp], s[s_out_stride_n2], v[v_in_in0] ; v_tmp=v_in_n0*(ho*wo)
    v_sub_u32 v[v_tmp+4], v[v_tmp+4], v[v_tmp] ; v_tmp_4=block_ib+v_in_ib-v_in_n0*(ho*wo)
    .v_u32_div_vs v_in_iho, v_tmp+4, s_wo, v_tmp, s_tmp ; v_in_iho=v_tmp_4/wo
    v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_in_iho] 
    v_sub_u32 v[v_in_iwo], v[v_tmp+4], v[v_tmp] ; v_in_iwo=v_tmp_4-v_in_iho*wo

    ; e_n1_b_n2:e
    ;   1) transform e -> c*y*x
    .v_u32_div_vs v_in_ic, v_in_ie, s_wei_stride_k, v_tmp, s_tmp ; v_in_ic=v_in_ie/(y*x)
    v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_in_ic] ; v_tmp=v_in_ic*(y*x)
    v_sub_u32 v[v_tmp+4], v[v_in_ie], v[v_tmp] ; v_tmp_4=v_in_ie-v_in_ic*(y*x)
    .v_u32_div_vs v_in_iy, v_tmp+4, s_x, v_tmp, s_tmp ; v_in_iy=v_tmp_4/x
    v_mul_lo_u32 v[v_tmp], s[s_x], v[v_in_iy] ; v_tmp=x*v_in_iy
    v_sub_u32 v[v_in_ix], v[v_tmp+4], v[v_tmp] ; v_in_ix=v_in_ie-v_in_ic*(y*x)-x*v_in_iy

    ;   2) transform iho, iwo, iy, ix -> hip, wip
    v_mul_lo_u32 v[v_tmp], s[s_stride_h], v[v_in_iho]
    v_mul_lo_u32 v[v_tmp+1], s[s_stride_w], v[v_in_iwo]
    v_mul_lo_u32 v[v_tmp+2], s[s_dilation_h], v[v_in_iy]
    v_mul_lo_u32 v[v_tmp+3], s[s_dilation_w], v[v_in_ix]

    ;   3) transform hip, wip -> hi, wi
    v_add_u32 v[v_tmp], v[v_tmp], v[v_tmp+2]
    v_add_u32 v[v_tmp+1], v[v_tmp+1], v[v_tmp+3]
    v_sub_i32 v[v_in_ihi], v[v_tmp], s[s_pad_h] ; v_in_ihi=v_in_iho*s_stride_h+v_in_iy*s_dilation_h+s_pad_h
    v_sub_i32 v[v_in_iwi], v[v_tmp+1], s[s_pad_w] ; v_in_iwi=v_in_iwo*s_stride_w+v_in_ix*s_dilation_w+s_pad_w

    ; set input flag
    .v_in_set_flag v_flag, v_in_ihi, v_in_iwi, s_hi, s_wi, s_tmp

    ; in offset: from ihi, iwi, ic, in, calculate v_in_os
    v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi] ; v_tmp=v_in_ihi*s_wi
    v_add_u32 v[v_tmp], v[v_tmp], v[v_in_iwi] ; v_tmp=v_in_ihi*s_wi+v_in_iwi
    s_mul_i32 s[s_tmp], s[s_block_ie], s[s_sub_c] ; s_tmp=s_block_ie*s_sub_c
    v_add_i32 v[v_tmp+2], s[s_tmp], v[v_in_ic] ; v_tmp_2=s_block_ie*s_sub_c+v_in_ic
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_c], v[v_tmp+2] ; v_tmp_1=v_in_ic*n*hi*wi
    v_add_u32 v[v_tmp], v[v_tmp], v[v_tmp+1] ; v_tmp=v_in_ic*n*hi*wi+v_in_ihi*s_wi+v_in_iwi
    v_lshl_add_u32 v[v_tmp+1], v[v_in_in0], 3, v[v_in_in2] ; v_tmp_1=v_in_in0*8+v_in_in2
    v_lshl_add_u32 v[v_tmp+1], v[v_in_in1], 2, v[v_tmp+1] ; v_tmp_1=v_in_in0*8+v_in_in2+v_in_in1*4
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n2], v[v_tmp+1] ; v_tmp_1=(v_in_in0*8+v_in_in2+v_in_in1*4)*hi*wi
    v_add_lshl_u32 v[v_in_os], v[v_tmp], v[v_tmp+1], 2 ; v_in_os=((v_in_in0*8+v_in_in2+v_in_in1*4)*hi*wi
                                                       ;           +v_in_ic*n*hi*wi+v_in_ihi*s_wi+v_in_iwi)*sizeof(float)

    s_lshl_b32 s[s_in_stride_n2], s[s_in_stride_n2], 2
    s_lshl_b32 s[s_in_stride_n1], s[s_in_stride_n2], 2
    ; load input from global
    .v_in_load_e_n1_b_n2_1_2_1_4 v_gld_b, s_p_buf_in, v_in_os, s_in_stride_n1, s_in_stride_n2, v_flag, s_tmp

    ; calculate SliceWindow e=c*y*x. this is same for both input/weight
    s_mov_b32 s[s_wei_slice], 16
    .v_u32_div_ss v_tmp+4, s_wei_slice, s_wei_stride_k, v_tmp, s_tmp
    v_readfirstlane_b32 s[s_in_ic], v[v_tmp+4]
    s_mul_i32 s[s_tmp], s[s_wei_stride_k], s[s_in_ic]
    s_sub_i32 s[s_wei_slice], s[s_wei_slice], s[s_tmp]
    .v_u32_div_ss v_tmp+4, s_wei_slice, s_x, v_tmp, s_tmp
    v_readfirstlane_b32 s[s_in_iy], v[v_tmp+4]
    s_mul_i32 s[s_tmp], s[s_x], s[s_in_iy]
    s_sub_i32 s[s_in_ix], s[s_wei_slice], s[s_tmp]

    ; c thread mapping
    v_and_b32 v[v_tmp+4], 15, v0 ; tid%16
    v_and_b32 v[v_tmp], 3, v[v_tmp+4] ; (tid%16)%4
    v_lshrrev_b32 v[v_tmp+1], 2, v[v_tmp+4] ; (tid%16)/4

    v_lshrrev_b32 v[v_tmp+4], 4, v0 ; tid/16
    v_and_b32 v[v_tmp+2], 3, v[v_tmp+4] ; (tid/16)%4
    v_lshrrev_b32 v[v_tmp+3], 2, v[v_tmp+4] ; (tid/16)/4

    v_lshl_or_b32 v[v_gemm_in], v[v_tmp+2], 2, v[v_tmp]               ; in
    v_lshl_or_b32 v[v_gemm_im], v[v_tmp+3], 2, v[v_tmp+1]             ; im
    v_lshlrev_b32 v[v_sld_b_os], 4, v[v_gemm_in]
    v_lshlrev_b32 v[v_sld_a_os], 4, v[v_gemm_im]
    v_add_u32 v[v_sld_a_os], 8192, v[v_sld_a_os]

    ; calculate weight transform
    v_add_u32 v[v_tmp], s[s_block_ik], v[v_wei_ik] ; v_tmp=block_ik+v_wei_ik
    v_mul_lo_u32 v[v_tmp+1], s[s_wei_stride_k], v[v_tmp] ; v_tmp_1=(block_ik+v_wei_ik)*y*x
    .v_u32_div_vs v_tmp, v_wei_ie, s_wei_stride_k, v_tmp+2, s_tmp
    s_mul_i32 s[s_tmp], s[s_block_ie], s[s_sub_c] ; s_tmp=s_block_ie*s_sub_c
    v_add_i32 v[v_tmp+3], s[s_tmp], v[v_tmp] ; v_tmp_3=s_block_ie*s_sub_c+v_wei_ic
    v_mul_lo_u32 v[v_tmp+2], s[s_wei_stride_c], v[v_tmp+3] ; v_tmp_2=(s_block_ie*s_sub_c+v_wei_ic)*k*y*x
    v_add_u32 v[v_tmp+1], v[v_tmp+2], v[v_tmp+1] ; v_tmp_1=(s_block_ie*s_sub_c+v_wei_ic)*k*y*x+(block_ik+v_wei_ik)*y*x
    v_mul_lo_u32 v[v_tmp+2], s[s_wei_stride_k], v[v_tmp]
    v_sub_u32 v[v_tmp+4], v[v_wei_ie], v[v_tmp+2] ; v_tmp_4=v_wei_ie-(v_wei_ie/(y*x))*(y*x)
    v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+1], 2 ; v_wei_os=v_wei_ic*k*y*x+(block_ik+v_wei_ik)*y*x+v_wei_iy*x+v_wei_ix
    s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2

    ; load wei from global
    .v_wei_load_e_k_4_2_ev4 v_gld_a, s_p_buf_wei, v_wei_os, s_wei_stride_k, s_tmp

    ; calculate out index ik0, ik1, ib
    v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_im]
    v_add_u32 v[v_tmp], s[s_block_ik], v[v_tmp+1]
    v_lshrrev_b32 v[v_out_ik0], 6, v[v_tmp]
    v_and_b32 v[v_out_ik1], 63, v[v_tmp]

    v_add_u32 v[v_out_ib], s[s_block_ib], v[v_gemm_in]
    .v_u32_div_vs v_tmp+4, v_out_ib, s_out_stride_n2, v_tmp, s_tmp
    v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n2], v[v_tmp+4]
    v_sub_u32 v[v_tmp+5], v[v_out_ib], v[v_tmp+1]
    .v_u32_div_vs v_tmp+6, v_tmp+5, s_wo, v_tmp, s_tmp
    v_mul_lo_u32 v[v_tmp+1], s[s_wo], v[v_tmp+6]
    v_sub_u32 v[v_tmp+5], v[v_tmp+5], v[v_tmp+1]
    ; v_tmp+4:in0, v_tmp+6:iho, v_tmp+5:iwo

    v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_tmp+6] ; v_tmp=iho*wo
    v_add_u32 v[v_out_os], v[v_tmp], v[v_tmp+5] ; v_out_os=iho*wo+iwo
    s_lshl_b32 s[s_tmp+1], s[s_out_stride_n2], 3 ; s_tmp_1=ho*wo*8
    v_mul_lo_u32 v[v_tmp], s[s_tmp+1], v[v_tmp+4] ; v_tmp=in0*ho*wo*8
    v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp]

    s_lshl_b32 s[s_out_stride_k0], s[s_out_stride_k0], 2
    v_lshl_or_b32 v[v_tmp], v[v_out_ik0], 6, v[v_out_ik1] ; v_tmp=ik0*64+ik1
    s_lshl_b32 s[s_out_stride_n1], s[s_out_stride_n1], 2
    v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_k1], v[v_tmp] ; v_tmp_1=(ik0*64+ik1)*out_stride_k1
    s_lshl_b32 s[s_out_stride_n2], s[s_out_stride_n2], 2
    v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp+1] ; v_out_os = (ik0*64+ik1)*n*ho*wo+in0*ho*wo*8+iho*wo+iwo
    s_mul_i32 s[s_tmp], s[s_out_stride_k1], s[s_k] ; s_tmp=k*n*wo*ho
    s_mul_i32 s[s_tmp], s[s_tmp], s[s_block_ie] ; s_tmp=s_block_ie*k*n*wo*ho
    v_add_u32 v[v_out_os], v[v_out_os], s[s_tmp] ; v_out_os=v_out_os+s_block_ie*k*n*wo*ho
    s_lshl_b32 s[s_out_stride_k1], s[s_out_stride_k1], 2
    v_lshlrev_b32 v[v_out_os], 2, v[v_out_os] 

    ; in lds offset block e_n1_b_n2
    v_lshlrev_b32 v[v_tmp], 7, v[v_in_ie]
    v_lshl_or_b32 v[v_tmp], v[v_in_ib], 2, v[v_tmp]
    v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] ; v_sst_b_os=ie*128+ib*4

    ; wei lds offset block e_k
    v_lshl_or_b32 v[v_tmp], v[v_wei_ie], 7, v[v_wei_ik]
    v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp]
    v_add_u32 v[v_sst_a_os], 8192, v[v_sst_a_os]

    s_mov_b32 s[s_p_buf_out+2], 0xffffffff
    s_mov_b32 s[s_p_buf_out+3], 0x27000
    .v_clear_nc v_c, 64

    ; start FMA loop, 8x8 thread tile with 4x4 sub-tile
    s_waitcnt vmcnt(2)
    .v_in_sst_e_n1_b_n2_1_2_1_4_n1s256_n2v4 v_gld_b, v_sst_b_os

    s_waitcnt vmcnt(0)
    .v_wei_sst_e_k_4_2_es512_kv2 v_gld_a, v_sst_a_os

    ; E = C * Y * X
    s_mul_i32 s[s_tmp], s[s_sub_c], s[s_wei_stride_k] ; s_tmp=sub_c*y*x
    s_lshr_b32 s[s_tmp], s[s_tmp], 2 ; cause wei stride k has shl 2
    s_sub_i32 s[s_kitr], s[s_tmp], 16
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_4x64_end

    .v_in_wei_move_slice_window_wrw v_in_os, v_wei_os, v_in_ic, v_in_iy, v_in_ix, v_in_ihi, v_in_iwi, v_flag, s_hi, s_wi, s_y, s_x, s_in_stride_c, s_wei_stride_c s_dilation_h, s_dilation_w, s_in_ic, s_in_iy, s_in_ix, v_idc, v_idy, v_idx, v_wei_idc, s_tmp
    v_xor_b32 v[v_sst_b_os], 0x4000, v[v_sst_b_os] ; switch double buffer b store
    v_xor_b32 v[v_sst_a_os], 0x4000, v[v_sst_a_os] ; switch double buffer a store
    s_waitcnt lgkmcnt(0)
    s_barrier


    .v_in_load_e_n1_b_n2_1_2_1_4 v_gld_b, s_p_buf_in, v_in_os, s_in_stride_n1, s_in_stride_n2, v_flag, s_tmp
    .v_wei_load_e_k_4_2_ev4 v_gld_a, s_p_buf_wei, v_wei_os, s_wei_stride_k, s_tmp

L_igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_4x64_fma_body:
    ; do fma accumulate with unroll 16
    ds_read_b128 v[v_a:v_a+3], v[v_sld_a_os] 
    ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] 
    ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:256
    ds_read_b128 v[v_a+4:v_a+4+3], v[v_sld_a_os] offset:256

    .itr_k = 0
    .rept 15
        s_waitcnt lgkmcnt(2)
        .v_fma_4x4_s8 v_c,v_a,v_b

        s_waitcnt lgkmcnt(1)
        .v_fma_4x4_s8 v_c+4,v_a,v_b+4

        ds_read_b128 v[v_a:v_a+3], v[v_sld_a_os] offset:0+(.itr_k+1)*512
        s_waitcnt lgkmcnt(1)
        .v_fma_4x4_s8 v_c+32,v_a+4,v_b

        ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:0+(.itr_k+1)*512
        .v_fma_4x4_s8 v_c+36,v_a+4,v_b+4

        ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:0+(.itr_k+1)*512+256
        ds_read_b128 v[v_a+4:v_a+4+3], v[v_sld_a_os] offset:0+(.itr_k+1)*512+256
        .itr_k = .itr_k + 1
    .endr

    ; last unroll
    v_xor_b32 v[v_sld_b_os], 0x4000, v[v_sld_b_os] ; switch double buffer b load
    v_xor_b32 v[v_sld_a_os], 0x4000, v[v_sld_a_os] ; switch double buffer a load
    s_waitcnt lgkmcnt(2)
    .v_fma_4x4_s8 v_c,v_a,v_b

    s_waitcnt lgkmcnt(1)
    .v_fma_4x4_s8 v_c+4,v_a,v_b+4

    s_waitcnt vmcnt(2)
    .v_in_sst_e_n1_b_n2_1_2_1_4_n1s256_n2v4 v_gld_b, v_sst_b_os
    s_waitcnt vmcnt(0)
    .v_wei_sst_e_k_4_2_es512_kv2 v_gld_a, v_sst_a_os
    s_sub_i32 s[s_kitr], s[s_kitr], 16
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_4x64_fma_finishing
    .v_in_wei_move_slice_window_wrw v_in_os, v_wei_os, v_in_ic, v_in_iy, v_in_ix, v_in_ihi, v_in_iwi, v_flag, s_hi, s_wi, s_y, s_x, s_in_stride_c, s_wei_stride_c, s_dilation_h, s_dilation_w, s_in_ic, s_in_iy, s_in_ix, v_idc, v_idy, v_idx, v_wei_idc, s_tmp

    s_waitcnt lgkmcnt(4)
    .v_fma_4x4_s8 v_c+32,v_a+4,v_b

    v_xor_b32 v[v_sst_b_os], 0x4000, v[v_sst_b_os] ; switch double buffer b store
    v_xor_b32 v[v_sst_a_os], 0x4000, v[v_sst_a_os] ; switch double buffer a store
    s_waitcnt lgkmcnt(0)
    s_barrier
    .v_in_load_e_n1_b_n2_1_2_1_4 v_gld_b, s_p_buf_in, v_in_os, s_in_stride_n1, s_in_stride_n2, v_flag, s_tmp
    .v_wei_load_e_k_4_2_ev4 v_gld_a, s_p_buf_wei, v_wei_os, s_wei_stride_k, s_tmp
    .v_fma_4x4_s8 v_c+36,v_a+4,v_b+4

    s_branch L_igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_4x64_fma_body
L_igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_4x64_fma_finishing:
    s_waitcnt lgkmcnt(4)
    .v_fma_4x4_s8 v_c+32,v_a+4,v_b
    .v_fma_4x4_s8 v_c+36,v_a+4,v_b+4
L_igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_4x64_end:
    s_waitcnt lgkmcnt(0)
    s_barrier
    ds_read_b128 v[v_a:v_a+3], v[v_sld_a_os] 
    ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] 
    ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:256
    ds_read_b128 v[v_a+4:v_a+4+3], v[v_sld_a_os] offset:256
    .itr_k = 0
    .rept 15
        s_waitcnt lgkmcnt(2)
        .v_fma_4x4_s8 v_c,v_a,v_b

        s_waitcnt lgkmcnt(1)
        .v_fma_4x4_s8 v_c+4,v_a,v_b+4

        ds_read_b128 v[v_a:v_a+3], v[v_sld_a_os] offset:0+(.itr_k+1)*512
        s_waitcnt lgkmcnt(1)
        .v_fma_4x4_s8 v_c+32,v_a+4,v_b

        ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:0+(.itr_k+1)*512
        .v_fma_4x4_s8 v_c+36,v_a+4,v_b+4

        ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:0+(.itr_k+1)*512+256
        ds_read_b128 v[v_a+4:v_a+4+3], v[v_sld_a_os] offset:0+(.itr_k+1)*512+256
        .itr_k = .itr_k + 1
    .endr

    ; last unroll
    s_waitcnt lgkmcnt(2)
    .v_fma_4x4_s8 v_c,v_a,v_b

    s_waitcnt lgkmcnt(1)
    .v_fma_4x4_s8 v_c+4,v_a,v_b+4

    s_waitcnt lgkmcnt(0)
    .v_fma_4x4_s8 v_c+32,v_a+4,v_b

    .v_fma_4x4_s8 v_c+36,v_a+4,v_b+4

    s_mov_b32 s[s_tmp], 0
    s_mov_b32 s[s_tmp+1], 0
    s_mov_b32 s[s_tmp+2], 0
    s_mov_b32 s[s_tmp+3], 0
    .v_out_write_k0_k1_n1_b_n2_2_4_2_1_4 v_c, s_p_buf_out, v_out_os, s_out_stride_k0, s_out_stride_k1, s_out_stride_n1, s_out_stride_n2, s_tmp
    
L_program_end:
    s_endpgm
.rodata
.p2align 6
.amdhsa_kernel  igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_4x64
    .amdhsa_group_segment_fixed_size 32768
    .amdhsa_user_sgpr_kernarg_segment_ptr 1
    .amdhsa_system_sgpr_workgroup_id_x 1
    .amdhsa_system_vgpr_workitem_id 0
    .amdhsa_next_free_vgpr 120
    .amdhsa_next_free_sgpr 58
    .amdhsa_ieee_mode 0
    .amdhsa_dx10_clamp 0
    //xnack disabled by default for asm kernels
    .amdhsa_reserve_xnack_mask 0
.end_amdhsa_kernel

;----------------------------------------------------------
; starting of kernel  igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_16x16
; b_per_block                      : 16
; k_per_block                      : 128
; e_per_block                      : 16
; gemm_n_repeat                    : 2
; gemm_m_per_thread_subc           : 4
; gemm_m_level0_cluster            : 4
; gemm_m_level1_cluster            : 4
; gemm_n_per_thread_subc           : 4
; gemm_n_level0_cluster            : 4
; gemm_n_level1_cluster            : 4
; in_block_copy_cluster_lengths_e  : 16
; in_block_copy_cluster_lengths_n1 : 1
; in_block_copy_cluster_lengths_b  : 16
; in_block_copy_cluster_lengths_n2 : 1
; wei_block_copy_cluster_lengths_e : 16
; wei_block_copy_cluster_lengths_k : 16
; 
; in_block_copy_sub_lengths_e      : 1
; in_block_copy_sub_lengths_n1     : 2
; in_block_copy_sub_lengths_b      : 1
; in_block_copy_sub_lengths_n2     : 4
; wei_block_copy_sub_lengths_e     : 1
; wei_block_copy_sub_lengths_k     : 8
; block_size                       : 256
; thread_tile                      : 8x8
; 
; kernarg offset
.set k_p_in,                0
.set k_p_wei,               8
.set k_p_out,               16
.set k_hi,                  24
.set k_wi,                  28
.set k_n,                   32
.set k_k,                   36
.set k_c,                   40
.set k_ho,                  44
.set k_wo,                  48
.set k_stride_h,            52
.set k_stride_w,            56
.set k_dilation_h,          60
.set k_dilation_w,          64
.set k_pad_h,               68
.set k_pad_w,               72
.set k_y,                   76
.set k_x,                   80
.set k_gemmk_groups,        84

; sgpr
.set s_ka,                  0
.set s_bx,                  2
.set s_p_in,                4
.set s_p_wei,               6
.set s_hi,                  8
.set s_wi,                  9
.set s_n,                   10
.set s_k,                   11
.set s_c,                   12
.set s_ho,                  13
.set s_wo,                  14
.set s_stride_h,            15
.set s_stride_w,            16
.set s_dilation_h,          17
.set s_dilation_w,          18
.set s_pad_h,               19
.set s_pad_w,               20
.set s_y,                   21
.set s_x,                   22
.set s_gemmkgroups,         23
.set s_p_out,               24
.set s_block_ie,            52
.set s_block_ik,            26
.set s_block_ib,            27
.set s_in_stride_c,         28
.set s_in_stride_n2,        29
.set s_in_stride_n1,        30
.set s_in_ic,               31
.set s_in_iy,               32
.set s_in_ix,               33
.set s_group_stride,        53
.set s_wei_stride_c,        35
.set s_wei_stride_k,        36
.set s_out_stride_k0,       37
.set s_out_stride_k1,       38
.set s_out_stride_n1,       39
.set s_out_stride_n2,       40
.set s_kitr,                0
.set s_tmp,                 44
.set s_p_buf_in,            s_p_in      ; 4 sgpr used for MUBUF
.set s_p_buf_wei,           48
.set s_p_buf_out,           s_p_out
.set s_wei_slice,           1
.set s_sub_c,               54
.set s_group_left,          55

; vgpr
.set v_c,                   0
.set v_a,                   64
.set v_b,                   72
.set v_gld_a,               80
.set v_gld_b,               88
.set v_in_os,               96
.set v_wei_os,              97
.set v_sst_a_os,            98
.set v_sst_b_os,            99
.set v_sld_a_os,            100
.set v_sld_b_os,            101
.set v_out_os,              102
.set v_flag,                103
.set v_in_ic,               104
.set v_in_iy,               105
.set v_in_ix,               106
.set v_in_ihi,              107
.set v_in_iwi,              108
.set v_in_in0,              63
.set v_in_iho,              62
.set v_in_iwo,              61
.set v_in_ie,               60
.set v_in_in1,              59
.set v_in_ib,               58
.set v_in_in2,              57
.set v_wei_ie,              56
.set v_wei_ik,              55
.set v_out_ik0,             54
.set v_out_ik1,             53
.set v_out_ib,              52
.set v_gemm_in,             51
.set v_gemm_im,             50
.set v_idc,                 109
.set v_idy,                 110
.set v_idx,                 111
.set v_wei_idc,             112
.set v_wei_idyx,            113
.set v_tmp,                 44
.set v_end,                 114

.text
.globl  igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_16x16
.p2align 8
.type  igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_16x16,@function
 igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_16x16:
    s_load_dwordx4  s[s_p_in:s_p_in+3],         s[s_ka:s_ka+1],     0+k_p_in
    s_load_dwordx2  s[s_p_out:s_p_out+1],       s[s_ka:s_ka+1],     0+k_p_out
    s_load_dwordx8  s[s_hi:s_hi+7],             s[s_ka:s_ka+1],     0+k_hi
    s_load_dwordx4  s[s_stride_w:s_stride_w+3], s[s_ka:s_ka+1],     0+k_stride_w
    s_load_dwordx2  s[s_pad_w:s_pad_w+1],       s[s_ka:s_ka+1],     0+k_pad_w
    s_load_dwordx2  s[s_x:s_gemmkgroups],       s[s_ka:s_ka+1],     0+k_x

    ; in e_n1_b_n2 cluster_lengths:{16,1,16,1}, sub_lengths:{1,2,1,4}, order:{0,1,3,2}
    v_and_b32 v[v_in_ie], 15, v0 ; in_ib=tid%16
    v_lshrrev_b32 v[v_tmp], 4, v0 ; tmp=tid/16
    v_mov_b32 v[v_in_in2], 0 ; in_in2=0
    v_mov_b32 v[v_in_in1], 0 ; in_in1=0
    v_and_b32 v[v_in_ib], 15, v[v_tmp] ; in_ie=(tid/16)%16
    ; wei e_k cluster_lengths:{16,16}, sub_lengths:{1,8}, order:{1,0}
    v_and_b32 v[v_wei_ie], 15, v0 ; wei_ie=tid%16
    v_lshrrev_b32 v[v_tmp], 4, v0 ; tmp=tid/16
    ;v_lshlrev_b32 v[v_wei_ie], 0, v[v_wei_ie] ; wei_ie=(tid%16)*1
    v_and_b32 v[v_wei_ik], 15, v[v_tmp] ; wei_ik=(tid/16)%16
    v_lshlrev_b32 v[v_wei_ik], 3, v[v_wei_ik] ; wei_ik=((tid/16)%16)*8
    s_waitcnt lgkmcnt(0)

    ; calculate c per group
    s_lshr_b32 s[s_sub_c], s[s_c], s[s_gemmkgroups]

    ; calculate index
    s_mul_i32 s[s_out_stride_n2], s[s_ho], s[s_wo] ; out_stride_n2=ho*wo
    s_mul_i32 s[s_out_stride_k1], s[s_n], s[s_out_stride_n2] ; out_stride_k1=ho*wo*n
    s_lshl_b32 s[s_out_stride_k0], s[s_out_stride_k1], 6 ; out_stride_k0=ho*wo*n*64
    s_lshl_b32 s[s_out_stride_n1], s[s_out_stride_n2], 2 ; out_stride_n1=ho*wo*4
    s_mul_i32 s[s_in_stride_n2], s[s_hi], s[s_wi] ; in_stride_n2=hi*wi
    s_mul_i32 s[s_in_stride_c], s[s_n], s[s_in_stride_n2] ; in_stride_c=hi*wi*n
    s_mul_i32 s[s_wei_stride_k], s[s_y], s[s_x] ; wei_stride_k=y*x
    s_mul_i32 s[s_wei_stride_c], s[s_k], s[s_wei_stride_k] ; wei_stride_c=k*y*x
    s_mov_b64 s[s_p_buf_wei:s_p_buf_wei+1], s[s_p_wei:s_p_wei+1]
    s_mov_b32 s[s_p_buf_in+2], 0xffffffff
    s_mov_b32 s[s_p_buf_in+3], 0x27000
    s_mov_b32 s[s_p_buf_wei+2], 0xffffffff
    s_mov_b32 s[s_p_buf_wei+3], 0x27000

    ; block e,k,b index global
    s_lshr_b32 s[s_tmp], s[s_n], 3 ; tmp=n/8 maybe n0
    s_mul_i32 s[s_tmp+1], s[s_out_stride_n2], s[s_tmp]; tmp_1=ho*wo*n/8
    s_lshr_b32 s[0], s[s_tmp+1], 4 ; s_0=ho*wo*n/8/16
    s_lshr_b32 s[s_tmp], s[s_k], 7 ; s_tmp=k/128
    s_mul_i32 s[s_group_stride], s[s_tmp], s[0] ; s_group_stride=n*ho*wo/128*k/128

    ; block e index on global
    .v_u32_div_ss v_tmp+5, s_bx, s_group_stride, v_tmp, s_tmp ; v_tmp_5=block_idx/(n*ho*wo/128*k/128)
    v_readfirstlane_b32 s[s_block_ie], v[v_tmp+5] ; s_block_ie=block_idx/(n*ho*wo/128*k/128)
    
    ; block k, b index on global
    s_mul_i32 s[s_tmp+2], s[s_block_ie], s[s_group_stride] ; s_tmp_2=block_idx/(n*ho*wo/128*k/128)*(n*ho*wo/128*k/128)
    s_sub_i32 s[s_group_left], s[s_bx], s[s_tmp+2] ; s_tmp_1=block_idx-block_idx/(n*ho*wo/128*k/128)*(n*ho*wo/128*k/128)
    
    .v_u32_div_ss v_tmp+5, s_group_left, 0, v_tmp, s_tmp ; v_tmp_5=block_idx/(ho*wo*n/8/16)
    v_readfirstlane_b32 s[s_tmp], v[v_tmp+5] ; s_tmp=v_tmp_5
    s_mul_i32 s[s_tmp+2], s[s_tmp], s[0] ; s_tmp_2=block_idx/(ho*wo*n/8/16)*(ho*wo*n/8/16)
    s_sub_i32 s[s_tmp+1], s[s_group_left], s[s_tmp+2] ; s_tmp_1=block_idx-s_tmp_2
    s_lshl_b32 s[s_block_ik], s[s_tmp], 7 ; s_block_ik=block_idx/(ho*wo*n/8/16)*128
    s_lshl_b32 s[s_block_ib], s[s_tmp+1], 4 ; s_block_ib=s_tmp_1*16
    ;s_mul_i32 s[s_block_ie], s[s_block_ie], s[s_sub_c] ; s_block_ie*=s_sub_c

    ; calculate input transform
    ; e_n1_b_n2:b, transform: b -> n0*ho*wo
    v_add_u32 v[v_tmp+4], s[s_block_ib], v[v_in_ib] ; v_tmp_4=block_ib+v_in_ib(thread_ib)
    .v_u32_div_vs v_in_in0, v_tmp+4, s_out_stride_n2, v_tmp, s_tmp ; v_in_n0=(block_ib+v_in_ib)/(ho*wo)
    v_mul_lo_u32 v[v_tmp], s[s_out_stride_n2], v[v_in_in0] ; v_tmp=v_in_n0*(ho*wo)
    v_sub_u32 v[v_tmp+4], v[v_tmp+4], v[v_tmp] ; v_tmp_4=block_ib+v_in_ib-v_in_n0*(ho*wo)
    .v_u32_div_vs v_in_iho, v_tmp+4, s_wo, v_tmp, s_tmp ; v_in_iho=v_tmp_4/wo
    v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_in_iho] 
    v_sub_u32 v[v_in_iwo], v[v_tmp+4], v[v_tmp] ; v_in_iwo=v_tmp_4-v_in_iho*wo

    ; e_n1_b_n2:e
    ;   1) transform e -> c*y*x
    .v_u32_div_vs v_in_ic, v_in_ie, s_wei_stride_k, v_tmp, s_tmp ; v_in_ic=v_in_ie/(y*x)
    v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_in_ic] ; v_tmp=v_in_ic*(y*x)
    v_sub_u32 v[v_tmp+4], v[v_in_ie], v[v_tmp] ; v_tmp_4=v_in_ie-v_in_ic*(y*x)
    .v_u32_div_vs v_in_iy, v_tmp+4, s_x, v_tmp, s_tmp ; v_in_iy=v_tmp_4/x
    v_mul_lo_u32 v[v_tmp], s[s_x], v[v_in_iy] ; v_tmp=x*v_in_iy
    v_sub_u32 v[v_in_ix], v[v_tmp+4], v[v_tmp] ; v_in_ix=v_in_ie-v_in_ic*(y*x)-x*v_in_iy

    ;   2) transform iho, iwo, iy, ix -> hip, wip
    v_mul_lo_u32 v[v_tmp], s[s_stride_h], v[v_in_iho]
    v_mul_lo_u32 v[v_tmp+1], s[s_stride_w], v[v_in_iwo]
    v_mul_lo_u32 v[v_tmp+2], s[s_dilation_h], v[v_in_iy]
    v_mul_lo_u32 v[v_tmp+3], s[s_dilation_w], v[v_in_ix]

    ;   3) transform hip, wip -> hi, wi
    v_add_u32 v[v_tmp], v[v_tmp], v[v_tmp+2]
    v_add_u32 v[v_tmp+1], v[v_tmp+1], v[v_tmp+3]
    v_sub_i32 v[v_in_ihi], v[v_tmp], s[s_pad_h] ; v_in_ihi=v_in_iho*s_stride_h+v_in_iy*s_dilation_h+s_pad_h
    v_sub_i32 v[v_in_iwi], v[v_tmp+1], s[s_pad_w] ; v_in_iwi=v_in_iwo*s_stride_w+v_in_ix*s_dilation_w+s_pad_w

    ; set input flag
    .v_in_set_flag v_flag, v_in_ihi, v_in_iwi, s_hi, s_wi, s_tmp

    ; in offset: from ihi, iwi, ic, in, calculate v_in_os
    v_mul_lo_u32 v[v_tmp], s[s_wi], v[v_in_ihi] ; v_tmp=v_in_ihi*s_wi
    v_add_u32 v[v_tmp], v[v_tmp], v[v_in_iwi] ; v_tmp=v_in_ihi*s_wi+v_in_iwi
    s_mul_i32 s[s_tmp], s[s_block_ie], s[s_sub_c] ; s_tmp=s_block_ie*s_sub_c
    v_add_i32 v[v_tmp+2], s[s_tmp], v[v_in_ic] ; v_tmp_2=s_block_ie*s_sub_c+v_in_ic
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_c], v[v_tmp+2] ; v_tmp_1=v_in_ic*n*hi*wi
    v_add_u32 v[v_tmp], v[v_tmp], v[v_tmp+1] ; v_tmp=v_in_ic*n*hi*wi+v_in_ihi*s_wi+v_in_iwi
    v_lshl_add_u32 v[v_tmp+1], v[v_in_in0], 3, v[v_in_in2] ; v_tmp_1=v_in_in0*8+v_in_in2
    v_lshl_add_u32 v[v_tmp+1], v[v_in_in1], 2, v[v_tmp+1] ; v_tmp_1=v_in_in0*8+v_in_in2+v_in_in1*4
    v_mul_lo_u32 v[v_tmp+1], s[s_in_stride_n2], v[v_tmp+1] ; v_tmp_1=(v_in_in0*8+v_in_in2+v_in_in1*4)*hi*wi
    v_add_lshl_u32 v[v_in_os], v[v_tmp], v[v_tmp+1], 2 ; v_in_os=((v_in_in0*8+v_in_in2+v_in_in1*4)*hi*wi
                                                       ;           +v_in_ic*n*hi*wi+v_in_ihi*s_wi+v_in_iwi)*sizeof(float)

    s_lshl_b32 s[s_in_stride_n2], s[s_in_stride_n2], 2
    s_lshl_b32 s[s_in_stride_n1], s[s_in_stride_n2], 2
    ; load input from global
    .v_in_load_e_n1_b_n2_1_2_1_4 v_gld_b, s_p_buf_in, v_in_os, s_in_stride_n1, s_in_stride_n2, v_flag, s_tmp

    ; calculate SliceWindow e=c*y*x. this is same for both input/weight
    s_mov_b32 s[s_wei_slice], 16
    .v_u32_div_ss v_tmp+4, s_wei_slice, s_wei_stride_k, v_tmp, s_tmp
    v_readfirstlane_b32 s[s_in_ic], v[v_tmp+4]
    s_mul_i32 s[s_tmp], s[s_wei_stride_k], s[s_in_ic]
    s_sub_i32 s[s_wei_slice], s[s_wei_slice], s[s_tmp]
    .v_u32_div_ss v_tmp+4, s_wei_slice, s_x, v_tmp, s_tmp
    v_readfirstlane_b32 s[s_in_iy], v[v_tmp+4]
    s_mul_i32 s[s_tmp], s[s_x], s[s_in_iy]
    s_sub_i32 s[s_in_ix], s[s_wei_slice], s[s_tmp]

    ; c thread mapping
    v_and_b32 v[v_tmp+4], 15, v0 ; tid%16
    v_and_b32 v[v_tmp], 3, v[v_tmp+4] ; (tid%16)%4
    v_lshrrev_b32 v[v_tmp+1], 2, v[v_tmp+4] ; (tid%16)/4

    v_lshrrev_b32 v[v_tmp+4], 4, v0 ; tid/16
    v_and_b32 v[v_tmp+2], 3, v[v_tmp+4] ; (tid/16)%4
    v_lshrrev_b32 v[v_tmp+3], 2, v[v_tmp+4] ; (tid/16)/4

    v_lshl_or_b32 v[v_gemm_in], v[v_tmp+2], 2, v[v_tmp]               ; in
    v_lshl_or_b32 v[v_gemm_im], v[v_tmp+3], 2, v[v_tmp+1]             ; im
    v_lshlrev_b32 v[v_sld_b_os], 4, v[v_gemm_in]
    v_lshlrev_b32 v[v_sld_a_os], 4, v[v_gemm_im]
    v_add_u32 v[v_sld_a_os], 8192, v[v_sld_a_os]

    ; calculate weight transform
    v_add_u32 v[v_tmp], s[s_block_ik], v[v_wei_ik] ; v_tmp=block_ik+v_wei_ik
    v_mul_lo_u32 v[v_tmp+1], s[s_wei_stride_k], v[v_tmp] ; v_tmp_1=(block_ik+v_wei_ik)*y*x
    .v_u32_div_vs v_tmp, v_wei_ie, s_wei_stride_k, v_tmp+2, s_tmp
    s_mul_i32 s[s_tmp], s[s_block_ie], s[s_sub_c] ; s_tmp=s_block_ie*s_sub_c
    v_add_i32 v[v_tmp+3], s[s_tmp], v[v_tmp] ; v_tmp_3=s_block_ie*s_sub_c+v_wei_ic
    v_mul_lo_u32 v[v_tmp+2], s[s_wei_stride_c], v[v_tmp+3] ; v_tmp_2=(s_block_ie*s_sub_c+v_wei_ic)*k*y*x
    v_add_u32 v[v_tmp+1], v[v_tmp+2], v[v_tmp+1] ; v_tmp_1=(s_block_ie*s_sub_c+v_wei_ic)*k*y*x+(block_ik+v_wei_ik)*y*x
    v_mul_lo_u32 v[v_tmp+2], s[s_wei_stride_k], v[v_tmp]
    v_sub_u32 v[v_tmp+4], v[v_wei_ie], v[v_tmp+2] ; v_tmp_4=v_wei_ie-(v_wei_ie/(y*x))*(y*x)
    v_add_lshl_u32 v[v_wei_os], v[v_tmp+4], v[v_tmp+1], 2 ; v_wei_os=v_wei_ic*k*y*x+(block_ik+v_wei_ik)*y*x+v_wei_iy*x+v_wei_ix
    s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2

    ; load wei from global
    .v_wei_load_e_k_1_8_ev1 v_gld_a, s_p_buf_wei, v_wei_os, s_wei_stride_k, s_tmp

    ; calculate out index ik0, ik1, ib
    v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_im]
    v_add_u32 v[v_tmp], s[s_block_ik], v[v_tmp+1]
    v_lshrrev_b32 v[v_out_ik0], 6, v[v_tmp]
    v_and_b32 v[v_out_ik1], 63, v[v_tmp]

    v_add_u32 v[v_out_ib], s[s_block_ib], v[v_gemm_in]
    .v_u32_div_vs v_tmp+4, v_out_ib, s_out_stride_n2, v_tmp, s_tmp
    v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_n2], v[v_tmp+4]
    v_sub_u32 v[v_tmp+5], v[v_out_ib], v[v_tmp+1]
    .v_u32_div_vs v_tmp+6, v_tmp+5, s_wo, v_tmp, s_tmp
    v_mul_lo_u32 v[v_tmp+1], s[s_wo], v[v_tmp+6]
    v_sub_u32 v[v_tmp+5], v[v_tmp+5], v[v_tmp+1]
    ; v_tmp+4:in0, v_tmp+6:iho, v_tmp+5:iwo

    v_mul_lo_u32 v[v_tmp], s[s_wo], v[v_tmp+6] ; v_tmp=iho*wo
    v_add_u32 v[v_out_os], v[v_tmp], v[v_tmp+5] ; v_out_os=iho*wo+iwo
    s_lshl_b32 s[s_tmp+1], s[s_out_stride_n2], 3 ; s_tmp_1=ho*wo*8
    v_mul_lo_u32 v[v_tmp], s[s_tmp+1], v[v_tmp+4] ; v_tmp=in0*ho*wo*8
    v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp]

    s_lshl_b32 s[s_out_stride_k0], s[s_out_stride_k0], 2
    v_lshl_or_b32 v[v_tmp], v[v_out_ik0], 6, v[v_out_ik1] ; v_tmp=ik0*64+ik1
    s_lshl_b32 s[s_out_stride_n1], s[s_out_stride_n1], 2
    v_mul_lo_u32 v[v_tmp+1], s[s_out_stride_k1], v[v_tmp] ; v_tmp_1=(ik0*64+ik1)*out_stride_k1
    s_lshl_b32 s[s_out_stride_n2], s[s_out_stride_n2], 2
    v_add_u32 v[v_out_os], v[v_out_os], v[v_tmp+1] ; v_out_os = (ik0*64+ik1)*n*ho*wo+in0*ho*wo*8+iho*wo+iwo
    s_mul_i32 s[s_tmp], s[s_out_stride_k1], s[s_k] ; s_tmp=k*n*wo*ho
    s_mul_i32 s[s_tmp], s[s_tmp], s[s_block_ie] ; s_tmp=s_block_ie*k*n*wo*ho
    v_add_u32 v[v_out_os], v[v_out_os], s[s_tmp] ; v_out_os=v_out_os+s_block_ie*k*n*wo*ho
    s_lshl_b32 s[s_out_stride_k1], s[s_out_stride_k1], 2
    v_lshlrev_b32 v[v_out_os], 2, v[v_out_os] 

    ; in lds offset block e_n1_b_n2
    v_lshlrev_b32 v[v_tmp], 7, v[v_in_ie]
    v_lshl_or_b32 v[v_tmp], v[v_in_ib], 2, v[v_tmp]
    v_lshlrev_b32 v[v_sst_b_os], 2, v[v_tmp] ; v_sst_b_os=ie*128+ib*4

    ; wei lds offset block e_k
    v_lshl_or_b32 v[v_tmp], v[v_wei_ie], 7, v[v_wei_ik]
    v_lshlrev_b32 v[v_sst_a_os], 2, v[v_tmp]
    v_add_u32 v[v_sst_a_os], 8192, v[v_sst_a_os]

    s_mov_b32 s[s_p_buf_out+2], 0xffffffff
    s_mov_b32 s[s_p_buf_out+3], 0x27000
    .v_clear_nc v_c, 64

    ; start FMA loop, 8x8 thread tile with 4x4 sub-tile
    s_waitcnt vmcnt(8)
    .v_in_sst_e_n1_b_n2_1_2_1_4_n1s256_n2v4 v_gld_b, v_sst_b_os

    s_waitcnt vmcnt(0)
    .v_wei_sst_e_k_1_8_kv8 v_gld_a, v_sst_a_os

    ; E = C * Y * X
    s_mul_i32 s[s_tmp], s[s_sub_c], s[s_wei_stride_k] ; s_tmp=sub_c*y*x
    s_lshr_b32 s[s_tmp], s[s_tmp], 2 ; cause wei stride k has shl 2
    s_sub_i32 s[s_kitr], s[s_tmp], 16
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_16x16_end

    .v_in_wei_move_slice_window_wrw v_in_os, v_wei_os, v_in_ic, v_in_iy, v_in_ix, v_in_ihi, v_in_iwi, v_flag, s_hi, s_wi, s_y, s_x, s_in_stride_c, s_wei_stride_c s_dilation_h, s_dilation_w, s_in_ic, s_in_iy, s_in_ix, v_idc, v_idy, v_idx, v_wei_idc, s_tmp
    v_xor_b32 v[v_sst_b_os], 0x4000, v[v_sst_b_os] ; switch double buffer b store
    v_xor_b32 v[v_sst_a_os], 0x4000, v[v_sst_a_os] ; switch double buffer a store
    s_waitcnt lgkmcnt(0)
    s_barrier

    .v_in_load_e_n1_b_n2_1_2_1_4 v_gld_b, s_p_buf_in, v_in_os, s_in_stride_n1, s_in_stride_n2, v_flag, s_tmp
    .v_wei_load_e_k_1_8_ev1 v_gld_a, s_p_buf_wei, v_wei_os, s_wei_stride_k, s_tmp

L_igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_16x16_fma_body:
    ; do fma accumulate with unroll 16
    ds_read_b128 v[v_a:v_a+3], v[v_sld_a_os] 
    ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] 
    ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:256
    ds_read_b128 v[v_a+4:v_a+4+3], v[v_sld_a_os] offset:256
    
    .itr_k = 0
    .rept 15
        s_waitcnt lgkmcnt(2)
        .v_fma_4x4_s8 v_c,v_a,v_b

        s_waitcnt lgkmcnt(1)
        .v_fma_4x4_s8 v_c+4,v_a,v_b+4

        ds_read_b128 v[v_a:v_a+3], v[v_sld_a_os] offset:0+(.itr_k+1)*512
        s_waitcnt lgkmcnt(1)
        .v_fma_4x4_s8 v_c+32,v_a+4,v_b

        ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:0+(.itr_k+1)*512
        .v_fma_4x4_s8 v_c+36,v_a+4,v_b+4

        ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:0+(.itr_k+1)*512+256
        ds_read_b128 v[v_a+4:v_a+4+3], v[v_sld_a_os] offset:0+(.itr_k+1)*512+256
        .itr_k = .itr_k + 1
    .endr

    ; last unroll
    v_xor_b32 v[v_sld_b_os], 0x4000, v[v_sld_b_os] ; switch double buffer b load
    v_xor_b32 v[v_sld_a_os], 0x4000, v[v_sld_a_os] ; switch double buffer a load
    s_waitcnt lgkmcnt(2)
    .v_fma_4x4_s8 v_c,v_a,v_b

    s_waitcnt lgkmcnt(1)
    .v_fma_4x4_s8 v_c+4,v_a,v_b+4

    s_waitcnt vmcnt(8) 
    .v_in_sst_e_n1_b_n2_1_2_1_4_n1s256_n2v4 v_gld_b, v_sst_b_os
    s_waitcnt vmcnt(0)
    .v_wei_sst_e_k_1_8_kv8 v_gld_a, v_sst_a_os
    s_sub_i32 s[s_kitr], s[s_kitr], 16
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_16x16_fma_finishing
    .v_in_wei_move_slice_window_wrw v_in_os, v_wei_os, v_in_ic, v_in_iy, v_in_ix, v_in_ihi, v_in_iwi, v_flag, s_hi, s_wi, s_y, s_x, s_in_stride_c, s_wei_stride_c, s_dilation_h, s_dilation_w, s_in_ic, s_in_iy, s_in_ix, v_idc, v_idy, v_idx, v_wei_idc, s_tmp

    s_waitcnt lgkmcnt(4)
    .v_fma_4x4_s8 v_c+32,v_a+4,v_b

    v_xor_b32 v[v_sst_b_os], 0x4000, v[v_sst_b_os] ; switch double buffer b store
    v_xor_b32 v[v_sst_a_os], 0x4000, v[v_sst_a_os] ; switch double buffer a store
    s_waitcnt lgkmcnt(0)
    s_barrier
    .v_in_load_e_n1_b_n2_1_2_1_4 v_gld_b, s_p_buf_in, v_in_os, s_in_stride_n1, s_in_stride_n2, v_flag, s_tmp
    .v_wei_load_e_k_1_8_ev1 v_gld_a, s_p_buf_wei, v_wei_os, s_wei_stride_k, s_tmp
    .v_fma_4x4_s8 v_c+36,v_a+4,v_b+4

    s_branch L_igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_16x16_fma_body
L_igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_16x16_fma_finishing:
    s_waitcnt lgkmcnt(4)
    .v_fma_4x4_s8 v_c+32,v_a+4,v_b
    .v_fma_4x4_s8 v_c+36,v_a+4,v_b+4
L_igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_16x16_end:
    s_waitcnt lgkmcnt(0)
    s_barrier
    ds_read_b128 v[v_a:v_a+3], v[v_sld_a_os] 
    ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] 
    ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:256
    ds_read_b128 v[v_a+4:v_a+4+3], v[v_sld_a_os] offset:256
    .itr_k = 0
    .rept 15
        s_waitcnt lgkmcnt(2)
        .v_fma_4x4_s8 v_c,v_a,v_b

        s_waitcnt lgkmcnt(1)
        .v_fma_4x4_s8 v_c+4,v_a,v_b+4

        ds_read_b128 v[v_a:v_a+3], v[v_sld_a_os] offset:0+(.itr_k+1)*512
        s_waitcnt lgkmcnt(1)
        .v_fma_4x4_s8 v_c+32,v_a+4,v_b

        ds_read_b128 v[v_b:v_b+3], v[v_sld_b_os] offset:0+(.itr_k+1)*512
        .v_fma_4x4_s8 v_c+36,v_a+4,v_b+4

        ds_read_b128 v[v_b+4:v_b+4+3], v[v_sld_b_os] offset:0+(.itr_k+1)*512+256
        ds_read_b128 v[v_a+4:v_a+4+3], v[v_sld_a_os] offset:0+(.itr_k+1)*512+256
        .itr_k = .itr_k + 1
    .endr

    ; last unroll
    s_waitcnt lgkmcnt(2)
    .v_fma_4x4_s8 v_c,v_a,v_b

    s_waitcnt lgkmcnt(1)
    .v_fma_4x4_s8 v_c+4,v_a,v_b+4

    s_waitcnt lgkmcnt(0)
    .v_fma_4x4_s8 v_c+32,v_a+4,v_b

    .v_fma_4x4_s8 v_c+36,v_a+4,v_b+4

    s_mov_b32 s[s_tmp], 0
    s_mov_b32 s[s_tmp+1], 0
    s_mov_b32 s[s_tmp+2], 0
    s_mov_b32 s[s_tmp+3], 0
    .v_out_write_k0_k1_n1_b_n2_2_4_2_1_4 v_c, s_p_buf_out, v_out_os, s_out_stride_k0, s_out_stride_k1, s_out_stride_n1, s_out_stride_n2, s_tmp
    
L_program_end_1:
    s_endpgm
.rodata
.p2align 6
.amdhsa_kernel  igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_16x16
    .amdhsa_group_segment_fixed_size 32768
    .amdhsa_user_sgpr_kernarg_segment_ptr 1
    .amdhsa_system_sgpr_workgroup_id_x 1
    .amdhsa_system_vgpr_workitem_id 0
    .amdhsa_next_free_vgpr 120
    .amdhsa_next_free_sgpr 58
    .amdhsa_ieee_mode 0
    .amdhsa_dx10_clamp 0
    //xnack disabled by default for asm kernels
    .amdhsa_reserve_xnack_mask 0
.end_amdhsa_kernel

.amdgpu_metadata
---
amdhsa.version: [ 1, 0 ]
amdhsa.kernels:
  - .name: igemm_v4r1_dynamic_wrw_32x32x4_4x4_2x2x4x2x4x2_4x2x8x1_4x16
    .symbol: igemm_v4r1_dynamic_wrw_32x32x4_4x4_2x2x4x2x4x2_4x2x8x1_4x16.kd
    .sgpr_count: 58
    .vgpr_count: 60
    .kernarg_segment_align: 8
    .kernarg_segment_size: 88
    .group_segment_fixed_size: 2048
    .private_segment_fixed_size: 0
    .wavefront_size: 64
    .reqd_workgroup_size : [64, 1, 1]
    .max_flat_workgroup_size: 64
    .args:
    - { .name: p_in          , .size: 8, .offset:   0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: p_wei         , .size: 8, .offset:   8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: p_in          , .size: 8, .offset:  16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false}
    - { .name: hi            , .size: 4, .offset:  24, .value_kind: by_value, .value_type: i32}
    - { .name: wi            , .size: 4, .offset:  28, .value_kind: by_value, .value_type: i32}
    - { .name: n             , .size: 4, .offset:  32, .value_kind: by_value, .value_type: i32}
    - { .name: k             , .size: 4, .offset:  36, .value_kind: by_value, .value_type: i32}
    - { .name: c             , .size: 4, .offset:  40, .value_kind: by_value, .value_type: i32}
    - { .name: ho            , .size: 4, .offset:  44, .value_kind: by_value, .value_type: i32}
    - { .name: wo            , .size: 4, .offset:  48, .value_kind: by_value, .value_type: i32}
    - { .name: stride_h      , .size: 4, .offset:  52, .value_kind: by_value, .value_type: i32}
    - { .name: stride_w      , .size: 4, .offset:  56, .value_kind: by_value, .value_type: i32}
    - { .name: dilation_h    , .size: 4, .offset:  60, .value_kind: by_value, .value_type: i32}
    - { .name: dilation_w    , .size: 4, .offset:  64, .value_kind: by_value, .value_type: i32}
    - { .name: pad_h         , .size: 4, .offset:  68, .value_kind: by_value, .value_type: i32}
    - { .name: pad_w         , .size: 4, .offset:  72, .value_kind: by_value, .value_type: i32}
    - { .name: y             , .size: 4, .offset:  76, .value_kind: by_value, .value_type: i32}
    - { .name: x             , .size: 4, .offset:  80, .value_kind: by_value, .value_type: i32}
    - { .name: k_gemmk_groups, .size: 4, .offset:  84, .value_kind: by_value, .value_type: i32}
  - .name:  igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_4x64
    .symbol:  igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_4x64.kd
    .sgpr_count: 58
    .vgpr_count: 120
    .kernarg_segment_align: 8
    .kernarg_segment_size: 88
    .group_segment_fixed_size: 32768
    .private_segment_fixed_size: 0
    .wavefront_size: 64
    .reqd_workgroup_size : [256, 1, 1]
    .max_flat_workgroup_size: 256
    .args:
    - { .name: p_in          , .size: 8, .offset:   0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: p_wei         , .size: 8, .offset:   8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: p_in          , .size: 8, .offset:  16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false}
    - { .name: hi            , .size: 4, .offset:  24, .value_kind: by_value, .value_type: i32}
    - { .name: wi            , .size: 4, .offset:  28, .value_kind: by_value, .value_type: i32}
    - { .name: n             , .size: 4, .offset:  32, .value_kind: by_value, .value_type: i32}
    - { .name: k             , .size: 4, .offset:  36, .value_kind: by_value, .value_type: i32}
    - { .name: c             , .size: 4, .offset:  40, .value_kind: by_value, .value_type: i32}
    - { .name: ho            , .size: 4, .offset:  44, .value_kind: by_value, .value_type: i32}
    - { .name: wo            , .size: 4, .offset:  48, .value_kind: by_value, .value_type: i32}
    - { .name: stride_h      , .size: 4, .offset:  52, .value_kind: by_value, .value_type: i32}
    - { .name: stride_w      , .size: 4, .offset:  56, .value_kind: by_value, .value_type: i32}
    - { .name: dilation_h    , .size: 4, .offset:  60, .value_kind: by_value, .value_type: i32}
    - { .name: dilation_w    , .size: 4, .offset:  64, .value_kind: by_value, .value_type: i32}
    - { .name: pad_h         , .size: 4, .offset:  68, .value_kind: by_value, .value_type: i32}
    - { .name: pad_w         , .size: 4, .offset:  72, .value_kind: by_value, .value_type: i32}
    - { .name: y             , .size: 4, .offset:  76, .value_kind: by_value, .value_type: i32}
    - { .name: x             , .size: 4, .offset:  80, .value_kind: by_value, .value_type: i32}
    - { .name: k_gemmk_groups, .size: 4, .offset:  84, .value_kind: by_value, .value_type: i32}
  - .name:  igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_16x16
    .symbol:  igemm_v4r1_dynamic_wrw_128x128x16_8x8_4x4x4x4x4x4_16x1x16x1_16x16.kd
    .sgpr_count: 58
    .vgpr_count: 120
    .kernarg_segment_align: 8
    .kernarg_segment_size: 88
    .group_segment_fixed_size: 32768
    .private_segment_fixed_size: 0
    .wavefront_size: 64
    .reqd_workgroup_size : [256, 1, 1]
    .max_flat_workgroup_size: 256
    .args:
    - { .name: p_in          , .size: 8, .offset:   0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: p_wei         , .size: 8, .offset:   8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: p_in          , .size: 8, .offset:  16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false}
    - { .name: hi            , .size: 4, .offset:  24, .value_kind: by_value, .value_type: i32}
    - { .name: wi            , .size: 4, .offset:  28, .value_kind: by_value, .value_type: i32}
    - { .name: n             , .size: 4, .offset:  32, .value_kind: by_value, .value_type: i32}
    - { .name: k             , .size: 4, .offset:  36, .value_kind: by_value, .value_type: i32}
    - { .name: c             , .size: 4, .offset:  40, .value_kind: by_value, .value_type: i32}
    - { .name: ho            , .size: 4, .offset:  44, .value_kind: by_value, .value_type: i32}
    - { .name: wo            , .size: 4, .offset:  48, .value_kind: by_value, .value_type: i32}
    - { .name: stride_h      , .size: 4, .offset:  52, .value_kind: by_value, .value_type: i32}
    - { .name: stride_w      , .size: 4, .offset:  56, .value_kind: by_value, .value_type: i32}
    - { .name: dilation_h    , .size: 4, .offset:  60, .value_kind: by_value, .value_type: i32}
    - { .name: dilation_w    , .size: 4, .offset:  64, .value_kind: by_value, .value_type: i32}
    - { .name: pad_h         , .size: 4, .offset:  68, .value_kind: by_value, .value_type: i32}
    - { .name: pad_w         , .size: 4, .offset:  72, .value_kind: by_value, .value_type: i32}
    - { .name: y             , .size: 4, .offset:  76, .value_kind: by_value, .value_type: i32}
    - { .name: x             , .size: 4, .offset:  80, .value_kind: by_value, .value_type: i32}
    - { .name: k_gemmk_groups, .size: 4, .offset:  84, .value_kind: by_value, .value_type: i32}
...
.end_amdgpu_metadata