shader main
type(CS)
user_sgpr_count(8)   // 2 for the buffer resource + 5 for thread/thread group parameters
    //s[0:1] the mmeory address for the buffer resource
    //s2 x
    //s3 x*y
    //s4 x*y*z
    //s5 X	
    //s6 X*Y
    //s7 output offset
    //s8 loop

    tgid_x_en(1)       //s_tgid_x s9
    tgid_y_en(1)       //s_tgid_y s10
    tgid_z_en(1)       //s_tgid_z s11

    //vo for tid_x
    //v1 for tid_y
    //v2 for tid_z

    // Clear ACC VGPR
	for var vgpr = 0; vgpr < 256; ++vgpr
		v_accvgpr_write acc[vgpr], 0
	end

    //sp3 loop for lifetime
    s_mov_b32 s12, 0                //init loop idx s12
label_0001:
    s_cmp_lt_i32 s12, s8            //scc = (s12 < s8) ? 1 : 0
    s_cbranch_scc0  label_0006      //if(scc == 0) then jump to label_0006; else nop
    v_mov_b32 v4,s12
    s_add_i32 s12, s12, 1           //add loop incr
    s_branch label_0001
label_0006: //end of SP3 loop

    //fetch the buffer resource through SQC
    s_load_dwordx4 s[24:27], s[0:1], 0x0
    s_waitcnt 0

    s_load_dwordx4 s[40:43], s[0:1], 0x20
    s_waitcnt 0

    // v3  thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
    v_mad_u32_u24 v3, v1, s2, v0 
    v_mad_u32_u24 v3, v2, s3, v3

    //s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
    s_mul_i32 s28, s_tgid_y, s5
    s_add_i32 s28, s28, s_tgid_x
    s_mul_i32 s29, s6, s_tgid_z
    s_add_i32 s28, s29, s28

    //v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group 
    v_mov_b32 v9, s28
    v_mad_u32_u24 v9, v9, s4, v3

    // Clear VGPR and LDS
    s_movk_i32    m0, 0x0000
    s_mov_b32     s12, 0x000000f8
    s_set_gpr_idx_on  s12, 0x8
label_0004:
    v_mov_b32     v0, 0
    v_mov_b32     v1, 0
    v_mov_b32     v2, 0
    v_mov_b32     v3, 0
    v_mov_b32     v4, 0
    v_mov_b32     v5, 0
    v_mov_b32     v6, 0
    v_mov_b32     v7, 0
    s_sub_u32     s12, s12, 8
    s_set_gpr_idx_idx  s12
    s_cbranch_scc0  label_0004
    s_set_gpr_idx_off
    v_mbcnt_lo_u32_b32  v1, exec_hi, 0
    v_mbcnt_hi_u32_b32  v1, exec_lo, v1
    v_mul_u32_u24  v1, 8, v1
    s_getreg_b32  s13, hwreg(HW_REG_HW_ID, 4, 2)
    s_mulk_i32    s13, 0x4000
    v_add_co_u32  v1, vcc, v1, s13
    s_mov_b32     s12, 7
    s_mov_b32     m0, -1
label_001B:
    ds_write2_b64  v1, v[2:3], v[2:3] offset1:64
    ds_write2_b64  v1, v[4:5], v[4:5] offset0:128 offset1:192
    v_add_co_u32  v1, vcc, 0x00000800, v1
    s_sub_u32     s12, s12, 1
    s_cbranch_scc0  label_001B

    // Save coverage in the memory
    s_getreg_b32 s20, hwreg(HW_REG_HW_ID, 0, 32)
    // s12 = SIMD
    s_lshr_b32 s12,s20,4
    s_and_b32 s12, s12, 0x3
    // s13 = CU
    s_lshr_b32 s13,s20,8
    s_and_b32 s13, s13, 0xf
    // s14 = SE
    s_lshr_b32 s14,s20,13
    s_and_b32 s14, s14, 0x7
    // s15 = SE * 16 * 4 + CU * 4 + SIMD
    s_mul_i32 s16, s14, 64
    s_mul_i32 s17, s13, 4
    s_add_i32 s15, s16, s17
    s_add_i32 s15, s15, s12
    s_mul_i32 s16, s15, 4

    s_buffer_store_dword s15, s24, s16 glc
    s_waitcnt 0

    s_buffer_load_dword s17, s24, s16 glc
    s_waitcnt 0
s_endpgm
end
