shader main
type(CS)

user_sgpr_count(9)   // 2 for the buffer resource + 5 for thread/thread group parameters

//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X	
//s6 X*Y
//s7 output offset
//s8 loop

tgid_x_en(1)       //s_tgid_x s9
tgid_y_en(1)       //s_tgid_y s10
tgid_z_en(1)       //s_tgid_z s11

//vo for tid_x
//v1 for tid_y
//v2 for tid_z

//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_waitcnt 0

// v3  thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0 
v_mad_u32_u24 v3, v2, s3, v3

//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28

//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group 
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3

//read mem data
s_mov_b32 s31, 0x0
buffer_load_dword v0, v9, s24, s31 idxen:1 glc:1 
s_waitcnt 0

//store and load s8 times
s_mov_b32 s30, s8
v_lshlrev_b32 v10, 2, v3
s_mov_b32 m0, 0xFFFF
s_nop 0x1
s_nop 0x1
s_nop 0x1

STORE_LOOP:
ds_write_b32 v10, v0
s_waitcnt 0
v_add_u16 v10, v10, 0x10
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP

s_mov_b32 s8, s30
v_lshlrev_b32 v10, 2, v3
  
LOAD_LOOP:
ds_read_b32 v11, v10
s_waitcnt 0
v_mov_b32 v12, v11
v_add_u16 v10, v10, 0x10
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP

s_mov_b32 s16, 0xa5a50000
s_store_dword s16, s[0:1], 0x40 glc

s_endpgm
end 

