shader main
type(CS)

user_sgpr_count(9)   // 2 for the buffer resource + 5 for thread/thread group parameters

//s[0:1] the mmeory address for the buffer resource
//s2 x
//s3 x*y
//s4 x*y*z
//s5 X	
//s6 X*Y
//s7 output offset
//s8 loop

tgid_x_en(1)       //s_tgid_x s9
tgid_y_en(1)       //s_tgid_y s10
tgid_z_en(1)       //s_tgid_z s11

//vo for tid_x
//v1 for tid_y
//v2 for tid_z

//fetch the buffer resource through SQC
s_load_dwordx4 s[24:27], s[0:1], 0x0
s_load_dwordx4 s[20:23], s[0:1], 16   // load atc mem surface rsrc
s_waitcnt 0

// v3  thread_id_in_group = (tid_z *x*y) + (tid_y*x) + tid_x
v_mad_u32_u24 v3, v1, s2, v0 
v_mad_u32_u24 v3, v2, s3, v3

//s28 thread_group_id = (tgid_z*X*Y) + (tgid_y*X) + tgid_x
s_mul_i32 s28, s_tgid_y, s5
s_add_i32 s28, s28, s_tgid_x
s_mul_i32 s29, s6, s_tgid_z
s_add_i32 s28, s29, s28

//v9 absolute thread id = thread_group_id *(x*y*z) + thread_id_in_group 
v_mov_b32 v9, s28
v_mad_u32_u24 v9, v9, s4, v3


var MTYPE_UC = 0x38000000
s_or_b32 s27, s27, MTYPE_UC

s_mov_b32 s30, s8
s_mov_b32 m0, 0x0


STORE_LOOP:
s_buffer_store_dword s8, s[20:23], m0 glc:1
s_waitcnt 0
s_add_u32 m0, m0, 4*1024  // step one 4KB page table address
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 STORE_LOOP

var DEBUG_FUNCTION = 0
// Remove function check code to half shader run time...
if DEBUG_FUNCTION
s_mov_b32 s8, s30
s_mov_b32 m0, 0x0

LOAD_LOOP:
s_buffer_load_dword s0, s[20:23], m0 glc:1
s_waitcnt 0
s_add_u32 m0, m0, 4*1024
s_sub_u32 s8, s8, 1
s_cmpk_eq_u32 s8, 0
s_cbranch_scc0 LOAD_LOOP
end

s_endpgm
end 

