#include "gfx_metrics.xml"

<gfx8_expr>
  <metric name="TA_BUSY_avr" expr=avr(TA_TA_BUSY,16) descr="TA block is busy. Average over TA instances."></metric>
  <metric name="TA_BUSY_max" expr=max(TA_TA_BUSY,16) descr="TA block is busy. Max over TA instances."></metric>
  <metric name="TA_BUSY_min" expr=min(TA_TA_BUSY,16) descr="TA block is busy. Min over TA instances."></metric>
  <metric name="TA_FLAT_READ_WAVEFRONTS_sum" expr=sum(TA_FLAT_READ_WAVEFRONTS,16) descr="Number of flat opcode reads processed by the TA. Sum over TA instances."></metric>
  <metric name="TA_FLAT_WRITE_WAVEFRONTS_sum" expr=sum(TA_FLAT_WRITE_WAVEFRONTS,16) descr="Number of flat opcode writes processed by the TA. Sum over TA instances."></metric>

  <metric name="TCC_HIT_sum" expr=sum(TCC_HIT,16) descr="Number of cache hits. Sum over TCC instances."></metric>
  <metric name="TCC_MISS_sum" expr=sum(TCC_MISS,16) descr="Number of cache misses. Sum over TCC instances."></metric>
  <metric name="TCC_MC_RDREQ_sum" expr=sum(TCC_MC_RDREQ,16) descr="Number of 32-byte reads. Sum over TCC instaces."></metric>
  <metric name="TCC_MC_WRREQ_sum" expr=sum(TCC_MC_WRREQ,16) descr="Number of 32-byte transactions going over the TC_MC_wrreq interface. Sum over TCC instaces."></metric>
  <metric name="TCC_WRREQ_STALL_max" expr=max(TCC_MC_WRREQ_STALL,16) descr="Number of cycles a write request was stalled. Max over TCC instances."></metric>

  <metric name="FETCH_SIZE" expr=(TCC_MC_RDREQ_sum*32)/1024 descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
  <metric name="WRITE_SIZE" expr=(TCC_MC_WRREQ_sum*32)/1024 descr="The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
  <metric name="WRITE_REQ_32B" expr=TCC_MC_WRREQ_sum descr="The total number of 32-byte effective memory writes."></metric>
</gfx8_expr>

<gfx9_expr>
  <metric name="TA_BUSY_avr" expr=avr(TA_TA_BUSY,16) descr="TA block is busy. Average over TA instances."></metric>
  <metric name="TA_BUSY_max" expr=max(TA_TA_BUSY,16) descr="TA block is busy. Max over TA instances."></metric>
  <metric name="TA_BUSY_min" expr=min(TA_TA_BUSY,16) descr="TA block is busy. Min over TA instances."></metric>
  <metric name="TA_FLAT_READ_WAVEFRONTS_sum" expr=sum(TA_FLAT_READ_WAVEFRONTS,16) descr="Number of flat opcode reads processed by the TA. Sum over TA instances."></metric>
  <metric name="TA_FLAT_WRITE_WAVEFRONTS_sum" expr=sum(TA_FLAT_WRITE_WAVEFRONTS,16) descr="Number of flat opcode writes processed by the TA. Sum over TA instances."></metric>

  <metric name="TCC_HIT_sum" expr=sum(TCC_HIT,16) descr="Number of cache hits. Sum over TCC instances."></metric>
  <metric name="TCC_MISS_sum" expr=sum(TCC_MISS,16) descr="Number of cache misses. Sum over TCC instances."></metric>
  <metric name="TCC_EA_RDREQ_32B_sum" expr=sum(TCC_EA_RDREQ_32B,16) descr="Number of 32-byte TCC/EA read requests. Sum over TCC instances."></metric>
  <metric name="TCC_EA_RDREQ_sum" expr=sum(TCC_EA_RDREQ,16) descr="Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances."></metric>
  <metric name="TCC_EA_WRREQ_sum" expr=sum(TCC_EA_WRREQ,16) descr="Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances."></metric>
  <metric name="TCC_EA_WRREQ_64B_sum" expr=sum(TCC_EA_WRREQ_64B,16) descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances."></metric>
  <metric name="TCC_WRREQ_STALL_max" expr=max(TCC_EA_WRREQ_STALL,16) descr="Number of cycles a write request was stalled. Max over TCC instances."></metric>

  <metric name="FETCH_SIZE" expr=(TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64)/1024 descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
  <metric name="WRITE_SIZE" expr=((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024 descr="The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
  <metric name="WRITE_REQ_32B" expr=TCC_EA_WRREQ_64B_sum*2+(TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum) descr="The total number of 32-byte effective memory writes."></metric>
</gfx9_expr>

<gfx906_expr base="gfx9_expr">
  # EA1
  <metric name="TCC_EA1_RDREQ_32B_sum" expr=sum(TCC_EA1_RDREQ_32B,16) descr="Number of 32-byte TCC/EA read requests. Sum over TCC EA1s."></metric>
  <metric name="TCC_EA1_RDREQ_sum" expr=sum(TCC_EA1_RDREQ,16) descr="Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC EA1s."></metric>
  <metric name="TCC_EA1_WRREQ_sum" expr=sum(TCC_EA1_WRREQ,16) descr="Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC EA1s."></metric>
  <metric name="TCC_EA1_WRREQ_64B_sum" expr=sum(TCC_EA1_WRREQ_64B,16) descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC EA1s."></metric>
  <metric name="TCC_WRREQ1_STALL_max" expr=max(TCC_EA1_WRREQ_STALL,16) descr="Number of cycles a write request was stalled. Max over TCC instances."></metric>

  <metric name="RDATA1_SIZE" expr=(TCC_EA1_RDREQ_32B_sum*32+(TCC_EA1_RDREQ_sum-TCC_EA1_RDREQ_32B_sum)*64) descr="The total kilobytes fetched from the video memory. This is measured on EA1s."></metric>
  <metric name="WDATA1_SIZE" expr=((TCC_EA1_WRREQ_sum-TCC_EA1_WRREQ_64B_sum)*32+TCC_EA1_WRREQ_64B_sum*64) descr="The total kilobytes written to the video memory. This is measured on EA1s."></metric>

  # both EA0 and EA1 should be included
  <metric name="FETCH_SIZE" expr=(TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64+RDATA1_SIZE)/1024 descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
  <metric name="WRITE_SIZE" expr=((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64+WDATA1_SIZE)/1024 descr="The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
  <metric name="WRITE_REQ_32B" expr=(TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)+(TCC_EA1_WRREQ_sum-TCC_EA1_WRREQ_64B_sum)+(TCC_EA_WRREQ_64B_sum+TCC_EA1_WRREQ_64B_sum)*2 descr="The total number of 32-byte effective memory writes."></metric>
</gfx906_expr>

<gfx908_expr base="gfx9_expr">
  <metric name="TCC_HIT_sum" expr=sum(TCC_HIT,32) descr="Number of cache hits. Sum over TCC instances."></metric>
  <metric name="TCC_MISS_sum" expr=sum(TCC_MISS,32) descr="Number of cache misses. Sum over TCC instances."></metric>
  <metric name="TCC_EA_RDREQ_32B_sum" expr=sum(TCC_EA_RDREQ_32B,32) descr="Number of 32-byte TCC/EA read requests. Sum over TCC instances."></metric>
  <metric name="TCC_EA_RDREQ_sum" expr=sum(TCC_EA_RDREQ,32) descr="Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances."></metric>
  <metric name="TCC_EA_WRREQ_sum" expr=sum(TCC_EA_WRREQ,32) descr="Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances."></metric>
  <metric name="TCC_EA_WRREQ_64B_sum" expr=sum(TCC_EA_WRREQ_64B,32) descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances."></metric>
  <metric name="TCC_WRREQ_STALL_max" expr=max(TCC_EA_WRREQ_STALL,32) descr="Number of cycles a write request was stalled. Max over TCC instances."></metric>
</gfx908_expr>

<gfx8 base="gfx8_expr"></gfx8>
<gfx9 base="gfx9_expr"></gfx9>
# VG20
<gfx906 base="gfx906_expr"></gfx906>
# MI100
<gfx908 base="gfx908_expr"></gfx908>

<global>
  # GPUBusy         The percentage of time GPU was busy.
  <metric
    name="GPUBusy"
    descr="The percentage of time GPU was busy."
    expr=100*GRBM_GUI_ACTIVE/GRBM_COUNT
  ></metric>

  # Wavefronts      Total wavefronts.
  <metric
    name="Wavefronts"
    descr="Total wavefronts."
    expr=SQ_WAVES
  ></metric>

  # VALUInsts       The average number of vector ALU instructions executed per work-item (affected by flow control).
  <metric
    name="VALUInsts"
    descr="The average number of vector ALU instructions executed per work-item (affected by flow control)."
    expr=SQ_INSTS_VALU/SQ_WAVES
  ></metric>

  # SALUInsts       The average number of scalar ALU instructions executed per work-item (affected by flow control).
  <metric
    name="SALUInsts"
    descr="The average number of scalar ALU instructions executed per work-item (affected by flow control)."
    expr=SQ_INSTS_SALU/SQ_WAVES
  ></metric>

  # VFetchInsts     The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory.
  <metric
    name="VFetchInsts"
    descr="The average number of vector fetch instructions from the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that fetch from video memory."
    expr=(SQ_INSTS_VMEM_RD-TA_FLAT_READ_WAVEFRONTS_sum)/SQ_WAVES
  ></metric>

  # SFetchInsts     The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control).
  <metric
    name="SFetchInsts"
    descr="The average number of scalar fetch instructions from the video memory executed per work-item (affected by flow control)."
    expr=SQ_INSTS_SMEM/SQ_WAVES
  ></metric>

  # VWriteInsts     The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory.
  <metric
    name="VWriteInsts"
    descr="The average number of vector write instructions to the video memory executed per work-item (affected by flow control). Excludes FLAT instructions that write to video memory."
    expr=(SQ_INSTS_VMEM_WR-TA_FLAT_WRITE_WAVEFRONTS_sum)/SQ_WAVES
  ></metric>

  # FlatVMemInsts   The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch.
  <metric
    name="FlatVMemInsts"
    descr="The average number of FLAT instructions that read from or write to the video memory executed per work item (affected by flow control). Includes FLAT instructions that read from or write to scratch."
    expr=(SQ_INSTS_FLAT-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES
  ></metric>

  # LDSInsts        The average number of LDS read or LDS write instructions executed per work item (affected by flow control).  Excludes FLAT instructions that read from or write to LDS.
  <metric
    name="LDSInsts"
    descr="The average number of LDS read or LDS write instructions executed per work item (affected by flow control).  Excludes FLAT instructions that read from or write to LDS."
    expr=(SQ_INSTS_LDS-SQ_INSTS_FLAT_LDS_ONLY)/SQ_WAVES
  ></metric>

  # FlatLDSInsts    The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control).
  <metric
    name="FlatLDSInsts"
    descr="The average number of FLAT instructions that read or write to LDS executed per work item (affected by flow control)."
    expr=SQ_INSTS_FLAT_LDS_ONLY/SQ_WAVES
  ></metric>

  # GDSInsts        The average number of GDS read or GDS write instructions executed per work item (affected by flow control).
  <metric
    name="GDSInsts"
    descr="The average number of GDS read or GDS write instructions executed per work item (affected by flow control)."
    expr=SQ_INSTS_GDS/SQ_WAVES
  ></metric>

  # VALUUtilization The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence).
  <metric
    name="VALUUtilization"
    descr="The percentage of active vector ALU threads in a wave. A lower number can mean either more thread divergence in a wave or that the work-group size is not a multiple of 64. Value range: 0% (bad), 100% (ideal - no thread divergence)."
    expr=100*SQ_THREAD_CYCLES_VALU/(SQ_ACTIVE_INST_VALU*MAX_WAVE_SIZE)
  ></metric>

  # VALUBusy        The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).
  <metric
    name="VALUBusy"
    descr="The percentage of GPUTime vector ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."
    expr=100*SQ_ACTIVE_INST_VALU*4/SIMD_NUM/GRBM_GUI_ACTIVE
  ></metric>

  # SALUBusy        The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal).
  <metric
    name="SALUBusy"
    descr="The percentage of GPUTime scalar ALU instructions are processed. Value range: 0% (bad) to 100% (optimal)."
    expr=100*SQ_INST_CYCLES_SALU*4/SIMD_NUM/GRBM_GUI_ACTIVE
  ></metric>

  # FetchSize       The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
  <metric
    name="FetchSize"
    descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
    expr=FETCH_SIZE
  ></metric>

  # WriteSize       The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account.
  <metric
    name="WriteSize"
    descr="The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."
    expr=WRITE_SIZE
  ></metric>

  # MemWrites32B    The total number of effective 32B write transactions to the memory
  <metric
    name="MemWrites32B"
    descr="The total number of effective 32B write transactions to the memory"
    expr=WRITE_REQ_32B
  ></metric>

  # L2CacheHit      The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal).
  <metric
    name="L2CacheHit"
    descr="The percentage of fetch, write, atomic, and other instructions that hit the data in L2 cache. Value range: 0% (no hit) to 100% (optimal)."
    expr=100*sum(TCC_HIT,16)/(sum(TCC_HIT,16)+sum(TCC_MISS,16))
  ></metric>

  # MemUnitBusy     The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound).
  <metric
    name="MemUnitBusy"
    descr="The percentage of GPUTime the memory unit is active. The result includes the stall time (MemUnitStalled). This is measured with all extra fetches and writes and any cache or memory effects taken into account. Value range: 0% to 100% (fetch-bound)."
    expr=100*max(TA_TA_BUSY,16)/GRBM_GUI_ACTIVE/SE_NUM
  ></metric>

  # MemUnitStalled  The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad).
  <metric
    name="MemUnitStalled"
    descr="The percentage of GPUTime the memory unit is stalled. Try reducing the number or size of fetches and writes if possible. Value range: 0% (optimal) to 100% (bad)."
    expr=100*max(TCP_TCP_TA_DATA_STALL_CYCLES,16)/GRBM_GUI_ACTIVE/SE_NUM
  ></metric>

  # WriteUnitStalled The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad).
  <metric
    name="WriteUnitStalled"
    descr="The percentage of GPUTime the Write unit is stalled. Value range: 0% to 100% (bad)."
    expr=100*TCC_WRREQ_STALL_max/GRBM_GUI_ACTIVE
  ></metric>

  # ALUStalledByLDS The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad).
  <metric
    name="ALUStalledByLDS"
    descr="The percentage of GPUTime ALU units are stalled by the LDS input queue being full or the output queue being not ready. If there are LDS bank conflicts, reduce them. Otherwise, try reducing the number of LDS accesses if possible. Value range: 0% (optimal) to 100% (bad)."
    expr=100*SQ_WAIT_INST_LDS*4/SQ_WAVES/GRBM_GUI_ACTIVE
  ></metric>

  # LDSBankConflict The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad).
  <metric
    name="LDSBankConflict"
    descr="The percentage of GPUTime LDS is stalled by bank conflicts. Value range: 0% (optimal) to 100% (bad)."
    expr=100*SQ_LDS_BANK_CONFLICT/GRBM_GUI_ACTIVE/CU_NUM
  ></metric>

</global>
