
/******************************************/
/* Begin Kernel                           */
/******************************************/
.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
.text
.protected Custom_Cijk_Alik_Bljk_F8NBS_BH_BiasSB_AS_SABV_SAV_UserArgs_MT256x256x128_MI16x16x1_SN_K1_MIWT4_16_DTVA
.globl Custom_Cijk_Alik_Bljk_F8NBS_BH_BiasSB_AS_SABV_SAV_UserArgs_MT256x256x128_MI16x16x1_SN_K1_MIWT4_16_DTVA
.p2align 8
.type Custom_Cijk_Alik_Bljk_F8NBS_BH_BiasSB_AS_SABV_SAV_UserArgs_MT256x256x128_MI16x16x1_SN_K1_MIWT4_16_DTVA,@function
.section .rodata,#alloc
.p2align 6
.amdhsa_kernel Custom_Cijk_Alik_Bljk_F8NBS_BH_BiasSB_AS_SABV_SAV_UserArgs_MT256x256x128_MI16x16x1_SN_K1_MIWT4_16_DTVA
  .amdhsa_user_sgpr_kernarg_segment_ptr 1
  .amdhsa_accum_offset 256 // accvgpr offset
  .amdhsa_next_free_vgpr 512 // vgprs
  .amdhsa_next_free_sgpr 84 // sgprs
  .amdhsa_group_segment_fixed_size 33280 // lds bytes
  .amdhsa_private_segment_fixed_size 0
  .amdhsa_system_sgpr_workgroup_id_x 1
  .amdhsa_system_sgpr_workgroup_id_y 1
  .amdhsa_system_sgpr_workgroup_id_z 1
  .amdhsa_system_vgpr_workitem_id 0
  .amdhsa_float_denorm_mode_32 3
  .amdhsa_float_denorm_mode_16_64 3
  .amdhsa_user_sgpr_count 13
  .amdhsa_user_sgpr_kernarg_preload_length 11
  .amdhsa_user_sgpr_kernarg_preload_offset 0
.end_amdhsa_kernel
.text
/* Num VGPR   =256 */
/* Num AccVGPR=256 */
/* Num SGPR   =84 */

/******************************************/
/* Optimizations and Config:              */
/******************************************/
/* ThreadTile= 16 x 16 */
/* SubGroup= 16 x 16 */
/* VectorWidthA=4 */
/* VectorWidthB=16 */
/* GlobalReadVectorWidthA=16, GlobalReadVectorWidthB=16 */
/* DirectToLdsA=False */
/* DirectToLdsB=False */
/* UseSgprForGRO=1 */
.amdgpu_metadata
---
custom.config:
   ProblemType:
      OperationType: GEMM
      UseScaleAB: "Vector"
      DataType: f8n
      DestDataType: b
      ComputeDataType: s
      HighPrecisionAccumulate: True
      TransposeA: True
      TransposeB: False
      UseBias: 1
      BiasDataTypeList: [S,B]
      Activation: True
      UseScaleAlphaVec: 1
      UseBeta: True
      Batched: True
      GroupedGemm: False
      SupportUserArgs: True
   MatrixInstruction: [16, 16, 32, 1, 1, 4, 16, 4, 1]
   WavefrontSize: 64
   1LDSBuffer: 1
   ScheduleIterAlg: 3
   DepthU: 128
   GlobalReadVectorWidthA: 16
   GlobalReadVectorWidthB: 16
   AssertFree0ElementMultiple: 1
   AssertFree1ElementMultiple: 1
   AssertSummationElementMultiple: 1
   NoReject: True
   InternalSupportParams:
      KernArgsVersion: 2
      SupportUserGSU: True
      SupportCustomWGM: True
      SupportCustomStaggerU: True
      UseUniversalArgs: True
amdhsa.version:
  - 1
  - 1
amdhsa.kernels:
  - .name: Custom_Cijk_Alik_Bljk_F8NBS_BH_BiasSB_AS_SABV_SAV_UserArgs_MT256x256x128_MI16x16x1_SN_K1_MIWT4_16_DTVA
    .symbol: 'Custom_Cijk_Alik_Bljk_F8NBS_BH_BiasSB_AS_SABV_SAV_UserArgs_MT256x256x128_MI16x16x1_SN_K1_MIWT4_16_DTVA.kd'
    .language:                   OpenCL C
    .language_version:
      - 2
      - 0
    .args:
      - .name:            Gemm info
        .size:            4
        .offset:          0
        .value_kind:      by_value
        .value_type:      u32
      - .name:            kernel info0
        .size:            4
        .offset:          4
        .value_kind:      by_value
        .value_type:      u32
      - .name:            kernel info1
        .size:            4
        .offset:          8
        .value_kind:      by_value
        .value_type:      u32
      - .name:            numWG
        .size:            4
        .offset:          12
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SizesFree0
        .size:            4
        .offset:          16
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SizesFree1
        .size:            4
        .offset:          20
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SizesFree2
        .size:            4
        .offset:          24
        .value_kind:      by_value
        .value_type:      u32
      - .name:            SizesSum0
        .size:            4
        .offset:          28
        .value_kind:      by_value
        .value_type:      u32
      - .name:            D
        .size:            8
        .offset:          32
        .value_kind:      global_buffer
        .value_type:      bf16
        .address_space:   generic
      - .name:            C
        .size:            8
        .offset:          40
        .value_kind:      global_buffer
        .value_type:      bf16
        .address_space:   generic
      - .name:            A
        .size:            8
        .offset:          48
        .value_kind:      global_buffer
        .value_type:      fp8
        .address_space:   generic
      - .name:            B
        .size:            8
        .offset:          56
        .value_kind:      global_buffer
        .value_type:      fp8
        .address_space:   generic
      - .name:            strideD0
        .size:            4
        .offset:          64
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideD1
        .size:            4
        .offset:          68
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideC0
        .size:            4
        .offset:          72
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideC1
        .size:            4
        .offset:          76
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideA0
        .size:            4
        .offset:          80
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideA1
        .size:            4
        .offset:          84
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideB0
        .size:            4
        .offset:          88
        .value_kind:      by_value
        .value_type:      u32
      - .name:            strideB1
        .size:            4
        .offset:          92
        .value_kind:      by_value
        .value_type:      u32
      - .name:            alpha
        .size:            4
        .offset:          96
        .value_kind:      by_value
        .value_type:      f32
      - .name:            beta
        .size:            4
        .offset:          100
        .value_kind:      by_value
        .value_type:      f32
      - .name:            AddressScaleA
        .size:            8
        .offset:          104
        .value_kind:      global_buffer
        .value_type:      f32
        .address_space:   generic
      - .name:            AddressScaleB
        .size:            8
        .offset:          112
        .value_kind:      global_buffer
        .value_type:      f32
        .address_space:   generic
      - .name:            AddressScaleAlphaVec
        .size:            8
        .offset:          120
        .value_kind:      global_buffer
        .value_type:      f32
        .address_space:   generic
      - .name:            bias
        .size:            8
        .offset:          128
        .value_kind:      global_buffer
        .value_type:      void
        .address_space:   generic
      - .name:            biasType
        .size:            4
        .offset:          136
        .value_kind:      by_value
        .value_type:      u32
      - .name:            StrideBias
        .size:            4
        .offset:          140
        .value_kind:      by_value
        .value_type:      u32
      - .name:            activationAlpha
        .size:            4
        .offset:          144
        .value_kind:      by_value
        .value_type:      f32
      - .name:            activationBeta
        .size:            4
        .offset:          148
        .value_kind:      by_value
        .value_type:      f32
      - .name:            activationType
        .size:            4
        .offset:          152
        .value_kind:      by_value
        .value_type:      u32
    .group_segment_fixed_size:   33280
    .kernarg_segment_align:      8
    .kernarg_segment_size:       160
    .max_flat_workgroup_size:    256
    .private_segment_fixed_size: 0
    .sgpr_count:                 84
    .sgpr_spill_count:           0
    .vgpr_count:                 256
    .vgpr_spill_count:           0
    .wavefront_size:             64
...
.end_amdgpu_metadata
Custom_Cijk_Alik_Bljk_F8NBS_BH_BiasSB_AS_SABV_SAV_UserArgs_MT256x256x128_MI16x16x1_SN_K1_MIWT4_16_DTVA:
label_ASM_Start:  /// Main body of the asm kernel

/* Magic div and mod functions */
.macro V_MAGIC_DIV dstIdx:req dividend:req magicNumber:req magicShift:req magicA:req
    v_mul_hi_u32 v[\dstIdx+1] \dividend \magicNumber
    v_mul_lo_u32 v[\dstIdx+0] \dividend \magicA
    v_add_u32 v[\dstIdx+0] v[\dstIdx+0] v[\dstIdx+1]
    v_lshrrev_b32 v[\dstIdx+0] \magicShift v[\dstIdx+0]
.endm

/******************************************/
/* VGPR Assignments                       */
/******************************************/
/* ValuC range: [0-0), serializedStore enabled */
.set vgprValuC, 0
/* ValuA/B   Xn=PLR buffer idx,  In=InnerUnroll idx */
.set vgprValuA_X0_I0_0, 0
.set vgprValuA_X2_I0_0, 16
.set vgprValuB_X0_I0, 32
.set vgprValuB_X2_I0, 96
.set vgprLocalWriteAddrA, 160
.set vgprLocalWriteAddrB, 161
.set vgprGlobalReadOffsetA, 162
.set vgprGlobalReadOffsetB, 163
.set vgprG2LB, 164
.set vgprValuA_X0_I0_1, 196
.set vgprValuA_X2_I0_1, 212
.set vgprLocalReadAddrA, 228
.set vgprLocalReadAddrB, 229
.set vgprSerial, 230


/******************************************/
/* SGPR Assignments                       */
/******************************************/
.set sgprKernArgAddress, 0
.set sgprWorkGroup0, 2
.set sgprWorkGroup1, 3
.set sgprWorkGroup2, 4
.set sgprArgType, 5
.set sgprGSUSumIdx, 6
.set sgprGSULog2BpeC, 8
.set sgprGSULog2BpeD, 9
.set sgprStaggerU, 10
.set sgprWGM, 11
.set sgprLoopCounterL, 12
.set sgprOrigLoopCounter, 13
.set sgprSrdD, 16
.set sgprSrdC, 20
.set sgprNumWorkGroups0, 14
.set sgprNumWorkGroups1, 15
.set sgprSizesFree, 24
.set sgprSizesSum, 27
.set sgprAddressD, 28
.set sgprAddressC, 30
.set sgprAddressA, 32
.set sgprAddressB, 34
.set sgprStridesD, 36
.set sgprStridesC, 38
.set sgprStridesA, 40
.set sgprStridesB, 42
.set sgprAlpha, 44
.set sgprBeta, 45
.set sgprGSU, 46

/* Size Assignments */
.set sgprSizeI, sgprSizesFree+0
.set sgprSizeJ, sgprSizesFree+1
.set sgprSizeK, sgprSizesFree+2
.set sgprSizeL, sgprSizesSum+0

/* Stride Assignments */
.set constStrideD0I, 1
.set sgprStrideD1J, sgprStridesD+0
.set sgprStrideDK, sgprStridesD+1
.set constStrideC0I, 1
.set sgprStrideC1J, sgprStridesC+0
.set sgprStrideCK, sgprStridesC+1
.set constStrideAL, 1
.set sgprStrideA0I, sgprStridesA+0
.set sgprStrideAK, sgprStridesA+1
.set constStrideBL, 1
.set sgprStrideB1J, sgprStridesB+0
.set sgprStrideBK, sgprStridesB+1

.set MT0, 256
.set MT1, 256
.set DepthU, 128
.set BpeA, 1
.set BpeALog2, 0
.set BpeB, 1
.set BpeBLog2, 0
.set BpeAGR, 1
.set BpeAGRLog2, 0
.set BpeBGR, 1
.set BpeBGRLog2, 0
/* Number of elements to shift-left SRD */
.set SrdShiftLeftA, 16
.set SrdShiftLeftB, 16
/* 2GB limit - set offsets to -1 to exceed this and clamp */
.set BufferLimit, 0xffffffff
.set BufferOOB, 0x80000000

/******************************************/
/* Bits 127:96 of SRD.                    */
/* hex: 0x00020000                        */
/* dst_sel_x (3b): 0                      */
/* dst_sel_y (3b): 0                      */
/* dst_sel_z (3b): 0                      */
/* dst_sel_w (3b): 0                      */
/* num_format (3b): 0                     */
/* data_format (4b): 4                    */
/* user_vm_enable (1b): 0                 */
/* user_vm_mode (1b): 0                   */
/* index_stride (2b): 0                   */
/* add_tid_enable (1b): 0                 */
/* _unusedA (3b): 0                       */
/* nv (1b): 0                             */
/* _unusedB (2b): 0                       */
/* type (2b): 0                           */
/******************************************/
.set Srd127_96, 0x00020000

/* Global Offset A */
.macro GLOBAL_OFFSET_A vgprAddr:req vgprTmp:req
    v_and_b32 v[\vgprTmp+0], 63, v[vgprSerial]                                // 0. thread id in wave: wtid = tid % wavelength(64)
    v_and_b32 v[\vgprAddr+0], 15, v[\vgprTmp+0]                               // 1. M offset: mIdx = wtid % MI_M(16)
    v_mul_lo_u32 v[\vgprAddr+0], s[sgprStrideA0I], v[\vgprAddr+0]             // 1. M offset: mOffset = mIdx * mStride(k)
    v_lshlrev_b32 v[\vgprAddr+0], 0x2, v[\vgprAddr+0]                         // 4. apply VectorWidth: bnOffset = bnOffset * vw(4)
    v_and_b32 v[\vgprTmp+0], 63, v[vgprSerial]                                // 5. thread id in wave: wtid = tid % wavelength(64)
    v_lshrrev_b32 v[\vgprTmp+0], 4, v[\vgprTmp+0]                             // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
    v_lshlrev_b32 v[\vgprTmp+0], 0x4, v[\vgprTmp+0]                           // 5. K offset: lrKOffset = kIdx * mStride(16)
    v_add_u32 v[\vgprAddr+0], v[\vgprTmp+0], v[\vgprAddr+0]                   // 6. offset in wave: lrOffset = bnOffset + lrKOffset
    v_lshrrev_b32 v[\vgprTmp+0], 6, v[vgprSerial]                             // 7. wave offset in M dimen: wtid = tid / dividedForWaveId(64)
    v_and_b32 v[\vgprTmp+0], 3, v[\vgprTmp+0]                                 // 7. wave offset in M dimen: wtid0 = wtid % num1DWaves(4)
    v_mul_lo_u32 v[\vgprTmp+0], s[sgprStrideA0I], v[\vgprTmp+0]               // 7. wave offset in M dimen: wOffset = wtid0 * s[sgprStrideA0I]
    v_lshlrev_b32 v[\vgprTmp+0], 0x6, v[\vgprTmp+0]                           // 7. wave offset in M dimen: wOffset = wOffset * 16 * vw(4)
    v_add_u32 v[\vgprAddr+0], v[\vgprTmp+0], v[\vgprAddr+0]                   // 7. final local read offset: flrOffset = lrOffset + WOffset
    v_add_u32 v[\vgprAddr+0] 0x10 v[\vgprAddr+0]                              // add prepad for pointer shift                                                                          // offset *= bytes/element
.endm

/* Global Offset B */
.macro GLOBAL_OFFSET_B vgprAddr:req vgprOffsetL:req vgprOffset1J:req vgprTmp:req
    v_mul_lo_u32 v[\vgprTmp+0] s[sgprStrideB1J] v[\vgprOffset1J] // mul d1 lower
    v_add_co_u32 v[\vgprAddr+0] vcc v[\vgprOffsetL] v[\vgprTmp+0] // accumulate K lower
    v_add_u32 v[\vgprAddr+0] 0x10 v[\vgprAddr+0]     // add prepad for pointer shift
                                                       // offset *= bytes/element (multiplier is 1 do nothing)
.endm

/* Dynamic Scalar Divide: vQuotient=vDividend/vDivisor; vRemainder=vDividend%vDivisor; */
.macro DYNAMIC_VECTOR_DIVIDE vQuotient vRemainder vDividend vDivisor vTmp0 vTmp1 sTmp
    v_cvt_f32_u32 v[\vQuotient] v[\vDivisor]
    v_rcp_f32 v[\vQuotient] v[\vQuotient]
    v_mul_f32 v[\vQuotient] 0x4f800000 v[\vQuotient]
    v_cvt_u32_f32 v[\vQuotient] v[\vQuotient]
    v_mul_lo_u32 v[\vRemainder] v[\vDivisor] v[\vQuotient]
    v_mul_hi_u32 v[\vTmp0] v[\vDivisor] v[\vQuotient]
    v_sub_co_u32 v[\vTmp1] vcc 0x0 v[\vRemainder]
    v_cmp_ne_i32 s[\sTmp:\sTmp+1] 0x0 v[\vTmp0]
    v_cndmask_b32 v[\vRemainder] v[\vTmp1] v[\vRemainder] s[\sTmp:\sTmp+1]
    v_mul_hi_u32 v[\vRemainder] v[\vRemainder] v[\vQuotient]
    v_sub_co_u32 v[\vTmp0] vcc v[\vQuotient] v[\vRemainder]
    v_add_co_u32 v[\vQuotient] vcc v[\vQuotient] v[\vRemainder]
    v_cndmask_b32 v[\vQuotient] v[\vQuotient] v[\vTmp0] s[\sTmp:\sTmp+1]
    v_mul_hi_u32 v[\vQuotient] v[\vQuotient] v[\vDividend]
    v_mul_lo_u32 v[\vRemainder] v[\vQuotient] v[\vDivisor]
    v_sub_co_u32 v[\vTmp0] vcc v[\vDividend] v[\vRemainder]
    v_cmp_ge_u32 s[\sTmp:\sTmp+1] v[\vDividend] v[\vRemainder]
    v_add_co_u32 v[\vRemainder] vcc 0x1 v[\vQuotient]
    v_add_co_u32 v[\vTmp1] vcc -1 v[\vQuotient]
    v_cmp_le_u32 vcc v[\vDivisor] v[\vTmp0]
    s_and_b64 vcc s[\sTmp:\sTmp+1] vcc
    v_cndmask_b32 v[\vQuotient] v[\vQuotient] v[\vRemainder] vcc
    v_cndmask_b32 v[\vQuotient] v[\vTmp1] v[\vQuotient] s[\sTmp:\sTmp+1]
    v_cmp_ne_i32 vcc 0x0 v[\vDivisor]
    v_cndmask_b32 v[\vQuotient] -1 v[\vQuotient] vcc // final result
    v_mul_lo_u32 v[\vRemainder] v[\vQuotient] v[\vDivisor]
    v_sub_co_u32 v[\vRemainder] vcc v[\vDividend] v[\vRemainder] // final result
.endm

/******************************************/
/* Allocate Resources                     */
/******************************************/

/* Load num of Gemms */
s_load_dword s47, s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0

/* Load packed kernel args (StaggerU/GSU) */
s_load_dword s49, s[sgprKernArgAddress:sgprKernArgAddress+1], 0x4

/* Load WGM data */
s_load_dword s[sgprWGM], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x8
s_waitcnt lgkmcnt(0)
s_lshr_b32 s48, s47, 0x1e                          // Get arg type
s_and_b32 s47, 0x3fffffff, s47                     // Get nums of gemm
s_cmp_eq_u32 s48, 0                                // Is kernel args
s_cbranch_scc0 label_HBMArgs
s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args
s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0

/* Load Kernel Args */
s_load_dwordx16 s[24:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x0
s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40
s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50
s_waitcnt lgkmcnt(0)
s_branch label_LoadArgsEnd
label_HBMArgs:

/* Load address of kernel arguments */
s_load_dwordx2 s[sgprKernArgAddress:sgprKernArgAddress+1], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x10
s_waitcnt lgkmcnt(0)                               // wait for args to load
label_LoadArgsEnd:
s_branch label_common_kernel_entry

/* pad 39 snops to satisfy 0x100 code size for Preload Backward Compatibility Prologue */
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
s_nop 0
label_Preload_Offset_Start:
s_and_b32 s47, 0x3fffffff, s2                      // Get nums of gemm
s_lshr_b32 s48, s2, 0x1e                           // Get arg type
s_mov_b32 s49, s3                                  // Preload internal args
s_cmp_eq_u32 s48, 0                                // Is kernel args
s_cbranch_scc0 label_Preload_HBMArgs
s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], 0x10 // Shift common args
s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0

/* Load Kernel Args */
s_load_dword s31, s[sgprKernArgAddress:sgprKernArgAddress+1], 0x1c
s_load_dwordx8 s[32:39], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x20
s_load_dwordx4 s[40:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x40
s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50
s_mov_b32 s24, s6                                  // move preload data to correct sgpr
s_mov_b32 s25, s7                                  // move preload data to correct sgpr
s_mov_b32 s26, s8                                  // move preload data to correct sgpr
s_mov_b32 s27, s9                                  // move preload data to correct sgpr
s_mov_b32 s28, s10                                 // move preload data to correct sgpr
s_mov_b32 s29, s11                                 // move preload data to correct sgpr
s_mov_b32 s30, s12                                 // move preload data to correct sgpr
s_branch label_Preload_LoadArgsEnd
label_Preload_HBMArgs:
s_mov_b64 s[sgprKernArgAddress:sgprKernArgAddress+1], s[6:7] // Load address of kernel arguments
label_Preload_LoadArgsEnd:
s_mov_b32 s[sgprWGM], s4                           // Preload internal args2
label_common_kernel_entry:  /// for both preload/non-preload common code
s_mov_b32 s[sgprWorkGroup0+0], s13                 // restore workgroup id
s_mov_b32 s[sgprWorkGroup0+1], s14                 // restore workgroup id
s_mov_b32 s[sgprWorkGroup0+2], s15                 // restore workgroup id
s_and_b32 s[sgprStaggerU], s49, 0xffff0000         // Restore StaggerU related vars
s_lshr_b32 s[sgprStaggerU], s[sgprStaggerU], 0x10
s_and_b32 s[sgprGSU], s49, 0xffff                  // Restore GSUConfig and GSU
s_mov_b32 s[sgprArgType], s48
s_mov_b32 m0, 0x8200                               // LDS clamp at 33280 bytes
v_mov_b32 v[vgprSerial], v0                        // thread serial id
s_cmp_eq_u32 s48, 0
s_cbranch_scc0 label_MultiGemm
/* init: add vgpr [0...160) to pool */
/* init: add vgpr [0...0) to pool */
/* init: add agpr [0...256) to pool */

/******************************************/
/* Local Read Addresses                   */
/******************************************/

/* local read addresses: tile assignments a/b */
/* lr0I */
v_and_b32 v1, 63, v[vgprSerial]                    // 0. thread id in wave: wtid = tid % wavelength(64)
v_and_b32 v0, 15, v1                               // 1. N offset: nIdx = wtid % MI_N(16)
v_lshlrev_b32 v0, 0x7, v0                          // 1. N offset: nOffset = nIdx * nStride(128)
/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */
v_lshlrev_b32 v0, 0x2, v0                          // 4. apply VectorWidth: bnOffset = bnOffset * vw(4)
v_and_b32 v1, 63, v[vgprSerial]                    // 5. thread id in wave: wtid = tid % wavelength(64)
v_lshrrev_b32 v1, 4, v1                            // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
v_lshlrev_b32 v1, 0x4, v1                          // 5. K offset: lrKOffset = kIdx * mStride(16)
v_add_u32 v0, v1, v0                               // 6. offset in wave: lrOffset = bnOffset + lrKOffset
v_lshrrev_b32 v1, 6, v[vgprSerial]                 // 7. wave offset in N dimen: wtid = tid / dividedForWaveId(64)
v_and_b32 v1, 3, v1                                // 7. wave offset in M dimen: wtid0 = wtid / num1DWaves(4)
v_lshlrev_b32 v1, 0xd, v1                          // 7. wave offset in M dimen: wOffset = wtid0 * W0Stride(8192)
v_add_u32 v0, v1, v0                               // 7. final local read offset: flrOffset = lrOffset + WOffset
/* lr1J */
v_and_b32 v2, 63, v[vgprSerial]                    // 0. thread id in wave: wtid = tid % wavelength(64)
v_and_b32 v1, 15, v2                               // 1. N offset: nIdx = wtid % MI_N(16)
v_lshlrev_b32 v1, 0x7, v1                          // 1. N offset: nOffset = nIdx * nStride(128)
/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */
v_lshlrev_b32 v1, 0x4, v1                          // 4. apply VectorWidth: bnOffset = bnOffset * vw(16)
v_and_b32 v2, 63, v[vgprSerial]                    // 5. thread id in wave: wtid = tid % wavelength(64)
v_lshrrev_b32 v2, 4, v2                            // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
v_lshlrev_b32 v2, 0x4, v2                          // 5. K offset: lrKOffset = kIdx * mStride(16)
v_add_u32 v1, v2, v1                               // 6. offset in wave: lrOffset = bnOffset + lrKOffset

/* local read addresses: final offsets a */
v_lshrrev_b32 v2, 6, v[vgprSerial]                 // v2 = v[vgprSerial] / 64
v_lshrrev_b32 v2, 2, v2                            // LSU offset: Get LSU wave_id
s_mov_b32 s49, 128                                 // LSU offset: stride = lsuStride(128) when umlds==True
v_mul_lo_u32 v2, s49, v2                           // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD)
v_add_u32 v[vgprLocalReadAddrA], v2, v0            // Final Offset: offset = (lro0+lsuoffset)*bpeDS(1)
v_lshrrev_b32 v3, 9, v[vgprLocalReadAddrA]         // Final Offset: padding 32 per block 512
v_lshlrev_b32 v3, 0x5, v3                          // Final Offset: padding 32 per block 512
v_add_u32 v[vgprLocalReadAddrA], v3, v[vgprLocalReadAddrA] // Final Offset: add padding 32 per block 512

/* local read addresses: final offsets b */
v_lshrrev_b32 v0, 6, v[vgprSerial]                 // v0 = v[vgprSerial] / 64
v_lshrrev_b32 v0, 2, v0                            // LSU offset: Get LSU wave_id
s_mov_b32 s49, 128                                 // LSU offset: stride = lsuStride(128) when umlds==True
v_mul_lo_u32 v0, s49, v0                           // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD)
v_add_u32 v[vgprLocalReadAddrB], v0, v1            // Final Offset: offset = (lro1+lsuoffset)*bpeDS(1)
v_lshrrev_b32 v2, 11, v[vgprLocalReadAddrB]        // Final Offset: padding 32 per block 2048
v_lshlrev_b32 v2, 0x5, v2                          // Final Offset: padding 32 per block 2048
v_add_u32 v[vgprLocalReadAddrB], v2, v[vgprLocalReadAddrB] // Final Offset: add padding 32 per block 2048

/* local read addresses: declare addresses a */
/* N/A */

/* local read addresses: declare addresses b */

/******************************************/
/* Local Write Addresses                  */
/******************************************/
/* LVCA = 8 */
/* v1 = A-unroll = serial%LVCA */
v_lshrrev_b32 v0, 3, v[vgprSerial]                 // v0 = v[vgprSerial] / 8
v_and_b32 v1, 7, v[vgprSerial]                     // v1 = v[vgprSerial] % 8
/* unroll *= glvw */
v_lshlrev_b32 v1, 0x4, v1                          // v1 = v1 * 16
v_mov_b32 v4, v1                                   // copy for GlobalSplitU
/* LVCB = 8 */
/* v3 = B-unroll = serial%LVCB */
v_lshrrev_b32 v2, 3, v[vgprSerial]                 // v2 = v[vgprSerial] / 8
v_and_b32 v3, 7, v[vgprSerial]                     // v3 = v[vgprSerial] % 8
/* unroll *= glvw */
v_lshlrev_b32 v3, 0x4, v3                          // v3 = v3 * 16
v_mov_b32 v5, v3                                   // copy for GlobalSplitU
/* lwaUnrollAssignmentA = v4 */
/* lwaUnrollAssignmentB = v5 */

/* local write addresses: first offset a */
v_mul_u32_u24 v[vgprLocalWriteAddrA], 0x80, v0     // lwAL**(DepthU_Compute + PAD)
v_add_u32 v[vgprLocalWriteAddrA], v4, v[vgprLocalWriteAddrA] // lwFOA = (lwAA + lwAL*(DepthU+PAD))*bpeDS(1)
v_lshrrev_b32 v6, 9, v[vgprLocalWriteAddrA]        // padding 32 per block 512
v_lshlrev_b32 v6, 0x5, v6                          // padding 32 per block 512
v_add_u32 v[vgprLocalWriteAddrA], v6, v[vgprLocalWriteAddrA] // add padding 32 per block 512

/* local write addresses: first offset b */
v_mul_u32_u24 v[vgprLocalWriteAddrB], 0x80, v2     // lwBL**(DepthU_Compute + PAD)
v_add_u32 v[vgprLocalWriteAddrB], v5, v[vgprLocalWriteAddrB] // lwFOB = (lwBB + lwBL*(DepthU+PAD))*bpeDS(1)
v_lshrrev_b32 v6, 11, v[vgprLocalWriteAddrB]       // padding 32 per block 2048
v_lshlrev_b32 v6, 0x5, v6                          // padding 32 per block 2048
v_add_u32 v[vgprLocalWriteAddrB], v6, v[vgprLocalWriteAddrB] // add padding 32 per block 2048
v_mov_b32 v8, MT0                                  // set MT0 into sgpr
v_mov_b32 v7, s[sgprSizesFree+0]                   // set Free0 size
v_cvt_f32_u32 v6, v8                               // v6 = ceil(v7 / v8)
v_rcp_iflag_f32 v6, v6                             // v6 = ceil(v7 / v8)
v_cvt_f32_u32 v9, v7                               // v6 = ceil(v7 / v8)
v_mul_f32 v6, v6, v9                               // v6 = ceil(v7 / v8)
v_cvt_u32_f32 v6, v6                               // v6 = ceil(v7 / v8)
v_mul_u32_u24 v9, v6, v8                           // v6 = ceil(v7 / v8)
v_sub_u32 v9, v7, v9                               // v6 = ceil(v7 / v8)
v_cmp_ne_u32 vcc, v9, 0                            // v6 = ceil(v7 / v8)
v_addc_co_u32 v6, vcc, v6, 0, vcc                  // ceil
v_mov_b32 v8, MT1                                  // set MT1 into sgpr
v_mov_b32 v7, s[sgprSizesFree+1]                   // set Free1 size
v_readfirstlane_b32 s[sgprNumWorkGroups0], v6      // set back to numWorkGroup0
v_cvt_f32_u32 v6, v8                               // v6 = ceil(v7 / v8)
v_rcp_iflag_f32 v6, v6                             // v6 = ceil(v7 / v8)
v_cvt_f32_u32 v9, v7                               // v6 = ceil(v7 / v8)
v_mul_f32 v6, v6, v9                               // v6 = ceil(v7 / v8)
v_cvt_u32_f32 v6, v6                               // v6 = ceil(v7 / v8)
v_mul_u32_u24 v9, v6, v8                           // v6 = ceil(v7 / v8)
v_sub_u32 v9, v7, v9                               // v6 = ceil(v7 / v8)
v_cmp_ne_u32 vcc, v9, 0                            // v6 = ceil(v7 / v8)
v_addc_co_u32 v6, vcc, v6, 0, vcc                  // ceil
s_nop 0                                            // 1 wait states
v_readfirstlane_b32 s[sgprNumWorkGroups1], v6      // set back to numWorkGroup1
s_waitcnt lgkmcnt(0)                               // wait for 44/0 bytes of kern args

/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */
/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */
s_mul_i32 s48, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1]
s_and_b32 s49, s[sgprGSU], 0x3fff                  // Restore GSU
s_mul_i32 s48, s48, s49
v_cvt_f32_u32 v6, s48                              // s48 = s[sgprWorkGroup0] / s48
v_rcp_iflag_f32 v6, v6                             // s48 = s[sgprWorkGroup0] / s48
v_cvt_f32_u32 v7, s[sgprWorkGroup0]                // s48 = s[sgprWorkGroup0] / s48
v_mul_f32 v6, v6, v7                               // s48 = s[sgprWorkGroup0] / s48
v_cvt_u32_f32 v6, v6                               // s48 = s[sgprWorkGroup0] / s48
v_mul_u32_u24 v7, v6, s48                          // s48 = s[sgprWorkGroup0] / s48
v_sub_u32 v7, s[sgprWorkGroup0], v7                // s48 = s[sgprWorkGroup0] / s48
v_cmpx_eq_u32 exec, v7, s48                        // s48 = s[sgprWorkGroup0] / s48
v_add_u32 v6, 1, v6                                // s48 = s[sgprWorkGroup0] / s48
s_mov_b64 exec, -1                                 // s48 = s[sgprWorkGroup0] / s48
v_readfirstlane_b32 s48, v6                        // quotient
s_mov_b32 s[sgprWorkGroup2], s48
/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */
s_mul_i32 s48, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0]
s_mul_i32 s48, s48, s[sgprWorkGroup2]
s_mul_i32 s48, s48, s49
s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s48
/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */
v_cvt_f32_u32 v6, s[sgprNumWorkGroups0]            // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_rcp_iflag_f32 v6, v6                             // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_cvt_f32_u32 v7, s[sgprWorkGroup0]                // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_mul_f32 v6, v6, v7                               // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_cvt_u32_f32 v6, v6                               // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0]        // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_sub_u32 v7, s[sgprWorkGroup0], v7                // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0]      // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_add_u32 v6, 1, v6                                // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
s_mov_b64 exec, -1                                 // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_readfirstlane_b32 s48, v6                        // quotient
s_mov_b32 s[sgprWorkGroup1], s48
/* wg0 = idxWG01 - wg1 * numWG0 */
s_mul_i32 s48, s[sgprWorkGroup1], s[sgprNumWorkGroups0]
s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s48
s_branch label_MultiGemmEnd
label_MultiGemm:

/* Check if custom structure pointer is null */
s_cmp_eq_u32 s[sgprArgType], 2                     // ArgType == 2 ?
s_cbranch_scc1 label_IsExternalValid               // branch if ArgType == 2
s_mov_b32 s15, 140
s_mul_i32 s54, s47, 4
s_mov_b64 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1]
s_branch label_IsExternalValidEnd
label_IsExternalValid:
s_mov_b32 s15, 196
s_mov_b32 s54, 0x0
s_mov_b64 s[48:49], s[sgprKernArgAddress:sgprKernArgAddress+1]
label_IsExternalValidEnd:

/* Grouped Gemm:: prefetch 1 arg load */
s_mov_b32 s14, 1
s_mov_b32 s55, 0
s_load_dwordx4 s[24:27], s[48:49], s54
s_cmpk_eq_u32 s47, 1                               // if gemm_count is 1?
s_cbranch_scc1 label_wgTable_noLoadLoop

/* Grouped Gemm:: accumulate numTiles for each gemm */
/* Grouped Gemm:: loop start */
label_Loop_GemmCount:
s_waitcnt lgkmcnt(0)
s_lshr_b32 s52, s24, 8                             // s52 = s24 / 256
s_and_b32 s50, 255, s24                            // s50 = s24 % 256
s_addc_u32 s52, s52, 0x0
s_lshr_b32 s53, s25, 8                             // s53 = s25 / 256
s_and_b32 s50, 255, s25                            // s50 = s25 % 256
s_addc_u32 s53, s53, 0x0
s_mul_i32 s52, s52, s53
s_mul_i32 s52, s52, s26
s_and_b32 s53, s[sgprGSU], 0x3fff                  // Restore GSU
s_mul_i32 s52, s52, s53
s_add_u32 s55, s55, s52
s_cmp_lt_u32 s[sgprWorkGroup0], s55
s_cbranch_scc1 label_FOUND
s_add_u32 s54, s54, s15
s_load_dwordx4 s[24:27], s[48:49], s54
s_add_u32 s14, s14, 1
s_cmp_lt_u32 s14, s47
s_cbranch_scc1 label_Loop_GemmCount

/* Grouped Gemm:: noLoadLoop */
label_wgTable_noLoadLoop:
s_waitcnt lgkmcnt(0)
s_lshr_b32 s52, s24, 8                             // s52 = s24 / 256
s_and_b32 s50, 255, s24                            // s50 = s24 % 256
s_addc_u32 s52, s52, 0x0
s_lshr_b32 s53, s25, 8                             // s53 = s25 / 256
s_and_b32 s50, 255, s25                            // s50 = s25 % 256
s_addc_u32 s53, s53, 0x0
s_mul_i32 s52, s52, s53
s_mul_i32 s52, s52, s26
s_and_b32 s48, s[sgprGSU], 0x3fff                  // Restore GSU
s_mul_i32 s52, s52, s48
s_add_u32 s55, s55, s52

/* Grouped Gemm:: gemmIndex found */
label_FOUND:
s_sub_u32 s49, s14, 1
s_sub_u32 s48, s55, s52
s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s48
/* Check if custom structure pointer is null */
s_cmp_eq_u32 s[sgprArgType], 2                     // ArgType == 2 ?
s_cbranch_scc1 label_LoadExternalStruct            // branch if ArgType == 2

/* Grouped Gemm: offset argument address to gemm */
/* Grouped Gemm: offset address from wg_table_start to args_start */
s_lshl2_add_u32 s[sgprKernArgAddress], s47, s[sgprKernArgAddress]
s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0
/* Grouped Gemm: offset address from args_start to gemm_start */
s_mul_i32 s49, s49, 140
s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s49
s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0

/* Load Kernel Args */
s_load_dwordx16 s[28:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x10
s_load_dwordx2 s[44:45], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50
s_branch label_LoadExternalStructEnd
label_LoadExternalStruct:
/* Grouped Gemm: offset address from args_start to gemm_start */
s_mul_i32 s49, s49, 196
s_add_u32 s[sgprKernArgAddress], s[sgprKernArgAddress], s49
s_addc_u32 s[sgprKernArgAddress+1], s[sgprKernArgAddress+1], 0x0
s_load_dwordx16 s[28:43], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x10
s_load_dword s44, s[sgprKernArgAddress:sgprKernArgAddress+1], 0x50
// Read Beta
s_load_dword s45, s[sgprKernArgAddress:sgprKernArgAddress+1], 0x60
label_LoadExternalStructEnd:
/* init: add vgpr [0...160) to pool */
/* init: add vgpr [0...0) to pool */
/* init: add agpr [0...256) to pool */

/******************************************/
/* Local Read Addresses                   */
/******************************************/

/* local read addresses: tile assignments a/b */
/* lr0I */
v_and_b32 v1, 63, v[vgprSerial]                    // 0. thread id in wave: wtid = tid % wavelength(64)
v_and_b32 v0, 15, v1                               // 1. N offset: nIdx = wtid % MI_N(16)
v_lshlrev_b32 v0, 0x7, v0                          // 1. N offset: nOffset = nIdx * nStride(128)
/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */
v_lshlrev_b32 v0, 0x2, v0                          // 4. apply VectorWidth: bnOffset = bnOffset * vw(4)
v_and_b32 v1, 63, v[vgprSerial]                    // 5. thread id in wave: wtid = tid % wavelength(64)
v_lshrrev_b32 v1, 4, v1                            // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
v_lshlrev_b32 v1, 0x4, v1                          // 5. K offset: lrKOffset = kIdx * mStride(16)
v_add_u32 v0, v1, v0                               // 6. offset in wave: lrOffset = bnOffset + lrKOffset
v_lshrrev_b32 v1, 6, v[vgprSerial]                 // 7. wave offset in N dimen: wtid = tid / dividedForWaveId(64)
v_and_b32 v1, 3, v1                                // 7. wave offset in M dimen: wtid0 = wtid / num1DWaves(4)
v_lshlrev_b32 v1, 0xd, v1                          // 7. wave offset in M dimen: wOffset = wtid0 * W0Stride(8192)
v_add_u32 v0, v1, v0                               // 7. final local read offset: flrOffset = lrOffset + WOffset
/* lr1J */
v_and_b32 v2, 63, v[vgprSerial]                    // 0. thread id in wave: wtid = tid % wavelength(64)
v_and_b32 v1, 15, v2                               // 1. N offset: nIdx = wtid % MI_N(16)
v_lshlrev_b32 v1, 0x7, v1                          // 1. N offset: nOffset = nIdx * nStride(128)
/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */
v_lshlrev_b32 v1, 0x4, v1                          // 4. apply VectorWidth: bnOffset = bnOffset * vw(16)
v_and_b32 v2, 63, v[vgprSerial]                    // 5. thread id in wave: wtid = tid % wavelength(64)
v_lshrrev_b32 v2, 4, v2                            // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
v_lshlrev_b32 v2, 0x4, v2                          // 5. K offset: lrKOffset = kIdx * mStride(16)
v_add_u32 v1, v2, v1                               // 6. offset in wave: lrOffset = bnOffset + lrKOffset

/* local read addresses: final offsets a */
v_lshrrev_b32 v2, 6, v[vgprSerial]                 // v2 = v[vgprSerial] / 64
v_lshrrev_b32 v2, 2, v2                            // LSU offset: Get LSU wave_id
s_mov_b32 s49, 128                                 // LSU offset: stride = lsuStride(128) when umlds==True
v_mul_lo_u32 v2, s49, v2                           // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD)
v_add_u32 v[vgprLocalReadAddrA], v2, v0            // Final Offset: offset = (lro0+lsuoffset)*bpeDS(1)
v_lshrrev_b32 v3, 9, v[vgprLocalReadAddrA]         // Final Offset: padding 32 per block 512
v_lshlrev_b32 v3, 0x5, v3                          // Final Offset: padding 32 per block 512
v_add_u32 v[vgprLocalReadAddrA], v3, v[vgprLocalReadAddrA] // Final Offset: add padding 32 per block 512

/* local read addresses: final offsets b */
v_lshrrev_b32 v0, 6, v[vgprSerial]                 // v0 = v[vgprSerial] / 64
v_lshrrev_b32 v0, 2, v0                            // LSU offset: Get LSU wave_id
s_mov_b32 s49, 128                                 // LSU offset: stride = lsuStride(128) when umlds==True
v_mul_lo_u32 v0, s49, v0                           // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD)
v_add_u32 v[vgprLocalReadAddrB], v0, v1            // Final Offset: offset = (lro1+lsuoffset)*bpeDS(1)
v_lshrrev_b32 v2, 11, v[vgprLocalReadAddrB]        // Final Offset: padding 32 per block 2048
v_lshlrev_b32 v2, 0x5, v2                          // Final Offset: padding 32 per block 2048
v_add_u32 v[vgprLocalReadAddrB], v2, v[vgprLocalReadAddrB] // Final Offset: add padding 32 per block 2048

/* local read addresses: declare addresses a */
/* N/A */

/* local read addresses: declare addresses b */

/******************************************/
/* Local Write Addresses                  */
/******************************************/
/* LVCA = 8 */
/* v1 = A-unroll = serial%LVCA */
v_lshrrev_b32 v0, 3, v[vgprSerial]                 // v0 = v[vgprSerial] / 8
v_and_b32 v1, 7, v[vgprSerial]                     // v1 = v[vgprSerial] % 8
/* unroll *= glvw */
v_lshlrev_b32 v1, 0x4, v1                          // v1 = v1 * 16
v_mov_b32 v4, v1                                   // copy for GlobalSplitU
/* LVCB = 8 */
/* v3 = B-unroll = serial%LVCB */
v_lshrrev_b32 v2, 3, v[vgprSerial]                 // v2 = v[vgprSerial] / 8
v_and_b32 v3, 7, v[vgprSerial]                     // v3 = v[vgprSerial] % 8
/* unroll *= glvw */
v_lshlrev_b32 v3, 0x4, v3                          // v3 = v3 * 16
v_mov_b32 v5, v3                                   // copy for GlobalSplitU
/* lwaUnrollAssignmentA = v4 */
/* lwaUnrollAssignmentB = v5 */

/* local write addresses: first offset a */
v_mul_u32_u24 v[vgprLocalWriteAddrA], 0x80, v0     // lwAL**(DepthU_Compute + PAD)
v_add_u32 v[vgprLocalWriteAddrA], v4, v[vgprLocalWriteAddrA] // lwFOA = (lwAA + lwAL*(DepthU+PAD))*bpeDS(1)
v_lshrrev_b32 v6, 9, v[vgprLocalWriteAddrA]        // padding 32 per block 512
v_lshlrev_b32 v6, 0x5, v6                          // padding 32 per block 512
v_add_u32 v[vgprLocalWriteAddrA], v6, v[vgprLocalWriteAddrA] // add padding 32 per block 512

/* local write addresses: first offset b */
v_mul_u32_u24 v[vgprLocalWriteAddrB], 0x80, v2     // lwBL**(DepthU_Compute + PAD)
v_add_u32 v[vgprLocalWriteAddrB], v5, v[vgprLocalWriteAddrB] // lwFOB = (lwBB + lwBL*(DepthU+PAD))*bpeDS(1)
v_lshrrev_b32 v6, 11, v[vgprLocalWriteAddrB]       // padding 32 per block 2048
v_lshlrev_b32 v6, 0x5, v6                          // padding 32 per block 2048
v_add_u32 v[vgprLocalWriteAddrB], v6, v[vgprLocalWriteAddrB] // add padding 32 per block 2048
v_mov_b32 v8, MT0                                  // set MT0 into sgpr
v_mov_b32 v7, s[sgprSizesFree+0]                   // set Free0 size
v_cvt_f32_u32 v6, v8                               // v6 = ceil(v7 / v8)
v_rcp_iflag_f32 v6, v6                             // v6 = ceil(v7 / v8)
v_cvt_f32_u32 v9, v7                               // v6 = ceil(v7 / v8)
v_mul_f32 v6, v6, v9                               // v6 = ceil(v7 / v8)
v_cvt_u32_f32 v6, v6                               // v6 = ceil(v7 / v8)
v_mul_u32_u24 v9, v6, v8                           // v6 = ceil(v7 / v8)
v_sub_u32 v9, v7, v9                               // v6 = ceil(v7 / v8)
v_cmp_ne_u32 vcc, v9, 0                            // v6 = ceil(v7 / v8)
v_addc_co_u32 v6, vcc, v6, 0, vcc                  // ceil
v_mov_b32 v8, MT1                                  // set MT1 into sgpr
v_mov_b32 v7, s[sgprSizesFree+1]                   // set Free1 size
v_readfirstlane_b32 s[sgprNumWorkGroups0], v6      // set back to numWorkGroup0
v_cvt_f32_u32 v6, v8                               // v6 = ceil(v7 / v8)
v_rcp_iflag_f32 v6, v6                             // v6 = ceil(v7 / v8)
v_cvt_f32_u32 v9, v7                               // v6 = ceil(v7 / v8)
v_mul_f32 v6, v6, v9                               // v6 = ceil(v7 / v8)
v_cvt_u32_f32 v6, v6                               // v6 = ceil(v7 / v8)
v_mul_u32_u24 v9, v6, v8                           // v6 = ceil(v7 / v8)
v_sub_u32 v9, v7, v9                               // v6 = ceil(v7 / v8)
v_cmp_ne_u32 vcc, v9, 0                            // v6 = ceil(v7 / v8)
v_addc_co_u32 v6, vcc, v6, 0, vcc                  // ceil
s_nop 0                                            // 1 wait states
v_readfirstlane_b32 s[sgprNumWorkGroups1], v6      // set back to numWorkGroup1
s_waitcnt lgkmcnt(0)                               // wait for 44/0 bytes of kern args

/* Early stop if N(SizeFreeJ) == 0 */
s_cmp_eq_u32 s[sgprSizeJ], 0x0
s_cbranch_scc0 label_NoEarlyStop_N0
label_EarlyStop_if_N_is_0:
s_endpgm
label_NoEarlyStop_N0:

/* remap wg from 1D(idxWG012) to 3D(wg2,wg1,wg0) */
/* wg2 = idxWG012 * smallMagicNumber(1/(numWG0*numWG1)) */
s_mul_i32 s48, s[sgprNumWorkGroups0], s[sgprNumWorkGroups1]
s_and_b32 s49, s[sgprGSU], 0x3fff                  // Restore GSU
s_mul_i32 s48, s48, s49
v_cvt_f32_u32 v6, s48                              // s48 = s[sgprWorkGroup0] / s48
v_rcp_iflag_f32 v6, v6                             // s48 = s[sgprWorkGroup0] / s48
v_cvt_f32_u32 v7, s[sgprWorkGroup0]                // s48 = s[sgprWorkGroup0] / s48
v_mul_f32 v6, v6, v7                               // s48 = s[sgprWorkGroup0] / s48
v_cvt_u32_f32 v6, v6                               // s48 = s[sgprWorkGroup0] / s48
v_mul_u32_u24 v7, v6, s48                          // s48 = s[sgprWorkGroup0] / s48
v_sub_u32 v7, s[sgprWorkGroup0], v7                // s48 = s[sgprWorkGroup0] / s48
v_cmpx_eq_u32 exec, v7, s48                        // s48 = s[sgprWorkGroup0] / s48
v_add_u32 v6, 1, v6                                // s48 = s[sgprWorkGroup0] / s48
s_mov_b64 exec, -1                                 // s48 = s[sgprWorkGroup0] / s48
v_readfirstlane_b32 s48, v6                        // quotient
s_mov_b32 s[sgprWorkGroup2], s48
/* idxWG01 = idxWG012 - wg2 * numWG0 * numWG1 */
s_mul_i32 s48, s[sgprNumWorkGroups1], s[sgprNumWorkGroups0]
s_mul_i32 s48, s48, s[sgprWorkGroup2]
s_mul_i32 s48, s48, s49
s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s48
/* wg1 = idxWG01 * smallMagicNumber(1/numWG0) */
v_cvt_f32_u32 v6, s[sgprNumWorkGroups0]            // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_rcp_iflag_f32 v6, v6                             // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_cvt_f32_u32 v7, s[sgprWorkGroup0]                // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_mul_f32 v6, v6, v7                               // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_cvt_u32_f32 v6, v6                               // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups0]        // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_sub_u32 v7, s[sgprWorkGroup0], v7                // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups0]      // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_add_u32 v6, 1, v6                                // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
s_mov_b64 exec, -1                                 // s48 = s[sgprWorkGroup0] / s[sgprNumWorkGroups0]
v_readfirstlane_b32 s48, v6                        // quotient
s_mov_b32 s[sgprWorkGroup1], s48
/* wg0 = idxWG01 - wg1 * numWG0 */
s_mul_i32 s48, s[sgprWorkGroup1], s[sgprNumWorkGroups0]
s_sub_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s48

/* Early stop if wg exceed */
s_cmp_ge_u32 s[sgprWorkGroup2], s[sgprSizesFree+2]
s_cbranch_scc0 label_NoEarlyStop_wgExceed
label_EarlyStop_if_wg_exceed:
s_endpgm
label_NoEarlyStop_wgExceed:

label_MultiGemmEnd:
.set sgprSrdA, 48
.set sgprSrdB, 52
.set sgprShadowLimitA, 56
.set sgprShadowLimitB, 58
.set sgprStaggerUIter, 47
.set sgprWrapUA, 60
.set sgprWrapUB, 62
.set sgprGlobalReadIncsA, 64
.set sgprGlobalReadIncsB, 65
.set sgprScalarGlobalReadOffsetA, 66
.set sgprScalarGlobalReadOffsetB, 73
s_sub_u32 s[sgprAddressA+0], s[sgprAddressA+0], 16 // pre-pad to make room for possible pointer shift
s_subb_u32 s[sgprAddressA+1], s[sgprAddressA+1], 0 // pre-pad to make room for possible pointer shift
s_sub_u32 s[sgprAddressB+0], s[sgprAddressB+0], 16 // pre-pad to make room for possible pointer shift
s_subb_u32 s[sgprAddressB+1], s[sgprAddressB+1], 0 // pre-pad to make room for possible pointer shift

/* Short circuit condition if Alpha == 0, then sumDims=0 */
v_cmp_eq_f32 vcc, s[sgprAlpha], 0.0                // s[Alpha] == 0.0f ?
s_cbranch_vccz label_AlphaNonZero                  // branch if s[Alpha] != 0
s_mov_b32 s[sgprSizesSum+0], 0x0                   // Set summation dim=0 if Alpha == 0
label_AlphaNonZero:

/******************************************/
/* Begin setupNewTile                     */
/******************************************/

/* global read addresses: work-group */
/* graWorkGroup mapping */
s_and_b32 s80, s[sgprGSU], 0x3fff                  // Restore GSU
s_cmp_eq_u32 s80, 1                                // GSU == 1 ?
s_cbranch_scc1 label_GSU                           // branch if GSU == 1
// GSU-not-WGMapRR :nwg1 = (size1J + MT1J - 1) / MT1J;
s_and_b32 s80, s[sgprGSU], 0x4000                  // SCC = (GSUWGMRR == 1) ?
s_cbranch_scc1 label_GSUWGMRR                      // branch if GSUWGMRR == 1
s_and_b32 s80, s[sgprGSU], 0x3fff                  // Restore GSU
v_cvt_f32_u32 v6, s80                              // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80
v_rcp_iflag_f32 v6, v6                             // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80
v_cvt_f32_u32 v7, s[sgprWorkGroup1]                // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80
v_mul_f32 v6, v6, v7                               // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80
v_cvt_u32_f32 v6, v6                               // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80
v_mul_u32_u24 v7, v6, s80                          // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80
v_sub_u32 v7, s[sgprWorkGroup1], v7                // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80
v_cmpx_eq_u32 exec, v7, s80                        // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80
v_add_u32 v6, 1, v6                                // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80
v_mov_b32 v7, 0                                    // s[sgprGSUSumIdx] = s[sgprWorkGroup1] % s80
s_mov_b64 exec, -1                                 // s[sgprWorkGroup1] = s[sgprWorkGroup1] / s80
v_readfirstlane_b32 s[sgprWorkGroup1], v6          // quotient
v_readfirstlane_b32 s[sgprGSUSumIdx], v7           // remainder
s_branch label_GSUWGMRR_End
label_GSUWGMRR:
v_cvt_f32_u32 v6, s[sgprNumWorkGroups1]            // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_rcp_iflag_f32 v6, v6                             // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_cvt_f32_u32 v7, s[sgprWorkGroup1]                // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_mul_f32 v6, v6, v7                               // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_cvt_u32_f32 v6, v6                               // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_mul_u32_u24 v7, v6, s[sgprNumWorkGroups1]        // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_sub_u32 v7, s[sgprWorkGroup1], v7                // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_cmpx_eq_u32 exec, v7, s[sgprNumWorkGroups1]      // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_add_u32 v6, 1, v6                                // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_mov_b32 v7, 0                                    // s[sgprWorkGroup1] = s[sgprWorkGroup1] % s[sgprNumWorkGroups1]
s_mov_b64 exec, -1                                 // s[sgprGSUSumIdx] = s[sgprWorkGroup1] / s[sgprNumWorkGroups1]
v_readfirstlane_b32 s[sgprGSUSumIdx], v6           // quotient
v_readfirstlane_b32 s[sgprWorkGroup1], v7          // remainder
label_GSUWGMRR_End:
s_mov_b32 s[sgprGSULog2BpeC], 1
s_mov_b32 s[sgprGSULog2BpeD], 2
s_branch label_GSU_End
label_GSU:
s_mov_b64 s[sgprGSUSumIdx:sgprGSUSumIdx+1], 0      // Set GSUSumIdx to 0
s_mov_b32 s[sgprGSULog2BpeC], 1
s_mov_b32 s[sgprGSULog2BpeD], 1
label_GSU_End:
s_sext_i32_i16 s[sgprWGM], s[sgprWGM]              // Restore WGM
s_cmp_gt_i32 s[sgprWGM], 1                         // WGM > 1 ?
s_cbranch_scc1 label_WGMPositive                   // branch if WGM > 1
s_cmp_ge_i32 s[sgprWGM], 0                         // WGM >= 0 ?
s_cbranch_scc1 label_WGM                           // branch if WGM >= 0
s_abs_i32 s[sgprWGM], s[sgprWGM]                   // abs(WGM)
v_cvt_f32_u32 v6, s[sgprWGM]                       // WGM
v_rcp_iflag_f32 v6, v6                             // WGM
v_cvt_f32_u32 v7, s[sgprWorkGroup0]                // WGM
v_mul_f32 v6, v6, v7                               // WGM
v_cvt_u32_f32 v6, v6                               // WGM
v_mul_u32_u24 v7, v6, s[sgprWGM]                   // WGM
v_sub_u32 v7, s[sgprWorkGroup0], v7                // WGM
v_cmpx_eq_u32 exec, v7, s[sgprWGM]                 // WGM
v_add_u32 v6, 1, v6                                // WGM
s_mov_b64 exec, -1                                 // WGM
v_readfirstlane_b32 s82, v6                        // quotient
s_mul_i32 s83, s82, s[sgprWGM]                     // quotient * non-magic divisor
s_sub_u32 s83, s[sgprWorkGroup0], s83              // WorkGroup0=remainder
s_mul_i32 s83, s83, s[sgprNumWorkGroups1]          // (wg1 % WGM)*NumWorkGroups1
s_add_u32 s83, s83, s[sgprWorkGroup1]              // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups1
v_cvt_f32_u32 v6, s[sgprWGM]                       // WGM
v_rcp_iflag_f32 v6, v6                             // WGM
v_cvt_f32_u32 v7, s[sgprNumWorkGroups0]            // WGM
v_mul_f32 v6, v6, v7                               // WGM
v_cvt_u32_f32 v6, v6                               // WGM
v_mul_u32_u24 v7, v6, s[sgprWGM]                   // WGM
v_sub_u32 v7, s[sgprNumWorkGroups0], v7            // WGM
v_cmpx_eq_u32 exec, v7, s[sgprWGM]                 // WGM
v_add_u32 v6, 1, v6                                // WGM
s_mov_b64 exec, -1                                 // WGM
v_readfirstlane_b32 s80, v6                        // quotient
s_mul_i32 s81, s[sgprWGM], s80                     // quotient * non-magic divisor
s_sub_u32 s81, s[sgprNumWorkGroups0], s81          // NumWorkGroups0=remainder
s_cmp_eq_u32 s81, 0                                // remainder == 0 ?
s_cmov_b32 s81, s[sgprWGM]                         // remainder = WGM if remainder == 0
s_cmp_ge_u32 s82, s80                              // blockId >= numFullBlocks ?
s_cselect_b32 s80, s81, s[sgprWGM]
v_cvt_f32_u32 v6, s80                              // s[sgprWorkGroup1] = s83 / s80
v_rcp_iflag_f32 v6, v6                             // s[sgprWorkGroup1] = s83 / s80
v_cvt_f32_u32 v7, s83                              // s[sgprWorkGroup1] = s83 / s80
v_mul_f32 v6, v6, v7                               // s[sgprWorkGroup1] = s83 / s80
v_cvt_u32_f32 v6, v6                               // s[sgprWorkGroup1] = s83 / s80
v_mul_u32_u24 v7, v6, s80                          // s[sgprWorkGroup1] = s83 / s80
v_sub_u32 v7, s83, v7                              // s[sgprWorkGroup1] = s83 / s80
v_cmpx_eq_u32 exec, v7, s80                        // s[sgprWorkGroup1] = s83 / s80
v_add_u32 v6, 1, v6                                // s[sgprWorkGroup1] = s83 / s80
v_mov_b32 v7, 0                                    // s[sgprWorkGroup0] = s83 % s80
s_mov_b64 exec, -1                                 // s[sgprWorkGroup1] = s83 / s80
v_readfirstlane_b32 s[sgprWorkGroup1], v6          // quotient
v_readfirstlane_b32 s[sgprWorkGroup0], v7          // remainder
s_mul_i32 s[sgprWorkGroup0], s[sgprWorkGroup1], s80 // quotient * non-magic divisor
s_sub_u32 s[sgprWorkGroup0], s83, s[sgprWorkGroup0] // WorkGroup0=remainder
s_mul_i32 s82, s82, s[sgprWGM]                     // blockId * WGM
s_add_u32 s[sgprWorkGroup0], s[sgprWorkGroup0], s82 // wg1 += blockId * WGM
s_branch label_WGM
label_WGMPositive:
v_cvt_f32_u32 v6, s[sgprWGM]                       // WGM
v_rcp_iflag_f32 v6, v6                             // WGM
v_cvt_f32_u32 v7, s[sgprWorkGroup1]                // WGM
v_mul_f32 v6, v6, v7                               // WGM
v_cvt_u32_f32 v6, v6                               // WGM
v_mul_u32_u24 v7, v6, s[sgprWGM]                   // WGM
v_sub_u32 v7, s[sgprWorkGroup1], v7                // WGM
v_cmpx_eq_u32 exec, v7, s[sgprWGM]                 // WGM
v_add_u32 v6, 1, v6                                // WGM
s_mov_b64 exec, -1                                 // WGM
v_readfirstlane_b32 s82, v6                        // quotient
s_mul_i32 s83, s82, s[sgprWGM]                     // quotient * non-magic divisor
s_sub_u32 s83, s[sgprWorkGroup1], s83              // WorkGroup1=remainder
s_mul_i32 s83, s83, s[sgprNumWorkGroups0]          // (wg1 % WGM)*NumWorkGroups0
s_add_u32 s83, s83, s[sgprWorkGroup0]              // wgSerial = wg0 + (wg1 % WGM)*NumWorkGroups0
v_cvt_f32_u32 v6, s[sgprWGM]                       // WGM
v_rcp_iflag_f32 v6, v6                             // WGM
v_cvt_f32_u32 v7, s[sgprNumWorkGroups1]            // WGM
v_mul_f32 v6, v6, v7                               // WGM
v_cvt_u32_f32 v6, v6                               // WGM
v_mul_u32_u24 v7, v6, s[sgprWGM]                   // WGM
v_sub_u32 v7, s[sgprNumWorkGroups1], v7            // WGM
v_cmpx_eq_u32 exec, v7, s[sgprWGM]                 // WGM
v_add_u32 v6, 1, v6                                // WGM
s_mov_b64 exec, -1                                 // WGM
v_readfirstlane_b32 s80, v6                        // quotient
s_mul_i32 s81, s[sgprWGM], s80                     // quotient * non-magic divisor
s_sub_u32 s81, s[sgprNumWorkGroups1], s81          // NumWorkGroups1=remainder
s_cmp_eq_u32 s81, 0                                // remainder == 0 ?
s_cmov_b32 s81, s[sgprWGM]                         // remainder = WGM if remainder == 0
s_cmp_ge_u32 s82, s80                              // blockId >= numFullBlocks ?
s_cselect_b32 s80, s81, s[sgprWGM]
v_cvt_f32_u32 v6, s80                              // s[sgprWorkGroup0] = s83 / s80
v_rcp_iflag_f32 v6, v6                             // s[sgprWorkGroup0] = s83 / s80
v_cvt_f32_u32 v7, s83                              // s[sgprWorkGroup0] = s83 / s80
v_mul_f32 v6, v6, v7                               // s[sgprWorkGroup0] = s83 / s80
v_cvt_u32_f32 v6, v6                               // s[sgprWorkGroup0] = s83 / s80
v_mul_u32_u24 v7, v6, s80                          // s[sgprWorkGroup0] = s83 / s80
v_sub_u32 v7, s83, v7                              // s[sgprWorkGroup0] = s83 / s80
v_cmpx_eq_u32 exec, v7, s80                        // s[sgprWorkGroup0] = s83 / s80
v_add_u32 v6, 1, v6                                // s[sgprWorkGroup0] = s83 / s80
v_mov_b32 v7, 0                                    // s[sgprWorkGroup1] = s83 % s80
s_mov_b64 exec, -1                                 // s[sgprWorkGroup0] = s83 / s80
v_readfirstlane_b32 s[sgprWorkGroup0], v6          // quotient
v_readfirstlane_b32 s[sgprWorkGroup1], v7          // remainder
s_mul_i32 s[sgprWorkGroup1], s[sgprWorkGroup0], s80 // quotient * non-magic divisor
s_sub_u32 s[sgprWorkGroup1], s83, s[sgprWorkGroup1] // WorkGroup1=remainder
s_mul_i32 s82, s82, s[sgprWGM]                     // blockId * WGM
s_add_u32 s[sgprWorkGroup1], s[sgprWorkGroup1], s82 // wg1 += blockId * WGM
label_WGM:

/* global read addresses: tile offset assignment a */
/* graTileAssignmentA = v0 */

/* global read addresses: tile offset assignment b */
/* graTileAssignmentB = v2 */

/* global read addresses: unroll assignment a */
/* v1 */

/* global read addresses: unroll assignment b */
/* v3 */

/* global read addresses: other free assignments */
/* s[sgprWorkGroup2] */

/* global read addresses: tile offsets a */

/* global read addresses: tile offsets b */

/* global read addresses: unroll offsets a */

/* global read addresses: unroll offsets b */

/* global read addresses: final offsets a */
GLOBAL_OFFSET_A vgprGlobalReadOffsetA+0, 6
s_mul_i32 s[sgprScalarGlobalReadOffsetA+0], s[sgprStrideA0I], 1 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetA+1], s[sgprStrideA0I], 2 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetA+2], s[sgprStrideA0I], 3 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetA+3], 1, 64  // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element
s_add_u32 s[sgprScalarGlobalReadOffsetA+4], s[sgprScalarGlobalReadOffsetA+0], s[sgprScalarGlobalReadOffsetA+3]
s_add_u32 s[sgprScalarGlobalReadOffsetA+5], s[sgprScalarGlobalReadOffsetA+1], s[sgprScalarGlobalReadOffsetA+3]
s_add_u32 s[sgprScalarGlobalReadOffsetA+6], s[sgprScalarGlobalReadOffsetA+2], s[sgprScalarGlobalReadOffsetA+3]

/* global read addresses: final offsets b */
GLOBAL_OFFSET_B vgprGlobalReadOffsetB+0,  3,  2, 6 // gROB_0_0_0_0
s_mul_i32 s[sgprScalarGlobalReadOffsetB+0], s[sgprStrideB1J], 32 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetB+1], s[sgprStrideB1J], 64 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetB+2], s[sgprStrideB1J], 96 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetB+3], s[sgprStrideB1J], 128 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetB+4], s[sgprStrideB1J], 160 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetB+5], s[sgprStrideB1J], 192 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)
s_mul_i32 s[sgprScalarGlobalReadOffsetB+6], s[sgprStrideB1J], 224 // compute offset diff (scaled tileDim)
                                                   // scalar offset *= bytes/element (multiplier is 1, do nothing)

/* global read addresses: addresses a */
/* max read offset = size[n] * stride[n-1] */
s_mul_hi_u32 s83, s[sgprWorkGroup0], 256           // WorkGroup[01] * MT
s_mul_i32 s82, s[sgprWorkGroup0], 256              // WorkGroup[01] * MT
s_mul_hi_u32 s83, s82, s[sgprStrideA0I]            // tlu=0, scaled tile-offset by stride
s_mul_i32 s82, s82, s[sgprStrideA0I]               // tlu=0, scaled tile-offset by stride
s_and_b32 s80, s[sgprGSU], 0x8000                  // SCC = (GSUC == 1) ?
s_cbranch_scc1 label_GSUC_A                        // branch if GSUC == 1
s_mul_hi_u32 s81, 128, s[sgprGSUSumIdx]            // gsuOffset = DepthU*GSUSumIdx
s_mul_i32 s80, 128, s[sgprGSUSumIdx]               // gsuOffset = DepthU*GSUSumIdx
s_branch label_GSUC_A_End
label_GSUC_A:
s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 7 // s[LoopCounterL] = s[sgprSizesSum] / 128
s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff   // Restore GSU
v_cvt_f32_u32 v0, s[sgprGSUSumIdx+1]               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_rcp_iflag_f32 v0, v0                             // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cvt_f32_u32 v1, s[sgprLoopCounterL]              // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mul_f32 v0, v0, v1                               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cvt_u32_f32 v0, v0                               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mul_u32_u24 v1, v0, s[sgprGSUSumIdx+1]           // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_sub_u32 v1, s[sgprLoopCounterL], v1              // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cmpx_eq_u32 exec, v1, s[sgprGSUSumIdx+1]         // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_add_u32 v0, 1, v0                                // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mov_b32 v1, 0                                    // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1]
s_mov_b64 exec, -1                                 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_readfirstlane_b32 s[sgprLoopCounterL], v0        // quotient
v_readfirstlane_b32 s[sgprGSUSumIdx+1], v1         // remainder
s_mul_i32 s81, s[sgprLoopCounterL], s[sgprGSUSumIdx] // quotient*GSUSumIdx
s_add_u32 s80, 1, s[sgprLoopCounterL]              // quotient+1
s_add_u32 s81, s81, s[sgprGSUSumIdx+1]             // quotient*GSUSumIdx+remainder
s_mul_i32 s80, s80, s[sgprGSUSumIdx]               // (quotient+1)*GSUSumIdx
s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1]  // gsuSumIdx < numIterPerWgRemainder
s_cselect_b32 s80, s80, s81                        // (quotient+1)*GSUSumIdx if needed
s_mul_hi_u32 s81, s80, 128                         // gsuOffset = DepthU*accumulatedNumOfLoopCounterL
s_mul_i32 s80, s80, 128                            // gsuOffset = DepthU*accumulatedNumOfLoopCounterL
label_GSUC_A_End:
s_add_u32 s82, s82, s80                            // accum GsuOffset term to tilestart
s_addc_u32 s83, s83, s81                           // accum GsuOffset term to tilestart
s_mov_b32 s[sgprShadowLimitA+0], 1                 // Init tensor size
s_mov_b32 s[sgprShadowLimitA+1], 0                 // init tensor size
s_sub_u32 s80, s[sgprSizeL], 1                     // (size-1)
s_mul_hi_u32 s81, constStrideAL, s80               // stride x (size-1)
s_mul_i32 s80, constStrideAL, s80                  // stride x (size-1)
s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s80 // sum tensor size
s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s81 // sum tensor size
s_sub_u32 s80, s[sgprSizeI], 1                     // (size-1)
s_mul_hi_u32 s81, s[sgprStrideA0I], s80            // stride x (size-1)
s_mul_i32 s80, s[sgprStrideA0I], s80               // stride x (size-1)
s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s80 // sum tensor size
s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s81 // sum tensor size
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s82 // sub tileStart
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s83 // sub tileStart
                                                   // Set limit to use bytes (byte is 1, do nothing)
s_add_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], 16 // extend limit for pre-pad
s_addc_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], 0 // extend limit for pre-pad
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32
s_mul_hi_u32 s81, s[sgprStrideAK], s[sgprWorkGroup2] // Stride*WG
s_mul_i32 s80, s[sgprStrideAK], s[sgprWorkGroup2]  // Stride*WG
s_add_u32 s82, s82, s80                            // accum wg term to tilestart
s_addc_u32 s83, s83, s81                           // accum wg term to tilestart
                                                   // tileStart *= BPE (multiplier is 1, do nothing)
s_add_u32 s[sgprSrdA+0], s[sgprAddressA+0], s82    // SRD base = Address+ tileStart0
s_addc_u32 s[sgprSrdA+1], s[sgprAddressA+1], s83   // SRD base = Address+ tileStart1
s_mov_b32 s[sgprSrdA+3], Srd127_96                 // Set bits 127_96 in SRD

/* global read addresses: addresses b */
/* max read offset = size[n] * stride[n-1] */
s_mul_hi_u32 s83, s[sgprWorkGroup1], 256           // WorkGroup[01] * MT
s_mul_i32 s82, s[sgprWorkGroup1], 256              // WorkGroup[01] * MT
s_mul_hi_u32 s83, s82, s[sgprStrideB1J]            // tlu=0, scaled tile-offset by stride
s_mul_i32 s82, s82, s[sgprStrideB1J]               // tlu=0, scaled tile-offset by stride
s_and_b32 s80, s[sgprGSU], 0x8000                  // SCC = (GSUC == 1) ?
s_cbranch_scc1 label_GSUC_B                        // branch if GSUC == 1
s_mul_hi_u32 s81, 128, s[sgprGSUSumIdx]            // gsuOffset = DepthU*GSUSumIdx
s_mul_i32 s80, 128, s[sgprGSUSumIdx]               // gsuOffset = DepthU*GSUSumIdx
s_branch label_GSUC_B_End
label_GSUC_B:
s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum], 7 // s[LoopCounterL] = s[sgprSizesSum] / 128
s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff   // Restore GSU
v_cvt_f32_u32 v0, s[sgprGSUSumIdx+1]               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_rcp_iflag_f32 v0, v0                             // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cvt_f32_u32 v1, s[sgprLoopCounterL]              // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mul_f32 v0, v0, v1                               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cvt_u32_f32 v0, v0                               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mul_u32_u24 v1, v0, s[sgprGSUSumIdx+1]           // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_sub_u32 v1, s[sgprLoopCounterL], v1              // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cmpx_eq_u32 exec, v1, s[sgprGSUSumIdx+1]         // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_add_u32 v0, 1, v0                                // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mov_b32 v1, 0                                    // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1]
s_mov_b64 exec, -1                                 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_readfirstlane_b32 s[sgprLoopCounterL], v0        // quotient
v_readfirstlane_b32 s[sgprGSUSumIdx+1], v1         // remainder
s_mul_i32 s81, s[sgprLoopCounterL], s[sgprGSUSumIdx] // quotient*GSUSumIdx
s_add_u32 s80, 1, s[sgprLoopCounterL]              // quotient+1
s_add_u32 s81, s81, s[sgprGSUSumIdx+1]             // quotient*GSUSumIdx+remainder
s_mul_i32 s80, s80, s[sgprGSUSumIdx]               // (quotient+1)*GSUSumIdx
s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1]  // gsuSumIdx < numIterPerWgRemainder
s_cselect_b32 s80, s80, s81                        // (quotient+1)*GSUSumIdx if needed
s_mul_hi_u32 s81, s80, 128                         // gsuOffset = DepthU*accumulatedNumOfLoopCounterL
s_mul_i32 s80, s80, 128                            // gsuOffset = DepthU*accumulatedNumOfLoopCounterL
label_GSUC_B_End:
s_add_u32 s82, s82, s80                            // accum GsuOffset term to tilestart
s_addc_u32 s83, s83, s81                           // accum GsuOffset term to tilestart
s_mov_b32 s[sgprShadowLimitB+0], 1                 // Init tensor size
s_mov_b32 s[sgprShadowLimitB+1], 0                 // init tensor size
s_sub_u32 s80, s[sgprSizeL], 1                     // (size-1)
s_mul_hi_u32 s81, constStrideBL, s80               // stride x (size-1)
s_mul_i32 s80, constStrideBL, s80                  // stride x (size-1)
s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s80 // sum tensor size
s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s81 // sum tensor size
s_sub_u32 s80, s[sgprSizeJ], 1                     // (size-1)
s_mul_hi_u32 s81, s[sgprStrideB1J], s80            // stride x (size-1)
s_mul_i32 s80, s[sgprStrideB1J], s80               // stride x (size-1)
s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s80 // sum tensor size
s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s81 // sum tensor size
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s82 // sub tileStart
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s83 // sub tileStart
                                                   // Set limit to use bytes (byte is 1, do nothing)
s_add_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], 16 // extend limit for pre-pad
s_addc_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], 0 // extend limit for pre-pad
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32
s_mul_hi_u32 s81, s[sgprStrideBK], s[sgprWorkGroup2] // Stride*WG
s_mul_i32 s80, s[sgprStrideBK], s[sgprWorkGroup2]  // Stride*WG
s_add_u32 s82, s82, s80                            // accum wg term to tilestart
s_addc_u32 s83, s83, s81                           // accum wg term to tilestart
                                                   // tileStart *= BPE (multiplier is 1, do nothing)
s_add_u32 s[sgprSrdB+0], s[sgprAddressB+0], s82    // SRD base = Address+ tileStart0
s_addc_u32 s[sgprSrdB+1], s[sgprAddressB+1], s83   // SRD base = Address+ tileStart1
s_mov_b32 s[sgprSrdB+3], Srd127_96                 // Set bits 127_96 in SRD

/* global read addresses: increments a */
s_and_b32 s81, s[sgprGSU], 0x3fff                  // Restore GSU
s_mul_i32 s81, s81, DepthU*BpeAGR                  // GSU*DepthU*Bpe
s_and_b32 s80, s[sgprGSU], 0x8000                  // SCC = (GSUC == 1) ?
s_cselect_b32 s[sgprGlobalReadIncsA+0], DepthU*BpeAGR, s81 // incrA (unrollIdx)

/* global read addresses: increments b */
s_and_b32 s81, s[sgprGSU], 0x3fff                  // Restore GSU
s_mul_i32 s81, s81, DepthU*BpeBGR                  // GSU*DepthU*Bpe
s_and_b32 s80, s[sgprGSU], 0x8000                  // SCC = (GSUC == 1) ?
s_cselect_b32 s[sgprGlobalReadIncsB+0], DepthU*BpeBGR, s81 // incrB (unrollIdx)
/* declare loop num iterations */
s_lshr_b32 s[sgprLoopCounterL], s[sgprSizesSum+0], 7 // s[sgprLoopCounterL] = s[sgprSizesSum+0] / 128
s_and_b32 s80, s[sgprGSU], 0x3fff                  // Restore GSU
s_cmp_eq_u32 s80, 1                                // GSU == 1 ?
s_cbranch_scc1 label_GSU_1                         // branch if GSU == 1
s_and_b32 s[sgprGSUSumIdx+1], s[sgprGSU], 0x3fff   // Restore GSU
v_cvt_f32_u32 v0, s[sgprGSUSumIdx+1]               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_rcp_iflag_f32 v0, v0                             // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cvt_f32_u32 v1, s[sgprLoopCounterL]              // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mul_f32 v0, v0, v1                               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cvt_u32_f32 v0, v0                               // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mul_u32_u24 v1, v0, s[sgprGSUSumIdx+1]           // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_sub_u32 v1, s[sgprLoopCounterL], v1              // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_cmpx_eq_u32 exec, v1, s[sgprGSUSumIdx+1]         // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_add_u32 v0, 1, v0                                // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_mov_b32 v1, 0                                    // s[sgprGSUSumIdx+1] = s[sgprLoopCounterL] % s[sgprGSUSumIdx+1]
s_mov_b64 exec, -1                                 // s[sgprLoopCounterL] = s[sgprLoopCounterL] / s[sgprGSUSumIdx+1]
v_readfirstlane_b32 s[sgprLoopCounterL], v0        // quotient
v_readfirstlane_b32 s[sgprGSUSumIdx+1], v1         // remainder
s_add_u32 s80, 1, s[sgprLoopCounterL]              // tmp<-numIterMyWg+1
s_cmp_lt_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1]  // gsuSumIdx < numIterPerWgRemainder
s_cmov_b32 s[sgprLoopCounterL], s80                // numIterMyWg++ if needed
label_GSU_1:
s_mov_b32 s[sgprOrigLoopCounter], s[sgprLoopCounterL] // copy loop counter
s_and_b32 s82, s[sgprStaggerU], 0x1f00
s_lshr_b32 s82, s82, 0x8
s_and_b32 s83, s[sgprStaggerU], 0xe000
s_and_b32 s[sgprStaggerU], s[sgprStaggerU], 0xff
s_mov_b32 s80, s[sgprStaggerU]                     // init staggerU
label_beginStaggerUIter:
s_lshl_b32 s81, s80, s82                           // shift by StaggerUStride
s_cmp_ge_u32 s[sgprOrigLoopCounter], s81           // loopCount >= current shift Count
s_cbranch_scc1 label_endStaggerUIter               // jump to end
s_lshr_b32 s80, s80, 1                             // step down to smaller stagger
s_branch label_beginStaggerUIter                   // jump to begin
label_endStaggerUIter:
s_sub_u32 s81, s80, 1                              // staggerU mask
s_cmp_ge_u32 s80, 1                                // if current staggerU >= 1
s_cselect_b32 s[sgprStaggerUIter], s81, 0          // set Mask
s_cmp_eq_u32 s83, 0x0
s_cbranch_scc1 label_StaggerUMapping_1
s_mov_b32 s80, s[sgprWorkGroup0]
s_branch label_staggerInputEnd
label_StaggerUMapping_1:
s_cmp_eq_u32 s83, 0x2000
s_cbranch_scc1 label_StaggerUMapping_2
s_mov_b32 s80, s[sgprWorkGroup1]
s_branch label_staggerInputEnd
label_StaggerUMapping_2:
s_cmp_eq_u32 s83, 0x4000
s_cbranch_scc1 label_StaggerUMapping_3
s_mov_b32 s80, -0x1
s_branch label_staggerInputEnd
label_StaggerUMapping_3:
s_cmp_eq_u32 s83, 0x6000
s_cbranch_scc1 label_StaggerUMapping_4
s_mul_i32 s81, s[sgprNumWorkGroups0], s[sgprWorkGroup1]
s_add_u32 s80, s80, s81
s_add_u32 s80, s80, s[sgprWorkGroup0]
s_branch label_staggerInputEnd
label_StaggerUMapping_4:
s_cmp_eq_u32 s83, 0x8000
s_cbranch_scc1 label_staggerInputEnd
s_mov_b32 s80, -0x1
s_branch label_staggerInputEnd
label_staggerInputEnd:
s_and_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s80 // Compute actual stagger start for this tile
s_lshl_b32 s[sgprStaggerUIter], s[sgprStaggerUIter], s82 // shift by StaggerUStride

/* SRDs += (StaggerUIter) * GlobalReadIncsA+0 */
s_mul_hi_i32 s81, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] //  stagger byte offset
s_mul_i32 s80, s[sgprStaggerUIter], s[sgprGlobalReadIncsA+0] //  stagger byte offset
s_mul_hi_i32 s[sgprWrapUA+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop
s_mul_i32 s[sgprWrapUA+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsA+0] // Number of bytes accessed by the unroll loop
s_sub_u32 s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0], s[sgprWrapUA+0] // remove one iteration
s_subb_u32 s[sgprWrapUA+1], 0, s[sgprWrapUA+1]     // remove one iteration
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s80        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s81       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s80 // limit -= inc)
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s81 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32

/* SRDs += (StaggerUIter) * GlobalReadIncsB+0 */
s_mul_hi_i32 s81, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] //  stagger byte offset
s_mul_i32 s80, s[sgprStaggerUIter], s[sgprGlobalReadIncsB+0] //  stagger byte offset
s_mul_hi_i32 s[sgprWrapUB+1], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop
s_mul_i32 s[sgprWrapUB+0], s[sgprLoopCounterL], s[sgprGlobalReadIncsB+0] // Number of bytes accessed by the unroll loop
s_sub_u32 s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0], s[sgprWrapUB+0] // remove one iteration
s_subb_u32 s[sgprWrapUB+1], 0, s[sgprWrapUB+1]     // remove one iteration
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s80        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s81       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s80 // limit -= inc)
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s81 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32
s_add_u32 s[sgprStaggerUIter], s[sgprStaggerUIter], 2 // Subtract (PGR-1); StaggerUIter now contains target iteration to wrap
/* local read addresses: init pointers a */

/* localReadInitPointers */
/* local read addresses: init pointers b */

/* localReadInitPointers */

/* prefetch: global -> local */
s_cmp_eq_u32 s[sgprLoopCounterL], 0                // at last iteration?
s_cbranch_scc1 label_ShadowInitStart               // skip to ShadowInitStart iter b/c numIter==0
buffer_load_dwordx4 v[vgprG2LB+0:vgprG2LB+0+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0 // G -> Reg 0_0_0_0
buffer_load_dwordx4 v[vgprG2LB+4:vgprG2LB+4+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0 // G -> Reg 0_0_1_0
buffer_load_dwordx4 v[vgprG2LB+8:vgprG2LB+8+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0 // G -> Reg 0_0_2_0
buffer_load_dwordx4 v[vgprG2LB+12:vgprG2LB+12+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0 // G -> Reg 0_0_3_0
buffer_load_dwordx4 v[vgprG2LB+16:vgprG2LB+16+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0 // G -> Reg 0_0_4_0
buffer_load_dwordx4 v[vgprG2LB+20:vgprG2LB+20+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0 // G -> Reg 0_0_5_0
buffer_load_dwordx4 v[vgprG2LB+24:vgprG2LB+24+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0 // G -> Reg 0_0_6_0
buffer_load_dwordx4 v[vgprG2LB+28:vgprG2LB+28+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0 // G -> Reg 0_0_7_0
.set vgprValuA_X0_I0, vgprValuA_X0_I0_0
.set vgprValuA_X2_I0, vgprValuA_X2_I0_0
buffer_load_dwordx4 v[vgprValuA_X0_I0+0:vgprValuA_X0_I0+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0 // G -> Reg 0_0_0_0
buffer_load_dwordx4 v[vgprValuA_X2_I0+0:vgprValuA_X2_I0+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0 // G -> Reg 0_0_1_0
buffer_load_dwordx4 v[vgprValuA_X0_I0+4:vgprValuA_X0_I0+4+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0 // G -> Reg 0_0_1_0
buffer_load_dwordx4 v[vgprValuA_X2_I0+4:vgprValuA_X2_I0+4+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0 // G -> Reg 0_0_1_0
buffer_load_dwordx4 v[vgprValuA_X0_I0+8:vgprValuA_X0_I0+8+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0 // G -> Reg 0_0_2_0
buffer_load_dwordx4 v[vgprValuA_X2_I0+8:vgprValuA_X2_I0+8+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0 // G -> Reg 0_0_1_0
buffer_load_dwordx4 v[vgprValuA_X0_I0+12:vgprValuA_X0_I0+12+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0 // G -> Reg 0_0_3_0
buffer_load_dwordx4 v[vgprValuA_X2_I0+12:vgprValuA_X2_I0+12+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0 // G -> Reg 0_0_1_0

/* global read inc A loopL */
s_add_u32 s82, s[sgprLoopCounterL], 1              // remove pf(1)
s_cmp_eq_u32 s[sgprStaggerUIter], s82              // Is this wrapIter? (pf)
s_cselect_b32 s80, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ?
s_cselect_b32 s81, s[sgprWrapUA+1], 0              // incUpper <- ?
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s80        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s81       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s80 // limit -= inc)
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s81 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32

/* global read inc B loopL */
s_add_u32 s82, s[sgprLoopCounterL], 1              // remove pf(1)
s_cmp_eq_u32 s[sgprStaggerUIter], s82              // Is this wrapIter? (pf)
s_cselect_b32 s80, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ?
s_cselect_b32 s81, s[sgprWrapUB+1], 0              // incUpper <- ?
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s80        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s81       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s80 // limit -= inc)
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s81 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32

/******************************************/
/* End setupNewTile                       */
/******************************************/
label_ShadowInitStart:
s_mov_b32 s[sgprSrdD+0], s[sgprAddressD+0]         // init SRD base address (lower)
s_mov_b32 s[sgprSrdD+1], s[sgprAddressD+1]         // init SRD base address (upper) + other fields
s_mov_b32 s[sgprSrdD+2], 0x80000000
s_mov_b32 s[sgprSrdD+3], Srd127_96                 // Set bits 127_96 in post-loop SRD

s_mov_b32 s[sgprSrdC+0], s[sgprAddressC+0]         // init SRD base address (lower)
s_mov_b32 s[sgprSrdC+1], s[sgprAddressC+1]         // init SRD base address (upper) + other fields
s_mov_b32 s[sgprSrdC+2], 0x80000000
s_mov_b32 s[sgprSrdC+3], Srd127_96                 // Set bits 127_96 in post-loop SRD


s_mul_i32 s82, MT1, s[sgprWorkGroup1]              // <- wg1*MT1
s_mul_hi_u32 s81, s82, s[sgprStrideC1J]            // ScaleC s82 by Stride
s_mul_i32 s80, s82, s[sgprStrideC1J]               // ScaleC s82 by Stride
s_lshl_b64 s[80:81], s[80:81], s[sgprGSULog2BpeC]  // scale by bpe
s_add_u32 s[sgprSrdC+0], s[sgprAddressC+0], s80    // add lo to SRD
s_addc_u32 s[sgprSrdC+1], s[sgprAddressC+1], s81   // add hi to SRD
s_mul_hi_u32 s81, s82, s[sgprStrideD1J]            // ScaleD s82 by Stride
s_mul_i32 s80, s82, s[sgprStrideD1J]               // ScaleD s82 by Stride
s_lshl_b64 s[80:81], s[80:81], s[sgprGSULog2BpeD]  // scale by bpe
s_add_u32 s[sgprSrdD+0], s[sgprAddressD+0], s80    // add lo to SRD
s_addc_u32 s[sgprSrdD+1], s[sgprAddressD+1], s81   // add hi to SRD

s_mul_hi_u32 s81, s[sgprWorkGroup2], s[sgprStrideCK] // ScaleC s[sgprWorkGroup2] by Stride
s_mul_i32 s80, s[sgprWorkGroup2], s[sgprStrideCK]  // ScaleC s[sgprWorkGroup2] by Stride
s_lshl_b64 s[80:81], s[80:81], s[sgprGSULog2BpeC]  // scale by bpe
s_add_u32 s[sgprSrdC+0], s[sgprSrdC+0], s80        // add lo to SRD
s_addc_u32 s[sgprSrdC+1], s[sgprSrdC+1], s81       // add hi to SRD
s_mul_hi_u32 s81, s[sgprWorkGroup2], s[sgprStrideDK] // ScaleD s[sgprWorkGroup2] by Stride
s_mul_i32 s80, s[sgprWorkGroup2], s[sgprStrideDK]  // ScaleD s[sgprWorkGroup2] by Stride
s_lshl_b64 s[80:81], s[80:81], s[sgprGSULog2BpeD]  // scale by bpe
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s80        // add lo to SRD
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s81       // add hi to SRD

s_and_b32 s80, s[sgprGSU], 0x3fff                  // Restore GSU
s_cmp_eq_u32 s80, 1                                // GSU == 1 ?
s_cbranch_scc1 label_GSU_2                         // branch if GSU == 1
// GSU Output Buffer offset: Free0 + (Free1-1)*StrideC1J + (Free2-1)*StrideCK * GSUIdx * bpe%s
s_mul_hi_u32 s81, s[sgprSizesFree+0], s[sgprGSUSumIdx] // Free0
s_mul_i32 s80, s[sgprSizesFree+0], s[sgprGSUSumIdx] // Free0
s_sub_u32 s82, s[sgprSizesFree+1], 1               // Free1
s_mul_i32 s82, s82, s[sgprGSUSumIdx]               // Free1
s_mul_hi_u32 s83, s82, s[sgprStrideC1J]            // Free1
s_mul_i32 s82, s82, s[sgprStrideC1J]               // Free1
s_add_u32 s80, s80, s82                            // Free1
s_addc_u32 s81, s81, s83                           // Free1
s_sub_u32 s82, s[sgprSizesFree+2], 1               // Free2
s_mul_i32 s82, s82, s[sgprGSUSumIdx]               // Free2
s_mul_hi_u32 s83, s82, s[sgprStrideCK]             // Free2
s_mul_i32 s82, s82, s[sgprStrideCK]                // Free2
s_add_u32 s80, s80, s82                            // Free2
s_addc_u32 s81, s81, s83                           // Free2
s_lshl_b64 s[80:81], s[80:81], 2                   // scale by bpe
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s80        // add lo GSU offset to SRD
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], s81       // add hi GSU offset to SRD
label_GSU_2:
.set sgprGSULog2BpeC, UNDEF
.set sgprAddressC, UNDEF
.set sgprAddressD, UNDEF

/* initC: remove ValuC vgpr buffer [0...0) from pool */

/* initC: remove acc vgpr buffer [0...256) from pool */

/* initC: remove ValuA/B vgpr buffer [0...160) from pool */
v_accvgpr_write acc0, 0x0                          // initC
v_accvgpr_write acc1, 0x0                          // initC
v_accvgpr_write acc2, 0x0                          // initC
v_accvgpr_write acc3, 0x0                          // initC
v_accvgpr_write acc4, 0x0                          // initC
v_accvgpr_write acc5, 0x0                          // initC
v_accvgpr_write acc6, 0x0                          // initC
v_accvgpr_write acc7, 0x0                          // initC
v_accvgpr_write acc8, 0x0                          // initC
v_accvgpr_write acc9, 0x0                          // initC
v_accvgpr_write acc10, 0x0                         // initC
v_accvgpr_write acc11, 0x0                         // initC
v_accvgpr_write acc12, 0x0                         // initC
v_accvgpr_write acc13, 0x0                         // initC
v_accvgpr_write acc14, 0x0                         // initC
v_accvgpr_write acc15, 0x0                         // initC
v_accvgpr_write acc16, 0x0                         // initC
v_accvgpr_write acc17, 0x0                         // initC
v_accvgpr_write acc18, 0x0                         // initC
v_accvgpr_write acc19, 0x0                         // initC
v_accvgpr_write acc20, 0x0                         // initC
v_accvgpr_write acc21, 0x0                         // initC
v_accvgpr_write acc22, 0x0                         // initC
v_accvgpr_write acc23, 0x0                         // initC
v_accvgpr_write acc24, 0x0                         // initC
v_accvgpr_write acc25, 0x0                         // initC
v_accvgpr_write acc26, 0x0                         // initC
v_accvgpr_write acc27, 0x0                         // initC
v_accvgpr_write acc28, 0x0                         // initC
v_accvgpr_write acc29, 0x0                         // initC
v_accvgpr_write acc30, 0x0                         // initC
v_accvgpr_write acc31, 0x0                         // initC
v_accvgpr_write acc32, 0x0                         // initC
v_accvgpr_write acc33, 0x0                         // initC
v_accvgpr_write acc34, 0x0                         // initC
v_accvgpr_write acc35, 0x0                         // initC
v_accvgpr_write acc36, 0x0                         // initC
v_accvgpr_write acc37, 0x0                         // initC
v_accvgpr_write acc38, 0x0                         // initC
v_accvgpr_write acc39, 0x0                         // initC
v_accvgpr_write acc40, 0x0                         // initC
v_accvgpr_write acc41, 0x0                         // initC
v_accvgpr_write acc42, 0x0                         // initC
v_accvgpr_write acc43, 0x0                         // initC
v_accvgpr_write acc44, 0x0                         // initC
v_accvgpr_write acc45, 0x0                         // initC
v_accvgpr_write acc46, 0x0                         // initC
v_accvgpr_write acc47, 0x0                         // initC
v_accvgpr_write acc48, 0x0                         // initC
v_accvgpr_write acc49, 0x0                         // initC
v_accvgpr_write acc50, 0x0                         // initC
v_accvgpr_write acc51, 0x0                         // initC
v_accvgpr_write acc52, 0x0                         // initC
v_accvgpr_write acc53, 0x0                         // initC
v_accvgpr_write acc54, 0x0                         // initC
v_accvgpr_write acc55, 0x0                         // initC
v_accvgpr_write acc56, 0x0                         // initC
v_accvgpr_write acc57, 0x0                         // initC
v_accvgpr_write acc58, 0x0                         // initC
v_accvgpr_write acc59, 0x0                         // initC
v_accvgpr_write acc60, 0x0                         // initC
v_accvgpr_write acc61, 0x0                         // initC
v_accvgpr_write acc62, 0x0                         // initC
v_accvgpr_write acc63, 0x0                         // initC
v_accvgpr_write acc64, 0x0                         // initC
v_accvgpr_write acc65, 0x0                         // initC
v_accvgpr_write acc66, 0x0                         // initC
v_accvgpr_write acc67, 0x0                         // initC
v_accvgpr_write acc68, 0x0                         // initC
v_accvgpr_write acc69, 0x0                         // initC
v_accvgpr_write acc70, 0x0                         // initC
v_accvgpr_write acc71, 0x0                         // initC
v_accvgpr_write acc72, 0x0                         // initC
v_accvgpr_write acc73, 0x0                         // initC
v_accvgpr_write acc74, 0x0                         // initC
v_accvgpr_write acc75, 0x0                         // initC
v_accvgpr_write acc76, 0x0                         // initC
v_accvgpr_write acc77, 0x0                         // initC
v_accvgpr_write acc78, 0x0                         // initC
v_accvgpr_write acc79, 0x0                         // initC
v_accvgpr_write acc80, 0x0                         // initC
v_accvgpr_write acc81, 0x0                         // initC
v_accvgpr_write acc82, 0x0                         // initC
v_accvgpr_write acc83, 0x0                         // initC
v_accvgpr_write acc84, 0x0                         // initC
v_accvgpr_write acc85, 0x0                         // initC
v_accvgpr_write acc86, 0x0                         // initC
v_accvgpr_write acc87, 0x0                         // initC
v_accvgpr_write acc88, 0x0                         // initC
v_accvgpr_write acc89, 0x0                         // initC
v_accvgpr_write acc90, 0x0                         // initC
v_accvgpr_write acc91, 0x0                         // initC
v_accvgpr_write acc92, 0x0                         // initC
v_accvgpr_write acc93, 0x0                         // initC
v_accvgpr_write acc94, 0x0                         // initC
v_accvgpr_write acc95, 0x0                         // initC
v_accvgpr_write acc96, 0x0                         // initC
v_accvgpr_write acc97, 0x0                         // initC
v_accvgpr_write acc98, 0x0                         // initC
v_accvgpr_write acc99, 0x0                         // initC
v_accvgpr_write acc100, 0x0                        // initC
v_accvgpr_write acc101, 0x0                        // initC
v_accvgpr_write acc102, 0x0                        // initC
v_accvgpr_write acc103, 0x0                        // initC
v_accvgpr_write acc104, 0x0                        // initC
v_accvgpr_write acc105, 0x0                        // initC
v_accvgpr_write acc106, 0x0                        // initC
v_accvgpr_write acc107, 0x0                        // initC
v_accvgpr_write acc108, 0x0                        // initC
v_accvgpr_write acc109, 0x0                        // initC
v_accvgpr_write acc110, 0x0                        // initC
v_accvgpr_write acc111, 0x0                        // initC
v_accvgpr_write acc112, 0x0                        // initC
v_accvgpr_write acc113, 0x0                        // initC
v_accvgpr_write acc114, 0x0                        // initC
v_accvgpr_write acc115, 0x0                        // initC
v_accvgpr_write acc116, 0x0                        // initC
v_accvgpr_write acc117, 0x0                        // initC
v_accvgpr_write acc118, 0x0                        // initC
v_accvgpr_write acc119, 0x0                        // initC
v_accvgpr_write acc120, 0x0                        // initC
v_accvgpr_write acc121, 0x0                        // initC
v_accvgpr_write acc122, 0x0                        // initC
v_accvgpr_write acc123, 0x0                        // initC
v_accvgpr_write acc124, 0x0                        // initC
v_accvgpr_write acc125, 0x0                        // initC
v_accvgpr_write acc126, 0x0                        // initC
v_accvgpr_write acc127, 0x0                        // initC
v_accvgpr_write acc128, 0x0                        // initC
v_accvgpr_write acc129, 0x0                        // initC
v_accvgpr_write acc130, 0x0                        // initC
v_accvgpr_write acc131, 0x0                        // initC
v_accvgpr_write acc132, 0x0                        // initC
v_accvgpr_write acc133, 0x0                        // initC
v_accvgpr_write acc134, 0x0                        // initC
v_accvgpr_write acc135, 0x0                        // initC
v_accvgpr_write acc136, 0x0                        // initC
v_accvgpr_write acc137, 0x0                        // initC
v_accvgpr_write acc138, 0x0                        // initC
v_accvgpr_write acc139, 0x0                        // initC
v_accvgpr_write acc140, 0x0                        // initC
v_accvgpr_write acc141, 0x0                        // initC
v_accvgpr_write acc142, 0x0                        // initC
v_accvgpr_write acc143, 0x0                        // initC
v_accvgpr_write acc144, 0x0                        // initC
v_accvgpr_write acc145, 0x0                        // initC
v_accvgpr_write acc146, 0x0                        // initC
v_accvgpr_write acc147, 0x0                        // initC
v_accvgpr_write acc148, 0x0                        // initC
v_accvgpr_write acc149, 0x0                        // initC
v_accvgpr_write acc150, 0x0                        // initC
v_accvgpr_write acc151, 0x0                        // initC
v_accvgpr_write acc152, 0x0                        // initC
v_accvgpr_write acc153, 0x0                        // initC
v_accvgpr_write acc154, 0x0                        // initC
v_accvgpr_write acc155, 0x0                        // initC
v_accvgpr_write acc156, 0x0                        // initC
v_accvgpr_write acc157, 0x0                        // initC
v_accvgpr_write acc158, 0x0                        // initC
v_accvgpr_write acc159, 0x0                        // initC
v_accvgpr_write acc160, 0x0                        // initC
v_accvgpr_write acc161, 0x0                        // initC
v_accvgpr_write acc162, 0x0                        // initC
v_accvgpr_write acc163, 0x0                        // initC
v_accvgpr_write acc164, 0x0                        // initC
v_accvgpr_write acc165, 0x0                        // initC
v_accvgpr_write acc166, 0x0                        // initC
v_accvgpr_write acc167, 0x0                        // initC
v_accvgpr_write acc168, 0x0                        // initC
v_accvgpr_write acc169, 0x0                        // initC
v_accvgpr_write acc170, 0x0                        // initC
v_accvgpr_write acc171, 0x0                        // initC
v_accvgpr_write acc172, 0x0                        // initC
v_accvgpr_write acc173, 0x0                        // initC
v_accvgpr_write acc174, 0x0                        // initC
v_accvgpr_write acc175, 0x0                        // initC
v_accvgpr_write acc176, 0x0                        // initC
v_accvgpr_write acc177, 0x0                        // initC
v_accvgpr_write acc178, 0x0                        // initC
v_accvgpr_write acc179, 0x0                        // initC
v_accvgpr_write acc180, 0x0                        // initC
v_accvgpr_write acc181, 0x0                        // initC
v_accvgpr_write acc182, 0x0                        // initC
v_accvgpr_write acc183, 0x0                        // initC
v_accvgpr_write acc184, 0x0                        // initC
v_accvgpr_write acc185, 0x0                        // initC
v_accvgpr_write acc186, 0x0                        // initC
v_accvgpr_write acc187, 0x0                        // initC
v_accvgpr_write acc188, 0x0                        // initC
v_accvgpr_write acc189, 0x0                        // initC
v_accvgpr_write acc190, 0x0                        // initC
v_accvgpr_write acc191, 0x0                        // initC
v_accvgpr_write acc192, 0x0                        // initC
v_accvgpr_write acc193, 0x0                        // initC
v_accvgpr_write acc194, 0x0                        // initC
v_accvgpr_write acc195, 0x0                        // initC
v_accvgpr_write acc196, 0x0                        // initC
v_accvgpr_write acc197, 0x0                        // initC
v_accvgpr_write acc198, 0x0                        // initC
v_accvgpr_write acc199, 0x0                        // initC
v_accvgpr_write acc200, 0x0                        // initC
v_accvgpr_write acc201, 0x0                        // initC
v_accvgpr_write acc202, 0x0                        // initC
v_accvgpr_write acc203, 0x0                        // initC
v_accvgpr_write acc204, 0x0                        // initC
v_accvgpr_write acc205, 0x0                        // initC
v_accvgpr_write acc206, 0x0                        // initC
v_accvgpr_write acc207, 0x0                        // initC
v_accvgpr_write acc208, 0x0                        // initC
v_accvgpr_write acc209, 0x0                        // initC
v_accvgpr_write acc210, 0x0                        // initC
v_accvgpr_write acc211, 0x0                        // initC
v_accvgpr_write acc212, 0x0                        // initC
v_accvgpr_write acc213, 0x0                        // initC
v_accvgpr_write acc214, 0x0                        // initC
v_accvgpr_write acc215, 0x0                        // initC
v_accvgpr_write acc216, 0x0                        // initC
v_accvgpr_write acc217, 0x0                        // initC
v_accvgpr_write acc218, 0x0                        // initC
v_accvgpr_write acc219, 0x0                        // initC
v_accvgpr_write acc220, 0x0                        // initC
v_accvgpr_write acc221, 0x0                        // initC
v_accvgpr_write acc222, 0x0                        // initC
v_accvgpr_write acc223, 0x0                        // initC
v_accvgpr_write acc224, 0x0                        // initC
v_accvgpr_write acc225, 0x0                        // initC
v_accvgpr_write acc226, 0x0                        // initC
v_accvgpr_write acc227, 0x0                        // initC
v_accvgpr_write acc228, 0x0                        // initC
v_accvgpr_write acc229, 0x0                        // initC
v_accvgpr_write acc230, 0x0                        // initC
v_accvgpr_write acc231, 0x0                        // initC
v_accvgpr_write acc232, 0x0                        // initC
v_accvgpr_write acc233, 0x0                        // initC
v_accvgpr_write acc234, 0x0                        // initC
v_accvgpr_write acc235, 0x0                        // initC
v_accvgpr_write acc236, 0x0                        // initC
v_accvgpr_write acc237, 0x0                        // initC
v_accvgpr_write acc238, 0x0                        // initC
v_accvgpr_write acc239, 0x0                        // initC
v_accvgpr_write acc240, 0x0                        // initC
v_accvgpr_write acc241, 0x0                        // initC
v_accvgpr_write acc242, 0x0                        // initC
v_accvgpr_write acc243, 0x0                        // initC
v_accvgpr_write acc244, 0x0                        // initC
v_accvgpr_write acc245, 0x0                        // initC
v_accvgpr_write acc246, 0x0                        // initC
v_accvgpr_write acc247, 0x0                        // initC
v_accvgpr_write acc248, 0x0                        // initC
v_accvgpr_write acc249, 0x0                        // initC
v_accvgpr_write acc250, 0x0                        // initC
v_accvgpr_write acc251, 0x0                        // initC
v_accvgpr_write acc252, 0x0                        // initC
v_accvgpr_write acc253, 0x0                        // initC
v_accvgpr_write acc254, 0x0                        // initC
v_accvgpr_write acc255, 0x0                        // initC
s_cmp_eq_u32 s[sgprLoopCounterL], 0                // at last iteration?

/* after InitC, skip to end of prefetch last iter if numIter==0 */
s_cbranch_scc0 label_NoBranch_IVWDN609MP1IS66Z_0   // Only branch on scc1
s_getpc_b64 s[28:29]                               // addr of next instr
s_add_i32 s30, label_PrefetchGlobalLastIterEnd, 0x4 // target branch offset
s_add_u32 s28, s28, s30                            // add target branch offset
s_addc_u32 s29, s29, 0                             // add high and carry
s_setpc_b64 s[28:29]                               // branch to label_PrefetchGlobalLastIterEnd
label_NoBranch_IVWDN609MP1IS66Z_0:
s_waitcnt vmcnt(8)                                 // wait for global read B

/* local write a */

/* local write b */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+0:vgprG2LB+0+3] offset:0 // lwoB_0_0_0_0 = (0*LSCB)*(MT1J+PAD) + (0*LSPB) = 0
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+4:vgprG2LB+4+3] offset:4160 // lwoB_0_0_1_0 = (0*LSCB)*(MT1J+PAD) + (1*LSPB) = 4160
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+8:vgprG2LB+8+3] offset:8320 // lwoB_0_0_2_0 = (0*LSCB)*(MT1J+PAD) + (2*LSPB) = 8320
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+12:vgprG2LB+12+3] offset:12480 // lwoB_0_0_3_0 = (0*LSCB)*(MT1J+PAD) + (3*LSPB) = 12480
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+16:vgprG2LB+16+3] offset:16640 // lwoB_0_0_4_0 = (0*LSCB)*(MT1J+PAD) + (4*LSPB) = 16640
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+20:vgprG2LB+20+3] offset:20800 // lwoB_0_0_5_0 = (0*LSCB)*(MT1J+PAD) + (5*LSPB) = 20800
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+24:vgprG2LB+24+3] offset:24960 // lwoB_0_0_6_0 = (0*LSCB)*(MT1J+PAD) + (6*LSPB) = 24960
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+28:vgprG2LB+28+3] offset:29120 // lwoB_0_0_7_0 = (0*LSCB)*(MT1J+PAD) + (7*LSPB) = 29120

/* local write swap a */

/* local write swap b */
s_cmp_eq_u32 s[sgprLoopCounterL], 0x1              // PGR=2 but only 1 loop
s_cbranch_scc1 label_skipPGR2_0                    // PGR=2 but only 1 loop
buffer_load_dwordx4 v[vgprG2LB+0:vgprG2LB+0+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0 // G -> Reg 0_0_0_0
buffer_load_dwordx4 v[vgprG2LB+4:vgprG2LB+4+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0 // G -> Reg 0_0_1_0
buffer_load_dwordx4 v[vgprG2LB+8:vgprG2LB+8+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0 // G -> Reg 0_0_2_0
buffer_load_dwordx4 v[vgprG2LB+12:vgprG2LB+12+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0 // G -> Reg 0_0_3_0
buffer_load_dwordx4 v[vgprG2LB+16:vgprG2LB+16+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0 // G -> Reg 0_0_4_0
buffer_load_dwordx4 v[vgprG2LB+20:vgprG2LB+20+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0 // G -> Reg 0_0_5_0
buffer_load_dwordx4 v[vgprG2LB+24:vgprG2LB+24+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0 // G -> Reg 0_0_6_0
buffer_load_dwordx4 v[vgprG2LB+28:vgprG2LB+28+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0 // G -> Reg 0_0_7_0
label_skipPGR2_0:
s_waitcnt lgkmcnt(0)                               // 0prefetch wait for local write
// Skip force waitcnt0
s_barrier

/* local read prefetch a */

/* local read prefetch b */
ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprLocalReadAddrB] offset:1024 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=8 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+36:vgprValuB_X0_I0+36+3], v[vgprLocalReadAddrB] offset:1152 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=9 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprLocalReadAddrB] offset:1280 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=10 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+44:vgprValuB_X0_I0+44+3], v[vgprLocalReadAddrB] offset:1408 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=11 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprLocalReadAddrB] offset:1536 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=12 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+52:vgprValuB_X0_I0+52+3], v[vgprLocalReadAddrB] offset:1664 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=13 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprLocalReadAddrB] offset:1792 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=14 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b128 v[vgprValuB_X0_I0+60:vgprValuB_X0_I0+60+3], v[vgprLocalReadAddrB] offset:1920 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=15 rIdx=0 oIdx=0 buffer=0 iui=0

/* local read inc a */
/* N/A, lro->64 */
/* self.localReadDoCntA 1 self.localReadDoCntB 1 */

/* local read inc b */
/* N/A, lro->64 */
/* self.localReadDoCntA 1 self.localReadDoCntB 1 */

.set vgprValuA_X0_I0, vgprValuA_X0_I0_0
.set vgprValuA_X2_I0, vgprValuA_X2_I0_0

/******************************************/
/* Unrolled Loop(s) - Begin               */
/******************************************/
label_openLoopL:
s_cmp_eq_u32 s[sgprLoopCounterL], 0x1              // LoopCounterL < EndCounter
s_cbranch_scc1 label_LoopEndL_odd_NoLoadLoop
s_cmp_le_u32 s[sgprLoopCounterL], 0x2              // LoopCounterL < EndCounter
s_cbranch_scc1 label_LoopEndL_even                 // do not enter LoopL
label_LoopBeginL:

/******************************************/
/* Unrolled Loop 1/2 - Begin              */
/******************************************/

s_waitcnt vmcnt(8)

/* Begin Each Unroll: Check VGPR.checkin for INT8 LW */

/* iter 0 */
/*  grEndMfmaIndex:18, lwStartMfmaIndex:35, lwEndMfmaIndex:223  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:0  */
s_waitcnt lgkmcnt(0)                               // wait for prior local read local write old=0, new=0 newLW=0 newLR=0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:1  */
buffer_load_dwordx4 v[vgprValuA_X0_I0_1+0:vgprValuA_X0_I0_1+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0 // G -> Reg 0_0_0_0
buffer_load_dwordx4 v[vgprValuA_X2_I0_1+0:vgprValuA_X2_I0_1+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0 // G -> Reg 0_0_1_0
/* global read inc B loopL */
s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:2  */
ds_read_b128 v[vgprValuB_X2_I0+0:vgprValuB_X2_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
s_cselect_b32 s80, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ?
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:3  */
s_cselect_b32 s81, s[sgprWrapUB+1], 0              // incUpper <- ?
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:4  */
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s80        // gra SRD += inc(lower)
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:5  */
ds_read_b128 v[vgprValuB_X2_I0+4:vgprValuB_X2_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=2 iui=0
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s81       // gra SRD += inc(upper)
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:6  */
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s80 // limit -= inc)
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:7  */
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s81 // limit -= inc)
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:8  */
ds_read_b128 v[vgprValuB_X2_I0+8:vgprValuB_X2_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=2 iui=0
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:9  */
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:10  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:11  */
ds_read_b128 v[vgprValuB_X2_I0+12:vgprValuB_X2_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:12  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:13  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:14  */
ds_read_b128 v[vgprValuB_X2_I0+16:vgprValuB_X2_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:15  */
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:16  */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:17  */
ds_read_b128 v[vgprValuB_X2_I0+20:vgprValuB_X2_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:18  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:19  */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:20  */
ds_read_b128 v[vgprValuB_X2_I0+24:vgprValuB_X2_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:21  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:22  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:23  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+28:vgprValuB_X2_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:24  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:25  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:26  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+32:vgprValuB_X2_I0+32+3], v[vgprLocalReadAddrB] offset:1088 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=8 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:27  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:28  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:29  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+36:vgprValuB_X2_I0+36+3], v[vgprLocalReadAddrB] offset:1216 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=9 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:30  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:31  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:32  */
/* localReadsVacancy: latencyLeft 2 */
buffer_load_dwordx4 v[vgprValuA_X0_I0_1+4:vgprValuA_X0_I0_1+4+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0 // G -> Reg 0_0_1_0
buffer_load_dwordx4 v[vgprValuA_X2_I0_1+4:vgprValuA_X2_I0_1+4+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0 // G -> Reg 0_0_1_0
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:33  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+40:vgprValuB_X2_I0+40+3], v[vgprLocalReadAddrB] offset:1344 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=10 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:34  */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:35  */
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:36  */
ds_read_b128 v[vgprValuB_X2_I0+44:vgprValuB_X2_I0+44+3], v[vgprLocalReadAddrB] offset:1472 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=11 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:37  */
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:38  */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:39  */
ds_read_b128 v[vgprValuB_X2_I0+48:vgprValuB_X2_I0+48+3], v[vgprLocalReadAddrB] offset:1600 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=12 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:40  */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:41  */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:42  */
ds_read_b128 v[vgprValuB_X2_I0+52:vgprValuB_X2_I0+52+3], v[vgprLocalReadAddrB] offset:1728 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=13 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:43  */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:44  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:45  */
ds_read_b128 v[vgprValuB_X2_I0+56:vgprValuB_X2_I0+56+3], v[vgprLocalReadAddrB] offset:1856 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=14 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:46  */
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:47  */
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:48  */
ds_read_b128 v[vgprValuB_X2_I0+60:vgprValuB_X2_I0+60+3], v[vgprLocalReadAddrB] offset:1984 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=15 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:49  */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:50  */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:51  */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:52  */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:53  */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:54  */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:55  */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:56  */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:57  */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:58  */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:59  */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:60  */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:61  */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:62  */
/* 1 LDS buffer: read-sync-write */
s_waitcnt lgkmcnt(0)
s_barrier
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:63  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=4 */
/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=16 */

/* iter 1 */
/*  grEndMfmaIndex:18, lwStartMfmaIndex:35, lwEndMfmaIndex:223  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:64  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+0:vgprG2LB+0+3] offset:0 // lwoB_0_0_0_0 = (0*LSCB)*(MT1J+PAD) + (0*LSPB) = 0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:65  */
buffer_load_dwordx4 v[vgprValuA_X0_I0_1+8:vgprValuA_X0_I0_1+8+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0 // G -> Reg 0_0_2_0
buffer_load_dwordx4 v[vgprValuA_X2_I0_1+8:vgprValuA_X2_I0_1+8+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0 // G -> Reg 0_0_1_0
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:66  */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:67  */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:68  */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:69  */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:70  */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:71  */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:72  */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:73  */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:74  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:75  */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:76  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:77  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:78  */
s_waitcnt vmcnt(12)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:79  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+4:vgprG2LB+4+3] offset:4160 // lwoB_0_0_1_0 = (0*LSCB)*(MT1J+PAD) + (1*LSPB) = 4160
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:80  */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:81  */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:82  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:83  */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:84  */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:85  */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:86  */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:87  */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:88  */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:89  */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:90  */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:91  */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:92  */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:93  */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:94  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:95  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+8:vgprG2LB+8+3] offset:8320 // lwoB_0_0_2_0 = (0*LSCB)*(MT1J+PAD) + (2*LSPB) = 8320
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:96  */
buffer_load_dwordx4 v[vgprValuA_X0_I0_1+12:vgprValuA_X0_I0_1+12+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0 // G -> Reg 0_0_1_0
buffer_load_dwordx4 v[vgprValuA_X2_I0_1+12:vgprValuA_X2_I0_1+12+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0 // G -> Reg 0_0_1_0
/* global read inc A loopL */
s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:97  */
s_cselect_b32 s80, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ?
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:98  */
s_cselect_b32 s81, s[sgprWrapUA+1], 0              // incUpper <- ?
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:99  */
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s80        // gra SRD += inc(lower)
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:100  */
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s81       // gra SRD += inc(upper)
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:101  */
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s80 // limit -= inc)
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:102  */
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s81 // limit -= inc)
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:103  */
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:104  */
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:105  */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:106  */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:107  */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:108  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:109  */
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:110  */
s_waitcnt vmcnt(12)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:111  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+12:vgprG2LB+12+3] offset:12480 // lwoB_0_0_3_0 = (0*LSCB)*(MT1J+PAD) + (3*LSPB) = 12480
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:112  */
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:113  */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:114  */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:115  */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:116  */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:117  */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:118  */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:119  */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:120  */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:121  */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:122  */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:123  */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:124  */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:125  */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:126  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:127  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+16:vgprG2LB+16+3] offset:16640 // lwoB_0_0_4_0 = (0*LSCB)*(MT1J+PAD) + (4*LSPB) = 16640
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=4 */
/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=16 */

/* iter 2 (reset local read pointers iteration)  (swap local read pointers iteration)  */
/*  grEndMfmaIndex:18, lwStartMfmaIndex:35, lwEndMfmaIndex:223  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:128  */
buffer_load_dwordx4 v[vgprG2LB+0:vgprG2LB+0+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0 // G -> Reg 0_0_0_0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:129  */
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:130  */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:131  */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:132  */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:133  */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:134  */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:135  */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:136  */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:137  */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:138  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:139  */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:140  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:141  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:142  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:143  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+20:vgprG2LB+20+3] offset:20800 // lwoB_0_0_5_0 = (0*LSCB)*(MT1J+PAD) + (5*LSPB) = 20800
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:144  */
buffer_load_dwordx4 v[vgprG2LB+4:vgprG2LB+4+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0 // G -> Reg 0_0_1_0
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:145  */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:146  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:147  */
/* sched write - iter 2 writesPerItem=1 */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:148  */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:149  */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:150  */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:151  */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:152  */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:153  */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:154  */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:155  */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:156  */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:157  */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:158  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:159  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+24:vgprG2LB+24+3] offset:24960 // lwoB_0_0_6_0 = (0*LSCB)*(MT1J+PAD) + (6*LSPB) = 24960
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:160  */
buffer_load_dwordx4 v[vgprG2LB+8:vgprG2LB+8+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0 // G -> Reg 0_0_2_0
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:161  */
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:162  */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:163  */
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:164  */
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:165  */
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:166  */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:167  */
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:168  */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:169  */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:170  */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:171  */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:172  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:173  */
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:174  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:175  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+28:vgprG2LB+28+3] offset:29120 // lwoB_0_0_7_0 = (0*LSCB)*(MT1J+PAD) + (7*LSPB) = 29120
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:176  */
buffer_load_dwordx4 v[vgprG2LB+12:vgprG2LB+12+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0 // G -> Reg 0_0_3_0
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:177  */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:178  */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:179  */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:180  */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:181  */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:182  */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:183  */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:184  */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:185  */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:186  */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:187  */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:188  */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:189  */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:190  */
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:191  */

/* local read swap offsets a */

/* local read swap offsets b */

/* local read init pointers a */

/* localReadInitPointers */

/* local read init pointers b */

/* localReadInitPointers */
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=4 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=16 */

/* iter 3 (swap and reset local write pointers iteration)  */
/*  grEndMfmaIndex:18, lwStartMfmaIndex:35, lwEndMfmaIndex:223  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:192  */
buffer_load_dwordx4 v[vgprG2LB+16:vgprG2LB+16+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0 // G -> Reg 0_0_4_0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:193  */
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:194  */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:195  */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:196  */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:197  */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:198  */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:199  */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:200  */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:201  */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:202  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:203  */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:204  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:205  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:206  */
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:207  */
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:208  */
buffer_load_dwordx4 v[vgprG2LB+20:vgprG2LB+20+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0 // G -> Reg 0_0_5_0
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:209  */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:210  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:211  */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:212  */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:213  */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:214  */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:215  */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:216  */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:217  */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:218  */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:219  */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:220  */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:221  */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:222  */

/* local write swap offsets a */

/* local write swap offsets b */
s_waitcnt lgkmcnt(0)                               // 3wait for local write
// Skip force waitcnt0
s_barrier
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:223  */
ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:224  */
buffer_load_dwordx4 v[vgprG2LB+24:vgprG2LB+24+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0 // G -> Reg 0_0_6_0
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:225  */
ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:226  */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:227  */
ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:228  */
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:229  */
ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:230  */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:231  */
ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:232  */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:233  */
ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:234  */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:235  */
ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:236  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:237  */
ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:238  */
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:239  */
ds_read_b128 v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprLocalReadAddrB] offset:1024 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=8 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:240  */
buffer_load_dwordx4 v[vgprG2LB+28:vgprG2LB+28+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0 // G -> Reg 0_0_7_0
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:241  */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:242  */
ds_read_b128 v[vgprValuB_X0_I0+36:vgprValuB_X0_I0+36+3], v[vgprLocalReadAddrB] offset:1152 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=9 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:243  */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:244  */
ds_read_b128 v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprLocalReadAddrB] offset:1280 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=10 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:245  */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:246  */
ds_read_b128 v[vgprValuB_X0_I0+44:vgprValuB_X0_I0+44+3], v[vgprLocalReadAddrB] offset:1408 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=11 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:247  */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:248  */
ds_read_b128 v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprLocalReadAddrB] offset:1536 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=12 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:249  */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:250  */
ds_read_b128 v[vgprValuB_X0_I0+52:vgprValuB_X0_I0+52+3], v[vgprLocalReadAddrB] offset:1664 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=13 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:251  */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:252  */
ds_read_b128 v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprLocalReadAddrB] offset:1792 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=14 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:253  */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:254  */
ds_read_b128 v[vgprValuB_X0_I0+60:vgprValuB_X0_I0+60+3], v[vgprLocalReadAddrB] offset:1920 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=15 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:255  */
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=1 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=4 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=16 */

/******************************************/
/* Unrolled Loop - End                    */
/******************************************/

/* closeLoop loopL finalLoop=1 tailLoop=0 */
s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], 1 // dec counterL
s_cmp_eq_i32 s[sgprLoopCounterL], 0x2                 // counterL==1
s_cbranch_scc1 label_LoopEndL_odd          // to End

/******************************************/
/* Unrolled Loop 2/2 - Begin              */
/******************************************/

.set vgprValuA_X0_I0, vgprValuA_X0_I0_1
.set vgprValuA_X2_I0, vgprValuA_X2_I0_1

s_waitcnt vmcnt(8)

/* Begin Each Unroll: Check VGPR.checkin for INT8 LW */

/* iter 0 */
/*  grEndMfmaIndex:18, lwStartMfmaIndex:35, lwEndMfmaIndex:223  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:0  */
s_waitcnt lgkmcnt(0)                               // wait for prior local read local write old=0, new=0 newLW=0 newLR=0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:1  */
buffer_load_dwordx4 v[vgprValuA_X0_I0_0+0:vgprValuA_X0_I0_0+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0 // G -> Reg 0_0_0_0
buffer_load_dwordx4 v[vgprValuA_X2_I0_0+0:vgprValuA_X2_I0_0+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0 // G -> Reg 0_0_1_0
/* global read inc B loopL */
s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:2  */
ds_read_b128 v[vgprValuB_X2_I0+0:vgprValuB_X2_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
s_cselect_b32 s80, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ?
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:3  */
s_cselect_b32 s81, s[sgprWrapUB+1], 0              // incUpper <- ?
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:4  */
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s80        // gra SRD += inc(lower)
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:5  */
ds_read_b128 v[vgprValuB_X2_I0+4:vgprValuB_X2_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=2 iui=0
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s81       // gra SRD += inc(upper)
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:6  */
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s80 // limit -= inc)
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:7  */
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s81 // limit -= inc)
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:8  */
ds_read_b128 v[vgprValuB_X2_I0+8:vgprValuB_X2_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=2 iui=0
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:9  */
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:10  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:11  */
ds_read_b128 v[vgprValuB_X2_I0+12:vgprValuB_X2_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:12  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:13  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:14  */
ds_read_b128 v[vgprValuB_X2_I0+16:vgprValuB_X2_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:15  */
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:16  */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:17  */
ds_read_b128 v[vgprValuB_X2_I0+20:vgprValuB_X2_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:18  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:19  */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:20  */
ds_read_b128 v[vgprValuB_X2_I0+24:vgprValuB_X2_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:21  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:22  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:23  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+28:vgprValuB_X2_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:24  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:25  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:26  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+32:vgprValuB_X2_I0+32+3], v[vgprLocalReadAddrB] offset:1088 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=8 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:27  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:28  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:29  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+36:vgprValuB_X2_I0+36+3], v[vgprLocalReadAddrB] offset:1216 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=9 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:30  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:31  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:32  */
/* localReadsVacancy: latencyLeft 2 */
buffer_load_dwordx4 v[vgprValuA_X0_I0_0+4:vgprValuA_X0_I0_0+4+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0 // G -> Reg 0_0_1_0
buffer_load_dwordx4 v[vgprValuA_X2_I0_0+4:vgprValuA_X2_I0_0+4+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0 // G -> Reg 0_0_1_0
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:33  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+40:vgprValuB_X2_I0+40+3], v[vgprLocalReadAddrB] offset:1344 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=10 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:34  */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:35  */
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:36  */
ds_read_b128 v[vgprValuB_X2_I0+44:vgprValuB_X2_I0+44+3], v[vgprLocalReadAddrB] offset:1472 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=11 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:37  */
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:38  */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:39  */
ds_read_b128 v[vgprValuB_X2_I0+48:vgprValuB_X2_I0+48+3], v[vgprLocalReadAddrB] offset:1600 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=12 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:40  */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:41  */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:42  */
ds_read_b128 v[vgprValuB_X2_I0+52:vgprValuB_X2_I0+52+3], v[vgprLocalReadAddrB] offset:1728 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=13 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:43  */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:44  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:45  */
ds_read_b128 v[vgprValuB_X2_I0+56:vgprValuB_X2_I0+56+3], v[vgprLocalReadAddrB] offset:1856 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=14 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:46  */
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:47  */
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:48  */
ds_read_b128 v[vgprValuB_X2_I0+60:vgprValuB_X2_I0+60+3], v[vgprLocalReadAddrB] offset:1984 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=15 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:49  */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:50  */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:51  */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:52  */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:53  */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:54  */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:55  */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:56  */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:57  */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:58  */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:59  */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:60  */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:61  */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:62  */
/* 1 LDS buffer: read-sync-write */
s_waitcnt lgkmcnt(0)
s_barrier
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:63  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=4 */
/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=16 */

/* iter 1 */
/*  grEndMfmaIndex:18, lwStartMfmaIndex:35, lwEndMfmaIndex:223  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:64  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+0:vgprG2LB+0+3] offset:0 // lwoB_0_0_0_0 = (0*LSCB)*(MT1J+PAD) + (0*LSPB) = 0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:65  */
buffer_load_dwordx4 v[vgprValuA_X0_I0_0+8:vgprValuA_X0_I0_0+8+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0 // G -> Reg 0_0_2_0
buffer_load_dwordx4 v[vgprValuA_X2_I0_0+8:vgprValuA_X2_I0_0+8+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0 // G -> Reg 0_0_1_0
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:66  */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:67  */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:68  */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:69  */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:70  */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:71  */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:72  */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:73  */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:74  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:75  */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:76  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:77  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:78  */
s_waitcnt vmcnt(12)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:79  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+4:vgprG2LB+4+3] offset:4160 // lwoB_0_0_1_0 = (0*LSCB)*(MT1J+PAD) + (1*LSPB) = 4160
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:80  */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:81  */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:82  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:83  */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:84  */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:85  */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:86  */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:87  */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:88  */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:89  */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:90  */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:91  */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:92  */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:93  */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:94  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:95  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+8:vgprG2LB+8+3] offset:8320 // lwoB_0_0_2_0 = (0*LSCB)*(MT1J+PAD) + (2*LSPB) = 8320
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:96  */
buffer_load_dwordx4 v[vgprValuA_X0_I0_0+12:vgprValuA_X0_I0_0+12+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0 // G -> Reg 0_0_1_0
buffer_load_dwordx4 v[vgprValuA_X2_I0_0+12:vgprValuA_X2_I0_0+12+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0 // G -> Reg 0_0_1_0
/* global read inc A loopL */
s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:97  */
s_cselect_b32 s80, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ?
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:98  */
s_cselect_b32 s81, s[sgprWrapUA+1], 0              // incUpper <- ?
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:99  */
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s80        // gra SRD += inc(lower)
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:100  */
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s81       // gra SRD += inc(upper)
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:101  */
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s80 // limit -= inc)
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:102  */
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s81 // limit -= inc)
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:103  */
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:104  */
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:105  */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:106  */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:107  */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:108  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:109  */
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:110  */
s_waitcnt vmcnt(12)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:111  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+12:vgprG2LB+12+3] offset:12480 // lwoB_0_0_3_0 = (0*LSCB)*(MT1J+PAD) + (3*LSPB) = 12480
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:112  */
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:113  */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:114  */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:115  */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:116  */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:117  */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:118  */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:119  */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:120  */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:121  */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:122  */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:123  */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:124  */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:125  */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:126  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:127  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+16:vgprG2LB+16+3] offset:16640 // lwoB_0_0_4_0 = (0*LSCB)*(MT1J+PAD) + (4*LSPB) = 16640
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=4 */
/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=16 */

/* iter 2 (reset local read pointers iteration)  (swap local read pointers iteration)  */
/*  grEndMfmaIndex:18, lwStartMfmaIndex:35, lwEndMfmaIndex:223  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:128  */
buffer_load_dwordx4 v[vgprG2LB+0:vgprG2LB+0+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0 // G -> Reg 0_0_0_0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:129  */
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:130  */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:131  */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:132  */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:133  */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:134  */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:135  */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:136  */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:137  */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:138  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:139  */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:140  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:141  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:142  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:143  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+20:vgprG2LB+20+3] offset:20800 // lwoB_0_0_5_0 = (0*LSCB)*(MT1J+PAD) + (5*LSPB) = 20800
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:144  */
buffer_load_dwordx4 v[vgprG2LB+4:vgprG2LB+4+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0 // G -> Reg 0_0_1_0
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:145  */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:146  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:147  */
/* sched write - iter 2 writesPerItem=1 */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:148  */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:149  */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:150  */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:151  */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:152  */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:153  */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:154  */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:155  */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:156  */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:157  */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:158  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:159  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+24:vgprG2LB+24+3] offset:24960 // lwoB_0_0_6_0 = (0*LSCB)*(MT1J+PAD) + (6*LSPB) = 24960
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:160  */
buffer_load_dwordx4 v[vgprG2LB+8:vgprG2LB+8+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0 // G -> Reg 0_0_2_0
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:161  */
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:162  */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:163  */
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:164  */
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:165  */
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:166  */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:167  */
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:168  */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:169  */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:170  */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:171  */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:172  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:173  */
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:174  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:175  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+28:vgprG2LB+28+3] offset:29120 // lwoB_0_0_7_0 = (0*LSCB)*(MT1J+PAD) + (7*LSPB) = 29120
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:176  */
buffer_load_dwordx4 v[vgprG2LB+12:vgprG2LB+12+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0 // G -> Reg 0_0_3_0
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:177  */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:178  */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:179  */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:180  */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:181  */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:182  */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:183  */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:184  */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:185  */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:186  */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:187  */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:188  */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:189  */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:190  */
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:191  */

/* local read swap offsets a */

/* local read swap offsets b */

/* local read init pointers a */

/* localReadInitPointers */

/* local read init pointers b */

/* localReadInitPointers */
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=4 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=16 */

/* iter 3 (swap and reset local write pointers iteration)  */
/*  grEndMfmaIndex:18, lwStartMfmaIndex:35, lwEndMfmaIndex:223  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:192  */
buffer_load_dwordx4 v[vgprG2LB+16:vgprG2LB+16+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0 // G -> Reg 0_0_4_0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:193  */
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:194  */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:195  */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:196  */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:197  */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:198  */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:199  */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:200  */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:201  */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:202  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:203  */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:204  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:205  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:206  */
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:207  */
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:208  */
buffer_load_dwordx4 v[vgprG2LB+20:vgprG2LB+20+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0 // G -> Reg 0_0_5_0
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:209  */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:210  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:211  */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:212  */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:213  */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:214  */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:215  */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:216  */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:217  */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:218  */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:219  */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:220  */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:221  */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:222  */

/* local write swap offsets a */

/* local write swap offsets b */
s_waitcnt lgkmcnt(0)                               // 3wait for local write
// Skip force waitcnt0
s_barrier
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:223  */
ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:224  */
buffer_load_dwordx4 v[vgprG2LB+24:vgprG2LB+24+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0 // G -> Reg 0_0_6_0
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:225  */
ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:226  */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:227  */
ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:228  */
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:229  */
ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:230  */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:231  */
ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:232  */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:233  */
ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:234  */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:235  */
ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:236  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:237  */
ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:238  */
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:239  */
ds_read_b128 v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprLocalReadAddrB] offset:1024 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=8 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:240  */
buffer_load_dwordx4 v[vgprG2LB+28:vgprG2LB+28+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0 // G -> Reg 0_0_7_0
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:241  */
ds_read_b128 v[vgprValuB_X0_I0+36:vgprValuB_X0_I0+36+3], v[vgprLocalReadAddrB] offset:1152 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=9 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:242  */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:243  */
ds_read_b128 v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprLocalReadAddrB] offset:1280 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=10 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:244  */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:245  */
ds_read_b128 v[vgprValuB_X0_I0+44:vgprValuB_X0_I0+44+3], v[vgprLocalReadAddrB] offset:1408 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=11 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:246  */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:247  */
ds_read_b128 v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprLocalReadAddrB] offset:1536 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=12 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:248  */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:249  */
ds_read_b128 v[vgprValuB_X0_I0+52:vgprValuB_X0_I0+52+3], v[vgprLocalReadAddrB] offset:1664 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=13 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:250  */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:251  */
ds_read_b128 v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprLocalReadAddrB] offset:1792 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=14 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:252  */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:253  */
ds_read_b128 v[vgprValuB_X0_I0+60:vgprValuB_X0_I0+60+3], v[vgprLocalReadAddrB] offset:1920 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=15 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:254  */
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:255  */
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=1 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=4 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=16 */

.set vgprValuA_X0_I0, vgprValuA_X0_I0_0
.set vgprValuA_X2_I0, vgprValuA_X2_I0_0

/******************************************/
/* Unrolled Loop - End                    */
/******************************************/

/* closeLoop loopL finalLoop=1 tailLoop=0 */
s_sub_u32 s[sgprLoopCounterL], s[sgprLoopCounterL], 1 // dec counterL
s_cmp_eq_i32 s[sgprLoopCounterL], 0x2              // counterL==2
s_cbranch_scc0 label_LoopBeginL                    // restart LoopL
label_LoopEndL_even:

/* Before NLL: Check VGPR.checkin for INT8 LW */

/******************************************/
/* Ord. NoGlobalLoadLoop - Begin          */
/******************************************/
s_waitcnt vmcnt(8)

/* iter 0 */
/*  grEndMfmaIndex:18, lwStartMfmaIndex:35, lwEndMfmaIndex:223  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:0  */
s_waitcnt lgkmcnt(0)                               // wait for prior local read local write old=0, new=0 newLW=0 newLR=0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:1  */
buffer_load_dwordx4 v[vgprValuA_X0_I0_1+0:vgprValuA_X0_I0_1+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0 // G -> Reg 0_0_0_0
buffer_load_dwordx4 v[vgprValuA_X2_I0_1+0:vgprValuA_X2_I0_1+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0 // G -> Reg 0_0_1_0
/* global read inc B loopL */
s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:2  */
ds_read_b128 v[vgprValuB_X2_I0+0:vgprValuB_X2_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
s_cselect_b32 s80, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ?
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:3  */
s_cselect_b32 s81, s[sgprWrapUB+1], 0              // incUpper <- ?
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:4  */
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s80        // gra SRD += inc(lower)
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:5  */
ds_read_b128 v[vgprValuB_X2_I0+4:vgprValuB_X2_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=2 iui=0
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s81       // gra SRD += inc(upper)
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:6  */
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s80 // limit -= inc)
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:7  */
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s81 // limit -= inc)
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:8  */
ds_read_b128 v[vgprValuB_X2_I0+8:vgprValuB_X2_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=2 iui=0
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:9  */
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:10  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:11  */
ds_read_b128 v[vgprValuB_X2_I0+12:vgprValuB_X2_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:12  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:13  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:14  */
ds_read_b128 v[vgprValuB_X2_I0+16:vgprValuB_X2_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:15  */
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:16  */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:17  */
ds_read_b128 v[vgprValuB_X2_I0+20:vgprValuB_X2_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:18  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:19  */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:20  */
ds_read_b128 v[vgprValuB_X2_I0+24:vgprValuB_X2_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:21  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:22  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:23  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+28:vgprValuB_X2_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:24  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:25  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:26  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+32:vgprValuB_X2_I0+32+3], v[vgprLocalReadAddrB] offset:1088 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=8 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:27  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:28  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:29  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+36:vgprValuB_X2_I0+36+3], v[vgprLocalReadAddrB] offset:1216 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=9 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:30  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:31  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:32  */
/* localReadsVacancy: latencyLeft 2 */
buffer_load_dwordx4 v[vgprValuA_X0_I0_1+4:vgprValuA_X0_I0_1+4+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0 // G -> Reg 0_0_1_0
buffer_load_dwordx4 v[vgprValuA_X2_I0_1+4:vgprValuA_X2_I0_1+4+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0 // G -> Reg 0_0_1_0
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:33  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+40:vgprValuB_X2_I0+40+3], v[vgprLocalReadAddrB] offset:1344 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=10 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:34  */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:35  */
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:36  */
ds_read_b128 v[vgprValuB_X2_I0+44:vgprValuB_X2_I0+44+3], v[vgprLocalReadAddrB] offset:1472 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=11 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:37  */
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:38  */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:39  */
ds_read_b128 v[vgprValuB_X2_I0+48:vgprValuB_X2_I0+48+3], v[vgprLocalReadAddrB] offset:1600 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=12 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:40  */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:41  */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:42  */
ds_read_b128 v[vgprValuB_X2_I0+52:vgprValuB_X2_I0+52+3], v[vgprLocalReadAddrB] offset:1728 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=13 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:43  */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:44  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:45  */
ds_read_b128 v[vgprValuB_X2_I0+56:vgprValuB_X2_I0+56+3], v[vgprLocalReadAddrB] offset:1856 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=14 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:46  */
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:47  */
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:48  */
ds_read_b128 v[vgprValuB_X2_I0+60:vgprValuB_X2_I0+60+3], v[vgprLocalReadAddrB] offset:1984 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=15 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:49  */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:50  */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:51  */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:52  */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:53  */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:54  */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:55  */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:56  */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:57  */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:58  */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:59  */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:60  */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:61  */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:62  */
/* 1 LDS buffer: read-sync-write */
s_waitcnt lgkmcnt(0)
s_barrier
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:63  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=4 */
/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=16 */

/* iter 1 */
/*  grEndMfmaIndex:18, lwStartMfmaIndex:35, lwEndMfmaIndex:223  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:64  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+0:vgprG2LB+0+3] offset:0 // lwoB_0_0_0_0 = (0*LSCB)*(MT1J+PAD) + (0*LSPB) = 0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:65  */
buffer_load_dwordx4 v[vgprValuA_X0_I0_1+8:vgprValuA_X0_I0_1+8+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0 // G -> Reg 0_0_1_0
buffer_load_dwordx4 v[vgprValuA_X2_I0_1+8:vgprValuA_X2_I0_1+8+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0 // G -> Reg 0_0_1_0
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:66  */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:67  */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:68  */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:69  */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:70  */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:71  */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:72  */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:73  */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:74  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:75  */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:76  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:77  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:78  */
s_waitcnt vmcnt(12)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:79  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+4:vgprG2LB+4+3] offset:4160 // lwoB_0_0_1_0 = (0*LSCB)*(MT1J+PAD) + (1*LSPB) = 4160
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:80  */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:81  */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:82  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:83  */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:84  */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:85  */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:86  */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:87  */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:88  */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:89  */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:90  */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:91  */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:92  */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:93  */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:94  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:95  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+8:vgprG2LB+8+3] offset:8320 // lwoB_0_0_2_0 = (0*LSCB)*(MT1J+PAD) + (2*LSPB) = 8320
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:96  */
buffer_load_dwordx4 v[vgprValuA_X0_I0_1+12:vgprValuA_X0_I0_1+12+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0 // G -> Reg 0_0_1_0
buffer_load_dwordx4 v[vgprValuA_X2_I0_1+12:vgprValuA_X2_I0_1+12+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0 // G -> Reg 0_0_1_0
/* global read inc A loopL */
s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:97  */
s_cselect_b32 s80, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ?
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:98  */
s_cselect_b32 s81, s[sgprWrapUA+1], 0              // incUpper <- ?
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:99  */
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s80        // gra SRD += inc(lower)
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:100  */
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s81       // gra SRD += inc(upper)
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:101  */
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s80 // limit -= inc)
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:102  */
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s81 // limit -= inc)
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:103  */
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:104  */
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:105  */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:106  */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:107  */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:108  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:109  */
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:110  */
s_waitcnt vmcnt(12)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:111  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+12:vgprG2LB+12+3] offset:12480 // lwoB_0_0_3_0 = (0*LSCB)*(MT1J+PAD) + (3*LSPB) = 12480
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:112  */
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:113  */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:114  */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:115  */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:116  */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:117  */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:118  */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:119  */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:120  */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:121  */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:122  */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:123  */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:124  */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:125  */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:126  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:127  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+16:vgprG2LB+16+3] offset:16640 // lwoB_0_0_4_0 = (0*LSCB)*(MT1J+PAD) + (4*LSPB) = 16640
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=4 */
/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=16 */

/* iter 2 (reset local read pointers iteration)  (swap local read pointers iteration)  */
/*  grEndMfmaIndex:18, lwStartMfmaIndex:35, lwEndMfmaIndex:223  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:128  */
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:129  */
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:130  */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:131  */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:132  */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:133  */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:134  */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:135  */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:136  */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:137  */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:138  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:139  */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:140  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:141  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:142  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:143  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+20:vgprG2LB+20+3] offset:20800 // lwoB_0_0_5_0 = (0*LSCB)*(MT1J+PAD) + (5*LSPB) = 20800
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:144  */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:145  */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:146  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:147  */
/* sched write - iter 2 writesPerItem=1 */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:148  */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:149  */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:150  */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:151  */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:152  */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:153  */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:154  */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:155  */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:156  */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:157  */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:158  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:159  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+24:vgprG2LB+24+3] offset:24960 // lwoB_0_0_6_0 = (0*LSCB)*(MT1J+PAD) + (6*LSPB) = 24960
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:160  */
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:161  */
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:162  */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:163  */
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:164  */
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:165  */
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:166  */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:167  */
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:168  */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:169  */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:170  */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:171  */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:172  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:173  */
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:174  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:175  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+28:vgprG2LB+28+3] offset:29120 // lwoB_0_0_7_0 = (0*LSCB)*(MT1J+PAD) + (7*LSPB) = 29120
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:176  */
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:177  */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:178  */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:179  */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:180  */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:181  */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:182  */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:183  */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:184  */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:185  */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:186  */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:187  */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:188  */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:189  */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:190  */
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:191  */

/* local read swap offsets a */

/* local read swap offsets b */

/* local read init pointers a */

/* localReadInitPointers */

/* local read init pointers b */

/* localReadInitPointers */
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=4 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=16 */

/* iter 3 (swap and reset local write pointers iteration)  */
/*  grEndMfmaIndex:18, lwStartMfmaIndex:35, lwEndMfmaIndex:223  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:192  */
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:193  */
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:194  */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:195  */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:196  */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:197  */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:198  */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:199  */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:200  */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:201  */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:202  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:203  */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:204  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:205  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:206  */
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:207  */
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:208  */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:209  */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:210  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:211  */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:212  */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:213  */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:214  */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:215  */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:216  */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:217  */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:218  */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:219  */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:220  */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:221  */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:222  */

/* local write swap offsets a */

/* local write swap offsets b */
s_waitcnt lgkmcnt(0)                               // 3wait for local write
// Skip force waitcnt0
s_barrier
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:223  */
ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:224  */
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:225  */
ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:226  */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:227  */
ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:228  */
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:229  */
ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:230  */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:231  */
ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:232  */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:233  */
ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:234  */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:235  */
ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:236  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:237  */
ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:238  */
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:239  */
ds_read_b128 v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprLocalReadAddrB] offset:1024 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=8 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:240  */
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:241  */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:242  */
ds_read_b128 v[vgprValuB_X0_I0+36:vgprValuB_X0_I0+36+3], v[vgprLocalReadAddrB] offset:1152 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=9 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:243  */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:244  */
ds_read_b128 v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprLocalReadAddrB] offset:1280 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=10 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:245  */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:246  */
ds_read_b128 v[vgprValuB_X0_I0+44:vgprValuB_X0_I0+44+3], v[vgprLocalReadAddrB] offset:1408 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=11 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:247  */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:248  */
ds_read_b128 v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprLocalReadAddrB] offset:1536 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=12 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:249  */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:250  */
ds_read_b128 v[vgprValuB_X0_I0+52:vgprValuB_X0_I0+52+3], v[vgprLocalReadAddrB] offset:1664 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=13 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:251  */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:252  */
ds_read_b128 v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprLocalReadAddrB] offset:1792 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=14 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:253  */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:254  */
ds_read_b128 v[vgprValuB_X0_I0+60:vgprValuB_X0_I0+60+3], v[vgprLocalReadAddrB] offset:1920 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=15 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:255  */
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=1 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=4 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=16 */

.set vgprValuA_X0_I0, vgprValuA_X0_I0_1
.set vgprValuA_X2_I0, vgprValuA_X2_I0_1

/******************************************/
/* Ord. NoLoadLoop - Begin                */
/******************************************/
s_waitcnt vmcnt(0)

.set vgprValuA_X0_I0, vgprValuA_X0_I0_1
.set vgprValuA_X2_I0, vgprValuA_X2_I0_1

/* iter 0 (last unrolled loop) */
/*  grEndMfmaIndex:0, lwStartMfmaIndex:191, lwEndMfmaIndex:191  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:0  */
s_waitcnt lgkmcnt(0)                               // wait for prior local read local write old=0, new=0 newLW=0 newLR=0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:1  */
ds_read_b128 v[vgprValuB_X2_I0+0:vgprValuB_X2_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:2  */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:3  */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:4  */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:5  */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:6  */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:7  */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:8  */
ds_read_b128 v[vgprValuB_X2_I0+4:vgprValuB_X2_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:9  */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:10  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:11  */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:12  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:13  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:14  */
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:15  */
ds_read_b128 v[vgprValuB_X2_I0+8:vgprValuB_X2_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:16  */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:17  */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:18  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:19  */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:20  */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:21  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:22  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+12:vgprValuB_X2_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:23  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:24  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:25  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:26  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:27  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:28  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:29  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+16:vgprValuB_X2_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:30  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:31  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:32  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:33  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:34  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:35  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:36  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+20:vgprValuB_X2_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:37  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:38  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:39  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:40  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:41  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:42  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:43  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+24:vgprValuB_X2_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:44  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:45  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:46  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:47  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:48  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:49  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:50  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+28:vgprValuB_X2_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:51  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:52  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:53  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:54  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:55  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:56  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:57  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+32:vgprValuB_X2_I0+32+3], v[vgprLocalReadAddrB] offset:1088 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=8 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:58  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:59  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:60  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:61  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:62  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:63  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=4 */
/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=16 */

/* iter 1 (last unrolled loop) */
/*  grEndMfmaIndex:0, lwStartMfmaIndex:191, lwEndMfmaIndex:191  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:64  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+36:vgprValuB_X2_I0+36+3], v[vgprLocalReadAddrB] offset:1216 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=9 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:65  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:66  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:67  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:68  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:69  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:70  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:71  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+40:vgprValuB_X2_I0+40+3], v[vgprLocalReadAddrB] offset:1344 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=10 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:72  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:73  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:74  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:75  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:76  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:77  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:78  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+44:vgprValuB_X2_I0+44+3], v[vgprLocalReadAddrB] offset:1472 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=11 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:79  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:80  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:81  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:82  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:83  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:84  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:85  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+48:vgprValuB_X2_I0+48+3], v[vgprLocalReadAddrB] offset:1600 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=12 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:86  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:87  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:88  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:89  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:90  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:91  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:92  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+52:vgprValuB_X2_I0+52+3], v[vgprLocalReadAddrB] offset:1728 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=13 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:93  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:94  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:95  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:96  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:97  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:98  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:99  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+56:vgprValuB_X2_I0+56+3], v[vgprLocalReadAddrB] offset:1856 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=14 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:100  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:101  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:102  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:103  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:104  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:105  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:106  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+60:vgprValuB_X2_I0+60+3], v[vgprLocalReadAddrB] offset:1984 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=15 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:107  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:108  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:109  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:110  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:111  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:112  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:113  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:114  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:115  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:116  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:117  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:118  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:119  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:120  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:121  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:122  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:123  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:124  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:125  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:126  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:127  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=4 */
/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=16 */

/* iter 2 (last unrolled loop) */
/*  grEndMfmaIndex:0, lwStartMfmaIndex:191, lwEndMfmaIndex:191  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:128  */
/* localReadsVacancy: latencyLeft 2 */
s_waitcnt lgkmcnt(0)                               // wait for prior local read local write old=0, new=0 newLW=0 newLR=0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:129  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:130  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:131  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:132  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:133  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:134  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:135  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:136  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:137  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:138  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:139  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:140  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:141  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:142  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:143  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:144  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:145  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:146  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:147  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:148  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:149  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:150  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:151  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:152  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:153  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:154  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:155  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:156  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:157  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:158  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:159  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:160  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:161  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:162  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:163  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:164  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:165  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:166  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:167  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:168  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:169  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:170  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:171  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:172  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:173  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:174  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:175  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:176  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:177  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:178  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:179  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:180  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:181  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:182  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:183  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:184  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:185  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:186  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:187  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:188  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:189  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:190  */
/* schedule remaining localreads for 1LDSB */
/* localReadsVacancy: latencyLeft 2 */
/* 1 LDS buffer: read-sync-write */
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:191  */
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=4 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=16 */

/* iter 3 (last unrolled loop) */
/*  grEndMfmaIndex:0, lwStartMfmaIndex:191, lwEndMfmaIndex:191  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:192  */
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:193  */
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:194  */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:195  */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:196  */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:197  */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:198  */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:199  */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:200  */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:201  */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:202  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:203  */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:204  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:205  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:206  */
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:207  */
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:208  */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:209  */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:210  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:211  */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:212  */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:213  */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:214  */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:215  */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:216  */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:217  */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:218  */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:219  */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:220  */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:221  */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:222  */
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:223  */
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:224  */
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:225  */
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:226  */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:227  */
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:228  */
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:229  */
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:230  */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:231  */
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:232  */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:233  */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:234  */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:235  */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:236  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:237  */
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:238  */
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:239  */
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:240  */
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:241  */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:242  */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:243  */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:244  */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:245  */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:246  */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:247  */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:248  */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:249  */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:250  */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:251  */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:252  */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:253  */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:254  */
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:255  */
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=4 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=16 */

s_branch label_PrefetchGlobalLastIterEnd

label_LoopEndL_odd:
.set vgprValuA_X0_I0, vgprValuA_X0_I0_1
.set vgprValuA_X2_I0, vgprValuA_X2_I0_1

/* Before NLL: Check VGPR.checkin for INT8 LW */

/******************************************/
/* Ord. NoGlobalLoadLoop - Begin          */
/******************************************/
s_waitcnt vmcnt(8)

/* iter 0 */
/*  grEndMfmaIndex:18, lwStartMfmaIndex:35, lwEndMfmaIndex:223  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:0  */
s_waitcnt lgkmcnt(0)                               // wait for prior local read local write old=0, new=0 newLW=0 newLR=0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:1  */
buffer_load_dwordx4 v[vgprValuA_X0_I0_0+0:vgprValuA_X0_I0_0+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0 // G -> Reg 0_0_0_0
buffer_load_dwordx4 v[vgprValuA_X2_I0_0+0:vgprValuA_X2_I0_0+0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0 // G -> Reg 0_0_1_0
/* global read inc B loopL */
s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:2  */
ds_read_b128 v[vgprValuB_X2_I0+0:vgprValuB_X2_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
s_cselect_b32 s80, s[sgprWrapUB+0], s[sgprGlobalReadIncsB+0] // incLower <- ?
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:3  */
s_cselect_b32 s81, s[sgprWrapUB+1], 0              // incUpper <- ?
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:4  */
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s80        // gra SRD += inc(lower)
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:5  */
ds_read_b128 v[vgprValuB_X2_I0+4:vgprValuB_X2_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=2 iui=0
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s81       // gra SRD += inc(upper)
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:6  */
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s80 // limit -= inc)
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:7  */
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s81 // limit -= inc)
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:8  */
ds_read_b128 v[vgprValuB_X2_I0+8:vgprValuB_X2_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=2 iui=0
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:9  */
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:10  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:11  */
ds_read_b128 v[vgprValuB_X2_I0+12:vgprValuB_X2_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:12  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:13  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:14  */
ds_read_b128 v[vgprValuB_X2_I0+16:vgprValuB_X2_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:15  */
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:16  */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:17  */
ds_read_b128 v[vgprValuB_X2_I0+20:vgprValuB_X2_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:18  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:19  */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:20  */
ds_read_b128 v[vgprValuB_X2_I0+24:vgprValuB_X2_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:21  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:22  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:23  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+28:vgprValuB_X2_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:24  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:25  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:26  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+32:vgprValuB_X2_I0+32+3], v[vgprLocalReadAddrB] offset:1088 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=8 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:27  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:28  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:29  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+36:vgprValuB_X2_I0+36+3], v[vgprLocalReadAddrB] offset:1216 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=9 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:30  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:31  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:32  */
/* localReadsVacancy: latencyLeft 2 */
buffer_load_dwordx4 v[vgprValuA_X0_I0_0+4:vgprValuA_X0_I0_0+4+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0 // G -> Reg 0_0_1_0
buffer_load_dwordx4 v[vgprValuA_X2_I0_0+4:vgprValuA_X2_I0_0+4+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0 // G -> Reg 0_0_1_0
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:33  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+40:vgprValuB_X2_I0+40+3], v[vgprLocalReadAddrB] offset:1344 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=10 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:34  */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:35  */
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:36  */
ds_read_b128 v[vgprValuB_X2_I0+44:vgprValuB_X2_I0+44+3], v[vgprLocalReadAddrB] offset:1472 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=11 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:37  */
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:38  */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:39  */
ds_read_b128 v[vgprValuB_X2_I0+48:vgprValuB_X2_I0+48+3], v[vgprLocalReadAddrB] offset:1600 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=12 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:40  */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:41  */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:42  */
ds_read_b128 v[vgprValuB_X2_I0+52:vgprValuB_X2_I0+52+3], v[vgprLocalReadAddrB] offset:1728 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=13 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:43  */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:44  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:45  */
ds_read_b128 v[vgprValuB_X2_I0+56:vgprValuB_X2_I0+56+3], v[vgprLocalReadAddrB] offset:1856 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=14 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:46  */
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:47  */
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:48  */
ds_read_b128 v[vgprValuB_X2_I0+60:vgprValuB_X2_I0+60+3], v[vgprLocalReadAddrB] offset:1984 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=15 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:49  */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:50  */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:51  */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:52  */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:53  */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:54  */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:55  */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:56  */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:57  */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:58  */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:59  */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:60  */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:61  */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:62  */
/* 1 LDS buffer: read-sync-write */
s_waitcnt lgkmcnt(0)
s_barrier
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:63  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=4 */
/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=16 */

/* iter 1 */
/*  grEndMfmaIndex:18, lwStartMfmaIndex:35, lwEndMfmaIndex:223  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:64  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+0:vgprG2LB+0+3] offset:0 // lwoB_0_0_0_0 = (0*LSCB)*(MT1J+PAD) + (0*LSPB) = 0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:65  */
buffer_load_dwordx4 v[vgprValuA_X0_I0_0+8:vgprValuA_X0_I0_0+8+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0 // G -> Reg 0_0_1_0
buffer_load_dwordx4 v[vgprValuA_X2_I0_0+8:vgprValuA_X2_I0_0+8+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0 // G -> Reg 0_0_1_0
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:66  */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:67  */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:68  */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:69  */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:70  */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:71  */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:72  */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:73  */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:74  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:75  */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:76  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:77  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:78  */
s_waitcnt vmcnt(12)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:79  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+4:vgprG2LB+4+3] offset:4160 // lwoB_0_0_1_0 = (0*LSCB)*(MT1J+PAD) + (1*LSPB) = 4160
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:80  */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:81  */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:82  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:83  */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:84  */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:85  */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:86  */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:87  */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:88  */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:89  */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:90  */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:91  */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:92  */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:93  */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:94  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:95  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+8:vgprG2LB+8+3] offset:8320 // lwoB_0_0_2_0 = (0*LSCB)*(MT1J+PAD) + (2*LSPB) = 8320
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:96  */
buffer_load_dwordx4 v[vgprValuA_X0_I0_0+12:vgprValuA_X0_I0_0+12+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0 // G -> Reg 0_0_1_0
buffer_load_dwordx4 v[vgprValuA_X2_I0_0+12:vgprValuA_X2_I0_0+12+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0 // G -> Reg 0_0_1_0
/* global read inc A loopL */
s_cmp_eq_u32 s[sgprLoopCounterL], s[sgprStaggerUIter] // Is this the wrapIter?
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:97  */
s_cselect_b32 s80, s[sgprWrapUA+0], s[sgprGlobalReadIncsA+0] // incLower <- ?
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:98  */
s_cselect_b32 s81, s[sgprWrapUA+1], 0              // incUpper <- ?
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:99  */
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s80        // gra SRD += inc(lower)
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:100  */
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s81       // gra SRD += inc(upper)
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:101  */
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s80 // limit -= inc)
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:102  */
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s81 // limit -= inc)
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:103  */
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:104  */
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:105  */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:106  */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:107  */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:108  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:109  */
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:110  */
s_waitcnt vmcnt(12)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:111  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+12:vgprG2LB+12+3] offset:12480 // lwoB_0_0_3_0 = (0*LSCB)*(MT1J+PAD) + (3*LSPB) = 12480
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:112  */
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:113  */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:114  */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:115  */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:116  */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:117  */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:118  */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:119  */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:120  */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:121  */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:122  */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:123  */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:124  */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:125  */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:126  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:127  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+16:vgprG2LB+16+3] offset:16640 // lwoB_0_0_4_0 = (0*LSCB)*(MT1J+PAD) + (4*LSPB) = 16640
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=4 */
/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=16 */

/* iter 2 (reset local read pointers iteration)  (swap local read pointers iteration)  */
/*  grEndMfmaIndex:18, lwStartMfmaIndex:35, lwEndMfmaIndex:223  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:128  */
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:129  */
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:130  */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:131  */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:132  */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:133  */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:134  */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:135  */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:136  */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:137  */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:138  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:139  */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:140  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:141  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:142  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:143  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+20:vgprG2LB+20+3] offset:20800 // lwoB_0_0_5_0 = (0*LSCB)*(MT1J+PAD) + (5*LSPB) = 20800
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:144  */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:145  */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:146  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:147  */
/* sched write - iter 2 writesPerItem=1 */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:148  */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:149  */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:150  */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:151  */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:152  */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:153  */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:154  */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:155  */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:156  */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:157  */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:158  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:159  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+24:vgprG2LB+24+3] offset:24960 // lwoB_0_0_6_0 = (0*LSCB)*(MT1J+PAD) + (6*LSPB) = 24960
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:160  */
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:161  */
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:162  */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:163  */
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:164  */
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:165  */
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:166  */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:167  */
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:168  */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:169  */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:170  */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:171  */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:172  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:173  */
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:174  */
s_waitcnt vmcnt(11)                                // wait for global read before writing to local
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:175  */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+28:vgprG2LB+28+3] offset:29120 // lwoB_0_0_7_0 = (0*LSCB)*(MT1J+PAD) + (7*LSPB) = 29120
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:176  */
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:177  */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:178  */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:179  */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:180  */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:181  */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:182  */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:183  */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:184  */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:185  */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:186  */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:187  */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:188  */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:189  */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:190  */
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:191  */

/* local read swap offsets a */

/* local read swap offsets b */

/* local read init pointers a */

/* localReadInitPointers */

/* local read init pointers b */

/* localReadInitPointers */
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=4 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=16 */

/* iter 3 (swap and reset local write pointers iteration)  */
/*  grEndMfmaIndex:18, lwStartMfmaIndex:35, lwEndMfmaIndex:223  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:192  */
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:193  */
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:194  */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:195  */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:196  */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:197  */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:198  */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:199  */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:200  */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:201  */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:202  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:203  */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:204  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:205  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:206  */
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:207  */
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:208  */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:209  */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:210  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:211  */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:212  */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:213  */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:214  */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:215  */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:216  */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:217  */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:218  */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:219  */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:220  */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:221  */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:222  */

/* local write swap offsets a */

/* local write swap offsets b */
s_waitcnt lgkmcnt(0)                               // 3wait for local write
// Skip force waitcnt0
s_barrier
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:223  */
ds_read_b128 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+3], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:224  */
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:225  */
ds_read_b128 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+3], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:226  */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:227  */
ds_read_b128 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+3], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:228  */
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:229  */
ds_read_b128 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+3], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:230  */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:231  */
ds_read_b128 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+3], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:232  */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:233  */
ds_read_b128 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+3], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:234  */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:235  */
ds_read_b128 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+3], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:236  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:237  */
ds_read_b128 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+3], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:238  */
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:239  */
ds_read_b128 v[vgprValuB_X0_I0+32:vgprValuB_X0_I0+32+3], v[vgprLocalReadAddrB] offset:1024 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=8 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:240  */
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:241  */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:242  */
ds_read_b128 v[vgprValuB_X0_I0+36:vgprValuB_X0_I0+36+3], v[vgprLocalReadAddrB] offset:1152 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=9 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:243  */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:244  */
ds_read_b128 v[vgprValuB_X0_I0+40:vgprValuB_X0_I0+40+3], v[vgprLocalReadAddrB] offset:1280 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=10 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:245  */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:246  */
ds_read_b128 v[vgprValuB_X0_I0+44:vgprValuB_X0_I0+44+3], v[vgprLocalReadAddrB] offset:1408 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=11 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:247  */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:248  */
ds_read_b128 v[vgprValuB_X0_I0+48:vgprValuB_X0_I0+48+3], v[vgprLocalReadAddrB] offset:1536 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=12 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:249  */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:250  */
ds_read_b128 v[vgprValuB_X0_I0+52:vgprValuB_X0_I0+52+3], v[vgprLocalReadAddrB] offset:1664 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=13 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:251  */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:252  */
ds_read_b128 v[vgprValuB_X0_I0+56:vgprValuB_X0_I0+56+3], v[vgprLocalReadAddrB] offset:1792 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=14 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:253  */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:254  */
ds_read_b128 v[vgprValuB_X0_I0+60:vgprValuB_X0_I0+60+3], v[vgprLocalReadAddrB] offset:1920 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=15 rIdx=0 oIdx=0 buffer=0 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:255  */
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=1 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=4 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=16 */

.set vgprValuA_X0_I0, vgprValuA_X0_I0_0
.set vgprValuA_X2_I0, vgprValuA_X2_I0_0

s_branch label_LoopEndL_odd_NoLoadLoop

label_LoopEndL_odd_NoLoadLoop:

/******************************************/
/* Ord. NoLoadLoop - Begin                */
/******************************************/
s_waitcnt vmcnt(0)

/* iter 0 (last unrolled loop) */
/*  grEndMfmaIndex:0, lwStartMfmaIndex:191, lwEndMfmaIndex:191  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:0  */
s_waitcnt lgkmcnt(0)                               // wait for prior local read local write old=0, new=0 newLW=0 newLR=0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:1  */
ds_read_b128 v[vgprValuB_X2_I0+0:vgprValuB_X2_I0+0+3], v[vgprLocalReadAddrB] offset:64 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:2  */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:3  */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:4  */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:5  */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:6  */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:7  */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:8  */
ds_read_b128 v[vgprValuB_X2_I0+4:vgprValuB_X2_I0+4+3], v[vgprLocalReadAddrB] offset:192 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:9  */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:10  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:11  */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:12  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:13  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:14  */
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:15  */
ds_read_b128 v[vgprValuB_X2_I0+8:vgprValuB_X2_I0+8+3], v[vgprLocalReadAddrB] offset:320 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:16  */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:17  */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:18  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:19  */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:20  */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:21  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:22  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+12:vgprValuB_X2_I0+12+3], v[vgprLocalReadAddrB] offset:448 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:23  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:24  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:25  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:26  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:27  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:28  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:29  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+16:vgprValuB_X2_I0+16+3], v[vgprLocalReadAddrB] offset:576 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:30  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:31  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:32  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:33  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:34  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:35  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X0_I0+32+0+0:vgprValuB_X0_I0+32+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:36  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+20:vgprValuB_X2_I0+20+3], v[vgprLocalReadAddrB] offset:704 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:37  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:38  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:39  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X0_I0+36+0+0:vgprValuB_X0_I0+36+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:40  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:41  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:42  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:43  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+24:vgprValuB_X2_I0+24+3], v[vgprLocalReadAddrB] offset:832 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X0_I0+40+0+0:vgprValuB_X0_I0+40+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:44  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:45  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:46  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:47  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X0_I0+44+0+0:vgprValuB_X0_I0+44+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:48  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:49  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:50  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+28:vgprValuB_X2_I0+28+3], v[vgprLocalReadAddrB] offset:960 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:51  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X0_I0+48+0+0:vgprValuB_X0_I0+48+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:52  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:53  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:54  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:55  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X0_I0+52+0+0:vgprValuB_X0_I0+52+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:56  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:57  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+32:vgprValuB_X2_I0+32+3], v[vgprLocalReadAddrB] offset:1088 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=8 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:58  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:59  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X0_I0+56+0+0:vgprValuB_X0_I0+56+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:60  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:61  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:62  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+8+0+0:vgprValuA_X0_I0+8+0+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:63  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X0_I0+60+0+0:vgprValuB_X0_I0+60+0+0+1], v[vgprValuA_X0_I0+12+0+0:vgprValuA_X0_I0+12+0+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=4 */
/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=16 */

/* iter 1 (last unrolled loop) */
/*  grEndMfmaIndex:0, lwStartMfmaIndex:191, lwEndMfmaIndex:191  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:64  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+36:vgprValuB_X2_I0+36+3], v[vgprLocalReadAddrB] offset:1216 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=9 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:65  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:66  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:67  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X0_I0+0+2+0:vgprValuB_X0_I0+0+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:68  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:69  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:70  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:71  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+40:vgprValuB_X2_I0+40+3], v[vgprLocalReadAddrB] offset:1344 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=10 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X0_I0+4+2+0:vgprValuB_X0_I0+4+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:72  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:73  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:74  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:75  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X0_I0+8+2+0:vgprValuB_X0_I0+8+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:76  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:77  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:78  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+44:vgprValuB_X2_I0+44+3], v[vgprLocalReadAddrB] offset:1472 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=11 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:79  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X0_I0+12+2+0:vgprValuB_X0_I0+12+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:80  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:81  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:82  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:83  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X0_I0+16+2+0:vgprValuB_X0_I0+16+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:84  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:85  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+48:vgprValuB_X2_I0+48+3], v[vgprLocalReadAddrB] offset:1600 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=12 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:86  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:87  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X0_I0+20+2+0:vgprValuB_X0_I0+20+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:88  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:89  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:90  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:91  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X0_I0+24+2+0:vgprValuB_X0_I0+24+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:92  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+52:vgprValuB_X2_I0+52+3], v[vgprLocalReadAddrB] offset:1728 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=13 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:93  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:94  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:95  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X0_I0+28+2+0:vgprValuB_X0_I0+28+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:96  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:97  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:98  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:99  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+56:vgprValuB_X2_I0+56+3], v[vgprLocalReadAddrB] offset:1856 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=14 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X0_I0+32+2+0:vgprValuB_X0_I0+32+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:100  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:101  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:102  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:103  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X0_I0+36+2+0:vgprValuB_X0_I0+36+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:104  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:105  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:106  */
/* localReadsVacancy: latencyLeft 2 */
ds_read_b128 v[vgprValuB_X2_I0+60:vgprValuB_X2_I0+60+3], v[vgprLocalReadAddrB] offset:1984 // L -> Reg lro=64 swapByteOffset=0 ti=256 vIdx=0 eIdx=15 rIdx=0 oIdx=0 buffer=2 iui=0
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:107  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X0_I0+40+2+0:vgprValuB_X0_I0+40+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:108  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:109  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:110  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:111  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X0_I0+44+2+0:vgprValuB_X0_I0+44+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:112  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:113  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:114  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:115  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X0_I0+48+2+0:vgprValuB_X0_I0+48+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:116  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:117  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:118  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:119  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X0_I0+52+2+0:vgprValuB_X0_I0+52+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:120  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:121  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:122  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:123  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X0_I0+56+2+0:vgprValuB_X0_I0+56+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:124  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+0+2+0:vgprValuA_X0_I0+0+2+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:125  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+4+2+0:vgprValuA_X0_I0+4+2+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:126  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+8+2+0:vgprValuA_X0_I0+8+2+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:127  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X0_I0+60+2+0:vgprValuB_X0_I0+60+2+0+1], v[vgprValuA_X0_I0+12+2+0:vgprValuA_X0_I0+12+2+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=-1 numReadsIterA=1 skipReadsIterA=1 readsPerIterA=4 */
/* dataAtIterB=-1 numReadsIterB=1 skipReadsIterB=1 readsPerIterB=16 */

/* iter 2 (last unrolled loop) */
/*  grEndMfmaIndex:0, lwStartMfmaIndex:191, lwEndMfmaIndex:191  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:128  */
/* localReadsVacancy: latencyLeft 2 */
s_waitcnt lgkmcnt(0)                               // wait for prior local read local write old=0, new=0 newLW=0 newLR=0
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:129  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:130  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:131  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X2_I0+0+0+0:vgprValuB_X2_I0+0+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:132  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:133  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:134  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:135  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X2_I0+4+0+0:vgprValuB_X2_I0+4+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:136  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:137  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:138  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:139  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X2_I0+8+0+0:vgprValuB_X2_I0+8+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:140  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:141  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:142  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:143  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X2_I0+12+0+0:vgprValuB_X2_I0+12+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:144  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:145  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:146  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:147  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X2_I0+16+0+0:vgprValuB_X2_I0+16+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:148  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:149  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:150  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:151  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X2_I0+20+0+0:vgprValuB_X2_I0+20+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:152  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:153  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:154  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:155  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X2_I0+24+0+0:vgprValuB_X2_I0+24+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:156  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:157  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:158  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:159  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X2_I0+28+0+0:vgprValuB_X2_I0+28+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:160  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:161  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:162  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:163  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X2_I0+32+0+0:vgprValuB_X2_I0+32+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:164  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:165  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:166  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:167  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X2_I0+36+0+0:vgprValuB_X2_I0+36+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:168  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:169  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:170  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:171  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X2_I0+40+0+0:vgprValuB_X2_I0+40+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:172  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:173  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:174  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:175  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X2_I0+44+0+0:vgprValuB_X2_I0+44+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:176  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:177  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:178  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:179  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X2_I0+48+0+0:vgprValuB_X2_I0+48+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:180  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:181  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:182  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:183  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X2_I0+52+0+0:vgprValuB_X2_I0+52+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:184  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:185  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:186  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:187  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X2_I0+56+0+0:vgprValuB_X2_I0+56+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:188  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+0+0+0:vgprValuA_X2_I0+0+0+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:189  */
/* localReadsVacancy: latencyLeft 2 */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+4+0+0:vgprValuA_X2_I0+4+0+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:190  */
/* schedule remaining localreads for 1LDSB */
/* localReadsVacancy: latencyLeft 2 */
/* 1 LDS buffer: read-sync-write */
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+8+0+0:vgprValuA_X2_I0+8+0+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:191  */
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X2_I0+60+0+0:vgprValuB_X2_I0+60+0+0+1], v[vgprValuA_X2_I0+12+0+0:vgprValuA_X2_I0+12+0+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=4 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=16 */

/* iter 3 (last unrolled loop) */
/*  grEndMfmaIndex:0, lwStartMfmaIndex:191, lwEndMfmaIndex:191  */
/*  numMfmaForLR:30, syncPlrMfmaIndex:225  */
/*  mfmaIndex:192  */
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[0:3] // left value = acc[0+0:3+0]
/*  mfmaIndex:193  */
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[4:7] // left value = acc[4+0:7+0]
/*  mfmaIndex:194  */
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[8:11] // left value = acc[8+0:11+0]
/*  mfmaIndex:195  */
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X2_I0+0+2+0:vgprValuB_X2_I0+0+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[12:15] // left value = acc[12+0:15+0]
/*  mfmaIndex:196  */
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[16:19] // left value = acc[16+0:19+0]
/*  mfmaIndex:197  */
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[20:23] // left value = acc[20+0:23+0]
/*  mfmaIndex:198  */
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[24:27] // left value = acc[24+0:27+0]
/*  mfmaIndex:199  */
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X2_I0+4+2+0:vgprValuB_X2_I0+4+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[28:31] // left value = acc[28+0:31+0]
/*  mfmaIndex:200  */
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[32:35] // left value = acc[32+0:35+0]
/*  mfmaIndex:201  */
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[36:39] // left value = acc[36+0:39+0]
/*  mfmaIndex:202  */
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[40:43] // left value = acc[40+0:43+0]
/*  mfmaIndex:203  */
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X2_I0+8+2+0:vgprValuB_X2_I0+8+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[44:47] // left value = acc[44+0:47+0]
/*  mfmaIndex:204  */
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[48:51] // left value = acc[48+0:51+0]
/*  mfmaIndex:205  */
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[52:55] // left value = acc[52+0:55+0]
/*  mfmaIndex:206  */
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[56:59] // left value = acc[56+0:59+0]
/*  mfmaIndex:207  */
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X2_I0+12+2+0:vgprValuB_X2_I0+12+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[60:63] // left value = acc[60+0:63+0]
/*  mfmaIndex:208  */
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[64:67] // left value = acc[64+0:67+0]
/*  mfmaIndex:209  */
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[68:71] // left value = acc[68+0:71+0]
/*  mfmaIndex:210  */
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[72:75] // left value = acc[72+0:75+0]
/*  mfmaIndex:211  */
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X2_I0+16+2+0:vgprValuB_X2_I0+16+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[76:79] // left value = acc[76+0:79+0]
/*  mfmaIndex:212  */
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[80:83] // left value = acc[80+0:83+0]
/*  mfmaIndex:213  */
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[84:87] // left value = acc[84+0:87+0]
/*  mfmaIndex:214  */
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[88:91] // left value = acc[88+0:91+0]
/*  mfmaIndex:215  */
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X2_I0+20+2+0:vgprValuB_X2_I0+20+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[92:95] // left value = acc[92+0:95+0]
/*  mfmaIndex:216  */
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[96:99] // left value = acc[96+0:99+0]
/*  mfmaIndex:217  */
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[100:103] // left value = acc[100+0:103+0]
/*  mfmaIndex:218  */
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[104:107] // left value = acc[104+0:107+0]
/*  mfmaIndex:219  */
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X2_I0+24+2+0:vgprValuB_X2_I0+24+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[108:111] // left value = acc[108+0:111+0]
/*  mfmaIndex:220  */
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[112:115] // left value = acc[112+0:115+0]
/*  mfmaIndex:221  */
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[116:119] // left value = acc[116+0:119+0]
/*  mfmaIndex:222  */
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[120:123] // left value = acc[120+0:123+0]
/*  mfmaIndex:223  */
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X2_I0+28+2+0:vgprValuB_X2_I0+28+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[124:127] // left value = acc[124+0:127+0]
/*  mfmaIndex:224  */
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[128:131] // left value = acc[128+0:131+0]
/*  mfmaIndex:225  */
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[132:135] // left value = acc[132+0:135+0]
/*  mfmaIndex:226  */
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[136:139] // left value = acc[136+0:139+0]
/*  mfmaIndex:227  */
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X2_I0+32+2+0:vgprValuB_X2_I0+32+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[140:143] // left value = acc[140+0:143+0]
/*  mfmaIndex:228  */
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[144:147] // left value = acc[144+0:147+0]
/*  mfmaIndex:229  */
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[148:151] // left value = acc[148+0:151+0]
/*  mfmaIndex:230  */
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[152:155] // left value = acc[152+0:155+0]
/*  mfmaIndex:231  */
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X2_I0+36+2+0:vgprValuB_X2_I0+36+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[156:159] // left value = acc[156+0:159+0]
/*  mfmaIndex:232  */
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[160:163] // left value = acc[160+0:163+0]
/*  mfmaIndex:233  */
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[164:167] // left value = acc[164+0:167+0]
/*  mfmaIndex:234  */
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[168:171] // left value = acc[168+0:171+0]
/*  mfmaIndex:235  */
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X2_I0+40+2+0:vgprValuB_X2_I0+40+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[172:175] // left value = acc[172+0:175+0]
/*  mfmaIndex:236  */
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[176:179] // left value = acc[176+0:179+0]
/*  mfmaIndex:237  */
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[180:183] // left value = acc[180+0:183+0]
/*  mfmaIndex:238  */
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[184:187] // left value = acc[184+0:187+0]
/*  mfmaIndex:239  */
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X2_I0+44+2+0:vgprValuB_X2_I0+44+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[188:191] // left value = acc[188+0:191+0]
/*  mfmaIndex:240  */
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[192:195] // left value = acc[192+0:195+0]
/*  mfmaIndex:241  */
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[196:199] // left value = acc[196+0:199+0]
/*  mfmaIndex:242  */
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[200:203] // left value = acc[200+0:203+0]
/*  mfmaIndex:243  */
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X2_I0+48+2+0:vgprValuB_X2_I0+48+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[204:207] // left value = acc[204+0:207+0]
/*  mfmaIndex:244  */
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[208:211] // left value = acc[208+0:211+0]
/*  mfmaIndex:245  */
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[212:215] // left value = acc[212+0:215+0]
/*  mfmaIndex:246  */
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[216:219] // left value = acc[216+0:219+0]
/*  mfmaIndex:247  */
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X2_I0+52+2+0:vgprValuB_X2_I0+52+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[220:223] // left value = acc[220+0:223+0]
/*  mfmaIndex:248  */
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[224:227] // left value = acc[224+0:227+0]
/*  mfmaIndex:249  */
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[228:231] // left value = acc[228+0:231+0]
/*  mfmaIndex:250  */
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[232:235] // left value = acc[232+0:235+0]
/*  mfmaIndex:251  */
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X2_I0+56+2+0:vgprValuB_X2_I0+56+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[236:239] // left value = acc[236+0:239+0]
/*  mfmaIndex:252  */
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+0+2+0:vgprValuA_X2_I0+0+2+0+1], acc[240:243] // left value = acc[240+0:243+0]
/*  mfmaIndex:253  */
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+4+2+0:vgprValuA_X2_I0+4+2+0+1], acc[244:247] // left value = acc[244+0:247+0]
/*  mfmaIndex:254  */
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+8+2+0:vgprValuA_X2_I0+8+2+0+1], acc[248:251] // left value = acc[248+0:251+0]
/*  mfmaIndex:255  */
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X2_I0+60+2+0:vgprValuB_X2_I0+60+2+0+1], v[vgprValuA_X2_I0+12+2+0:vgprValuA_X2_I0+12+2+0+1], acc[252:255] // left value = acc[252+0:255+0]
/* numPrefetchIter=0 */
/* dataAtIterA=0 numReadsIterA=1 skipReadsIterA=0 readsPerIterA=4 */
/* dataAtIterB=0 numReadsIterB=1 skipReadsIterB=0 readsPerIterB=16 */

label_PrefetchGlobalLastIterEnd:
.set vgprValuA_X0_I0, vgprValuA_X0_I0_1
.set vgprValuA_X2_I0, vgprValuA_X2_I0_1

/******************************************/
/* Tail Loop                              */
/******************************************/

/* Tail: add ValuA/B vgpr buffer [0...160) to pool */

/* local write reset offsets a */

/* local write reset offsets b */

// numIterL = LOCAL_SPLITU * min(sizeL % LOCAL_DEPTHU, DEPTHU / LOCAL_SPLITU)
s_and_b32 s[sgprLoopCounterL], 127, s[sgprSizesSum+0] // s[sgprLoopCounterL] = s[sgprSizesSum+0] % 128
s_and_b32 s28, s[sgprGSU], 0x8000                  // SCC = (GSUC == 1) ?
s_cbranch_scc1 label_GSUC_TL                       // branch if GSUC == 1
s_cmp_lg_u32 s[sgprGSUSumIdx], s[sgprGSUSumIdx+1]  // gsuSumIdx == numIterPerWgRemainder
s_cmov_b32 s[sgprLoopCounterL], 0x0                // numIter=0 if gsuSimIdx != numIterPerWgRemainder
s_branch label_GSUC_TL_End
label_GSUC_TL:
s_lshr_b32 s29, s[sgprSizesSum], 7                 // s29 = s[sgprSizesSum] / 128
s_and_b32 s30, s[sgprGSU], 0x3fff                  // Restore GSU
v_cvt_f32_u32 v0, s30                              // s28 = s29 / s30
v_rcp_iflag_f32 v0, v0                             // s28 = s29 / s30
v_cvt_f32_u32 v1, s29                              // s28 = s29 / s30
v_mul_f32 v0, v0, v1                               // s28 = s29 / s30
v_cvt_u32_f32 v0, v0                               // s28 = s29 / s30
v_mul_u32_u24 v1, v0, s30                          // s28 = s29 / s30
v_sub_u32 v1, s29, v1                              // s28 = s29 / s30
v_cmpx_eq_u32 exec, v1, s30                        // s28 = s29 / s30
v_add_u32 v0, 1, v0                                // s28 = s29 / s30
v_mov_b32 v1, 0                                    // s[sgprGSUSumIdx+1] = s29 % s30
s_mov_b64 exec, -1                                 // s28 = s29 / s30
v_readfirstlane_b32 s28, v0                        // quotient
v_readfirstlane_b32 s[sgprGSUSumIdx+1], v1         // remainder
s_sub_u32 s29, s30, 1                              // GSU-1
s_cmp_eq_u32 s28, 0                                // quotient == 0
s_cselect_b32 s28, s[sgprGSUSumIdx+1], s29         // lastWg = (quotient==0) ? numIterPerWgRemainder : GSU-1
s_cmp_lg_u32 s[sgprGSUSumIdx], s28                 // gsuSumIdx == lastWg
s_cmov_b32 s[sgprLoopCounterL], 0x0                // numIter=0 if gsuSumIdx != lastWg
label_GSUC_TL_End:
s_cmp_eq_u32 s[sgprLoopCounterL], 0x0              // numIterL == 0
s_mov_b32 s[sgprOrigLoopCounter], 0                // repurpose to count each localRead increment
s_cbranch_scc1 label_SkipTailLoopL                 // skip to end of tail loop b/c numIter==0

/* remove stagger offsets for tail loop */
s_sub_i32 s28, 3, s[sgprStaggerUIter]
s_mul_hi_i32 s29, s28, s[sgprGlobalReadIncsA+0]    // start offset S in bytes
s_mul_i32 s28, s28, s[sgprGlobalReadIncsA+0]       // start offset S in bytes
s_sub_u32 s28, s28, s[sgprWrapUA]                  // S - WrapU
s_subb_u32 s29, s29, s[sgprWrapUA+1]               // S - WrapU
s_add_u32 s[sgprSrdA+0], s[sgprSrdA+0], s28        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdA+1], s[sgprSrdA+1], s29       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitA+0], s[sgprShadowLimitA+0], s28 // limit -= inc)
s_subb_u32 s[sgprShadowLimitA+1], s[sgprShadowLimitA+1], s29 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitA+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdA+2], s[sgprShadowLimitA+0], BufferLimit // Move shadow to real if we are within 2^32
s_sub_i32 s28, 3, s[sgprStaggerUIter]
s_mul_hi_i32 s29, s28, s[sgprGlobalReadIncsB+0]    // start offset S in bytes
s_mul_i32 s28, s28, s[sgprGlobalReadIncsB+0]       // start offset S in bytes
s_sub_u32 s28, s28, s[sgprWrapUB]                  // S - WrapU
s_subb_u32 s29, s29, s[sgprWrapUB+1]               // S - WrapU
s_add_u32 s[sgprSrdB+0], s[sgprSrdB+0], s28        // gra SRD += inc(lower)
s_addc_u32 s[sgprSrdB+1], s[sgprSrdB+1], s29       // gra SRD += inc(upper)
s_sub_u32 s[sgprShadowLimitB+0], s[sgprShadowLimitB+0], s28 // limit -= inc)
s_subb_u32 s[sgprShadowLimitB+1], s[sgprShadowLimitB+1], s29 // limit -= inc)
s_cmp_eq_u32 s[sgprShadowLimitB+1], 0              // are we within 2^32?
s_cselect_b32 s[sgprSrdB+2], s[sgprShadowLimitB+0], BufferLimit // Move shadow to real if we are within 2^32

/* Recalc global read offsets */
v_and_b32 v1, 63, v[vgprSerial]                    // 0. thread id in wave: wtid = tid % wavelength(64)
v_and_b32 v0 15, v1                               // 1. M offset: mIdx = wtid % MI_M(16)
v_mul_lo_u32 v0, s[sgprStrideA0I], v0              // 1. M offset: mOffset = mIdx * mStride(k)
v_lshlrev_b32 v0, 0x2, v0                          // 2. apply VectorWidth: bnOffset = bnOffset * vw(4)
v_and_b32 v1, 63, v[vgprSerial]                    // 3. thread id in wave: wtid = tid % wavelength(64)
v_lshrrev_b32 v1, 4, v1                            // 4. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
v_lshlrev_b32 v1, 0x3, v1                          // 4. K offset: lrKOffset = kIdx * mStride(8)
v_add_u32 v0, v1, v0                               // 5. offset in wave: lrOffset = bnOffset + lrKOffset
v_lshrrev_b32 v1, 6, v[vgprSerial]                 // 6. wave offset in M dimen: wtid = tid / dividedForWaveId(64)
v_and_b32 v1, 3, v1                                // 6. wave offset in M dimen: wtid0 = wtid % num1DWaves(4)
v_mul_lo_u32 v1, s[sgprStrideA0I], v1              // 6. wave offset in M dimen: wOffset = wtid0 * s[sgprStrideA0I]
v_lshlrev_b32 v1, 0x6, v1                          // 6. wave offset in M dimen: wOffset = wOffset * 16 * vw(4)
v_add_u32 v[vgprGlobalReadOffsetA], v1, v0         // 7. final local read offset: flrOffset = lrOffset + WOffset
v_add_u32 v[vgprGlobalReadOffsetA] 0x10 v[vgprGlobalReadOffsetA]    // add prepad for pointer shift
                                                                    // offset *= bytes/element

s_mul_i32 s[sgprScalarGlobalReadOffsetA+0], s[sgprStrideA0I], 1 // compute offset diff (scaled tileDim)
                                                                // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetA+1], s[sgprStrideA0I], 2 // compute offset diff (scaled tileDim)
                                                                // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetA+2], s[sgprStrideA0I], 3 // compute offset diff (scaled tileDim)
                                                                // scalar offset *= bytes/element
s_mul_i32 s[sgprScalarGlobalReadOffsetA+3], 1, 32 // compute offset diff (scaled tileDim)
                                                  // scalar offset *= bytes/element
s_add_u32 s[sgprScalarGlobalReadOffsetA+4], s[sgprScalarGlobalReadOffsetA+0], s[sgprScalarGlobalReadOffsetA+3]
s_add_u32 s[sgprScalarGlobalReadOffsetA+5], s[sgprScalarGlobalReadOffsetA+1], s[sgprScalarGlobalReadOffsetA+3]
s_add_u32 s[sgprScalarGlobalReadOffsetA+6], s[sgprScalarGlobalReadOffsetA+2], s[sgprScalarGlobalReadOffsetA+3]

/* Update M0 for DTLDS */

/* global read A */
/* g2l=0, load component 0 */
buffer_load_ubyte_d16 v[vgprValuA_X0_I0+0], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:0 // load one buffer value
/* g2l=0, load component 1 */
buffer_load_ubyte_d16 v0, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:1 // load one buffer value
/* g2l=0, load component 2 */
buffer_load_ubyte_d16_hi v1, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:2 // load one buffer value
/* g2l=0, load component 3 */
buffer_load_ubyte_d16_hi v2, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:3 // load one buffer value
/* g2l=0, load component 4 */
buffer_load_ubyte_d16 v[vgprValuA_X0_I0+1], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:4 // load one buffer value
/* g2l=0, load component 5 */
buffer_load_ubyte_d16 v4, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:5 // load one buffer value
/* g2l=0, load component 6 */
buffer_load_ubyte_d16_hi v5, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:6 // load one buffer value
/* g2l=0, load component 7 */
buffer_load_ubyte_d16_hi v6, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:7 // load one buffer value

/* g2l=0, load component 0 */
buffer_load_ubyte_d16 v[vgprValuA_X0_I0+8], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0 // load one buffer value
/* g2l=0, load component 1 */
buffer_load_ubyte_d16 v8, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:1 // load one buffer value
/* g2l=0, load component 2 */
buffer_load_ubyte_d16_hi v9, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:2 // load one buffer value
/* g2l=0, load component 3 */
buffer_load_ubyte_d16_hi v10, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:3 // load one buffer value
/* g2l=0, load component 4 */
buffer_load_ubyte_d16 v[vgprValuA_X0_I0+9], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:4 // load one buffer value
/* g2l=0, load component 5 */
buffer_load_ubyte_d16 v12, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:5 // load one buffer value
/* g2l=0, load component 6 */
buffer_load_ubyte_d16_hi v13, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:6 // load one buffer value
/* g2l=0, load component 7 */
buffer_load_ubyte_d16_hi v14, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:7 // load one buffer value

s_waitcnt vmcnt(14)
v_lshlrev_b32 v0, 0x8, v0                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+0], v[vgprValuA_X0_I0+0], v0      // pack a sub 8-bit with dest
s_waitcnt vmcnt(13)
v_or_b32 v[vgprValuA_X0_I0+0+0], v[vgprValuA_X0_I0+0+0], v1      // pack a sub 8-bit with dest
s_waitcnt vmcnt(12)
v_lshlrev_b32 v2, 0x8, v2                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+0+0], v[vgprValuA_X0_I0+0+0], v2      // pack a sub 8-bit with dest
s_waitcnt vmcnt(10)
v_lshlrev_b32 v4, 0x8, v4                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+0+1], v[vgprValuA_X0_I0+0+1], v4      // pack a sub 8-bit with dest
s_waitcnt vmcnt(9)
v_or_b32 v[vgprValuA_X0_I0+0+1], v[vgprValuA_X0_I0+0+1], v5      // pack a sub 8-bit with dest
s_waitcnt vmcnt(8)
v_lshlrev_b32 v6, 0x8, v6                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+0+1], v[vgprValuA_X0_I0+0+1], v6      // pack a sub 8-bit with dest
s_waitcnt vmcnt(6)
v_lshlrev_b32 v8, 0x8, v8                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+8], v[vgprValuA_X0_I0+8], v8      // pack a sub 8-bit with dest
s_waitcnt vmcnt(5)
v_or_b32 v[vgprValuA_X0_I0+8], v[vgprValuA_X0_I0+8], v9      // pack a sub 8-bit with dest
s_waitcnt vmcnt(4)
v_lshlrev_b32 v10, 0x8, v10                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+8], v[vgprValuA_X0_I0+8], v10     // pack a sub 8-bit with dest
s_waitcnt vmcnt(2)
v_lshlrev_b32 v12, 0x8, v12                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+9], v[vgprValuA_X0_I0+9], v12     // pack a sub 8-bit with dest
s_waitcnt vmcnt(1)
v_or_b32 v[vgprValuA_X0_I0+9], v[vgprValuA_X0_I0+9], v13     // pack a sub 8-bit with dest
s_waitcnt vmcnt(0)
v_lshlrev_b32 v14, 0x8, v14                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+9], v[vgprValuA_X0_I0+9], v14     // pack a sub 8-bit with dest

/* g2l=0, load component 0 */
buffer_load_ubyte_d16 v[vgprValuA_X2_I0+0], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:64 // load one buffer value
/* g2l=0, load component 1 */
buffer_load_ubyte_d16 v0, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:65 // load one buffer value
/* g2l=0, load component 2 */
buffer_load_ubyte_d16_hi v1, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:66 // load one buffer value
/* g2l=0, load component 3 */
buffer_load_ubyte_d16_hi v2, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:67 // load one buffer value
/* g2l=0, load component 4 */
buffer_load_ubyte_d16 v[vgprValuA_X2_I0+1], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:68 // load one buffer value
/* g2l=0, load component 5 */
buffer_load_ubyte_d16 v4, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:69 // load one buffer value
/* g2l=0, load component 6 */
buffer_load_ubyte_d16_hi v5, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:70 // load one buffer value
/* g2l=0, load component 7 */
buffer_load_ubyte_d16_hi v6, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], 0 offen offset:71 // load one buffer value

s_add_u32 s[sgprScalarGlobalReadOffsetA+3], s[sgprScalarGlobalReadOffsetA+3], 64

/* g2l=0, load component 0 */
buffer_load_ubyte_d16 v[vgprValuA_X2_I0+8], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:0 // load one buffer value
/* g2l=0, load component 1 */
buffer_load_ubyte_d16 v8, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:1 // load one buffer value
/* g2l=0, load component 2 */
buffer_load_ubyte_d16_hi v9, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:2 // load one buffer value
/* g2l=0, load component 3 */
buffer_load_ubyte_d16_hi v10, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:3 // load one buffer value
/* g2l=0, load component 4 */
buffer_load_ubyte_d16 v[vgprValuA_X2_I0+9], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:4 // load one buffer value
/* g2l=0, load component 5 */
buffer_load_ubyte_d16 v12, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:5 // load one buffer value
/* g2l=0, load component 6 */
buffer_load_ubyte_d16_hi v13, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:6 // load one buffer value
/* g2l=0, load component 7 */
buffer_load_ubyte_d16_hi v14, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+3] offen offset:7 // load one buffer value

s_waitcnt vmcnt(14)
v_lshlrev_b32 v0, 0x8, v0                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+0], v[vgprValuA_X2_I0+0], v0      // pack a sub 8-bit with dest
s_waitcnt vmcnt(13)
v_or_b32 v[vgprValuA_X2_I0+0+0], v[vgprValuA_X2_I0+0+0], v1      // pack a sub 8-bit with dest
s_waitcnt vmcnt(12)
v_lshlrev_b32 v2, 0x8, v2                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+0+0], v[vgprValuA_X2_I0+0+0], v2      // pack a sub 8-bit with dest
s_waitcnt vmcnt(10)
v_lshlrev_b32 v4, 0x8, v4                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+0+1], v[vgprValuA_X2_I0+0+1], v4      // pack a sub 8-bit with dest
s_waitcnt vmcnt(9)
v_or_b32 v[vgprValuA_X2_I0+0+1], v[vgprValuA_X2_I0+0+1], v5      // pack a sub 8-bit with dest
s_waitcnt vmcnt(8)
v_lshlrev_b32 v6, 0x8, v6                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+0+1], v[vgprValuA_X2_I0+0+1], v6      // pack a sub 8-bit with dest
s_waitcnt vmcnt(6)
v_lshlrev_b32 v8, 0x8, v8                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+8], v[vgprValuA_X2_I0+8], v8      // pack a sub 8-bit with dest
s_waitcnt vmcnt(5)
v_or_b32 v[vgprValuA_X2_I0+8], v[vgprValuA_X2_I0+8], v9      // pack a sub 8-bit with dest
s_waitcnt vmcnt(4)
v_lshlrev_b32 v10, 0x8, v10                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+8], v[vgprValuA_X2_I0+8], v10     // pack a sub 8-bit with dest
s_waitcnt vmcnt(2)
v_lshlrev_b32 v12, 0x8, v12                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+9], v[vgprValuA_X2_I0+9], v12     // pack a sub 8-bit with dest
s_waitcnt vmcnt(1)
v_or_b32 v[vgprValuA_X2_I0+9], v[vgprValuA_X2_I0+9], v13     // pack a sub 8-bit with dest
s_waitcnt vmcnt(0)
v_lshlrev_b32 v14, 0x8, v14                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+9], v[vgprValuA_X2_I0+9], v14     // pack a sub 8-bit with dest

/* g2l=0, load component 0 */
buffer_load_ubyte_d16 v[vgprValuA_X0_I0+2], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0 // load one buffer value
/* g2l=0, load component 1 */
buffer_load_ubyte_d16 v0, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:1 // load one buffer value
/* g2l=0, load component 2 */
buffer_load_ubyte_d16_hi v1, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:2 // load one buffer value
/* g2l=0, load component 3 */
buffer_load_ubyte_d16_hi v2, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:3 // load one buffer value
/* g2l=0, load component 4 */
buffer_load_ubyte_d16 v[vgprValuA_X0_I0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:4 // load one buffer value
/* g2l=0, load component 5 */
buffer_load_ubyte_d16 v4, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:5 // load one buffer value
/* g2l=0, load component 6 */
buffer_load_ubyte_d16_hi v5, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:6 // load one buffer value
/* g2l=0, load component 7 */
buffer_load_ubyte_d16_hi v6, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:7 // load one buffer value

/* g2l=0, load component 0 */
buffer_load_ubyte_d16 v[vgprValuA_X0_I0+10], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0 // load one buffer value
/* g2l=0, load component 1 */
buffer_load_ubyte_d16 v8, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:1 // load one buffer value
/* g2l=0, load component 2 */
buffer_load_ubyte_d16_hi v9, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:2 // load one buffer value
/* g2l=0, load component 3 */
buffer_load_ubyte_d16_hi v10, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:3 // load one buffer value
/* g2l=0, load component 4 */
buffer_load_ubyte_d16 v[vgprValuA_X0_I0+11], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:4 // load one buffer value
/* g2l=0, load component 5 */
buffer_load_ubyte_d16 v12, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:5 // load one buffer value
/* g2l=0, load component 6 */
buffer_load_ubyte_d16_hi v13, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:6 // load one buffer value
/* g2l=0, load component 7 */
buffer_load_ubyte_d16_hi v14, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:7 // load one buffer value

s_waitcnt vmcnt(14)
v_lshlrev_b32 v0, 0x8, v0                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+2], v[vgprValuA_X0_I0+2], v0      // pack a sub 8-bit with dest
s_waitcnt vmcnt(13)
v_or_b32 v[vgprValuA_X0_I0+2], v[vgprValuA_X0_I0+2], v1      // pack a sub 8-bit with dest
s_waitcnt vmcnt(12)
v_lshlrev_b32 v2, 0x8, v2                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+2], v[vgprValuA_X0_I0+2], v2      // pack a sub 8-bit with dest
s_waitcnt vmcnt(10)
v_lshlrev_b32 v4, 0x8, v4                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+3], v[vgprValuA_X0_I0+3], v4      // pack a sub 8-bit with dest
s_waitcnt vmcnt(9)
v_or_b32 v[vgprValuA_X0_I0+3], v[vgprValuA_X0_I0+3], v5      // pack a sub 8-bit with dest
s_waitcnt vmcnt(8)
v_lshlrev_b32 v6, 0x8, v6                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+3], v[vgprValuA_X0_I0+3], v6      // pack a sub 8-bit with dest
s_waitcnt vmcnt(6)
v_lshlrev_b32 v8, 0x8, v8                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+10], v[vgprValuA_X0_I0+10], v8      // pack a sub 8-bit with dest
s_waitcnt vmcnt(5)
v_or_b32 v[vgprValuA_X0_I0+10], v[vgprValuA_X0_I0+10], v9      // pack a sub 8-bit with dest
s_waitcnt vmcnt(4)
v_lshlrev_b32 v10, 0x8, v10                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+10], v[vgprValuA_X0_I0+10], v10     // pack a sub 8-bit with dest
s_waitcnt vmcnt(2)
v_lshlrev_b32 v12, 0x8, v12                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+11], v[vgprValuA_X0_I0+11], v12     // pack a sub 8-bit with dest
s_waitcnt vmcnt(1)
v_or_b32 v[vgprValuA_X0_I0+11], v[vgprValuA_X0_I0+11], v13     // pack a sub 8-bit with dest
s_waitcnt vmcnt(0)
v_lshlrev_b32 v14, 0x8, v14                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+11], v[vgprValuA_X0_I0+11], v14     // pack a sub 8-bit with dest

s_add_u32 s[sgprScalarGlobalReadOffsetA+0], s[sgprScalarGlobalReadOffsetA+0], 64

/* g2l=0, load component 0 */
buffer_load_ubyte_d16 v[vgprValuA_X2_I0+2], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:0 // load one buffer value
/* g2l=0, load component 1 */
buffer_load_ubyte_d16 v0, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:1 // load one buffer value
/* g2l=0, load component 2 */
buffer_load_ubyte_d16_hi v1, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:2 // load one buffer value
/* g2l=0, load component 3 */
buffer_load_ubyte_d16_hi v2, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:3 // load one buffer value
/* g2l=0, load component 4 */
buffer_load_ubyte_d16 v[vgprValuA_X2_I0+3], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:4 // load one buffer value
/* g2l=0, load component 5 */
buffer_load_ubyte_d16 v4, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:5 // load one buffer value
/* g2l=0, load component 6 */
buffer_load_ubyte_d16_hi v5, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:6 // load one buffer value
/* g2l=0, load component 7 */
buffer_load_ubyte_d16_hi v6, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+0] offen offset:7 // load one buffer value

s_add_u32 s[sgprScalarGlobalReadOffsetA+4], s[sgprScalarGlobalReadOffsetA+4], 64

/* g2l=0, load component 0 */
buffer_load_ubyte_d16 v[vgprValuA_X2_I0+10], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:0 // load one buffer value
/* g2l=0, load component 1 */
buffer_load_ubyte_d16 v8, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:1 // load one buffer value
/* g2l=0, load component 2 */
buffer_load_ubyte_d16_hi v9, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:2 // load one buffer value
/* g2l=0, load component 3 */
buffer_load_ubyte_d16_hi v10, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:3 // load one buffer value
/* g2l=0, load component 4 */
buffer_load_ubyte_d16 v[vgprValuA_X2_I0+11], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:4 // load one buffer value
/* g2l=0, load component 5 */
buffer_load_ubyte_d16 v12, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:5 // load one buffer value
/* g2l=0, load component 6 */
buffer_load_ubyte_d16_hi v13, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:6 // load one buffer value
/* g2l=0, load component 7 */
buffer_load_ubyte_d16_hi v14, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+4] offen offset:7 // load one buffer value

s_waitcnt vmcnt(14)
v_lshlrev_b32 v0, 0x8, v0                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+2], v[vgprValuA_X2_I0+2], v0      // pack a sub 8-bit with dest
s_waitcnt vmcnt(13)
v_or_b32 v[vgprValuA_X2_I0+0+2], v[vgprValuA_X2_I0+0+2], v1      // pack a sub 8-bit with dest
s_waitcnt vmcnt(12)
v_lshlrev_b32 v2, 0x8, v2                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+0+2], v[vgprValuA_X2_I0+0+2], v2      // pack a sub 8-bit with dest
s_waitcnt vmcnt(10)
v_lshlrev_b32 v4, 0x8, v4                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+0+3], v[vgprValuA_X2_I0+0+3], v4      // pack a sub 8-bit with dest
s_waitcnt vmcnt(9)
v_or_b32 v[vgprValuA_X2_I0+0+3], v[vgprValuA_X2_I0+0+3], v5      // pack a sub 8-bit with dest
s_waitcnt vmcnt(8)
v_lshlrev_b32 v6, 0x8, v6                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+0+3], v[vgprValuA_X2_I0+0+3], v6      // pack a sub 8-bit with dest
s_waitcnt vmcnt(6)
v_lshlrev_b32 v8, 0x8, v8                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+10], v[vgprValuA_X2_I0+10], v8      // pack a sub 8-bit with dest
s_waitcnt vmcnt(5)
v_or_b32 v[vgprValuA_X2_I0+10], v[vgprValuA_X2_I0+10], v9      // pack a sub 8-bit with dest
s_waitcnt vmcnt(4)
v_lshlrev_b32 v10, 0x8, v10                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+10], v[vgprValuA_X2_I0+10], v10     // pack a sub 8-bit with dest
s_waitcnt vmcnt(2)
v_lshlrev_b32 v12, 0x8, v12                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+11], v[vgprValuA_X2_I0+11], v12     // pack a sub 8-bit with dest
s_waitcnt vmcnt(1)
v_or_b32 v[vgprValuA_X2_I0+11], v[vgprValuA_X2_I0+11], v13     // pack a sub 8-bit with dest
s_waitcnt vmcnt(0)
v_lshlrev_b32 v14, 0x8, v14                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+11], v[vgprValuA_X2_I0+11], v14     // pack a sub 8-bit with dest

/* g2l=0, load component 0 */
buffer_load_ubyte_d16 v[vgprValuA_X0_I0+4], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0 // load one buffer value
/* g2l=0, load component 1 */
buffer_load_ubyte_d16 v0, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:1 // load one buffer value
/* g2l=0, load component 2 */
buffer_load_ubyte_d16_hi v1, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:2 // load one buffer value
/* g2l=0, load component 3 */
buffer_load_ubyte_d16_hi v2, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:3 // load one buffer value
/* g2l=0, load component 4 */
buffer_load_ubyte_d16 v[vgprValuA_X0_I0+5], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:4 // load one buffer value
/* g2l=0, load component 5 */
buffer_load_ubyte_d16 v4, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:5 // load one buffer value
/* g2l=0, load component 6 */
buffer_load_ubyte_d16_hi v5, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:6 // load one buffer value
/* g2l=0, load component 7 */
buffer_load_ubyte_d16_hi v6, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:7 // load one buffer value

/* g2l=0, load component 0 */
buffer_load_ubyte_d16 v[vgprValuA_X0_I0+12], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0 // load one buffer value
/* g2l=0, load component 1 */
buffer_load_ubyte_d16 v8, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:1 // load one buffer value
/* g2l=0, load component 2 */
buffer_load_ubyte_d16_hi v9, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:2 // load one buffer value
/* g2l=0, load component 3 */
buffer_load_ubyte_d16_hi v10, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:3 // load one buffer value
/* g2l=0, load component 4 */
buffer_load_ubyte_d16 v[vgprValuA_X0_I0+13], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:4 // load one buffer value
/* g2l=0, load component 5 */
buffer_load_ubyte_d16 v12, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:5 // load one buffer value
/* g2l=0, load component 6 */
buffer_load_ubyte_d16_hi v13, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:6 // load one buffer value
/* g2l=0, load component 7 */
buffer_load_ubyte_d16_hi v14, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:7 // load one buffer value

s_waitcnt vmcnt(14)
v_lshlrev_b32 v0, 0x8, v0                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+4], v[vgprValuA_X0_I0+4], v0      // pack a sub 8-bit with dest
s_waitcnt vmcnt(13)
v_or_b32 v[vgprValuA_X0_I0+4], v[vgprValuA_X0_I0+4], v1      // pack a sub 8-bit with dest
s_waitcnt vmcnt(12)
v_lshlrev_b32 v2, 0x8, v2                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+4], v[vgprValuA_X0_I0+4], v2      // pack a sub 8-bit with dest
s_waitcnt vmcnt(10)
v_lshlrev_b32 v4, 0x8, v4                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+5], v[vgprValuA_X0_I0+5], v4      // pack a sub 8-bit with dest
s_waitcnt vmcnt(9)
v_or_b32 v[vgprValuA_X0_I0+5], v[vgprValuA_X0_I0+5], v5      // pack a sub 8-bit with dest
s_waitcnt vmcnt(8)
v_lshlrev_b32 v6, 0x8, v6                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+5], v[vgprValuA_X0_I0+5], v6      // pack a sub 8-bit with dest
s_waitcnt vmcnt(6)
v_lshlrev_b32 v8, 0x8, v8                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+12], v[vgprValuA_X0_I0+12], v8      // pack a sub 8-bit with dest
s_waitcnt vmcnt(5)
v_or_b32 v[vgprValuA_X0_I0+12], v[vgprValuA_X0_I0+12], v9      // pack a sub 8-bit with dest
s_waitcnt vmcnt(4)
v_lshlrev_b32 v10, 0x8, v10                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+12], v[vgprValuA_X0_I0+12], v10     // pack a sub 8-bit with dest
s_waitcnt vmcnt(2)
v_lshlrev_b32 v12, 0x8, v12                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+13], v[vgprValuA_X0_I0+13], v12     // pack a sub 8-bit with dest
s_waitcnt vmcnt(1)
v_or_b32 v[vgprValuA_X0_I0+13], v[vgprValuA_X0_I0+13], v13     // pack a sub 8-bit with dest
s_waitcnt vmcnt(0)
v_lshlrev_b32 v14, 0x8, v14                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+13], v[vgprValuA_X0_I0+13], v14     // pack a sub 8-bit with dest

s_add_u32 s[sgprScalarGlobalReadOffsetA+1], s[sgprScalarGlobalReadOffsetA+1], 64

/* g2l=0, load component 0 */
buffer_load_ubyte_d16 v[vgprValuA_X2_I0+4], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:0 // load one buffer value
/* g2l=0, load component 1 */
buffer_load_ubyte_d16 v0, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:1 // load one buffer value
/* g2l=0, load component 2 */
buffer_load_ubyte_d16_hi v1, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:2 // load one buffer value
/* g2l=0, load component 3 */
buffer_load_ubyte_d16_hi v2, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:3 // load one buffer value
/* g2l=0, load component 4 */
buffer_load_ubyte_d16 v[vgprValuA_X2_I0+5], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:4 // load one buffer value
/* g2l=0, load component 5 */
buffer_load_ubyte_d16 v4, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:5 // load one buffer value
/* g2l=0, load component 6 */
buffer_load_ubyte_d16_hi v5, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:6 // load one buffer value
/* g2l=0, load component 7 */
buffer_load_ubyte_d16_hi v6, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+1] offen offset:7 // load one buffer value

s_add_u32 s[sgprScalarGlobalReadOffsetA+5], s[sgprScalarGlobalReadOffsetA+5], 64

/* g2l=0, load component 0 */
buffer_load_ubyte_d16 v[vgprValuA_X2_I0+12], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:0 // load one buffer value
/* g2l=0, load component 1 */
buffer_load_ubyte_d16 v8, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:1 // load one buffer value
/* g2l=0, load component 2 */
buffer_load_ubyte_d16_hi v9, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:2 // load one buffer value
/* g2l=0, load component 3 */
buffer_load_ubyte_d16_hi v10, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:3 // load one buffer value
/* g2l=0, load component 4 */
buffer_load_ubyte_d16 v[vgprValuA_X2_I0+13], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:4 // load one buffer value
/* g2l=0, load component 5 */
buffer_load_ubyte_d16 v12, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:5 // load one buffer value
/* g2l=0, load component 6 */
buffer_load_ubyte_d16_hi v13, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:6 // load one buffer value
/* g2l=0, load component 7 */
buffer_load_ubyte_d16_hi v14, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+5] offen offset:7 // load one buffer value

s_waitcnt vmcnt(14)
v_lshlrev_b32 v0, 0x8, v0                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+4], v[vgprValuA_X2_I0+4], v0      // pack a sub 8-bit with dest
s_waitcnt vmcnt(13)
v_or_b32 v[vgprValuA_X2_I0+0+4], v[vgprValuA_X2_I0+0+4], v1      // pack a sub 8-bit with dest
s_waitcnt vmcnt(12)
v_lshlrev_b32 v2, 0x8, v2                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+0+4], v[vgprValuA_X2_I0+0+4], v2      // pack a sub 8-bit with dest
s_waitcnt vmcnt(10)
v_lshlrev_b32 v4, 0x8, v4                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+0+5], v[vgprValuA_X2_I0+0+5], v4      // pack a sub 8-bit with dest
s_waitcnt vmcnt(9)
v_or_b32 v[vgprValuA_X2_I0+0+5], v[vgprValuA_X2_I0+0+5], v5      // pack a sub 8-bit with dest
s_waitcnt vmcnt(8)
v_lshlrev_b32 v6, 0x8, v6                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+0+5], v[vgprValuA_X2_I0+0+5], v6      // pack a sub 8-bit with dest
s_waitcnt vmcnt(6)
v_lshlrev_b32 v8, 0x8, v8                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+12], v[vgprValuA_X2_I0+12], v8      // pack a sub 8-bit with dest
s_waitcnt vmcnt(5)
v_or_b32 v[vgprValuA_X2_I0+12], v[vgprValuA_X2_I0+12], v9      // pack a sub 8-bit with dest
s_waitcnt vmcnt(4)
v_lshlrev_b32 v10, 0x8, v10                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+12], v[vgprValuA_X2_I0+12], v10     // pack a sub 8-bit with dest
s_waitcnt vmcnt(2)
v_lshlrev_b32 v12, 0x8, v12                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+13], v[vgprValuA_X2_I0+13], v12     // pack a sub 8-bit with dest
s_waitcnt vmcnt(1)
v_or_b32 v[vgprValuA_X2_I0+13], v[vgprValuA_X2_I0+13], v13     // pack a sub 8-bit with dest
s_waitcnt vmcnt(0)
v_lshlrev_b32 v14, 0x8, v14                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+13], v[vgprValuA_X2_I0+13], v14     // pack a sub 8-bit with dest

/* g2l=0, load component 0 */
buffer_load_ubyte_d16 v[vgprValuA_X0_I0+6], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0 // load one buffer value
/* g2l=0, load component 1 */
buffer_load_ubyte_d16 v0, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:1 // load one buffer value
/* g2l=0, load component 2 */
buffer_load_ubyte_d16_hi v1, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:2 // load one buffer value
/* g2l=0, load component 3 */
buffer_load_ubyte_d16_hi v2, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:3 // load one buffer value
/* g2l=0, load component 4 */
buffer_load_ubyte_d16 v[vgprValuA_X0_I0+7], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:4 // load one buffer value
/* g2l=0, load component 5 */
buffer_load_ubyte_d16 v4, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:5 // load one buffer value
/* g2l=0, load component 6 */
buffer_load_ubyte_d16_hi v5, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:6 // load one buffer value
/* g2l=0, load component 7 */
buffer_load_ubyte_d16_hi v6, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:7 // load one buffer value

/* g2l=0, load component 0 */
buffer_load_ubyte_d16 v[vgprValuA_X0_I0+14], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0 // load one buffer value
/* g2l=0, load component 1 */
buffer_load_ubyte_d16 v8, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:1 // load one buffer value
/* g2l=0, load component 2 */
buffer_load_ubyte_d16_hi v9, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:2 // load one buffer value
/* g2l=0, load component 3 */
buffer_load_ubyte_d16_hi v10, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:3 // load one buffer value
/* g2l=0, load component 4 */
buffer_load_ubyte_d16 v[vgprValuA_X0_I0+15], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:4 // load one buffer value
/* g2l=0, load component 5 */
buffer_load_ubyte_d16 v12, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:5 // load one buffer value
/* g2l=0, load component 6 */
buffer_load_ubyte_d16_hi v13, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:6 // load one buffer value
/* g2l=0, load component 7 */
buffer_load_ubyte_d16_hi v14, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:7 // load one buffer value

s_waitcnt vmcnt(14)
v_lshlrev_b32 v0, 0x8, v0                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+6], v[vgprValuA_X0_I0+6], v0      // pack a sub 8-bit with dest
s_waitcnt vmcnt(13)
v_or_b32 v[vgprValuA_X0_I0+6], v[vgprValuA_X0_I0+6], v1      // pack a sub 8-bit with dest
s_waitcnt vmcnt(12)
v_lshlrev_b32 v2, 0x8, v2                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+6], v[vgprValuA_X0_I0+6], v2      // pack a sub 8-bit with dest
s_waitcnt vmcnt(10)
v_lshlrev_b32 v4, 0x8, v4                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+7], v[vgprValuA_X0_I0+7], v4      // pack a sub 8-bit with dest
s_waitcnt vmcnt(9)
v_or_b32 v[vgprValuA_X0_I0+7], v[vgprValuA_X0_I0+7], v5      // pack a sub 8-bit with dest
s_waitcnt vmcnt(8)
v_lshlrev_b32 v6, 0x8, v6                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+7], v[vgprValuA_X0_I0+7], v6      // pack a sub 8-bit with dest
s_waitcnt vmcnt(6)
v_lshlrev_b32 v8, 0x8, v8                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+14], v[vgprValuA_X0_I0+14], v8      // pack a sub 8-bit with dest
s_waitcnt vmcnt(5)
v_or_b32 v[vgprValuA_X0_I0+14], v[vgprValuA_X0_I0+14], v9      // pack a sub 8-bit with dest
s_waitcnt vmcnt(4)
v_lshlrev_b32 v10, 0x8, v10                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+14], v[vgprValuA_X0_I0+14], v10     // pack a sub 8-bit with dest
s_waitcnt vmcnt(2)
v_lshlrev_b32 v12, 0x8, v12                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+15], v[vgprValuA_X0_I0+15], v12     // pack a sub 8-bit with dest
s_waitcnt vmcnt(1)
v_or_b32 v[vgprValuA_X0_I0+15], v[vgprValuA_X0_I0+15], v13     // pack a sub 8-bit with dest
s_waitcnt vmcnt(0)
v_lshlrev_b32 v14, 0x8, v14                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X0_I0+15], v[vgprValuA_X0_I0+15], v14     // pack a sub 8-bit with dest

s_add_u32 s[sgprScalarGlobalReadOffsetA+2], s[sgprScalarGlobalReadOffsetA+2], 64

/* g2l=0, load component 0 */
buffer_load_ubyte_d16 v[vgprValuA_X2_I0+6], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:0 // load one buffer value
/* g2l=0, load component 1 */
buffer_load_ubyte_d16 v0, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:1 // load one buffer value
/* g2l=0, load component 2 */
buffer_load_ubyte_d16_hi v1, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:2 // load one buffer value
/* g2l=0, load component 3 */
buffer_load_ubyte_d16_hi v2, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:3 // load one buffer value
/* g2l=0, load component 4 */
buffer_load_ubyte_d16 v[vgprValuA_X2_I0+7], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:4 // load one buffer value
/* g2l=0, load component 5 */
buffer_load_ubyte_d16 v4, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:5 // load one buffer value
/* g2l=0, load component 6 */
buffer_load_ubyte_d16_hi v5, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:6 // load one buffer value
/* g2l=0, load component 7 */
buffer_load_ubyte_d16_hi v6, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+2] offen offset:7 // load one buffer value

s_add_u32 s[sgprScalarGlobalReadOffsetA+6], s[sgprScalarGlobalReadOffsetA+6], 64

/* g2l=0, load component 0 */
buffer_load_ubyte_d16 v[vgprValuA_X2_I0+14], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:0 // load one buffer value
/* g2l=0, load component 1 */
buffer_load_ubyte_d16 v8, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:1 // load one buffer value
/* g2l=0, load component 2 */
buffer_load_ubyte_d16_hi v9, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:2 // load one buffer value
/* g2l=0, load component 3 */
buffer_load_ubyte_d16_hi v10, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:3 // load one buffer value
/* g2l=0, load component 4 */
buffer_load_ubyte_d16 v[vgprValuA_X2_I0+15], v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:4 // load one buffer value
/* g2l=0, load component 5 */
buffer_load_ubyte_d16 v12, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:5 // load one buffer value
/* g2l=0, load component 6 */
buffer_load_ubyte_d16_hi v13, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:6 // load one buffer value
/* g2l=0, load component 7 */
buffer_load_ubyte_d16_hi v14, v[vgprGlobalReadOffsetA+0], s[sgprSrdA:sgprSrdA+3], s[sgprScalarGlobalReadOffsetA+6] offen offset:7 // load one buffer value

s_waitcnt vmcnt(14)
v_lshlrev_b32 v0, 0x8, v0                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+6], v[vgprValuA_X2_I0+6], v0      // pack a sub 8-bit with dest
s_waitcnt vmcnt(13)
v_or_b32 v[vgprValuA_X2_I0+0+6], v[vgprValuA_X2_I0+0+6], v1      // pack a sub 8-bit with dest
s_waitcnt vmcnt(12)
v_lshlrev_b32 v2, 0x8, v2                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+0+6], v[vgprValuA_X2_I0+0+6], v2      // pack a sub 8-bit with dest
s_waitcnt vmcnt(10)
v_lshlrev_b32 v4, 0x8, v4                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+0+7], v[vgprValuA_X2_I0+0+7], v4      // pack a sub 8-bit with dest
s_waitcnt vmcnt(9)
v_or_b32 v[vgprValuA_X2_I0+0+7], v[vgprValuA_X2_I0+0+7], v5      // pack a sub 8-bit with dest
s_waitcnt vmcnt(8)
v_lshlrev_b32 v6, 0x8, v6                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+0+7], v[vgprValuA_X2_I0+0+7], v6      // pack a sub 8-bit with dest
s_waitcnt vmcnt(6)
v_lshlrev_b32 v8, 0x8, v8                          // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+14], v[vgprValuA_X2_I0+14], v8      // pack a sub 8-bit with dest
s_waitcnt vmcnt(5)
v_or_b32 v[vgprValuA_X2_I0+14], v[vgprValuA_X2_I0+14], v9      // pack a sub 8-bit with dest
s_waitcnt vmcnt(4)
v_lshlrev_b32 v10, 0x8, v10                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+14], v[vgprValuA_X2_I0+14], v10     // pack a sub 8-bit with dest
s_waitcnt vmcnt(2)
v_lshlrev_b32 v12, 0x8, v12                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+15], v[vgprValuA_X2_I0+15], v12     // pack a sub 8-bit with dest
s_waitcnt vmcnt(1)
v_or_b32 v[vgprValuA_X2_I0+15], v[vgprValuA_X2_I0+15], v13     // pack a sub 8-bit with dest
s_waitcnt vmcnt(0)
v_lshlrev_b32 v14, 0x8, v14                        // shift left to higher 8 bits
v_or_b32 v[vgprValuA_X2_I0+15], v[vgprValuA_X2_I0+15], v14     // pack a sub 8-bit with dest

/* Update M0 for DTLDS */

/* global read B */
/* g2l=0, load component 0 */
buffer_load_ubyte_d16 v[vgprG2LB+0+0], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:0 // load one buffer value
/* g2l=0, load component 1 */
buffer_load_ubyte_d16 v0, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:1 // load one buffer value
/* g2l=0, load component 2 */
buffer_load_ubyte_d16_hi v1, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:2 // load one buffer value
/* g2l=0, load component 3 */
buffer_load_ubyte_d16_hi v2, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:3 // load one buffer value
/* g2l=0, load component 4 */
buffer_load_ubyte_d16 v[vgprG2LB+0+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:4 // load one buffer value
/* g2l=0, load component 5 */
buffer_load_ubyte_d16 v4, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:5 // load one buffer value
/* g2l=0, load component 6 */
buffer_load_ubyte_d16_hi v5, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:6 // load one buffer value
/* g2l=0, load component 7 */
buffer_load_ubyte_d16_hi v6, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:7 // load one buffer value
/* g2l=0, load component 8 */
buffer_load_ubyte_d16 v[vgprG2LB+0+2], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:8 // load one buffer value
/* g2l=0, load component 9 */
buffer_load_ubyte_d16 v8, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:9 // load one buffer value
/* g2l=0, load component 10 */
buffer_load_ubyte_d16_hi v9, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:10 // load one buffer value
/* g2l=0, load component 11 */
buffer_load_ubyte_d16_hi v10, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:11 // load one buffer value
/* g2l=0, load component 12 */
buffer_load_ubyte_d16 v[vgprG2LB+0+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:12 // load one buffer value
/* g2l=0, load component 13 */
buffer_load_ubyte_d16 v12, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:13 // load one buffer value
/* g2l=0, load component 14 */
buffer_load_ubyte_d16_hi v13, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:14 // load one buffer value
/* g2l=0, load component 15 */
buffer_load_ubyte_d16_hi v14, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], 0 offen offset:15 // load one buffer value
s_waitcnt vmcnt(14)
v_lshlrev_b32 v0, 0x8, v0                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+0+0], v[vgprG2LB+0+0], v0      // pack a sub 8-bit with dest
s_waitcnt vmcnt(13)
v_or_b32 v[vgprG2LB+0+0], v[vgprG2LB+0+0], v1      // pack a sub 8-bit with dest
s_waitcnt vmcnt(12)
v_lshlrev_b32 v2, 0x8, v2                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+0+0], v[vgprG2LB+0+0], v2      // pack a sub 8-bit with dest
s_waitcnt vmcnt(10)
v_lshlrev_b32 v4, 0x8, v4                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+0+1], v[vgprG2LB+0+1], v4      // pack a sub 8-bit with dest
s_waitcnt vmcnt(9)
v_or_b32 v[vgprG2LB+0+1], v[vgprG2LB+0+1], v5      // pack a sub 8-bit with dest
s_waitcnt vmcnt(8)
v_lshlrev_b32 v6, 0x8, v6                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+0+1], v[vgprG2LB+0+1], v6      // pack a sub 8-bit with dest
s_waitcnt vmcnt(6)
v_lshlrev_b32 v8, 0x8, v8                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+0+2], v[vgprG2LB+0+2], v8      // pack a sub 8-bit with dest
s_waitcnt vmcnt(5)
v_or_b32 v[vgprG2LB+0+2], v[vgprG2LB+0+2], v9      // pack a sub 8-bit with dest
s_waitcnt vmcnt(4)
v_lshlrev_b32 v10, 0x8, v10                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+0+2], v[vgprG2LB+0+2], v10     // pack a sub 8-bit with dest
s_waitcnt vmcnt(2)
v_lshlrev_b32 v12, 0x8, v12                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+0+3], v[vgprG2LB+0+3], v12     // pack a sub 8-bit with dest
s_waitcnt vmcnt(1)
v_or_b32 v[vgprG2LB+0+3], v[vgprG2LB+0+3], v13     // pack a sub 8-bit with dest
s_waitcnt vmcnt(0)
v_lshlrev_b32 v14, 0x8, v14                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+0+3], v[vgprG2LB+0+3], v14     // pack a sub 8-bit with dest
/* g2l=4, load component 0 */
buffer_load_ubyte_d16 v[vgprG2LB+4+0], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:0 // load one buffer value
/* g2l=4, load component 1 */
buffer_load_ubyte_d16 v0, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:1 // load one buffer value
/* g2l=4, load component 2 */
buffer_load_ubyte_d16_hi v1, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:2 // load one buffer value
/* g2l=4, load component 3 */
buffer_load_ubyte_d16_hi v2, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:3 // load one buffer value
/* g2l=4, load component 4 */
buffer_load_ubyte_d16 v[vgprG2LB+4+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:4 // load one buffer value
/* g2l=4, load component 5 */
buffer_load_ubyte_d16 v4, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:5 // load one buffer value
/* g2l=4, load component 6 */
buffer_load_ubyte_d16_hi v5, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:6 // load one buffer value
/* g2l=4, load component 7 */
buffer_load_ubyte_d16_hi v6, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:7 // load one buffer value
/* g2l=4, load component 8 */
buffer_load_ubyte_d16 v[vgprG2LB+4+2], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:8 // load one buffer value
/* g2l=4, load component 9 */
buffer_load_ubyte_d16 v8, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:9 // load one buffer value
/* g2l=4, load component 10 */
buffer_load_ubyte_d16_hi v9, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:10 // load one buffer value
/* g2l=4, load component 11 */
buffer_load_ubyte_d16_hi v10, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:11 // load one buffer value
/* g2l=4, load component 12 */
buffer_load_ubyte_d16 v[vgprG2LB+4+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:12 // load one buffer value
/* g2l=4, load component 13 */
buffer_load_ubyte_d16 v12, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:13 // load one buffer value
/* g2l=4, load component 14 */
buffer_load_ubyte_d16_hi v13, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:14 // load one buffer value
/* g2l=4, load component 15 */
buffer_load_ubyte_d16_hi v14, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+0] offen offset:15 // load one buffer value
s_waitcnt vmcnt(14)
v_lshlrev_b32 v0, 0x8, v0                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+4+0], v[vgprG2LB+4+0], v0      // pack a sub 8-bit with dest
s_waitcnt vmcnt(13)
v_or_b32 v[vgprG2LB+4+0], v[vgprG2LB+4+0], v1      // pack a sub 8-bit with dest
s_waitcnt vmcnt(12)
v_lshlrev_b32 v2, 0x8, v2                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+4+0], v[vgprG2LB+4+0], v2      // pack a sub 8-bit with dest
s_waitcnt vmcnt(10)
v_lshlrev_b32 v4, 0x8, v4                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+4+1], v[vgprG2LB+4+1], v4      // pack a sub 8-bit with dest
s_waitcnt vmcnt(9)
v_or_b32 v[vgprG2LB+4+1], v[vgprG2LB+4+1], v5      // pack a sub 8-bit with dest
s_waitcnt vmcnt(8)
v_lshlrev_b32 v6, 0x8, v6                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+4+1], v[vgprG2LB+4+1], v6      // pack a sub 8-bit with dest
s_waitcnt vmcnt(6)
v_lshlrev_b32 v8, 0x8, v8                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+4+2], v[vgprG2LB+4+2], v8      // pack a sub 8-bit with dest
s_waitcnt vmcnt(5)
v_or_b32 v[vgprG2LB+4+2], v[vgprG2LB+4+2], v9      // pack a sub 8-bit with dest
s_waitcnt vmcnt(4)
v_lshlrev_b32 v10, 0x8, v10                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+4+2], v[vgprG2LB+4+2], v10     // pack a sub 8-bit with dest
s_waitcnt vmcnt(2)
v_lshlrev_b32 v12, 0x8, v12                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+4+3], v[vgprG2LB+4+3], v12     // pack a sub 8-bit with dest
s_waitcnt vmcnt(1)
v_or_b32 v[vgprG2LB+4+3], v[vgprG2LB+4+3], v13     // pack a sub 8-bit with dest
s_waitcnt vmcnt(0)
v_lshlrev_b32 v14, 0x8, v14                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+4+3], v[vgprG2LB+4+3], v14     // pack a sub 8-bit with dest
/* g2l=8, load component 0 */
buffer_load_ubyte_d16 v[vgprG2LB+8+0], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:0 // load one buffer value
/* g2l=8, load component 1 */
buffer_load_ubyte_d16 v0, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:1 // load one buffer value
/* g2l=8, load component 2 */
buffer_load_ubyte_d16_hi v1, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:2 // load one buffer value
/* g2l=8, load component 3 */
buffer_load_ubyte_d16_hi v2, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:3 // load one buffer value
/* g2l=8, load component 4 */
buffer_load_ubyte_d16 v[vgprG2LB+8+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:4 // load one buffer value
/* g2l=8, load component 5 */
buffer_load_ubyte_d16 v4, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:5 // load one buffer value
/* g2l=8, load component 6 */
buffer_load_ubyte_d16_hi v5, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:6 // load one buffer value
/* g2l=8, load component 7 */
buffer_load_ubyte_d16_hi v6, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:7 // load one buffer value
/* g2l=8, load component 8 */
buffer_load_ubyte_d16 v[vgprG2LB+8+2], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:8 // load one buffer value
/* g2l=8, load component 9 */
buffer_load_ubyte_d16 v8, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:9 // load one buffer value
/* g2l=8, load component 10 */
buffer_load_ubyte_d16_hi v9, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:10 // load one buffer value
/* g2l=8, load component 11 */
buffer_load_ubyte_d16_hi v10, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:11 // load one buffer value
/* g2l=8, load component 12 */
buffer_load_ubyte_d16 v[vgprG2LB+8+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:12 // load one buffer value
/* g2l=8, load component 13 */
buffer_load_ubyte_d16 v12, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:13 // load one buffer value
/* g2l=8, load component 14 */
buffer_load_ubyte_d16_hi v13, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:14 // load one buffer value
/* g2l=8, load component 15 */
buffer_load_ubyte_d16_hi v14, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+1] offen offset:15 // load one buffer value
s_waitcnt vmcnt(14)
v_lshlrev_b32 v0, 0x8, v0                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+8+0], v[vgprG2LB+8+0], v0      // pack a sub 8-bit with dest
s_waitcnt vmcnt(13)
v_or_b32 v[vgprG2LB+8+0], v[vgprG2LB+8+0], v1      // pack a sub 8-bit with dest
s_waitcnt vmcnt(12)
v_lshlrev_b32 v2, 0x8, v2                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+8+0], v[vgprG2LB+8+0], v2      // pack a sub 8-bit with dest
s_waitcnt vmcnt(10)
v_lshlrev_b32 v4, 0x8, v4                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+8+1], v[vgprG2LB+8+1], v4      // pack a sub 8-bit with dest
s_waitcnt vmcnt(9)
v_or_b32 v[vgprG2LB+8+1], v[vgprG2LB+8+1], v5      // pack a sub 8-bit with dest
s_waitcnt vmcnt(8)
v_lshlrev_b32 v6, 0x8, v6                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+8+1], v[vgprG2LB+8+1], v6      // pack a sub 8-bit with dest
s_waitcnt vmcnt(6)
v_lshlrev_b32 v8, 0x8, v8                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+8+2], v[vgprG2LB+8+2], v8      // pack a sub 8-bit with dest
s_waitcnt vmcnt(5)
v_or_b32 v[vgprG2LB+8+2], v[vgprG2LB+8+2], v9      // pack a sub 8-bit with dest
s_waitcnt vmcnt(4)
v_lshlrev_b32 v10, 0x8, v10                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+8+2], v[vgprG2LB+8+2], v10     // pack a sub 8-bit with dest
s_waitcnt vmcnt(2)
v_lshlrev_b32 v12, 0x8, v12                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+8+3], v[vgprG2LB+8+3], v12     // pack a sub 8-bit with dest
s_waitcnt vmcnt(1)
v_or_b32 v[vgprG2LB+8+3], v[vgprG2LB+8+3], v13     // pack a sub 8-bit with dest
s_waitcnt vmcnt(0)
v_lshlrev_b32 v14, 0x8, v14                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+8+3], v[vgprG2LB+8+3], v14     // pack a sub 8-bit with dest
/* g2l=12, load component 0 */
buffer_load_ubyte_d16 v[vgprG2LB+12+0], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:0 // load one buffer value
/* g2l=12, load component 1 */
buffer_load_ubyte_d16 v0, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:1 // load one buffer value
/* g2l=12, load component 2 */
buffer_load_ubyte_d16_hi v1, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:2 // load one buffer value
/* g2l=12, load component 3 */
buffer_load_ubyte_d16_hi v2, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:3 // load one buffer value
/* g2l=12, load component 4 */
buffer_load_ubyte_d16 v[vgprG2LB+12+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:4 // load one buffer value
/* g2l=12, load component 5 */
buffer_load_ubyte_d16 v4, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:5 // load one buffer value
/* g2l=12, load component 6 */
buffer_load_ubyte_d16_hi v5, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:6 // load one buffer value
/* g2l=12, load component 7 */
buffer_load_ubyte_d16_hi v6, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:7 // load one buffer value
/* g2l=12, load component 8 */
buffer_load_ubyte_d16 v[vgprG2LB+12+2], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:8 // load one buffer value
/* g2l=12, load component 9 */
buffer_load_ubyte_d16 v8, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:9 // load one buffer value
/* g2l=12, load component 10 */
buffer_load_ubyte_d16_hi v9, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:10 // load one buffer value
/* g2l=12, load component 11 */
buffer_load_ubyte_d16_hi v10, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:11 // load one buffer value
/* g2l=12, load component 12 */
buffer_load_ubyte_d16 v[vgprG2LB+12+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:12 // load one buffer value
/* g2l=12, load component 13 */
buffer_load_ubyte_d16 v12, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:13 // load one buffer value
/* g2l=12, load component 14 */
buffer_load_ubyte_d16_hi v13, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:14 // load one buffer value
/* g2l=12, load component 15 */
buffer_load_ubyte_d16_hi v14, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+2] offen offset:15 // load one buffer value
s_waitcnt vmcnt(14)
v_lshlrev_b32 v0, 0x8, v0                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+12+0], v[vgprG2LB+12+0], v0    // pack a sub 8-bit with dest
s_waitcnt vmcnt(13)
v_or_b32 v[vgprG2LB+12+0], v[vgprG2LB+12+0], v1    // pack a sub 8-bit with dest
s_waitcnt vmcnt(12)
v_lshlrev_b32 v2, 0x8, v2                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+12+0], v[vgprG2LB+12+0], v2    // pack a sub 8-bit with dest
s_waitcnt vmcnt(10)
v_lshlrev_b32 v4, 0x8, v4                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+12+1], v[vgprG2LB+12+1], v4    // pack a sub 8-bit with dest
s_waitcnt vmcnt(9)
v_or_b32 v[vgprG2LB+12+1], v[vgprG2LB+12+1], v5    // pack a sub 8-bit with dest
s_waitcnt vmcnt(8)
v_lshlrev_b32 v6, 0x8, v6                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+12+1], v[vgprG2LB+12+1], v6    // pack a sub 8-bit with dest
s_waitcnt vmcnt(6)
v_lshlrev_b32 v8, 0x8, v8                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+12+2], v[vgprG2LB+12+2], v8    // pack a sub 8-bit with dest
s_waitcnt vmcnt(5)
v_or_b32 v[vgprG2LB+12+2], v[vgprG2LB+12+2], v9    // pack a sub 8-bit with dest
s_waitcnt vmcnt(4)
v_lshlrev_b32 v10, 0x8, v10                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+12+2], v[vgprG2LB+12+2], v10   // pack a sub 8-bit with dest
s_waitcnt vmcnt(2)
v_lshlrev_b32 v12, 0x8, v12                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+12+3], v[vgprG2LB+12+3], v12   // pack a sub 8-bit with dest
s_waitcnt vmcnt(1)
v_or_b32 v[vgprG2LB+12+3], v[vgprG2LB+12+3], v13   // pack a sub 8-bit with dest
s_waitcnt vmcnt(0)
v_lshlrev_b32 v14, 0x8, v14                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+12+3], v[vgprG2LB+12+3], v14   // pack a sub 8-bit with dest
/* g2l=16, load component 0 */
buffer_load_ubyte_d16 v[vgprG2LB+16+0], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:0 // load one buffer value
/* g2l=16, load component 1 */
buffer_load_ubyte_d16 v0, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:1 // load one buffer value
/* g2l=16, load component 2 */
buffer_load_ubyte_d16_hi v1, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:2 // load one buffer value
/* g2l=16, load component 3 */
buffer_load_ubyte_d16_hi v2, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:3 // load one buffer value
/* g2l=16, load component 4 */
buffer_load_ubyte_d16 v[vgprG2LB+16+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:4 // load one buffer value
/* g2l=16, load component 5 */
buffer_load_ubyte_d16 v4, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:5 // load one buffer value
/* g2l=16, load component 6 */
buffer_load_ubyte_d16_hi v5, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:6 // load one buffer value
/* g2l=16, load component 7 */
buffer_load_ubyte_d16_hi v6, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:7 // load one buffer value
/* g2l=16, load component 8 */
buffer_load_ubyte_d16 v[vgprG2LB+16+2], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:8 // load one buffer value
/* g2l=16, load component 9 */
buffer_load_ubyte_d16 v8, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:9 // load one buffer value
/* g2l=16, load component 10 */
buffer_load_ubyte_d16_hi v9, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:10 // load one buffer value
/* g2l=16, load component 11 */
buffer_load_ubyte_d16_hi v10, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:11 // load one buffer value
/* g2l=16, load component 12 */
buffer_load_ubyte_d16 v[vgprG2LB+16+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:12 // load one buffer value
/* g2l=16, load component 13 */
buffer_load_ubyte_d16 v12, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:13 // load one buffer value
/* g2l=16, load component 14 */
buffer_load_ubyte_d16_hi v13, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:14 // load one buffer value
/* g2l=16, load component 15 */
buffer_load_ubyte_d16_hi v14, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+3] offen offset:15 // load one buffer value
s_waitcnt vmcnt(14)
v_lshlrev_b32 v0, 0x8, v0                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+16+0], v[vgprG2LB+16+0], v0    // pack a sub 8-bit with dest
s_waitcnt vmcnt(13)
v_or_b32 v[vgprG2LB+16+0], v[vgprG2LB+16+0], v1    // pack a sub 8-bit with dest
s_waitcnt vmcnt(12)
v_lshlrev_b32 v2, 0x8, v2                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+16+0], v[vgprG2LB+16+0], v2    // pack a sub 8-bit with dest
s_waitcnt vmcnt(10)
v_lshlrev_b32 v4, 0x8, v4                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+16+1], v[vgprG2LB+16+1], v4    // pack a sub 8-bit with dest
s_waitcnt vmcnt(9)
v_or_b32 v[vgprG2LB+16+1], v[vgprG2LB+16+1], v5    // pack a sub 8-bit with dest
s_waitcnt vmcnt(8)
v_lshlrev_b32 v6, 0x8, v6                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+16+1], v[vgprG2LB+16+1], v6    // pack a sub 8-bit with dest
s_waitcnt vmcnt(6)
v_lshlrev_b32 v8, 0x8, v8                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+16+2], v[vgprG2LB+16+2], v8    // pack a sub 8-bit with dest
s_waitcnt vmcnt(5)
v_or_b32 v[vgprG2LB+16+2], v[vgprG2LB+16+2], v9    // pack a sub 8-bit with dest
s_waitcnt vmcnt(4)
v_lshlrev_b32 v10, 0x8, v10                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+16+2], v[vgprG2LB+16+2], v10   // pack a sub 8-bit with dest
s_waitcnt vmcnt(2)
v_lshlrev_b32 v12, 0x8, v12                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+16+3], v[vgprG2LB+16+3], v12   // pack a sub 8-bit with dest
s_waitcnt vmcnt(1)
v_or_b32 v[vgprG2LB+16+3], v[vgprG2LB+16+3], v13   // pack a sub 8-bit with dest
s_waitcnt vmcnt(0)
v_lshlrev_b32 v14, 0x8, v14                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+16+3], v[vgprG2LB+16+3], v14   // pack a sub 8-bit with dest
/* g2l=20, load component 0 */
buffer_load_ubyte_d16 v[vgprG2LB+20+0], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:0 // load one buffer value
/* g2l=20, load component 1 */
buffer_load_ubyte_d16 v0, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:1 // load one buffer value
/* g2l=20, load component 2 */
buffer_load_ubyte_d16_hi v1, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:2 // load one buffer value
/* g2l=20, load component 3 */
buffer_load_ubyte_d16_hi v2, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:3 // load one buffer value
/* g2l=20, load component 4 */
buffer_load_ubyte_d16 v[vgprG2LB+20+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:4 // load one buffer value
/* g2l=20, load component 5 */
buffer_load_ubyte_d16 v4, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:5 // load one buffer value
/* g2l=20, load component 6 */
buffer_load_ubyte_d16_hi v5, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:6 // load one buffer value
/* g2l=20, load component 7 */
buffer_load_ubyte_d16_hi v6, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:7 // load one buffer value
/* g2l=20, load component 8 */
buffer_load_ubyte_d16 v[vgprG2LB+20+2], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:8 // load one buffer value
/* g2l=20, load component 9 */
buffer_load_ubyte_d16 v8, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:9 // load one buffer value
/* g2l=20, load component 10 */
buffer_load_ubyte_d16_hi v9, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:10 // load one buffer value
/* g2l=20, load component 11 */
buffer_load_ubyte_d16_hi v10, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:11 // load one buffer value
/* g2l=20, load component 12 */
buffer_load_ubyte_d16 v[vgprG2LB+20+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:12 // load one buffer value
/* g2l=20, load component 13 */
buffer_load_ubyte_d16 v12, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:13 // load one buffer value
/* g2l=20, load component 14 */
buffer_load_ubyte_d16_hi v13, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:14 // load one buffer value
/* g2l=20, load component 15 */
buffer_load_ubyte_d16_hi v14, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+4] offen offset:15 // load one buffer value
s_waitcnt vmcnt(14)
v_lshlrev_b32 v0, 0x8, v0                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+20+0], v[vgprG2LB+20+0], v0    // pack a sub 8-bit with dest
s_waitcnt vmcnt(13)
v_or_b32 v[vgprG2LB+20+0], v[vgprG2LB+20+0], v1    // pack a sub 8-bit with dest
s_waitcnt vmcnt(12)
v_lshlrev_b32 v2, 0x8, v2                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+20+0], v[vgprG2LB+20+0], v2    // pack a sub 8-bit with dest
s_waitcnt vmcnt(10)
v_lshlrev_b32 v4, 0x8, v4                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+20+1], v[vgprG2LB+20+1], v4    // pack a sub 8-bit with dest
s_waitcnt vmcnt(9)
v_or_b32 v[vgprG2LB+20+1], v[vgprG2LB+20+1], v5    // pack a sub 8-bit with dest
s_waitcnt vmcnt(8)
v_lshlrev_b32 v6, 0x8, v6                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+20+1], v[vgprG2LB+20+1], v6    // pack a sub 8-bit with dest
s_waitcnt vmcnt(6)
v_lshlrev_b32 v8, 0x8, v8                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+20+2], v[vgprG2LB+20+2], v8    // pack a sub 8-bit with dest
s_waitcnt vmcnt(5)
v_or_b32 v[vgprG2LB+20+2], v[vgprG2LB+20+2], v9    // pack a sub 8-bit with dest
s_waitcnt vmcnt(4)
v_lshlrev_b32 v10, 0x8, v10                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+20+2], v[vgprG2LB+20+2], v10   // pack a sub 8-bit with dest
s_waitcnt vmcnt(2)
v_lshlrev_b32 v12, 0x8, v12                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+20+3], v[vgprG2LB+20+3], v12   // pack a sub 8-bit with dest
s_waitcnt vmcnt(1)
v_or_b32 v[vgprG2LB+20+3], v[vgprG2LB+20+3], v13   // pack a sub 8-bit with dest
s_waitcnt vmcnt(0)
v_lshlrev_b32 v14, 0x8, v14                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+20+3], v[vgprG2LB+20+3], v14   // pack a sub 8-bit with dest
/* g2l=24, load component 0 */
buffer_load_ubyte_d16 v[vgprG2LB+24+0], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:0 // load one buffer value
/* g2l=24, load component 1 */
buffer_load_ubyte_d16 v0, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:1 // load one buffer value
/* g2l=24, load component 2 */
buffer_load_ubyte_d16_hi v1, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:2 // load one buffer value
/* g2l=24, load component 3 */
buffer_load_ubyte_d16_hi v2, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:3 // load one buffer value
/* g2l=24, load component 4 */
buffer_load_ubyte_d16 v[vgprG2LB+24+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:4 // load one buffer value
/* g2l=24, load component 5 */
buffer_load_ubyte_d16 v4, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:5 // load one buffer value
/* g2l=24, load component 6 */
buffer_load_ubyte_d16_hi v5, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:6 // load one buffer value
/* g2l=24, load component 7 */
buffer_load_ubyte_d16_hi v6, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:7 // load one buffer value
/* g2l=24, load component 8 */
buffer_load_ubyte_d16 v[vgprG2LB+24+2], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:8 // load one buffer value
/* g2l=24, load component 9 */
buffer_load_ubyte_d16 v8, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:9 // load one buffer value
/* g2l=24, load component 10 */
buffer_load_ubyte_d16_hi v9, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:10 // load one buffer value
/* g2l=24, load component 11 */
buffer_load_ubyte_d16_hi v10, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:11 // load one buffer value
/* g2l=24, load component 12 */
buffer_load_ubyte_d16 v[vgprG2LB+24+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:12 // load one buffer value
/* g2l=24, load component 13 */
buffer_load_ubyte_d16 v12, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:13 // load one buffer value
/* g2l=24, load component 14 */
buffer_load_ubyte_d16_hi v13, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:14 // load one buffer value
/* g2l=24, load component 15 */
buffer_load_ubyte_d16_hi v14, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+5] offen offset:15 // load one buffer value
s_waitcnt vmcnt(14)
v_lshlrev_b32 v0, 0x8, v0                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+24+0], v[vgprG2LB+24+0], v0    // pack a sub 8-bit with dest
s_waitcnt vmcnt(13)
v_or_b32 v[vgprG2LB+24+0], v[vgprG2LB+24+0], v1    // pack a sub 8-bit with dest
s_waitcnt vmcnt(12)
v_lshlrev_b32 v2, 0x8, v2                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+24+0], v[vgprG2LB+24+0], v2    // pack a sub 8-bit with dest
s_waitcnt vmcnt(10)
v_lshlrev_b32 v4, 0x8, v4                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+24+1], v[vgprG2LB+24+1], v4    // pack a sub 8-bit with dest
s_waitcnt vmcnt(9)
v_or_b32 v[vgprG2LB+24+1], v[vgprG2LB+24+1], v5    // pack a sub 8-bit with dest
s_waitcnt vmcnt(8)
v_lshlrev_b32 v6, 0x8, v6                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+24+1], v[vgprG2LB+24+1], v6    // pack a sub 8-bit with dest
s_waitcnt vmcnt(6)
v_lshlrev_b32 v8, 0x8, v8                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+24+2], v[vgprG2LB+24+2], v8    // pack a sub 8-bit with dest
s_waitcnt vmcnt(5)
v_or_b32 v[vgprG2LB+24+2], v[vgprG2LB+24+2], v9    // pack a sub 8-bit with dest
s_waitcnt vmcnt(4)
v_lshlrev_b32 v10, 0x8, v10                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+24+2], v[vgprG2LB+24+2], v10   // pack a sub 8-bit with dest
s_waitcnt vmcnt(2)
v_lshlrev_b32 v12, 0x8, v12                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+24+3], v[vgprG2LB+24+3], v12   // pack a sub 8-bit with dest
s_waitcnt vmcnt(1)
v_or_b32 v[vgprG2LB+24+3], v[vgprG2LB+24+3], v13   // pack a sub 8-bit with dest
s_waitcnt vmcnt(0)
v_lshlrev_b32 v14, 0x8, v14                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+24+3], v[vgprG2LB+24+3], v14   // pack a sub 8-bit with dest
/* g2l=28, load component 0 */
buffer_load_ubyte_d16 v[vgprG2LB+28+0], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:0 // load one buffer value
/* g2l=28, load component 1 */
buffer_load_ubyte_d16 v0, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:1 // load one buffer value
/* g2l=28, load component 2 */
buffer_load_ubyte_d16_hi v1, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:2 // load one buffer value
/* g2l=28, load component 3 */
buffer_load_ubyte_d16_hi v2, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:3 // load one buffer value
/* g2l=28, load component 4 */
buffer_load_ubyte_d16 v[vgprG2LB+28+1], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:4 // load one buffer value
/* g2l=28, load component 5 */
buffer_load_ubyte_d16 v4, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:5 // load one buffer value
/* g2l=28, load component 6 */
buffer_load_ubyte_d16_hi v5, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:6 // load one buffer value
/* g2l=28, load component 7 */
buffer_load_ubyte_d16_hi v6, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:7 // load one buffer value
/* g2l=28, load component 8 */
buffer_load_ubyte_d16 v[vgprG2LB+28+2], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:8 // load one buffer value
/* g2l=28, load component 9 */
buffer_load_ubyte_d16 v8, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:9 // load one buffer value
/* g2l=28, load component 10 */
buffer_load_ubyte_d16_hi v9, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:10 // load one buffer value
/* g2l=28, load component 11 */
buffer_load_ubyte_d16_hi v10, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:11 // load one buffer value
/* g2l=28, load component 12 */
buffer_load_ubyte_d16 v[vgprG2LB+28+3], v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:12 // load one buffer value
/* g2l=28, load component 13 */
buffer_load_ubyte_d16 v12, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:13 // load one buffer value
/* g2l=28, load component 14 */
buffer_load_ubyte_d16_hi v13, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:14 // load one buffer value
/* g2l=28, load component 15 */
buffer_load_ubyte_d16_hi v14, v[vgprGlobalReadOffsetB+0], s[sgprSrdB:sgprSrdB+3], s[sgprScalarGlobalReadOffsetB+6] offen offset:15 // load one buffer value
s_waitcnt vmcnt(14)
v_lshlrev_b32 v0, 0x8, v0                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+28+0], v[vgprG2LB+28+0], v0    // pack a sub 8-bit with dest
s_waitcnt vmcnt(13)
v_or_b32 v[vgprG2LB+28+0], v[vgprG2LB+28+0], v1    // pack a sub 8-bit with dest
s_waitcnt vmcnt(12)
v_lshlrev_b32 v2, 0x8, v2                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+28+0], v[vgprG2LB+28+0], v2    // pack a sub 8-bit with dest
s_waitcnt vmcnt(10)
v_lshlrev_b32 v4, 0x8, v4                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+28+1], v[vgprG2LB+28+1], v4    // pack a sub 8-bit with dest
s_waitcnt vmcnt(9)
v_or_b32 v[vgprG2LB+28+1], v[vgprG2LB+28+1], v5    // pack a sub 8-bit with dest
s_waitcnt vmcnt(8)
v_lshlrev_b32 v6, 0x8, v6                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+28+1], v[vgprG2LB+28+1], v6    // pack a sub 8-bit with dest
s_waitcnt vmcnt(6)
v_lshlrev_b32 v8, 0x8, v8                          // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+28+2], v[vgprG2LB+28+2], v8    // pack a sub 8-bit with dest
s_waitcnt vmcnt(5)
v_or_b32 v[vgprG2LB+28+2], v[vgprG2LB+28+2], v9    // pack a sub 8-bit with dest
s_waitcnt vmcnt(4)
v_lshlrev_b32 v10, 0x8, v10                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+28+2], v[vgprG2LB+28+2], v10   // pack a sub 8-bit with dest
s_waitcnt vmcnt(2)
v_lshlrev_b32 v12, 0x8, v12                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+28+3], v[vgprG2LB+28+3], v12   // pack a sub 8-bit with dest
s_waitcnt vmcnt(1)
v_or_b32 v[vgprG2LB+28+3], v[vgprG2LB+28+3], v13   // pack a sub 8-bit with dest
s_waitcnt vmcnt(0)
v_lshlrev_b32 v14, 0x8, v14                        // shift left to higher 8 bits
v_or_b32 v[vgprG2LB+28+3], v[vgprG2LB+28+3], v14   // pack a sub 8-bit with dest
s_waitcnt vmcnt(0)                                 // 2wait for global read
// Skip force waitcnt0
s_barrier

/* local write a */

/* local write b */
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+0:vgprG2LB+0+3] offset:0 // lwoB_0_0_0_0 = (0*LSCB)*(MT1J+PAD) + (0*LSPB) = 0
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+4:vgprG2LB+4+3] offset:4160 // lwoB_0_0_1_0 = (0*LSCB)*(MT1J+PAD) + (1*LSPB) = 4160
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+8:vgprG2LB+8+3] offset:8320 // lwoB_0_0_2_0 = (0*LSCB)*(MT1J+PAD) + (2*LSPB) = 8320
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+12:vgprG2LB+12+3] offset:12480 // lwoB_0_0_3_0 = (0*LSCB)*(MT1J+PAD) + (3*LSPB) = 12480
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+16:vgprG2LB+16+3] offset:16640 // lwoB_0_0_4_0 = (0*LSCB)*(MT1J+PAD) + (4*LSPB) = 16640
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+20:vgprG2LB+20+3] offset:20800 // lwoB_0_0_5_0 = (0*LSCB)*(MT1J+PAD) + (5*LSPB) = 20800
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+24:vgprG2LB+24+3] offset:24960 // lwoB_0_0_6_0 = (0*LSCB)*(MT1J+PAD) + (6*LSPB) = 24960
ds_write_b128 v[vgprLocalWriteAddrB], v[vgprG2LB+28:vgprG2LB+28+3] offset:29120 // lwoB_0_0_7_0 = (0*LSCB)*(MT1J+PAD) + (7*LSPB) = 29120

/* Recalc local read offsets */
/* lr0I */
v_and_b32 v1, 63, v[vgprSerial]                    // 0. thread id in wave: wtid = tid % wavelength(64)
v_and_b32 v0, 15, v1                               // 1. N offset: nIdx = wtid % MI_N(16)
v_lshlrev_b32 v0, 0x7, v0                          // 1. N offset: nOffset = nIdx * nStride(128)
/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */
v_lshlrev_b32 v0, 0x2, v0                          // 4. apply VectorWidth: bnOffset = bnOffset * vw(4)
v_and_b32 v1, 63, v[vgprSerial]                    // 5. thread id in wave: wtid = tid % wavelength(64)
v_lshrrev_b32 v1, 4, v1                            // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
v_lshlrev_b32 v1, 0x3, v1                          // 5. K offset: lrKOffset = kIdx * mStride(8)
v_add_u32 v0, v1, v0                               // 6. offset in wave: lrOffset = bnOffset + lrKOffset
v_lshrrev_b32 v1, 6, v[vgprSerial]                 // 7. wave offset in N dimen: wtid = tid / dividedForWaveId(64)
v_and_b32 v1, 3, v1                                // 7. wave offset in M dimen: wtid0 = wtid / num1DWaves(4)
v_lshlrev_b32 v1, 0xd, v1                          // 7. wave offset in M dimen: wOffset = wtid0 * W0Stride(8192)
v_add_u32 v0, v1, v0                               // 7. final local read offset: flrOffset = lrOffset + WOffset
/* lr1J */
v_and_b32 v2, 63, v[vgprSerial]                    // 0. thread id in wave: wtid = tid % wavelength(64)
v_and_b32 v1, 15, v2                               // 1. N offset: nIdx = wtid % MI_N(16)
v_lshlrev_b32 v1, 0x7, v1                          // 1. N offset: nOffset = nIdx * nStride(128)
/* Skip. 2. block offset: bnOffset = 0 when num1DBlocks = 1 */
v_lshlrev_b32 v1, 0x4, v1                          // 4. apply VectorWidth: bnOffset = bnOffset * vw(16)
v_and_b32 v2, 63, v[vgprSerial]                    // 5. thread id in wave: wtid = tid % wavelength(64)
v_lshrrev_b32 v2, 4, v2                            // 5. K offset: kIdx = wtid / (MIN(16) * MIBB(1))
v_lshlrev_b32 v2, 0x3, v2                          // 5. K offset: lrKOffset = kIdx * mStride(8)
v_add_u32 v1, v2, v1                               // 6. offset in wave: lrOffset = bnOffset + lrKOffset
v_lshrrev_b32 v2, 6, v[vgprSerial]                 // v2 = v[vgprSerial] / 64
v_lshrrev_b32 v2, 2, v2                            // LSU offset: Get LSU wave_id
s_mov_b32 s8, 128                                  // LSU offset: stride = lsuStride(128) when umlds==True
v_mul_lo_u32 v2, s8, v2                            // LSU offset: lsuoffset = wave_id*lsuStride*(MT0+PAD)
v_add_u32 v[vgprLocalReadAddrA], v2, v0            // Final Offset: offset = (lro0+lsuoffset)*bpeDS(1)
v_lshrrev_b32 v3, 9, v[vgprLocalReadAddrA]         // Final Offset: padding 32 per block 512
v_lshlrev_b32 v3, 0x5, v3                          // Final Offset: padding 32 per block 512
v_add_u32 v[vgprLocalReadAddrA], v3, v[vgprLocalReadAddrA] // Final Offset: add padding 32 per block 512
/* N/A */
v_lshrrev_b32 v0, 6, v[vgprSerial]                 // v0 = v[vgprSerial] / 64
v_lshrrev_b32 v0, 2, v0                            // LSU offset: Get LSU wave_id
s_mov_b32 s8, 128                                  // LSU offset: stride = lsuStride(128) when umlds==True
v_mul_lo_u32 v0, s8, v0                            // LSU offset: lsuoffset = wave_id*lsuStride*(MT1+PAD)
v_add_u32 v[vgprLocalReadAddrB], v0, v1            // Final Offset: offset = (lro1+lsuoffset)*bpeDS(1)
v_lshrrev_b32 v2, 11, v[vgprLocalReadAddrB]        // Final Offset: padding 32 per block 2048
v_lshlrev_b32 v2, 0x5, v2                          // Final Offset: padding 32 per block 2048
v_add_u32 v[vgprLocalReadAddrB], v2, v[vgprLocalReadAddrB] // Final Offset: add padding 32 per block 2048
s_waitcnt lgkmcnt(0)                               // 5wait for local write
// Skip force waitcnt0
s_barrier

/* local read reset offsets a */

/* local read reset offsets b */

/* local read init pointers a */

/* localReadInitPointers */

/* local read init pointers b */

/* localReadInitPointers */

/* tail loop: macs */
label_TailLoopBeginL:

/* Tail: remove ValuA/B vgpr buffer [0...160) from pool */

/* Tail: add address/G2L vgpr [160...230) to pool */

/* local read a */

/* local read b */
ds_read_b64 v[vgprValuB_X0_I0+0:vgprValuB_X0_I0+0+1], v[vgprLocalReadAddrB] offset:0 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=0 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b64 v[vgprValuB_X0_I0+2:vgprValuB_X0_I0+2+1], v[vgprLocalReadAddrB] offset:128 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=1 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b64 v[vgprValuB_X0_I0+4:vgprValuB_X0_I0+4+1], v[vgprLocalReadAddrB] offset:256 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=2 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b64 v[vgprValuB_X0_I0+6:vgprValuB_X0_I0+6+1], v[vgprLocalReadAddrB] offset:384 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=3 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b64 v[vgprValuB_X0_I0+8:vgprValuB_X0_I0+8+1], v[vgprLocalReadAddrB] offset:512 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=4 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b64 v[vgprValuB_X0_I0+10:vgprValuB_X0_I0+10+1], v[vgprLocalReadAddrB] offset:640 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=5 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b64 v[vgprValuB_X0_I0+12:vgprValuB_X0_I0+12+1], v[vgprLocalReadAddrB] offset:768 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=6 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b64 v[vgprValuB_X0_I0+14:vgprValuB_X0_I0+14+1], v[vgprLocalReadAddrB] offset:896 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=7 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b64 v[vgprValuB_X0_I0+16:vgprValuB_X0_I0+16+1], v[vgprLocalReadAddrB] offset:1024 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=8 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b64 v[vgprValuB_X0_I0+18:vgprValuB_X0_I0+18+1], v[vgprLocalReadAddrB] offset:1152 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=9 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b64 v[vgprValuB_X0_I0+20:vgprValuB_X0_I0+20+1], v[vgprLocalReadAddrB] offset:1280 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=10 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b64 v[vgprValuB_X0_I0+22:vgprValuB_X0_I0+22+1], v[vgprLocalReadAddrB] offset:1408 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=11 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b64 v[vgprValuB_X0_I0+24:vgprValuB_X0_I0+24+1], v[vgprLocalReadAddrB] offset:1536 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=12 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b64 v[vgprValuB_X0_I0+26:vgprValuB_X0_I0+26+1], v[vgprLocalReadAddrB] offset:1664 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=13 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b64 v[vgprValuB_X0_I0+28:vgprValuB_X0_I0+28+1], v[vgprLocalReadAddrB] offset:1792 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=14 rIdx=0 oIdx=0 buffer=0 iui=0
ds_read_b64 v[vgprValuB_X0_I0+30:vgprValuB_X0_I0+30+1], v[vgprLocalReadAddrB] offset:1920 // L -> Reg lro=0 swapByteOffset=0 ti=256 vIdx=0 eIdx=15 rIdx=0 oIdx=0 buffer=0 iui=0

/* local read inc a */
s_mov_b32 s8, 0x20                                 // inc

/* local read inc b */
s_mov_b32 s8, 0x20                                 // inc
v_add_co_u32 v[vgprLocalReadAddrB], vcc, s8, v[vgprLocalReadAddrB] // lrB += 32 (bpeDS)
s_waitcnt lgkmcnt(0)                               // 4wait for local read
v_and_b32 v160, 63, v[vgprSerial]                  // v160 = v[vgprSerial] % 64
v_lshrrev_b32 v160, 4, v160                        // v160 = v160 / 16
v_lshlrev_b32 v160, 0x3, v160                      // v160 = v160 * 8
v_cmp_ge_i32 s[28:29], v160, s[sgprLoopCounterL]   // check K index >= Size L
v_cndmask_b32 v[vgprValuA_X0_I0+0+0], v[vgprValuA_X0_I0+0+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+2+0], v[vgprValuA_X0_I0+2+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+4+0], v[vgprValuA_X0_I0+4+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+6+0], v[vgprValuA_X0_I0+6+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+0+1], v[vgprValuA_X0_I0+0+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+2+1], v[vgprValuA_X0_I0+2+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+4+1], v[vgprValuA_X0_I0+4+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuA_X0_I0+6+1], v[vgprValuA_X0_I0+6+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+0+0], v[vgprValuB_X0_I0+0+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+2+0], v[vgprValuB_X0_I0+2+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+4+0], v[vgprValuB_X0_I0+4+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+6+0], v[vgprValuB_X0_I0+6+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+0], v[vgprValuB_X0_I0+8+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+10+0], v[vgprValuB_X0_I0+10+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+12+0], v[vgprValuB_X0_I0+12+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+14+0], v[vgprValuB_X0_I0+14+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+0], v[vgprValuB_X0_I0+16+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+18+0], v[vgprValuB_X0_I0+18+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+20+0], v[vgprValuB_X0_I0+20+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+22+0], v[vgprValuB_X0_I0+22+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+0], v[vgprValuB_X0_I0+24+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+26+0], v[vgprValuB_X0_I0+26+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+28+0], v[vgprValuB_X0_I0+28+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+30+0], v[vgprValuB_X0_I0+30+0], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+0+1], v[vgprValuB_X0_I0+0+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+2+1], v[vgprValuB_X0_I0+2+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+4+1], v[vgprValuB_X0_I0+4+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+6+1], v[vgprValuB_X0_I0+6+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+8+1], v[vgprValuB_X0_I0+8+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+10+1], v[vgprValuB_X0_I0+10+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+12+1], v[vgprValuB_X0_I0+12+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+14+1], v[vgprValuB_X0_I0+14+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+16+1], v[vgprValuB_X0_I0+16+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+18+1], v[vgprValuB_X0_I0+18+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+20+1], v[vgprValuB_X0_I0+20+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+22+1], v[vgprValuB_X0_I0+22+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+24+1], v[vgprValuB_X0_I0+24+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+26+1], v[vgprValuB_X0_I0+26+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+28+1], v[vgprValuB_X0_I0+28+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_cndmask_b32 v[vgprValuB_X0_I0+30+1], v[vgprValuB_X0_I0+30+1], 0x0, s[28:29] // set 0 if K_idx >= sizeL
v_sub_u32 v160, s[sgprLoopCounterL], v160          // get distance between size and k index
v_cmp_lt_i32 s[28:29], v160, 8                     // set partial 0 if distance less than input per thread
s_and_b32 s30, s[sgprLoopCounterL], 7              // get inputs for edge thread
s_sub_u32 s30, 8, s30                              // use shift to fill 0 for outside element
s_lshl_b32 s30, s30, 3                             // use shift to fill 0 for outside element
v_lshlrev_b64 v[162:163], s30, v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1]
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+0], v[vgprValuA_X0_I0+0+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuA_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1]
v_cndmask_b32 v[vgprValuA_X0_I0+2+0+0+0], v[vgprValuA_X0_I0+2+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuA_X0_I0+2+0+0+1], v[vgprValuA_X0_I0+2+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1]
v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+0], v[vgprValuA_X0_I0+4+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuA_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+4+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuA_X0_I0+6+0+0:vgprValuA_X0_I0+6+0+0+1]
v_cndmask_b32 v[vgprValuA_X0_I0+6+0+0+0], v[vgprValuA_X0_I0+6+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuA_X0_I0+6+0+0+1], v[vgprValuA_X0_I0+6+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1]
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+0], v[vgprValuB_X0_I0+0+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuB_X0_I0+0+0+0+1], v[vgprValuB_X0_I0+0+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuB_X0_I0+2+0+0:vgprValuB_X0_I0+2+0+0+1]
v_cndmask_b32 v[vgprValuB_X0_I0+2+0+0+0], v[vgprValuB_X0_I0+2+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuB_X0_I0+2+0+0+1], v[vgprValuB_X0_I0+2+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1]
v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+0], v[vgprValuB_X0_I0+4+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuB_X0_I0+4+0+0+1], v[vgprValuB_X0_I0+4+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuB_X0_I0+6+0+0:vgprValuB_X0_I0+6+0+0+1]
v_cndmask_b32 v[vgprValuB_X0_I0+6+0+0+0], v[vgprValuB_X0_I0+6+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuB_X0_I0+6+0+0+1], v[vgprValuB_X0_I0+6+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1]
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+0], v[vgprValuB_X0_I0+8+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuB_X0_I0+8+0+0+1], v[vgprValuB_X0_I0+8+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuB_X0_I0+10+0+0:vgprValuB_X0_I0+10+0+0+1]
v_cndmask_b32 v[vgprValuB_X0_I0+10+0+0+0], v[vgprValuB_X0_I0+10+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuB_X0_I0+10+0+0+1], v[vgprValuB_X0_I0+10+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1]
v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+0], v[vgprValuB_X0_I0+12+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuB_X0_I0+12+0+0+1], v[vgprValuB_X0_I0+12+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuB_X0_I0+14+0+0:vgprValuB_X0_I0+14+0+0+1]
v_cndmask_b32 v[vgprValuB_X0_I0+14+0+0+0], v[vgprValuB_X0_I0+14+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuB_X0_I0+14+0+0+1], v[vgprValuB_X0_I0+14+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1]
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+0], v[vgprValuB_X0_I0+16+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuB_X0_I0+16+0+0+1], v[vgprValuB_X0_I0+16+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuB_X0_I0+18+0+0:vgprValuB_X0_I0+18+0+0+1]
v_cndmask_b32 v[vgprValuB_X0_I0+18+0+0+0], v[vgprValuB_X0_I0+18+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuB_X0_I0+18+0+0+1], v[vgprValuB_X0_I0+18+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1]
v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+0], v[vgprValuB_X0_I0+20+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuB_X0_I0+20+0+0+1], v[vgprValuB_X0_I0+20+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuB_X0_I0+22+0+0:vgprValuB_X0_I0+22+0+0+1]
v_cndmask_b32 v[vgprValuB_X0_I0+22+0+0+0], v[vgprValuB_X0_I0+22+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuB_X0_I0+22+0+0+1], v[vgprValuB_X0_I0+22+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1]
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+0], v[vgprValuB_X0_I0+24+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuB_X0_I0+24+0+0+1], v[vgprValuB_X0_I0+24+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuB_X0_I0+26+0+0:vgprValuB_X0_I0+26+0+0+1]
v_cndmask_b32 v[vgprValuB_X0_I0+26+0+0+0], v[vgprValuB_X0_I0+26+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuB_X0_I0+26+0+0+1], v[vgprValuB_X0_I0+26+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1]
v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+0], v[vgprValuB_X0_I0+28+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuB_X0_I0+28+0+0+1], v[vgprValuB_X0_I0+28+0+0+1], v163, s[28:29]
v_lshlrev_b64 v[162:163], s30, v[vgprValuB_X0_I0+30+0+0:vgprValuB_X0_I0+30+0+0+1]
v_cndmask_b32 v[vgprValuB_X0_I0+30+0+0+0], v[vgprValuB_X0_I0+30+0+0+0], v162, s[28:29]
v_cndmask_b32 v[vgprValuB_X0_I0+30+0+0+1], v[vgprValuB_X0_I0+30+0+0+1], v163, s[28:29]
s_nop 1
v_mfma_f32_16x16x32_fp8_fp8 acc[0:3], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[0:3] // left value = acc[0+0:3+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[4:7], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], acc[4:7] // left value = acc[4+0:7+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[8:11], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[8:11] // left value = acc[8+0:11+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[12:15], v[vgprValuB_X0_I0+0+0+0:vgprValuB_X0_I0+0+0+0+1], v[vgprValuA_X0_I0+6+0+0:vgprValuA_X0_I0+6+0+0+1], acc[12:15] // left value = acc[12+0:15+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[16:19], v[vgprValuB_X0_I0+2+0+0:vgprValuB_X0_I0+2+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[16:19] // left value = acc[16+0:19+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[20:23], v[vgprValuB_X0_I0+2+0+0:vgprValuB_X0_I0+2+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], acc[20:23] // left value = acc[20+0:23+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[24:27], v[vgprValuB_X0_I0+2+0+0:vgprValuB_X0_I0+2+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[24:27] // left value = acc[24+0:27+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[28:31], v[vgprValuB_X0_I0+2+0+0:vgprValuB_X0_I0+2+0+0+1], v[vgprValuA_X0_I0+6+0+0:vgprValuA_X0_I0+6+0+0+1], acc[28:31] // left value = acc[28+0:31+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[32:35], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[32:35] // left value = acc[32+0:35+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[36:39], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], acc[36:39] // left value = acc[36+0:39+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[40:43], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[40:43] // left value = acc[40+0:43+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[44:47], v[vgprValuB_X0_I0+4+0+0:vgprValuB_X0_I0+4+0+0+1], v[vgprValuA_X0_I0+6+0+0:vgprValuA_X0_I0+6+0+0+1], acc[44:47] // left value = acc[44+0:47+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[48:51], v[vgprValuB_X0_I0+6+0+0:vgprValuB_X0_I0+6+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[48:51] // left value = acc[48+0:51+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[52:55], v[vgprValuB_X0_I0+6+0+0:vgprValuB_X0_I0+6+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], acc[52:55] // left value = acc[52+0:55+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[56:59], v[vgprValuB_X0_I0+6+0+0:vgprValuB_X0_I0+6+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[56:59] // left value = acc[56+0:59+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[60:63], v[vgprValuB_X0_I0+6+0+0:vgprValuB_X0_I0+6+0+0+1], v[vgprValuA_X0_I0+6+0+0:vgprValuA_X0_I0+6+0+0+1], acc[60:63] // left value = acc[60+0:63+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[64:67], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[64:67] // left value = acc[64+0:67+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[68:71], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], acc[68:71] // left value = acc[68+0:71+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[72:75], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[72:75] // left value = acc[72+0:75+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[76:79], v[vgprValuB_X0_I0+8+0+0:vgprValuB_X0_I0+8+0+0+1], v[vgprValuA_X0_I0+6+0+0:vgprValuA_X0_I0+6+0+0+1], acc[76:79] // left value = acc[76+0:79+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[80:83], v[vgprValuB_X0_I0+10+0+0:vgprValuB_X0_I0+10+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[80:83] // left value = acc[80+0:83+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[84:87], v[vgprValuB_X0_I0+10+0+0:vgprValuB_X0_I0+10+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], acc[84:87] // left value = acc[84+0:87+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[88:91], v[vgprValuB_X0_I0+10+0+0:vgprValuB_X0_I0+10+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[88:91] // left value = acc[88+0:91+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[92:95], v[vgprValuB_X0_I0+10+0+0:vgprValuB_X0_I0+10+0+0+1], v[vgprValuA_X0_I0+6+0+0:vgprValuA_X0_I0+6+0+0+1], acc[92:95] // left value = acc[92+0:95+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[96:99], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[96:99] // left value = acc[96+0:99+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[100:103], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], acc[100:103] // left value = acc[100+0:103+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[104:107], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[104:107] // left value = acc[104+0:107+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[108:111], v[vgprValuB_X0_I0+12+0+0:vgprValuB_X0_I0+12+0+0+1], v[vgprValuA_X0_I0+6+0+0:vgprValuA_X0_I0+6+0+0+1], acc[108:111] // left value = acc[108+0:111+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[112:115], v[vgprValuB_X0_I0+14+0+0:vgprValuB_X0_I0+14+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[112:115] // left value = acc[112+0:115+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[116:119], v[vgprValuB_X0_I0+14+0+0:vgprValuB_X0_I0+14+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], acc[116:119] // left value = acc[116+0:119+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[120:123], v[vgprValuB_X0_I0+14+0+0:vgprValuB_X0_I0+14+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[120:123] // left value = acc[120+0:123+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[124:127], v[vgprValuB_X0_I0+14+0+0:vgprValuB_X0_I0+14+0+0+1], v[vgprValuA_X0_I0+6+0+0:vgprValuA_X0_I0+6+0+0+1], acc[124:127] // left value = acc[124+0:127+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[128:131], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[128:131] // left value = acc[128+0:131+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[132:135], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], acc[132:135] // left value = acc[132+0:135+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[136:139], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[136:139] // left value = acc[136+0:139+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[140:143], v[vgprValuB_X0_I0+16+0+0:vgprValuB_X0_I0+16+0+0+1], v[vgprValuA_X0_I0+6+0+0:vgprValuA_X0_I0+6+0+0+1], acc[140:143] // left value = acc[140+0:143+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[144:147], v[vgprValuB_X0_I0+18+0+0:vgprValuB_X0_I0+18+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[144:147] // left value = acc[144+0:147+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[148:151], v[vgprValuB_X0_I0+18+0+0:vgprValuB_X0_I0+18+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], acc[148:151] // left value = acc[148+0:151+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[152:155], v[vgprValuB_X0_I0+18+0+0:vgprValuB_X0_I0+18+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[152:155] // left value = acc[152+0:155+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[156:159], v[vgprValuB_X0_I0+18+0+0:vgprValuB_X0_I0+18+0+0+1], v[vgprValuA_X0_I0+6+0+0:vgprValuA_X0_I0+6+0+0+1], acc[156:159] // left value = acc[156+0:159+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[160:163], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[160:163] // left value = acc[160+0:163+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[164:167], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], acc[164:167] // left value = acc[164+0:167+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[168:171], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[168:171] // left value = acc[168+0:171+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[172:175], v[vgprValuB_X0_I0+20+0+0:vgprValuB_X0_I0+20+0+0+1], v[vgprValuA_X0_I0+6+0+0:vgprValuA_X0_I0+6+0+0+1], acc[172:175] // left value = acc[172+0:175+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[176:179], v[vgprValuB_X0_I0+22+0+0:vgprValuB_X0_I0+22+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[176:179] // left value = acc[176+0:179+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[180:183], v[vgprValuB_X0_I0+22+0+0:vgprValuB_X0_I0+22+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], acc[180:183] // left value = acc[180+0:183+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[184:187], v[vgprValuB_X0_I0+22+0+0:vgprValuB_X0_I0+22+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[184:187] // left value = acc[184+0:187+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[188:191], v[vgprValuB_X0_I0+22+0+0:vgprValuB_X0_I0+22+0+0+1], v[vgprValuA_X0_I0+6+0+0:vgprValuA_X0_I0+6+0+0+1], acc[188:191] // left value = acc[188+0:191+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[192:195], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[192:195] // left value = acc[192+0:195+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[196:199], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], acc[196:199] // left value = acc[196+0:199+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[200:203], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[200:203] // left value = acc[200+0:203+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[204:207], v[vgprValuB_X0_I0+24+0+0:vgprValuB_X0_I0+24+0+0+1], v[vgprValuA_X0_I0+6+0+0:vgprValuA_X0_I0+6+0+0+1], acc[204:207] // left value = acc[204+0:207+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[208:211], v[vgprValuB_X0_I0+26+0+0:vgprValuB_X0_I0+26+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[208:211] // left value = acc[208+0:211+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[212:215], v[vgprValuB_X0_I0+26+0+0:vgprValuB_X0_I0+26+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], acc[212:215] // left value = acc[212+0:215+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[216:219], v[vgprValuB_X0_I0+26+0+0:vgprValuB_X0_I0+26+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[216:219] // left value = acc[216+0:219+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[220:223], v[vgprValuB_X0_I0+26+0+0:vgprValuB_X0_I0+26+0+0+1], v[vgprValuA_X0_I0+6+0+0:vgprValuA_X0_I0+6+0+0+1], acc[220:223] // left value = acc[220+0:223+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[224:227], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[224:227] // left value = acc[224+0:227+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[228:231], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], acc[228:231] // left value = acc[228+0:231+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[232:235], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[232:235] // left value = acc[232+0:235+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[236:239], v[vgprValuB_X0_I0+28+0+0:vgprValuB_X0_I0+28+0+0+1], v[vgprValuA_X0_I0+6+0+0:vgprValuA_X0_I0+6+0+0+1], acc[236:239] // left value = acc[236+0:239+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[240:243], v[vgprValuB_X0_I0+30+0+0:vgprValuB_X0_I0+30+0+0+1], v[vgprValuA_X0_I0+0+0+0:vgprValuA_X0_I0+0+0+0+1], acc[240:243] // left value = acc[240+0:243+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[244:247], v[vgprValuB_X0_I0+30+0+0:vgprValuB_X0_I0+30+0+0+1], v[vgprValuA_X0_I0+2+0+0:vgprValuA_X0_I0+2+0+0+1], acc[244:247] // left value = acc[244+0:247+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[248:251], v[vgprValuB_X0_I0+30+0+0:vgprValuB_X0_I0+30+0+0+1], v[vgprValuA_X0_I0+4+0+0:vgprValuA_X0_I0+4+0+0+1], acc[248:251] // left value = acc[248+0:251+0]
v_mfma_f32_16x16x32_fp8_fp8 acc[252:255], v[vgprValuB_X0_I0+30+0+0:vgprValuB_X0_I0+30+0+0+1], v[vgprValuA_X0_I0+6+0+0:vgprValuA_X0_I0+6+0+0+1], acc[252:255] // left value = acc[252+0:255+0]

v_mov_b32 v[vgprValuA_X0_I0+0+0], v[vgprValuA_X0_I0+8+0]
v_mov_b32 v[vgprValuA_X0_I0+1+0], v[vgprValuA_X0_I0+9+0]
v_mov_b32 v[vgprValuA_X0_I0+2+0], v[vgprValuA_X0_I0+10+0]
v_mov_b32 v[vgprValuA_X0_I0+3+0], v[vgprValuA_X0_I0+11+0]
v_mov_b32 v[vgprValuA_X0_I0+4+0], v[vgprValuA_X0_I0+12+0]
v_mov_b32 v[vgprValuA_X0_I0+5+0], v[vgprValuA_X0_I0+13+0]
v_mov_b32 v[vgprValuA_X0_I0+6+0], v[vgprValuA_X0_I0+14+0]
v_mov_b32 v[vgprValuA_X0_I0+7+0], v[vgprValuA_X0_I0+15+0]
v_mov_b32 v[vgprValuA_X0_I0+8+0], v[vgprValuA_X2_I0+0+0]
v_mov_b32 v[vgprValuA_X0_I0+9+0], v[vgprValuA_X2_I0+1+0]
v_mov_b32 v[vgprValuA_X0_I0+10+0], v[vgprValuA_X2_I0+2+0]
v_mov_b32 v[vgprValuA_X0_I0+11+0], v[vgprValuA_X2_I0+3+0]
v_mov_b32 v[vgprValuA_X0_I0+12+0], v[vgprValuA_X2_I0+4+0]
v_mov_b32 v[vgprValuA_X0_I0+13+0], v[vgprValuA_X2_I0+5+0]
v_mov_b32 v[vgprValuA_X0_I0+14+0], v[vgprValuA_X2_I0+6+0]
v_mov_b32 v[vgprValuA_X0_I0+15+0], v[vgprValuA_X2_I0+7+0]
v_mov_b32 v[vgprValuA_X2_I0+0+0], v[vgprValuA_X2_I0+8+0]
v_mov_b32 v[vgprValuA_X2_I0+1+0], v[vgprValuA_X2_I0+9+0]
v_mov_b32 v[vgprValuA_X2_I0+2+0], v[vgprValuA_X2_I0+10+0]
v_mov_b32 v[vgprValuA_X2_I0+3+0], v[vgprValuA_X2_I0+11+0]
v_mov_b32 v[vgprValuA_X2_I0+4+0], v[vgprValuA_X2_I0+12+0]
v_mov_b32 v[vgprValuA_X2_I0+5+0], v[vgprValuA_X2_I0+13+0]
v_mov_b32 v[vgprValuA_X2_I0+6+0], v[vgprValuA_X2_I0+14+0]
v_mov_b32 v[vgprValuA_X2_I0+7+0], v[vgprValuA_X2_I0+15+0]

/* closeLoop loopL finalLoop=1 tailLoop=1 */
s_sub_i32 s[sgprLoopCounterL], s[sgprLoopCounterL], 0x20 // dec counterL (tailLoop)
s_add_u32 s[sgprOrigLoopCounter], s[sgprOrigLoopCounter], 0x20 // inc counterL
s_cmp_le_i32 s[sgprLoopCounterL], 0x0              // counterL<=0
s_cbranch_scc0 label_TailLoopBeginL                // restart LoopL
label_TailLoopEndL:
label_SkipTailLoopL:

/* Tail: remove address/G2L [160...230) from pool */
label_Summation_End_T9OQ8US7MJS7S5B0_0:
/* endSummation: add vgpr [0...230) to pool */
.set sgprWGM, UNDEF
.set sgprLoopCounterL, UNDEF
.set sgprOrigLoopCounter, UNDEF
.set sgprAddressA, UNDEF
.set sgprAddressB, UNDEF
.set sgprStridesA, UNDEF
.set sgprStridesB, UNDEF
.set sgprStaggerUIter, UNDEF
.set sgprSrdA, UNDEF
.set sgprSrdB, UNDEF
.set sgprShadowLimitA, UNDEF
.set sgprShadowLimitB, UNDEF
.set sgprWrapUA, UNDEF
.set sgprWrapUB, UNDEF
.set sgprGlobalReadIncsA, UNDEF
.set sgprGlobalReadIncsB, UNDEF
.set sgprScalarGlobalReadOffsetA, UNDEF
.set sgprScalarGlobalReadOffsetB, UNDEF
/* load store sgprs */
.set sgprAddressScaleA, 48
.set sgprAddressScaleB, 50
.set sgprAddressScaleAlphaVec, 52
.set sgprAddressBias, 54
.set sgprBiasType, 56
.set sgprBiasStride, 57
.set sgpractivationAlpha, 58
.set sgpractivationBeta, 59
.set sgprActivationType, 60
s_and_b32 s8, s[sgprGSU], 0x3fff                   // Restore GSU
s_cmp_eq_u32 s8, 1                                 // GSU == 1 ?
s_cbranch_scc0 label_GSU_4                         // branch if GSU != 1
/* Check if custom structure pointer is null */
s_cmp_eq_u32 s[sgprArgType], 2                     // ArgType == 2 ?
s_cbranch_scc1 label_LoadExternalEpilogueStruct    // branch if ArgType == 2
s_load_dwordx8 s[48:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x58
s_load_dwordx4 s[56:59], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x78
s_load_dword s60, s[sgprKernArgAddress:sgprKernArgAddress+1], 0x88
s_branch label_LoadExternalEpilogueStructEnd
label_LoadExternalEpilogueStruct:
s_load_dwordx4 s[48:51], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x70
s_load_dwordx4 s[52:55], s[sgprKernArgAddress:sgprKernArgAddress+1], 0x90
s_load_dwordx2 s[56:57], s[sgprKernArgAddress:sgprKernArgAddress+1], 0xa0
s_load_dwordx2 s[58:59], s[sgprKernArgAddress:sgprKernArgAddress+1], 0xb8
s_load_dword s60, s[sgprKernArgAddress:sgprKernArgAddress+1], 0xc0
label_LoadExternalEpilogueStructEnd:
label_GSU_4:
.set sgprSrdScaleA, 28
.set sgprSrdScaleB, 32
.set sgprSrdScaleAlphaVec, 40
.set sgprSrdBias, 64

/* Mapping of Acc register -> C Vgpr register */

/* not-LocalSplitU: global write indices */
/* computeStoreVgprs */
v_lshrrev_b32 v4, 6, v[vgprSerial]                 // v4 = v[vgprSerial] / 64
v_lshrrev_b32 v5, 2, v4                            // v5 = v4 / 4
v_mul_lo_u32 v5, 0x10, v5                          // wave coordination offset 1
v_and_b32 v1, 63, v[vgprSerial]                    // v1 = v[vgprSerial] % 64
v_lshrrev_b32 v1, 4, v1                            // v1 = v1 / 16
v_lshlrev_b32 v1, 0x2, v1                          // thread0 * continuous_output
v_add_lshl_u32 v1, v5, v1, 4                       // coordination 1 = vwB *(wave_id1 + tid1)
v_mul_lo_u32 v2, v1, s[sgprStrideC1J]              //  offset 1
v_mul_lo_u32 v3, v1, s[sgprStrideD1J]              //  offset 1
v_and_b32 v0, 3, v4                                // v0 = v4 % 4
v_mul_lo_u32 v0, 0x10, v0                          // wave coordination offset 0
v_and_b32 v5, 15, v[vgprSerial]                    // v5 = v[vgprSerial] % 16
v_add_lshl_u32 v0, v5, v0, 2                       // coordination 0 = vwA * (wave_id0 + tid0)
s_mul_i32 s8, 256, s[sgprWorkGroup0]               // wgp0 * MT0
v_add_u32 v0, s8, v0                               // coord 0 = (tid0/MI_m)*4 + waveG0*MIB_m + MT0*SG0
s_mul_i32 s8, 256, s[sgprWorkGroup1]               // wgp1 * MT1
v_add_u32 v1, s8, v1                               // coord 1 = (tid0%MI_m) + waveG1*MIB_n + MT1*SG1

/* not-LocalSplitU: global write */

/******************************************/
/* Global Write Elements                  */
/******************************************/
s_waitcnt lgkmcnt(0)                               // wait for 52 bytes of kern args.
s_and_b32 s8, s[sgprGSU], 0x3fff                   // Restore GSU
s_cmp_eq_u32 s8, 1                                 // GSU == 1 ?
s_cbranch_scc1 label_GSU_5                         // branch if GSU == 1
.set sgprAddressScaleA, UNDEF
.set sgprSrdScaleA, UNDEF
.set sgprAddressScaleB, UNDEF
.set sgprSrdScaleB, UNDEF
.set sgprAddressScaleAlphaVec, UNDEF
.set sgprSrdScaleAlphaVec, UNDEF
s_and_b32 s28, 255, s[sgprSizeI]                   // s28 = s[sgprSizeI] % 256
s_add_u32 s29, -0x1, s[sgprNumWorkGroups0]
s_cmp_ge_u32 s[sgprWorkGroup0], s29                // wg0 >= nwg0-1 ?
s_cselect_b32 s28, s28, 0                          // set rMT0
s_cmpk_gt_u32 s28, 0x0                             // rMT0 > 0
s_cbranch_scc1 label_GW_B0_E1_M                    // jump if edges required
s_and_b32 s28, 255, s[sgprSizeJ]                   // s28 = s[sgprSizeJ] % 256
s_add_u32 s29, -0x1, s[sgprNumWorkGroups1]
s_cmp_ge_u32 s[sgprWorkGroup1], s29                // wg1 >= nwg1-1
s_cselect_b32 s28, s28, 0                          // set rMT1
s_cmpk_gt_u32 s28, 0x0                             // rMT1 > 0
s_cbranch_scc1 label_GW_B0_E1_N                    // jump if edges required
label_GW_B0_E0:

/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=16 */
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw4); (0,0,1,0:vw4); (0,0,2,0:vw4); (0,0,3,0:vw4); (0,0,4,0:vw4); (0,0,5,0:vw4); (0,0,6,0:vw4); (0,0,7,0:vw4); (0,0,8,0:vw4); (0,0,9,0:vw4); (0,0,10,0:vw4); (0,0,11,0:vw4); (0,0,12,0:vw4); (0,0,13,0:vw4); (0,0,14,0:vw4); (0,0,15,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
/* (d1,vc1,d0,vc0)=(0,14,0,0) */
/* (d1,vc1,d0,vc0)=(0,15,0,0) */
v_add_lshl_u32 v10, v3, v0, 0x2                    // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=0, coord0Vgpr=0
v_accvgpr_read_b32 v[vgprValuC+12], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+13], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+14], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+15], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+16], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+17], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+18], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+19], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+20], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+21], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+22], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+23], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+24], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+25], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+26], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+27], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+28], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+29], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+30], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+31], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+32], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+33], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+34], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+35], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+36], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+37], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+38], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+39], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+40], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+41], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+42], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+43], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+44], acc128         // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+45], acc132         // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+46], acc136         // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+47], acc140         // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+48], acc144         // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+49], acc148         // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+50], acc152         // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+51], acc156         // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+52], acc160         // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+53], acc164         // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+54], acc168         // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+55], acc172         // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+56], acc176         // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+57], acc180         // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+58], acc184         // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+59], acc188         // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+60], acc192         // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+61], acc196         // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+62], acc200         // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+63], acc204         // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+64], acc208         // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+65], acc212         // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+66], acc216         // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+67], acc220         // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+68], acc224         // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+69], acc228         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+70], acc232         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+71], acc236         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+72], acc240         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+73], acc244         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+74], acc248         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+75], acc252         // copy acc to vreg[63]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dwordx4 v[12:15], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[16:19], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[20:23], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[24:27], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[28:31], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[36:39], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[44:47], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[48:51], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[52:55], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[56:59], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[60:63], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[64:67], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[68:71], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[72:75], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #1 (d1,d0,vc1,vc0) = */
/*    (0,0,16,0:vw4); (0,0,17,0:vw4); (0,0,18,0:vw4); (0,0,19,0:vw4); (0,0,20,0:vw4); (0,0,21,0:vw4); (0,0,22,0:vw4); (0,0,23,0:vw4); (0,0,24,0:vw4); (0,0,25,0:vw4); (0,0,26,0:vw4); (0,0,27,0:vw4); (0,0,28,0:vw4); (0,0,29,0:vw4); (0,0,30,0:vw4); (0,0,31,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,16,0,0) */
/* (d1,vc1,d0,vc0)=(0,17,0,0) */
/* (d1,vc1,d0,vc0)=(0,18,0,0) */
/* (d1,vc1,d0,vc0)=(0,19,0,0) */
/* (d1,vc1,d0,vc0)=(0,20,0,0) */
/* (d1,vc1,d0,vc0)=(0,21,0,0) */
/* (d1,vc1,d0,vc0)=(0,22,0,0) */
/* (d1,vc1,d0,vc0)=(0,23,0,0) */
/* (d1,vc1,d0,vc0)=(0,24,0,0) */
/* (d1,vc1,d0,vc0)=(0,25,0,0) */
/* (d1,vc1,d0,vc0)=(0,26,0,0) */
/* (d1,vc1,d0,vc0)=(0,27,0,0) */
/* (d1,vc1,d0,vc0)=(0,28,0,0) */
/* (d1,vc1,d0,vc0)=(0,29,0,0) */
/* (d1,vc1,d0,vc0)=(0,30,0,0) */
/* (d1,vc1,d0,vc0)=(0,31,0,0) */
v_accvgpr_read_b32 v[vgprValuC+12], acc1           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+13], acc5           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+14], acc9           // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+15], acc13          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+16], acc17          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+17], acc21          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+18], acc25          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+19], acc29          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+20], acc33          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+21], acc37          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+22], acc41          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+23], acc45          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+24], acc49          // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+25], acc53          // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+26], acc57          // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+27], acc61          // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+28], acc65          // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+29], acc69          // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+30], acc73          // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+31], acc77          // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+32], acc81          // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+33], acc85          // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+34], acc89          // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+35], acc93          // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+36], acc97          // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+37], acc101         // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+38], acc105         // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+39], acc109         // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+40], acc113         // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+41], acc117         // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+42], acc121         // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+43], acc125         // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+44], acc129         // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+45], acc133         // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+46], acc137         // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+47], acc141         // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+48], acc145         // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+49], acc149         // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+50], acc153         // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+51], acc157         // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+52], acc161         // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+53], acc165         // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+54], acc169         // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+55], acc173         // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+56], acc177         // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+57], acc181         // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+58], acc185         // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+59], acc189         // copy acc to vreg[111]
v_accvgpr_read_b32 v[vgprValuC+60], acc193         // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+61], acc197         // copy acc to vreg[113]
v_accvgpr_read_b32 v[vgprValuC+62], acc201         // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+63], acc205         // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+64], acc209         // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+65], acc213         // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+66], acc217         // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+67], acc221         // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+68], acc225         // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+69], acc229         // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+70], acc233         // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+71], acc237         // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+72], acc241         // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+73], acc245         // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+74], acc249         // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+75], acc253         // copy acc to vreg[127]

/* rC *= alpha batchElements=[(0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0), (0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[12:15], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[16:19], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[20:23], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[24:27], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[28:31], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[36:39], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[44:47], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[48:51], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[52:55], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[56:59], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[60:63], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[64:67], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[68:71], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[72:75], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #2 (d1,d0,vc1,vc0) = */
/*    (0,0,32,0:vw4); (0,0,33,0:vw4); (0,0,34,0:vw4); (0,0,35,0:vw4); (0,0,36,0:vw4); (0,0,37,0:vw4); (0,0,38,0:vw4); (0,0,39,0:vw4); (0,0,40,0:vw4); (0,0,41,0:vw4); (0,0,42,0:vw4); (0,0,43,0:vw4); (0,0,44,0:vw4); (0,0,45,0:vw4); (0,0,46,0:vw4); (0,0,47,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,32,0,0) */
/* (d1,vc1,d0,vc0)=(0,33,0,0) */
/* (d1,vc1,d0,vc0)=(0,34,0,0) */
/* (d1,vc1,d0,vc0)=(0,35,0,0) */
/* (d1,vc1,d0,vc0)=(0,36,0,0) */
/* (d1,vc1,d0,vc0)=(0,37,0,0) */
/* (d1,vc1,d0,vc0)=(0,38,0,0) */
/* (d1,vc1,d0,vc0)=(0,39,0,0) */
/* (d1,vc1,d0,vc0)=(0,40,0,0) */
/* (d1,vc1,d0,vc0)=(0,41,0,0) */
/* (d1,vc1,d0,vc0)=(0,42,0,0) */
/* (d1,vc1,d0,vc0)=(0,43,0,0) */
/* (d1,vc1,d0,vc0)=(0,44,0,0) */
/* (d1,vc1,d0,vc0)=(0,45,0,0) */
/* (d1,vc1,d0,vc0)=(0,46,0,0) */
/* (d1,vc1,d0,vc0)=(0,47,0,0) */
v_accvgpr_read_b32 v[vgprValuC+12], acc2           // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+13], acc6           // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+14], acc10          // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+15], acc14          // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+16], acc18          // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+17], acc22          // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+18], acc26          // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+19], acc30          // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+20], acc34          // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+21], acc38          // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+22], acc42          // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+23], acc46          // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+24], acc50          // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+25], acc54          // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+26], acc58          // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+27], acc62          // copy acc to vreg[143]
v_accvgpr_read_b32 v[vgprValuC+28], acc66          // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+29], acc70          // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+30], acc74          // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+31], acc78          // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+32], acc82          // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+33], acc86          // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+34], acc90          // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+35], acc94          // copy acc to vreg[151]
v_accvgpr_read_b32 v[vgprValuC+36], acc98          // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+37], acc102         // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+38], acc106         // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+39], acc110         // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+40], acc114         // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+41], acc118         // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+42], acc122         // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+43], acc126         // copy acc to vreg[159]
v_accvgpr_read_b32 v[vgprValuC+44], acc130         // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+45], acc134         // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+46], acc138         // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+47], acc142         // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+48], acc146         // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+49], acc150         // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+50], acc154         // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+51], acc158         // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+52], acc162         // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+53], acc166         // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+54], acc170         // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+55], acc174         // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+56], acc178         // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+57], acc182         // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+58], acc186         // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+59], acc190         // copy acc to vreg[175]
v_accvgpr_read_b32 v[vgprValuC+60], acc194         // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+61], acc198         // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+62], acc202         // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+63], acc206         // copy acc to vreg[179]
v_accvgpr_read_b32 v[vgprValuC+64], acc210         // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+65], acc214         // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+66], acc218         // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+67], acc222         // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+68], acc226         // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+69], acc230         // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+70], acc234         // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+71], acc238         // copy acc to vreg[187]
v_accvgpr_read_b32 v[vgprValuC+72], acc242         // copy acc to vreg[188]
v_accvgpr_read_b32 v[vgprValuC+73], acc246         // copy acc to vreg[189]
v_accvgpr_read_b32 v[vgprValuC+74], acc250         // copy acc to vreg[190]
v_accvgpr_read_b32 v[vgprValuC+75], acc254         // copy acc to vreg[191]

/* rC *= alpha batchElements=[(0, 0, 32, 0), (0, 0, 33, 0), (0, 0, 34, 0), (0, 0, 35, 0), (0, 0, 36, 0), (0, 0, 37, 0), (0, 0, 38, 0), (0, 0, 39, 0), (0, 0, 40, 0), (0, 0, 41, 0), (0, 0, 42, 0), (0, 0, 43, 0), (0, 0, 44, 0), (0, 0, 45, 0), (0, 0, 46, 0), (0, 0, 47, 0)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[12:15], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[16:19], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[20:23], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[24:27], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[28:31], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[36:39], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[44:47], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[48:51], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[52:55], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[56:59], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[60:63], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[64:67], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[68:71], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[72:75], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #3 (d1,d0,vc1,vc0) = */
/*    (0,0,48,0:vw4); (0,0,49,0:vw4); (0,0,50,0:vw4); (0,0,51,0:vw4); (0,0,52,0:vw4); (0,0,53,0:vw4); (0,0,54,0:vw4); (0,0,55,0:vw4); (0,0,56,0:vw4); (0,0,57,0:vw4); (0,0,58,0:vw4); (0,0,59,0:vw4); (0,0,60,0:vw4); (0,0,61,0:vw4); (0,0,62,0:vw4); (0,0,63,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,48,0,0) */
/* (d1,vc1,d0,vc0)=(0,49,0,0) */
/* (d1,vc1,d0,vc0)=(0,50,0,0) */
/* (d1,vc1,d0,vc0)=(0,51,0,0) */
/* (d1,vc1,d0,vc0)=(0,52,0,0) */
/* (d1,vc1,d0,vc0)=(0,53,0,0) */
/* (d1,vc1,d0,vc0)=(0,54,0,0) */
/* (d1,vc1,d0,vc0)=(0,55,0,0) */
/* (d1,vc1,d0,vc0)=(0,56,0,0) */
/* (d1,vc1,d0,vc0)=(0,57,0,0) */
/* (d1,vc1,d0,vc0)=(0,58,0,0) */
/* (d1,vc1,d0,vc0)=(0,59,0,0) */
/* (d1,vc1,d0,vc0)=(0,60,0,0) */
/* (d1,vc1,d0,vc0)=(0,61,0,0) */
/* (d1,vc1,d0,vc0)=(0,62,0,0) */
/* (d1,vc1,d0,vc0)=(0,63,0,0) */
v_accvgpr_read_b32 v[vgprValuC+12], acc3           // copy acc to vreg[192]
v_accvgpr_read_b32 v[vgprValuC+13], acc7           // copy acc to vreg[193]
v_accvgpr_read_b32 v[vgprValuC+14], acc11          // copy acc to vreg[194]
v_accvgpr_read_b32 v[vgprValuC+15], acc15          // copy acc to vreg[195]
v_accvgpr_read_b32 v[vgprValuC+16], acc19          // copy acc to vreg[196]
v_accvgpr_read_b32 v[vgprValuC+17], acc23          // copy acc to vreg[197]
v_accvgpr_read_b32 v[vgprValuC+18], acc27          // copy acc to vreg[198]
v_accvgpr_read_b32 v[vgprValuC+19], acc31          // copy acc to vreg[199]
v_accvgpr_read_b32 v[vgprValuC+20], acc35          // copy acc to vreg[200]
v_accvgpr_read_b32 v[vgprValuC+21], acc39          // copy acc to vreg[201]
v_accvgpr_read_b32 v[vgprValuC+22], acc43          // copy acc to vreg[202]
v_accvgpr_read_b32 v[vgprValuC+23], acc47          // copy acc to vreg[203]
v_accvgpr_read_b32 v[vgprValuC+24], acc51          // copy acc to vreg[204]
v_accvgpr_read_b32 v[vgprValuC+25], acc55          // copy acc to vreg[205]
v_accvgpr_read_b32 v[vgprValuC+26], acc59          // copy acc to vreg[206]
v_accvgpr_read_b32 v[vgprValuC+27], acc63          // copy acc to vreg[207]
v_accvgpr_read_b32 v[vgprValuC+28], acc67          // copy acc to vreg[208]
v_accvgpr_read_b32 v[vgprValuC+29], acc71          // copy acc to vreg[209]
v_accvgpr_read_b32 v[vgprValuC+30], acc75          // copy acc to vreg[210]
v_accvgpr_read_b32 v[vgprValuC+31], acc79          // copy acc to vreg[211]
v_accvgpr_read_b32 v[vgprValuC+32], acc83          // copy acc to vreg[212]
v_accvgpr_read_b32 v[vgprValuC+33], acc87          // copy acc to vreg[213]
v_accvgpr_read_b32 v[vgprValuC+34], acc91          // copy acc to vreg[214]
v_accvgpr_read_b32 v[vgprValuC+35], acc95          // copy acc to vreg[215]
v_accvgpr_read_b32 v[vgprValuC+36], acc99          // copy acc to vreg[216]
v_accvgpr_read_b32 v[vgprValuC+37], acc103         // copy acc to vreg[217]
v_accvgpr_read_b32 v[vgprValuC+38], acc107         // copy acc to vreg[218]
v_accvgpr_read_b32 v[vgprValuC+39], acc111         // copy acc to vreg[219]
v_accvgpr_read_b32 v[vgprValuC+40], acc115         // copy acc to vreg[220]
v_accvgpr_read_b32 v[vgprValuC+41], acc119         // copy acc to vreg[221]
v_accvgpr_read_b32 v[vgprValuC+42], acc123         // copy acc to vreg[222]
v_accvgpr_read_b32 v[vgprValuC+43], acc127         // copy acc to vreg[223]
v_accvgpr_read_b32 v[vgprValuC+44], acc131         // copy acc to vreg[224]
v_accvgpr_read_b32 v[vgprValuC+45], acc135         // copy acc to vreg[225]
v_accvgpr_read_b32 v[vgprValuC+46], acc139         // copy acc to vreg[226]
v_accvgpr_read_b32 v[vgprValuC+47], acc143         // copy acc to vreg[227]
v_accvgpr_read_b32 v[vgprValuC+48], acc147         // copy acc to vreg[228]
v_accvgpr_read_b32 v[vgprValuC+49], acc151         // copy acc to vreg[229]
v_accvgpr_read_b32 v[vgprValuC+50], acc155         // copy acc to vreg[230]
v_accvgpr_read_b32 v[vgprValuC+51], acc159         // copy acc to vreg[231]
v_accvgpr_read_b32 v[vgprValuC+52], acc163         // copy acc to vreg[232]
v_accvgpr_read_b32 v[vgprValuC+53], acc167         // copy acc to vreg[233]
v_accvgpr_read_b32 v[vgprValuC+54], acc171         // copy acc to vreg[234]
v_accvgpr_read_b32 v[vgprValuC+55], acc175         // copy acc to vreg[235]
v_accvgpr_read_b32 v[vgprValuC+56], acc179         // copy acc to vreg[236]
v_accvgpr_read_b32 v[vgprValuC+57], acc183         // copy acc to vreg[237]
v_accvgpr_read_b32 v[vgprValuC+58], acc187         // copy acc to vreg[238]
v_accvgpr_read_b32 v[vgprValuC+59], acc191         // copy acc to vreg[239]
v_accvgpr_read_b32 v[vgprValuC+60], acc195         // copy acc to vreg[240]
v_accvgpr_read_b32 v[vgprValuC+61], acc199         // copy acc to vreg[241]
v_accvgpr_read_b32 v[vgprValuC+62], acc203         // copy acc to vreg[242]
v_accvgpr_read_b32 v[vgprValuC+63], acc207         // copy acc to vreg[243]
v_accvgpr_read_b32 v[vgprValuC+64], acc211         // copy acc to vreg[244]
v_accvgpr_read_b32 v[vgprValuC+65], acc215         // copy acc to vreg[245]
v_accvgpr_read_b32 v[vgprValuC+66], acc219         // copy acc to vreg[246]
v_accvgpr_read_b32 v[vgprValuC+67], acc223         // copy acc to vreg[247]
v_accvgpr_read_b32 v[vgprValuC+68], acc227         // copy acc to vreg[248]
v_accvgpr_read_b32 v[vgprValuC+69], acc231         // copy acc to vreg[249]
v_accvgpr_read_b32 v[vgprValuC+70], acc235         // copy acc to vreg[250]
v_accvgpr_read_b32 v[vgprValuC+71], acc239         // copy acc to vreg[251]
v_accvgpr_read_b32 v[vgprValuC+72], acc243         // copy acc to vreg[252]
v_accvgpr_read_b32 v[vgprValuC+73], acc247         // copy acc to vreg[253]
v_accvgpr_read_b32 v[vgprValuC+74], acc251         // copy acc to vreg[254]
v_accvgpr_read_b32 v[vgprValuC+75], acc255         // copy acc to vreg[255]

/* rC *= alpha batchElements=[(0, 0, 48, 0), (0, 0, 49, 0), (0, 0, 50, 0), (0, 0, 51, 0), (0, 0, 52, 0), (0, 0, 53, 0), (0, 0, 54, 0), (0, 0, 55, 0), (0, 0, 56, 0), (0, 0, 57, 0), (0, 0, 58, 0), (0, 0, 59, 0), (0, 0, 60, 0), (0, 0, 61, 0), (0, 0, 62, 0), (0, 0, 63, 0)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[12:15], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[16:19], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[20:23], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[24:27], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[28:31], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[32:35], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[36:39], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[40:43], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[44:47], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[48:51], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[52:55], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[56:59], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[60:63], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[64:67], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[68:71], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_lshl_b32 s12, s[sgprStrideD1J], 2                // incToNextRow: Scale by BPE
s_add_u32 s[sgprSrdD+0], s[sgprSrdD+0], s12        // incToNextRow: gra SRD += inc(lower)
s_addc_u32 s[sgprSrdD+1], s[sgprSrdD+1], 0         // incToNextRow: gra SRD += inc(upper)
buffer_store_dwordx4 v[72:75], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
s_branch label_GW_End                              // jump to end
label_GW_B0_E1_N:

/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=16 */
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw4); (0,0,1,0:vw4); (0,0,2,0:vw4); (0,0,3,0:vw4); (0,0,4,0:vw4); (0,0,5,0:vw4); (0,0,6,0:vw4); (0,0,7,0:vw4); (0,0,8,0:vw4); (0,0,9,0:vw4); (0,0,10,0:vw4); (0,0,11,0:vw4); (0,0,12,0:vw4); (0,0,13,0:vw4); (0,0,14,0:vw4); (0,0,15,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v82, BufferOOB
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v82, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v11, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v11, v82, v11, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v82, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v21, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v21, v82, v21, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v82, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v23, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v23, v82, v23, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v82, v40, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v41, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v41, v82, v41, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v42, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v42, v82, v42, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v43, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v43, v82, v43, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v60, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v60, v82, v60, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v61, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v61, v82, v61, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v62, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v62, v82, v62, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v63, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v63, v82, v63, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v80, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v80, v82, v80, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v81, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v81, v82, v81, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+12], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+13], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+14], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+15], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+16], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+17], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+18], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+19], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+24], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+25], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+26], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+27], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+28], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+29], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+30], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+31], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+32], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+33], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+34], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+35], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+36], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+37], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+38], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+39], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+44], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+45], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+46], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+47], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+48], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+49], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+50], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+51], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+52], acc128         // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+53], acc132         // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+54], acc136         // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+55], acc140         // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+56], acc144         // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+57], acc148         // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+58], acc152         // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+59], acc156         // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+64], acc160         // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+65], acc164         // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+66], acc168         // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+67], acc172         // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+68], acc176         // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+69], acc180         // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+70], acc184         // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+71], acc188         // copy acc to vreg[47]
v_accvgpr_read_b32 v[vgprValuC+72], acc192         // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+73], acc196         // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+74], acc200         // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+75], acc204         // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+76], acc208         // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+77], acc212         // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+78], acc216         // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+79], acc220         // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+84], acc224         // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+85], acc228         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+86], acc232         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+87], acc236         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+88], acc240         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+89], acc244         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+90], acc248         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+91], acc252         // copy acc to vreg[63]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0), (0, 0, 12, 0), (0, 0, 13, 0), (0, 0, 14, 0), (0, 0, 15, 0)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dwordx4 v[12:15], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[24:27], v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[28:31], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[32:35], v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[36:39], v23, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[44:47], v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[48:51], v41, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[52:55], v42, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[56:59], v43, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[64:67], v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[68:71], v61, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[72:75], v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[76:79], v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[84:87], v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[88:91], v81, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */
/*    (0,0,16,0:vw4); (0,0,17,0:vw4); (0,0,18,0:vw4); (0,0,19,0:vw4); (0,0,20,0:vw4); (0,0,21,0:vw4); (0,0,22,0:vw4); (0,0,23,0:vw4); (0,0,24,0:vw4); (0,0,25,0:vw4); (0,0,26,0:vw4); (0,0,27,0:vw4); (0,0,28,0:vw4); (0,0,29,0:vw4); (0,0,30,0:vw4); (0,0,31,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v82, BufferOOB
/* (d1,vc1,d0,vc0)=(0,16,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v82, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v11, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v11, v82, v11, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v82, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v21, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v21, v82, v21, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v82, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v23, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v23, v82, v23, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v82, v40, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v41, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v41, v82, v41, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v42, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v42, v82, v42, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v43, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v43, v82, v43, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v60, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v60, v82, v60, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v61, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v61, v82, v61, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v62, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v62, v82, v62, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v63, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v63, v82, v63, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v80, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v80, v82, v80, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v81, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v81, v82, v81, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+12], acc1           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+13], acc5           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+14], acc9           // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+15], acc13          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+16], acc17          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+17], acc21          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+18], acc25          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+19], acc29          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+24], acc33          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+25], acc37          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+26], acc41          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+27], acc45          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+28], acc49          // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+29], acc53          // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+30], acc57          // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+31], acc61          // copy acc to vreg[79]
v_accvgpr_read_b32 v[vgprValuC+32], acc65          // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+33], acc69          // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+34], acc73          // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+35], acc77          // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+36], acc81          // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+37], acc85          // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+38], acc89          // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+39], acc93          // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+44], acc97          // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+45], acc101         // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+46], acc105         // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+47], acc109         // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+48], acc113         // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+49], acc117         // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+50], acc121         // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+51], acc125         // copy acc to vreg[95]
v_accvgpr_read_b32 v[vgprValuC+52], acc129         // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+53], acc133         // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+54], acc137         // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+55], acc141         // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+56], acc145         // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+57], acc149         // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+58], acc153         // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+59], acc157         // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+64], acc161         // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+65], acc165         // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+66], acc169         // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+67], acc173         // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+68], acc177         // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+69], acc181         // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+70], acc185         // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+71], acc189         // copy acc to vreg[111]
v_accvgpr_read_b32 v[vgprValuC+72], acc193         // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+73], acc197         // copy acc to vreg[113]
v_accvgpr_read_b32 v[vgprValuC+74], acc201         // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+75], acc205         // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+76], acc209         // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+77], acc213         // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+78], acc217         // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+79], acc221         // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+84], acc225         // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+85], acc229         // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+86], acc233         // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+87], acc237         // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+88], acc241         // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+89], acc245         // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+90], acc249         // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+91], acc253         // copy acc to vreg[127]

/* rC *= alpha batchElements=[(0, 0, 16, 0), (0, 0, 17, 0), (0, 0, 18, 0), (0, 0, 19, 0), (0, 0, 20, 0), (0, 0, 21, 0), (0, 0, 22, 0), (0, 0, 23, 0), (0, 0, 24, 0), (0, 0, 25, 0), (0, 0, 26, 0), (0, 0, 27, 0), (0, 0, 28, 0), (0, 0, 29, 0), (0, 0, 30, 0), (0, 0, 31, 0)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dwordx4 v[12:15], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[24:27], v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[28:31], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[32:35], v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[36:39], v23, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[44:47], v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[48:51], v41, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[52:55], v42, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[56:59], v43, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[64:67], v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[68:71], v61, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[72:75], v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[76:79], v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[84:87], v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[88:91], v81, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #2 (d1,d0,vc1,vc0) = */
/*    (0,0,32,0:vw4); (0,0,33,0:vw4); (0,0,34,0:vw4); (0,0,35,0:vw4); (0,0,36,0:vw4); (0,0,37,0:vw4); (0,0,38,0:vw4); (0,0,39,0:vw4); (0,0,40,0:vw4); (0,0,41,0:vw4); (0,0,42,0:vw4); (0,0,43,0:vw4); (0,0,44,0:vw4); (0,0,45,0:vw4); (0,0,46,0:vw4); (0,0,47,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v82, BufferOOB
/* (d1,vc1,d0,vc0)=(0,32,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v82, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,33,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v11, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v11, v82, v11, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,34,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v82, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,35,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v21, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v21, v82, v21, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,36,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v82, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,37,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v23, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v23, v82, v23, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,38,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v82, v40, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,39,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v41, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v41, v82, v41, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,40,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v42, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v42, v82, v42, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,41,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v43, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v43, v82, v43, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,42,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v60, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v60, v82, v60, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,43,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v61, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v61, v82, v61, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,44,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v62, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v62, v82, v62, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,45,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v63, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v63, v82, v63, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,46,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v80, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v80, v82, v80, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,47,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v81, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v81, v82, v81, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+12], acc2           // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+13], acc6           // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+14], acc10          // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+15], acc14          // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+16], acc18          // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+17], acc22          // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+18], acc26          // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+19], acc30          // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+24], acc34          // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+25], acc38          // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+26], acc42          // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+27], acc46          // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+28], acc50          // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+29], acc54          // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+30], acc58          // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+31], acc62          // copy acc to vreg[143]
v_accvgpr_read_b32 v[vgprValuC+32], acc66          // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+33], acc70          // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+34], acc74          // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+35], acc78          // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+36], acc82          // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+37], acc86          // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+38], acc90          // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+39], acc94          // copy acc to vreg[151]
v_accvgpr_read_b32 v[vgprValuC+44], acc98          // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+45], acc102         // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+46], acc106         // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+47], acc110         // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+48], acc114         // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+49], acc118         // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+50], acc122         // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+51], acc126         // copy acc to vreg[159]
v_accvgpr_read_b32 v[vgprValuC+52], acc130         // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+53], acc134         // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+54], acc138         // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+55], acc142         // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+56], acc146         // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+57], acc150         // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+58], acc154         // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+59], acc158         // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+64], acc162         // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+65], acc166         // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+66], acc170         // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+67], acc174         // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+68], acc178         // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+69], acc182         // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+70], acc186         // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+71], acc190         // copy acc to vreg[175]
v_accvgpr_read_b32 v[vgprValuC+72], acc194         // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+73], acc198         // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+74], acc202         // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+75], acc206         // copy acc to vreg[179]
v_accvgpr_read_b32 v[vgprValuC+76], acc210         // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+77], acc214         // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+78], acc218         // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+79], acc222         // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+84], acc226         // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+85], acc230         // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+86], acc234         // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+87], acc238         // copy acc to vreg[187]
v_accvgpr_read_b32 v[vgprValuC+88], acc242         // copy acc to vreg[188]
v_accvgpr_read_b32 v[vgprValuC+89], acc246         // copy acc to vreg[189]
v_accvgpr_read_b32 v[vgprValuC+90], acc250         // copy acc to vreg[190]
v_accvgpr_read_b32 v[vgprValuC+91], acc254         // copy acc to vreg[191]

/* rC *= alpha batchElements=[(0, 0, 32, 0), (0, 0, 33, 0), (0, 0, 34, 0), (0, 0, 35, 0), (0, 0, 36, 0), (0, 0, 37, 0), (0, 0, 38, 0), (0, 0, 39, 0), (0, 0, 40, 0), (0, 0, 41, 0), (0, 0, 42, 0), (0, 0, 43, 0), (0, 0, 44, 0), (0, 0, 45, 0), (0, 0, 46, 0), (0, 0, 47, 0)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dwordx4 v[12:15], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[24:27], v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[28:31], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[32:35], v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[36:39], v23, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[44:47], v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[48:51], v41, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[52:55], v42, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[56:59], v43, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[64:67], v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[68:71], v61, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[72:75], v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[76:79], v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[84:87], v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[88:91], v81, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #3 (d1,d0,vc1,vc0) = */
/*    (0,0,48,0:vw4); (0,0,49,0:vw4); (0,0,50,0:vw4); (0,0,51,0:vw4); (0,0,52,0:vw4); (0,0,53,0:vw4); (0,0,54,0:vw4); (0,0,55,0:vw4); (0,0,56,0:vw4); (0,0,57,0:vw4); (0,0,58,0:vw4); (0,0,59,0:vw4); (0,0,60,0:vw4); (0,0,61,0:vw4); (0,0,62,0:vw4); (0,0,63,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v82, BufferOOB
/* (d1,vc1,d0,vc0)=(0,48,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v82, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,49,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v11, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v11, v82, v11, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,50,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v82, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,51,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v21, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v21, v82, v21, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,52,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v82, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,53,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v23, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v23, v82, v23, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,54,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v82, v40, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,55,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v41, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v41, v82, v41, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,56,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v42, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v42, v82, v42, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,57,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v43, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v43, v82, v43, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,58,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v60, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v60, v82, v60, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,59,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v61, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v61, v82, v61, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,60,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v62, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v62, v82, v62, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,61,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v63, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v63, v82, v63, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,62,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v80, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v80, v82, v80, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,63,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v81, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v81, v82, v81, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+12], acc3           // copy acc to vreg[192]
v_accvgpr_read_b32 v[vgprValuC+13], acc7           // copy acc to vreg[193]
v_accvgpr_read_b32 v[vgprValuC+14], acc11          // copy acc to vreg[194]
v_accvgpr_read_b32 v[vgprValuC+15], acc15          // copy acc to vreg[195]
v_accvgpr_read_b32 v[vgprValuC+16], acc19          // copy acc to vreg[196]
v_accvgpr_read_b32 v[vgprValuC+17], acc23          // copy acc to vreg[197]
v_accvgpr_read_b32 v[vgprValuC+18], acc27          // copy acc to vreg[198]
v_accvgpr_read_b32 v[vgprValuC+19], acc31          // copy acc to vreg[199]
v_accvgpr_read_b32 v[vgprValuC+24], acc35          // copy acc to vreg[200]
v_accvgpr_read_b32 v[vgprValuC+25], acc39          // copy acc to vreg[201]
v_accvgpr_read_b32 v[vgprValuC+26], acc43          // copy acc to vreg[202]
v_accvgpr_read_b32 v[vgprValuC+27], acc47          // copy acc to vreg[203]
v_accvgpr_read_b32 v[vgprValuC+28], acc51          // copy acc to vreg[204]
v_accvgpr_read_b32 v[vgprValuC+29], acc55          // copy acc to vreg[205]
v_accvgpr_read_b32 v[vgprValuC+30], acc59          // copy acc to vreg[206]
v_accvgpr_read_b32 v[vgprValuC+31], acc63          // copy acc to vreg[207]
v_accvgpr_read_b32 v[vgprValuC+32], acc67          // copy acc to vreg[208]
v_accvgpr_read_b32 v[vgprValuC+33], acc71          // copy acc to vreg[209]
v_accvgpr_read_b32 v[vgprValuC+34], acc75          // copy acc to vreg[210]
v_accvgpr_read_b32 v[vgprValuC+35], acc79          // copy acc to vreg[211]
v_accvgpr_read_b32 v[vgprValuC+36], acc83          // copy acc to vreg[212]
v_accvgpr_read_b32 v[vgprValuC+37], acc87          // copy acc to vreg[213]
v_accvgpr_read_b32 v[vgprValuC+38], acc91          // copy acc to vreg[214]
v_accvgpr_read_b32 v[vgprValuC+39], acc95          // copy acc to vreg[215]
v_accvgpr_read_b32 v[vgprValuC+44], acc99          // copy acc to vreg[216]
v_accvgpr_read_b32 v[vgprValuC+45], acc103         // copy acc to vreg[217]
v_accvgpr_read_b32 v[vgprValuC+46], acc107         // copy acc to vreg[218]
v_accvgpr_read_b32 v[vgprValuC+47], acc111         // copy acc to vreg[219]
v_accvgpr_read_b32 v[vgprValuC+48], acc115         // copy acc to vreg[220]
v_accvgpr_read_b32 v[vgprValuC+49], acc119         // copy acc to vreg[221]
v_accvgpr_read_b32 v[vgprValuC+50], acc123         // copy acc to vreg[222]
v_accvgpr_read_b32 v[vgprValuC+51], acc127         // copy acc to vreg[223]
v_accvgpr_read_b32 v[vgprValuC+52], acc131         // copy acc to vreg[224]
v_accvgpr_read_b32 v[vgprValuC+53], acc135         // copy acc to vreg[225]
v_accvgpr_read_b32 v[vgprValuC+54], acc139         // copy acc to vreg[226]
v_accvgpr_read_b32 v[vgprValuC+55], acc143         // copy acc to vreg[227]
v_accvgpr_read_b32 v[vgprValuC+56], acc147         // copy acc to vreg[228]
v_accvgpr_read_b32 v[vgprValuC+57], acc151         // copy acc to vreg[229]
v_accvgpr_read_b32 v[vgprValuC+58], acc155         // copy acc to vreg[230]
v_accvgpr_read_b32 v[vgprValuC+59], acc159         // copy acc to vreg[231]
v_accvgpr_read_b32 v[vgprValuC+64], acc163         // copy acc to vreg[232]
v_accvgpr_read_b32 v[vgprValuC+65], acc167         // copy acc to vreg[233]
v_accvgpr_read_b32 v[vgprValuC+66], acc171         // copy acc to vreg[234]
v_accvgpr_read_b32 v[vgprValuC+67], acc175         // copy acc to vreg[235]
v_accvgpr_read_b32 v[vgprValuC+68], acc179         // copy acc to vreg[236]
v_accvgpr_read_b32 v[vgprValuC+69], acc183         // copy acc to vreg[237]
v_accvgpr_read_b32 v[vgprValuC+70], acc187         // copy acc to vreg[238]
v_accvgpr_read_b32 v[vgprValuC+71], acc191         // copy acc to vreg[239]
v_accvgpr_read_b32 v[vgprValuC+72], acc195         // copy acc to vreg[240]
v_accvgpr_read_b32 v[vgprValuC+73], acc199         // copy acc to vreg[241]
v_accvgpr_read_b32 v[vgprValuC+74], acc203         // copy acc to vreg[242]
v_accvgpr_read_b32 v[vgprValuC+75], acc207         // copy acc to vreg[243]
v_accvgpr_read_b32 v[vgprValuC+76], acc211         // copy acc to vreg[244]
v_accvgpr_read_b32 v[vgprValuC+77], acc215         // copy acc to vreg[245]
v_accvgpr_read_b32 v[vgprValuC+78], acc219         // copy acc to vreg[246]
v_accvgpr_read_b32 v[vgprValuC+79], acc223         // copy acc to vreg[247]
v_accvgpr_read_b32 v[vgprValuC+84], acc227         // copy acc to vreg[248]
v_accvgpr_read_b32 v[vgprValuC+85], acc231         // copy acc to vreg[249]
v_accvgpr_read_b32 v[vgprValuC+86], acc235         // copy acc to vreg[250]
v_accvgpr_read_b32 v[vgprValuC+87], acc239         // copy acc to vreg[251]
v_accvgpr_read_b32 v[vgprValuC+88], acc243         // copy acc to vreg[252]
v_accvgpr_read_b32 v[vgprValuC+89], acc247         // copy acc to vreg[253]
v_accvgpr_read_b32 v[vgprValuC+90], acc251         // copy acc to vreg[254]
v_accvgpr_read_b32 v[vgprValuC+91], acc255         // copy acc to vreg[255]

/* rC *= alpha batchElements=[(0, 0, 48, 0), (0, 0, 49, 0), (0, 0, 50, 0), (0, 0, 51, 0), (0, 0, 52, 0), (0, 0, 53, 0), (0, 0, 54, 0), (0, 0, 55, 0), (0, 0, 56, 0), (0, 0, 57, 0), (0, 0, 58, 0), (0, 0, 59, 0), (0, 0, 60, 0), (0, 0, 61, 0), (0, 0, 62, 0), (0, 0, 63, 0)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dwordx4 v[12:15], v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[16:19], v11, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[24:27], v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[28:31], v21, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[32:35], v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[36:39], v23, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[44:47], v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[48:51], v41, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[52:55], v42, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[56:59], v43, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[64:67], v60, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[68:71], v61, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[72:75], v62, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[76:79], v63, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[84:87], v80, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dwordx4 v[88:91], v81, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
s_branch label_GW_End                              // jump to end
label_GW_B0_E1_M:

/* edge=1, allocate 6 sgpr. perBatchTmpS=4 perBatchMaskS=2 perElementMaskS=0 elementsPerBatch=16 */
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw1); (0,0,0,1:vw1); (0,0,0,2:vw1); (0,0,0,3:vw1); (0,0,1,0:vw1); (0,0,1,1:vw1); (0,0,1,2:vw1); (0,0,1,3:vw1); (0,0,2,0:vw1); (0,0,2,1:vw1); (0,0,2,2:vw1); (0,0,2,3:vw1); (0,0,3,0:vw1); (0,0,3,1:vw1); (0,0,3,2:vw1); (0,0,3,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v42, BufferOOB
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v42, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v12, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v12, v42, v12, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v14, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v14, v42, v14, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,0,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v16, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v16, v42, v16, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v18, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v18, v42, v18, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v42, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v42, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,1,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v24, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v24, v42, v24, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v26, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v26, v42, v26, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v28, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v28, v42, v28, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v30, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v30, v42, v30, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,2,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v32, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v32, v42, v32, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v34, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v34, v42, v34, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v36, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v36, v42, v36, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v38, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v38, v42, v38, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,3,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v42, v40, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+11], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+13], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+15], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+17], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+19], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+21], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+23], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+25], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+27], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+29], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+31], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+33], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+35], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+37], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+39], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+41], acc60          // copy acc to vreg[15]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 0, 1), (0, 0, 0, 2), (0, 0, 0, 3), (0, 0, 1, 0), (0, 0, 1, 1), (0, 0, 1, 2), (0, 0, 1, 3), (0, 0, 2, 0), (0, 0, 2, 1), (0, 0, 2, 2), (0, 0, 2, 3), (0, 0, 3, 0), (0, 0, 3, 1), (0, 0, 3, 2), (0, 0, 3, 3)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dword v11, v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v13, v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v15, v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v17, v16, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v19, v18, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v21, v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v23, v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v25, v24, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v27, v26, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v29, v28, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v31, v30, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v33, v32, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v35, v34, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v37, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v39, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v41, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #1 (d1,d0,vc1,vc0) = */
/*    (0,0,4,0:vw1); (0,0,4,1:vw1); (0,0,4,2:vw1); (0,0,4,3:vw1); (0,0,5,0:vw1); (0,0,5,1:vw1); (0,0,5,2:vw1); (0,0,5,3:vw1); (0,0,6,0:vw1); (0,0,6,1:vw1); (0,0,6,2:vw1); (0,0,6,3:vw1); (0,0,7,0:vw1); (0,0,7,1:vw1); (0,0,7,2:vw1); (0,0,7,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v42, BufferOOB
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v42, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v12, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v12, v42, v12, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v14, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v14, v42, v14, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,4,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v16, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v16, v42, v16, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v18, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v18, v42, v18, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v42, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v42, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,5,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v24, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v24, v42, v24, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v26, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v26, v42, v26, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v28, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v28, v42, v28, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v30, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v30, v42, v30, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,6,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v32, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v32, v42, v32, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v34, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v34, v42, v34, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v36, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v36, v42, v36, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v38, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v38, v42, v38, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,7,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v42, v40, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+11], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+13], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+15], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+17], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+19], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+21], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+23], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+25], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+27], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+29], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+31], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+33], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+35], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+37], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+39], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+41], acc124         // copy acc to vreg[31]

/* rC *= alpha batchElements=[(0, 0, 4, 0), (0, 0, 4, 1), (0, 0, 4, 2), (0, 0, 4, 3), (0, 0, 5, 0), (0, 0, 5, 1), (0, 0, 5, 2), (0, 0, 5, 3), (0, 0, 6, 0), (0, 0, 6, 1), (0, 0, 6, 2), (0, 0, 6, 3), (0, 0, 7, 0), (0, 0, 7, 1), (0, 0, 7, 2), (0, 0, 7, 3)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dword v11, v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v13, v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v15, v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v17, v16, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v19, v18, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v21, v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v23, v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v25, v24, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v27, v26, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v29, v28, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v31, v30, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v33, v32, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v35, v34, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v37, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v39, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v41, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #2 (d1,d0,vc1,vc0) = */
/*    (0,0,8,0:vw1); (0,0,8,1:vw1); (0,0,8,2:vw1); (0,0,8,3:vw1); (0,0,9,0:vw1); (0,0,9,1:vw1); (0,0,9,2:vw1); (0,0,9,3:vw1); (0,0,10,0:vw1); (0,0,10,1:vw1); (0,0,10,2:vw1); (0,0,10,3:vw1); (0,0,11,0:vw1); (0,0,11,1:vw1); (0,0,11,2:vw1); (0,0,11,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v42, BufferOOB
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v42, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v12, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v12, v42, v12, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v14, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v14, v42, v14, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,8,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v16, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v16, v42, v16, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v18, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v18, v42, v18, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v42, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v42, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,9,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v24, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v24, v42, v24, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v26, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v26, v42, v26, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v28, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v28, v42, v28, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v30, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v30, v42, v30, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,10,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v32, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v32, v42, v32, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v34, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v34, v42, v34, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v36, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v36, v42, v36, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v38, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v38, v42, v38, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,11,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v42, v40, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+11], acc128         // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+13], acc132         // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+15], acc136         // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+17], acc140         // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+19], acc144         // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+21], acc148         // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+23], acc152         // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+25], acc156         // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+27], acc160         // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+29], acc164         // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+31], acc168         // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+33], acc172         // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+35], acc176         // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+37], acc180         // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+39], acc184         // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+41], acc188         // copy acc to vreg[47]

/* rC *= alpha batchElements=[(0, 0, 8, 0), (0, 0, 8, 1), (0, 0, 8, 2), (0, 0, 8, 3), (0, 0, 9, 0), (0, 0, 9, 1), (0, 0, 9, 2), (0, 0, 9, 3), (0, 0, 10, 0), (0, 0, 10, 1), (0, 0, 10, 2), (0, 0, 10, 3), (0, 0, 11, 0), (0, 0, 11, 1), (0, 0, 11, 2), (0, 0, 11, 3)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dword v11, v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v13, v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v15, v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v17, v16, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v19, v18, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v21, v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v23, v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v25, v24, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v27, v26, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v29, v28, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v31, v30, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v33, v32, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v35, v34, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v37, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v39, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v41, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #3 (d1,d0,vc1,vc0) = */
/*    (0,0,12,0:vw1); (0,0,12,1:vw1); (0,0,12,2:vw1); (0,0,12,3:vw1); (0,0,13,0:vw1); (0,0,13,1:vw1); (0,0,13,2:vw1); (0,0,13,3:vw1); (0,0,14,0:vw1); (0,0,14,1:vw1); (0,0,14,2:vw1); (0,0,14,3:vw1); (0,0,15,0:vw1); (0,0,15,1:vw1); (0,0,15,2:vw1); (0,0,15,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v42, BufferOOB
/* (d1,vc1,d0,vc0)=(0,12,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v42, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v12, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v12, v42, v12, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v14, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v14, v42, v14, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,12,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v16, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v16, v42, v16, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v18, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v18, v42, v18, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v42, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v42, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,13,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v24, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v24, v42, v24, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v26, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v26, v42, v26, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v28, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v28, v42, v28, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v30, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v30, v42, v30, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,14,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v32, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v32, v42, v32, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v34, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v34, v42, v34, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v36, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v36, v42, v36, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v38, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v38, v42, v38, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,15,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v42, v40, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+11], acc192         // copy acc to vreg[48]
v_accvgpr_read_b32 v[vgprValuC+13], acc196         // copy acc to vreg[49]
v_accvgpr_read_b32 v[vgprValuC+15], acc200         // copy acc to vreg[50]
v_accvgpr_read_b32 v[vgprValuC+17], acc204         // copy acc to vreg[51]
v_accvgpr_read_b32 v[vgprValuC+19], acc208         // copy acc to vreg[52]
v_accvgpr_read_b32 v[vgprValuC+21], acc212         // copy acc to vreg[53]
v_accvgpr_read_b32 v[vgprValuC+23], acc216         // copy acc to vreg[54]
v_accvgpr_read_b32 v[vgprValuC+25], acc220         // copy acc to vreg[55]
v_accvgpr_read_b32 v[vgprValuC+27], acc224         // copy acc to vreg[56]
v_accvgpr_read_b32 v[vgprValuC+29], acc228         // copy acc to vreg[57]
v_accvgpr_read_b32 v[vgprValuC+31], acc232         // copy acc to vreg[58]
v_accvgpr_read_b32 v[vgprValuC+33], acc236         // copy acc to vreg[59]
v_accvgpr_read_b32 v[vgprValuC+35], acc240         // copy acc to vreg[60]
v_accvgpr_read_b32 v[vgprValuC+37], acc244         // copy acc to vreg[61]
v_accvgpr_read_b32 v[vgprValuC+39], acc248         // copy acc to vreg[62]
v_accvgpr_read_b32 v[vgprValuC+41], acc252         // copy acc to vreg[63]

/* rC *= alpha batchElements=[(0, 0, 12, 0), (0, 0, 12, 1), (0, 0, 12, 2), (0, 0, 12, 3), (0, 0, 13, 0), (0, 0, 13, 1), (0, 0, 13, 2), (0, 0, 13, 3), (0, 0, 14, 0), (0, 0, 14, 1), (0, 0, 14, 2), (0, 0, 14, 3), (0, 0, 15, 0), (0, 0, 15, 1), (0, 0, 15, 2), (0, 0, 15, 3)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dword v11, v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v13, v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v15, v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v17, v16, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v19, v18, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v21, v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v23, v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v25, v24, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v27, v26, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v29, v28, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v31, v30, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v33, v32, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v35, v34, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v37, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v39, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v41, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #4 (d1,d0,vc1,vc0) = */
/*    (0,0,16,0:vw1); (0,0,16,1:vw1); (0,0,16,2:vw1); (0,0,16,3:vw1); (0,0,17,0:vw1); (0,0,17,1:vw1); (0,0,17,2:vw1); (0,0,17,3:vw1); (0,0,18,0:vw1); (0,0,18,1:vw1); (0,0,18,2:vw1); (0,0,18,3:vw1); (0,0,19,0:vw1); (0,0,19,1:vw1); (0,0,19,2:vw1); (0,0,19,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v42, BufferOOB
/* (d1,vc1,d0,vc0)=(0,16,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v42, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v12, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v12, v42, v12, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v14, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v14, v42, v14, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,16,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v16, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v16, v42, v16, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v18, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v18, v42, v18, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v42, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v42, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,17,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v24, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v24, v42, v24, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v26, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v26, v42, v26, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v28, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v28, v42, v28, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v30, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v30, v42, v30, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,18,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v32, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v32, v42, v32, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v34, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v34, v42, v34, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v36, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v36, v42, v36, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v38, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v38, v42, v38, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,19,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v42, v40, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+11], acc1           // copy acc to vreg[64]
v_accvgpr_read_b32 v[vgprValuC+13], acc5           // copy acc to vreg[65]
v_accvgpr_read_b32 v[vgprValuC+15], acc9           // copy acc to vreg[66]
v_accvgpr_read_b32 v[vgprValuC+17], acc13          // copy acc to vreg[67]
v_accvgpr_read_b32 v[vgprValuC+19], acc17          // copy acc to vreg[68]
v_accvgpr_read_b32 v[vgprValuC+21], acc21          // copy acc to vreg[69]
v_accvgpr_read_b32 v[vgprValuC+23], acc25          // copy acc to vreg[70]
v_accvgpr_read_b32 v[vgprValuC+25], acc29          // copy acc to vreg[71]
v_accvgpr_read_b32 v[vgprValuC+27], acc33          // copy acc to vreg[72]
v_accvgpr_read_b32 v[vgprValuC+29], acc37          // copy acc to vreg[73]
v_accvgpr_read_b32 v[vgprValuC+31], acc41          // copy acc to vreg[74]
v_accvgpr_read_b32 v[vgprValuC+33], acc45          // copy acc to vreg[75]
v_accvgpr_read_b32 v[vgprValuC+35], acc49          // copy acc to vreg[76]
v_accvgpr_read_b32 v[vgprValuC+37], acc53          // copy acc to vreg[77]
v_accvgpr_read_b32 v[vgprValuC+39], acc57          // copy acc to vreg[78]
v_accvgpr_read_b32 v[vgprValuC+41], acc61          // copy acc to vreg[79]

/* rC *= alpha batchElements=[(0, 0, 16, 0), (0, 0, 16, 1), (0, 0, 16, 2), (0, 0, 16, 3), (0, 0, 17, 0), (0, 0, 17, 1), (0, 0, 17, 2), (0, 0, 17, 3), (0, 0, 18, 0), (0, 0, 18, 1), (0, 0, 18, 2), (0, 0, 18, 3), (0, 0, 19, 0), (0, 0, 19, 1), (0, 0, 19, 2), (0, 0, 19, 3)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dword v11, v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v13, v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v15, v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v17, v16, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v19, v18, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v21, v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v23, v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v25, v24, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v27, v26, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v29, v28, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v31, v30, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v33, v32, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v35, v34, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v37, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v39, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v41, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #5 (d1,d0,vc1,vc0) = */
/*    (0,0,20,0:vw1); (0,0,20,1:vw1); (0,0,20,2:vw1); (0,0,20,3:vw1); (0,0,21,0:vw1); (0,0,21,1:vw1); (0,0,21,2:vw1); (0,0,21,3:vw1); (0,0,22,0:vw1); (0,0,22,1:vw1); (0,0,22,2:vw1); (0,0,22,3:vw1); (0,0,23,0:vw1); (0,0,23,1:vw1); (0,0,23,2:vw1); (0,0,23,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v42, BufferOOB
/* (d1,vc1,d0,vc0)=(0,20,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v42, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v12, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v12, v42, v12, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v14, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v14, v42, v14, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,20,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v16, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v16, v42, v16, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v18, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v18, v42, v18, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v42, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v42, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,21,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v24, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v24, v42, v24, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v26, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v26, v42, v26, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v28, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v28, v42, v28, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v30, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v30, v42, v30, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,22,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v32, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v32, v42, v32, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v34, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v34, v42, v34, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v36, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v36, v42, v36, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v38, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v38, v42, v38, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,23,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v42, v40, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+11], acc65          // copy acc to vreg[80]
v_accvgpr_read_b32 v[vgprValuC+13], acc69          // copy acc to vreg[81]
v_accvgpr_read_b32 v[vgprValuC+15], acc73          // copy acc to vreg[82]
v_accvgpr_read_b32 v[vgprValuC+17], acc77          // copy acc to vreg[83]
v_accvgpr_read_b32 v[vgprValuC+19], acc81          // copy acc to vreg[84]
v_accvgpr_read_b32 v[vgprValuC+21], acc85          // copy acc to vreg[85]
v_accvgpr_read_b32 v[vgprValuC+23], acc89          // copy acc to vreg[86]
v_accvgpr_read_b32 v[vgprValuC+25], acc93          // copy acc to vreg[87]
v_accvgpr_read_b32 v[vgprValuC+27], acc97          // copy acc to vreg[88]
v_accvgpr_read_b32 v[vgprValuC+29], acc101         // copy acc to vreg[89]
v_accvgpr_read_b32 v[vgprValuC+31], acc105         // copy acc to vreg[90]
v_accvgpr_read_b32 v[vgprValuC+33], acc109         // copy acc to vreg[91]
v_accvgpr_read_b32 v[vgprValuC+35], acc113         // copy acc to vreg[92]
v_accvgpr_read_b32 v[vgprValuC+37], acc117         // copy acc to vreg[93]
v_accvgpr_read_b32 v[vgprValuC+39], acc121         // copy acc to vreg[94]
v_accvgpr_read_b32 v[vgprValuC+41], acc125         // copy acc to vreg[95]

/* rC *= alpha batchElements=[(0, 0, 20, 0), (0, 0, 20, 1), (0, 0, 20, 2), (0, 0, 20, 3), (0, 0, 21, 0), (0, 0, 21, 1), (0, 0, 21, 2), (0, 0, 21, 3), (0, 0, 22, 0), (0, 0, 22, 1), (0, 0, 22, 2), (0, 0, 22, 3), (0, 0, 23, 0), (0, 0, 23, 1), (0, 0, 23, 2), (0, 0, 23, 3)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dword v11, v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v13, v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v15, v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v17, v16, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v19, v18, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v21, v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v23, v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v25, v24, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v27, v26, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v29, v28, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v31, v30, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v33, v32, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v35, v34, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v37, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v39, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v41, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #6 (d1,d0,vc1,vc0) = */
/*    (0,0,24,0:vw1); (0,0,24,1:vw1); (0,0,24,2:vw1); (0,0,24,3:vw1); (0,0,25,0:vw1); (0,0,25,1:vw1); (0,0,25,2:vw1); (0,0,25,3:vw1); (0,0,26,0:vw1); (0,0,26,1:vw1); (0,0,26,2:vw1); (0,0,26,3:vw1); (0,0,27,0:vw1); (0,0,27,1:vw1); (0,0,27,2:vw1); (0,0,27,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v42, BufferOOB
/* (d1,vc1,d0,vc0)=(0,24,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v42, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v12, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v12, v42, v12, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v14, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v14, v42, v14, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,24,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v16, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v16, v42, v16, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v18, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v18, v42, v18, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v42, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v42, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,25,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v24, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v24, v42, v24, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v26, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v26, v42, v26, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v28, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v28, v42, v28, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v30, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v30, v42, v30, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,26,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v32, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v32, v42, v32, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v34, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v34, v42, v34, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v36, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v36, v42, v36, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v38, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v38, v42, v38, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,27,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v42, v40, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+11], acc129         // copy acc to vreg[96]
v_accvgpr_read_b32 v[vgprValuC+13], acc133         // copy acc to vreg[97]
v_accvgpr_read_b32 v[vgprValuC+15], acc137         // copy acc to vreg[98]
v_accvgpr_read_b32 v[vgprValuC+17], acc141         // copy acc to vreg[99]
v_accvgpr_read_b32 v[vgprValuC+19], acc145         // copy acc to vreg[100]
v_accvgpr_read_b32 v[vgprValuC+21], acc149         // copy acc to vreg[101]
v_accvgpr_read_b32 v[vgprValuC+23], acc153         // copy acc to vreg[102]
v_accvgpr_read_b32 v[vgprValuC+25], acc157         // copy acc to vreg[103]
v_accvgpr_read_b32 v[vgprValuC+27], acc161         // copy acc to vreg[104]
v_accvgpr_read_b32 v[vgprValuC+29], acc165         // copy acc to vreg[105]
v_accvgpr_read_b32 v[vgprValuC+31], acc169         // copy acc to vreg[106]
v_accvgpr_read_b32 v[vgprValuC+33], acc173         // copy acc to vreg[107]
v_accvgpr_read_b32 v[vgprValuC+35], acc177         // copy acc to vreg[108]
v_accvgpr_read_b32 v[vgprValuC+37], acc181         // copy acc to vreg[109]
v_accvgpr_read_b32 v[vgprValuC+39], acc185         // copy acc to vreg[110]
v_accvgpr_read_b32 v[vgprValuC+41], acc189         // copy acc to vreg[111]

/* rC *= alpha batchElements=[(0, 0, 24, 0), (0, 0, 24, 1), (0, 0, 24, 2), (0, 0, 24, 3), (0, 0, 25, 0), (0, 0, 25, 1), (0, 0, 25, 2), (0, 0, 25, 3), (0, 0, 26, 0), (0, 0, 26, 1), (0, 0, 26, 2), (0, 0, 26, 3), (0, 0, 27, 0), (0, 0, 27, 1), (0, 0, 27, 2), (0, 0, 27, 3)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dword v11, v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v13, v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v15, v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v17, v16, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v19, v18, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v21, v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v23, v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v25, v24, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v27, v26, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v29, v28, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v31, v30, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v33, v32, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v35, v34, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v37, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v39, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v41, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #7 (d1,d0,vc1,vc0) = */
/*    (0,0,28,0:vw1); (0,0,28,1:vw1); (0,0,28,2:vw1); (0,0,28,3:vw1); (0,0,29,0:vw1); (0,0,29,1:vw1); (0,0,29,2:vw1); (0,0,29,3:vw1); (0,0,30,0:vw1); (0,0,30,1:vw1); (0,0,30,2:vw1); (0,0,30,3:vw1); (0,0,31,0:vw1); (0,0,31,1:vw1); (0,0,31,2:vw1); (0,0,31,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v42, BufferOOB
/* (d1,vc1,d0,vc0)=(0,28,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v42, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v12, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v12, v42, v12, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v14, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v14, v42, v14, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,28,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v16, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v16, v42, v16, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v18, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v18, v42, v18, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v42, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v42, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,29,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v24, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v24, v42, v24, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v26, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v26, v42, v26, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v28, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v28, v42, v28, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v30, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v30, v42, v30, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,30,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v32, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v32, v42, v32, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v34, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v34, v42, v34, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v36, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v36, v42, v36, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v38, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v38, v42, v38, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,31,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v42, v40, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+11], acc193         // copy acc to vreg[112]
v_accvgpr_read_b32 v[vgprValuC+13], acc197         // copy acc to vreg[113]
v_accvgpr_read_b32 v[vgprValuC+15], acc201         // copy acc to vreg[114]
v_accvgpr_read_b32 v[vgprValuC+17], acc205         // copy acc to vreg[115]
v_accvgpr_read_b32 v[vgprValuC+19], acc209         // copy acc to vreg[116]
v_accvgpr_read_b32 v[vgprValuC+21], acc213         // copy acc to vreg[117]
v_accvgpr_read_b32 v[vgprValuC+23], acc217         // copy acc to vreg[118]
v_accvgpr_read_b32 v[vgprValuC+25], acc221         // copy acc to vreg[119]
v_accvgpr_read_b32 v[vgprValuC+27], acc225         // copy acc to vreg[120]
v_accvgpr_read_b32 v[vgprValuC+29], acc229         // copy acc to vreg[121]
v_accvgpr_read_b32 v[vgprValuC+31], acc233         // copy acc to vreg[122]
v_accvgpr_read_b32 v[vgprValuC+33], acc237         // copy acc to vreg[123]
v_accvgpr_read_b32 v[vgprValuC+35], acc241         // copy acc to vreg[124]
v_accvgpr_read_b32 v[vgprValuC+37], acc245         // copy acc to vreg[125]
v_accvgpr_read_b32 v[vgprValuC+39], acc249         // copy acc to vreg[126]
v_accvgpr_read_b32 v[vgprValuC+41], acc253         // copy acc to vreg[127]

/* rC *= alpha batchElements=[(0, 0, 28, 0), (0, 0, 28, 1), (0, 0, 28, 2), (0, 0, 28, 3), (0, 0, 29, 0), (0, 0, 29, 1), (0, 0, 29, 2), (0, 0, 29, 3), (0, 0, 30, 0), (0, 0, 30, 1), (0, 0, 30, 2), (0, 0, 30, 3), (0, 0, 31, 0), (0, 0, 31, 1), (0, 0, 31, 2), (0, 0, 31, 3)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dword v11, v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v13, v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v15, v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v17, v16, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v19, v18, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v21, v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v23, v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v25, v24, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v27, v26, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v29, v28, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v31, v30, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v33, v32, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v35, v34, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v37, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v39, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v41, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #8 (d1,d0,vc1,vc0) = */
/*    (0,0,32,0:vw1); (0,0,32,1:vw1); (0,0,32,2:vw1); (0,0,32,3:vw1); (0,0,33,0:vw1); (0,0,33,1:vw1); (0,0,33,2:vw1); (0,0,33,3:vw1); (0,0,34,0:vw1); (0,0,34,1:vw1); (0,0,34,2:vw1); (0,0,34,3:vw1); (0,0,35,0:vw1); (0,0,35,1:vw1); (0,0,35,2:vw1); (0,0,35,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v42, BufferOOB
/* (d1,vc1,d0,vc0)=(0,32,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v42, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,32,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v12, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v12, v42, v12, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,32,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v14, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v14, v42, v14, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,32,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v16, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v16, v42, v16, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,33,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v18, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v18, v42, v18, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,33,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v42, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,33,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v42, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,33,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v24, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v24, v42, v24, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,34,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v26, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v26, v42, v26, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,34,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v28, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v28, v42, v28, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,34,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v30, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v30, v42, v30, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,34,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v32, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v32, v42, v32, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,35,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v34, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v34, v42, v34, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,35,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v36, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v36, v42, v36, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,35,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v38, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v38, v42, v38, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,35,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v42, v40, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+11], acc2           // copy acc to vreg[128]
v_accvgpr_read_b32 v[vgprValuC+13], acc6           // copy acc to vreg[129]
v_accvgpr_read_b32 v[vgprValuC+15], acc10          // copy acc to vreg[130]
v_accvgpr_read_b32 v[vgprValuC+17], acc14          // copy acc to vreg[131]
v_accvgpr_read_b32 v[vgprValuC+19], acc18          // copy acc to vreg[132]
v_accvgpr_read_b32 v[vgprValuC+21], acc22          // copy acc to vreg[133]
v_accvgpr_read_b32 v[vgprValuC+23], acc26          // copy acc to vreg[134]
v_accvgpr_read_b32 v[vgprValuC+25], acc30          // copy acc to vreg[135]
v_accvgpr_read_b32 v[vgprValuC+27], acc34          // copy acc to vreg[136]
v_accvgpr_read_b32 v[vgprValuC+29], acc38          // copy acc to vreg[137]
v_accvgpr_read_b32 v[vgprValuC+31], acc42          // copy acc to vreg[138]
v_accvgpr_read_b32 v[vgprValuC+33], acc46          // copy acc to vreg[139]
v_accvgpr_read_b32 v[vgprValuC+35], acc50          // copy acc to vreg[140]
v_accvgpr_read_b32 v[vgprValuC+37], acc54          // copy acc to vreg[141]
v_accvgpr_read_b32 v[vgprValuC+39], acc58          // copy acc to vreg[142]
v_accvgpr_read_b32 v[vgprValuC+41], acc62          // copy acc to vreg[143]

/* rC *= alpha batchElements=[(0, 0, 32, 0), (0, 0, 32, 1), (0, 0, 32, 2), (0, 0, 32, 3), (0, 0, 33, 0), (0, 0, 33, 1), (0, 0, 33, 2), (0, 0, 33, 3), (0, 0, 34, 0), (0, 0, 34, 1), (0, 0, 34, 2), (0, 0, 34, 3), (0, 0, 35, 0), (0, 0, 35, 1), (0, 0, 35, 2), (0, 0, 35, 3)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dword v11, v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v13, v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v15, v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v17, v16, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v19, v18, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v21, v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v23, v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v25, v24, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v27, v26, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v29, v28, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v31, v30, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v33, v32, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v35, v34, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v37, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v39, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v41, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #9 (d1,d0,vc1,vc0) = */
/*    (0,0,36,0:vw1); (0,0,36,1:vw1); (0,0,36,2:vw1); (0,0,36,3:vw1); (0,0,37,0:vw1); (0,0,37,1:vw1); (0,0,37,2:vw1); (0,0,37,3:vw1); (0,0,38,0:vw1); (0,0,38,1:vw1); (0,0,38,2:vw1); (0,0,38,3:vw1); (0,0,39,0:vw1); (0,0,39,1:vw1); (0,0,39,2:vw1); (0,0,39,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v42, BufferOOB
/* (d1,vc1,d0,vc0)=(0,36,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v42, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,36,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v12, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v12, v42, v12, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,36,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v14, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v14, v42, v14, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,36,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v16, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v16, v42, v16, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,37,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v18, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v18, v42, v18, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,37,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v42, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,37,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v42, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,37,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v24, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v24, v42, v24, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,38,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v26, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v26, v42, v26, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,38,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v28, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v28, v42, v28, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,38,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v30, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v30, v42, v30, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,38,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v32, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v32, v42, v32, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,39,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v34, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v34, v42, v34, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,39,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v36, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v36, v42, v36, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,39,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v38, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v38, v42, v38, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,39,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v42, v40, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+11], acc66          // copy acc to vreg[144]
v_accvgpr_read_b32 v[vgprValuC+13], acc70          // copy acc to vreg[145]
v_accvgpr_read_b32 v[vgprValuC+15], acc74          // copy acc to vreg[146]
v_accvgpr_read_b32 v[vgprValuC+17], acc78          // copy acc to vreg[147]
v_accvgpr_read_b32 v[vgprValuC+19], acc82          // copy acc to vreg[148]
v_accvgpr_read_b32 v[vgprValuC+21], acc86          // copy acc to vreg[149]
v_accvgpr_read_b32 v[vgprValuC+23], acc90          // copy acc to vreg[150]
v_accvgpr_read_b32 v[vgprValuC+25], acc94          // copy acc to vreg[151]
v_accvgpr_read_b32 v[vgprValuC+27], acc98          // copy acc to vreg[152]
v_accvgpr_read_b32 v[vgprValuC+29], acc102         // copy acc to vreg[153]
v_accvgpr_read_b32 v[vgprValuC+31], acc106         // copy acc to vreg[154]
v_accvgpr_read_b32 v[vgprValuC+33], acc110         // copy acc to vreg[155]
v_accvgpr_read_b32 v[vgprValuC+35], acc114         // copy acc to vreg[156]
v_accvgpr_read_b32 v[vgprValuC+37], acc118         // copy acc to vreg[157]
v_accvgpr_read_b32 v[vgprValuC+39], acc122         // copy acc to vreg[158]
v_accvgpr_read_b32 v[vgprValuC+41], acc126         // copy acc to vreg[159]

/* rC *= alpha batchElements=[(0, 0, 36, 0), (0, 0, 36, 1), (0, 0, 36, 2), (0, 0, 36, 3), (0, 0, 37, 0), (0, 0, 37, 1), (0, 0, 37, 2), (0, 0, 37, 3), (0, 0, 38, 0), (0, 0, 38, 1), (0, 0, 38, 2), (0, 0, 38, 3), (0, 0, 39, 0), (0, 0, 39, 1), (0, 0, 39, 2), (0, 0, 39, 3)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dword v11, v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v13, v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v15, v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v17, v16, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v19, v18, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v21, v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v23, v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v25, v24, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v27, v26, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v29, v28, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v31, v30, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v33, v32, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v35, v34, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v37, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v39, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v41, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #10 (d1,d0,vc1,vc0) = */
/*    (0,0,40,0:vw1); (0,0,40,1:vw1); (0,0,40,2:vw1); (0,0,40,3:vw1); (0,0,41,0:vw1); (0,0,41,1:vw1); (0,0,41,2:vw1); (0,0,41,3:vw1); (0,0,42,0:vw1); (0,0,42,1:vw1); (0,0,42,2:vw1); (0,0,42,3:vw1); (0,0,43,0:vw1); (0,0,43,1:vw1); (0,0,43,2:vw1); (0,0,43,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v42, BufferOOB
/* (d1,vc1,d0,vc0)=(0,40,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v42, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,40,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v12, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v12, v42, v12, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,40,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v14, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v14, v42, v14, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,40,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v16, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v16, v42, v16, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,41,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v18, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v18, v42, v18, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,41,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v42, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,41,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v42, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,41,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v24, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v24, v42, v24, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,42,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v26, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v26, v42, v26, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,42,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v28, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v28, v42, v28, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,42,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v30, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v30, v42, v30, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,42,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v32, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v32, v42, v32, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,43,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v34, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v34, v42, v34, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,43,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v36, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v36, v42, v36, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,43,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v38, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v38, v42, v38, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,43,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v42, v40, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+11], acc130         // copy acc to vreg[160]
v_accvgpr_read_b32 v[vgprValuC+13], acc134         // copy acc to vreg[161]
v_accvgpr_read_b32 v[vgprValuC+15], acc138         // copy acc to vreg[162]
v_accvgpr_read_b32 v[vgprValuC+17], acc142         // copy acc to vreg[163]
v_accvgpr_read_b32 v[vgprValuC+19], acc146         // copy acc to vreg[164]
v_accvgpr_read_b32 v[vgprValuC+21], acc150         // copy acc to vreg[165]
v_accvgpr_read_b32 v[vgprValuC+23], acc154         // copy acc to vreg[166]
v_accvgpr_read_b32 v[vgprValuC+25], acc158         // copy acc to vreg[167]
v_accvgpr_read_b32 v[vgprValuC+27], acc162         // copy acc to vreg[168]
v_accvgpr_read_b32 v[vgprValuC+29], acc166         // copy acc to vreg[169]
v_accvgpr_read_b32 v[vgprValuC+31], acc170         // copy acc to vreg[170]
v_accvgpr_read_b32 v[vgprValuC+33], acc174         // copy acc to vreg[171]
v_accvgpr_read_b32 v[vgprValuC+35], acc178         // copy acc to vreg[172]
v_accvgpr_read_b32 v[vgprValuC+37], acc182         // copy acc to vreg[173]
v_accvgpr_read_b32 v[vgprValuC+39], acc186         // copy acc to vreg[174]
v_accvgpr_read_b32 v[vgprValuC+41], acc190         // copy acc to vreg[175]

/* rC *= alpha batchElements=[(0, 0, 40, 0), (0, 0, 40, 1), (0, 0, 40, 2), (0, 0, 40, 3), (0, 0, 41, 0), (0, 0, 41, 1), (0, 0, 41, 2), (0, 0, 41, 3), (0, 0, 42, 0), (0, 0, 42, 1), (0, 0, 42, 2), (0, 0, 42, 3), (0, 0, 43, 0), (0, 0, 43, 1), (0, 0, 43, 2), (0, 0, 43, 3)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dword v11, v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v13, v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v15, v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v17, v16, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v19, v18, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v21, v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v23, v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v25, v24, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v27, v26, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v29, v28, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v31, v30, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v33, v32, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v35, v34, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v37, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v39, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v41, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #11 (d1,d0,vc1,vc0) = */
/*    (0,0,44,0:vw1); (0,0,44,1:vw1); (0,0,44,2:vw1); (0,0,44,3:vw1); (0,0,45,0:vw1); (0,0,45,1:vw1); (0,0,45,2:vw1); (0,0,45,3:vw1); (0,0,46,0:vw1); (0,0,46,1:vw1); (0,0,46,2:vw1); (0,0,46,3:vw1); (0,0,47,0:vw1); (0,0,47,1:vw1); (0,0,47,2:vw1); (0,0,47,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v42, BufferOOB
/* (d1,vc1,d0,vc0)=(0,44,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v42, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,44,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v12, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v12, v42, v12, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,44,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v14, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v14, v42, v14, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,44,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v16, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v16, v42, v16, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,45,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v18, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v18, v42, v18, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,45,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v42, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,45,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v42, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,45,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v24, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v24, v42, v24, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,46,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v26, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v26, v42, v26, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,46,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v28, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v28, v42, v28, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,46,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v30, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v30, v42, v30, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,46,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v32, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v32, v42, v32, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,47,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v34, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v34, v42, v34, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,47,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v36, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v36, v42, v36, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,47,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v38, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v38, v42, v38, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,47,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v42, v40, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+11], acc194         // copy acc to vreg[176]
v_accvgpr_read_b32 v[vgprValuC+13], acc198         // copy acc to vreg[177]
v_accvgpr_read_b32 v[vgprValuC+15], acc202         // copy acc to vreg[178]
v_accvgpr_read_b32 v[vgprValuC+17], acc206         // copy acc to vreg[179]
v_accvgpr_read_b32 v[vgprValuC+19], acc210         // copy acc to vreg[180]
v_accvgpr_read_b32 v[vgprValuC+21], acc214         // copy acc to vreg[181]
v_accvgpr_read_b32 v[vgprValuC+23], acc218         // copy acc to vreg[182]
v_accvgpr_read_b32 v[vgprValuC+25], acc222         // copy acc to vreg[183]
v_accvgpr_read_b32 v[vgprValuC+27], acc226         // copy acc to vreg[184]
v_accvgpr_read_b32 v[vgprValuC+29], acc230         // copy acc to vreg[185]
v_accvgpr_read_b32 v[vgprValuC+31], acc234         // copy acc to vreg[186]
v_accvgpr_read_b32 v[vgprValuC+33], acc238         // copy acc to vreg[187]
v_accvgpr_read_b32 v[vgprValuC+35], acc242         // copy acc to vreg[188]
v_accvgpr_read_b32 v[vgprValuC+37], acc246         // copy acc to vreg[189]
v_accvgpr_read_b32 v[vgprValuC+39], acc250         // copy acc to vreg[190]
v_accvgpr_read_b32 v[vgprValuC+41], acc254         // copy acc to vreg[191]

/* rC *= alpha batchElements=[(0, 0, 44, 0), (0, 0, 44, 1), (0, 0, 44, 2), (0, 0, 44, 3), (0, 0, 45, 0), (0, 0, 45, 1), (0, 0, 45, 2), (0, 0, 45, 3), (0, 0, 46, 0), (0, 0, 46, 1), (0, 0, 46, 2), (0, 0, 46, 3), (0, 0, 47, 0), (0, 0, 47, 1), (0, 0, 47, 2), (0, 0, 47, 3)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dword v11, v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v13, v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v15, v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v17, v16, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v19, v18, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v21, v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v23, v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v25, v24, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v27, v26, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v29, v28, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v31, v30, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v33, v32, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v35, v34, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v37, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v39, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v41, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #12 (d1,d0,vc1,vc0) = */
/*    (0,0,48,0:vw1); (0,0,48,1:vw1); (0,0,48,2:vw1); (0,0,48,3:vw1); (0,0,49,0:vw1); (0,0,49,1:vw1); (0,0,49,2:vw1); (0,0,49,3:vw1); (0,0,50,0:vw1); (0,0,50,1:vw1); (0,0,50,2:vw1); (0,0,50,3:vw1); (0,0,51,0:vw1); (0,0,51,1:vw1); (0,0,51,2:vw1); (0,0,51,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v42, BufferOOB
/* (d1,vc1,d0,vc0)=(0,48,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v42, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,48,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v12, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v12, v42, v12, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,48,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v14, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v14, v42, v14, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,48,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v16, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v16, v42, v16, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,49,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v18, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v18, v42, v18, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,49,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v42, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,49,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v42, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,49,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v24, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v24, v42, v24, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,50,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v26, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v26, v42, v26, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,50,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v28, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v28, v42, v28, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,50,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v30, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v30, v42, v30, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,50,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v32, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v32, v42, v32, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,51,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v34, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v34, v42, v34, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,51,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v36, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v36, v42, v36, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,51,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v38, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v38, v42, v38, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,51,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v42, v40, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+11], acc3           // copy acc to vreg[192]
v_accvgpr_read_b32 v[vgprValuC+13], acc7           // copy acc to vreg[193]
v_accvgpr_read_b32 v[vgprValuC+15], acc11          // copy acc to vreg[194]
v_accvgpr_read_b32 v[vgprValuC+17], acc15          // copy acc to vreg[195]
v_accvgpr_read_b32 v[vgprValuC+19], acc19          // copy acc to vreg[196]
v_accvgpr_read_b32 v[vgprValuC+21], acc23          // copy acc to vreg[197]
v_accvgpr_read_b32 v[vgprValuC+23], acc27          // copy acc to vreg[198]
v_accvgpr_read_b32 v[vgprValuC+25], acc31          // copy acc to vreg[199]
v_accvgpr_read_b32 v[vgprValuC+27], acc35          // copy acc to vreg[200]
v_accvgpr_read_b32 v[vgprValuC+29], acc39          // copy acc to vreg[201]
v_accvgpr_read_b32 v[vgprValuC+31], acc43          // copy acc to vreg[202]
v_accvgpr_read_b32 v[vgprValuC+33], acc47          // copy acc to vreg[203]
v_accvgpr_read_b32 v[vgprValuC+35], acc51          // copy acc to vreg[204]
v_accvgpr_read_b32 v[vgprValuC+37], acc55          // copy acc to vreg[205]
v_accvgpr_read_b32 v[vgprValuC+39], acc59          // copy acc to vreg[206]
v_accvgpr_read_b32 v[vgprValuC+41], acc63          // copy acc to vreg[207]

/* rC *= alpha batchElements=[(0, 0, 48, 0), (0, 0, 48, 1), (0, 0, 48, 2), (0, 0, 48, 3), (0, 0, 49, 0), (0, 0, 49, 1), (0, 0, 49, 2), (0, 0, 49, 3), (0, 0, 50, 0), (0, 0, 50, 1), (0, 0, 50, 2), (0, 0, 50, 3), (0, 0, 51, 0), (0, 0, 51, 1), (0, 0, 51, 2), (0, 0, 51, 3)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dword v11, v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v13, v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v15, v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v17, v16, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v19, v18, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v21, v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v23, v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v25, v24, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v27, v26, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v29, v28, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v31, v30, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v33, v32, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v35, v34, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v37, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v39, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v41, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #13 (d1,d0,vc1,vc0) = */
/*    (0,0,52,0:vw1); (0,0,52,1:vw1); (0,0,52,2:vw1); (0,0,52,3:vw1); (0,0,53,0:vw1); (0,0,53,1:vw1); (0,0,53,2:vw1); (0,0,53,3:vw1); (0,0,54,0:vw1); (0,0,54,1:vw1); (0,0,54,2:vw1); (0,0,54,3:vw1); (0,0,55,0:vw1); (0,0,55,1:vw1); (0,0,55,2:vw1); (0,0,55,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v42, BufferOOB
/* (d1,vc1,d0,vc0)=(0,52,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v42, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,52,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v12, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v12, v42, v12, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,52,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v14, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v14, v42, v14, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,52,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v16, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v16, v42, v16, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,53,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v18, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v18, v42, v18, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,53,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v42, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,53,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v42, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,53,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v24, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v24, v42, v24, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,54,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v26, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v26, v42, v26, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,54,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v28, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v28, v42, v28, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,54,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v30, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v30, v42, v30, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,54,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v32, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v32, v42, v32, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,55,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v34, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v34, v42, v34, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,55,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v36, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v36, v42, v36, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,55,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v38, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v38, v42, v38, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,55,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v42, v40, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+11], acc67          // copy acc to vreg[208]
v_accvgpr_read_b32 v[vgprValuC+13], acc71          // copy acc to vreg[209]
v_accvgpr_read_b32 v[vgprValuC+15], acc75          // copy acc to vreg[210]
v_accvgpr_read_b32 v[vgprValuC+17], acc79          // copy acc to vreg[211]
v_accvgpr_read_b32 v[vgprValuC+19], acc83          // copy acc to vreg[212]
v_accvgpr_read_b32 v[vgprValuC+21], acc87          // copy acc to vreg[213]
v_accvgpr_read_b32 v[vgprValuC+23], acc91          // copy acc to vreg[214]
v_accvgpr_read_b32 v[vgprValuC+25], acc95          // copy acc to vreg[215]
v_accvgpr_read_b32 v[vgprValuC+27], acc99          // copy acc to vreg[216]
v_accvgpr_read_b32 v[vgprValuC+29], acc103         // copy acc to vreg[217]
v_accvgpr_read_b32 v[vgprValuC+31], acc107         // copy acc to vreg[218]
v_accvgpr_read_b32 v[vgprValuC+33], acc111         // copy acc to vreg[219]
v_accvgpr_read_b32 v[vgprValuC+35], acc115         // copy acc to vreg[220]
v_accvgpr_read_b32 v[vgprValuC+37], acc119         // copy acc to vreg[221]
v_accvgpr_read_b32 v[vgprValuC+39], acc123         // copy acc to vreg[222]
v_accvgpr_read_b32 v[vgprValuC+41], acc127         // copy acc to vreg[223]

/* rC *= alpha batchElements=[(0, 0, 52, 0), (0, 0, 52, 1), (0, 0, 52, 2), (0, 0, 52, 3), (0, 0, 53, 0), (0, 0, 53, 1), (0, 0, 53, 2), (0, 0, 53, 3), (0, 0, 54, 0), (0, 0, 54, 1), (0, 0, 54, 2), (0, 0, 54, 3), (0, 0, 55, 0), (0, 0, 55, 1), (0, 0, 55, 2), (0, 0, 55, 3)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dword v11, v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v13, v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v15, v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v17, v16, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v19, v18, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v21, v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v23, v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v25, v24, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v27, v26, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v29, v28, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v31, v30, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v33, v32, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v35, v34, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v37, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v39, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v41, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #14 (d1,d0,vc1,vc0) = */
/*    (0,0,56,0:vw1); (0,0,56,1:vw1); (0,0,56,2:vw1); (0,0,56,3:vw1); (0,0,57,0:vw1); (0,0,57,1:vw1); (0,0,57,2:vw1); (0,0,57,3:vw1); (0,0,58,0:vw1); (0,0,58,1:vw1); (0,0,58,2:vw1); (0,0,58,3:vw1); (0,0,59,0:vw1); (0,0,59,1:vw1); (0,0,59,2:vw1); (0,0,59,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v42, BufferOOB
/* (d1,vc1,d0,vc0)=(0,56,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v42, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,56,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v12, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v12, v42, v12, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,56,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v14, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v14, v42, v14, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,56,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v16, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v16, v42, v16, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,57,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v18, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v18, v42, v18, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,57,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v42, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,57,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v42, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,57,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v24, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v24, v42, v24, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,58,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v26, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v26, v42, v26, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,58,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v28, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v28, v42, v28, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,58,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v30, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v30, v42, v30, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,58,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v32, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v32, v42, v32, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,59,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v34, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v34, v42, v34, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,59,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v36, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v36, v42, v36, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,59,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v38, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v38, v42, v38, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,59,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v42, v40, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+11], acc131         // copy acc to vreg[224]
v_accvgpr_read_b32 v[vgprValuC+13], acc135         // copy acc to vreg[225]
v_accvgpr_read_b32 v[vgprValuC+15], acc139         // copy acc to vreg[226]
v_accvgpr_read_b32 v[vgprValuC+17], acc143         // copy acc to vreg[227]
v_accvgpr_read_b32 v[vgprValuC+19], acc147         // copy acc to vreg[228]
v_accvgpr_read_b32 v[vgprValuC+21], acc151         // copy acc to vreg[229]
v_accvgpr_read_b32 v[vgprValuC+23], acc155         // copy acc to vreg[230]
v_accvgpr_read_b32 v[vgprValuC+25], acc159         // copy acc to vreg[231]
v_accvgpr_read_b32 v[vgprValuC+27], acc163         // copy acc to vreg[232]
v_accvgpr_read_b32 v[vgprValuC+29], acc167         // copy acc to vreg[233]
v_accvgpr_read_b32 v[vgprValuC+31], acc171         // copy acc to vreg[234]
v_accvgpr_read_b32 v[vgprValuC+33], acc175         // copy acc to vreg[235]
v_accvgpr_read_b32 v[vgprValuC+35], acc179         // copy acc to vreg[236]
v_accvgpr_read_b32 v[vgprValuC+37], acc183         // copy acc to vreg[237]
v_accvgpr_read_b32 v[vgprValuC+39], acc187         // copy acc to vreg[238]
v_accvgpr_read_b32 v[vgprValuC+41], acc191         // copy acc to vreg[239]

/* rC *= alpha batchElements=[(0, 0, 56, 0), (0, 0, 56, 1), (0, 0, 56, 2), (0, 0, 56, 3), (0, 0, 57, 0), (0, 0, 57, 1), (0, 0, 57, 2), (0, 0, 57, 3), (0, 0, 58, 0), (0, 0, 58, 1), (0, 0, 58, 2), (0, 0, 58, 3), (0, 0, 59, 0), (0, 0, 59, 1), (0, 0, 59, 2), (0, 0, 59, 3)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dword v11, v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v13, v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v15, v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v17, v16, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v19, v18, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v21, v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v23, v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v25, v24, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v27, v26, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v29, v28, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v31, v30, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v33, v32, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v35, v34, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v37, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v39, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v41, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
/* optSingleColVgpr=0 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Edge_Mask optSrdIncForRow=0 factorDim=0 */

/******************************************/
/* Global Write Edge Batch #15 (d1,d0,vc1,vc0) = */
/*    (0,0,60,0:vw1); (0,0,60,1:vw1); (0,0,60,2:vw1); (0,0,60,3:vw1); (0,0,61,0:vw1); (0,0,61,1:vw1); (0,0,61,2:vw1); (0,0,61,3:vw1); (0,0,62,0:vw1); (0,0,62,1:vw1); (0,0,62,2:vw1); (0,0,62,3:vw1); (0,0,63,0:vw1); (0,0,63,1:vw1); (0,0,63,2:vw1); (0,0,63,3:vw1) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
v_mov_b32 v42, BufferOOB
/* (d1,vc1,d0,vc0)=(0,60,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v10, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v10, v42, v10, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,60,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v12, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v12, v42, v12, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,60,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v14, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v14, v42, v14, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,60,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v16, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v16, v42, v16, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,61,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v18, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v18, v42, v18, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,61,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v20, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v20, v42, v20, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,61,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v22, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v22, v42, v22, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,61,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v24, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v24, v42, v24, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,62,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v26, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v26, v42, v26, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,62,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v28, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v28, v42, v28, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,62,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v30, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v30, v42, v30, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,62,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v32, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v32, v42, v32, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,63,0,0) */
v_add_co_u32 v1, vcc, v1, 1                        // coord1.1: coord1Vgpr += d1*sg1*VW + vc1

/* Fix for UseInitialStridesCD, emitAddressSetupCode */
v_add_u32 v2, v2, s[sgprStrideC1J]                 // ROWINC- Move cinRowPtr to next row
v_add_u32 v3, v3, s[sgprStrideD1J]                 // Move coutRowPtrD to next row
v_cmp_lt_u32 s[28:29], v0, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v34, v3, v0, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v34, v42, v34, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,63,0,1) */
v_add_co_u32 v4, vcc, v0, 1                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v36, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v36, v42, v36, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,63,0,2) */
v_add_co_u32 v4, vcc, v0, 2                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v38, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v38, v42, v38, s[32:33]              // LDD clip if OOB. offset
/* (d1,vc1,d0,vc0)=(0,63,0,3) */
v_add_co_u32 v4, vcc, v0, 3                        // coord0.1: coord0 += d0*sg0*VW + vc0
v_cmp_lt_u32 s[28:29], v4, s[sgprSizeI]            // coord0 < size0
v_cmp_lt_u32 s[32:33], v1, s[sgprSizeJ]            // coord1 < size1
s_and_b64 s[32:33], s[28:29], s[32:33]             // in0 && in1
v_add_lshl_u32 v40, v3, v4, 0x2                    // scaleToBpe: accumulate d0 lower and *= bpe into Cin addr
v_cndmask_b32 v40, v42, v40, s[32:33]              // LDD clip if OOB. offset
v_accvgpr_read_b32 v[vgprValuC+11], acc195         // copy acc to vreg[240]
v_accvgpr_read_b32 v[vgprValuC+13], acc199         // copy acc to vreg[241]
v_accvgpr_read_b32 v[vgprValuC+15], acc203         // copy acc to vreg[242]
v_accvgpr_read_b32 v[vgprValuC+17], acc207         // copy acc to vreg[243]
v_accvgpr_read_b32 v[vgprValuC+19], acc211         // copy acc to vreg[244]
v_accvgpr_read_b32 v[vgprValuC+21], acc215         // copy acc to vreg[245]
v_accvgpr_read_b32 v[vgprValuC+23], acc219         // copy acc to vreg[246]
v_accvgpr_read_b32 v[vgprValuC+25], acc223         // copy acc to vreg[247]
v_accvgpr_read_b32 v[vgprValuC+27], acc227         // copy acc to vreg[248]
v_accvgpr_read_b32 v[vgprValuC+29], acc231         // copy acc to vreg[249]
v_accvgpr_read_b32 v[vgprValuC+31], acc235         // copy acc to vreg[250]
v_accvgpr_read_b32 v[vgprValuC+33], acc239         // copy acc to vreg[251]
v_accvgpr_read_b32 v[vgprValuC+35], acc243         // copy acc to vreg[252]
v_accvgpr_read_b32 v[vgprValuC+37], acc247         // copy acc to vreg[253]
v_accvgpr_read_b32 v[vgprValuC+39], acc251         // copy acc to vreg[254]
v_accvgpr_read_b32 v[vgprValuC+41], acc255         // copy acc to vreg[255]

/* rC *= alpha batchElements=[(0, 0, 60, 0), (0, 0, 60, 1), (0, 0, 60, 2), (0, 0, 60, 3), (0, 0, 61, 0), (0, 0, 61, 1), (0, 0, 61, 2), (0, 0, 61, 3), (0, 0, 62, 0), (0, 0, 62, 1), (0, 0, 62, 2), (0, 0, 62, 3), (0, 0, 63, 0), (0, 0, 63, 1), (0, 0, 63, 2), (0, 0, 63, 3)] */

/* apply mask, calc new C and issue writes */
v_mov_b32 v7, 0xffff0000                           // mask for pack two bfloat16 element to 32bit
v_mov_b32 v8, 0x7fff0000                           // fp32 Nan
v_mov_b32 v9, 0x7fff                               // rounding bias for bfloat16
buffer_store_dword v11, v10, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v13, v12, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v15, v14, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v17, v16, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v19, v18, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v21, v20, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v23, v22, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v25, v24, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v27, v26, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v29, v28, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v31, v30, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v33, v32, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v35, v34, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v37, v36, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v39, v38, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
buffer_store_dword v41, v40, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D
s_nop 0                                            // 1 wait state required when next inst writes vgprs held by previous dwordx4 store inst
s_branch label_GW_End                              // jump to end
label_GW_End:
s_getpc_b64 s[28:29]                               // addr of next instr
s_add_i32 s30, label_KernelEnd, 0x4                // target branch offset
s_add_u32 s28, s28, s30                            // add target branch offset
s_addc_u32 s29, s29, 0                             // add high and carry
s_setpc_b64 s[28:29]                               // branch to label_KernelEnd
label_GSU_5:
.set sgprAddressScaleA, 48
.set sgprSrdScaleA, 28
.set sgprAddressScaleB, 50
.set sgprSrdScaleB, 32
.set sgprAddressScaleAlphaVec, 52
.set sgprSrdScaleAlphaVec, 40
s_mov_b32 s[sgprSrdScaleAlphaVec+0], s[sgprAddressScaleAlphaVec+0] // init SRD base address (lower)
s_mov_b32 s[sgprSrdScaleAlphaVec+1], s[sgprAddressScaleAlphaVec+1] // init SRD base address (upper) + other fields
s_mov_b32 s[sgprSrdScaleAlphaVec+3], Srd127_96     // Set bits 127_96 in post-loop SRD
s_cmp_eq_u64 s[sgprAddressScaleAlphaVec:sgprAddressScaleAlphaVec+1], 0 // s[AddressScaleAlphaVec] == 0 ?
s_cbranch_scc0 label_ScaleAlphaVecAddrValid        // branch if s[AddressScaleAlphaVec] != 0
s_mov_b32 s[sgprSrdScaleAlphaVec+2], 0
s_branch label_ScaleAlphaVecAddrValid_End
label_ScaleAlphaVecAddrValid:
s_mov_b32 s[sgprSrdScaleAlphaVec+2], s[sgprSizeI]
label_ScaleAlphaVecAddrValid_End:

s_mul_i32 s[sgprSrdScaleAlphaVec+2], 0x4, s[sgprSrdScaleAlphaVec+2] // ScaleAlphaVec scaled by BPE
s_mov_b32 s[sgprSrdScaleA+0], s[sgprAddressScaleA+0] // init SRD base address (lower)
s_mov_b32 s[sgprSrdScaleA+1], s[sgprAddressScaleA+1] // init SRD base address (upper) + other fields
s_mov_b32 s[sgprSrdScaleA+3], Srd127_96            // Set bits 127_96 in post-loop SRD
s_cmp_eq_u64 s[sgprAddressScaleA:sgprAddressScaleA+1], 0 // s[AddressScaleA] == 0 ?
s_cbranch_scc0 label_ScaleAVecAddrValid            // branch if s[AddressScaleA] != 0
s_mov_b32 s[sgprSrdScaleA+2], 0
s_branch label_ScaleAVecAddrValid_End
label_ScaleAVecAddrValid:
s_mov_b32 s[sgprSrdScaleA+2], s[sgprSizeI]
label_ScaleAVecAddrValid_End:

s_mov_b32 s[sgprSrdScaleB+0], s[sgprAddressScaleB+0] // init SRD base address (lower)
s_mov_b32 s[sgprSrdScaleB+1], s[sgprAddressScaleB+1] // init SRD base address (upper) + other fields
s_mov_b32 s[sgprSrdScaleB+3], Srd127_96            // Set bits 127_96 in post-loop SRD
s_cmp_eq_u64 s[sgprAddressScaleB:sgprAddressScaleB+1], 0 // s[AddressScaleB] == 0 ?
s_cbranch_scc0 label_ScaleBVecAddrValid            // branch if s[AddressScaleB] != 0
s_mov_b32 s[sgprSrdScaleB+2], 0
s_branch label_ScaleBVecAddrValid_End
label_ScaleBVecAddrValid:
s_mov_b32 s[sgprSrdScaleB+2], s[sgprSizeJ]
label_ScaleBVecAddrValid_End:

s_mul_i32 s[sgprSrdScaleA+2], 0x4, s[sgprSrdScaleA+2] // ScaleAVec scaled by BPE
s_mul_i32 s[sgprSrdScaleB+2], 0x4, s[sgprSrdScaleB+2] // ScaleBVec scaled by BPE
s_add_u32 s8, s[sgprWorkGroup2], 0x1
s_mul_i32 s8, s[sgprBiasStride], s8                // stride * (wg+1)
s_cmp_eq_u32 s8, 0x0                               // bias stride = 0?
s_cselect_b32 s8, s[sgprSizeI], s8
s_mov_b32 s[sgprSrdBias+0], s[sgprAddressBias+0]   // init SRD base address (lower)
s_mov_b32 s[sgprSrdBias+1], s[sgprAddressBias+1]   // init SRD base address (upper) + other fields
s_mov_b32 s[sgprSrdBias+3], Srd127_96              // Set bits 127_96 in post-loop SRD
s_cmp_eq_u64 s[sgprAddressBias:sgprAddressBias+1], 0 // s[AddressBias] == 0 ?
s_cbranch_scc0 label_BiasAddrValid                 // branch if s[AddressBias] != 0
s_mov_b32 s[sgprSrdBias+2], 0
s_branch label_BiasAddrValid_End
label_BiasAddrValid:
s_mov_b32 s[sgprSrdBias+2], s8
label_BiasAddrValid_End:

label_Load_Biasf32_0:
s_cmpk_lg_u32 s[sgprBiasType], 0                   // BiasType != 0
s_cbranch_scc1 label_Load_Biasbf16_0               // Branch if true

/******************************************/
/* Read vector to LDS                     */
/******************************************/
s_mul_i32 s68, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_add_u32 v12, s68, v[vgprSerial]                  // coord 0 = wgp0 * MT0 + thread offset
s_mul_i32 s[sgprSrdBias+2], 0x4, s[sgprSrdBias+2]  // scaled by BPE
s_mul_i32 s68, s[sgprBiasStride], s[sgprWorkGroup2] // Stride * WG
v_add_u32 v8, s68, v12                             // coord 0 = wgp0 * MT0 + thread offset + Stride * WG
v_lshlrev_b32 v8, 0x2, v8                          // Global bias address scaled by BPE
v_lshlrev_b32 v9, 0x2, v12                         // Global scaleAlpha address scaled by BPE
v_lshlrev_b32 v10, 0x2, v12                        // Global scaleA address scaled by BPE
s_mul_i32 s68, 256, s[sgprWorkGroup1]              // wgp1 * MT1
v_add_u32 v12, s68, v[vgprSerial]                  // coord 1 = wgp1 * MT1 + thread offset
v_lshlrev_b32 v11, 0x2, v12                        // Global scaleB address scaled by BPE
buffer_load_dword v4, v8, s[sgprSrdBias:sgprSrdBias+3], 0 offen offset:0 // Load Bias
buffer_load_dword v5, v9, s[sgprSrdScaleAlphaVec:sgprSrdScaleAlphaVec+3], 0 offen offset:0 // Load ScaleAlphaVec
buffer_load_dword v6, v10, s[sgprSrdScaleA:sgprSrdScaleA+3], 0 offen offset:0 // Load ScaleA
buffer_load_dword v7, v11, s[sgprSrdScaleB:sgprSrdScaleB+3], 0 offen offset:0 // Load ScaleB
v_lshlrev_b32 v12, 0x2, v[vgprSerial]              // Local address scaled by BPE
s_barrier                                          // wait for all global loads.
s_waitcnt vmcnt(3)                                 // wait for global load
ds_write_b32 v12, v4 offset:0                      // store bias
v_cmp_gt_u32 s[sgprAddressScaleAlphaVec:sgprAddressScaleAlphaVec+1], s[sgprSrdScaleAlphaVec+2], 0 //  == 0 ?
s_waitcnt vmcnt(2)                                 // wait for global load
v_cndmask_b32 v5, 1.0, v5, s[sgprAddressScaleAlphaVec:sgprAddressScaleAlphaVec+1] // 1. mul 1 if 0
ds_write_b32 v12, v5 offset:1024                   // store scaleAlpha
v_cmp_gt_u32 s[sgprAddressScaleA:sgprAddressScaleA+1], s[sgprSrdScaleA+2], 0 //  == 0 ?
s_waitcnt vmcnt(1)                                 // wait for global load
v_cndmask_b32 v6, 1.0, v6, s[sgprAddressScaleA:sgprAddressScaleA+1] // 1. mul 1 if 0
ds_write_b32 v12, v6 offset:2048                   // store scaleA
v_cmp_gt_u32 s[sgprAddressScaleB:sgprAddressScaleB+1], s[sgprSrdScaleB+2], 0 //  == 0 ?
s_waitcnt vmcnt(0)                                 // wait for global load
v_cndmask_b32 v7, 1.0, v7, s[sgprAddressScaleB:sgprAddressScaleB+1] // 1. mul 1 if 0
ds_write_b32 v12, v7 offset:3072                   // store scaleB
s_branch label_Load_Bias_End                       // Branch to load bias end
label_Load_Biasbf16_0:
s_cmpk_lg_u32 s[sgprBiasType], 7                   // BiasType != 7
s_cbranch_scc1 label_Load_Bias_End                 // Branch if true

/******************************************/
/* Read vector to LDS                     */
/******************************************/
s_mul_i32 s68, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_add_u32 v12, s68, v[vgprSerial]                  // coord 0 = wgp0 * MT0 + thread offset
s_mul_i32 s[sgprSrdBias+2], 0x2, s[sgprSrdBias+2]  // scaled by BPE
s_mul_i32 s68, s[sgprBiasStride], s[sgprWorkGroup2] // Stride * WG
v_add_u32 v8, s68, v12                             // coord 0 = wgp0 * MT0 + thread offset + Stride * WG
v_lshlrev_b32 v8, 0x1, v8                          // Global bias address scaled by BPE
v_lshlrev_b32 v9, 0x2, v12                         // Global scaleAlpha address scaled by BPE
v_lshlrev_b32 v10, 0x2, v12                        // Global scaleA address scaled by BPE
s_mul_i32 s68, 256, s[sgprWorkGroup1]              // wgp1 * MT1
v_add_u32 v12, s68, v[vgprSerial]                  // coord 1 = wgp1 * MT1 + thread offset
v_lshlrev_b32 v11, 0x2, v12                        // Global scaleB address scaled by BPE
buffer_load_short_d16 v4, v8, s[sgprSrdBias:sgprSrdBias+3], 0 offen offset:0 // Load Bias
buffer_load_dword v5, v9, s[sgprSrdScaleAlphaVec:sgprSrdScaleAlphaVec+3], 0 offen offset:0 // Load ScaleAlphaVec
buffer_load_dword v6, v10, s[sgprSrdScaleA:sgprSrdScaleA+3], 0 offen offset:0 // Load ScaleA
buffer_load_dword v7, v11, s[sgprSrdScaleB:sgprSrdScaleB+3], 0 offen offset:0 // Load ScaleB
v_lshlrev_b32 v12, 0x2, v[vgprSerial]              // Local address scaled by BPE
s_barrier                                          // wait for all global loads.
s_waitcnt vmcnt(3)                                 // wait for global load
v_lshlrev_b32 v4, 16, v4                           // cvt bf16 to fp32. 
ds_write_b32 v12, v4 offset:0                      // store bias
v_cmp_gt_u32 s[sgprAddressScaleAlphaVec:sgprAddressScaleAlphaVec+1], s[sgprSrdScaleAlphaVec+2], 0 //  == 0 ?
s_waitcnt vmcnt(2)                                 // wait for global load
v_cndmask_b32 v5, 1.0, v5, s[sgprAddressScaleAlphaVec:sgprAddressScaleAlphaVec+1] // 1. mul 1 if 0
ds_write_b32 v12, v5 offset:1024                   // store scaleAlpha
v_cmp_gt_u32 s[sgprAddressScaleA:sgprAddressScaleA+1], s[sgprSrdScaleA+2], 0 //  == 0 ?
s_waitcnt vmcnt(1)                                 // wait for global load
v_cndmask_b32 v6, 1.0, v6, s[sgprAddressScaleA:sgprAddressScaleA+1] // 1. mul 1 if 0
ds_write_b32 v12, v6 offset:2048                   // store scaleA
v_cmp_gt_u32 s[sgprAddressScaleB:sgprAddressScaleB+1], s[sgprSrdScaleB+2], 0 //  == 0 ?
s_waitcnt vmcnt(0)                                 // wait for global load
v_cndmask_b32 v7, 1.0, v7, s[sgprAddressScaleB:sgprAddressScaleB+1] // 1. mul 1 if 0
ds_write_b32 v12, v7 offset:3072                   // store scaleB
s_branch label_Load_Bias_End                       // Branch to load bias end
label_Load_Bias_End:
.set sgprAddressScaleA, UNDEF
.set sgprSrdScaleA, UNDEF
.set sgprAddressScaleB, UNDEF
.set sgprSrdScaleB, UNDEF
.set sgprAddressScaleAlphaVec, UNDEF
.set sgprSrdScaleAlphaVec, UNDEF
s_cmpk_eq_u32 s[sgprBeta], 0x0                     // Beta == 0
s_cbranch_scc0 label_GW_Beta_1                     // Branch if Beta is not zero

s_and_b32 s30, 255, s[sgprSizeI]                   // s30 = s[sgprSizeI] % 256
s_add_u32 s31, -0x1, s[sgprNumWorkGroups0]
s_cmp_ge_u32 s[sgprWorkGroup0], s31                // wg0 >= nwg0-1 ?
s_cselect_b32 s30, s30, 0                          // set rMT0
s_cmpk_gt_u32 s30, 0x0                             // rMT0 > 0
s_cbranch_scc0 label_NoBranch_L22R6YMJ6STQ4WFV_0   // Only branch on scc1
// jump if edges required
s_getpc_b64 s[30:31]                               // addr of next instr
s_add_i32 s32, label_GW_B0_E1_M_1, 0x4             // target branch offset
s_add_u32 s30, s30, s32                            // add target branch offset
s_addc_u32 s31, s31, 0                             // add high and carry
s_setpc_b64 s[30:31]                               // branch to label_GW_B0_E1_M_1
label_NoBranch_L22R6YMJ6STQ4WFV_0:
s_and_b32 s30, 255, s[sgprSizeJ]                   // s30 = s[sgprSizeJ] % 256
s_add_u32 s31, -0x1, s[sgprNumWorkGroups1]
s_cmp_ge_u32 s[sgprWorkGroup1], s31                // wg1 >= nwg1-1
s_cselect_b32 s30, s30, 0                          // set rMT1
s_cmpk_gt_u32 s30, 0x0                             // rMT1 > 0
s_cbranch_scc0 label_NoBranch_76TEE7GOZM6QIIAP_0   // Only branch on scc1
// jump if edges required
s_getpc_b64 s[30:31]                               // addr of next instr
s_add_i32 s32, label_GW_B0_E1_N_1, 0x4             // target branch offset
s_add_u32 s30, s30, s32                            // add target branch offset
s_addc_u32 s31, s31, 0                             // add high and carry
s_setpc_b64 s[30:31]                               // branch to label_GW_B0_E1_N_1
label_NoBranch_76TEE7GOZM6QIIAP_0:
label_GW_B0_E0_1:
s_cmpk_eq_u32 s[sgprActivationType], 1             // activationType == 1
s_cbranch_scc1 label_To_Activation_Abs_VW4_beta_0_edge_0 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 2             // activationType == 2
s_cbranch_scc1 label_To_Activation_Clippedrelu_VW4_beta_0_edge_0 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 3             // activationType == 3
s_cbranch_scc1 label_To_Activation_Gelu_VW4_beta_0_edge_0 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 4             // activationType == 4
s_cbranch_scc1 label_To_Activation_Leakyrelu_VW4_beta_0_edge_0 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 5             // activationType == 5
s_cbranch_scc1 label_To_Activation_Relu_VW4_beta_0_edge_0 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 6             // activationType == 6
s_cbranch_scc1 label_To_Activation_Sigmoid_VW4_beta_0_edge_0 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 7             // activationType == 7
s_cbranch_scc1 label_To_Activation_Tanh_VW4_beta_0_edge_0 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 9             // activationType == 9
s_cbranch_scc1 label_To_Activation_Geluscaling_VW4_beta_0_edge_0 // Branch if true
s_cmpk_eq_u32 s[sgprActivationType], 10            // activationType == 10
s_cbranch_scc1 label_To_Activation_Silu_VW4_beta_0_edge_0 // Branch if true
label_To_Activation_None_VW4_beta_0_edge_0:
s_getpc_b64 s[12:13]                               // addr of next instr
s_add_i32 s8, label_Activation_None_VW4, 0x4       // target branch offset
s_add_u32 s12, s12, s8                             // add target branch offset
s_addc_u32 s13, s13, 0                             // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_To_Activation_Abs_VW4_beta_0_edge_0:
s_getpc_b64 s[12:13]                               // addr of next instr
s_add_i32 s8, label_Activation_Abs_VW4, 0x4        // target branch offset
s_add_u32 s12, s12, s8                             // add target branch offset
s_addc_u32 s13, s13, 0                             // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_To_Activation_Clippedrelu_VW4_beta_0_edge_0:
s_getpc_b64 s[12:13]                               // addr of next instr
s_add_i32 s8, label_Activation_Clippedrelu_VW4, 0x4 // target branch offset
s_add_u32 s12, s12, s8                             // add target branch offset
s_addc_u32 s13, s13, 0                             // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_To_Activation_Gelu_VW4_beta_0_edge_0:
s_getpc_b64 s[12:13]                               // addr of next instr
s_add_i32 s8, label_Activation_Gelu_VW4, 0x4       // target branch offset
s_add_u32 s12, s12, s8                             // add target branch offset
s_addc_u32 s13, s13, 0                             // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_To_Activation_Leakyrelu_VW4_beta_0_edge_0:
s_getpc_b64 s[12:13]                               // addr of next instr
s_add_i32 s8, label_Activation_Leakyrelu_VW4, 0x4  // target branch offset
s_add_u32 s12, s12, s8                             // add target branch offset
s_addc_u32 s13, s13, 0                             // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_To_Activation_Relu_VW4_beta_0_edge_0:
s_getpc_b64 s[12:13]                               // addr of next instr
s_add_i32 s8, label_Activation_Relu_VW4, 0x4       // target branch offset
s_add_u32 s12, s12, s8                             // add target branch offset
s_addc_u32 s13, s13, 0                             // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_To_Activation_Sigmoid_VW4_beta_0_edge_0:
s_getpc_b64 s[12:13]                               // addr of next instr
s_add_i32 s8, label_Activation_Sigmoid_VW4, 0x4    // target branch offset
s_add_u32 s12, s12, s8                             // add target branch offset
s_addc_u32 s13, s13, 0                             // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_To_Activation_Tanh_VW4_beta_0_edge_0:
s_getpc_b64 s[12:13]                               // addr of next instr
s_add_i32 s8, label_Activation_Tanh_VW4, 0x4       // target branch offset
s_add_u32 s12, s12, s8                             // add target branch offset
s_addc_u32 s13, s13, 0                             // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_To_Activation_Geluscaling_VW4_beta_0_edge_0:
s_getpc_b64 s[12:13]                               // addr of next instr
s_add_i32 s8, label_Activation_Geluscaling_VW4, 0x4 // target branch offset
s_add_u32 s12, s12, s8                             // add target branch offset
s_addc_u32 s13, s13, 0                             // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_To_Activation_Silu_VW4_beta_0_edge_0:
s_getpc_b64 s[12:13]                               // addr of next instr
s_add_i32 s8, label_Activation_Silu_VW4, 0x4       // target branch offset
s_add_u32 s12, s12, s8                             // add target branch offset
s_addc_u32 s13, s13, 0                             // add high and carry
s_branch label_ActivationSetPCAddrEnd_5
label_ActivationSetPCAddrEnd_5:

/* edge=0, allocate 2 sgpr. perBatchTmpS=2 perBatchMaskS=0 perElementMaskS=0 elementsPerBatch=12 */
/* optSingleColVgpr=1 optSharedColVgpr=0 optSGPRUsage=BufferLoad_Mask optSrdIncForRow=1 factorDim=0 */

/******************************************/
/* Global Write Batch #0 (d1,d0,vc1,vc0) = */
/*    (0,0,0,0:vw4); (0,0,1,0:vw4); (0,0,2,0:vw4); (0,0,3,0:vw4); (0,0,4,0:vw4); (0,0,5,0:vw4); (0,0,6,0:vw4); (0,0,7,0:vw4); (0,0,8,0:vw4); (0,0,9,0:vw4); (0,0,10,0:vw4); (0,0,11,0:vw4) */
/******************************************/

/* calc coords, apply mask, and issue loads (if necessary) */
/* (d1,vc1,d0,vc0)=(0,0,0,0) */
s_mul_i32 s30, 256, s[sgprWorkGroup0]              // wgp0 * MT0
v_sub_u32 v15, v0, s30
v_lshlrev_b32 v15, 0x2, v15                        // Bias address scaled by BPE
s_waitcnt lgkmcnt(0)                               // Wait for LDS write
s_barrier                                          // LDS write barrier
ds_read_b128 v[20:23], v15 offset:0                // load Bias
v_add_u32 v18, 1024, v15                           // add ScaleAlphaVec offset (1)
ds_read_b128 v[32:35], v18 offset:0                // load scaleAlpha
v_add_u32 v16, 2048, v15                           // add ScaleAVec offset
ds_read_b128 v[24:27], v16 offset:0                // load scaleA
s_mul_i32 s30, 256, s[sgprWorkGroup1]              // wgp1 * MT1
v_sub_u32 v17, v1, s30
v_lshlrev_b32 v17, 0x2, v17                        // ScaleBVec address scaled by BPE
v_add_u32 v17, 3072, v17                           // add ScaleBVec lds offset
ds_read_b32 v28, v17 offset:0                      // load scaleB
/* (d1,vc1,d0,vc0)=(0,1,0,0) */
ds_read_b32 v30, v17 offset:4                      // load scaleB
/* (d1,vc1,d0,vc0)=(0,2,0,0) */
ds_read_b32 v44, v17 offset:8                      // load scaleB
/* (d1,vc1,d0,vc0)=(0,3,0,0) */
ds_read_b32 v46, v17 offset:12                     // load scaleB
/* (d1,vc1,d0,vc0)=(0,4,0,0) */
ds_read_b32 v56, v17 offset:16                     // load scaleB
/* (d1,vc1,d0,vc0)=(0,5,0,0) */
ds_read_b32 v58, v17 offset:20                     // load scaleB
/* (d1,vc1,d0,vc0)=(0,6,0,0) */
ds_read_b32 v68, v17 offset:24                     // load scaleB
/* (d1,vc1,d0,vc0)=(0,7,0,0) */
ds_read_b32 v70, v17 offset:28                     // load scaleB
/* (d1,vc1,d0,vc0)=(0,8,0,0) */
ds_read_b32 v80, v17 offset:32                     // load scaleB
/* (d1,vc1,d0,vc0)=(0,9,0,0) */
ds_read_b32 v82, v17 offset:36                     // load scaleB
/* (d1,vc1,d0,vc0)=(0,10,0,0) */
ds_read_b32 v92, v17 offset:40                     // load scaleB
/* (d1,vc1,d0,vc0)=(0,11,0,0) */
ds_read_b32 v94, v17 offset:44                     // load scaleB
v_add_lshl_u32 v13, v3, v0, 0x1                    // optSingleColVgpr scaleToBpe: sharedAddrVgpr <- cinRowPtr + coord0, scaled by BPE. BSHERE:coord0=0, coord0Vgpr=0
v_accvgpr_read_b32 v[vgprValuC+36], acc0           // copy acc to vreg[0]
v_accvgpr_read_b32 v[vgprValuC+37], acc4           // copy acc to vreg[1]
v_accvgpr_read_b32 v[vgprValuC+38], acc8           // copy acc to vreg[2]
v_accvgpr_read_b32 v[vgprValuC+39], acc12          // copy acc to vreg[3]
v_accvgpr_read_b32 v[vgprValuC+40], acc16          // copy acc to vreg[4]
v_accvgpr_read_b32 v[vgprValuC+41], acc20          // copy acc to vreg[5]
v_accvgpr_read_b32 v[vgprValuC+42], acc24          // copy acc to vreg[6]
v_accvgpr_read_b32 v[vgprValuC+43], acc28          // copy acc to vreg[7]
v_accvgpr_read_b32 v[vgprValuC+48], acc32          // copy acc to vreg[8]
v_accvgpr_read_b32 v[vgprValuC+49], acc36          // copy acc to vreg[9]
v_accvgpr_read_b32 v[vgprValuC+50], acc40          // copy acc to vreg[10]
v_accvgpr_read_b32 v[vgprValuC+51], acc44          // copy acc to vreg[11]
v_accvgpr_read_b32 v[vgprValuC+52], acc48          // copy acc to vreg[12]
v_accvgpr_read_b32 v[vgprValuC+53], acc52          // copy acc to vreg[13]
v_accvgpr_read_b32 v[vgprValuC+54], acc56          // copy acc to vreg[14]
v_accvgpr_read_b32 v[vgprValuC+55], acc60          // copy acc to vreg[15]
v_accvgpr_read_b32 v[vgprValuC+60], acc64          // copy acc to vreg[16]
v_accvgpr_read_b32 v[vgprValuC+61], acc68          // copy acc to vreg[17]
v_accvgpr_read_b32 v[vgprValuC+62], acc72          // copy acc to vreg[18]
v_accvgpr_read_b32 v[vgprValuC+63], acc76          // copy acc to vreg[19]
v_accvgpr_read_b32 v[vgprValuC+64], acc80          // copy acc to vreg[20]
v_accvgpr_read_b32 v[vgprValuC+65], acc84          // copy acc to vreg[21]
v_accvgpr_read_b32 v[vgprValuC+66], acc88          // copy acc to vreg[22]
v_accvgpr_read_b32 v[vgprValuC+67], acc92          // copy acc to vreg[23]
v_accvgpr_read_b32 v[vgprValuC+72], acc96          // copy acc to vreg[24]
v_accvgpr_read_b32 v[vgprValuC+73], acc100         // copy acc to vreg[25]
v_accvgpr_read_b32 v[vgprValuC+74], acc104         // copy acc to vreg[26]
v_accvgpr_read_b32 v[vgprValuC+75], acc108         // copy acc to vreg[27]
v_accvgpr_read_b32 v[vgprValuC+76], acc112         // copy acc to vreg[28]
v_accvgpr_read_b32 v[vgprValuC+77], acc116         // copy acc to vreg[29]
v_accvgpr_read_b32 v[vgprValuC+78], acc120         // copy acc to vreg[30]
v_accvgpr_read_b32 v[vgprValuC+79], acc124         // copy acc to vreg[31]
v_accvgpr_read_b32 v[vgprValuC+84], acc128         // copy acc to vreg[32]
v_accvgpr_read_b32 v[vgprValuC+85], acc132         // copy acc to vreg[33]
v_accvgpr_read_b32 v[vgprValuC+86], acc136         // copy acc to vreg[34]
v_accvgpr_read_b32 v[vgprValuC+87], acc140         // copy acc to vreg[35]
v_accvgpr_read_b32 v[vgprValuC+88], acc144         // copy acc to vreg[36]
v_accvgpr_read_b32 v[vgprValuC+89], acc148         // copy acc to vreg[37]
v_accvgpr_read_b32 v[vgprValuC+90], acc152         // copy acc to vreg[38]
v_accvgpr_read_b32 v[vgprValuC+91], acc156         // copy acc to vreg[39]
v_accvgpr_read_b32 v[vgprValuC+96], acc160         // copy acc to vreg[40]
v_accvgpr_read_b32 v[vgprValuC+97], acc164         // copy acc to vreg[41]
v_accvgpr_read_b32 v[vgprValuC+98], acc168         // copy acc to vreg[42]
v_accvgpr_read_b32 v[vgprValuC+99], acc172         // copy acc to vreg[43]
v_accvgpr_read_b32 v[vgprValuC+100], acc176        // copy acc to vreg[44]
v_accvgpr_read_b32 v[vgprValuC+101], acc180        // copy acc to vreg[45]
v_accvgpr_read_b32 v[vgprValuC+102], acc184        // copy acc to vreg[46]
v_accvgpr_read_b32 v[vgprValuC+103], acc188        // copy acc to vreg[47]

/* rC *= alpha batchElements=[(0, 0, 0, 0), (0, 0, 1, 0), (0, 0, 2, 0), (0, 0, 3, 0), (0, 0, 4, 0), (0, 0, 5, 0), (0, 0, 6, 0), (0, 0, 7, 0), (0, 0, 8, 0), (0, 0, 9, 0), (0, 0, 10, 0), (0, 0, 11, 0)] */
v_mul_f32 v[vgprValuC+36], s[sgprAlpha], v[vgprValuC+36] // *= alpha
v_mul_f32 v[vgprValuC+37], s[sgprAlpha], v[vgprValuC+37] // *= alpha
v_mul_f32 v[vgprValuC+38], s[sgprAlpha], v[vgprValuC+38] // *= alpha
v_mul_f32 v[vgprValuC+39], s[sgprAlpha], v[vgprValuC+39] // *= alpha
v_mul_f32 v[vgprValuC+40], s[sgprAlpha], v[vgprValuC+40] // *= alpha
v_mul_f32 v[vgprValuC+41], s[sgprAlpha], v[vgprValuC+41] // *= alpha
v_mul_f32 v[vgprValuC+42], s[sgprAlpha], v[vgprValuC+42] // *= alpha
v_mul_f32 v[vgprValuC+43], s[sgprAlpha], v[vgprValuC+43] // *= alpha
v_mul_f32 v[vgprValuC+48], s[sgprAlpha], v[vgprValuC+48] // *= alpha
v_mul_f32 v[vgprValuC+49], s[sgprAlpha], v[vgprValuC+49] // *= alpha
v_mul_f32 v[vgprValuC+50], s[sgprAlpha], v[vgprValuC+50] // *= alpha
v_mul_f32 v[vgprValuC+51], s[sgprAlpha], v[vgprValuC+51] // *= alpha
v_mul_f32 v[vgprValuC+52], s[sgprAlpha], v[vgprValuC+52] // *= alpha
v_mul_f32 v[vgprValuC+53], s[sgprAlpha], v[vgprValuC+53] // *= alpha
v_mul_f32 v[vgprValuC+54], s[sgprAlpha], v[vgprValuC+54] // *= alpha
v_mul_f32 v[vgprValuC+55], s[sgprAlpha], v[vgprValuC+55] // *= alpha
v_mul_f32 v[vgprValuC+60], s[sgprAlpha], v[vgprValuC+60] // *= alpha
v_mul_f32 v[vgprValuC+61], s[sgprAlpha], v[vgprValuC+61] // *= alpha
v_mul_f32 v[vgprValuC+62], s[sgprAlpha], v[vgprValuC+62] // *= alpha
v_mul_f32 v[vgprValuC+63], s[sgprAlpha], v[vgprValuC+63] // *= alpha
v_mul_f32 v[vgprValuC+64], s[sgprAlpha], v[vgprValuC+64] // *= alpha
v_mul_f32 v[vgprValuC+65], s[sgprAlpha], v[vgprValuC+65] // *= alpha
v_mul_f32 v[vgprValuC+66], s[sgprAlpha], v[vgprValuC+66] // *= alpha
v_mul_f32 v[vgprValuC+67], s[sgprAlpha], v[vgprValuC+67] // *= alpha
v_mul_f32 v[vgprValuC+72], s[sgprAlpha], v[vgprValuC+72] // *= alpha
v_mul_f32 v[vgprValuC+73], s[sgprAlpha], v[vgprValuC+73] // *= alpha
v_mul_f32 v[vgprValuC+74], s[sgprAlpha], v[vgprValuC+74] // *= alpha
v_mul_f32 v[vgprValuC+75], s[sgprAlpha], v[vgprValuC+75] // *= alpha
v_mul_f32 v[vgprValuC+76], s[sgprAlpha], v[vgprValuC+76] // *= alpha
v_mul_f32 v[vgprValuC+77], s[sgprAlpha], v[vgprValuC+77] // *= alpha
v_mul_f32 v[vgprValuC+78], s[sgprAlpha], v[vgprValuC+78] // *= alpha
v_mul_f32 v[vgprValuC+79], s[sgprAlpha], v[vgprValuC+79] // *= alpha
v_mul_f32 v[vgprValuC+84], s[sgprAlpha], v[vgprValuC+84] // *= alpha
v_mul_f32 v[vgprValuC+85], s[sgprAlpha], v[vgprValuC+85] // *= alpha
v_mul_f32 v[vgprValuC+86], s[sgprAlpha], v[vgprValuC+86] // *= alpha
v_mul_f32 v[vgprValuC+87], s[sgprAlpha], v[vgprValuC+87] // *= alpha
v_mul_f32 v[vgprValuC+88], s[sgprAlpha], v[vgprValuC+88] // *= alpha
v_mul_f32 v[vgprValuC+89], s[sgprAlpha], v[vgprValuC+89] // *= alpha
v_mul_f32 v[vgprValuC+90], s[sgprAlpha], v[vgprValuC+90] // *= alpha
v_mul_f32 v[vgprValuC+91], s[sgprAlpha], v[vgprValuC+91] // *= alpha
v_mul_f32 v[vgprValuC+96], s[sgprAlpha], v[vgprValuC+96] // *= alpha
v_mul_f32 v[vgprValuC+97], s[sgprAlpha], v[vgprValuC+97] // *= alpha
v_mul_f32 v[vgprValuC+98], s[sgprAlpha], v[vgprValuC+98] // *= alpha
v_mul_f32 v[vgprValuC+99], s[sgprAlpha], v[vgprValuC+99] // *= alpha
v_mul_f32 v[vgprValuC+100], s[sgprAlpha], v[vgprValuC+100] // *= alpha
v_mul_f32 v[vgprValuC+101], s[sgprAlpha], v[vgprValuC+101] // *= alpha
v_mul_f32 v[vgprValuC+102], s[sgprAlpha], v[vgprValuC+102] // *= alpha
v_mul_f32 v[vgprValuC+103], s[sgprAlpha], v[vgprValuC+103] // *= alpha

/* apply mask, calc new C and issue writes */
v_mov_b32 v10, 0xffff0000                          // mask for pack two bfloat16 element to 32bit
v_mov_b32 v11, 0x7fff0000                          // fp32 Nan
v_mov_b32 v12, 0x7fff                              // rounding bias for bfloat16

s_waitcnt lgkmcnt(11)                              // lgkmcnt(11) = 15 - 1 (bias) - 1 (scaleAVec) - 1 (scaleBVec) - 1 (scaleAlphaVec) (interleaved)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[24:25], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAVMulPK(24)(0)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[26:27], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAVMulPK(24)(2)
v_mov_b32 v29, v28                                 // copy dataScaleB to dataScaleB+1
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[28:29], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleBVMulPK(28)(0)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[28:29], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleBVMulPK(28)(2)
v_pk_mul_f32 v[vgprValuC+36:vgprValuC+36+1], v[32:33], v[vgprValuC+36:vgprValuC+36+1] // *= ScaleAlphaVecVMulPK(32)(0)
v_pk_mul_f32 v[vgprValuC+38:vgprValuC+38+1], v[34:35], v[vgprValuC+38:vgprValuC+38+1] // *= ScaleAlphaVecVMulPK(32)(2)
v_pk_add_f32 v[4:5], v[20:21], v[vgprValuC+36:vgprValuC+36+1] // C += bias
v_pk_add_f32 v[6:7], v[22:23], v[vgprValuC+38:vgprValuC+38+1] // C += bias
s_swappc_b64 s[28:29], s[12:13]
v_mov_b32 v36, v4
v_mov_b32 v37, v5
v_mov_b32 v38, v6
v_mov_b32 v39, v7
v_cmp_u_f32 s[30:31], v[vgprValuC+36], v[vgprValuC+36] // check Nan
v_bfe_u32 v9, v[vgprValuC+36], 16, 1               // Non-Nan case: store lsb of bf16
v_add3_u32 v9, v[vgprValuC+36], v9, v12            // Non-Nan case: add lsb and the increment for rounding
v_cndmask_b32 v[vgprValuC+36], v9, v11, s[30:31]
v_lshrrev_b32 v[vgprValuC+36], 16, v[vgprValuC+36] // convert C to bf16
v_cmp_u_f32 s[30:31], v[vgprValuC+37], v[vgprValuC+37] // check Nan
v_bfe_u32 v9, v[vgprValuC+37], 16, 1               // Non-Nan case: store lsb of bf16
v_add3_u32 v9, v[vgprValuC+37], v9, v12            // Non-Nan case: add lsb and the increment for rounding
v_cndmask_b32 v[vgprValuC+37], v9, v11, s[30:31]
v_and_or_b32 v36, v[vgprValuC+37], v10, v[vgprValuC+36] // pack two bf16 to dword
v_cmp_u_f32 s[30:31], v[vgprValuC+38], v[vgprValuC+38] // check Nan
v_bfe_u32 v9, v[vgprValuC+38], 16, 1               // Non-Nan case: store lsb of bf16
v_add3_u32 v9, v[vgprValuC+38], v9, v12            // Non-Nan case: add lsb and the increment for rounding
v_cndmask_b32 v[vgprValuC+38], v9, v11, s[30:31]
v_lshrrev_b32 v[vgprValuC+38], 16, v[vgprValuC+38] // convert C to bf16
v_cmp_u_f32 s[30:31], v[vgprValuC+39], v[vgprValuC+39] // check Nan
v_bfe_u32 v9, v[vgprValuC+39], 16, 1               // Non-Nan case: store lsb of bf16
v_add3_u32 v9, v[vgprValuC+39], v9, v12            // Non-Nan case: add lsb and the increment for rounding
v_cndmask_b32 v[vgprValuC+39], v9, v11, s[30:31]
v_and_or_b32 v37, v[vgprValuC+39], v10, v[vgprValuC+38] // pack two bf16 to dword
buffer_store_dwordx2 v[36:37], v13, s[sgprSrdD:sgprSrdD+3], 0 offen offset:0 // store D

s_waitcnt lgkmcnt(10)                              // lgkmcnt(10) = 15 - 1 (bias) - 1 (scaleAVec) - 2 (scaleBVec) - 1 (scaleAlphaVec) (interleaved)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[24:25], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAVMulPK(24)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[26:27], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAVMulPK(24)(2)
v_mov_b32 v31, v30                                 // copy dataScaleB to dataScaleB+1
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[30:31], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleBVMulPK(30)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[30:31], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleBVMulPK(30)(2)
v_pk_mul_f32 v[vgprValuC+40:vgprValuC+40+1], v[32:33], v[vgprValuC+40:vgprValuC+40+1] // *= ScaleAlphaVecVMulPK(32)(0)
v_pk_mul_f32 v[vgprValuC+42:vgprValuC+42+1], v[34:35], v[vgprValuC+42:vgprValuC+42+1] // *= ScaleAlphaVecVMulPK(32)(2)
v_pk_add_f32 v[4:5], v[20:21], v[vgprValuC+40:vgprValuC+40+1] // C += bias
v_pk_add_f32 v[6:7], v[22:23], v[vgprValuC+42:vgprValuC+42+1] // C += bias
s_swappc_b64 s[28:29], s[12:13]
v_mov_b32 v40, v4
v_mov_b32 v41, v5
v_mov_b32 v42, v6
v_mov_b32 v43, v7
v_cmp_u_f32 s[30:31], v[vgprValuC+40], v[vgprValuC+40] // check Nan
v_bfe_u32 v9, v[vgprValuC+40], 16, 1               // Non-Nan case: store lsb of bf16
v_add3_u32 v9, v[vgprValuC+40], v9, v12            // Non-Nan case: add lsb and the increment for rounding
v_cndmask_b32 v[vgprValuC+40], v9, v11, s[30:31]
v_lshrrev_b32 v[vgprValuC+40], 16, v[vgprValuC+40] // convert C to bf16
v_cmp_u_f32 s[30:31], v[vgprValuC+41], v[vgprValuC+41] // check Nan
v_bfe_u32 v9, v[vgprValuC+41], 16, 1               // Non-Nan case: store lsb of bf16
v_add3_u32 v9, v[vgprValuC+41], v9, v12            // Non-Nan case: add lsb and the increment for rounding
v_cndmask_b32 v[vgprValuC+41], v9, v11, s[30:31]
v_and_or_b32 v40, v[vgprValuC+41], v10, v[vgprValuC+40] // pack two bf16 to dword
v_cmp_u_f32 s[30:31], v[vgprValuC+42], v[vgprValuC+42] // check Nan
v_bfe_u32 v9, v[vgprValuC+42], 16, 1               // Non-Nan case: store lsb of bf16
v_add3_u32 v9, v[vgprValuC+42], v9, v12            // Non-Nan case: add lsb and the increment for rounding
v_cndmask_b32 v[vgprValuC+42], v9, v11, s[30:31]
v_lshrrev_b32 v[vgprValuC+42], 16, v[vgprValuC+42] // convert C to bf16
v_cmp_u_f32 s[30:31], v[vgprValuC+43], v[vgprValuC+43] // check Nan
v_bfe_u32 v9, v[vgprValuC+43],