/*******************************************************************************
 *
 * MIT License
 *
 * Copyright (c) 2020-2021 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 *******************************************************************************/
; generated by igemm_codegen.py (92dd200fb253b1c95091a20a463ed95fb9ce9d13)
;
.include "igemm_wrw_gtcx2_nhwc_fp16_utils.inc"

;----------------------------------------------------------
; starting of kernel igemm_wrw_gtcx2_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x8_1x8x1x32_vs1_gkgs
; tensor_layout              : 'nhwc'
; gemm_m_per_block           : 256
; gemm_n_per_block           : 256
; gemm_k_per_block           : 32
; wave_tile_m                : 32
; wave_step_m                : 2
; wave_repeat_m              : 2
; wave_tile_n                : 32
; wave_step_n                : 2
; wave_repeat_n              : 2
; wave_tile_k                : 8
; tensor_a_thread_lengths    : [1, 4, 1, 8]
; tensor_a_cluster_lengths   : [1, 8, 1, 32]
; tensor_b_thread_lengths    : [1, 4, 1, 8]
; tensor_b_cluster_lengths   : [1, 8, 1, 32]
; direction                  : 'wrw'
; precision                  : 'fp16'
; nxb                        : 0
; nxe                        : 1
; gemm_k_global_split        : 1
; vector_store               : 1
; 
; block_size                 : 256
; lds_total                  : 36864
; lds_buffer_num             : 1
; 
.set k_p_in, 0
.set k_p_wei, 8
.set k_p_out, 16
.set k_hi, 24
.set k_wi, 28
.set k_n, 32
.set k_k, 36
.set k_c, 40
.set k_ho, 44
.set k_wo, 48
.set k_stride_h, 52
.set k_stride_w, 56
.set k_dilation_h, 60
.set k_dilation_w, 64
.set k_pad_h, 68
.set k_pad_w, 72
.set k_y, 76
.set k_x, 80
.set k_gemm_k_global_split, 84
.set k_group, 88
.set k_pack_0, 92
.set k_end, 96

.set s_ka, 0
.set s_bx, 2
.set s_by, 3
.set s_bz, 4
.set s_p_in, 8
.set s_p_wei, 12
.set s_p_out, 16
.set s_hi, 20
.set s_wi, 21
.set s_n, 22
.set s_k, 23
.set s_c, 24
.set s_ho, 25
.set s_wo, 26
.set s_stride_h, 27
.set s_stride_w, 28
.set s_dilation_h, 29
.set s_dilation_w, 30
.set s_pad_h, 31
.set s_pad_w, 32
.set s_y, 33
.set s_x, 34
.set s_gemmk_split, 35
.set s_group, 36
.set s_gemmk_per_wg, 37
.set s_ho_x_stride_h, 38
.set s_wo_x_stride_w, 39
.set s_in_stride_wi, 40
.set s_in_stride_hi, 41
.set s_in_stride_n, 42
.set s_out_stride_wo, 43
.set s_out_stride_ho, 44
.set s_out_stride_n, 45
.set s_wei_stride_k, 46
.set s_ec_padded, 47
.set s_in_stride_n_n, 48
.set s_out_stride_n_n, 49
.set s_move_slice_n, 50
.set s_move_slice_n_dsho, 51
.set s_move_slice_n_dswo, 52
.set s_dim_b, 53
.set s_dim_e, 54
.set s_block_gtc_ie, 55
.set s_block_gtc_ik, 56
.set s_block_gtc_iec, 57
.set s_block_gtc_in, 58
.set s_block_gtc_ig, 59
.set s_knum, 1
.set s_gemm_k_num_n1, 0
.set s_gemm_k_num_dsho, 60
.set s_gemm_k_num_dswo, 61
.set s_kitr, 3
.set s_in_offset, 62
.set s_out_offset, 64
.set s_sub_n, 66
.set s_in_stride_move_n, 67
.set s_out_stride_move_n, 68
.set s_k_padded, 69
.set s_c_padded, 70
.set s_out_move_step, 71
.set s_tmp, 72
.set s_end, 78

.set v_c, 0  ; coalescing:32, needed:0, resuable:78
.set v_a, 0
.set v_b, 16
.set v_gld_a, 32
.set v_gld_b, 48
.set v_sst_a_os, 64
.set v_sst_b_os, 65
.set v_sld_a_os, 66
.set v_sld_b_os, 67
.set v_in_ihi, 68
.set v_in_iwi, 69
.set v_in_os, 70
.set v_in_os_base, 71
.set v_out_iho, 72
.set v_out_iwo, 73
.set v_in_ihi_max, 74
.set v_in_iwi_max, 75
.set v_gtc_in, 76
.set v_out_os, 77
.set v_in_flag, 78
.set v_co_sst, 79
.set v_co_sld, 80
.set v_wei_os, 81
.set v_wei_c_flag, 82
.set v_gtc_iec, 83
.set v_wei_ie, 84
.set v_gtc_ic, 85
.set v_gtc_ie, 86
.set v_gtc_ik, 87
.set v_gtc_inb_a, 88
.set v_gemm_in, 89
.set v_gemm_im, 90
.set v_wei_ic, 91
.set v_wei_iec, 92
.set v_co_sub_m_index, 93
.set v_co_sub_n_index, 94
.set v_cur_k, 95
.set v_tmp, 96
.set v_end, 360

.set a_c, 0
.set a_end, 256

.text
.globl igemm_wrw_gtcx2_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x8_1x8x1x32_vs1_gkgs
.p2align 8
.type igemm_wrw_gtcx2_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x8_1x8x1x32_vs1_gkgs,@function
igemm_wrw_gtcx2_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x8_1x8x1x32_vs1_gkgs:
    s_load_dwordx2  s[s_p_in+0:s_p_in+1],       s[s_ka+0:s_ka+1],    0+k_p_in
    s_load_dwordx2  s[s_p_wei+0:s_p_wei+1],      s[s_ka+0:s_ka+1],    0+k_p_wei
    s_load_dwordx2  s[s_p_out+0:s_p_out+1],      s[s_ka+0:s_ka+1],    0+k_p_out
    s_load_dwordx16 s[s_hi+0:s_hi+15],        s[s_ka+0:s_ka+1],    0+k_hi
    s_load_dwordx2  s[s_group+0:s_group+1],      s[s_ka+0:s_ka+1],    0+k_group

    ; input, thread(1,nb,1,c): 1x4x1x8, cluster(1,nb,1,ec): 1x8x1x32
    v_mov_b32 v[v_tmp], v0
    v_and_b32 v[v_gtc_iec], 31, v[v_tmp]
    v_lshlrev_b32 v[v_gtc_iec], 3, v[v_gtc_iec]
    v_lshrrev_b32 v[v_tmp], 5, v[v_tmp]
    v_and_b32 v[v_gtc_inb_a], 7, v[v_tmp]

    ; output, thread(1,nb,1,k): 1x4x1x8, cluster(1,nb,1,k) 1x8x1x32
    v_mov_b32 v[v_tmp], v0
    v_and_b32 v[v_gtc_ik], 31, v[v_tmp]
    v_lshlrev_b32 v[v_gtc_ik], 3, v[v_gtc_ik]

    s_mov_b32 s[s_p_in+3], 0x27000
    s_mov_b32 s[s_p_wei+2], 0xffffffff
    s_mov_b32 s[s_p_wei+3], 0x27000
    s_mov_b32 s[s_p_out+3], 0x27000
    s_waitcnt lgkmcnt(0)

    ; calculate index
    ; s_lshr_b32 s[s_sub_n], s[s_n], s[s_gemmk_split]
    s_mul_i32 s[s_in_stride_wi], s[s_c], s[s_group]
    s_mul_i32 s[s_tmp+2], s[s_wi], s[s_in_stride_wi]
    s_mul_i32 s[s_in_stride_n], s[s_hi], s[s_tmp+2]
    s_mul_i32 s[s_dim_e], s[s_x], s[s_y]
    s_mul_i32 s[s_wei_stride_k], s[s_c], s[s_dim_e]
    s_mul_i32 s[s_out_stride_wo], s[s_k], s[s_group]
    s_mul_i32 s[s_tmp+2], s[s_wo], s[s_out_stride_wo]
    s_mul_i32 s[s_out_stride_n], s[s_ho], s[s_tmp+2]
    s_mul_i32 s[s_dim_b], s[s_ho], s[s_wo]
    s_mul_i32  s[s_tmp], s[s_n], s[s_in_stride_n]
    s_mul_i32  s[s_tmp+1], s[s_n], s[s_out_stride_n]
    s_lshl_b32 s[s_tmp+4], s[s_tmp], 1
    s_lshl_b32 s[s_tmp+5], s[s_tmp+1], 1
    s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+4]
    s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+4]
    s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp]
    s_addc_u32 s[s_p_in+1], s[s_p_in+1], s[s_tmp+1]
    s_mul_i32 s[s_tmp], s[s_by], s[s_tmp+5]
    s_mul_hi_u32 s[s_tmp+1], s[s_by], s[s_tmp+5]
    s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp]
    s_addc_u32 s[s_p_out+1], s[s_p_out+1], s[s_tmp+1]
    ; compute start point
    s_mul_i32 s[s_sub_n], s[s_bz], s[s_gemmk_per_wg]
    v_add_u32 v[v_gtc_inb_a], v[v_gtc_inb_a], s[s_sub_n]
    ; n1b transform
    .v_u32_div_rem_vs v_tmp+4, v_gtc_in, v_gtc_inb_a, s_dim_b, v_tmp, s_tmp
    .v_u32_div_rem_vs v_out_iwo, v_out_iho, v_tmp+4, s_wo, v_tmp, s_tmp
    v_lshlrev_b32 v[v_gtc_in], 2, v[v_gtc_in]

    ; pad gemm_m if needed
    s_add_u32 s[s_tmp], 255, s[s_k]
    s_lshr_b32 s[s_tmp], s[s_tmp], 8
    s_lshl_b32 s[s_k_padded], s[s_tmp], 8
    ; pad c
    s_add_u32 s[s_tmp], 7, s[s_c]
    s_lshr_b32 s[s_tmp], s[s_tmp], 3
    s_lshl_b32 s[s_c_padded], s[s_tmp], 3
    ; pad ec
    s_mul_i32 s[s_ec_padded], s[s_c_padded], s[s_dim_e]
    s_add_u32 s[s_tmp], 255, s[s_ec_padded]
    s_lshr_b32 s[s_tmp], s[s_tmp], 8
    s_lshl_b32 s[s_ec_padded], s[s_tmp], 8

    ; add block i_n
    ; gemm_m_per_block:256, gemm_n_per_block:256
    s_lshr_b32 s[0], s[s_ec_padded], 8
    s_lshr_b32 s[s_tmp], s[s_k_padded], 8
    s_mul_i32 s[1], s[0], s[s_tmp]
    ;s_lshl_b32 s[s_tmp+3], 1, s[s_gemmk_split]
    ;s_sub_u32 s[s_tmp+3], s[s_tmp+3], 1
    ;s_and_b32 s[s_block_gtc_in], s[s_bx], s[s_tmp+3]
    ;s_mul_i32 s[s_block_gtc_in], s[s_block_gtc_in], s[s_sub_n]
    ;s_lshr_b32 s[s_bx], s[s_bx], s[s_gemmk_split]
    .v_u32_div_rem_ss s_tmp+4, s_block_gtc_ig, s_bx, 1, v_tmp+5, v_tmp, s_tmp
    s_mov_b32 s[s_bx], s[s_tmp+4]
    .v_u32_div_rem_ss s_tmp+4, s_tmp+5, s_bx, 0, v_tmp+5, v_tmp, s_tmp
    ; s_tmp+4:block_gtc_in, s_tmp+5:block_gtc_im
    s_lshl_b32 s[s_block_gtc_ik], s[s_tmp+5], 8
    s_lshl_b32 s[s_block_gtc_iec], s[s_tmp+4], 8

    ; config for output and input range
    s_mul_i32 s[s_p_out+2], s[s_n], s[s_out_stride_n]
    s_lshl_b32 s[s_p_out+2], s[s_p_out+2], 1
    s_mul_i32 s[s_p_in+2], s[s_n], s[s_in_stride_n]
    s_lshl_b32 s[s_p_in+2], s[s_p_in+2], 1
    ; ec transform
    v_add_u32 v[v_tmp+5], s[s_block_gtc_iec], v[v_gtc_iec]
    .v_u32_div_rem_vs v_gtc_ic, v_gtc_ie, v_tmp+5, s_c_padded, v_tmp, s_tmp
    .v_u32_div_rem_vs v_tmp+5, v_tmp+6, v_gtc_ie, s_x, v_tmp, s_tmp
    ; v_tmp_5: v_wei_ix, v_tmp_6: v_wei_iy

    v_mul_u32_u24 v[v_tmp], s[s_dilation_h], v[v_tmp+6]
    v_mul_u32_u24 v[v_tmp+1], s[s_dilation_w], v[v_tmp+5]
    v_sub_i32 v[v_tmp+6], v[v_tmp], s[s_pad_h]
    v_sub_i32 v[v_tmp+5], v[v_tmp+1], s[s_pad_w]
    s_mul_i32 s[s_ho_x_stride_h], s[s_ho], s[s_stride_h]
    s_mul_i32 s[s_wo_x_stride_w], s[s_wo], s[s_stride_w]
    v_add_i32 v[v_in_ihi_max], v[v_tmp+6], s[s_ho_x_stride_h]
    v_add_i32 v[v_in_iwi_max], v[v_tmp+5], s[s_wo_x_stride_w]
    ; ihi = iho * s_stride_h + iy * s_dilation_h - s_pad_h,   here make sure iy <- iy * s_dilation_h - s_pad_h before hand
    ; iwi = iwo * s_stride_w + ix * s_dilation_w - s_pad_w,   here make sure ix <- ix * s_dilation_w - s_pad_w before hand
    v_mad_u32_u24 v[v_in_ihi], s[s_stride_h], v[v_out_iho], v[v_tmp+6]
    v_mad_u32_u24 v[v_in_iwi], s[s_stride_w], v[v_out_iwo], v[v_tmp+5]
    ; calculate input offset
    s_lshl_b32 s[s_block_gtc_ig], s[s_block_gtc_ig], 1
    s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_c]
    s_sub_u32 s[s_p_in+2], s[s_p_in+2], s[s_tmp]
    s_add_u32 s[s_p_in], s[s_p_in], s[s_tmp]

    ;v_add_u32 v[v_tmp], v[v_gtc_in], s[s_block_gtc_in]
    v_mul_lo_u32 v[v_tmp], s[s_in_stride_n], v[v_gtc_in]
    v_add_lshl_u32 v[v_in_os_base], v[v_tmp], v[v_gtc_ic], 1
    ; from hi, wi, os_base, compute final offset
    v_mad_u32_u24 v[v_tmp], s[s_wi], v[v_in_ihi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp], v[v_tmp], s[s_in_stride_wi]
    v_lshl_add_u32 v[v_in_os], v[v_tmp], 1, v[v_in_os_base]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc

    s_lshl_b32 s[s_in_stride_n], s[s_in_stride_n], 1

    s_mul_i32 s[s_tmp], s[s_in_stride_n], 2
    s_mov_b32 s[s_in_offset+0], s[s_tmp]
    s_mul_i32 s[s_tmp], s[s_in_stride_n], 3
    s_mov_b32 s[s_in_offset+1], s[s_tmp]
    ; load input
    .v_clear_nc v_gld_b, 16
    v_cmpx_eq_u32 vcc, 1, v[v_in_flag]
    buffer_load_dwordx4 v[v_gld_b+0:v_gld_b+0+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0
    buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_stride_n] offen offset:0
    buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset+0] offen offset:0
    buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset+1] offen offset:0
    s_mov_b64 exec, -1

    ; calculate out offset
    s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_k]
    s_sub_u32 s[s_p_out+2], s[s_p_out+2], s[s_tmp]
    s_add_u32 s[s_p_out], s[s_p_out], s[s_tmp]

    v_add_u32 v[v_cur_k], s[s_block_gtc_ik], v[v_gtc_ik]
    ;s_mul_i32 s[s_tmp], s[s_out_stride_n], s[s_block_gtc_in]
    ;v_add_lshl_u32 v[v_tmp+1], v[v_cur_k], s[s_tmp], 1
    v_mul_lo_u32 v[v_tmp], v[v_gtc_inb_a], s[s_out_stride_wo]
    v_add_lshl_u32 v[v_out_os], v[v_tmp], v[v_cur_k], 1

    ; supplement for v_gtc_in
    v_lshrrev_b32 v[v_gtc_in], 2, v[v_gtc_in]
    v_mul_lo_u32 v[v_tmp], v[v_gtc_in], 6
    v_mul_lo_u32 v[v_tmp], v[v_tmp], s[s_out_stride_n]
    v_add_i32 v[v_out_os], v[v_out_os], v[v_tmp]
    s_lshl_b32 s[s_out_stride_n], s[s_out_stride_n], 1

    s_mul_i32 s[s_tmp], s[s_out_stride_n], 2
    s_mov_b32 s[s_out_offset+0], s[s_tmp]
    s_mul_i32 s[s_tmp], s[s_out_stride_n], 3
    s_mov_b32 s[s_out_offset+1], s[s_tmp]
    ; load output
    buffer_load_dwordx4 v[v_gld_a+0:v_gld_a+0+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0
    buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_stride_n] offen offset:0
    buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+0] offen offset:0
    buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+1] offen offset:0

    v_mov_b32 v[v_tmp+5], v0
    ; xdlops mapping, get source matrix gemm index, k_pack:4, v_pack:1, k_pack_per_thread:1
    v_and_b32 v[v_gemm_in], 31, v[v_tmp+5]           ; block_n index 
    v_and_b32 v[v_gemm_im], 31, v[v_tmp+5]           ; block_m index 
    v_lshlrev_b32 v[v_gemm_in], 2, v[v_gemm_in]   ; shift left k_pack:4
    v_lshlrev_b32 v[v_gemm_im], 2, v[v_gemm_im]   ; shift left k_pack:4
    v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5]
    v_and_b32 v[v_tmp + 0], 1, v[v_tmp+5]          ; block_k_per_wave index
    v_and_b32 v[v_tmp + 1], 0, v[v_tmp + 0]   ; and k_pack_per_thread:1
    v_lshrrev_b32 v[v_tmp + 0], 0, v[v_tmp + 0] ; shift right k_pack_per_thread:1
    v_lshl_or_b32 v[v_gemm_in],  v[v_tmp + 1], 2, v[v_gemm_in]  ; or lanegroup_k_per_thread:4
    v_lshl_or_b32 v[v_gemm_im],  v[v_tmp + 1], 2, v[v_gemm_im]  ; or lanegroup_k_per_thread:4
    v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 0], 10, v[v_gemm_in]
    v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 0], 10, v[v_gemm_im]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp + 2], 1, v[v_tmp+5]  ; waves_per_n index
    v_lshl_or_b32 v[v_gemm_in], v[v_tmp + 2], 8, v[v_gemm_in]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp + 3], 1, v[v_tmp+5]  ; waves_per_m index
    v_lshl_or_b32 v[v_gemm_im], v[v_tmp + 3], 8, v[v_gemm_im]

    v_mov_b32 v[v_tmp+5], v0
    ; xdlops mapping, get dst matrix gemm index
    v_and_b32 v[v_tmp+0], 31, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 5, v[v_tmp+5]
    v_and_b32 v[v_tmp+1], 1, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_mov_b32 v[v_co_sst], v[v_tmp+0]
    v_lshlrev_b32 v[v_co_sld], 2, v[v_tmp+1]
    v_and_b32 v[v_tmp+0], 1, v[v_tmp+5]
    v_lshrrev_b32 v[v_tmp+5], 1, v[v_tmp+5]
    v_and_b32 v[v_tmp+1], 1, v[v_tmp+5]
    v_lshl_or_b32 v[v_co_sst], v[v_tmp+0], 6, v[v_co_sst]
    v_lshl_or_b32 v[v_co_sld], v[v_tmp+1], 6, v[v_co_sld]

    ; LDS store, in: 1,nb,1,ec: 1x4x1x8, 1x8x1x32
    v_sub_i32 v[v_gtc_inb_a], v[v_gtc_inb_a], s[s_sub_n]
    v_lshlrev_b32 v[v_tmp+2], 2,  v[v_gtc_iec]
    v_lshrrev_b32 v[v_tmp+1], 0,  v[v_gtc_inb_a]
    v_lshl_add_u32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2]
    v_and_b32 v[v_tmp+2], 0, v[v_gtc_inb_a]
    v_lshl_or_b32 v[v_tmp], v[v_tmp+2], 2, v[v_tmp]
    v_lshlrev_b32 v[v_sst_b_os], 1, v[v_tmp]
    v_add_u32 v[v_sst_b_os], 16384, v[v_sst_b_os]
    v_lshrrev_b32 v[v_tmp], 7, v[v_sst_b_os]
    v_lshlrev_b32 v[v_tmp], 4, v[v_tmp]
    v_add_u32 v[v_sst_b_os], v[v_tmp], v[v_sst_b_os]

    ; LDS store, out: 1,nb,1,k: 1x4x1x8, 1x8x1x32
    v_lshlrev_b32 v[v_tmp+2], 2,  v[v_gtc_ik]
    v_lshrrev_b32 v[v_tmp+1], 0,  v[v_gtc_inb_a]
    v_lshl_add_u32 v[v_tmp], v[v_tmp+1], 10, v[v_tmp+2]
    v_and_b32 v[v_tmp+2], 0, v[v_gtc_inb_a]
    v_lshl_or_b32 v[v_tmp], v[v_tmp+2], 2, v[v_tmp]
    v_lshlrev_b32 v[v_sst_a_os], 1, v[v_tmp]

    v_lshrrev_b32 v[v_tmp], 7, v[v_sst_a_os]
    v_lshlrev_b32 v[v_tmp], 4, v[v_tmp]
    v_add_u32 v[v_sst_a_os], v[v_tmp], v[v_sst_a_os]

    ; LDS load
    v_lshlrev_b32 v[v_sld_b_os], 1, v[v_gemm_in]
    v_lshlrev_b32 v[v_sld_a_os], 1, v[v_gemm_im]
    v_add_u32 v[v_sld_b_os], 16384, v[v_sld_b_os]

    v_lshrrev_b32 v[v_tmp], 7, v[v_sld_b_os]
    v_lshlrev_b32 v[v_tmp], 4, v[v_tmp]
    v_add_u32 v[v_sld_b_os], v[v_tmp], v[v_sld_b_os]

    v_lshrrev_b32 v[v_tmp], 7, v[v_sld_a_os]
    v_lshlrev_b32 v[v_tmp], 4, v[v_tmp]
    v_add_u32 v[v_sld_a_os], v[v_tmp], v[v_sld_a_os]

    v_mov_b32 v[v_gemm_in], v[v_co_sst]
    v_mov_b32 v[v_gemm_im], v[v_co_sld]
    ; init_co_lds_offset for xdlops
    v_lshrrev_b32 v[v_tmp], 2, v[v_gemm_im]
    v_and_b32 v[v_tmp],  1 v[v_tmp]   ; thread id of lanegroup_m_per_cluster
    v_lshlrev_b32 v[v_co_sst], 2, v[v_tmp]
    v_lshrrev_b32 v[v_tmp+2], 6, v[v_gemm_im]  ; thread id of waves_per_m
    v_lshl_or_b32 v[v_co_sst], v[v_tmp+2], 4, v[v_co_sst]
    v_lshrrev_b32 v[v_tmp], 2, v[v_co_sst]
    v_lshlrev_b32 v[v_tmp+1], 2, v[v_gemm_in]   ; implicit transpose with m granularity:4 while store
    v_lshl_or_b32 v[v_co_sst], v[v_tmp], 10, v[v_tmp+1]
    v_lshlrev_b32 v[v_co_sst], 2, v[v_co_sst]
    v_lshlrev_b32 v[v_co_sld], 4, v[0]
    ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x256 sub_m_index:[0]
    ; g_mr:2, g_ms:2, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2
    ; nd_stride:[4, 2, 1, 4, 1, 2, 2, 1]
    v_mov_b32 v[v_co_sub_m_index], 0
    ; init_co_sub_n_index xdlops
    v_and_b32 v[v_co_sub_n_index], 255, v[0]

    ; weight offset
    s_mul_i32 s[s_block_gtc_ig], s[s_block_gtc_ig], 2
    s_mul_i32 s[s_tmp+2], s[s_k], s[s_wei_stride_k]
    s_mul_i32 s[s_tmp], s[s_block_gtc_ig], s[s_tmp+2]
    s_mul_hi_u32 s[s_tmp+1], s[s_block_gtc_ig], s[s_tmp+2]
    s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp]
    s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1]

    s_lshl_b32 s[s_tmp+3], s[s_block_gtc_ik], 2
    s_mul_i32 s[s_tmp], s[s_wei_stride_k], s[s_tmp+3]
    s_mul_hi_u32 s[s_tmp+1], s[s_wei_stride_k], s[s_tmp+3]
    s_add_u32 s[s_p_wei], s[s_p_wei], s[s_tmp]
    s_addc_u32 s[s_p_wei+1], s[s_p_wei+1], s[s_tmp+1]

    ; compute v_co_sub_n_index along ec : 256
    v_and_b32 v[v_wei_iec], 255, v[v_co_sub_n_index]     ; => EC

    ; compute wei_ic and set wei_flag
    v_add_u32 v[v_wei_iec], v[v_wei_iec], s[s_block_gtc_iec]
    .v_u32_div_rem_vs v_wei_ic, v_wei_ie, v_wei_iec, s_c_padded, v_tmp, s_tmp
    v_cmp_gt_u32 vcc, s[s_c], v[v_wei_ic]
    v_cndmask_b32 v[v_wei_c_flag],  0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_dim_e], v[v_wei_ie]
    v_cndmask_b32 v[v_wei_c_flag],  0, v[v_wei_c_flag], vcc

    ; compute wei offset
    v_mad_u32_u24 v[v_wei_os], s[s_c], v[v_wei_ie], v[v_wei_ic]
    ; add i_k
    v_mul_lo_u32 v[v_tmp], s[s_wei_stride_k], v[v_co_sub_m_index]
    v_add_u32 v[v_wei_os], v[v_wei_os], v[v_tmp]
    v_lshlrev_b32 v[v_wei_os], 2, v[v_wei_os]
    ; move slice step for output tensor
    ; move slice stride
    s_mov_b32 s[0], 8
    .v_u32_div_rem_ss s_tmp+4, s_move_slice_n, 0, s_dim_b, v_tmp+4, v_tmp, s_tmp
    .v_u32_div_rem_ss s_move_slice_n_dswo, s_move_slice_n_dsho, s_tmp+4, s_wo, v_tmp+4, v_tmp, s_tmp

    ; move slice step for output tensor
    s_lshl_b32 s[s_tmp], s[s_tmp+4], 1
    s_mul_i32 s[s_out_move_step], s[s_k], s[s_tmp]
    s_mul_i32 s[s_out_move_step], s[s_group], s[s_out_move_step]

    s_lshl_b32 s[s_move_slice_n], s[s_move_slice_n], 2

    ; convert dswo and dsho to dswi and dshi, dswi=dswo*stride_w, dshi=dsho*stride_h
    s_mul_i32 s[s_move_slice_n_dswo], s[s_move_slice_n_dswo], s[s_stride_w]
    s_mul_i32 s[s_move_slice_n_dsho], s[s_move_slice_n_dsho], s[s_stride_h]
    
    
    s_mul_i32 s[s_in_stride_n_n], s[s_move_slice_n], s[s_in_stride_n]
    s_mul_i32 s[s_out_stride_n_n], s[s_move_slice_n], s[s_out_stride_n]
    s_lshl_b32 s[s_in_stride_move_n], s[s_in_stride_n], 2
    s_mul_i32 s[s_out_stride_move_n], s[s_out_stride_n], 3
    s_lshl_b32 s[s_wei_stride_k], s[s_wei_stride_k], 2
    s_lshl_b32 s[s_knum], s[s_gemmk_per_wg], 2
    s_add_i32 s[s_knum], s[s_knum], 31
    s_lshr_b32 s[s_knum], s[s_knum], 5
    s_lshl_b32 s[s_knum], s[s_knum], 5

    ; start MFMA loop, 32x32 wave tile with 2x2 repeat, 2x2 step, k_pack:4
    s_waitcnt vmcnt(4)
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+4]
    v_pack_b32_f16 v[v_tmp], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp], v[v_gld_b], v[v_gld_b+4]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+8]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+12]
    v_pack_b32_f16 v[v_tmp+1], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+1], v[v_gld_b+8], v[v_gld_b+12]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+4] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+2], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+2], v[v_gld_b], v[v_gld_b+4] op_sel:[1, 1]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+8] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+12] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+3], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+3], v[v_gld_b+8], v[v_gld_b+12] op_sel:[1, 1]
    .endif
    ds_write_b128 v[v_sst_b_os], v[v_tmp:v_tmp+3] 
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+1]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+5]
    v_pack_b32_f16 v[v_tmp], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp], v[v_gld_b+1], v[v_gld_b+5]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+9]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+13]
    v_pack_b32_f16 v[v_tmp+1], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+1], v[v_gld_b+9], v[v_gld_b+13]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+1] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+5] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+2], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+2], v[v_gld_b+1], v[v_gld_b+5] op_sel:[1, 1]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+9] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+13] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+3], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+3], v[v_gld_b+9], v[v_gld_b+13] op_sel:[1, 1]
    .endif
    ds_write_b128 v[v_sst_b_os], v[v_tmp:v_tmp+3] offset:16
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+2]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+6]
    v_pack_b32_f16 v[v_tmp], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp], v[v_gld_b+2], v[v_gld_b+6]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+10]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+14]
    v_pack_b32_f16 v[v_tmp+1], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+1], v[v_gld_b+10], v[v_gld_b+14]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+2] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+6] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+2], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+2], v[v_gld_b+2], v[v_gld_b+6] op_sel:[1, 1]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+10] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+14] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+3], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+3], v[v_gld_b+10], v[v_gld_b+14] op_sel:[1, 1]
    .endif
    ds_write_b128 v[v_sst_b_os], v[v_tmp:v_tmp+3] offset:32
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+3]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+7]
    v_pack_b32_f16 v[v_tmp], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp], v[v_gld_b+3], v[v_gld_b+7]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+11]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+15]
    v_pack_b32_f16 v[v_tmp+1], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+1], v[v_gld_b+11], v[v_gld_b+15]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+3] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+7] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+2], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+2], v[v_gld_b+3], v[v_gld_b+7] op_sel:[1, 1]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+11] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+15] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+3], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+3], v[v_gld_b+11], v[v_gld_b+15] op_sel:[1, 1]
    .endif
    ds_write_b128 v[v_sst_b_os], v[v_tmp:v_tmp+3] offset:48

    s_waitcnt vmcnt(0)
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+4]
    v_pack_b32_f16 v[v_tmp], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp], v[v_gld_a], v[v_gld_a+4]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+8]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+12]
    v_pack_b32_f16 v[v_tmp+1], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+1], v[v_gld_a+8], v[v_gld_a+12]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+4] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+2], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+2], v[v_gld_a], v[v_gld_a+4] op_sel:[1, 1]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+8] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+12] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+3], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+3], v[v_gld_a+8], v[v_gld_a+12] op_sel:[1, 1]
    .endif
    ds_write_b128 v[v_sst_a_os], v[v_tmp:v_tmp+3] 
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+1]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+5]
    v_pack_b32_f16 v[v_tmp], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp], v[v_gld_a+1], v[v_gld_a+5]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+9]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+13]
    v_pack_b32_f16 v[v_tmp+1], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+1], v[v_gld_a+9], v[v_gld_a+13]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+1] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+5] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+2], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+2], v[v_gld_a+1], v[v_gld_a+5] op_sel:[1, 1]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+9] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+13] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+3], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+3], v[v_gld_a+9], v[v_gld_a+13] op_sel:[1, 1]
    .endif
    ds_write_b128 v[v_sst_a_os], v[v_tmp:v_tmp+3] offset:16
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+2]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+6]
    v_pack_b32_f16 v[v_tmp], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp], v[v_gld_a+2], v[v_gld_a+6]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+10]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+14]
    v_pack_b32_f16 v[v_tmp+1], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+1], v[v_gld_a+10], v[v_gld_a+14]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+2] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+6] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+2], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+2], v[v_gld_a+2], v[v_gld_a+6] op_sel:[1, 1]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+10] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+14] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+3], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+3], v[v_gld_a+10], v[v_gld_a+14] op_sel:[1, 1]
    .endif
    ds_write_b128 v[v_sst_a_os], v[v_tmp:v_tmp+3] offset:32
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+3]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+7]
    v_pack_b32_f16 v[v_tmp], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp], v[v_gld_a+3], v[v_gld_a+7]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+11]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+15]
    v_pack_b32_f16 v[v_tmp+1], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+1], v[v_gld_a+11], v[v_gld_a+15]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+3] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+7] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+2], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+2], v[v_gld_a+3], v[v_gld_a+7] op_sel:[1, 1]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+11] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+15] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+3], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+3], v[v_gld_a+11], v[v_gld_a+15] op_sel:[1, 1]
    .endif
    ds_write_b128 v[v_sst_a_os], v[v_tmp:v_tmp+3] offset:48

    .v_clear_acc_c a_c, 256
    ; make sure acc WAR harzard, at least 1 nop for src_c
    s_sub_i32 s[s_kitr], s[s_knum], 32
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_wrw_gtcx2_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x8_1x8x1x32_vs1_gkgs_mfma_end

    v_add_u32 v[v_in_iwi], s[s_move_slice_n_dswo], v[v_in_iwi]
    v_cmpx_le_i32 vcc, v[v_in_iwi_max], v[v_in_iwi]
    v_subrev_u32 v[v_in_iwi], s[s_wo_x_stride_w], v[v_in_iwi]
    v_add_u32 v[v_in_ihi], s[s_stride_h], v[v_in_ihi]
    s_mov_b64 exec, -1

    v_add_u32 v[v_in_ihi], s[s_move_slice_n_dsho], v[v_in_ihi]
    v_cmpx_le_i32 vcc, v[v_in_ihi_max], v[v_in_ihi]
    v_subrev_u32 v[v_in_ihi], s[s_ho_x_stride_h], v[v_in_ihi]
    v_add_u32 v[v_in_os_base], s[s_in_stride_move_n], v[v_in_os_base]
    v_add_u32 v[v_out_os], v[v_out_os], s[s_out_stride_move_n]
    s_mov_b64 exec, -1

    v_add_u32 v[v_in_os_base], s[s_in_stride_n_n], v[v_in_os_base]
    v_add_u32 v[v_out_os], s[s_out_stride_n_n], v[v_out_os]
    ; from hi, wi, os_base, compute final offset
    v_mad_u32_u24 v[v_tmp], s[s_wi], v[v_in_ihi], v[v_in_iwi]
    v_mul_lo_u32 v[v_tmp], v[v_tmp], s[s_in_stride_wi]
    v_lshl_add_u32 v[v_in_os], v[v_tmp], 1, v[v_in_os_base]
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    v_cndmask_b32 v[v_in_flag], 0, 1, vcc
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc
    v_add_u32 v[v_out_os], v[v_out_os], s[s_out_move_step]
    s_waitcnt lgkmcnt(0)
    s_barrier
    ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:36
    ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:36
    ds_read2_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:144, offset1:180
    ds_read2_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:144, offset1:180
L_igemm_wrw_gtcx2_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x8_1x8x1x32_vs1_gkgs_mfma_body:
    ; do fma accumulate with unroll 32
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_32x32x8_16f_m a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    .v_clear_nc v_gld_b, 16
    v_mfma_f32_32x32x8_16f_m a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47]     ; repeat:0x0, step:1x0, num_a_c:16
    v_cmpx_eq_u32 vcc, 1, v[v_in_flag]
    buffer_load_dwordx4 v[v_gld_b+0:v_gld_b+0+3], v[v_in_os], s[s_p_in:s_p_in+3], 0 offen offset:0
    buffer_load_dwordx4 v[v_gld_b+4:v_gld_b+4+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_stride_n] offen offset:0
    buffer_load_dwordx4 v[v_gld_b+8:v_gld_b+8+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset+0] offen offset:0
    buffer_load_dwordx4 v[v_gld_b+12:v_gld_b+12+3], v[v_in_os], s[s_p_in:s_p_in+3], s[s_in_offset+1] offen offset:0
    s_mov_b64 exec, -1
    v_mfma_f32_32x32x8_16f_m a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31]     ; repeat:0x0, step:0x1, num_a_c:16
    buffer_load_dwordx4 v[v_gld_a+0:v_gld_a+0+3], v[v_out_os], s[s_p_out:s_p_out+3], 0 offen offset:0
    v_mfma_f32_32x32x8_16f_m a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63]     ; repeat:0x0, step:1x1, num_a_c:16
    buffer_load_dwordx4 v[v_gld_a+4:v_gld_a+4+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_stride_n] offen offset:0
    v_add_u32 v[v_tmp+4], 4608, v[v_sld_a_os]
    ds_read2_b64 v[v_a+8+0:v_a+8+3], v[v_tmp+4], offset0:0, offset1:36 ; load i_k:1 into local buffer 1, repeat 0
    v_add_u32 v[v_tmp+5], 4608, v[v_sld_b_os]
    ds_read2_b64 v[v_b+8+0:v_b+8+3], v[v_tmp+5], offset0:0, offset1:36 ; load i_k:1 into local buffer 1, repeat 0
    s_waitcnt lgkmcnt(3)
    v_mfma_f32_32x32x8_16f_m a[a_c+64:a_c+79], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+64:a_c+79]     ; repeat:0x1, step:0x0, num_a_c:16
    buffer_load_dwordx4 v[v_gld_a+8:v_gld_a+8+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+0] offen offset:0
    v_mfma_f32_32x32x8_16f_m a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111]     ; repeat:0x1, step:1x0, num_a_c:16
    buffer_load_dwordx4 v[v_gld_a+12:v_gld_a+12+3], v[v_out_os], s[s_p_out:s_p_out+3], s[s_out_offset+1] offen offset:0
    v_mfma_f32_32x32x8_16f_m a[a_c+80:a_c+95], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+80:a_c+95]     ; repeat:0x1, step:0x1, num_a_c:16
    v_add_u32 v[v_in_iwi], s[s_move_slice_n_dswo], v[v_in_iwi]
    v_mfma_f32_32x32x8_16f_m a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127]     ; repeat:0x1, step:1x1, num_a_c:16
    v_cmpx_le_i32 vcc, v[v_in_iwi_max], v[v_in_iwi]
    v_subrev_u32 v[v_in_iwi], s[s_wo_x_stride_w], v[v_in_iwi]
    v_add_u32 v[v_in_ihi], s[s_stride_h], v[v_in_ihi]
    s_mov_b64 exec, -1
    ds_read2_b64 v[v_b+12+0:v_b+12+3], v[v_tmp+5], offset0:144, offset1:180 ; load i_k:1 into local buffer 1, repeat 1
    ds_read2_b64 v[v_a+12+0:v_a+12+3], v[v_tmp+4], offset0:144, offset1:180 ; load i_k:1 into local buffer 1, repeat 1
    s_waitcnt lgkmcnt(4)
    v_mfma_f32_32x32x8_16f_m a[a_c+128:a_c+143], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    v_add_u32 v[v_in_ihi], s[s_move_slice_n_dsho], v[v_in_ihi]
    v_mfma_f32_32x32x8_16f_m a[a_c+160:a_c+175], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    v_cmpx_le_i32 vcc, v[v_in_ihi_max], v[v_in_ihi]
    v_subrev_u32 v[v_in_ihi], s[s_ho_x_stride_h], v[v_in_ihi]
    v_add_u32 v[v_in_os_base], s[s_in_stride_move_n], v[v_in_os_base]
    v_add_u32 v[v_out_os], v[v_out_os], s[s_out_stride_move_n]
    s_mov_b64 exec, -1
    v_mfma_f32_32x32x8_16f_m a[a_c+144:a_c+159], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    v_add_u32 v[v_in_os_base], s[s_in_stride_n_n], v[v_in_os_base]
    v_mfma_f32_32x32x8_16f_m a[a_c+176:a_c+191], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16
    v_add_u32 v[v_out_os], s[s_out_stride_n_n], v[v_out_os]
    v_add_u32 v[v_tmp+4], 4608, v[v_tmp+4]
    ds_read2_b64 v[v_a+0:v_a+3], v[v_tmp+4], offset0:0, offset1:36 ; load i_k:2 into local buffer 0, repeat 0
    v_add_u32 v[v_tmp+5], 4608, v[v_tmp+5]
    ds_read2_b64 v[v_b+0:v_b+3], v[v_tmp+5], offset0:0, offset1:36 ; load i_k:2 into local buffer 0, repeat 0
    v_mfma_f32_32x32x8_16f_m a[a_c+192:a_c+207], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    v_mad_u32_u24 v[v_tmp], s[s_wi], v[v_in_ihi], v[v_in_iwi]
    v_mfma_f32_32x32x8_16f_m a[a_c+224:a_c+239], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    v_mul_lo_u32 v[v_tmp], v[v_tmp], s[s_in_stride_wi]
    v_mfma_f32_32x32x8_16f_m a[a_c+208:a_c+223], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    v_lshl_add_u32 v[v_in_os], v[v_tmp], 1, v[v_in_os_base]
    v_mfma_f32_32x32x8_16f_m a[a_c+240:a_c+255], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16
    v_cmp_gt_u32 vcc, s[s_hi], v[v_in_ihi]
    ds_read2_b64 v[v_b+4+0:v_b+4+3], v[v_tmp+5], offset0:144, offset1:180 ; load i_k:2 into local buffer 0, repeat 1
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_32x32x8_16f_m a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+8:v_b+9], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    v_cndmask_b32 v[v_in_flag], 0, 1, vcc
    v_mfma_f32_32x32x8_16f_m a[a_c+32:a_c+47], v[v_a+10:v_a+11], v[v_b+8:v_b+9], a[a_c+32:a_c+47]     ; repeat:0x0, step:1x0, num_a_c:16
    v_cmp_gt_u32 vcc, s[s_wi], v[v_in_iwi]
    v_mfma_f32_32x32x8_16f_m a[a_c+16:a_c+31], v[v_a+8:v_a+9], v[v_b+10:v_b+11], a[a_c+16:a_c+31]     ; repeat:0x0, step:0x1, num_a_c:16
    v_cndmask_b32 v[v_in_flag], 0, v[v_in_flag], vcc
    v_mfma_f32_32x32x8_16f_m a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+10:v_b+11], a[a_c+48:a_c+63]     ; repeat:0x0, step:1x1, num_a_c:16
    v_add_u32 v[v_out_os], v[v_out_os], s[s_out_move_step]
    ds_read2_b64 v[v_a+4+0:v_a+4+3], v[v_tmp+4], offset0:144, offset1:180 ; load i_k:2 into local buffer 0, repeat 1
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_32x32x8_16f_m a[a_c+64:a_c+79], v[v_a+8:v_a+9], v[v_b+12:v_b+13], a[a_c+64:a_c+79]     ; repeat:0x1, step:0x0, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+96:a_c+111], v[v_a+10:v_a+11], v[v_b+12:v_b+13], a[a_c+96:a_c+111]     ; repeat:0x1, step:1x0, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+80:a_c+95], v[v_a+8:v_a+9], v[v_b+14:v_b+15], a[a_c+80:a_c+95]     ; repeat:0x1, step:0x1, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+112:a_c+127], v[v_a+10:v_a+11], v[v_b+14:v_b+15], a[a_c+112:a_c+127]     ; repeat:0x1, step:1x1, num_a_c:16
    
    v_add_u32 v[v_tmp+4], 4608, v[v_tmp+4]
    ds_read2_b64 v[v_a+8+0:v_a+8+3], v[v_tmp+4], offset0:0, offset1:36 ; load i_k:3 into local buffer 1, repeat 0
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_32x32x8_16f_m a[a_c+128:a_c+143], v[v_a+12:v_a+13], v[v_b+8:v_b+9], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+160:a_c+175], v[v_a+14:v_a+15], v[v_b+8:v_b+9], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+144:a_c+159], v[v_a+12:v_a+13], v[v_b+10:v_b+11], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+176:a_c+191], v[v_a+14:v_a+15], v[v_b+10:v_b+11], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16
    
    v_add_u32 v[v_tmp+5], 4608, v[v_tmp+5]
    ds_read2_b64 v[v_b+8+0:v_b+8+3], v[v_tmp+5], offset0:0, offset1:36 ; load i_k:3 into local buffer 1, repeat 0
    v_mfma_f32_32x32x8_16f_m a[a_c+192:a_c+207], v[v_a+12:v_a+13], v[v_b+12:v_b+13], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+224:a_c+239], v[v_a+14:v_a+15], v[v_b+12:v_b+13], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+208:a_c+223], v[v_a+12:v_a+13], v[v_b+14:v_b+15], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+240:a_c+255], v[v_a+14:v_a+15], v[v_b+14:v_b+15], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16
    
    ds_read2_b64 v[v_b+12+0:v_b+12+3], v[v_tmp+5], offset0:144, offset1:180 ; load i_k:3 into local buffer 1, repeat 1
    ds_read2_b64 v[v_a+12+0:v_a+12+3], v[v_tmp+4], offset0:144, offset1:180 ; load i_k:3 into local buffer 1, repeat 1
    s_waitcnt lgkmcnt(6)
    v_mfma_f32_32x32x8_16f_m a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47]     ; repeat:0x0, step:1x0, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31]     ; repeat:0x0, step:0x1, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63]     ; repeat:0x0, step:1x1, num_a_c:16
    
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_32x32x8_16f_m a[a_c+64:a_c+79], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+64:a_c+79]     ; repeat:0x1, step:0x0, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111]     ; repeat:0x1, step:1x0, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+80:a_c+95], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+80:a_c+95]     ; repeat:0x1, step:0x1, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127]     ; repeat:0x1, step:1x1, num_a_c:16
    
    s_waitcnt lgkmcnt(4)
    v_mfma_f32_32x32x8_16f_m a[a_c+128:a_c+143], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+160:a_c+175], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+144:a_c+159], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    
    v_mfma_f32_32x32x8_16f_m a[a_c+176:a_c+191], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16
    
    s_waitcnt lgkmcnt(0)
    s_barrier
    s_waitcnt vmcnt(4)
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+4]
    v_pack_b32_f16 v[v_tmp], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp], v[v_gld_b], v[v_gld_b+4]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+8]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+12]
    v_pack_b32_f16 v[v_tmp+1], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+1], v[v_gld_b+8], v[v_gld_b+12]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+4] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+2], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+2], v[v_gld_b], v[v_gld_b+4] op_sel:[1, 1]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+8] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+12] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+3], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+3], v[v_gld_b+8], v[v_gld_b+12] op_sel:[1, 1]
    .endif
    ds_write_b128 v[v_sst_b_os], v[v_tmp:v_tmp+3]
    v_mfma_f32_32x32x8_16f_m a[a_c+192:a_c+207], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+1]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+5]
    v_pack_b32_f16 v[v_tmp], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp], v[v_gld_b+1], v[v_gld_b+5]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+9]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+13]
    v_pack_b32_f16 v[v_tmp+1], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+1], v[v_gld_b+9], v[v_gld_b+13]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+1] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+5] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+2], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+2], v[v_gld_b+1], v[v_gld_b+5] op_sel:[1, 1]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+9] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+13] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+3], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+3], v[v_gld_b+9], v[v_gld_b+13] op_sel:[1, 1]
    .endif
    ds_write_b128 v[v_sst_b_os], v[v_tmp:v_tmp+3] offset:16
    v_mfma_f32_32x32x8_16f_m a[a_c+224:a_c+239], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+2]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+6]
    v_pack_b32_f16 v[v_tmp], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp], v[v_gld_b+2], v[v_gld_b+6]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+10]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+14]
    v_pack_b32_f16 v[v_tmp+1], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+1], v[v_gld_b+10], v[v_gld_b+14]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+2] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+6] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+2], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+2], v[v_gld_b+2], v[v_gld_b+6] op_sel:[1, 1]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+10] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+14] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+3], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+3], v[v_gld_b+10], v[v_gld_b+14] op_sel:[1, 1]
    .endif
    ds_write_b128 v[v_sst_b_os], v[v_tmp:v_tmp+3] offset:32
    v_mfma_f32_32x32x8_16f_m a[a_c+208:a_c+223], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+3]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+7]
    v_pack_b32_f16 v[v_tmp], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp], v[v_gld_b+3], v[v_gld_b+7]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+11]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+15]
    v_pack_b32_f16 v[v_tmp+1], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+1], v[v_gld_b+11], v[v_gld_b+15]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+3] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+7] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+2], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+2], v[v_gld_b+3], v[v_gld_b+7] op_sel:[1, 1]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_b+11] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_b+15] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+3], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+3], v[v_gld_b+11], v[v_gld_b+15] op_sel:[1, 1]
    .endif
    ds_write_b128 v[v_sst_b_os], v[v_tmp:v_tmp+3] offset:48
    v_mfma_f32_32x32x8_16f_m a[a_c+240:a_c+255], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16
    s_waitcnt vmcnt(0)
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+4]
    v_pack_b32_f16 v[v_tmp], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp], v[v_gld_a], v[v_gld_a+4]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+8]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+12]
    v_pack_b32_f16 v[v_tmp+1], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+1], v[v_gld_a+8], v[v_gld_a+12]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+4] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+2], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+2], v[v_gld_a], v[v_gld_a+4] op_sel:[1, 1]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+8] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+12] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+3], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+3], v[v_gld_a+8], v[v_gld_a+12] op_sel:[1, 1]
    .endif
    ds_write_b128 v[v_sst_a_os], v[v_tmp:v_tmp+3]
    v_mfma_f32_32x32x8_16f_m a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+8:v_b+9], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+1]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+5]
    v_pack_b32_f16 v[v_tmp], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp], v[v_gld_a+1], v[v_gld_a+5]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+9]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+13]
    v_pack_b32_f16 v[v_tmp+1], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+1], v[v_gld_a+9], v[v_gld_a+13]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+1] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+5] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+2], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+2], v[v_gld_a+1], v[v_gld_a+5] op_sel:[1, 1]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+9] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+13] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+3], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+3], v[v_gld_a+9], v[v_gld_a+13] op_sel:[1, 1]
    .endif
    ds_write_b128 v[v_sst_a_os], v[v_tmp:v_tmp+3] offset:16
    v_mfma_f32_32x32x8_16f_m a[a_c+32:a_c+47], v[v_a+10:v_a+11], v[v_b+8:v_b+9], a[a_c+32:a_c+47]     ; repeat:0x0, step:1x0, num_a_c:16
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+2]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+6]
    v_pack_b32_f16 v[v_tmp], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp], v[v_gld_a+2], v[v_gld_a+6]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+10]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+14]
    v_pack_b32_f16 v[v_tmp+1], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+1], v[v_gld_a+10], v[v_gld_a+14]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+2] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+6] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+2], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+2], v[v_gld_a+2], v[v_gld_a+6] op_sel:[1, 1]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+10] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+14] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+3], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+3], v[v_gld_a+10], v[v_gld_a+14] op_sel:[1, 1]
    .endif
    ds_write_b128 v[v_sst_a_os], v[v_tmp:v_tmp+3] offset:32
    v_mfma_f32_32x32x8_16f_m a[a_c+16:a_c+31], v[v_a+8:v_a+9], v[v_b+10:v_b+11], a[a_c+16:a_c+31]     ; repeat:0x0, step:0x1, num_a_c:16
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+3]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+7]
    v_pack_b32_f16 v[v_tmp], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp], v[v_gld_a+3], v[v_gld_a+7]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+11]
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+15]
    v_pack_b32_f16 v[v_tmp+1], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+1], v[v_gld_a+11], v[v_gld_a+15]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+3] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+7] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+2], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+2], v[v_gld_a+3], v[v_gld_a+7] op_sel:[1, 1]
    .endif
    .if igemm_wrw_fp16_alt_impl == 1
    v_cvt_f32_f16 v[v_tmp+6], v[v_gld_a+11] src0_sel:WORD_1
    v_cvt_f32_f16 v[v_tmp+6+1], v[v_gld_a+15] src0_sel:WORD_1
    v_pack_b32_f16 v[v_tmp+3], v[v_tmp+6], v[v_tmp+6+1]  op_sel:[1, 1]
    .else
    v_pack_b32_f16 v[v_tmp+3], v[v_gld_a+11], v[v_gld_a+15] op_sel:[1, 1]
    .endif
    ds_write_b128 v[v_sst_a_os], v[v_tmp:v_tmp+3] offset:48
    v_mfma_f32_32x32x8_16f_m a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+10:v_b+11], a[a_c+48:a_c+63]     ; repeat:0x0, step:1x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+64:a_c+79], v[v_a+8:v_a+9], v[v_b+12:v_b+13], a[a_c+64:a_c+79]     ; repeat:0x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+96:a_c+111], v[v_a+10:v_a+11], v[v_b+12:v_b+13], a[a_c+96:a_c+111]     ; repeat:0x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+80:a_c+95], v[v_a+8:v_a+9], v[v_b+14:v_b+15], a[a_c+80:a_c+95]     ; repeat:0x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+112:a_c+127], v[v_a+10:v_a+11], v[v_b+14:v_b+15], a[a_c+112:a_c+127]     ; repeat:0x1, step:1x1, num_a_c:16
    s_sub_i32 s[s_kitr], s[s_kitr], 32
    s_cmp_gt_i32 s[s_kitr], 0
    s_cbranch_scc0 L_igemm_wrw_gtcx2_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x8_1x8x1x32_vs1_gkgs_mfma_finishing
    s_waitcnt lgkmcnt(0)
    s_barrier
    ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:36
    ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:36
    v_mfma_f32_32x32x8_16f_m a[a_c+128:a_c+143], v[v_a+12:v_a+13], v[v_b+8:v_b+9], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+160:a_c+175], v[v_a+14:v_a+15], v[v_b+8:v_b+9], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+144:a_c+159], v[v_a+12:v_a+13], v[v_b+10:v_b+11], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+176:a_c+191], v[v_a+14:v_a+15], v[v_b+10:v_b+11], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16
    ds_read2_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:144, offset1:180
    ds_read2_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:144, offset1:180
    v_mfma_f32_32x32x8_16f_m a[a_c+192:a_c+207], v[v_a+12:v_a+13], v[v_b+12:v_b+13], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+224:a_c+239], v[v_a+14:v_a+15], v[v_b+12:v_b+13], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+208:a_c+223], v[v_a+12:v_a+13], v[v_b+14:v_b+15], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+240:a_c+255], v[v_a+14:v_a+15], v[v_b+14:v_b+15], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16
    s_branch L_igemm_wrw_gtcx2_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x8_1x8x1x32_vs1_gkgs_mfma_body
L_igemm_wrw_gtcx2_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x8_1x8x1x32_vs1_gkgs_mfma_finishing:
    v_mfma_f32_32x32x8_16f_m a[a_c+128:a_c+143], v[v_a+12:v_a+13], v[v_b+8:v_b+9], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+160:a_c+175], v[v_a+14:v_a+15], v[v_b+8:v_b+9], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+144:a_c+159], v[v_a+12:v_a+13], v[v_b+10:v_b+11], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+176:a_c+191], v[v_a+14:v_a+15], v[v_b+10:v_b+11], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16

    v_mfma_f32_32x32x8_16f_m a[a_c+192:a_c+207], v[v_a+12:v_a+13], v[v_b+12:v_b+13], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+224:a_c+239], v[v_a+14:v_a+15], v[v_b+12:v_b+13], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+208:a_c+223], v[v_a+12:v_a+13], v[v_b+14:v_b+15], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+240:a_c+255], v[v_a+14:v_a+15], v[v_b+14:v_b+15], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16

L_igemm_wrw_gtcx2_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x8_1x8x1x32_vs1_gkgs_mfma_end:
    s_waitcnt lgkmcnt(0)
    s_barrier
    ds_read2_b64 v[v_a+0:v_a+3], v[v_sld_a_os], offset0:0, offset1:36
    ds_read2_b64 v[v_b+0:v_b+3], v[v_sld_b_os], offset0:0, offset1:36
    ds_read2_b64 v[v_b+4+0:v_b+4+3], v[v_sld_b_os], offset0:144, offset1:180
    ds_read2_b64 v[v_a+4+0:v_a+4+3], v[v_sld_a_os], offset0:144, offset1:180
    ; k iteration : 0
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_32x32x8_16f_m a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47]     ; repeat:0x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31]     ; repeat:0x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63]     ; repeat:0x0, step:1x1, num_a_c:16
    v_add_u32 v[v_tmp+4], 4608, v[v_sld_a_os]
    ds_read2_b64 v[v_a+8+0:v_a+8+3], v[v_tmp+4], offset0:0, offset1:36 ; load i_k:1 into local buffer 1, repeat 0
    v_add_u32 v[v_tmp+5], 4608, v[v_sld_b_os]
    ds_read2_b64 v[v_b+8+0:v_b+8+3], v[v_tmp+5], offset0:0, offset1:36 ; load i_k:1 into local buffer 1, repeat 0

    s_waitcnt lgkmcnt(3)
    v_mfma_f32_32x32x8_16f_m a[a_c+64:a_c+79], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+64:a_c+79]     ; repeat:0x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111]     ; repeat:0x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+80:a_c+95], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+80:a_c+95]     ; repeat:0x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127]     ; repeat:0x1, step:1x1, num_a_c:16
    ds_read2_b64 v[v_b+12+0:v_b+12+3], v[v_tmp+5], offset0:144, offset1:180 ; load i_k:1 into local buffer 1, repeat 1
    ds_read2_b64 v[v_a+12+0:v_a+12+3], v[v_tmp+4], offset0:144, offset1:180 ; load i_k:1 into local buffer 1, repeat 1

    s_waitcnt lgkmcnt(4)
    v_mfma_f32_32x32x8_16f_m a[a_c+128:a_c+143], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+160:a_c+175], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+144:a_c+159], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+176:a_c+191], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16
    v_add_u32 v[v_tmp+4], 4608, v[v_tmp+4]
    ds_read2_b64 v[v_a+0:v_a+3], v[v_tmp+4], offset0:0, offset1:36 ; load i_k:2 into local buffer 0, repeat 0
    v_add_u32 v[v_tmp+5], 4608, v[v_tmp+5]
    ds_read2_b64 v[v_b+0:v_b+3], v[v_tmp+5], offset0:0, offset1:36 ; load i_k:2 into local buffer 0, repeat 0

    v_mfma_f32_32x32x8_16f_m a[a_c+192:a_c+207], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+224:a_c+239], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+208:a_c+223], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+240:a_c+255], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16
    ds_read2_b64 v[v_b+4+0:v_b+4+3], v[v_tmp+5], offset0:144, offset1:180 ; load i_k:2 into local buffer 0, repeat 1

    ; k iteration : 1
    s_waitcnt lgkmcnt(5)
    v_mfma_f32_32x32x8_16f_m a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+8:v_b+9], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+32:a_c+47], v[v_a+10:v_a+11], v[v_b+8:v_b+9], a[a_c+32:a_c+47]     ; repeat:0x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+16:a_c+31], v[v_a+8:v_a+9], v[v_b+10:v_b+11], a[a_c+16:a_c+31]     ; repeat:0x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+10:v_b+11], a[a_c+48:a_c+63]     ; repeat:0x0, step:1x1, num_a_c:16
    ds_read2_b64 v[v_a+4+0:v_a+4+3], v[v_tmp+4], offset0:144, offset1:180 ; load i_k:2 into local buffer 0, repeat 1

    s_waitcnt lgkmcnt(5)
    v_mfma_f32_32x32x8_16f_m a[a_c+64:a_c+79], v[v_a+8:v_a+9], v[v_b+12:v_b+13], a[a_c+64:a_c+79]     ; repeat:0x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+96:a_c+111], v[v_a+10:v_a+11], v[v_b+12:v_b+13], a[a_c+96:a_c+111]     ; repeat:0x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+80:a_c+95], v[v_a+8:v_a+9], v[v_b+14:v_b+15], a[a_c+80:a_c+95]     ; repeat:0x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+112:a_c+127], v[v_a+10:v_a+11], v[v_b+14:v_b+15], a[a_c+112:a_c+127]     ; repeat:0x1, step:1x1, num_a_c:16
    v_add_u32 v[v_tmp+4], 4608, v[v_tmp+4]
    ds_read2_b64 v[v_a+8+0:v_a+8+3], v[v_tmp+4], offset0:0, offset1:36 ; load i_k:3 into local buffer 1, repeat 0

    s_waitcnt lgkmcnt(5)
    v_mfma_f32_32x32x8_16f_m a[a_c+128:a_c+143], v[v_a+12:v_a+13], v[v_b+8:v_b+9], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+160:a_c+175], v[v_a+14:v_a+15], v[v_b+8:v_b+9], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+144:a_c+159], v[v_a+12:v_a+13], v[v_b+10:v_b+11], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+176:a_c+191], v[v_a+14:v_a+15], v[v_b+10:v_b+11], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16
    v_add_u32 v[v_tmp+5], 4608, v[v_tmp+5]
    ds_read2_b64 v[v_b+8+0:v_b+8+3], v[v_tmp+5], offset0:0, offset1:36 ; load i_k:3 into local buffer 1, repeat 0

    v_mfma_f32_32x32x8_16f_m a[a_c+192:a_c+207], v[v_a+12:v_a+13], v[v_b+12:v_b+13], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+224:a_c+239], v[v_a+14:v_a+15], v[v_b+12:v_b+13], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+208:a_c+223], v[v_a+12:v_a+13], v[v_b+14:v_b+15], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+240:a_c+255], v[v_a+14:v_a+15], v[v_b+14:v_b+15], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16
    ds_read2_b64 v[v_b+12+0:v_b+12+3], v[v_tmp+5], offset0:144, offset1:180 ; load i_k:3 into local buffer 1, repeat 1
    ds_read2_b64 v[v_a+12+0:v_a+12+3], v[v_tmp+4], offset0:144, offset1:180 ; load i_k:3 into local buffer 1, repeat 1
    s_waitcnt lgkmcnt(6)
    v_mfma_f32_32x32x8_16f_m a[a_c+0:a_c+15], v[v_a+0:v_a+1], v[v_b+0:v_b+1], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+32:a_c+47], v[v_a+2:v_a+3], v[v_b+0:v_b+1], a[a_c+32:a_c+47]     ; repeat:0x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+16:a_c+31], v[v_a+0:v_a+1], v[v_b+2:v_b+3], a[a_c+16:a_c+31]     ; repeat:0x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+48:a_c+63], v[v_a+2:v_a+3], v[v_b+2:v_b+3], a[a_c+48:a_c+63]     ; repeat:0x0, step:1x1, num_a_c:16

    s_waitcnt lgkmcnt(5)
    v_mfma_f32_32x32x8_16f_m a[a_c+64:a_c+79], v[v_a+0:v_a+1], v[v_b+4:v_b+5], a[a_c+64:a_c+79]     ; repeat:0x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+96:a_c+111], v[v_a+2:v_a+3], v[v_b+4:v_b+5], a[a_c+96:a_c+111]     ; repeat:0x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+80:a_c+95], v[v_a+0:v_a+1], v[v_b+6:v_b+7], a[a_c+80:a_c+95]     ; repeat:0x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+112:a_c+127], v[v_a+2:v_a+3], v[v_b+6:v_b+7], a[a_c+112:a_c+127]     ; repeat:0x1, step:1x1, num_a_c:16

    s_waitcnt lgkmcnt(4)
    v_mfma_f32_32x32x8_16f_m a[a_c+128:a_c+143], v[v_a+4:v_a+5], v[v_b+0:v_b+1], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+160:a_c+175], v[v_a+6:v_a+7], v[v_b+0:v_b+1], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+144:a_c+159], v[v_a+4:v_a+5], v[v_b+2:v_b+3], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+176:a_c+191], v[v_a+6:v_a+7], v[v_b+2:v_b+3], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16

    ; k iteration : 30
    v_mfma_f32_32x32x8_16f_m a[a_c+192:a_c+207], v[v_a+4:v_a+5], v[v_b+4:v_b+5], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+224:a_c+239], v[v_a+6:v_a+7], v[v_b+4:v_b+5], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+208:a_c+223], v[v_a+4:v_a+5], v[v_b+6:v_b+7], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+240:a_c+255], v[v_a+6:v_a+7], v[v_b+6:v_b+7], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16
    ; k iteration : 31
    s_waitcnt lgkmcnt(2)
    v_mfma_f32_32x32x8_16f_m a[a_c+0:a_c+15], v[v_a+8:v_a+9], v[v_b+8:v_b+9], a[a_c+0:a_c+15]     ; repeat:0x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+32:a_c+47], v[v_a+10:v_a+11], v[v_b+8:v_b+9], a[a_c+32:a_c+47]     ; repeat:0x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+16:a_c+31], v[v_a+8:v_a+9], v[v_b+10:v_b+11], a[a_c+16:a_c+31]     ; repeat:0x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+48:a_c+63], v[v_a+10:v_a+11], v[v_b+10:v_b+11], a[a_c+48:a_c+63]     ; repeat:0x0, step:1x1, num_a_c:16

    s_waitcnt lgkmcnt(1)
    v_mfma_f32_32x32x8_16f_m a[a_c+64:a_c+79], v[v_a+8:v_a+9], v[v_b+12:v_b+13], a[a_c+64:a_c+79]     ; repeat:0x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+96:a_c+111], v[v_a+10:v_a+11], v[v_b+12:v_b+13], a[a_c+96:a_c+111]     ; repeat:0x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+80:a_c+95], v[v_a+8:v_a+9], v[v_b+14:v_b+15], a[a_c+80:a_c+95]     ; repeat:0x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+112:a_c+127], v[v_a+10:v_a+11], v[v_b+14:v_b+15], a[a_c+112:a_c+127]     ; repeat:0x1, step:1x1, num_a_c:16

    s_waitcnt lgkmcnt(0)
    v_mfma_f32_32x32x8_16f_m a[a_c+128:a_c+143], v[v_a+12:v_a+13], v[v_b+8:v_b+9], a[a_c+128:a_c+143]     ; repeat:1x0, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+160:a_c+175], v[v_a+14:v_a+15], v[v_b+8:v_b+9], a[a_c+160:a_c+175]     ; repeat:1x0, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+144:a_c+159], v[v_a+12:v_a+13], v[v_b+10:v_b+11], a[a_c+144:a_c+159]     ; repeat:1x0, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+176:a_c+191], v[v_a+14:v_a+15], v[v_b+10:v_b+11], a[a_c+176:a_c+191]     ; repeat:1x0, step:1x1, num_a_c:16

    v_mfma_f32_32x32x8_16f_m a[a_c+192:a_c+207], v[v_a+12:v_a+13], v[v_b+12:v_b+13], a[a_c+192:a_c+207]     ; repeat:1x1, step:0x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+224:a_c+239], v[v_a+14:v_a+15], v[v_b+12:v_b+13], a[a_c+224:a_c+239]     ; repeat:1x1, step:1x0, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+208:a_c+223], v[v_a+12:v_a+13], v[v_b+14:v_b+15], a[a_c+208:a_c+223]     ; repeat:1x1, step:0x1, num_a_c:16
    v_mfma_f32_32x32x8_16f_m a[a_c+240:a_c+255], v[v_a+14:v_a+15], v[v_b+14:v_b+15], a[a_c+240:a_c+255]     ; repeat:1x1, step:1x1, num_a_c:16

    s_nop 15
    s_nop 2
    ; coalescing store, mapping:mt_m:256, mt_n:256, wt_m:32, wt_n:32, ws:4, r_m:2, r_n:2, s_m:2, s_n:2 | 32x32x8, lanegroup_m_tcbw:4x2x4x1, lanegroup_n_tcbw:1x32x1x1
    ; coalescing_groups:8, num_dword_per_group:32
    ; init_co_sub_m_index xdlops, block_size:256, macro-tile:256x256 sub_m_index:[0]
    ; g_mr:2, g_ms:2, g_mw:1, g_mb:2, g_mt:1 | l_mr:1, l_ms:1, l_mw:1, l_mb:2, l_mt:4 | n_mc:2, n_ml:1, n_mv:2
    ; nd_stride:[2, 1, 4, 1, 2, 2, 1]
    ; start group 0, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 0
    s_barrier
    v_accvgpr_read_b32 v[v_c], a[a_c]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+1]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+2]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+3]
    ds_write_b128 v[v_co_sst], v[v_c:v_c+3]    ; idword:0(0,0),  0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+16]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+17]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+18]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+19]
    ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512   ; idword:32(0,32),  0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+64]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+65]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+66]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+67]
    ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048   ; idword:128(0,128),  0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+80]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+81]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+82]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+83]
    ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560   ; idword:160(0,160),  0x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c], a[a_c+4]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+5]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+6]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+7]
    ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192   ; idword:512(2,0),  2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+20]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+21]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+22]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+23]
    ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:8704   ; idword:544(2,32),  2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+68]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+69]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+70]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+71]
    ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240   ; idword:640(2,128),  2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+84]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+85]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+86]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+87]
    ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10752   ; idword:672(2,160),  2x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    s_mov_b32 s[s_tmp], 0   ; i_m:0(i_m0:0,i_m1:0)
    v_add_u32 v[v_cur_k], s[s_block_gtc_ik], v[v_co_sub_m_index]
    v_mov_b32 v[v_tmp], v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] 
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288
    v_cmpx_eq_u32 vcc, 1, v[v_wei_c_flag]
    ;   store to global, m index start from 0, m0:0, m1:0
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mov_b32 s[s_tmp], s[s_wei_stride_k]   ; i_m:1(i_m0:0,i_m1:1)
    v_add_u32 v[v_tmp], 1, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 2, s[s_wei_stride_k]   ; i_m:2(i_m0:0,i_m1:2)
    v_add_u32 v[v_tmp], 2, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 3, s[s_wei_stride_k]   ; i_m:3(i_m0:0,i_m1:3)
    v_add_u32 v[v_tmp], 3, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 4, s[s_wei_stride_k]   ; i_m:4(i_m0:0,i_m1:4)
    v_add_u32 v[v_tmp], 4, v[v_cur_k]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 5, s[s_wei_stride_k]   ; i_m:5(i_m0:0,i_m1:5)
    v_add_u32 v[v_tmp], 5, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 6, s[s_wei_stride_k]   ; i_m:6(i_m0:0,i_m1:6)
    v_add_u32 v[v_tmp], 6, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 7, s[s_wei_stride_k]   ; i_m:7(i_m0:0,i_m1:7)
    v_add_u32 v[v_tmp], 7, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 8, s[s_wei_stride_k]   ; i_m:8(i_m0:0,i_m1:8)
    v_add_u32 v[v_tmp], 8, v[v_cur_k]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 9, s[s_wei_stride_k]   ; i_m:9(i_m0:0,i_m1:9)
    v_add_u32 v[v_tmp], 9, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 10, s[s_wei_stride_k]   ; i_m:10(i_m0:0,i_m1:10)
    v_add_u32 v[v_tmp], 10, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 11, s[s_wei_stride_k]   ; i_m:11(i_m0:0,i_m1:11)
    v_add_u32 v[v_tmp], 11, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 12, s[s_wei_stride_k]   ; i_m:12(i_m0:0,i_m1:12)
    v_add_u32 v[v_tmp], 12, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 13, s[s_wei_stride_k]   ; i_m:13(i_m0:0,i_m1:13)
    v_add_u32 v[v_tmp], 13, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 14, s[s_wei_stride_k]   ; i_m:14(i_m0:0,i_m1:14)
    v_add_u32 v[v_tmp], 14, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 15, s[s_wei_stride_k]   ; i_m:15(i_m0:0,i_m1:15)
    v_add_u32 v[v_tmp], 15, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 64, s[s_wei_stride_k]   ; i_m:64(i_m0:0,i_m1:64)
    v_add_u32 v[v_tmp], 64, v[v_cur_k]
    s_mov_b64 exec, -1
    ;   load from lds, i_ssgroup:1, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672
    v_cmpx_eq_u32 vcc, 1, v[v_wei_c_flag]
    ;   store to global, m index start from 0, m0:0, m1:0
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 65, s[s_wei_stride_k]   ; i_m:65(i_m0:0,i_m1:65)
    v_add_u32 v[v_tmp], 65, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 66, s[s_wei_stride_k]   ; i_m:66(i_m0:0,i_m1:66)
    v_add_u32 v[v_tmp], 66, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 67, s[s_wei_stride_k]   ; i_m:67(i_m0:0,i_m1:67)
    v_add_u32 v[v_tmp], 67, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 68, s[s_wei_stride_k]   ; i_m:68(i_m0:0,i_m1:68)
    v_add_u32 v[v_tmp], 68, v[v_cur_k]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 69, s[s_wei_stride_k]   ; i_m:69(i_m0:0,i_m1:69)
    v_add_u32 v[v_tmp], 69, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 70, s[s_wei_stride_k]   ; i_m:70(i_m0:0,i_m1:70)
    v_add_u32 v[v_tmp], 70, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 71, s[s_wei_stride_k]   ; i_m:71(i_m0:0,i_m1:71)
    v_add_u32 v[v_tmp], 71, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 72, s[s_wei_stride_k]   ; i_m:72(i_m0:0,i_m1:72)
    v_add_u32 v[v_tmp], 72, v[v_cur_k]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 73, s[s_wei_stride_k]   ; i_m:73(i_m0:0,i_m1:73)
    v_add_u32 v[v_tmp], 73, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 74, s[s_wei_stride_k]   ; i_m:74(i_m0:0,i_m1:74)
    v_add_u32 v[v_tmp], 74, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 75, s[s_wei_stride_k]   ; i_m:75(i_m0:0,i_m1:75)
    v_add_u32 v[v_tmp], 75, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 76, s[s_wei_stride_k]   ; i_m:76(i_m0:0,i_m1:76)
    v_add_u32 v[v_tmp], 76, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 77, s[s_wei_stride_k]   ; i_m:77(i_m0:0,i_m1:77)
    v_add_u32 v[v_tmp], 77, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 78, s[s_wei_stride_k]   ; i_m:78(i_m0:0,i_m1:78)
    v_add_u32 v[v_tmp], 78, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 79, s[s_wei_stride_k]   ; i_m:79(i_m0:0,i_m1:79)
    v_add_u32 v[v_tmp], 79, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mov_b64 exec, -1
    ; start group 1, i_g_mr:0, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 16
    s_barrier
    v_accvgpr_read_b32 v[v_c], a[a_c+8]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+9]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+10]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+11]
    ds_write_b128 v[v_co_sst], v[v_c:v_c+3]    ; idword:0(0,0),  0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+24]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+25]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+26]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+27]
    ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512   ; idword:32(0,32),  0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+72]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+73]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+74]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+75]
    ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048   ; idword:128(0,128),  0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+88]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+89]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+90]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+91]
    ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560   ; idword:160(0,160),  0x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c], a[a_c+12]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+13]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+14]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+15]
    ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192   ; idword:512(2,0),  2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+28]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+29]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+30]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+31]
    ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:8704   ; idword:544(2,32),  2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+76]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+77]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+78]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+79]
    ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240   ; idword:640(2,128),  2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+92]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+93]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+94]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+95]
    ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10752   ; idword:672(2,160),  2x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    s_mul_i32 s[s_tmp], 16, s[s_wei_stride_k]   ; i_m:16(i_m0:0,i_m1:16)
    v_add_u32 v[v_tmp], 16, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] 
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288
    v_cmpx_eq_u32 vcc, 1, v[v_wei_c_flag]
    ;   store to global, m index start from 16, m0:0, m1:16
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 17, s[s_wei_stride_k]   ; i_m:17(i_m0:0,i_m1:17)
    v_add_u32 v[v_tmp], 17, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 18, s[s_wei_stride_k]   ; i_m:18(i_m0:0,i_m1:18)
    v_add_u32 v[v_tmp], 18, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 19, s[s_wei_stride_k]   ; i_m:19(i_m0:0,i_m1:19)
    v_add_u32 v[v_tmp], 19, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 20, s[s_wei_stride_k]   ; i_m:20(i_m0:0,i_m1:20)
    v_add_u32 v[v_tmp], 20, v[v_cur_k]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 21, s[s_wei_stride_k]   ; i_m:21(i_m0:0,i_m1:21)
    v_add_u32 v[v_tmp], 21, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 22, s[s_wei_stride_k]   ; i_m:22(i_m0:0,i_m1:22)
    v_add_u32 v[v_tmp], 22, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 23, s[s_wei_stride_k]   ; i_m:23(i_m0:0,i_m1:23)
    v_add_u32 v[v_tmp], 23, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 24, s[s_wei_stride_k]   ; i_m:24(i_m0:0,i_m1:24)
    v_add_u32 v[v_tmp], 24, v[v_cur_k]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 25, s[s_wei_stride_k]   ; i_m:25(i_m0:0,i_m1:25)
    v_add_u32 v[v_tmp], 25, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 26, s[s_wei_stride_k]   ; i_m:26(i_m0:0,i_m1:26)
    v_add_u32 v[v_tmp], 26, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 27, s[s_wei_stride_k]   ; i_m:27(i_m0:0,i_m1:27)
    v_add_u32 v[v_tmp], 27, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 28, s[s_wei_stride_k]   ; i_m:28(i_m0:0,i_m1:28)
    v_add_u32 v[v_tmp], 28, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 29, s[s_wei_stride_k]   ; i_m:29(i_m0:0,i_m1:29)
    v_add_u32 v[v_tmp], 29, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 30, s[s_wei_stride_k]   ; i_m:30(i_m0:0,i_m1:30)
    v_add_u32 v[v_tmp], 30, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 31, s[s_wei_stride_k]   ; i_m:31(i_m0:0,i_m1:31)
    v_add_u32 v[v_tmp], 31, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 80, s[s_wei_stride_k]   ; i_m:80(i_m0:0,i_m1:80)
    v_add_u32 v[v_tmp], 80, v[v_cur_k]
    s_mov_b64 exec, -1
    ;   load from lds, i_ssgroup:1, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672
    v_cmpx_eq_u32 vcc, 1, v[v_wei_c_flag]
    ;   store to global, m index start from 16, m0:0, m1:16
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 81, s[s_wei_stride_k]   ; i_m:81(i_m0:0,i_m1:81)
    v_add_u32 v[v_tmp], 81, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 82, s[s_wei_stride_k]   ; i_m:82(i_m0:0,i_m1:82)
    v_add_u32 v[v_tmp], 82, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 83, s[s_wei_stride_k]   ; i_m:83(i_m0:0,i_m1:83)
    v_add_u32 v[v_tmp], 83, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 84, s[s_wei_stride_k]   ; i_m:84(i_m0:0,i_m1:84)
    v_add_u32 v[v_tmp], 84, v[v_cur_k]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 85, s[s_wei_stride_k]   ; i_m:85(i_m0:0,i_m1:85)
    v_add_u32 v[v_tmp], 85, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 86, s[s_wei_stride_k]   ; i_m:86(i_m0:0,i_m1:86)
    v_add_u32 v[v_tmp], 86, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 87, s[s_wei_stride_k]   ; i_m:87(i_m0:0,i_m1:87)
    v_add_u32 v[v_tmp], 87, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 88, s[s_wei_stride_k]   ; i_m:88(i_m0:0,i_m1:88)
    v_add_u32 v[v_tmp], 88, v[v_cur_k]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 89, s[s_wei_stride_k]   ; i_m:89(i_m0:0,i_m1:89)
    v_add_u32 v[v_tmp], 89, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 90, s[s_wei_stride_k]   ; i_m:90(i_m0:0,i_m1:90)
    v_add_u32 v[v_tmp], 90, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 91, s[s_wei_stride_k]   ; i_m:91(i_m0:0,i_m1:91)
    v_add_u32 v[v_tmp], 91, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 92, s[s_wei_stride_k]   ; i_m:92(i_m0:0,i_m1:92)
    v_add_u32 v[v_tmp], 92, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 93, s[s_wei_stride_k]   ; i_m:93(i_m0:0,i_m1:93)
    v_add_u32 v[v_tmp], 93, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 94, s[s_wei_stride_k]   ; i_m:94(i_m0:0,i_m1:94)
    v_add_u32 v[v_tmp], 94, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 95, s[s_wei_stride_k]   ; i_m:95(i_m0:0,i_m1:95)
    v_add_u32 v[v_tmp], 95, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mov_b64 exec, -1
    ; start group 2, i_g_mr:0, i_g_ms:1, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 32
    s_barrier
    v_accvgpr_read_b32 v[v_c], a[a_c+32]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+33]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+34]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+35]
    ds_write_b128 v[v_co_sst], v[v_c:v_c+3]    ; idword:0(0,0),  0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+48]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+49]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+50]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+51]
    ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512   ; idword:32(0,32),  0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+96]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+97]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+98]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+99]
    ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048   ; idword:128(0,128),  0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+112]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+113]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+114]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+115]
    ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560   ; idword:160(0,160),  0x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c], a[a_c+36]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+37]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+38]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+39]
    ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192   ; idword:512(2,0),  2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+52]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+53]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+54]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+55]
    ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:8704   ; idword:544(2,32),  2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+100]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+101]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+102]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+103]
    ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240   ; idword:640(2,128),  2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+116]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+117]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+118]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+119]
    ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10752   ; idword:672(2,160),  2x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    s_mul_i32 s[s_tmp], 32, s[s_wei_stride_k]   ; i_m:32(i_m0:0,i_m1:32)
    v_add_u32 v[v_tmp], 32, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] 
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288
    v_cmpx_eq_u32 vcc, 1, v[v_wei_c_flag]
    ;   store to global, m index start from 32, m0:0, m1:32
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 33, s[s_wei_stride_k]   ; i_m:33(i_m0:0,i_m1:33)
    v_add_u32 v[v_tmp], 33, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 34, s[s_wei_stride_k]   ; i_m:34(i_m0:0,i_m1:34)
    v_add_u32 v[v_tmp], 34, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 35, s[s_wei_stride_k]   ; i_m:35(i_m0:0,i_m1:35)
    v_add_u32 v[v_tmp], 35, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 36, s[s_wei_stride_k]   ; i_m:36(i_m0:0,i_m1:36)
    v_add_u32 v[v_tmp], 36, v[v_cur_k]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 37, s[s_wei_stride_k]   ; i_m:37(i_m0:0,i_m1:37)
    v_add_u32 v[v_tmp], 37, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 38, s[s_wei_stride_k]   ; i_m:38(i_m0:0,i_m1:38)
    v_add_u32 v[v_tmp], 38, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 39, s[s_wei_stride_k]   ; i_m:39(i_m0:0,i_m1:39)
    v_add_u32 v[v_tmp], 39, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 40, s[s_wei_stride_k]   ; i_m:40(i_m0:0,i_m1:40)
    v_add_u32 v[v_tmp], 40, v[v_cur_k]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 41, s[s_wei_stride_k]   ; i_m:41(i_m0:0,i_m1:41)
    v_add_u32 v[v_tmp], 41, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 42, s[s_wei_stride_k]   ; i_m:42(i_m0:0,i_m1:42)
    v_add_u32 v[v_tmp], 42, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 43, s[s_wei_stride_k]   ; i_m:43(i_m0:0,i_m1:43)
    v_add_u32 v[v_tmp], 43, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 44, s[s_wei_stride_k]   ; i_m:44(i_m0:0,i_m1:44)
    v_add_u32 v[v_tmp], 44, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 45, s[s_wei_stride_k]   ; i_m:45(i_m0:0,i_m1:45)
    v_add_u32 v[v_tmp], 45, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 46, s[s_wei_stride_k]   ; i_m:46(i_m0:0,i_m1:46)
    v_add_u32 v[v_tmp], 46, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 47, s[s_wei_stride_k]   ; i_m:47(i_m0:0,i_m1:47)
    v_add_u32 v[v_tmp], 47, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 96, s[s_wei_stride_k]   ; i_m:96(i_m0:0,i_m1:96)
    v_add_u32 v[v_tmp], 96, v[v_cur_k]
    s_mov_b64 exec, -1
    ;   load from lds, i_ssgroup:1, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672
    v_cmpx_eq_u32 vcc, 1, v[v_wei_c_flag]
    ;   store to global, m index start from 32, m0:0, m1:32
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 97, s[s_wei_stride_k]   ; i_m:97(i_m0:0,i_m1:97)
    v_add_u32 v[v_tmp], 97, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 98, s[s_wei_stride_k]   ; i_m:98(i_m0:0,i_m1:98)
    v_add_u32 v[v_tmp], 98, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 99, s[s_wei_stride_k]   ; i_m:99(i_m0:0,i_m1:99)
    v_add_u32 v[v_tmp], 99, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 100, s[s_wei_stride_k]   ; i_m:100(i_m0:0,i_m1:100)
    v_add_u32 v[v_tmp], 100, v[v_cur_k]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 101, s[s_wei_stride_k]   ; i_m:101(i_m0:0,i_m1:101)
    v_add_u32 v[v_tmp], 101, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 102, s[s_wei_stride_k]   ; i_m:102(i_m0:0,i_m1:102)
    v_add_u32 v[v_tmp], 102, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 103, s[s_wei_stride_k]   ; i_m:103(i_m0:0,i_m1:103)
    v_add_u32 v[v_tmp], 103, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 104, s[s_wei_stride_k]   ; i_m:104(i_m0:0,i_m1:104)
    v_add_u32 v[v_tmp], 104, v[v_cur_k]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 105, s[s_wei_stride_k]   ; i_m:105(i_m0:0,i_m1:105)
    v_add_u32 v[v_tmp], 105, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 106, s[s_wei_stride_k]   ; i_m:106(i_m0:0,i_m1:106)
    v_add_u32 v[v_tmp], 106, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 107, s[s_wei_stride_k]   ; i_m:107(i_m0:0,i_m1:107)
    v_add_u32 v[v_tmp], 107, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 108, s[s_wei_stride_k]   ; i_m:108(i_m0:0,i_m1:108)
    v_add_u32 v[v_tmp], 108, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 109, s[s_wei_stride_k]   ; i_m:109(i_m0:0,i_m1:109)
    v_add_u32 v[v_tmp], 109, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 110, s[s_wei_stride_k]   ; i_m:110(i_m0:0,i_m1:110)
    v_add_u32 v[v_tmp], 110, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 111, s[s_wei_stride_k]   ; i_m:111(i_m0:0,i_m1:111)
    v_add_u32 v[v_tmp], 111, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mov_b64 exec, -1
    ; start group 3, i_g_mr:0, i_g_ms:1, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 48
    s_barrier
    v_accvgpr_read_b32 v[v_c], a[a_c+40]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+41]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+42]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+43]
    ds_write_b128 v[v_co_sst], v[v_c:v_c+3]    ; idword:0(0,0),  0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+56]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+57]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+58]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+59]
    ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512   ; idword:32(0,32),  0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+104]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+105]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+106]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+107]
    ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048   ; idword:128(0,128),  0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+120]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+121]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+122]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+123]
    ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560   ; idword:160(0,160),  0x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c], a[a_c+44]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+45]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+46]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+47]
    ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192   ; idword:512(2,0),  2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+60]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+61]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+62]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+63]
    ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:8704   ; idword:544(2,32),  2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+108]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+109]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+110]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+111]
    ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240   ; idword:640(2,128),  2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+124]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+125]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+126]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+127]
    ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10752   ; idword:672(2,160),  2x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    s_mul_i32 s[s_tmp], 48, s[s_wei_stride_k]   ; i_m:48(i_m0:0,i_m1:48)
    v_add_u32 v[v_tmp], 48, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] 
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288
    v_cmpx_eq_u32 vcc, 1, v[v_wei_c_flag]
    ;   store to global, m index start from 48, m0:0, m1:48
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 49, s[s_wei_stride_k]   ; i_m:49(i_m0:0,i_m1:49)
    v_add_u32 v[v_tmp], 49, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 50, s[s_wei_stride_k]   ; i_m:50(i_m0:0,i_m1:50)
    v_add_u32 v[v_tmp], 50, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 51, s[s_wei_stride_k]   ; i_m:51(i_m0:0,i_m1:51)
    v_add_u32 v[v_tmp], 51, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 52, s[s_wei_stride_k]   ; i_m:52(i_m0:0,i_m1:52)
    v_add_u32 v[v_tmp], 52, v[v_cur_k]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 53, s[s_wei_stride_k]   ; i_m:53(i_m0:0,i_m1:53)
    v_add_u32 v[v_tmp], 53, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 54, s[s_wei_stride_k]   ; i_m:54(i_m0:0,i_m1:54)
    v_add_u32 v[v_tmp], 54, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 55, s[s_wei_stride_k]   ; i_m:55(i_m0:0,i_m1:55)
    v_add_u32 v[v_tmp], 55, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 56, s[s_wei_stride_k]   ; i_m:56(i_m0:0,i_m1:56)
    v_add_u32 v[v_tmp], 56, v[v_cur_k]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 57, s[s_wei_stride_k]   ; i_m:57(i_m0:0,i_m1:57)
    v_add_u32 v[v_tmp], 57, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 58, s[s_wei_stride_k]   ; i_m:58(i_m0:0,i_m1:58)
    v_add_u32 v[v_tmp], 58, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 59, s[s_wei_stride_k]   ; i_m:59(i_m0:0,i_m1:59)
    v_add_u32 v[v_tmp], 59, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 60, s[s_wei_stride_k]   ; i_m:60(i_m0:0,i_m1:60)
    v_add_u32 v[v_tmp], 60, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 61, s[s_wei_stride_k]   ; i_m:61(i_m0:0,i_m1:61)
    v_add_u32 v[v_tmp], 61, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 62, s[s_wei_stride_k]   ; i_m:62(i_m0:0,i_m1:62)
    v_add_u32 v[v_tmp], 62, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 63, s[s_wei_stride_k]   ; i_m:63(i_m0:0,i_m1:63)
    v_add_u32 v[v_tmp], 63, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 112, s[s_wei_stride_k]   ; i_m:112(i_m0:0,i_m1:112)
    v_add_u32 v[v_tmp], 112, v[v_cur_k]
    s_mov_b64 exec, -1
    ;   load from lds, i_ssgroup:1, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672
    v_cmpx_eq_u32 vcc, 1, v[v_wei_c_flag]
    ;   store to global, m index start from 48, m0:0, m1:48
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 113, s[s_wei_stride_k]   ; i_m:113(i_m0:0,i_m1:113)
    v_add_u32 v[v_tmp], 113, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 114, s[s_wei_stride_k]   ; i_m:114(i_m0:0,i_m1:114)
    v_add_u32 v[v_tmp], 114, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 115, s[s_wei_stride_k]   ; i_m:115(i_m0:0,i_m1:115)
    v_add_u32 v[v_tmp], 115, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 116, s[s_wei_stride_k]   ; i_m:116(i_m0:0,i_m1:116)
    v_add_u32 v[v_tmp], 116, v[v_cur_k]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 117, s[s_wei_stride_k]   ; i_m:117(i_m0:0,i_m1:117)
    v_add_u32 v[v_tmp], 117, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 118, s[s_wei_stride_k]   ; i_m:118(i_m0:0,i_m1:118)
    v_add_u32 v[v_tmp], 118, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 119, s[s_wei_stride_k]   ; i_m:119(i_m0:0,i_m1:119)
    v_add_u32 v[v_tmp], 119, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 120, s[s_wei_stride_k]   ; i_m:120(i_m0:0,i_m1:120)
    v_add_u32 v[v_tmp], 120, v[v_cur_k]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 121, s[s_wei_stride_k]   ; i_m:121(i_m0:0,i_m1:121)
    v_add_u32 v[v_tmp], 121, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 122, s[s_wei_stride_k]   ; i_m:122(i_m0:0,i_m1:122)
    v_add_u32 v[v_tmp], 122, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 123, s[s_wei_stride_k]   ; i_m:123(i_m0:0,i_m1:123)
    v_add_u32 v[v_tmp], 123, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 124, s[s_wei_stride_k]   ; i_m:124(i_m0:0,i_m1:124)
    v_add_u32 v[v_tmp], 124, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 125, s[s_wei_stride_k]   ; i_m:125(i_m0:0,i_m1:125)
    v_add_u32 v[v_tmp], 125, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 126, s[s_wei_stride_k]   ; i_m:126(i_m0:0,i_m1:126)
    v_add_u32 v[v_tmp], 126, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 127, s[s_wei_stride_k]   ; i_m:127(i_m0:0,i_m1:127)
    v_add_u32 v[v_tmp], 127, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mov_b64 exec, -1
    ; start group 4, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 128
    s_barrier
    v_accvgpr_read_b32 v[v_c], a[a_c+128]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+129]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+130]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+131]
    ds_write_b128 v[v_co_sst], v[v_c:v_c+3]    ; idword:0(0,0),  0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+144]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+145]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+146]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+147]
    ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512   ; idword:32(0,32),  0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+192]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+193]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+194]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+195]
    ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048   ; idword:128(0,128),  0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+208]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+209]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+210]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+211]
    ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560   ; idword:160(0,160),  0x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c], a[a_c+132]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+133]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+134]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+135]
    ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192   ; idword:512(2,0),  2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+148]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+149]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+150]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+151]
    ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:8704   ; idword:544(2,32),  2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+196]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+197]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+198]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+199]
    ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240   ; idword:640(2,128),  2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+212]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+213]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+214]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+215]
    ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10752   ; idword:672(2,160),  2x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    s_mul_i32 s[s_tmp], 128, s[s_wei_stride_k]   ; i_m:128(i_m0:0,i_m1:128)
    v_add_u32 v[v_tmp], 128, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] 
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288
    v_cmpx_eq_u32 vcc, 1, v[v_wei_c_flag]
    ;   store to global, m index start from 128, m0:0, m1:128
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 129, s[s_wei_stride_k]   ; i_m:129(i_m0:0,i_m1:129)
    v_add_u32 v[v_tmp], 129, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 130, s[s_wei_stride_k]   ; i_m:130(i_m0:0,i_m1:130)
    v_add_u32 v[v_tmp], 130, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 131, s[s_wei_stride_k]   ; i_m:131(i_m0:0,i_m1:131)
    v_add_u32 v[v_tmp], 131, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 132, s[s_wei_stride_k]   ; i_m:132(i_m0:0,i_m1:132)
    v_add_u32 v[v_tmp], 132, v[v_cur_k]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 133, s[s_wei_stride_k]   ; i_m:133(i_m0:0,i_m1:133)
    v_add_u32 v[v_tmp], 133, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 134, s[s_wei_stride_k]   ; i_m:134(i_m0:0,i_m1:134)
    v_add_u32 v[v_tmp], 134, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 135, s[s_wei_stride_k]   ; i_m:135(i_m0:0,i_m1:135)
    v_add_u32 v[v_tmp], 135, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 136, s[s_wei_stride_k]   ; i_m:136(i_m0:0,i_m1:136)
    v_add_u32 v[v_tmp], 136, v[v_cur_k]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 137, s[s_wei_stride_k]   ; i_m:137(i_m0:0,i_m1:137)
    v_add_u32 v[v_tmp], 137, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 138, s[s_wei_stride_k]   ; i_m:138(i_m0:0,i_m1:138)
    v_add_u32 v[v_tmp], 138, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 139, s[s_wei_stride_k]   ; i_m:139(i_m0:0,i_m1:139)
    v_add_u32 v[v_tmp], 139, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 140, s[s_wei_stride_k]   ; i_m:140(i_m0:0,i_m1:140)
    v_add_u32 v[v_tmp], 140, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 141, s[s_wei_stride_k]   ; i_m:141(i_m0:0,i_m1:141)
    v_add_u32 v[v_tmp], 141, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 142, s[s_wei_stride_k]   ; i_m:142(i_m0:0,i_m1:142)
    v_add_u32 v[v_tmp], 142, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 143, s[s_wei_stride_k]   ; i_m:143(i_m0:0,i_m1:143)
    v_add_u32 v[v_tmp], 143, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 192, s[s_wei_stride_k]   ; i_m:192(i_m0:0,i_m1:192)
    v_add_u32 v[v_tmp], 192, v[v_cur_k]
    s_mov_b64 exec, -1
    ;   load from lds, i_ssgroup:1, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672
    v_cmpx_eq_u32 vcc, 1, v[v_wei_c_flag]
    ;   store to global, m index start from 128, m0:0, m1:128
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 193, s[s_wei_stride_k]   ; i_m:193(i_m0:0,i_m1:193)
    v_add_u32 v[v_tmp], 193, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 194, s[s_wei_stride_k]   ; i_m:194(i_m0:0,i_m1:194)
    v_add_u32 v[v_tmp], 194, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 195, s[s_wei_stride_k]   ; i_m:195(i_m0:0,i_m1:195)
    v_add_u32 v[v_tmp], 195, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 196, s[s_wei_stride_k]   ; i_m:196(i_m0:0,i_m1:196)
    v_add_u32 v[v_tmp], 196, v[v_cur_k]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 197, s[s_wei_stride_k]   ; i_m:197(i_m0:0,i_m1:197)
    v_add_u32 v[v_tmp], 197, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 198, s[s_wei_stride_k]   ; i_m:198(i_m0:0,i_m1:198)
    v_add_u32 v[v_tmp], 198, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 199, s[s_wei_stride_k]   ; i_m:199(i_m0:0,i_m1:199)
    v_add_u32 v[v_tmp], 199, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 200, s[s_wei_stride_k]   ; i_m:200(i_m0:0,i_m1:200)
    v_add_u32 v[v_tmp], 200, v[v_cur_k]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 201, s[s_wei_stride_k]   ; i_m:201(i_m0:0,i_m1:201)
    v_add_u32 v[v_tmp], 201, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 202, s[s_wei_stride_k]   ; i_m:202(i_m0:0,i_m1:202)
    v_add_u32 v[v_tmp], 202, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 203, s[s_wei_stride_k]   ; i_m:203(i_m0:0,i_m1:203)
    v_add_u32 v[v_tmp], 203, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 204, s[s_wei_stride_k]   ; i_m:204(i_m0:0,i_m1:204)
    v_add_u32 v[v_tmp], 204, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 205, s[s_wei_stride_k]   ; i_m:205(i_m0:0,i_m1:205)
    v_add_u32 v[v_tmp], 205, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 206, s[s_wei_stride_k]   ; i_m:206(i_m0:0,i_m1:206)
    v_add_u32 v[v_tmp], 206, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 207, s[s_wei_stride_k]   ; i_m:207(i_m0:0,i_m1:207)
    v_add_u32 v[v_tmp], 207, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mov_b64 exec, -1
    ; start group 5, i_g_mr:1, i_g_ms:0, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 144
    s_barrier
    v_accvgpr_read_b32 v[v_c], a[a_c+136]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+137]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+138]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+139]
    ds_write_b128 v[v_co_sst], v[v_c:v_c+3]    ; idword:0(0,0),  0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+152]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+153]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+154]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+155]
    ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512   ; idword:32(0,32),  0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+200]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+201]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+202]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+203]
    ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048   ; idword:128(0,128),  0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+216]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+217]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+218]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+219]
    ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560   ; idword:160(0,160),  0x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c], a[a_c+140]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+141]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+142]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+143]
    ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192   ; idword:512(2,0),  2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+156]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+157]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+158]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+159]
    ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:8704   ; idword:544(2,32),  2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+204]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+205]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+206]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+207]
    ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240   ; idword:640(2,128),  2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+220]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+221]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+222]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+223]
    ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10752   ; idword:672(2,160),  2x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    s_mul_i32 s[s_tmp], 144, s[s_wei_stride_k]   ; i_m:144(i_m0:0,i_m1:144)
    v_add_u32 v[v_tmp], 144, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] 
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288
    v_cmpx_eq_u32 vcc, 1, v[v_wei_c_flag]
    ;   store to global, m index start from 144, m0:0, m1:144
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 145, s[s_wei_stride_k]   ; i_m:145(i_m0:0,i_m1:145)
    v_add_u32 v[v_tmp], 145, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 146, s[s_wei_stride_k]   ; i_m:146(i_m0:0,i_m1:146)
    v_add_u32 v[v_tmp], 146, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 147, s[s_wei_stride_k]   ; i_m:147(i_m0:0,i_m1:147)
    v_add_u32 v[v_tmp], 147, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 148, s[s_wei_stride_k]   ; i_m:148(i_m0:0,i_m1:148)
    v_add_u32 v[v_tmp], 148, v[v_cur_k]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 149, s[s_wei_stride_k]   ; i_m:149(i_m0:0,i_m1:149)
    v_add_u32 v[v_tmp], 149, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 150, s[s_wei_stride_k]   ; i_m:150(i_m0:0,i_m1:150)
    v_add_u32 v[v_tmp], 150, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 151, s[s_wei_stride_k]   ; i_m:151(i_m0:0,i_m1:151)
    v_add_u32 v[v_tmp], 151, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 152, s[s_wei_stride_k]   ; i_m:152(i_m0:0,i_m1:152)
    v_add_u32 v[v_tmp], 152, v[v_cur_k]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 153, s[s_wei_stride_k]   ; i_m:153(i_m0:0,i_m1:153)
    v_add_u32 v[v_tmp], 153, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 154, s[s_wei_stride_k]   ; i_m:154(i_m0:0,i_m1:154)
    v_add_u32 v[v_tmp], 154, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 155, s[s_wei_stride_k]   ; i_m:155(i_m0:0,i_m1:155)
    v_add_u32 v[v_tmp], 155, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 156, s[s_wei_stride_k]   ; i_m:156(i_m0:0,i_m1:156)
    v_add_u32 v[v_tmp], 156, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 157, s[s_wei_stride_k]   ; i_m:157(i_m0:0,i_m1:157)
    v_add_u32 v[v_tmp], 157, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 158, s[s_wei_stride_k]   ; i_m:158(i_m0:0,i_m1:158)
    v_add_u32 v[v_tmp], 158, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 159, s[s_wei_stride_k]   ; i_m:159(i_m0:0,i_m1:159)
    v_add_u32 v[v_tmp], 159, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 208, s[s_wei_stride_k]   ; i_m:208(i_m0:0,i_m1:208)
    v_add_u32 v[v_tmp], 208, v[v_cur_k]
    s_mov_b64 exec, -1
    ;   load from lds, i_ssgroup:1, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672
    v_cmpx_eq_u32 vcc, 1, v[v_wei_c_flag]
    ;   store to global, m index start from 144, m0:0, m1:144
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 209, s[s_wei_stride_k]   ; i_m:209(i_m0:0,i_m1:209)
    v_add_u32 v[v_tmp], 209, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 210, s[s_wei_stride_k]   ; i_m:210(i_m0:0,i_m1:210)
    v_add_u32 v[v_tmp], 210, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 211, s[s_wei_stride_k]   ; i_m:211(i_m0:0,i_m1:211)
    v_add_u32 v[v_tmp], 211, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 212, s[s_wei_stride_k]   ; i_m:212(i_m0:0,i_m1:212)
    v_add_u32 v[v_tmp], 212, v[v_cur_k]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 213, s[s_wei_stride_k]   ; i_m:213(i_m0:0,i_m1:213)
    v_add_u32 v[v_tmp], 213, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 214, s[s_wei_stride_k]   ; i_m:214(i_m0:0,i_m1:214)
    v_add_u32 v[v_tmp], 214, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 215, s[s_wei_stride_k]   ; i_m:215(i_m0:0,i_m1:215)
    v_add_u32 v[v_tmp], 215, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 216, s[s_wei_stride_k]   ; i_m:216(i_m0:0,i_m1:216)
    v_add_u32 v[v_tmp], 216, v[v_cur_k]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 217, s[s_wei_stride_k]   ; i_m:217(i_m0:0,i_m1:217)
    v_add_u32 v[v_tmp], 217, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 218, s[s_wei_stride_k]   ; i_m:218(i_m0:0,i_m1:218)
    v_add_u32 v[v_tmp], 218, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 219, s[s_wei_stride_k]   ; i_m:219(i_m0:0,i_m1:219)
    v_add_u32 v[v_tmp], 219, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 220, s[s_wei_stride_k]   ; i_m:220(i_m0:0,i_m1:220)
    v_add_u32 v[v_tmp], 220, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 221, s[s_wei_stride_k]   ; i_m:221(i_m0:0,i_m1:221)
    v_add_u32 v[v_tmp], 221, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 222, s[s_wei_stride_k]   ; i_m:222(i_m0:0,i_m1:222)
    v_add_u32 v[v_tmp], 222, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 223, s[s_wei_stride_k]   ; i_m:223(i_m0:0,i_m1:223)
    v_add_u32 v[v_tmp], 223, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mov_b64 exec, -1
    ; start group 6, i_g_mr:1, i_g_ms:1, i_g_mw:0, i_g_mb:0, i_g_mt:0, m index start from 160
    s_barrier
    v_accvgpr_read_b32 v[v_c], a[a_c+160]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+161]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+162]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+163]
    ds_write_b128 v[v_co_sst], v[v_c:v_c+3]    ; idword:0(0,0),  0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+176]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+177]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+178]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+179]
    ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512   ; idword:32(0,32),  0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+224]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+225]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+226]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+227]
    ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048   ; idword:128(0,128),  0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+240]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+241]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+242]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+243]
    ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560   ; idword:160(0,160),  0x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c], a[a_c+164]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+165]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+166]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+167]
    ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192   ; idword:512(2,0),  2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+180]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+181]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+182]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+183]
    ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:8704   ; idword:544(2,32),  2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+228]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+229]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+230]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+231]
    ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240   ; idword:640(2,128),  2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+244]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+245]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+246]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+247]
    ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10752   ; idword:672(2,160),  2x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    s_mul_i32 s[s_tmp], 160, s[s_wei_stride_k]   ; i_m:160(i_m0:0,i_m1:160)
    v_add_u32 v[v_tmp], 160, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] 
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288
    v_cmpx_eq_u32 vcc, 1, v[v_wei_c_flag]
    ;   store to global, m index start from 160, m0:0, m1:160
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 161, s[s_wei_stride_k]   ; i_m:161(i_m0:0,i_m1:161)
    v_add_u32 v[v_tmp], 161, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 162, s[s_wei_stride_k]   ; i_m:162(i_m0:0,i_m1:162)
    v_add_u32 v[v_tmp], 162, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 163, s[s_wei_stride_k]   ; i_m:163(i_m0:0,i_m1:163)
    v_add_u32 v[v_tmp], 163, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 164, s[s_wei_stride_k]   ; i_m:164(i_m0:0,i_m1:164)
    v_add_u32 v[v_tmp], 164, v[v_cur_k]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 165, s[s_wei_stride_k]   ; i_m:165(i_m0:0,i_m1:165)
    v_add_u32 v[v_tmp], 165, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 166, s[s_wei_stride_k]   ; i_m:166(i_m0:0,i_m1:166)
    v_add_u32 v[v_tmp], 166, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 167, s[s_wei_stride_k]   ; i_m:167(i_m0:0,i_m1:167)
    v_add_u32 v[v_tmp], 167, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 168, s[s_wei_stride_k]   ; i_m:168(i_m0:0,i_m1:168)
    v_add_u32 v[v_tmp], 168, v[v_cur_k]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 169, s[s_wei_stride_k]   ; i_m:169(i_m0:0,i_m1:169)
    v_add_u32 v[v_tmp], 169, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 170, s[s_wei_stride_k]   ; i_m:170(i_m0:0,i_m1:170)
    v_add_u32 v[v_tmp], 170, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 171, s[s_wei_stride_k]   ; i_m:171(i_m0:0,i_m1:171)
    v_add_u32 v[v_tmp], 171, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 172, s[s_wei_stride_k]   ; i_m:172(i_m0:0,i_m1:172)
    v_add_u32 v[v_tmp], 172, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 173, s[s_wei_stride_k]   ; i_m:173(i_m0:0,i_m1:173)
    v_add_u32 v[v_tmp], 173, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 174, s[s_wei_stride_k]   ; i_m:174(i_m0:0,i_m1:174)
    v_add_u32 v[v_tmp], 174, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 175, s[s_wei_stride_k]   ; i_m:175(i_m0:0,i_m1:175)
    v_add_u32 v[v_tmp], 175, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 224, s[s_wei_stride_k]   ; i_m:224(i_m0:0,i_m1:224)
    v_add_u32 v[v_tmp], 224, v[v_cur_k]
    s_mov_b64 exec, -1
    ;   load from lds, i_ssgroup:1, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672
    v_cmpx_eq_u32 vcc, 1, v[v_wei_c_flag]
    ;   store to global, m index start from 160, m0:0, m1:160
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 225, s[s_wei_stride_k]   ; i_m:225(i_m0:0,i_m1:225)
    v_add_u32 v[v_tmp], 225, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 226, s[s_wei_stride_k]   ; i_m:226(i_m0:0,i_m1:226)
    v_add_u32 v[v_tmp], 226, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 227, s[s_wei_stride_k]   ; i_m:227(i_m0:0,i_m1:227)
    v_add_u32 v[v_tmp], 227, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 228, s[s_wei_stride_k]   ; i_m:228(i_m0:0,i_m1:228)
    v_add_u32 v[v_tmp], 228, v[v_cur_k]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 229, s[s_wei_stride_k]   ; i_m:229(i_m0:0,i_m1:229)
    v_add_u32 v[v_tmp], 229, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 230, s[s_wei_stride_k]   ; i_m:230(i_m0:0,i_m1:230)
    v_add_u32 v[v_tmp], 230, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 231, s[s_wei_stride_k]   ; i_m:231(i_m0:0,i_m1:231)
    v_add_u32 v[v_tmp], 231, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 232, s[s_wei_stride_k]   ; i_m:232(i_m0:0,i_m1:232)
    v_add_u32 v[v_tmp], 232, v[v_cur_k]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 233, s[s_wei_stride_k]   ; i_m:233(i_m0:0,i_m1:233)
    v_add_u32 v[v_tmp], 233, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 234, s[s_wei_stride_k]   ; i_m:234(i_m0:0,i_m1:234)
    v_add_u32 v[v_tmp], 234, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 235, s[s_wei_stride_k]   ; i_m:235(i_m0:0,i_m1:235)
    v_add_u32 v[v_tmp], 235, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 236, s[s_wei_stride_k]   ; i_m:236(i_m0:0,i_m1:236)
    v_add_u32 v[v_tmp], 236, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 237, s[s_wei_stride_k]   ; i_m:237(i_m0:0,i_m1:237)
    v_add_u32 v[v_tmp], 237, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 238, s[s_wei_stride_k]   ; i_m:238(i_m0:0,i_m1:238)
    v_add_u32 v[v_tmp], 238, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 239, s[s_wei_stride_k]   ; i_m:239(i_m0:0,i_m1:239)
    v_add_u32 v[v_tmp], 239, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mov_b64 exec, -1
    ; start group 7, i_g_mr:1, i_g_ms:1, i_g_mw:0, i_g_mb:1, i_g_mt:0, m index start from 176
    s_barrier
    v_accvgpr_read_b32 v[v_c], a[a_c+168]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+169]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+170]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+171]
    ds_write_b128 v[v_co_sst], v[v_c:v_c+3]    ; idword:0(0,0),  0x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+184]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+185]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+186]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+187]
    ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:512   ; idword:32(0,32),  0x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+232]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+233]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+234]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+235]
    ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:2048   ; idword:128(0,128),  0x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+248]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+249]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+250]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+251]
    ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:2560   ; idword:160(0,160),  0x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:0  x  i_nr:1, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c], a[a_c+172]
    v_accvgpr_read_b32 v[v_c+1], a[a_c+173]
    v_accvgpr_read_b32 v[v_c+2], a[a_c+174]
    v_accvgpr_read_b32 v[v_c+3], a[a_c+175]
    ds_write_b128 v[v_co_sst], v[v_c:v_c+3] offset:8192   ; idword:512(2,0),  2x0 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+4], a[a_c+188]
    v_accvgpr_read_b32 v[v_c+5], a[a_c+189]
    v_accvgpr_read_b32 v[v_c+6], a[a_c+190]
    v_accvgpr_read_b32 v[v_c+7], a[a_c+191]
    ds_write_b128 v[v_co_sst], v[v_c+4:v_c+4+3] offset:8704   ; idword:544(2,32),  2x32 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:0, i_ns:1, i_nw:0
    v_accvgpr_read_b32 v[v_c+8], a[a_c+236]
    v_accvgpr_read_b32 v[v_c+9], a[a_c+237]
    v_accvgpr_read_b32 v[v_c+10], a[a_c+238]
    v_accvgpr_read_b32 v[v_c+11], a[a_c+239]
    ds_write_b128 v[v_co_sst], v[v_c+8:v_c+8+3] offset:10240   ; idword:640(2,128),  2x128 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:0, i_nw:0
    v_accvgpr_read_b32 v[v_c+12], a[a_c+252]
    v_accvgpr_read_b32 v[v_c+13], a[a_c+253]
    v_accvgpr_read_b32 v[v_c+14], a[a_c+254]
    v_accvgpr_read_b32 v[v_c+15], a[a_c+255]
    ds_write_b128 v[v_co_sst], v[v_c+12:v_c+12+3] offset:10752   ; idword:672(2,160),  2x160 | /4, i_mr:0, i_ms:0, i_mw:0, i_mb:1  x  i_nr:1, i_ns:1, i_nw:0
    s_mul_i32 s[s_tmp], 176, s[s_wei_stride_k]   ; i_m:176(i_m0:0,i_m1:176)
    v_add_u32 v[v_tmp], 176, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    s_barrier
    ;   load from lds, i_ssgroup:0, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] 
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:4096
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:8192
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:12288
    v_cmpx_eq_u32 vcc, 1, v[v_wei_c_flag]
    ;   store to global, m index start from 176, m0:0, m1:176
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 177, s[s_wei_stride_k]   ; i_m:177(i_m0:0,i_m1:177)
    v_add_u32 v[v_tmp], 177, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 178, s[s_wei_stride_k]   ; i_m:178(i_m0:0,i_m1:178)
    v_add_u32 v[v_tmp], 178, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 179, s[s_wei_stride_k]   ; i_m:179(i_m0:0,i_m1:179)
    v_add_u32 v[v_tmp], 179, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 180, s[s_wei_stride_k]   ; i_m:180(i_m0:0,i_m1:180)
    v_add_u32 v[v_tmp], 180, v[v_cur_k]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 181, s[s_wei_stride_k]   ; i_m:181(i_m0:0,i_m1:181)
    v_add_u32 v[v_tmp], 181, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 182, s[s_wei_stride_k]   ; i_m:182(i_m0:0,i_m1:182)
    v_add_u32 v[v_tmp], 182, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 183, s[s_wei_stride_k]   ; i_m:183(i_m0:0,i_m1:183)
    v_add_u32 v[v_tmp], 183, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 184, s[s_wei_stride_k]   ; i_m:184(i_m0:0,i_m1:184)
    v_add_u32 v[v_tmp], 184, v[v_cur_k]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 185, s[s_wei_stride_k]   ; i_m:185(i_m0:0,i_m1:185)
    v_add_u32 v[v_tmp], 185, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 186, s[s_wei_stride_k]   ; i_m:186(i_m0:0,i_m1:186)
    v_add_u32 v[v_tmp], 186, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 187, s[s_wei_stride_k]   ; i_m:187(i_m0:0,i_m1:187)
    v_add_u32 v[v_tmp], 187, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 188, s[s_wei_stride_k]   ; i_m:188(i_m0:0,i_m1:188)
    v_add_u32 v[v_tmp], 188, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 189, s[s_wei_stride_k]   ; i_m:189(i_m0:0,i_m1:189)
    v_add_u32 v[v_tmp], 189, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 190, s[s_wei_stride_k]   ; i_m:190(i_m0:0,i_m1:190)
    v_add_u32 v[v_tmp], 190, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 191, s[s_wei_stride_k]   ; i_m:191(i_m0:0,i_m1:191)
    v_add_u32 v[v_tmp], 191, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 240, s[s_wei_stride_k]   ; i_m:240(i_m0:0,i_m1:240)
    v_add_u32 v[v_tmp], 240, v[v_cur_k]
    s_mov_b64 exec, -1
    ;   load from lds, i_ssgroup:1, num_sld_per_ssgroup:4
    ds_read_b128 v[v_c:v_c+3], v[v_co_sld] offset:16384
    ds_read_b128 v[v_c+4:v_c+4+3], v[v_co_sld] offset:20480
    ds_read_b128 v[v_c+8:v_c+8+3], v[v_co_sld] offset:24576
    ds_read_b128 v[v_c+12:v_c+12+3], v[v_co_sld] offset:28672
    v_cmpx_eq_u32 vcc, 1, v[v_wei_c_flag]
    ;   store to global, m index start from 176, m0:0, m1:176
    s_waitcnt lgkmcnt(3)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 241, s[s_wei_stride_k]   ; i_m:241(i_m0:0,i_m1:241)
    v_add_u32 v[v_tmp], 241, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+1], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 242, s[s_wei_stride_k]   ; i_m:242(i_m0:0,i_m1:242)
    v_add_u32 v[v_tmp], 242, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+2], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 243, s[s_wei_stride_k]   ; i_m:243(i_m0:0,i_m1:243)
    v_add_u32 v[v_tmp], 243, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+3], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 244, s[s_wei_stride_k]   ; i_m:244(i_m0:0,i_m1:244)
    v_add_u32 v[v_tmp], 244, v[v_cur_k]
    s_waitcnt lgkmcnt(2)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+4], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 245, s[s_wei_stride_k]   ; i_m:245(i_m0:0,i_m1:245)
    v_add_u32 v[v_tmp], 245, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+5], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 246, s[s_wei_stride_k]   ; i_m:246(i_m0:0,i_m1:246)
    v_add_u32 v[v_tmp], 246, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+6], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 247, s[s_wei_stride_k]   ; i_m:247(i_m0:0,i_m1:247)
    v_add_u32 v[v_tmp], 247, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+7], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 248, s[s_wei_stride_k]   ; i_m:248(i_m0:0,i_m1:248)
    v_add_u32 v[v_tmp], 248, v[v_cur_k]
    s_waitcnt lgkmcnt(1)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+8], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 249, s[s_wei_stride_k]   ; i_m:249(i_m0:0,i_m1:249)
    v_add_u32 v[v_tmp], 249, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+9], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 250, s[s_wei_stride_k]   ; i_m:250(i_m0:0,i_m1:250)
    v_add_u32 v[v_tmp], 250, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+10], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 251, s[s_wei_stride_k]   ; i_m:251(i_m0:0,i_m1:251)
    v_add_u32 v[v_tmp], 251, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+11], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 252, s[s_wei_stride_k]   ; i_m:252(i_m0:0,i_m1:252)
    v_add_u32 v[v_tmp], 252, v[v_cur_k]
    s_waitcnt lgkmcnt(0)
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+12], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 253, s[s_wei_stride_k]   ; i_m:253(i_m0:0,i_m1:253)
    v_add_u32 v[v_tmp], 253, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+13], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 254, s[s_wei_stride_k]   ; i_m:254(i_m0:0,i_m1:254)
    v_add_u32 v[v_tmp], 254, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+14], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mul_i32 s[s_tmp], 255, s[s_wei_stride_k]   ; i_m:255(i_m0:0,i_m1:255)
    v_add_u32 v[v_tmp], 255, v[v_cur_k]
    v_cmp_gt_u32 vcc, s[s_k], v[v_tmp]
    s_and_saveexec_b64 s[s_tmp+4:s_tmp+5], vcc
    buffer_atomic_add_f32 v[v_c+15], v[v_wei_os], s[s_p_wei:s_p_wei+3], s[s_tmp] offen offset:0
    s_or_b64 exec, exec, s[s_tmp+4:s_tmp+5]
    s_mov_b64 exec, -1

L_igemm_wrw_gtcx2_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x8_1x8x1x32_vs1_gkgs_out:
    s_endpgm
.rodata
.p2align 6
.amdhsa_kernel igemm_wrw_gtcx2_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x8_1x8x1x32_vs1_gkgs
    .amdhsa_group_segment_fixed_size 36864
    .amdhsa_user_sgpr_kernarg_segment_ptr 1
    .amdhsa_system_sgpr_workgroup_id_x 1
    .amdhsa_system_sgpr_workgroup_id_y 1
    .amdhsa_system_sgpr_workgroup_id_z 1
    .amdhsa_system_vgpr_workitem_id 0
    .amdhsa_next_free_vgpr 360
    .amdhsa_next_free_sgpr 78
    .amdhsa_ieee_mode 0
    .amdhsa_dx10_clamp 0
    .amdhsa_tg_split 0
    .amdhsa_accum_offset 104
.end_amdhsa_kernel

.amdgpu_metadata
---
amdhsa.version: [ 1, 0 ]
amdhsa.kernels:
  - .name: igemm_wrw_gtcx2_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x8_1x8x1x32_vs1_gkgs
    .symbol: igemm_wrw_gtcx2_nhwc_fp16_bx0_ex1_bt256x256x32_wt32x32x8_ws2x2_wr2x2_ta1x4x1x8_1x8x1x32_tb1x4x1x8_1x8x1x32_vs1_gkgs.kd
    .sgpr_count: 84
    .vgpr_count: 360
    .kernarg_segment_align: 8
    .kernarg_segment_size: 96
    .group_segment_fixed_size: 36864
    .private_segment_fixed_size: 0
    .wavefront_size: 64
    .reqd_workgroup_size : [256, 1, 1]
    .max_flat_workgroup_size: 256
    .args:
    - { .name: p_in      , .size: 8, .offset:   0, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: false}
    - { .name: p_wei     , .size: 8, .offset:   8, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: p_out     , .size: 8, .offset:  16, .value_kind: global_buffer, .value_type: f32, .address_space: global, .is_const: true}
    - { .name: hi        , .size: 4, .offset:  24, .value_kind: by_value, .value_type: i32}
    - { .name: wi        , .size: 4, .offset:  28, .value_kind: by_value, .value_type: i32}
    - { .name: n_         , .size: 4, .offset:  32, .value_kind: by_value, .value_type: i32}
    - { .name: k         , .size: 4, .offset:  36, .value_kind: by_value, .value_type: i32}
    - { .name: c         , .size: 4, .offset:  40, .value_kind: by_value, .value_type: i32}
    - { .name: ho        , .size: 4, .offset:  44, .value_kind: by_value, .value_type: i32}
    - { .name: wo        , .size: 4, .offset:  48, .value_kind: by_value, .value_type: i32}
    - { .name: stride_h  , .size: 4, .offset:  52, .value_kind: by_value, .value_type: i32}
    - { .name: stride_w  , .size: 4, .offset:  56, .value_kind: by_value, .value_type: i32}
    - { .name: dilation_h, .size: 4, .offset:  60, .value_kind: by_value, .value_type: i32}
    - { .name: dilation_w, .size: 4, .offset:  64, .value_kind: by_value, .value_type: i32}
    - { .name: pad_h     , .size: 4, .offset:  68, .value_kind: by_value, .value_type: i32}
    - { .name: pad_w     , .size: 4, .offset:  72, .value_kind: by_value, .value_type: i32}
    - { .name: y_         , .size: 4, .offset:  76, .value_kind: by_value, .value_type: i32}
    - { .name: x         , .size: 4, .offset:  80, .value_kind: by_value, .value_type: i32}
    - { .name: gemm_k_global_split, .size: 4, .offset:  84, .value_kind: by_value, .value_type: i32}
    - { .name: group     , .size: 4, .offset:  88, .value_kind: by_value, .value_type: i32}
    - { .name: __pack_0  , .size: 4, .offset:  92, .value_kind: by_value, .value_type: i32}
...
.end_amdgpu_metadata
