//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-33961263
// Cuda compilation tools, release 12.4, V12.4.99
// Based on NVVM 7.0.1
//
.version 8.4
.target sm_89
.address_size 64
// .globl gemm_4bit_kernel_f32
// _ZZ26gemm_4bit_tiled_kernel_f32E6x_tile has been demoted
// _ZZ26gemm_4bit_tiled_kernel_f32E6w_tile has been demoted
// _ZZ39gemm_ternary_multibase_tiled_kernel_f32E6x_tile has been demoted
// _ZZ39gemm_ternary_multibase_tiled_kernel_f32E8s_shared has been demoted
.visible .entry gemm_4bit_kernel_f32(
.param .u64 gemm_4bit_kernel_f32_param_0,
.param .u64 gemm_4bit_kernel_f32_param_1,
.param .u64 gemm_4bit_kernel_f32_param_2,
.param .u64 gemm_4bit_kernel_f32_param_3,
.param .u32 gemm_4bit_kernel_f32_param_4,
.param .u32 gemm_4bit_kernel_f32_param_5,
.param .u32 gemm_4bit_kernel_f32_param_6,
.param .u32 gemm_4bit_kernel_f32_param_7
)
{
.reg .pred %p<14>;
.reg .b16 %rs<21>;
.reg .f32 %f<75>;
.reg .b32 %r<82>;
.reg .b64 %rd<78>;
ld.param.u64 %rd21, [gemm_4bit_kernel_f32_param_0];
ld.param.u64 %rd22, [gemm_4bit_kernel_f32_param_1];
ld.param.u64 %rd23, [gemm_4bit_kernel_f32_param_2];
ld.param.u64 %rd20, [gemm_4bit_kernel_f32_param_3];
ld.param.u32 %r21, [gemm_4bit_kernel_f32_param_4];
ld.param.u32 %r18, [gemm_4bit_kernel_f32_param_5];
ld.param.u32 %r19, [gemm_4bit_kernel_f32_param_6];
ld.param.u32 %r20, [gemm_4bit_kernel_f32_param_7];
cvta.to.global.u64 %rd1, %rd21;
cvta.to.global.u64 %rd2, %rd23;
cvta.to.global.u64 %rd3, %rd22;
mov.u32 %r22, %ntid.x;
mov.u32 %r23, %ctaid.x;
mov.u32 %r24, %tid.x;
mad.lo.s32 %r1, %r23, %r22, %r24;
setp.ge.s32 %p1, %r1, %r19;
mov.u32 %r2, %ctaid.y;
setp.ge.s32 %p2, %r2, %r21;
or.pred %p3, %p2, %p1;
@%p3 bra $L__BB0_19;
shr.u32 %r25, %r18, 31;
add.s32 %r26, %r18, %r25;
shr.s32 %r3, %r26, 1;
add.s32 %r27, %r18, %r20;
add.s32 %r28, %r27, -1;
div.s32 %r29, %r28, %r20;
mul.lo.s32 %r30, %r3, %r1;
cvt.s64.s32 %rd4, %r30;
mul.lo.s32 %r31, %r2, %r18;
cvt.s64.s32 %rd5, %r31;
mul.lo.s32 %r32, %r29, %r1;
cvt.s64.s32 %rd6, %r32;
setp.lt.s32 %p4, %r18, 2;
mov.f32 %f72, 0f00000000;
@%p4 bra $L__BB0_18;
add.s32 %r34, %r3, -1;
and.b32 %r81, %r3, 3;
setp.lt.u32 %p5, %r34, 3;
mov.f32 %f72, 0f00000000;
mov.u32 %r79, 0;
@%p5 bra $L__BB0_13;
sub.s32 %r77, %r81, %r3;
shl.b64 %rd24, %rd5, 2;
add.s64 %rd25, %rd1, %rd24;
add.s64 %rd74, %rd25, 16;
add.s64 %rd75, %rd3, %rd4;
mov.f32 %f72, 0f00000000;
mov.u32 %r79, 0;
mov.u32 %r76, 2;
$L__BB0_4:
add.s32 %r37, %r76, -2;
add.s32 %r38, %r76, -1;
div.s32 %r39, %r37, %r20;
div.s32 %r40, %r38, %r20;
cvt.s64.s32 %rd26, %r39;
add.s64 %rd27, %rd26, %rd6;
shl.b64 %rd28, %rd27, 2;
add.s64 %rd29, %rd2, %rd28;
cvt.s64.s32 %rd30, %r40;
add.s64 %rd31, %rd30, %rd6;
shl.b64 %rd32, %rd31, 2;
add.s64 %rd33, %rd2, %rd32;
ld.global.nc.u8 %rs1, [%rd75];
and.b16 %rs2, %rs1, 240;
and.b16 %rs3, %rs1, 15;
cvt.u32.u16 %r41, %rs3;
add.s32 %r42, %r41, -8;
cvt.rn.f32.s32 %f30, %r42;
ld.global.nc.f32 %f31, [%rd29];
mul.f32 %f32, %f31, %f30;
shr.u16 %rs4, %rs2, 4;
cvt.u32.u16 %r43, %rs4;
add.s32 %r44, %r43, -8;
cvt.rn.f32.s32 %f33, %r44;
ld.global.nc.f32 %f34, [%rd33];
mul.f32 %f2, %f34, %f33;
ld.global.nc.f32 %f35, [%rd74+-16];
fma.rn.f32 %f66, %f35, %f32, %f72;
setp.ge.s32 %p6, %r38, %r18;
@%p6 bra $L__BB0_6;
ld.global.nc.f32 %f36, [%rd74+-12];
fma.rn.f32 %f66, %f2, %f36, %f66;
$L__BB0_6:
add.s32 %r45, %r76, 1;
div.s32 %r46, %r45, %r20;
div.s32 %r47, %r76, %r20;
cvt.s64.s32 %rd34, %r47;
add.s64 %rd35, %rd34, %rd6;
shl.b64 %rd36, %rd35, 2;
add.s64 %rd37, %rd2, %rd36;
cvt.s64.s32 %rd38, %r46;
add.s64 %rd39, %rd38, %rd6;
shl.b64 %rd40, %rd39, 2;
add.s64 %rd41, %rd2, %rd40;
ld.global.nc.u8 %rs5, [%rd75+1];
and.b16 %rs6, %rs5, 240;
and.b16 %rs7, %rs5, 15;
cvt.u32.u16 %r48, %rs7;
add.s32 %r49, %r48, -8;
cvt.rn.f32.s32 %f37, %r49;
ld.global.nc.f32 %f38, [%rd37];
mul.f32 %f39, %f38, %f37;
shr.u16 %rs8, %rs6, 4;
cvt.u32.u16 %r50, %rs8;
add.s32 %r51, %r50, -8;
cvt.rn.f32.s32 %f40, %r51;
ld.global.nc.f32 %f41, [%rd41];
mul.f32 %f6, %f41, %f40;
add.s64 %rd11, %rd74, -16;
ld.global.nc.f32 %f42, [%rd74+-8];
fma.rn.f32 %f67, %f42, %f39, %f66;
setp.ge.s32 %p7, %r45, %r18;
@%p7 bra $L__BB0_8;
ld.global.nc.f32 %f43, [%rd74+-4];
fma.rn.f32 %f67, %f6, %f43, %f67;
$L__BB0_8:
add.s32 %r52, %r76, 2;
add.s32 %r53, %r76, 3;
div.s32 %r54, %r52, %r20;
div.s32 %r55, %r53, %r20;
cvt.s64.s32 %rd42, %r54;
add.s64 %rd43, %rd42, %rd6;
shl.b64 %rd44, %rd43, 2;
add.s64 %rd45, %rd2, %rd44;
cvt.s64.s32 %rd46, %r55;
add.s64 %rd47, %rd46, %rd6;
shl.b64 %rd48, %rd47, 2;
add.s64 %rd49, %rd2, %rd48;
ld.global.nc.u8 %rs9, [%rd75+2];
and.b16 %rs10, %rs9, 240;
and.b16 %rs11, %rs9, 15;
cvt.u32.u16 %r56, %rs11;
add.s32 %r57, %r56, -8;
cvt.rn.f32.s32 %f44, %r57;
ld.global.nc.f32 %f45, [%rd45];
mul.f32 %f46, %f45, %f44;
shr.u16 %rs12, %rs10, 4;
cvt.u32.u16 %r58, %rs12;
add.s32 %r59, %r58, -8;
cvt.rn.f32.s32 %f47, %r59;
ld.global.nc.f32 %f48, [%rd49];
mul.f32 %f10, %f48, %f47;
ld.global.nc.f32 %f49, [%rd11+16];
fma.rn.f32 %f68, %f49, %f46, %f67;
setp.ge.s32 %p8, %r53, %r18;
@%p8 bra $L__BB0_10;
ld.global.nc.f32 %f50, [%rd74+4];
fma.rn.f32 %f68, %f10, %f50, %f68;
$L__BB0_10:
add.s32 %r60, %r76, 4;
add.s32 %r61, %r76, 5;
div.s32 %r62, %r60, %r20;
div.s32 %r63, %r61, %r20;
cvt.s64.s32 %rd50, %r62;
add.s64 %rd51, %rd50, %rd6;
shl.b64 %rd52, %rd51, 2;
add.s64 %rd53, %rd2, %rd52;
cvt.s64.s32 %rd54, %r63;
add.s64 %rd55, %rd54, %rd6;
shl.b64 %rd56, %rd55, 2;
add.s64 %rd57, %rd2, %rd56;
ld.global.nc.u8 %rs13, [%rd75+3];
and.b16 %rs14, %rs13, 240;
and.b16 %rs15, %rs13, 15;
cvt.u32.u16 %r64, %rs15;
add.s32 %r65, %r64, -8;
cvt.rn.f32.s32 %f51, %r65;
ld.global.nc.f32 %f52, [%rd53];
mul.f32 %f53, %f52, %f51;
shr.u16 %rs16, %rs14, 4;
cvt.u32.u16 %r66, %rs16;
add.s32 %r67, %r66, -8;
cvt.rn.f32.s32 %f54, %r67;
ld.global.nc.f32 %f55, [%rd57];
mul.f32 %f14, %f55, %f54;
ld.global.nc.f32 %f56, [%rd11+24];
fma.rn.f32 %f72, %f56, %f53, %f68;
setp.ge.s32 %p9, %r61, %r18;
@%p9 bra $L__BB0_12;
ld.global.nc.f32 %f57, [%rd74+12];
fma.rn.f32 %f72, %f14, %f57, %f72;
$L__BB0_12:
add.s64 %rd75, %rd75, 4;
add.s32 %r79, %r79, 4;
add.s64 %rd74, %rd74, 32;
add.s32 %r76, %r76, 8;
add.s32 %r77, %r77, 4;
setp.ne.s32 %p10, %r77, 0;
@%p10 bra $L__BB0_4;
$L__BB0_13:
setp.eq.s32 %p11, %r81, 0;
@%p11 bra $L__BB0_18;
shl.b32 %r80, %r79, 1;
cvt.s64.s32 %rd58, %r80;
add.s64 %rd59, %rd5, %rd58;
shl.b64 %rd60, %rd59, 2;
add.s64 %rd77, %rd1, %rd60;
cvt.s64.s32 %rd61, %r79;
add.s64 %rd62, %rd61, %rd4;
add.s64 %rd76, %rd3, %rd62;
$L__BB0_15:
.pragma "nounroll";
add.s32 %r68, %r80, 1;
div.s32 %r69, %r68, %r20;
div.s32 %r70, %r80, %r20;
cvt.s64.s32 %rd63, %r70;
add.s64 %rd64, %rd63, %rd6;
shl.b64 %rd65, %rd64, 2;
add.s64 %rd66, %rd2, %rd65;
cvt.s64.s32 %rd67, %r69;
add.s64 %rd68, %rd67, %rd6;
shl.b64 %rd69, %rd68, 2;
add.s64 %rd70, %rd2, %rd69;
ld.global.nc.u8 %rs17, [%rd76];
and.b16 %rs18, %rs17, 240;
and.b16 %rs19, %rs17, 15;
cvt.u32.u16 %r71, %rs19;
add.s32 %r72, %r71, -8;
cvt.rn.f32.s32 %f58, %r72;
ld.global.nc.f32 %f59, [%rd66];
mul.f32 %f60, %f59, %f58;
shr.u16 %rs20, %rs18, 4;
cvt.u32.u16 %r73, %rs20;
add.s32 %r74, %r73, -8;
cvt.rn.f32.s32 %f61, %r74;
ld.global.nc.f32 %f62, [%rd70];
mul.f32 %f21, %f62, %f61;
ld.global.nc.f32 %f63, [%rd77];
fma.rn.f32 %f72, %f63, %f60, %f72;
setp.ge.s32 %p12, %r68, %r18;
@%p12 bra $L__BB0_17;
ld.global.nc.f32 %f64, [%rd77+4];
fma.rn.f32 %f72, %f21, %f64, %f72;
$L__BB0_17:
add.s64 %rd77, %rd77, 8;
add.s32 %r80, %r80, 2;
add.s64 %rd76, %rd76, 1;
add.s32 %r81, %r81, -1;
setp.ne.s32 %p13, %r81, 0;
@%p13 bra $L__BB0_15;
$L__BB0_18:
mad.lo.s32 %r75, %r2, %r19, %r1;
cvta.to.global.u64 %rd71, %rd20;
mul.wide.s32 %rd72, %r75, 4;
add.s64 %rd73, %rd71, %rd72;
st.global.f32 [%rd73], %f72;
$L__BB0_19:
ret;
}
// .globl gemm_4bit_tiled_kernel_f32
.visible .entry gemm_4bit_tiled_kernel_f32(
.param .u64 gemm_4bit_tiled_kernel_f32_param_0,
.param .u64 gemm_4bit_tiled_kernel_f32_param_1,
.param .u64 gemm_4bit_tiled_kernel_f32_param_2,
.param .u64 gemm_4bit_tiled_kernel_f32_param_3,
.param .u32 gemm_4bit_tiled_kernel_f32_param_4,
.param .u32 gemm_4bit_tiled_kernel_f32_param_5,
.param .u32 gemm_4bit_tiled_kernel_f32_param_6,
.param .u32 gemm_4bit_tiled_kernel_f32_param_7
)
{
.reg .pred %p<28>;
.reg .b16 %rs<5>;
.reg .f32 %f<43>;
.reg .b32 %r<93>;
.reg .b64 %rd<22>;
// demoted variable
.shared .align 4 .b8 _ZZ26gemm_4bit_tiled_kernel_f32E6x_tile[256];
// demoted variable
.shared .align 4 .b8 _ZZ26gemm_4bit_tiled_kernel_f32E6w_tile[4096];
ld.param.u64 %rd6, [gemm_4bit_tiled_kernel_f32_param_0];
ld.param.u64 %rd7, [gemm_4bit_tiled_kernel_f32_param_1];
ld.param.u64 %rd8, [gemm_4bit_tiled_kernel_f32_param_2];
ld.param.u64 %rd9, [gemm_4bit_tiled_kernel_f32_param_3];
ld.param.u32 %r38, [gemm_4bit_tiled_kernel_f32_param_4];
ld.param.u32 %r35, [gemm_4bit_tiled_kernel_f32_param_5];
ld.param.u32 %r36, [gemm_4bit_tiled_kernel_f32_param_6];
ld.param.u32 %r37, [gemm_4bit_tiled_kernel_f32_param_7];
mov.u32 %r39, %ctaid.x;
shl.b32 %r40, %r39, 4;
mov.u32 %r1, %tid.x;
add.s32 %r2, %r40, %r1;
mov.u32 %r3, %ctaid.y;
setp.ge.s32 %p2, %r3, %r38;
@%p2 bra $L__BB1_23;
shr.u32 %r41, %r35, 31;
add.s32 %r42, %r35, %r41;
shr.s32 %r4, %r42, 1;
setp.lt.s32 %p3, %r35, 1;
mov.f32 %f41, 0f00000000;
@%p3 bra $L__BB1_21;
mul.lo.s32 %r5, %r3, %r35;
mul.lo.s32 %r45, %r4, %r2;
cvt.s64.s32 %rd1, %r45;
add.s32 %r46, %r35, %r37;
add.s32 %r47, %r46, -1;
div.s32 %r48, %r47, %r37;
mul.lo.s32 %r49, %r48, %r2;
cvt.s64.s32 %rd2, %r49;
shl.b32 %r50, %r1, 8;
mov.u32 %r51, _ZZ26gemm_4bit_tiled_kernel_f32E6w_tile;
add.s32 %r52, %r51, %r50;
add.s32 %r6, %r52, 8;
cvta.to.global.u64 %rd3, %rd6;
cvta.to.global.u64 %rd4, %rd8;
cvta.to.global.u64 %rd5, %rd7;
mov.f32 %f41, 0f00000000;
mov.u32 %r83, 0;
mov.u32 %r84, %r83;
$L__BB1_3:
sub.s32 %r53, %r83, %r35;
max.u32 %r9, %r53, -64;
add.s32 %r54, %r1, %r84;
setp.ge.s32 %p4, %r54, %r35;
setp.gt.s32 %p5, %r1, 63;
or.pred %p6, %p5, %p4;
@%p6 bra $L__BB1_6;
add.s32 %r10, %r84, %r5;
mov.u32 %r85, %r1;
$L__BB1_5:
add.s32 %r55, %r10, %r85;
mul.wide.s32 %rd10, %r55, 4;
add.s64 %rd11, %rd3, %rd10;
ld.global.nc.f32 %f14, [%rd11];
shl.b32 %r56, %r85, 2;
mov.u32 %r57, _ZZ26gemm_4bit_tiled_kernel_f32E6x_tile;
add.s32 %r58, %r57, %r56;
st.shared.f32 [%r58], %f14;
add.s32 %r12, %r85, 16;
add.s32 %r59, %r12, %r84;
setp.lt.s32 %p7, %r59, %r35;
setp.lt.s32 %p8, %r85, 48;
and.pred %p9, %p8, %p7;
mov.u32 %r85, %r12;
@%p9 bra $L__BB1_5;
$L__BB1_6:
setp.lt.s32 %p10, %r2, %r36;
setp.lt.s32 %p11, %r84, %r35;
and.pred %p1, %p10, %p11;
not.pred %p12, %p1;
@%p12 bra $L__BB1_12;
mov.u32 %r87, 0;
mov.u32 %r86, %r84;
$L__BB1_8:
shr.u32 %r15, %r86, 1;
setp.ge.s32 %p13, %r15, %r4;
@%p13 bra $L__BB1_11;
cvt.u64.u32 %rd12, %r15;
add.s64 %rd13, %rd12, %rd1;
add.s64 %rd14, %rd5, %rd13;
div.s32 %r61, %r86, %r37;
cvt.s64.s32 %rd15, %r61;
add.s64 %rd16, %rd15, %rd2;
shl.b64 %rd17, %rd16, 2;
add.s64 %rd18, %rd4, %rd17;
ld.global.nc.u8 %rs1, [%rd14];
and.b16 %rs2, %rs1, 15;
cvt.u32.u16 %r62, %rs2;
add.s32 %r63, %r62, -8;
cvt.rn.f32.s32 %f15, %r63;
ld.global.nc.f32 %f2, [%rd18];
mul.f32 %f16, %f2, %f15;
shl.b32 %r67, %r87, 2;
add.s32 %r16, %r52, %r67;
st.shared.f32 [%r16], %f16;
add.s32 %r68, %r87, 1;
setp.gt.u32 %p14, %r68, 63;
add.s32 %r69, %r86, 1;
setp.ge.s32 %p15, %r69, %r35;
or.pred %p16, %p14, %p15;
@%p16 bra $L__BB1_11;
and.b16 %rs3, %rs1, 240;
shr.u16 %rs4, %rs3, 4;
cvt.u32.u16 %r70, %rs4;
add.s32 %r71, %r70, -8;
cvt.rn.f32.s32 %f17, %r71;
mul.f32 %f18, %f2, %f17;
st.shared.f32 [%r16+4], %f18;
$L__BB1_11:
add.s32 %r87, %r87, 2;
setp.lt.u32 %p17, %r87, 64;
add.s32 %r86, %r87, %r84;
setp.lt.s32 %p18, %r86, %r35;
and.pred %p19, %p17, %p18;
@%p19 bra $L__BB1_8;
$L__BB1_12:
bar.sync 0;
@%p12 bra $L__BB1_20;
neg.s32 %r73, %r9;
mov.u32 %r92, 0;
and.b32 %r19, %r73, 3;
setp.gt.u32 %p21, %r9, -4;
@%p21 bra $L__BB1_16;
add.s32 %r76, %r9, %r19;
neg.s32 %r90, %r76;
mov.u32 %r92, 0;
mov.u32 %r88, _ZZ26gemm_4bit_tiled_kernel_f32E6x_tile;
mov.u32 %r89, %r6;
$L__BB1_15:
ld.shared.f32 %f20, [%r89+-8];
ld.shared.f32 %f21, [%r88];
fma.rn.f32 %f22, %f21, %f20, %f41;
ld.shared.f32 %f23, [%r89+-4];
ld.shared.f32 %f24, [%r88+4];
fma.rn.f32 %f25, %f24, %f23, %f22;
ld.shared.f32 %f26, [%r89];
ld.shared.f32 %f27, [%r88+8];
fma.rn.f32 %f28, %f27, %f26, %f25;
ld.shared.f32 %f29, [%r89+4];
ld.shared.f32 %f30, [%r88+12];
fma.rn.f32 %f41, %f30, %f29, %f28;
add.s32 %r92, %r92, 4;
add.s32 %r89, %r89, 16;
add.s32 %r88, %r88, 16;
add.s32 %r90, %r90, -4;
setp.ne.s32 %p22, %r90, 0;
@%p22 bra $L__BB1_15;
$L__BB1_16:
setp.eq.s32 %p23, %r19, 0;
@%p23 bra $L__BB1_20;
shl.b32 %r77, %r92, 2;
mov.u32 %r78, _ZZ26gemm_4bit_tiled_kernel_f32E6x_tile;
add.s32 %r31, %r78, %r77;
add.s32 %r32, %r52, %r77;
ld.shared.f32 %f31, [%r32];
ld.shared.f32 %f32, [%r31];
fma.rn.f32 %f41, %f32, %f31, %f41;
setp.eq.s32 %p24, %r19, 1;
@%p24 bra $L__BB1_20;
ld.shared.f32 %f33, [%r32+4];
ld.shared.f32 %f34, [%r31+4];
fma.rn.f32 %f41, %f34, %f33, %f41;
setp.eq.s32 %p25, %r19, 2;
@%p25 bra $L__BB1_20;
ld.shared.f32 %f35, [%r32+8];
ld.shared.f32 %f36, [%r31+8];
fma.rn.f32 %f41, %f36, %f35, %f41;
$L__BB1_20:
bar.sync 0;
add.s32 %r84, %r84, 64;
setp.lt.s32 %p26, %r84, %r35;
add.s32 %r83, %r83, 64;
@%p26 bra $L__BB1_3;
$L__BB1_21:
setp.ge.s32 %p27, %r2, %r36;
@%p27 bra $L__BB1_23;
mad.lo.s32 %r82, %r3, %r36, %r2;
cvta.to.global.u64 %rd19, %rd9;
mul.wide.s32 %rd20, %r82, 4;
add.s64 %rd21, %rd19, %rd20;
st.global.f32 [%rd21], %f41;
$L__BB1_23:
ret;
}
// .globl gemm_ternary_multibase_kernel_f32
.visible .entry gemm_ternary_multibase_kernel_f32(
.param .u64 gemm_ternary_multibase_kernel_f32_param_0,
.param .u64 gemm_ternary_multibase_kernel_f32_param_1,
.param .u64 gemm_ternary_multibase_kernel_f32_param_2,
.param .u64 gemm_ternary_multibase_kernel_f32_param_3,
.param .u32 gemm_ternary_multibase_kernel_f32_param_4,
.param .u32 gemm_ternary_multibase_kernel_f32_param_5,
.param .u32 gemm_ternary_multibase_kernel_f32_param_6,
.param .u32 gemm_ternary_multibase_kernel_f32_param_7,
.param .u32 gemm_ternary_multibase_kernel_f32_param_8
)
{
.local .align 16 .b8 __local_depot2[32];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<24>;
.reg .b16 %rs<57>;
.reg .f32 %f<137>;
.reg .b32 %r<94>;
.reg .b64 %rd<32>;
mov.u64 %SPL, __local_depot2;
ld.param.u64 %rd16, [gemm_ternary_multibase_kernel_f32_param_0];
ld.param.u64 %rd18, [gemm_ternary_multibase_kernel_f32_param_1];
ld.param.u64 %rd19, [gemm_ternary_multibase_kernel_f32_param_2];
ld.param.u64 %rd17, [gemm_ternary_multibase_kernel_f32_param_3];
ld.param.u32 %r20, [gemm_ternary_multibase_kernel_f32_param_4];
ld.param.u32 %r16, [gemm_ternary_multibase_kernel_f32_param_5];
ld.param.u32 %r17, [gemm_ternary_multibase_kernel_f32_param_6];
ld.param.u32 %r18, [gemm_ternary_multibase_kernel_f32_param_7];
ld.param.u32 %r19, [gemm_ternary_multibase_kernel_f32_param_8];
cvta.to.global.u64 %rd1, %rd19;
cvta.to.global.u64 %rd2, %rd18;
add.u64 %rd3, %SPL, 0;
mov.u32 %r21, %ntid.x;
mov.u32 %r22, %ctaid.x;
mov.u32 %r23, %tid.x;
mad.lo.s32 %r1, %r22, %r21, %r23;
setp.ge.s32 %p1, %r1, %r17;
mov.u32 %r2, %ctaid.y;
setp.ge.s32 %p2, %r2, %r20;
or.pred %p3, %p2, %p1;
@%p3 bra $L__BB2_30;
setp.lt.s32 %p4, %r19, 1;
@%p4 bra $L__BB2_10;
ld.global.nc.f32 %f19, [%rd1];
st.local.f32 [%rd3], %f19;
setp.eq.s32 %p5, %r19, 1;
@%p5 bra $L__BB2_10;
ld.global.nc.f32 %f20, [%rd1+4];
st.local.f32 [%rd3+4], %f20;
setp.eq.s32 %p6, %r19, 2;
@%p6 bra $L__BB2_10;
ld.global.nc.f32 %f21, [%rd1+8];
st.local.f32 [%rd3+8], %f21;
setp.lt.s32 %p7, %r19, 4;
@%p7 bra $L__BB2_10;
ld.global.nc.f32 %f22, [%rd1+12];
st.local.f32 [%rd3+12], %f22;
setp.eq.s32 %p8, %r19, 4;
@%p8 bra $L__BB2_10;
ld.global.nc.f32 %f23, [%rd1+16];
st.local.f32 [%rd3+16], %f23;
setp.lt.s32 %p9, %r19, 6;
@%p9 bra $L__BB2_10;
ld.global.nc.f32 %f24, [%rd1+20];
st.local.f32 [%rd3+20], %f24;
setp.eq.s32 %p10, %r19, 6;
@%p10 bra $L__BB2_10;
ld.global.nc.f32 %f25, [%rd1+24];
st.local.f32 [%rd3+24], %f25;
setp.lt.s32 %p11, %r19, 8;
@%p11 bra $L__BB2_10;
ld.global.nc.f32 %f26, [%rd1+28];
st.local.f32 [%rd3+28], %f26;
$L__BB2_10:
setp.lt.s32 %p12, %r18, 1;
mov.f32 %f135, 0f00000000;
@%p12 bra $L__BB2_29;
@%p4 bra $L__BB2_29;
add.s32 %r3, %r19, -1;
and.b32 %r4, %r19, 3;
sub.s32 %r5, %r19, %r4;
mul.lo.s32 %r25, %r2, %r16;
cvt.s64.s32 %rd6, %r25;
cvta.to.global.u64 %rd7, %rd16;
mul.lo.s32 %r6, %r1, %r18;
mov.f32 %f29, 0f00000000;
mov.u32 %r90, 0;
mov.f32 %f135, %f29;
$L__BB2_13:
shl.b32 %r8, %r90, 2;
setp.ge.s32 %p14, %r8, %r16;
cvt.s64.s32 %rd21, %r8;
add.s64 %rd22, %rd21, %rd6;
shl.b64 %rd23, %rd22, 2;
add.s64 %rd8, %rd7, %rd23;
mov.f32 %f128, %f29;
@%p14 bra $L__BB2_15;
ld.global.nc.f32 %f128, [%rd8];
$L__BB2_15:
add.s32 %r26, %r8, 1;
setp.ge.s32 %p15, %r26, %r16;
mov.f32 %f130, 0f00000000;
mov.f32 %f129, %f130;
@%p15 bra $L__BB2_17;
ld.global.nc.f32 %f129, [%rd8+4];
$L__BB2_17:
add.s32 %r27, %r8, 2;
setp.ge.s32 %p16, %r27, %r16;
@%p16 bra $L__BB2_19;
ld.global.nc.f32 %f130, [%rd8+8];
$L__BB2_19:
add.s32 %r28, %r8, 3;
setp.ge.s32 %p17, %r28, %r16;
mov.f32 %f131, 0f00000000;
@%p17 bra $L__BB2_21;
ld.global.nc.f32 %f131, [%rd8+12];
$L__BB2_21:
add.s32 %r30, %r90, %r6;
mul.lo.s32 %r9, %r30, %r19;
setp.lt.u32 %p18, %r3, 3;
mov.u32 %r93, 0;
@%p18 bra $L__BB2_24;
cvt.s64.s32 %rd24, %r9;
add.s64 %rd30, %rd2, %rd24;
mov.u32 %r93, 0;
mov.u64 %rd31, %rd3;
mov.u32 %r92, %r5;
$L__BB2_23:
ld.global.nc.u8 %rs1, [%rd30];
and.b16 %rs3, %rs1, 3;
cvt.u32.u16 %r32, %rs3;
add.s32 %r33, %r32, -1;
cvt.rn.f32.s32 %f35, %r33;
shr.u16 %rs4, %rs1, 2;
and.b16 %rs5, %rs4, 3;
cvt.u32.u16 %r34, %rs5;
add.s32 %r35, %r34, -1;
cvt.rn.f32.s32 %f36, %r35;
shr.u16 %rs6, %rs1, 4;
and.b16 %rs7, %rs6, 3;
cvt.u32.u16 %r36, %rs7;
add.s32 %r37, %r36, -1;
cvt.rn.f32.s32 %f37, %r37;
shr.u16 %rs8, %rs1, 6;
cvt.u32.u16 %r38, %rs8;
add.s32 %r39, %r38, -1;
cvt.rn.f32.s32 %f38, %r39;
ld.local.v4.f32 {%f39, %f40, %f41, %f42}, [%rd31];
mul.f32 %f47, %f128, %f35;
fma.rn.f32 %f48, %f39, %f47, %f135;
mul.f32 %f49, %f129, %f36;
fma.rn.f32 %f50, %f39, %f49, %f48;
mul.f32 %f51, %f130, %f37;
fma.rn.f32 %f52, %f39, %f51, %f50;
mul.f32 %f53, %f131, %f38;
fma.rn.f32 %f54, %f39, %f53, %f52;
ld.global.nc.u8 %rs9, [%rd30+1];
and.b16 %rs11, %rs9, 3;
cvt.u32.u16 %r40, %rs11;
add.s32 %r41, %r40, -1;
cvt.rn.f32.s32 %f55, %r41;
shr.u16 %rs12, %rs9, 2;
and.b16 %rs13, %rs12, 3;
cvt.u32.u16 %r42, %rs13;
add.s32 %r43, %r42, -1;
cvt.rn.f32.s32 %f56, %r43;
shr.u16 %rs14, %rs9, 4;
and.b16 %rs15, %rs14, 3;
cvt.u32.u16 %r44, %rs15;
add.s32 %r45, %r44, -1;
cvt.rn.f32.s32 %f57, %r45;
shr.u16 %rs16, %rs9, 6;
cvt.u32.u16 %r46, %rs16;
add.s32 %r47, %r46, -1;
cvt.rn.f32.s32 %f58, %r47;
mul.f32 %f59, %f128, %f55;
fma.rn.f32 %f60, %f40, %f59, %f54;
mul.f32 %f61, %f129, %f56;
fma.rn.f32 %f62, %f40, %f61, %f60;
mul.f32 %f63, %f130, %f57;
fma.rn.f32 %f64, %f40, %f63, %f62;
mul.f32 %f65, %f131, %f58;
fma.rn.f32 %f66, %f40, %f65, %f64;
ld.global.nc.u8 %rs17, [%rd30+2];
and.b16 %rs19, %rs17, 3;
cvt.u32.u16 %r48, %rs19;
add.s32 %r49, %r48, -1;
cvt.rn.f32.s32 %f67, %r49;
shr.u16 %rs20, %rs17, 2;
and.b16 %rs21, %rs20, 3;
cvt.u32.u16 %r50, %rs21;
add.s32 %r51, %r50, -1;
cvt.rn.f32.s32 %f68, %r51;
shr.u16 %rs22, %rs17, 4;
and.b16 %rs23, %rs22, 3;
cvt.u32.u16 %r52, %rs23;
add.s32 %r53, %r52, -1;
cvt.rn.f32.s32 %f69, %r53;
shr.u16 %rs24, %rs17, 6;
cvt.u32.u16 %r54, %rs24;
add.s32 %r55, %r54, -1;
cvt.rn.f32.s32 %f70, %r55;
mul.f32 %f71, %f128, %f67;
fma.rn.f32 %f72, %f41, %f71, %f66;
mul.f32 %f73, %f129, %f68;
fma.rn.f32 %f74, %f41, %f73, %f72;
mul.f32 %f75, %f130, %f69;
fma.rn.f32 %f76, %f41, %f75, %f74;
mul.f32 %f77, %f131, %f70;
fma.rn.f32 %f78, %f41, %f77, %f76;
ld.global.nc.u8 %rs25, [%rd30+3];
and.b16 %rs27, %rs25, 3;
cvt.u32.u16 %r56, %rs27;
add.s32 %r57, %r56, -1;
cvt.rn.f32.s32 %f79, %r57;
shr.u16 %rs28, %rs25, 2;
and.b16 %rs29, %rs28, 3;
cvt.u32.u16 %r58, %rs29;
add.s32 %r59, %r58, -1;
cvt.rn.f32.s32 %f80, %r59;
shr.u16 %rs30, %rs25, 4;
and.b16 %rs31, %rs30, 3;
cvt.u32.u16 %r60, %rs31;
add.s32 %r61, %r60, -1;
cvt.rn.f32.s32 %f81, %r61;
shr.u16 %rs32, %rs25, 6;
cvt.u32.u16 %r62, %rs32;
add.s32 %r63, %r62, -1;
cvt.rn.f32.s32 %f82, %r63;
mul.f32 %f83, %f128, %f79;
fma.rn.f32 %f84, %f42, %f83, %f78;
mul.f32 %f85, %f129, %f80;
fma.rn.f32 %f86, %f42, %f85, %f84;
mul.f32 %f87, %f130, %f81;
fma.rn.f32 %f88, %f42, %f87, %f86;
mul.f32 %f89, %f131, %f82;
fma.rn.f32 %f135, %f42, %f89, %f88;
add.s32 %r93, %r93, 4;
add.s64 %rd31, %rd31, 16;
add.s64 %rd30, %rd30, 4;
add.s32 %r92, %r92, -4;
setp.ne.s32 %p19, %r92, 0;
@%p19 bra $L__BB2_23;
$L__BB2_24:
setp.eq.s32 %p20, %r4, 0;
@%p20 bra $L__BB2_28;
setp.eq.s32 %p21, %r4, 1;
add.s32 %r64, %r93, %r9;
cvt.s64.s32 %rd25, %r64;
add.s64 %rd14, %rd2, %rd25;
ld.global.nc.u8 %rs33, [%rd14];
and.b16 %rs35, %rs33, 3;
cvt.u32.u16 %r65, %rs35;
add.s32 %r66, %r65, -1;
cvt.rn.f32.s32 %f90, %r66;
shr.u16 %rs36, %rs33, 2;
and.b16 %rs37, %rs36, 3;
cvt.u32.u16 %r67, %rs37;
add.s32 %r68, %r67, -1;
cvt.rn.f32.s32 %f91, %r68;
shr.u16 %rs38, %rs33, 4;
and.b16 %rs39, %rs38, 3;
cvt.u32.u16 %r69, %rs39;
add.s32 %r70, %r69, -1;
cvt.rn.f32.s32 %f92, %r70;
shr.u16 %rs40, %rs33, 6;
cvt.u32.u16 %r71, %rs40;
add.s32 %r72, %r71, -1;
cvt.rn.f32.s32 %f93, %r72;
mul.wide.s32 %rd26, %r93, 4;
add.s64 %rd15, %rd3, %rd26;
mul.f32 %f94, %f128, %f90;
ld.local.f32 %f95, [%rd15];
fma.rn.f32 %f96, %f95, %f94, %f135;
mul.f32 %f97, %f129, %f91;
fma.rn.f32 %f98, %f95, %f97, %f96;
mul.f32 %f99, %f130, %f92;
fma.rn.f32 %f100, %f95, %f99, %f98;
mul.f32 %f101, %f131, %f93;
fma.rn.f32 %f135, %f95, %f101, %f100;
@%p21 bra $L__BB2_28;
setp.eq.s32 %p22, %r4, 2;
ld.global.nc.u8 %rs41, [%rd14+1];
and.b16 %rs43, %rs41, 3;
cvt.u32.u16 %r73, %rs43;
add.s32 %r74, %r73, -1;
cvt.rn.f32.s32 %f102, %r74;
shr.u16 %rs44, %rs41, 2;
and.b16 %rs45, %rs44, 3;
cvt.u32.u16 %r75, %rs45;
add.s32 %r76, %r75, -1;
cvt.rn.f32.s32 %f103, %r76;
shr.u16 %rs46, %rs41, 4;
and.b16 %rs47, %rs46, 3;
cvt.u32.u16 %r77, %rs47;
add.s32 %r78, %r77, -1;
cvt.rn.f32.s32 %f104, %r78;
shr.u16 %rs48, %rs41, 6;
cvt.u32.u16 %r79, %rs48;
add.s32 %r80, %r79, -1;
cvt.rn.f32.s32 %f105, %r80;
mul.f32 %f106, %f128, %f102;
ld.local.f32 %f107, [%rd15+4];
fma.rn.f32 %f108, %f107, %f106, %f135;
mul.f32 %f109, %f129, %f103;
fma.rn.f32 %f110, %f107, %f109, %f108;
mul.f32 %f111, %f130, %f104;
fma.rn.f32 %f112, %f107, %f111, %f110;
mul.f32 %f113, %f131, %f105;
fma.rn.f32 %f135, %f107, %f113, %f112;
@%p22 bra $L__BB2_28;
ld.global.nc.u8 %rs49, [%rd14+2];
and.b16 %rs51, %rs49, 3;
cvt.u32.u16 %r81, %rs51;
add.s32 %r82, %r81, -1;
cvt.rn.f32.s32 %f114, %r82;
shr.u16 %rs52, %rs49, 2;
and.b16 %rs53, %rs52, 3;
cvt.u32.u16 %r83, %rs53;
add.s32 %r84, %r83, -1;
cvt.rn.f32.s32 %f115, %r84;
shr.u16 %rs54, %rs49, 4;
and.b16 %rs55, %rs54, 3;
cvt.u32.u16 %r85, %rs55;
add.s32 %r86, %r85, -1;
cvt.rn.f32.s32 %f116, %r86;
shr.u16 %rs56, %rs49, 6;
cvt.u32.u16 %r87, %rs56;
add.s32 %r88, %r87, -1;
cvt.rn.f32.s32 %f117, %r88;
mul.f32 %f118, %f128, %f114;
ld.local.f32 %f119, [%rd15+8];
fma.rn.f32 %f120, %f119, %f118, %f135;
mul.f32 %f121, %f129, %f115;
fma.rn.f32 %f122, %f119, %f121, %f120;
mul.f32 %f123, %f130, %f116;
fma.rn.f32 %f124, %f119, %f123, %f122;
mul.f32 %f125, %f131, %f117;
fma.rn.f32 %f135, %f119, %f125, %f124;
$L__BB2_28:
add.s32 %r90, %r90, 1;
setp.lt.s32 %p23, %r90, %r18;
@%p23 bra $L__BB2_13;
$L__BB2_29:
mad.lo.s32 %r89, %r2, %r17, %r1;
cvta.to.global.u64 %rd27, %rd17;
mul.wide.s32 %rd28, %r89, 4;
add.s64 %rd29, %rd27, %rd28;
st.global.f32 [%rd29], %f135;
$L__BB2_30:
ret;
}
// .globl gemm_4bit_vectorized_kernel_f32
.visible .entry gemm_4bit_vectorized_kernel_f32(
.param .u64 gemm_4bit_vectorized_kernel_f32_param_0,
.param .u64 gemm_4bit_vectorized_kernel_f32_param_1,
.param .u64 gemm_4bit_vectorized_kernel_f32_param_2,
.param .u64 gemm_4bit_vectorized_kernel_f32_param_3,
.param .u32 gemm_4bit_vectorized_kernel_f32_param_4,
.param .u32 gemm_4bit_vectorized_kernel_f32_param_5,
.param .u32 gemm_4bit_vectorized_kernel_f32_param_6,
.param .u32 gemm_4bit_vectorized_kernel_f32_param_7
)
{
.reg .pred %p<24>;
.reg .b16 %rs<91>;
.reg .f32 %f<301>;
.reg .b32 %r<239>;
.reg .b64 %rd<133>;
ld.param.u64 %rd29, [gemm_4bit_vectorized_kernel_f32_param_0];
ld.param.u64 %rd30, [gemm_4bit_vectorized_kernel_f32_param_1];
ld.param.u64 %rd31, [gemm_4bit_vectorized_kernel_f32_param_2];
ld.param.u64 %rd28, [gemm_4bit_vectorized_kernel_f32_param_3];
ld.param.u32 %r45, [gemm_4bit_vectorized_kernel_f32_param_4];
ld.param.u32 %r42, [gemm_4bit_vectorized_kernel_f32_param_5];
ld.param.u32 %r43, [gemm_4bit_vectorized_kernel_f32_param_6];
ld.param.u32 %r44, [gemm_4bit_vectorized_kernel_f32_param_7];
cvta.to.global.u64 %rd1, %rd29;
cvta.to.global.u64 %rd2, %rd31;
cvta.to.global.u64 %rd3, %rd30;
mov.u32 %r46, %ntid.x;
mov.u32 %r47, %ctaid.x;
mov.u32 %r48, %tid.x;
mad.lo.s32 %r1, %r47, %r46, %r48;
setp.ge.s32 %p1, %r1, %r43;
mov.u32 %r2, %ctaid.y;
setp.ge.s32 %p2, %r2, %r45;
or.pred %p3, %p2, %p1;
@%p3 bra $L__BB3_36;
shr.u32 %r49, %r42, 31;
add.s32 %r50, %r42, %r49;
shr.s32 %r51, %r50, 1;
add.s32 %r52, %r42, %r44;
add.s32 %r53, %r52, -1;
div.s32 %r54, %r53, %r44;
mul.lo.s32 %r55, %r2, %r42;
cvt.s64.s32 %rd4, %r55;
mul.lo.s32 %r56, %r51, %r1;
cvt.s64.s32 %rd5, %r56;
mul.lo.s32 %r57, %r54, %r1;
cvt.s64.s32 %rd6, %r57;
shr.s32 %r58, %r42, 31;
shr.u32 %r59, %r58, 29;
add.s32 %r60, %r42, %r59;
shr.s32 %r3, %r60, 3;
setp.lt.s32 %p4, %r42, 8;
mov.f32 %f295, 0f00000000;
@%p4 bra $L__BB3_18;
shl.b64 %rd32, %rd4, 2;
add.s64 %rd7, %rd1, %rd32;
add.s32 %r62, %r3, -1;
and.b32 %r4, %r3, 3;
setp.lt.u32 %p5, %r62, 3;
mov.f32 %f295, 0f00000000;
mov.u32 %r230, 0;
@%p5 bra $L__BB3_13;
sub.s32 %r229, %r3, %r4;
add.s64 %rd127, %rd3, %rd5;
mov.f32 %f295, 0f00000000;
mov.u32 %r230, 0;
$L__BB3_4:
.pragma "nounroll";
shl.b32 %r64, %r230, 1;
mul.wide.s32 %rd33, %r64, 16;
add.s64 %rd10, %rd7, %rd33;
ld.global.nc.v4.f32 {%f77, %f78, %f79, %f80}, [%rd10];
ld.global.nc.v4.f32 {%f85, %f86, %f87, %f88}, [%rd10+16];
ld.global.nc.u8 %rs1, [%rd127+2];
ld.global.nc.u8 %rs2, [%rd127+3];
shl.b32 %r65, %r230, 3;
div.s32 %r66, %r65, %r44;
cvt.s64.s32 %rd34, %r66;
add.s64 %rd35, %rd34, %rd6;
shl.b64 %rd36, %rd35, 2;
add.s64 %rd37, %rd2, %rd36;
ld.global.nc.u8 %rs11, [%rd127];
and.b16 %rs12, %rs11, 240;
and.b16 %rs13, %rs11, 15;
cvt.u32.u16 %r67, %rs13;
add.s32 %r68, %r67, -8;
cvt.rn.f32.s32 %f89, %r68;
shr.u16 %rs14, %rs12, 4;
cvt.u32.u16 %r69, %rs14;
add.s32 %r70, %r69, -8;
cvt.rn.f32.s32 %f90, %r70;
ld.global.nc.u8 %rs15, [%rd127+1];
and.b16 %rs16, %rs15, 240;
and.b16 %rs17, %rs15, 15;
cvt.u32.u16 %r71, %rs17;
add.s32 %r72, %r71, -8;
cvt.rn.f32.s32 %f91, %r72;
shr.u16 %rs18, %rs16, 4;
cvt.u32.u16 %r73, %rs18;
add.s32 %r74, %r73, -8;
cvt.rn.f32.s32 %f92, %r74;
ld.global.nc.f32 %f282, [%rd37];
mul.f32 %f93, %f282, %f89;
fma.rn.f32 %f94, %f77, %f93, %f295;
mul.f32 %f95, %f282, %f90;
fma.rn.f32 %f96, %f78, %f95, %f94;
mul.f32 %f97, %f282, %f91;
fma.rn.f32 %f98, %f79, %f97, %f96;
mul.f32 %f99, %f282, %f92;
fma.rn.f32 %f7, %f80, %f99, %f98;
or.b32 %r75, %r65, 4;
div.s32 %r8, %r75, %r44;
setp.eq.s32 %p6, %r8, %r66;
@%p6 bra $L__BB3_6;
cvt.s64.s32 %rd38, %r8;
add.s64 %rd39, %rd38, %rd6;
shl.b64 %rd40, %rd39, 2;
add.s64 %rd41, %rd2, %rd40;
ld.global.nc.f32 %f282, [%rd41];
$L__BB3_6:
and.b16 %rs19, %rs1, 240;
cvt.u32.u16 %r76, %rs1;
and.b32 %r77, %r76, 15;
add.s32 %r78, %r77, -8;
cvt.rn.f32.s32 %f100, %r78;
shr.u16 %rs20, %rs19, 4;
cvt.u32.u16 %r79, %rs20;
add.s32 %r80, %r79, -8;
cvt.rn.f32.s32 %f101, %r80;
and.b16 %rs21, %rs2, 240;
cvt.u32.u16 %r81, %rs2;
and.b32 %r82, %r81, 15;
add.s32 %r83, %r82, -8;
cvt.rn.f32.s32 %f102, %r83;
shr.u16 %rs22, %rs21, 4;
cvt.u32.u16 %r84, %rs22;
add.s32 %r85, %r84, -8;
cvt.rn.f32.s32 %f103, %r85;
mul.f32 %f104, %f282, %f100;
fma.rn.f32 %f105, %f85, %f104, %f7;
mul.f32 %f106, %f282, %f101;
fma.rn.f32 %f107, %f86, %f106, %f105;
mul.f32 %f108, %f282, %f102;
fma.rn.f32 %f109, %f87, %f108, %f107;
mul.f32 %f110, %f282, %f103;
fma.rn.f32 %f111, %f88, %f110, %f109;
ld.global.nc.v4.f32 {%f112, %f113, %f114, %f115}, [%rd10+32];
ld.global.nc.v4.f32 {%f120, %f121, %f122, %f123}, [%rd10+48];
ld.global.nc.u8 %rs3, [%rd127+6];
ld.global.nc.u8 %rs4, [%rd127+7];
add.s32 %r87, %r65, 8;
div.s32 %r88, %r87, %r44;
cvt.s64.s32 %rd42, %r88;
add.s64 %rd43, %rd42, %rd6;
shl.b64 %rd44, %rd43, 2;
add.s64 %rd45, %rd2, %rd44;
ld.global.nc.u8 %rs23, [%rd127+4];
and.b16 %rs24, %rs23, 240;
and.b16 %rs25, %rs23, 15;
cvt.u32.u16 %r89, %rs25;
add.s32 %r90, %r89, -8;
cvt.rn.f32.s32 %f124, %r90;
shr.u16 %rs26, %rs24, 4;
cvt.u32.u16 %r91, %rs26;
add.s32 %r92, %r91, -8;
cvt.rn.f32.s32 %f125, %r92;
ld.global.nc.u8 %rs27, [%rd127+5];
and.b16 %rs28, %rs27, 240;
and.b16 %rs29, %rs27, 15;
cvt.u32.u16 %r93, %rs29;
add.s32 %r94, %r93, -8;
cvt.rn.f32.s32 %f126, %r94;
shr.u16 %rs30, %rs28, 4;
cvt.u32.u16 %r95, %rs30;
add.s32 %r96, %r95, -8;
cvt.rn.f32.s32 %f127, %r96;
ld.global.nc.f32 %f283, [%rd45];
mul.f32 %f128, %f283, %f124;
fma.rn.f32 %f129, %f112, %f128, %f111;
mul.f32 %f130, %f283, %f125;
fma.rn.f32 %f131, %f113, %f130, %f129;
mul.f32 %f132, %f283, %f126;
fma.rn.f32 %f133, %f114, %f132, %f131;
mul.f32 %f134, %f283, %f127;
fma.rn.f32 %f15, %f115, %f134, %f133;
or.b32 %r97, %r87, 4;
div.s32 %r9, %r97, %r44;
setp.eq.s32 %p7, %r9, %r88;
@%p7 bra $L__BB3_8;
cvt.s64.s32 %rd46, %r9;
add.s64 %rd47, %rd46, %rd6;
shl.b64 %rd48, %rd47, 2;
add.s64 %rd49, %rd2, %rd48;
ld.global.nc.f32 %f283, [%rd49];
$L__BB3_8:
and.b16 %rs31, %rs3, 240;
cvt.u32.u16 %r98, %rs3;
and.b32 %r99, %r98, 15;
add.s32 %r100, %r99, -8;
cvt.rn.f32.s32 %f135, %r100;
shr.u16 %rs32, %rs31, 4;
cvt.u32.u16 %r101, %rs32;
add.s32 %r102, %r101, -8;
cvt.rn.f32.s32 %f136, %r102;
and.b16 %rs33, %rs4, 240;
cvt.u32.u16 %r103, %rs4;
and.b32 %r104, %r103, 15;
add.s32 %r105, %r104, -8;
cvt.rn.f32.s32 %f137, %r105;
shr.u16 %rs34, %rs33, 4;
cvt.u32.u16 %r106, %rs34;
add.s32 %r107, %r106, -8;
cvt.rn.f32.s32 %f138, %r107;
mul.f32 %f139, %f283, %f135;
fma.rn.f32 %f140, %f120, %f139, %f15;
mul.f32 %f141, %f283, %f136;
fma.rn.f32 %f142, %f121, %f141, %f140;
mul.f32 %f143, %f283, %f137;
fma.rn.f32 %f144, %f122, %f143, %f142;
mul.f32 %f145, %f283, %f138;
fma.rn.f32 %f146, %f123, %f145, %f144;
ld.global.nc.v4.f32 {%f147, %f148, %f149, %f150}, [%rd10+64];
ld.global.nc.v4.f32 {%f155, %f156, %f157, %f158}, [%rd10+80];
ld.global.nc.u8 %rs5, [%rd127+10];
ld.global.nc.u8 %rs6, [%rd127+11];
add.s32 %r109, %r65, 16;
div.s32 %r110, %r109, %r44;
cvt.s64.s32 %rd50, %r110;
add.s64 %rd51, %rd50, %rd6;
shl.b64 %rd52, %rd51, 2;
add.s64 %rd53, %rd2, %rd52;
ld.global.nc.u8 %rs35, [%rd127+8];
and.b16 %rs36, %rs35, 240;
and.b16 %rs37, %rs35, 15;
cvt.u32.u16 %r111, %rs37;
add.s32 %r112, %r111, -8;
cvt.rn.f32.s32 %f159, %r112;
shr.u16 %rs38, %rs36, 4;
cvt.u32.u16 %r113, %rs38;
add.s32 %r114, %r113, -8;
cvt.rn.f32.s32 %f160, %r114;
ld.global.nc.u8 %rs39, [%rd127+9];
and.b16 %rs40, %rs39, 240;
and.b16 %rs41, %rs39, 15;
cvt.u32.u16 %r115, %rs41;
add.s32 %r116, %r115, -8;
cvt.rn.f32.s32 %f161, %r116;
shr.u16 %rs42, %rs40, 4;
cvt.u32.u16 %r117, %rs42;
add.s32 %r118, %r117, -8;
cvt.rn.f32.s32 %f162, %r118;
ld.global.nc.f32 %f284, [%rd53];
mul.f32 %f163, %f284, %f159;
fma.rn.f32 %f164, %f147, %f163, %f146;
mul.f32 %f165, %f284, %f160;
fma.rn.f32 %f166, %f148, %f165, %f164;
mul.f32 %f167, %f284, %f161;
fma.rn.f32 %f168, %f149, %f167, %f166;
mul.f32 %f169, %f284, %f162;
fma.rn.f32 %f23, %f150, %f169, %f168;
or.b32 %r119, %r109, 4;
div.s32 %r10, %r119, %r44;
setp.eq.s32 %p8, %r10, %r110;
@%p8 bra $L__BB3_10;
cvt.s64.s32 %rd54, %r10;
add.s64 %rd55, %rd54, %rd6;
shl.b64 %rd56, %rd55, 2;
add.s64 %rd57, %rd2, %rd56;
ld.global.nc.f32 %f284, [%rd57];
$L__BB3_10:
and.b16 %rs43, %rs5, 240;
cvt.u32.u16 %r120, %rs5;
and.b32 %r121, %r120, 15;
add.s32 %r122, %r121, -8;
cvt.rn.f32.s32 %f170, %r122;
shr.u16 %rs44, %rs43, 4;
cvt.u32.u16 %r123, %rs44;
add.s32 %r124, %r123, -8;
cvt.rn.f32.s32 %f171, %r124;
and.b16 %rs45, %rs6, 240;
cvt.u32.u16 %r125, %rs6;
and.b32 %r126, %r125, 15;
add.s32 %r127, %r126, -8;
cvt.rn.f32.s32 %f172, %r127;
shr.u16 %rs46, %rs45, 4;
cvt.u32.u16 %r128, %rs46;
add.s32 %r129, %r128, -8;
cvt.rn.f32.s32 %f173, %r129;
mul.f32 %f174, %f284, %f170;
fma.rn.f32 %f175, %f155, %f174, %f23;
mul.f32 %f176, %f284, %f171;
fma.rn.f32 %f177, %f156, %f176, %f175;
mul.f32 %f178, %f284, %f172;
fma.rn.f32 %f179, %f157, %f178, %f177;
mul.f32 %f180, %f284, %f173;
fma.rn.f32 %f181, %f158, %f180, %f179;
ld.global.nc.v4.f32 {%f182, %f183, %f184, %f185}, [%rd10+96];
ld.global.nc.v4.f32 {%f190, %f191, %f192, %f193}, [%rd10+112];
ld.global.nc.u8 %rs7, [%rd127+14];
ld.global.nc.u8 %rs8, [%rd127+15];
add.s32 %r131, %r65, 24;
div.s32 %r132, %r131, %r44;
cvt.s64.s32 %rd58, %r132;
add.s64 %rd59, %rd58, %rd6;
shl.b64 %rd60, %rd59, 2;
add.s64 %rd61, %rd2, %rd60;
ld.global.nc.u8 %rs47, [%rd127+12];
and.b16 %rs48, %rs47, 240;
and.b16 %rs49, %rs47, 15;
cvt.u32.u16 %r133, %rs49;
add.s32 %r134, %r133, -8;
cvt.rn.f32.s32 %f194, %r134;
shr.u16 %rs50, %rs48, 4;
cvt.u32.u16 %r135, %rs50;
add.s32 %r136, %r135, -8;
cvt.rn.f32.s32 %f195, %r136;
ld.global.nc.u8 %rs51, [%rd127+13];
and.b16 %rs52, %rs51, 240;
and.b16 %rs53, %rs51, 15;
cvt.u32.u16 %r137, %rs53;
add.s32 %r138, %r137, -8;
cvt.rn.f32.s32 %f196, %r138;
shr.u16 %rs54, %rs52, 4;
cvt.u32.u16 %r139, %rs54;
add.s32 %r140, %r139, -8;
cvt.rn.f32.s32 %f197, %r140;
ld.global.nc.f32 %f285, [%rd61];
mul.f32 %f198, %f285, %f194;
fma.rn.f32 %f199, %f182, %f198, %f181;
mul.f32 %f200, %f285, %f195;
fma.rn.f32 %f201, %f183, %f200, %f199;
mul.f32 %f202, %f285, %f196;
fma.rn.f32 %f203, %f184, %f202, %f201;
mul.f32 %f204, %f285, %f197;
fma.rn.f32 %f31, %f185, %f204, %f203;
or.b32 %r141, %r131, 4;
div.s32 %r11, %r141, %r44;
setp.eq.s32 %p9, %r11, %r132;
@%p9 bra $L__BB3_12;
cvt.s64.s32 %rd62, %r11;
add.s64 %rd63, %rd62, %rd6;
shl.b64 %rd64, %rd63, 2;
add.s64 %rd65, %rd2, %rd64;
ld.global.nc.f32 %f285, [%rd65];
$L__BB3_12:
add.s64 %rd127, %rd127, 16;
and.b16 %rs55, %rs7, 240;
cvt.u32.u16 %r142, %rs7;
and.b32 %r143, %r142, 15;
add.s32 %r144, %r143, -8;
cvt.rn.f32.s32 %f205, %r144;
shr.u16 %rs56, %rs55, 4;
cvt.u32.u16 %r145, %rs56;
add.s32 %r146, %r145, -8;
cvt.rn.f32.s32 %f206, %r146;
and.b16 %rs57, %rs8, 240;
cvt.u32.u16 %r147, %rs8;
and.b32 %r148, %r147, 15;
add.s32 %r149, %r148, -8;
cvt.rn.f32.s32 %f207, %r149;
shr.u16 %rs58, %rs57, 4;
cvt.u32.u16 %r150, %rs58;
add.s32 %r151, %r150, -8;
cvt.rn.f32.s32 %f208, %r151;
mul.f32 %f209, %f285, %f205;
fma.rn.f32 %f210, %f190, %f209, %f31;
mul.f32 %f211, %f285, %f206;
fma.rn.f32 %f212, %f191, %f211, %f210;
mul.f32 %f213, %f285, %f207;
fma.rn.f32 %f214, %f192, %f213, %f212;
mul.f32 %f215, %f285, %f208;
fma.rn.f32 %f295, %f193, %f215, %f214;
add.s32 %r230, %r230, 4;
add.s32 %r229, %r229, -4;
setp.ne.s32 %p10, %r229, 0;
@%p10 bra $L__BB3_4;
$L__BB3_13:
setp.eq.s32 %p11, %r4, 0;
@%p11 bra $L__BB3_18;
shl.b32 %r152, %r230, 3;
or.b32 %r234, %r152, 4;
shl.b32 %r153, %r230, 2;
cvt.s64.s32 %rd66, %r153;
add.s64 %rd67, %rd5, %rd66;
add.s64 %rd128, %rd3, %rd67;
or.b32 %r233, %r153, 1;
shl.b32 %r232, %r230, 1;
neg.s32 %r231, %r4;
$L__BB3_15:
.pragma "nounroll";
mul.wide.s32 %rd68, %r232, 16;
add.s64 %rd69, %rd7, %rd68;
ld.global.nc.v4.f32 {%f216, %f217, %f218, %f219}, [%rd69];
ld.global.nc.v4.f32 {%f224, %f225, %f226, %f227}, [%rd69+16];
cvt.s64.s32 %rd70, %r233;
add.s64 %rd71, %rd70, %rd5;
add.s64 %rd72, %rd3, %rd71;
ld.global.nc.u8 %rs9, [%rd128+2];
ld.global.nc.u8 %rs10, [%rd128+3];
add.s32 %r154, %r234, -4;
div.s32 %r155, %r154, %r44;
cvt.s64.s32 %rd73, %r155;
add.s64 %rd74, %rd73, %rd6;
shl.b64 %rd75, %rd74, 2;
add.s64 %rd76, %rd2, %rd75;
ld.global.nc.u8 %rs59, [%rd128];
and.b16 %rs60, %rs59, 240;
and.b16 %rs61, %rs59, 15;
cvt.u32.u16 %r156, %rs61;
add.s32 %r157, %r156, -8;
cvt.rn.f32.s32 %f228, %r157;
shr.u16 %rs62, %rs60, 4;
cvt.u32.u16 %r158, %rs62;
add.s32 %r159, %r158, -8;
cvt.rn.f32.s32 %f229, %r159;
ld.global.nc.u8 %rs63, [%rd72];
and.b16 %rs64, %rs63, 240;
and.b16 %rs65, %rs63, 15;
cvt.u32.u16 %r160, %rs65;
add.s32 %r161, %r160, -8;
cvt.rn.f32.s32 %f230, %r161;
shr.u16 %rs66, %rs64, 4;
cvt.u32.u16 %r162, %rs66;
add.s32 %r163, %r162, -8;
cvt.rn.f32.s32 %f231, %r163;
ld.global.nc.f32 %f289, [%rd76];
mul.f32 %f232, %f289, %f228;
fma.rn.f32 %f233, %f216, %f232, %f295;
mul.f32 %f234, %f289, %f229;
fma.rn.f32 %f235, %f217, %f234, %f233;
mul.f32 %f236, %f289, %f230;
fma.rn.f32 %f237, %f218, %f236, %f235;
mul.f32 %f238, %f289, %f231;
fma.rn.f32 %f43, %f219, %f238, %f237;
div.s32 %r23, %r234, %r44;
setp.eq.s32 %p12, %r23, %r155;
@%p12 bra $L__BB3_17;
cvt.s64.s32 %rd77, %r23;
add.s64 %rd78, %rd77, %rd6;
shl.b64 %rd79, %rd78, 2;
add.s64 %rd80, %rd2, %rd79;
ld.global.nc.f32 %f289, [%rd80];
$L__BB3_17:
and.b16 %rs67, %rs9, 240;
cvt.u32.u16 %r164, %rs9;
and.b32 %r165, %r164, 15;
add.s32 %r166, %r165, -8;
cvt.rn.f32.s32 %f239, %r166;
shr.u16 %rs68, %rs67, 4;
cvt.u32.u16 %r167, %rs68;
add.s32 %r168, %r167, -8;
cvt.rn.f32.s32 %f240, %r168;
and.b16 %rs69, %rs10, 240;
cvt.u32.u16 %r169, %rs10;
and.b32 %r170, %r169, 15;
add.s32 %r171, %r170, -8;
cvt.rn.f32.s32 %f241, %r171;
shr.u16 %rs70, %rs69, 4;
cvt.u32.u16 %r172, %rs70;
add.s32 %r173, %r172, -8;
cvt.rn.f32.s32 %f242, %r173;
mul.f32 %f243, %f289, %f239;
fma.rn.f32 %f244, %f224, %f243, %f43;
mul.f32 %f245, %f289, %f240;
fma.rn.f32 %f246, %f225, %f245, %f244;
mul.f32 %f247, %f289, %f241;
fma.rn.f32 %f248, %f226, %f247, %f246;
mul.f32 %f249, %f289, %f242;
fma.rn.f32 %f295, %f227, %f249, %f248;
add.s32 %r234, %r234, 8;
add.s64 %rd128, %rd128, 4;
add.s32 %r233, %r233, 4;
add.s32 %r232, %r232, 2;
add.s32 %r231, %r231, 1;
setp.ne.s32 %p13, %r231, 0;
@%p13 bra $L__BB3_15;
$L__BB3_18:
shl.b32 %r237, %r3, 3;
setp.ge.s32 %p14, %r237, %r42;
@%p14 bra $L__BB3_35;
not.b32 %r174, %r237;
add.s32 %r29, %r174, %r42;
shr.u32 %r175, %r29, 1;
add.s32 %r176, %r175, 1;
and.b32 %r236, %r176, 3;
setp.eq.s32 %p15, %r236, 0;
@%p15 bra $L__BB3_24;
cvt.u32.u64 %r177, %rd4;
cvt.s64.s32 %rd81, %r237;
add.s64 %rd82, %rd81, %rd4;
shl.b64 %rd83, %rd82, 2;
add.s64 %rd130, %rd1, %rd83;
add.s32 %r178, %r237, %r177;
add.s32 %r179, %r178, 1;
mul.wide.s32 %rd84, %r179, 4;
add.s64 %rd129, %rd1, %rd84;
$L__BB3_21:
.pragma "nounroll";
shr.u32 %r180, %r237, 31;
add.s32 %r181, %r237, %r180;
shr.s32 %r182, %r181, 1;
cvt.s64.s32 %rd85, %r182;
add.s64 %rd86, %rd85, %rd5;
add.s64 %rd87, %rd3, %rd86;
div.s32 %r183, %r237, %r44;
cvt.s64.s32 %rd88, %r183;
add.s64 %rd89, %rd88, %rd6;
shl.b64 %rd90, %rd89, 2;
add.s64 %rd91, %rd2, %rd90;
ld.global.nc.u8 %rs71, [%rd87];
and.b16 %rs72, %rs71, 240;
and.b16 %rs73, %rs71, 15;
cvt.u32.u16 %r184, %rs73;
add.s32 %r185, %r184, -8;
cvt.rn.f32.s32 %f251, %r185;
ld.global.nc.f32 %f252, [%rd91];
mul.f32 %f253, %f252, %f251;
shr.u16 %rs74, %rs72, 4;
cvt.u32.u16 %r186, %rs74;
add.s32 %r187, %r186, -8;
cvt.rn.f32.s32 %f254, %r187;
mul.f32 %f49, %f252, %f254;
ld.global.nc.f32 %f255, [%rd130];
fma.rn.f32 %f295, %f255, %f253, %f295;
add.s32 %r188, %r237, 1;
setp.ge.s32 %p16, %r188, %r42;
@%p16 bra $L__BB3_23;
ld.global.nc.f32 %f256, [%rd129];
fma.rn.f32 %f295, %f256, %f49, %f295;
$L__BB3_23:
add.s32 %r237, %r237, 2;
add.s64 %rd130, %rd130, 8;
add.s64 %rd129, %rd129, 8;
add.s32 %r236, %r236, -1;
setp.ne.s32 %p17, %r236, 0;
@%p17 bra $L__BB3_21;
$L__BB3_24:
setp.lt.u32 %p18, %r29, 6;
@%p18 bra $L__BB3_35;
cvt.u32.u64 %r189, %rd4;
add.s32 %r190, %r237, %r189;
add.s32 %r191, %r190, 1;
mul.wide.s32 %rd92, %r191, 4;
add.s64 %rd131, %rd1, %rd92;
cvt.s64.s32 %rd93, %r237;
add.s64 %rd94, %rd93, %rd4;
shl.b64 %rd95, %rd94, 2;
add.s64 %rd132, %rd1, %rd95;
$L__BB3_26:
shr.u32 %r192, %r237, 31;
add.s32 %r193, %r237, %r192;
shr.s32 %r194, %r193, 1;
cvt.s64.s32 %rd96, %r194;
add.s64 %rd97, %rd96, %rd5;
add.s64 %rd98, %rd3, %rd97;
div.s32 %r195, %r237, %r44;
cvt.s64.s32 %rd99, %r195;
add.s64 %rd100, %rd99, %rd6;
shl.b64 %rd101, %rd100, 2;
add.s64 %rd102, %rd2, %rd101;
ld.global.nc.u8 %rs75, [%rd98];
and.b16 %rs76, %rs75, 240;
and.b16 %rs77, %rs75, 15;
cvt.u32.u16 %r196, %rs77;
add.s32 %r197, %r196, -8;
cvt.rn.f32.s32 %f257, %r197;
ld.global.nc.f32 %f258, [%rd102];
mul.f32 %f259, %f258, %f257;
shr.u16 %rs78, %rs76, 4;
cvt.u32.u16 %r198, %rs78;
add.s32 %r199, %r198, -8;
cvt.rn.f32.s32 %f260, %r199;
mul.f32 %f56, %f258, %f260;
ld.global.nc.f32 %f261, [%rd132];
fma.rn.f32 %f296, %f261, %f259, %f295;
add.s32 %r37, %r237, 1;
setp.ge.s32 %p19, %r37, %r42;
@%p19 bra $L__BB3_28;
ld.global.nc.f32 %f262, [%rd131];
fma.rn.f32 %f296, %f262, %f56, %f296;
$L__BB3_28:
add.s32 %r200, %r37, 1;
shr.u32 %r201, %r200, 31;
add.s32 %r202, %r200, %r201;
shr.s32 %r203, %r202, 1;
cvt.s64.s32 %rd103, %r203;
add.s64 %rd104, %rd103, %rd5;
add.s64 %rd105, %rd3, %rd104;
div.s32 %r204, %r200, %r44;
cvt.s64.s32 %rd106, %r204;
add.s64 %rd107, %rd106, %rd6;
shl.b64 %rd108, %rd107, 2;
add.s64 %rd109, %rd2, %rd108;
ld.global.nc.u8 %rs79, [%rd105];
and.b16 %rs80, %rs79, 240;
and.b16 %rs81, %rs79, 15;
cvt.u32.u16 %r205, %rs81;
add.s32 %r206, %r205, -8;
cvt.rn.f32.s32 %f263, %r206;
ld.global.nc.f32 %f264, [%rd109];
mul.f32 %f265, %f264, %f263;
shr.u16 %rs82, %rs80, 4;
cvt.u32.u16 %r207, %rs82;
add.s32 %r208, %r207, -8;
cvt.rn.f32.s32 %f266, %r208;
mul.f32 %f60, %f264, %f266;
ld.global.nc.f32 %f267, [%rd132+8];
fma.rn.f32 %f297, %f267, %f265, %f296;
add.s32 %r38, %r37, 2;
setp.ge.s32 %p20, %r38, %r42;
@%p20 bra $L__BB3_30;
ld.global.nc.f32 %f268, [%rd131+8];
fma.rn.f32 %f297, %f268, %f60, %f297;
$L__BB3_30:
add.s32 %r209, %r38, 1;
shr.u32 %r210, %r209, 31;
add.s32 %r211, %r209, %r210;
shr.s32 %r212, %r211, 1;
cvt.s64.s32 %rd110, %r212;
add.s64 %rd111, %rd110, %rd5;
add.s64 %rd112, %rd3, %rd111;
div.s32 %r213, %r209, %r44;
cvt.s64.s32 %rd113, %r213;
add.s64 %rd114, %rd113, %rd6;
shl.b64 %rd115, %rd114, 2;
add.s64 %rd116, %rd2, %rd115;
ld.global.nc.u8 %rs83, [%rd112];
and.b16 %rs84, %rs83, 240;
and.b16 %rs85, %rs83, 15;
cvt.u32.u16 %r214, %rs85;
add.s32 %r215, %r214, -8;
cvt.rn.f32.s32 %f269, %r215;
ld.global.nc.f32 %f270, [%rd116];
mul.f32 %f271, %f270, %f269;
shr.u16 %rs86, %rs84, 4;
cvt.u32.u16 %r216, %rs86;
add.s32 %r217, %r216, -8;
cvt.rn.f32.s32 %f272, %r217;
mul.f32 %f64, %f270, %f272;
ld.global.nc.f32 %f273, [%rd132+16];
fma.rn.f32 %f298, %f273, %f271, %f297;
add.s32 %r39, %r38, 2;
setp.ge.s32 %p21, %r39, %r42;
@%p21 bra $L__BB3_32;
ld.global.nc.f32 %f274, [%rd131+16];
fma.rn.f32 %f298, %f274, %f64, %f298;
$L__BB3_32:
add.s32 %r218, %r39, 1;
shr.u32 %r219, %r218, 31;
add.s32 %r220, %r218, %r219;
shr.s32 %r221, %r220, 1;
cvt.s64.s32 %rd117, %r221;
add.s64 %rd118, %rd117, %rd5;
add.s64 %rd119, %rd3, %rd118;
div.s32 %r222, %r218, %r44;
cvt.s64.s32 %rd120, %r222;
add.s64 %rd121, %rd120, %rd6;
shl.b64 %rd122, %rd121, 2;
add.s64 %rd123, %rd2, %rd122;
ld.global.nc.u8 %rs87, [%rd119];
and.b16 %rs88, %rs87, 240;
and.b16 %rs89, %rs87, 15;
cvt.u32.u16 %r223, %rs89;
add.s32 %r224, %r223, -8;
cvt.rn.f32.s32 %f275, %r224;
ld.global.nc.f32 %f276, [%rd123];
mul.f32 %f277, %f276, %f275;
shr.u16 %rs90, %rs88, 4;
cvt.u32.u16 %r225, %rs90;
add.s32 %r226, %r225, -8;
cvt.rn.f32.s32 %f278, %r226;
mul.f32 %f68, %f276, %f278;
ld.global.nc.f32 %f279, [%rd132+24];
fma.rn.f32 %f295, %f279, %f277, %f298;
add.s32 %r40, %r39, 2;
setp.ge.s32 %p22, %r40, %r42;
@%p22 bra $L__BB3_34;
ld.global.nc.f32 %f280, [%rd131+24];
fma.rn.f32 %f295, %f280, %f68, %f295;
$L__BB3_34:
add.s64 %rd132, %rd132, 32;
add.s64 %rd131, %rd131, 32;
add.s32 %r237, %r40, 1;
setp.lt.s32 %p23, %r237, %r42;
@%p23 bra $L__BB3_26;
$L__BB3_35:
mad.lo.s32 %r227, %r2, %r43, %r1;
cvta.to.global.u64 %rd124, %rd28;
mul.wide.s32 %rd125, %r227, 4;
add.s64 %rd126, %rd124, %rd125;
st.global.f32 [%rd126], %f295;
$L__BB3_36:
ret;
}
// .globl gemm_ternary_multibase_vectorized_kernel_f32
.visible .entry gemm_ternary_multibase_vectorized_kernel_f32(
.param .u64 gemm_ternary_multibase_vectorized_kernel_f32_param_0,
.param .u64 gemm_ternary_multibase_vectorized_kernel_f32_param_1,
.param .u64 gemm_ternary_multibase_vectorized_kernel_f32_param_2,
.param .u64 gemm_ternary_multibase_vectorized_kernel_f32_param_3,
.param .u32 gemm_ternary_multibase_vectorized_kernel_f32_param_4,
.param .u32 gemm_ternary_multibase_vectorized_kernel_f32_param_5,
.param .u32 gemm_ternary_multibase_vectorized_kernel_f32_param_6,
.param .u32 gemm_ternary_multibase_vectorized_kernel_f32_param_7,
.param .u32 gemm_ternary_multibase_vectorized_kernel_f32_param_8
)
{
.reg .pred %p<54>;
.reg .b16 %rs<321>;
.reg .f32 %f<577>;
.reg .b32 %r<359>;
.reg .b64 %rd<44>;
ld.param.u64 %rd23, [gemm_ternary_multibase_vectorized_kernel_f32_param_0];
ld.param.u64 %rd24, [gemm_ternary_multibase_vectorized_kernel_f32_param_1];
ld.param.u64 %rd25, [gemm_ternary_multibase_vectorized_kernel_f32_param_2];
ld.param.u64 %rd22, [gemm_ternary_multibase_vectorized_kernel_f32_param_3];
ld.param.u32 %r16, [gemm_ternary_multibase_vectorized_kernel_f32_param_4];
ld.param.u32 %r12, [gemm_ternary_multibase_vectorized_kernel_f32_param_5];
ld.param.u32 %r13, [gemm_ternary_multibase_vectorized_kernel_f32_param_6];
ld.param.u32 %r14, [gemm_ternary_multibase_vectorized_kernel_f32_param_7];
ld.param.u32 %r15, [gemm_ternary_multibase_vectorized_kernel_f32_param_8];
cvta.to.global.u64 %rd1, %rd25;
cvta.to.global.u64 %rd2, %rd24;
cvta.to.global.u64 %rd3, %rd23;
mov.u32 %r17, %ntid.x;
mov.u32 %r18, %ctaid.x;
mov.u32 %r19, %tid.x;
mad.lo.s32 %r1, %r18, %r17, %r19;
setp.ge.s32 %p1, %r1, %r13;
mov.u32 %r2, %ctaid.y;
setp.ge.s32 %p2, %r2, %r16;
or.pred %p3, %p2, %p1;
@%p3 bra $L__BB4_58;
setp.lt.s32 %p4, %r15, 1;
@%p4 bra $L__BB4_10;
ld.global.nc.f32 %f566, [%rd1];
setp.eq.s32 %p5, %r15, 1;
@%p5 bra $L__BB4_10;
ld.global.nc.f32 %f565, [%rd1+4];
setp.eq.s32 %p6, %r15, 2;
@%p6 bra $L__BB4_10;
ld.global.nc.f32 %f564, [%rd1+8];
setp.lt.s32 %p7, %r15, 4;
@%p7 bra $L__BB4_10;
ld.global.nc.f32 %f563, [%rd1+12];
setp.eq.s32 %p8, %r15, 4;
@%p8 bra $L__BB4_10;
ld.global.nc.f32 %f562, [%rd1+16];
setp.lt.s32 %p9, %r15, 6;
@%p9 bra $L__BB4_10;
ld.global.nc.f32 %f561, [%rd1+20];
setp.eq.s32 %p10, %r15, 6;
@%p10 bra $L__BB4_10;
ld.global.nc.f32 %f560, [%rd1+24];
setp.lt.s32 %p11, %r15, 8;
@%p11 bra $L__BB4_10;
ld.global.nc.f32 %f559, [%rd1+28];
$L__BB4_10:
mul.lo.s32 %r20, %r2, %r12;
cvt.s64.s32 %rd4, %r20;
mul.lo.s32 %r21, %r1, %r14;
mul.lo.s32 %r22, %r21, %r15;
cvt.s64.s32 %rd5, %r22;
setp.lt.s32 %p12, %r14, 1;
mov.f32 %f575, 0f00000000;
or.pred %p14, %p12, %p4;
@%p14 bra $L__BB4_57;
add.s32 %r24, %r14, -1;
and.b32 %r358, %r14, 3;
setp.lt.u32 %p15, %r24, 3;
mov.f32 %f575, 0f00000000;
mov.u32 %r357, 0;
@%p15 bra $L__BB4_46;
sub.s32 %r356, %r14, %r358;
shl.b64 %rd26, %rd4, 2;
add.s64 %rd6, %rd3, %rd26;
mov.f32 %f575, 0f00000000;
mov.u32 %r357, 0;
$L__BB4_13:
.pragma "nounroll";
mul.wide.s32 %rd27, %r357, 16;
add.s64 %rd7, %rd6, %rd27;
ld.global.nc.v4.f32 {%f99, %f100, %f101, %f102}, [%rd7];
mul.lo.s32 %r26, %r357, %r15;
cvt.s64.s32 %rd28, %r26;
add.s64 %rd29, %rd28, %rd5;
add.s64 %rd8, %rd2, %rd29;
ld.global.nc.u8 %rs1, [%rd8];
and.b16 %rs3, %rs1, 3;
cvt.u32.u16 %r27, %rs3;
add.s32 %r28, %r27, -1;
cvt.rn.f32.s32 %f103, %r28;
shr.u16 %rs4, %rs1, 2;
and.b16 %rs5, %rs4, 3;
cvt.u32.u16 %r29, %rs5;
add.s32 %r30, %r29, -1;
cvt.rn.f32.s32 %f104, %r30;
shr.u16 %rs6, %rs1, 4;
and.b16 %rs7, %rs6, 3;
cvt.u32.u16 %r31, %rs7;
add.s32 %r32, %r31, -1;
cvt.rn.f32.s32 %f105, %r32;
shr.u16 %rs8, %rs1, 6;
cvt.u32.u16 %r33, %rs8;
add.s32 %r34, %r33, -1;
cvt.rn.f32.s32 %f106, %r34;
mul.f32 %f107, %f99, %f103;
fma.rn.f32 %f108, %f107, %f566, %f575;
mul.f32 %f109, %f100, %f104;
fma.rn.f32 %f110, %f109, %f566, %f108;
mul.f32 %f111, %f101, %f105;
fma.rn.f32 %f112, %f111, %f566, %f110;
mul.f32 %f113, %f102, %f106;
fma.rn.f32 %f568, %f113, %f566, %f112;
setp.lt.s32 %p16, %r15, 2;
@%p16 bra $L__BB4_21;
setp.eq.s32 %p17, %r15, 2;
ld.global.nc.u8 %rs9, [%rd8+1];
and.b16 %rs11, %rs9, 3;
cvt.u32.u16 %r35, %rs11;
add.s32 %r36, %r35, -1;
cvt.rn.f32.s32 %f114, %r36;
shr.u16 %rs12, %rs9, 2;
and.b16 %rs13, %rs12, 3;
cvt.u32.u16 %r37, %rs13;
add.s32 %r38, %r37, -1;
cvt.rn.f32.s32 %f115, %r38;
shr.u16 %rs14, %rs9, 4;
and.b16 %rs15, %rs14, 3;
cvt.u32.u16 %r39, %rs15;
add.s32 %r40, %r39, -1;
cvt.rn.f32.s32 %f116, %r40;
shr.u16 %rs16, %rs9, 6;
cvt.u32.u16 %r41, %rs16;
add.s32 %r42, %r41, -1;
cvt.rn.f32.s32 %f117, %r42;
mul.f32 %f118, %f99, %f114;
fma.rn.f32 %f119, %f118, %f565, %f568;
mul.f32 %f120, %f100, %f115;
fma.rn.f32 %f121, %f120, %f565, %f119;
mul.f32 %f122, %f101, %f116;
fma.rn.f32 %f123, %f122, %f565, %f121;
mul.f32 %f124, %f102, %f117;
fma.rn.f32 %f568, %f124, %f565, %f123;
@%p17 bra $L__BB4_21;
setp.lt.s32 %p18, %r15, 4;
ld.global.nc.u8 %rs17, [%rd8+2];
and.b16 %rs19, %rs17, 3;
cvt.u32.u16 %r43, %rs19;
add.s32 %r44, %r43, -1;
cvt.rn.f32.s32 %f125, %r44;
shr.u16 %rs20, %rs17, 2;
and.b16 %rs21, %rs20, 3;
cvt.u32.u16 %r45, %rs21;
add.s32 %r46, %r45, -1;
cvt.rn.f32.s32 %f126, %r46;
shr.u16 %rs22, %rs17, 4;
and.b16 %rs23, %rs22, 3;
cvt.u32.u16 %r47, %rs23;
add.s32 %r48, %r47, -1;
cvt.rn.f32.s32 %f127, %r48;
shr.u16 %rs24, %rs17, 6;
cvt.u32.u16 %r49, %rs24;
add.s32 %r50, %r49, -1;
cvt.rn.f32.s32 %f128, %r50;
mul.f32 %f129, %f99, %f125;
fma.rn.f32 %f130, %f129, %f564, %f568;
mul.f32 %f131, %f100, %f126;
fma.rn.f32 %f132, %f131, %f564, %f130;
mul.f32 %f133, %f101, %f127;
fma.rn.f32 %f134, %f133, %f564, %f132;
mul.f32 %f135, %f102, %f128;
fma.rn.f32 %f568, %f135, %f564, %f134;
@%p18 bra $L__BB4_21;
setp.eq.s32 %p19, %r15, 4;
ld.global.nc.u8 %rs25, [%rd8+3];
and.b16 %rs27, %rs25, 3;
cvt.u32.u16 %r51, %rs27;
add.s32 %r52, %r51, -1;
cvt.rn.f32.s32 %f136, %r52;
shr.u16 %rs28, %rs25, 2;
and.b16 %rs29, %rs28, 3;
cvt.u32.u16 %r53, %rs29;
add.s32 %r54, %r53, -1;
cvt.rn.f32.s32 %f137, %r54;
shr.u16 %rs30, %rs25, 4;
and.b16 %rs31, %rs30, 3;
cvt.u32.u16 %r55, %rs31;
add.s32 %r56, %r55, -1;
cvt.rn.f32.s32 %f138, %r56;
shr.u16 %rs32, %rs25, 6;
cvt.u32.u16 %r57, %rs32;
add.s32 %r58, %r57, -1;
cvt.rn.f32.s32 %f139, %r58;
mul.f32 %f140, %f99, %f136;
fma.rn.f32 %f141, %f140, %f563, %f568;
mul.f32 %f142, %f100, %f137;
fma.rn.f32 %f143, %f142, %f563, %f141;
mul.f32 %f144, %f101, %f138;
fma.rn.f32 %f145, %f144, %f563, %f143;
mul.f32 %f146, %f102, %f139;
fma.rn.f32 %f568, %f146, %f563, %f145;
@%p19 bra $L__BB4_21;
setp.lt.s32 %p20, %r15, 6;
ld.global.nc.u8 %rs33, [%rd8+4];
and.b16 %rs35, %rs33, 3;
cvt.u32.u16 %r59, %rs35;
add.s32 %r60, %r59, -1;
cvt.rn.f32.s32 %f147, %r60;
shr.u16 %rs36, %rs33, 2;
and.b16 %rs37, %rs36, 3;
cvt.u32.u16 %r61, %rs37;
add.s32 %r62, %r61, -1;
cvt.rn.f32.s32 %f148, %r62;
shr.u16 %rs38, %rs33, 4;
and.b16 %rs39, %rs38, 3;
cvt.u32.u16 %r63, %rs39;
add.s32 %r64, %r63, -1;
cvt.rn.f32.s32 %f149, %r64;
shr.u16 %rs40, %rs33, 6;
cvt.u32.u16 %r65, %rs40;
add.s32 %r66, %r65, -1;
cvt.rn.f32.s32 %f150, %r66;
mul.f32 %f151, %f99, %f147;
fma.rn.f32 %f152, %f151, %f562, %f568;
mul.f32 %f153, %f100, %f148;
fma.rn.f32 %f154, %f153, %f562, %f152;
mul.f32 %f155, %f101, %f149;
fma.rn.f32 %f156, %f155, %f562, %f154;
mul.f32 %f157, %f102, %f150;
fma.rn.f32 %f568, %f157, %f562, %f156;
@%p20 bra $L__BB4_21;
setp.eq.s32 %p21, %r15, 6;
ld.global.nc.u8 %rs41, [%rd8+5];
and.b16 %rs43, %rs41, 3;
cvt.u32.u16 %r67, %rs43;
add.s32 %r68, %r67, -1;
cvt.rn.f32.s32 %f158, %r68;
shr.u16 %rs44, %rs41, 2;
and.b16 %rs45, %rs44, 3;
cvt.u32.u16 %r69, %rs45;
add.s32 %r70, %r69, -1;
cvt.rn.f32.s32 %f159, %r70;
shr.u16 %rs46, %rs41, 4;
and.b16 %rs47, %rs46, 3;
cvt.u32.u16 %r71, %rs47;
add.s32 %r72, %r71, -1;
cvt.rn.f32.s32 %f160, %r72;
shr.u16 %rs48, %rs41, 6;
cvt.u32.u16 %r73, %rs48;
add.s32 %r74, %r73, -1;
cvt.rn.f32.s32 %f161, %r74;
mul.f32 %f162, %f99, %f158;
fma.rn.f32 %f163, %f162, %f561, %f568;
mul.f32 %f164, %f100, %f159;
fma.rn.f32 %f165, %f164, %f561, %f163;
mul.f32 %f166, %f101, %f160;
fma.rn.f32 %f167, %f166, %f561, %f165;
mul.f32 %f168, %f102, %f161;
fma.rn.f32 %f568, %f168, %f561, %f167;
@%p21 bra $L__BB4_21;
setp.lt.s32 %p22, %r15, 8;
ld.global.nc.u8 %rs49, [%rd8+6];
and.b16 %rs51, %rs49, 3;
cvt.u32.u16 %r75, %rs51;
add.s32 %r76, %r75, -1;
cvt.rn.f32.s32 %f169, %r76;
shr.u16 %rs52, %rs49, 2;
and.b16 %rs53, %rs52, 3;
cvt.u32.u16 %r77, %rs53;
add.s32 %r78, %r77, -1;
cvt.rn.f32.s32 %f170, %r78;
shr.u16 %rs54, %rs49, 4;
and.b16 %rs55, %rs54, 3;
cvt.u32.u16 %r79, %rs55;
add.s32 %r80, %r79, -1;
cvt.rn.f32.s32 %f171, %r80;
shr.u16 %rs56, %rs49, 6;
cvt.u32.u16 %r81, %rs56;
add.s32 %r82, %r81, -1;
cvt.rn.f32.s32 %f172, %r82;
mul.f32 %f173, %f99, %f169;
fma.rn.f32 %f174, %f173, %f560, %f568;
mul.f32 %f175, %f100, %f170;
fma.rn.f32 %f176, %f175, %f560, %f174;
mul.f32 %f177, %f101, %f171;
fma.rn.f32 %f178, %f177, %f560, %f176;
mul.f32 %f179, %f102, %f172;
fma.rn.f32 %f568, %f179, %f560, %f178;
@%p22 bra $L__BB4_21;
ld.global.nc.u8 %rs57, [%rd8+7];
and.b16 %rs59, %rs57, 3;
cvt.u32.u16 %r83, %rs59;
add.s32 %r84, %r83, -1;
cvt.rn.f32.s32 %f180, %r84;
shr.u16 %rs60, %rs57, 2;
and.b16 %rs61, %rs60, 3;
cvt.u32.u16 %r85, %rs61;
add.s32 %r86, %r85, -1;
cvt.rn.f32.s32 %f181, %r86;
shr.u16 %rs62, %rs57, 4;
and.b16 %rs63, %rs62, 3;
cvt.u32.u16 %r87, %rs63;
add.s32 %r88, %r87, -1;
cvt.rn.f32.s32 %f182, %r88;
shr.u16 %rs64, %rs57, 6;
cvt.u32.u16 %r89, %rs64;
add.s32 %r90, %r89, -1;
cvt.rn.f32.s32 %f183, %r90;
mul.f32 %f184, %f99, %f180;
fma.rn.f32 %f185, %f184, %f559, %f568;
mul.f32 %f186, %f100, %f181;
fma.rn.f32 %f187, %f186, %f559, %f185;
mul.f32 %f188, %f101, %f182;
fma.rn.f32 %f189, %f188, %f559, %f187;
mul.f32 %f190, %f102, %f183;
fma.rn.f32 %f568, %f190, %f559, %f189;
$L__BB4_21:
ld.global.nc.v4.f32 {%f191, %f192, %f193, %f194}, [%rd7+16];
add.s32 %r91, %r357, 1;
mul.lo.s32 %r92, %r91, %r15;
cvt.s64.s32 %rd30, %r92;
add.s64 %rd31, %rd30, %rd5;
add.s64 %rd10, %rd2, %rd31;
ld.global.nc.u8 %rs65, [%rd10];
and.b16 %rs67, %rs65, 3;
cvt.u32.u16 %r93, %rs67;
add.s32 %r94, %r93, -1;
cvt.rn.f32.s32 %f195, %r94;
shr.u16 %rs68, %rs65, 2;
and.b16 %rs69, %rs68, 3;
cvt.u32.u16 %r95, %rs69;
add.s32 %r96, %r95, -1;
cvt.rn.f32.s32 %f196, %r96;
shr.u16 %rs70, %rs65, 4;
and.b16 %rs71, %rs70, 3;
cvt.u32.u16 %r97, %rs71;
add.s32 %r98, %r97, -1;
cvt.rn.f32.s32 %f197, %r98;
shr.u16 %rs72, %rs65, 6;
cvt.u32.u16 %r99, %rs72;
add.s32 %r100, %r99, -1;
cvt.rn.f32.s32 %f198, %r100;
mul.f32 %f199, %f191, %f195;
fma.rn.f32 %f200, %f199, %f566, %f568;
mul.f32 %f201, %f192, %f196;
fma.rn.f32 %f202, %f201, %f566, %f200;
mul.f32 %f203, %f193, %f197;
fma.rn.f32 %f204, %f203, %f566, %f202;
mul.f32 %f205, %f194, %f198;
fma.rn.f32 %f569, %f205, %f566, %f204;
@%p16 bra $L__BB4_29;
setp.eq.s32 %p24, %r15, 2;
ld.global.nc.u8 %rs73, [%rd10+1];
and.b16 %rs75, %rs73, 3;
cvt.u32.u16 %r101, %rs75;
add.s32 %r102, %r101, -1;
cvt.rn.f32.s32 %f206, %r102;
shr.u16 %rs76, %rs73, 2;
and.b16 %rs77, %rs76, 3;
cvt.u32.u16 %r103, %rs77;
add.s32 %r104, %r103, -1;
cvt.rn.f32.s32 %f207, %r104;
shr.u16 %rs78, %rs73, 4;
and.b16 %rs79, %rs78, 3;
cvt.u32.u16 %r105, %rs79;
add.s32 %r106, %r105, -1;
cvt.rn.f32.s32 %f208, %r106;
shr.u16 %rs80, %rs73, 6;
cvt.u32.u16 %r107, %rs80;
add.s32 %r108, %r107, -1;
cvt.rn.f32.s32 %f209, %r108;
mul.f32 %f210, %f191, %f206;
fma.rn.f32 %f211, %f210, %f565, %f569;
mul.f32 %f212, %f192, %f207;
fma.rn.f32 %f213, %f212, %f565, %f211;
mul.f32 %f214, %f193, %f208;
fma.rn.f32 %f215, %f214, %f565, %f213;
mul.f32 %f216, %f194, %f209;
fma.rn.f32 %f569, %f216, %f565, %f215;
@%p24 bra $L__BB4_29;
setp.lt.s32 %p25, %r15, 4;
ld.global.nc.u8 %rs81, [%rd10+2];
and.b16 %rs83, %rs81, 3;
cvt.u32.u16 %r109, %rs83;
add.s32 %r110, %r109, -1;
cvt.rn.f32.s32 %f217, %r110;
shr.u16 %rs84, %rs81, 2;
and.b16 %rs85, %rs84, 3;
cvt.u32.u16 %r111, %rs85;
add.s32 %r112, %r111, -1;
cvt.rn.f32.s32 %f218, %r112;
shr.u16 %rs86, %rs81, 4;
and.b16 %rs87, %rs86, 3;
cvt.u32.u16 %r113, %rs87;
add.s32 %r114, %r113, -1;
cvt.rn.f32.s32 %f219, %r114;
shr.u16 %rs88, %rs81, 6;
cvt.u32.u16 %r115, %rs88;
add.s32 %r116, %r115, -1;
cvt.rn.f32.s32 %f220, %r116;
mul.f32 %f221, %f191, %f217;
fma.rn.f32 %f222, %f221, %f564, %f569;
mul.f32 %f223, %f192, %f218;
fma.rn.f32 %f224, %f223, %f564, %f222;
mul.f32 %f225, %f193, %f219;
fma.rn.f32 %f226, %f225, %f564, %f224;
mul.f32 %f227, %f194, %f220;
fma.rn.f32 %f569, %f227, %f564, %f226;
@%p25 bra $L__BB4_29;
setp.eq.s32 %p26, %r15, 4;
ld.global.nc.u8 %rs89, [%rd10+3];
and.b16 %rs91, %rs89, 3;
cvt.u32.u16 %r117, %rs91;
add.s32 %r118, %r117, -1;
cvt.rn.f32.s32 %f228, %r118;
shr.u16 %rs92, %rs89, 2;
and.b16 %rs93, %rs92, 3;
cvt.u32.u16 %r119, %rs93;
add.s32 %r120, %r119, -1;
cvt.rn.f32.s32 %f229, %r120;
shr.u16 %rs94, %rs89, 4;
and.b16 %rs95, %rs94, 3;
cvt.u32.u16 %r121, %rs95;
add.s32 %r122, %r121, -1;
cvt.rn.f32.s32 %f230, %r122;
shr.u16 %rs96, %rs89, 6;
cvt.u32.u16 %r123, %rs96;
add.s32 %r124, %r123, -1;
cvt.rn.f32.s32 %f231, %r124;
mul.f32 %f232, %f191, %f228;
fma.rn.f32 %f233, %f232, %f563, %f569;
mul.f32 %f234, %f192, %f229;
fma.rn.f32 %f235, %f234, %f563, %f233;
mul.f32 %f236, %f193, %f230;
fma.rn.f32 %f237, %f236, %f563, %f235;
mul.f32 %f238, %f194, %f231;
fma.rn.f32 %f569, %f238, %f563, %f237;
@%p26 bra $L__BB4_29;
setp.lt.s32 %p27, %r15, 6;
ld.global.nc.u8 %rs97, [%rd10+4];
and.b16 %rs99, %rs97, 3;
cvt.u32.u16 %r125, %rs99;
add.s32 %r126, %r125, -1;
cvt.rn.f32.s32 %f239, %r126;
shr.u16 %rs100, %rs97, 2;
and.b16 %rs101, %rs100, 3;
cvt.u32.u16 %r127, %rs101;
add.s32 %r128, %r127, -1;
cvt.rn.f32.s32 %f240, %r128;
shr.u16 %rs102, %rs97, 4;
and.b16 %rs103, %rs102, 3;
cvt.u32.u16 %r129, %rs103;
add.s32 %r130, %r129, -1;
cvt.rn.f32.s32 %f241, %r130;
shr.u16 %rs104, %rs97, 6;
cvt.u32.u16 %r131, %rs104;
add.s32 %r132, %r131, -1;
cvt.rn.f32.s32 %f242, %r132;
mul.f32 %f243, %f191, %f239;
fma.rn.f32 %f244, %f243, %f562, %f569;
mul.f32 %f245, %f192, %f240;
fma.rn.f32 %f246, %f245, %f562, %f244;
mul.f32 %f247, %f193, %f241;
fma.rn.f32 %f248, %f247, %f562, %f246;
mul.f32 %f249, %f194, %f242;
fma.rn.f32 %f569, %f249, %f562, %f248;
@%p27 bra $L__BB4_29;
setp.eq.s32 %p28, %r15, 6;
ld.global.nc.u8 %rs105, [%rd10+5];
and.b16 %rs107, %rs105, 3;
cvt.u32.u16 %r133, %rs107;
add.s32 %r134, %r133, -1;
cvt.rn.f32.s32 %f250, %r134;
shr.u16 %rs108, %rs105, 2;
and.b16 %rs109, %rs108, 3;
cvt.u32.u16 %r135, %rs109;
add.s32 %r136, %r135, -1;
cvt.rn.f32.s32 %f251, %r136;
shr.u16 %rs110, %rs105, 4;
and.b16 %rs111, %rs110, 3;
cvt.u32.u16 %r137, %rs111;
add.s32 %r138, %r137, -1;
cvt.rn.f32.s32 %f252, %r138;
shr.u16 %rs112, %rs105, 6;
cvt.u32.u16 %r139, %rs112;
add.s32 %r140, %r139, -1;
cvt.rn.f32.s32 %f253, %r140;
mul.f32 %f254, %f191, %f250;
fma.rn.f32 %f255, %f254, %f561, %f569;
mul.f32 %f256, %f192, %f251;
fma.rn.f32 %f257, %f256, %f561, %f255;
mul.f32 %f258, %f193, %f252;
fma.rn.f32 %f259, %f258, %f561, %f257;
mul.f32 %f260, %f194, %f253;
fma.rn.f32 %f569, %f260, %f561, %f259;
@%p28 bra $L__BB4_29;
setp.lt.s32 %p29, %r15, 8;
ld.global.nc.u8 %rs113, [%rd10+6];
and.b16 %rs115, %rs113, 3;
cvt.u32.u16 %r141, %rs115;
add.s32 %r142, %r141, -1;
cvt.rn.f32.s32 %f261, %r142;
shr.u16 %rs116, %rs113, 2;
and.b16 %rs117, %rs116, 3;
cvt.u32.u16 %r143, %rs117;
add.s32 %r144, %r143, -1;
cvt.rn.f32.s32 %f262, %r144;
shr.u16 %rs118, %rs113, 4;
and.b16 %rs119, %rs118, 3;
cvt.u32.u16 %r145, %rs119;
add.s32 %r146, %r145, -1;
cvt.rn.f32.s32 %f263, %r146;
shr.u16 %rs120, %rs113, 6;
cvt.u32.u16 %r147, %rs120;
add.s32 %r148, %r147, -1;
cvt.rn.f32.s32 %f264, %r148;
mul.f32 %f265, %f191, %f261;
fma.rn.f32 %f266, %f265, %f560, %f569;
mul.f32 %f267, %f192, %f262;
fma.rn.f32 %f268, %f267, %f560, %f266;
mul.f32 %f269, %f193, %f263;
fma.rn.f32 %f270, %f269, %f560, %f268;
mul.f32 %f271, %f194, %f264;
fma.rn.f32 %f569, %f271, %f560, %f270;
@%p29 bra $L__BB4_29;
ld.global.nc.u8 %rs121, [%rd10+7];
and.b16 %rs123, %rs121, 3;
cvt.u32.u16 %r149, %rs123;
add.s32 %r150, %r149, -1;
cvt.rn.f32.s32 %f272, %r150;
shr.u16 %rs124, %rs121, 2;
and.b16 %rs125, %rs124, 3;
cvt.u32.u16 %r151, %rs125;
add.s32 %r152, %r151, -1;
cvt.rn.f32.s32 %f273, %r152;
shr.u16 %rs126, %rs121, 4;
and.b16 %rs127, %rs126, 3;
cvt.u32.u16 %r153, %rs127;
add.s32 %r154, %r153, -1;
cvt.rn.f32.s32 %f274, %r154;
shr.u16 %rs128, %rs121, 6;
cvt.u32.u16 %r155, %rs128;
add.s32 %r156, %r155, -1;
cvt.rn.f32.s32 %f275, %r156;
mul.f32 %f276, %f191, %f272;
fma.rn.f32 %f277, %f276, %f559, %f569;
mul.f32 %f278, %f192, %f273;
fma.rn.f32 %f279, %f278, %f559, %f277;
mul.f32 %f280, %f193, %f274;
fma.rn.f32 %f281, %f280, %f559, %f279;
mul.f32 %f282, %f194, %f275;
fma.rn.f32 %f569, %f282, %f559, %f281;
$L__BB4_29:
ld.global.nc.v4.f32 {%f283, %f284, %f285, %f286}, [%rd7+32];
add.s32 %r157, %r357, 2;
mul.lo.s32 %r158, %r157, %r15;
cvt.s64.s32 %rd32, %r158;
add.s64 %rd33, %rd32, %rd5;
add.s64 %rd11, %rd2, %rd33;
ld.global.nc.u8 %rs129, [%rd11];
and.b16 %rs131, %rs129, 3;
cvt.u32.u16 %r159, %rs131;
add.s32 %r160, %r159, -1;
cvt.rn.f32.s32 %f287, %r160;
shr.u16 %rs132, %rs129, 2;
and.b16 %rs133, %rs132, 3;
cvt.u32.u16 %r161, %rs133;
add.s32 %r162, %r161, -1;
cvt.rn.f32.s32 %f288, %r162;
shr.u16 %rs134, %rs129, 4;
and.b16 %rs135, %rs134, 3;
cvt.u32.u16 %r163, %rs135;
add.s32 %r164, %r163, -1;
cvt.rn.f32.s32 %f289, %r164;
shr.u16 %rs136, %rs129, 6;
cvt.u32.u16 %r165, %rs136;
add.s32 %r166, %r165, -1;
cvt.rn.f32.s32 %f290, %r166;
mul.f32 %f291, %f283, %f287;
fma.rn.f32 %f292, %f291, %f566, %f569;
mul.f32 %f293, %f284, %f288;
fma.rn.f32 %f294, %f293, %f566, %f292;
mul.f32 %f295, %f285, %f289;
fma.rn.f32 %f296, %f295, %f566, %f294;
mul.f32 %f297, %f286, %f290;
fma.rn.f32 %f570, %f297, %f566, %f296;
@%p16 bra $L__BB4_37;
setp.eq.s32 %p31, %r15, 2;
ld.global.nc.u8 %rs137, [%rd11+1];
and.b16 %rs139, %rs137, 3;
cvt.u32.u16 %r167, %rs139;
add.s32 %r168, %r167, -1;
cvt.rn.f32.s32 %f298, %r168;
shr.u16 %rs140, %rs137, 2;
and.b16 %rs141, %rs140, 3;
cvt.u32.u16 %r169, %rs141;
add.s32 %r170, %r169, -1;
cvt.rn.f32.s32 %f299, %r170;
shr.u16 %rs142, %rs137, 4;
and.b16 %rs143, %rs142, 3;
cvt.u32.u16 %r171, %rs143;
add.s32 %r172, %r171, -1;
cvt.rn.f32.s32 %f300, %r172;
shr.u16 %rs144, %rs137, 6;
cvt.u32.u16 %r173, %rs144;
add.s32 %r174, %r173, -1;
cvt.rn.f32.s32 %f301, %r174;
mul.f32 %f302, %f283, %f298;
fma.rn.f32 %f303, %f302, %f565, %f570;
mul.f32 %f304, %f284, %f299;
fma.rn.f32 %f305, %f304, %f565, %f303;
mul.f32 %f306, %f285, %f300;
fma.rn.f32 %f307, %f306, %f565, %f305;
mul.f32 %f308, %f286, %f301;
fma.rn.f32 %f570, %f308, %f565, %f307;
@%p31 bra $L__BB4_37;
setp.lt.s32 %p32, %r15, 4;
ld.global.nc.u8 %rs145, [%rd11+2];
and.b16 %rs147, %rs145, 3;
cvt.u32.u16 %r175, %rs147;
add.s32 %r176, %r175, -1;
cvt.rn.f32.s32 %f309, %r176;
shr.u16 %rs148, %rs145, 2;
and.b16 %rs149, %rs148, 3;
cvt.u32.u16 %r177, %rs149;
add.s32 %r178, %r177, -1;
cvt.rn.f32.s32 %f310, %r178;
shr.u16 %rs150, %rs145, 4;
and.b16 %rs151, %rs150, 3;
cvt.u32.u16 %r179, %rs151;
add.s32 %r180, %r179, -1;
cvt.rn.f32.s32 %f311, %r180;
shr.u16 %rs152, %rs145, 6;
cvt.u32.u16 %r181, %rs152;
add.s32 %r182, %r181, -1;
cvt.rn.f32.s32 %f312, %r182;
mul.f32 %f313, %f283, %f309;
fma.rn.f32 %f314, %f313, %f564, %f570;
mul.f32 %f315, %f284, %f310;
fma.rn.f32 %f316, %f315, %f564, %f314;
mul.f32 %f317, %f285, %f311;
fma.rn.f32 %f318, %f317, %f564, %f316;
mul.f32 %f319, %f286, %f312;
fma.rn.f32 %f570, %f319, %f564, %f318;
@%p32 bra $L__BB4_37;
setp.eq.s32 %p33, %r15, 4;
ld.global.nc.u8 %rs153, [%rd11+3];
and.b16 %rs155, %rs153, 3;
cvt.u32.u16 %r183, %rs155;
add.s32 %r184, %r183, -1;
cvt.rn.f32.s32 %f320, %r184;
shr.u16 %rs156, %rs153, 2;
and.b16 %rs157, %rs156, 3;
cvt.u32.u16 %r185, %rs157;
add.s32 %r186, %r185, -1;
cvt.rn.f32.s32 %f321, %r186;
shr.u16 %rs158, %rs153, 4;
and.b16 %rs159, %rs158, 3;
cvt.u32.u16 %r187, %rs159;
add.s32 %r188, %r187, -1;
cvt.rn.f32.s32 %f322, %r188;
shr.u16 %rs160, %rs153, 6;
cvt.u32.u16 %r189, %rs160;
add.s32 %r190, %r189, -1;
cvt.rn.f32.s32 %f323, %r190;
mul.f32 %f324, %f283, %f320;
fma.rn.f32 %f325, %f324, %f563, %f570;
mul.f32 %f326, %f284, %f321;
fma.rn.f32 %f327, %f326, %f563, %f325;
mul.f32 %f328, %f285, %f322;
fma.rn.f32 %f329, %f328, %f563, %f327;
mul.f32 %f330, %f286, %f323;
fma.rn.f32 %f570, %f330, %f563, %f329;
@%p33 bra $L__BB4_37;
setp.lt.s32 %p34, %r15, 6;
ld.global.nc.u8 %rs161, [%rd11+4];
and.b16 %rs163, %rs161, 3;
cvt.u32.u16 %r191, %rs163;
add.s32 %r192, %r191, -1;
cvt.rn.f32.s32 %f331, %r192;
shr.u16 %rs164, %rs161, 2;
and.b16 %rs165, %rs164, 3;
cvt.u32.u16 %r193, %rs165;
add.s32 %r194, %r193, -1;
cvt.rn.f32.s32 %f332, %r194;
shr.u16 %rs166, %rs161, 4;
and.b16 %rs167, %rs166, 3;
cvt.u32.u16 %r195, %rs167;
add.s32 %r196, %r195, -1;
cvt.rn.f32.s32 %f333, %r196;
shr.u16 %rs168, %rs161, 6;
cvt.u32.u16 %r197, %rs168;
add.s32 %r198, %r197, -1;
cvt.rn.f32.s32 %f334, %r198;
mul.f32 %f335, %f283, %f331;
fma.rn.f32 %f336, %f335, %f562, %f570;
mul.f32 %f337, %f284, %f332;
fma.rn.f32 %f338, %f337, %f562, %f336;
mul.f32 %f339, %f285, %f333;
fma.rn.f32 %f340, %f339, %f562, %f338;
mul.f32 %f341, %f286, %f334;
fma.rn.f32 %f570, %f341, %f562, %f340;
@%p34 bra $L__BB4_37;
setp.eq.s32 %p35, %r15, 6;
ld.global.nc.u8 %rs169, [%rd11+5];
and.b16 %rs171, %rs169, 3;
cvt.u32.u16 %r199, %rs171;
add.s32 %r200, %r199, -1;
cvt.rn.f32.s32 %f342, %r200;
shr.u16 %rs172, %rs169, 2;
and.b16 %rs173, %rs172, 3;
cvt.u32.u16 %r201, %rs173;
add.s32 %r202, %r201, -1;
cvt.rn.f32.s32 %f343, %r202;
shr.u16 %rs174, %rs169, 4;
and.b16 %rs175, %rs174, 3;
cvt.u32.u16 %r203, %rs175;
add.s32 %r204, %r203, -1;
cvt.rn.f32.s32 %f344, %r204;
shr.u16 %rs176, %rs169, 6;
cvt.u32.u16 %r205, %rs176;
add.s32 %r206, %r205, -1;
cvt.rn.f32.s32 %f345, %r206;
mul.f32 %f346, %f283, %f342;
fma.rn.f32 %f347, %f346, %f561, %f570;
mul.f32 %f348, %f284, %f343;
fma.rn.f32 %f349, %f348, %f561, %f347;
mul.f32 %f350, %f285, %f344;
fma.rn.f32 %f351, %f350, %f561, %f349;
mul.f32 %f352, %f286, %f345;
fma.rn.f32 %f570, %f352, %f561, %f351;
@%p35 bra $L__BB4_37;
setp.lt.s32 %p36, %r15, 8;
ld.global.nc.u8 %rs177, [%rd11+6];
and.b16 %rs179, %rs177, 3;
cvt.u32.u16 %r207, %rs179;
add.s32 %r208, %r207, -1;
cvt.rn.f32.s32 %f353, %r208;
shr.u16 %rs180, %rs177, 2;
and.b16 %rs181, %rs180, 3;
cvt.u32.u16 %r209, %rs181;
add.s32 %r210, %r209, -1;
cvt.rn.f32.s32 %f354, %r210;
shr.u16 %rs182, %rs177, 4;
and.b16 %rs183, %rs182, 3;
cvt.u32.u16 %r211, %rs183;
add.s32 %r212, %r211, -1;
cvt.rn.f32.s32 %f355, %r212;
shr.u16 %rs184, %rs177, 6;
cvt.u32.u16 %r213, %rs184;
add.s32 %r214, %r213, -1;
cvt.rn.f32.s32 %f356, %r214;
mul.f32 %f357, %f283, %f353;
fma.rn.f32 %f358, %f357, %f560, %f570;
mul.f32 %f359, %f284, %f354;
fma.rn.f32 %f360, %f359, %f560, %f358;
mul.f32 %f361, %f285, %f355;
fma.rn.f32 %f362, %f361, %f560, %f360;
mul.f32 %f363, %f286, %f356;
fma.rn.f32 %f570, %f363, %f560, %f362;
@%p36 bra $L__BB4_37;
ld.global.nc.u8 %rs185, [%rd11+7];
and.b16 %rs187, %rs185, 3;
cvt.u32.u16 %r215, %rs187;
add.s32 %r216, %r215, -1;
cvt.rn.f32.s32 %f364, %r216;
shr.u16 %rs188, %rs185, 2;
and.b16 %rs189, %rs188, 3;
cvt.u32.u16 %r217, %rs189;
add.s32 %r218, %r217, -1;
cvt.rn.f32.s32 %f365, %r218;
shr.u16 %rs190, %rs185, 4;
and.b16 %rs191, %rs190, 3;
cvt.u32.u16 %r219, %rs191;
add.s32 %r220, %r219, -1;
cvt.rn.f32.s32 %f366, %r220;
shr.u16 %rs192, %rs185, 6;
cvt.u32.u16 %r221, %rs192;
add.s32 %r222, %r221, -1;
cvt.rn.f32.s32 %f367, %r222;
mul.f32 %f368, %f283, %f364;
fma.rn.f32 %f369, %f368, %f559, %f570;
mul.f32 %f370, %f284, %f365;
fma.rn.f32 %f371, %f370, %f559, %f369;
mul.f32 %f372, %f285, %f366;
fma.rn.f32 %f373, %f372, %f559, %f371;
mul.f32 %f374, %f286, %f367;
fma.rn.f32 %f570, %f374, %f559, %f373;
$L__BB4_37:
ld.global.nc.v4.f32 {%f375, %f376, %f377, %f378}, [%rd7+48];
add.s32 %r223, %r357, 3;
mul.lo.s32 %r224, %r223, %r15;
cvt.s64.s32 %rd34, %r224;
add.s64 %rd35, %rd34, %rd5;
add.s64 %rd12, %rd2, %rd35;
ld.global.nc.u8 %rs193, [%rd12];
and.b16 %rs195, %rs193, 3;
cvt.u32.u16 %r225, %rs195;
add.s32 %r226, %r225, -1;
cvt.rn.f32.s32 %f379, %r226;
shr.u16 %rs196, %rs193, 2;
and.b16 %rs197, %rs196, 3;
cvt.u32.u16 %r227, %rs197;
add.s32 %r228, %r227, -1;
cvt.rn.f32.s32 %f380, %r228;
shr.u16 %rs198, %rs193, 4;
and.b16 %rs199, %rs198, 3;
cvt.u32.u16 %r229, %rs199;
add.s32 %r230, %r229, -1;
cvt.rn.f32.s32 %f381, %r230;
shr.u16 %rs200, %rs193, 6;
cvt.u32.u16 %r231, %rs200;
add.s32 %r232, %r231, -1;
cvt.rn.f32.s32 %f382, %r232;
mul.f32 %f383, %f375, %f379;
fma.rn.f32 %f384, %f383, %f566, %f570;
mul.f32 %f385, %f376, %f380;
fma.rn.f32 %f386, %f385, %f566, %f384;
mul.f32 %f387, %f377, %f381;
fma.rn.f32 %f388, %f387, %f566, %f386;
mul.f32 %f389, %f378, %f382;
fma.rn.f32 %f575, %f389, %f566, %f388;
@%p16 bra $L__BB4_45;
setp.eq.s32 %p38, %r15, 2;
ld.global.nc.u8 %rs201, [%rd12+1];
and.b16 %rs203, %rs201, 3;
cvt.u32.u16 %r233, %rs203;
add.s32 %r234, %r233, -1;
cvt.rn.f32.s32 %f390, %r234;
shr.u16 %rs204, %rs201, 2;
and.b16 %rs205, %rs204, 3;
cvt.u32.u16 %r235, %rs205;
add.s32 %r236, %r235, -1;
cvt.rn.f32.s32 %f391, %r236;
shr.u16 %rs206, %rs201, 4;
and.b16 %rs207, %rs206, 3;
cvt.u32.u16 %r237, %rs207;
add.s32 %r238, %r237, -1;
cvt.rn.f32.s32 %f392, %r238;
shr.u16 %rs208, %rs201, 6;
cvt.u32.u16 %r239, %rs208;
add.s32 %r240, %r239, -1;
cvt.rn.f32.s32 %f393, %r240;
mul.f32 %f394, %f375, %f390;
fma.rn.f32 %f395, %f394, %f565, %f575;
mul.f32 %f396, %f376, %f391;
fma.rn.f32 %f397, %f396, %f565, %f395;
mul.f32 %f398, %f377, %f392;
fma.rn.f32 %f399, %f398, %f565, %f397;
mul.f32 %f400, %f378, %f393;
fma.rn.f32 %f575, %f400, %f565, %f399;
@%p38 bra $L__BB4_45;
setp.lt.s32 %p39, %r15, 4;
ld.global.nc.u8 %rs209, [%rd12+2];
and.b16 %rs211, %rs209, 3;
cvt.u32.u16 %r241, %rs211;
add.s32 %r242, %r241, -1;
cvt.rn.f32.s32 %f401, %r242;
shr.u16 %rs212, %rs209, 2;
and.b16 %rs213, %rs212, 3;
cvt.u32.u16 %r243, %rs213;
add.s32 %r244, %r243, -1;
cvt.rn.f32.s32 %f402, %r244;
shr.u16 %rs214, %rs209, 4;
and.b16 %rs215, %rs214, 3;
cvt.u32.u16 %r245, %rs215;
add.s32 %r246, %r245, -1;
cvt.rn.f32.s32 %f403, %r246;
shr.u16 %rs216, %rs209, 6;
cvt.u32.u16 %r247, %rs216;
add.s32 %r248, %r247, -1;
cvt.rn.f32.s32 %f404, %r248;
mul.f32 %f405, %f375, %f401;
fma.rn.f32 %f406, %f405, %f564, %f575;
mul.f32 %f407, %f376, %f402;
fma.rn.f32 %f408, %f407, %f564, %f406;
mul.f32 %f409, %f377, %f403;
fma.rn.f32 %f410, %f409, %f564, %f408;
mul.f32 %f411, %f378, %f404;
fma.rn.f32 %f575, %f411, %f564, %f410;
@%p39 bra $L__BB4_45;
setp.eq.s32 %p40, %r15, 4;
ld.global.nc.u8 %rs217, [%rd12+3];
and.b16 %rs219, %rs217, 3;
cvt.u32.u16 %r249, %rs219;
add.s32 %r250, %r249, -1;
cvt.rn.f32.s32 %f412, %r250;
shr.u16 %rs220, %rs217, 2;
and.b16 %rs221, %rs220, 3;
cvt.u32.u16 %r251, %rs221;
add.s32 %r252, %r251, -1;
cvt.rn.f32.s32 %f413, %r252;
shr.u16 %rs222, %rs217, 4;
and.b16 %rs223, %rs222, 3;
cvt.u32.u16 %r253, %rs223;
add.s32 %r254, %r253, -1;
cvt.rn.f32.s32 %f414, %r254;
shr.u16 %rs224, %rs217, 6;
cvt.u32.u16 %r255, %rs224;
add.s32 %r256, %r255, -1;
cvt.rn.f32.s32 %f415, %r256;
mul.f32 %f416, %f375, %f412;
fma.rn.f32 %f417, %f416, %f563, %f575;
mul.f32 %f418, %f376, %f413;
fma.rn.f32 %f419, %f418, %f563, %f417;
mul.f32 %f420, %f377, %f414;
fma.rn.f32 %f421, %f420, %f563, %f419;
mul.f32 %f422, %f378, %f415;
fma.rn.f32 %f575, %f422, %f563, %f421;
@%p40 bra $L__BB4_45;
setp.lt.s32 %p41, %r15, 6;
ld.global.nc.u8 %rs225, [%rd12+4];
and.b16 %rs227, %rs225, 3;
cvt.u32.u16 %r257, %rs227;
add.s32 %r258, %r257, -1;
cvt.rn.f32.s32 %f423, %r258;
shr.u16 %rs228, %rs225, 2;
and.b16 %rs229, %rs228, 3;
cvt.u32.u16 %r259, %rs229;
add.s32 %r260, %r259, -1;
cvt.rn.f32.s32 %f424, %r260;
shr.u16 %rs230, %rs225, 4;
and.b16 %rs231, %rs230, 3;
cvt.u32.u16 %r261, %rs231;
add.s32 %r262, %r261, -1;
cvt.rn.f32.s32 %f425, %r262;
shr.u16 %rs232, %rs225, 6;
cvt.u32.u16 %r263, %rs232;
add.s32 %r264, %r263, -1;
cvt.rn.f32.s32 %f426, %r264;
mul.f32 %f427, %f375, %f423;
fma.rn.f32 %f428, %f427, %f562, %f575;
mul.f32 %f429, %f376, %f424;
fma.rn.f32 %f430, %f429, %f562, %f428;
mul.f32 %f431, %f377, %f425;
fma.rn.f32 %f432, %f431, %f562, %f430;
mul.f32 %f433, %f378, %f426;
fma.rn.f32 %f575, %f433, %f562, %f432;
@%p41 bra $L__BB4_45;
setp.eq.s32 %p42, %r15, 6;
ld.global.nc.u8 %rs233, [%rd12+5];
and.b16 %rs235, %rs233, 3;
cvt.u32.u16 %r265, %rs235;
add.s32 %r266, %r265, -1;
cvt.rn.f32.s32 %f434, %r266;
shr.u16 %rs236, %rs233, 2;
and.b16 %rs237, %rs236, 3;
cvt.u32.u16 %r267, %rs237;
add.s32 %r268, %r267, -1;
cvt.rn.f32.s32 %f435, %r268;
shr.u16 %rs238, %rs233, 4;
and.b16 %rs239, %rs238, 3;
cvt.u32.u16 %r269, %rs239;
add.s32 %r270, %r269, -1;
cvt.rn.f32.s32 %f436, %r270;
shr.u16 %rs240, %rs233, 6;
cvt.u32.u16 %r271, %rs240;
add.s32 %r272, %r271, -1;
cvt.rn.f32.s32 %f437, %r272;
mul.f32 %f438, %f375, %f434;
fma.rn.f32 %f439, %f438, %f561, %f575;
mul.f32 %f440, %f376, %f435;
fma.rn.f32 %f441, %f440, %f561, %f439;
mul.f32 %f442, %f377, %f436;
fma.rn.f32 %f443, %f442, %f561, %f441;
mul.f32 %f444, %f378, %f437;
fma.rn.f32 %f575, %f444, %f561, %f443;
@%p42 bra $L__BB4_45;
setp.lt.s32 %p43, %r15, 8;
ld.global.nc.u8 %rs241, [%rd12+6];
and.b16 %rs243, %rs241, 3;
cvt.u32.u16 %r273, %rs243;
add.s32 %r274, %r273, -1;
cvt.rn.f32.s32 %f445, %r274;
shr.u16 %rs244, %rs241, 2;
and.b16 %rs245, %rs244, 3;
cvt.u32.u16 %r275, %rs245;
add.s32 %r276, %r275, -1;
cvt.rn.f32.s32 %f446, %r276;
shr.u16 %rs246, %rs241, 4;
and.b16 %rs247, %rs246, 3;
cvt.u32.u16 %r277, %rs247;
add.s32 %r278, %r277, -1;
cvt.rn.f32.s32 %f447, %r278;
shr.u16 %rs248, %rs241, 6;
cvt.u32.u16 %r279, %rs248;
add.s32 %r280, %r279, -1;
cvt.rn.f32.s32 %f448, %r280;
mul.f32 %f449, %f375, %f445;
fma.rn.f32 %f450, %f449, %f560, %f575;
mul.f32 %f451, %f376, %f446;
fma.rn.f32 %f452, %f451, %f560, %f450;
mul.f32 %f453, %f377, %f447;
fma.rn.f32 %f454, %f453, %f560, %f452;
mul.f32 %f455, %f378, %f448;
fma.rn.f32 %f575, %f455, %f560, %f454;
@%p43 bra $L__BB4_45;
ld.global.nc.u8 %rs249, [%rd12+7];
and.b16 %rs251, %rs249, 3;
cvt.u32.u16 %r281, %rs251;
add.s32 %r282, %r281, -1;
cvt.rn.f32.s32 %f456, %r282;
shr.u16 %rs252, %rs249, 2;
and.b16 %rs253, %rs252, 3;
cvt.u32.u16 %r283, %rs253;
add.s32 %r284, %r283, -1;
cvt.rn.f32.s32 %f457, %r284;
shr.u16 %rs254, %rs249, 4;
and.b16 %rs255, %rs254, 3;
cvt.u32.u16 %r285, %rs255;
add.s32 %r286, %r285, -1;
cvt.rn.f32.s32 %f458, %r286;
shr.u16 %rs256, %rs249, 6;
cvt.u32.u16 %r287, %rs256;
add.s32 %r288, %r287, -1;
cvt.rn.f32.s32 %f459, %r288;
mul.f32 %f460, %f375, %f456;
fma.rn.f32 %f461, %f460, %f559, %f575;
mul.f32 %f462, %f376, %f457;
fma.rn.f32 %f463, %f462, %f559, %f461;
mul.f32 %f464, %f377, %f458;
fma.rn.f32 %f465, %f464, %f559, %f463;
mul.f32 %f466, %f378, %f459;
fma.rn.f32 %f575, %f466, %f559, %f465;
$L__BB4_45:
add.s32 %r357, %r357, 4;
add.s32 %r356, %r356, -4;
setp.ne.s32 %p44, %r356, 0;
@%p44 bra $L__BB4_13;
$L__BB4_46:
setp.eq.s32 %p45, %r358, 0;
@%p45 bra $L__BB4_57;
mul.lo.s32 %r289, %r357, %r15;
add.s64 %rd43, %rd2, %rd5;
cvt.s64.s32 %rd14, %r15;
cvt.s64.s32 %rd15, %r289;
mul.wide.s32 %rd36, %r357, 4;
add.s64 %rd37, %rd36, %rd4;
shl.b64 %rd38, %rd37, 2;
add.s64 %rd42, %rd3, %rd38;
$L__BB4_48:
.pragma "nounroll";
ld.global.nc.v4.f32 {%f467, %f468, %f469, %f470}, [%rd42];
add.s64 %rd19, %rd43, %rd15;
ld.global.nc.u8 %rs257, [%rd19];
and.b16 %rs259, %rs257, 3;
cvt.u32.u16 %r290, %rs259;
add.s32 %r291, %r290, -1;
cvt.rn.f32.s32 %f471, %r291;
shr.u16 %rs260, %rs257, 2;
and.b16 %rs261, %rs260, 3;
cvt.u32.u16 %r292, %rs261;
add.s32 %r293, %r292, -1;
cvt.rn.f32.s32 %f472, %r293;
shr.u16 %rs262, %rs257, 4;
and.b16 %rs263, %rs262, 3;
cvt.u32.u16 %r294, %rs263;
add.s32 %r295, %r294, -1;
cvt.rn.f32.s32 %f473, %r295;
shr.u16 %rs264, %rs257, 6;
cvt.u32.u16 %r296, %rs264;
add.s32 %r297, %r296, -1;
cvt.rn.f32.s32 %f474, %r297;
mul.f32 %f475, %f467, %f471;
fma.rn.f32 %f476, %f475, %f566, %f575;
mul.f32 %f477, %f468, %f472;
fma.rn.f32 %f478, %f477, %f566, %f476;
mul.f32 %f479, %f469, %f473;
fma.rn.f32 %f480, %f479, %f566, %f478;
mul.f32 %f481, %f470, %f474;
fma.rn.f32 %f575, %f481, %f566, %f480;
setp.lt.s32 %p46, %r15, 2;
@%p46 bra $L__BB4_56;
setp.eq.s32 %p47, %r15, 2;
ld.global.nc.u8 %rs265, [%rd19+1];
and.b16 %rs267, %rs265, 3;
cvt.u32.u16 %r298, %rs267;
add.s32 %r299, %r298, -1;
cvt.rn.f32.s32 %f482, %r299;
shr.u16 %rs268, %rs265, 2;
and.b16 %rs269, %rs268, 3;
cvt.u32.u16 %r300, %rs269;
add.s32 %r301, %r300, -1;
cvt.rn.f32.s32 %f483, %r301;
shr.u16 %rs270, %rs265, 4;
and.b16 %rs271, %rs270, 3;
cvt.u32.u16 %r302, %rs271;
add.s32 %r303, %r302, -1;
cvt.rn.f32.s32 %f484, %r303;
shr.u16 %rs272, %rs265, 6;
cvt.u32.u16 %r304, %rs272;
add.s32 %r305, %r304, -1;
cvt.rn.f32.s32 %f485, %r305;
mul.f32 %f486, %f467, %f482;
fma.rn.f32 %f487, %f486, %f565, %f575;
mul.f32 %f488, %f468, %f483;
fma.rn.f32 %f489, %f488, %f565, %f487;
mul.f32 %f490, %f469, %f484;
fma.rn.f32 %f491, %f490, %f565, %f489;
mul.f32 %f492, %f470, %f485;
fma.rn.f32 %f575, %f492, %f565, %f491;
@%p47 bra $L__BB4_56;
setp.lt.s32 %p48, %r15, 4;
ld.global.nc.u8 %rs273, [%rd19+2];
and.b16 %rs275, %rs273, 3;
cvt.u32.u16 %r306, %rs275;
add.s32 %r307, %r306, -1;
cvt.rn.f32.s32 %f493, %r307;
shr.u16 %rs276, %rs273, 2;
and.b16 %rs277, %rs276, 3;
cvt.u32.u16 %r308, %rs277;
add.s32 %r309, %r308, -1;
cvt.rn.f32.s32 %f494, %r309;
shr.u16 %rs278, %rs273, 4;
and.b16 %rs279, %rs278, 3;
cvt.u32.u16 %r310, %rs279;
add.s32 %r311, %r310, -1;
cvt.rn.f32.s32 %f495, %r311;
shr.u16 %rs280, %rs273, 6;
cvt.u32.u16 %r312, %rs280;
add.s32 %r313, %r312, -1;
cvt.rn.f32.s32 %f496, %r313;
mul.f32 %f497, %f467, %f493;
fma.rn.f32 %f498, %f497, %f564, %f575;
mul.f32 %f499, %f468, %f494;
fma.rn.f32 %f500, %f499, %f564, %f498;
mul.f32 %f501, %f469, %f495;
fma.rn.f32 %f502, %f501, %f564, %f500;
mul.f32 %f503, %f470, %f496;
fma.rn.f32 %f575, %f503, %f564, %f502;
@%p48 bra $L__BB4_56;
setp.eq.s32 %p49, %r15, 4;
ld.global.nc.u8 %rs281, [%rd19+3];
and.b16 %rs283, %rs281, 3;
cvt.u32.u16 %r314, %rs283;
add.s32 %r315, %r314, -1;
cvt.rn.f32.s32 %f504, %r315;
shr.u16 %rs284, %rs281, 2;
and.b16 %rs285, %rs284, 3;
cvt.u32.u16 %r316, %rs285;
add.s32 %r317, %r316, -1;
cvt.rn.f32.s32 %f505, %r317;
shr.u16 %rs286, %rs281, 4;
and.b16 %rs287, %rs286, 3;
cvt.u32.u16 %r318, %rs287;
add.s32 %r319, %r318, -1;
cvt.rn.f32.s32 %f506, %r319;
shr.u16 %rs288, %rs281, 6;
cvt.u32.u16 %r320, %rs288;
add.s32 %r321, %r320, -1;
cvt.rn.f32.s32 %f507, %r321;
mul.f32 %f508, %f467, %f504;
fma.rn.f32 %f509, %f508, %f563, %f575;
mul.f32 %f510, %f468, %f505;
fma.rn.f32 %f511, %f510, %f563, %f509;
mul.f32 %f512, %f469, %f506;
fma.rn.f32 %f513, %f512, %f563, %f511;
mul.f32 %f514, %f470, %f507;
fma.rn.f32 %f575, %f514, %f563, %f513;
@%p49 bra $L__BB4_56;
setp.lt.s32 %p50, %r15, 6;
ld.global.nc.u8 %rs289, [%rd19+4];
and.b16 %rs291, %rs289, 3;
cvt.u32.u16 %r322, %rs291;
add.s32 %r323, %r322, -1;
cvt.rn.f32.s32 %f515, %r323;
shr.u16 %rs292, %rs289, 2;
and.b16 %rs293, %rs292, 3;
cvt.u32.u16 %r324, %rs293;
add.s32 %r325, %r324, -1;
cvt.rn.f32.s32 %f516, %r325;
shr.u16 %rs294, %rs289, 4;
and.b16 %rs295, %rs294, 3;
cvt.u32.u16 %r326, %rs295;
add.s32 %r327, %r326, -1;
cvt.rn.f32.s32 %f517, %r327;
shr.u16 %rs296, %rs289, 6;
cvt.u32.u16 %r328, %rs296;
add.s32 %r329, %r328, -1;
cvt.rn.f32.s32 %f518, %r329;
mul.f32 %f519, %f467, %f515;
fma.rn.f32 %f520, %f519, %f562, %f575;
mul.f32 %f521, %f468, %f516;
fma.rn.f32 %f522, %f521, %f562, %f520;
mul.f32 %f523, %f469, %f517;
fma.rn.f32 %f524, %f523, %f562, %f522;
mul.f32 %f525, %f470, %f518;
fma.rn.f32 %f575, %f525, %f562, %f524;
@%p50 bra $L__BB4_56;
setp.eq.s32 %p51, %r15, 6;
ld.global.nc.u8 %rs297, [%rd19+5];
and.b16 %rs299, %rs297, 3;
cvt.u32.u16 %r330, %rs299;
add.s32 %r331, %r330, -1;
cvt.rn.f32.s32 %f526, %r331;
shr.u16 %rs300, %rs297, 2;
and.b16 %rs301, %rs300, 3;
cvt.u32.u16 %r332, %rs301;
add.s32 %r333, %r332, -1;
cvt.rn.f32.s32 %f527, %r333;
shr.u16 %rs302, %rs297, 4;
and.b16 %rs303, %rs302, 3;
cvt.u32.u16 %r334, %rs303;
add.s32 %r335, %r334, -1;
cvt.rn.f32.s32 %f528, %r335;
shr.u16 %rs304, %rs297, 6;
cvt.u32.u16 %r336, %rs304;
add.s32 %r337, %r336, -1;
cvt.rn.f32.s32 %f529, %r337;
mul.f32 %f530, %f467, %f526;
fma.rn.f32 %f531, %f530, %f561, %f575;
mul.f32 %f532, %f468, %f527;
fma.rn.f32 %f533, %f532, %f561, %f531;
mul.f32 %f534, %f469, %f528;
fma.rn.f32 %f535, %f534, %f561, %f533;
mul.f32 %f536, %f470, %f529;
fma.rn.f32 %f575, %f536, %f561, %f535;
@%p51 bra $L__BB4_56;
setp.lt.s32 %p52, %r15, 8;
ld.global.nc.u8 %rs305, [%rd19+6];
and.b16 %rs307, %rs305, 3;
cvt.u32.u16 %r338, %rs307;
add.s32 %r339, %r338, -1;
cvt.rn.f32.s32 %f537, %r339;
shr.u16 %rs308, %rs305, 2;
and.b16 %rs309, %rs308, 3;
cvt.u32.u16 %r340, %rs309;
add.s32 %r341, %r340, -1;
cvt.rn.f32.s32 %f538, %r341;
shr.u16 %rs310, %rs305, 4;
and.b16 %rs311, %rs310, 3;
cvt.u32.u16 %r342, %rs311;
add.s32 %r343, %r342, -1;
cvt.rn.f32.s32 %f539, %r343;
shr.u16 %rs312, %rs305, 6;
cvt.u32.u16 %r344, %rs312;
add.s32 %r345, %r344, -1;
cvt.rn.f32.s32 %f540, %r345;
mul.f32 %f541, %f467, %f537;
fma.rn.f32 %f542, %f541, %f560, %f575;
mul.f32 %f543, %f468, %f538;
fma.rn.f32 %f544, %f543, %f560, %f542;
mul.f32 %f545, %f469, %f539;
fma.rn.f32 %f546, %f545, %f560, %f544;
mul.f32 %f547, %f470, %f540;
fma.rn.f32 %f575, %f547, %f560, %f546;
@%p52 bra $L__BB4_56;
ld.global.nc.u8 %rs313, [%rd19+7];
and.b16 %rs315, %rs313, 3;
cvt.u32.u16 %r346, %rs315;
add.s32 %r347, %r346, -1;
cvt.rn.f32.s32 %f548, %r347;
shr.u16 %rs316, %rs313, 2;
and.b16 %rs317, %rs316, 3;
cvt.u32.u16 %r348, %rs317;
add.s32 %r349, %r348, -1;
cvt.rn.f32.s32 %f549, %r349;
shr.u16 %rs318, %rs313, 4;
and.b16 %rs319, %rs318, 3;
cvt.u32.u16 %r350, %rs319;
add.s32 %r351, %r350, -1;
cvt.rn.f32.s32 %f550, %r351;
shr.u16 %rs320, %rs313, 6;
cvt.u32.u16 %r352, %rs320;
add.s32 %r353, %r352, -1;
cvt.rn.f32.s32 %f551, %r353;
mul.f32 %f552, %f467, %f548;
fma.rn.f32 %f553, %f552, %f559, %f575;
mul.f32 %f554, %f468, %f549;
fma.rn.f32 %f555, %f554, %f559, %f553;
mul.f32 %f556, %f469, %f550;
fma.rn.f32 %f557, %f556, %f559, %f555;
mul.f32 %f558, %f470, %f551;
fma.rn.f32 %f575, %f558, %f559, %f557;
$L__BB4_56:
add.s64 %rd43, %rd43, %rd14;
add.s64 %rd42, %rd42, 16;
add.s32 %r358, %r358, -1;
setp.ne.s32 %p53, %r358, 0;
@%p53 bra $L__BB4_48;
$L__BB4_57:
mad.lo.s32 %r354, %r2, %r13, %r1;
cvta.to.global.u64 %rd39, %rd22;
mul.wide.s32 %rd40, %r354, 4;
add.s64 %rd41, %rd39, %rd40;
st.global.f32 [%rd41], %f575;
$L__BB4_58:
ret;
}
// .globl gemm_ternary_multibase_tiled_kernel_f32
.visible .entry gemm_ternary_multibase_tiled_kernel_f32(
.param .u64 gemm_ternary_multibase_tiled_kernel_f32_param_0,
.param .u64 gemm_ternary_multibase_tiled_kernel_f32_param_1,
.param .u64 gemm_ternary_multibase_tiled_kernel_f32_param_2,
.param .u64 gemm_ternary_multibase_tiled_kernel_f32_param_3,
.param .u32 gemm_ternary_multibase_tiled_kernel_f32_param_4,
.param .u32 gemm_ternary_multibase_tiled_kernel_f32_param_5,
.param .u32 gemm_ternary_multibase_tiled_kernel_f32_param_6,
.param .u32 gemm_ternary_multibase_tiled_kernel_f32_param_7,
.param .u32 gemm_ternary_multibase_tiled_kernel_f32_param_8
)
{
.reg .pred %p<50>;
.reg .b16 %rs<57>;
.reg .f32 %f<136>;
.reg .b32 %r<129>;
.reg .b64 %rd<22>;
// demoted variable
.shared .align 4 .b8 _ZZ39gemm_ternary_multibase_tiled_kernel_f32E6x_tile[256];
// demoted variable
.shared .align 4 .b8 _ZZ39gemm_ternary_multibase_tiled_kernel_f32E8s_shared[32];
ld.param.u64 %rd7, [gemm_ternary_multibase_tiled_kernel_f32_param_0];
ld.param.u64 %rd10, [gemm_ternary_multibase_tiled_kernel_f32_param_1];
ld.param.u64 %rd8, [gemm_ternary_multibase_tiled_kernel_f32_param_2];
ld.param.u64 %rd9, [gemm_ternary_multibase_tiled_kernel_f32_param_3];
ld.param.u32 %r35, [gemm_ternary_multibase_tiled_kernel_f32_param_4];
ld.param.u32 %r31, [gemm_ternary_multibase_tiled_kernel_f32_param_5];
ld.param.u32 %r32, [gemm_ternary_multibase_tiled_kernel_f32_param_6];
ld.param.u32 %r33, [gemm_ternary_multibase_tiled_kernel_f32_param_7];
ld.param.u32 %r34, [gemm_ternary_multibase_tiled_kernel_f32_param_8];
cvta.to.global.u64 %rd1, %rd10;
mov.u32 %r1, %ntid.x;
mov.u32 %r36, %ctaid.x;
mov.u32 %r2, %tid.x;
mad.lo.s32 %r3, %r36, %r1, %r2;
mov.u32 %r4, %ctaid.y;
setp.ge.s32 %p1, %r4, %r35;
@%p1 bra $L__BB5_31;
setp.ge.u32 %p2, %r2, %r34;
setp.gt.u32 %p3, %r2, 7;
or.pred %p4, %p3, %p2;
@%p4 bra $L__BB5_3;
cvta.to.global.u64 %rd11, %rd8;
mul.wide.u32 %rd12, %r2, 4;
add.s64 %rd13, %rd11, %rd12;
ld.global.nc.f32 %f21, [%rd13];
shl.b32 %r37, %r2, 2;
mov.u32 %r38, _ZZ39gemm_ternary_multibase_tiled_kernel_f32E8s_shared;
add.s32 %r39, %r38, %r37;
st.shared.f32 [%r39], %f21;
$L__BB5_3:
bar.sync 0;
setp.lt.s32 %p5, %r31, 1;
mov.f32 %f133, 0f00000000;
@%p5 bra $L__BB5_29;
mul.lo.s32 %r5, %r4, %r31;
mul.lo.s32 %r6, %r3, %r33;
neg.s32 %r41, %r34;
mov.u32 %r122, 0;
max.u32 %r7, %r41, -8;
neg.s32 %r42, %r7;
and.b32 %r8, %r42, 3;
add.s32 %r43, %r7, %r8;
neg.s32 %r9, %r43;
cvta.to.global.u64 %rd2, %rd7;
mov.f32 %f133, 0f00000000;
$L__BB5_5:
add.s32 %r44, %r2, %r122;
setp.ge.s32 %p6, %r44, %r31;
setp.gt.s32 %p7, %r2, 63;
or.pred %p8, %p7, %p6;
@%p8 bra $L__BB5_8;
add.s32 %r11, %r122, %r5;
mov.u32 %r123, %r2;
$L__BB5_7:
add.s32 %r45, %r11, %r123;
mul.wide.s32 %rd14, %r45, 4;
add.s64 %rd15, %rd2, %rd14;
ld.global.nc.f32 %f24, [%rd15];
shl.b32 %r46, %r123, 2;
mov.u32 %r47, _ZZ39gemm_ternary_multibase_tiled_kernel_f32E6x_tile;
add.s32 %r48, %r47, %r46;
st.shared.f32 [%r48], %f24;
add.s32 %r123, %r123, %r1;
setp.lt.s32 %p9, %r123, 64;
add.s32 %r49, %r123, %r122;
setp.lt.s32 %p10, %r49, %r31;
and.pred %p11, %p9, %p10;
@%p11 bra $L__BB5_7;
$L__BB5_8:
setp.ge.s32 %p12, %r3, %r32;
bar.sync 0;
@%p12 bra $L__BB5_28;
shr.u32 %r124, %r122, 2;
add.s32 %r50, %r122, 67;
shr.u32 %r51, %r50, 2;
min.s32 %r15, %r51, %r33;
setp.ge.s32 %p13, %r124, %r15;
@%p13 bra $L__BB5_28;
setp.lt.s32 %p14, %r34, 1;
add.s32 %r16, %r122, 64;
@%p14 bra $L__BB5_28;
$L__BB5_11:
shl.b32 %r18, %r124, 2;
setp.lt.s32 %p15, %r18, %r122;
setp.ge.s32 %p16, %r18, %r16;
or.pred %p17, %p16, %p15;
setp.ge.s32 %p18, %r18, %r31;
sub.s32 %r52, %r18, %r122;
shl.b32 %r53, %r52, 2;
mov.u32 %r54, _ZZ39gemm_ternary_multibase_tiled_kernel_f32E6x_tile;
add.s32 %r19, %r54, %r53;
mov.f32 %f127, 0f00000000;
or.pred %p19, %p18, %p17;
mov.f32 %f126, %f127;
@%p19 bra $L__BB5_13;
ld.shared.f32 %f126, [%r19];
$L__BB5_13:
add.s32 %r55, %r18, 1;
setp.lt.u32 %p20, %r55, %r122;
setp.ge.u32 %p21, %r55, %r16;
or.pred %p22, %p21, %p20;
setp.ge.s32 %p23, %r55, %r31;
or.pred %p24, %p23, %p22;
@%p24 bra $L__BB5_15;
ld.shared.f32 %f127, [%r19+4];
$L__BB5_15:
add.s32 %r56, %r18, 2;
setp.lt.u32 %p25, %r56, %r122;
setp.ge.u32 %p26, %r56, %r16;
or.pred %p27, %p26, %p25;
setp.ge.s32 %p28, %r56, %r31;
mov.f32 %f129, 0f00000000;
or.pred %p29, %p28, %p27;
mov.f32 %f128, %f129;
@%p29 bra $L__BB5_17;
ld.shared.f32 %f128, [%r19+8];
$L__BB5_17:
add.s32 %r57, %r18, 3;
setp.lt.u32 %p30, %r57, %r122;
setp.ge.u32 %p31, %r57, %r16;
or.pred %p32, %p31, %p30;
setp.ge.s32 %p33, %r57, %r31;
or.pred %p34, %p33, %p32;
@%p34 bra $L__BB5_19;
ld.shared.f32 %f129, [%r19+12];
$L__BB5_19:
setp.eq.f32 %p35, %f127, 0f00000000;
setp.eq.f32 %p36, %f126, 0f00000000;
and.pred %p37, %p36, %p35;
setp.eq.f32 %p38, %f128, 0f00000000;
and.pred %p39, %p37, %p38;
setp.eq.f32 %p40, %f129, 0f00000000;
and.pred %p41, %p39, %p40;
@%p41 bra $L__BB5_27;
setp.gt.u32 %p42, %r7, -4;
add.s32 %r59, %r124, %r6;
mul.lo.s32 %r20, %r59, %r34;
mov.u32 %r128, 0;
@%p42 bra $L__BB5_23;
cvt.s64.s32 %rd16, %r20;
add.s64 %rd21, %rd1, %rd16;
mov.u32 %r128, 0;
mov.u32 %r125, _ZZ39gemm_ternary_multibase_tiled_kernel_f32E8s_shared;
mov.u32 %r126, %r9;
$L__BB5_22:
ld.global.nc.u8 %rs1, [%rd21];
and.b16 %rs3, %rs1, 3;
cvt.u32.u16 %r62, %rs3;
add.s32 %r63, %r62, -1;
cvt.rn.f32.s32 %f30, %r63;
shr.u16 %rs4, %rs1, 2;
and.b16 %rs5, %rs4, 3;
cvt.u32.u16 %r64, %rs5;
add.s32 %r65, %r64, -1;
cvt.rn.f32.s32 %f31, %r65;
shr.u16 %rs6, %rs1, 4;
and.b16 %rs7, %rs6, 3;
cvt.u32.u16 %r66, %rs7;
add.s32 %r67, %r66, -1;
cvt.rn.f32.s32 %f32, %r67;
shr.u16 %rs8, %rs1, 6;
cvt.u32.u16 %r68, %rs8;
add.s32 %r69, %r68, -1;
cvt.rn.f32.s32 %f33, %r69;
mul.f32 %f34, %f126, %f30;
ld.shared.f32 %f35, [%r125];
mul.f32 %f36, %f127, %f31;
mul.f32 %f37, %f35, %f36;
fma.rn.f32 %f38, %f35, %f34, %f37;
mul.f32 %f39, %f128, %f32;
fma.rn.f32 %f40, %f35, %f39, %f38;
mul.f32 %f41, %f129, %f33;
fma.rn.f32 %f42, %f35, %f41, %f40;
add.f32 %f43, %f133, %f42;
ld.global.nc.u8 %rs9, [%rd21+1];
and.b16 %rs11, %rs9, 3;
cvt.u32.u16 %r70, %rs11;
add.s32 %r71, %r70, -1;
cvt.rn.f32.s32 %f44, %r71;
shr.u16 %rs12, %rs9, 2;
and.b16 %rs13, %rs12, 3;
cvt.u32.u16 %r72, %rs13;
add.s32 %r73, %r72, -1;
cvt.rn.f32.s32 %f45, %r73;
shr.u16 %rs14, %rs9, 4;
and.b16 %rs15, %rs14, 3;
cvt.u32.u16 %r74, %rs15;
add.s32 %r75, %r74, -1;
cvt.rn.f32.s32 %f46, %r75;
shr.u16 %rs16, %rs9, 6;
cvt.u32.u16 %r76, %rs16;
add.s32 %r77, %r76, -1;
cvt.rn.f32.s32 %f47, %r77;
mul.f32 %f48, %f126, %f44;
ld.shared.f32 %f49, [%r125+4];
mul.f32 %f50, %f127, %f45;
mul.f32 %f51, %f49, %f50;
fma.rn.f32 %f52, %f49, %f48, %f51;
mul.f32 %f53, %f128, %f46;
fma.rn.f32 %f54, %f49, %f53, %f52;
mul.f32 %f55, %f129, %f47;
fma.rn.f32 %f56, %f49, %f55, %f54;
add.f32 %f57, %f43, %f56;
ld.global.nc.u8 %rs17, [%rd21+2];
and.b16 %rs19, %rs17, 3;
cvt.u32.u16 %r78, %rs19;
add.s32 %r79, %r78, -1;
cvt.rn.f32.s32 %f58, %r79;
shr.u16 %rs20, %rs17, 2;
and.b16 %rs21, %rs20, 3;
cvt.u32.u16 %r80, %rs21;
add.s32 %r81, %r80, -1;
cvt.rn.f32.s32 %f59, %r81;
shr.u16 %rs22, %rs17, 4;
and.b16 %rs23, %rs22, 3;
cvt.u32.u16 %r82, %rs23;
add.s32 %r83, %r82, -1;
cvt.rn.f32.s32 %f60, %r83;
shr.u16 %rs24, %rs17, 6;
cvt.u32.u16 %r84, %rs24;
add.s32 %r85, %r84, -1;
cvt.rn.f32.s32 %f61, %r85;
mul.f32 %f62, %f126, %f58;
ld.shared.f32 %f63, [%r125+8];
mul.f32 %f64, %f127, %f59;
mul.f32 %f65, %f63, %f64;
fma.rn.f32 %f66, %f63, %f62, %f65;
mul.f32 %f67, %f128, %f60;
fma.rn.f32 %f68, %f63, %f67, %f66;
mul.f32 %f69, %f129, %f61;
fma.rn.f32 %f70, %f63, %f69, %f68;
add.f32 %f71, %f57, %f70;
ld.global.nc.u8 %rs25, [%rd21+3];
and.b16 %rs27, %rs25, 3;
cvt.u32.u16 %r86, %rs27;
add.s32 %r87, %r86, -1;
cvt.rn.f32.s32 %f72, %r87;
shr.u16 %rs28, %rs25, 2;
and.b16 %rs29, %rs28, 3;
cvt.u32.u16 %r88, %rs29;
add.s32 %r89, %r88, -1;
cvt.rn.f32.s32 %f73, %r89;
shr.u16 %rs30, %rs25, 4;
and.b16 %rs31, %rs30, 3;
cvt.u32.u16 %r90, %rs31;
add.s32 %r91, %r90, -1;
cvt.rn.f32.s32 %f74, %r91;
shr.u16 %rs32, %rs25, 6;
cvt.u32.u16 %r92, %rs32;
add.s32 %r93, %r92, -1;
cvt.rn.f32.s32 %f75, %r93;
mul.f32 %f76, %f126, %f72;
ld.shared.f32 %f77, [%r125+12];
mul.f32 %f78, %f127, %f73;
mul.f32 %f79, %f77, %f78;
fma.rn.f32 %f80, %f77, %f76, %f79;
mul.f32 %f81, %f128, %f74;
fma.rn.f32 %f82, %f77, %f81, %f80;
mul.f32 %f83, %f129, %f75;
fma.rn.f32 %f84, %f77, %f83, %f82;
add.f32 %f133, %f71, %f84;
add.s32 %r128, %r128, 4;
add.s64 %rd21, %rd21, 4;
add.s32 %r125, %r125, 16;
add.s32 %r126, %r126, -4;
setp.ne.s32 %p43, %r126, 0;
@%p43 bra $L__BB5_22;
$L__BB5_23:
setp.eq.s32 %p44, %r8, 0;
@%p44 bra $L__BB5_27;
setp.eq.s32 %p45, %r8, 1;
add.s32 %r94, %r128, %r20;
cvt.s64.s32 %rd17, %r94;
add.s64 %rd6, %rd1, %rd17;
ld.global.nc.u8 %rs33, [%rd6];
and.b16 %rs35, %rs33, 3;
cvt.u32.u16 %r95, %rs35;
add.s32 %r96, %r95, -1;
cvt.rn.f32.s32 %f85, %r96;
shr.u16 %rs36, %rs33, 2;
and.b16 %rs37, %rs36, 3;
cvt.u32.u16 %r97, %rs37;
add.s32 %r98, %r97, -1;
cvt.rn.f32.s32 %f86, %r98;
shr.u16 %rs38, %rs33, 4;
and.b16 %rs39, %rs38, 3;
cvt.u32.u16 %r99, %rs39;
add.s32 %r100, %r99, -1;
cvt.rn.f32.s32 %f87, %r100;
shr.u16 %rs40, %rs33, 6;
cvt.u32.u16 %r101, %rs40;
add.s32 %r102, %r101, -1;
cvt.rn.f32.s32 %f88, %r102;
mul.f32 %f89, %f126, %f85;
shl.b32 %r103, %r128, 2;
mov.u32 %r104, _ZZ39gemm_ternary_multibase_tiled_kernel_f32E8s_shared;
add.s32 %r28, %r104, %r103;
ld.shared.f32 %f90, [%r28];
mul.f32 %f91, %f127, %f86;
mul.f32 %f92, %f90, %f91;
fma.rn.f32 %f93, %f90, %f89, %f92;
mul.f32 %f94, %f128, %f87;
fma.rn.f32 %f95, %f90, %f94, %f93;
mul.f32 %f96, %f129, %f88;
fma.rn.f32 %f97, %f90, %f96, %f95;
add.f32 %f133, %f133, %f97;
@%p45 bra $L__BB5_27;
setp.eq.s32 %p46, %r8, 2;
ld.global.nc.u8 %rs41, [%rd6+1];
and.b16 %rs43, %rs41, 3;
cvt.u32.u16 %r105, %rs43;
add.s32 %r106, %r105, -1;
cvt.rn.f32.s32 %f98, %r106;
shr.u16 %rs44, %rs41, 2;
and.b16 %rs45, %rs44, 3;
cvt.u32.u16 %r107, %rs45;
add.s32 %r108, %r107, -1;
cvt.rn.f32.s32 %f99, %r108;
shr.u16 %rs46, %rs41, 4;
and.b16 %rs47, %rs46, 3;
cvt.u32.u16 %r109, %rs47;
add.s32 %r110, %r109, -1;
cvt.rn.f32.s32 %f100, %r110;
shr.u16 %rs48, %rs41, 6;
cvt.u32.u16 %r111, %rs48;
add.s32 %r112, %r111, -1;
cvt.rn.f32.s32 %f101, %r112;
mul.f32 %f102, %f126, %f98;
ld.shared.f32 %f103, [%r28+4];
mul.f32 %f104, %f127, %f99;
mul.f32 %f105, %f103, %f104;
fma.rn.f32 %f106, %f103, %f102, %f105;
mul.f32 %f107, %f128, %f100;
fma.rn.f32 %f108, %f103, %f107, %f106;
mul.f32 %f109, %f129, %f101;
fma.rn.f32 %f110, %f103, %f109, %f108;
add.f32 %f133, %f133, %f110;
@%p46 bra $L__BB5_27;
ld.global.nc.u8 %rs49, [%rd6+2];
and.b16 %rs51, %rs49, 3;
cvt.u32.u16 %r113, %rs51;
add.s32 %r114, %r113, -1;
cvt.rn.f32.s32 %f111, %r114;
shr.u16 %rs52, %rs49, 2;
and.b16 %rs53, %rs52, 3;
cvt.u32.u16 %r115, %rs53;
add.s32 %r116, %r115, -1;
cvt.rn.f32.s32 %f112, %r116;
shr.u16 %rs54, %rs49, 4;
and.b16 %rs55, %rs54, 3;
cvt.u32.u16 %r117, %rs55;
add.s32 %r118, %r117, -1;
cvt.rn.f32.s32 %f113, %r118;
shr.u16 %rs56, %rs49, 6;
cvt.u32.u16 %r119, %rs56;
add.s32 %r120, %r119, -1;
cvt.rn.f32.s32 %f114, %r120;
mul.f32 %f115, %f126, %f111;
ld.shared.f32 %f116, [%r28+8];
mul.f32 %f117, %f127, %f112;
mul.f32 %f118, %f116, %f117;
fma.rn.f32 %f119, %f116, %f115, %f118;
mul.f32 %f120, %f128, %f113;
fma.rn.f32 %f121, %f116, %f120, %f119;
mul.f32 %f122, %f129, %f114;
fma.rn.f32 %f123, %f116, %f122, %f121;
add.f32 %f133, %f133, %f123;
$L__BB5_27:
add.s32 %r124, %r124, 1;
setp.lt.s32 %p47, %r124, %r15;
@%p47 bra $L__BB5_11;
$L__BB5_28:
bar.sync 0;
add.s32 %r122, %r122, 64;
setp.lt.s32 %p48, %r122, %r31;
@%p48 bra $L__BB5_5;
$L__BB5_29:
setp.ge.s32 %p49, %r3, %r32;
@%p49 bra $L__BB5_31;
mad.lo.s32 %r121, %r4, %r32, %r3;
cvta.to.global.u64 %rd18, %rd9;
mul.wide.s32 %rd19, %r121, 4;
add.s64 %rd20, %rd18, %rd19;
st.global.f32 [%rd20], %f133;
$L__BB5_31:
ret;
}