//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-33961263
// Cuda compilation tools, release 12.4, V12.4.99
// Based on NVVM 7.0.1
//
.version 8.4
.target sm_89
.address_size 64
// .globl adaptive_gemm_n3_kernel_f32
.visible .entry adaptive_gemm_n3_kernel_f32(
.param .u64 adaptive_gemm_n3_kernel_f32_param_0,
.param .u64 adaptive_gemm_n3_kernel_f32_param_1,
.param .u64 adaptive_gemm_n3_kernel_f32_param_2,
.param .u64 adaptive_gemm_n3_kernel_f32_param_3,
.param .u32 adaptive_gemm_n3_kernel_f32_param_4,
.param .u32 adaptive_gemm_n3_kernel_f32_param_5,
.param .u32 adaptive_gemm_n3_kernel_f32_param_6
)
{
.reg .pred %p<6>;
.reg .b16 %rs<49>;
.reg .f32 %f<42>;
.reg .b32 %r<58>;
.reg .b64 %rd<34>;
ld.param.u64 %rd13, [adaptive_gemm_n3_kernel_f32_param_0];
ld.param.u64 %rd14, [adaptive_gemm_n3_kernel_f32_param_1];
ld.param.u64 %rd15, [adaptive_gemm_n3_kernel_f32_param_2];
ld.param.u64 %rd16, [adaptive_gemm_n3_kernel_f32_param_3];
ld.param.u32 %r8, [adaptive_gemm_n3_kernel_f32_param_4];
ld.param.u32 %r6, [adaptive_gemm_n3_kernel_f32_param_5];
ld.param.u32 %r7, [adaptive_gemm_n3_kernel_f32_param_6];
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %ctaid.x;
mov.u32 %r11, %tid.x;
mad.lo.s32 %r1, %r10, %r9, %r11;
setp.ge.s32 %p1, %r1, %r7;
mov.u32 %r2, %ctaid.y;
setp.ge.s32 %p2, %r2, %r8;
or.pred %p3, %p2, %p1;
@%p3 bra $L__BB0_5;
shr.s32 %r12, %r6, 31;
shr.u32 %r13, %r12, 30;
add.s32 %r14, %r6, %r13;
shr.s32 %r3, %r14, 2;
setp.lt.s32 %p4, %r6, 4;
mov.f32 %f41, 0f00000000;
@%p4 bra $L__BB0_4;
cvta.to.global.u64 %rd17, %rd14;
cvta.to.global.u64 %rd18, %rd15;
mul.lo.s32 %r16, %r3, %r7;
mul.lo.s32 %r17, %r3, %r1;
cvt.s64.s32 %rd19, %r17;
ld.global.nc.f32 %f1, [%rd18];
ld.global.nc.f32 %f2, [%rd18+4];
ld.global.nc.f32 %f3, [%rd18+8];
shl.b32 %r18, %r16, 1;
cvt.s64.s32 %rd20, %r18;
add.s64 %rd21, %rd20, %rd19;
add.s64 %rd33, %rd17, %rd21;
cvt.s64.s32 %rd22, %r16;
add.s64 %rd23, %rd22, %rd19;
add.s64 %rd32, %rd17, %rd23;
add.s64 %rd31, %rd17, %rd19;
mul.lo.s32 %r19, %r2, %r6;
cvta.to.global.u64 %rd24, %rd13;
mul.wide.s32 %rd25, %r19, 4;
add.s64 %rd26, %rd24, %rd25;
add.s64 %rd30, %rd26, 8;
mov.f32 %f41, 0f00000000;
mov.u32 %r57, 0;
$L__BB0_3:
ld.global.nc.u8 %rs1, [%rd31];
and.b16 %rs3, %rs1, 2;
ld.global.nc.u8 %rs4, [%rd32];
and.b16 %rs6, %rs4, 2;
ld.global.nc.u8 %rs7, [%rd33];
and.b16 %rs9, %rs7, 2;
and.b16 %rs10, %rs1, 1;
cvt.u32.u16 %r20, %rs10;
shr.u16 %rs11, %rs3, 1;
cvt.u32.u16 %r21, %rs11;
sub.s32 %r22, %r20, %r21;
cvt.rn.f32.s32 %f9, %r22;
and.b16 %rs12, %rs4, 1;
cvt.u32.u16 %r23, %rs12;
shr.u16 %rs13, %rs6, 1;
cvt.u32.u16 %r24, %rs13;
sub.s32 %r25, %r23, %r24;
cvt.rn.f32.s32 %f10, %r25;
mul.f32 %f11, %f2, %f10;
fma.rn.f32 %f12, %f1, %f9, %f11;
and.b16 %rs14, %rs7, 1;
cvt.u32.u16 %r26, %rs14;
shr.u16 %rs15, %rs9, 1;
cvt.u32.u16 %r27, %rs15;
sub.s32 %r28, %r26, %r27;
cvt.rn.f32.s32 %f13, %r28;
fma.rn.f32 %f14, %f3, %f13, %f12;
ld.global.nc.f32 %f15, [%rd30+-8];
fma.rn.f32 %f16, %f15, %f14, %f41;
shr.u16 %rs16, %rs1, 3;
shr.u16 %rs17, %rs1, 2;
shr.u16 %rs18, %rs4, 3;
shr.u16 %rs19, %rs4, 2;
shr.u16 %rs20, %rs7, 3;
shr.u16 %rs21, %rs7, 2;
and.b16 %rs22, %rs17, 1;
cvt.u32.u16 %r29, %rs22;
and.b16 %rs23, %rs16, 1;
cvt.u32.u16 %r30, %rs23;
sub.s32 %r31, %r29, %r30;
cvt.rn.f32.s32 %f17, %r31;
and.b16 %rs24, %rs19, 1;
cvt.u32.u16 %r32, %rs24;
and.b16 %rs25, %rs18, 1;
cvt.u32.u16 %r33, %rs25;
sub.s32 %r34, %r32, %r33;
cvt.rn.f32.s32 %f18, %r34;
mul.f32 %f19, %f2, %f18;
fma.rn.f32 %f20, %f1, %f17, %f19;
and.b16 %rs26, %rs21, 1;
cvt.u32.u16 %r35, %rs26;
and.b16 %rs27, %rs20, 1;
cvt.u32.u16 %r36, %rs27;
sub.s32 %r37, %r35, %r36;
cvt.rn.f32.s32 %f21, %r37;
fma.rn.f32 %f22, %f3, %f21, %f20;
ld.global.nc.f32 %f23, [%rd30+-4];
fma.rn.f32 %f24, %f23, %f22, %f16;
shr.u16 %rs28, %rs1, 5;
shr.u16 %rs29, %rs1, 4;
shr.u16 %rs30, %rs4, 5;
shr.u16 %rs31, %rs4, 4;
shr.u16 %rs32, %rs7, 5;
shr.u16 %rs33, %rs7, 4;
and.b16 %rs34, %rs29, 1;
cvt.u32.u16 %r38, %rs34;
and.b16 %rs35, %rs28, 1;
cvt.u32.u16 %r39, %rs35;
sub.s32 %r40, %r38, %r39;
cvt.rn.f32.s32 %f25, %r40;
and.b16 %rs36, %rs31, 1;
cvt.u32.u16 %r41, %rs36;
and.b16 %rs37, %rs30, 1;
cvt.u32.u16 %r42, %rs37;
sub.s32 %r43, %r41, %r42;
cvt.rn.f32.s32 %f26, %r43;
mul.f32 %f27, %f2, %f26;
fma.rn.f32 %f28, %f1, %f25, %f27;
and.b16 %rs38, %rs33, 1;
cvt.u32.u16 %r44, %rs38;
and.b16 %rs39, %rs32, 1;
cvt.u32.u16 %r45, %rs39;
sub.s32 %r46, %r44, %r45;
cvt.rn.f32.s32 %f29, %r46;
fma.rn.f32 %f30, %f3, %f29, %f28;
ld.global.nc.f32 %f31, [%rd30];
fma.rn.f32 %f32, %f31, %f30, %f24;
shr.u16 %rs40, %rs1, 6;
shr.u16 %rs41, %rs4, 6;
shr.u16 %rs42, %rs7, 6;
and.b16 %rs43, %rs40, 1;
cvt.u32.u16 %r47, %rs43;
shr.u16 %rs44, %rs1, 7;
cvt.u32.u16 %r48, %rs44;
sub.s32 %r49, %r47, %r48;
cvt.rn.f32.s32 %f33, %r49;
and.b16 %rs45, %rs41, 1;
cvt.u32.u16 %r50, %rs45;
shr.u16 %rs46, %rs4, 7;
cvt.u32.u16 %r51, %rs46;
sub.s32 %r52, %r50, %r51;
cvt.rn.f32.s32 %f34, %r52;
mul.f32 %f35, %f2, %f34;
fma.rn.f32 %f36, %f1, %f33, %f35;
and.b16 %rs47, %rs42, 1;
cvt.u32.u16 %r53, %rs47;
shr.u16 %rs48, %rs7, 7;
cvt.u32.u16 %r54, %rs48;
sub.s32 %r55, %r53, %r54;
cvt.rn.f32.s32 %f37, %r55;
fma.rn.f32 %f38, %f3, %f37, %f36;
ld.global.nc.f32 %f39, [%rd30+4];
fma.rn.f32 %f41, %f39, %f38, %f32;
add.s64 %rd33, %rd33, 1;
add.s64 %rd32, %rd32, 1;
add.s64 %rd31, %rd31, 1;
add.s64 %rd30, %rd30, 16;
add.s32 %r57, %r57, 1;
setp.lt.s32 %p5, %r57, %r3;
@%p5 bra $L__BB0_3;
$L__BB0_4:
mad.lo.s32 %r56, %r2, %r7, %r1;
cvta.to.global.u64 %rd27, %rd16;
mul.wide.s32 %rd28, %r56, 4;
add.s64 %rd29, %rd27, %rd28;
st.global.f32 [%rd29], %f41;
$L__BB0_5:
ret;
}