cortex_rust 0.6.0

High-performance LLM inference with 4-bit quantization and Test-Time Training (TTT)
Documentation
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-33961263
// Cuda compilation tools, release 12.4, V12.4.99
// Based on NVVM 7.0.1
//

.version 8.4
.target sm_89
.address_size 64

	// .globl	adaptive_gemm_n3_kernel_f32

.visible .entry adaptive_gemm_n3_kernel_f32(
	.param .u64 adaptive_gemm_n3_kernel_f32_param_0,
	.param .u64 adaptive_gemm_n3_kernel_f32_param_1,
	.param .u64 adaptive_gemm_n3_kernel_f32_param_2,
	.param .u64 adaptive_gemm_n3_kernel_f32_param_3,
	.param .u32 adaptive_gemm_n3_kernel_f32_param_4,
	.param .u32 adaptive_gemm_n3_kernel_f32_param_5,
	.param .u32 adaptive_gemm_n3_kernel_f32_param_6
)
{
	.reg .pred 	%p<6>;
	.reg .b16 	%rs<49>;
	.reg .f32 	%f<42>;
	.reg .b32 	%r<58>;
	.reg .b64 	%rd<34>;


	ld.param.u64 	%rd13, [adaptive_gemm_n3_kernel_f32_param_0];
	ld.param.u64 	%rd14, [adaptive_gemm_n3_kernel_f32_param_1];
	ld.param.u64 	%rd15, [adaptive_gemm_n3_kernel_f32_param_2];
	ld.param.u64 	%rd16, [adaptive_gemm_n3_kernel_f32_param_3];
	ld.param.u32 	%r8, [adaptive_gemm_n3_kernel_f32_param_4];
	ld.param.u32 	%r6, [adaptive_gemm_n3_kernel_f32_param_5];
	ld.param.u32 	%r7, [adaptive_gemm_n3_kernel_f32_param_6];
	mov.u32 	%r9, %ntid.x;
	mov.u32 	%r10, %ctaid.x;
	mov.u32 	%r11, %tid.x;
	mad.lo.s32 	%r1, %r10, %r9, %r11;
	setp.ge.s32 	%p1, %r1, %r7;
	mov.u32 	%r2, %ctaid.y;
	setp.ge.s32 	%p2, %r2, %r8;
	or.pred  	%p3, %p2, %p1;
	@%p3 bra 	$L__BB0_5;

	shr.s32 	%r12, %r6, 31;
	shr.u32 	%r13, %r12, 30;
	add.s32 	%r14, %r6, %r13;
	shr.s32 	%r3, %r14, 2;
	setp.lt.s32 	%p4, %r6, 4;
	mov.f32 	%f41, 0f00000000;
	@%p4 bra 	$L__BB0_4;

	cvta.to.global.u64 	%rd17, %rd14;
	cvta.to.global.u64 	%rd18, %rd15;
	mul.lo.s32 	%r16, %r3, %r7;
	mul.lo.s32 	%r17, %r3, %r1;
	cvt.s64.s32 	%rd19, %r17;
	ld.global.nc.f32 	%f1, [%rd18];
	ld.global.nc.f32 	%f2, [%rd18+4];
	ld.global.nc.f32 	%f3, [%rd18+8];
	shl.b32 	%r18, %r16, 1;
	cvt.s64.s32 	%rd20, %r18;
	add.s64 	%rd21, %rd20, %rd19;
	add.s64 	%rd33, %rd17, %rd21;
	cvt.s64.s32 	%rd22, %r16;
	add.s64 	%rd23, %rd22, %rd19;
	add.s64 	%rd32, %rd17, %rd23;
	add.s64 	%rd31, %rd17, %rd19;
	mul.lo.s32 	%r19, %r2, %r6;
	cvta.to.global.u64 	%rd24, %rd13;
	mul.wide.s32 	%rd25, %r19, 4;
	add.s64 	%rd26, %rd24, %rd25;
	add.s64 	%rd30, %rd26, 8;
	mov.f32 	%f41, 0f00000000;
	mov.u32 	%r57, 0;

$L__BB0_3:
	ld.global.nc.u8 	%rs1, [%rd31];
	and.b16  	%rs3, %rs1, 2;
	ld.global.nc.u8 	%rs4, [%rd32];
	and.b16  	%rs6, %rs4, 2;
	ld.global.nc.u8 	%rs7, [%rd33];
	and.b16  	%rs9, %rs7, 2;
	and.b16  	%rs10, %rs1, 1;
	cvt.u32.u16 	%r20, %rs10;
	shr.u16 	%rs11, %rs3, 1;
	cvt.u32.u16 	%r21, %rs11;
	sub.s32 	%r22, %r20, %r21;
	cvt.rn.f32.s32 	%f9, %r22;
	and.b16  	%rs12, %rs4, 1;
	cvt.u32.u16 	%r23, %rs12;
	shr.u16 	%rs13, %rs6, 1;
	cvt.u32.u16 	%r24, %rs13;
	sub.s32 	%r25, %r23, %r24;
	cvt.rn.f32.s32 	%f10, %r25;
	mul.f32 	%f11, %f2, %f10;
	fma.rn.f32 	%f12, %f1, %f9, %f11;
	and.b16  	%rs14, %rs7, 1;
	cvt.u32.u16 	%r26, %rs14;
	shr.u16 	%rs15, %rs9, 1;
	cvt.u32.u16 	%r27, %rs15;
	sub.s32 	%r28, %r26, %r27;
	cvt.rn.f32.s32 	%f13, %r28;
	fma.rn.f32 	%f14, %f3, %f13, %f12;
	ld.global.nc.f32 	%f15, [%rd30+-8];
	fma.rn.f32 	%f16, %f15, %f14, %f41;
	shr.u16 	%rs16, %rs1, 3;
	shr.u16 	%rs17, %rs1, 2;
	shr.u16 	%rs18, %rs4, 3;
	shr.u16 	%rs19, %rs4, 2;
	shr.u16 	%rs20, %rs7, 3;
	shr.u16 	%rs21, %rs7, 2;
	and.b16  	%rs22, %rs17, 1;
	cvt.u32.u16 	%r29, %rs22;
	and.b16  	%rs23, %rs16, 1;
	cvt.u32.u16 	%r30, %rs23;
	sub.s32 	%r31, %r29, %r30;
	cvt.rn.f32.s32 	%f17, %r31;
	and.b16  	%rs24, %rs19, 1;
	cvt.u32.u16 	%r32, %rs24;
	and.b16  	%rs25, %rs18, 1;
	cvt.u32.u16 	%r33, %rs25;
	sub.s32 	%r34, %r32, %r33;
	cvt.rn.f32.s32 	%f18, %r34;
	mul.f32 	%f19, %f2, %f18;
	fma.rn.f32 	%f20, %f1, %f17, %f19;
	and.b16  	%rs26, %rs21, 1;
	cvt.u32.u16 	%r35, %rs26;
	and.b16  	%rs27, %rs20, 1;
	cvt.u32.u16 	%r36, %rs27;
	sub.s32 	%r37, %r35, %r36;
	cvt.rn.f32.s32 	%f21, %r37;
	fma.rn.f32 	%f22, %f3, %f21, %f20;
	ld.global.nc.f32 	%f23, [%rd30+-4];
	fma.rn.f32 	%f24, %f23, %f22, %f16;
	shr.u16 	%rs28, %rs1, 5;
	shr.u16 	%rs29, %rs1, 4;
	shr.u16 	%rs30, %rs4, 5;
	shr.u16 	%rs31, %rs4, 4;
	shr.u16 	%rs32, %rs7, 5;
	shr.u16 	%rs33, %rs7, 4;
	and.b16  	%rs34, %rs29, 1;
	cvt.u32.u16 	%r38, %rs34;
	and.b16  	%rs35, %rs28, 1;
	cvt.u32.u16 	%r39, %rs35;
	sub.s32 	%r40, %r38, %r39;
	cvt.rn.f32.s32 	%f25, %r40;
	and.b16  	%rs36, %rs31, 1;
	cvt.u32.u16 	%r41, %rs36;
	and.b16  	%rs37, %rs30, 1;
	cvt.u32.u16 	%r42, %rs37;
	sub.s32 	%r43, %r41, %r42;
	cvt.rn.f32.s32 	%f26, %r43;
	mul.f32 	%f27, %f2, %f26;
	fma.rn.f32 	%f28, %f1, %f25, %f27;
	and.b16  	%rs38, %rs33, 1;
	cvt.u32.u16 	%r44, %rs38;
	and.b16  	%rs39, %rs32, 1;
	cvt.u32.u16 	%r45, %rs39;
	sub.s32 	%r46, %r44, %r45;
	cvt.rn.f32.s32 	%f29, %r46;
	fma.rn.f32 	%f30, %f3, %f29, %f28;
	ld.global.nc.f32 	%f31, [%rd30];
	fma.rn.f32 	%f32, %f31, %f30, %f24;
	shr.u16 	%rs40, %rs1, 6;
	shr.u16 	%rs41, %rs4, 6;
	shr.u16 	%rs42, %rs7, 6;
	and.b16  	%rs43, %rs40, 1;
	cvt.u32.u16 	%r47, %rs43;
	shr.u16 	%rs44, %rs1, 7;
	cvt.u32.u16 	%r48, %rs44;
	sub.s32 	%r49, %r47, %r48;
	cvt.rn.f32.s32 	%f33, %r49;
	and.b16  	%rs45, %rs41, 1;
	cvt.u32.u16 	%r50, %rs45;
	shr.u16 	%rs46, %rs4, 7;
	cvt.u32.u16 	%r51, %rs46;
	sub.s32 	%r52, %r50, %r51;
	cvt.rn.f32.s32 	%f34, %r52;
	mul.f32 	%f35, %f2, %f34;
	fma.rn.f32 	%f36, %f1, %f33, %f35;
	and.b16  	%rs47, %rs42, 1;
	cvt.u32.u16 	%r53, %rs47;
	shr.u16 	%rs48, %rs7, 7;
	cvt.u32.u16 	%r54, %rs48;
	sub.s32 	%r55, %r53, %r54;
	cvt.rn.f32.s32 	%f37, %r55;
	fma.rn.f32 	%f38, %f3, %f37, %f36;
	ld.global.nc.f32 	%f39, [%rd30+4];
	fma.rn.f32 	%f41, %f39, %f38, %f32;
	add.s64 	%rd33, %rd33, 1;
	add.s64 	%rd32, %rd32, 1;
	add.s64 	%rd31, %rd31, 1;
	add.s64 	%rd30, %rd30, 16;
	add.s32 	%r57, %r57, 1;
	setp.lt.s32 	%p5, %r57, %r3;
	@%p5 bra 	$L__BB0_3;

$L__BB0_4:
	mad.lo.s32 	%r56, %r2, %r7, %r1;
	cvta.to.global.u64 	%rd27, %rd16;
	mul.wide.s32 	%rd28, %r56, 4;
	add.s64 	%rd29, %rd27, %rd28;
	st.global.f32 	[%rd29], %f41;

$L__BB0_5:
	ret;

}