//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-33961263
// Cuda compilation tools, release 12.4, V12.4.99
// Based on NVVM 7.0.1
//
.version 8.4
.target sm_89
.address_size 64
// .globl rms_norm_kernel_f32
// _ZZ19rms_norm_kernel_f32E10shared_sum has been demoted
// _ZZ18softmax_kernel_f32E10shared_max has been demoted
// _ZZ18softmax_kernel_f32E10shared_sum has been demoted
.visible .entry rms_norm_kernel_f32(
.param .u64 rms_norm_kernel_f32_param_0,
.param .u64 rms_norm_kernel_f32_param_1,
.param .u64 rms_norm_kernel_f32_param_2,
.param .u32 rms_norm_kernel_f32_param_3,
.param .f32 rms_norm_kernel_f32_param_4,
.param .u32 rms_norm_kernel_f32_param_5
)
{
.reg .pred %p<9>;
.reg .f32 %f<22>;
.reg .b32 %r<23>;
.reg .b64 %rd<19>;
// demoted variable
.shared .align 4 .b8 _ZZ19rms_norm_kernel_f32E10shared_sum[1024];
ld.param.u64 %rd7, [rms_norm_kernel_f32_param_0];
ld.param.u64 %rd5, [rms_norm_kernel_f32_param_1];
ld.param.u64 %rd6, [rms_norm_kernel_f32_param_2];
ld.param.u32 %r13, [rms_norm_kernel_f32_param_3];
ld.param.f32 %f5, [rms_norm_kernel_f32_param_4];
ld.param.u32 %r14, [rms_norm_kernel_f32_param_5];
cvta.to.global.u64 %rd1, %rd7;
mov.u32 %r1, %ctaid.x;
setp.ge.s32 %p1, %r1, %r14;
@%p1 bra $L__BB0_11;
mul.lo.s32 %r15, %r1, %r13;
cvt.s64.s32 %rd2, %r15;
mov.u32 %r22, %tid.x;
mov.f32 %f21, 0f00000000;
setp.lt.s32 %p2, %r22, %r13;
@%p2 bra $L__BB0_2;
bra.uni $L__BB0_4;
$L__BB0_2:
mov.u32 %r3, %ntid.x;
mov.u32 %r20, %r22;
$L__BB0_3:
cvt.s64.s32 %rd8, %r20;
add.s64 %rd9, %rd8, %rd2;
shl.b64 %rd10, %rd9, 2;
add.s64 %rd11, %rd1, %rd10;
ld.global.nc.f32 %f8, [%rd11];
fma.rn.f32 %f21, %f8, %f8, %f21;
add.s32 %r20, %r20, %r3;
setp.lt.s32 %p3, %r20, %r13;
@%p3 bra $L__BB0_3;
$L__BB0_4:
shl.b32 %r16, %r22, 2;
mov.u32 %r17, _ZZ19rms_norm_kernel_f32E10shared_sum;
add.s32 %r6, %r17, %r16;
st.shared.f32 [%r6], %f21;
bar.sync 0;
mov.u32 %r7, %ntid.x;
shr.u32 %r21, %r7, 1;
setp.eq.s32 %p4, %r21, 0;
@%p4 bra $L__BB0_8;
$L__BB0_5:
setp.ge.u32 %p5, %r22, %r21;
@%p5 bra $L__BB0_7;
shl.b32 %r18, %r21, 2;
add.s32 %r19, %r6, %r18;
ld.shared.f32 %f9, [%r6];
ld.shared.f32 %f10, [%r19];
add.f32 %f11, %f10, %f9;
st.shared.f32 [%r6], %f11;
$L__BB0_7:
bar.sync 0;
shr.u32 %r21, %r21, 1;
setp.ne.s32 %p6, %r21, 0;
@%p6 bra $L__BB0_5;
$L__BB0_8:
setp.ge.s32 %p7, %r22, %r13;
@%p7 bra $L__BB0_11;
ld.shared.f32 %f12, [_ZZ19rms_norm_kernel_f32E10shared_sum];
cvta.to.global.u64 %rd3, %rd6;
cvta.to.global.u64 %rd4, %rd5;
cvt.rn.f32.s32 %f13, %r13;
div.rn.f32 %f14, %f12, %f13;
add.f32 %f15, %f14, %f5;
rsqrt.approx.f32 %f4, %f15;
$L__BB0_10:
cvt.s64.s32 %rd12, %r22;
add.s64 %rd13, %rd12, %rd2;
shl.b64 %rd14, %rd13, 2;
add.s64 %rd15, %rd1, %rd14;
ld.global.nc.f32 %f16, [%rd15];
mul.f32 %f17, %f4, %f16;
mul.wide.s32 %rd16, %r22, 4;
add.s64 %rd17, %rd4, %rd16;
ld.global.nc.f32 %f18, [%rd17];
mul.f32 %f19, %f17, %f18;
add.s64 %rd18, %rd3, %rd14;
st.global.f32 [%rd18], %f19;
add.s32 %r22, %r22, %r7;
setp.lt.s32 %p8, %r22, %r13;
@%p8 bra $L__BB0_10;
$L__BB0_11:
ret;
}
// .globl softmax_kernel_f32
.visible .entry softmax_kernel_f32(
.param .u64 softmax_kernel_f32_param_0,
.param .u64 softmax_kernel_f32_param_1,
.param .u32 softmax_kernel_f32_param_2,
.param .u32 softmax_kernel_f32_param_3
)
{
.reg .pred %p<14>;
.reg .f32 %f<45>;
.reg .b32 %r<36>;
.reg .b64 %rd<19>;
// demoted variable
.shared .align 4 .b8 _ZZ18softmax_kernel_f32E10shared_max[1024];
// demoted variable
.shared .align 4 .b8 _ZZ18softmax_kernel_f32E10shared_sum[1024];
ld.param.u64 %rd4, [softmax_kernel_f32_param_0];
ld.param.u64 %rd5, [softmax_kernel_f32_param_1];
ld.param.u32 %r18, [softmax_kernel_f32_param_2];
ld.param.u32 %r19, [softmax_kernel_f32_param_3];
cvta.to.global.u64 %rd1, %rd4;
cvta.to.global.u64 %rd2, %rd5;
mov.u32 %r1, %ctaid.x;
setp.ge.s32 %p1, %r1, %r19;
@%p1 bra $L__BB1_19;
mul.lo.s32 %r20, %r1, %r18;
cvt.s64.s32 %rd3, %r20;
mov.u32 %r35, %tid.x;
mov.f32 %f42, 0fFF7FFFFF;
setp.lt.s32 %p2, %r35, %r18;
@%p2 bra $L__BB1_2;
bra.uni $L__BB1_4;
$L__BB1_2:
mov.u32 %r3, %ntid.x;
mov.u32 %r31, %r35;
$L__BB1_3:
cvt.s64.s32 %rd6, %r31;
add.s64 %rd7, %rd6, %rd3;
shl.b64 %rd8, %rd7, 2;
add.s64 %rd9, %rd1, %rd8;
ld.global.nc.f32 %f11, [%rd9];
max.f32 %f42, %f42, %f11;
add.s32 %r31, %r31, %r3;
setp.lt.s32 %p3, %r31, %r18;
@%p3 bra $L__BB1_3;
$L__BB1_4:
shl.b32 %r21, %r35, 2;
mov.u32 %r22, _ZZ18softmax_kernel_f32E10shared_max;
add.s32 %r6, %r22, %r21;
st.shared.f32 [%r6], %f42;
bar.sync 0;
mov.u32 %r7, %ntid.x;
shr.u32 %r34, %r7, 1;
setp.eq.s32 %p4, %r34, 0;
@%p4 bra $L__BB1_9;
mov.u32 %r32, %r34;
$L__BB1_6:
setp.ge.u32 %p5, %r35, %r32;
@%p5 bra $L__BB1_8;
ld.shared.f32 %f12, [%r6];
shl.b32 %r23, %r32, 2;
add.s32 %r24, %r6, %r23;
ld.shared.f32 %f13, [%r24];
max.f32 %f14, %f12, %f13;
st.shared.f32 [%r6], %f14;
$L__BB1_8:
bar.sync 0;
shr.u32 %r32, %r32, 1;
setp.ne.s32 %p6, %r32, 0;
@%p6 bra $L__BB1_6;
$L__BB1_9:
setp.ge.s32 %p7, %r35, %r18;
mov.f32 %f44, 0f00000000;
@%p7 bra $L__BB1_12;
mov.f32 %f44, 0f00000000;
ld.shared.f32 %f4, [_ZZ18softmax_kernel_f32E10shared_max];
mov.u32 %r33, %r35;
$L__BB1_11:
cvt.s64.s32 %rd10, %r33;
add.s64 %rd11, %rd10, %rd3;
shl.b64 %rd12, %rd11, 2;
add.s64 %rd13, %rd1, %rd12;
ld.global.nc.f32 %f17, [%rd13];
sub.f32 %f18, %f17, %f4;
mov.f32 %f19, 0f3F000000;
mov.f32 %f20, 0f3BBB989D;
fma.rn.f32 %f21, %f18, %f20, %f19;
cvt.sat.f32.f32 %f22, %f21;
mov.f32 %f23, 0f4B400001;
mov.f32 %f24, 0f437C0000;
fma.rm.f32 %f25, %f22, %f24, %f23;
add.f32 %f26, %f25, 0fCB40007F;
neg.f32 %f27, %f26;
mov.f32 %f28, 0f3FB8AA3B;
fma.rn.f32 %f29, %f18, %f28, %f27;
mov.f32 %f30, 0f32A57060;
fma.rn.f32 %f31, %f18, %f30, %f29;
mov.b32 %r25, %f25;
shl.b32 %r26, %r25, 23;
mov.b32 %f32, %r26;
ex2.approx.ftz.f32 %f33, %f31;
mul.f32 %f34, %f33, %f32;
add.s64 %rd14, %rd2, %rd12;
st.global.f32 [%rd14], %f34;
add.f32 %f44, %f44, %f34;
add.s32 %r33, %r33, %r7;
setp.lt.s32 %p8, %r33, %r18;
@%p8 bra $L__BB1_11;
$L__BB1_12:
mov.u32 %r28, _ZZ18softmax_kernel_f32E10shared_sum;
add.s32 %r13, %r28, %r21;
st.shared.f32 [%r13], %f44;
bar.sync 0;
@%p4 bra $L__BB1_16;
$L__BB1_13:
setp.ge.u32 %p10, %r35, %r34;
@%p10 bra $L__BB1_15;
shl.b32 %r29, %r34, 2;
add.s32 %r30, %r13, %r29;
ld.shared.f32 %f35, [%r13];
ld.shared.f32 %f36, [%r30];
add.f32 %f37, %f36, %f35;
st.shared.f32 [%r13], %f37;
$L__BB1_15:
bar.sync 0;
shr.u32 %r34, %r34, 1;
setp.ne.s32 %p11, %r34, 0;
@%p11 bra $L__BB1_13;
$L__BB1_16:
@%p7 bra $L__BB1_19;
ld.shared.f32 %f38, [_ZZ18softmax_kernel_f32E10shared_sum];
rcp.rn.f32 %f8, %f38;
$L__BB1_18:
cvt.s64.s32 %rd15, %r35;
add.s64 %rd16, %rd15, %rd3;
shl.b64 %rd17, %rd16, 2;
add.s64 %rd18, %rd2, %rd17;
ld.global.f32 %f39, [%rd18];
mul.f32 %f40, %f8, %f39;
st.global.f32 [%rd18], %f40;
add.s32 %r35, %r35, %r7;
setp.lt.s32 %p13, %r35, %r18;
@%p13 bra $L__BB1_18;
$L__BB1_19:
ret;
}
// .globl rope_kernel_f32
.visible .entry rope_kernel_f32(
.param .u64 rope_kernel_f32_param_0,
.param .u64 rope_kernel_f32_param_1,
.param .u64 rope_kernel_f32_param_2,
.param .u64 rope_kernel_f32_param_3,
.param .u32 rope_kernel_f32_param_4,
.param .u32 rope_kernel_f32_param_5,
.param .u32 rope_kernel_f32_param_6,
.param .u32 rope_kernel_f32_param_7,
.param .u32 rope_kernel_f32_param_8
)
{
.reg .pred %p<2>;
.reg .f32 %f<17>;
.reg .b32 %r<23>;
.reg .b64 %rd<18>;
ld.param.u64 %rd1, [rope_kernel_f32_param_0];
ld.param.u64 %rd2, [rope_kernel_f32_param_1];
ld.param.u64 %rd3, [rope_kernel_f32_param_2];
ld.param.u64 %rd4, [rope_kernel_f32_param_3];
ld.param.u32 %r6, [rope_kernel_f32_param_4];
ld.param.u32 %r7, [rope_kernel_f32_param_5];
ld.param.u32 %r3, [rope_kernel_f32_param_6];
ld.param.u32 %r5, [rope_kernel_f32_param_8];
ld.param.u32 %r4, [rope_kernel_f32_param_7];
mov.u32 %r8, %ntid.x;
mov.u32 %r9, %ctaid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r1, %r9, %r8, %r10;
mul.lo.s32 %r11, %r7, %r6;
mul.lo.s32 %r12, %r11, %r3;
shr.u32 %r13, %r4, 31;
add.s32 %r14, %r4, %r13;
shr.s32 %r2, %r14, 1;
mul.lo.s32 %r15, %r12, %r2;
setp.ge.s32 %p1, %r1, %r15;
@%p1 bra $L__BB2_2;
cvta.to.global.u64 %rd5, %rd3;
div.s32 %r16, %r1, %r2;
rem.s32 %r17, %r16, %r3;
add.s32 %r18, %r17, %r5;
mul.lo.s32 %r19, %r16, %r2;
sub.s32 %r20, %r1, %r19;
mad.lo.s32 %r21, %r18, %r4, %r20;
mul.wide.s32 %rd6, %r21, 4;
add.s64 %rd7, %rd5, %rd6;
cvta.to.global.u64 %rd8, %rd4;
add.s64 %rd9, %rd8, %rd6;
mad.lo.s32 %r22, %r16, %r4, %r20;
cvta.to.global.u64 %rd10, %rd1;
mul.wide.s32 %rd11, %r22, 4;
add.s64 %rd12, %rd10, %rd11;
mul.wide.s32 %rd13, %r2, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.f32 %f1, [%rd12];
ld.global.nc.f32 %f2, [%rd7];
mul.f32 %f3, %f2, %f1;
ld.global.f32 %f4, [%rd14];
ld.global.nc.f32 %f5, [%rd9];
mul.f32 %f6, %f5, %f4;
sub.f32 %f7, %f3, %f6;
st.global.f32 [%rd12], %f7;
mul.f32 %f8, %f2, %f4;
fma.rn.f32 %f9, %f5, %f1, %f8;
st.global.f32 [%rd14], %f9;
cvta.to.global.u64 %rd15, %rd2;
add.s64 %rd16, %rd15, %rd11;
add.s64 %rd17, %rd16, %rd13;
ld.global.f32 %f10, [%rd16];
mul.f32 %f11, %f2, %f10;
ld.global.f32 %f12, [%rd17];
mul.f32 %f13, %f5, %f12;
sub.f32 %f14, %f11, %f13;
st.global.f32 [%rd16], %f14;
mul.f32 %f15, %f2, %f12;
fma.rn.f32 %f16, %f5, %f10, %f15;
st.global.f32 [%rd17], %f16;
$L__BB2_2:
ret;
}
// .globl silu_kernel_f32
.visible .entry silu_kernel_f32(
.param .u64 silu_kernel_f32_param_0,
.param .u64 silu_kernel_f32_param_1,
.param .u32 silu_kernel_f32_param_2
)
{
.reg .pred %p<2>;
.reg .f32 %f<20>;
.reg .b32 %r<8>;
.reg .b64 %rd<8>;
ld.param.u64 %rd1, [silu_kernel_f32_param_0];
ld.param.u64 %rd2, [silu_kernel_f32_param_1];
ld.param.u32 %r2, [silu_kernel_f32_param_2];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r3, %r4, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra $L__BB3_2;
cvta.to.global.u64 %rd3, %rd1;
mul.wide.s32 %rd4, %r1, 4;
add.s64 %rd5, %rd3, %rd4;
ld.global.nc.f32 %f1, [%rd5];
neg.f32 %f2, %f1;
mov.f32 %f3, 0f3F000000;
mov.f32 %f4, 0f3BBB989D;
fma.rn.f32 %f5, %f2, %f4, %f3;
cvt.sat.f32.f32 %f6, %f5;
mov.f32 %f7, 0f4B400001;
mov.f32 %f8, 0f437C0000;
fma.rm.f32 %f9, %f6, %f8, %f7;
add.f32 %f10, %f9, 0fCB40007F;
neg.f32 %f11, %f10;
mov.f32 %f12, 0f3FB8AA3B;
fma.rn.f32 %f13, %f2, %f12, %f11;
mov.f32 %f14, 0f32A57060;
fma.rn.f32 %f15, %f2, %f14, %f13;
mov.b32 %r6, %f9;
shl.b32 %r7, %r6, 23;
mov.b32 %f16, %r7;
ex2.approx.ftz.f32 %f17, %f15;
fma.rn.f32 %f18, %f17, %f16, 0f3F800000;
div.rn.f32 %f19, %f1, %f18;
cvta.to.global.u64 %rd6, %rd2;
add.s64 %rd7, %rd6, %rd4;
st.global.f32 [%rd7], %f19;
$L__BB3_2:
ret;
}
// .globl fused_silu_mul_kernel_f32
.visible .entry fused_silu_mul_kernel_f32(
.param .u64 fused_silu_mul_kernel_f32_param_0,
.param .u64 fused_silu_mul_kernel_f32_param_1,
.param .u64 fused_silu_mul_kernel_f32_param_2,
.param .u32 fused_silu_mul_kernel_f32_param_3
)
{
.reg .pred %p<2>;
.reg .f32 %f<22>;
.reg .b32 %r<8>;
.reg .b64 %rd<11>;
ld.param.u64 %rd1, [fused_silu_mul_kernel_f32_param_0];
ld.param.u64 %rd2, [fused_silu_mul_kernel_f32_param_1];
ld.param.u64 %rd3, [fused_silu_mul_kernel_f32_param_2];
ld.param.u32 %r2, [fused_silu_mul_kernel_f32_param_3];
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %tid.x;
mad.lo.s32 %r1, %r3, %r4, %r5;
setp.ge.s32 %p1, %r1, %r2;
@%p1 bra $L__BB4_2;
cvta.to.global.u64 %rd4, %rd1;
mul.wide.s32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.nc.f32 %f1, [%rd6];
neg.f32 %f2, %f1;
mov.f32 %f3, 0f3F000000;
mov.f32 %f4, 0f3BBB989D;
fma.rn.f32 %f5, %f2, %f4, %f3;
cvt.sat.f32.f32 %f6, %f5;
mov.f32 %f7, 0f4B400001;
mov.f32 %f8, 0f437C0000;
fma.rm.f32 %f9, %f6, %f8, %f7;
add.f32 %f10, %f9, 0fCB40007F;
neg.f32 %f11, %f10;
mov.f32 %f12, 0f3FB8AA3B;
fma.rn.f32 %f13, %f2, %f12, %f11;
mov.f32 %f14, 0f32A57060;
fma.rn.f32 %f15, %f2, %f14, %f13;
mov.b32 %r6, %f9;
shl.b32 %r7, %r6, 23;
mov.b32 %f16, %r7;
ex2.approx.ftz.f32 %f17, %f15;
fma.rn.f32 %f18, %f17, %f16, 0f3F800000;
div.rn.f32 %f19, %f1, %f18;
cvta.to.global.u64 %rd7, %rd2;
add.s64 %rd8, %rd7, %rd5;
ld.global.nc.f32 %f20, [%rd8];
mul.f32 %f21, %f20, %f19;
cvta.to.global.u64 %rd9, %rd3;
add.s64 %rd10, %rd9, %rd5;
st.global.f32 [%rd10], %f21;
$L__BB4_2:
ret;
}