//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-33961263
// Cuda compilation tools, release 12.4, V12.4.99
// Based on NVVM 7.0.1
//
.version 8.4
.target sm_89
.address_size 64
// .globl reshape_and_cache_kernel_f32
.extern .shared .align 16 .b8 shared_mem[];
.extern .shared .align 16 .b8 shared[];
.visible .entry reshape_and_cache_kernel_f32(
.param .u64 reshape_and_cache_kernel_f32_param_0,
.param .u64 reshape_and_cache_kernel_f32_param_1,
.param .u64 reshape_and_cache_kernel_f32_param_2,
.param .u64 reshape_and_cache_kernel_f32_param_3,
.param .u64 reshape_and_cache_kernel_f32_param_4,
.param .u32 reshape_and_cache_kernel_f32_param_5,
.param .u32 reshape_and_cache_kernel_f32_param_6,
.param .u32 reshape_and_cache_kernel_f32_param_7,
.param .u32 reshape_and_cache_kernel_f32_param_8
)
{
.reg .pred %p<6>;
.reg .f32 %f<3>;
.reg .b32 %r<24>;
.reg .b64 %rd<32>;
ld.param.u64 %rd13, [reshape_and_cache_kernel_f32_param_0];
ld.param.u64 %rd14, [reshape_and_cache_kernel_f32_param_1];
ld.param.u64 %rd15, [reshape_and_cache_kernel_f32_param_2];
ld.param.u64 %rd16, [reshape_and_cache_kernel_f32_param_3];
ld.param.u64 %rd17, [reshape_and_cache_kernel_f32_param_4];
ld.param.u32 %r13, [reshape_and_cache_kernel_f32_param_5];
ld.param.u32 %r10, [reshape_and_cache_kernel_f32_param_6];
ld.param.u32 %r11, [reshape_and_cache_kernel_f32_param_7];
ld.param.u32 %r12, [reshape_and_cache_kernel_f32_param_8];
mov.u32 %r1, %ctaid.x;
setp.ge.s32 %p1, %r1, %r13;
@%p1 bra $L__BB0_8;
cvta.to.global.u64 %rd18, %rd17;
mul.wide.s32 %rd19, %r1, 8;
add.s64 %rd20, %rd18, %rd19;
ld.global.nc.u64 %rd1, [%rd20];
setp.lt.s64 %p2, %rd1, 0;
@%p2 bra $L__BB0_8;
cvt.s64.s32 %rd2, %r12;
or.b64 %rd21, %rd1, %rd2;
and.b64 %rd22, %rd21, -4294967296;
setp.eq.s64 %p3, %rd22, 0;
@%p3 bra $L__BB0_4;
div.s64 %rd30, %rd1, %rd2;
mul.lo.s64 %rd23, %rd30, %rd2;
sub.s64 %rd31, %rd1, %rd23;
bra.uni $L__BB0_5;
$L__BB0_4:
cvt.u32.u64 %r14, %rd2;
cvt.u32.u64 %r15, %rd1;
div.u32 %r16, %r15, %r14;
mul.lo.s32 %r17, %r16, %r14;
sub.s32 %r18, %r15, %r17;
cvt.u64.u32 %rd30, %r16;
cvt.u64.u32 %rd31, %r18;
$L__BB0_5:
mov.u32 %r23, %tid.x;
mul.lo.s32 %r3, %r11, %r10;
setp.ge.s32 %p4, %r23, %r3;
@%p4 bra $L__BB0_8;
cvt.u32.u64 %r19, %rd30;
mul.lo.s32 %r4, %r3, %r1;
mul.lo.s32 %r5, %r3, %r19;
mov.u32 %r6, %ntid.x;
cvta.to.global.u64 %rd9, %rd16;
cvta.to.global.u64 %rd10, %rd14;
cvta.to.global.u64 %rd11, %rd15;
cvta.to.global.u64 %rd12, %rd13;
cvt.u32.u64 %r7, %rd31;
$L__BB0_7:
add.s32 %r20, %r23, %r5;
mad.lo.s32 %r21, %r20, %r12, %r7;
add.s32 %r22, %r23, %r4;
mul.wide.s32 %rd24, %r22, 4;
add.s64 %rd25, %rd12, %rd24;
ld.global.nc.f32 %f1, [%rd25];
mul.wide.s32 %rd26, %r21, 4;
add.s64 %rd27, %rd11, %rd26;
st.global.f32 [%rd27], %f1;
add.s64 %rd28, %rd10, %rd24;
ld.global.nc.f32 %f2, [%rd28];
add.s64 %rd29, %rd9, %rd26;
st.global.f32 [%rd29], %f2;
add.s32 %r23, %r23, %r6;
setp.lt.s32 %p5, %r23, %r3;
@%p5 bra $L__BB0_7;
$L__BB0_8:
ret;
}
// .globl paged_attention_v1_kernel_f32
.visible .entry paged_attention_v1_kernel_f32(
.param .u64 paged_attention_v1_kernel_f32_param_0,
.param .u64 paged_attention_v1_kernel_f32_param_1,
.param .u64 paged_attention_v1_kernel_f32_param_2,
.param .u64 paged_attention_v1_kernel_f32_param_3,
.param .u64 paged_attention_v1_kernel_f32_param_4,
.param .u64 paged_attention_v1_kernel_f32_param_5,
.param .f32 paged_attention_v1_kernel_f32_param_6,
.param .u32 paged_attention_v1_kernel_f32_param_7,
.param .u32 paged_attention_v1_kernel_f32_param_8,
.param .u32 paged_attention_v1_kernel_f32_param_9,
.param .u32 paged_attention_v1_kernel_f32_param_10,
.param .u32 paged_attention_v1_kernel_f32_param_11,
.param .u32 paged_attention_v1_kernel_f32_param_12
)
{
.reg .pred %p<41>;
.reg .f32 %f<133>;
.reg .b32 %r<229>;
.reg .b64 %rd<63>;
ld.param.u64 %rd9, [paged_attention_v1_kernel_f32_param_0];
ld.param.u64 %rd7, [paged_attention_v1_kernel_f32_param_1];
ld.param.u64 %rd10, [paged_attention_v1_kernel_f32_param_2];
ld.param.u64 %rd11, [paged_attention_v1_kernel_f32_param_3];
ld.param.u64 %rd12, [paged_attention_v1_kernel_f32_param_4];
ld.param.u64 %rd8, [paged_attention_v1_kernel_f32_param_5];
ld.param.f32 %f28, [paged_attention_v1_kernel_f32_param_6];
ld.param.u32 %r61, [paged_attention_v1_kernel_f32_param_7];
ld.param.u32 %r56, [paged_attention_v1_kernel_f32_param_8];
ld.param.u32 %r57, [paged_attention_v1_kernel_f32_param_9];
ld.param.u32 %r58, [paged_attention_v1_kernel_f32_param_10];
ld.param.u32 %r59, [paged_attention_v1_kernel_f32_param_11];
ld.param.u32 %r60, [paged_attention_v1_kernel_f32_param_12];
cvta.to.global.u64 %rd1, %rd10;
cvta.to.global.u64 %rd2, %rd11;
cvta.to.global.u64 %rd3, %rd12;
cvta.to.global.u64 %rd4, %rd9;
mov.u32 %r1, %ctaid.x;
setp.ge.s32 %p3, %r1, %r61;
mov.u32 %r2, %ctaid.y;
setp.ge.s32 %p4, %r2, %r56;
or.pred %p5, %p3, %p4;
@%p5 bra $L__BB1_36;
cvta.to.global.u64 %rd13, %rd8;
mul.wide.s32 %rd14, %r1, 4;
add.s64 %rd15, %rd13, %rd14;
ld.global.nc.u32 %r3, [%rd15];
setp.eq.s32 %p6, %r3, 0;
@%p6 bra $L__BB1_36;
mul.lo.s32 %r62, %r2, %r57;
div.s32 %r4, %r62, %r56;
mad.lo.s32 %r63, %r1, %r56, %r2;
mul.lo.s32 %r5, %r63, %r58;
mov.u32 %r223, %tid.x;
setp.ge.s32 %p7, %r223, %r58;
@%p7 bra $L__BB1_5;
mov.u32 %r7, %ntid.x;
cvta.to.global.u64 %rd5, %rd7;
mov.u32 %r215, %r223;
$L__BB1_4:
add.s32 %r64, %r215, %r5;
mul.wide.s32 %rd16, %r64, 4;
add.s64 %rd17, %rd5, %rd16;
ld.global.nc.f32 %f29, [%rd17];
shl.b32 %r65, %r215, 2;
mov.u32 %r66, shared_mem;
add.s32 %r67, %r66, %r65;
st.shared.f32 [%r67], %f29;
add.s32 %r215, %r215, %r7;
setp.lt.s32 %p8, %r215, %r58;
@%p8 bra $L__BB1_4;
$L__BB1_5:
bar.sync 0;
setp.ge.s32 %p9, %r223, %r3;
mov.f32 %f126, 0fFF7FFFFF;
@%p9 bra $L__BB1_18;
mov.u32 %r10, %ntid.x;
setp.gt.s32 %p10, %r58, 0;
@%p10 bra $L__BB1_9;
bra.uni $L__BB1_7;
$L__BB1_9:
add.s32 %r13, %r58, -1;
and.b32 %r14, %r58, 3;
sub.s32 %r15, %r58, %r14;
mul.wide.s32 %rd6, %r59, 4;
mul.lo.s32 %r16, %r1, %r60;
mov.f32 %f126, 0fFF7FFFFF;
mov.u32 %r217, %r223;
$L__BB1_10:
div.s32 %r73, %r217, %r59;
mul.lo.s32 %r74, %r73, %r59;
sub.s32 %r18, %r217, %r74;
add.s32 %r75, %r73, %r16;
mul.wide.s32 %rd18, %r75, 4;
add.s64 %rd19, %rd3, %rd18;
ld.global.nc.u32 %r76, [%rd19];
mad.lo.s32 %r77, %r76, %r57, %r4;
mul.lo.s32 %r19, %r77, %r58;
setp.lt.u32 %p12, %r13, 3;
mov.f32 %f125, 0f00000000;
mov.u32 %r220, 0;
@%p12 bra $L__BB1_13;
mov.f32 %f125, 0f00000000;
mov.u32 %r220, 0;
mov.u32 %r219, %r15;
$L__BB1_12:
add.s32 %r79, %r220, %r19;
mad.lo.s32 %r80, %r79, %r59, %r18;
shl.b32 %r81, %r220, 2;
mov.u32 %r82, shared_mem;
add.s32 %r83, %r82, %r81;
ld.shared.v4.f32 {%f36, %f37, %f38, %f39}, [%r83];
mul.wide.s32 %rd20, %r80, 4;
add.s64 %rd21, %rd1, %rd20;
ld.global.nc.f32 %f44, [%rd21];
fma.rn.f32 %f45, %f36, %f44, %f125;
add.s64 %rd22, %rd21, %rd6;
ld.global.nc.f32 %f46, [%rd22];
fma.rn.f32 %f47, %f37, %f46, %f45;
add.s64 %rd23, %rd22, %rd6;
ld.global.nc.f32 %f48, [%rd23];
fma.rn.f32 %f49, %f38, %f48, %f47;
add.s64 %rd24, %rd23, %rd6;
ld.global.nc.f32 %f50, [%rd24];
fma.rn.f32 %f125, %f39, %f50, %f49;
add.s32 %r220, %r220, 4;
add.s32 %r219, %r219, -4;
setp.ne.s32 %p13, %r219, 0;
@%p13 bra $L__BB1_12;
$L__BB1_13:
setp.eq.s32 %p14, %r14, 0;
@%p14 bra $L__BB1_17;
setp.eq.s32 %p15, %r14, 1;
add.s32 %r25, %r220, %r19;
mad.lo.s32 %r84, %r25, %r59, %r18;
mul.wide.s32 %rd25, %r84, 4;
add.s64 %rd26, %rd1, %rd25;
ld.global.nc.f32 %f51, [%rd26];
shl.b32 %r85, %r220, 2;
mov.u32 %r86, shared_mem;
add.s32 %r26, %r86, %r85;
ld.shared.f32 %f52, [%r26];
fma.rn.f32 %f125, %f52, %f51, %f125;
@%p15 bra $L__BB1_17;
setp.eq.s32 %p16, %r14, 2;
add.s32 %r87, %r25, 1;
mad.lo.s32 %r88, %r87, %r59, %r18;
mul.wide.s32 %rd27, %r88, 4;
add.s64 %rd28, %rd1, %rd27;
ld.global.nc.f32 %f53, [%rd28];
ld.shared.f32 %f54, [%r26+4];
fma.rn.f32 %f125, %f54, %f53, %f125;
@%p16 bra $L__BB1_17;
add.s32 %r89, %r25, 2;
mad.lo.s32 %r90, %r89, %r59, %r18;
mul.wide.s32 %rd29, %r90, 4;
add.s64 %rd30, %rd1, %rd29;
ld.global.nc.f32 %f55, [%rd30];
ld.shared.f32 %f56, [%r26+8];
fma.rn.f32 %f125, %f56, %f55, %f125;
$L__BB1_17:
add.s32 %r91, %r217, %r58;
shl.b32 %r92, %r91, 2;
mov.u32 %r93, shared_mem;
add.s32 %r94, %r93, %r92;
mul.f32 %f57, %f125, %f28;
st.shared.f32 [%r94], %f57;
max.f32 %f126, %f126, %f57;
add.s32 %r217, %r217, %r10;
setp.lt.s32 %p17, %r217, %r3;
@%p17 bra $L__BB1_10;
bra.uni $L__BB1_18;
$L__BB1_7:
mul.f32 %f1, %f28, 0f00000000;
mov.f32 %f126, 0fFF7FFFFF;
mov.u32 %r70, shared_mem;
mov.u32 %r216, %r223;
$L__BB1_8:
add.s32 %r68, %r216, %r58;
shl.b32 %r69, %r68, 2;
add.s32 %r71, %r70, %r69;
st.shared.f32 [%r71], %f1;
max.f32 %f126, %f126, %f1;
add.s32 %r216, %r216, %r10;
setp.lt.s32 %p11, %r216, %r3;
@%p11 bra $L__BB1_8;
$L__BB1_18:
bar.sync 0;
mov.b32 %r95, %f126;
mov.u32 %r96, 2;
mov.u32 %r97, 31;
mov.u32 %r98, 16;
mov.u32 %r99, -1;
shfl.sync.down.b32 %r100|%p18, %r95, %r98, %r97, %r99;
mov.b32 %f59, %r100;
max.f32 %f60, %f126, %f59;
mov.b32 %r101, %f60;
mov.u32 %r102, 8;
shfl.sync.down.b32 %r103|%p19, %r101, %r102, %r97, %r99;
mov.b32 %f61, %r103;
max.f32 %f62, %f60, %f61;
mov.b32 %r104, %f62;
mov.u32 %r105, 4;
shfl.sync.down.b32 %r106|%p20, %r104, %r105, %r97, %r99;
mov.b32 %f63, %r106;
max.f32 %f64, %f62, %f63;
mov.b32 %r107, %f64;
shfl.sync.down.b32 %r108|%p21, %r107, %r96, %r97, %r99;
mov.b32 %f65, %r108;
max.f32 %f66, %f64, %f65;
mov.b32 %r109, %f66;
mov.u32 %r110, 1;
shfl.sync.down.b32 %r111|%p22, %r109, %r110, %r97, %r99;
mov.b32 %f67, %r111;
max.f32 %f68, %f66, %f67;
mov.b32 %r112, %f68;
mov.u32 %r113, 0;
shfl.sync.idx.b32 %r28|%p1, %r112, %r113, %r97, %r99;
mov.f32 %f128, 0f00000000;
@%p9 bra $L__BB1_21;
mov.u32 %r29, %ntid.x;
mov.b32 %f15, %r28;
mov.f32 %f128, 0f00000000;
mov.u32 %r221, %r223;
$L__BB1_20:
add.s32 %r114, %r221, %r58;
shl.b32 %r115, %r114, 2;
mov.u32 %r116, shared_mem;
add.s32 %r117, %r116, %r115;
ld.shared.f32 %f70, [%r117];
sub.f32 %f71, %f70, %f15;
mov.f32 %f72, 0f3F000000;
mov.f32 %f73, 0f3BBB989D;
fma.rn.f32 %f74, %f71, %f73, %f72;
cvt.sat.f32.f32 %f75, %f74;
mov.f32 %f76, 0f4B400001;
mov.f32 %f77, 0f437C0000;
fma.rm.f32 %f78, %f75, %f77, %f76;
add.f32 %f79, %f78, 0fCB40007F;
neg.f32 %f80, %f79;
mov.f32 %f81, 0f3FB8AA3B;
fma.rn.f32 %f82, %f71, %f81, %f80;
mov.f32 %f83, 0f32A57060;
fma.rn.f32 %f84, %f71, %f83, %f82;
mov.b32 %r118, %f78;
shl.b32 %r119, %r118, 23;
mov.b32 %f85, %r119;
ex2.approx.ftz.f32 %f86, %f84;
mul.f32 %f87, %f86, %f85;
st.shared.f32 [%r117], %f87;
add.f32 %f128, %f128, %f87;
add.s32 %r221, %r221, %r29;
setp.lt.s32 %p24, %r221, %r3;
@%p24 bra $L__BB1_20;
$L__BB1_21:
bar.sync 0;
mov.b32 %r120, %f128;
shfl.sync.down.b32 %r125|%p25, %r120, %r98, %r97, %r99;
mov.b32 %f88, %r125;
add.f32 %f89, %f128, %f88;
mov.b32 %r126, %f89;
shfl.sync.down.b32 %r128|%p26, %r126, %r102, %r97, %r99;
mov.b32 %f90, %r128;
add.f32 %f91, %f89, %f90;
mov.b32 %r129, %f91;
shfl.sync.down.b32 %r131|%p27, %r129, %r105, %r97, %r99;
mov.b32 %f92, %r131;
add.f32 %f93, %f91, %f92;
mov.b32 %r132, %f93;
shfl.sync.down.b32 %r133|%p28, %r132, %r96, %r97, %r99;
mov.b32 %f94, %r133;
add.f32 %f95, %f93, %f94;
mov.b32 %r134, %f95;
shfl.sync.down.b32 %r136|%p29, %r134, %r110, %r97, %r99;
mov.b32 %f96, %r136;
add.f32 %f97, %f95, %f96;
mov.b32 %r137, %f97;
shfl.sync.idx.b32 %r32|%p2, %r137, %r113, %r97, %r99;
@%p9 bra $L__BB1_24;
mov.u32 %r33, %ntid.x;
mov.b32 %f19, %r32;
mov.u32 %r222, %r223;
$L__BB1_23:
add.s32 %r139, %r222, %r58;
shl.b32 %r140, %r139, 2;
mov.u32 %r141, shared_mem;
add.s32 %r142, %r141, %r140;
ld.shared.f32 %f98, [%r142];
div.rn.f32 %f99, %f98, %f19;
st.shared.f32 [%r142], %f99;
add.s32 %r222, %r222, %r33;
setp.lt.s32 %p31, %r222, %r3;
@%p31 bra $L__BB1_23;
$L__BB1_24:
bar.sync 0;
@%p7 bra $L__BB1_36;
setp.gt.s32 %p33, %r3, 0;
mov.u32 %r36, %ntid.x;
mul.lo.s32 %r37, %r1, %r60;
@%p33 bra $L__BB1_27;
bra.uni $L__BB1_26;
$L__BB1_27:
add.s32 %r40, %r3, -1;
and.b32 %r41, %r3, 3;
sub.s32 %r42, %r3, %r41;
shl.b32 %r145, %r58, 2;
mov.u32 %r146, shared_mem;
add.s32 %r147, %r146, %r145;
add.s32 %r43, %r147, 8;
$L__BB1_28:
setp.lt.u32 %p35, %r40, 3;
mov.f32 %f132, 0f00000000;
mov.u32 %r228, 0;
@%p35 bra $L__BB1_31;
mov.f32 %f132, 0f00000000;
mov.u32 %r228, 0;
mov.u32 %r225, %r43;
mov.u32 %r227, %r42;
$L__BB1_30:
div.s32 %r150, %r228, %r59;
add.s32 %r151, %r150, %r37;
mul.wide.s32 %rd33, %r151, 4;
add.s64 %rd34, %rd3, %rd33;
ld.global.nc.u32 %r152, [%rd34];
mad.lo.s32 %r153, %r152, %r57, %r4;
mad.lo.s32 %r154, %r153, %r58, %r223;
mul.lo.s32 %r155, %r150, %r59;
sub.s32 %r156, %r228, %r155;
mad.lo.s32 %r157, %r154, %r59, %r156;
mul.wide.s32 %rd35, %r157, 4;
add.s64 %rd36, %rd2, %rd35;
ld.global.nc.f32 %f103, [%rd36];
ld.shared.f32 %f104, [%r225+-8];
fma.rn.f32 %f105, %f104, %f103, %f132;
add.s32 %r158, %r228, 1;
div.s32 %r159, %r158, %r59;
mul.lo.s32 %r160, %r159, %r59;
sub.s32 %r161, %r158, %r160;
add.s32 %r162, %r159, %r37;
mul.wide.s32 %rd37, %r162, 4;
add.s64 %rd38, %rd3, %rd37;
ld.global.nc.u32 %r163, [%rd38];
mad.lo.s32 %r164, %r163, %r57, %r4;
mad.lo.s32 %r165, %r164, %r58, %r223;
mad.lo.s32 %r166, %r165, %r59, %r161;
mul.wide.s32 %rd39, %r166, 4;
add.s64 %rd40, %rd2, %rd39;
ld.global.nc.f32 %f106, [%rd40];
ld.shared.f32 %f107, [%r225+-4];
fma.rn.f32 %f108, %f107, %f106, %f105;
add.s32 %r167, %r228, 2;
div.s32 %r168, %r167, %r59;
mul.lo.s32 %r169, %r168, %r59;
sub.s32 %r170, %r167, %r169;
add.s32 %r171, %r168, %r37;
mul.wide.s32 %rd41, %r171, 4;
add.s64 %rd42, %rd3, %rd41;
ld.global.nc.u32 %r172, [%rd42];
mad.lo.s32 %r173, %r172, %r57, %r4;
mad.lo.s32 %r174, %r173, %r58, %r223;
mad.lo.s32 %r175, %r174, %r59, %r170;
mul.wide.s32 %rd43, %r175, 4;
add.s64 %rd44, %rd2, %rd43;
ld.global.nc.f32 %f109, [%rd44];
ld.shared.f32 %f110, [%r225];
fma.rn.f32 %f111, %f110, %f109, %f108;
add.s32 %r176, %r228, 3;
div.s32 %r177, %r176, %r59;
mul.lo.s32 %r178, %r177, %r59;
sub.s32 %r179, %r176, %r178;
add.s32 %r180, %r177, %r37;
mul.wide.s32 %rd45, %r180, 4;
add.s64 %rd46, %rd3, %rd45;
ld.global.nc.u32 %r181, [%rd46];
mad.lo.s32 %r182, %r181, %r57, %r4;
mad.lo.s32 %r183, %r182, %r58, %r223;
mad.lo.s32 %r184, %r183, %r59, %r179;
mul.wide.s32 %rd47, %r184, 4;
add.s64 %rd48, %rd2, %rd47;
ld.global.nc.f32 %f112, [%rd48];
ld.shared.f32 %f113, [%r225+4];
fma.rn.f32 %f132, %f113, %f112, %f111;
add.s32 %r228, %r228, 4;
add.s32 %r225, %r225, 16;
add.s32 %r227, %r227, -4;
setp.ne.s32 %p36, %r227, 0;
@%p36 bra $L__BB1_30;
$L__BB1_31:
setp.eq.s32 %p37, %r41, 0;
@%p37 bra $L__BB1_35;
setp.eq.s32 %p38, %r41, 1;
div.s32 %r185, %r228, %r59;
add.s32 %r186, %r185, %r37;
mul.wide.s32 %rd49, %r186, 4;
add.s64 %rd50, %rd3, %rd49;
ld.global.nc.u32 %r187, [%rd50];
mad.lo.s32 %r188, %r187, %r57, %r4;
mad.lo.s32 %r189, %r188, %r58, %r223;
mul.lo.s32 %r190, %r185, %r59;
sub.s32 %r191, %r228, %r190;
mad.lo.s32 %r192, %r189, %r59, %r191;
add.s32 %r193, %r228, %r58;
shl.b32 %r194, %r193, 2;
add.s32 %r54, %r146, %r194;
mul.wide.s32 %rd51, %r192, 4;
add.s64 %rd52, %rd2, %rd51;
ld.global.nc.f32 %f114, [%rd52];
ld.shared.f32 %f115, [%r54];
fma.rn.f32 %f132, %f115, %f114, %f132;
@%p38 bra $L__BB1_35;
add.s32 %r196, %r228, 1;
setp.eq.s32 %p39, %r41, 2;
div.s32 %r197, %r196, %r59;
add.s32 %r198, %r197, %r37;
mul.wide.s32 %rd53, %r198, 4;
add.s64 %rd54, %rd3, %rd53;
ld.global.nc.u32 %r199, [%rd54];
mad.lo.s32 %r200, %r199, %r57, %r4;
mad.lo.s32 %r201, %r200, %r58, %r223;
mul.lo.s32 %r202, %r197, %r59;
sub.s32 %r203, %r196, %r202;
mad.lo.s32 %r204, %r201, %r59, %r203;
mul.wide.s32 %rd55, %r204, 4;
add.s64 %rd56, %rd2, %rd55;
ld.global.nc.f32 %f116, [%rd56];
ld.shared.f32 %f117, [%r54+4];
fma.rn.f32 %f132, %f117, %f116, %f132;
@%p39 bra $L__BB1_35;
add.s32 %r205, %r228, 2;
div.s32 %r206, %r205, %r59;
add.s32 %r207, %r206, %r37;
mul.wide.s32 %rd57, %r207, 4;
add.s64 %rd58, %rd3, %rd57;
ld.global.nc.u32 %r208, [%rd58];
mad.lo.s32 %r209, %r208, %r57, %r4;
mad.lo.s32 %r210, %r209, %r58, %r223;
mul.lo.s32 %r211, %r206, %r59;
sub.s32 %r212, %r205, %r211;
mad.lo.s32 %r213, %r210, %r59, %r212;
mul.wide.s32 %rd59, %r213, 4;
add.s64 %rd60, %rd2, %rd59;
ld.global.nc.f32 %f118, [%rd60];
ld.shared.f32 %f119, [%r54+8];
fma.rn.f32 %f132, %f119, %f118, %f132;
$L__BB1_35:
add.s32 %r214, %r223, %r5;
mul.wide.s32 %rd61, %r214, 4;
add.s64 %rd62, %rd4, %rd61;
st.global.f32 [%rd62], %f132;
add.s32 %r223, %r223, %r36;
setp.lt.s32 %p40, %r223, %r58;
@%p40 bra $L__BB1_28;
bra.uni $L__BB1_36;
$L__BB1_26:
add.s32 %r143, %r223, %r5;
mul.wide.s32 %rd31, %r143, 4;
add.s64 %rd32, %rd4, %rd31;
mov.u32 %r144, 0;
st.global.u32 [%rd32], %r144;
add.s32 %r223, %r223, %r36;
setp.lt.s32 %p34, %r223, %r58;
@%p34 bra $L__BB1_26;
$L__BB1_36:
ret;
}
// .globl attention_kernel_f32
.visible .entry attention_kernel_f32(
.param .u64 attention_kernel_f32_param_0,
.param .u64 attention_kernel_f32_param_1,
.param .u64 attention_kernel_f32_param_2,
.param .u64 attention_kernel_f32_param_3,
.param .f32 attention_kernel_f32_param_4,
.param .u32 attention_kernel_f32_param_5,
.param .u32 attention_kernel_f32_param_6,
.param .u32 attention_kernel_f32_param_7,
.param .u32 attention_kernel_f32_param_8
)
{
.reg .pred %p<35>;
.reg .f32 %f<118>;
.reg .b32 %r<114>;
.reg .b64 %rd<33>;
ld.param.u64 %rd8, [attention_kernel_f32_param_0];
ld.param.u64 %rd9, [attention_kernel_f32_param_1];
ld.param.u64 %rd10, [attention_kernel_f32_param_2];
ld.param.u64 %rd11, [attention_kernel_f32_param_3];
ld.param.f32 %f30, [attention_kernel_f32_param_4];
ld.param.u32 %r56, [attention_kernel_f32_param_5];
ld.param.u32 %r53, [attention_kernel_f32_param_6];
ld.param.u32 %r54, [attention_kernel_f32_param_7];
ld.param.u32 %r55, [attention_kernel_f32_param_8];
cvta.to.global.u64 %rd1, %rd8;
cvta.to.global.u64 %rd2, %rd11;
cvta.to.global.u64 %rd3, %rd10;
cvta.to.global.u64 %rd4, %rd9;
mov.u32 %r1, %ctaid.z;
setp.ge.s32 %p1, %r1, %r56;
mov.u32 %r2, %ctaid.y;
setp.ge.s32 %p2, %r2, %r54;
or.pred %p3, %p1, %p2;
mov.u32 %r3, %ctaid.x;
setp.ge.s32 %p4, %r3, %r53;
or.pred %p5, %p4, %p3;
@%p5 bra $L__BB2_39;
mul.lo.s32 %r57, %r1, %r53;
mad.lo.s32 %r58, %r57, %r54, %r2;
mul.lo.s32 %r4, %r58, %r55;
mul.lo.s32 %r59, %r3, %r54;
mad.lo.s32 %r5, %r59, %r55, %r4;
mov.u32 %r109, %tid.x;
setp.gt.s32 %p6, %r109, %r3;
mov.f32 %f108, 0fFF7FFFFF;
@%p6 bra $L__BB2_14;
mov.u32 %r7, %ntid.x;
setp.gt.s32 %p7, %r55, 0;
@%p7 bra $L__BB2_5;
bra.uni $L__BB2_3;
$L__BB2_5:
add.s32 %r10, %r55, -1;
and.b32 %r11, %r55, 3;
sub.s32 %r12, %r55, %r11;
mul.lo.s32 %r13, %r55, %r54;
mov.f32 %f108, 0fFF7FFFFF;
mov.u32 %r101, %r109;
$L__BB2_6:
mad.lo.s32 %r15, %r13, %r101, %r4;
setp.lt.u32 %p9, %r10, 3;
mov.f32 %f107, 0f00000000;
mov.u32 %r104, 0;
@%p9 bra $L__BB2_9;
mov.f32 %f107, 0f00000000;
mov.u32 %r104, 0;
mov.u32 %r103, %r12;
$L__BB2_8:
add.s32 %r65, %r104, %r5;
mul.wide.s32 %rd12, %r65, 4;
add.s64 %rd13, %rd4, %rd12;
add.s32 %r66, %r15, %r104;
mul.wide.s32 %rd14, %r66, 4;
add.s64 %rd15, %rd3, %rd14;
ld.global.nc.f32 %f37, [%rd15];
ld.global.nc.f32 %f38, [%rd13];
fma.rn.f32 %f39, %f38, %f37, %f107;
ld.global.nc.f32 %f40, [%rd15+4];
ld.global.nc.f32 %f41, [%rd13+4];
fma.rn.f32 %f42, %f41, %f40, %f39;
ld.global.nc.f32 %f43, [%rd15+8];
ld.global.nc.f32 %f44, [%rd13+8];
fma.rn.f32 %f45, %f44, %f43, %f42;
ld.global.nc.f32 %f46, [%rd15+12];
ld.global.nc.f32 %f47, [%rd13+12];
fma.rn.f32 %f107, %f47, %f46, %f45;
add.s32 %r104, %r104, 4;
add.s32 %r103, %r103, -4;
setp.ne.s32 %p10, %r103, 0;
@%p10 bra $L__BB2_8;
$L__BB2_9:
setp.eq.s32 %p11, %r11, 0;
@%p11 bra $L__BB2_13;
setp.eq.s32 %p12, %r11, 1;
add.s32 %r67, %r104, %r5;
mul.wide.s32 %rd16, %r67, 4;
add.s64 %rd5, %rd4, %rd16;
add.s32 %r68, %r15, %r104;
mul.wide.s32 %rd17, %r68, 4;
add.s64 %rd6, %rd3, %rd17;
ld.global.nc.f32 %f48, [%rd6];
ld.global.nc.f32 %f49, [%rd5];
fma.rn.f32 %f107, %f49, %f48, %f107;
@%p12 bra $L__BB2_13;
setp.eq.s32 %p13, %r11, 2;
ld.global.nc.f32 %f50, [%rd6+4];
ld.global.nc.f32 %f51, [%rd5+4];
fma.rn.f32 %f107, %f51, %f50, %f107;
@%p13 bra $L__BB2_13;
ld.global.nc.f32 %f52, [%rd6+8];
ld.global.nc.f32 %f53, [%rd5+8];
fma.rn.f32 %f107, %f53, %f52, %f107;
$L__BB2_13:
shl.b32 %r69, %r101, 2;
mov.u32 %r70, shared;
add.s32 %r71, %r70, %r69;
mul.f32 %f54, %f107, %f30;
st.shared.f32 [%r71], %f54;
max.f32 %f108, %f108, %f54;
add.s32 %r101, %r101, %r7;
setp.le.s32 %p14, %r101, %r3;
@%p14 bra $L__BB2_6;
bra.uni $L__BB2_14;
$L__BB2_3:
mul.f32 %f1, %f30, 0f00000000;
mov.f32 %f108, 0fFF7FFFFF;
mov.u32 %r61, shared;
mov.u32 %r100, %r109;
$L__BB2_4:
shl.b32 %r60, %r100, 2;
add.s32 %r62, %r61, %r60;
st.shared.f32 [%r62], %f1;
max.f32 %f108, %f108, %f1;
add.s32 %r100, %r100, %r7;
setp.gt.s32 %p8, %r100, %r3;
@%p8 bra $L__BB2_14;
bra.uni $L__BB2_4;
$L__BB2_14:
add.s32 %r22, %r3, 1;
add.s32 %r105, %r22, %r109;
setp.ge.s32 %p15, %r105, %r53;
@%p15 bra $L__BB2_17;
mov.u32 %r24, %ntid.x;
$L__BB2_16:
shl.b32 %r72, %r105, 2;
mov.u32 %r73, shared;
add.s32 %r74, %r73, %r72;
mov.u32 %r75, -8388609;
st.shared.u32 [%r74], %r75;
add.s32 %r105, %r105, %r24;
setp.lt.s32 %p16, %r105, %r53;
@%p16 bra $L__BB2_16;
$L__BB2_17:
bar.sync 0;
mov.f32 %f112, 0f00000000;
@%p6 bra $L__BB2_20;
mov.u32 %r27, %ntid.x;
mov.u32 %r77, shared;
mov.u32 %r106, %r109;
$L__BB2_19:
shl.b32 %r76, %r106, 2;
add.s32 %r78, %r77, %r76;
ld.shared.f32 %f56, [%r78];
sub.f32 %f57, %f56, %f108;
mov.f32 %f58, 0f3F000000;
mov.f32 %f59, 0f3BBB989D;
fma.rn.f32 %f60, %f57, %f59, %f58;
cvt.sat.f32.f32 %f61, %f60;
mov.f32 %f62, 0f4B400001;
mov.f32 %f63, 0f437C0000;
fma.rm.f32 %f64, %f61, %f63, %f62;
add.f32 %f65, %f64, 0fCB40007F;
neg.f32 %f66, %f65;
mov.f32 %f67, 0f3FB8AA3B;
fma.rn.f32 %f68, %f57, %f67, %f66;
mov.f32 %f69, 0f32A57060;
fma.rn.f32 %f70, %f57, %f69, %f68;
mov.b32 %r79, %f64;
shl.b32 %r80, %r79, 23;
mov.b32 %f71, %r80;
ex2.approx.ftz.f32 %f72, %f70;
mul.f32 %f73, %f72, %f71;
st.shared.f32 [%r78], %f73;
add.f32 %f112, %f112, %f73;
add.s32 %r106, %r106, %r27;
setp.gt.s32 %p18, %r106, %r3;
@%p18 bra $L__BB2_20;
bra.uni $L__BB2_19;
$L__BB2_20:
bar.sync 0;
mov.u32 %r30, %ntid.x;
shr.u32 %r107, %r30, 1;
setp.eq.s32 %p19, %r107, 0;
@%p19 bra $L__BB2_24;
$L__BB2_21:
add.s32 %r33, %r107, %r109;
setp.ge.u32 %p20, %r33, %r53;
setp.ge.u32 %p21, %r109, %r107;
or.pred %p22, %p21, %p20;
@%p22 bra $L__BB2_23;
shl.b32 %r81, %r33, 2;
mov.u32 %r82, shared;
add.s32 %r83, %r82, %r81;
ld.shared.f32 %f75, [%r83];
add.f32 %f112, %f112, %f75;
$L__BB2_23:
bar.sync 0;
shr.u32 %r107, %r107, 1;
setp.ne.s32 %p23, %r107, 0;
@%p23 bra $L__BB2_21;
$L__BB2_24:
@%p6 bra $L__BB2_27;
mov.u32 %r85, shared;
mov.u32 %r108, %r109;
$L__BB2_26:
shl.b32 %r84, %r108, 2;
add.s32 %r86, %r85, %r84;
ld.shared.f32 %f76, [%r86];
div.rn.f32 %f77, %f76, %f112;
st.shared.f32 [%r86], %f77;
add.s32 %r108, %r108, %r30;
setp.le.s32 %p25, %r108, %r3;
@%p25 bra $L__BB2_26;
$L__BB2_27:
bar.sync 0;
setp.ge.s32 %p26, %r109, %r55;
@%p26 bra $L__BB2_39;
setp.lt.s32 %p27, %r3, 0;
mul.lo.s32 %r37, %r55, %r54;
@%p27 bra $L__BB2_38;
and.b32 %r38, %r22, 3;
sub.s32 %r39, %r22, %r38;
mul.wide.s32 %rd7, %r37, 4;
$L__BB2_30:
add.s32 %r41, %r109, %r4;
setp.lt.u32 %p28, %r3, 3;
mov.f32 %f117, 0f00000000;
mov.u32 %r112, 0;
@%p28 bra $L__BB2_33;
mov.f32 %f117, 0f00000000;
mov.u32 %r112, 0;
mov.u32 %r111, %r39;
$L__BB2_32:
shl.b32 %r90, %r112, 2;
mov.u32 %r91, shared;
add.s32 %r92, %r91, %r90;
ld.shared.v4.f32 {%f81, %f82, %f83, %f84}, [%r92];
mad.lo.s32 %r93, %r37, %r112, %r41;
mul.wide.s32 %rd18, %r93, 4;
add.s64 %rd19, %rd2, %rd18;
ld.global.nc.f32 %f89, [%rd19];
fma.rn.f32 %f90, %f81, %f89, %f117;
add.s64 %rd20, %rd19, %rd7;
ld.global.nc.f32 %f91, [%rd20];
fma.rn.f32 %f92, %f82, %f91, %f90;
add.s64 %rd21, %rd20, %rd7;
ld.global.nc.f32 %f93, [%rd21];
fma.rn.f32 %f94, %f83, %f93, %f92;
add.s64 %rd22, %rd21, %rd7;
ld.global.nc.f32 %f95, [%rd22];
fma.rn.f32 %f117, %f84, %f95, %f94;
add.s32 %r112, %r112, 4;
add.s32 %r111, %r111, -4;
setp.ne.s32 %p29, %r111, 0;
@%p29 bra $L__BB2_32;
$L__BB2_33:
setp.eq.s32 %p30, %r38, 0;
@%p30 bra $L__BB2_37;
setp.eq.s32 %p31, %r38, 1;
mad.lo.s32 %r47, %r37, %r112, %r41;
mul.wide.s32 %rd23, %r47, 4;
add.s64 %rd24, %rd2, %rd23;
ld.global.nc.f32 %f96, [%rd24];
shl.b32 %r94, %r112, 2;
mov.u32 %r95, shared;
add.s32 %r48, %r95, %r94;
ld.shared.f32 %f97, [%r48];
fma.rn.f32 %f117, %f97, %f96, %f117;
@%p31 bra $L__BB2_37;
setp.eq.s32 %p32, %r38, 2;
add.s32 %r49, %r47, %r37;
mul.wide.s32 %rd25, %r49, 4;
add.s64 %rd26, %rd2, %rd25;
ld.global.nc.f32 %f98, [%rd26];
ld.shared.f32 %f99, [%r48+4];
fma.rn.f32 %f117, %f99, %f98, %f117;
@%p32 bra $L__BB2_37;
add.s32 %r96, %r49, %r37;
mul.wide.s32 %rd27, %r96, 4;
add.s64 %rd28, %rd2, %rd27;
ld.global.nc.f32 %f100, [%rd28];
ld.shared.f32 %f101, [%r48+8];
fma.rn.f32 %f117, %f101, %f100, %f117;
$L__BB2_37:
add.s32 %r97, %r109, %r5;
mul.wide.s32 %rd29, %r97, 4;
add.s64 %rd30, %rd1, %rd29;
st.global.f32 [%rd30], %f117;
add.s32 %r109, %r109, %r30;
setp.lt.s32 %p33, %r109, %r55;
@%p33 bra $L__BB2_30;
bra.uni $L__BB2_39;
$L__BB2_38:
add.s32 %r98, %r109, %r5;
mul.wide.s32 %rd31, %r98, 4;
add.s64 %rd32, %rd1, %rd31;
mov.u32 %r99, 0;
st.global.u32 [%rd32], %r99;
add.s32 %r109, %r109, %r30;
setp.lt.s32 %p34, %r109, %r55;
@%p34 bra $L__BB2_38;
$L__BB2_39:
ret;
}