//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-36037853
// Cuda compilation tools, release 12.9, V12.9.86
// Based on NVVM 7.0.1
//
.version 8.8
.target sm_80
.address_size 64
// .globl fused_attention_fwd_f32
.visible .entry fused_attention_fwd_f32(
.param .u64 fused_attention_fwd_f32_param_0,
.param .u64 fused_attention_fwd_f32_param_1,
.param .u64 fused_attention_fwd_f32_param_2,
.param .u64 fused_attention_fwd_f32_param_3,
.param .f32 fused_attention_fwd_f32_param_4,
.param .u32 fused_attention_fwd_f32_param_5,
.param .u32 fused_attention_fwd_f32_param_6,
.param .u32 fused_attention_fwd_f32_param_7,
.param .u32 fused_attention_fwd_f32_param_8,
.param .u32 fused_attention_fwd_f32_param_9,
.param .u32 fused_attention_fwd_f32_param_10
)
{
.reg .pred %p<67>;
.reg .f32 %f<244>;
.reg .b32 %r<173>;
.reg .b64 %rd<122>;
ld.param.u64 %rd24, [fused_attention_fwd_f32_param_0];
ld.param.u64 %rd25, [fused_attention_fwd_f32_param_1];
ld.param.u64 %rd26, [fused_attention_fwd_f32_param_2];
ld.param.u64 %rd27, [fused_attention_fwd_f32_param_3];
ld.param.f32 %f59, [fused_attention_fwd_f32_param_4];
ld.param.u32 %r88, [fused_attention_fwd_f32_param_5];
ld.param.u32 %r83, [fused_attention_fwd_f32_param_6];
ld.param.u32 %r84, [fused_attention_fwd_f32_param_7];
ld.param.u32 %r85, [fused_attention_fwd_f32_param_8];
ld.param.u32 %r86, [fused_attention_fwd_f32_param_9];
ld.param.u32 %r87, [fused_attention_fwd_f32_param_10];
cvta.to.global.u64 %rd1, %rd27;
cvta.to.global.u64 %rd2, %rd26;
cvta.to.global.u64 %rd3, %rd25;
cvta.to.global.u64 %rd4, %rd24;
mov.u32 %r89, %ntid.x;
mov.u32 %r90, %ctaid.x;
mov.u32 %r91, %tid.x;
mad.lo.s32 %r1, %r90, %r89, %r91;
mul.lo.s32 %r92, %r83, %r88;
mul.lo.s32 %r93, %r92, %r84;
setp.ge.u32 %p1, %r1, %r93;
@%p1 bra $L__BB0_85;
mul.lo.s32 %r94, %r84, %r83;
div.u32 %r95, %r1, %r94;
mul.lo.s32 %r96, %r95, %r94;
sub.s32 %r97, %r1, %r96;
div.u32 %r98, %r97, %r84;
mul.lo.s32 %r99, %r98, %r84;
sub.s32 %r2, %r97, %r99;
mad.lo.s32 %r100, %r95, %r83, %r98;
mul.lo.s32 %r101, %r86, %r84;
mul.lo.s32 %r102, %r101, %r100;
mul.lo.s32 %r103, %r86, %r85;
mul.lo.s32 %r3, %r103, %r100;
cvt.u64.u32 %rd28, %r102;
mul.lo.s32 %r104, %r2, %r86;
cvt.u64.u32 %rd29, %r104;
add.s64 %rd5, %rd28, %rd29;
setp.eq.s32 %p2, %r85, 0;
mov.f32 %f229, 0fFF7FFFFF;
@%p2 bra $L__BB0_29;
setp.eq.s32 %p3, %r87, 0;
cvt.u64.u32 %rd6, %r3;
@%p3 bra $L__BB0_17;
setp.eq.s32 %p4, %r86, 0;
@%p4 bra $L__BB0_14;
add.s32 %r4, %r86, -1;
and.b32 %r5, %r86, 3;
sub.s32 %r6, %r86, %r5;
mov.f32 %f229, 0fFF7FFFFF;
mov.u32 %r137, 0;
$L__BB0_5:
setp.gt.u32 %p5, %r137, %r2;
@%p5 bra $L__BB0_29;
setp.lt.u32 %p6, %r4, 3;
mul.lo.s32 %r107, %r137, %r86;
cvt.u64.u32 %rd30, %r107;
add.s64 %rd7, %rd30, %rd6;
mov.f32 %f221, 0f00000000;
mov.u32 %r140, 0;
@%p6 bra $L__BB0_9;
mov.f32 %f221, 0f00000000;
mov.u32 %r140, 0;
mov.u32 %r139, %r6;
$L__BB0_8:
cvt.u64.u32 %rd31, %r140;
add.s64 %rd32, %rd5, %rd31;
shl.b64 %rd33, %rd32, 2;
add.s64 %rd34, %rd4, %rd33;
add.s64 %rd35, %rd7, %rd31;
shl.b64 %rd36, %rd35, 2;
add.s64 %rd37, %rd3, %rd36;
ld.global.nc.f32 %f65, [%rd37];
ld.global.nc.f32 %f66, [%rd34];
fma.rn.ftz.f32 %f67, %f66, %f65, %f221;
ld.global.nc.f32 %f68, [%rd37+4];
ld.global.nc.f32 %f69, [%rd34+4];
fma.rn.ftz.f32 %f70, %f69, %f68, %f67;
ld.global.nc.f32 %f71, [%rd37+8];
ld.global.nc.f32 %f72, [%rd34+8];
fma.rn.ftz.f32 %f73, %f72, %f71, %f70;
ld.global.nc.f32 %f74, [%rd37+12];
ld.global.nc.f32 %f75, [%rd34+12];
fma.rn.ftz.f32 %f221, %f75, %f74, %f73;
add.s32 %r140, %r140, 4;
add.s32 %r139, %r139, -4;
setp.ne.s32 %p7, %r139, 0;
@%p7 bra $L__BB0_8;
$L__BB0_9:
setp.eq.s32 %p8, %r5, 0;
@%p8 bra $L__BB0_13;
setp.eq.s32 %p9, %r5, 1;
cvt.u64.u32 %rd38, %r140;
add.s64 %rd39, %rd5, %rd38;
shl.b64 %rd40, %rd39, 2;
add.s64 %rd8, %rd4, %rd40;
add.s64 %rd41, %rd7, %rd38;
shl.b64 %rd42, %rd41, 2;
add.s64 %rd9, %rd3, %rd42;
ld.global.nc.f32 %f76, [%rd9];
ld.global.nc.f32 %f77, [%rd8];
fma.rn.ftz.f32 %f221, %f77, %f76, %f221;
@%p9 bra $L__BB0_13;
setp.eq.s32 %p10, %r5, 2;
ld.global.nc.f32 %f78, [%rd9+4];
ld.global.nc.f32 %f79, [%rd8+4];
fma.rn.ftz.f32 %f221, %f79, %f78, %f221;
@%p10 bra $L__BB0_13;
ld.global.nc.f32 %f80, [%rd9+8];
ld.global.nc.f32 %f81, [%rd8+8];
fma.rn.ftz.f32 %f221, %f81, %f80, %f221;
$L__BB0_13:
mul.ftz.f32 %f82, %f221, %f59;
setp.gt.ftz.f32 %p11, %f82, %f229;
selp.f32 %f229, %f82, %f229, %p11;
add.s32 %r137, %r137, 1;
setp.lt.u32 %p12, %r137, %r85;
@%p12 bra $L__BB0_5;
bra.uni $L__BB0_29;
$L__BB0_17:
setp.eq.s32 %p16, %r86, 0;
@%p16 bra $L__BB0_27;
add.s32 %r16, %r86, -1;
and.b32 %r17, %r86, 3;
sub.s32 %r18, %r86, %r17;
mov.f32 %f229, 0fFF7FFFFF;
mov.u32 %r110, 0;
mov.u32 %r142, %r110;
$L__BB0_19:
mul.lo.s32 %r112, %r142, %r86;
cvt.u64.u32 %rd43, %r112;
add.s64 %rd10, %rd43, %rd6;
setp.lt.u32 %p17, %r16, 3;
mov.f32 %f227, 0f00000000;
mov.u32 %r145, %r110;
@%p17 bra $L__BB0_22;
mov.f32 %f227, 0f00000000;
mov.u32 %r145, 0;
mov.u32 %r144, %r18;
$L__BB0_21:
cvt.u64.u32 %rd44, %r145;
add.s64 %rd45, %rd5, %rd44;
shl.b64 %rd46, %rd45, 2;
add.s64 %rd47, %rd4, %rd46;
add.s64 %rd48, %rd10, %rd44;
shl.b64 %rd49, %rd48, 2;
add.s64 %rd50, %rd3, %rd49;
ld.global.nc.f32 %f88, [%rd50];
ld.global.nc.f32 %f89, [%rd47];
fma.rn.ftz.f32 %f90, %f89, %f88, %f227;
ld.global.nc.f32 %f91, [%rd50+4];
ld.global.nc.f32 %f92, [%rd47+4];
fma.rn.ftz.f32 %f93, %f92, %f91, %f90;
ld.global.nc.f32 %f94, [%rd50+8];
ld.global.nc.f32 %f95, [%rd47+8];
fma.rn.ftz.f32 %f96, %f95, %f94, %f93;
ld.global.nc.f32 %f97, [%rd50+12];
ld.global.nc.f32 %f98, [%rd47+12];
fma.rn.ftz.f32 %f227, %f98, %f97, %f96;
add.s32 %r145, %r145, 4;
add.s32 %r144, %r144, -4;
setp.ne.s32 %p18, %r144, 0;
@%p18 bra $L__BB0_21;
$L__BB0_22:
setp.eq.s32 %p19, %r17, 0;
@%p19 bra $L__BB0_26;
setp.eq.s32 %p20, %r17, 1;
cvt.u64.u32 %rd51, %r145;
add.s64 %rd52, %rd5, %rd51;
shl.b64 %rd53, %rd52, 2;
add.s64 %rd11, %rd4, %rd53;
add.s64 %rd54, %rd10, %rd51;
shl.b64 %rd55, %rd54, 2;
add.s64 %rd12, %rd3, %rd55;
ld.global.nc.f32 %f99, [%rd12];
ld.global.nc.f32 %f100, [%rd11];
fma.rn.ftz.f32 %f227, %f100, %f99, %f227;
@%p20 bra $L__BB0_26;
setp.eq.s32 %p21, %r17, 2;
ld.global.nc.f32 %f101, [%rd12+4];
ld.global.nc.f32 %f102, [%rd11+4];
fma.rn.ftz.f32 %f227, %f102, %f101, %f227;
@%p21 bra $L__BB0_26;
ld.global.nc.f32 %f103, [%rd12+8];
ld.global.nc.f32 %f104, [%rd11+8];
fma.rn.ftz.f32 %f227, %f104, %f103, %f227;
$L__BB0_26:
mul.ftz.f32 %f105, %f227, %f59;
setp.gt.ftz.f32 %p22, %f105, %f229;
selp.f32 %f229, %f105, %f229, %p22;
add.s32 %r142, %r142, 1;
setp.lt.u32 %p23, %r142, %r85;
@%p23 bra $L__BB0_19;
bra.uni $L__BB0_29;
$L__BB0_14:
mul.ftz.f32 %f11, %f59, 0f00000000;
mov.f32 %f229, 0fFF7FFFFF;
mov.u32 %r141, 0;
$L__BB0_15:
setp.gt.u32 %p13, %r141, %r2;
@%p13 bra $L__BB0_29;
setp.gt.ftz.f32 %p14, %f11, %f229;
selp.f32 %f229, %f11, %f229, %p14;
add.s32 %r141, %r141, 1;
setp.lt.u32 %p15, %r141, %r85;
@%p15 bra $L__BB0_15;
bra.uni $L__BB0_29;
$L__BB0_27:
mul.ftz.f32 %f24, %f59, 0f00000000;
mov.f32 %f229, 0fFF7FFFFF;
mov.u32 %r146, 0;
$L__BB0_28:
setp.gt.ftz.f32 %p24, %f24, %f229;
selp.f32 %f229, %f24, %f229, %p24;
add.s32 %r146, %r146, 1;
setp.lt.u32 %p25, %r146, %r85;
@%p25 bra $L__BB0_28;
$L__BB0_29:
setp.eq.s32 %p26, %r86, 0;
@%p26 bra $L__BB0_35;
add.s32 %r116, %r86, -1;
and.b32 %r151, %r86, 3;
setp.lt.u32 %p27, %r116, 3;
mov.u32 %r149, 0;
@%p27 bra $L__BB0_33;
sub.s32 %r148, %r86, %r151;
mov.u32 %r117, 0;
mov.u32 %r149, %r117;
$L__BB0_32:
cvt.u64.u32 %rd56, %r149;
add.s64 %rd57, %rd5, %rd56;
shl.b64 %rd58, %rd57, 2;
add.s64 %rd59, %rd1, %rd58;
st.global.u32 [%rd59], %r117;
st.global.u32 [%rd59+4], %r117;
st.global.u32 [%rd59+8], %r117;
st.global.u32 [%rd59+12], %r117;
add.s32 %r149, %r149, 4;
add.s32 %r148, %r148, -4;
setp.ne.s32 %p28, %r148, 0;
@%p28 bra $L__BB0_32;
$L__BB0_33:
setp.eq.s32 %p29, %r151, 0;
@%p29 bra $L__BB0_35;
$L__BB0_34:
.pragma "nounroll";
cvt.u64.u32 %rd60, %r149;
add.s64 %rd61, %rd5, %rd60;
shl.b64 %rd62, %rd61, 2;
add.s64 %rd63, %rd1, %rd62;
mov.u32 %r119, 0;
st.global.u32 [%rd63], %r119;
add.s32 %r149, %r149, 1;
add.s32 %r151, %r151, -1;
setp.ne.s32 %p30, %r151, 0;
@%p30 bra $L__BB0_34;
$L__BB0_35:
mov.f32 %f243, 0f00000000;
mov.f32 %f242, %f243;
@%p2 bra $L__BB0_77;
setp.eq.s32 %p32, %r87, 0;
cvt.u64.u32 %rd13, %r3;
add.s32 %r39, %r86, -1;
@%p32 bra $L__BB0_58;
@%p26 bra $L__BB0_55;
and.b32 %r40, %r86, 3;
sub.s32 %r41, %r86, %r40;
mov.f32 %f242, 0f00000000;
mov.u32 %r152, 0;
$L__BB0_39:
setp.gt.u32 %p34, %r152, %r2;
@%p34 bra $L__BB0_77;
setp.lt.u32 %p35, %r39, 3;
mul.lo.s32 %r122, %r152, %r86;
cvt.u64.u32 %rd64, %r122;
add.s64 %rd14, %rd64, %rd13;
mov.f32 %f234, 0f00000000;
mov.u32 %r155, 0;
@%p35 bra $L__BB0_43;
mov.f32 %f234, 0f00000000;
mov.u32 %r155, 0;
mov.u32 %r154, %r41;
$L__BB0_42:
cvt.u64.u32 %rd65, %r155;
add.s64 %rd66, %rd5, %rd65;
shl.b64 %rd67, %rd66, 2;
add.s64 %rd68, %rd4, %rd67;
add.s64 %rd69, %rd14, %rd65;
shl.b64 %rd70, %rd69, 2;
add.s64 %rd71, %rd3, %rd70;
ld.global.nc.f32 %f112, [%rd71];
ld.global.nc.f32 %f113, [%rd68];
fma.rn.ftz.f32 %f114, %f113, %f112, %f234;
ld.global.nc.f32 %f115, [%rd71+4];
ld.global.nc.f32 %f116, [%rd68+4];
fma.rn.ftz.f32 %f117, %f116, %f115, %f114;
ld.global.nc.f32 %f118, [%rd71+8];
ld.global.nc.f32 %f119, [%rd68+8];
fma.rn.ftz.f32 %f120, %f119, %f118, %f117;
ld.global.nc.f32 %f121, [%rd71+12];
ld.global.nc.f32 %f122, [%rd68+12];
fma.rn.ftz.f32 %f234, %f122, %f121, %f120;
add.s32 %r155, %r155, 4;
add.s32 %r154, %r154, -4;
setp.ne.s32 %p36, %r154, 0;
@%p36 bra $L__BB0_42;
$L__BB0_43:
setp.eq.s32 %p37, %r40, 0;
@%p37 bra $L__BB0_47;
setp.eq.s32 %p38, %r40, 1;
cvt.u64.u32 %rd72, %r155;
add.s64 %rd73, %rd5, %rd72;
shl.b64 %rd74, %rd73, 2;
add.s64 %rd15, %rd4, %rd74;
add.s64 %rd75, %rd14, %rd72;
shl.b64 %rd76, %rd75, 2;
add.s64 %rd16, %rd3, %rd76;
ld.global.nc.f32 %f123, [%rd16];
ld.global.nc.f32 %f124, [%rd15];
fma.rn.ftz.f32 %f234, %f124, %f123, %f234;
@%p38 bra $L__BB0_47;
setp.eq.s32 %p39, %r40, 2;
ld.global.nc.f32 %f125, [%rd16+4];
ld.global.nc.f32 %f126, [%rd15+4];
fma.rn.ftz.f32 %f234, %f126, %f125, %f234;
@%p39 bra $L__BB0_47;
ld.global.nc.f32 %f127, [%rd16+8];
ld.global.nc.f32 %f128, [%rd15+8];
fma.rn.ftz.f32 %f234, %f128, %f127, %f234;
$L__BB0_47:
mul.ftz.f32 %f129, %f234, %f59;
sub.ftz.f32 %f130, %f129, %f229;
mul.ftz.f32 %f131, %f130, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f37, %f131;
add.ftz.f32 %f242, %f242, %f37;
mov.u32 %r158, 0;
@%p35 bra $L__BB0_50;
mov.u32 %r158, 0;
mov.u32 %r157, %r41;
$L__BB0_49:
cvt.u64.u32 %rd77, %r158;
add.s64 %rd78, %rd14, %rd77;
shl.b64 %rd79, %rd78, 2;
add.s64 %rd80, %rd2, %rd79;
ld.global.nc.f32 %f132, [%rd80];
add.s64 %rd81, %rd5, %rd77;
shl.b64 %rd82, %rd81, 2;
add.s64 %rd83, %rd1, %rd82;
ld.global.f32 %f133, [%rd83];
fma.rn.ftz.f32 %f134, %f37, %f132, %f133;
st.global.f32 [%rd83], %f134;
ld.global.f32 %f135, [%rd83+4];
ld.global.nc.f32 %f136, [%rd80+4];
fma.rn.ftz.f32 %f137, %f37, %f136, %f135;
st.global.f32 [%rd83+4], %f137;
ld.global.f32 %f138, [%rd83+8];
ld.global.nc.f32 %f139, [%rd80+8];
fma.rn.ftz.f32 %f140, %f37, %f139, %f138;
st.global.f32 [%rd83+8], %f140;
ld.global.nc.f32 %f141, [%rd80+12];
ld.global.f32 %f142, [%rd83+12];
fma.rn.ftz.f32 %f143, %f37, %f141, %f142;
st.global.f32 [%rd83+12], %f143;
add.s32 %r158, %r158, 4;
add.s32 %r157, %r157, -4;
setp.ne.s32 %p41, %r157, 0;
@%p41 bra $L__BB0_49;
$L__BB0_50:
@%p37 bra $L__BB0_54;
setp.eq.s32 %p43, %r40, 1;
cvt.u64.u32 %rd84, %r158;
add.s64 %rd85, %rd14, %rd84;
shl.b64 %rd86, %rd85, 2;
add.s64 %rd17, %rd2, %rd86;
ld.global.nc.f32 %f144, [%rd17];
add.s64 %rd87, %rd5, %rd84;
shl.b64 %rd88, %rd87, 2;
add.s64 %rd18, %rd1, %rd88;
ld.global.f32 %f145, [%rd18];
fma.rn.ftz.f32 %f146, %f37, %f144, %f145;
st.global.f32 [%rd18], %f146;
@%p43 bra $L__BB0_54;
setp.eq.s32 %p44, %r40, 2;
ld.global.nc.f32 %f147, [%rd17+4];
ld.global.f32 %f148, [%rd18+4];
fma.rn.ftz.f32 %f149, %f37, %f147, %f148;
st.global.f32 [%rd18+4], %f149;
@%p44 bra $L__BB0_54;
ld.global.nc.f32 %f150, [%rd17+8];
ld.global.f32 %f151, [%rd18+8];
fma.rn.ftz.f32 %f152, %f37, %f150, %f151;
st.global.f32 [%rd18+8], %f152;
$L__BB0_54:
add.s32 %r152, %r152, 1;
setp.lt.u32 %p45, %r152, %r85;
@%p45 bra $L__BB0_39;
bra.uni $L__BB0_77;
$L__BB0_58:
@%p26 bra $L__BB0_75;
and.b32 %r56, %r86, 3;
sub.s32 %r57, %r86, %r56;
mov.f32 %f157, 0f00000000;
mov.u32 %r127, 0;
mov.u32 %r160, %r127;
mov.f32 %f242, %f157;
$L__BB0_60:
mul.lo.s32 %r129, %r160, %r86;
cvt.u64.u32 %rd89, %r129;
add.s64 %rd19, %rd89, %rd13;
setp.lt.u32 %p49, %r39, 3;
mov.u32 %r163, %r127;
mov.f32 %f240, %f157;
@%p49 bra $L__BB0_63;
mov.f32 %f240, 0f00000000;
mov.u32 %r163, 0;
mov.u32 %r162, %r57;
$L__BB0_62:
cvt.u64.u32 %rd90, %r163;
add.s64 %rd91, %rd5, %rd90;
shl.b64 %rd92, %rd91, 2;
add.s64 %rd93, %rd4, %rd92;
add.s64 %rd94, %rd19, %rd90;
shl.b64 %rd95, %rd94, 2;
add.s64 %rd96, %rd3, %rd95;
ld.global.nc.f32 %f161, [%rd96];
ld.global.nc.f32 %f162, [%rd93];
fma.rn.ftz.f32 %f163, %f162, %f161, %f240;
ld.global.nc.f32 %f164, [%rd96+4];
ld.global.nc.f32 %f165, [%rd93+4];
fma.rn.ftz.f32 %f166, %f165, %f164, %f163;
ld.global.nc.f32 %f167, [%rd96+8];
ld.global.nc.f32 %f168, [%rd93+8];
fma.rn.ftz.f32 %f169, %f168, %f167, %f166;
ld.global.nc.f32 %f170, [%rd96+12];
ld.global.nc.f32 %f171, [%rd93+12];
fma.rn.ftz.f32 %f240, %f171, %f170, %f169;
add.s32 %r163, %r163, 4;
add.s32 %r162, %r162, -4;
setp.ne.s32 %p50, %r162, 0;
@%p50 bra $L__BB0_62;
$L__BB0_63:
setp.eq.s32 %p51, %r56, 0;
@%p51 bra $L__BB0_67;
setp.eq.s32 %p52, %r56, 1;
cvt.u64.u32 %rd97, %r163;
add.s64 %rd98, %rd5, %rd97;
shl.b64 %rd99, %rd98, 2;
add.s64 %rd20, %rd4, %rd99;
add.s64 %rd100, %rd19, %rd97;
shl.b64 %rd101, %rd100, 2;
add.s64 %rd21, %rd3, %rd101;
ld.global.nc.f32 %f172, [%rd21];
ld.global.nc.f32 %f173, [%rd20];
fma.rn.ftz.f32 %f240, %f173, %f172, %f240;
@%p52 bra $L__BB0_67;
setp.eq.s32 %p53, %r56, 2;
ld.global.nc.f32 %f174, [%rd21+4];
ld.global.nc.f32 %f175, [%rd20+4];
fma.rn.ftz.f32 %f240, %f175, %f174, %f240;
@%p53 bra $L__BB0_67;
ld.global.nc.f32 %f176, [%rd21+8];
ld.global.nc.f32 %f177, [%rd20+8];
fma.rn.ftz.f32 %f240, %f177, %f176, %f240;
$L__BB0_67:
mul.ftz.f32 %f178, %f240, %f59;
sub.ftz.f32 %f179, %f178, %f229;
mul.ftz.f32 %f180, %f179, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f51, %f180;
add.ftz.f32 %f242, %f242, %f51;
mov.u32 %r166, 0;
@%p49 bra $L__BB0_70;
mov.u32 %r166, 0;
mov.u32 %r165, %r57;
$L__BB0_69:
cvt.u64.u32 %rd102, %r166;
add.s64 %rd103, %rd19, %rd102;
shl.b64 %rd104, %rd103, 2;
add.s64 %rd105, %rd2, %rd104;
ld.global.nc.f32 %f181, [%rd105];
add.s64 %rd106, %rd5, %rd102;
shl.b64 %rd107, %rd106, 2;
add.s64 %rd108, %rd1, %rd107;
ld.global.f32 %f182, [%rd108];
fma.rn.ftz.f32 %f183, %f51, %f181, %f182;
st.global.f32 [%rd108], %f183;
ld.global.f32 %f184, [%rd108+4];
ld.global.nc.f32 %f185, [%rd105+4];
fma.rn.ftz.f32 %f186, %f51, %f185, %f184;
st.global.f32 [%rd108+4], %f186;
ld.global.f32 %f187, [%rd108+8];
ld.global.nc.f32 %f188, [%rd105+8];
fma.rn.ftz.f32 %f189, %f51, %f188, %f187;
st.global.f32 [%rd108+8], %f189;
ld.global.nc.f32 %f190, [%rd105+12];
ld.global.f32 %f191, [%rd108+12];
fma.rn.ftz.f32 %f192, %f51, %f190, %f191;
st.global.f32 [%rd108+12], %f192;
add.s32 %r166, %r166, 4;
add.s32 %r165, %r165, -4;
setp.ne.s32 %p55, %r165, 0;
@%p55 bra $L__BB0_69;
$L__BB0_70:
@%p51 bra $L__BB0_74;
setp.eq.s32 %p57, %r56, 1;
cvt.u64.u32 %rd109, %r166;
add.s64 %rd110, %rd19, %rd109;
shl.b64 %rd111, %rd110, 2;
add.s64 %rd22, %rd2, %rd111;
ld.global.nc.f32 %f193, [%rd22];
add.s64 %rd112, %rd5, %rd109;
shl.b64 %rd113, %rd112, 2;
add.s64 %rd23, %rd1, %rd113;
ld.global.f32 %f194, [%rd23];
fma.rn.ftz.f32 %f195, %f51, %f193, %f194;
st.global.f32 [%rd23], %f195;
@%p57 bra $L__BB0_74;
setp.eq.s32 %p58, %r56, 2;
ld.global.nc.f32 %f196, [%rd22+4];
ld.global.f32 %f197, [%rd23+4];
fma.rn.ftz.f32 %f198, %f51, %f196, %f197;
st.global.f32 [%rd23+4], %f198;
@%p58 bra $L__BB0_74;
ld.global.nc.f32 %f199, [%rd22+8];
ld.global.f32 %f200, [%rd23+8];
fma.rn.ftz.f32 %f201, %f51, %f199, %f200;
st.global.f32 [%rd23+8], %f201;
$L__BB0_74:
add.s32 %r160, %r160, 1;
setp.lt.u32 %p59, %r160, %r85;
@%p59 bra $L__BB0_60;
bra.uni $L__BB0_77;
$L__BB0_55:
mul.ftz.f32 %f154, %f59, 0f00000000;
mov.f32 %f242, 0f00000000;
sub.ftz.f32 %f155, %f154, %f229;
mul.ftz.f32 %f39, %f155, 0f3FB8AA3B;
mov.u32 %r159, 0;
ex2.approx.ftz.f32 %f156, %f39;
$L__BB0_56:
setp.gt.u32 %p46, %r159, %r2;
@%p46 bra $L__BB0_77;
add.ftz.f32 %f242, %f242, %f156;
add.s32 %r159, %r159, 1;
setp.lt.u32 %p47, %r159, %r85;
@%p47 bra $L__BB0_56;
bra.uni $L__BB0_77;
$L__BB0_75:
mul.ftz.f32 %f203, %f59, 0f00000000;
mov.f32 %f242, 0f00000000;
sub.ftz.f32 %f204, %f203, %f229;
mul.ftz.f32 %f205, %f204, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f53, %f205;
mov.u32 %r167, 0;
$L__BB0_76:
add.ftz.f32 %f242, %f242, %f53;
add.s32 %r167, %r167, 1;
setp.lt.u32 %p60, %r167, %r85;
@%p60 bra $L__BB0_76;
$L__BB0_77:
setp.leu.ftz.f32 %p61, %f242, 0f00000000;
@%p61 bra $L__BB0_79;
rcp.approx.ftz.f32 %f243, %f242;
$L__BB0_79:
@%p26 bra $L__BB0_85;
add.s32 %r135, %r86, -1;
and.b32 %r172, %r86, 3;
setp.lt.u32 %p63, %r135, 3;
mov.u32 %r170, 0;
@%p63 bra $L__BB0_83;
sub.s32 %r169, %r86, %r172;
mov.u32 %r170, 0;
$L__BB0_82:
cvt.u64.u32 %rd114, %r170;
add.s64 %rd115, %rd5, %rd114;
shl.b64 %rd116, %rd115, 2;
add.s64 %rd117, %rd1, %rd116;
ld.global.f32 %f207, [%rd117];
mul.ftz.f32 %f208, %f243, %f207;
st.global.f32 [%rd117], %f208;
ld.global.f32 %f209, [%rd117+4];
mul.ftz.f32 %f210, %f243, %f209;
st.global.f32 [%rd117+4], %f210;
ld.global.f32 %f211, [%rd117+8];
mul.ftz.f32 %f212, %f243, %f211;
st.global.f32 [%rd117+8], %f212;
ld.global.f32 %f213, [%rd117+12];
mul.ftz.f32 %f214, %f243, %f213;
st.global.f32 [%rd117+12], %f214;
add.s32 %r170, %r170, 4;
add.s32 %r169, %r169, -4;
setp.ne.s32 %p64, %r169, 0;
@%p64 bra $L__BB0_82;
$L__BB0_83:
setp.eq.s32 %p65, %r172, 0;
@%p65 bra $L__BB0_85;
$L__BB0_84:
.pragma "nounroll";
cvt.u64.u32 %rd118, %r170;
add.s64 %rd119, %rd5, %rd118;
shl.b64 %rd120, %rd119, 2;
add.s64 %rd121, %rd1, %rd120;
ld.global.f32 %f215, [%rd121];
mul.ftz.f32 %f216, %f243, %f215;
st.global.f32 [%rd121], %f216;
add.s32 %r170, %r170, 1;
add.s32 %r172, %r172, -1;
setp.ne.s32 %p66, %r172, 0;
@%p66 bra $L__BB0_84;
$L__BB0_85:
ret;
}
// .globl fused_attn_decode_f32
.visible .entry fused_attn_decode_f32(
.param .u64 fused_attn_decode_f32_param_0,
.param .u64 fused_attn_decode_f32_param_1,
.param .u64 fused_attn_decode_f32_param_2,
.param .u64 fused_attn_decode_f32_param_3,
.param .u32 fused_attn_decode_f32_param_4,
.param .u32 fused_attn_decode_f32_param_5,
.param .u32 fused_attn_decode_f32_param_6,
.param .u32 fused_attn_decode_f32_param_7,
.param .u32 fused_attn_decode_f32_param_8,
.param .f32 fused_attn_decode_f32_param_9
)
{
.reg .pred %p<177>;
.reg .f32 %f<435>;
.reg .b32 %r<161>;
.reg .b64 %rd<98>;
ld.param.u64 %rd27, [fused_attn_decode_f32_param_0];
ld.param.u64 %rd25, [fused_attn_decode_f32_param_1];
ld.param.u64 %rd26, [fused_attn_decode_f32_param_2];
ld.param.u64 %rd28, [fused_attn_decode_f32_param_3];
ld.param.u32 %r40, [fused_attn_decode_f32_param_4];
ld.param.u32 %r41, [fused_attn_decode_f32_param_5];
ld.param.u32 %r42, [fused_attn_decode_f32_param_6];
ld.param.u32 %r43, [fused_attn_decode_f32_param_7];
ld.param.u32 %r44, [fused_attn_decode_f32_param_8];
cvta.to.global.u64 %rd1, %rd27;
cvta.to.global.u64 %rd2, %rd28;
mov.u32 %r1, %ctaid.x;
setp.ge.u32 %p2, %r1, %r41;
mov.u32 %r2, %tid.x;
setp.gt.u32 %p3, %r2, 31;
or.pred %p4, %p3, %p2;
@%p4 bra $L__BB1_196;
add.s32 %r3, %r43, 31;
shr.u32 %r4, %r3, 5;
mul.lo.s32 %r5, %r1, %r43;
setp.eq.s32 %p5, %r4, 0;
setp.ge.u32 %p6, %r2, %r43;
mov.f32 %f336, 0f00000000;
or.pred %p7, %p6, %p5;
mov.f32 %f335, %f336;
@%p7 bra $L__BB1_3;
add.s32 %r45, %r2, %r5;
mul.wide.u32 %rd29, %r45, 4;
add.s64 %rd30, %rd1, %rd29;
ld.global.nc.f32 %f335, [%rd30];
$L__BB1_3:
setp.lt.u32 %p8, %r3, 64;
@%p8 bra $L__BB1_6;
add.s32 %r6, %r2, 32;
setp.ge.u32 %p9, %r6, %r43;
@%p9 bra $L__BB1_6;
add.s32 %r46, %r6, %r5;
mul.wide.u32 %rd31, %r46, 4;
add.s64 %rd32, %rd1, %rd31;
ld.global.nc.f32 %f336, [%rd32];
$L__BB1_6:
setp.lt.u32 %p10, %r3, 96;
mov.f32 %f338, 0f00000000;
mov.f32 %f337, %f338;
@%p10 bra $L__BB1_9;
add.s32 %r7, %r2, 64;
setp.ge.u32 %p11, %r7, %r43;
@%p11 bra $L__BB1_9;
add.s32 %r47, %r7, %r5;
mul.wide.u32 %rd33, %r47, 4;
add.s64 %rd34, %rd1, %rd33;
ld.global.nc.f32 %f337, [%rd34];
$L__BB1_9:
setp.lt.u32 %p12, %r3, 128;
@%p12 bra $L__BB1_12;
add.s32 %r8, %r2, 96;
setp.ge.u32 %p13, %r8, %r43;
@%p13 bra $L__BB1_12;
add.s32 %r48, %r8, %r5;
mul.wide.u32 %rd35, %r48, 4;
add.s64 %rd36, %rd1, %rd35;
ld.global.nc.f32 %f338, [%rd36];
$L__BB1_12:
setp.lt.u32 %p14, %r3, 160;
mov.f32 %f340, 0f00000000;
mov.f32 %f339, %f340;
@%p14 bra $L__BB1_15;
add.s32 %r9, %r2, 128;
setp.ge.u32 %p15, %r9, %r43;
@%p15 bra $L__BB1_15;
add.s32 %r49, %r9, %r5;
mul.wide.u32 %rd37, %r49, 4;
add.s64 %rd38, %rd1, %rd37;
ld.global.nc.f32 %f339, [%rd38];
$L__BB1_15:
setp.lt.u32 %p16, %r3, 192;
@%p16 bra $L__BB1_18;
add.s32 %r10, %r2, 160;
setp.ge.u32 %p17, %r10, %r43;
@%p17 bra $L__BB1_18;
add.s32 %r50, %r10, %r5;
mul.wide.u32 %rd39, %r50, 4;
add.s64 %rd40, %rd1, %rd39;
ld.global.nc.f32 %f340, [%rd40];
$L__BB1_18:
setp.lt.u32 %p18, %r3, 224;
mov.f32 %f342, 0f00000000;
mov.f32 %f341, %f342;
@%p18 bra $L__BB1_21;
add.s32 %r11, %r2, 192;
setp.ge.u32 %p19, %r11, %r43;
@%p19 bra $L__BB1_21;
add.s32 %r51, %r11, %r5;
mul.wide.u32 %rd41, %r51, 4;
add.s64 %rd42, %rd1, %rd41;
ld.global.nc.f32 %f341, [%rd42];
$L__BB1_21:
setp.lt.u32 %p20, %r3, 256;
@%p20 bra $L__BB1_24;
add.s32 %r12, %r2, 224;
setp.ge.u32 %p21, %r12, %r43;
@%p21 bra $L__BB1_24;
add.s32 %r52, %r12, %r5;
mul.wide.u32 %rd43, %r52, 4;
add.s64 %rd44, %rd1, %rd43;
ld.global.nc.f32 %f342, [%rd44];
$L__BB1_24:
setp.lt.u32 %p22, %r3, 288;
mov.f32 %f344, 0f00000000;
mov.f32 %f343, %f344;
@%p22 bra $L__BB1_27;
add.s32 %r13, %r2, 256;
setp.ge.u32 %p23, %r13, %r43;
@%p23 bra $L__BB1_27;
add.s32 %r53, %r13, %r5;
mul.wide.u32 %rd45, %r53, 4;
add.s64 %rd46, %rd1, %rd45;
ld.global.nc.f32 %f343, [%rd46];
$L__BB1_27:
setp.lt.u32 %p24, %r3, 320;
@%p24 bra $L__BB1_30;
add.s32 %r14, %r2, 288;
setp.ge.u32 %p25, %r14, %r43;
@%p25 bra $L__BB1_30;
add.s32 %r54, %r14, %r5;
mul.wide.u32 %rd47, %r54, 4;
add.s64 %rd48, %rd1, %rd47;
ld.global.nc.f32 %f344, [%rd48];
$L__BB1_30:
setp.lt.u32 %p26, %r3, 352;
mov.f32 %f346, 0f00000000;
mov.f32 %f345, %f346;
@%p26 bra $L__BB1_33;
add.s32 %r15, %r2, 320;
setp.ge.u32 %p27, %r15, %r43;
@%p27 bra $L__BB1_33;
add.s32 %r55, %r15, %r5;
mul.wide.u32 %rd49, %r55, 4;
add.s64 %rd50, %rd1, %rd49;
ld.global.nc.f32 %f345, [%rd50];
$L__BB1_33:
setp.lt.u32 %p28, %r3, 384;
@%p28 bra $L__BB1_36;
add.s32 %r16, %r2, 352;
setp.ge.u32 %p29, %r16, %r43;
@%p29 bra $L__BB1_36;
add.s32 %r56, %r16, %r5;
mul.wide.u32 %rd51, %r56, 4;
add.s64 %rd52, %rd1, %rd51;
ld.global.nc.f32 %f346, [%rd52];
$L__BB1_36:
setp.lt.u32 %p30, %r3, 416;
mov.f32 %f348, 0f00000000;
mov.f32 %f347, %f348;
@%p30 bra $L__BB1_39;
add.s32 %r17, %r2, 384;
setp.ge.u32 %p31, %r17, %r43;
@%p31 bra $L__BB1_39;
add.s32 %r57, %r17, %r5;
mul.wide.u32 %rd53, %r57, 4;
add.s64 %rd54, %rd1, %rd53;
ld.global.nc.f32 %f347, [%rd54];
$L__BB1_39:
setp.lt.u32 %p32, %r3, 448;
@%p32 bra $L__BB1_42;
add.s32 %r18, %r2, 416;
setp.ge.u32 %p33, %r18, %r43;
@%p33 bra $L__BB1_42;
add.s32 %r58, %r18, %r5;
mul.wide.u32 %rd55, %r58, 4;
add.s64 %rd56, %rd1, %rd55;
ld.global.nc.f32 %f348, [%rd56];
$L__BB1_42:
setp.lt.u32 %p34, %r3, 480;
mov.f32 %f350, 0f00000000;
mov.f32 %f349, %f350;
@%p34 bra $L__BB1_45;
add.s32 %r19, %r2, 448;
setp.ge.u32 %p35, %r19, %r43;
@%p35 bra $L__BB1_45;
add.s32 %r59, %r19, %r5;
mul.wide.u32 %rd57, %r59, 4;
add.s64 %rd58, %rd1, %rd57;
ld.global.nc.f32 %f349, [%rd58];
$L__BB1_45:
setp.lt.u32 %p36, %r3, 512;
@%p36 bra $L__BB1_48;
add.s32 %r20, %r2, 480;
setp.ge.u32 %p37, %r20, %r43;
@%p37 bra $L__BB1_48;
add.s32 %r60, %r20, %r5;
mul.wide.u32 %rd59, %r60, 4;
add.s64 %rd60, %rd1, %rd59;
ld.global.nc.f32 %f350, [%rd60];
$L__BB1_48:
setp.gt.u32 %p38, %r40, %r44;
setp.ne.s32 %p39, %r44, 0;
and.pred %p40, %p38, %p39;
sub.s32 %r61, %r40, %r44;
selp.b32 %r160, %r61, 0, %p40;
setp.ge.u32 %p41, %r160, %r40;
mov.f32 %f416, 0f00000000;
mov.f32 %f414, %f416;
mov.f32 %f412, %f416;
mov.f32 %f410, %f416;
mov.f32 %f408, %f416;
mov.f32 %f406, %f416;
mov.f32 %f404, %f416;
mov.f32 %f402, %f416;
mov.f32 %f400, %f416;
mov.f32 %f398, %f416;
mov.f32 %f396, %f416;
mov.f32 %f394, %f416;
mov.f32 %f392, %f416;
mov.f32 %f390, %f416;
mov.f32 %f388, %f416;
mov.f32 %f386, %f416;
mov.f32 %f433, %f416;
@%p41 bra $L__BB1_147;
setp.lt.u32 %p42, %r2, %r43;
div.u32 %r62, %r41, %r42;
div.u32 %r63, %r1, %r62;
mul.lo.s32 %r64, %r63, %r43;
cvt.u64.u32 %rd3, %r64;
setp.ne.s32 %p43, %r4, 0;
and.pred %p1, %p43, %p42;
cvt.u64.u32 %rd4, %r2;
add.s32 %r65, %r2, 32;
cvt.u64.u32 %rd5, %r65;
add.s32 %r66, %r2, 64;
cvt.u64.u32 %rd6, %r66;
add.s32 %r67, %r2, 96;
cvt.u64.u32 %rd7, %r67;
add.s32 %r68, %r2, 128;
cvt.u64.u32 %rd8, %r68;
add.s32 %r69, %r2, 160;
cvt.u64.u32 %rd9, %r69;
add.s32 %r70, %r2, 192;
cvt.u64.u32 %rd10, %r70;
add.s32 %r71, %r2, 224;
cvt.u64.u32 %rd11, %r71;
add.s32 %r72, %r2, 256;
cvt.u64.u32 %rd12, %r72;
add.s32 %r73, %r2, 288;
cvt.u64.u32 %rd13, %r73;
add.s32 %r74, %r2, 320;
cvt.u64.u32 %rd14, %r74;
add.s32 %r75, %r2, 352;
cvt.u64.u32 %rd15, %r75;
add.s32 %r76, %r2, 384;
cvt.u64.u32 %rd16, %r76;
add.s32 %r77, %r2, 416;
cvt.u64.u32 %rd17, %r77;
add.s32 %r78, %r2, 448;
cvt.u64.u32 %rd18, %r78;
add.s32 %r79, %r2, 480;
cvt.u64.u32 %rd19, %r79;
cvta.to.global.u64 %rd20, %rd25;
cvta.to.global.u64 %rd21, %rd26;
mul.lo.s32 %r22, %r43, %r42;
mov.f32 %f368, 0fFF7FFFFF;
mov.f32 %f235, 0f00000000;
not.pred %p44, %p1;
mov.f32 %f416, %f235;
mov.f32 %f414, %f235;
mov.f32 %f412, %f235;
mov.f32 %f410, %f235;
mov.f32 %f408, %f235;
mov.f32 %f406, %f235;
mov.f32 %f404, %f235;
mov.f32 %f402, %f235;
mov.f32 %f400, %f235;
mov.f32 %f398, %f235;
mov.f32 %f396, %f235;
mov.f32 %f394, %f235;
mov.f32 %f392, %f235;
mov.f32 %f390, %f235;
mov.f32 %f388, %f235;
mov.f32 %f386, %f235;
mov.f32 %f433, %f235;
$L__BB1_50:
mul.lo.s32 %r80, %r22, %r160;
cvt.u64.u32 %rd61, %r80;
add.s64 %rd22, %rd61, %rd3;
add.s64 %rd62, %rd22, %rd4;
shl.b64 %rd63, %rd62, 2;
add.s64 %rd23, %rd20, %rd63;
mov.f32 %f370, %f235;
@%p44 bra $L__BB1_52;
ld.global.nc.f32 %f238, [%rd23];
fma.rn.ftz.f32 %f370, %f335, %f238, 0f00000000;
$L__BB1_52:
cvt.u32.u64 %r81, %rd5;
setp.ge.u32 %p45, %r81, %r43;
or.pred %p47, %p45, %p8;
@%p47 bra $L__BB1_54;
ld.global.nc.f32 %f239, [%rd23+128];
fma.rn.ftz.f32 %f370, %f336, %f239, %f370;
$L__BB1_54:
cvt.u32.u64 %r82, %rd6;
setp.ge.u32 %p48, %r82, %r43;
or.pred %p50, %p48, %p10;
@%p50 bra $L__BB1_56;
ld.global.nc.f32 %f240, [%rd23+256];
fma.rn.ftz.f32 %f370, %f337, %f240, %f370;
$L__BB1_56:
cvt.u32.u64 %r83, %rd7;
setp.ge.u32 %p51, %r83, %r43;
or.pred %p53, %p51, %p12;
@%p53 bra $L__BB1_58;
ld.global.nc.f32 %f241, [%rd23+384];
fma.rn.ftz.f32 %f370, %f338, %f241, %f370;
$L__BB1_58:
cvt.u32.u64 %r84, %rd8;
setp.ge.u32 %p54, %r84, %r43;
or.pred %p56, %p54, %p14;
@%p56 bra $L__BB1_60;
ld.global.nc.f32 %f242, [%rd23+512];
fma.rn.ftz.f32 %f370, %f339, %f242, %f370;
$L__BB1_60:
cvt.u32.u64 %r85, %rd9;
setp.ge.u32 %p57, %r85, %r43;
or.pred %p59, %p57, %p16;
@%p59 bra $L__BB1_62;
ld.global.nc.f32 %f243, [%rd23+640];
fma.rn.ftz.f32 %f370, %f340, %f243, %f370;
$L__BB1_62:
cvt.u32.u64 %r86, %rd10;
setp.ge.u32 %p60, %r86, %r43;
or.pred %p62, %p60, %p18;
@%p62 bra $L__BB1_64;
ld.global.nc.f32 %f244, [%rd23+768];
fma.rn.ftz.f32 %f370, %f341, %f244, %f370;
$L__BB1_64:
cvt.u32.u64 %r87, %rd11;
setp.ge.u32 %p63, %r87, %r43;
or.pred %p65, %p63, %p20;
@%p65 bra $L__BB1_66;
ld.global.nc.f32 %f245, [%rd23+896];
fma.rn.ftz.f32 %f370, %f342, %f245, %f370;
$L__BB1_66:
cvt.u32.u64 %r88, %rd12;
setp.ge.u32 %p66, %r88, %r43;
or.pred %p68, %p66, %p22;
@%p68 bra $L__BB1_68;
ld.global.nc.f32 %f246, [%rd23+1024];
fma.rn.ftz.f32 %f370, %f343, %f246, %f370;
$L__BB1_68:
cvt.u32.u64 %r89, %rd13;
setp.ge.u32 %p69, %r89, %r43;
or.pred %p71, %p69, %p24;
@%p71 bra $L__BB1_70;
ld.global.nc.f32 %f247, [%rd23+1152];
fma.rn.ftz.f32 %f370, %f344, %f247, %f370;
$L__BB1_70:
cvt.u32.u64 %r90, %rd14;
setp.ge.u32 %p72, %r90, %r43;
or.pred %p74, %p72, %p26;
@%p74 bra $L__BB1_72;
ld.global.nc.f32 %f248, [%rd23+1280];
fma.rn.ftz.f32 %f370, %f345, %f248, %f370;
$L__BB1_72:
cvt.u32.u64 %r91, %rd15;
setp.ge.u32 %p75, %r91, %r43;
or.pred %p77, %p75, %p28;
@%p77 bra $L__BB1_74;
ld.global.nc.f32 %f249, [%rd23+1408];
fma.rn.ftz.f32 %f370, %f346, %f249, %f370;
$L__BB1_74:
cvt.u32.u64 %r92, %rd16;
setp.ge.u32 %p78, %r92, %r43;
or.pred %p80, %p78, %p30;
@%p80 bra $L__BB1_76;
ld.global.nc.f32 %f250, [%rd23+1536];
fma.rn.ftz.f32 %f370, %f347, %f250, %f370;
$L__BB1_76:
cvt.u32.u64 %r93, %rd17;
setp.ge.u32 %p81, %r93, %r43;
or.pred %p83, %p81, %p32;
@%p83 bra $L__BB1_78;
ld.global.nc.f32 %f251, [%rd23+1664];
fma.rn.ftz.f32 %f370, %f348, %f251, %f370;
$L__BB1_78:
cvt.u32.u64 %r94, %rd18;
setp.ge.u32 %p84, %r94, %r43;
or.pred %p86, %p84, %p34;
@%p86 bra $L__BB1_80;
ld.global.nc.f32 %f252, [%rd23+1792];
fma.rn.ftz.f32 %f370, %f349, %f252, %f370;
$L__BB1_80:
cvt.u32.u64 %r95, %rd19;
setp.ge.u32 %p87, %r95, %r43;
or.pred %p89, %p87, %p36;
@%p89 bra $L__BB1_82;
ld.global.nc.f32 %f253, [%rd23+1920];
fma.rn.ftz.f32 %f370, %f350, %f253, %f370;
$L__BB1_82:
ld.param.f32 %f334, [fused_attn_decode_f32_param_9];
mov.b32 %r96, %f370;
mov.u32 %r97, 31;
mov.u32 %r98, 16;
mov.u32 %r99, -1;
shfl.sync.bfly.b32 %r100|%p90, %r96, %r98, %r97, %r99;
mov.b32 %f254, %r100;
add.ftz.f32 %f255, %f370, %f254;
mov.b32 %r101, %f255;
mov.u32 %r102, 8;
shfl.sync.bfly.b32 %r103|%p91, %r101, %r102, %r97, %r99;
mov.b32 %f256, %r103;
add.ftz.f32 %f257, %f255, %f256;
mov.b32 %r104, %f257;
mov.u32 %r105, 4;
shfl.sync.bfly.b32 %r106|%p92, %r104, %r105, %r97, %r99;
mov.b32 %f258, %r106;
add.ftz.f32 %f259, %f257, %f258;
mov.b32 %r107, %f259;
mov.u32 %r108, 2;
shfl.sync.bfly.b32 %r109|%p93, %r107, %r108, %r97, %r99;
mov.b32 %f260, %r109;
add.ftz.f32 %f261, %f259, %f260;
mov.b32 %r110, %f261;
mov.u32 %r111, 1;
shfl.sync.bfly.b32 %r112|%p94, %r110, %r111, %r97, %r99;
mov.b32 %f262, %r112;
add.ftz.f32 %f263, %f261, %f262;
mul.ftz.f32 %f264, %f263, %f334;
max.ftz.f32 %f83, %f368, %f264;
sub.ftz.f32 %f265, %f368, %f83;
mul.ftz.f32 %f266, %f265, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f84, %f266;
sub.ftz.f32 %f267, %f264, %f83;
mul.ftz.f32 %f268, %f267, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f85, %f268;
add.s64 %rd24, %rd21, %rd63;
@%p5 bra $L__BB1_86;
mov.f32 %f385, 0f00000000;
@%p6 bra $L__BB1_85;
ld.global.nc.f32 %f385, [%rd24];
$L__BB1_85:
mul.ftz.f32 %f270, %f84, %f386;
fma.rn.ftz.f32 %f386, %f85, %f385, %f270;
$L__BB1_86:
@%p8 bra $L__BB1_90;
cvt.u32.u64 %r159, %rd5;
setp.ge.u32 %p176, %r159, %r43;
mov.f32 %f387, 0f00000000;
@%p176 bra $L__BB1_89;
ld.global.nc.f32 %f387, [%rd24+128];
$L__BB1_89:
mul.ftz.f32 %f272, %f84, %f388;
fma.rn.ftz.f32 %f388, %f85, %f387, %f272;
$L__BB1_90:
@%p10 bra $L__BB1_94;
cvt.u32.u64 %r158, %rd6;
setp.ge.u32 %p175, %r158, %r43;
mov.f32 %f389, 0f00000000;
@%p175 bra $L__BB1_93;
ld.global.nc.f32 %f389, [%rd24+256];
$L__BB1_93:
mul.ftz.f32 %f274, %f84, %f390;
fma.rn.ftz.f32 %f390, %f85, %f389, %f274;
$L__BB1_94:
@%p12 bra $L__BB1_98;
cvt.u32.u64 %r157, %rd7;
setp.ge.u32 %p174, %r157, %r43;
mov.f32 %f391, 0f00000000;
@%p174 bra $L__BB1_97;
ld.global.nc.f32 %f391, [%rd24+384];
$L__BB1_97:
mul.ftz.f32 %f276, %f84, %f392;
fma.rn.ftz.f32 %f392, %f85, %f391, %f276;
$L__BB1_98:
@%p14 bra $L__BB1_102;
cvt.u32.u64 %r156, %rd8;
setp.ge.u32 %p173, %r156, %r43;
mov.f32 %f393, 0f00000000;
@%p173 bra $L__BB1_101;
ld.global.nc.f32 %f393, [%rd24+512];
$L__BB1_101:
mul.ftz.f32 %f278, %f84, %f394;
fma.rn.ftz.f32 %f394, %f85, %f393, %f278;
$L__BB1_102:
@%p16 bra $L__BB1_106;
cvt.u32.u64 %r155, %rd9;
setp.ge.u32 %p172, %r155, %r43;
mov.f32 %f395, 0f00000000;
@%p172 bra $L__BB1_105;
ld.global.nc.f32 %f395, [%rd24+640];
$L__BB1_105:
mul.ftz.f32 %f280, %f84, %f396;
fma.rn.ftz.f32 %f396, %f85, %f395, %f280;
$L__BB1_106:
@%p18 bra $L__BB1_110;
cvt.u32.u64 %r154, %rd10;
setp.ge.u32 %p171, %r154, %r43;
mov.f32 %f397, 0f00000000;
@%p171 bra $L__BB1_109;
ld.global.nc.f32 %f397, [%rd24+768];
$L__BB1_109:
mul.ftz.f32 %f282, %f84, %f398;
fma.rn.ftz.f32 %f398, %f85, %f397, %f282;
$L__BB1_110:
@%p20 bra $L__BB1_114;
cvt.u32.u64 %r153, %rd11;
setp.ge.u32 %p170, %r153, %r43;
mov.f32 %f399, 0f00000000;
@%p170 bra $L__BB1_113;
ld.global.nc.f32 %f399, [%rd24+896];
$L__BB1_113:
mul.ftz.f32 %f284, %f84, %f400;
fma.rn.ftz.f32 %f400, %f85, %f399, %f284;
$L__BB1_114:
@%p22 bra $L__BB1_118;
cvt.u32.u64 %r152, %rd12;
setp.ge.u32 %p169, %r152, %r43;
mov.f32 %f401, 0f00000000;
@%p169 bra $L__BB1_117;
ld.global.nc.f32 %f401, [%rd24+1024];
$L__BB1_117:
mul.ftz.f32 %f286, %f84, %f402;
fma.rn.ftz.f32 %f402, %f85, %f401, %f286;
$L__BB1_118:
@%p24 bra $L__BB1_122;
cvt.u32.u64 %r151, %rd13;
setp.ge.u32 %p168, %r151, %r43;
mov.f32 %f403, 0f00000000;
@%p168 bra $L__BB1_121;
ld.global.nc.f32 %f403, [%rd24+1152];
$L__BB1_121:
mul.ftz.f32 %f288, %f84, %f404;
fma.rn.ftz.f32 %f404, %f85, %f403, %f288;
$L__BB1_122:
@%p26 bra $L__BB1_126;
cvt.u32.u64 %r150, %rd14;
setp.ge.u32 %p167, %r150, %r43;
mov.f32 %f405, 0f00000000;
@%p167 bra $L__BB1_125;
ld.global.nc.f32 %f405, [%rd24+1280];
$L__BB1_125:
mul.ftz.f32 %f290, %f84, %f406;
fma.rn.ftz.f32 %f406, %f85, %f405, %f290;
$L__BB1_126:
@%p28 bra $L__BB1_130;
cvt.u32.u64 %r149, %rd15;
setp.ge.u32 %p166, %r149, %r43;
mov.f32 %f407, 0f00000000;
@%p166 bra $L__BB1_129;
ld.global.nc.f32 %f407, [%rd24+1408];
$L__BB1_129:
mul.ftz.f32 %f292, %f84, %f408;
fma.rn.ftz.f32 %f408, %f85, %f407, %f292;
$L__BB1_130:
@%p30 bra $L__BB1_134;
cvt.u32.u64 %r148, %rd16;
setp.ge.u32 %p165, %r148, %r43;
mov.f32 %f409, 0f00000000;
@%p165 bra $L__BB1_133;
ld.global.nc.f32 %f409, [%rd24+1536];
$L__BB1_133:
mul.ftz.f32 %f294, %f84, %f410;
fma.rn.ftz.f32 %f410, %f85, %f409, %f294;
$L__BB1_134:
@%p32 bra $L__BB1_138;
cvt.u32.u64 %r147, %rd17;
setp.ge.u32 %p164, %r147, %r43;
mov.f32 %f411, 0f00000000;
@%p164 bra $L__BB1_137;
ld.global.nc.f32 %f411, [%rd24+1664];
$L__BB1_137:
mul.ftz.f32 %f296, %f84, %f412;
fma.rn.ftz.f32 %f412, %f85, %f411, %f296;
$L__BB1_138:
@%p34 bra $L__BB1_142;
cvt.u32.u64 %r146, %rd18;
setp.ge.u32 %p163, %r146, %r43;
mov.f32 %f413, 0f00000000;
@%p163 bra $L__BB1_141;
ld.global.nc.f32 %f413, [%rd24+1792];
$L__BB1_141:
mul.ftz.f32 %f298, %f84, %f414;
fma.rn.ftz.f32 %f414, %f85, %f413, %f298;
$L__BB1_142:
@%p36 bra $L__BB1_146;
cvt.u32.u64 %r145, %rd19;
setp.ge.u32 %p162, %r145, %r43;
mov.f32 %f415, 0f00000000;
@%p162 bra $L__BB1_145;
ld.global.nc.f32 %f415, [%rd24+1920];
$L__BB1_145:
mul.ftz.f32 %f300, %f84, %f416;
fma.rn.ftz.f32 %f416, %f85, %f415, %f300;
$L__BB1_146:
ld.param.u32 %r144, [fused_attn_decode_f32_param_4];
fma.rn.ftz.f32 %f433, %f433, %f84, %f85;
add.s32 %r160, %r160, 1;
setp.lt.u32 %p127, %r160, %r144;
mov.f32 %f368, %f83;
@%p127 bra $L__BB1_50;
$L__BB1_147:
mov.f32 %f434, 0f00000000;
setp.leu.ftz.f32 %p128, %f433, 0f00000000;
@%p128 bra $L__BB1_149;
rcp.approx.ftz.f32 %f434, %f433;
$L__BB1_149:
or.pred %p131, %p5, %p6;
@%p131 bra $L__BB1_151;
mul.ftz.f32 %f302, %f434, %f386;
add.s32 %r128, %r2, %r5;
mul.wide.u32 %rd66, %r128, 4;
add.s64 %rd67, %rd2, %rd66;
st.global.f32 [%rd67], %f302;
$L__BB1_151:
@%p8 bra $L__BB1_154;
add.s32 %r25, %r2, 32;
setp.ge.u32 %p133, %r25, %r43;
@%p133 bra $L__BB1_154;
mul.ftz.f32 %f303, %f434, %f388;
add.s32 %r129, %r25, %r5;
mul.wide.u32 %rd68, %r129, 4;
add.s64 %rd69, %rd2, %rd68;
st.global.f32 [%rd69], %f303;
$L__BB1_154:
@%p10 bra $L__BB1_157;
add.s32 %r26, %r2, 64;
setp.ge.u32 %p135, %r26, %r43;
@%p135 bra $L__BB1_157;
mul.ftz.f32 %f304, %f434, %f390;
add.s32 %r130, %r26, %r5;
mul.wide.u32 %rd70, %r130, 4;
add.s64 %rd71, %rd2, %rd70;
st.global.f32 [%rd71], %f304;
$L__BB1_157:
@%p12 bra $L__BB1_160;
add.s32 %r27, %r2, 96;
setp.ge.u32 %p137, %r27, %r43;
@%p137 bra $L__BB1_160;
mul.ftz.f32 %f305, %f434, %f392;
add.s32 %r131, %r27, %r5;
mul.wide.u32 %rd72, %r131, 4;
add.s64 %rd73, %rd2, %rd72;
st.global.f32 [%rd73], %f305;
$L__BB1_160:
@%p14 bra $L__BB1_163;
add.s32 %r28, %r2, 128;
setp.ge.u32 %p139, %r28, %r43;
@%p139 bra $L__BB1_163;
mul.ftz.f32 %f306, %f434, %f394;
add.s32 %r132, %r28, %r5;
mul.wide.u32 %rd74, %r132, 4;
add.s64 %rd75, %rd2, %rd74;
st.global.f32 [%rd75], %f306;
$L__BB1_163:
@%p16 bra $L__BB1_166;
add.s32 %r29, %r2, 160;
setp.ge.u32 %p141, %r29, %r43;
@%p141 bra $L__BB1_166;
mul.ftz.f32 %f307, %f434, %f396;
add.s32 %r133, %r29, %r5;
mul.wide.u32 %rd76, %r133, 4;
add.s64 %rd77, %rd2, %rd76;
st.global.f32 [%rd77], %f307;
$L__BB1_166:
@%p18 bra $L__BB1_169;
add.s32 %r30, %r2, 192;
setp.ge.u32 %p143, %r30, %r43;
@%p143 bra $L__BB1_169;
mul.ftz.f32 %f308, %f434, %f398;
add.s32 %r134, %r30, %r5;
mul.wide.u32 %rd78, %r134, 4;
add.s64 %rd79, %rd2, %rd78;
st.global.f32 [%rd79], %f308;
$L__BB1_169:
@%p20 bra $L__BB1_172;
add.s32 %r31, %r2, 224;
setp.ge.u32 %p145, %r31, %r43;
@%p145 bra $L__BB1_172;
mul.ftz.f32 %f309, %f434, %f400;
add.s32 %r135, %r31, %r5;
mul.wide.u32 %rd80, %r135, 4;
add.s64 %rd81, %rd2, %rd80;
st.global.f32 [%rd81], %f309;
$L__BB1_172:
@%p22 bra $L__BB1_175;
add.s32 %r32, %r2, 256;
setp.ge.u32 %p147, %r32, %r43;
@%p147 bra $L__BB1_175;
mul.ftz.f32 %f310, %f434, %f402;
add.s32 %r136, %r32, %r5;
mul.wide.u32 %rd82, %r136, 4;
add.s64 %rd83, %rd2, %rd82;
st.global.f32 [%rd83], %f310;
$L__BB1_175:
@%p24 bra $L__BB1_178;
add.s32 %r33, %r2, 288;
setp.ge.u32 %p149, %r33, %r43;
@%p149 bra $L__BB1_178;
mul.ftz.f32 %f311, %f434, %f404;
add.s32 %r137, %r33, %r5;
mul.wide.u32 %rd84, %r137, 4;
add.s64 %rd85, %rd2, %rd84;
st.global.f32 [%rd85], %f311;
$L__BB1_178:
@%p26 bra $L__BB1_181;
add.s32 %r34, %r2, 320;
setp.ge.u32 %p151, %r34, %r43;
@%p151 bra $L__BB1_181;
mul.ftz.f32 %f312, %f434, %f406;
add.s32 %r138, %r34, %r5;
mul.wide.u32 %rd86, %r138, 4;
add.s64 %rd87, %rd2, %rd86;
st.global.f32 [%rd87], %f312;
$L__BB1_181:
@%p28 bra $L__BB1_184;
add.s32 %r35, %r2, 352;
setp.ge.u32 %p153, %r35, %r43;
@%p153 bra $L__BB1_184;
mul.ftz.f32 %f313, %f434, %f408;
add.s32 %r139, %r35, %r5;
mul.wide.u32 %rd88, %r139, 4;
add.s64 %rd89, %rd2, %rd88;
st.global.f32 [%rd89], %f313;
$L__BB1_184:
@%p30 bra $L__BB1_187;
add.s32 %r36, %r2, 384;
setp.ge.u32 %p155, %r36, %r43;
@%p155 bra $L__BB1_187;
mul.ftz.f32 %f314, %f434, %f410;
add.s32 %r140, %r36, %r5;
mul.wide.u32 %rd90, %r140, 4;
add.s64 %rd91, %rd2, %rd90;
st.global.f32 [%rd91], %f314;
$L__BB1_187:
@%p32 bra $L__BB1_190;
add.s32 %r37, %r2, 416;
setp.ge.u32 %p157, %r37, %r43;
@%p157 bra $L__BB1_190;
mul.ftz.f32 %f315, %f434, %f412;
add.s32 %r141, %r37, %r5;
mul.wide.u32 %rd92, %r141, 4;
add.s64 %rd93, %rd2, %rd92;
st.global.f32 [%rd93], %f315;
$L__BB1_190:
@%p34 bra $L__BB1_193;
add.s32 %r38, %r2, 448;
setp.ge.u32 %p159, %r38, %r43;
@%p159 bra $L__BB1_193;
mul.ftz.f32 %f316, %f434, %f414;
add.s32 %r142, %r38, %r5;
mul.wide.u32 %rd94, %r142, 4;
add.s64 %rd95, %rd2, %rd94;
st.global.f32 [%rd95], %f316;
$L__BB1_193:
@%p36 bra $L__BB1_196;
add.s32 %r39, %r2, 480;
setp.ge.u32 %p161, %r39, %r43;
@%p161 bra $L__BB1_196;
mul.ftz.f32 %f317, %f434, %f416;
add.s32 %r143, %r39, %r5;
mul.wide.u32 %rd96, %r143, 4;
add.s64 %rd97, %rd2, %rd96;
st.global.f32 [%rd97], %f317;
$L__BB1_196:
ret;
}
// .globl fused_attn_prefill_f32
.visible .entry fused_attn_prefill_f32(
.param .u64 fused_attn_prefill_f32_param_0,
.param .u64 fused_attn_prefill_f32_param_1,
.param .u64 fused_attn_prefill_f32_param_2,
.param .u64 fused_attn_prefill_f32_param_3,
.param .u32 fused_attn_prefill_f32_param_4,
.param .u32 fused_attn_prefill_f32_param_5,
.param .u32 fused_attn_prefill_f32_param_6,
.param .u32 fused_attn_prefill_f32_param_7,
.param .u32 fused_attn_prefill_f32_param_8,
.param .u32 fused_attn_prefill_f32_param_9,
.param .u32 fused_attn_prefill_f32_param_10,
.param .f32 fused_attn_prefill_f32_param_11
)
{
.reg .pred %p<175>;
.reg .f32 %f<435>;
.reg .b32 %r<171>;
.reg .b64 %rd<98>;
ld.param.u64 %rd27, [fused_attn_prefill_f32_param_0];
ld.param.u64 %rd25, [fused_attn_prefill_f32_param_1];
ld.param.u64 %rd26, [fused_attn_prefill_f32_param_2];
ld.param.u64 %rd28, [fused_attn_prefill_f32_param_3];
ld.param.u32 %r48, [fused_attn_prefill_f32_param_4];
ld.param.u32 %r42, [fused_attn_prefill_f32_param_5];
ld.param.u32 %r43, [fused_attn_prefill_f32_param_6];
ld.param.u32 %r44, [fused_attn_prefill_f32_param_7];
ld.param.u32 %r45, [fused_attn_prefill_f32_param_8];
ld.param.u32 %r46, [fused_attn_prefill_f32_param_9];
ld.param.u32 %r47, [fused_attn_prefill_f32_param_10];
cvta.to.global.u64 %rd1, %rd27;
cvta.to.global.u64 %rd2, %rd28;
mov.u32 %r1, %tid.x;
mul.lo.s32 %r49, %r43, %r48;
mov.u32 %r2, %ctaid.x;
setp.ge.u32 %p2, %r2, %r49;
@%p2 bra $L__BB2_196;
div.u32 %r50, %r2, %r43;
mul.lo.s32 %r51, %r50, %r43;
sub.s32 %r3, %r2, %r51;
add.s32 %r52, %r46, %r50;
add.s32 %r53, %r52, 1;
min.u32 %r4, %r53, %r42;
add.s32 %r5, %r45, 31;
shr.u32 %r6, %r5, 5;
mul.lo.s32 %r54, %r45, %r43;
mul.lo.s32 %r55, %r50, %r54;
mad.lo.s32 %r7, %r3, %r45, %r55;
setp.eq.s32 %p3, %r6, 0;
setp.ge.u32 %p4, %r1, %r45;
mov.f32 %f336, 0f00000000;
or.pred %p5, %p4, %p3;
mov.f32 %f335, %f336;
@%p5 bra $L__BB2_3;
add.s32 %r56, %r7, %r1;
mul.wide.u32 %rd29, %r56, 4;
add.s64 %rd30, %rd1, %rd29;
ld.global.nc.f32 %f335, [%rd30];
$L__BB2_3:
setp.lt.u32 %p6, %r5, 64;
@%p6 bra $L__BB2_6;
add.s32 %r8, %r1, 32;
setp.ge.u32 %p7, %r8, %r45;
@%p7 bra $L__BB2_6;
add.s32 %r57, %r7, %r8;
mul.wide.u32 %rd31, %r57, 4;
add.s64 %rd32, %rd1, %rd31;
ld.global.nc.f32 %f336, [%rd32];
$L__BB2_6:
setp.lt.u32 %p8, %r5, 96;
mov.f32 %f338, 0f00000000;
mov.f32 %f337, %f338;
@%p8 bra $L__BB2_9;
add.s32 %r9, %r1, 64;
setp.ge.u32 %p9, %r9, %r45;
@%p9 bra $L__BB2_9;
add.s32 %r58, %r7, %r9;
mul.wide.u32 %rd33, %r58, 4;
add.s64 %rd34, %rd1, %rd33;
ld.global.nc.f32 %f337, [%rd34];
$L__BB2_9:
setp.lt.u32 %p10, %r5, 128;
@%p10 bra $L__BB2_12;
add.s32 %r10, %r1, 96;
setp.ge.u32 %p11, %r10, %r45;
@%p11 bra $L__BB2_12;
add.s32 %r59, %r7, %r10;
mul.wide.u32 %rd35, %r59, 4;
add.s64 %rd36, %rd1, %rd35;
ld.global.nc.f32 %f338, [%rd36];
$L__BB2_12:
setp.lt.u32 %p12, %r5, 160;
mov.f32 %f340, 0f00000000;
mov.f32 %f339, %f340;
@%p12 bra $L__BB2_15;
add.s32 %r11, %r1, 128;
setp.ge.u32 %p13, %r11, %r45;
@%p13 bra $L__BB2_15;
add.s32 %r60, %r7, %r11;
mul.wide.u32 %rd37, %r60, 4;
add.s64 %rd38, %rd1, %rd37;
ld.global.nc.f32 %f339, [%rd38];
$L__BB2_15:
setp.lt.u32 %p14, %r5, 192;
@%p14 bra $L__BB2_18;
add.s32 %r12, %r1, 160;
setp.ge.u32 %p15, %r12, %r45;
@%p15 bra $L__BB2_18;
add.s32 %r61, %r7, %r12;
mul.wide.u32 %rd39, %r61, 4;
add.s64 %rd40, %rd1, %rd39;
ld.global.nc.f32 %f340, [%rd40];
$L__BB2_18:
setp.lt.u32 %p16, %r5, 224;
mov.f32 %f342, 0f00000000;
mov.f32 %f341, %f342;
@%p16 bra $L__BB2_21;
add.s32 %r13, %r1, 192;
setp.ge.u32 %p17, %r13, %r45;
@%p17 bra $L__BB2_21;
add.s32 %r62, %r7, %r13;
mul.wide.u32 %rd41, %r62, 4;
add.s64 %rd42, %rd1, %rd41;
ld.global.nc.f32 %f341, [%rd42];
$L__BB2_21:
setp.lt.u32 %p18, %r5, 256;
@%p18 bra $L__BB2_24;
add.s32 %r14, %r1, 224;
setp.ge.u32 %p19, %r14, %r45;
@%p19 bra $L__BB2_24;
add.s32 %r63, %r7, %r14;
mul.wide.u32 %rd43, %r63, 4;
add.s64 %rd44, %rd1, %rd43;
ld.global.nc.f32 %f342, [%rd44];
$L__BB2_24:
setp.lt.u32 %p20, %r5, 288;
mov.f32 %f344, 0f00000000;
mov.f32 %f343, %f344;
@%p20 bra $L__BB2_27;
add.s32 %r15, %r1, 256;
setp.ge.u32 %p21, %r15, %r45;
@%p21 bra $L__BB2_27;
add.s32 %r64, %r7, %r15;
mul.wide.u32 %rd45, %r64, 4;
add.s64 %rd46, %rd1, %rd45;
ld.global.nc.f32 %f343, [%rd46];
$L__BB2_27:
setp.lt.u32 %p22, %r5, 320;
@%p22 bra $L__BB2_30;
add.s32 %r16, %r1, 288;
setp.ge.u32 %p23, %r16, %r45;
@%p23 bra $L__BB2_30;
add.s32 %r65, %r7, %r16;
mul.wide.u32 %rd47, %r65, 4;
add.s64 %rd48, %rd1, %rd47;
ld.global.nc.f32 %f344, [%rd48];
$L__BB2_30:
setp.lt.u32 %p24, %r5, 352;
mov.f32 %f346, 0f00000000;
mov.f32 %f345, %f346;
@%p24 bra $L__BB2_33;
add.s32 %r17, %r1, 320;
setp.ge.u32 %p25, %r17, %r45;
@%p25 bra $L__BB2_33;
add.s32 %r66, %r7, %r17;
mul.wide.u32 %rd49, %r66, 4;
add.s64 %rd50, %rd1, %rd49;
ld.global.nc.f32 %f345, [%rd50];
$L__BB2_33:
setp.lt.u32 %p26, %r5, 384;
@%p26 bra $L__BB2_36;
add.s32 %r18, %r1, 352;
setp.ge.u32 %p27, %r18, %r45;
@%p27 bra $L__BB2_36;
add.s32 %r67, %r7, %r18;
mul.wide.u32 %rd51, %r67, 4;
add.s64 %rd52, %rd1, %rd51;
ld.global.nc.f32 %f346, [%rd52];
$L__BB2_36:
setp.lt.u32 %p28, %r5, 416;
mov.f32 %f348, 0f00000000;
mov.f32 %f347, %f348;
@%p28 bra $L__BB2_39;
add.s32 %r19, %r1, 384;
setp.ge.u32 %p29, %r19, %r45;
@%p29 bra $L__BB2_39;
add.s32 %r68, %r7, %r19;
mul.wide.u32 %rd53, %r68, 4;
add.s64 %rd54, %rd1, %rd53;
ld.global.nc.f32 %f347, [%rd54];
$L__BB2_39:
setp.lt.u32 %p30, %r5, 448;
@%p30 bra $L__BB2_42;
add.s32 %r20, %r1, 416;
setp.ge.u32 %p31, %r20, %r45;
@%p31 bra $L__BB2_42;
add.s32 %r69, %r7, %r20;
mul.wide.u32 %rd55, %r69, 4;
add.s64 %rd56, %rd1, %rd55;
ld.global.nc.f32 %f348, [%rd56];
$L__BB2_42:
setp.lt.u32 %p32, %r5, 480;
mov.f32 %f350, 0f00000000;
mov.f32 %f349, %f350;
@%p32 bra $L__BB2_45;
add.s32 %r21, %r1, 448;
setp.ge.u32 %p33, %r21, %r45;
@%p33 bra $L__BB2_45;
add.s32 %r70, %r7, %r21;
mul.wide.u32 %rd57, %r70, 4;
add.s64 %rd58, %rd1, %rd57;
ld.global.nc.f32 %f349, [%rd58];
$L__BB2_45:
setp.lt.u32 %p34, %r5, 512;
@%p34 bra $L__BB2_48;
add.s32 %r22, %r1, 480;
setp.ge.u32 %p35, %r22, %r45;
@%p35 bra $L__BB2_48;
add.s32 %r71, %r7, %r22;
mul.wide.u32 %rd59, %r71, 4;
add.s64 %rd60, %rd1, %rd59;
ld.global.nc.f32 %f350, [%rd60];
$L__BB2_48:
setp.gt.u32 %p36, %r4, %r47;
setp.ne.s32 %p37, %r47, 0;
and.pred %p38, %p37, %p36;
sub.s32 %r72, %r4, %r47;
selp.b32 %r170, %r72, 0, %p38;
setp.ge.u32 %p39, %r170, %r4;
mov.f32 %f416, 0f00000000;
mov.f32 %f414, %f416;
mov.f32 %f412, %f416;
mov.f32 %f410, %f416;
mov.f32 %f408, %f416;
mov.f32 %f406, %f416;
mov.f32 %f404, %f416;
mov.f32 %f402, %f416;
mov.f32 %f400, %f416;
mov.f32 %f398, %f416;
mov.f32 %f396, %f416;
mov.f32 %f394, %f416;
mov.f32 %f392, %f416;
mov.f32 %f390, %f416;
mov.f32 %f388, %f416;
mov.f32 %f386, %f416;
mov.f32 %f433, %f416;
@%p39 bra $L__BB2_147;
setp.lt.u32 %p40, %r1, %r45;
div.u32 %r73, %r43, %r44;
div.u32 %r74, %r3, %r73;
mul.lo.s32 %r75, %r74, %r45;
cvt.u64.u32 %rd3, %r75;
setp.ne.s32 %p41, %r6, 0;
and.pred %p1, %p41, %p40;
cvt.u64.u32 %rd4, %r1;
add.s32 %r76, %r1, 32;
cvt.u64.u32 %rd5, %r76;
add.s32 %r77, %r1, 64;
cvt.u64.u32 %rd6, %r77;
add.s32 %r78, %r1, 96;
cvt.u64.u32 %rd7, %r78;
add.s32 %r79, %r1, 128;
cvt.u64.u32 %rd8, %r79;
add.s32 %r80, %r1, 160;
cvt.u64.u32 %rd9, %r80;
add.s32 %r81, %r1, 192;
cvt.u64.u32 %rd10, %r81;
add.s32 %r82, %r1, 224;
cvt.u64.u32 %rd11, %r82;
add.s32 %r83, %r1, 256;
cvt.u64.u32 %rd12, %r83;
add.s32 %r84, %r1, 288;
cvt.u64.u32 %rd13, %r84;
add.s32 %r85, %r1, 320;
cvt.u64.u32 %rd14, %r85;
add.s32 %r86, %r1, 352;
cvt.u64.u32 %rd15, %r86;
add.s32 %r87, %r1, 384;
cvt.u64.u32 %rd16, %r87;
add.s32 %r88, %r1, 416;
cvt.u64.u32 %rd17, %r88;
add.s32 %r89, %r1, 448;
cvt.u64.u32 %rd18, %r89;
add.s32 %r90, %r1, 480;
cvt.u64.u32 %rd19, %r90;
cvta.to.global.u64 %rd20, %rd25;
cvta.to.global.u64 %rd21, %rd26;
mul.lo.s32 %r24, %r45, %r44;
mov.f32 %f368, 0fFF7FFFFF;
mov.f32 %f235, 0f00000000;
not.pred %p42, %p1;
mov.f32 %f416, %f235;
mov.f32 %f414, %f235;
mov.f32 %f412, %f235;
mov.f32 %f410, %f235;
mov.f32 %f408, %f235;
mov.f32 %f406, %f235;
mov.f32 %f404, %f235;
mov.f32 %f402, %f235;
mov.f32 %f400, %f235;
mov.f32 %f398, %f235;
mov.f32 %f396, %f235;
mov.f32 %f394, %f235;
mov.f32 %f392, %f235;
mov.f32 %f390, %f235;
mov.f32 %f388, %f235;
mov.f32 %f386, %f235;
mov.f32 %f433, %f235;
$L__BB2_50:
mul.lo.s32 %r91, %r24, %r170;
cvt.u64.u32 %rd61, %r91;
add.s64 %rd22, %rd61, %rd3;
add.s64 %rd62, %rd22, %rd4;
shl.b64 %rd63, %rd62, 2;
add.s64 %rd23, %rd20, %rd63;
mov.f32 %f370, %f235;
@%p42 bra $L__BB2_52;
ld.global.nc.f32 %f238, [%rd23];
fma.rn.ftz.f32 %f370, %f335, %f238, 0f00000000;
$L__BB2_52:
cvt.u32.u64 %r92, %rd5;
setp.ge.u32 %p43, %r92, %r45;
or.pred %p45, %p43, %p6;
@%p45 bra $L__BB2_54;
ld.global.nc.f32 %f239, [%rd23+128];
fma.rn.ftz.f32 %f370, %f336, %f239, %f370;
$L__BB2_54:
cvt.u32.u64 %r93, %rd6;
setp.ge.u32 %p46, %r93, %r45;
or.pred %p48, %p46, %p8;
@%p48 bra $L__BB2_56;
ld.global.nc.f32 %f240, [%rd23+256];
fma.rn.ftz.f32 %f370, %f337, %f240, %f370;
$L__BB2_56:
cvt.u32.u64 %r94, %rd7;
setp.ge.u32 %p49, %r94, %r45;
or.pred %p51, %p49, %p10;
@%p51 bra $L__BB2_58;
ld.global.nc.f32 %f241, [%rd23+384];
fma.rn.ftz.f32 %f370, %f338, %f241, %f370;
$L__BB2_58:
cvt.u32.u64 %r95, %rd8;
setp.ge.u32 %p52, %r95, %r45;
or.pred %p54, %p52, %p12;
@%p54 bra $L__BB2_60;
ld.global.nc.f32 %f242, [%rd23+512];
fma.rn.ftz.f32 %f370, %f339, %f242, %f370;
$L__BB2_60:
cvt.u32.u64 %r96, %rd9;
setp.ge.u32 %p55, %r96, %r45;
or.pred %p57, %p55, %p14;
@%p57 bra $L__BB2_62;
ld.global.nc.f32 %f243, [%rd23+640];
fma.rn.ftz.f32 %f370, %f340, %f243, %f370;
$L__BB2_62:
cvt.u32.u64 %r97, %rd10;
setp.ge.u32 %p58, %r97, %r45;
or.pred %p60, %p58, %p16;
@%p60 bra $L__BB2_64;
ld.global.nc.f32 %f244, [%rd23+768];
fma.rn.ftz.f32 %f370, %f341, %f244, %f370;
$L__BB2_64:
cvt.u32.u64 %r98, %rd11;
setp.ge.u32 %p61, %r98, %r45;
or.pred %p63, %p61, %p18;
@%p63 bra $L__BB2_66;
ld.global.nc.f32 %f245, [%rd23+896];
fma.rn.ftz.f32 %f370, %f342, %f245, %f370;
$L__BB2_66:
cvt.u32.u64 %r99, %rd12;
setp.ge.u32 %p64, %r99, %r45;
or.pred %p66, %p64, %p20;
@%p66 bra $L__BB2_68;
ld.global.nc.f32 %f246, [%rd23+1024];
fma.rn.ftz.f32 %f370, %f343, %f246, %f370;
$L__BB2_68:
cvt.u32.u64 %r100, %rd13;
setp.ge.u32 %p67, %r100, %r45;
or.pred %p69, %p67, %p22;
@%p69 bra $L__BB2_70;
ld.global.nc.f32 %f247, [%rd23+1152];
fma.rn.ftz.f32 %f370, %f344, %f247, %f370;
$L__BB2_70:
cvt.u32.u64 %r101, %rd14;
setp.ge.u32 %p70, %r101, %r45;
or.pred %p72, %p70, %p24;
@%p72 bra $L__BB2_72;
ld.global.nc.f32 %f248, [%rd23+1280];
fma.rn.ftz.f32 %f370, %f345, %f248, %f370;
$L__BB2_72:
cvt.u32.u64 %r102, %rd15;
setp.ge.u32 %p73, %r102, %r45;
or.pred %p75, %p73, %p26;
@%p75 bra $L__BB2_74;
ld.global.nc.f32 %f249, [%rd23+1408];
fma.rn.ftz.f32 %f370, %f346, %f249, %f370;
$L__BB2_74:
cvt.u32.u64 %r103, %rd16;
setp.ge.u32 %p76, %r103, %r45;
or.pred %p78, %p76, %p28;
@%p78 bra $L__BB2_76;
ld.global.nc.f32 %f250, [%rd23+1536];
fma.rn.ftz.f32 %f370, %f347, %f250, %f370;
$L__BB2_76:
cvt.u32.u64 %r104, %rd17;
setp.ge.u32 %p79, %r104, %r45;
or.pred %p81, %p79, %p30;
@%p81 bra $L__BB2_78;
ld.global.nc.f32 %f251, [%rd23+1664];
fma.rn.ftz.f32 %f370, %f348, %f251, %f370;
$L__BB2_78:
cvt.u32.u64 %r105, %rd18;
setp.ge.u32 %p82, %r105, %r45;
or.pred %p84, %p82, %p32;
@%p84 bra $L__BB2_80;
ld.global.nc.f32 %f252, [%rd23+1792];
fma.rn.ftz.f32 %f370, %f349, %f252, %f370;
$L__BB2_80:
cvt.u32.u64 %r106, %rd19;
setp.ge.u32 %p85, %r106, %r45;
or.pred %p87, %p85, %p34;
@%p87 bra $L__BB2_82;
ld.global.nc.f32 %f253, [%rd23+1920];
fma.rn.ftz.f32 %f370, %f350, %f253, %f370;
$L__BB2_82:
ld.param.f32 %f334, [fused_attn_prefill_f32_param_11];
mov.b32 %r107, %f370;
mov.u32 %r108, 31;
mov.u32 %r109, 16;
mov.u32 %r110, -1;
shfl.sync.bfly.b32 %r111|%p88, %r107, %r109, %r108, %r110;
mov.b32 %f254, %r111;
add.ftz.f32 %f255, %f370, %f254;
mov.b32 %r112, %f255;
mov.u32 %r113, 8;
shfl.sync.bfly.b32 %r114|%p89, %r112, %r113, %r108, %r110;
mov.b32 %f256, %r114;
add.ftz.f32 %f257, %f255, %f256;
mov.b32 %r115, %f257;
mov.u32 %r116, 4;
shfl.sync.bfly.b32 %r117|%p90, %r115, %r116, %r108, %r110;
mov.b32 %f258, %r117;
add.ftz.f32 %f259, %f257, %f258;
mov.b32 %r118, %f259;
mov.u32 %r119, 2;
shfl.sync.bfly.b32 %r120|%p91, %r118, %r119, %r108, %r110;
mov.b32 %f260, %r120;
add.ftz.f32 %f261, %f259, %f260;
mov.b32 %r121, %f261;
mov.u32 %r122, 1;
shfl.sync.bfly.b32 %r123|%p92, %r121, %r122, %r108, %r110;
mov.b32 %f262, %r123;
add.ftz.f32 %f263, %f261, %f262;
mul.ftz.f32 %f264, %f263, %f334;
max.ftz.f32 %f83, %f368, %f264;
sub.ftz.f32 %f265, %f368, %f83;
mul.ftz.f32 %f266, %f265, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f84, %f266;
sub.ftz.f32 %f267, %f264, %f83;
mul.ftz.f32 %f268, %f267, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f85, %f268;
add.s64 %rd24, %rd21, %rd63;
@%p3 bra $L__BB2_86;
mov.f32 %f385, 0f00000000;
@%p4 bra $L__BB2_85;
ld.global.nc.f32 %f385, [%rd24];
$L__BB2_85:
mul.ftz.f32 %f270, %f84, %f386;
fma.rn.ftz.f32 %f386, %f85, %f385, %f270;
$L__BB2_86:
@%p6 bra $L__BB2_90;
cvt.u32.u64 %r169, %rd5;
setp.ge.u32 %p174, %r169, %r45;
mov.f32 %f387, 0f00000000;
@%p174 bra $L__BB2_89;
ld.global.nc.f32 %f387, [%rd24+128];
$L__BB2_89:
mul.ftz.f32 %f272, %f84, %f388;
fma.rn.ftz.f32 %f388, %f85, %f387, %f272;
$L__BB2_90:
@%p8 bra $L__BB2_94;
cvt.u32.u64 %r168, %rd6;
setp.ge.u32 %p173, %r168, %r45;
mov.f32 %f389, 0f00000000;
@%p173 bra $L__BB2_93;
ld.global.nc.f32 %f389, [%rd24+256];
$L__BB2_93:
mul.ftz.f32 %f274, %f84, %f390;
fma.rn.ftz.f32 %f390, %f85, %f389, %f274;
$L__BB2_94:
@%p10 bra $L__BB2_98;
cvt.u32.u64 %r167, %rd7;
setp.ge.u32 %p172, %r167, %r45;
mov.f32 %f391, 0f00000000;
@%p172 bra $L__BB2_97;
ld.global.nc.f32 %f391, [%rd24+384];
$L__BB2_97:
mul.ftz.f32 %f276, %f84, %f392;
fma.rn.ftz.f32 %f392, %f85, %f391, %f276;
$L__BB2_98:
@%p12 bra $L__BB2_102;
cvt.u32.u64 %r166, %rd8;
setp.ge.u32 %p171, %r166, %r45;
mov.f32 %f393, 0f00000000;
@%p171 bra $L__BB2_101;
ld.global.nc.f32 %f393, [%rd24+512];
$L__BB2_101:
mul.ftz.f32 %f278, %f84, %f394;
fma.rn.ftz.f32 %f394, %f85, %f393, %f278;
$L__BB2_102:
@%p14 bra $L__BB2_106;
cvt.u32.u64 %r165, %rd9;
setp.ge.u32 %p170, %r165, %r45;
mov.f32 %f395, 0f00000000;
@%p170 bra $L__BB2_105;
ld.global.nc.f32 %f395, [%rd24+640];
$L__BB2_105:
mul.ftz.f32 %f280, %f84, %f396;
fma.rn.ftz.f32 %f396, %f85, %f395, %f280;
$L__BB2_106:
@%p16 bra $L__BB2_110;
cvt.u32.u64 %r164, %rd10;
setp.ge.u32 %p169, %r164, %r45;
mov.f32 %f397, 0f00000000;
@%p169 bra $L__BB2_109;
ld.global.nc.f32 %f397, [%rd24+768];
$L__BB2_109:
mul.ftz.f32 %f282, %f84, %f398;
fma.rn.ftz.f32 %f398, %f85, %f397, %f282;
$L__BB2_110:
@%p18 bra $L__BB2_114;
cvt.u32.u64 %r163, %rd11;
setp.ge.u32 %p168, %r163, %r45;
mov.f32 %f399, 0f00000000;
@%p168 bra $L__BB2_113;
ld.global.nc.f32 %f399, [%rd24+896];
$L__BB2_113:
mul.ftz.f32 %f284, %f84, %f400;
fma.rn.ftz.f32 %f400, %f85, %f399, %f284;
$L__BB2_114:
@%p20 bra $L__BB2_118;
cvt.u32.u64 %r162, %rd12;
setp.ge.u32 %p167, %r162, %r45;
mov.f32 %f401, 0f00000000;
@%p167 bra $L__BB2_117;
ld.global.nc.f32 %f401, [%rd24+1024];
$L__BB2_117:
mul.ftz.f32 %f286, %f84, %f402;
fma.rn.ftz.f32 %f402, %f85, %f401, %f286;
$L__BB2_118:
@%p22 bra $L__BB2_122;
cvt.u32.u64 %r161, %rd13;
setp.ge.u32 %p166, %r161, %r45;
mov.f32 %f403, 0f00000000;
@%p166 bra $L__BB2_121;
ld.global.nc.f32 %f403, [%rd24+1152];
$L__BB2_121:
mul.ftz.f32 %f288, %f84, %f404;
fma.rn.ftz.f32 %f404, %f85, %f403, %f288;
$L__BB2_122:
@%p24 bra $L__BB2_126;
cvt.u32.u64 %r160, %rd14;
setp.ge.u32 %p165, %r160, %r45;
mov.f32 %f405, 0f00000000;
@%p165 bra $L__BB2_125;
ld.global.nc.f32 %f405, [%rd24+1280];
$L__BB2_125:
mul.ftz.f32 %f290, %f84, %f406;
fma.rn.ftz.f32 %f406, %f85, %f405, %f290;
$L__BB2_126:
@%p26 bra $L__BB2_130;
cvt.u32.u64 %r159, %rd15;
setp.ge.u32 %p164, %r159, %r45;
mov.f32 %f407, 0f00000000;
@%p164 bra $L__BB2_129;
ld.global.nc.f32 %f407, [%rd24+1408];
$L__BB2_129:
mul.ftz.f32 %f292, %f84, %f408;
fma.rn.ftz.f32 %f408, %f85, %f407, %f292;
$L__BB2_130:
@%p28 bra $L__BB2_134;
cvt.u32.u64 %r158, %rd16;
setp.ge.u32 %p163, %r158, %r45;
mov.f32 %f409, 0f00000000;
@%p163 bra $L__BB2_133;
ld.global.nc.f32 %f409, [%rd24+1536];
$L__BB2_133:
mul.ftz.f32 %f294, %f84, %f410;
fma.rn.ftz.f32 %f410, %f85, %f409, %f294;
$L__BB2_134:
@%p30 bra $L__BB2_138;
cvt.u32.u64 %r157, %rd17;
setp.ge.u32 %p162, %r157, %r45;
mov.f32 %f411, 0f00000000;
@%p162 bra $L__BB2_137;
ld.global.nc.f32 %f411, [%rd24+1664];
$L__BB2_137:
mul.ftz.f32 %f296, %f84, %f412;
fma.rn.ftz.f32 %f412, %f85, %f411, %f296;
$L__BB2_138:
@%p32 bra $L__BB2_142;
cvt.u32.u64 %r156, %rd18;
setp.ge.u32 %p161, %r156, %r45;
mov.f32 %f413, 0f00000000;
@%p161 bra $L__BB2_141;
ld.global.nc.f32 %f413, [%rd24+1792];
$L__BB2_141:
mul.ftz.f32 %f298, %f84, %f414;
fma.rn.ftz.f32 %f414, %f85, %f413, %f298;
$L__BB2_142:
@%p34 bra $L__BB2_146;
cvt.u32.u64 %r155, %rd19;
setp.ge.u32 %p160, %r155, %r45;
mov.f32 %f415, 0f00000000;
@%p160 bra $L__BB2_145;
ld.global.nc.f32 %f415, [%rd24+1920];
$L__BB2_145:
mul.ftz.f32 %f300, %f84, %f416;
fma.rn.ftz.f32 %f416, %f85, %f415, %f300;
$L__BB2_146:
fma.rn.ftz.f32 %f433, %f433, %f84, %f85;
add.s32 %r170, %r170, 1;
setp.lt.u32 %p125, %r170, %r4;
mov.f32 %f368, %f83;
@%p125 bra $L__BB2_50;
$L__BB2_147:
mov.f32 %f434, 0f00000000;
setp.leu.ftz.f32 %p126, %f433, 0f00000000;
@%p126 bra $L__BB2_149;
rcp.approx.ftz.f32 %f434, %f433;
$L__BB2_149:
or.pred %p129, %p3, %p4;
@%p129 bra $L__BB2_151;
mul.ftz.f32 %f302, %f434, %f386;
add.s32 %r139, %r7, %r1;
mul.wide.u32 %rd66, %r139, 4;
add.s64 %rd67, %rd2, %rd66;
st.global.f32 [%rd67], %f302;
$L__BB2_151:
@%p6 bra $L__BB2_154;
add.s32 %r27, %r1, 32;
setp.ge.u32 %p131, %r27, %r45;
@%p131 bra $L__BB2_154;
mul.ftz.f32 %f303, %f434, %f388;
add.s32 %r140, %r7, %r27;
mul.wide.u32 %rd68, %r140, 4;
add.s64 %rd69, %rd2, %rd68;
st.global.f32 [%rd69], %f303;
$L__BB2_154:
@%p8 bra $L__BB2_157;
add.s32 %r28, %r1, 64;
setp.ge.u32 %p133, %r28, %r45;
@%p133 bra $L__BB2_157;
mul.ftz.f32 %f304, %f434, %f390;
add.s32 %r141, %r7, %r28;
mul.wide.u32 %rd70, %r141, 4;
add.s64 %rd71, %rd2, %rd70;
st.global.f32 [%rd71], %f304;
$L__BB2_157:
@%p10 bra $L__BB2_160;
add.s32 %r29, %r1, 96;
setp.ge.u32 %p135, %r29, %r45;
@%p135 bra $L__BB2_160;
mul.ftz.f32 %f305, %f434, %f392;
add.s32 %r142, %r7, %r29;
mul.wide.u32 %rd72, %r142, 4;
add.s64 %rd73, %rd2, %rd72;
st.global.f32 [%rd73], %f305;
$L__BB2_160:
@%p12 bra $L__BB2_163;
add.s32 %r30, %r1, 128;
setp.ge.u32 %p137, %r30, %r45;
@%p137 bra $L__BB2_163;
mul.ftz.f32 %f306, %f434, %f394;
add.s32 %r143, %r7, %r30;
mul.wide.u32 %rd74, %r143, 4;
add.s64 %rd75, %rd2, %rd74;
st.global.f32 [%rd75], %f306;
$L__BB2_163:
@%p14 bra $L__BB2_166;
add.s32 %r31, %r1, 160;
setp.ge.u32 %p139, %r31, %r45;
@%p139 bra $L__BB2_166;
mul.ftz.f32 %f307, %f434, %f396;
add.s32 %r144, %r7, %r31;
mul.wide.u32 %rd76, %r144, 4;
add.s64 %rd77, %rd2, %rd76;
st.global.f32 [%rd77], %f307;
$L__BB2_166:
@%p16 bra $L__BB2_169;
add.s32 %r32, %r1, 192;
setp.ge.u32 %p141, %r32, %r45;
@%p141 bra $L__BB2_169;
mul.ftz.f32 %f308, %f434, %f398;
add.s32 %r145, %r7, %r32;
mul.wide.u32 %rd78, %r145, 4;
add.s64 %rd79, %rd2, %rd78;
st.global.f32 [%rd79], %f308;
$L__BB2_169:
@%p18 bra $L__BB2_172;
add.s32 %r33, %r1, 224;
setp.ge.u32 %p143, %r33, %r45;
@%p143 bra $L__BB2_172;
mul.ftz.f32 %f309, %f434, %f400;
add.s32 %r146, %r7, %r33;
mul.wide.u32 %rd80, %r146, 4;
add.s64 %rd81, %rd2, %rd80;
st.global.f32 [%rd81], %f309;
$L__BB2_172:
@%p20 bra $L__BB2_175;
add.s32 %r34, %r1, 256;
setp.ge.u32 %p145, %r34, %r45;
@%p145 bra $L__BB2_175;
mul.ftz.f32 %f310, %f434, %f402;
add.s32 %r147, %r7, %r34;
mul.wide.u32 %rd82, %r147, 4;
add.s64 %rd83, %rd2, %rd82;
st.global.f32 [%rd83], %f310;
$L__BB2_175:
@%p22 bra $L__BB2_178;
add.s32 %r35, %r1, 288;
setp.ge.u32 %p147, %r35, %r45;
@%p147 bra $L__BB2_178;
mul.ftz.f32 %f311, %f434, %f404;
add.s32 %r148, %r7, %r35;
mul.wide.u32 %rd84, %r148, 4;
add.s64 %rd85, %rd2, %rd84;
st.global.f32 [%rd85], %f311;
$L__BB2_178:
@%p24 bra $L__BB2_181;
add.s32 %r36, %r1, 320;
setp.ge.u32 %p149, %r36, %r45;
@%p149 bra $L__BB2_181;
mul.ftz.f32 %f312, %f434, %f406;
add.s32 %r149, %r7, %r36;
mul.wide.u32 %rd86, %r149, 4;
add.s64 %rd87, %rd2, %rd86;
st.global.f32 [%rd87], %f312;
$L__BB2_181:
@%p26 bra $L__BB2_184;
add.s32 %r37, %r1, 352;
setp.ge.u32 %p151, %r37, %r45;
@%p151 bra $L__BB2_184;
mul.ftz.f32 %f313, %f434, %f408;
add.s32 %r150, %r7, %r37;
mul.wide.u32 %rd88, %r150, 4;
add.s64 %rd89, %rd2, %rd88;
st.global.f32 [%rd89], %f313;
$L__BB2_184:
@%p28 bra $L__BB2_187;
add.s32 %r38, %r1, 384;
setp.ge.u32 %p153, %r38, %r45;
@%p153 bra $L__BB2_187;
mul.ftz.f32 %f314, %f434, %f410;
add.s32 %r151, %r7, %r38;
mul.wide.u32 %rd90, %r151, 4;
add.s64 %rd91, %rd2, %rd90;
st.global.f32 [%rd91], %f314;
$L__BB2_187:
@%p30 bra $L__BB2_190;
add.s32 %r39, %r1, 416;
setp.ge.u32 %p155, %r39, %r45;
@%p155 bra $L__BB2_190;
mul.ftz.f32 %f315, %f434, %f412;
add.s32 %r152, %r7, %r39;
mul.wide.u32 %rd92, %r152, 4;
add.s64 %rd93, %rd2, %rd92;
st.global.f32 [%rd93], %f315;
$L__BB2_190:
@%p32 bra $L__BB2_193;
add.s32 %r40, %r1, 448;
setp.ge.u32 %p157, %r40, %r45;
@%p157 bra $L__BB2_193;
mul.ftz.f32 %f316, %f434, %f414;
add.s32 %r153, %r7, %r40;
mul.wide.u32 %rd94, %r153, 4;
add.s64 %rd95, %rd2, %rd94;
st.global.f32 [%rd95], %f316;
$L__BB2_193:
@%p34 bra $L__BB2_196;
add.s32 %r41, %r1, 480;
setp.ge.u32 %p159, %r41, %r45;
@%p159 bra $L__BB2_196;
mul.ftz.f32 %f317, %f434, %f416;
add.s32 %r154, %r7, %r41;
mul.wide.u32 %rd96, %r154, 4;
add.s64 %rd97, %rd2, %rd96;
st.global.f32 [%rd97], %f317;
$L__BB2_196:
ret;
}
// .globl fused_attention_bwd_f32
.visible .entry fused_attention_bwd_f32(
.param .u64 fused_attention_bwd_f32_param_0,
.param .u64 fused_attention_bwd_f32_param_1,
.param .u64 fused_attention_bwd_f32_param_2,
.param .u64 fused_attention_bwd_f32_param_3,
.param .u64 fused_attention_bwd_f32_param_4,
.param .u64 fused_attention_bwd_f32_param_5,
.param .u64 fused_attention_bwd_f32_param_6,
.param .u64 fused_attention_bwd_f32_param_7,
.param .f32 fused_attention_bwd_f32_param_8,
.param .u32 fused_attention_bwd_f32_param_9,
.param .u32 fused_attention_bwd_f32_param_10,
.param .u32 fused_attention_bwd_f32_param_11,
.param .u32 fused_attention_bwd_f32_param_12,
.param .u32 fused_attention_bwd_f32_param_13,
.param .u32 fused_attention_bwd_f32_param_14
)
{
.reg .pred %p<56>;
.reg .f32 %f<291>;
.reg .b32 %r<144>;
.reg .b64 %rd<142>;
ld.param.u64 %rd28, [fused_attention_bwd_f32_param_0];
ld.param.u64 %rd29, [fused_attention_bwd_f32_param_1];
ld.param.u64 %rd30, [fused_attention_bwd_f32_param_2];
ld.param.u64 %rd31, [fused_attention_bwd_f32_param_3];
ld.param.u64 %rd32, [fused_attention_bwd_f32_param_4];
ld.param.u64 %rd33, [fused_attention_bwd_f32_param_5];
ld.param.u64 %rd34, [fused_attention_bwd_f32_param_6];
ld.param.u64 %rd35, [fused_attention_bwd_f32_param_7];
ld.param.f32 %f65, [fused_attention_bwd_f32_param_8];
ld.param.u32 %r72, [fused_attention_bwd_f32_param_9];
ld.param.u32 %r67, [fused_attention_bwd_f32_param_10];
ld.param.u32 %r68, [fused_attention_bwd_f32_param_11];
ld.param.u32 %r69, [fused_attention_bwd_f32_param_12];
ld.param.u32 %r70, [fused_attention_bwd_f32_param_13];
ld.param.u32 %r71, [fused_attention_bwd_f32_param_14];
cvta.to.global.u64 %rd1, %rd31;
cvta.to.global.u64 %rd2, %rd30;
cvta.to.global.u64 %rd3, %rd34;
cvta.to.global.u64 %rd4, %rd28;
cvta.to.global.u64 %rd5, %rd33;
cvta.to.global.u64 %rd6, %rd29;
cvta.to.global.u64 %rd7, %rd35;
cvta.to.global.u64 %rd8, %rd32;
mov.u32 %r73, %ntid.x;
mov.u32 %r74, %ctaid.x;
mov.u32 %r75, %tid.x;
mad.lo.s32 %r1, %r74, %r73, %r75;
mul.lo.s32 %r76, %r67, %r72;
mul.lo.s32 %r77, %r76, %r68;
setp.ge.u32 %p1, %r1, %r77;
@%p1 bra $L__BB3_67;
mul.lo.s32 %r78, %r68, %r67;
div.u32 %r79, %r1, %r78;
mul.lo.s32 %r80, %r79, %r78;
sub.s32 %r81, %r1, %r80;
div.u32 %r82, %r81, %r68;
mul.lo.s32 %r83, %r82, %r68;
sub.s32 %r84, %r81, %r83;
mad.lo.s32 %r85, %r79, %r67, %r82;
mul.lo.s32 %r86, %r70, %r68;
mul.lo.s32 %r87, %r86, %r85;
mul.lo.s32 %r88, %r70, %r69;
mul.lo.s32 %r2, %r88, %r85;
cvt.u64.u32 %rd36, %r87;
mul.lo.s32 %r89, %r84, %r70;
cvt.u64.u32 %rd37, %r89;
add.s64 %rd9, %rd36, %rd37;
add.s32 %r90, %r84, 1;
setp.lt.u32 %p2, %r90, %r69;
setp.ne.s32 %p3, %r71, 0;
and.pred %p4, %p3, %p2;
selp.b32 %r3, %r90, %r69, %p4;
setp.eq.s32 %p5, %r3, 0;
mov.f32 %f266, 0fFF7FFFFF;
@%p5 bra $L__BB3_17;
setp.eq.s32 %p6, %r70, 0;
@%p6 bra $L__BB3_12;
add.s32 %r4, %r70, -1;
and.b32 %r5, %r70, 3;
sub.s32 %r6, %r70, %r5;
cvt.u64.u32 %rd10, %r2;
mov.f32 %f266, 0fFF7FFFFF;
mov.u32 %r91, 0;
mov.u32 %r117, %r91;
$L__BB3_4:
mul.lo.s32 %r93, %r117, %r70;
cvt.u64.u32 %rd38, %r93;
add.s64 %rd11, %rd38, %rd10;
setp.lt.u32 %p7, %r4, 3;
mov.f32 %f261, 0f00000000;
mov.u32 %r120, %r91;
@%p7 bra $L__BB3_7;
mov.f32 %f261, 0f00000000;
mov.u32 %r120, 0;
mov.u32 %r119, %r6;
$L__BB3_6:
cvt.u64.u32 %rd39, %r120;
add.s64 %rd40, %rd9, %rd39;
shl.b64 %rd41, %rd40, 2;
add.s64 %rd42, %rd4, %rd41;
add.s64 %rd43, %rd11, %rd39;
shl.b64 %rd44, %rd43, 2;
add.s64 %rd45, %rd6, %rd44;
ld.global.nc.f32 %f71, [%rd45];
ld.global.nc.f32 %f72, [%rd42];
fma.rn.ftz.f32 %f73, %f72, %f71, %f261;
ld.global.nc.f32 %f74, [%rd45+4];
ld.global.nc.f32 %f75, [%rd42+4];
fma.rn.ftz.f32 %f76, %f75, %f74, %f73;
ld.global.nc.f32 %f77, [%rd45+8];
ld.global.nc.f32 %f78, [%rd42+8];
fma.rn.ftz.f32 %f79, %f78, %f77, %f76;
ld.global.nc.f32 %f80, [%rd45+12];
ld.global.nc.f32 %f81, [%rd42+12];
fma.rn.ftz.f32 %f261, %f81, %f80, %f79;
add.s32 %r120, %r120, 4;
add.s32 %r119, %r119, -4;
setp.ne.s32 %p8, %r119, 0;
@%p8 bra $L__BB3_6;
$L__BB3_7:
setp.eq.s32 %p9, %r5, 0;
@%p9 bra $L__BB3_11;
setp.eq.s32 %p10, %r5, 1;
cvt.u64.u32 %rd46, %r120;
add.s64 %rd47, %rd9, %rd46;
shl.b64 %rd48, %rd47, 2;
add.s64 %rd12, %rd4, %rd48;
add.s64 %rd49, %rd11, %rd46;
shl.b64 %rd50, %rd49, 2;
add.s64 %rd13, %rd6, %rd50;
ld.global.nc.f32 %f82, [%rd13];
ld.global.nc.f32 %f83, [%rd12];
fma.rn.ftz.f32 %f261, %f83, %f82, %f261;
@%p10 bra $L__BB3_11;
setp.eq.s32 %p11, %r5, 2;
ld.global.nc.f32 %f84, [%rd13+4];
ld.global.nc.f32 %f85, [%rd12+4];
fma.rn.ftz.f32 %f261, %f85, %f84, %f261;
@%p11 bra $L__BB3_11;
ld.global.nc.f32 %f86, [%rd13+8];
ld.global.nc.f32 %f87, [%rd12+8];
fma.rn.ftz.f32 %f261, %f87, %f86, %f261;
$L__BB3_11:
mul.ftz.f32 %f88, %f261, %f65;
setp.gt.ftz.f32 %p12, %f88, %f266;
selp.f32 %f266, %f88, %f266, %p12;
add.s32 %r117, %r117, 1;
setp.lt.u32 %p13, %r117, %r3;
@%p13 bra $L__BB3_4;
bra.uni $L__BB3_17;
$L__BB3_12:
mul.ftz.f32 %f11, %f65, 0f00000000;
and.b32 %r122, %r3, 3;
add.s32 %r95, %r3, -1;
setp.lt.u32 %p14, %r95, 3;
mov.f32 %f266, 0fFF7FFFFF;
@%p14 bra $L__BB3_15;
sub.s32 %r121, %r3, %r122;
mov.f32 %f266, 0fFF7FFFFF;
$L__BB3_14:
setp.gt.ftz.f32 %p15, %f11, %f266;
selp.f32 %f266, %f11, %f266, %p15;
add.s32 %r121, %r121, -4;
setp.ne.s32 %p16, %r121, 0;
@%p16 bra $L__BB3_14;
$L__BB3_15:
setp.eq.s32 %p17, %r122, 0;
@%p17 bra $L__BB3_17;
$L__BB3_16:
.pragma "nounroll";
setp.gt.ftz.f32 %p18, %f11, %f266;
selp.f32 %f266, %f11, %f266, %p18;
add.s32 %r122, %r122, -1;
setp.ne.s32 %p19, %r122, 0;
@%p19 bra $L__BB3_16;
$L__BB3_17:
mov.f32 %f277, 0f00000000;
mov.f32 %f276, %f277;
@%p5 bra $L__BB3_34;
setp.eq.s32 %p21, %r70, 0;
@%p21 bra $L__BB3_28;
add.s32 %r20, %r70, -1;
and.b32 %r21, %r70, 3;
sub.s32 %r22, %r70, %r21;
cvt.u64.u32 %rd14, %r2;
mov.f32 %f93, 0f00000000;
mov.u32 %r96, 0;
mov.u32 %r123, %r96;
mov.f32 %f276, %f93;
$L__BB3_20:
mul.lo.s32 %r98, %r123, %r70;
cvt.u64.u32 %rd51, %r98;
add.s64 %rd15, %rd51, %rd14;
setp.lt.u32 %p22, %r20, 3;
mov.u32 %r126, %r96;
mov.f32 %f271, %f93;
@%p22 bra $L__BB3_23;
mov.f32 %f271, 0f00000000;
mov.u32 %r126, 0;
mov.u32 %r125, %r22;
$L__BB3_22:
cvt.u64.u32 %rd52, %r126;
add.s64 %rd53, %rd9, %rd52;
shl.b64 %rd54, %rd53, 2;
add.s64 %rd55, %rd4, %rd54;
add.s64 %rd56, %rd15, %rd52;
shl.b64 %rd57, %rd56, 2;
add.s64 %rd58, %rd6, %rd57;
ld.global.nc.f32 %f97, [%rd58];
ld.global.nc.f32 %f98, [%rd55];
fma.rn.ftz.f32 %f99, %f98, %f97, %f271;
ld.global.nc.f32 %f100, [%rd58+4];
ld.global.nc.f32 %f101, [%rd55+4];
fma.rn.ftz.f32 %f102, %f101, %f100, %f99;
ld.global.nc.f32 %f103, [%rd58+8];
ld.global.nc.f32 %f104, [%rd55+8];
fma.rn.ftz.f32 %f105, %f104, %f103, %f102;
ld.global.nc.f32 %f106, [%rd58+12];
ld.global.nc.f32 %f107, [%rd55+12];
fma.rn.ftz.f32 %f271, %f107, %f106, %f105;
add.s32 %r126, %r126, 4;
add.s32 %r125, %r125, -4;
setp.ne.s32 %p23, %r125, 0;
@%p23 bra $L__BB3_22;
$L__BB3_23:
setp.eq.s32 %p24, %r21, 0;
@%p24 bra $L__BB3_27;
setp.eq.s32 %p25, %r21, 1;
cvt.u64.u32 %rd59, %r126;
add.s64 %rd60, %rd9, %rd59;
shl.b64 %rd61, %rd60, 2;
add.s64 %rd16, %rd4, %rd61;
add.s64 %rd62, %rd15, %rd59;
shl.b64 %rd63, %rd62, 2;
add.s64 %rd17, %rd6, %rd63;
ld.global.nc.f32 %f108, [%rd17];
ld.global.nc.f32 %f109, [%rd16];
fma.rn.ftz.f32 %f271, %f109, %f108, %f271;
@%p25 bra $L__BB3_27;
setp.eq.s32 %p26, %r21, 2;
ld.global.nc.f32 %f110, [%rd17+4];
ld.global.nc.f32 %f111, [%rd16+4];
fma.rn.ftz.f32 %f271, %f111, %f110, %f271;
@%p26 bra $L__BB3_27;
ld.global.nc.f32 %f112, [%rd17+8];
ld.global.nc.f32 %f113, [%rd16+8];
fma.rn.ftz.f32 %f271, %f113, %f112, %f271;
$L__BB3_27:
mul.ftz.f32 %f114, %f271, %f65;
sub.ftz.f32 %f115, %f114, %f266;
mul.ftz.f32 %f116, %f115, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f117, %f116;
add.ftz.f32 %f276, %f276, %f117;
add.s32 %r123, %r123, 1;
setp.lt.u32 %p27, %r123, %r3;
@%p27 bra $L__BB3_20;
bra.uni $L__BB3_34;
$L__BB3_28:
add.s32 %r100, %r3, -1;
and.b32 %r128, %r3, 3;
setp.lt.u32 %p28, %r100, 3;
mov.f32 %f276, 0f00000000;
@%p28 bra $L__BB3_31;
sub.s32 %r127, %r3, %r128;
mul.ftz.f32 %f121, %f65, 0f00000000;
mov.f32 %f276, 0f00000000;
sub.ftz.f32 %f122, %f121, %f266;
mul.ftz.f32 %f123, %f122, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f29, %f123;
$L__BB3_30:
add.ftz.f32 %f124, %f276, %f29;
add.ftz.f32 %f125, %f124, %f29;
add.ftz.f32 %f126, %f125, %f29;
add.ftz.f32 %f276, %f126, %f29;
add.s32 %r127, %r127, -4;
setp.ne.s32 %p29, %r127, 0;
@%p29 bra $L__BB3_30;
$L__BB3_31:
setp.eq.s32 %p30, %r128, 0;
@%p30 bra $L__BB3_34;
mul.ftz.f32 %f127, %f65, 0f00000000;
sub.ftz.f32 %f128, %f127, %f266;
mul.ftz.f32 %f129, %f128, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f34, %f129;
$L__BB3_33:
.pragma "nounroll";
add.ftz.f32 %f276, %f276, %f34;
add.s32 %r128, %r128, -1;
setp.ne.s32 %p31, %r128, 0;
@%p31 bra $L__BB3_33;
$L__BB3_34:
setp.leu.ftz.f32 %p32, %f276, 0f00000000;
@%p32 bra $L__BB3_36;
rcp.approx.ftz.f32 %f277, %f276;
$L__BB3_36:
setp.eq.s32 %p33, %r70, 0;
mov.f32 %f282, 0f00000000;
@%p33 bra $L__BB3_42;
add.s32 %r102, %r70, -1;
and.b32 %r133, %r70, 3;
setp.lt.u32 %p34, %r102, 3;
mov.f32 %f282, 0f00000000;
mov.u32 %r131, 0;
@%p34 bra $L__BB3_40;
sub.s32 %r130, %r70, %r133;
mov.f32 %f282, 0f00000000;
mov.u32 %r131, 0;
$L__BB3_39:
cvt.u64.u32 %rd64, %r131;
add.s64 %rd65, %rd9, %rd64;
shl.b64 %rd66, %rd65, 2;
add.s64 %rd67, %rd8, %rd66;
add.s64 %rd68, %rd1, %rd66;
ld.global.nc.f32 %f135, [%rd68];
ld.global.nc.f32 %f136, [%rd67];
fma.rn.ftz.f32 %f137, %f136, %f135, %f282;
ld.global.nc.f32 %f138, [%rd68+4];
ld.global.nc.f32 %f139, [%rd67+4];
fma.rn.ftz.f32 %f140, %f139, %f138, %f137;
ld.global.nc.f32 %f141, [%rd68+8];
ld.global.nc.f32 %f142, [%rd67+8];
fma.rn.ftz.f32 %f143, %f142, %f141, %f140;
ld.global.nc.f32 %f144, [%rd68+12];
ld.global.nc.f32 %f145, [%rd67+12];
fma.rn.ftz.f32 %f282, %f145, %f144, %f143;
add.s32 %r131, %r131, 4;
add.s32 %r130, %r130, -4;
setp.ne.s32 %p35, %r130, 0;
@%p35 bra $L__BB3_39;
$L__BB3_40:
setp.eq.s32 %p36, %r133, 0;
@%p36 bra $L__BB3_42;
$L__BB3_41:
.pragma "nounroll";
cvt.u64.u32 %rd69, %r131;
add.s64 %rd70, %rd9, %rd69;
shl.b64 %rd71, %rd70, 2;
add.s64 %rd72, %rd8, %rd71;
add.s64 %rd73, %rd1, %rd71;
ld.global.nc.f32 %f146, [%rd73];
ld.global.nc.f32 %f147, [%rd72];
fma.rn.ftz.f32 %f282, %f147, %f146, %f282;
add.s32 %r131, %r131, 1;
add.s32 %r133, %r133, -1;
setp.ne.s32 %p37, %r133, 0;
@%p37 bra $L__BB3_41;
$L__BB3_42:
@%p5 bra $L__BB3_67;
@%p33 bra $L__BB3_67;
add.s32 %r47, %r70, -1;
and.b32 %r48, %r70, 3;
sub.s32 %r49, %r70, %r48;
cvt.u64.u32 %rd18, %r2;
mov.u32 %r104, 0;
mov.u32 %r134, %r104;
$L__BB3_45:
mul.lo.s32 %r106, %r134, %r70;
cvt.u64.u32 %rd74, %r106;
add.s64 %rd19, %rd74, %rd18;
setp.lt.u32 %p40, %r47, 3;
mov.f32 %f286, 0f00000000;
mov.u32 %r137, %r104;
@%p40 bra $L__BB3_48;
mov.f32 %f286, 0f00000000;
mov.u32 %r137, 0;
mov.u32 %r136, %r49;
$L__BB3_47:
cvt.u64.u32 %rd75, %r137;
add.s64 %rd76, %rd9, %rd75;
shl.b64 %rd77, %rd76, 2;
add.s64 %rd78, %rd4, %rd77;
add.s64 %rd79, %rd19, %rd75;
shl.b64 %rd80, %rd79, 2;
add.s64 %rd81, %rd6, %rd80;
ld.global.nc.f32 %f151, [%rd81];
ld.global.nc.f32 %f152, [%rd78];
fma.rn.ftz.f32 %f153, %f152, %f151, %f286;
ld.global.nc.f32 %f154, [%rd81+4];
ld.global.nc.f32 %f155, [%rd78+4];
fma.rn.ftz.f32 %f156, %f155, %f154, %f153;
ld.global.nc.f32 %f157, [%rd81+8];
ld.global.nc.f32 %f158, [%rd78+8];
fma.rn.ftz.f32 %f159, %f158, %f157, %f156;
ld.global.nc.f32 %f160, [%rd81+12];
ld.global.nc.f32 %f161, [%rd78+12];
fma.rn.ftz.f32 %f286, %f161, %f160, %f159;
add.s32 %r137, %r137, 4;
add.s32 %r136, %r136, -4;
setp.ne.s32 %p41, %r136, 0;
@%p41 bra $L__BB3_47;
$L__BB3_48:
setp.eq.s32 %p42, %r48, 0;
@%p42 bra $L__BB3_52;
setp.eq.s32 %p43, %r48, 1;
cvt.u64.u32 %rd82, %r137;
add.s64 %rd83, %rd9, %rd82;
shl.b64 %rd84, %rd83, 2;
add.s64 %rd20, %rd4, %rd84;
add.s64 %rd85, %rd19, %rd82;
shl.b64 %rd86, %rd85, 2;
add.s64 %rd21, %rd6, %rd86;
ld.global.nc.f32 %f162, [%rd21];
ld.global.nc.f32 %f163, [%rd20];
fma.rn.ftz.f32 %f286, %f163, %f162, %f286;
@%p43 bra $L__BB3_52;
setp.eq.s32 %p44, %r48, 2;
ld.global.nc.f32 %f164, [%rd21+4];
ld.global.nc.f32 %f165, [%rd20+4];
fma.rn.ftz.f32 %f286, %f165, %f164, %f286;
@%p44 bra $L__BB3_52;
ld.global.nc.f32 %f166, [%rd21+8];
ld.global.nc.f32 %f167, [%rd20+8];
fma.rn.ftz.f32 %f286, %f167, %f166, %f286;
$L__BB3_52:
mul.ftz.f32 %f170, %f286, %f65;
sub.ftz.f32 %f171, %f170, %f266;
mul.ftz.f32 %f172, %f171, 0f3FB8AA3B;
ex2.approx.ftz.f32 %f173, %f172;
mul.ftz.f32 %f55, %f277, %f173;
mov.f32 %f290, 0f00000000;
mov.u32 %r140, 0;
@%p40 bra $L__BB3_55;
mov.f32 %f290, 0f00000000;
mov.u32 %r140, 0;
mov.u32 %r139, %r49;
$L__BB3_54:
cvt.u64.u32 %rd87, %r140;
add.s64 %rd88, %rd9, %rd87;
shl.b64 %rd89, %rd88, 2;
add.s64 %rd90, %rd8, %rd89;
add.s64 %rd91, %rd19, %rd87;
shl.b64 %rd92, %rd91, 2;
add.s64 %rd93, %rd2, %rd92;
ld.global.nc.f32 %f175, [%rd93];
ld.global.nc.f32 %f176, [%rd90];
fma.rn.ftz.f32 %f177, %f176, %f175, %f290;
ld.global.nc.f32 %f178, [%rd93+4];
ld.global.nc.f32 %f179, [%rd90+4];
fma.rn.ftz.f32 %f180, %f179, %f178, %f177;
ld.global.nc.f32 %f181, [%rd93+8];
ld.global.nc.f32 %f182, [%rd90+8];
fma.rn.ftz.f32 %f183, %f182, %f181, %f180;
ld.global.nc.f32 %f184, [%rd93+12];
ld.global.nc.f32 %f185, [%rd90+12];
fma.rn.ftz.f32 %f290, %f185, %f184, %f183;
add.s32 %r140, %r140, 4;
add.s32 %r139, %r139, -4;
setp.ne.s32 %p46, %r139, 0;
@%p46 bra $L__BB3_54;
$L__BB3_55:
@%p42 bra $L__BB3_59;
setp.eq.s32 %p48, %r48, 1;
cvt.u64.u32 %rd94, %r140;
add.s64 %rd95, %rd9, %rd94;
shl.b64 %rd96, %rd95, 2;
add.s64 %rd22, %rd8, %rd96;
add.s64 %rd97, %rd19, %rd94;
shl.b64 %rd98, %rd97, 2;
add.s64 %rd23, %rd2, %rd98;
ld.global.nc.f32 %f186, [%rd23];
ld.global.nc.f32 %f187, [%rd22];
fma.rn.ftz.f32 %f290, %f187, %f186, %f290;
@%p48 bra $L__BB3_59;
setp.eq.s32 %p49, %r48, 2;
ld.global.nc.f32 %f188, [%rd23+4];
ld.global.nc.f32 %f189, [%rd22+4];
fma.rn.ftz.f32 %f290, %f189, %f188, %f290;
@%p49 bra $L__BB3_59;
ld.global.nc.f32 %f190, [%rd23+8];
ld.global.nc.f32 %f191, [%rd22+8];
fma.rn.ftz.f32 %f290, %f191, %f190, %f290;
$L__BB3_59:
sub.ftz.f32 %f192, %f290, %f282;
mul.ftz.f32 %f193, %f55, %f192;
mul.ftz.f32 %f64, %f193, %f65;
mov.u32 %r143, 0;
@%p40 bra $L__BB3_62;
mov.u32 %r143, 0;
mov.u32 %r142, %r49;
$L__BB3_61:
cvt.u64.u32 %rd99, %r143;
add.s64 %rd100, %rd19, %rd99;
shl.b64 %rd101, %rd100, 2;
add.s64 %rd102, %rd7, %rd101;
add.s64 %rd103, %rd9, %rd99;
shl.b64 %rd104, %rd103, 2;
add.s64 %rd105, %rd8, %rd104;
ld.global.nc.f32 %f194, [%rd105];
mul.ftz.f32 %f195, %f55, %f194;
atom.global.add.f32 %f196, [%rd102], %f195;
add.s64 %rd106, %rd6, %rd101;
ld.global.nc.f32 %f197, [%rd106];
add.s64 %rd107, %rd5, %rd104;
ld.global.f32 %f198, [%rd107];
fma.rn.ftz.f32 %f199, %f64, %f197, %f198;
st.global.f32 [%rd107], %f199;
add.s64 %rd108, %rd3, %rd101;
add.s64 %rd109, %rd4, %rd104;
ld.global.nc.f32 %f200, [%rd109];
mul.ftz.f32 %f201, %f64, %f200;
atom.global.add.f32 %f202, [%rd108], %f201;
add.s32 %r112, %r143, 1;
cvt.u64.u32 %rd110, %r112;
add.s64 %rd111, %rd19, %rd110;
shl.b64 %rd112, %rd111, 2;
add.s64 %rd113, %rd7, %rd112;
ld.global.nc.f32 %f203, [%rd105+4];
mul.ftz.f32 %f204, %f55, %f203;
atom.global.add.f32 %f205, [%rd113], %f204;
ld.global.nc.f32 %f206, [%rd106+4];
ld.global.f32 %f207, [%rd107+4];
fma.rn.ftz.f32 %f208, %f64, %f206, %f207;
st.global.f32 [%rd107+4], %f208;
add.s64 %rd114, %rd3, %rd112;
ld.global.nc.f32 %f209, [%rd109+4];
mul.ftz.f32 %f210, %f64, %f209;
atom.global.add.f32 %f211, [%rd114], %f210;
add.s32 %r113, %r143, 2;
cvt.u64.u32 %rd115, %r113;
add.s64 %rd116, %rd19, %rd115;
shl.b64 %rd117, %rd116, 2;
add.s64 %rd118, %rd7, %rd117;
ld.global.nc.f32 %f212, [%rd105+8];
mul.ftz.f32 %f213, %f55, %f212;
atom.global.add.f32 %f214, [%rd118], %f213;
ld.global.nc.f32 %f215, [%rd106+8];
ld.global.f32 %f216, [%rd107+8];
fma.rn.ftz.f32 %f217, %f64, %f215, %f216;
st.global.f32 [%rd107+8], %f217;
add.s64 %rd119, %rd3, %rd117;
ld.global.nc.f32 %f218, [%rd109+8];
mul.ftz.f32 %f219, %f64, %f218;
atom.global.add.f32 %f220, [%rd119], %f219;
add.s32 %r114, %r143, 3;
cvt.u64.u32 %rd120, %r114;
add.s64 %rd121, %rd19, %rd120;
shl.b64 %rd122, %rd121, 2;
add.s64 %rd123, %rd7, %rd122;
ld.global.nc.f32 %f221, [%rd105+12];
mul.ftz.f32 %f222, %f55, %f221;
atom.global.add.f32 %f223, [%rd123], %f222;
ld.global.nc.f32 %f224, [%rd106+12];
ld.global.f32 %f225, [%rd107+12];
fma.rn.ftz.f32 %f226, %f64, %f224, %f225;
st.global.f32 [%rd107+12], %f226;
add.s64 %rd124, %rd3, %rd122;
ld.global.nc.f32 %f227, [%rd109+12];
mul.ftz.f32 %f228, %f64, %f227;
atom.global.add.f32 %f229, [%rd124], %f228;
add.s32 %r143, %r143, 4;
add.s32 %r142, %r142, -4;
setp.ne.s32 %p51, %r142, 0;
@%p51 bra $L__BB3_61;
$L__BB3_62:
@%p42 bra $L__BB3_66;
setp.eq.s32 %p53, %r48, 1;
cvt.u64.u32 %rd125, %r143;
add.s64 %rd126, %rd19, %rd125;
shl.b64 %rd127, %rd126, 2;
add.s64 %rd128, %rd7, %rd127;
add.s64 %rd129, %rd9, %rd125;
shl.b64 %rd130, %rd129, 2;
add.s64 %rd24, %rd8, %rd130;
ld.global.nc.f32 %f230, [%rd24];
mul.ftz.f32 %f231, %f55, %f230;
atom.global.add.f32 %f232, [%rd128], %f231;
add.s64 %rd25, %rd6, %rd127;
ld.global.nc.f32 %f233, [%rd25];
add.s64 %rd26, %rd5, %rd130;
ld.global.f32 %f234, [%rd26];
fma.rn.ftz.f32 %f235, %f64, %f233, %f234;
st.global.f32 [%rd26], %f235;
add.s64 %rd131, %rd3, %rd127;
add.s64 %rd27, %rd4, %rd130;
ld.global.nc.f32 %f236, [%rd27];
mul.ftz.f32 %f237, %f64, %f236;
atom.global.add.f32 %f238, [%rd131], %f237;
@%p53 bra $L__BB3_66;
setp.eq.s32 %p54, %r48, 2;
add.s32 %r115, %r143, 1;
cvt.u64.u32 %rd132, %r115;
add.s64 %rd133, %rd19, %rd132;
shl.b64 %rd134, %rd133, 2;
add.s64 %rd135, %rd7, %rd134;
ld.global.nc.f32 %f239, [%rd24+4];
mul.ftz.f32 %f240, %f55, %f239;
atom.global.add.f32 %f241, [%rd135], %f240;
ld.global.nc.f32 %f242, [%rd25+4];
ld.global.f32 %f243, [%rd26+4];
fma.rn.ftz.f32 %f244, %f64, %f242, %f243;
st.global.f32 [%rd26+4], %f244;
add.s64 %rd136, %rd3, %rd134;
ld.global.nc.f32 %f245, [%rd27+4];
mul.ftz.f32 %f246, %f64, %f245;
atom.global.add.f32 %f247, [%rd136], %f246;
@%p54 bra $L__BB3_66;
add.s32 %r116, %r143, 2;
cvt.u64.u32 %rd137, %r116;
add.s64 %rd138, %rd19, %rd137;
shl.b64 %rd139, %rd138, 2;
add.s64 %rd140, %rd7, %rd139;
ld.global.nc.f32 %f248, [%rd24+8];
mul.ftz.f32 %f249, %f55, %f248;
atom.global.add.f32 %f250, [%rd140], %f249;
ld.global.nc.f32 %f251, [%rd25+8];
ld.global.f32 %f252, [%rd26+8];
fma.rn.ftz.f32 %f253, %f64, %f251, %f252;
st.global.f32 [%rd26+8], %f253;
add.s64 %rd141, %rd3, %rd139;
ld.global.nc.f32 %f254, [%rd27+8];
mul.ftz.f32 %f255, %f64, %f254;
atom.global.add.f32 %f256, [%rd141], %f255;
$L__BB3_66:
add.s32 %r134, %r134, 1;
setp.lt.u32 %p55, %r134, %r3;
@%p55 bra $L__BB3_45;
$L__BB3_67:
ret;
}