//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35059454
// Cuda compilation tools, release 12.6, V12.6.85
// Based on NVVM 7.0.1
//
.version 8.5
.target sm_50
.address_size 64
// .globl fused_attention_fwd_f32
.visible .entry fused_attention_fwd_f32(
.param .u64 fused_attention_fwd_f32_param_0,
.param .u64 fused_attention_fwd_f32_param_1,
.param .u64 fused_attention_fwd_f32_param_2,
.param .u64 fused_attention_fwd_f32_param_3,
.param .f32 fused_attention_fwd_f32_param_4,
.param .u32 fused_attention_fwd_f32_param_5,
.param .u32 fused_attention_fwd_f32_param_6,
.param .u32 fused_attention_fwd_f32_param_7,
.param .u32 fused_attention_fwd_f32_param_8,
.param .u32 fused_attention_fwd_f32_param_9,
.param .u32 fused_attention_fwd_f32_param_10
)
{
.reg .pred %p<66>;
.reg .f32 %f<279>;
.reg .b32 %r<175>;
.reg .b64 %rd<122>;
ld.param.u64 %rd24, [fused_attention_fwd_f32_param_0];
ld.param.u64 %rd25, [fused_attention_fwd_f32_param_1];
ld.param.u64 %rd26, [fused_attention_fwd_f32_param_2];
ld.param.u64 %rd27, [fused_attention_fwd_f32_param_3];
ld.param.f32 %f56, [fused_attention_fwd_f32_param_4];
ld.param.u32 %r86, [fused_attention_fwd_f32_param_5];
ld.param.u32 %r81, [fused_attention_fwd_f32_param_6];
ld.param.u32 %r82, [fused_attention_fwd_f32_param_7];
ld.param.u32 %r83, [fused_attention_fwd_f32_param_8];
ld.param.u32 %r84, [fused_attention_fwd_f32_param_9];
ld.param.u32 %r85, [fused_attention_fwd_f32_param_10];
cvta.to.global.u64 %rd1, %rd27;
cvta.to.global.u64 %rd2, %rd26;
cvta.to.global.u64 %rd3, %rd25;
cvta.to.global.u64 %rd4, %rd24;
mov.u32 %r87, %ntid.x;
mov.u32 %r88, %ctaid.x;
mov.u32 %r89, %tid.x;
mad.lo.s32 %r1, %r88, %r87, %r89;
mul.lo.s32 %r90, %r81, %r86;
mul.lo.s32 %r91, %r90, %r82;
setp.ge.u32 %p1, %r1, %r91;
@%p1 bra $L__BB0_83;
mul.lo.s32 %r92, %r82, %r81;
div.u32 %r93, %r1, %r92;
mul.lo.s32 %r94, %r93, %r92;
sub.s32 %r95, %r1, %r94;
div.u32 %r96, %r95, %r82;
mul.lo.s32 %r97, %r96, %r82;
sub.s32 %r2, %r95, %r97;
mad.lo.s32 %r98, %r93, %r81, %r96;
mul.lo.s32 %r99, %r84, %r82;
mul.lo.s32 %r100, %r99, %r98;
mul.lo.s32 %r101, %r84, %r83;
mul.lo.s32 %r3, %r101, %r98;
cvt.u64.u32 %rd28, %r100;
mul.lo.s32 %r102, %r2, %r84;
cvt.u64.u32 %rd29, %r102;
add.s64 %rd5, %rd28, %rd29;
setp.eq.s32 %p2, %r83, 0;
mov.f32 %f265, 0fFF7FFFFF;
@%p2 bra $L__BB0_29;
setp.eq.s32 %p3, %r85, 0;
cvt.u64.u32 %rd6, %r3;
@%p3 bra $L__BB0_17;
setp.eq.s32 %p4, %r84, 0;
@%p4 bra $L__BB0_14;
add.s32 %r4, %r84, -1;
and.b32 %r5, %r84, 3;
sub.s32 %r6, %r84, %r5;
mov.f32 %f265, 0fFF7FFFFF;
mov.u32 %r140, 0;
$L__BB0_5:
setp.gt.u32 %p5, %r140, %r2;
@%p5 bra $L__BB0_29;
setp.lt.u32 %p6, %r4, 3;
mul.lo.s32 %r105, %r140, %r84;
cvt.u64.u32 %rd30, %r105;
add.s64 %rd7, %rd30, %rd6;
mov.f32 %f257, 0f00000000;
mov.u32 %r143, 0;
@%p6 bra $L__BB0_9;
mov.f32 %f257, 0f00000000;
mov.u32 %r143, 0;
mov.u32 %r142, %r6;
$L__BB0_8:
cvt.u64.u32 %rd31, %r143;
add.s64 %rd32, %rd5, %rd31;
shl.b64 %rd33, %rd32, 2;
add.s64 %rd34, %rd4, %rd33;
add.s64 %rd35, %rd7, %rd31;
shl.b64 %rd36, %rd35, 2;
add.s64 %rd37, %rd3, %rd36;
ld.global.nc.f32 %f62, [%rd37];
ld.global.nc.f32 %f63, [%rd34];
fma.rn.f32 %f64, %f63, %f62, %f257;
ld.global.nc.f32 %f65, [%rd37+4];
ld.global.nc.f32 %f66, [%rd34+4];
fma.rn.f32 %f67, %f66, %f65, %f64;
ld.global.nc.f32 %f68, [%rd37+8];
ld.global.nc.f32 %f69, [%rd34+8];
fma.rn.f32 %f70, %f69, %f68, %f67;
ld.global.nc.f32 %f71, [%rd37+12];
ld.global.nc.f32 %f72, [%rd34+12];
fma.rn.f32 %f257, %f72, %f71, %f70;
add.s32 %r143, %r143, 4;
add.s32 %r142, %r142, -4;
setp.ne.s32 %p7, %r142, 0;
@%p7 bra $L__BB0_8;
$L__BB0_9:
setp.eq.s32 %p8, %r5, 0;
@%p8 bra $L__BB0_13;
setp.eq.s32 %p9, %r5, 1;
cvt.u64.u32 %rd38, %r143;
add.s64 %rd39, %rd5, %rd38;
shl.b64 %rd40, %rd39, 2;
add.s64 %rd8, %rd4, %rd40;
add.s64 %rd41, %rd7, %rd38;
shl.b64 %rd42, %rd41, 2;
add.s64 %rd9, %rd3, %rd42;
ld.global.nc.f32 %f73, [%rd9];
ld.global.nc.f32 %f74, [%rd8];
fma.rn.f32 %f257, %f74, %f73, %f257;
@%p9 bra $L__BB0_13;
setp.eq.s32 %p10, %r5, 2;
ld.global.nc.f32 %f75, [%rd9+4];
ld.global.nc.f32 %f76, [%rd8+4];
fma.rn.f32 %f257, %f76, %f75, %f257;
@%p10 bra $L__BB0_13;
ld.global.nc.f32 %f77, [%rd9+8];
ld.global.nc.f32 %f78, [%rd8+8];
fma.rn.f32 %f257, %f78, %f77, %f257;
$L__BB0_13:
mul.f32 %f79, %f257, %f56;
setp.gt.f32 %p11, %f79, %f265;
selp.f32 %f265, %f79, %f265, %p11;
add.s32 %r140, %r140, 1;
setp.lt.u32 %p12, %r140, %r83;
@%p12 bra $L__BB0_5;
bra.uni $L__BB0_29;
$L__BB0_17:
setp.eq.s32 %p16, %r84, 0;
@%p16 bra $L__BB0_27;
add.s32 %r16, %r84, -1;
and.b32 %r17, %r84, 3;
sub.s32 %r18, %r84, %r17;
mov.f32 %f265, 0fFF7FFFFF;
mov.u32 %r108, 0;
mov.u32 %r145, %r108;
$L__BB0_19:
mul.lo.s32 %r110, %r145, %r84;
cvt.u64.u32 %rd43, %r110;
add.s64 %rd10, %rd43, %rd6;
setp.lt.u32 %p17, %r16, 3;
mov.f32 %f263, 0f00000000;
mov.u32 %r148, %r108;
@%p17 bra $L__BB0_22;
mov.f32 %f263, 0f00000000;
mov.u32 %r148, 0;
mov.u32 %r147, %r18;
$L__BB0_21:
cvt.u64.u32 %rd44, %r148;
add.s64 %rd45, %rd5, %rd44;
shl.b64 %rd46, %rd45, 2;
add.s64 %rd47, %rd4, %rd46;
add.s64 %rd48, %rd10, %rd44;
shl.b64 %rd49, %rd48, 2;
add.s64 %rd50, %rd3, %rd49;
ld.global.nc.f32 %f85, [%rd50];
ld.global.nc.f32 %f86, [%rd47];
fma.rn.f32 %f87, %f86, %f85, %f263;
ld.global.nc.f32 %f88, [%rd50+4];
ld.global.nc.f32 %f89, [%rd47+4];
fma.rn.f32 %f90, %f89, %f88, %f87;
ld.global.nc.f32 %f91, [%rd50+8];
ld.global.nc.f32 %f92, [%rd47+8];
fma.rn.f32 %f93, %f92, %f91, %f90;
ld.global.nc.f32 %f94, [%rd50+12];
ld.global.nc.f32 %f95, [%rd47+12];
fma.rn.f32 %f263, %f95, %f94, %f93;
add.s32 %r148, %r148, 4;
add.s32 %r147, %r147, -4;
setp.ne.s32 %p18, %r147, 0;
@%p18 bra $L__BB0_21;
$L__BB0_22:
setp.eq.s32 %p19, %r17, 0;
@%p19 bra $L__BB0_26;
setp.eq.s32 %p20, %r17, 1;
cvt.u64.u32 %rd51, %r148;
add.s64 %rd52, %rd5, %rd51;
shl.b64 %rd53, %rd52, 2;
add.s64 %rd11, %rd4, %rd53;
add.s64 %rd54, %rd10, %rd51;
shl.b64 %rd55, %rd54, 2;
add.s64 %rd12, %rd3, %rd55;
ld.global.nc.f32 %f96, [%rd12];
ld.global.nc.f32 %f97, [%rd11];
fma.rn.f32 %f263, %f97, %f96, %f263;
@%p20 bra $L__BB0_26;
setp.eq.s32 %p21, %r17, 2;
ld.global.nc.f32 %f98, [%rd12+4];
ld.global.nc.f32 %f99, [%rd11+4];
fma.rn.f32 %f263, %f99, %f98, %f263;
@%p21 bra $L__BB0_26;
ld.global.nc.f32 %f100, [%rd12+8];
ld.global.nc.f32 %f101, [%rd11+8];
fma.rn.f32 %f263, %f101, %f100, %f263;
$L__BB0_26:
mul.f32 %f102, %f263, %f56;
setp.gt.f32 %p22, %f102, %f265;
selp.f32 %f265, %f102, %f265, %p22;
add.s32 %r145, %r145, 1;
setp.lt.u32 %p23, %r145, %r83;
@%p23 bra $L__BB0_19;
bra.uni $L__BB0_29;
$L__BB0_14:
mul.f32 %f11, %f56, 0f00000000;
mov.f32 %f265, 0fFF7FFFFF;
mov.u32 %r144, 0;
$L__BB0_15:
setp.gt.u32 %p13, %r144, %r2;
@%p13 bra $L__BB0_29;
setp.gt.f32 %p14, %f11, %f265;
selp.f32 %f265, %f11, %f265, %p14;
add.s32 %r144, %r144, 1;
setp.lt.u32 %p15, %r144, %r83;
@%p15 bra $L__BB0_15;
bra.uni $L__BB0_29;
$L__BB0_27:
mul.f32 %f24, %f56, 0f00000000;
mov.f32 %f265, 0fFF7FFFFF;
mov.u32 %r149, 0;
$L__BB0_28:
setp.gt.f32 %p24, %f24, %f265;
selp.f32 %f265, %f24, %f265, %p24;
add.s32 %r149, %r149, 1;
setp.lt.u32 %p25, %r149, %r83;
@%p25 bra $L__BB0_28;
$L__BB0_29:
setp.eq.s32 %p26, %r84, 0;
@%p26 bra $L__BB0_35;
add.s32 %r114, %r84, -1;
and.b32 %r154, %r84, 3;
setp.lt.u32 %p27, %r114, 3;
mov.u32 %r152, 0;
@%p27 bra $L__BB0_33;
sub.s32 %r151, %r84, %r154;
mov.u32 %r115, 0;
mov.u32 %r152, %r115;
$L__BB0_32:
cvt.u64.u32 %rd56, %r152;
add.s64 %rd57, %rd5, %rd56;
shl.b64 %rd58, %rd57, 2;
add.s64 %rd59, %rd1, %rd58;
st.global.u32 [%rd59], %r115;
st.global.u32 [%rd59+4], %r115;
st.global.u32 [%rd59+8], %r115;
st.global.u32 [%rd59+12], %r115;
add.s32 %r152, %r152, 4;
add.s32 %r151, %r151, -4;
setp.ne.s32 %p28, %r151, 0;
@%p28 bra $L__BB0_32;
$L__BB0_33:
setp.eq.s32 %p29, %r154, 0;
@%p29 bra $L__BB0_35;
$L__BB0_34:
.pragma "nounroll";
cvt.u64.u32 %rd60, %r152;
add.s64 %rd61, %rd5, %rd60;
shl.b64 %rd62, %rd61, 2;
add.s64 %rd63, %rd1, %rd62;
mov.u32 %r117, 0;
st.global.u32 [%rd63], %r117;
add.s32 %r152, %r152, 1;
add.s32 %r154, %r154, -1;
setp.ne.s32 %p30, %r154, 0;
@%p30 bra $L__BB0_34;
$L__BB0_35:
mov.f32 %f278, 0f00000000;
mov.f32 %f277, %f278;
@%p2 bra $L__BB0_75;
setp.eq.s32 %p32, %r85, 0;
cvt.u64.u32 %rd13, %r3;
add.s32 %r39, %r84, -1;
@%p32 bra $L__BB0_56;
and.b32 %r40, %r84, 3;
sub.s32 %r41, %r84, %r40;
mov.f32 %f277, 0f00000000;
mov.u32 %r155, 0;
$L__BB0_38:
setp.gt.u32 %p33, %r155, %r2;
@%p33 bra $L__BB0_75;
mul.lo.s32 %r119, %r155, %r84;
cvt.u64.u32 %rd64, %r119;
add.s64 %rd14, %rd64, %rd13;
mov.f32 %f270, 0f00000000;
@%p26 bra $L__BB0_47;
setp.lt.u32 %p35, %r39, 3;
mov.f32 %f270, 0f00000000;
mov.u32 %r158, 0;
@%p35 bra $L__BB0_43;
mov.f32 %f270, 0f00000000;
mov.u32 %r158, 0;
mov.u32 %r157, %r41;
$L__BB0_42:
cvt.u64.u32 %rd65, %r158;
add.s64 %rd66, %rd5, %rd65;
shl.b64 %rd67, %rd66, 2;
add.s64 %rd68, %rd4, %rd67;
add.s64 %rd69, %rd14, %rd65;
shl.b64 %rd70, %rd69, 2;
add.s64 %rd71, %rd3, %rd70;
ld.global.nc.f32 %f110, [%rd71];
ld.global.nc.f32 %f111, [%rd68];
fma.rn.f32 %f112, %f111, %f110, %f270;
ld.global.nc.f32 %f113, [%rd71+4];
ld.global.nc.f32 %f114, [%rd68+4];
fma.rn.f32 %f115, %f114, %f113, %f112;
ld.global.nc.f32 %f116, [%rd71+8];
ld.global.nc.f32 %f117, [%rd68+8];
fma.rn.f32 %f118, %f117, %f116, %f115;
ld.global.nc.f32 %f119, [%rd71+12];
ld.global.nc.f32 %f120, [%rd68+12];
fma.rn.f32 %f270, %f120, %f119, %f118;
add.s32 %r158, %r158, 4;
add.s32 %r157, %r157, -4;
setp.ne.s32 %p36, %r157, 0;
@%p36 bra $L__BB0_42;
$L__BB0_43:
setp.eq.s32 %p37, %r40, 0;
@%p37 bra $L__BB0_47;
setp.eq.s32 %p38, %r40, 1;
cvt.u64.u32 %rd72, %r158;
add.s64 %rd73, %rd5, %rd72;
shl.b64 %rd74, %rd73, 2;
add.s64 %rd15, %rd4, %rd74;
add.s64 %rd75, %rd14, %rd72;
shl.b64 %rd76, %rd75, 2;
add.s64 %rd16, %rd3, %rd76;
ld.global.nc.f32 %f121, [%rd16];
ld.global.nc.f32 %f122, [%rd15];
fma.rn.f32 %f270, %f122, %f121, %f270;
@%p38 bra $L__BB0_47;
setp.eq.s32 %p39, %r40, 2;
ld.global.nc.f32 %f123, [%rd16+4];
ld.global.nc.f32 %f124, [%rd15+4];
fma.rn.f32 %f270, %f124, %f123, %f270;
@%p39 bra $L__BB0_47;
ld.global.nc.f32 %f125, [%rd16+8];
ld.global.nc.f32 %f126, [%rd15+8];
fma.rn.f32 %f270, %f126, %f125, %f270;
$L__BB0_47:
mul.f32 %f127, %f270, %f56;
sub.f32 %f128, %f127, %f265;
mov.f32 %f129, 0f3F000000;
mov.f32 %f130, 0f3BBB989D;
fma.rn.f32 %f131, %f128, %f130, %f129;
cvt.sat.f32.f32 %f132, %f131;
mov.f32 %f133, 0f4B400001;
mov.f32 %f134, 0f437C0000;
fma.rm.f32 %f135, %f132, %f134, %f133;
add.f32 %f136, %f135, 0fCB40007F;
neg.f32 %f137, %f136;
mov.f32 %f138, 0f3FB8AA3B;
fma.rn.f32 %f139, %f128, %f138, %f137;
mov.f32 %f140, 0f32A57060;
fma.rn.f32 %f141, %f128, %f140, %f139;
mov.b32 %r122, %f135;
shl.b32 %r123, %r122, 23;
mov.b32 %f142, %r123;
ex2.approx.ftz.f32 %f143, %f141;
mul.f32 %f37, %f143, %f142;
add.f32 %f277, %f277, %f37;
@%p26 bra $L__BB0_55;
setp.lt.u32 %p41, %r39, 3;
mov.u32 %r161, 0;
@%p41 bra $L__BB0_51;
mov.u32 %r161, 0;
mov.u32 %r160, %r41;
$L__BB0_50:
cvt.u64.u32 %rd77, %r161;
add.s64 %rd78, %rd14, %rd77;
shl.b64 %rd79, %rd78, 2;
add.s64 %rd80, %rd2, %rd79;
ld.global.nc.f32 %f144, [%rd80];
add.s64 %rd81, %rd5, %rd77;
shl.b64 %rd82, %rd81, 2;
add.s64 %rd83, %rd1, %rd82;
ld.global.f32 %f145, [%rd83];
fma.rn.f32 %f146, %f37, %f144, %f145;
st.global.f32 [%rd83], %f146;
ld.global.f32 %f147, [%rd83+4];
ld.global.nc.f32 %f148, [%rd80+4];
fma.rn.f32 %f149, %f37, %f148, %f147;
st.global.f32 [%rd83+4], %f149;
ld.global.f32 %f150, [%rd83+8];
ld.global.nc.f32 %f151, [%rd80+8];
fma.rn.f32 %f152, %f37, %f151, %f150;
st.global.f32 [%rd83+8], %f152;
ld.global.nc.f32 %f153, [%rd80+12];
ld.global.f32 %f154, [%rd83+12];
fma.rn.f32 %f155, %f37, %f153, %f154;
st.global.f32 [%rd83+12], %f155;
add.s32 %r161, %r161, 4;
add.s32 %r160, %r160, -4;
setp.ne.s32 %p42, %r160, 0;
@%p42 bra $L__BB0_50;
$L__BB0_51:
setp.eq.s32 %p43, %r40, 0;
@%p43 bra $L__BB0_55;
setp.eq.s32 %p44, %r40, 1;
cvt.u64.u32 %rd84, %r161;
add.s64 %rd85, %rd14, %rd84;
shl.b64 %rd86, %rd85, 2;
add.s64 %rd17, %rd2, %rd86;
ld.global.nc.f32 %f156, [%rd17];
add.s64 %rd87, %rd5, %rd84;
shl.b64 %rd88, %rd87, 2;
add.s64 %rd18, %rd1, %rd88;
ld.global.f32 %f157, [%rd18];
fma.rn.f32 %f158, %f37, %f156, %f157;
st.global.f32 [%rd18], %f158;
@%p44 bra $L__BB0_55;
setp.eq.s32 %p45, %r40, 2;
ld.global.nc.f32 %f159, [%rd17+4];
ld.global.f32 %f160, [%rd18+4];
fma.rn.f32 %f161, %f37, %f159, %f160;
st.global.f32 [%rd18+4], %f161;
@%p45 bra $L__BB0_55;
ld.global.nc.f32 %f162, [%rd17+8];
ld.global.f32 %f163, [%rd18+8];
fma.rn.f32 %f164, %f37, %f162, %f163;
st.global.f32 [%rd18+8], %f164;
$L__BB0_55:
add.s32 %r155, %r155, 1;
setp.lt.u32 %p46, %r155, %r83;
@%p46 bra $L__BB0_38;
bra.uni $L__BB0_75;
$L__BB0_56:
@%p26 bra $L__BB0_73;
and.b32 %r54, %r84, 3;
sub.s32 %r55, %r84, %r54;
mov.f32 %f165, 0f00000000;
mov.u32 %r126, 0;
mov.u32 %r162, %r126;
mov.f32 %f277, %f165;
$L__BB0_58:
mul.lo.s32 %r128, %r162, %r84;
cvt.u64.u32 %rd89, %r128;
add.s64 %rd19, %rd89, %rd13;
setp.lt.u32 %p48, %r39, 3;
mov.u32 %r165, %r126;
mov.f32 %f275, %f165;
@%p48 bra $L__BB0_61;
mov.f32 %f275, 0f00000000;
mov.u32 %r165, 0;
mov.u32 %r164, %r55;
$L__BB0_60:
cvt.u64.u32 %rd90, %r165;
add.s64 %rd91, %rd5, %rd90;
shl.b64 %rd92, %rd91, 2;
add.s64 %rd93, %rd4, %rd92;
add.s64 %rd94, %rd19, %rd90;
shl.b64 %rd95, %rd94, 2;
add.s64 %rd96, %rd3, %rd95;
ld.global.nc.f32 %f169, [%rd96];
ld.global.nc.f32 %f170, [%rd93];
fma.rn.f32 %f171, %f170, %f169, %f275;
ld.global.nc.f32 %f172, [%rd96+4];
ld.global.nc.f32 %f173, [%rd93+4];
fma.rn.f32 %f174, %f173, %f172, %f171;
ld.global.nc.f32 %f175, [%rd96+8];
ld.global.nc.f32 %f176, [%rd93+8];
fma.rn.f32 %f177, %f176, %f175, %f174;
ld.global.nc.f32 %f178, [%rd96+12];
ld.global.nc.f32 %f179, [%rd93+12];
fma.rn.f32 %f275, %f179, %f178, %f177;
add.s32 %r165, %r165, 4;
add.s32 %r164, %r164, -4;
setp.ne.s32 %p49, %r164, 0;
@%p49 bra $L__BB0_60;
$L__BB0_61:
setp.eq.s32 %p50, %r54, 0;
@%p50 bra $L__BB0_65;
setp.eq.s32 %p51, %r54, 1;
cvt.u64.u32 %rd97, %r165;
add.s64 %rd98, %rd5, %rd97;
shl.b64 %rd99, %rd98, 2;
add.s64 %rd20, %rd4, %rd99;
add.s64 %rd100, %rd19, %rd97;
shl.b64 %rd101, %rd100, 2;
add.s64 %rd21, %rd3, %rd101;
ld.global.nc.f32 %f180, [%rd21];
ld.global.nc.f32 %f181, [%rd20];
fma.rn.f32 %f275, %f181, %f180, %f275;
@%p51 bra $L__BB0_65;
setp.eq.s32 %p52, %r54, 2;
ld.global.nc.f32 %f182, [%rd21+4];
ld.global.nc.f32 %f183, [%rd20+4];
fma.rn.f32 %f275, %f183, %f182, %f275;
@%p52 bra $L__BB0_65;
ld.global.nc.f32 %f184, [%rd21+8];
ld.global.nc.f32 %f185, [%rd20+8];
fma.rn.f32 %f275, %f185, %f184, %f275;
$L__BB0_65:
mul.f32 %f186, %f275, %f56;
sub.f32 %f187, %f186, %f265;
mov.f32 %f188, 0f3F000000;
mov.f32 %f189, 0f3BBB989D;
fma.rn.f32 %f190, %f187, %f189, %f188;
cvt.sat.f32.f32 %f191, %f190;
mov.f32 %f192, 0f4B400001;
mov.f32 %f193, 0f437C0000;
fma.rm.f32 %f194, %f191, %f193, %f192;
add.f32 %f195, %f194, 0fCB40007F;
neg.f32 %f196, %f195;
mov.f32 %f197, 0f3FB8AA3B;
fma.rn.f32 %f198, %f187, %f197, %f196;
mov.f32 %f199, 0f32A57060;
fma.rn.f32 %f200, %f187, %f199, %f198;
mov.b32 %r131, %f194;
shl.b32 %r132, %r131, 23;
mov.b32 %f201, %r132;
ex2.approx.ftz.f32 %f202, %f200;
mul.f32 %f48, %f202, %f201;
add.f32 %f277, %f277, %f48;
mov.u32 %r168, 0;
@%p48 bra $L__BB0_68;
mov.u32 %r168, 0;
mov.u32 %r167, %r55;
$L__BB0_67:
cvt.u64.u32 %rd102, %r168;
add.s64 %rd103, %rd19, %rd102;
shl.b64 %rd104, %rd103, 2;
add.s64 %rd105, %rd2, %rd104;
ld.global.nc.f32 %f203, [%rd105];
add.s64 %rd106, %rd5, %rd102;
shl.b64 %rd107, %rd106, 2;
add.s64 %rd108, %rd1, %rd107;
ld.global.f32 %f204, [%rd108];
fma.rn.f32 %f205, %f48, %f203, %f204;
st.global.f32 [%rd108], %f205;
ld.global.f32 %f206, [%rd108+4];
ld.global.nc.f32 %f207, [%rd105+4];
fma.rn.f32 %f208, %f48, %f207, %f206;
st.global.f32 [%rd108+4], %f208;
ld.global.f32 %f209, [%rd108+8];
ld.global.nc.f32 %f210, [%rd105+8];
fma.rn.f32 %f211, %f48, %f210, %f209;
st.global.f32 [%rd108+8], %f211;
ld.global.nc.f32 %f212, [%rd105+12];
ld.global.f32 %f213, [%rd108+12];
fma.rn.f32 %f214, %f48, %f212, %f213;
st.global.f32 [%rd108+12], %f214;
add.s32 %r168, %r168, 4;
add.s32 %r167, %r167, -4;
setp.ne.s32 %p54, %r167, 0;
@%p54 bra $L__BB0_67;
$L__BB0_68:
@%p50 bra $L__BB0_72;
setp.eq.s32 %p56, %r54, 1;
cvt.u64.u32 %rd109, %r168;
add.s64 %rd110, %rd19, %rd109;
shl.b64 %rd111, %rd110, 2;
add.s64 %rd22, %rd2, %rd111;
ld.global.nc.f32 %f215, [%rd22];
add.s64 %rd112, %rd5, %rd109;
shl.b64 %rd113, %rd112, 2;
add.s64 %rd23, %rd1, %rd113;
ld.global.f32 %f216, [%rd23];
fma.rn.f32 %f217, %f48, %f215, %f216;
st.global.f32 [%rd23], %f217;
@%p56 bra $L__BB0_72;
setp.eq.s32 %p57, %r54, 2;
ld.global.nc.f32 %f218, [%rd22+4];
ld.global.f32 %f219, [%rd23+4];
fma.rn.f32 %f220, %f48, %f218, %f219;
st.global.f32 [%rd23+4], %f220;
@%p57 bra $L__BB0_72;
ld.global.nc.f32 %f221, [%rd22+8];
ld.global.f32 %f222, [%rd23+8];
fma.rn.f32 %f223, %f48, %f221, %f222;
st.global.f32 [%rd23+8], %f223;
$L__BB0_72:
add.s32 %r162, %r162, 1;
setp.lt.u32 %p58, %r162, %r83;
@%p58 bra $L__BB0_58;
bra.uni $L__BB0_75;
$L__BB0_73:
mul.f32 %f225, %f56, 0f00000000;
mov.f32 %f277, 0f00000000;
sub.f32 %f226, %f225, %f265;
mov.f32 %f227, 0f3F000000;
mov.f32 %f228, 0f3BBB989D;
fma.rn.f32 %f229, %f226, %f228, %f227;
cvt.sat.f32.f32 %f230, %f229;
mov.f32 %f231, 0f4B400001;
mov.f32 %f232, 0f437C0000;
fma.rm.f32 %f233, %f230, %f232, %f231;
add.f32 %f234, %f233, 0fCB40007F;
neg.f32 %f235, %f234;
mov.f32 %f236, 0f3FB8AA3B;
fma.rn.f32 %f237, %f226, %f236, %f235;
mov.f32 %f238, 0f32A57060;
fma.rn.f32 %f239, %f226, %f238, %f237;
mov.b32 %r135, %f233;
shl.b32 %r136, %r135, 23;
mov.b32 %f240, %r136;
ex2.approx.ftz.f32 %f241, %f239;
mul.f32 %f50, %f241, %f240;
mov.u32 %r169, 0;
$L__BB0_74:
add.f32 %f277, %f277, %f50;
add.s32 %r169, %r169, 1;
setp.lt.u32 %p59, %r169, %r83;
@%p59 bra $L__BB0_74;
$L__BB0_75:
setp.leu.f32 %p60, %f277, 0f00000000;
@%p60 bra $L__BB0_77;
rcp.rn.f32 %f278, %f277;
$L__BB0_77:
@%p26 bra $L__BB0_83;
add.s32 %r138, %r84, -1;
and.b32 %r174, %r84, 3;
setp.lt.u32 %p62, %r138, 3;
mov.u32 %r172, 0;
@%p62 bra $L__BB0_81;
sub.s32 %r171, %r84, %r174;
mov.u32 %r172, 0;
$L__BB0_80:
cvt.u64.u32 %rd114, %r172;
add.s64 %rd115, %rd5, %rd114;
shl.b64 %rd116, %rd115, 2;
add.s64 %rd117, %rd1, %rd116;
ld.global.f32 %f243, [%rd117];
mul.f32 %f244, %f278, %f243;
st.global.f32 [%rd117], %f244;
ld.global.f32 %f245, [%rd117+4];
mul.f32 %f246, %f278, %f245;
st.global.f32 [%rd117+4], %f246;
ld.global.f32 %f247, [%rd117+8];
mul.f32 %f248, %f278, %f247;
st.global.f32 [%rd117+8], %f248;
ld.global.f32 %f249, [%rd117+12];
mul.f32 %f250, %f278, %f249;
st.global.f32 [%rd117+12], %f250;
add.s32 %r172, %r172, 4;
add.s32 %r171, %r171, -4;
setp.ne.s32 %p63, %r171, 0;
@%p63 bra $L__BB0_80;
$L__BB0_81:
setp.eq.s32 %p64, %r174, 0;
@%p64 bra $L__BB0_83;
$L__BB0_82:
.pragma "nounroll";
cvt.u64.u32 %rd118, %r172;
add.s64 %rd119, %rd5, %rd118;
shl.b64 %rd120, %rd119, 2;
add.s64 %rd121, %rd1, %rd120;
ld.global.f32 %f251, [%rd121];
mul.f32 %f252, %f278, %f251;
st.global.f32 [%rd121], %f252;
add.s32 %r172, %r172, 1;
add.s32 %r174, %r174, -1;
setp.ne.s32 %p65, %r174, 0;
@%p65 bra $L__BB0_82;
$L__BB0_83:
ret;
}
// .globl fused_attention_bwd_f32
.visible .entry fused_attention_bwd_f32(
.param .u64 fused_attention_bwd_f32_param_0,
.param .u64 fused_attention_bwd_f32_param_1,
.param .u64 fused_attention_bwd_f32_param_2,
.param .u64 fused_attention_bwd_f32_param_3,
.param .u64 fused_attention_bwd_f32_param_4,
.param .u64 fused_attention_bwd_f32_param_5,
.param .u64 fused_attention_bwd_f32_param_6,
.param .u64 fused_attention_bwd_f32_param_7,
.param .f32 fused_attention_bwd_f32_param_8,
.param .u32 fused_attention_bwd_f32_param_9,
.param .u32 fused_attention_bwd_f32_param_10,
.param .u32 fused_attention_bwd_f32_param_11,
.param .u32 fused_attention_bwd_f32_param_12,
.param .u32 fused_attention_bwd_f32_param_13,
.param .u32 fused_attention_bwd_f32_param_14
)
{
.reg .pred %p<56>;
.reg .f32 %f<346>;
.reg .b32 %r<152>;
.reg .b64 %rd<142>;
ld.param.u64 %rd28, [fused_attention_bwd_f32_param_0];
ld.param.u64 %rd29, [fused_attention_bwd_f32_param_1];
ld.param.u64 %rd30, [fused_attention_bwd_f32_param_2];
ld.param.u64 %rd31, [fused_attention_bwd_f32_param_3];
ld.param.u64 %rd32, [fused_attention_bwd_f32_param_4];
ld.param.u64 %rd33, [fused_attention_bwd_f32_param_5];
ld.param.u64 %rd34, [fused_attention_bwd_f32_param_6];
ld.param.u64 %rd35, [fused_attention_bwd_f32_param_7];
ld.param.f32 %f65, [fused_attention_bwd_f32_param_8];
ld.param.u32 %r72, [fused_attention_bwd_f32_param_9];
ld.param.u32 %r67, [fused_attention_bwd_f32_param_10];
ld.param.u32 %r68, [fused_attention_bwd_f32_param_11];
ld.param.u32 %r69, [fused_attention_bwd_f32_param_12];
ld.param.u32 %r70, [fused_attention_bwd_f32_param_13];
ld.param.u32 %r71, [fused_attention_bwd_f32_param_14];
cvta.to.global.u64 %rd1, %rd31;
cvta.to.global.u64 %rd2, %rd30;
cvta.to.global.u64 %rd3, %rd34;
cvta.to.global.u64 %rd4, %rd28;
cvta.to.global.u64 %rd5, %rd33;
cvta.to.global.u64 %rd6, %rd29;
cvta.to.global.u64 %rd7, %rd35;
cvta.to.global.u64 %rd8, %rd32;
mov.u32 %r73, %ntid.x;
mov.u32 %r74, %ctaid.x;
mov.u32 %r75, %tid.x;
mad.lo.s32 %r1, %r74, %r73, %r75;
mul.lo.s32 %r76, %r67, %r72;
mul.lo.s32 %r77, %r76, %r68;
setp.ge.u32 %p1, %r1, %r77;
@%p1 bra $L__BB1_67;
mul.lo.s32 %r78, %r68, %r67;
div.u32 %r79, %r1, %r78;
mul.lo.s32 %r80, %r79, %r78;
sub.s32 %r81, %r1, %r80;
div.u32 %r82, %r81, %r68;
mul.lo.s32 %r83, %r82, %r68;
sub.s32 %r84, %r81, %r83;
mad.lo.s32 %r85, %r79, %r67, %r82;
mul.lo.s32 %r86, %r70, %r68;
mul.lo.s32 %r87, %r86, %r85;
mul.lo.s32 %r88, %r70, %r69;
mul.lo.s32 %r2, %r88, %r85;
cvt.u64.u32 %rd36, %r87;
mul.lo.s32 %r89, %r84, %r70;
cvt.u64.u32 %rd37, %r89;
add.s64 %rd9, %rd36, %rd37;
add.s32 %r90, %r84, 1;
setp.lt.u32 %p2, %r90, %r69;
setp.ne.s32 %p3, %r71, 0;
and.pred %p4, %p3, %p2;
selp.b32 %r3, %r90, %r69, %p4;
setp.eq.s32 %p5, %r3, 0;
mov.f32 %f321, 0fFF7FFFFF;
@%p5 bra $L__BB1_17;
setp.eq.s32 %p6, %r70, 0;
@%p6 bra $L__BB1_12;
add.s32 %r4, %r70, -1;
and.b32 %r5, %r70, 3;
sub.s32 %r6, %r70, %r5;
cvt.u64.u32 %rd10, %r2;
mov.f32 %f321, 0fFF7FFFFF;
mov.u32 %r91, 0;
mov.u32 %r125, %r91;
$L__BB1_4:
mul.lo.s32 %r93, %r125, %r70;
cvt.u64.u32 %rd38, %r93;
add.s64 %rd11, %rd38, %rd10;
setp.lt.u32 %p7, %r4, 3;
mov.f32 %f316, 0f00000000;
mov.u32 %r128, %r91;
@%p7 bra $L__BB1_7;
mov.f32 %f316, 0f00000000;
mov.u32 %r128, 0;
mov.u32 %r127, %r6;
$L__BB1_6:
cvt.u64.u32 %rd39, %r128;
add.s64 %rd40, %rd9, %rd39;
shl.b64 %rd41, %rd40, 2;
add.s64 %rd42, %rd4, %rd41;
add.s64 %rd43, %rd11, %rd39;
shl.b64 %rd44, %rd43, 2;
add.s64 %rd45, %rd6, %rd44;
ld.global.nc.f32 %f71, [%rd45];
ld.global.nc.f32 %f72, [%rd42];
fma.rn.f32 %f73, %f72, %f71, %f316;
ld.global.nc.f32 %f74, [%rd45+4];
ld.global.nc.f32 %f75, [%rd42+4];
fma.rn.f32 %f76, %f75, %f74, %f73;
ld.global.nc.f32 %f77, [%rd45+8];
ld.global.nc.f32 %f78, [%rd42+8];
fma.rn.f32 %f79, %f78, %f77, %f76;
ld.global.nc.f32 %f80, [%rd45+12];
ld.global.nc.f32 %f81, [%rd42+12];
fma.rn.f32 %f316, %f81, %f80, %f79;
add.s32 %r128, %r128, 4;
add.s32 %r127, %r127, -4;
setp.ne.s32 %p8, %r127, 0;
@%p8 bra $L__BB1_6;
$L__BB1_7:
setp.eq.s32 %p9, %r5, 0;
@%p9 bra $L__BB1_11;
setp.eq.s32 %p10, %r5, 1;
cvt.u64.u32 %rd46, %r128;
add.s64 %rd47, %rd9, %rd46;
shl.b64 %rd48, %rd47, 2;
add.s64 %rd12, %rd4, %rd48;
add.s64 %rd49, %rd11, %rd46;
shl.b64 %rd50, %rd49, 2;
add.s64 %rd13, %rd6, %rd50;
ld.global.nc.f32 %f82, [%rd13];
ld.global.nc.f32 %f83, [%rd12];
fma.rn.f32 %f316, %f83, %f82, %f316;
@%p10 bra $L__BB1_11;
setp.eq.s32 %p11, %r5, 2;
ld.global.nc.f32 %f84, [%rd13+4];
ld.global.nc.f32 %f85, [%rd12+4];
fma.rn.f32 %f316, %f85, %f84, %f316;
@%p11 bra $L__BB1_11;
ld.global.nc.f32 %f86, [%rd13+8];
ld.global.nc.f32 %f87, [%rd12+8];
fma.rn.f32 %f316, %f87, %f86, %f316;
$L__BB1_11:
mul.f32 %f88, %f316, %f65;
setp.gt.f32 %p12, %f88, %f321;
selp.f32 %f321, %f88, %f321, %p12;
add.s32 %r125, %r125, 1;
setp.lt.u32 %p13, %r125, %r3;
@%p13 bra $L__BB1_4;
bra.uni $L__BB1_17;
$L__BB1_12:
mul.f32 %f11, %f65, 0f00000000;
and.b32 %r130, %r3, 3;
add.s32 %r95, %r3, -1;
setp.lt.u32 %p14, %r95, 3;
mov.f32 %f321, 0fFF7FFFFF;
@%p14 bra $L__BB1_15;
sub.s32 %r129, %r3, %r130;
mov.f32 %f321, 0fFF7FFFFF;
$L__BB1_14:
setp.gt.f32 %p15, %f11, %f321;
selp.f32 %f321, %f11, %f321, %p15;
add.s32 %r129, %r129, -4;
setp.ne.s32 %p16, %r129, 0;
@%p16 bra $L__BB1_14;
$L__BB1_15:
setp.eq.s32 %p17, %r130, 0;
@%p17 bra $L__BB1_17;
$L__BB1_16:
.pragma "nounroll";
setp.gt.f32 %p18, %f11, %f321;
selp.f32 %f321, %f11, %f321, %p18;
add.s32 %r130, %r130, -1;
setp.ne.s32 %p19, %r130, 0;
@%p19 bra $L__BB1_16;
$L__BB1_17:
mov.f32 %f332, 0f00000000;
mov.f32 %f331, %f332;
@%p5 bra $L__BB1_34;
setp.eq.s32 %p21, %r70, 0;
@%p21 bra $L__BB1_28;
add.s32 %r20, %r70, -1;
and.b32 %r21, %r70, 3;
sub.s32 %r22, %r70, %r21;
cvt.u64.u32 %rd14, %r2;
mov.f32 %f93, 0f00000000;
mov.u32 %r96, 0;
mov.u32 %r131, %r96;
mov.f32 %f331, %f93;
$L__BB1_20:
mul.lo.s32 %r98, %r131, %r70;
cvt.u64.u32 %rd51, %r98;
add.s64 %rd15, %rd51, %rd14;
setp.lt.u32 %p22, %r20, 3;
mov.u32 %r134, %r96;
mov.f32 %f326, %f93;
@%p22 bra $L__BB1_23;
mov.f32 %f326, 0f00000000;
mov.u32 %r134, 0;
mov.u32 %r133, %r22;
$L__BB1_22:
cvt.u64.u32 %rd52, %r134;
add.s64 %rd53, %rd9, %rd52;
shl.b64 %rd54, %rd53, 2;
add.s64 %rd55, %rd4, %rd54;
add.s64 %rd56, %rd15, %rd52;
shl.b64 %rd57, %rd56, 2;
add.s64 %rd58, %rd6, %rd57;
ld.global.nc.f32 %f97, [%rd58];
ld.global.nc.f32 %f98, [%rd55];
fma.rn.f32 %f99, %f98, %f97, %f326;
ld.global.nc.f32 %f100, [%rd58+4];
ld.global.nc.f32 %f101, [%rd55+4];
fma.rn.f32 %f102, %f101, %f100, %f99;
ld.global.nc.f32 %f103, [%rd58+8];
ld.global.nc.f32 %f104, [%rd55+8];
fma.rn.f32 %f105, %f104, %f103, %f102;
ld.global.nc.f32 %f106, [%rd58+12];
ld.global.nc.f32 %f107, [%rd55+12];
fma.rn.f32 %f326, %f107, %f106, %f105;
add.s32 %r134, %r134, 4;
add.s32 %r133, %r133, -4;
setp.ne.s32 %p23, %r133, 0;
@%p23 bra $L__BB1_22;
$L__BB1_23:
setp.eq.s32 %p24, %r21, 0;
@%p24 bra $L__BB1_27;
setp.eq.s32 %p25, %r21, 1;
cvt.u64.u32 %rd59, %r134;
add.s64 %rd60, %rd9, %rd59;
shl.b64 %rd61, %rd60, 2;
add.s64 %rd16, %rd4, %rd61;
add.s64 %rd62, %rd15, %rd59;
shl.b64 %rd63, %rd62, 2;
add.s64 %rd17, %rd6, %rd63;
ld.global.nc.f32 %f108, [%rd17];
ld.global.nc.f32 %f109, [%rd16];
fma.rn.f32 %f326, %f109, %f108, %f326;
@%p25 bra $L__BB1_27;
setp.eq.s32 %p26, %r21, 2;
ld.global.nc.f32 %f110, [%rd17+4];
ld.global.nc.f32 %f111, [%rd16+4];
fma.rn.f32 %f326, %f111, %f110, %f326;
@%p26 bra $L__BB1_27;
ld.global.nc.f32 %f112, [%rd17+8];
ld.global.nc.f32 %f113, [%rd16+8];
fma.rn.f32 %f326, %f113, %f112, %f326;
$L__BB1_27:
mul.f32 %f114, %f326, %f65;
sub.f32 %f115, %f114, %f321;
mov.f32 %f116, 0f3F000000;
mov.f32 %f117, 0f3BBB989D;
fma.rn.f32 %f118, %f115, %f117, %f116;
cvt.sat.f32.f32 %f119, %f118;
mov.f32 %f120, 0f4B400001;
mov.f32 %f121, 0f437C0000;
fma.rm.f32 %f122, %f119, %f121, %f120;
add.f32 %f123, %f122, 0fCB40007F;
neg.f32 %f124, %f123;
mov.f32 %f125, 0f3FB8AA3B;
fma.rn.f32 %f126, %f115, %f125, %f124;
mov.f32 %f127, 0f32A57060;
fma.rn.f32 %f128, %f115, %f127, %f126;
mov.b32 %r100, %f122;
shl.b32 %r101, %r100, 23;
mov.b32 %f129, %r101;
ex2.approx.ftz.f32 %f130, %f128;
fma.rn.f32 %f331, %f130, %f129, %f331;
add.s32 %r131, %r131, 1;
setp.lt.u32 %p27, %r131, %r3;
@%p27 bra $L__BB1_20;
bra.uni $L__BB1_34;
$L__BB1_28:
add.s32 %r102, %r3, -1;
and.b32 %r136, %r3, 3;
setp.lt.u32 %p28, %r102, 3;
mov.f32 %f331, 0f00000000;
@%p28 bra $L__BB1_31;
sub.s32 %r135, %r3, %r136;
mul.f32 %f134, %f65, 0f00000000;
mov.f32 %f331, 0f00000000;
sub.f32 %f135, %f134, %f321;
mov.f32 %f136, 0f3F000000;
mov.f32 %f137, 0f3BBB989D;
fma.rn.f32 %f138, %f135, %f137, %f136;
cvt.sat.f32.f32 %f139, %f138;
mov.f32 %f140, 0f4B400001;
mov.f32 %f141, 0f437C0000;
fma.rm.f32 %f142, %f139, %f141, %f140;
add.f32 %f143, %f142, 0fCB40007F;
neg.f32 %f144, %f143;
mov.f32 %f145, 0f3FB8AA3B;
fma.rn.f32 %f146, %f135, %f145, %f144;
mov.f32 %f147, 0f32A57060;
fma.rn.f32 %f148, %f135, %f147, %f146;
mov.b32 %r103, %f142;
shl.b32 %r104, %r103, 23;
mov.b32 %f149, %r104;
ex2.approx.ftz.f32 %f150, %f148;
mul.f32 %f29, %f150, %f149;
$L__BB1_30:
add.f32 %f151, %f331, %f29;
add.f32 %f152, %f151, %f29;
add.f32 %f153, %f152, %f29;
add.f32 %f331, %f153, %f29;
add.s32 %r135, %r135, -4;
setp.ne.s32 %p29, %r135, 0;
@%p29 bra $L__BB1_30;
$L__BB1_31:
setp.eq.s32 %p30, %r136, 0;
@%p30 bra $L__BB1_34;
mul.f32 %f154, %f65, 0f00000000;
sub.f32 %f155, %f154, %f321;
mov.f32 %f156, 0f3F000000;
mov.f32 %f157, 0f3BBB989D;
fma.rn.f32 %f158, %f155, %f157, %f156;
cvt.sat.f32.f32 %f159, %f158;
mov.f32 %f160, 0f4B400001;
mov.f32 %f161, 0f437C0000;
fma.rm.f32 %f162, %f159, %f161, %f160;
add.f32 %f163, %f162, 0fCB40007F;
neg.f32 %f164, %f163;
mov.f32 %f165, 0f3FB8AA3B;
fma.rn.f32 %f166, %f155, %f165, %f164;
mov.f32 %f167, 0f32A57060;
fma.rn.f32 %f168, %f155, %f167, %f166;
mov.b32 %r105, %f162;
shl.b32 %r106, %r105, 23;
mov.b32 %f169, %r106;
ex2.approx.ftz.f32 %f170, %f168;
mul.f32 %f34, %f170, %f169;
$L__BB1_33:
.pragma "nounroll";
add.f32 %f331, %f331, %f34;
add.s32 %r136, %r136, -1;
setp.ne.s32 %p31, %r136, 0;
@%p31 bra $L__BB1_33;
$L__BB1_34:
setp.leu.f32 %p32, %f331, 0f00000000;
@%p32 bra $L__BB1_36;
rcp.rn.f32 %f332, %f331;
$L__BB1_36:
setp.eq.s32 %p33, %r70, 0;
mov.f32 %f337, 0f00000000;
@%p33 bra $L__BB1_42;
add.s32 %r108, %r70, -1;
and.b32 %r141, %r70, 3;
setp.lt.u32 %p34, %r108, 3;
mov.f32 %f337, 0f00000000;
mov.u32 %r139, 0;
@%p34 bra $L__BB1_40;
sub.s32 %r138, %r70, %r141;
mov.f32 %f337, 0f00000000;
mov.u32 %r139, 0;
$L__BB1_39:
cvt.u64.u32 %rd64, %r139;
add.s64 %rd65, %rd9, %rd64;
shl.b64 %rd66, %rd65, 2;
add.s64 %rd67, %rd8, %rd66;
add.s64 %rd68, %rd1, %rd66;
ld.global.nc.f32 %f176, [%rd68];
ld.global.nc.f32 %f177, [%rd67];
fma.rn.f32 %f178, %f177, %f176, %f337;
ld.global.nc.f32 %f179, [%rd68+4];
ld.global.nc.f32 %f180, [%rd67+4];
fma.rn.f32 %f181, %f180, %f179, %f178;
ld.global.nc.f32 %f182, [%rd68+8];
ld.global.nc.f32 %f183, [%rd67+8];
fma.rn.f32 %f184, %f183, %f182, %f181;
ld.global.nc.f32 %f185, [%rd68+12];
ld.global.nc.f32 %f186, [%rd67+12];
fma.rn.f32 %f337, %f186, %f185, %f184;
add.s32 %r139, %r139, 4;
add.s32 %r138, %r138, -4;
setp.ne.s32 %p35, %r138, 0;
@%p35 bra $L__BB1_39;
$L__BB1_40:
setp.eq.s32 %p36, %r141, 0;
@%p36 bra $L__BB1_42;
$L__BB1_41:
.pragma "nounroll";
cvt.u64.u32 %rd69, %r139;
add.s64 %rd70, %rd9, %rd69;
shl.b64 %rd71, %rd70, 2;
add.s64 %rd72, %rd8, %rd71;
add.s64 %rd73, %rd1, %rd71;
ld.global.nc.f32 %f187, [%rd73];
ld.global.nc.f32 %f188, [%rd72];
fma.rn.f32 %f337, %f188, %f187, %f337;
add.s32 %r139, %r139, 1;
add.s32 %r141, %r141, -1;
setp.ne.s32 %p37, %r141, 0;
@%p37 bra $L__BB1_41;
$L__BB1_42:
@%p5 bra $L__BB1_67;
@%p33 bra $L__BB1_67;
add.s32 %r47, %r70, -1;
and.b32 %r48, %r70, 3;
sub.s32 %r49, %r70, %r48;
cvt.u64.u32 %rd18, %r2;
mov.u32 %r110, 0;
mov.u32 %r142, %r110;
$L__BB1_45:
mul.lo.s32 %r112, %r142, %r70;
cvt.u64.u32 %rd74, %r112;
add.s64 %rd19, %rd74, %rd18;
setp.lt.u32 %p40, %r47, 3;
mov.f32 %f341, 0f00000000;
mov.u32 %r145, %r110;
@%p40 bra $L__BB1_48;
mov.f32 %f341, 0f00000000;
mov.u32 %r145, 0;
mov.u32 %r144, %r49;
$L__BB1_47:
cvt.u64.u32 %rd75, %r145;
add.s64 %rd76, %rd9, %rd75;
shl.b64 %rd77, %rd76, 2;
add.s64 %rd78, %rd4, %rd77;
add.s64 %rd79, %rd19, %rd75;
shl.b64 %rd80, %rd79, 2;
add.s64 %rd81, %rd6, %rd80;
ld.global.nc.f32 %f192, [%rd81];
ld.global.nc.f32 %f193, [%rd78];
fma.rn.f32 %f194, %f193, %f192, %f341;
ld.global.nc.f32 %f195, [%rd81+4];
ld.global.nc.f32 %f196, [%rd78+4];
fma.rn.f32 %f197, %f196, %f195, %f194;
ld.global.nc.f32 %f198, [%rd81+8];
ld.global.nc.f32 %f199, [%rd78+8];
fma.rn.f32 %f200, %f199, %f198, %f197;
ld.global.nc.f32 %f201, [%rd81+12];
ld.global.nc.f32 %f202, [%rd78+12];
fma.rn.f32 %f341, %f202, %f201, %f200;
add.s32 %r145, %r145, 4;
add.s32 %r144, %r144, -4;
setp.ne.s32 %p41, %r144, 0;
@%p41 bra $L__BB1_47;
$L__BB1_48:
setp.eq.s32 %p42, %r48, 0;
@%p42 bra $L__BB1_52;
setp.eq.s32 %p43, %r48, 1;
cvt.u64.u32 %rd82, %r145;
add.s64 %rd83, %rd9, %rd82;
shl.b64 %rd84, %rd83, 2;
add.s64 %rd20, %rd4, %rd84;
add.s64 %rd85, %rd19, %rd82;
shl.b64 %rd86, %rd85, 2;
add.s64 %rd21, %rd6, %rd86;
ld.global.nc.f32 %f203, [%rd21];
ld.global.nc.f32 %f204, [%rd20];
fma.rn.f32 %f341, %f204, %f203, %f341;
@%p43 bra $L__BB1_52;
setp.eq.s32 %p44, %r48, 2;
ld.global.nc.f32 %f205, [%rd21+4];
ld.global.nc.f32 %f206, [%rd20+4];
fma.rn.f32 %f341, %f206, %f205, %f341;
@%p44 bra $L__BB1_52;
ld.global.nc.f32 %f207, [%rd21+8];
ld.global.nc.f32 %f208, [%rd20+8];
fma.rn.f32 %f341, %f208, %f207, %f341;
$L__BB1_52:
mul.f32 %f211, %f341, %f65;
sub.f32 %f212, %f211, %f321;
mov.f32 %f213, 0f3F000000;
mov.f32 %f214, 0f3BBB989D;
fma.rn.f32 %f215, %f212, %f214, %f213;
cvt.sat.f32.f32 %f216, %f215;
mov.f32 %f217, 0f4B400001;
mov.f32 %f218, 0f437C0000;
fma.rm.f32 %f219, %f216, %f218, %f217;
add.f32 %f220, %f219, 0fCB40007F;
neg.f32 %f221, %f220;
mov.f32 %f222, 0f3FB8AA3B;
fma.rn.f32 %f223, %f212, %f222, %f221;
mov.f32 %f224, 0f32A57060;
fma.rn.f32 %f225, %f212, %f224, %f223;
mov.b32 %r115, %f219;
shl.b32 %r116, %r115, 23;
mov.b32 %f226, %r116;
ex2.approx.ftz.f32 %f227, %f225;
mul.f32 %f228, %f227, %f226;
mul.f32 %f55, %f332, %f228;
mov.f32 %f345, 0f00000000;
mov.u32 %r148, 0;
@%p40 bra $L__BB1_55;
mov.f32 %f345, 0f00000000;
mov.u32 %r148, 0;
mov.u32 %r147, %r49;
$L__BB1_54:
cvt.u64.u32 %rd87, %r148;
add.s64 %rd88, %rd9, %rd87;
shl.b64 %rd89, %rd88, 2;
add.s64 %rd90, %rd8, %rd89;
add.s64 %rd91, %rd19, %rd87;
shl.b64 %rd92, %rd91, 2;
add.s64 %rd93, %rd2, %rd92;
ld.global.nc.f32 %f230, [%rd93];
ld.global.nc.f32 %f231, [%rd90];
fma.rn.f32 %f232, %f231, %f230, %f345;
ld.global.nc.f32 %f233, [%rd93+4];
ld.global.nc.f32 %f234, [%rd90+4];
fma.rn.f32 %f235, %f234, %f233, %f232;
ld.global.nc.f32 %f236, [%rd93+8];
ld.global.nc.f32 %f237, [%rd90+8];
fma.rn.f32 %f238, %f237, %f236, %f235;
ld.global.nc.f32 %f239, [%rd93+12];
ld.global.nc.f32 %f240, [%rd90+12];
fma.rn.f32 %f345, %f240, %f239, %f238;
add.s32 %r148, %r148, 4;
add.s32 %r147, %r147, -4;
setp.ne.s32 %p46, %r147, 0;
@%p46 bra $L__BB1_54;
$L__BB1_55:
@%p42 bra $L__BB1_59;
setp.eq.s32 %p48, %r48, 1;
cvt.u64.u32 %rd94, %r148;
add.s64 %rd95, %rd9, %rd94;
shl.b64 %rd96, %rd95, 2;
add.s64 %rd22, %rd8, %rd96;
add.s64 %rd97, %rd19, %rd94;
shl.b64 %rd98, %rd97, 2;
add.s64 %rd23, %rd2, %rd98;
ld.global.nc.f32 %f241, [%rd23];
ld.global.nc.f32 %f242, [%rd22];
fma.rn.f32 %f345, %f242, %f241, %f345;
@%p48 bra $L__BB1_59;
setp.eq.s32 %p49, %r48, 2;
ld.global.nc.f32 %f243, [%rd23+4];
ld.global.nc.f32 %f244, [%rd22+4];
fma.rn.f32 %f345, %f244, %f243, %f345;
@%p49 bra $L__BB1_59;
ld.global.nc.f32 %f245, [%rd23+8];
ld.global.nc.f32 %f246, [%rd22+8];
fma.rn.f32 %f345, %f246, %f245, %f345;
$L__BB1_59:
sub.f32 %f247, %f345, %f337;
mul.f32 %f248, %f55, %f247;
mul.f32 %f64, %f248, %f65;
mov.u32 %r151, 0;
@%p40 bra $L__BB1_62;
mov.u32 %r151, 0;
mov.u32 %r150, %r49;
$L__BB1_61:
cvt.u64.u32 %rd99, %r151;
add.s64 %rd100, %rd19, %rd99;
shl.b64 %rd101, %rd100, 2;
add.s64 %rd102, %rd7, %rd101;
add.s64 %rd103, %rd9, %rd99;
shl.b64 %rd104, %rd103, 2;
add.s64 %rd105, %rd8, %rd104;
ld.global.nc.f32 %f249, [%rd105];
mul.f32 %f250, %f55, %f249;
atom.global.add.f32 %f251, [%rd102], %f250;
add.s64 %rd106, %rd6, %rd101;
ld.global.nc.f32 %f252, [%rd106];
add.s64 %rd107, %rd5, %rd104;
ld.global.f32 %f253, [%rd107];
fma.rn.f32 %f254, %f64, %f252, %f253;
st.global.f32 [%rd107], %f254;
add.s64 %rd108, %rd3, %rd101;
add.s64 %rd109, %rd4, %rd104;
ld.global.nc.f32 %f255, [%rd109];
mul.f32 %f256, %f64, %f255;
atom.global.add.f32 %f257, [%rd108], %f256;
add.s32 %r120, %r151, 1;
cvt.u64.u32 %rd110, %r120;
add.s64 %rd111, %rd19, %rd110;
shl.b64 %rd112, %rd111, 2;
add.s64 %rd113, %rd7, %rd112;
ld.global.nc.f32 %f258, [%rd105+4];
mul.f32 %f259, %f55, %f258;
atom.global.add.f32 %f260, [%rd113], %f259;
ld.global.nc.f32 %f261, [%rd106+4];
ld.global.f32 %f262, [%rd107+4];
fma.rn.f32 %f263, %f64, %f261, %f262;
st.global.f32 [%rd107+4], %f263;
add.s64 %rd114, %rd3, %rd112;
ld.global.nc.f32 %f264, [%rd109+4];
mul.f32 %f265, %f64, %f264;
atom.global.add.f32 %f266, [%rd114], %f265;
add.s32 %r121, %r151, 2;
cvt.u64.u32 %rd115, %r121;
add.s64 %rd116, %rd19, %rd115;
shl.b64 %rd117, %rd116, 2;
add.s64 %rd118, %rd7, %rd117;
ld.global.nc.f32 %f267, [%rd105+8];
mul.f32 %f268, %f55, %f267;
atom.global.add.f32 %f269, [%rd118], %f268;
ld.global.nc.f32 %f270, [%rd106+8];
ld.global.f32 %f271, [%rd107+8];
fma.rn.f32 %f272, %f64, %f270, %f271;
st.global.f32 [%rd107+8], %f272;
add.s64 %rd119, %rd3, %rd117;
ld.global.nc.f32 %f273, [%rd109+8];
mul.f32 %f274, %f64, %f273;
atom.global.add.f32 %f275, [%rd119], %f274;
add.s32 %r122, %r151, 3;
cvt.u64.u32 %rd120, %r122;
add.s64 %rd121, %rd19, %rd120;
shl.b64 %rd122, %rd121, 2;
add.s64 %rd123, %rd7, %rd122;
ld.global.nc.f32 %f276, [%rd105+12];
mul.f32 %f277, %f55, %f276;
atom.global.add.f32 %f278, [%rd123], %f277;
ld.global.nc.f32 %f279, [%rd106+12];
ld.global.f32 %f280, [%rd107+12];
fma.rn.f32 %f281, %f64, %f279, %f280;
st.global.f32 [%rd107+12], %f281;
add.s64 %rd124, %rd3, %rd122;
ld.global.nc.f32 %f282, [%rd109+12];
mul.f32 %f283, %f64, %f282;
atom.global.add.f32 %f284, [%rd124], %f283;
add.s32 %r151, %r151, 4;
add.s32 %r150, %r150, -4;
setp.ne.s32 %p51, %r150, 0;
@%p51 bra $L__BB1_61;
$L__BB1_62:
@%p42 bra $L__BB1_66;
setp.eq.s32 %p53, %r48, 1;
cvt.u64.u32 %rd125, %r151;
add.s64 %rd126, %rd19, %rd125;
shl.b64 %rd127, %rd126, 2;
add.s64 %rd128, %rd7, %rd127;
add.s64 %rd129, %rd9, %rd125;
shl.b64 %rd130, %rd129, 2;
add.s64 %rd24, %rd8, %rd130;
ld.global.nc.f32 %f285, [%rd24];
mul.f32 %f286, %f55, %f285;
atom.global.add.f32 %f287, [%rd128], %f286;
add.s64 %rd25, %rd6, %rd127;
ld.global.nc.f32 %f288, [%rd25];
add.s64 %rd26, %rd5, %rd130;
ld.global.f32 %f289, [%rd26];
fma.rn.f32 %f290, %f64, %f288, %f289;
st.global.f32 [%rd26], %f290;
add.s64 %rd131, %rd3, %rd127;
add.s64 %rd27, %rd4, %rd130;
ld.global.nc.f32 %f291, [%rd27];
mul.f32 %f292, %f64, %f291;
atom.global.add.f32 %f293, [%rd131], %f292;
@%p53 bra $L__BB1_66;
setp.eq.s32 %p54, %r48, 2;
add.s32 %r123, %r151, 1;
cvt.u64.u32 %rd132, %r123;
add.s64 %rd133, %rd19, %rd132;
shl.b64 %rd134, %rd133, 2;
add.s64 %rd135, %rd7, %rd134;
ld.global.nc.f32 %f294, [%rd24+4];
mul.f32 %f295, %f55, %f294;
atom.global.add.f32 %f296, [%rd135], %f295;
ld.global.nc.f32 %f297, [%rd25+4];
ld.global.f32 %f298, [%rd26+4];
fma.rn.f32 %f299, %f64, %f297, %f298;
st.global.f32 [%rd26+4], %f299;
add.s64 %rd136, %rd3, %rd134;
ld.global.nc.f32 %f300, [%rd27+4];
mul.f32 %f301, %f64, %f300;
atom.global.add.f32 %f302, [%rd136], %f301;
@%p54 bra $L__BB1_66;
add.s32 %r124, %r151, 2;
cvt.u64.u32 %rd137, %r124;
add.s64 %rd138, %rd19, %rd137;
shl.b64 %rd139, %rd138, 2;
add.s64 %rd140, %rd7, %rd139;
ld.global.nc.f32 %f303, [%rd24+8];
mul.f32 %f304, %f55, %f303;
atom.global.add.f32 %f305, [%rd140], %f304;
ld.global.nc.f32 %f306, [%rd25+8];
ld.global.f32 %f307, [%rd26+8];
fma.rn.f32 %f308, %f64, %f306, %f307;
st.global.f32 [%rd26+8], %f308;
add.s64 %rd141, %rd3, %rd139;
ld.global.nc.f32 %f309, [%rd27+8];
mul.f32 %f310, %f64, %f309;
atom.global.add.f32 %f311, [%rd141], %f310;
$L__BB1_66:
add.s32 %r142, %r142, 1;
setp.lt.u32 %p55, %r142, %r3;
@%p55 bra $L__BB1_45;
$L__BB1_67:
ret;
}