//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35059454
// Cuda compilation tools, release 12.6, V12.6.85
// Based on NVVM 7.0.1
//
.version 8.5
.target sm_75
.address_size 64
// .globl maxpool2d_fwd_f32
.visible .entry maxpool2d_fwd_f32(
.param .u64 maxpool2d_fwd_f32_param_0,
.param .u64 maxpool2d_fwd_f32_param_1,
.param .u64 maxpool2d_fwd_f32_param_2,
.param .u64 maxpool2d_fwd_f32_param_3,
.param .u32 maxpool2d_fwd_f32_param_4,
.param .u32 maxpool2d_fwd_f32_param_5,
.param .u32 maxpool2d_fwd_f32_param_6,
.param .u32 maxpool2d_fwd_f32_param_7
)
{
.reg .pred %p<41>;
.reg .f32 %f<44>;
.reg .b32 %r<100>;
.reg .b64 %rd<27>;
ld.param.u64 %rd6, [maxpool2d_fwd_f32_param_0];
ld.param.u64 %rd4, [maxpool2d_fwd_f32_param_1];
ld.param.u64 %rd5, [maxpool2d_fwd_f32_param_2];
ld.param.u64 %rd7, [maxpool2d_fwd_f32_param_3];
ld.param.u32 %r47, [maxpool2d_fwd_f32_param_4];
ld.param.u32 %r48, [maxpool2d_fwd_f32_param_5];
ld.param.u32 %r49, [maxpool2d_fwd_f32_param_6];
ld.param.u32 %r50, [maxpool2d_fwd_f32_param_7];
cvta.to.global.u64 %rd1, %rd6;
cvta.to.global.u64 %rd2, %rd7;
mov.u32 %r51, %ntid.x;
mov.u32 %r52, %ctaid.x;
mov.u32 %r53, %tid.x;
mad.lo.s32 %r1, %r52, %r51, %r53;
setp.ge.u32 %p1, %r1, %r50;
mov.f32 %f34, 0fFF7FFFFD;
@%p1 bra $L__BB0_27;
ld.global.nc.u32 %r2, [%rd2];
ld.global.nc.u32 %r3, [%rd2+4];
ld.global.nc.u32 %r4, [%rd2+12];
ld.global.nc.u32 %r5, [%rd2+8];
setp.eq.s32 %p2, %r5, 0;
mov.u32 %r89, -1;
@%p2 bra $L__BB0_26;
ld.global.nc.u32 %r56, [%rd2+20];
rem.u32 %r57, %r1, %r49;
mul.lo.s32 %r58, %r57, %r56;
ld.global.nc.u32 %r59, [%rd2+28];
sub.s32 %r6, %r58, %r59;
setp.eq.s32 %p3, %r4, 0;
@%p3 bra $L__BB0_26;
mul.lo.s32 %r62, %r49, %r48;
ld.global.nc.u32 %r63, [%rd2+16];
ld.global.nc.u32 %r64, [%rd2+24];
add.s32 %r7, %r4, -1;
mov.u32 %r89, -1;
and.b32 %r8, %r4, 3;
sub.s32 %r9, %r4, %r8;
div.u32 %r65, %r1, %r62;
rem.u32 %r66, %r65, %r47;
mul.lo.s32 %r67, %r62, %r47;
div.u32 %r68, %r1, %r67;
mad.lo.s32 %r69, %r68, %r47, %r66;
mul.lo.s32 %r10, %r69, %r2;
div.u32 %r70, %r1, %r49;
rem.u32 %r71, %r70, %r48;
mul.lo.s32 %r72, %r71, %r63;
sub.s32 %r11, %r72, %r64;
mov.f32 %f34, 0fFF7FFFFD;
mov.u32 %r84, 0;
$L__BB0_4:
add.s32 %r14, %r11, %r84;
setp.lt.s32 %p4, %r14, 0;
setp.ge.s32 %p5, %r14, %r2;
or.pred %p6, %p4, %p5;
@%p6 bra $L__BB0_25;
setp.lt.u32 %p7, %r7, 3;
add.s32 %r75, %r14, %r10;
mul.lo.s32 %r15, %r75, %r3;
mov.u32 %r94, 0;
@%p7 bra $L__BB0_16;
mov.u32 %r94, 0;
mov.u32 %r88, %r9;
$L__BB0_7:
add.s32 %r19, %r6, %r94;
setp.lt.s32 %p8, %r19, 0;
setp.ge.s32 %p9, %r19, %r3;
or.pred %p10, %p8, %p9;
@%p10 bra $L__BB0_9;
add.s32 %r77, %r19, %r15;
mul.wide.u32 %rd8, %r77, 4;
add.s64 %rd9, %rd1, %rd8;
ld.global.nc.f32 %f24, [%rd9];
setp.gt.f32 %p11, %f24, %f34;
selp.f32 %f34, %f24, %f34, %p11;
selp.b32 %r89, %r77, %r89, %p11;
$L__BB0_9:
add.s32 %r22, %r19, 1;
setp.lt.s32 %p12, %r22, 0;
setp.ge.s32 %p13, %r22, %r3;
or.pred %p14, %p12, %p13;
@%p14 bra $L__BB0_11;
add.s32 %r78, %r22, %r15;
mul.wide.u32 %rd10, %r78, 4;
add.s64 %rd11, %rd1, %rd10;
ld.global.nc.f32 %f25, [%rd11];
setp.gt.f32 %p15, %f25, %f34;
selp.f32 %f34, %f25, %f34, %p15;
selp.b32 %r89, %r78, %r89, %p15;
$L__BB0_11:
add.s32 %r25, %r19, 2;
setp.lt.s32 %p16, %r25, 0;
setp.ge.s32 %p17, %r25, %r3;
or.pred %p18, %p16, %p17;
@%p18 bra $L__BB0_13;
add.s32 %r79, %r25, %r15;
mul.wide.u32 %rd12, %r79, 4;
add.s64 %rd13, %rd1, %rd12;
ld.global.nc.f32 %f26, [%rd13];
setp.gt.f32 %p19, %f26, %f34;
selp.f32 %f34, %f26, %f34, %p19;
selp.b32 %r89, %r79, %r89, %p19;
$L__BB0_13:
add.s32 %r28, %r19, 3;
setp.lt.s32 %p20, %r28, 0;
setp.ge.s32 %p21, %r28, %r3;
or.pred %p22, %p20, %p21;
@%p22 bra $L__BB0_15;
add.s32 %r80, %r28, %r15;
mul.wide.u32 %rd14, %r80, 4;
add.s64 %rd15, %rd1, %rd14;
ld.global.nc.f32 %f27, [%rd15];
setp.gt.f32 %p23, %f27, %f34;
selp.f32 %f34, %f27, %f34, %p23;
selp.b32 %r89, %r80, %r89, %p23;
$L__BB0_15:
add.s32 %r94, %r94, 4;
add.s32 %r88, %r88, -4;
setp.ne.s32 %p24, %r88, 0;
@%p24 bra $L__BB0_7;
$L__BB0_16:
setp.eq.s32 %p25, %r8, 0;
@%p25 bra $L__BB0_25;
add.s32 %r36, %r6, %r94;
setp.lt.s32 %p26, %r36, 0;
setp.ge.s32 %p27, %r36, %r3;
or.pred %p28, %p26, %p27;
@%p28 bra $L__BB0_19;
add.s32 %r81, %r36, %r15;
mul.wide.u32 %rd16, %r81, 4;
add.s64 %rd17, %rd1, %rd16;
ld.global.nc.f32 %f28, [%rd17];
setp.gt.f32 %p29, %f28, %f34;
selp.f32 %f34, %f28, %f34, %p29;
selp.b32 %r89, %r81, %r89, %p29;
$L__BB0_19:
setp.eq.s32 %p30, %r8, 1;
@%p30 bra $L__BB0_25;
add.s32 %r39, %r36, 1;
setp.lt.s32 %p31, %r39, 0;
setp.ge.s32 %p32, %r39, %r3;
or.pred %p33, %p31, %p32;
@%p33 bra $L__BB0_22;
add.s32 %r82, %r39, %r15;
mul.wide.u32 %rd18, %r82, 4;
add.s64 %rd19, %rd1, %rd18;
ld.global.nc.f32 %f29, [%rd19];
setp.gt.f32 %p34, %f29, %f34;
selp.f32 %f34, %f29, %f34, %p34;
selp.b32 %r89, %r82, %r89, %p34;
$L__BB0_22:
setp.eq.s32 %p35, %r8, 2;
@%p35 bra $L__BB0_25;
add.s32 %r42, %r36, 2;
setp.lt.s32 %p36, %r42, 0;
setp.ge.s32 %p37, %r42, %r3;
or.pred %p38, %p36, %p37;
@%p38 bra $L__BB0_25;
add.s32 %r83, %r42, %r15;
mul.wide.u32 %rd20, %r83, 4;
add.s64 %rd21, %rd1, %rd20;
ld.global.nc.f32 %f30, [%rd21];
setp.gt.f32 %p39, %f30, %f34;
selp.f32 %f34, %f30, %f34, %p39;
selp.b32 %r89, %r83, %r89, %p39;
$L__BB0_25:
add.s32 %r84, %r84, 1;
setp.lt.u32 %p40, %r84, %r5;
@%p40 bra $L__BB0_4;
$L__BB0_26:
cvta.to.global.u64 %rd22, %rd4;
mul.wide.u32 %rd23, %r1, 4;
add.s64 %rd24, %rd22, %rd23;
st.global.f32 [%rd24], %f34;
cvta.to.global.u64 %rd25, %rd5;
add.s64 %rd26, %rd25, %rd23;
st.global.u32 [%rd26], %r89;
$L__BB0_27:
ret;
}
// .globl maxpool2d_bwd_f32
.visible .entry maxpool2d_bwd_f32(
.param .u64 maxpool2d_bwd_f32_param_0,
.param .u64 maxpool2d_bwd_f32_param_1,
.param .u64 maxpool2d_bwd_f32_param_2,
.param .u32 maxpool2d_bwd_f32_param_3
)
{
.reg .pred %p<3>;
.reg .f32 %f<3>;
.reg .b32 %r<7>;
.reg .b64 %rd<14>;
ld.param.u64 %rd2, [maxpool2d_bwd_f32_param_0];
ld.param.u64 %rd3, [maxpool2d_bwd_f32_param_1];
ld.param.u64 %rd4, [maxpool2d_bwd_f32_param_2];
ld.param.u32 %r3, [maxpool2d_bwd_f32_param_3];
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
setp.ge.u32 %p1, %r1, %r3;
@%p1 bra $L__BB1_3;
cvta.to.global.u64 %rd5, %rd3;
cvt.u64.u32 %rd1, %r1;
mul.wide.u32 %rd6, %r1, 4;
add.s64 %rd7, %rd5, %rd6;
ld.global.nc.u32 %r2, [%rd7];
setp.lt.s32 %p2, %r2, 0;
@%p2 bra $L__BB1_3;
cvta.to.global.u64 %rd8, %rd4;
mul.wide.s32 %rd9, %r2, 4;
add.s64 %rd10, %rd8, %rd9;
cvta.to.global.u64 %rd11, %rd2;
shl.b64 %rd12, %rd1, 2;
add.s64 %rd13, %rd11, %rd12;
ld.global.nc.f32 %f1, [%rd13];
atom.global.add.f32 %f2, [%rd10], %f1;
$L__BB1_3:
ret;
}
// .globl avgpool2d_fwd_f32
.visible .entry avgpool2d_fwd_f32(
.param .u64 avgpool2d_fwd_f32_param_0,
.param .u64 avgpool2d_fwd_f32_param_1,
.param .u64 avgpool2d_fwd_f32_param_2,
.param .u32 avgpool2d_fwd_f32_param_3,
.param .u32 avgpool2d_fwd_f32_param_4,
.param .u32 avgpool2d_fwd_f32_param_5,
.param .u32 avgpool2d_fwd_f32_param_6
)
{
.reg .pred %p<36>;
.reg .f32 %f<49>;
.reg .b32 %r<102>;
.reg .b64 %rd<24>;
ld.param.u64 %rd5, [avgpool2d_fwd_f32_param_0];
ld.param.u64 %rd4, [avgpool2d_fwd_f32_param_1];
ld.param.u64 %rd6, [avgpool2d_fwd_f32_param_2];
ld.param.u32 %r48, [avgpool2d_fwd_f32_param_3];
ld.param.u32 %r49, [avgpool2d_fwd_f32_param_4];
ld.param.u32 %r50, [avgpool2d_fwd_f32_param_5];
ld.param.u32 %r51, [avgpool2d_fwd_f32_param_6];
cvta.to.global.u64 %rd1, %rd5;
cvta.to.global.u64 %rd2, %rd6;
mov.u32 %r52, %ntid.x;
mov.u32 %r53, %ctaid.x;
mov.u32 %r54, %tid.x;
mad.lo.s32 %r1, %r53, %r52, %r54;
setp.ge.u32 %p1, %r1, %r51;
mov.f32 %f39, 0f00000000;
@%p1 bra $L__BB2_27;
ld.global.nc.u32 %r2, [%rd2];
ld.global.nc.u32 %r3, [%rd2+4];
ld.global.nc.u32 %r4, [%rd2+12];
ld.global.nc.u32 %r5, [%rd2+32];
ld.global.nc.u32 %r6, [%rd2+8];
setp.eq.s32 %p2, %r6, 0;
mov.u32 %r91, 0;
@%p2 bra $L__BB2_26;
ld.global.nc.u32 %r57, [%rd2+20];
rem.u32 %r58, %r1, %r50;
mul.lo.s32 %r59, %r58, %r57;
ld.global.nc.u32 %r60, [%rd2+28];
sub.s32 %r7, %r59, %r60;
setp.eq.s32 %p3, %r4, 0;
@%p3 bra $L__BB2_26;
mul.lo.s32 %r63, %r50, %r49;
ld.global.nc.u32 %r64, [%rd2+16];
ld.global.nc.u32 %r65, [%rd2+24];
add.s32 %r8, %r4, -1;
and.b32 %r9, %r4, 3;
sub.s32 %r10, %r4, %r9;
div.u32 %r66, %r1, %r63;
rem.u32 %r67, %r66, %r48;
mul.lo.s32 %r68, %r63, %r48;
div.u32 %r69, %r1, %r68;
mad.lo.s32 %r70, %r69, %r48, %r67;
mul.lo.s32 %r11, %r70, %r2;
div.u32 %r71, %r1, %r50;
rem.u32 %r72, %r71, %r49;
mul.lo.s32 %r73, %r72, %r64;
sub.s32 %r12, %r73, %r65;
mov.f32 %f39, 0f00000000;
mov.u32 %r86, 0;
mov.u32 %r91, %r86;
$L__BB2_4:
add.s32 %r15, %r12, %r86;
setp.lt.s32 %p4, %r15, 0;
setp.ge.s32 %p5, %r15, %r2;
or.pred %p6, %p4, %p5;
@%p6 bra $L__BB2_25;
setp.lt.u32 %p7, %r8, 3;
add.s32 %r76, %r15, %r11;
mul.lo.s32 %r16, %r76, %r3;
mov.u32 %r96, 0;
@%p7 bra $L__BB2_16;
mov.u32 %r96, 0;
mov.u32 %r90, %r10;
$L__BB2_7:
add.s32 %r20, %r7, %r96;
setp.lt.s32 %p8, %r20, 0;
setp.ge.s32 %p9, %r20, %r3;
or.pred %p10, %p8, %p9;
@%p10 bra $L__BB2_9;
add.s32 %r78, %r20, %r16;
mul.wide.u32 %rd7, %r78, 4;
add.s64 %rd8, %rd1, %rd7;
ld.global.nc.f32 %f24, [%rd8];
add.f32 %f39, %f39, %f24;
add.s32 %r91, %r91, 1;
$L__BB2_9:
add.s32 %r23, %r20, 1;
setp.lt.s32 %p11, %r23, 0;
setp.ge.s32 %p12, %r23, %r3;
or.pred %p13, %p11, %p12;
@%p13 bra $L__BB2_11;
add.s32 %r79, %r23, %r16;
mul.wide.u32 %rd9, %r79, 4;
add.s64 %rd10, %rd1, %rd9;
ld.global.nc.f32 %f25, [%rd10];
add.f32 %f39, %f39, %f25;
add.s32 %r91, %r91, 1;
$L__BB2_11:
add.s32 %r26, %r20, 2;
setp.lt.s32 %p14, %r26, 0;
setp.ge.s32 %p15, %r26, %r3;
or.pred %p16, %p14, %p15;
@%p16 bra $L__BB2_13;
add.s32 %r80, %r26, %r16;
mul.wide.u32 %rd11, %r80, 4;
add.s64 %rd12, %rd1, %rd11;
ld.global.nc.f32 %f26, [%rd12];
add.f32 %f39, %f39, %f26;
add.s32 %r91, %r91, 1;
$L__BB2_13:
add.s32 %r29, %r20, 3;
setp.lt.s32 %p17, %r29, 0;
setp.ge.s32 %p18, %r29, %r3;
or.pred %p19, %p17, %p18;
@%p19 bra $L__BB2_15;
add.s32 %r81, %r29, %r16;
mul.wide.u32 %rd13, %r81, 4;
add.s64 %rd14, %rd1, %rd13;
ld.global.nc.f32 %f27, [%rd14];
add.f32 %f39, %f39, %f27;
add.s32 %r91, %r91, 1;
$L__BB2_15:
add.s32 %r96, %r96, 4;
add.s32 %r90, %r90, -4;
setp.ne.s32 %p20, %r90, 0;
@%p20 bra $L__BB2_7;
$L__BB2_16:
setp.eq.s32 %p21, %r9, 0;
@%p21 bra $L__BB2_25;
add.s32 %r37, %r7, %r96;
setp.lt.s32 %p22, %r37, 0;
setp.ge.s32 %p23, %r37, %r3;
or.pred %p24, %p22, %p23;
@%p24 bra $L__BB2_19;
add.s32 %r82, %r37, %r16;
mul.wide.u32 %rd15, %r82, 4;
add.s64 %rd16, %rd1, %rd15;
ld.global.nc.f32 %f28, [%rd16];
add.f32 %f39, %f39, %f28;
add.s32 %r91, %r91, 1;
$L__BB2_19:
setp.eq.s32 %p25, %r9, 1;
@%p25 bra $L__BB2_25;
add.s32 %r40, %r37, 1;
setp.lt.s32 %p26, %r40, 0;
setp.ge.s32 %p27, %r40, %r3;
or.pred %p28, %p26, %p27;
@%p28 bra $L__BB2_22;
add.s32 %r83, %r40, %r16;
mul.wide.u32 %rd17, %r83, 4;
add.s64 %rd18, %rd1, %rd17;
ld.global.nc.f32 %f29, [%rd18];
add.f32 %f39, %f39, %f29;
add.s32 %r91, %r91, 1;
$L__BB2_22:
setp.eq.s32 %p29, %r9, 2;
@%p29 bra $L__BB2_25;
add.s32 %r43, %r37, 2;
setp.lt.s32 %p30, %r43, 0;
setp.ge.s32 %p31, %r43, %r3;
or.pred %p32, %p30, %p31;
@%p32 bra $L__BB2_25;
add.s32 %r84, %r43, %r16;
mul.wide.u32 %rd19, %r84, 4;
add.s64 %rd20, %rd1, %rd19;
ld.global.nc.f32 %f30, [%rd20];
add.f32 %f39, %f39, %f30;
add.s32 %r91, %r91, 1;
$L__BB2_25:
add.s32 %r86, %r86, 1;
setp.lt.u32 %p33, %r86, %r6;
@%p33 bra $L__BB2_4;
$L__BB2_26:
mul.lo.s32 %r85, %r4, %r6;
cvt.rn.f32.u32 %f31, %r85;
cvt.rn.f32.u32 %f32, %r91;
setp.eq.s32 %p34, %r91, 0;
selp.f32 %f33, 0f3F800000, %f32, %p34;
setp.eq.s32 %p35, %r5, 0;
selp.f32 %f34, %f33, %f31, %p35;
div.rn.f32 %f35, %f39, %f34;
cvta.to.global.u64 %rd21, %rd4;
mul.wide.u32 %rd22, %r1, 4;
add.s64 %rd23, %rd21, %rd22;
st.global.f32 [%rd23], %f35;
$L__BB2_27:
ret;
}
// .globl avgpool2d_bwd_f32
.visible .entry avgpool2d_bwd_f32(
.param .u64 avgpool2d_bwd_f32_param_0,
.param .u64 avgpool2d_bwd_f32_param_1,
.param .u64 avgpool2d_bwd_f32_param_2,
.param .u32 avgpool2d_bwd_f32_param_3,
.param .u32 avgpool2d_bwd_f32_param_4,
.param .u32 avgpool2d_bwd_f32_param_5,
.param .u32 avgpool2d_bwd_f32_param_6
)
{
.reg .pred %p<68>;
.reg .f32 %f<14>;
.reg .b32 %r<127>;
.reg .b64 %rd<23>;
ld.param.u64 %rd2, [avgpool2d_bwd_f32_param_0];
ld.param.u64 %rd4, [avgpool2d_bwd_f32_param_1];
ld.param.u64 %rd3, [avgpool2d_bwd_f32_param_2];
ld.param.u32 %r59, [avgpool2d_bwd_f32_param_3];
ld.param.u32 %r60, [avgpool2d_bwd_f32_param_4];
ld.param.u32 %r61, [avgpool2d_bwd_f32_param_5];
ld.param.u32 %r62, [avgpool2d_bwd_f32_param_6];
cvta.to.global.u64 %rd1, %rd4;
mov.u32 %r63, %ntid.x;
mov.u32 %r64, %ctaid.x;
mov.u32 %r65, %tid.x;
mad.lo.s32 %r1, %r64, %r63, %r65;
setp.ge.u32 %p1, %r1, %r62;
@%p1 bra $L__BB3_38;
cvta.to.global.u64 %rd5, %rd3;
ld.global.nc.u32 %r2, [%rd5];
ld.global.nc.u32 %r3, [%rd5+4];
ld.global.nc.u32 %r4, [%rd5+12];
ld.global.nc.u32 %r5, [%rd5+16];
ld.global.nc.u32 %r6, [%rd5+20];
ld.global.nc.u32 %r7, [%rd5+24];
ld.global.nc.u32 %r8, [%rd5+28];
ld.global.nc.u32 %r9, [%rd5+32];
div.u32 %r67, %r1, %r61;
mul.lo.s32 %r68, %r67, %r61;
sub.s32 %r10, %r1, %r68;
rem.u32 %r11, %r67, %r60;
ld.global.nc.u32 %r12, [%rd5+8];
setp.eq.s32 %p2, %r12, 0;
mov.u32 %r121, 0;
@%p2 bra $L__BB3_13;
mul.lo.s32 %r70, %r10, %r6;
sub.s32 %r13, %r70, %r8;
setp.eq.s32 %p3, %r4, 0;
@%p3 bra $L__BB3_13;
add.s32 %r14, %r4, -1;
and.b32 %r15, %r4, 3;
sub.s32 %r16, %r4, %r15;
mul.lo.s32 %r73, %r11, %r5;
sub.s32 %r17, %r73, %r7;
mov.u32 %r113, 0;
mov.u32 %r121, %r113;
$L__BB3_4:
add.s32 %r74, %r17, %r113;
setp.lt.s32 %p4, %r74, 0;
setp.ge.s32 %p5, %r74, %r2;
or.pred %p6, %p4, %p5;
@%p6 bra $L__BB3_12;
setp.lt.u32 %p7, %r14, 3;
mov.u32 %r119, 0;
@%p7 bra $L__BB3_8;
mov.u32 %r119, 0;
mov.u32 %r117, %r16;
$L__BB3_7:
add.s32 %r78, %r13, %r119;
setp.gt.s32 %p8, %r78, -1;
setp.lt.s32 %p9, %r78, %r3;
and.pred %p10, %p9, %p8;
selp.u32 %r79, 1, 0, %p10;
add.s32 %r80, %r121, %r79;
add.s32 %r81, %r78, 1;
setp.gt.s32 %p11, %r81, -1;
setp.lt.s32 %p12, %r81, %r3;
and.pred %p13, %p12, %p11;
selp.u32 %r82, 1, 0, %p13;
add.s32 %r83, %r80, %r82;
add.s32 %r84, %r78, 2;
setp.gt.s32 %p14, %r84, -1;
setp.lt.s32 %p15, %r84, %r3;
and.pred %p16, %p15, %p14;
selp.u32 %r85, 1, 0, %p16;
add.s32 %r86, %r83, %r85;
add.s32 %r87, %r78, 3;
setp.gt.s32 %p17, %r87, -1;
setp.lt.s32 %p18, %r87, %r3;
and.pred %p19, %p18, %p17;
selp.u32 %r88, 1, 0, %p19;
add.s32 %r121, %r86, %r88;
add.s32 %r119, %r119, 4;
add.s32 %r117, %r117, -4;
setp.ne.s32 %p20, %r117, 0;
@%p20 bra $L__BB3_7;
$L__BB3_8:
setp.eq.s32 %p21, %r15, 0;
@%p21 bra $L__BB3_12;
setp.eq.s32 %p22, %r15, 1;
add.s32 %r29, %r13, %r119;
setp.gt.s32 %p23, %r29, -1;
setp.lt.s32 %p24, %r29, %r3;
and.pred %p25, %p24, %p23;
selp.u32 %r89, 1, 0, %p25;
add.s32 %r121, %r121, %r89;
@%p22 bra $L__BB3_12;
setp.eq.s32 %p26, %r15, 2;
add.s32 %r90, %r29, 1;
setp.gt.s32 %p27, %r90, -1;
setp.lt.s32 %p28, %r90, %r3;
and.pred %p29, %p28, %p27;
selp.u32 %r91, 1, 0, %p29;
add.s32 %r121, %r121, %r91;
@%p26 bra $L__BB3_12;
add.s32 %r92, %r29, 2;
setp.gt.s32 %p30, %r92, -1;
setp.lt.s32 %p31, %r92, %r3;
and.pred %p32, %p31, %p30;
selp.u32 %r93, 1, 0, %p32;
add.s32 %r121, %r121, %r93;
$L__BB3_12:
add.s32 %r113, %r113, 1;
setp.lt.u32 %p33, %r113, %r12;
@%p33 bra $L__BB3_4;
$L__BB3_13:
mul.lo.s32 %r94, %r4, %r12;
cvt.rn.f32.u32 %f2, %r94;
cvt.rn.f32.u32 %f3, %r121;
setp.eq.s32 %p34, %r121, 0;
selp.f32 %f4, 0f3F800000, %f3, %p34;
setp.eq.s32 %p35, %r9, 0;
selp.f32 %f5, %f4, %f2, %p35;
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
ld.global.nc.f32 %f6, [%rd8];
div.rn.f32 %f1, %f6, %f5;
@%p2 bra $L__BB3_38;
mul.lo.s32 %r95, %r61, %r60;
mul.lo.s32 %r96, %r11, %r5;
sub.s32 %r36, %r96, %r7;
mul.lo.s32 %r97, %r10, %r6;
sub.s32 %r37, %r97, %r8;
setp.eq.s32 %p37, %r4, 0;
div.u32 %r98, %r1, %r95;
rem.u32 %r99, %r98, %r59;
mul.lo.s32 %r100, %r95, %r59;
div.u32 %r101, %r1, %r100;
mad.lo.s32 %r38, %r101, %r59, %r99;
@%p37 bra $L__BB3_38;
add.s32 %r39, %r4, -1;
and.b32 %r40, %r4, 3;
sub.s32 %r41, %r4, %r40;
mul.lo.s32 %r42, %r38, %r2;
mov.u32 %r123, 0;
$L__BB3_16:
add.s32 %r44, %r36, %r123;
setp.lt.s32 %p38, %r44, 0;
setp.ge.s32 %p39, %r44, %r2;
or.pred %p40, %p38, %p39;
@%p40 bra $L__BB3_37;
setp.lt.u32 %p41, %r39, 3;
add.s32 %r104, %r44, %r42;
mul.lo.s32 %r45, %r104, %r3;
mov.u32 %r126, 0;
@%p41 bra $L__BB3_28;
mov.u32 %r126, 0;
mov.u32 %r125, %r41;
$L__BB3_19:
add.s32 %r48, %r37, %r126;
setp.lt.s32 %p42, %r48, 0;
setp.ge.s32 %p43, %r48, %r3;
or.pred %p44, %p42, %p43;
@%p44 bra $L__BB3_21;
add.s32 %r106, %r48, %r45;
mul.wide.u32 %rd9, %r106, 4;
add.s64 %rd10, %rd1, %rd9;
atom.global.add.f32 %f7, [%rd10], %f1;
$L__BB3_21:
add.s32 %r49, %r48, 1;
setp.lt.s32 %p45, %r49, 0;
setp.ge.s32 %p46, %r49, %r3;
or.pred %p47, %p45, %p46;
@%p47 bra $L__BB3_23;
add.s32 %r107, %r49, %r45;
mul.wide.u32 %rd11, %r107, 4;
add.s64 %rd12, %rd1, %rd11;
atom.global.add.f32 %f8, [%rd12], %f1;
$L__BB3_23:
add.s32 %r50, %r48, 2;
setp.lt.s32 %p48, %r50, 0;
setp.ge.s32 %p49, %r50, %r3;
or.pred %p50, %p48, %p49;
@%p50 bra $L__BB3_25;
add.s32 %r108, %r50, %r45;
mul.wide.u32 %rd13, %r108, 4;
add.s64 %rd14, %rd1, %rd13;
atom.global.add.f32 %f9, [%rd14], %f1;
$L__BB3_25:
add.s32 %r51, %r48, 3;
setp.lt.s32 %p51, %r51, 0;
setp.ge.s32 %p52, %r51, %r3;
or.pred %p53, %p51, %p52;
@%p53 bra $L__BB3_27;
add.s32 %r109, %r51, %r45;
mul.wide.u32 %rd15, %r109, 4;
add.s64 %rd16, %rd1, %rd15;
atom.global.add.f32 %f10, [%rd16], %f1;
$L__BB3_27:
add.s32 %r126, %r126, 4;
add.s32 %r125, %r125, -4;
setp.ne.s32 %p54, %r125, 0;
@%p54 bra $L__BB3_19;
$L__BB3_28:
setp.eq.s32 %p55, %r40, 0;
@%p55 bra $L__BB3_37;
add.s32 %r55, %r37, %r126;
setp.lt.s32 %p56, %r55, 0;
setp.ge.s32 %p57, %r55, %r3;
or.pred %p58, %p56, %p57;
@%p58 bra $L__BB3_31;
add.s32 %r110, %r55, %r45;
mul.wide.u32 %rd17, %r110, 4;
add.s64 %rd18, %rd1, %rd17;
atom.global.add.f32 %f11, [%rd18], %f1;
$L__BB3_31:
setp.eq.s32 %p59, %r40, 1;
@%p59 bra $L__BB3_37;
add.s32 %r56, %r55, 1;
setp.lt.s32 %p60, %r56, 0;
setp.ge.s32 %p61, %r56, %r3;
or.pred %p62, %p60, %p61;
@%p62 bra $L__BB3_34;
add.s32 %r111, %r56, %r45;
mul.wide.u32 %rd19, %r111, 4;
add.s64 %rd20, %rd1, %rd19;
atom.global.add.f32 %f12, [%rd20], %f1;
$L__BB3_34:
setp.eq.s32 %p63, %r40, 2;
@%p63 bra $L__BB3_37;
add.s32 %r57, %r55, 2;
setp.lt.s32 %p64, %r57, 0;
setp.ge.s32 %p65, %r57, %r3;
or.pred %p66, %p64, %p65;
@%p66 bra $L__BB3_37;
add.s32 %r112, %r57, %r45;
mul.wide.u32 %rd21, %r112, 4;
add.s64 %rd22, %rd1, %rd21;
atom.global.add.f32 %f13, [%rd22], %f1;
$L__BB3_37:
add.s32 %r123, %r123, 1;
setp.lt.u32 %p67, %r123, %r12;
@%p67 bra $L__BB3_16;
$L__BB3_38:
ret;
}