//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35059454
// Cuda compilation tools, release 12.6, V12.6.85
// Based on NVVM 7.0.1
//
.version 8.5
.target sm_75
.address_size 64
// .globl lstm_gates_f32
.visible .entry lstm_gates_f32(
.param .u64 lstm_gates_f32_param_0,
.param .u64 lstm_gates_f32_param_1,
.param .u64 lstm_gates_f32_param_2,
.param .u64 lstm_gates_f32_param_3,
.param .u32 lstm_gates_f32_param_4,
.param .u32 lstm_gates_f32_param_5
)
{
.reg .pred %p<6>;
.reg .f32 %f<103>;
.reg .b32 %r<33>;
.reg .b64 %rd<23>;
ld.param.u64 %rd2, [lstm_gates_f32_param_0];
ld.param.u64 %rd3, [lstm_gates_f32_param_1];
ld.param.u64 %rd4, [lstm_gates_f32_param_2];
ld.param.u64 %rd5, [lstm_gates_f32_param_3];
ld.param.u32 %r2, [lstm_gates_f32_param_4];
ld.param.u32 %r3, [lstm_gates_f32_param_5];
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
setp.ge.u32 %p1, %r1, %r3;
@%p1 bra $L__BB0_8;
cvta.to.global.u64 %rd6, %rd2;
div.u32 %r7, %r1, %r2;
shl.b32 %r8, %r7, 2;
mul.lo.s32 %r9, %r7, %r2;
sub.s32 %r10, %r1, %r9;
mad.lo.s32 %r11, %r8, %r2, %r10;
mul.wide.u32 %rd7, %r11, 4;
add.s64 %rd8, %rd6, %rd7;
add.s32 %r12, %r11, %r2;
mul.wide.u32 %rd9, %r12, 4;
add.s64 %rd10, %rd6, %rd9;
add.s32 %r13, %r12, %r2;
mul.wide.u32 %rd11, %r13, 4;
add.s64 %rd12, %rd6, %rd11;
add.s32 %r14, %r13, %r2;
mul.wide.u32 %rd13, %r14, 4;
add.s64 %rd14, %rd6, %rd13;
ld.global.nc.f32 %f1, [%rd14];
ld.global.nc.f32 %f15, [%rd8];
neg.f32 %f16, %f15;
mov.f32 %f17, 0f3F000000;
mov.f32 %f18, 0f3BBB989D;
fma.rn.f32 %f19, %f16, %f18, %f17;
cvt.sat.f32.f32 %f20, %f19;
mov.f32 %f21, 0f4B400001;
mov.f32 %f22, 0f437C0000;
fma.rm.f32 %f23, %f20, %f22, %f21;
add.f32 %f24, %f23, 0fCB40007F;
neg.f32 %f25, %f24;
mov.f32 %f26, 0f3FB8AA3B;
fma.rn.f32 %f27, %f16, %f26, %f25;
mov.f32 %f28, 0f32A57060;
fma.rn.f32 %f29, %f16, %f28, %f27;
mov.b32 %r15, %f23;
shl.b32 %r16, %r15, 23;
mov.b32 %f30, %r16;
ex2.approx.ftz.f32 %f31, %f29;
fma.rn.f32 %f2, %f31, %f30, 0f3F800000;
ld.global.nc.f32 %f32, [%rd10];
neg.f32 %f33, %f32;
fma.rn.f32 %f34, %f33, %f18, %f17;
cvt.sat.f32.f32 %f35, %f34;
fma.rm.f32 %f36, %f35, %f22, %f21;
add.f32 %f37, %f36, 0fCB40007F;
neg.f32 %f38, %f37;
fma.rn.f32 %f39, %f33, %f26, %f38;
fma.rn.f32 %f40, %f33, %f28, %f39;
mov.b32 %r17, %f36;
shl.b32 %r18, %r17, 23;
mov.b32 %f41, %r18;
ex2.approx.ftz.f32 %f42, %f40;
fma.rn.f32 %f3, %f42, %f41, 0f3F800000;
ld.global.nc.f32 %f4, [%rd12];
abs.f32 %f5, %f4;
setp.ltu.f32 %p2, %f5, 0f3F19999A;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_2;
$L__BB0_3:
mul.f32 %f51, %f4, %f4;
mov.f32 %f52, 0fBD563CAE;
mov.f32 %f53, 0f3C80F082;
fma.rn.f32 %f54, %f53, %f51, %f52;
mov.f32 %f55, 0f3E085941;
fma.rn.f32 %f56, %f54, %f51, %f55;
mov.f32 %f57, 0fBEAAA9ED;
fma.rn.f32 %f58, %f56, %f51, %f57;
mov.f32 %f59, 0f00000000;
fma.rn.f32 %f60, %f58, %f51, %f59;
fma.rn.f32 %f101, %f60, %f4, %f4;
bra.uni $L__BB0_4;
$L__BB0_2:
mul.f32 %f43, %f5, 0f4038AA3B;
ex2.approx.ftz.f32 %f44, %f43;
add.f32 %f45, %f44, 0f3F800000;
mov.f32 %f46, 0f3F800000;
rcp.approx.ftz.f32 %f47, %f45;
mov.f32 %f48, 0fC0000000;
fma.rn.f32 %f49, %f47, %f48, %f46;
setp.ge.f32 %p3, %f5, 0f41102CB4;
selp.f32 %f50, 0f3F800000, %f49, %p3;
mov.b32 %r19, %f50;
mov.b32 %r20, %f4;
and.b32 %r21, %r20, -2147483648;
or.b32 %r22, %r21, %r19;
mov.b32 %f101, %r22;
$L__BB0_4:
neg.f32 %f61, %f1;
mov.f32 %f62, 0f3F000000;
mov.f32 %f63, 0f3BBB989D;
fma.rn.f32 %f64, %f61, %f63, %f62;
cvt.sat.f32.f32 %f65, %f64;
mov.f32 %f66, 0f4B400001;
mov.f32 %f67, 0f437C0000;
fma.rm.f32 %f68, %f65, %f67, %f66;
add.f32 %f69, %f68, 0fCB40007F;
neg.f32 %f70, %f69;
mov.f32 %f71, 0f3FB8AA3B;
fma.rn.f32 %f72, %f61, %f71, %f70;
mov.f32 %f73, 0f32A57060;
fma.rn.f32 %f74, %f61, %f73, %f72;
mov.b32 %r23, %f68;
shl.b32 %r24, %r23, 23;
mov.b32 %f75, %r24;
ex2.approx.ftz.f32 %f76, %f74;
fma.rn.f32 %f9, %f76, %f75, 0f3F800000;
cvt.u64.u32 %rd1, %r1;
cvta.to.global.u64 %rd15, %rd3;
mul.wide.u32 %rd16, %r1, 4;
add.s64 %rd17, %rd15, %rd16;
ld.global.nc.f32 %f77, [%rd17];
rcp.rn.f32 %f78, %f3;
mul.f32 %f79, %f78, %f77;
rcp.rn.f32 %f80, %f2;
fma.rn.f32 %f10, %f80, %f101, %f79;
cvta.to.global.u64 %rd18, %rd5;
add.s64 %rd19, %rd18, %rd16;
st.global.f32 [%rd19], %f10;
abs.f32 %f11, %f10;
setp.ltu.f32 %p4, %f11, 0f3F19999A;
@%p4 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
mul.f32 %f89, %f10, %f10;
mov.f32 %f90, 0fBD563CAE;
mov.f32 %f91, 0f3C80F082;
fma.rn.f32 %f92, %f91, %f89, %f90;
mov.f32 %f93, 0f3E085941;
fma.rn.f32 %f94, %f92, %f89, %f93;
mov.f32 %f95, 0fBEAAA9ED;
fma.rn.f32 %f96, %f94, %f89, %f95;
mov.f32 %f97, 0f00000000;
fma.rn.f32 %f98, %f96, %f89, %f97;
fma.rn.f32 %f102, %f98, %f10, %f10;
bra.uni $L__BB0_7;
$L__BB0_5:
mul.f32 %f81, %f11, 0f4038AA3B;
ex2.approx.ftz.f32 %f82, %f81;
add.f32 %f83, %f82, 0f3F800000;
mov.f32 %f84, 0f3F800000;
rcp.approx.ftz.f32 %f85, %f83;
mov.f32 %f86, 0fC0000000;
fma.rn.f32 %f87, %f85, %f86, %f84;
setp.ge.f32 %p5, %f11, 0f41102CB4;
selp.f32 %f88, 0f3F800000, %f87, %p5;
mov.b32 %r29, %f88;
mov.b32 %r30, %f10;
and.b32 %r31, %r30, -2147483648;
or.b32 %r32, %r31, %r29;
mov.b32 %f102, %r32;
$L__BB0_7:
cvta.to.global.u64 %rd20, %rd4;
shl.b64 %rd21, %rd1, 2;
add.s64 %rd22, %rd20, %rd21;
rcp.rn.f32 %f99, %f9;
mul.f32 %f100, %f99, %f102;
st.global.f32 [%rd22], %f100;
$L__BB0_8:
ret;
}
// .globl gru_gates_f32
.visible .entry gru_gates_f32(
.param .u64 gru_gates_f32_param_0,
.param .u64 gru_gates_f32_param_1,
.param .u64 gru_gates_f32_param_2,
.param .u64 gru_gates_f32_param_3,
.param .u32 gru_gates_f32_param_4,
.param .u32 gru_gates_f32_param_5
)
{
.reg .pred %p<4>;
.reg .f32 %f<68>;
.reg .b32 %r<22>;
.reg .b64 %rd<21>;
ld.param.u64 %rd1, [gru_gates_f32_param_0];
ld.param.u64 %rd2, [gru_gates_f32_param_1];
ld.param.u64 %rd3, [gru_gates_f32_param_2];
ld.param.u64 %rd4, [gru_gates_f32_param_3];
ld.param.u32 %r2, [gru_gates_f32_param_4];
ld.param.u32 %r3, [gru_gates_f32_param_5];
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
setp.ge.u32 %p1, %r1, %r3;
@%p1 bra $L__BB1_5;
cvta.to.global.u64 %rd5, %rd2;
cvta.to.global.u64 %rd6, %rd1;
mul.lo.s32 %r7, %r2, 3;
div.u32 %r8, %r1, %r2;
mul.lo.s32 %r9, %r8, %r2;
sub.s32 %r10, %r1, %r9;
mad.lo.s32 %r11, %r7, %r8, %r10;
mul.wide.u32 %rd7, %r11, 4;
add.s64 %rd8, %rd6, %rd7;
add.s32 %r12, %r11, %r2;
mul.wide.u32 %rd9, %r12, 4;
add.s64 %rd10, %rd6, %rd9;
add.s32 %r13, %r12, %r2;
mul.wide.u32 %rd11, %r13, 4;
add.s64 %rd12, %rd6, %rd11;
add.s64 %rd13, %rd5, %rd7;
add.s64 %rd14, %rd5, %rd9;
add.s64 %rd15, %rd5, %rd11;
ld.global.nc.f32 %f7, [%rd13];
ld.global.nc.f32 %f8, [%rd8];
add.f32 %f9, %f8, %f7;
neg.f32 %f10, %f9;
mov.f32 %f11, 0f3F000000;
mov.f32 %f12, 0f3BBB989D;
fma.rn.f32 %f13, %f10, %f12, %f11;
cvt.sat.f32.f32 %f14, %f13;
mov.f32 %f15, 0f4B400001;
mov.f32 %f16, 0f437C0000;
fma.rm.f32 %f17, %f14, %f16, %f15;
add.f32 %f18, %f17, 0fCB40007F;
neg.f32 %f19, %f18;
mov.f32 %f20, 0f3FB8AA3B;
fma.rn.f32 %f21, %f10, %f20, %f19;
mov.f32 %f22, 0f32A57060;
fma.rn.f32 %f23, %f10, %f22, %f21;
mov.b32 %r14, %f17;
shl.b32 %r15, %r14, 23;
mov.b32 %f24, %r15;
ex2.approx.ftz.f32 %f25, %f23;
fma.rn.f32 %f26, %f25, %f24, 0f3F800000;
rcp.rn.f32 %f27, %f26;
ld.global.nc.f32 %f28, [%rd14];
ld.global.nc.f32 %f29, [%rd10];
add.f32 %f30, %f29, %f28;
neg.f32 %f31, %f30;
fma.rn.f32 %f32, %f31, %f12, %f11;
cvt.sat.f32.f32 %f33, %f32;
fma.rm.f32 %f34, %f33, %f16, %f15;
add.f32 %f35, %f34, 0fCB40007F;
neg.f32 %f36, %f35;
fma.rn.f32 %f37, %f31, %f20, %f36;
fma.rn.f32 %f38, %f31, %f22, %f37;
mov.b32 %r16, %f34;
shl.b32 %r17, %r16, 23;
mov.b32 %f39, %r17;
ex2.approx.ftz.f32 %f40, %f38;
fma.rn.f32 %f1, %f40, %f39, 0f3F800000;
ld.global.nc.f32 %f41, [%rd15];
ld.global.nc.f32 %f42, [%rd12];
fma.rn.f32 %f2, %f41, %f27, %f42;
abs.f32 %f3, %f2;
setp.ltu.f32 %p2, %f3, 0f3F19999A;
@%p2 bra $L__BB1_3;
bra.uni $L__BB1_2;
$L__BB1_3:
mul.f32 %f51, %f2, %f2;
mov.f32 %f52, 0fBD563CAE;
mov.f32 %f53, 0f3C80F082;
fma.rn.f32 %f54, %f53, %f51, %f52;
mov.f32 %f55, 0f3E085941;
fma.rn.f32 %f56, %f54, %f51, %f55;
mov.f32 %f57, 0fBEAAA9ED;
fma.rn.f32 %f58, %f56, %f51, %f57;
mov.f32 %f59, 0f00000000;
fma.rn.f32 %f60, %f58, %f51, %f59;
fma.rn.f32 %f67, %f60, %f2, %f2;
bra.uni $L__BB1_4;
$L__BB1_2:
mul.f32 %f43, %f3, 0f4038AA3B;
ex2.approx.ftz.f32 %f44, %f43;
add.f32 %f45, %f44, 0f3F800000;
mov.f32 %f46, 0f3F800000;
rcp.approx.ftz.f32 %f47, %f45;
mov.f32 %f48, 0fC0000000;
fma.rn.f32 %f49, %f47, %f48, %f46;
setp.ge.f32 %p3, %f3, 0f41102CB4;
selp.f32 %f50, 0f3F800000, %f49, %p3;
mov.b32 %r18, %f50;
mov.b32 %r19, %f2;
and.b32 %r20, %r19, -2147483648;
or.b32 %r21, %r20, %r18;
mov.b32 %f67, %r21;
$L__BB1_4:
rcp.rn.f32 %f61, %f1;
mov.f32 %f62, 0f3F800000;
sub.f32 %f63, %f62, %f61;
cvta.to.global.u64 %rd16, %rd3;
mul.wide.u32 %rd17, %r1, 4;
add.s64 %rd18, %rd16, %rd17;
ld.global.nc.f32 %f64, [%rd18];
mul.f32 %f65, %f61, %f64;
fma.rn.f32 %f66, %f63, %f67, %f65;
cvta.to.global.u64 %rd19, %rd4;
add.s64 %rd20, %rd19, %rd17;
st.global.f32 [%rd20], %f66;
$L__BB1_5:
ret;
}
// .globl lstm_gates_backward_f32
.visible .entry lstm_gates_backward_f32(
.param .u64 lstm_gates_backward_f32_param_0,
.param .u64 lstm_gates_backward_f32_param_1,
.param .u64 lstm_gates_backward_f32_param_2,
.param .u64 lstm_gates_backward_f32_param_3,
.param .u64 lstm_gates_backward_f32_param_4,
.param .u64 lstm_gates_backward_f32_param_5,
.param .u64 lstm_gates_backward_f32_param_6,
.param .u32 lstm_gates_backward_f32_param_7,
.param .u32 lstm_gates_backward_f32_param_8
)
{
.reg .pred %p<6>;
.reg .f32 %f<125>;
.reg .b32 %r<29>;
.reg .b64 %rd<43>;
ld.param.u64 %rd6, [lstm_gates_backward_f32_param_0];
ld.param.u64 %rd7, [lstm_gates_backward_f32_param_1];
ld.param.u64 %rd8, [lstm_gates_backward_f32_param_2];
ld.param.u64 %rd9, [lstm_gates_backward_f32_param_3];
ld.param.u64 %rd10, [lstm_gates_backward_f32_param_4];
ld.param.u64 %rd11, [lstm_gates_backward_f32_param_5];
ld.param.u64 %rd12, [lstm_gates_backward_f32_param_6];
ld.param.u32 %r2, [lstm_gates_backward_f32_param_7];
ld.param.u32 %r3, [lstm_gates_backward_f32_param_8];
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
setp.ge.u32 %p1, %r1, %r3;
@%p1 bra $L__BB2_8;
cvta.to.global.u64 %rd13, %rd6;
div.u32 %r7, %r1, %r2;
shl.b32 %r8, %r7, 2;
mul.lo.s32 %r9, %r7, %r2;
sub.s32 %r10, %r1, %r9;
mad.lo.s32 %r11, %r8, %r2, %r10;
cvt.u64.u32 %rd1, %r11;
mul.wide.u32 %rd14, %r11, 4;
add.s64 %rd15, %rd13, %rd14;
add.s32 %r12, %r11, %r2;
cvt.u64.u32 %rd2, %r12;
mul.wide.u32 %rd16, %r12, 4;
add.s64 %rd17, %rd13, %rd16;
add.s32 %r13, %r12, %r2;
cvt.u64.u32 %rd3, %r13;
mul.wide.u32 %rd18, %r13, 4;
add.s64 %rd19, %rd13, %rd18;
add.s32 %r14, %r13, %r2;
cvt.u64.u32 %rd4, %r14;
mul.wide.u32 %rd20, %r14, 4;
add.s64 %rd21, %rd13, %rd20;
ld.global.nc.f32 %f1, [%rd21];
ld.global.nc.f32 %f15, [%rd15];
neg.f32 %f16, %f15;
mov.f32 %f17, 0f3F000000;
mov.f32 %f18, 0f3BBB989D;
fma.rn.f32 %f19, %f16, %f18, %f17;
cvt.sat.f32.f32 %f20, %f19;
mov.f32 %f21, 0f4B400001;
mov.f32 %f22, 0f437C0000;
fma.rm.f32 %f23, %f20, %f22, %f21;
add.f32 %f24, %f23, 0fCB40007F;
neg.f32 %f25, %f24;
mov.f32 %f26, 0f3FB8AA3B;
fma.rn.f32 %f27, %f16, %f26, %f25;
mov.f32 %f28, 0f32A57060;
fma.rn.f32 %f29, %f16, %f28, %f27;
mov.b32 %r15, %f23;
shl.b32 %r16, %r15, 23;
mov.b32 %f30, %r16;
ex2.approx.ftz.f32 %f31, %f29;
fma.rn.f32 %f2, %f31, %f30, 0f3F800000;
ld.global.nc.f32 %f32, [%rd17];
neg.f32 %f33, %f32;
fma.rn.f32 %f34, %f33, %f18, %f17;
cvt.sat.f32.f32 %f35, %f34;
fma.rm.f32 %f36, %f35, %f22, %f21;
add.f32 %f37, %f36, 0fCB40007F;
neg.f32 %f38, %f37;
fma.rn.f32 %f39, %f33, %f26, %f38;
fma.rn.f32 %f40, %f33, %f28, %f39;
mov.b32 %r17, %f36;
shl.b32 %r18, %r17, 23;
mov.b32 %f41, %r18;
ex2.approx.ftz.f32 %f42, %f40;
fma.rn.f32 %f3, %f42, %f41, 0f3F800000;
ld.global.nc.f32 %f4, [%rd19];
abs.f32 %f5, %f4;
setp.ltu.f32 %p2, %f5, 0f3F19999A;
@%p2 bra $L__BB2_3;
bra.uni $L__BB2_2;
$L__BB2_3:
mul.f32 %f51, %f4, %f4;
mov.f32 %f52, 0fBD563CAE;
mov.f32 %f53, 0f3C80F082;
fma.rn.f32 %f54, %f53, %f51, %f52;
mov.f32 %f55, 0f3E085941;
fma.rn.f32 %f56, %f54, %f51, %f55;
mov.f32 %f57, 0fBEAAA9ED;
fma.rn.f32 %f58, %f56, %f51, %f57;
mov.f32 %f59, 0f00000000;
fma.rn.f32 %f60, %f58, %f51, %f59;
fma.rn.f32 %f123, %f60, %f4, %f4;
bra.uni $L__BB2_4;
$L__BB2_2:
mul.f32 %f43, %f5, 0f4038AA3B;
ex2.approx.ftz.f32 %f44, %f43;
add.f32 %f45, %f44, 0f3F800000;
mov.f32 %f46, 0f3F800000;
rcp.approx.ftz.f32 %f47, %f45;
mov.f32 %f48, 0fC0000000;
fma.rn.f32 %f49, %f47, %f48, %f46;
setp.ge.f32 %p3, %f5, 0f41102CB4;
selp.f32 %f50, 0f3F800000, %f49, %p3;
mov.b32 %r19, %f50;
mov.b32 %r20, %f4;
and.b32 %r21, %r20, -2147483648;
or.b32 %r22, %r21, %r19;
mov.b32 %f123, %r22;
$L__BB2_4:
neg.f32 %f61, %f1;
mov.f32 %f62, 0f3F000000;
mov.f32 %f63, 0f3BBB989D;
fma.rn.f32 %f64, %f61, %f63, %f62;
cvt.sat.f32.f32 %f65, %f64;
mov.f32 %f66, 0f4B400001;
mov.f32 %f67, 0f437C0000;
fma.rm.f32 %f68, %f65, %f67, %f66;
add.f32 %f69, %f68, 0fCB40007F;
neg.f32 %f70, %f69;
mov.f32 %f71, 0f3FB8AA3B;
fma.rn.f32 %f72, %f61, %f71, %f70;
mov.f32 %f73, 0f32A57060;
fma.rn.f32 %f74, %f61, %f73, %f72;
mov.b32 %r23, %f68;
shl.b32 %r24, %r23, 23;
mov.b32 %f75, %r24;
ex2.approx.ftz.f32 %f76, %f74;
fma.rn.f32 %f9, %f76, %f75, 0f3F800000;
cvt.u64.u32 %rd5, %r1;
cvta.to.global.u64 %rd22, %rd8;
mul.wide.u32 %rd23, %r1, 4;
add.s64 %rd24, %rd22, %rd23;
ld.global.nc.f32 %f10, [%rd24];
abs.f32 %f11, %f10;
setp.ltu.f32 %p4, %f11, 0f3F19999A;
@%p4 bra $L__BB2_6;
bra.uni $L__BB2_5;
$L__BB2_6:
mul.f32 %f85, %f10, %f10;
mov.f32 %f86, 0fBD563CAE;
mov.f32 %f87, 0f3C80F082;
fma.rn.f32 %f88, %f87, %f85, %f86;
mov.f32 %f89, 0f3E085941;
fma.rn.f32 %f90, %f88, %f85, %f89;
mov.f32 %f91, 0fBEAAA9ED;
fma.rn.f32 %f92, %f90, %f85, %f91;
mov.f32 %f93, 0f00000000;
fma.rn.f32 %f94, %f92, %f85, %f93;
fma.rn.f32 %f124, %f94, %f10, %f10;
bra.uni $L__BB2_7;
$L__BB2_5:
mul.f32 %f77, %f11, 0f4038AA3B;
ex2.approx.ftz.f32 %f78, %f77;
add.f32 %f79, %f78, 0f3F800000;
mov.f32 %f80, 0f3F800000;
rcp.approx.ftz.f32 %f81, %f79;
mov.f32 %f82, 0fC0000000;
fma.rn.f32 %f83, %f81, %f82, %f80;
setp.ge.f32 %p5, %f11, 0f41102CB4;
selp.f32 %f84, 0f3F800000, %f83, %p5;
mov.b32 %r25, %f84;
mov.b32 %r26, %f10;
and.b32 %r27, %r26, -2147483648;
or.b32 %r28, %r27, %r25;
mov.b32 %f124, %r28;
$L__BB2_7:
cvta.to.global.u64 %rd25, %rd11;
rcp.rn.f32 %f95, %f2;
mov.f32 %f96, 0f3F800000;
rcp.rn.f32 %f97, %f3;
rcp.rn.f32 %f98, %f9;
cvta.to.global.u64 %rd26, %rd9;
shl.b64 %rd27, %rd5, 2;
add.s64 %rd28, %rd26, %rd27;
ld.global.nc.f32 %f99, [%rd28];
mul.f32 %f100, %f98, %f99;
mul.f32 %f101, %f124, %f124;
sub.f32 %f102, %f96, %f101;
cvta.to.global.u64 %rd29, %rd10;
add.s64 %rd30, %rd29, %rd27;
ld.global.nc.f32 %f103, [%rd30];
fma.rn.f32 %f104, %f100, %f102, %f103;
mul.f32 %f105, %f123, %f104;
mul.f32 %f106, %f95, %f105;
sub.f32 %f107, %f96, %f95;
mul.f32 %f108, %f107, %f106;
cvta.to.global.u64 %rd31, %rd7;
add.s64 %rd32, %rd31, %rd27;
ld.global.nc.f32 %f109, [%rd32];
mul.f32 %f110, %f109, %f104;
mul.f32 %f111, %f97, %f110;
sub.f32 %f112, %f96, %f97;
mul.f32 %f113, %f112, %f111;
mul.f32 %f114, %f95, %f104;
mul.f32 %f115, %f123, %f123;
sub.f32 %f116, %f96, %f115;
mul.f32 %f117, %f116, %f114;
mul.f32 %f118, %f124, %f99;
mul.f32 %f119, %f98, %f118;
sub.f32 %f120, %f96, %f98;
mul.f32 %f121, %f120, %f119;
shl.b64 %rd33, %rd1, 2;
add.s64 %rd34, %rd25, %rd33;
st.global.f32 [%rd34], %f108;
shl.b64 %rd35, %rd2, 2;
add.s64 %rd36, %rd25, %rd35;
st.global.f32 [%rd36], %f113;
shl.b64 %rd37, %rd3, 2;
add.s64 %rd38, %rd25, %rd37;
st.global.f32 [%rd38], %f117;
shl.b64 %rd39, %rd4, 2;
add.s64 %rd40, %rd25, %rd39;
st.global.f32 [%rd40], %f121;
mul.f32 %f122, %f97, %f104;
cvta.to.global.u64 %rd41, %rd12;
add.s64 %rd42, %rd41, %rd27;
st.global.f32 [%rd42], %f122;
$L__BB2_8:
ret;
}
// .globl gru_gates_backward_f32
.visible .entry gru_gates_backward_f32(
.param .u64 gru_gates_backward_f32_param_0,
.param .u64 gru_gates_backward_f32_param_1,
.param .u64 gru_gates_backward_f32_param_2,
.param .u64 gru_gates_backward_f32_param_3,
.param .u64 gru_gates_backward_f32_param_4,
.param .u64 gru_gates_backward_f32_param_5,
.param .u64 gru_gates_backward_f32_param_6,
.param .u32 gru_gates_backward_f32_param_7,
.param .u32 gru_gates_backward_f32_param_8
)
{
.reg .pred %p<4>;
.reg .f32 %f<81>;
.reg .b32 %r<22>;
.reg .b64 %rd<40>;
ld.param.u64 %rd4, [gru_gates_backward_f32_param_0];
ld.param.u64 %rd5, [gru_gates_backward_f32_param_1];
ld.param.u64 %rd6, [gru_gates_backward_f32_param_2];
ld.param.u64 %rd7, [gru_gates_backward_f32_param_3];
ld.param.u64 %rd8, [gru_gates_backward_f32_param_4];
ld.param.u64 %rd9, [gru_gates_backward_f32_param_5];
ld.param.u64 %rd10, [gru_gates_backward_f32_param_6];
ld.param.u32 %r2, [gru_gates_backward_f32_param_7];
ld.param.u32 %r3, [gru_gates_backward_f32_param_8];
mov.u32 %r4, %ntid.x;
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %tid.x;
mad.lo.s32 %r1, %r5, %r4, %r6;
setp.ge.u32 %p1, %r1, %r3;
@%p1 bra $L__BB3_5;
cvta.to.global.u64 %rd11, %rd5;
cvta.to.global.u64 %rd12, %rd4;
mul.lo.s32 %r7, %r2, 3;
div.u32 %r8, %r1, %r2;
mul.lo.s32 %r9, %r8, %r2;
sub.s32 %r10, %r1, %r9;
mad.lo.s32 %r11, %r7, %r8, %r10;
cvt.u64.u32 %rd1, %r11;
mul.wide.u32 %rd13, %r11, 4;
add.s64 %rd14, %rd12, %rd13;
add.s32 %r12, %r11, %r2;
cvt.u64.u32 %rd2, %r12;
mul.wide.u32 %rd15, %r12, 4;
add.s64 %rd16, %rd12, %rd15;
add.s32 %r13, %r12, %r2;
cvt.u64.u32 %rd3, %r13;
mul.wide.u32 %rd17, %r13, 4;
add.s64 %rd18, %rd12, %rd17;
add.s64 %rd19, %rd11, %rd13;
add.s64 %rd20, %rd11, %rd15;
add.s64 %rd21, %rd11, %rd17;
ld.global.nc.f32 %f9, [%rd19];
ld.global.nc.f32 %f10, [%rd14];
add.f32 %f11, %f10, %f9;
neg.f32 %f12, %f11;
mov.f32 %f13, 0f3F000000;
mov.f32 %f14, 0f3BBB989D;
fma.rn.f32 %f15, %f12, %f14, %f13;
cvt.sat.f32.f32 %f16, %f15;
mov.f32 %f17, 0f4B400001;
mov.f32 %f18, 0f437C0000;
fma.rm.f32 %f19, %f16, %f18, %f17;
add.f32 %f20, %f19, 0fCB40007F;
neg.f32 %f21, %f20;
mov.f32 %f22, 0f3FB8AA3B;
fma.rn.f32 %f23, %f12, %f22, %f21;
mov.f32 %f24, 0f32A57060;
fma.rn.f32 %f25, %f12, %f24, %f23;
mov.b32 %r14, %f19;
shl.b32 %r15, %r14, 23;
mov.b32 %f26, %r15;
ex2.approx.ftz.f32 %f27, %f25;
fma.rn.f32 %f28, %f27, %f26, 0f3F800000;
rcp.rn.f32 %f1, %f28;
ld.global.nc.f32 %f29, [%rd20];
ld.global.nc.f32 %f30, [%rd16];
add.f32 %f31, %f30, %f29;
neg.f32 %f32, %f31;
fma.rn.f32 %f33, %f32, %f14, %f13;
cvt.sat.f32.f32 %f34, %f33;
fma.rm.f32 %f35, %f34, %f18, %f17;
add.f32 %f36, %f35, 0fCB40007F;
neg.f32 %f37, %f36;
fma.rn.f32 %f38, %f32, %f22, %f37;
fma.rn.f32 %f39, %f32, %f24, %f38;
mov.b32 %r16, %f35;
shl.b32 %r17, %r16, 23;
mov.b32 %f40, %r17;
ex2.approx.ftz.f32 %f41, %f39;
fma.rn.f32 %f2, %f41, %f40, 0f3F800000;
ld.global.nc.f32 %f3, [%rd21];
ld.global.nc.f32 %f42, [%rd18];
fma.rn.f32 %f4, %f3, %f1, %f42;
abs.f32 %f5, %f4;
setp.ltu.f32 %p2, %f5, 0f3F19999A;
@%p2 bra $L__BB3_3;
bra.uni $L__BB3_2;
$L__BB3_3:
mul.f32 %f51, %f4, %f4;
mov.f32 %f52, 0fBD563CAE;
mov.f32 %f53, 0f3C80F082;
fma.rn.f32 %f54, %f53, %f51, %f52;
mov.f32 %f55, 0f3E085941;
fma.rn.f32 %f56, %f54, %f51, %f55;
mov.f32 %f57, 0fBEAAA9ED;
fma.rn.f32 %f58, %f56, %f51, %f57;
mov.f32 %f59, 0f00000000;
fma.rn.f32 %f60, %f58, %f51, %f59;
fma.rn.f32 %f80, %f60, %f4, %f4;
bra.uni $L__BB3_4;
$L__BB3_2:
mul.f32 %f43, %f5, 0f4038AA3B;
ex2.approx.ftz.f32 %f44, %f43;
add.f32 %f45, %f44, 0f3F800000;
mov.f32 %f46, 0f3F800000;
rcp.approx.ftz.f32 %f47, %f45;
mov.f32 %f48, 0fC0000000;
fma.rn.f32 %f49, %f47, %f48, %f46;
setp.ge.f32 %p3, %f5, 0f41102CB4;
selp.f32 %f50, 0f3F800000, %f49, %p3;
mov.b32 %r18, %f50;
mov.b32 %r19, %f4;
and.b32 %r20, %r19, -2147483648;
or.b32 %r21, %r20, %r18;
mov.b32 %f80, %r21;
$L__BB3_4:
cvta.to.global.u64 %rd22, %rd9;
cvta.to.global.u64 %rd23, %rd8;
rcp.rn.f32 %f61, %f2;
mov.f32 %f62, 0f3F800000;
cvta.to.global.u64 %rd24, %rd6;
mul.wide.u32 %rd25, %r1, 4;
add.s64 %rd26, %rd24, %rd25;
cvta.to.global.u64 %rd27, %rd7;
add.s64 %rd28, %rd27, %rd25;
ld.global.nc.f32 %f63, [%rd26];
sub.f32 %f64, %f63, %f80;
ld.global.nc.f32 %f65, [%rd28];
mul.f32 %f66, %f65, %f64;
sub.f32 %f67, %f62, %f61;
mul.f32 %f68, %f67, %f65;
mul.f32 %f69, %f61, %f65;
cvta.to.global.u64 %rd29, %rd10;
add.s64 %rd30, %rd29, %rd25;
st.global.f32 [%rd30], %f69;
mul.f32 %f70, %f80, %f80;
sub.f32 %f71, %f62, %f70;
mul.f32 %f72, %f71, %f68;
mul.f32 %f73, %f61, %f66;
mul.f32 %f74, %f67, %f73;
mul.f32 %f75, %f3, %f72;
mul.f32 %f76, %f1, %f75;
sub.f32 %f77, %f62, %f1;
mul.f32 %f78, %f77, %f76;
shl.b64 %rd31, %rd1, 2;
add.s64 %rd32, %rd23, %rd31;
st.global.f32 [%rd32], %f78;
shl.b64 %rd33, %rd2, 2;
add.s64 %rd34, %rd23, %rd33;
st.global.f32 [%rd34], %f74;
shl.b64 %rd35, %rd3, 2;
add.s64 %rd36, %rd23, %rd35;
st.global.f32 [%rd36], %f72;
add.s64 %rd37, %rd22, %rd31;
st.global.f32 [%rd37], %f78;
add.s64 %rd38, %rd22, %rd33;
st.global.f32 [%rd38], %f74;
mul.f32 %f79, %f1, %f72;
add.s64 %rd39, %rd22, %rd35;
st.global.f32 [%rd39], %f79;
$L__BB3_5:
ret;
}
// .globl batchnorm_stats_f32
.visible .entry batchnorm_stats_f32(
.param .u64 batchnorm_stats_f32_param_0,
.param .u64 batchnorm_stats_f32_param_1,
.param .u64 batchnorm_stats_f32_param_2,
.param .u32 batchnorm_stats_f32_param_3,
.param .u32 batchnorm_stats_f32_param_4,
.param .u32 batchnorm_stats_f32_param_5
)
{
.reg .pred %p<2>;
.reg .f32 %f<5>;
.reg .b32 %r<12>;
.reg .b64 %rd<12>;
ld.param.u64 %rd1, [batchnorm_stats_f32_param_0];
ld.param.u64 %rd2, [batchnorm_stats_f32_param_1];
ld.param.u64 %rd3, [batchnorm_stats_f32_param_2];
ld.param.u32 %r4, [batchnorm_stats_f32_param_3];
ld.param.u32 %r2, [batchnorm_stats_f32_param_4];
ld.param.u32 %r3, [batchnorm_stats_f32_param_5];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r5, %r6, %r7;
mul.lo.s32 %r8, %r2, %r4;
mul.lo.s32 %r9, %r8, %r3;
setp.ge.u32 %p1, %r1, %r9;
@%p1 bra $L__BB4_2;
cvta.to.global.u64 %rd4, %rd1;
div.u32 %r10, %r1, %r3;
rem.u32 %r11, %r10, %r2;
mul.wide.u32 %rd5, %r1, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.nc.f32 %f1, [%rd6];
cvta.to.global.u64 %rd7, %rd2;
mul.wide.u32 %rd8, %r11, 4;
add.s64 %rd9, %rd7, %rd8;
atom.global.add.f32 %f2, [%rd9], %f1;
cvta.to.global.u64 %rd10, %rd3;
add.s64 %rd11, %rd10, %rd8;
mul.f32 %f3, %f1, %f1;
atom.global.add.f32 %f4, [%rd11], %f3;
$L__BB4_2:
ret;
}
// .globl batchnorm_norm_f32
.visible .entry batchnorm_norm_f32(
.param .u64 batchnorm_norm_f32_param_0,
.param .u64 batchnorm_norm_f32_param_1,
.param .u64 batchnorm_norm_f32_param_2,
.param .u64 batchnorm_norm_f32_param_3,
.param .u64 batchnorm_norm_f32_param_4,
.param .u64 batchnorm_norm_f32_param_5,
.param .f32 batchnorm_norm_f32_param_6,
.param .u32 batchnorm_norm_f32_param_7,
.param .u32 batchnorm_norm_f32_param_8,
.param .u32 batchnorm_norm_f32_param_9
)
{
.reg .pred %p<2>;
.reg .f32 %f<12>;
.reg .b32 %r<10>;
.reg .b64 %rd<21>;
ld.param.u64 %rd1, [batchnorm_norm_f32_param_0];
ld.param.u64 %rd2, [batchnorm_norm_f32_param_1];
ld.param.u64 %rd3, [batchnorm_norm_f32_param_2];
ld.param.u64 %rd4, [batchnorm_norm_f32_param_3];
ld.param.u64 %rd5, [batchnorm_norm_f32_param_4];
ld.param.u64 %rd6, [batchnorm_norm_f32_param_5];
ld.param.f32 %f1, [batchnorm_norm_f32_param_6];
ld.param.u32 %r2, [batchnorm_norm_f32_param_7];
ld.param.u32 %r3, [batchnorm_norm_f32_param_8];
ld.param.u32 %r4, [batchnorm_norm_f32_param_9];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mov.u32 %r7, %tid.x;
mad.lo.s32 %r1, %r5, %r6, %r7;
setp.ge.u32 %p1, %r1, %r4;
@%p1 bra $L__BB5_2;
cvta.to.global.u64 %rd7, %rd1;
div.u32 %r8, %r1, %r3;
rem.u32 %r9, %r8, %r2;
mul.wide.u32 %rd8, %r1, 4;
add.s64 %rd9, %rd7, %rd8;
cvta.to.global.u64 %rd10, %rd2;
mul.wide.u32 %rd11, %r9, 4;
add.s64 %rd12, %rd10, %rd11;
cvta.to.global.u64 %rd13, %rd3;
add.s64 %rd14, %rd13, %rd11;
ld.global.nc.f32 %f2, [%rd14];
add.f32 %f3, %f2, %f1;
rsqrt.approx.f32 %f4, %f3;
cvta.to.global.u64 %rd15, %rd4;
add.s64 %rd16, %rd15, %rd11;
ld.global.nc.f32 %f5, [%rd12];
ld.global.nc.f32 %f6, [%rd9];
sub.f32 %f7, %f6, %f5;
ld.global.nc.f32 %f8, [%rd16];
mul.f32 %f9, %f7, %f8;
cvta.to.global.u64 %rd17, %rd5;
add.s64 %rd18, %rd17, %rd11;
ld.global.nc.f32 %f10, [%rd18];
fma.rn.f32 %f11, %f4, %f9, %f10;
cvta.to.global.u64 %rd19, %rd6;
add.s64 %rd20, %rd19, %rd8;
st.global.f32 [%rd20], %f11;
$L__BB5_2:
ret;
}