//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-33961263
// Cuda compilation tools, release 12.4, V12.4.99
// Based on NVVM 7.0.1
//
.version 8.4
.target sm_89
.address_size 64
// .globl bit_linear_forward
.visible .entry bit_linear_forward(
.param .u64 bit_linear_forward_param_0,
.param .u64 bit_linear_forward_param_1,
.param .u64 bit_linear_forward_param_2,
.param .f32 bit_linear_forward_param_3,
.param .u32 bit_linear_forward_param_4,
.param .u32 bit_linear_forward_param_5,
.param .u32 bit_linear_forward_param_6
)
{
.reg .pred %p<32>;
.reg .b16 %rs<4>;
.reg .f32 %f<75>;
.reg .b32 %r<62>;
.reg .b64 %rd<17>;
ld.param.u64 %rd4, [bit_linear_forward_param_0];
ld.param.u64 %rd5, [bit_linear_forward_param_1];
ld.param.u64 %rd3, [bit_linear_forward_param_2];
ld.param.f32 %f7, [bit_linear_forward_param_3];
ld.param.u32 %r15, [bit_linear_forward_param_4];
ld.param.u32 %r13, [bit_linear_forward_param_5];
ld.param.u32 %r14, [bit_linear_forward_param_6];
cvta.to.global.u64 %rd1, %rd4;
cvta.to.global.u64 %rd2, %rd5;
mov.u32 %r16, %ntid.y;
mov.u32 %r17, %ctaid.y;
mov.u32 %r18, %tid.y;
mad.lo.s32 %r1, %r17, %r16, %r18;
mov.u32 %r19, %ntid.x;
mov.u32 %r20, %ctaid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r2, %r20, %r19, %r21;
setp.ge.s32 %p1, %r1, %r14;
setp.ge.s32 %p2, %r2, %r15;
or.pred %p3, %p2, %p1;
@%p3 bra $L__BB0_8;
mul.lo.s32 %r3, %r2, %r13;
shr.s32 %r22, %r13, 31;
shr.u32 %r23, %r22, 30;
add.s32 %r24, %r13, %r23;
shr.s32 %r4, %r24, 2;
mul.lo.s32 %r5, %r4, %r1;
setp.lt.s32 %p4, %r13, 4;
mov.f32 %f72, 0f00000000;
@%p4 bra $L__BB0_7;
and.b32 %r6, %r4, 1;
and.b32 %r26, %r13, -4;
setp.eq.s32 %p5, %r26, 4;
mov.f32 %f72, 0f00000000;
mov.u32 %r61, 0;
@%p5 bra $L__BB0_5;
sub.s32 %r60, %r4, %r6;
mov.f32 %f72, 0f00000000;
mov.u32 %r61, 0;
$L__BB0_4:
add.s32 %r28, %r61, %r5;
cvt.s64.s32 %rd6, %r28;
add.s64 %rd7, %rd2, %rd6;
ld.global.nc.u8 %rs1, [%rd7];
cvt.u32.u16 %r29, %rs1;
and.b32 %r30, %r29, 255;
shl.b32 %r31, %r61, 2;
add.s32 %r32, %r31, %r3;
and.b32 %r33, %r29, 3;
mul.wide.s32 %rd8, %r32, 4;
add.s64 %rd9, %rd1, %rd8;
setp.eq.s32 %p6, %r33, 1;
ld.global.nc.f32 %f12, [%rd9];
add.f32 %f13, %f72, %f12;
setp.eq.s32 %p7, %r33, 2;
sub.f32 %f14, %f72, %f12;
selp.f32 %f15, %f14, %f72, %p7;
selp.f32 %f16, %f13, %f15, %p6;
shr.u32 %r34, %r30, 2;
and.b32 %r35, %r34, 3;
setp.eq.s32 %p8, %r35, 1;
ld.global.nc.f32 %f17, [%rd9+4];
add.f32 %f18, %f16, %f17;
setp.eq.s32 %p9, %r35, 2;
sub.f32 %f19, %f16, %f17;
selp.f32 %f20, %f19, %f16, %p9;
selp.f32 %f21, %f18, %f20, %p8;
shr.u32 %r36, %r30, 4;
and.b32 %r37, %r36, 3;
setp.eq.s32 %p10, %r37, 1;
ld.global.nc.f32 %f22, [%rd9+8];
add.f32 %f23, %f21, %f22;
setp.eq.s32 %p11, %r37, 2;
sub.f32 %f24, %f21, %f22;
selp.f32 %f25, %f24, %f21, %p11;
selp.f32 %f26, %f23, %f25, %p10;
shr.u32 %r38, %r30, 6;
setp.eq.s32 %p12, %r38, 1;
ld.global.nc.f32 %f27, [%rd9+12];
add.f32 %f28, %f26, %f27;
setp.eq.s32 %p13, %r38, 2;
sub.f32 %f29, %f26, %f27;
selp.f32 %f30, %f29, %f26, %p13;
selp.f32 %f31, %f28, %f30, %p12;
ld.global.nc.u8 %rs2, [%rd7+1];
cvt.u32.u16 %r39, %rs2;
and.b32 %r40, %r39, 255;
and.b32 %r41, %r39, 3;
setp.eq.s32 %p14, %r41, 1;
ld.global.nc.f32 %f32, [%rd9+16];
add.f32 %f33, %f31, %f32;
setp.eq.s32 %p15, %r41, 2;
sub.f32 %f34, %f31, %f32;
selp.f32 %f35, %f34, %f31, %p15;
selp.f32 %f36, %f33, %f35, %p14;
shr.u32 %r42, %r40, 2;
and.b32 %r43, %r42, 3;
setp.eq.s32 %p16, %r43, 1;
ld.global.nc.f32 %f37, [%rd9+20];
add.f32 %f38, %f36, %f37;
setp.eq.s32 %p17, %r43, 2;
sub.f32 %f39, %f36, %f37;
selp.f32 %f40, %f39, %f36, %p17;
selp.f32 %f41, %f38, %f40, %p16;
shr.u32 %r44, %r40, 4;
and.b32 %r45, %r44, 3;
setp.eq.s32 %p18, %r45, 1;
ld.global.nc.f32 %f42, [%rd9+24];
add.f32 %f43, %f41, %f42;
setp.eq.s32 %p19, %r45, 2;
sub.f32 %f44, %f41, %f42;
selp.f32 %f45, %f44, %f41, %p19;
selp.f32 %f46, %f43, %f45, %p18;
shr.u32 %r46, %r40, 6;
setp.eq.s32 %p20, %r46, 1;
ld.global.nc.f32 %f47, [%rd9+28];
add.f32 %f48, %f46, %f47;
setp.eq.s32 %p21, %r46, 2;
sub.f32 %f49, %f46, %f47;
selp.f32 %f50, %f49, %f46, %p21;
selp.f32 %f72, %f48, %f50, %p20;
add.s32 %r61, %r61, 2;
add.s32 %r60, %r60, -2;
setp.ne.s32 %p22, %r60, 0;
@%p22 bra $L__BB0_4;
$L__BB0_5:
setp.eq.s32 %p23, %r6, 0;
@%p23 bra $L__BB0_7;
add.s32 %r47, %r61, %r5;
cvt.s64.s32 %rd10, %r47;
add.s64 %rd11, %rd2, %rd10;
ld.global.nc.u8 %rs3, [%rd11];
cvt.u32.u16 %r48, %rs3;
and.b32 %r49, %r48, 255;
shl.b32 %r50, %r61, 2;
add.s32 %r51, %r50, %r3;
and.b32 %r52, %r48, 3;
mul.wide.s32 %rd12, %r51, 4;
add.s64 %rd13, %rd1, %rd12;
setp.eq.s32 %p24, %r52, 1;
ld.global.nc.f32 %f51, [%rd13];
add.f32 %f52, %f72, %f51;
setp.eq.s32 %p25, %r52, 2;
sub.f32 %f53, %f72, %f51;
selp.f32 %f54, %f53, %f72, %p25;
selp.f32 %f55, %f52, %f54, %p24;
shr.u32 %r53, %r49, 2;
and.b32 %r54, %r53, 3;
setp.eq.s32 %p26, %r54, 1;
ld.global.nc.f32 %f56, [%rd13+4];
add.f32 %f57, %f55, %f56;
setp.eq.s32 %p27, %r54, 2;
sub.f32 %f58, %f55, %f56;
selp.f32 %f59, %f58, %f55, %p27;
selp.f32 %f60, %f57, %f59, %p26;
shr.u32 %r55, %r49, 4;
and.b32 %r56, %r55, 3;
setp.eq.s32 %p28, %r56, 1;
ld.global.nc.f32 %f61, [%rd13+8];
add.f32 %f62, %f60, %f61;
setp.eq.s32 %p29, %r56, 2;
sub.f32 %f63, %f60, %f61;
selp.f32 %f64, %f63, %f60, %p29;
selp.f32 %f65, %f62, %f64, %p28;
shr.u32 %r57, %r49, 6;
setp.eq.s32 %p30, %r57, 1;
ld.global.nc.f32 %f66, [%rd13+12];
add.f32 %f67, %f65, %f66;
setp.eq.s32 %p31, %r57, 2;
sub.f32 %f68, %f65, %f66;
selp.f32 %f69, %f68, %f65, %p31;
selp.f32 %f72, %f67, %f69, %p30;
$L__BB0_7:
mad.lo.s32 %r58, %r2, %r14, %r1;
cvta.to.global.u64 %rd14, %rd3;
mul.wide.s32 %rd15, %r58, 4;
add.s64 %rd16, %rd14, %rd15;
mul.f32 %f70, %f72, %f7;
st.global.f32 [%rd16], %f70;
$L__BB0_8:
ret;
}