//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-36037853
// Cuda compilation tools, release 12.9, V12.9.86
// Based on NVVM 7.0.1
//
.version 8.8
.target sm_80
.address_size 64
// .globl q6k_gemv_f32
.visible .entry q6k_gemv_f32(
.param .u64 q6k_gemv_f32_param_0,
.param .u64 q6k_gemv_f32_param_1,
.param .u64 q6k_gemv_f32_param_2,
.param .u32 q6k_gemv_f32_param_3,
.param .u32 q6k_gemv_f32_param_4
)
{
.reg .pred %p<15>;
.reg .b16 %rs<31>;
.reg .f32 %f<66>;
.reg .b32 %r<127>;
.reg .b64 %rd<22>;
ld.param.u64 %rd4, [q6k_gemv_f32_param_0];
ld.param.u64 %rd5, [q6k_gemv_f32_param_1];
ld.param.u64 %rd6, [q6k_gemv_f32_param_2];
ld.param.u32 %r17, [q6k_gemv_f32_param_3];
ld.param.u32 %r16, [q6k_gemv_f32_param_4];
mov.u32 %r1, %tid.x;
shr.u32 %r18, %r1, 5;
mov.u32 %r19, %ntid.x;
shr.u32 %r20, %r19, 5;
mov.u32 %r21, %ctaid.x;
mad.lo.s32 %r2, %r20, %r21, %r18;
setp.ge.u32 %p1, %r2, %r17;
@%p1 bra $L__BB0_15;
shr.u32 %r22, %r16, 8;
mul.lo.s32 %r23, %r22, 210;
mul.wide.u32 %rd1, %r23, %r2;
setp.eq.s32 %p2, %r22, 0;
mov.f32 %f65, 0f00000000;
@%p2 bra $L__BB0_13;
and.b32 %r25, %r1, 31;
cvt.u64.u32 %rd2, %r25;
mov.f32 %f65, 0f00000000;
mov.u32 %r123, 0;
cvta.to.global.u64 %rd8, %rd4;
cvta.to.global.u64 %rd16, %rd5;
$L__BB0_3:
mul.lo.s32 %r26, %r123, 210;
cvt.u64.u32 %rd7, %r26;
add.s64 %rd3, %rd1, %rd7;
add.s64 %rd9, %rd8, %rd3;
ld.global.nc.u8 %rs3, [%rd9+208];
cvt.u32.u16 %r27, %rs3;
and.b32 %r28, %r27, 255;
ld.global.nc.u8 %rs4, [%rd9+209];
cvt.u32.u16 %r29, %rs4;
and.b32 %r30, %r29, 128;
prmt.b32 %r31, %r29, %r28, 30212;
cvt.u16.u32 %rs1, %r31;
shr.u32 %r4, %r30, 7;
shr.u16 %rs5, %rs1, 10;
and.b16 %rs2, %rs5, 31;
and.b32 %r124, %r31, 1023;
setp.eq.s16 %p3, %rs2, 0;
@%p3 bra $L__BB0_7;
setp.eq.s16 %p4, %rs2, 31;
@%p4 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
and.b16 %rs7, %rs1, 1023;
setp.eq.s16 %p5, %rs7, 0;
shl.b32 %r37, %r4, 31;
or.b32 %r38, %r37, 2139095040;
selp.b32 %r126, %r38, 2143289344, %p5;
bra.uni $L__BB0_12;
$L__BB0_7:
and.b16 %rs8, %rs1, 1023;
setp.eq.s16 %p6, %rs8, 0;
@%p6 bra $L__BB0_11;
mov.u32 %r125, -14;
$L__BB0_9:
shl.b32 %r10, %r124, 1;
add.s32 %r125, %r125, -1;
and.b32 %r40, %r124, 512;
setp.eq.s32 %p7, %r40, 0;
mov.u32 %r124, %r10;
@%p7 bra $L__BB0_9;
shl.b32 %r41, %r125, 23;
add.s32 %r42, %r41, 1065353216;
shl.b32 %r43, %r10, 13;
and.b32 %r44, %r43, 8372224;
shl.b32 %r45, %r4, 31;
or.b32 %r46, %r44, %r45;
or.b32 %r126, %r46, %r42;
bra.uni $L__BB0_12;
$L__BB0_5:
add.s16 %rs6, %rs2, 112;
cvt.u32.u16 %r32, %rs6;
shl.b32 %r33, %r32, 23;
shl.b32 %r34, %r124, 13;
shl.b32 %r35, %r4, 31;
or.b32 %r36, %r34, %r35;
or.b32 %r126, %r36, %r33;
bra.uni $L__BB0_12;
$L__BB0_11:
shl.b32 %r126, %r4, 31;
$L__BB0_12:
prmt.b32 %r49, %r123, %r25, 8452;
add.s64 %rd11, %rd3, %rd2;
add.s64 %rd12, %rd8, %rd11;
ld.global.nc.u8 %rs9, [%rd12];
cvt.u32.u16 %r50, %rs9;
and.b32 %r51, %r50, 240;
ld.global.nc.u8 %rs10, [%rd12+32];
cvt.u32.u16 %r52, %rs10;
and.b32 %r53, %r52, 240;
ld.global.nc.u8 %rs11, [%rd12+128];
cvt.u32.u16 %r54, %rs11;
and.b32 %r55, %r54, 252;
and.b32 %r56, %r50, 15;
and.b32 %r57, %r54, 3;
bfi.b32 %r58, %r57, %r56, 4, 2;
add.s32 %r59, %r58, -32;
and.b32 %r60, %r52, 15;
shr.u32 %r61, %r55, 2;
and.b32 %r62, %r61, 3;
bfi.b32 %r63, %r62, %r60, 4, 2;
add.s32 %r64, %r63, -32;
shr.u32 %r65, %r51, 4;
and.b32 %r66, %r54, 48;
or.b32 %r67, %r66, %r65;
add.s32 %r68, %r67, -32;
shr.u32 %r69, %r53, 4;
and.b32 %r70, %r61, 48;
or.b32 %r71, %r70, %r69;
add.s32 %r72, %r71, -32;
shr.u32 %r73, %r25, 4;
cvt.u64.u32 %rd13, %r73;
add.s64 %rd14, %rd3, %rd13;
add.s64 %rd15, %rd8, %rd14;
ld.global.nc.u8 %rs12, [%rd15+192];
cvt.s16.s8 %rs13, %rs12;
cvt.rn.f32.s16 %f7, %rs13;
mov.b32 %f8, %r126;
mul.ftz.f32 %f9, %f8, %f7;
ld.global.nc.u8 %rs14, [%rd15+194];
cvt.s16.s8 %rs15, %rs14;
cvt.rn.f32.s16 %f10, %rs15;
mul.ftz.f32 %f11, %f8, %f10;
ld.global.nc.u8 %rs16, [%rd15+196];
cvt.s16.s8 %rs17, %rs16;
cvt.rn.f32.s16 %f12, %rs17;
mul.ftz.f32 %f13, %f8, %f12;
ld.global.nc.u8 %rs18, [%rd15+198];
cvt.s16.s8 %rs19, %rs18;
cvt.rn.f32.s16 %f14, %rs19;
mul.ftz.f32 %f15, %f8, %f14;
mul.wide.u32 %rd17, %r49, 4;
add.s64 %rd18, %rd16, %rd17;
cvt.rn.f32.s32 %f16, %r59;
mul.ftz.f32 %f17, %f9, %f16;
ld.global.nc.f32 %f18, [%rd18];
fma.rn.ftz.f32 %f19, %f18, %f17, %f65;
cvt.rn.f32.s32 %f20, %r64;
mul.ftz.f32 %f21, %f11, %f20;
ld.global.nc.f32 %f22, [%rd18+128];
fma.rn.ftz.f32 %f23, %f22, %f21, %f19;
cvt.rn.f32.s32 %f24, %r68;
mul.ftz.f32 %f25, %f13, %f24;
ld.global.nc.f32 %f26, [%rd18+256];
fma.rn.ftz.f32 %f27, %f25, %f26, %f23;
cvt.rn.f32.s32 %f28, %r72;
mul.ftz.f32 %f29, %f15, %f28;
ld.global.nc.f32 %f30, [%rd18+384];
fma.rn.ftz.f32 %f31, %f29, %f30, %f27;
ld.global.nc.u8 %rs20, [%rd12+64];
cvt.u32.u16 %r74, %rs20;
and.b32 %r75, %r74, 240;
ld.global.nc.u8 %rs21, [%rd12+96];
cvt.u32.u16 %r76, %rs21;
and.b32 %r77, %r76, 240;
ld.global.nc.u8 %rs22, [%rd12+160];
cvt.u32.u16 %r78, %rs22;
and.b32 %r79, %r78, 252;
and.b32 %r80, %r74, 15;
and.b32 %r81, %r78, 3;
bfi.b32 %r82, %r81, %r80, 4, 2;
add.s32 %r83, %r82, -32;
and.b32 %r84, %r76, 15;
shr.u32 %r85, %r79, 2;
and.b32 %r86, %r85, 3;
bfi.b32 %r87, %r86, %r84, 4, 2;
add.s32 %r88, %r87, -32;
shr.u32 %r89, %r75, 4;
and.b32 %r90, %r78, 48;
or.b32 %r91, %r90, %r89;
add.s32 %r92, %r91, -32;
shr.u32 %r93, %r77, 4;
and.b32 %r94, %r85, 48;
or.b32 %r95, %r94, %r93;
add.s32 %r96, %r95, -32;
ld.global.nc.u8 %rs23, [%rd15+200];
cvt.s16.s8 %rs24, %rs23;
cvt.rn.f32.s16 %f32, %rs24;
mul.ftz.f32 %f33, %f8, %f32;
ld.global.nc.u8 %rs25, [%rd15+202];
cvt.s16.s8 %rs26, %rs25;
cvt.rn.f32.s16 %f34, %rs26;
mul.ftz.f32 %f35, %f8, %f34;
ld.global.nc.u8 %rs27, [%rd15+204];
cvt.s16.s8 %rs28, %rs27;
cvt.rn.f32.s16 %f36, %rs28;
mul.ftz.f32 %f37, %f8, %f36;
ld.global.nc.u8 %rs29, [%rd15+206];
cvt.s16.s8 %rs30, %rs29;
cvt.rn.f32.s16 %f38, %rs30;
mul.ftz.f32 %f39, %f8, %f38;
cvt.rn.f32.s32 %f40, %r83;
mul.ftz.f32 %f41, %f33, %f40;
ld.global.nc.f32 %f42, [%rd18+512];
fma.rn.ftz.f32 %f43, %f42, %f41, %f31;
cvt.rn.f32.s32 %f44, %r88;
mul.ftz.f32 %f45, %f35, %f44;
ld.global.nc.f32 %f46, [%rd18+640];
fma.rn.ftz.f32 %f47, %f46, %f45, %f43;
cvt.rn.f32.s32 %f48, %r92;
mul.ftz.f32 %f49, %f37, %f48;
ld.global.nc.f32 %f50, [%rd18+768];
fma.rn.ftz.f32 %f51, %f49, %f50, %f47;
cvt.rn.f32.s32 %f52, %r96;
mul.ftz.f32 %f53, %f39, %f52;
ld.global.nc.f32 %f54, [%rd18+896];
fma.rn.ftz.f32 %f65, %f53, %f54, %f51;
add.s32 %r123, %r123, 1;
setp.lt.u32 %p8, %r123, %r22;
@%p8 bra $L__BB0_3;
$L__BB0_13:
mov.b32 %r98, %f65;
mov.u32 %r99, 31;
mov.u32 %r100, 16;
mov.u32 %r101, -1;
shfl.sync.bfly.b32 %r102|%p9, %r98, %r100, %r99, %r101;
mov.b32 %f55, %r102;
add.ftz.f32 %f56, %f65, %f55;
mov.b32 %r103, %f56;
mov.u32 %r104, 8;
shfl.sync.bfly.b32 %r105|%p10, %r103, %r104, %r99, %r101;
mov.b32 %f57, %r105;
add.ftz.f32 %f58, %f56, %f57;
mov.b32 %r106, %f58;
mov.u32 %r107, 4;
shfl.sync.bfly.b32 %r108|%p11, %r106, %r107, %r99, %r101;
mov.b32 %f59, %r108;
add.ftz.f32 %f60, %f58, %f59;
mov.b32 %r109, %f60;
mov.u32 %r110, 2;
shfl.sync.bfly.b32 %r111|%p12, %r109, %r110, %r99, %r101;
mov.b32 %f61, %r111;
add.ftz.f32 %f62, %f60, %f61;
mov.b32 %r112, %f62;
mov.u32 %r113, 1;
shfl.sync.bfly.b32 %r114|%p13, %r112, %r113, %r99, %r101;
mov.b32 %f63, %r114;
add.ftz.f32 %f4, %f62, %f63;
and.b32 %r116, %r1, 31;
setp.ne.s32 %p14, %r116, 0;
@%p14 bra $L__BB0_15;
cvta.to.global.u64 %rd19, %rd6;
mul.wide.u32 %rd20, %r2, 4;
add.s64 %rd21, %rd19, %rd20;
st.global.f32 [%rd21], %f4;
$L__BB0_15:
ret;
}
// .globl q6k_gemm_f32
.visible .entry q6k_gemm_f32(
.param .u64 q6k_gemm_f32_param_0,
.param .u64 q6k_gemm_f32_param_1,
.param .u64 q6k_gemm_f32_param_2,
.param .u32 q6k_gemm_f32_param_3,
.param .u32 q6k_gemm_f32_param_4,
.param .u32 q6k_gemm_f32_param_5
)
{
.reg .pred %p<9>;
.reg .b16 %rs<232>;
.reg .f32 %f<1064>;
.reg .b32 %r<1531>;
.reg .b64 %rd<18>;
ld.param.u64 %rd4, [q6k_gemm_f32_param_0];
ld.param.u64 %rd5, [q6k_gemm_f32_param_1];
ld.param.u64 %rd6, [q6k_gemm_f32_param_2];
ld.param.u32 %r18, [q6k_gemm_f32_param_3];
ld.param.u32 %r16, [q6k_gemm_f32_param_4];
ld.param.u32 %r17, [q6k_gemm_f32_param_5];
mov.u32 %r19, %ctaid.x;
mov.u32 %r20, %ntid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r1, %r19, %r20, %r21;
mul.lo.s32 %r22, %r16, %r18;
setp.ge.u32 %p1, %r1, %r22;
@%p1 bra $L__BB1_14;
shr.u32 %r2, %r17, 8;
setp.eq.s32 %p2, %r2, 0;
mov.f32 %f1063, 0f00000000;
@%p2 bra $L__BB1_13;
div.u32 %r24, %r1, %r16;
mul.wide.u32 %rd1, %r24, %r17;
mul.lo.s32 %r25, %r2, 210;
mul.lo.s32 %r26, %r24, %r16;
sub.s32 %r27, %r1, %r26;
mul.wide.u32 %rd2, %r27, %r25;
mov.u32 %r1527, 0;
mov.f32 %f1063, 0f00000000;
cvta.to.global.u64 %rd9, %rd4;
cvta.to.global.u64 %rd12, %rd5;
$L__BB1_3:
mul.lo.s32 %r28, %r1527, 210;
cvt.u64.u32 %rd7, %r28;
add.s64 %rd8, %rd2, %rd7;
add.s64 %rd3, %rd9, %rd8;
ld.global.nc.u8 %rs3, [%rd3+208];
cvt.u32.u16 %r29, %rs3;
and.b32 %r30, %r29, 255;
ld.global.nc.u8 %rs4, [%rd3+209];
cvt.u32.u16 %r31, %rs4;
and.b32 %r32, %r31, 128;
prmt.b32 %r33, %r31, %r30, 30212;
cvt.u16.u32 %rs5, %r33;
shr.u32 %r4, %r32, 7;
shr.u16 %rs6, %rs5, 10;
and.b16 %rs1, %rs6, 31;
and.b16 %rs2, %rs5, 1023;
and.b32 %r1528, %r33, 1023;
setp.eq.s16 %p3, %rs1, 0;
@%p3 bra $L__BB1_7;
setp.eq.s16 %p4, %rs1, 31;
@%p4 bra $L__BB1_6;
bra.uni $L__BB1_5;
$L__BB1_6:
setp.eq.s16 %p5, %rs2, 0;
shl.b32 %r39, %r4, 31;
or.b32 %r40, %r39, 2139095040;
selp.b32 %r1530, %r40, 2143289344, %p5;
bra.uni $L__BB1_12;
$L__BB1_7:
setp.eq.s16 %p6, %rs2, 0;
@%p6 bra $L__BB1_11;
mov.u32 %r1529, -14;
$L__BB1_9:
shl.b32 %r10, %r1528, 1;
add.s32 %r1529, %r1529, -1;
and.b32 %r42, %r1528, 512;
setp.eq.s32 %p7, %r42, 0;
mov.u32 %r1528, %r10;
@%p7 bra $L__BB1_9;
shl.b32 %r43, %r1529, 23;
add.s32 %r44, %r43, 1065353216;
shl.b32 %r45, %r10, 13;
and.b32 %r46, %r45, 8372224;
shl.b32 %r47, %r4, 31;
or.b32 %r48, %r46, %r47;
or.b32 %r1530, %r48, %r44;
bra.uni $L__BB1_12;
$L__BB1_5:
add.s16 %rs7, %rs1, 112;
cvt.u32.u16 %r34, %rs7;
shl.b32 %r35, %r34, 23;
shl.b32 %r36, %r1528, 13;
shl.b32 %r37, %r4, 31;
or.b32 %r38, %r36, %r37;
or.b32 %r1530, %r38, %r35;
bra.uni $L__BB1_12;
$L__BB1_11:
shl.b32 %r1530, %r4, 31;
$L__BB1_12:
ld.global.nc.u8 %rs8, [%rd3];
cvt.u32.u16 %r49, %rs8;
and.b32 %r50, %r49, 240;
ld.global.nc.u8 %rs9, [%rd3+32];
cvt.u32.u16 %r51, %rs9;
and.b32 %r52, %r51, 240;
ld.global.nc.u8 %rs10, [%rd3+128];
cvt.u32.u16 %r53, %rs10;
and.b32 %r54, %r53, 252;
and.b32 %r55, %r49, 15;
and.b32 %r56, %r53, 3;
bfi.b32 %r57, %r56, %r55, 4, 2;
add.s32 %r58, %r57, -32;
and.b32 %r59, %r51, 15;
shr.u32 %r60, %r54, 2;
and.b32 %r61, %r60, 3;
bfi.b32 %r62, %r61, %r59, 4, 2;
add.s32 %r63, %r62, -32;
shr.u32 %r64, %r50, 4;
and.b32 %r65, %r53, 48;
or.b32 %r66, %r65, %r64;
add.s32 %r67, %r66, -32;
shr.u32 %r68, %r52, 4;
and.b32 %r69, %r60, 48;
or.b32 %r70, %r69, %r68;
add.s32 %r71, %r70, -32;
ld.global.nc.u8 %rs11, [%rd3+192];
cvt.s16.s8 %rs12, %rs11;
cvt.rn.f32.s16 %f6, %rs12;
mov.b32 %f7, %r1530;
mul.ftz.f32 %f8, %f7, %f6;
ld.global.nc.u8 %rs13, [%rd3+194];
cvt.s16.s8 %rs14, %rs13;
cvt.rn.f32.s16 %f9, %rs14;
mul.ftz.f32 %f10, %f7, %f9;
ld.global.nc.u8 %rs15, [%rd3+196];
cvt.s16.s8 %rs16, %rs15;
cvt.rn.f32.s16 %f11, %rs16;
mul.ftz.f32 %f12, %f7, %f11;
ld.global.nc.u8 %rs17, [%rd3+198];
cvt.s16.s8 %rs18, %rs17;
cvt.rn.f32.s16 %f13, %rs18;
mul.ftz.f32 %f14, %f7, %f13;
shl.b32 %r72, %r1527, 8;
cvt.u64.u32 %rd10, %r72;
add.s64 %rd11, %rd1, %rd10;
shl.b64 %rd13, %rd11, 2;
add.s64 %rd14, %rd12, %rd13;
cvt.rn.f32.s32 %f15, %r58;
mul.ftz.f32 %f16, %f8, %f15;
ld.global.nc.f32 %f17, [%rd14];
fma.rn.ftz.f32 %f18, %f17, %f16, %f1063;
cvt.rn.f32.s32 %f19, %r63;
mul.ftz.f32 %f20, %f10, %f19;
ld.global.nc.f32 %f21, [%rd14+128];
fma.rn.ftz.f32 %f22, %f21, %f20, %f18;
cvt.rn.f32.s32 %f23, %r67;
mul.ftz.f32 %f24, %f12, %f23;
ld.global.nc.f32 %f25, [%rd14+256];
fma.rn.ftz.f32 %f26, %f24, %f25, %f22;
cvt.rn.f32.s32 %f27, %r71;
mul.ftz.f32 %f28, %f14, %f27;
ld.global.nc.f32 %f29, [%rd14+384];
fma.rn.ftz.f32 %f30, %f28, %f29, %f26;
ld.global.nc.u8 %rs19, [%rd3+1];
cvt.u32.u16 %r73, %rs19;
and.b32 %r74, %r73, 240;
ld.global.nc.u8 %rs20, [%rd3+33];
cvt.u32.u16 %r75, %rs20;
and.b32 %r76, %r75, 240;
ld.global.nc.u8 %rs21, [%rd3+129];
cvt.u32.u16 %r77, %rs21;
and.b32 %r78, %r77, 252;
and.b32 %r79, %r73, 15;
and.b32 %r80, %r77, 3;
bfi.b32 %r81, %r80, %r79, 4, 2;
add.s32 %r82, %r81, -32;
and.b32 %r83, %r75, 15;
shr.u32 %r84, %r78, 2;
and.b32 %r85, %r84, 3;
bfi.b32 %r86, %r85, %r83, 4, 2;
add.s32 %r87, %r86, -32;
shr.u32 %r88, %r74, 4;
and.b32 %r89, %r77, 48;
or.b32 %r90, %r89, %r88;
add.s32 %r91, %r90, -32;
shr.u32 %r92, %r76, 4;
and.b32 %r93, %r84, 48;
or.b32 %r94, %r93, %r92;
add.s32 %r95, %r94, -32;
cvt.rn.f32.s32 %f31, %r82;
mul.ftz.f32 %f32, %f8, %f31;
ld.global.nc.f32 %f33, [%rd14+4];
fma.rn.ftz.f32 %f34, %f33, %f32, %f30;
cvt.rn.f32.s32 %f35, %r87;
mul.ftz.f32 %f36, %f10, %f35;
ld.global.nc.f32 %f37, [%rd14+132];
fma.rn.ftz.f32 %f38, %f37, %f36, %f34;
cvt.rn.f32.s32 %f39, %r91;
mul.ftz.f32 %f40, %f12, %f39;
ld.global.nc.f32 %f41, [%rd14+260];
fma.rn.ftz.f32 %f42, %f40, %f41, %f38;
cvt.rn.f32.s32 %f43, %r95;
mul.ftz.f32 %f44, %f14, %f43;
ld.global.nc.f32 %f45, [%rd14+388];
fma.rn.ftz.f32 %f46, %f44, %f45, %f42;
ld.global.nc.u8 %rs22, [%rd3+2];
cvt.u32.u16 %r96, %rs22;
and.b32 %r97, %r96, 240;
ld.global.nc.u8 %rs23, [%rd3+34];
cvt.u32.u16 %r98, %rs23;
and.b32 %r99, %r98, 240;
ld.global.nc.u8 %rs24, [%rd3+130];
cvt.u32.u16 %r100, %rs24;
and.b32 %r101, %r100, 252;
and.b32 %r102, %r96, 15;
and.b32 %r103, %r100, 3;
bfi.b32 %r104, %r103, %r102, 4, 2;
add.s32 %r105, %r104, -32;
and.b32 %r106, %r98, 15;
shr.u32 %r107, %r101, 2;
and.b32 %r108, %r107, 3;
bfi.b32 %r109, %r108, %r106, 4, 2;
add.s32 %r110, %r109, -32;
shr.u32 %r111, %r97, 4;
and.b32 %r112, %r100, 48;
or.b32 %r113, %r112, %r111;
add.s32 %r114, %r113, -32;
shr.u32 %r115, %r99, 4;
and.b32 %r116, %r107, 48;
or.b32 %r117, %r116, %r115;
add.s32 %r118, %r117, -32;
cvt.rn.f32.s32 %f47, %r105;
mul.ftz.f32 %f48, %f8, %f47;
ld.global.nc.f32 %f49, [%rd14+8];
fma.rn.ftz.f32 %f50, %f49, %f48, %f46;
cvt.rn.f32.s32 %f51, %r110;
mul.ftz.f32 %f52, %f10, %f51;
ld.global.nc.f32 %f53, [%rd14+136];
fma.rn.ftz.f32 %f54, %f53, %f52, %f50;
cvt.rn.f32.s32 %f55, %r114;
mul.ftz.f32 %f56, %f12, %f55;
ld.global.nc.f32 %f57, [%rd14+264];
fma.rn.ftz.f32 %f58, %f56, %f57, %f54;
cvt.rn.f32.s32 %f59, %r118;
mul.ftz.f32 %f60, %f14, %f59;
ld.global.nc.f32 %f61, [%rd14+392];
fma.rn.ftz.f32 %f62, %f60, %f61, %f58;
ld.global.nc.u8 %rs25, [%rd3+3];
cvt.u32.u16 %r119, %rs25;
and.b32 %r120, %r119, 240;
ld.global.nc.u8 %rs26, [%rd3+35];
cvt.u32.u16 %r121, %rs26;
and.b32 %r122, %r121, 240;
ld.global.nc.u8 %rs27, [%rd3+131];
cvt.u32.u16 %r123, %rs27;
and.b32 %r124, %r123, 252;
and.b32 %r125, %r119, 15;
and.b32 %r126, %r123, 3;
bfi.b32 %r127, %r126, %r125, 4, 2;
add.s32 %r128, %r127, -32;
and.b32 %r129, %r121, 15;
shr.u32 %r130, %r124, 2;
and.b32 %r131, %r130, 3;
bfi.b32 %r132, %r131, %r129, 4, 2;
add.s32 %r133, %r132, -32;
shr.u32 %r134, %r120, 4;
and.b32 %r135, %r123, 48;
or.b32 %r136, %r135, %r134;
add.s32 %r137, %r136, -32;
shr.u32 %r138, %r122, 4;
and.b32 %r139, %r130, 48;
or.b32 %r140, %r139, %r138;
add.s32 %r141, %r140, -32;
cvt.rn.f32.s32 %f63, %r128;
mul.ftz.f32 %f64, %f8, %f63;
ld.global.nc.f32 %f65, [%rd14+12];
fma.rn.ftz.f32 %f66, %f65, %f64, %f62;
cvt.rn.f32.s32 %f67, %r133;
mul.ftz.f32 %f68, %f10, %f67;
ld.global.nc.f32 %f69, [%rd14+140];
fma.rn.ftz.f32 %f70, %f69, %f68, %f66;
cvt.rn.f32.s32 %f71, %r137;
mul.ftz.f32 %f72, %f12, %f71;
ld.global.nc.f32 %f73, [%rd14+268];
fma.rn.ftz.f32 %f74, %f72, %f73, %f70;
cvt.rn.f32.s32 %f75, %r141;
mul.ftz.f32 %f76, %f14, %f75;
ld.global.nc.f32 %f77, [%rd14+396];
fma.rn.ftz.f32 %f78, %f76, %f77, %f74;
ld.global.nc.u8 %rs28, [%rd3+4];
cvt.u32.u16 %r142, %rs28;
and.b32 %r143, %r142, 240;
ld.global.nc.u8 %rs29, [%rd3+36];
cvt.u32.u16 %r144, %rs29;
and.b32 %r145, %r144, 240;
ld.global.nc.u8 %rs30, [%rd3+132];
cvt.u32.u16 %r146, %rs30;
and.b32 %r147, %r146, 252;
and.b32 %r148, %r142, 15;
and.b32 %r149, %r146, 3;
bfi.b32 %r150, %r149, %r148, 4, 2;
add.s32 %r151, %r150, -32;
and.b32 %r152, %r144, 15;
shr.u32 %r153, %r147, 2;
and.b32 %r154, %r153, 3;
bfi.b32 %r155, %r154, %r152, 4, 2;
add.s32 %r156, %r155, -32;
shr.u32 %r157, %r143, 4;
and.b32 %r158, %r146, 48;
or.b32 %r159, %r158, %r157;
add.s32 %r160, %r159, -32;
shr.u32 %r161, %r145, 4;
and.b32 %r162, %r153, 48;
or.b32 %r163, %r162, %r161;
add.s32 %r164, %r163, -32;
cvt.rn.f32.s32 %f79, %r151;
mul.ftz.f32 %f80, %f8, %f79;
ld.global.nc.f32 %f81, [%rd14+16];
fma.rn.ftz.f32 %f82, %f81, %f80, %f78;
cvt.rn.f32.s32 %f83, %r156;
mul.ftz.f32 %f84, %f10, %f83;
ld.global.nc.f32 %f85, [%rd14+144];
fma.rn.ftz.f32 %f86, %f85, %f84, %f82;
cvt.rn.f32.s32 %f87, %r160;
mul.ftz.f32 %f88, %f12, %f87;
ld.global.nc.f32 %f89, [%rd14+272];
fma.rn.ftz.f32 %f90, %f88, %f89, %f86;
cvt.rn.f32.s32 %f91, %r164;
mul.ftz.f32 %f92, %f14, %f91;
ld.global.nc.f32 %f93, [%rd14+400];
fma.rn.ftz.f32 %f94, %f92, %f93, %f90;
ld.global.nc.u8 %rs31, [%rd3+5];
cvt.u32.u16 %r165, %rs31;
and.b32 %r166, %r165, 240;
ld.global.nc.u8 %rs32, [%rd3+37];
cvt.u32.u16 %r167, %rs32;
and.b32 %r168, %r167, 240;
ld.global.nc.u8 %rs33, [%rd3+133];
cvt.u32.u16 %r169, %rs33;
and.b32 %r170, %r169, 252;
and.b32 %r171, %r165, 15;
and.b32 %r172, %r169, 3;
bfi.b32 %r173, %r172, %r171, 4, 2;
add.s32 %r174, %r173, -32;
and.b32 %r175, %r167, 15;
shr.u32 %r176, %r170, 2;
and.b32 %r177, %r176, 3;
bfi.b32 %r178, %r177, %r175, 4, 2;
add.s32 %r179, %r178, -32;
shr.u32 %r180, %r166, 4;
and.b32 %r181, %r169, 48;
or.b32 %r182, %r181, %r180;
add.s32 %r183, %r182, -32;
shr.u32 %r184, %r168, 4;
and.b32 %r185, %r176, 48;
or.b32 %r186, %r185, %r184;
add.s32 %r187, %r186, -32;
cvt.rn.f32.s32 %f95, %r174;
mul.ftz.f32 %f96, %f8, %f95;
ld.global.nc.f32 %f97, [%rd14+20];
fma.rn.ftz.f32 %f98, %f97, %f96, %f94;
cvt.rn.f32.s32 %f99, %r179;
mul.ftz.f32 %f100, %f10, %f99;
ld.global.nc.f32 %f101, [%rd14+148];
fma.rn.ftz.f32 %f102, %f101, %f100, %f98;
cvt.rn.f32.s32 %f103, %r183;
mul.ftz.f32 %f104, %f12, %f103;
ld.global.nc.f32 %f105, [%rd14+276];
fma.rn.ftz.f32 %f106, %f104, %f105, %f102;
cvt.rn.f32.s32 %f107, %r187;
mul.ftz.f32 %f108, %f14, %f107;
ld.global.nc.f32 %f109, [%rd14+404];
fma.rn.ftz.f32 %f110, %f108, %f109, %f106;
ld.global.nc.u8 %rs34, [%rd3+6];
cvt.u32.u16 %r188, %rs34;
and.b32 %r189, %r188, 240;
ld.global.nc.u8 %rs35, [%rd3+38];
cvt.u32.u16 %r190, %rs35;
and.b32 %r191, %r190, 240;
ld.global.nc.u8 %rs36, [%rd3+134];
cvt.u32.u16 %r192, %rs36;
and.b32 %r193, %r192, 252;
and.b32 %r194, %r188, 15;
and.b32 %r195, %r192, 3;
bfi.b32 %r196, %r195, %r194, 4, 2;
add.s32 %r197, %r196, -32;
and.b32 %r198, %r190, 15;
shr.u32 %r199, %r193, 2;
and.b32 %r200, %r199, 3;
bfi.b32 %r201, %r200, %r198, 4, 2;
add.s32 %r202, %r201, -32;
shr.u32 %r203, %r189, 4;
and.b32 %r204, %r192, 48;
or.b32 %r205, %r204, %r203;
add.s32 %r206, %r205, -32;
shr.u32 %r207, %r191, 4;
and.b32 %r208, %r199, 48;
or.b32 %r209, %r208, %r207;
add.s32 %r210, %r209, -32;
cvt.rn.f32.s32 %f111, %r197;
mul.ftz.f32 %f112, %f8, %f111;
ld.global.nc.f32 %f113, [%rd14+24];
fma.rn.ftz.f32 %f114, %f113, %f112, %f110;
cvt.rn.f32.s32 %f115, %r202;
mul.ftz.f32 %f116, %f10, %f115;
ld.global.nc.f32 %f117, [%rd14+152];
fma.rn.ftz.f32 %f118, %f117, %f116, %f114;
cvt.rn.f32.s32 %f119, %r206;
mul.ftz.f32 %f120, %f12, %f119;
ld.global.nc.f32 %f121, [%rd14+280];
fma.rn.ftz.f32 %f122, %f120, %f121, %f118;
cvt.rn.f32.s32 %f123, %r210;
mul.ftz.f32 %f124, %f14, %f123;
ld.global.nc.f32 %f125, [%rd14+408];
fma.rn.ftz.f32 %f126, %f124, %f125, %f122;
ld.global.nc.u8 %rs37, [%rd3+7];
cvt.u32.u16 %r211, %rs37;
and.b32 %r212, %r211, 240;
ld.global.nc.u8 %rs38, [%rd3+39];
cvt.u32.u16 %r213, %rs38;
and.b32 %r214, %r213, 240;
ld.global.nc.u8 %rs39, [%rd3+135];
cvt.u32.u16 %r215, %rs39;
and.b32 %r216, %r215, 252;
and.b32 %r217, %r211, 15;
and.b32 %r218, %r215, 3;
bfi.b32 %r219, %r218, %r217, 4, 2;
add.s32 %r220, %r219, -32;
and.b32 %r221, %r213, 15;
shr.u32 %r222, %r216, 2;
and.b32 %r223, %r222, 3;
bfi.b32 %r224, %r223, %r221, 4, 2;
add.s32 %r225, %r224, -32;
shr.u32 %r226, %r212, 4;
and.b32 %r227, %r215, 48;
or.b32 %r228, %r227, %r226;
add.s32 %r229, %r228, -32;
shr.u32 %r230, %r214, 4;
and.b32 %r231, %r222, 48;
or.b32 %r232, %r231, %r230;
add.s32 %r233, %r232, -32;
cvt.rn.f32.s32 %f127, %r220;
mul.ftz.f32 %f128, %f8, %f127;
ld.global.nc.f32 %f129, [%rd14+28];
fma.rn.ftz.f32 %f130, %f129, %f128, %f126;
cvt.rn.f32.s32 %f131, %r225;
mul.ftz.f32 %f132, %f10, %f131;
ld.global.nc.f32 %f133, [%rd14+156];
fma.rn.ftz.f32 %f134, %f133, %f132, %f130;
cvt.rn.f32.s32 %f135, %r229;
mul.ftz.f32 %f136, %f12, %f135;
ld.global.nc.f32 %f137, [%rd14+284];
fma.rn.ftz.f32 %f138, %f136, %f137, %f134;
cvt.rn.f32.s32 %f139, %r233;
mul.ftz.f32 %f140, %f14, %f139;
ld.global.nc.f32 %f141, [%rd14+412];
fma.rn.ftz.f32 %f142, %f140, %f141, %f138;
ld.global.nc.u8 %rs40, [%rd3+8];
cvt.u32.u16 %r234, %rs40;
and.b32 %r235, %r234, 240;
ld.global.nc.u8 %rs41, [%rd3+40];
cvt.u32.u16 %r236, %rs41;
and.b32 %r237, %r236, 240;
ld.global.nc.u8 %rs42, [%rd3+136];
cvt.u32.u16 %r238, %rs42;
and.b32 %r239, %r238, 252;
and.b32 %r240, %r234, 15;
and.b32 %r241, %r238, 3;
bfi.b32 %r242, %r241, %r240, 4, 2;
add.s32 %r243, %r242, -32;
and.b32 %r244, %r236, 15;
shr.u32 %r245, %r239, 2;
and.b32 %r246, %r245, 3;
bfi.b32 %r247, %r246, %r244, 4, 2;
add.s32 %r248, %r247, -32;
shr.u32 %r249, %r235, 4;
and.b32 %r250, %r238, 48;
or.b32 %r251, %r250, %r249;
add.s32 %r252, %r251, -32;
shr.u32 %r253, %r237, 4;
and.b32 %r254, %r245, 48;
or.b32 %r255, %r254, %r253;
add.s32 %r256, %r255, -32;
cvt.rn.f32.s32 %f143, %r243;
mul.ftz.f32 %f144, %f8, %f143;
ld.global.nc.f32 %f145, [%rd14+32];
fma.rn.ftz.f32 %f146, %f145, %f144, %f142;
cvt.rn.f32.s32 %f147, %r248;
mul.ftz.f32 %f148, %f10, %f147;
ld.global.nc.f32 %f149, [%rd14+160];
fma.rn.ftz.f32 %f150, %f149, %f148, %f146;
cvt.rn.f32.s32 %f151, %r252;
mul.ftz.f32 %f152, %f12, %f151;
ld.global.nc.f32 %f153, [%rd14+288];
fma.rn.ftz.f32 %f154, %f152, %f153, %f150;
cvt.rn.f32.s32 %f155, %r256;
mul.ftz.f32 %f156, %f14, %f155;
ld.global.nc.f32 %f157, [%rd14+416];
fma.rn.ftz.f32 %f158, %f156, %f157, %f154;
ld.global.nc.u8 %rs43, [%rd3+9];
cvt.u32.u16 %r257, %rs43;
and.b32 %r258, %r257, 240;
ld.global.nc.u8 %rs44, [%rd3+41];
cvt.u32.u16 %r259, %rs44;
and.b32 %r260, %r259, 240;
ld.global.nc.u8 %rs45, [%rd3+137];
cvt.u32.u16 %r261, %rs45;
and.b32 %r262, %r261, 252;
and.b32 %r263, %r257, 15;
and.b32 %r264, %r261, 3;
bfi.b32 %r265, %r264, %r263, 4, 2;
add.s32 %r266, %r265, -32;
and.b32 %r267, %r259, 15;
shr.u32 %r268, %r262, 2;
and.b32 %r269, %r268, 3;
bfi.b32 %r270, %r269, %r267, 4, 2;
add.s32 %r271, %r270, -32;
shr.u32 %r272, %r258, 4;
and.b32 %r273, %r261, 48;
or.b32 %r274, %r273, %r272;
add.s32 %r275, %r274, -32;
shr.u32 %r276, %r260, 4;
and.b32 %r277, %r268, 48;
or.b32 %r278, %r277, %r276;
add.s32 %r279, %r278, -32;
cvt.rn.f32.s32 %f159, %r266;
mul.ftz.f32 %f160, %f8, %f159;
ld.global.nc.f32 %f161, [%rd14+36];
fma.rn.ftz.f32 %f162, %f161, %f160, %f158;
cvt.rn.f32.s32 %f163, %r271;
mul.ftz.f32 %f164, %f10, %f163;
ld.global.nc.f32 %f165, [%rd14+164];
fma.rn.ftz.f32 %f166, %f165, %f164, %f162;
cvt.rn.f32.s32 %f167, %r275;
mul.ftz.f32 %f168, %f12, %f167;
ld.global.nc.f32 %f169, [%rd14+292];
fma.rn.ftz.f32 %f170, %f168, %f169, %f166;
cvt.rn.f32.s32 %f171, %r279;
mul.ftz.f32 %f172, %f14, %f171;
ld.global.nc.f32 %f173, [%rd14+420];
fma.rn.ftz.f32 %f174, %f172, %f173, %f170;
ld.global.nc.u8 %rs46, [%rd3+10];
cvt.u32.u16 %r280, %rs46;
and.b32 %r281, %r280, 240;
ld.global.nc.u8 %rs47, [%rd3+42];
cvt.u32.u16 %r282, %rs47;
and.b32 %r283, %r282, 240;
ld.global.nc.u8 %rs48, [%rd3+138];
cvt.u32.u16 %r284, %rs48;
and.b32 %r285, %r284, 252;
and.b32 %r286, %r280, 15;
and.b32 %r287, %r284, 3;
bfi.b32 %r288, %r287, %r286, 4, 2;
add.s32 %r289, %r288, -32;
and.b32 %r290, %r282, 15;
shr.u32 %r291, %r285, 2;
and.b32 %r292, %r291, 3;
bfi.b32 %r293, %r292, %r290, 4, 2;
add.s32 %r294, %r293, -32;
shr.u32 %r295, %r281, 4;
and.b32 %r296, %r284, 48;
or.b32 %r297, %r296, %r295;
add.s32 %r298, %r297, -32;
shr.u32 %r299, %r283, 4;
and.b32 %r300, %r291, 48;
or.b32 %r301, %r300, %r299;
add.s32 %r302, %r301, -32;
cvt.rn.f32.s32 %f175, %r289;
mul.ftz.f32 %f176, %f8, %f175;
ld.global.nc.f32 %f177, [%rd14+40];
fma.rn.ftz.f32 %f178, %f177, %f176, %f174;
cvt.rn.f32.s32 %f179, %r294;
mul.ftz.f32 %f180, %f10, %f179;
ld.global.nc.f32 %f181, [%rd14+168];
fma.rn.ftz.f32 %f182, %f181, %f180, %f178;
cvt.rn.f32.s32 %f183, %r298;
mul.ftz.f32 %f184, %f12, %f183;
ld.global.nc.f32 %f185, [%rd14+296];
fma.rn.ftz.f32 %f186, %f184, %f185, %f182;
cvt.rn.f32.s32 %f187, %r302;
mul.ftz.f32 %f188, %f14, %f187;
ld.global.nc.f32 %f189, [%rd14+424];
fma.rn.ftz.f32 %f190, %f188, %f189, %f186;
ld.global.nc.u8 %rs49, [%rd3+11];
cvt.u32.u16 %r303, %rs49;
and.b32 %r304, %r303, 240;
ld.global.nc.u8 %rs50, [%rd3+43];
cvt.u32.u16 %r305, %rs50;
and.b32 %r306, %r305, 240;
ld.global.nc.u8 %rs51, [%rd3+139];
cvt.u32.u16 %r307, %rs51;
and.b32 %r308, %r307, 252;
and.b32 %r309, %r303, 15;
and.b32 %r310, %r307, 3;
bfi.b32 %r311, %r310, %r309, 4, 2;
add.s32 %r312, %r311, -32;
and.b32 %r313, %r305, 15;
shr.u32 %r314, %r308, 2;
and.b32 %r315, %r314, 3;
bfi.b32 %r316, %r315, %r313, 4, 2;
add.s32 %r317, %r316, -32;
shr.u32 %r318, %r304, 4;
and.b32 %r319, %r307, 48;
or.b32 %r320, %r319, %r318;
add.s32 %r321, %r320, -32;
shr.u32 %r322, %r306, 4;
and.b32 %r323, %r314, 48;
or.b32 %r324, %r323, %r322;
add.s32 %r325, %r324, -32;
cvt.rn.f32.s32 %f191, %r312;
mul.ftz.f32 %f192, %f8, %f191;
ld.global.nc.f32 %f193, [%rd14+44];
fma.rn.ftz.f32 %f194, %f193, %f192, %f190;
cvt.rn.f32.s32 %f195, %r317;
mul.ftz.f32 %f196, %f10, %f195;
ld.global.nc.f32 %f197, [%rd14+172];
fma.rn.ftz.f32 %f198, %f197, %f196, %f194;
cvt.rn.f32.s32 %f199, %r321;
mul.ftz.f32 %f200, %f12, %f199;
ld.global.nc.f32 %f201, [%rd14+300];
fma.rn.ftz.f32 %f202, %f200, %f201, %f198;
cvt.rn.f32.s32 %f203, %r325;
mul.ftz.f32 %f204, %f14, %f203;
ld.global.nc.f32 %f205, [%rd14+428];
fma.rn.ftz.f32 %f206, %f204, %f205, %f202;
ld.global.nc.u8 %rs52, [%rd3+12];
cvt.u32.u16 %r326, %rs52;
and.b32 %r327, %r326, 240;
ld.global.nc.u8 %rs53, [%rd3+44];
cvt.u32.u16 %r328, %rs53;
and.b32 %r329, %r328, 240;
ld.global.nc.u8 %rs54, [%rd3+140];
cvt.u32.u16 %r330, %rs54;
and.b32 %r331, %r330, 252;
and.b32 %r332, %r326, 15;
and.b32 %r333, %r330, 3;
bfi.b32 %r334, %r333, %r332, 4, 2;
add.s32 %r335, %r334, -32;
and.b32 %r336, %r328, 15;
shr.u32 %r337, %r331, 2;
and.b32 %r338, %r337, 3;
bfi.b32 %r339, %r338, %r336, 4, 2;
add.s32 %r340, %r339, -32;
shr.u32 %r341, %r327, 4;
and.b32 %r342, %r330, 48;
or.b32 %r343, %r342, %r341;
add.s32 %r344, %r343, -32;
shr.u32 %r345, %r329, 4;
and.b32 %r346, %r337, 48;
or.b32 %r347, %r346, %r345;
add.s32 %r348, %r347, -32;
cvt.rn.f32.s32 %f207, %r335;
mul.ftz.f32 %f208, %f8, %f207;
ld.global.nc.f32 %f209, [%rd14+48];
fma.rn.ftz.f32 %f210, %f209, %f208, %f206;
cvt.rn.f32.s32 %f211, %r340;
mul.ftz.f32 %f212, %f10, %f211;
ld.global.nc.f32 %f213, [%rd14+176];
fma.rn.ftz.f32 %f214, %f213, %f212, %f210;
cvt.rn.f32.s32 %f215, %r344;
mul.ftz.f32 %f216, %f12, %f215;
ld.global.nc.f32 %f217, [%rd14+304];
fma.rn.ftz.f32 %f218, %f216, %f217, %f214;
cvt.rn.f32.s32 %f219, %r348;
mul.ftz.f32 %f220, %f14, %f219;
ld.global.nc.f32 %f221, [%rd14+432];
fma.rn.ftz.f32 %f222, %f220, %f221, %f218;
ld.global.nc.u8 %rs55, [%rd3+13];
cvt.u32.u16 %r349, %rs55;
and.b32 %r350, %r349, 240;
ld.global.nc.u8 %rs56, [%rd3+45];
cvt.u32.u16 %r351, %rs56;
and.b32 %r352, %r351, 240;
ld.global.nc.u8 %rs57, [%rd3+141];
cvt.u32.u16 %r353, %rs57;
and.b32 %r354, %r353, 252;
and.b32 %r355, %r349, 15;
and.b32 %r356, %r353, 3;
bfi.b32 %r357, %r356, %r355, 4, 2;
add.s32 %r358, %r357, -32;
and.b32 %r359, %r351, 15;
shr.u32 %r360, %r354, 2;
and.b32 %r361, %r360, 3;
bfi.b32 %r362, %r361, %r359, 4, 2;
add.s32 %r363, %r362, -32;
shr.u32 %r364, %r350, 4;
and.b32 %r365, %r353, 48;
or.b32 %r366, %r365, %r364;
add.s32 %r367, %r366, -32;
shr.u32 %r368, %r352, 4;
and.b32 %r369, %r360, 48;
or.b32 %r370, %r369, %r368;
add.s32 %r371, %r370, -32;
cvt.rn.f32.s32 %f223, %r358;
mul.ftz.f32 %f224, %f8, %f223;
ld.global.nc.f32 %f225, [%rd14+52];
fma.rn.ftz.f32 %f226, %f225, %f224, %f222;
cvt.rn.f32.s32 %f227, %r363;
mul.ftz.f32 %f228, %f10, %f227;
ld.global.nc.f32 %f229, [%rd14+180];
fma.rn.ftz.f32 %f230, %f229, %f228, %f226;
cvt.rn.f32.s32 %f231, %r367;
mul.ftz.f32 %f232, %f12, %f231;
ld.global.nc.f32 %f233, [%rd14+308];
fma.rn.ftz.f32 %f234, %f232, %f233, %f230;
cvt.rn.f32.s32 %f235, %r371;
mul.ftz.f32 %f236, %f14, %f235;
ld.global.nc.f32 %f237, [%rd14+436];
fma.rn.ftz.f32 %f238, %f236, %f237, %f234;
ld.global.nc.u8 %rs58, [%rd3+14];
cvt.u32.u16 %r372, %rs58;
and.b32 %r373, %r372, 240;
ld.global.nc.u8 %rs59, [%rd3+46];
cvt.u32.u16 %r374, %rs59;
and.b32 %r375, %r374, 240;
ld.global.nc.u8 %rs60, [%rd3+142];
cvt.u32.u16 %r376, %rs60;
and.b32 %r377, %r376, 252;
and.b32 %r378, %r372, 15;
and.b32 %r379, %r376, 3;
bfi.b32 %r380, %r379, %r378, 4, 2;
add.s32 %r381, %r380, -32;
and.b32 %r382, %r374, 15;
shr.u32 %r383, %r377, 2;
and.b32 %r384, %r383, 3;
bfi.b32 %r385, %r384, %r382, 4, 2;
add.s32 %r386, %r385, -32;
shr.u32 %r387, %r373, 4;
and.b32 %r388, %r376, 48;
or.b32 %r389, %r388, %r387;
add.s32 %r390, %r389, -32;
shr.u32 %r391, %r375, 4;
and.b32 %r392, %r383, 48;
or.b32 %r393, %r392, %r391;
add.s32 %r394, %r393, -32;
cvt.rn.f32.s32 %f239, %r381;
mul.ftz.f32 %f240, %f8, %f239;
ld.global.nc.f32 %f241, [%rd14+56];
fma.rn.ftz.f32 %f242, %f241, %f240, %f238;
cvt.rn.f32.s32 %f243, %r386;
mul.ftz.f32 %f244, %f10, %f243;
ld.global.nc.f32 %f245, [%rd14+184];
fma.rn.ftz.f32 %f246, %f245, %f244, %f242;
cvt.rn.f32.s32 %f247, %r390;
mul.ftz.f32 %f248, %f12, %f247;
ld.global.nc.f32 %f249, [%rd14+312];
fma.rn.ftz.f32 %f250, %f248, %f249, %f246;
cvt.rn.f32.s32 %f251, %r394;
mul.ftz.f32 %f252, %f14, %f251;
ld.global.nc.f32 %f253, [%rd14+440];
fma.rn.ftz.f32 %f254, %f252, %f253, %f250;
ld.global.nc.u8 %rs61, [%rd3+15];
cvt.u32.u16 %r395, %rs61;
and.b32 %r396, %r395, 240;
ld.global.nc.u8 %rs62, [%rd3+47];
cvt.u32.u16 %r397, %rs62;
and.b32 %r398, %r397, 240;
ld.global.nc.u8 %rs63, [%rd3+143];
cvt.u32.u16 %r399, %rs63;
and.b32 %r400, %r399, 252;
and.b32 %r401, %r395, 15;
and.b32 %r402, %r399, 3;
bfi.b32 %r403, %r402, %r401, 4, 2;
add.s32 %r404, %r403, -32;
and.b32 %r405, %r397, 15;
shr.u32 %r406, %r400, 2;
and.b32 %r407, %r406, 3;
bfi.b32 %r408, %r407, %r405, 4, 2;
add.s32 %r409, %r408, -32;
shr.u32 %r410, %r396, 4;
and.b32 %r411, %r399, 48;
or.b32 %r412, %r411, %r410;
add.s32 %r413, %r412, -32;
shr.u32 %r414, %r398, 4;
and.b32 %r415, %r406, 48;
or.b32 %r416, %r415, %r414;
add.s32 %r417, %r416, -32;
cvt.rn.f32.s32 %f255, %r404;
mul.ftz.f32 %f256, %f8, %f255;
ld.global.nc.f32 %f257, [%rd14+60];
fma.rn.ftz.f32 %f258, %f257, %f256, %f254;
cvt.rn.f32.s32 %f259, %r409;
mul.ftz.f32 %f260, %f10, %f259;
ld.global.nc.f32 %f261, [%rd14+188];
fma.rn.ftz.f32 %f262, %f261, %f260, %f258;
cvt.rn.f32.s32 %f263, %r413;
mul.ftz.f32 %f264, %f12, %f263;
ld.global.nc.f32 %f265, [%rd14+316];
fma.rn.ftz.f32 %f266, %f264, %f265, %f262;
cvt.rn.f32.s32 %f267, %r417;
mul.ftz.f32 %f268, %f14, %f267;
ld.global.nc.f32 %f269, [%rd14+444];
fma.rn.ftz.f32 %f270, %f268, %f269, %f266;
ld.global.nc.u8 %rs64, [%rd3+16];
cvt.u32.u16 %r418, %rs64;
and.b32 %r419, %r418, 240;
ld.global.nc.u8 %rs65, [%rd3+48];
cvt.u32.u16 %r420, %rs65;
and.b32 %r421, %r420, 240;
ld.global.nc.u8 %rs66, [%rd3+144];
cvt.u32.u16 %r422, %rs66;
and.b32 %r423, %r422, 252;
and.b32 %r424, %r418, 15;
and.b32 %r425, %r422, 3;
bfi.b32 %r426, %r425, %r424, 4, 2;
add.s32 %r427, %r426, -32;
and.b32 %r428, %r420, 15;
shr.u32 %r429, %r423, 2;
and.b32 %r430, %r429, 3;
bfi.b32 %r431, %r430, %r428, 4, 2;
add.s32 %r432, %r431, -32;
shr.u32 %r433, %r419, 4;
and.b32 %r434, %r422, 48;
or.b32 %r435, %r434, %r433;
add.s32 %r436, %r435, -32;
shr.u32 %r437, %r421, 4;
and.b32 %r438, %r429, 48;
or.b32 %r439, %r438, %r437;
add.s32 %r440, %r439, -32;
ld.global.nc.u8 %rs67, [%rd3+193];
cvt.s16.s8 %rs68, %rs67;
cvt.rn.f32.s16 %f271, %rs68;
mul.ftz.f32 %f272, %f7, %f271;
ld.global.nc.u8 %rs69, [%rd3+195];
cvt.s16.s8 %rs70, %rs69;
cvt.rn.f32.s16 %f273, %rs70;
mul.ftz.f32 %f274, %f7, %f273;
ld.global.nc.u8 %rs71, [%rd3+197];
cvt.s16.s8 %rs72, %rs71;
cvt.rn.f32.s16 %f275, %rs72;
mul.ftz.f32 %f276, %f7, %f275;
ld.global.nc.u8 %rs73, [%rd3+199];
cvt.s16.s8 %rs74, %rs73;
cvt.rn.f32.s16 %f277, %rs74;
mul.ftz.f32 %f278, %f7, %f277;
cvt.rn.f32.s32 %f279, %r427;
mul.ftz.f32 %f280, %f272, %f279;
ld.global.nc.f32 %f281, [%rd14+64];
fma.rn.ftz.f32 %f282, %f281, %f280, %f270;
cvt.rn.f32.s32 %f283, %r432;
mul.ftz.f32 %f284, %f274, %f283;
ld.global.nc.f32 %f285, [%rd14+192];
fma.rn.ftz.f32 %f286, %f285, %f284, %f282;
cvt.rn.f32.s32 %f287, %r436;
mul.ftz.f32 %f288, %f276, %f287;
ld.global.nc.f32 %f289, [%rd14+320];
fma.rn.ftz.f32 %f290, %f288, %f289, %f286;
cvt.rn.f32.s32 %f291, %r440;
mul.ftz.f32 %f292, %f278, %f291;
ld.global.nc.f32 %f293, [%rd14+448];
fma.rn.ftz.f32 %f294, %f292, %f293, %f290;
ld.global.nc.u8 %rs75, [%rd3+17];
cvt.u32.u16 %r441, %rs75;
and.b32 %r442, %r441, 240;
ld.global.nc.u8 %rs76, [%rd3+49];
cvt.u32.u16 %r443, %rs76;
and.b32 %r444, %r443, 240;
ld.global.nc.u8 %rs77, [%rd3+145];
cvt.u32.u16 %r445, %rs77;
and.b32 %r446, %r445, 252;
and.b32 %r447, %r441, 15;
and.b32 %r448, %r445, 3;
bfi.b32 %r449, %r448, %r447, 4, 2;
add.s32 %r450, %r449, -32;
and.b32 %r451, %r443, 15;
shr.u32 %r452, %r446, 2;
and.b32 %r453, %r452, 3;
bfi.b32 %r454, %r453, %r451, 4, 2;
add.s32 %r455, %r454, -32;
shr.u32 %r456, %r442, 4;
and.b32 %r457, %r445, 48;
or.b32 %r458, %r457, %r456;
add.s32 %r459, %r458, -32;
shr.u32 %r460, %r444, 4;
and.b32 %r461, %r452, 48;
or.b32 %r462, %r461, %r460;
add.s32 %r463, %r462, -32;
cvt.rn.f32.s32 %f295, %r450;
mul.ftz.f32 %f296, %f272, %f295;
ld.global.nc.f32 %f297, [%rd14+68];
fma.rn.ftz.f32 %f298, %f297, %f296, %f294;
cvt.rn.f32.s32 %f299, %r455;
mul.ftz.f32 %f300, %f274, %f299;
ld.global.nc.f32 %f301, [%rd14+196];
fma.rn.ftz.f32 %f302, %f301, %f300, %f298;
cvt.rn.f32.s32 %f303, %r459;
mul.ftz.f32 %f304, %f276, %f303;
ld.global.nc.f32 %f305, [%rd14+324];
fma.rn.ftz.f32 %f306, %f304, %f305, %f302;
cvt.rn.f32.s32 %f307, %r463;
mul.ftz.f32 %f308, %f278, %f307;
ld.global.nc.f32 %f309, [%rd14+452];
fma.rn.ftz.f32 %f310, %f308, %f309, %f306;
ld.global.nc.u8 %rs78, [%rd3+18];
cvt.u32.u16 %r464, %rs78;
and.b32 %r465, %r464, 240;
ld.global.nc.u8 %rs79, [%rd3+50];
cvt.u32.u16 %r466, %rs79;
and.b32 %r467, %r466, 240;
ld.global.nc.u8 %rs80, [%rd3+146];
cvt.u32.u16 %r468, %rs80;
and.b32 %r469, %r468, 252;
and.b32 %r470, %r464, 15;
and.b32 %r471, %r468, 3;
bfi.b32 %r472, %r471, %r470, 4, 2;
add.s32 %r473, %r472, -32;
and.b32 %r474, %r466, 15;
shr.u32 %r475, %r469, 2;
and.b32 %r476, %r475, 3;
bfi.b32 %r477, %r476, %r474, 4, 2;
add.s32 %r478, %r477, -32;
shr.u32 %r479, %r465, 4;
and.b32 %r480, %r468, 48;
or.b32 %r481, %r480, %r479;
add.s32 %r482, %r481, -32;
shr.u32 %r483, %r467, 4;
and.b32 %r484, %r475, 48;
or.b32 %r485, %r484, %r483;
add.s32 %r486, %r485, -32;
cvt.rn.f32.s32 %f311, %r473;
mul.ftz.f32 %f312, %f272, %f311;
ld.global.nc.f32 %f313, [%rd14+72];
fma.rn.ftz.f32 %f314, %f313, %f312, %f310;
cvt.rn.f32.s32 %f315, %r478;
mul.ftz.f32 %f316, %f274, %f315;
ld.global.nc.f32 %f317, [%rd14+200];
fma.rn.ftz.f32 %f318, %f317, %f316, %f314;
cvt.rn.f32.s32 %f319, %r482;
mul.ftz.f32 %f320, %f276, %f319;
ld.global.nc.f32 %f321, [%rd14+328];
fma.rn.ftz.f32 %f322, %f320, %f321, %f318;
cvt.rn.f32.s32 %f323, %r486;
mul.ftz.f32 %f324, %f278, %f323;
ld.global.nc.f32 %f325, [%rd14+456];
fma.rn.ftz.f32 %f326, %f324, %f325, %f322;
ld.global.nc.u8 %rs81, [%rd3+19];
cvt.u32.u16 %r487, %rs81;
and.b32 %r488, %r487, 240;
ld.global.nc.u8 %rs82, [%rd3+51];
cvt.u32.u16 %r489, %rs82;
and.b32 %r490, %r489, 240;
ld.global.nc.u8 %rs83, [%rd3+147];
cvt.u32.u16 %r491, %rs83;
and.b32 %r492, %r491, 252;
and.b32 %r493, %r487, 15;
and.b32 %r494, %r491, 3;
bfi.b32 %r495, %r494, %r493, 4, 2;
add.s32 %r496, %r495, -32;
and.b32 %r497, %r489, 15;
shr.u32 %r498, %r492, 2;
and.b32 %r499, %r498, 3;
bfi.b32 %r500, %r499, %r497, 4, 2;
add.s32 %r501, %r500, -32;
shr.u32 %r502, %r488, 4;
and.b32 %r503, %r491, 48;
or.b32 %r504, %r503, %r502;
add.s32 %r505, %r504, -32;
shr.u32 %r506, %r490, 4;
and.b32 %r507, %r498, 48;
or.b32 %r508, %r507, %r506;
add.s32 %r509, %r508, -32;
cvt.rn.f32.s32 %f327, %r496;
mul.ftz.f32 %f328, %f272, %f327;
ld.global.nc.f32 %f329, [%rd14+76];
fma.rn.ftz.f32 %f330, %f329, %f328, %f326;
cvt.rn.f32.s32 %f331, %r501;
mul.ftz.f32 %f332, %f274, %f331;
ld.global.nc.f32 %f333, [%rd14+204];
fma.rn.ftz.f32 %f334, %f333, %f332, %f330;
cvt.rn.f32.s32 %f335, %r505;
mul.ftz.f32 %f336, %f276, %f335;
ld.global.nc.f32 %f337, [%rd14+332];
fma.rn.ftz.f32 %f338, %f336, %f337, %f334;
cvt.rn.f32.s32 %f339, %r509;
mul.ftz.f32 %f340, %f278, %f339;
ld.global.nc.f32 %f341, [%rd14+460];
fma.rn.ftz.f32 %f342, %f340, %f341, %f338;
ld.global.nc.u8 %rs84, [%rd3+20];
cvt.u32.u16 %r510, %rs84;
and.b32 %r511, %r510, 240;
ld.global.nc.u8 %rs85, [%rd3+52];
cvt.u32.u16 %r512, %rs85;
and.b32 %r513, %r512, 240;
ld.global.nc.u8 %rs86, [%rd3+148];
cvt.u32.u16 %r514, %rs86;
and.b32 %r515, %r514, 252;
and.b32 %r516, %r510, 15;
and.b32 %r517, %r514, 3;
bfi.b32 %r518, %r517, %r516, 4, 2;
add.s32 %r519, %r518, -32;
and.b32 %r520, %r512, 15;
shr.u32 %r521, %r515, 2;
and.b32 %r522, %r521, 3;
bfi.b32 %r523, %r522, %r520, 4, 2;
add.s32 %r524, %r523, -32;
shr.u32 %r525, %r511, 4;
and.b32 %r526, %r514, 48;
or.b32 %r527, %r526, %r525;
add.s32 %r528, %r527, -32;
shr.u32 %r529, %r513, 4;
and.b32 %r530, %r521, 48;
or.b32 %r531, %r530, %r529;
add.s32 %r532, %r531, -32;
cvt.rn.f32.s32 %f343, %r519;
mul.ftz.f32 %f344, %f272, %f343;
ld.global.nc.f32 %f345, [%rd14+80];
fma.rn.ftz.f32 %f346, %f345, %f344, %f342;
cvt.rn.f32.s32 %f347, %r524;
mul.ftz.f32 %f348, %f274, %f347;
ld.global.nc.f32 %f349, [%rd14+208];
fma.rn.ftz.f32 %f350, %f349, %f348, %f346;
cvt.rn.f32.s32 %f351, %r528;
mul.ftz.f32 %f352, %f276, %f351;
ld.global.nc.f32 %f353, [%rd14+336];
fma.rn.ftz.f32 %f354, %f352, %f353, %f350;
cvt.rn.f32.s32 %f355, %r532;
mul.ftz.f32 %f356, %f278, %f355;
ld.global.nc.f32 %f357, [%rd14+464];
fma.rn.ftz.f32 %f358, %f356, %f357, %f354;
ld.global.nc.u8 %rs87, [%rd3+21];
cvt.u32.u16 %r533, %rs87;
and.b32 %r534, %r533, 240;
ld.global.nc.u8 %rs88, [%rd3+53];
cvt.u32.u16 %r535, %rs88;
and.b32 %r536, %r535, 240;
ld.global.nc.u8 %rs89, [%rd3+149];
cvt.u32.u16 %r537, %rs89;
and.b32 %r538, %r537, 252;
and.b32 %r539, %r533, 15;
and.b32 %r540, %r537, 3;
bfi.b32 %r541, %r540, %r539, 4, 2;
add.s32 %r542, %r541, -32;
and.b32 %r543, %r535, 15;
shr.u32 %r544, %r538, 2;
and.b32 %r545, %r544, 3;
bfi.b32 %r546, %r545, %r543, 4, 2;
add.s32 %r547, %r546, -32;
shr.u32 %r548, %r534, 4;
and.b32 %r549, %r537, 48;
or.b32 %r550, %r549, %r548;
add.s32 %r551, %r550, -32;
shr.u32 %r552, %r536, 4;
and.b32 %r553, %r544, 48;
or.b32 %r554, %r553, %r552;
add.s32 %r555, %r554, -32;
cvt.rn.f32.s32 %f359, %r542;
mul.ftz.f32 %f360, %f272, %f359;
ld.global.nc.f32 %f361, [%rd14+84];
fma.rn.ftz.f32 %f362, %f361, %f360, %f358;
cvt.rn.f32.s32 %f363, %r547;
mul.ftz.f32 %f364, %f274, %f363;
ld.global.nc.f32 %f365, [%rd14+212];
fma.rn.ftz.f32 %f366, %f365, %f364, %f362;
cvt.rn.f32.s32 %f367, %r551;
mul.ftz.f32 %f368, %f276, %f367;
ld.global.nc.f32 %f369, [%rd14+340];
fma.rn.ftz.f32 %f370, %f368, %f369, %f366;
cvt.rn.f32.s32 %f371, %r555;
mul.ftz.f32 %f372, %f278, %f371;
ld.global.nc.f32 %f373, [%rd14+468];
fma.rn.ftz.f32 %f374, %f372, %f373, %f370;
ld.global.nc.u8 %rs90, [%rd3+22];
cvt.u32.u16 %r556, %rs90;
and.b32 %r557, %r556, 240;
ld.global.nc.u8 %rs91, [%rd3+54];
cvt.u32.u16 %r558, %rs91;
and.b32 %r559, %r558, 240;
ld.global.nc.u8 %rs92, [%rd3+150];
cvt.u32.u16 %r560, %rs92;
and.b32 %r561, %r560, 252;
and.b32 %r562, %r556, 15;
and.b32 %r563, %r560, 3;
bfi.b32 %r564, %r563, %r562, 4, 2;
add.s32 %r565, %r564, -32;
and.b32 %r566, %r558, 15;
shr.u32 %r567, %r561, 2;
and.b32 %r568, %r567, 3;
bfi.b32 %r569, %r568, %r566, 4, 2;
add.s32 %r570, %r569, -32;
shr.u32 %r571, %r557, 4;
and.b32 %r572, %r560, 48;
or.b32 %r573, %r572, %r571;
add.s32 %r574, %r573, -32;
shr.u32 %r575, %r559, 4;
and.b32 %r576, %r567, 48;
or.b32 %r577, %r576, %r575;
add.s32 %r578, %r577, -32;
cvt.rn.f32.s32 %f375, %r565;
mul.ftz.f32 %f376, %f272, %f375;
ld.global.nc.f32 %f377, [%rd14+88];
fma.rn.ftz.f32 %f378, %f377, %f376, %f374;
cvt.rn.f32.s32 %f379, %r570;
mul.ftz.f32 %f380, %f274, %f379;
ld.global.nc.f32 %f381, [%rd14+216];
fma.rn.ftz.f32 %f382, %f381, %f380, %f378;
cvt.rn.f32.s32 %f383, %r574;
mul.ftz.f32 %f384, %f276, %f383;
ld.global.nc.f32 %f385, [%rd14+344];
fma.rn.ftz.f32 %f386, %f384, %f385, %f382;
cvt.rn.f32.s32 %f387, %r578;
mul.ftz.f32 %f388, %f278, %f387;
ld.global.nc.f32 %f389, [%rd14+472];
fma.rn.ftz.f32 %f390, %f388, %f389, %f386;
ld.global.nc.u8 %rs93, [%rd3+23];
cvt.u32.u16 %r579, %rs93;
and.b32 %r580, %r579, 240;
ld.global.nc.u8 %rs94, [%rd3+55];
cvt.u32.u16 %r581, %rs94;
and.b32 %r582, %r581, 240;
ld.global.nc.u8 %rs95, [%rd3+151];
cvt.u32.u16 %r583, %rs95;
and.b32 %r584, %r583, 252;
and.b32 %r585, %r579, 15;
and.b32 %r586, %r583, 3;
bfi.b32 %r587, %r586, %r585, 4, 2;
add.s32 %r588, %r587, -32;
and.b32 %r589, %r581, 15;
shr.u32 %r590, %r584, 2;
and.b32 %r591, %r590, 3;
bfi.b32 %r592, %r591, %r589, 4, 2;
add.s32 %r593, %r592, -32;
shr.u32 %r594, %r580, 4;
and.b32 %r595, %r583, 48;
or.b32 %r596, %r595, %r594;
add.s32 %r597, %r596, -32;
shr.u32 %r598, %r582, 4;
and.b32 %r599, %r590, 48;
or.b32 %r600, %r599, %r598;
add.s32 %r601, %r600, -32;
cvt.rn.f32.s32 %f391, %r588;
mul.ftz.f32 %f392, %f272, %f391;
ld.global.nc.f32 %f393, [%rd14+92];
fma.rn.ftz.f32 %f394, %f393, %f392, %f390;
cvt.rn.f32.s32 %f395, %r593;
mul.ftz.f32 %f396, %f274, %f395;
ld.global.nc.f32 %f397, [%rd14+220];
fma.rn.ftz.f32 %f398, %f397, %f396, %f394;
cvt.rn.f32.s32 %f399, %r597;
mul.ftz.f32 %f400, %f276, %f399;
ld.global.nc.f32 %f401, [%rd14+348];
fma.rn.ftz.f32 %f402, %f400, %f401, %f398;
cvt.rn.f32.s32 %f403, %r601;
mul.ftz.f32 %f404, %f278, %f403;
ld.global.nc.f32 %f405, [%rd14+476];
fma.rn.ftz.f32 %f406, %f404, %f405, %f402;
ld.global.nc.u8 %rs96, [%rd3+24];
cvt.u32.u16 %r602, %rs96;
and.b32 %r603, %r602, 240;
ld.global.nc.u8 %rs97, [%rd3+56];
cvt.u32.u16 %r604, %rs97;
and.b32 %r605, %r604, 240;
ld.global.nc.u8 %rs98, [%rd3+152];
cvt.u32.u16 %r606, %rs98;
and.b32 %r607, %r606, 252;
and.b32 %r608, %r602, 15;
and.b32 %r609, %r606, 3;
bfi.b32 %r610, %r609, %r608, 4, 2;
add.s32 %r611, %r610, -32;
and.b32 %r612, %r604, 15;
shr.u32 %r613, %r607, 2;
and.b32 %r614, %r613, 3;
bfi.b32 %r615, %r614, %r612, 4, 2;
add.s32 %r616, %r615, -32;
shr.u32 %r617, %r603, 4;
and.b32 %r618, %r606, 48;
or.b32 %r619, %r618, %r617;
add.s32 %r620, %r619, -32;
shr.u32 %r621, %r605, 4;
and.b32 %r622, %r613, 48;
or.b32 %r623, %r622, %r621;
add.s32 %r624, %r623, -32;
cvt.rn.f32.s32 %f407, %r611;
mul.ftz.f32 %f408, %f272, %f407;
ld.global.nc.f32 %f409, [%rd14+96];
fma.rn.ftz.f32 %f410, %f409, %f408, %f406;
cvt.rn.f32.s32 %f411, %r616;
mul.ftz.f32 %f412, %f274, %f411;
ld.global.nc.f32 %f413, [%rd14+224];
fma.rn.ftz.f32 %f414, %f413, %f412, %f410;
cvt.rn.f32.s32 %f415, %r620;
mul.ftz.f32 %f416, %f276, %f415;
ld.global.nc.f32 %f417, [%rd14+352];
fma.rn.ftz.f32 %f418, %f416, %f417, %f414;
cvt.rn.f32.s32 %f419, %r624;
mul.ftz.f32 %f420, %f278, %f419;
ld.global.nc.f32 %f421, [%rd14+480];
fma.rn.ftz.f32 %f422, %f420, %f421, %f418;
ld.global.nc.u8 %rs99, [%rd3+25];
cvt.u32.u16 %r625, %rs99;
and.b32 %r626, %r625, 240;
ld.global.nc.u8 %rs100, [%rd3+57];
cvt.u32.u16 %r627, %rs100;
and.b32 %r628, %r627, 240;
ld.global.nc.u8 %rs101, [%rd3+153];
cvt.u32.u16 %r629, %rs101;
and.b32 %r630, %r629, 252;
and.b32 %r631, %r625, 15;
and.b32 %r632, %r629, 3;
bfi.b32 %r633, %r632, %r631, 4, 2;
add.s32 %r634, %r633, -32;
and.b32 %r635, %r627, 15;
shr.u32 %r636, %r630, 2;
and.b32 %r637, %r636, 3;
bfi.b32 %r638, %r637, %r635, 4, 2;
add.s32 %r639, %r638, -32;
shr.u32 %r640, %r626, 4;
and.b32 %r641, %r629, 48;
or.b32 %r642, %r641, %r640;
add.s32 %r643, %r642, -32;
shr.u32 %r644, %r628, 4;
and.b32 %r645, %r636, 48;
or.b32 %r646, %r645, %r644;
add.s32 %r647, %r646, -32;
cvt.rn.f32.s32 %f423, %r634;
mul.ftz.f32 %f424, %f272, %f423;
ld.global.nc.f32 %f425, [%rd14+100];
fma.rn.ftz.f32 %f426, %f425, %f424, %f422;
cvt.rn.f32.s32 %f427, %r639;
mul.ftz.f32 %f428, %f274, %f427;
ld.global.nc.f32 %f429, [%rd14+228];
fma.rn.ftz.f32 %f430, %f429, %f428, %f426;
cvt.rn.f32.s32 %f431, %r643;
mul.ftz.f32 %f432, %f276, %f431;
ld.global.nc.f32 %f433, [%rd14+356];
fma.rn.ftz.f32 %f434, %f432, %f433, %f430;
cvt.rn.f32.s32 %f435, %r647;
mul.ftz.f32 %f436, %f278, %f435;
ld.global.nc.f32 %f437, [%rd14+484];
fma.rn.ftz.f32 %f438, %f436, %f437, %f434;
ld.global.nc.u8 %rs102, [%rd3+26];
cvt.u32.u16 %r648, %rs102;
and.b32 %r649, %r648, 240;
ld.global.nc.u8 %rs103, [%rd3+58];
cvt.u32.u16 %r650, %rs103;
and.b32 %r651, %r650, 240;
ld.global.nc.u8 %rs104, [%rd3+154];
cvt.u32.u16 %r652, %rs104;
and.b32 %r653, %r652, 252;
and.b32 %r654, %r648, 15;
and.b32 %r655, %r652, 3;
bfi.b32 %r656, %r655, %r654, 4, 2;
add.s32 %r657, %r656, -32;
and.b32 %r658, %r650, 15;
shr.u32 %r659, %r653, 2;
and.b32 %r660, %r659, 3;
bfi.b32 %r661, %r660, %r658, 4, 2;
add.s32 %r662, %r661, -32;
shr.u32 %r663, %r649, 4;
and.b32 %r664, %r652, 48;
or.b32 %r665, %r664, %r663;
add.s32 %r666, %r665, -32;
shr.u32 %r667, %r651, 4;
and.b32 %r668, %r659, 48;
or.b32 %r669, %r668, %r667;
add.s32 %r670, %r669, -32;
cvt.rn.f32.s32 %f439, %r657;
mul.ftz.f32 %f440, %f272, %f439;
ld.global.nc.f32 %f441, [%rd14+104];
fma.rn.ftz.f32 %f442, %f441, %f440, %f438;
cvt.rn.f32.s32 %f443, %r662;
mul.ftz.f32 %f444, %f274, %f443;
ld.global.nc.f32 %f445, [%rd14+232];
fma.rn.ftz.f32 %f446, %f445, %f444, %f442;
cvt.rn.f32.s32 %f447, %r666;
mul.ftz.f32 %f448, %f276, %f447;
ld.global.nc.f32 %f449, [%rd14+360];
fma.rn.ftz.f32 %f450, %f448, %f449, %f446;
cvt.rn.f32.s32 %f451, %r670;
mul.ftz.f32 %f452, %f278, %f451;
ld.global.nc.f32 %f453, [%rd14+488];
fma.rn.ftz.f32 %f454, %f452, %f453, %f450;
ld.global.nc.u8 %rs105, [%rd3+27];
cvt.u32.u16 %r671, %rs105;
and.b32 %r672, %r671, 240;
ld.global.nc.u8 %rs106, [%rd3+59];
cvt.u32.u16 %r673, %rs106;
and.b32 %r674, %r673, 240;
ld.global.nc.u8 %rs107, [%rd3+155];
cvt.u32.u16 %r675, %rs107;
and.b32 %r676, %r675, 252;
and.b32 %r677, %r671, 15;
and.b32 %r678, %r675, 3;
bfi.b32 %r679, %r678, %r677, 4, 2;
add.s32 %r680, %r679, -32;
and.b32 %r681, %r673, 15;
shr.u32 %r682, %r676, 2;
and.b32 %r683, %r682, 3;
bfi.b32 %r684, %r683, %r681, 4, 2;
add.s32 %r685, %r684, -32;
shr.u32 %r686, %r672, 4;
and.b32 %r687, %r675, 48;
or.b32 %r688, %r687, %r686;
add.s32 %r689, %r688, -32;
shr.u32 %r690, %r674, 4;
and.b32 %r691, %r682, 48;
or.b32 %r692, %r691, %r690;
add.s32 %r693, %r692, -32;
cvt.rn.f32.s32 %f455, %r680;
mul.ftz.f32 %f456, %f272, %f455;
ld.global.nc.f32 %f457, [%rd14+108];
fma.rn.ftz.f32 %f458, %f457, %f456, %f454;
cvt.rn.f32.s32 %f459, %r685;
mul.ftz.f32 %f460, %f274, %f459;
ld.global.nc.f32 %f461, [%rd14+236];
fma.rn.ftz.f32 %f462, %f461, %f460, %f458;
cvt.rn.f32.s32 %f463, %r689;
mul.ftz.f32 %f464, %f276, %f463;
ld.global.nc.f32 %f465, [%rd14+364];
fma.rn.ftz.f32 %f466, %f464, %f465, %f462;
cvt.rn.f32.s32 %f467, %r693;
mul.ftz.f32 %f468, %f278, %f467;
ld.global.nc.f32 %f469, [%rd14+492];
fma.rn.ftz.f32 %f470, %f468, %f469, %f466;
ld.global.nc.u8 %rs108, [%rd3+28];
cvt.u32.u16 %r694, %rs108;
and.b32 %r695, %r694, 240;
ld.global.nc.u8 %rs109, [%rd3+60];
cvt.u32.u16 %r696, %rs109;
and.b32 %r697, %r696, 240;
ld.global.nc.u8 %rs110, [%rd3+156];
cvt.u32.u16 %r698, %rs110;
and.b32 %r699, %r698, 252;
and.b32 %r700, %r694, 15;
and.b32 %r701, %r698, 3;
bfi.b32 %r702, %r701, %r700, 4, 2;
add.s32 %r703, %r702, -32;
and.b32 %r704, %r696, 15;
shr.u32 %r705, %r699, 2;
and.b32 %r706, %r705, 3;
bfi.b32 %r707, %r706, %r704, 4, 2;
add.s32 %r708, %r707, -32;
shr.u32 %r709, %r695, 4;
and.b32 %r710, %r698, 48;
or.b32 %r711, %r710, %r709;
add.s32 %r712, %r711, -32;
shr.u32 %r713, %r697, 4;
and.b32 %r714, %r705, 48;
or.b32 %r715, %r714, %r713;
add.s32 %r716, %r715, -32;
cvt.rn.f32.s32 %f471, %r703;
mul.ftz.f32 %f472, %f272, %f471;
ld.global.nc.f32 %f473, [%rd14+112];
fma.rn.ftz.f32 %f474, %f473, %f472, %f470;
cvt.rn.f32.s32 %f475, %r708;
mul.ftz.f32 %f476, %f274, %f475;
ld.global.nc.f32 %f477, [%rd14+240];
fma.rn.ftz.f32 %f478, %f477, %f476, %f474;
cvt.rn.f32.s32 %f479, %r712;
mul.ftz.f32 %f480, %f276, %f479;
ld.global.nc.f32 %f481, [%rd14+368];
fma.rn.ftz.f32 %f482, %f480, %f481, %f478;
cvt.rn.f32.s32 %f483, %r716;
mul.ftz.f32 %f484, %f278, %f483;
ld.global.nc.f32 %f485, [%rd14+496];
fma.rn.ftz.f32 %f486, %f484, %f485, %f482;
ld.global.nc.u8 %rs111, [%rd3+29];
cvt.u32.u16 %r717, %rs111;
and.b32 %r718, %r717, 240;
ld.global.nc.u8 %rs112, [%rd3+61];
cvt.u32.u16 %r719, %rs112;
and.b32 %r720, %r719, 240;
ld.global.nc.u8 %rs113, [%rd3+157];
cvt.u32.u16 %r721, %rs113;
and.b32 %r722, %r721, 252;
and.b32 %r723, %r717, 15;
and.b32 %r724, %r721, 3;
bfi.b32 %r725, %r724, %r723, 4, 2;
add.s32 %r726, %r725, -32;
and.b32 %r727, %r719, 15;
shr.u32 %r728, %r722, 2;
and.b32 %r729, %r728, 3;
bfi.b32 %r730, %r729, %r727, 4, 2;
add.s32 %r731, %r730, -32;
shr.u32 %r732, %r718, 4;
and.b32 %r733, %r721, 48;
or.b32 %r734, %r733, %r732;
add.s32 %r735, %r734, -32;
shr.u32 %r736, %r720, 4;
and.b32 %r737, %r728, 48;
or.b32 %r738, %r737, %r736;
add.s32 %r739, %r738, -32;
cvt.rn.f32.s32 %f487, %r726;
mul.ftz.f32 %f488, %f272, %f487;
ld.global.nc.f32 %f489, [%rd14+116];
fma.rn.ftz.f32 %f490, %f489, %f488, %f486;
cvt.rn.f32.s32 %f491, %r731;
mul.ftz.f32 %f492, %f274, %f491;
ld.global.nc.f32 %f493, [%rd14+244];
fma.rn.ftz.f32 %f494, %f493, %f492, %f490;
cvt.rn.f32.s32 %f495, %r735;
mul.ftz.f32 %f496, %f276, %f495;
ld.global.nc.f32 %f497, [%rd14+372];
fma.rn.ftz.f32 %f498, %f496, %f497, %f494;
cvt.rn.f32.s32 %f499, %r739;
mul.ftz.f32 %f500, %f278, %f499;
ld.global.nc.f32 %f501, [%rd14+500];
fma.rn.ftz.f32 %f502, %f500, %f501, %f498;
ld.global.nc.u8 %rs114, [%rd3+30];
cvt.u32.u16 %r740, %rs114;
and.b32 %r741, %r740, 240;
ld.global.nc.u8 %rs115, [%rd3+62];
cvt.u32.u16 %r742, %rs115;
and.b32 %r743, %r742, 240;
ld.global.nc.u8 %rs116, [%rd3+158];
cvt.u32.u16 %r744, %rs116;
and.b32 %r745, %r744, 252;
and.b32 %r746, %r740, 15;
and.b32 %r747, %r744, 3;
bfi.b32 %r748, %r747, %r746, 4, 2;
add.s32 %r749, %r748, -32;
and.b32 %r750, %r742, 15;
shr.u32 %r751, %r745, 2;
and.b32 %r752, %r751, 3;
bfi.b32 %r753, %r752, %r750, 4, 2;
add.s32 %r754, %r753, -32;
shr.u32 %r755, %r741, 4;
and.b32 %r756, %r744, 48;
or.b32 %r757, %r756, %r755;
add.s32 %r758, %r757, -32;
shr.u32 %r759, %r743, 4;
and.b32 %r760, %r751, 48;
or.b32 %r761, %r760, %r759;
add.s32 %r762, %r761, -32;
cvt.rn.f32.s32 %f503, %r749;
mul.ftz.f32 %f504, %f272, %f503;
ld.global.nc.f32 %f505, [%rd14+120];
fma.rn.ftz.f32 %f506, %f505, %f504, %f502;
cvt.rn.f32.s32 %f507, %r754;
mul.ftz.f32 %f508, %f274, %f507;
ld.global.nc.f32 %f509, [%rd14+248];
fma.rn.ftz.f32 %f510, %f509, %f508, %f506;
cvt.rn.f32.s32 %f511, %r758;
mul.ftz.f32 %f512, %f276, %f511;
ld.global.nc.f32 %f513, [%rd14+376];
fma.rn.ftz.f32 %f514, %f512, %f513, %f510;
cvt.rn.f32.s32 %f515, %r762;
mul.ftz.f32 %f516, %f278, %f515;
ld.global.nc.f32 %f517, [%rd14+504];
fma.rn.ftz.f32 %f518, %f516, %f517, %f514;
ld.global.nc.u8 %rs117, [%rd3+31];
cvt.u32.u16 %r763, %rs117;
and.b32 %r764, %r763, 240;
ld.global.nc.u8 %rs118, [%rd3+63];
cvt.u32.u16 %r765, %rs118;
and.b32 %r766, %r765, 240;
ld.global.nc.u8 %rs119, [%rd3+159];
cvt.u32.u16 %r767, %rs119;
and.b32 %r768, %r767, 252;
and.b32 %r769, %r763, 15;
and.b32 %r770, %r767, 3;
bfi.b32 %r771, %r770, %r769, 4, 2;
add.s32 %r772, %r771, -32;
and.b32 %r773, %r765, 15;
shr.u32 %r774, %r768, 2;
and.b32 %r775, %r774, 3;
bfi.b32 %r776, %r775, %r773, 4, 2;
add.s32 %r777, %r776, -32;
shr.u32 %r778, %r764, 4;
and.b32 %r779, %r767, 48;
or.b32 %r780, %r779, %r778;
add.s32 %r781, %r780, -32;
shr.u32 %r782, %r766, 4;
and.b32 %r783, %r774, 48;
or.b32 %r784, %r783, %r782;
add.s32 %r785, %r784, -32;
cvt.rn.f32.s32 %f519, %r772;
mul.ftz.f32 %f520, %f272, %f519;
ld.global.nc.f32 %f521, [%rd14+124];
fma.rn.ftz.f32 %f522, %f521, %f520, %f518;
cvt.rn.f32.s32 %f523, %r777;
mul.ftz.f32 %f524, %f274, %f523;
ld.global.nc.f32 %f525, [%rd14+252];
fma.rn.ftz.f32 %f526, %f525, %f524, %f522;
cvt.rn.f32.s32 %f527, %r781;
mul.ftz.f32 %f528, %f276, %f527;
ld.global.nc.f32 %f529, [%rd14+380];
fma.rn.ftz.f32 %f530, %f528, %f529, %f526;
cvt.rn.f32.s32 %f531, %r785;
mul.ftz.f32 %f532, %f278, %f531;
ld.global.nc.f32 %f533, [%rd14+508];
fma.rn.ftz.f32 %f534, %f532, %f533, %f530;
ld.global.nc.u8 %rs120, [%rd3+64];
cvt.u32.u16 %r786, %rs120;
and.b32 %r787, %r786, 240;
ld.global.nc.u8 %rs121, [%rd3+96];
cvt.u32.u16 %r788, %rs121;
and.b32 %r789, %r788, 240;
ld.global.nc.u8 %rs122, [%rd3+160];
cvt.u32.u16 %r790, %rs122;
and.b32 %r791, %r790, 252;
and.b32 %r792, %r786, 15;
and.b32 %r793, %r790, 3;
bfi.b32 %r794, %r793, %r792, 4, 2;
add.s32 %r795, %r794, -32;
and.b32 %r796, %r788, 15;
shr.u32 %r797, %r791, 2;
and.b32 %r798, %r797, 3;
bfi.b32 %r799, %r798, %r796, 4, 2;
add.s32 %r800, %r799, -32;
shr.u32 %r801, %r787, 4;
and.b32 %r802, %r790, 48;
or.b32 %r803, %r802, %r801;
add.s32 %r804, %r803, -32;
shr.u32 %r805, %r789, 4;
and.b32 %r806, %r797, 48;
or.b32 %r807, %r806, %r805;
add.s32 %r808, %r807, -32;
ld.global.nc.u8 %rs123, [%rd3+200];
cvt.s16.s8 %rs124, %rs123;
cvt.rn.f32.s16 %f535, %rs124;
mul.ftz.f32 %f536, %f7, %f535;
ld.global.nc.u8 %rs125, [%rd3+202];
cvt.s16.s8 %rs126, %rs125;
cvt.rn.f32.s16 %f537, %rs126;
mul.ftz.f32 %f538, %f7, %f537;
ld.global.nc.u8 %rs127, [%rd3+204];
cvt.s16.s8 %rs128, %rs127;
cvt.rn.f32.s16 %f539, %rs128;
mul.ftz.f32 %f540, %f7, %f539;
ld.global.nc.u8 %rs129, [%rd3+206];
cvt.s16.s8 %rs130, %rs129;
cvt.rn.f32.s16 %f541, %rs130;
mul.ftz.f32 %f542, %f7, %f541;
cvt.rn.f32.s32 %f543, %r795;
mul.ftz.f32 %f544, %f536, %f543;
ld.global.nc.f32 %f545, [%rd14+512];
fma.rn.ftz.f32 %f546, %f545, %f544, %f534;
cvt.rn.f32.s32 %f547, %r800;
mul.ftz.f32 %f548, %f538, %f547;
ld.global.nc.f32 %f549, [%rd14+640];
fma.rn.ftz.f32 %f550, %f549, %f548, %f546;
cvt.rn.f32.s32 %f551, %r804;
mul.ftz.f32 %f552, %f540, %f551;
ld.global.nc.f32 %f553, [%rd14+768];
fma.rn.ftz.f32 %f554, %f552, %f553, %f550;
cvt.rn.f32.s32 %f555, %r808;
mul.ftz.f32 %f556, %f542, %f555;
ld.global.nc.f32 %f557, [%rd14+896];
fma.rn.ftz.f32 %f558, %f556, %f557, %f554;
ld.global.nc.u8 %rs131, [%rd3+65];
cvt.u32.u16 %r809, %rs131;
and.b32 %r810, %r809, 240;
ld.global.nc.u8 %rs132, [%rd3+97];
cvt.u32.u16 %r811, %rs132;
and.b32 %r812, %r811, 240;
ld.global.nc.u8 %rs133, [%rd3+161];
cvt.u32.u16 %r813, %rs133;
and.b32 %r814, %r813, 252;
and.b32 %r815, %r809, 15;
and.b32 %r816, %r813, 3;
bfi.b32 %r817, %r816, %r815, 4, 2;
add.s32 %r818, %r817, -32;
and.b32 %r819, %r811, 15;
shr.u32 %r820, %r814, 2;
and.b32 %r821, %r820, 3;
bfi.b32 %r822, %r821, %r819, 4, 2;
add.s32 %r823, %r822, -32;
shr.u32 %r824, %r810, 4;
and.b32 %r825, %r813, 48;
or.b32 %r826, %r825, %r824;
add.s32 %r827, %r826, -32;
shr.u32 %r828, %r812, 4;
and.b32 %r829, %r820, 48;
or.b32 %r830, %r829, %r828;
add.s32 %r831, %r830, -32;
cvt.rn.f32.s32 %f559, %r818;
mul.ftz.f32 %f560, %f536, %f559;
ld.global.nc.f32 %f561, [%rd14+516];
fma.rn.ftz.f32 %f562, %f561, %f560, %f558;
cvt.rn.f32.s32 %f563, %r823;
mul.ftz.f32 %f564, %f538, %f563;
ld.global.nc.f32 %f565, [%rd14+644];
fma.rn.ftz.f32 %f566, %f565, %f564, %f562;
cvt.rn.f32.s32 %f567, %r827;
mul.ftz.f32 %f568, %f540, %f567;
ld.global.nc.f32 %f569, [%rd14+772];
fma.rn.ftz.f32 %f570, %f568, %f569, %f566;
cvt.rn.f32.s32 %f571, %r831;
mul.ftz.f32 %f572, %f542, %f571;
ld.global.nc.f32 %f573, [%rd14+900];
fma.rn.ftz.f32 %f574, %f572, %f573, %f570;
ld.global.nc.u8 %rs134, [%rd3+66];
cvt.u32.u16 %r832, %rs134;
and.b32 %r833, %r832, 240;
ld.global.nc.u8 %rs135, [%rd3+98];
cvt.u32.u16 %r834, %rs135;
and.b32 %r835, %r834, 240;
ld.global.nc.u8 %rs136, [%rd3+162];
cvt.u32.u16 %r836, %rs136;
and.b32 %r837, %r836, 252;
and.b32 %r838, %r832, 15;
and.b32 %r839, %r836, 3;
bfi.b32 %r840, %r839, %r838, 4, 2;
add.s32 %r841, %r840, -32;
and.b32 %r842, %r834, 15;
shr.u32 %r843, %r837, 2;
and.b32 %r844, %r843, 3;
bfi.b32 %r845, %r844, %r842, 4, 2;
add.s32 %r846, %r845, -32;
shr.u32 %r847, %r833, 4;
and.b32 %r848, %r836, 48;
or.b32 %r849, %r848, %r847;
add.s32 %r850, %r849, -32;
shr.u32 %r851, %r835, 4;
and.b32 %r852, %r843, 48;
or.b32 %r853, %r852, %r851;
add.s32 %r854, %r853, -32;
cvt.rn.f32.s32 %f575, %r841;
mul.ftz.f32 %f576, %f536, %f575;
ld.global.nc.f32 %f577, [%rd14+520];
fma.rn.ftz.f32 %f578, %f577, %f576, %f574;
cvt.rn.f32.s32 %f579, %r846;
mul.ftz.f32 %f580, %f538, %f579;
ld.global.nc.f32 %f581, [%rd14+648];
fma.rn.ftz.f32 %f582, %f581, %f580, %f578;
cvt.rn.f32.s32 %f583, %r850;
mul.ftz.f32 %f584, %f540, %f583;
ld.global.nc.f32 %f585, [%rd14+776];
fma.rn.ftz.f32 %f586, %f584, %f585, %f582;
cvt.rn.f32.s32 %f587, %r854;
mul.ftz.f32 %f588, %f542, %f587;
ld.global.nc.f32 %f589, [%rd14+904];
fma.rn.ftz.f32 %f590, %f588, %f589, %f586;
ld.global.nc.u8 %rs137, [%rd3+67];
cvt.u32.u16 %r855, %rs137;
and.b32 %r856, %r855, 240;
ld.global.nc.u8 %rs138, [%rd3+99];
cvt.u32.u16 %r857, %rs138;
and.b32 %r858, %r857, 240;
ld.global.nc.u8 %rs139, [%rd3+163];
cvt.u32.u16 %r859, %rs139;
and.b32 %r860, %r859, 252;
and.b32 %r861, %r855, 15;
and.b32 %r862, %r859, 3;
bfi.b32 %r863, %r862, %r861, 4, 2;
add.s32 %r864, %r863, -32;
and.b32 %r865, %r857, 15;
shr.u32 %r866, %r860, 2;
and.b32 %r867, %r866, 3;
bfi.b32 %r868, %r867, %r865, 4, 2;
add.s32 %r869, %r868, -32;
shr.u32 %r870, %r856, 4;
and.b32 %r871, %r859, 48;
or.b32 %r872, %r871, %r870;
add.s32 %r873, %r872, -32;
shr.u32 %r874, %r858, 4;
and.b32 %r875, %r866, 48;
or.b32 %r876, %r875, %r874;
add.s32 %r877, %r876, -32;
cvt.rn.f32.s32 %f591, %r864;
mul.ftz.f32 %f592, %f536, %f591;
ld.global.nc.f32 %f593, [%rd14+524];
fma.rn.ftz.f32 %f594, %f593, %f592, %f590;
cvt.rn.f32.s32 %f595, %r869;
mul.ftz.f32 %f596, %f538, %f595;
ld.global.nc.f32 %f597, [%rd14+652];
fma.rn.ftz.f32 %f598, %f597, %f596, %f594;
cvt.rn.f32.s32 %f599, %r873;
mul.ftz.f32 %f600, %f540, %f599;
ld.global.nc.f32 %f601, [%rd14+780];
fma.rn.ftz.f32 %f602, %f600, %f601, %f598;
cvt.rn.f32.s32 %f603, %r877;
mul.ftz.f32 %f604, %f542, %f603;
ld.global.nc.f32 %f605, [%rd14+908];
fma.rn.ftz.f32 %f606, %f604, %f605, %f602;
ld.global.nc.u8 %rs140, [%rd3+68];
cvt.u32.u16 %r878, %rs140;
and.b32 %r879, %r878, 240;
ld.global.nc.u8 %rs141, [%rd3+100];
cvt.u32.u16 %r880, %rs141;
and.b32 %r881, %r880, 240;
ld.global.nc.u8 %rs142, [%rd3+164];
cvt.u32.u16 %r882, %rs142;
and.b32 %r883, %r882, 252;
and.b32 %r884, %r878, 15;
and.b32 %r885, %r882, 3;
bfi.b32 %r886, %r885, %r884, 4, 2;
add.s32 %r887, %r886, -32;
and.b32 %r888, %r880, 15;
shr.u32 %r889, %r883, 2;
and.b32 %r890, %r889, 3;
bfi.b32 %r891, %r890, %r888, 4, 2;
add.s32 %r892, %r891, -32;
shr.u32 %r893, %r879, 4;
and.b32 %r894, %r882, 48;
or.b32 %r895, %r894, %r893;
add.s32 %r896, %r895, -32;
shr.u32 %r897, %r881, 4;
and.b32 %r898, %r889, 48;
or.b32 %r899, %r898, %r897;
add.s32 %r900, %r899, -32;
cvt.rn.f32.s32 %f607, %r887;
mul.ftz.f32 %f608, %f536, %f607;
ld.global.nc.f32 %f609, [%rd14+528];
fma.rn.ftz.f32 %f610, %f609, %f608, %f606;
cvt.rn.f32.s32 %f611, %r892;
mul.ftz.f32 %f612, %f538, %f611;
ld.global.nc.f32 %f613, [%rd14+656];
fma.rn.ftz.f32 %f614, %f613, %f612, %f610;
cvt.rn.f32.s32 %f615, %r896;
mul.ftz.f32 %f616, %f540, %f615;
ld.global.nc.f32 %f617, [%rd14+784];
fma.rn.ftz.f32 %f618, %f616, %f617, %f614;
cvt.rn.f32.s32 %f619, %r900;
mul.ftz.f32 %f620, %f542, %f619;
ld.global.nc.f32 %f621, [%rd14+912];
fma.rn.ftz.f32 %f622, %f620, %f621, %f618;
ld.global.nc.u8 %rs143, [%rd3+69];
cvt.u32.u16 %r901, %rs143;
and.b32 %r902, %r901, 240;
ld.global.nc.u8 %rs144, [%rd3+101];
cvt.u32.u16 %r903, %rs144;
and.b32 %r904, %r903, 240;
ld.global.nc.u8 %rs145, [%rd3+165];
cvt.u32.u16 %r905, %rs145;
and.b32 %r906, %r905, 252;
and.b32 %r907, %r901, 15;
and.b32 %r908, %r905, 3;
bfi.b32 %r909, %r908, %r907, 4, 2;
add.s32 %r910, %r909, -32;
and.b32 %r911, %r903, 15;
shr.u32 %r912, %r906, 2;
and.b32 %r913, %r912, 3;
bfi.b32 %r914, %r913, %r911, 4, 2;
add.s32 %r915, %r914, -32;
shr.u32 %r916, %r902, 4;
and.b32 %r917, %r905, 48;
or.b32 %r918, %r917, %r916;
add.s32 %r919, %r918, -32;
shr.u32 %r920, %r904, 4;
and.b32 %r921, %r912, 48;
or.b32 %r922, %r921, %r920;
add.s32 %r923, %r922, -32;
cvt.rn.f32.s32 %f623, %r910;
mul.ftz.f32 %f624, %f536, %f623;
ld.global.nc.f32 %f625, [%rd14+532];
fma.rn.ftz.f32 %f626, %f625, %f624, %f622;
cvt.rn.f32.s32 %f627, %r915;
mul.ftz.f32 %f628, %f538, %f627;
ld.global.nc.f32 %f629, [%rd14+660];
fma.rn.ftz.f32 %f630, %f629, %f628, %f626;
cvt.rn.f32.s32 %f631, %r919;
mul.ftz.f32 %f632, %f540, %f631;
ld.global.nc.f32 %f633, [%rd14+788];
fma.rn.ftz.f32 %f634, %f632, %f633, %f630;
cvt.rn.f32.s32 %f635, %r923;
mul.ftz.f32 %f636, %f542, %f635;
ld.global.nc.f32 %f637, [%rd14+916];
fma.rn.ftz.f32 %f638, %f636, %f637, %f634;
ld.global.nc.u8 %rs146, [%rd3+70];
cvt.u32.u16 %r924, %rs146;
and.b32 %r925, %r924, 240;
ld.global.nc.u8 %rs147, [%rd3+102];
cvt.u32.u16 %r926, %rs147;
and.b32 %r927, %r926, 240;
ld.global.nc.u8 %rs148, [%rd3+166];
cvt.u32.u16 %r928, %rs148;
and.b32 %r929, %r928, 252;
and.b32 %r930, %r924, 15;
and.b32 %r931, %r928, 3;
bfi.b32 %r932, %r931, %r930, 4, 2;
add.s32 %r933, %r932, -32;
and.b32 %r934, %r926, 15;
shr.u32 %r935, %r929, 2;
and.b32 %r936, %r935, 3;
bfi.b32 %r937, %r936, %r934, 4, 2;
add.s32 %r938, %r937, -32;
shr.u32 %r939, %r925, 4;
and.b32 %r940, %r928, 48;
or.b32 %r941, %r940, %r939;
add.s32 %r942, %r941, -32;
shr.u32 %r943, %r927, 4;
and.b32 %r944, %r935, 48;
or.b32 %r945, %r944, %r943;
add.s32 %r946, %r945, -32;
cvt.rn.f32.s32 %f639, %r933;
mul.ftz.f32 %f640, %f536, %f639;
ld.global.nc.f32 %f641, [%rd14+536];
fma.rn.ftz.f32 %f642, %f641, %f640, %f638;
cvt.rn.f32.s32 %f643, %r938;
mul.ftz.f32 %f644, %f538, %f643;
ld.global.nc.f32 %f645, [%rd14+664];
fma.rn.ftz.f32 %f646, %f645, %f644, %f642;
cvt.rn.f32.s32 %f647, %r942;
mul.ftz.f32 %f648, %f540, %f647;
ld.global.nc.f32 %f649, [%rd14+792];
fma.rn.ftz.f32 %f650, %f648, %f649, %f646;
cvt.rn.f32.s32 %f651, %r946;
mul.ftz.f32 %f652, %f542, %f651;
ld.global.nc.f32 %f653, [%rd14+920];
fma.rn.ftz.f32 %f654, %f652, %f653, %f650;
ld.global.nc.u8 %rs149, [%rd3+71];
cvt.u32.u16 %r947, %rs149;
and.b32 %r948, %r947, 240;
ld.global.nc.u8 %rs150, [%rd3+103];
cvt.u32.u16 %r949, %rs150;
and.b32 %r950, %r949, 240;
ld.global.nc.u8 %rs151, [%rd3+167];
cvt.u32.u16 %r951, %rs151;
and.b32 %r952, %r951, 252;
and.b32 %r953, %r947, 15;
and.b32 %r954, %r951, 3;
bfi.b32 %r955, %r954, %r953, 4, 2;
add.s32 %r956, %r955, -32;
and.b32 %r957, %r949, 15;
shr.u32 %r958, %r952, 2;
and.b32 %r959, %r958, 3;
bfi.b32 %r960, %r959, %r957, 4, 2;
add.s32 %r961, %r960, -32;
shr.u32 %r962, %r948, 4;
and.b32 %r963, %r951, 48;
or.b32 %r964, %r963, %r962;
add.s32 %r965, %r964, -32;
shr.u32 %r966, %r950, 4;
and.b32 %r967, %r958, 48;
or.b32 %r968, %r967, %r966;
add.s32 %r969, %r968, -32;
cvt.rn.f32.s32 %f655, %r956;
mul.ftz.f32 %f656, %f536, %f655;
ld.global.nc.f32 %f657, [%rd14+540];
fma.rn.ftz.f32 %f658, %f657, %f656, %f654;
cvt.rn.f32.s32 %f659, %r961;
mul.ftz.f32 %f660, %f538, %f659;
ld.global.nc.f32 %f661, [%rd14+668];
fma.rn.ftz.f32 %f662, %f661, %f660, %f658;
cvt.rn.f32.s32 %f663, %r965;
mul.ftz.f32 %f664, %f540, %f663;
ld.global.nc.f32 %f665, [%rd14+796];
fma.rn.ftz.f32 %f666, %f664, %f665, %f662;
cvt.rn.f32.s32 %f667, %r969;
mul.ftz.f32 %f668, %f542, %f667;
ld.global.nc.f32 %f669, [%rd14+924];
fma.rn.ftz.f32 %f670, %f668, %f669, %f666;
ld.global.nc.u8 %rs152, [%rd3+72];
cvt.u32.u16 %r970, %rs152;
and.b32 %r971, %r970, 240;
ld.global.nc.u8 %rs153, [%rd3+104];
cvt.u32.u16 %r972, %rs153;
and.b32 %r973, %r972, 240;
ld.global.nc.u8 %rs154, [%rd3+168];
cvt.u32.u16 %r974, %rs154;
and.b32 %r975, %r974, 252;
and.b32 %r976, %r970, 15;
and.b32 %r977, %r974, 3;
bfi.b32 %r978, %r977, %r976, 4, 2;
add.s32 %r979, %r978, -32;
and.b32 %r980, %r972, 15;
shr.u32 %r981, %r975, 2;
and.b32 %r982, %r981, 3;
bfi.b32 %r983, %r982, %r980, 4, 2;
add.s32 %r984, %r983, -32;
shr.u32 %r985, %r971, 4;
and.b32 %r986, %r974, 48;
or.b32 %r987, %r986, %r985;
add.s32 %r988, %r987, -32;
shr.u32 %r989, %r973, 4;
and.b32 %r990, %r981, 48;
or.b32 %r991, %r990, %r989;
add.s32 %r992, %r991, -32;
cvt.rn.f32.s32 %f671, %r979;
mul.ftz.f32 %f672, %f536, %f671;
ld.global.nc.f32 %f673, [%rd14+544];
fma.rn.ftz.f32 %f674, %f673, %f672, %f670;
cvt.rn.f32.s32 %f675, %r984;
mul.ftz.f32 %f676, %f538, %f675;
ld.global.nc.f32 %f677, [%rd14+672];
fma.rn.ftz.f32 %f678, %f677, %f676, %f674;
cvt.rn.f32.s32 %f679, %r988;
mul.ftz.f32 %f680, %f540, %f679;
ld.global.nc.f32 %f681, [%rd14+800];
fma.rn.ftz.f32 %f682, %f680, %f681, %f678;
cvt.rn.f32.s32 %f683, %r992;
mul.ftz.f32 %f684, %f542, %f683;
ld.global.nc.f32 %f685, [%rd14+928];
fma.rn.ftz.f32 %f686, %f684, %f685, %f682;
ld.global.nc.u8 %rs155, [%rd3+73];
cvt.u32.u16 %r993, %rs155;
and.b32 %r994, %r993, 240;
ld.global.nc.u8 %rs156, [%rd3+105];
cvt.u32.u16 %r995, %rs156;
and.b32 %r996, %r995, 240;
ld.global.nc.u8 %rs157, [%rd3+169];
cvt.u32.u16 %r997, %rs157;
and.b32 %r998, %r997, 252;
and.b32 %r999, %r993, 15;
and.b32 %r1000, %r997, 3;
bfi.b32 %r1001, %r1000, %r999, 4, 2;
add.s32 %r1002, %r1001, -32;
and.b32 %r1003, %r995, 15;
shr.u32 %r1004, %r998, 2;
and.b32 %r1005, %r1004, 3;
bfi.b32 %r1006, %r1005, %r1003, 4, 2;
add.s32 %r1007, %r1006, -32;
shr.u32 %r1008, %r994, 4;
and.b32 %r1009, %r997, 48;
or.b32 %r1010, %r1009, %r1008;
add.s32 %r1011, %r1010, -32;
shr.u32 %r1012, %r996, 4;
and.b32 %r1013, %r1004, 48;
or.b32 %r1014, %r1013, %r1012;
add.s32 %r1015, %r1014, -32;
cvt.rn.f32.s32 %f687, %r1002;
mul.ftz.f32 %f688, %f536, %f687;
ld.global.nc.f32 %f689, [%rd14+548];
fma.rn.ftz.f32 %f690, %f689, %f688, %f686;
cvt.rn.f32.s32 %f691, %r1007;
mul.ftz.f32 %f692, %f538, %f691;
ld.global.nc.f32 %f693, [%rd14+676];
fma.rn.ftz.f32 %f694, %f693, %f692, %f690;
cvt.rn.f32.s32 %f695, %r1011;
mul.ftz.f32 %f696, %f540, %f695;
ld.global.nc.f32 %f697, [%rd14+804];
fma.rn.ftz.f32 %f698, %f696, %f697, %f694;
cvt.rn.f32.s32 %f699, %r1015;
mul.ftz.f32 %f700, %f542, %f699;
ld.global.nc.f32 %f701, [%rd14+932];
fma.rn.ftz.f32 %f702, %f700, %f701, %f698;
ld.global.nc.u8 %rs158, [%rd3+74];
cvt.u32.u16 %r1016, %rs158;
and.b32 %r1017, %r1016, 240;
ld.global.nc.u8 %rs159, [%rd3+106];
cvt.u32.u16 %r1018, %rs159;
and.b32 %r1019, %r1018, 240;
ld.global.nc.u8 %rs160, [%rd3+170];
cvt.u32.u16 %r1020, %rs160;
and.b32 %r1021, %r1020, 252;
and.b32 %r1022, %r1016, 15;
and.b32 %r1023, %r1020, 3;
bfi.b32 %r1024, %r1023, %r1022, 4, 2;
add.s32 %r1025, %r1024, -32;
and.b32 %r1026, %r1018, 15;
shr.u32 %r1027, %r1021, 2;
and.b32 %r1028, %r1027, 3;
bfi.b32 %r1029, %r1028, %r1026, 4, 2;
add.s32 %r1030, %r1029, -32;
shr.u32 %r1031, %r1017, 4;
and.b32 %r1032, %r1020, 48;
or.b32 %r1033, %r1032, %r1031;
add.s32 %r1034, %r1033, -32;
shr.u32 %r1035, %r1019, 4;
and.b32 %r1036, %r1027, 48;
or.b32 %r1037, %r1036, %r1035;
add.s32 %r1038, %r1037, -32;
cvt.rn.f32.s32 %f703, %r1025;
mul.ftz.f32 %f704, %f536, %f703;
ld.global.nc.f32 %f705, [%rd14+552];
fma.rn.ftz.f32 %f706, %f705, %f704, %f702;
cvt.rn.f32.s32 %f707, %r1030;
mul.ftz.f32 %f708, %f538, %f707;
ld.global.nc.f32 %f709, [%rd14+680];
fma.rn.ftz.f32 %f710, %f709, %f708, %f706;
cvt.rn.f32.s32 %f711, %r1034;
mul.ftz.f32 %f712, %f540, %f711;
ld.global.nc.f32 %f713, [%rd14+808];
fma.rn.ftz.f32 %f714, %f712, %f713, %f710;
cvt.rn.f32.s32 %f715, %r1038;
mul.ftz.f32 %f716, %f542, %f715;
ld.global.nc.f32 %f717, [%rd14+936];
fma.rn.ftz.f32 %f718, %f716, %f717, %f714;
ld.global.nc.u8 %rs161, [%rd3+75];
cvt.u32.u16 %r1039, %rs161;
and.b32 %r1040, %r1039, 240;
ld.global.nc.u8 %rs162, [%rd3+107];
cvt.u32.u16 %r1041, %rs162;
and.b32 %r1042, %r1041, 240;
ld.global.nc.u8 %rs163, [%rd3+171];
cvt.u32.u16 %r1043, %rs163;
and.b32 %r1044, %r1043, 252;
and.b32 %r1045, %r1039, 15;
and.b32 %r1046, %r1043, 3;
bfi.b32 %r1047, %r1046, %r1045, 4, 2;
add.s32 %r1048, %r1047, -32;
and.b32 %r1049, %r1041, 15;
shr.u32 %r1050, %r1044, 2;
and.b32 %r1051, %r1050, 3;
bfi.b32 %r1052, %r1051, %r1049, 4, 2;
add.s32 %r1053, %r1052, -32;
shr.u32 %r1054, %r1040, 4;
and.b32 %r1055, %r1043, 48;
or.b32 %r1056, %r1055, %r1054;
add.s32 %r1057, %r1056, -32;
shr.u32 %r1058, %r1042, 4;
and.b32 %r1059, %r1050, 48;
or.b32 %r1060, %r1059, %r1058;
add.s32 %r1061, %r1060, -32;
cvt.rn.f32.s32 %f719, %r1048;
mul.ftz.f32 %f720, %f536, %f719;
ld.global.nc.f32 %f721, [%rd14+556];
fma.rn.ftz.f32 %f722, %f721, %f720, %f718;
cvt.rn.f32.s32 %f723, %r1053;
mul.ftz.f32 %f724, %f538, %f723;
ld.global.nc.f32 %f725, [%rd14+684];
fma.rn.ftz.f32 %f726, %f725, %f724, %f722;
cvt.rn.f32.s32 %f727, %r1057;
mul.ftz.f32 %f728, %f540, %f727;
ld.global.nc.f32 %f729, [%rd14+812];
fma.rn.ftz.f32 %f730, %f728, %f729, %f726;
cvt.rn.f32.s32 %f731, %r1061;
mul.ftz.f32 %f732, %f542, %f731;
ld.global.nc.f32 %f733, [%rd14+940];
fma.rn.ftz.f32 %f734, %f732, %f733, %f730;
ld.global.nc.u8 %rs164, [%rd3+76];
cvt.u32.u16 %r1062, %rs164;
and.b32 %r1063, %r1062, 240;
ld.global.nc.u8 %rs165, [%rd3+108];
cvt.u32.u16 %r1064, %rs165;
and.b32 %r1065, %r1064, 240;
ld.global.nc.u8 %rs166, [%rd3+172];
cvt.u32.u16 %r1066, %rs166;
and.b32 %r1067, %r1066, 252;
and.b32 %r1068, %r1062, 15;
and.b32 %r1069, %r1066, 3;
bfi.b32 %r1070, %r1069, %r1068, 4, 2;
add.s32 %r1071, %r1070, -32;
and.b32 %r1072, %r1064, 15;
shr.u32 %r1073, %r1067, 2;
and.b32 %r1074, %r1073, 3;
bfi.b32 %r1075, %r1074, %r1072, 4, 2;
add.s32 %r1076, %r1075, -32;
shr.u32 %r1077, %r1063, 4;
and.b32 %r1078, %r1066, 48;
or.b32 %r1079, %r1078, %r1077;
add.s32 %r1080, %r1079, -32;
shr.u32 %r1081, %r1065, 4;
and.b32 %r1082, %r1073, 48;
or.b32 %r1083, %r1082, %r1081;
add.s32 %r1084, %r1083, -32;
cvt.rn.f32.s32 %f735, %r1071;
mul.ftz.f32 %f736, %f536, %f735;
ld.global.nc.f32 %f737, [%rd14+560];
fma.rn.ftz.f32 %f738, %f737, %f736, %f734;
cvt.rn.f32.s32 %f739, %r1076;
mul.ftz.f32 %f740, %f538, %f739;
ld.global.nc.f32 %f741, [%rd14+688];
fma.rn.ftz.f32 %f742, %f741, %f740, %f738;
cvt.rn.f32.s32 %f743, %r1080;
mul.ftz.f32 %f744, %f540, %f743;
ld.global.nc.f32 %f745, [%rd14+816];
fma.rn.ftz.f32 %f746, %f744, %f745, %f742;
cvt.rn.f32.s32 %f747, %r1084;
mul.ftz.f32 %f748, %f542, %f747;
ld.global.nc.f32 %f749, [%rd14+944];
fma.rn.ftz.f32 %f750, %f748, %f749, %f746;
ld.global.nc.u8 %rs167, [%rd3+77];
cvt.u32.u16 %r1085, %rs167;
and.b32 %r1086, %r1085, 240;
ld.global.nc.u8 %rs168, [%rd3+109];
cvt.u32.u16 %r1087, %rs168;
and.b32 %r1088, %r1087, 240;
ld.global.nc.u8 %rs169, [%rd3+173];
cvt.u32.u16 %r1089, %rs169;
and.b32 %r1090, %r1089, 252;
and.b32 %r1091, %r1085, 15;
and.b32 %r1092, %r1089, 3;
bfi.b32 %r1093, %r1092, %r1091, 4, 2;
add.s32 %r1094, %r1093, -32;
and.b32 %r1095, %r1087, 15;
shr.u32 %r1096, %r1090, 2;
and.b32 %r1097, %r1096, 3;
bfi.b32 %r1098, %r1097, %r1095, 4, 2;
add.s32 %r1099, %r1098, -32;
shr.u32 %r1100, %r1086, 4;
and.b32 %r1101, %r1089, 48;
or.b32 %r1102, %r1101, %r1100;
add.s32 %r1103, %r1102, -32;
shr.u32 %r1104, %r1088, 4;
and.b32 %r1105, %r1096, 48;
or.b32 %r1106, %r1105, %r1104;
add.s32 %r1107, %r1106, -32;
cvt.rn.f32.s32 %f751, %r1094;
mul.ftz.f32 %f752, %f536, %f751;
ld.global.nc.f32 %f753, [%rd14+564];
fma.rn.ftz.f32 %f754, %f753, %f752, %f750;
cvt.rn.f32.s32 %f755, %r1099;
mul.ftz.f32 %f756, %f538, %f755;
ld.global.nc.f32 %f757, [%rd14+692];
fma.rn.ftz.f32 %f758, %f757, %f756, %f754;
cvt.rn.f32.s32 %f759, %r1103;
mul.ftz.f32 %f760, %f540, %f759;
ld.global.nc.f32 %f761, [%rd14+820];
fma.rn.ftz.f32 %f762, %f760, %f761, %f758;
cvt.rn.f32.s32 %f763, %r1107;
mul.ftz.f32 %f764, %f542, %f763;
ld.global.nc.f32 %f765, [%rd14+948];
fma.rn.ftz.f32 %f766, %f764, %f765, %f762;
ld.global.nc.u8 %rs170, [%rd3+78];
cvt.u32.u16 %r1108, %rs170;
and.b32 %r1109, %r1108, 240;
ld.global.nc.u8 %rs171, [%rd3+110];
cvt.u32.u16 %r1110, %rs171;
and.b32 %r1111, %r1110, 240;
ld.global.nc.u8 %rs172, [%rd3+174];
cvt.u32.u16 %r1112, %rs172;
and.b32 %r1113, %r1112, 252;
and.b32 %r1114, %r1108, 15;
and.b32 %r1115, %r1112, 3;
bfi.b32 %r1116, %r1115, %r1114, 4, 2;
add.s32 %r1117, %r1116, -32;
and.b32 %r1118, %r1110, 15;
shr.u32 %r1119, %r1113, 2;
and.b32 %r1120, %r1119, 3;
bfi.b32 %r1121, %r1120, %r1118, 4, 2;
add.s32 %r1122, %r1121, -32;
shr.u32 %r1123, %r1109, 4;
and.b32 %r1124, %r1112, 48;
or.b32 %r1125, %r1124, %r1123;
add.s32 %r1126, %r1125, -32;
shr.u32 %r1127, %r1111, 4;
and.b32 %r1128, %r1119, 48;
or.b32 %r1129, %r1128, %r1127;
add.s32 %r1130, %r1129, -32;
cvt.rn.f32.s32 %f767, %r1117;
mul.ftz.f32 %f768, %f536, %f767;
ld.global.nc.f32 %f769, [%rd14+568];
fma.rn.ftz.f32 %f770, %f769, %f768, %f766;
cvt.rn.f32.s32 %f771, %r1122;
mul.ftz.f32 %f772, %f538, %f771;
ld.global.nc.f32 %f773, [%rd14+696];
fma.rn.ftz.f32 %f774, %f773, %f772, %f770;
cvt.rn.f32.s32 %f775, %r1126;
mul.ftz.f32 %f776, %f540, %f775;
ld.global.nc.f32 %f777, [%rd14+824];
fma.rn.ftz.f32 %f778, %f776, %f777, %f774;
cvt.rn.f32.s32 %f779, %r1130;
mul.ftz.f32 %f780, %f542, %f779;
ld.global.nc.f32 %f781, [%rd14+952];
fma.rn.ftz.f32 %f782, %f780, %f781, %f778;
ld.global.nc.u8 %rs173, [%rd3+79];
cvt.u32.u16 %r1131, %rs173;
and.b32 %r1132, %r1131, 240;
ld.global.nc.u8 %rs174, [%rd3+111];
cvt.u32.u16 %r1133, %rs174;
and.b32 %r1134, %r1133, 240;
ld.global.nc.u8 %rs175, [%rd3+175];
cvt.u32.u16 %r1135, %rs175;
and.b32 %r1136, %r1135, 252;
and.b32 %r1137, %r1131, 15;
and.b32 %r1138, %r1135, 3;
bfi.b32 %r1139, %r1138, %r1137, 4, 2;
add.s32 %r1140, %r1139, -32;
and.b32 %r1141, %r1133, 15;
shr.u32 %r1142, %r1136, 2;
and.b32 %r1143, %r1142, 3;
bfi.b32 %r1144, %r1143, %r1141, 4, 2;
add.s32 %r1145, %r1144, -32;
shr.u32 %r1146, %r1132, 4;
and.b32 %r1147, %r1135, 48;
or.b32 %r1148, %r1147, %r1146;
add.s32 %r1149, %r1148, -32;
shr.u32 %r1150, %r1134, 4;
and.b32 %r1151, %r1142, 48;
or.b32 %r1152, %r1151, %r1150;
add.s32 %r1153, %r1152, -32;
cvt.rn.f32.s32 %f783, %r1140;
mul.ftz.f32 %f784, %f536, %f783;
ld.global.nc.f32 %f785, [%rd14+572];
fma.rn.ftz.f32 %f786, %f785, %f784, %f782;
cvt.rn.f32.s32 %f787, %r1145;
mul.ftz.f32 %f788, %f538, %f787;
ld.global.nc.f32 %f789, [%rd14+700];
fma.rn.ftz.f32 %f790, %f789, %f788, %f786;
cvt.rn.f32.s32 %f791, %r1149;
mul.ftz.f32 %f792, %f540, %f791;
ld.global.nc.f32 %f793, [%rd14+828];
fma.rn.ftz.f32 %f794, %f792, %f793, %f790;
cvt.rn.f32.s32 %f795, %r1153;
mul.ftz.f32 %f796, %f542, %f795;
ld.global.nc.f32 %f797, [%rd14+956];
fma.rn.ftz.f32 %f798, %f796, %f797, %f794;
ld.global.nc.u8 %rs176, [%rd3+80];
cvt.u32.u16 %r1154, %rs176;
and.b32 %r1155, %r1154, 240;
ld.global.nc.u8 %rs177, [%rd3+112];
cvt.u32.u16 %r1156, %rs177;
and.b32 %r1157, %r1156, 240;
ld.global.nc.u8 %rs178, [%rd3+176];
cvt.u32.u16 %r1158, %rs178;
and.b32 %r1159, %r1158, 252;
and.b32 %r1160, %r1154, 15;
and.b32 %r1161, %r1158, 3;
bfi.b32 %r1162, %r1161, %r1160, 4, 2;
add.s32 %r1163, %r1162, -32;
and.b32 %r1164, %r1156, 15;
shr.u32 %r1165, %r1159, 2;
and.b32 %r1166, %r1165, 3;
bfi.b32 %r1167, %r1166, %r1164, 4, 2;
add.s32 %r1168, %r1167, -32;
shr.u32 %r1169, %r1155, 4;
and.b32 %r1170, %r1158, 48;
or.b32 %r1171, %r1170, %r1169;
add.s32 %r1172, %r1171, -32;
shr.u32 %r1173, %r1157, 4;
and.b32 %r1174, %r1165, 48;
or.b32 %r1175, %r1174, %r1173;
add.s32 %r1176, %r1175, -32;
ld.global.nc.u8 %rs179, [%rd3+201];
cvt.s16.s8 %rs180, %rs179;
cvt.rn.f32.s16 %f799, %rs180;
mul.ftz.f32 %f800, %f7, %f799;
ld.global.nc.u8 %rs181, [%rd3+203];
cvt.s16.s8 %rs182, %rs181;
cvt.rn.f32.s16 %f801, %rs182;
mul.ftz.f32 %f802, %f7, %f801;
ld.global.nc.u8 %rs183, [%rd3+205];
cvt.s16.s8 %rs184, %rs183;
cvt.rn.f32.s16 %f803, %rs184;
mul.ftz.f32 %f804, %f7, %f803;
ld.global.nc.u8 %rs185, [%rd3+207];
cvt.s16.s8 %rs186, %rs185;
cvt.rn.f32.s16 %f805, %rs186;
mul.ftz.f32 %f806, %f7, %f805;
cvt.rn.f32.s32 %f807, %r1163;
mul.ftz.f32 %f808, %f800, %f807;
ld.global.nc.f32 %f809, [%rd14+576];
fma.rn.ftz.f32 %f810, %f809, %f808, %f798;
cvt.rn.f32.s32 %f811, %r1168;
mul.ftz.f32 %f812, %f802, %f811;
ld.global.nc.f32 %f813, [%rd14+704];
fma.rn.ftz.f32 %f814, %f813, %f812, %f810;
cvt.rn.f32.s32 %f815, %r1172;
mul.ftz.f32 %f816, %f804, %f815;
ld.global.nc.f32 %f817, [%rd14+832];
fma.rn.ftz.f32 %f818, %f816, %f817, %f814;
cvt.rn.f32.s32 %f819, %r1176;
mul.ftz.f32 %f820, %f806, %f819;
ld.global.nc.f32 %f821, [%rd14+960];
fma.rn.ftz.f32 %f822, %f820, %f821, %f818;
ld.global.nc.u8 %rs187, [%rd3+81];
cvt.u32.u16 %r1177, %rs187;
and.b32 %r1178, %r1177, 240;
ld.global.nc.u8 %rs188, [%rd3+113];
cvt.u32.u16 %r1179, %rs188;
and.b32 %r1180, %r1179, 240;
ld.global.nc.u8 %rs189, [%rd3+177];
cvt.u32.u16 %r1181, %rs189;
and.b32 %r1182, %r1181, 252;
and.b32 %r1183, %r1177, 15;
and.b32 %r1184, %r1181, 3;
bfi.b32 %r1185, %r1184, %r1183, 4, 2;
add.s32 %r1186, %r1185, -32;
and.b32 %r1187, %r1179, 15;
shr.u32 %r1188, %r1182, 2;
and.b32 %r1189, %r1188, 3;
bfi.b32 %r1190, %r1189, %r1187, 4, 2;
add.s32 %r1191, %r1190, -32;
shr.u32 %r1192, %r1178, 4;
and.b32 %r1193, %r1181, 48;
or.b32 %r1194, %r1193, %r1192;
add.s32 %r1195, %r1194, -32;
shr.u32 %r1196, %r1180, 4;
and.b32 %r1197, %r1188, 48;
or.b32 %r1198, %r1197, %r1196;
add.s32 %r1199, %r1198, -32;
cvt.rn.f32.s32 %f823, %r1186;
mul.ftz.f32 %f824, %f800, %f823;
ld.global.nc.f32 %f825, [%rd14+580];
fma.rn.ftz.f32 %f826, %f825, %f824, %f822;
cvt.rn.f32.s32 %f827, %r1191;
mul.ftz.f32 %f828, %f802, %f827;
ld.global.nc.f32 %f829, [%rd14+708];
fma.rn.ftz.f32 %f830, %f829, %f828, %f826;
cvt.rn.f32.s32 %f831, %r1195;
mul.ftz.f32 %f832, %f804, %f831;
ld.global.nc.f32 %f833, [%rd14+836];
fma.rn.ftz.f32 %f834, %f832, %f833, %f830;
cvt.rn.f32.s32 %f835, %r1199;
mul.ftz.f32 %f836, %f806, %f835;
ld.global.nc.f32 %f837, [%rd14+964];
fma.rn.ftz.f32 %f838, %f836, %f837, %f834;
ld.global.nc.u8 %rs190, [%rd3+82];
cvt.u32.u16 %r1200, %rs190;
and.b32 %r1201, %r1200, 240;
ld.global.nc.u8 %rs191, [%rd3+114];
cvt.u32.u16 %r1202, %rs191;
and.b32 %r1203, %r1202, 240;
ld.global.nc.u8 %rs192, [%rd3+178];
cvt.u32.u16 %r1204, %rs192;
and.b32 %r1205, %r1204, 252;
and.b32 %r1206, %r1200, 15;
and.b32 %r1207, %r1204, 3;
bfi.b32 %r1208, %r1207, %r1206, 4, 2;
add.s32 %r1209, %r1208, -32;
and.b32 %r1210, %r1202, 15;
shr.u32 %r1211, %r1205, 2;
and.b32 %r1212, %r1211, 3;
bfi.b32 %r1213, %r1212, %r1210, 4, 2;
add.s32 %r1214, %r1213, -32;
shr.u32 %r1215, %r1201, 4;
and.b32 %r1216, %r1204, 48;
or.b32 %r1217, %r1216, %r1215;
add.s32 %r1218, %r1217, -32;
shr.u32 %r1219, %r1203, 4;
and.b32 %r1220, %r1211, 48;
or.b32 %r1221, %r1220, %r1219;
add.s32 %r1222, %r1221, -32;
cvt.rn.f32.s32 %f839, %r1209;
mul.ftz.f32 %f840, %f800, %f839;
ld.global.nc.f32 %f841, [%rd14+584];
fma.rn.ftz.f32 %f842, %f841, %f840, %f838;
cvt.rn.f32.s32 %f843, %r1214;
mul.ftz.f32 %f844, %f802, %f843;
ld.global.nc.f32 %f845, [%rd14+712];
fma.rn.ftz.f32 %f846, %f845, %f844, %f842;
cvt.rn.f32.s32 %f847, %r1218;
mul.ftz.f32 %f848, %f804, %f847;
ld.global.nc.f32 %f849, [%rd14+840];
fma.rn.ftz.f32 %f850, %f848, %f849, %f846;
cvt.rn.f32.s32 %f851, %r1222;
mul.ftz.f32 %f852, %f806, %f851;
ld.global.nc.f32 %f853, [%rd14+968];
fma.rn.ftz.f32 %f854, %f852, %f853, %f850;
ld.global.nc.u8 %rs193, [%rd3+83];
cvt.u32.u16 %r1223, %rs193;
and.b32 %r1224, %r1223, 240;
ld.global.nc.u8 %rs194, [%rd3+115];
cvt.u32.u16 %r1225, %rs194;
and.b32 %r1226, %r1225, 240;
ld.global.nc.u8 %rs195, [%rd3+179];
cvt.u32.u16 %r1227, %rs195;
and.b32 %r1228, %r1227, 252;
and.b32 %r1229, %r1223, 15;
and.b32 %r1230, %r1227, 3;
bfi.b32 %r1231, %r1230, %r1229, 4, 2;
add.s32 %r1232, %r1231, -32;
and.b32 %r1233, %r1225, 15;
shr.u32 %r1234, %r1228, 2;
and.b32 %r1235, %r1234, 3;
bfi.b32 %r1236, %r1235, %r1233, 4, 2;
add.s32 %r1237, %r1236, -32;
shr.u32 %r1238, %r1224, 4;
and.b32 %r1239, %r1227, 48;
or.b32 %r1240, %r1239, %r1238;
add.s32 %r1241, %r1240, -32;
shr.u32 %r1242, %r1226, 4;
and.b32 %r1243, %r1234, 48;
or.b32 %r1244, %r1243, %r1242;
add.s32 %r1245, %r1244, -32;
cvt.rn.f32.s32 %f855, %r1232;
mul.ftz.f32 %f856, %f800, %f855;
ld.global.nc.f32 %f857, [%rd14+588];
fma.rn.ftz.f32 %f858, %f857, %f856, %f854;
cvt.rn.f32.s32 %f859, %r1237;
mul.ftz.f32 %f860, %f802, %f859;
ld.global.nc.f32 %f861, [%rd14+716];
fma.rn.ftz.f32 %f862, %f861, %f860, %f858;
cvt.rn.f32.s32 %f863, %r1241;
mul.ftz.f32 %f864, %f804, %f863;
ld.global.nc.f32 %f865, [%rd14+844];
fma.rn.ftz.f32 %f866, %f864, %f865, %f862;
cvt.rn.f32.s32 %f867, %r1245;
mul.ftz.f32 %f868, %f806, %f867;
ld.global.nc.f32 %f869, [%rd14+972];
fma.rn.ftz.f32 %f870, %f868, %f869, %f866;
ld.global.nc.u8 %rs196, [%rd3+84];
cvt.u32.u16 %r1246, %rs196;
and.b32 %r1247, %r1246, 240;
ld.global.nc.u8 %rs197, [%rd3+116];
cvt.u32.u16 %r1248, %rs197;
and.b32 %r1249, %r1248, 240;
ld.global.nc.u8 %rs198, [%rd3+180];
cvt.u32.u16 %r1250, %rs198;
and.b32 %r1251, %r1250, 252;
and.b32 %r1252, %r1246, 15;
and.b32 %r1253, %r1250, 3;
bfi.b32 %r1254, %r1253, %r1252, 4, 2;
add.s32 %r1255, %r1254, -32;
and.b32 %r1256, %r1248, 15;
shr.u32 %r1257, %r1251, 2;
and.b32 %r1258, %r1257, 3;
bfi.b32 %r1259, %r1258, %r1256, 4, 2;
add.s32 %r1260, %r1259, -32;
shr.u32 %r1261, %r1247, 4;
and.b32 %r1262, %r1250, 48;
or.b32 %r1263, %r1262, %r1261;
add.s32 %r1264, %r1263, -32;
shr.u32 %r1265, %r1249, 4;
and.b32 %r1266, %r1257, 48;
or.b32 %r1267, %r1266, %r1265;
add.s32 %r1268, %r1267, -32;
cvt.rn.f32.s32 %f871, %r1255;
mul.ftz.f32 %f872, %f800, %f871;
ld.global.nc.f32 %f873, [%rd14+592];
fma.rn.ftz.f32 %f874, %f873, %f872, %f870;
cvt.rn.f32.s32 %f875, %r1260;
mul.ftz.f32 %f876, %f802, %f875;
ld.global.nc.f32 %f877, [%rd14+720];
fma.rn.ftz.f32 %f878, %f877, %f876, %f874;
cvt.rn.f32.s32 %f879, %r1264;
mul.ftz.f32 %f880, %f804, %f879;
ld.global.nc.f32 %f881, [%rd14+848];
fma.rn.ftz.f32 %f882, %f880, %f881, %f878;
cvt.rn.f32.s32 %f883, %r1268;
mul.ftz.f32 %f884, %f806, %f883;
ld.global.nc.f32 %f885, [%rd14+976];
fma.rn.ftz.f32 %f886, %f884, %f885, %f882;
ld.global.nc.u8 %rs199, [%rd3+85];
cvt.u32.u16 %r1269, %rs199;
and.b32 %r1270, %r1269, 240;
ld.global.nc.u8 %rs200, [%rd3+117];
cvt.u32.u16 %r1271, %rs200;
and.b32 %r1272, %r1271, 240;
ld.global.nc.u8 %rs201, [%rd3+181];
cvt.u32.u16 %r1273, %rs201;
and.b32 %r1274, %r1273, 252;
and.b32 %r1275, %r1269, 15;
and.b32 %r1276, %r1273, 3;
bfi.b32 %r1277, %r1276, %r1275, 4, 2;
add.s32 %r1278, %r1277, -32;
and.b32 %r1279, %r1271, 15;
shr.u32 %r1280, %r1274, 2;
and.b32 %r1281, %r1280, 3;
bfi.b32 %r1282, %r1281, %r1279, 4, 2;
add.s32 %r1283, %r1282, -32;
shr.u32 %r1284, %r1270, 4;
and.b32 %r1285, %r1273, 48;
or.b32 %r1286, %r1285, %r1284;
add.s32 %r1287, %r1286, -32;
shr.u32 %r1288, %r1272, 4;
and.b32 %r1289, %r1280, 48;
or.b32 %r1290, %r1289, %r1288;
add.s32 %r1291, %r1290, -32;
cvt.rn.f32.s32 %f887, %r1278;
mul.ftz.f32 %f888, %f800, %f887;
ld.global.nc.f32 %f889, [%rd14+596];
fma.rn.ftz.f32 %f890, %f889, %f888, %f886;
cvt.rn.f32.s32 %f891, %r1283;
mul.ftz.f32 %f892, %f802, %f891;
ld.global.nc.f32 %f893, [%rd14+724];
fma.rn.ftz.f32 %f894, %f893, %f892, %f890;
cvt.rn.f32.s32 %f895, %r1287;
mul.ftz.f32 %f896, %f804, %f895;
ld.global.nc.f32 %f897, [%rd14+852];
fma.rn.ftz.f32 %f898, %f896, %f897, %f894;
cvt.rn.f32.s32 %f899, %r1291;
mul.ftz.f32 %f900, %f806, %f899;
ld.global.nc.f32 %f901, [%rd14+980];
fma.rn.ftz.f32 %f902, %f900, %f901, %f898;
ld.global.nc.u8 %rs202, [%rd3+86];
cvt.u32.u16 %r1292, %rs202;
and.b32 %r1293, %r1292, 240;
ld.global.nc.u8 %rs203, [%rd3+118];
cvt.u32.u16 %r1294, %rs203;
and.b32 %r1295, %r1294, 240;
ld.global.nc.u8 %rs204, [%rd3+182];
cvt.u32.u16 %r1296, %rs204;
and.b32 %r1297, %r1296, 252;
and.b32 %r1298, %r1292, 15;
and.b32 %r1299, %r1296, 3;
bfi.b32 %r1300, %r1299, %r1298, 4, 2;
add.s32 %r1301, %r1300, -32;
and.b32 %r1302, %r1294, 15;
shr.u32 %r1303, %r1297, 2;
and.b32 %r1304, %r1303, 3;
bfi.b32 %r1305, %r1304, %r1302, 4, 2;
add.s32 %r1306, %r1305, -32;
shr.u32 %r1307, %r1293, 4;
and.b32 %r1308, %r1296, 48;
or.b32 %r1309, %r1308, %r1307;
add.s32 %r1310, %r1309, -32;
shr.u32 %r1311, %r1295, 4;
and.b32 %r1312, %r1303, 48;
or.b32 %r1313, %r1312, %r1311;
add.s32 %r1314, %r1313, -32;
cvt.rn.f32.s32 %f903, %r1301;
mul.ftz.f32 %f904, %f800, %f903;
ld.global.nc.f32 %f905, [%rd14+600];
fma.rn.ftz.f32 %f906, %f905, %f904, %f902;
cvt.rn.f32.s32 %f907, %r1306;
mul.ftz.f32 %f908, %f802, %f907;
ld.global.nc.f32 %f909, [%rd14+728];
fma.rn.ftz.f32 %f910, %f909, %f908, %f906;
cvt.rn.f32.s32 %f911, %r1310;
mul.ftz.f32 %f912, %f804, %f911;
ld.global.nc.f32 %f913, [%rd14+856];
fma.rn.ftz.f32 %f914, %f912, %f913, %f910;
cvt.rn.f32.s32 %f915, %r1314;
mul.ftz.f32 %f916, %f806, %f915;
ld.global.nc.f32 %f917, [%rd14+984];
fma.rn.ftz.f32 %f918, %f916, %f917, %f914;
ld.global.nc.u8 %rs205, [%rd3+87];
cvt.u32.u16 %r1315, %rs205;
and.b32 %r1316, %r1315, 240;
ld.global.nc.u8 %rs206, [%rd3+119];
cvt.u32.u16 %r1317, %rs206;
and.b32 %r1318, %r1317, 240;
ld.global.nc.u8 %rs207, [%rd3+183];
cvt.u32.u16 %r1319, %rs207;
and.b32 %r1320, %r1319, 252;
and.b32 %r1321, %r1315, 15;
and.b32 %r1322, %r1319, 3;
bfi.b32 %r1323, %r1322, %r1321, 4, 2;
add.s32 %r1324, %r1323, -32;
and.b32 %r1325, %r1317, 15;
shr.u32 %r1326, %r1320, 2;
and.b32 %r1327, %r1326, 3;
bfi.b32 %r1328, %r1327, %r1325, 4, 2;
add.s32 %r1329, %r1328, -32;
shr.u32 %r1330, %r1316, 4;
and.b32 %r1331, %r1319, 48;
or.b32 %r1332, %r1331, %r1330;
add.s32 %r1333, %r1332, -32;
shr.u32 %r1334, %r1318, 4;
and.b32 %r1335, %r1326, 48;
or.b32 %r1336, %r1335, %r1334;
add.s32 %r1337, %r1336, -32;
cvt.rn.f32.s32 %f919, %r1324;
mul.ftz.f32 %f920, %f800, %f919;
ld.global.nc.f32 %f921, [%rd14+604];
fma.rn.ftz.f32 %f922, %f921, %f920, %f918;
cvt.rn.f32.s32 %f923, %r1329;
mul.ftz.f32 %f924, %f802, %f923;
ld.global.nc.f32 %f925, [%rd14+732];
fma.rn.ftz.f32 %f926, %f925, %f924, %f922;
cvt.rn.f32.s32 %f927, %r1333;
mul.ftz.f32 %f928, %f804, %f927;
ld.global.nc.f32 %f929, [%rd14+860];
fma.rn.ftz.f32 %f930, %f928, %f929, %f926;
cvt.rn.f32.s32 %f931, %r1337;
mul.ftz.f32 %f932, %f806, %f931;
ld.global.nc.f32 %f933, [%rd14+988];
fma.rn.ftz.f32 %f934, %f932, %f933, %f930;
ld.global.nc.u8 %rs208, [%rd3+88];
cvt.u32.u16 %r1338, %rs208;
and.b32 %r1339, %r1338, 240;
ld.global.nc.u8 %rs209, [%rd3+120];
cvt.u32.u16 %r1340, %rs209;
and.b32 %r1341, %r1340, 240;
ld.global.nc.u8 %rs210, [%rd3+184];
cvt.u32.u16 %r1342, %rs210;
and.b32 %r1343, %r1342, 252;
and.b32 %r1344, %r1338, 15;
and.b32 %r1345, %r1342, 3;
bfi.b32 %r1346, %r1345, %r1344, 4, 2;
add.s32 %r1347, %r1346, -32;
and.b32 %r1348, %r1340, 15;
shr.u32 %r1349, %r1343, 2;
and.b32 %r1350, %r1349, 3;
bfi.b32 %r1351, %r1350, %r1348, 4, 2;
add.s32 %r1352, %r1351, -32;
shr.u32 %r1353, %r1339, 4;
and.b32 %r1354, %r1342, 48;
or.b32 %r1355, %r1354, %r1353;
add.s32 %r1356, %r1355, -32;
shr.u32 %r1357, %r1341, 4;
and.b32 %r1358, %r1349, 48;
or.b32 %r1359, %r1358, %r1357;
add.s32 %r1360, %r1359, -32;
cvt.rn.f32.s32 %f935, %r1347;
mul.ftz.f32 %f936, %f800, %f935;
ld.global.nc.f32 %f937, [%rd14+608];
fma.rn.ftz.f32 %f938, %f937, %f936, %f934;
cvt.rn.f32.s32 %f939, %r1352;
mul.ftz.f32 %f940, %f802, %f939;
ld.global.nc.f32 %f941, [%rd14+736];
fma.rn.ftz.f32 %f942, %f941, %f940, %f938;
cvt.rn.f32.s32 %f943, %r1356;
mul.ftz.f32 %f944, %f804, %f943;
ld.global.nc.f32 %f945, [%rd14+864];
fma.rn.ftz.f32 %f946, %f944, %f945, %f942;
cvt.rn.f32.s32 %f947, %r1360;
mul.ftz.f32 %f948, %f806, %f947;
ld.global.nc.f32 %f949, [%rd14+992];
fma.rn.ftz.f32 %f950, %f948, %f949, %f946;
ld.global.nc.u8 %rs211, [%rd3+89];
cvt.u32.u16 %r1361, %rs211;
and.b32 %r1362, %r1361, 240;
ld.global.nc.u8 %rs212, [%rd3+121];
cvt.u32.u16 %r1363, %rs212;
and.b32 %r1364, %r1363, 240;
ld.global.nc.u8 %rs213, [%rd3+185];
cvt.u32.u16 %r1365, %rs213;
and.b32 %r1366, %r1365, 252;
and.b32 %r1367, %r1361, 15;
and.b32 %r1368, %r1365, 3;
bfi.b32 %r1369, %r1368, %r1367, 4, 2;
add.s32 %r1370, %r1369, -32;
and.b32 %r1371, %r1363, 15;
shr.u32 %r1372, %r1366, 2;
and.b32 %r1373, %r1372, 3;
bfi.b32 %r1374, %r1373, %r1371, 4, 2;
add.s32 %r1375, %r1374, -32;
shr.u32 %r1376, %r1362, 4;
and.b32 %r1377, %r1365, 48;
or.b32 %r1378, %r1377, %r1376;
add.s32 %r1379, %r1378, -32;
shr.u32 %r1380, %r1364, 4;
and.b32 %r1381, %r1372, 48;
or.b32 %r1382, %r1381, %r1380;
add.s32 %r1383, %r1382, -32;
cvt.rn.f32.s32 %f951, %r1370;
mul.ftz.f32 %f952, %f800, %f951;
ld.global.nc.f32 %f953, [%rd14+612];
fma.rn.ftz.f32 %f954, %f953, %f952, %f950;
cvt.rn.f32.s32 %f955, %r1375;
mul.ftz.f32 %f956, %f802, %f955;
ld.global.nc.f32 %f957, [%rd14+740];
fma.rn.ftz.f32 %f958, %f957, %f956, %f954;
cvt.rn.f32.s32 %f959, %r1379;
mul.ftz.f32 %f960, %f804, %f959;
ld.global.nc.f32 %f961, [%rd14+868];
fma.rn.ftz.f32 %f962, %f960, %f961, %f958;
cvt.rn.f32.s32 %f963, %r1383;
mul.ftz.f32 %f964, %f806, %f963;
ld.global.nc.f32 %f965, [%rd14+996];
fma.rn.ftz.f32 %f966, %f964, %f965, %f962;
ld.global.nc.u8 %rs214, [%rd3+90];
cvt.u32.u16 %r1384, %rs214;
and.b32 %r1385, %r1384, 240;
ld.global.nc.u8 %rs215, [%rd3+122];
cvt.u32.u16 %r1386, %rs215;
and.b32 %r1387, %r1386, 240;
ld.global.nc.u8 %rs216, [%rd3+186];
cvt.u32.u16 %r1388, %rs216;
and.b32 %r1389, %r1388, 252;
and.b32 %r1390, %r1384, 15;
and.b32 %r1391, %r1388, 3;
bfi.b32 %r1392, %r1391, %r1390, 4, 2;
add.s32 %r1393, %r1392, -32;
and.b32 %r1394, %r1386, 15;
shr.u32 %r1395, %r1389, 2;
and.b32 %r1396, %r1395, 3;
bfi.b32 %r1397, %r1396, %r1394, 4, 2;
add.s32 %r1398, %r1397, -32;
shr.u32 %r1399, %r1385, 4;
and.b32 %r1400, %r1388, 48;
or.b32 %r1401, %r1400, %r1399;
add.s32 %r1402, %r1401, -32;
shr.u32 %r1403, %r1387, 4;
and.b32 %r1404, %r1395, 48;
or.b32 %r1405, %r1404, %r1403;
add.s32 %r1406, %r1405, -32;
cvt.rn.f32.s32 %f967, %r1393;
mul.ftz.f32 %f968, %f800, %f967;
ld.global.nc.f32 %f969, [%rd14+616];
fma.rn.ftz.f32 %f970, %f969, %f968, %f966;
cvt.rn.f32.s32 %f971, %r1398;
mul.ftz.f32 %f972, %f802, %f971;
ld.global.nc.f32 %f973, [%rd14+744];
fma.rn.ftz.f32 %f974, %f973, %f972, %f970;
cvt.rn.f32.s32 %f975, %r1402;
mul.ftz.f32 %f976, %f804, %f975;
ld.global.nc.f32 %f977, [%rd14+872];
fma.rn.ftz.f32 %f978, %f976, %f977, %f974;
cvt.rn.f32.s32 %f979, %r1406;
mul.ftz.f32 %f980, %f806, %f979;
ld.global.nc.f32 %f981, [%rd14+1000];
fma.rn.ftz.f32 %f982, %f980, %f981, %f978;
ld.global.nc.u8 %rs217, [%rd3+91];
cvt.u32.u16 %r1407, %rs217;
and.b32 %r1408, %r1407, 240;
ld.global.nc.u8 %rs218, [%rd3+123];
cvt.u32.u16 %r1409, %rs218;
and.b32 %r1410, %r1409, 240;
ld.global.nc.u8 %rs219, [%rd3+187];
cvt.u32.u16 %r1411, %rs219;
and.b32 %r1412, %r1411, 252;
and.b32 %r1413, %r1407, 15;
and.b32 %r1414, %r1411, 3;
bfi.b32 %r1415, %r1414, %r1413, 4, 2;
add.s32 %r1416, %r1415, -32;
and.b32 %r1417, %r1409, 15;
shr.u32 %r1418, %r1412, 2;
and.b32 %r1419, %r1418, 3;
bfi.b32 %r1420, %r1419, %r1417, 4, 2;
add.s32 %r1421, %r1420, -32;
shr.u32 %r1422, %r1408, 4;
and.b32 %r1423, %r1411, 48;
or.b32 %r1424, %r1423, %r1422;
add.s32 %r1425, %r1424, -32;
shr.u32 %r1426, %r1410, 4;
and.b32 %r1427, %r1418, 48;
or.b32 %r1428, %r1427, %r1426;
add.s32 %r1429, %r1428, -32;
cvt.rn.f32.s32 %f983, %r1416;
mul.ftz.f32 %f984, %f800, %f983;
ld.global.nc.f32 %f985, [%rd14+620];
fma.rn.ftz.f32 %f986, %f985, %f984, %f982;
cvt.rn.f32.s32 %f987, %r1421;
mul.ftz.f32 %f988, %f802, %f987;
ld.global.nc.f32 %f989, [%rd14+748];
fma.rn.ftz.f32 %f990, %f989, %f988, %f986;
cvt.rn.f32.s32 %f991, %r1425;
mul.ftz.f32 %f992, %f804, %f991;
ld.global.nc.f32 %f993, [%rd14+876];
fma.rn.ftz.f32 %f994, %f992, %f993, %f990;
cvt.rn.f32.s32 %f995, %r1429;
mul.ftz.f32 %f996, %f806, %f995;
ld.global.nc.f32 %f997, [%rd14+1004];
fma.rn.ftz.f32 %f998, %f996, %f997, %f994;
ld.global.nc.u8 %rs220, [%rd3+92];
cvt.u32.u16 %r1430, %rs220;
and.b32 %r1431, %r1430, 240;
ld.global.nc.u8 %rs221, [%rd3+124];
cvt.u32.u16 %r1432, %rs221;
and.b32 %r1433, %r1432, 240;
ld.global.nc.u8 %rs222, [%rd3+188];
cvt.u32.u16 %r1434, %rs222;
and.b32 %r1435, %r1434, 252;
and.b32 %r1436, %r1430, 15;
and.b32 %r1437, %r1434, 3;
bfi.b32 %r1438, %r1437, %r1436, 4, 2;
add.s32 %r1439, %r1438, -32;
and.b32 %r1440, %r1432, 15;
shr.u32 %r1441, %r1435, 2;
and.b32 %r1442, %r1441, 3;
bfi.b32 %r1443, %r1442, %r1440, 4, 2;
add.s32 %r1444, %r1443, -32;
shr.u32 %r1445, %r1431, 4;
and.b32 %r1446, %r1434, 48;
or.b32 %r1447, %r1446, %r1445;
add.s32 %r1448, %r1447, -32;
shr.u32 %r1449, %r1433, 4;
and.b32 %r1450, %r1441, 48;
or.b32 %r1451, %r1450, %r1449;
add.s32 %r1452, %r1451, -32;
cvt.rn.f32.s32 %f999, %r1439;
mul.ftz.f32 %f1000, %f800, %f999;
ld.global.nc.f32 %f1001, [%rd14+624];
fma.rn.ftz.f32 %f1002, %f1001, %f1000, %f998;
cvt.rn.f32.s32 %f1003, %r1444;
mul.ftz.f32 %f1004, %f802, %f1003;
ld.global.nc.f32 %f1005, [%rd14+752];
fma.rn.ftz.f32 %f1006, %f1005, %f1004, %f1002;
cvt.rn.f32.s32 %f1007, %r1448;
mul.ftz.f32 %f1008, %f804, %f1007;
ld.global.nc.f32 %f1009, [%rd14+880];
fma.rn.ftz.f32 %f1010, %f1008, %f1009, %f1006;
cvt.rn.f32.s32 %f1011, %r1452;
mul.ftz.f32 %f1012, %f806, %f1011;
ld.global.nc.f32 %f1013, [%rd14+1008];
fma.rn.ftz.f32 %f1014, %f1012, %f1013, %f1010;
ld.global.nc.u8 %rs223, [%rd3+93];
cvt.u32.u16 %r1453, %rs223;
and.b32 %r1454, %r1453, 240;
ld.global.nc.u8 %rs224, [%rd3+125];
cvt.u32.u16 %r1455, %rs224;
and.b32 %r1456, %r1455, 240;
ld.global.nc.u8 %rs225, [%rd3+189];
cvt.u32.u16 %r1457, %rs225;
and.b32 %r1458, %r1457, 252;
and.b32 %r1459, %r1453, 15;
and.b32 %r1460, %r1457, 3;
bfi.b32 %r1461, %r1460, %r1459, 4, 2;
add.s32 %r1462, %r1461, -32;
and.b32 %r1463, %r1455, 15;
shr.u32 %r1464, %r1458, 2;
and.b32 %r1465, %r1464, 3;
bfi.b32 %r1466, %r1465, %r1463, 4, 2;
add.s32 %r1467, %r1466, -32;
shr.u32 %r1468, %r1454, 4;
and.b32 %r1469, %r1457, 48;
or.b32 %r1470, %r1469, %r1468;
add.s32 %r1471, %r1470, -32;
shr.u32 %r1472, %r1456, 4;
and.b32 %r1473, %r1464, 48;
or.b32 %r1474, %r1473, %r1472;
add.s32 %r1475, %r1474, -32;
cvt.rn.f32.s32 %f1015, %r1462;
mul.ftz.f32 %f1016, %f800, %f1015;
ld.global.nc.f32 %f1017, [%rd14+628];
fma.rn.ftz.f32 %f1018, %f1017, %f1016, %f1014;
cvt.rn.f32.s32 %f1019, %r1467;
mul.ftz.f32 %f1020, %f802, %f1019;
ld.global.nc.f32 %f1021, [%rd14+756];
fma.rn.ftz.f32 %f1022, %f1021, %f1020, %f1018;
cvt.rn.f32.s32 %f1023, %r1471;
mul.ftz.f32 %f1024, %f804, %f1023;
ld.global.nc.f32 %f1025, [%rd14+884];
fma.rn.ftz.f32 %f1026, %f1024, %f1025, %f1022;
cvt.rn.f32.s32 %f1027, %r1475;
mul.ftz.f32 %f1028, %f806, %f1027;
ld.global.nc.f32 %f1029, [%rd14+1012];
fma.rn.ftz.f32 %f1030, %f1028, %f1029, %f1026;
ld.global.nc.u8 %rs226, [%rd3+94];
cvt.u32.u16 %r1476, %rs226;
and.b32 %r1477, %r1476, 240;
ld.global.nc.u8 %rs227, [%rd3+126];
cvt.u32.u16 %r1478, %rs227;
and.b32 %r1479, %r1478, 240;
ld.global.nc.u8 %rs228, [%rd3+190];
cvt.u32.u16 %r1480, %rs228;
and.b32 %r1481, %r1480, 252;
and.b32 %r1482, %r1476, 15;
and.b32 %r1483, %r1480, 3;
bfi.b32 %r1484, %r1483, %r1482, 4, 2;
add.s32 %r1485, %r1484, -32;
and.b32 %r1486, %r1478, 15;
shr.u32 %r1487, %r1481, 2;
and.b32 %r1488, %r1487, 3;
bfi.b32 %r1489, %r1488, %r1486, 4, 2;
add.s32 %r1490, %r1489, -32;
shr.u32 %r1491, %r1477, 4;
and.b32 %r1492, %r1480, 48;
or.b32 %r1493, %r1492, %r1491;
add.s32 %r1494, %r1493, -32;
shr.u32 %r1495, %r1479, 4;
and.b32 %r1496, %r1487, 48;
or.b32 %r1497, %r1496, %r1495;
add.s32 %r1498, %r1497, -32;
cvt.rn.f32.s32 %f1031, %r1485;
mul.ftz.f32 %f1032, %f800, %f1031;
ld.global.nc.f32 %f1033, [%rd14+632];
fma.rn.ftz.f32 %f1034, %f1033, %f1032, %f1030;
cvt.rn.f32.s32 %f1035, %r1490;
mul.ftz.f32 %f1036, %f802, %f1035;
ld.global.nc.f32 %f1037, [%rd14+760];
fma.rn.ftz.f32 %f1038, %f1037, %f1036, %f1034;
cvt.rn.f32.s32 %f1039, %r1494;
mul.ftz.f32 %f1040, %f804, %f1039;
ld.global.nc.f32 %f1041, [%rd14+888];
fma.rn.ftz.f32 %f1042, %f1040, %f1041, %f1038;
cvt.rn.f32.s32 %f1043, %r1498;
mul.ftz.f32 %f1044, %f806, %f1043;
ld.global.nc.f32 %f1045, [%rd14+1016];
fma.rn.ftz.f32 %f1046, %f1044, %f1045, %f1042;
ld.global.nc.u8 %rs229, [%rd3+95];
cvt.u32.u16 %r1499, %rs229;
and.b32 %r1500, %r1499, 240;
ld.global.nc.u8 %rs230, [%rd3+127];
cvt.u32.u16 %r1501, %rs230;
and.b32 %r1502, %r1501, 240;
ld.global.nc.u8 %rs231, [%rd3+191];
cvt.u32.u16 %r1503, %rs231;
and.b32 %r1504, %r1503, 252;
and.b32 %r1505, %r1499, 15;
and.b32 %r1506, %r1503, 3;
bfi.b32 %r1507, %r1506, %r1505, 4, 2;
add.s32 %r1508, %r1507, -32;
and.b32 %r1509, %r1501, 15;
shr.u32 %r1510, %r1504, 2;
and.b32 %r1511, %r1510, 3;
bfi.b32 %r1512, %r1511, %r1509, 4, 2;
add.s32 %r1513, %r1512, -32;
shr.u32 %r1514, %r1500, 4;
and.b32 %r1515, %r1503, 48;
or.b32 %r1516, %r1515, %r1514;
add.s32 %r1517, %r1516, -32;
shr.u32 %r1518, %r1502, 4;
and.b32 %r1519, %r1510, 48;
or.b32 %r1520, %r1519, %r1518;
add.s32 %r1521, %r1520, -32;
cvt.rn.f32.s32 %f1047, %r1508;
mul.ftz.f32 %f1048, %f800, %f1047;
ld.global.nc.f32 %f1049, [%rd14+636];
fma.rn.ftz.f32 %f1050, %f1049, %f1048, %f1046;
cvt.rn.f32.s32 %f1051, %r1513;
mul.ftz.f32 %f1052, %f802, %f1051;
ld.global.nc.f32 %f1053, [%rd14+764];
fma.rn.ftz.f32 %f1054, %f1053, %f1052, %f1050;
cvt.rn.f32.s32 %f1055, %r1517;
mul.ftz.f32 %f1056, %f804, %f1055;
ld.global.nc.f32 %f1057, [%rd14+892];
fma.rn.ftz.f32 %f1058, %f1056, %f1057, %f1054;
cvt.rn.f32.s32 %f1059, %r1521;
mul.ftz.f32 %f1060, %f806, %f1059;
ld.global.nc.f32 %f1061, [%rd14+1020];
fma.rn.ftz.f32 %f1063, %f1060, %f1061, %f1058;
add.s32 %r1527, %r1527, 1;
setp.lt.u32 %p8, %r1527, %r2;
@%p8 bra $L__BB1_3;
$L__BB1_13:
cvta.to.global.u64 %rd15, %rd6;
mul.wide.u32 %rd16, %r1, 4;
add.s64 %rd17, %rd15, %rd16;
st.global.f32 [%rd17], %f1063;
$L__BB1_14:
ret;
}