//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: UNKNOWN
// Cuda compilation tools, release 13.2, V13.2.78
// Based on NVVM 7.0.1
//
.version 9.2
.target sm_75
.address_size 64
// .globl j2k_inverse_dwt_single
.global .align 4 .b8 _ZZ20mel_decode_more_runsR10MelDecoderE7MEL_EXP[52] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 5};
// _ZZ48j2k_idwt_interleave_horizontal_53_multiE11row_samples has been demoted
// _ZZ48j2k_idwt_interleave_horizontal_97_multiE11row_samples has been demoted
// _ZZ35j2k_idwt_vertical_53_multiE14column_samples has been demoted
// _ZZ35j2k_idwt_vertical_97_multiE14column_samples has been demoted
// _ZZ41j2k_idwt_vertical_97_multi_cols4E14column_samples has been demoted
.visible .entry j2k_inverse_dwt_single(
.param .u64 j2k_inverse_dwt_single_param_0,
.param .u64 j2k_inverse_dwt_single_param_1,
.param .u64 j2k_inverse_dwt_single_param_2,
.param .u64 j2k_inverse_dwt_single_param_3,
.param .u64 j2k_inverse_dwt_single_param_4,
.param .u64 j2k_inverse_dwt_single_param_5
)
{
.reg .pred %p<202>;
.reg .f32 %f<209>;
.reg .b32 %r<479>;
.reg .b64 %rd<157>;
ld.param.u64 %rd22, [j2k_inverse_dwt_single_param_0];
ld.param.u64 %rd23, [j2k_inverse_dwt_single_param_1];
ld.param.u64 %rd24, [j2k_inverse_dwt_single_param_2];
ld.param.u64 %rd25, [j2k_inverse_dwt_single_param_3];
ld.param.u64 %rd27, [j2k_inverse_dwt_single_param_4];
ld.param.u64 %rd26, [j2k_inverse_dwt_single_param_5];
cvta.to.global.u64 %rd1, %rd27;
mov.u32 %r226, %tid.x;
mov.u32 %r227, %ctaid.x;
or.b32 %r228, %r226, %r227;
setp.ne.s32 %p13, %r228, 0;
@%p13 bra $L__BB0_146;
cvta.to.global.u64 %rd2, %rd26;
ld.global.u32 %r1, [%rd2+16];
ld.global.u32 %r2, [%rd2+32];
ld.global.u32 %r3, [%rd2+48];
ld.global.u32 %r4, [%rd2+64];
ld.global.u32 %r5, [%rd2+80];
ld.global.u32 %r6, [%rd2+8];
ld.global.u32 %r7, [%rd2];
sub.s32 %r8, %r6, %r7;
ld.global.u32 %r9, [%rd2+12];
ld.global.u32 %r10, [%rd2+4];
sub.s32 %r11, %r9, %r10;
setp.le.u32 %p14, %r9, %r10;
@%p14 bra $L__BB0_63;
ld.global.u32 %r12, [%rd2+20];
ld.global.u32 %r13, [%rd2+36];
ld.global.u32 %r14, [%rd2+52];
ld.global.u32 %r15, [%rd2+68];
ld.global.u32 %r19, [%rd2+28];
ld.global.u32 %r18, [%rd2+24];
add.s32 %r20, %r7, 1;
shr.u32 %r21, %r20, 1;
sub.s32 %r22, %r1, %r21;
ld.global.u32 %r26, [%rd2+44];
ld.global.u32 %r25, [%rd2+40];
shr.u32 %r229, %r7, 1;
sub.s32 %r27, %r2, %r229;
ld.global.u32 %r31, [%rd2+60];
ld.global.u32 %r30, [%rd2+56];
sub.s32 %r32, %r3, %r21;
ld.global.u32 %r36, [%rd2+76];
ld.global.u32 %r35, [%rd2+72];
sub.s32 %r37, %r4, %r229;
setp.le.u32 %p15, %r6, %r7;
@%p15 bra $L__BB0_63;
add.s32 %r230, %r10, 1;
shr.u32 %r231, %r230, 1;
shr.u32 %r232, %r10, 1;
not.b32 %r233, %r7;
add.s32 %r38, %r6, %r233;
and.b32 %r39, %r8, 3;
and.b32 %r40, %r7, 1;
setp.eq.b32 %p1, %r40, 1;
and.b32 %r41, %r20, 1;
add.s32 %r42, %r7, 2;
shr.u32 %r234, %r42, 1;
add.s32 %r43, %r22, %r234;
setp.eq.b32 %p2, %r41, 1;
add.s32 %r44, %r27, %r21;
add.s32 %r45, %r32, %r234;
add.s32 %r46, %r37, %r21;
add.s32 %r47, %r7, 3;
shr.u32 %r235, %r47, 1;
add.s32 %r48, %r22, %r235;
add.s32 %r49, %r27, %r234;
add.s32 %r50, %r32, %r235;
add.s32 %r51, %r37, %r234;
sub.s32 %r52, %r15, %r232;
sub.s32 %r53, %r14, %r232;
sub.s32 %r54, %r13, %r231;
sub.s32 %r55, %r12, %r231;
mov.u32 %r411, %r10;
$L__BB0_4:
and.b32 %r57, %r411, 1;
add.s32 %r236, %r411, 1;
shr.u32 %r237, %r236, 1;
add.s32 %r58, %r55, %r237;
sub.s32 %r238, %r411, %r10;
mul.lo.s32 %r59, %r238, %r8;
sub.s32 %r60, %r59, %r7;
add.s32 %r61, %r54, %r237;
setp.eq.b32 %p4, %r57, 1;
shr.u32 %r239, %r411, 1;
add.s32 %r62, %r53, %r239;
add.s32 %r63, %r52, %r239;
setp.eq.s32 %p16, %r39, 0;
mov.u32 %r430, %r7;
@%p16 bra $L__BB0_23;
or.pred %p19, %p1, %p4;
mov.pred %p20, 0;
xor.pred %p21, %p19, %p20;
not.pred %p22, %p21;
mov.u64 %rd150, %rd22;
mov.u32 %r412, %r1;
mov.u32 %r413, %r12;
mov.u32 %r414, %r18;
mov.u32 %r415, %r19;
mov.u32 %r416, %r1;
mov.u32 %r417, %r58;
@%p22 bra $L__BB0_8;
setp.eq.s32 %p23, %r57, 0;
and.pred %p24, %p23, %p1;
mov.u64 %rd150, %rd23;
mov.u32 %r412, %r2;
mov.u32 %r413, %r13;
mov.u32 %r414, %r25;
mov.u32 %r415, %r26;
mov.u32 %r416, %r2;
mov.u32 %r417, %r61;
@%p24 bra $L__BB0_8;
setp.eq.s32 %p25, %r40, 0;
and.pred %p26, %p25, %p4;
selp.b64 %rd150, %rd24, %rd25, %p26;
selp.b32 %r414, %r30, %r35, %p26;
selp.b32 %r413, %r14, %r15, %p26;
selp.b32 %r412, %r3, %r4, %p26;
selp.b32 %r415, %r31, %r36, %p26;
selp.b32 %r416, %r3, %r4, %p26;
selp.b32 %r417, %r62, %r63, %p26;
$L__BB0_8:
setp.lt.u32 %p27, %r416, %r412;
setp.le.u32 %p28, %r414, %r416;
or.pred %p29, %p27, %p28;
setp.lt.u32 %p30, %r417, %r413;
or.pred %p31, %p29, %p30;
setp.le.u32 %p32, %r415, %r417;
or.pred %p33, %p32, %p31;
mov.f32 %f202, 0f00000000;
@%p33 bra $L__BB0_10;
cvta.to.global.u64 %rd28, %rd150;
sub.s32 %r242, %r414, %r412;
sub.s32 %r243, %r417, %r413;
sub.s32 %r244, %r416, %r412;
mad.lo.s32 %r245, %r243, %r242, %r244;
mul.wide.u32 %rd29, %r245, 4;
add.s64 %rd30, %rd28, %rd29;
ld.global.f32 %f202, [%rd30];
$L__BB0_10:
mul.wide.u32 %rd31, %r59, 4;
add.s64 %rd32, %rd1, %rd31;
st.global.f32 [%rd32], %f202;
setp.eq.s32 %p34, %r39, 1;
mov.u32 %r430, %r20;
@%p34 bra $L__BB0_23;
or.pred %p37, %p2, %p4;
mov.pred %p38, 0;
xor.pred %p39, %p37, %p38;
not.pred %p40, %p39;
mov.u64 %rd151, %rd22;
mov.u32 %r418, %r1;
mov.u32 %r419, %r12;
mov.u32 %r420, %r18;
mov.u32 %r421, %r19;
mov.u32 %r422, %r43;
mov.u32 %r423, %r58;
@%p40 bra $L__BB0_14;
setp.eq.s32 %p41, %r57, 0;
and.pred %p42, %p41, %p2;
mov.u64 %rd151, %rd23;
mov.u32 %r418, %r2;
mov.u32 %r419, %r13;
mov.u32 %r420, %r25;
mov.u32 %r421, %r26;
mov.u32 %r422, %r44;
mov.u32 %r423, %r61;
@%p42 bra $L__BB0_14;
setp.eq.s32 %p43, %r41, 0;
and.pred %p44, %p43, %p4;
selp.b64 %rd151, %rd24, %rd25, %p44;
selp.b32 %r420, %r30, %r35, %p44;
selp.b32 %r419, %r14, %r15, %p44;
selp.b32 %r418, %r3, %r4, %p44;
selp.b32 %r421, %r31, %r36, %p44;
selp.b32 %r422, %r45, %r46, %p44;
selp.b32 %r423, %r62, %r63, %p44;
$L__BB0_14:
setp.lt.u32 %p45, %r422, %r418;
setp.le.u32 %p46, %r420, %r422;
or.pred %p47, %p45, %p46;
setp.lt.u32 %p48, %r423, %r419;
or.pred %p49, %p47, %p48;
setp.le.u32 %p50, %r421, %r423;
or.pred %p51, %p50, %p49;
mov.f32 %f203, 0f00000000;
@%p51 bra $L__BB0_16;
cvta.to.global.u64 %rd33, %rd151;
sub.s32 %r248, %r420, %r418;
sub.s32 %r249, %r423, %r419;
sub.s32 %r250, %r422, %r418;
mad.lo.s32 %r251, %r249, %r248, %r250;
mul.wide.u32 %rd34, %r251, 4;
add.s64 %rd35, %rd33, %rd34;
ld.global.f32 %f203, [%rd35];
$L__BB0_16:
add.s32 %r252, %r59, 1;
mul.wide.u32 %rd36, %r252, 4;
add.s64 %rd37, %rd1, %rd36;
st.global.f32 [%rd37], %f203;
setp.eq.s32 %p52, %r39, 2;
mov.u32 %r430, %r42;
@%p52 bra $L__BB0_23;
and.b32 %r254, %r42, 1;
setp.eq.b32 %p54, %r254, 1;
or.pred %p55, %p54, %p4;
mov.pred %p56, 0;
xor.pred %p57, %p55, %p56;
not.pred %p58, %p57;
mov.u64 %rd152, %rd22;
mov.u32 %r424, %r1;
mov.u32 %r425, %r12;
mov.u32 %r426, %r18;
mov.u32 %r427, %r19;
mov.u32 %r428, %r48;
mov.u32 %r429, %r58;
@%p58 bra $L__BB0_20;
setp.eq.s32 %p59, %r57, 0;
and.pred %p60, %p59, %p1;
mov.u64 %rd152, %rd23;
mov.u32 %r424, %r2;
mov.u32 %r425, %r13;
mov.u32 %r426, %r25;
mov.u32 %r427, %r26;
mov.u32 %r428, %r49;
mov.u32 %r429, %r61;
@%p60 bra $L__BB0_20;
setp.eq.s32 %p61, %r40, 0;
and.pred %p62, %p61, %p4;
selp.b64 %rd152, %rd24, %rd25, %p62;
selp.b32 %r426, %r30, %r35, %p62;
selp.b32 %r425, %r14, %r15, %p62;
selp.b32 %r424, %r3, %r4, %p62;
selp.b32 %r427, %r31, %r36, %p62;
selp.b32 %r428, %r50, %r51, %p62;
selp.b32 %r429, %r62, %r63, %p62;
$L__BB0_20:
setp.lt.u32 %p63, %r428, %r424;
setp.le.u32 %p64, %r426, %r428;
or.pred %p65, %p63, %p64;
setp.lt.u32 %p66, %r429, %r425;
or.pred %p67, %p65, %p66;
setp.le.u32 %p68, %r427, %r429;
or.pred %p69, %p68, %p67;
mov.f32 %f204, 0f00000000;
@%p69 bra $L__BB0_22;
cvta.to.global.u64 %rd38, %rd152;
sub.s32 %r255, %r426, %r424;
sub.s32 %r256, %r429, %r425;
sub.s32 %r257, %r428, %r424;
mad.lo.s32 %r258, %r256, %r255, %r257;
mul.wide.u32 %rd39, %r258, 4;
add.s64 %rd40, %rd38, %rd39;
ld.global.f32 %f204, [%rd40];
$L__BB0_22:
add.s32 %r259, %r59, 2;
mul.wide.u32 %rd41, %r259, 4;
add.s64 %rd42, %rd1, %rd41;
st.global.f32 [%rd42], %f204;
mov.u32 %r430, %r47;
$L__BB0_23:
setp.lt.u32 %p70, %r38, 3;
@%p70 bra $L__BB0_62;
and.b32 %r260, %r430, 1;
or.b32 %r261, %r57, %r260;
setp.eq.s32 %p71, %r57, 0;
setp.eq.b32 %p72, %r260, 1;
setp.eq.s32 %p73, %r260, 0;
and.pred %p5, %p71, %p72;
and.pred %p6, %p73, %p4;
setp.eq.s32 %p7, %r261, 0;
and.pred %p8, %p72, %p4;
$L__BB0_25:
and.b32 %r263, %r430, 1;
setp.eq.b32 %p75, %r263, 1;
or.pred %p76, %p75, %p4;
mov.pred %p77, 0;
xor.pred %p78, %p76, %p77;
not.pred %p79, %p78;
@%p79 bra $L__BB0_31;
bra.uni $L__BB0_26;
$L__BB0_31:
add.s32 %r268, %r430, 1;
shr.u32 %r269, %r268, 1;
add.s32 %r436, %r22, %r269;
mov.u64 %rd153, %rd22;
mov.u32 %r432, %r1;
mov.u32 %r433, %r12;
mov.u32 %r434, %r18;
mov.u32 %r435, %r19;
mov.u32 %r437, %r58;
bra.uni $L__BB0_32;
$L__BB0_26:
@%p5 bra $L__BB0_30;
bra.uni $L__BB0_27;
$L__BB0_30:
shr.u32 %r267, %r430, 1;
add.s32 %r436, %r27, %r267;
mov.u64 %rd153, %rd23;
mov.u32 %r432, %r2;
mov.u32 %r433, %r13;
mov.u32 %r434, %r25;
mov.u32 %r435, %r26;
mov.u32 %r437, %r61;
bra.uni $L__BB0_32;
$L__BB0_27:
@%p6 bra $L__BB0_29;
bra.uni $L__BB0_28;
$L__BB0_29:
add.s32 %r265, %r430, 1;
shr.u32 %r266, %r265, 1;
add.s32 %r436, %r32, %r266;
mov.u64 %rd153, %rd24;
mov.u32 %r432, %r3;
mov.u32 %r433, %r14;
mov.u32 %r434, %r30;
mov.u32 %r435, %r31;
mov.u32 %r437, %r62;
bra.uni $L__BB0_32;
$L__BB0_28:
shr.u32 %r264, %r430, 1;
add.s32 %r436, %r37, %r264;
mov.u64 %rd153, %rd25;
mov.u32 %r432, %r4;
mov.u32 %r433, %r15;
mov.u32 %r434, %r35;
mov.u32 %r435, %r36;
mov.u32 %r437, %r63;
$L__BB0_32:
setp.lt.u32 %p80, %r436, %r432;
setp.le.u32 %p81, %r434, %r436;
or.pred %p82, %p80, %p81;
setp.lt.u32 %p83, %r437, %r433;
or.pred %p84, %p82, %p83;
setp.le.u32 %p85, %r435, %r437;
or.pred %p86, %p85, %p84;
mov.f32 %f205, 0f00000000;
@%p86 bra $L__BB0_34;
cvta.to.global.u64 %rd43, %rd153;
sub.s32 %r270, %r434, %r432;
sub.s32 %r271, %r437, %r433;
sub.s32 %r272, %r436, %r432;
mad.lo.s32 %r273, %r271, %r270, %r272;
mul.wide.u32 %rd44, %r273, 4;
add.s64 %rd45, %rd43, %rd44;
ld.global.f32 %f205, [%rd45];
$L__BB0_34:
add.s32 %r274, %r60, %r430;
mul.wide.u32 %rd46, %r274, 4;
add.s64 %rd47, %rd1, %rd46;
st.global.f32 [%rd47], %f205;
add.s32 %r124, %r430, 1;
and.b32 %r276, %r124, 1;
setp.eq.b32 %p88, %r276, 1;
or.pred %p89, %p88, %p4;
mov.pred %p90, 0;
xor.pred %p91, %p89, %p90;
not.pred %p92, %p91;
@%p92 bra $L__BB0_40;
bra.uni $L__BB0_35;
$L__BB0_40:
add.s32 %r281, %r430, 2;
shr.u32 %r282, %r281, 1;
add.s32 %r442, %r22, %r282;
mov.u64 %rd154, %rd22;
mov.u32 %r438, %r1;
mov.u32 %r439, %r12;
mov.u32 %r440, %r18;
mov.u32 %r441, %r19;
mov.u32 %r443, %r58;
bra.uni $L__BB0_41;
$L__BB0_35:
@%p7 bra $L__BB0_39;
bra.uni $L__BB0_36;
$L__BB0_39:
add.s32 %r406, %r430, 1;
shr.u32 %r280, %r406, 1;
add.s32 %r442, %r27, %r280;
mov.u64 %rd154, %rd23;
mov.u32 %r438, %r2;
mov.u32 %r439, %r13;
mov.u32 %r440, %r25;
mov.u32 %r441, %r26;
mov.u32 %r443, %r61;
bra.uni $L__BB0_41;
$L__BB0_36:
@%p8 bra $L__BB0_38;
bra.uni $L__BB0_37;
$L__BB0_38:
add.s32 %r278, %r430, 2;
shr.u32 %r279, %r278, 1;
add.s32 %r442, %r32, %r279;
mov.u64 %rd154, %rd24;
mov.u32 %r438, %r3;
mov.u32 %r439, %r14;
mov.u32 %r440, %r30;
mov.u32 %r441, %r31;
mov.u32 %r443, %r62;
bra.uni $L__BB0_41;
$L__BB0_37:
add.s32 %r405, %r430, 1;
shr.u32 %r277, %r405, 1;
add.s32 %r442, %r37, %r277;
mov.u64 %rd154, %rd25;
mov.u32 %r438, %r4;
mov.u32 %r439, %r15;
mov.u32 %r440, %r35;
mov.u32 %r441, %r36;
mov.u32 %r443, %r63;
$L__BB0_41:
setp.lt.u32 %p93, %r442, %r438;
setp.le.u32 %p94, %r440, %r442;
or.pred %p95, %p93, %p94;
setp.lt.u32 %p96, %r443, %r439;
or.pred %p97, %p95, %p96;
setp.le.u32 %p98, %r441, %r443;
or.pred %p99, %p98, %p97;
mov.f32 %f206, 0f00000000;
@%p99 bra $L__BB0_43;
cvta.to.global.u64 %rd48, %rd154;
sub.s32 %r283, %r440, %r438;
sub.s32 %r284, %r443, %r439;
sub.s32 %r285, %r442, %r438;
mad.lo.s32 %r286, %r284, %r283, %r285;
mul.wide.u32 %rd49, %r286, 4;
add.s64 %rd50, %rd48, %rd49;
ld.global.f32 %f206, [%rd50];
$L__BB0_43:
add.s32 %r402, %r430, 1;
add.s32 %r287, %r60, %r402;
mul.wide.u32 %rd51, %r287, 4;
add.s64 %rd52, %rd1, %rd51;
st.global.f32 [%rd52], %f206;
add.s32 %r138, %r430, 2;
and.b32 %r289, %r138, 1;
setp.eq.b32 %p101, %r289, 1;
or.pred %p102, %p101, %p4;
mov.pred %p103, 0;
xor.pred %p104, %p102, %p103;
not.pred %p105, %p104;
@%p105 bra $L__BB0_49;
bra.uni $L__BB0_44;
$L__BB0_49:
add.s32 %r294, %r430, 3;
shr.u32 %r295, %r294, 1;
add.s32 %r448, %r22, %r295;
mov.u64 %rd155, %rd22;
mov.u32 %r444, %r1;
mov.u32 %r445, %r12;
mov.u32 %r446, %r18;
mov.u32 %r447, %r19;
mov.u32 %r449, %r58;
bra.uni $L__BB0_50;
$L__BB0_44:
@%p5 bra $L__BB0_48;
bra.uni $L__BB0_45;
$L__BB0_48:
add.s32 %r408, %r430, 2;
shr.u32 %r293, %r408, 1;
add.s32 %r448, %r27, %r293;
mov.u64 %rd155, %rd23;
mov.u32 %r444, %r2;
mov.u32 %r445, %r13;
mov.u32 %r446, %r25;
mov.u32 %r447, %r26;
mov.u32 %r449, %r61;
bra.uni $L__BB0_50;
$L__BB0_45:
@%p6 bra $L__BB0_47;
bra.uni $L__BB0_46;
$L__BB0_47:
add.s32 %r291, %r430, 3;
shr.u32 %r292, %r291, 1;
add.s32 %r448, %r32, %r292;
mov.u64 %rd155, %rd24;
mov.u32 %r444, %r3;
mov.u32 %r445, %r14;
mov.u32 %r446, %r30;
mov.u32 %r447, %r31;
mov.u32 %r449, %r62;
bra.uni $L__BB0_50;
$L__BB0_46:
add.s32 %r407, %r430, 2;
shr.u32 %r290, %r407, 1;
add.s32 %r448, %r37, %r290;
mov.u64 %rd155, %rd25;
mov.u32 %r444, %r4;
mov.u32 %r445, %r15;
mov.u32 %r446, %r35;
mov.u32 %r447, %r36;
mov.u32 %r449, %r63;
$L__BB0_50:
setp.lt.u32 %p106, %r448, %r444;
setp.le.u32 %p107, %r446, %r448;
or.pred %p108, %p106, %p107;
setp.lt.u32 %p109, %r449, %r445;
or.pred %p110, %p108, %p109;
setp.le.u32 %p111, %r447, %r449;
or.pred %p112, %p111, %p110;
mov.f32 %f207, 0f00000000;
@%p112 bra $L__BB0_52;
cvta.to.global.u64 %rd53, %rd155;
sub.s32 %r296, %r446, %r444;
sub.s32 %r297, %r449, %r445;
sub.s32 %r298, %r448, %r444;
mad.lo.s32 %r299, %r297, %r296, %r298;
mul.wide.u32 %rd54, %r299, 4;
add.s64 %rd55, %rd53, %rd54;
ld.global.f32 %f207, [%rd55];
$L__BB0_52:
add.s32 %r403, %r430, 2;
add.s32 %r300, %r60, %r403;
mul.wide.u32 %rd56, %r300, 4;
add.s64 %rd57, %rd1, %rd56;
st.global.f32 [%rd57], %f207;
add.s32 %r152, %r430, 3;
and.b32 %r302, %r152, 1;
setp.eq.b32 %p114, %r302, 1;
or.pred %p115, %p114, %p4;
mov.pred %p116, 0;
xor.pred %p117, %p115, %p116;
not.pred %p118, %p117;
@%p118 bra $L__BB0_58;
bra.uni $L__BB0_53;
$L__BB0_58:
add.s32 %r307, %r430, 4;
shr.u32 %r308, %r307, 1;
add.s32 %r454, %r22, %r308;
mov.u64 %rd156, %rd22;
mov.u32 %r450, %r1;
mov.u32 %r451, %r12;
mov.u32 %r452, %r18;
mov.u32 %r453, %r19;
mov.u32 %r455, %r58;
bra.uni $L__BB0_59;
$L__BB0_53:
@%p7 bra $L__BB0_57;
bra.uni $L__BB0_54;
$L__BB0_57:
add.s32 %r410, %r430, 3;
shr.u32 %r306, %r410, 1;
add.s32 %r454, %r27, %r306;
mov.u64 %rd156, %rd23;
mov.u32 %r450, %r2;
mov.u32 %r451, %r13;
mov.u32 %r452, %r25;
mov.u32 %r453, %r26;
mov.u32 %r455, %r61;
bra.uni $L__BB0_59;
$L__BB0_54:
@%p8 bra $L__BB0_56;
bra.uni $L__BB0_55;
$L__BB0_56:
add.s32 %r304, %r430, 4;
shr.u32 %r305, %r304, 1;
add.s32 %r454, %r32, %r305;
mov.u64 %rd156, %rd24;
mov.u32 %r450, %r3;
mov.u32 %r451, %r14;
mov.u32 %r452, %r30;
mov.u32 %r453, %r31;
mov.u32 %r455, %r62;
bra.uni $L__BB0_59;
$L__BB0_55:
add.s32 %r409, %r430, 3;
shr.u32 %r303, %r409, 1;
add.s32 %r454, %r37, %r303;
mov.u64 %rd156, %rd25;
mov.u32 %r450, %r4;
mov.u32 %r451, %r15;
mov.u32 %r452, %r35;
mov.u32 %r453, %r36;
mov.u32 %r455, %r63;
$L__BB0_59:
setp.lt.u32 %p119, %r454, %r450;
setp.le.u32 %p120, %r452, %r454;
or.pred %p121, %p119, %p120;
setp.lt.u32 %p122, %r455, %r451;
or.pred %p123, %p121, %p122;
setp.le.u32 %p124, %r453, %r455;
or.pred %p125, %p124, %p123;
mov.f32 %f208, 0f00000000;
@%p125 bra $L__BB0_61;
cvta.to.global.u64 %rd58, %rd156;
sub.s32 %r309, %r452, %r450;
sub.s32 %r310, %r455, %r451;
sub.s32 %r311, %r454, %r450;
mad.lo.s32 %r312, %r310, %r309, %r311;
mul.wide.u32 %rd59, %r312, 4;
add.s64 %rd60, %rd58, %rd59;
ld.global.f32 %f208, [%rd60];
$L__BB0_61:
add.s32 %r404, %r430, 3;
add.s32 %r313, %r60, %r404;
mul.wide.u32 %rd61, %r313, 4;
add.s64 %rd62, %rd1, %rd61;
st.global.f32 [%rd62], %f208;
add.s32 %r430, %r430, 4;
setp.lt.u32 %p126, %r430, %r6;
@%p126 bra $L__BB0_25;
$L__BB0_62:
setp.lt.u32 %p127, %r236, %r9;
mov.u32 %r411, %r236;
@%p127 bra $L__BB0_4;
$L__BB0_63:
setp.eq.s32 %p128, %r8, 0;
setp.eq.s32 %p129, %r11, 0;
or.pred %p130, %p128, %p129;
@%p130 bra $L__BB0_146;
and.b32 %r168, %r7, 1;
setp.eq.s32 %p131, %r168, 0;
mov.u32 %r456, 0;
xor.b32 %r169, %r168, 1;
selp.f32 %f15, 0f3F9D7658, 0f3F5019C3, %p131;
selp.f32 %f16, 0f3F5019C3, 0f3F9D7658, %p131;
add.s32 %r315, %r8, %r8;
add.s32 %r316, %r315, -3;
setp.gt.u32 %p132, %r8, 1;
selp.b32 %r317, 1, %r316, %p132;
cvt.u64.u32 %rd13, %r317;
selp.b32 %r170, 2, 1, %p131;
mov.u32 %r318, 2;
add.s32 %r171, %r170, 1;
add.s32 %r319, %r8, -1;
cvt.u64.u32 %rd14, %r319;
and.b32 %r172, %r8, 1;
xor.b32 %r320, %r172, 1;
setp.eq.s32 %p133, %r320, %r168;
and.pred %p9, %p132, %p133;
setp.gt.u32 %p134, %r319, 1;
sub.s32 %r321, %r318, %r8;
add.s32 %r322, %r8, -2;
selp.b32 %r323, %r322, %r321, %p134;
cvt.u64.u32 %rd15, %r323;
cvt.u64.u32 %rd16, %r322;
setp.eq.b32 %p135, %r168, 1;
and.b32 %r324, %r319, 1;
setp.eq.s32 %p136, %r324, %r168;
and.pred %p10, %p132, %p136;
selp.b32 %r173, 2, 1, %p135;
add.s32 %r174, %r173, 1;
setp.eq.s32 %p137, %r172, %r168;
and.pred %p11, %p132, %p137;
setp.eq.s32 %p138, %r324, %r169;
and.pred %p12, %p132, %p138;
shl.b64 %rd64, %rd13, 2;
shl.b64 %rd65, %rd14, 2;
shl.b64 %rd66, %rd15, 2;
shl.b64 %rd67, %rd16, 2;
not.pred %p144, %p9;
not.pred %p148, %p11;
$L__BB0_65:
mul.lo.s32 %r325, %r456, %r8;
mul.wide.u32 %rd63, %r325, 4;
add.s64 %rd17, %rd1, %rd63;
setp.eq.s32 %p139, %r8, 1;
@%p139 bra $L__BB0_114;
bra.uni $L__BB0_66;
$L__BB0_114:
@%p131 bra $L__BB0_116;
ld.global.f32 %f152, [%rd17];
mul.rn.f32 %f153, %f152, 0f3F000000;
st.global.f32 [%rd17], %f153;
bra.uni $L__BB0_116;
$L__BB0_66:
setp.ne.s32 %p140, %r5, 0;
add.s64 %rd18, %rd17, %rd64;
add.s64 %rd19, %rd17, %rd65;
add.s64 %rd20, %rd17, %rd66;
add.s64 %rd21, %rd17, %rd67;
@%p140 bra $L__BB0_81;
bra.uni $L__BB0_67;
$L__BB0_81:
setp.lt.u32 %p149, %r8, 2;
@%p149 bra $L__BB0_84;
mov.u32 %r461, 0;
$L__BB0_83:
mul.wide.u32 %rd80, %r461, 4;
add.s64 %rd81, %rd17, %rd80;
ld.global.f32 %f74, [%rd81];
mul.rn.f32 %f75, %f15, %f74;
st.global.f32 [%rd81], %f75;
ld.global.f32 %f76, [%rd81+4];
mul.rn.f32 %f77, %f16, %f76;
st.global.f32 [%rd81+4], %f77;
add.s32 %r185, %r461, 2;
add.s32 %r329, %r461, 3;
setp.lt.u32 %p150, %r329, %r8;
mov.u32 %r461, %r185;
@%p150 bra $L__BB0_83;
$L__BB0_84:
setp.eq.s32 %p151, %r172, 0;
@%p151 bra $L__BB0_86;
ld.global.f32 %f78, [%rd19];
mul.rn.f32 %f79, %f15, %f78;
st.global.f32 [%rd19], %f79;
$L__BB0_86:
setp.ne.s32 %p152, %r168, 0;
@%p152 bra $L__BB0_88;
ld.global.f32 %f80, [%rd18];
ld.global.f32 %f81, [%rd17+4];
add.rn.f32 %f82, %f81, %f80;
ld.global.f32 %f83, [%rd17];
mov.f32 %f84, 0fBEE31355;
fma.rn.f32 %f85, %f82, %f84, %f83;
st.global.f32 [%rd17], %f85;
$L__BB0_88:
setp.ge.u32 %p153, %r171, %r8;
@%p153 bra $L__BB0_91;
mov.u32 %r462, %r171;
mov.u32 %r463, %r170;
$L__BB0_90:
add.s32 %r330, %r463, -1;
mul.wide.u32 %rd82, %r330, 4;
add.s64 %rd83, %rd17, %rd82;
mul.wide.u32 %rd84, %r462, 4;
add.s64 %rd85, %rd17, %rd84;
ld.global.f32 %f86, [%rd85];
ld.global.f32 %f87, [%rd83];
add.rn.f32 %f88, %f87, %f86;
mul.wide.u32 %rd86, %r463, 4;
add.s64 %rd87, %rd17, %rd86;
ld.global.f32 %f89, [%rd87];
mov.f32 %f90, 0fBEE31355;
fma.rn.f32 %f91, %f88, %f90, %f89;
st.global.f32 [%rd87], %f91;
add.s32 %r188, %r463, 2;
add.s32 %r462, %r463, 3;
setp.lt.u32 %p154, %r462, %r8;
mov.u32 %r463, %r188;
@%p154 bra $L__BB0_90;
$L__BB0_91:
not.pred %p155, %p10;
@%p155 bra $L__BB0_93;
ld.global.f32 %f92, [%rd20];
ld.global.f32 %f93, [%rd21];
add.rn.f32 %f94, %f92, %f93;
ld.global.f32 %f95, [%rd19];
mov.f32 %f96, 0fBEE31355;
fma.rn.f32 %f97, %f94, %f96, %f95;
st.global.f32 [%rd19], %f97;
$L__BB0_93:
setp.ne.s32 %p156, %r169, 0;
@%p156 bra $L__BB0_95;
ld.global.f32 %f98, [%rd18];
ld.global.f32 %f99, [%rd17+4];
add.rn.f32 %f100, %f99, %f98;
ld.global.f32 %f101, [%rd17];
mov.f32 %f102, 0fBF620676;
fma.rn.f32 %f103, %f100, %f102, %f101;
st.global.f32 [%rd17], %f103;
$L__BB0_95:
setp.ge.u32 %p157, %r174, %r8;
@%p157 bra $L__BB0_98;
mov.u32 %r464, %r174;
mov.u32 %r465, %r173;
$L__BB0_97:
add.s32 %r331, %r465, -1;
mul.wide.u32 %rd88, %r331, 4;
add.s64 %rd89, %rd17, %rd88;
mul.wide.u32 %rd90, %r464, 4;
add.s64 %rd91, %rd17, %rd90;
ld.global.f32 %f104, [%rd91];
ld.global.f32 %f105, [%rd89];
add.rn.f32 %f106, %f105, %f104;
mul.wide.u32 %rd92, %r465, 4;
add.s64 %rd93, %rd17, %rd92;
ld.global.f32 %f107, [%rd93];
mov.f32 %f108, 0fBF620676;
fma.rn.f32 %f109, %f106, %f108, %f107;
st.global.f32 [%rd93], %f109;
add.s32 %r192, %r465, 2;
add.s32 %r464, %r465, 3;
setp.lt.u32 %p158, %r464, %r8;
mov.u32 %r465, %r192;
@%p158 bra $L__BB0_97;
$L__BB0_98:
not.pred %p159, %p12;
@%p159 bra $L__BB0_100;
ld.global.f32 %f110, [%rd20];
ld.global.f32 %f111, [%rd21];
add.rn.f32 %f112, %f110, %f111;
ld.global.f32 %f113, [%rd19];
mov.f32 %f114, 0fBF620676;
fma.rn.f32 %f115, %f112, %f114, %f113;
st.global.f32 [%rd19], %f115;
$L__BB0_100:
@%p152 bra $L__BB0_102;
ld.global.f32 %f116, [%rd18];
ld.global.f32 %f117, [%rd17+4];
add.rn.f32 %f118, %f117, %f116;
ld.global.f32 %f119, [%rd17];
mov.f32 %f120, 0f3D5901AE;
fma.rn.f32 %f121, %f118, %f120, %f119;
st.global.f32 [%rd17], %f121;
$L__BB0_102:
@%p153 bra $L__BB0_105;
mov.u32 %r466, %r171;
mov.u32 %r467, %r170;
$L__BB0_104:
add.s32 %r332, %r467, -1;
mul.wide.u32 %rd94, %r332, 4;
add.s64 %rd95, %rd17, %rd94;
mul.wide.u32 %rd96, %r466, 4;
add.s64 %rd97, %rd17, %rd96;
ld.global.f32 %f122, [%rd97];
ld.global.f32 %f123, [%rd95];
add.rn.f32 %f124, %f123, %f122;
mul.wide.u32 %rd98, %r467, 4;
add.s64 %rd99, %rd17, %rd98;
ld.global.f32 %f125, [%rd99];
mov.f32 %f126, 0f3D5901AE;
fma.rn.f32 %f127, %f124, %f126, %f125;
st.global.f32 [%rd99], %f127;
add.s32 %r196, %r467, 2;
add.s32 %r466, %r467, 3;
setp.lt.u32 %p162, %r466, %r8;
mov.u32 %r467, %r196;
@%p162 bra $L__BB0_104;
$L__BB0_105:
@%p155 bra $L__BB0_107;
ld.global.f32 %f128, [%rd20];
ld.global.f32 %f129, [%rd21];
add.rn.f32 %f130, %f128, %f129;
ld.global.f32 %f131, [%rd19];
mov.f32 %f132, 0f3D5901AE;
fma.rn.f32 %f133, %f130, %f132, %f131;
st.global.f32 [%rd19], %f133;
$L__BB0_107:
@%p156 bra $L__BB0_109;
ld.global.f32 %f134, [%rd18];
ld.global.f32 %f135, [%rd17+4];
add.rn.f32 %f136, %f135, %f134;
ld.global.f32 %f137, [%rd17];
mov.f32 %f138, 0f3FCB0673;
fma.rn.f32 %f139, %f136, %f138, %f137;
st.global.f32 [%rd17], %f139;
$L__BB0_109:
@%p157 bra $L__BB0_112;
mov.u32 %r468, %r174;
mov.u32 %r469, %r173;
$L__BB0_111:
add.s32 %r333, %r469, -1;
mul.wide.u32 %rd100, %r333, 4;
add.s64 %rd101, %rd17, %rd100;
mul.wide.u32 %rd102, %r468, 4;
add.s64 %rd103, %rd17, %rd102;
ld.global.f32 %f140, [%rd103];
ld.global.f32 %f141, [%rd101];
add.rn.f32 %f142, %f141, %f140;
mul.wide.u32 %rd104, %r469, 4;
add.s64 %rd105, %rd17, %rd104;
ld.global.f32 %f143, [%rd105];
mov.f32 %f144, 0f3FCB0673;
fma.rn.f32 %f145, %f142, %f144, %f143;
st.global.f32 [%rd105], %f145;
add.s32 %r200, %r469, 2;
add.s32 %r468, %r469, 3;
setp.lt.u32 %p166, %r468, %r8;
mov.u32 %r469, %r200;
@%p166 bra $L__BB0_111;
$L__BB0_112:
@%p159 bra $L__BB0_116;
ld.global.f32 %f146, [%rd20];
ld.global.f32 %f147, [%rd21];
add.rn.f32 %f148, %f146, %f147;
ld.global.f32 %f149, [%rd19];
mov.f32 %f150, 0f3FCB0673;
fma.rn.f32 %f151, %f148, %f150, %f149;
st.global.f32 [%rd19], %f151;
bra.uni $L__BB0_116;
$L__BB0_67:
setp.ne.s32 %p141, %r168, 0;
@%p141 bra $L__BB0_69;
ld.global.f32 %f26, [%rd17];
ld.global.f32 %f27, [%rd18];
ld.global.f32 %f28, [%rd17+4];
add.rn.f32 %f29, %f28, %f27;
mov.f32 %f30, 0f3F000000;
mov.f32 %f31, 0f3E800000;
fma.rn.f32 %f32, %f29, %f31, %f30;
cvt.rmi.f32.f32 %f33, %f32;
sub.rn.f32 %f34, %f26, %f33;
st.global.f32 [%rd17], %f34;
$L__BB0_69:
setp.ge.u32 %p142, %r171, %r8;
@%p142 bra $L__BB0_72;
mov.u32 %r457, %r171;
mov.u32 %r458, %r170;
$L__BB0_71:
mul.wide.u32 %rd68, %r458, 4;
add.s64 %rd69, %rd17, %rd68;
add.s32 %r326, %r458, -1;
mul.wide.u32 %rd70, %r326, 4;
add.s64 %rd71, %rd17, %rd70;
mul.wide.u32 %rd72, %r457, 4;
add.s64 %rd73, %rd17, %rd72;
ld.global.f32 %f35, [%rd73];
ld.global.f32 %f36, [%rd71];
add.rn.f32 %f37, %f36, %f35;
mov.f32 %f38, 0f3F000000;
mov.f32 %f39, 0f3E800000;
fma.rn.f32 %f40, %f37, %f39, %f38;
cvt.rmi.f32.f32 %f41, %f40;
ld.global.f32 %f42, [%rd69];
sub.rn.f32 %f43, %f42, %f41;
st.global.f32 [%rd69], %f43;
add.s32 %r178, %r458, 2;
add.s32 %r457, %r458, 3;
setp.lt.u32 %p143, %r457, %r8;
mov.u32 %r458, %r178;
@%p143 bra $L__BB0_71;
$L__BB0_72:
@%p144 bra $L__BB0_74;
ld.global.f32 %f44, [%rd19];
ld.global.f32 %f45, [%rd21];
ld.global.f32 %f46, [%rd20];
add.rn.f32 %f47, %f46, %f45;
mov.f32 %f48, 0f3F000000;
mov.f32 %f49, 0f3E800000;
fma.rn.f32 %f50, %f47, %f49, %f48;
cvt.rmi.f32.f32 %f51, %f50;
sub.rn.f32 %f52, %f44, %f51;
st.global.f32 [%rd19], %f52;
$L__BB0_74:
setp.ne.s32 %p145, %r169, 0;
@%p145 bra $L__BB0_76;
ld.global.f32 %f53, [%rd17];
ld.global.f32 %f54, [%rd18];
ld.global.f32 %f55, [%rd17+4];
add.rn.f32 %f56, %f55, %f54;
mul.rn.f32 %f57, %f56, 0f3F000000;
cvt.rmi.f32.f32 %f58, %f57;
add.rn.f32 %f59, %f53, %f58;
st.global.f32 [%rd17], %f59;
$L__BB0_76:
setp.ge.u32 %p146, %r174, %r8;
@%p146 bra $L__BB0_79;
mov.u32 %r459, %r174;
mov.u32 %r460, %r173;
$L__BB0_78:
mul.wide.u32 %rd74, %r460, 4;
add.s64 %rd75, %rd17, %rd74;
add.s32 %r327, %r460, -1;
mul.wide.u32 %rd76, %r327, 4;
add.s64 %rd77, %rd17, %rd76;
mul.wide.u32 %rd78, %r459, 4;
add.s64 %rd79, %rd17, %rd78;
ld.global.f32 %f60, [%rd79];
ld.global.f32 %f61, [%rd77];
add.rn.f32 %f62, %f61, %f60;
mul.rn.f32 %f63, %f62, 0f3F000000;
cvt.rmi.f32.f32 %f64, %f63;
ld.global.f32 %f65, [%rd75];
add.rn.f32 %f66, %f65, %f64;
st.global.f32 [%rd75], %f66;
add.s32 %r182, %r460, 2;
add.s32 %r459, %r460, 3;
setp.lt.u32 %p147, %r459, %r8;
mov.u32 %r460, %r182;
@%p147 bra $L__BB0_78;
$L__BB0_79:
@%p148 bra $L__BB0_116;
ld.global.f32 %f67, [%rd19];
ld.global.f32 %f68, [%rd21];
ld.global.f32 %f69, [%rd20];
add.rn.f32 %f70, %f69, %f68;
mul.rn.f32 %f71, %f70, 0f3F000000;
cvt.rmi.f32.f32 %f72, %f71;
add.rn.f32 %f73, %f67, %f72;
st.global.f32 [%rd19], %f73;
$L__BB0_116:
add.s32 %r456, %r456, 1;
setp.lt.u32 %p169, %r456, %r11;
@%p169 bra $L__BB0_65;
and.b32 %r203, %r10, 1;
setp.eq.s32 %p170, %r203, 0;
mov.u32 %r470, 0;
xor.b32 %r204, %r203, 1;
selp.f32 %f17, 0f3F9D7658, 0f3F5019C3, %p170;
selp.f32 %f18, 0f3F5019C3, 0f3F9D7658, %p170;
and.b32 %r205, %r11, 1;
add.s32 %r335, %r11, %r11;
add.s32 %r206, %r335, -3;
add.s32 %r336, %r11, -1;
mul.lo.s32 %r207, %r336, %r8;
$L__BB0_118:
setp.eq.s32 %p171, %r11, 1;
@%p171 bra $L__BB0_143;
bra.uni $L__BB0_119;
$L__BB0_143:
@%p170 bra $L__BB0_145;
mul.wide.u32 %rd148, %r470, 4;
add.s64 %rd149, %rd1, %rd148;
ld.global.f32 %f200, [%rd149];
mul.rn.f32 %f201, %f200, 0f3F000000;
st.global.f32 [%rd149], %f201;
bra.uni $L__BB0_145;
$L__BB0_119:
setp.ne.s32 %p172, %r5, 0;
@%p172 bra $L__BB0_126;
bra.uni $L__BB0_120;
$L__BB0_126:
setp.lt.u32 %p181, %r11, 2;
@%p181 bra $L__BB0_129;
mov.u32 %r474, 0;
mov.u32 %r473, 1;
$L__BB0_128:
mad.lo.s32 %r359, %r474, %r8, %r470;
mul.wide.u32 %rd118, %r359, 4;
add.s64 %rd119, %rd1, %rd118;
ld.global.f32 %f170, [%rd119];
mul.rn.f32 %f171, %f17, %f170;
st.global.f32 [%rd119], %f171;
mad.lo.s32 %r360, %r473, %r8, %r470;
mul.wide.u32 %rd120, %r360, 4;
add.s64 %rd121, %rd1, %rd120;
ld.global.f32 %f172, [%rd121];
mul.rn.f32 %f173, %f18, %f172;
st.global.f32 [%rd121], %f173;
add.s32 %r215, %r474, 2;
add.s32 %r473, %r474, 3;
setp.lt.u32 %p182, %r473, %r11;
mov.u32 %r474, %r215;
@%p182 bra $L__BB0_128;
$L__BB0_129:
setp.eq.s32 %p183, %r205, 0;
@%p183 bra $L__BB0_131;
add.s32 %r361, %r470, %r207;
mul.wide.u32 %rd122, %r361, 4;
add.s64 %rd123, %rd1, %rd122;
ld.global.f32 %f174, [%rd123];
mul.rn.f32 %f175, %f17, %f174;
st.global.f32 [%rd123], %f175;
$L__BB0_131:
setp.ge.u32 %p184, %r203, %r11;
@%p184 bra $L__BB0_134;
mov.u32 %r475, %r203;
$L__BB0_133:
mov.u32 %r362, 1;
sub.s32 %r363, %r362, %r475;
add.s32 %r364, %r475, -1;
setp.gt.u32 %p185, %r475, 1;
selp.b32 %r365, %r364, %r363, %p185;
add.s32 %r366, %r475, 1;
setp.lt.u32 %p186, %r366, %r11;
sub.s32 %r367, %r206, %r475;
selp.b32 %r368, %r366, %r367, %p186;
mad.lo.s32 %r369, %r475, %r8, %r470;
mad.lo.s32 %r370, %r365, %r8, %r470;
mul.wide.u32 %rd124, %r370, 4;
add.s64 %rd125, %rd1, %rd124;
mad.lo.s32 %r371, %r368, %r8, %r470;
mul.wide.u32 %rd126, %r371, 4;
add.s64 %rd127, %rd1, %rd126;
ld.global.f32 %f176, [%rd127];
ld.global.f32 %f177, [%rd125];
add.rn.f32 %f178, %f177, %f176;
mul.wide.u32 %rd128, %r369, 4;
add.s64 %rd129, %rd1, %rd128;
ld.global.f32 %f179, [%rd129];
mov.f32 %f180, 0fBEE31355;
fma.rn.f32 %f181, %f178, %f180, %f179;
st.global.f32 [%rd129], %f181;
add.s32 %r475, %r475, 2;
setp.lt.u32 %p187, %r475, %r11;
@%p187 bra $L__BB0_133;
$L__BB0_134:
setp.ge.u32 %p188, %r204, %r11;
@%p188 bra $L__BB0_137;
mov.u32 %r476, %r204;
$L__BB0_136:
mov.u32 %r372, 1;
sub.s32 %r373, %r372, %r476;
add.s32 %r374, %r476, -1;
setp.gt.u32 %p189, %r476, 1;
selp.b32 %r375, %r374, %r373, %p189;
add.s32 %r376, %r476, 1;
setp.lt.u32 %p190, %r376, %r11;
sub.s32 %r377, %r206, %r476;
selp.b32 %r378, %r376, %r377, %p190;
mad.lo.s32 %r379, %r476, %r8, %r470;
mad.lo.s32 %r380, %r375, %r8, %r470;
mul.wide.u32 %rd130, %r380, 4;
add.s64 %rd131, %rd1, %rd130;
mad.lo.s32 %r381, %r378, %r8, %r470;
mul.wide.u32 %rd132, %r381, 4;
add.s64 %rd133, %rd1, %rd132;
ld.global.f32 %f182, [%rd133];
ld.global.f32 %f183, [%rd131];
add.rn.f32 %f184, %f183, %f182;
mul.wide.u32 %rd134, %r379, 4;
add.s64 %rd135, %rd1, %rd134;
ld.global.f32 %f185, [%rd135];
mov.f32 %f186, 0fBF620676;
fma.rn.f32 %f187, %f184, %f186, %f185;
st.global.f32 [%rd135], %f187;
add.s32 %r476, %r476, 2;
setp.lt.u32 %p191, %r476, %r11;
@%p191 bra $L__BB0_136;
$L__BB0_137:
@%p184 bra $L__BB0_140;
mov.u32 %r477, %r203;
$L__BB0_139:
mov.u32 %r382, 1;
sub.s32 %r383, %r382, %r477;
add.s32 %r384, %r477, -1;
setp.gt.u32 %p193, %r477, 1;
selp.b32 %r385, %r384, %r383, %p193;
add.s32 %r386, %r477, 1;
setp.lt.u32 %p194, %r386, %r11;
sub.s32 %r387, %r206, %r477;
selp.b32 %r388, %r386, %r387, %p194;
mad.lo.s32 %r389, %r477, %r8, %r470;
mad.lo.s32 %r390, %r385, %r8, %r470;
mul.wide.u32 %rd136, %r390, 4;
add.s64 %rd137, %rd1, %rd136;
mad.lo.s32 %r391, %r388, %r8, %r470;
mul.wide.u32 %rd138, %r391, 4;
add.s64 %rd139, %rd1, %rd138;
ld.global.f32 %f188, [%rd139];
ld.global.f32 %f189, [%rd137];
add.rn.f32 %f190, %f189, %f188;
mul.wide.u32 %rd140, %r389, 4;
add.s64 %rd141, %rd1, %rd140;
ld.global.f32 %f191, [%rd141];
mov.f32 %f192, 0f3D5901AE;
fma.rn.f32 %f193, %f190, %f192, %f191;
st.global.f32 [%rd141], %f193;
add.s32 %r477, %r477, 2;
setp.lt.u32 %p195, %r477, %r11;
@%p195 bra $L__BB0_139;
$L__BB0_140:
@%p188 bra $L__BB0_145;
mov.u32 %r478, %r204;
$L__BB0_142:
mov.u32 %r392, 1;
sub.s32 %r393, %r392, %r478;
add.s32 %r394, %r478, -1;
setp.gt.u32 %p197, %r478, 1;
selp.b32 %r395, %r394, %r393, %p197;
add.s32 %r396, %r478, 1;
setp.lt.u32 %p198, %r396, %r11;
sub.s32 %r397, %r206, %r478;
selp.b32 %r398, %r396, %r397, %p198;
mad.lo.s32 %r399, %r478, %r8, %r470;
mad.lo.s32 %r400, %r395, %r8, %r470;
mul.wide.u32 %rd142, %r400, 4;
add.s64 %rd143, %rd1, %rd142;
mad.lo.s32 %r401, %r398, %r8, %r470;
mul.wide.u32 %rd144, %r401, 4;
add.s64 %rd145, %rd1, %rd144;
ld.global.f32 %f194, [%rd145];
ld.global.f32 %f195, [%rd143];
add.rn.f32 %f196, %f195, %f194;
mul.wide.u32 %rd146, %r399, 4;
add.s64 %rd147, %rd1, %rd146;
ld.global.f32 %f197, [%rd147];
mov.f32 %f198, 0f3FCB0673;
fma.rn.f32 %f199, %f196, %f198, %f197;
st.global.f32 [%rd147], %f199;
add.s32 %r478, %r478, 2;
setp.lt.u32 %p199, %r478, %r11;
@%p199 bra $L__BB0_142;
bra.uni $L__BB0_145;
$L__BB0_120:
setp.ge.u32 %p173, %r203, %r11;
@%p173 bra $L__BB0_123;
mov.u32 %r471, %r203;
$L__BB0_122:
mov.u32 %r337, 1;
sub.s32 %r338, %r337, %r471;
add.s32 %r339, %r471, -1;
setp.gt.u32 %p174, %r471, 1;
selp.b32 %r340, %r339, %r338, %p174;
add.s32 %r341, %r471, 1;
setp.lt.u32 %p175, %r341, %r11;
sub.s32 %r342, %r206, %r471;
selp.b32 %r343, %r341, %r342, %p175;
mad.lo.s32 %r344, %r471, %r8, %r470;
mad.lo.s32 %r345, %r340, %r8, %r470;
mul.wide.u32 %rd106, %r345, 4;
add.s64 %rd107, %rd1, %rd106;
mad.lo.s32 %r346, %r343, %r8, %r470;
mul.wide.u32 %rd108, %r346, 4;
add.s64 %rd109, %rd1, %rd108;
mul.wide.u32 %rd110, %r344, 4;
add.s64 %rd111, %rd1, %rd110;
ld.global.f32 %f154, [%rd109];
ld.global.f32 %f155, [%rd107];
add.rn.f32 %f156, %f155, %f154;
mov.f32 %f157, 0f3F000000;
mov.f32 %f158, 0f3E800000;
fma.rn.f32 %f159, %f156, %f158, %f157;
cvt.rmi.f32.f32 %f160, %f159;
ld.global.f32 %f161, [%rd111];
sub.rn.f32 %f162, %f161, %f160;
st.global.f32 [%rd111], %f162;
add.s32 %r471, %r471, 2;
setp.lt.u32 %p176, %r471, %r11;
@%p176 bra $L__BB0_122;
$L__BB0_123:
setp.ge.u32 %p177, %r204, %r11;
@%p177 bra $L__BB0_145;
mov.u32 %r472, %r204;
$L__BB0_125:
mov.u32 %r347, 1;
sub.s32 %r348, %r347, %r472;
add.s32 %r349, %r472, -1;
setp.gt.u32 %p178, %r472, 1;
selp.b32 %r350, %r349, %r348, %p178;
add.s32 %r351, %r472, 1;
setp.lt.u32 %p179, %r351, %r11;
sub.s32 %r352, %r206, %r472;
selp.b32 %r353, %r351, %r352, %p179;
mad.lo.s32 %r354, %r472, %r8, %r470;
mad.lo.s32 %r355, %r350, %r8, %r470;
mul.wide.u32 %rd112, %r355, 4;
add.s64 %rd113, %rd1, %rd112;
mad.lo.s32 %r356, %r353, %r8, %r470;
mul.wide.u32 %rd114, %r356, 4;
add.s64 %rd115, %rd1, %rd114;
mul.wide.u32 %rd116, %r354, 4;
add.s64 %rd117, %rd1, %rd116;
ld.global.f32 %f163, [%rd115];
ld.global.f32 %f164, [%rd113];
add.rn.f32 %f165, %f164, %f163;
mul.rn.f32 %f166, %f165, 0f3F000000;
cvt.rmi.f32.f32 %f167, %f166;
ld.global.f32 %f168, [%rd117];
add.rn.f32 %f169, %f168, %f167;
st.global.f32 [%rd117], %f169;
add.s32 %r472, %r472, 2;
setp.lt.u32 %p180, %r472, %r11;
@%p180 bra $L__BB0_125;
$L__BB0_145:
add.s32 %r470, %r470, 1;
setp.lt.u32 %p201, %r470, %r8;
@%p201 bra $L__BB0_118;
$L__BB0_146:
ret;
}
// .globl j2k_idwt_interleave
.visible .entry j2k_idwt_interleave(
.param .u64 j2k_idwt_interleave_param_0,
.param .u64 j2k_idwt_interleave_param_1,
.param .u64 j2k_idwt_interleave_param_2,
.param .u64 j2k_idwt_interleave_param_3,
.param .u64 j2k_idwt_interleave_param_4,
.param .u64 j2k_idwt_interleave_param_5
)
{
.reg .pred %p<23>;
.reg .f32 %f<5>;
.reg .b32 %r<90>;
.reg .b64 %rd<16>;
ld.param.u64 %rd3, [j2k_idwt_interleave_param_0];
ld.param.u64 %rd4, [j2k_idwt_interleave_param_1];
ld.param.u64 %rd5, [j2k_idwt_interleave_param_2];
ld.param.u64 %rd15, [j2k_idwt_interleave_param_3];
ld.param.u64 %rd7, [j2k_idwt_interleave_param_4];
ld.param.u64 %rd8, [j2k_idwt_interleave_param_5];
cvta.to.global.u64 %rd1, %rd8;
ld.global.u32 %r1, [%rd1+20];
ld.global.u32 %r2, [%rd1+36];
ld.global.u32 %r3, [%rd1+52];
ld.global.u32 %r84, [%rd1+68];
ld.global.u32 %r50, [%rd1+8];
ld.global.u32 %r5, [%rd1];
sub.s32 %r6, %r50, %r5;
ld.global.u32 %r51, [%rd1+12];
ld.global.u32 %r88, [%rd1+4];
sub.s32 %r52, %r51, %r88;
mov.u32 %r53, %ntid.x;
mov.u32 %r54, %ctaid.x;
mov.u32 %r55, %tid.x;
mad.lo.s32 %r8, %r54, %r53, %r55;
mov.u32 %r56, %ntid.y;
mov.u32 %r57, %ctaid.y;
mov.u32 %r58, %tid.y;
mad.lo.s32 %r9, %r57, %r56, %r58;
setp.ge.u32 %p1, %r8, %r6;
setp.ge.u32 %p2, %r9, %r52;
or.pred %p3, %p1, %p2;
@%p3 bra $L__BB1_11;
add.s32 %r10, %r5, %r8;
and.b32 %r11, %r10, 1;
add.s32 %r89, %r88, %r9;
and.b32 %r13, %r89, 1;
setp.eq.b32 %p4, %r11, 1;
setp.eq.b32 %p5, %r13, 1;
or.pred %p6, %p5, %p4;
mov.pred %p7, 0;
xor.pred %p8, %p6, %p7;
not.pred %p9, %p8;
@%p9 bra $L__BB1_7;
bra.uni $L__BB1_2;
$L__BB1_7:
ld.global.u32 %r86, [%rd1+28];
ld.global.u32 %r85, [%rd1+24];
ld.global.u32 %r83, [%rd1+16];
add.s32 %r70, %r10, 1;
shr.u32 %r71, %r70, 1;
add.s32 %r72, %r5, 1;
shr.u32 %r73, %r72, 1;
sub.s32 %r74, %r71, %r73;
add.s32 %r87, %r74, %r83;
add.s32 %r88, %r88, 1;
add.s32 %r89, %r89, 1;
mov.u64 %rd15, %rd3;
mov.u32 %r84, %r1;
bra.uni $L__BB1_8;
$L__BB1_2:
setp.eq.s32 %p10, %r13, 0;
setp.ne.s32 %p11, %r11, 0;
and.pred %p12, %p10, %p11;
@%p12 bra $L__BB1_6;
bra.uni $L__BB1_3;
$L__BB1_6:
ld.global.u32 %r86, [%rd1+44];
ld.global.u32 %r85, [%rd1+40];
ld.global.u32 %r83, [%rd1+32];
shr.u32 %r67, %r5, 1;
shr.u32 %r68, %r10, 1;
sub.s32 %r69, %r68, %r67;
add.s32 %r87, %r69, %r83;
add.s32 %r88, %r88, 1;
add.s32 %r89, %r89, 1;
mov.u64 %rd15, %rd4;
mov.u32 %r84, %r2;
bra.uni $L__BB1_8;
$L__BB1_3:
setp.ne.s32 %p13, %r13, 0;
setp.eq.s32 %p14, %r11, 0;
and.pred %p15, %p14, %p13;
@%p15 bra $L__BB1_5;
bra.uni $L__BB1_4;
$L__BB1_5:
ld.global.u32 %r86, [%rd1+60];
ld.global.u32 %r85, [%rd1+56];
ld.global.u32 %r83, [%rd1+48];
add.s32 %r62, %r10, 1;
shr.u32 %r63, %r62, 1;
add.s32 %r64, %r5, 1;
shr.u32 %r65, %r64, 1;
sub.s32 %r66, %r63, %r65;
add.s32 %r87, %r66, %r83;
mov.u64 %rd15, %rd5;
mov.u32 %r84, %r3;
bra.uni $L__BB1_8;
$L__BB1_4:
ld.global.u32 %r86, [%rd1+76];
ld.global.u32 %r85, [%rd1+72];
ld.global.u32 %r83, [%rd1+64];
shr.u32 %r59, %r5, 1;
shr.u32 %r60, %r10, 1;
sub.s32 %r61, %r60, %r59;
add.s32 %r87, %r61, %r83;
$L__BB1_8:
shr.u32 %r75, %r88, 1;
sub.s32 %r76, %r84, %r75;
shr.u32 %r77, %r89, 1;
add.s32 %r46, %r76, %r77;
setp.lt.u32 %p16, %r87, %r83;
setp.le.u32 %p17, %r85, %r87;
or.pred %p18, %p16, %p17;
setp.lt.u32 %p19, %r46, %r84;
or.pred %p20, %p18, %p19;
setp.le.u32 %p21, %r86, %r46;
or.pred %p22, %p21, %p20;
mov.f32 %f4, 0f00000000;
@%p22 bra $L__BB1_10;
cvta.to.global.u64 %rd9, %rd15;
sub.s32 %r78, %r85, %r83;
sub.s32 %r79, %r46, %r84;
sub.s32 %r80, %r87, %r83;
mad.lo.s32 %r81, %r79, %r78, %r80;
mul.wide.u32 %rd10, %r81, 4;
add.s64 %rd11, %rd9, %rd10;
ld.global.f32 %f4, [%rd11];
$L__BB1_10:
mad.lo.s32 %r82, %r6, %r9, %r8;
cvta.to.global.u64 %rd12, %rd7;
mul.wide.u32 %rd13, %r82, 4;
add.s64 %rd14, %rd12, %rd13;
st.global.f32 [%rd14], %f4;
$L__BB1_11:
ret;
}
// .globl j2k_idwt_interleave_multi
.visible .entry j2k_idwt_interleave_multi(
.param .u64 j2k_idwt_interleave_multi_param_0
)
{
.reg .pred %p<23>;
.reg .f32 %f<5>;
.reg .b32 %r<119>;
.reg .b64 %rd<17>;
ld.param.u64 %rd8, [j2k_idwt_interleave_multi_param_0];
cvta.to.global.u64 %rd9, %rd8;
mov.u32 %r58, %ctaid.z;
mul.wide.u32 %rd10, %r58, 128;
add.s64 %rd11, %rd9, %rd10;
add.s64 %rd1, %rd11, 112;
ld.global.v2.u32 {%r112, %r113}, [%rd11+104];
ld.global.v2.u32 {%r61, %r62}, [%rd11+88];
ld.global.v2.u32 {%r63, %r64}, [%rd11+72];
ld.global.v2.u32 {%r65, %r66}, [%rd11+56];
ld.global.v2.u32 {%r67, %r68}, [%rd11+48];
ld.global.v2.u32 {%r71, %r117}, [%rd11+40];
sub.s32 %r15, %r67, %r71;
sub.s32 %r73, %r68, %r117;
mov.u32 %r74, %ntid.x;
mov.u32 %r75, %ctaid.x;
mov.u32 %r76, %tid.x;
mad.lo.s32 %r16, %r75, %r74, %r76;
mov.u32 %r77, %ntid.y;
mov.u32 %r78, %ctaid.y;
mov.u32 %r79, %tid.y;
mad.lo.s32 %r17, %r78, %r77, %r79;
setp.ge.u32 %p1, %r16, %r15;
setp.ge.u32 %p2, %r17, %r73;
or.pred %p3, %p2, %p1;
@%p3 bra $L__BB2_11;
ld.global.u64 %rd2, [%rd1+-80];
add.s32 %r18, %r71, %r16;
and.b32 %r19, %r18, 1;
add.s32 %r118, %r117, %r17;
and.b32 %r21, %r118, 1;
setp.eq.b32 %p4, %r21, 1;
setp.eq.b32 %p5, %r19, 1;
or.pred %p6, %p5, %p4;
mov.pred %p7, 0;
xor.pred %p8, %p6, %p7;
not.pred %p9, %p8;
@%p9 bra $L__BB2_7;
bra.uni $L__BB2_2;
$L__BB2_7:
ld.global.v2.u32 {%r114, %r115}, [%rd1+-48];
ld.global.u64 %rd16, [%rd1+-112];
add.s32 %r99, %r18, 1;
shr.u32 %r100, %r99, 1;
add.s32 %r101, %r71, 1;
shr.u32 %r102, %r101, 1;
sub.s32 %r103, %r65, %r102;
add.s32 %r116, %r103, %r100;
add.s32 %r117, %r117, 1;
add.s32 %r118, %r118, 1;
mov.u32 %r112, %r65;
mov.u32 %r113, %r66;
bra.uni $L__BB2_8;
$L__BB2_2:
setp.eq.s32 %p10, %r21, 0;
setp.ne.s32 %p11, %r19, 0;
and.pred %p12, %p10, %p11;
@%p12 bra $L__BB2_6;
bra.uni $L__BB2_3;
$L__BB2_6:
ld.global.v2.u32 {%r114, %r115}, [%rd1+-32];
ld.global.u64 %rd16, [%rd1+-104];
shr.u32 %r94, %r71, 1;
sub.s32 %r95, %r63, %r94;
shr.u32 %r96, %r18, 1;
add.s32 %r116, %r95, %r96;
add.s32 %r117, %r117, 1;
add.s32 %r118, %r118, 1;
mov.u32 %r112, %r63;
mov.u32 %r113, %r64;
bra.uni $L__BB2_8;
$L__BB2_3:
setp.ne.s32 %p13, %r21, 0;
setp.eq.s32 %p14, %r19, 0;
and.pred %p15, %p14, %p13;
@%p15 bra $L__BB2_5;
bra.uni $L__BB2_4;
$L__BB2_5:
ld.global.v2.u32 {%r114, %r115}, [%rd1+-16];
ld.global.u64 %rd16, [%rd1+-96];
add.s32 %r87, %r18, 1;
shr.u32 %r88, %r87, 1;
add.s32 %r89, %r71, 1;
shr.u32 %r90, %r89, 1;
sub.s32 %r91, %r61, %r90;
add.s32 %r116, %r91, %r88;
mov.u32 %r112, %r61;
mov.u32 %r113, %r62;
bra.uni $L__BB2_8;
$L__BB2_4:
ld.global.v2.u32 {%r114, %r115}, [%rd1];
ld.global.u64 %rd16, [%rd1+-88];
shr.u32 %r82, %r71, 1;
sub.s32 %r83, %r112, %r82;
shr.u32 %r84, %r18, 1;
add.s32 %r116, %r83, %r84;
$L__BB2_8:
shr.u32 %r104, %r117, 1;
sub.s32 %r105, %r113, %r104;
shr.u32 %r106, %r118, 1;
add.s32 %r54, %r105, %r106;
setp.lt.u32 %p16, %r116, %r112;
setp.le.u32 %p17, %r114, %r116;
or.pred %p18, %p16, %p17;
setp.lt.u32 %p19, %r54, %r113;
or.pred %p20, %p18, %p19;
setp.le.u32 %p21, %r115, %r54;
or.pred %p22, %p21, %p20;
mov.f32 %f4, 0f00000000;
@%p22 bra $L__BB2_10;
sub.s32 %r107, %r114, %r112;
sub.s32 %r108, %r54, %r113;
sub.s32 %r109, %r116, %r112;
mad.lo.s32 %r110, %r108, %r107, %r109;
mul.wide.u32 %rd12, %r110, 4;
add.s64 %rd13, %rd16, %rd12;
ld.f32 %f4, [%rd13];
$L__BB2_10:
mad.lo.s32 %r111, %r15, %r17, %r16;
mul.wide.u32 %rd14, %r111, 4;
add.s64 %rd15, %rd2, %rd14;
st.f32 [%rd15], %f4;
$L__BB2_11:
ret;
}
// .globl j2k_idwt_interleave_horizontal_multi
.visible .entry j2k_idwt_interleave_horizontal_multi(
.param .u64 j2k_idwt_interleave_horizontal_multi_param_0
)
{
.reg .pred %p<118>;
.reg .f32 %f<143>;
.reg .b32 %r<311>;
.reg .b64 %rd<95>;
ld.param.u64 %rd13, [j2k_idwt_interleave_horizontal_multi_param_0];
cvta.to.global.u64 %rd14, %rd13;
mov.u32 %r143, %ctaid.y;
mul.wide.u32 %rd15, %r143, 128;
add.s64 %rd16, %rd14, %rd15;
add.s64 %rd1, %rd16, 120;
ld.global.v2.u32 {%r291, %r292}, [%rd16+104];
ld.global.v2.u32 {%r146, %r147}, [%rd16+88];
ld.global.v2.u32 {%r148, %r149}, [%rd16+72];
ld.global.v2.u32 {%r150, %r151}, [%rd16+56];
ld.global.v2.u32 {%r152, %r153}, [%rd16+48];
ld.global.v2.u32 {%r155, %r296}, [%rd16+40];
ld.global.u64 %rd94, [%rd16+24];
ld.global.u64 %rd3, [%rd16+16];
ld.global.u64 %rd4, [%rd16+8];
ld.global.u64 %rd5, [%rd16];
sub.s32 %r16, %r152, %r155;
sub.s32 %r157, %r153, %r296;
mov.u32 %r158, %ntid.x;
mov.u32 %r159, %ctaid.x;
mov.u32 %r160, %tid.x;
mad.lo.s32 %r17, %r159, %r158, %r160;
setp.ge.u32 %p9, %r17, %r157;
@%p9 bra $L__BB3_81;
ld.global.u32 %r18, [%rd1];
add.s32 %r297, %r296, %r17;
and.b32 %r20, %r297, 1;
mul.lo.s32 %r161, %r16, %r17;
ld.global.u64 %rd17, [%rd1+-88];
mul.wide.u32 %rd18, %r161, 4;
add.s64 %rd6, %rd17, %rd18;
setp.eq.s32 %p10, %r16, 0;
@%p10 bra $L__BB3_34;
ld.global.v2.u32 {%r293, %r294}, [%rd1+-8];
mov.u32 %r290, 0;
ld.global.v2.u32 {%r165, %r166}, [%rd1+-24];
ld.global.v2.u32 {%r167, %r168}, [%rd1+-40];
ld.global.v2.u32 {%r169, %r170}, [%rd1+-56];
add.s32 %r171, %r155, 1;
shr.u32 %r172, %r171, 1;
sub.s32 %r25, %r150, %r172;
add.s32 %r26, %r296, 1;
add.s32 %r27, %r297, 1;
shr.u32 %r173, %r155, 1;
sub.s32 %r32, %r148, %r173;
setp.ne.s32 %p1, %r20, 0;
sub.s32 %r37, %r146, %r172;
sub.s32 %r42, %r291, %r173;
and.b32 %r43, %r16, 1;
setp.eq.s32 %p11, %r152, %r171;
@%p11 bra $L__BB3_23;
and.b32 %r175, %r155, 1;
or.b32 %r176, %r20, %r175;
setp.eq.s32 %p12, %r20, 0;
mov.u32 %r290, 0;
sub.s32 %r275, %r16, %r43;
setp.eq.b32 %p13, %r175, 1;
setp.eq.s32 %p14, %r175, 0;
and.pred %p2, %p12, %p13;
and.pred %p3, %p14, %p1;
setp.eq.s32 %p4, %r176, 0;
and.pred %p5, %p13, %p1;
$L__BB3_4:
add.s32 %r47, %r290, %r155;
and.b32 %r177, %r47, 1;
setp.eq.b32 %p15, %r177, 1;
setp.eq.b32 %p16, %r20, 1;
or.pred %p17, %p15, %p16;
mov.pred %p18, 0;
xor.pred %p19, %p17, %p18;
not.pred %p20, %p19;
@%p20 bra $L__BB3_10;
bra.uni $L__BB3_5;
$L__BB3_10:
add.s32 %r183, %r47, 1;
shr.u32 %r184, %r183, 1;
add.s32 %r280, %r25, %r184;
mov.u64 %rd92, %rd5;
mov.u32 %r276, %r150;
mov.u32 %r277, %r151;
mov.u32 %r278, %r169;
mov.u32 %r279, %r170;
mov.u32 %r281, %r26;
mov.u32 %r282, %r27;
bra.uni $L__BB3_11;
$L__BB3_5:
@%p2 bra $L__BB3_9;
bra.uni $L__BB3_6;
$L__BB3_9:
shr.u32 %r182, %r47, 1;
add.s32 %r280, %r32, %r182;
mov.u64 %rd92, %rd4;
mov.u32 %r276, %r148;
mov.u32 %r277, %r149;
mov.u32 %r278, %r167;
mov.u32 %r279, %r168;
mov.u32 %r281, %r26;
mov.u32 %r282, %r27;
bra.uni $L__BB3_11;
$L__BB3_6:
@%p3 bra $L__BB3_8;
bra.uni $L__BB3_7;
$L__BB3_8:
add.s32 %r180, %r47, 1;
shr.u32 %r181, %r180, 1;
add.s32 %r280, %r37, %r181;
mov.u64 %rd92, %rd3;
mov.u32 %r276, %r146;
mov.u32 %r277, %r147;
mov.u32 %r278, %r165;
mov.u32 %r279, %r166;
mov.u32 %r281, %r296;
mov.u32 %r282, %r297;
bra.uni $L__BB3_11;
$L__BB3_7:
shr.u32 %r179, %r47, 1;
add.s32 %r280, %r42, %r179;
mov.u64 %rd92, %rd94;
mov.u32 %r276, %r291;
mov.u32 %r277, %r292;
mov.u32 %r278, %r293;
mov.u32 %r279, %r294;
mov.u32 %r281, %r296;
mov.u32 %r282, %r297;
$L__BB3_11:
shr.u32 %r185, %r281, 1;
sub.s32 %r186, %r277, %r185;
shr.u32 %r187, %r282, 1;
add.s32 %r60, %r186, %r187;
setp.lt.u32 %p21, %r280, %r276;
setp.le.u32 %p22, %r278, %r280;
or.pred %p23, %p21, %p22;
setp.lt.u32 %p24, %r60, %r277;
or.pred %p25, %p23, %p24;
setp.le.u32 %p26, %r279, %r60;
or.pred %p27, %p26, %p25;
mov.f32 %f140, 0f00000000;
@%p27 bra $L__BB3_13;
sub.s32 %r188, %r278, %r276;
sub.s32 %r189, %r60, %r277;
sub.s32 %r190, %r280, %r276;
mad.lo.s32 %r191, %r189, %r188, %r190;
mul.wide.u32 %rd19, %r191, 4;
add.s64 %rd20, %rd92, %rd19;
ld.f32 %f140, [%rd20];
$L__BB3_13:
mul.wide.u32 %rd21, %r290, 4;
add.s64 %rd8, %rd6, %rd21;
st.f32 [%rd8], %f140;
add.s32 %r64, %r47, 1;
and.b32 %r193, %r64, 1;
setp.eq.b32 %p28, %r193, 1;
or.pred %p30, %p28, %p16;
mov.pred %p31, 0;
xor.pred %p32, %p30, %p31;
not.pred %p33, %p32;
@%p33 bra $L__BB3_19;
bra.uni $L__BB3_14;
$L__BB3_19:
add.s32 %r199, %r64, 1;
shr.u32 %r200, %r199, 1;
add.s32 %r287, %r25, %r200;
mov.u64 %rd93, %rd5;
mov.u32 %r283, %r150;
mov.u32 %r284, %r151;
mov.u32 %r285, %r169;
mov.u32 %r286, %r170;
mov.u32 %r288, %r26;
mov.u32 %r289, %r27;
bra.uni $L__BB3_20;
$L__BB3_14:
@%p4 bra $L__BB3_18;
bra.uni $L__BB3_15;
$L__BB3_18:
shr.u32 %r198, %r64, 1;
add.s32 %r287, %r32, %r198;
mov.u64 %rd93, %rd4;
mov.u32 %r283, %r148;
mov.u32 %r284, %r149;
mov.u32 %r285, %r167;
mov.u32 %r286, %r168;
mov.u32 %r288, %r26;
mov.u32 %r289, %r27;
bra.uni $L__BB3_20;
$L__BB3_15:
@%p5 bra $L__BB3_17;
bra.uni $L__BB3_16;
$L__BB3_17:
add.s32 %r196, %r64, 1;
shr.u32 %r197, %r196, 1;
add.s32 %r287, %r37, %r197;
mov.u64 %rd93, %rd3;
mov.u32 %r283, %r146;
mov.u32 %r284, %r147;
mov.u32 %r285, %r165;
mov.u32 %r286, %r166;
mov.u32 %r288, %r296;
mov.u32 %r289, %r297;
bra.uni $L__BB3_20;
$L__BB3_16:
shr.u32 %r195, %r64, 1;
add.s32 %r287, %r42, %r195;
mov.u64 %rd93, %rd94;
mov.u32 %r283, %r291;
mov.u32 %r284, %r292;
mov.u32 %r285, %r293;
mov.u32 %r286, %r294;
mov.u32 %r288, %r296;
mov.u32 %r289, %r297;
$L__BB3_20:
shr.u32 %r201, %r288, 1;
sub.s32 %r202, %r284, %r201;
shr.u32 %r203, %r289, 1;
add.s32 %r77, %r202, %r203;
setp.lt.u32 %p34, %r287, %r283;
setp.le.u32 %p35, %r285, %r287;
or.pred %p36, %p34, %p35;
setp.lt.u32 %p37, %r77, %r284;
or.pred %p38, %p36, %p37;
setp.le.u32 %p39, %r286, %r77;
or.pred %p40, %p39, %p38;
mov.f32 %f141, 0f00000000;
@%p40 bra $L__BB3_22;
sub.s32 %r204, %r285, %r283;
sub.s32 %r205, %r77, %r284;
sub.s32 %r206, %r287, %r283;
mad.lo.s32 %r207, %r205, %r204, %r206;
mul.wide.u32 %rd22, %r207, 4;
add.s64 %rd23, %rd93, %rd22;
ld.f32 %f141, [%rd23];
$L__BB3_22:
st.f32 [%rd8+4], %f141;
add.s32 %r290, %r290, 2;
add.s32 %r275, %r275, -2;
setp.ne.s32 %p41, %r275, 0;
@%p41 bra $L__BB3_4;
$L__BB3_23:
setp.eq.s32 %p42, %r43, 0;
@%p42 bra $L__BB3_34;
add.s32 %r84, %r290, %r155;
and.b32 %r85, %r84, 1;
setp.eq.b32 %p43, %r85, 1;
setp.eq.b32 %p44, %r20, 1;
or.pred %p45, %p43, %p44;
mov.pred %p46, 0;
xor.pred %p47, %p45, %p46;
not.pred %p48, %p47;
@%p48 bra $L__BB3_30;
bra.uni $L__BB3_25;
$L__BB3_30:
add.s32 %r213, %r84, 1;
shr.u32 %r214, %r213, 1;
add.s32 %r295, %r25, %r214;
mov.u64 %rd94, %rd5;
mov.u32 %r291, %r150;
mov.u32 %r292, %r151;
mov.u32 %r293, %r169;
mov.u32 %r294, %r170;
mov.u32 %r296, %r26;
mov.u32 %r297, %r27;
bra.uni $L__BB3_31;
$L__BB3_25:
setp.ne.s32 %p49, %r85, 0;
setp.eq.s32 %p50, %r20, 0;
and.pred %p51, %p50, %p49;
@%p51 bra $L__BB3_29;
bra.uni $L__BB3_26;
$L__BB3_29:
shr.u32 %r212, %r84, 1;
add.s32 %r295, %r32, %r212;
mov.u64 %rd94, %rd4;
mov.u32 %r291, %r148;
mov.u32 %r292, %r149;
mov.u32 %r293, %r167;
mov.u32 %r294, %r168;
mov.u32 %r296, %r26;
mov.u32 %r297, %r27;
bra.uni $L__BB3_31;
$L__BB3_26:
setp.eq.s32 %p52, %r85, 0;
and.pred %p53, %p52, %p1;
@%p53 bra $L__BB3_28;
bra.uni $L__BB3_27;
$L__BB3_28:
add.s32 %r210, %r84, 1;
shr.u32 %r211, %r210, 1;
add.s32 %r295, %r37, %r211;
mov.u64 %rd94, %rd3;
mov.u32 %r291, %r146;
mov.u32 %r292, %r147;
mov.u32 %r293, %r165;
mov.u32 %r294, %r166;
bra.uni $L__BB3_31;
$L__BB3_27:
shr.u32 %r209, %r84, 1;
add.s32 %r295, %r42, %r209;
$L__BB3_31:
shr.u32 %r215, %r296, 1;
sub.s32 %r216, %r292, %r215;
shr.u32 %r217, %r297, 1;
add.s32 %r98, %r216, %r217;
setp.lt.u32 %p54, %r295, %r291;
setp.le.u32 %p55, %r293, %r295;
or.pred %p56, %p54, %p55;
setp.lt.u32 %p57, %r98, %r292;
or.pred %p58, %p56, %p57;
setp.le.u32 %p59, %r294, %r98;
or.pred %p60, %p59, %p58;
mov.f32 %f142, 0f00000000;
@%p60 bra $L__BB3_33;
sub.s32 %r218, %r293, %r291;
sub.s32 %r219, %r98, %r292;
sub.s32 %r220, %r295, %r291;
mad.lo.s32 %r221, %r219, %r218, %r220;
mul.wide.u32 %rd24, %r221, 4;
add.s64 %rd25, %rd94, %rd24;
ld.f32 %f142, [%rd25];
$L__BB3_33:
mul.wide.u32 %rd26, %r290, 4;
add.s64 %rd27, %rd6, %rd26;
st.f32 [%rd27], %f142;
$L__BB3_34:
setp.eq.s32 %p61, %r16, 1;
and.b32 %r102, %r155, 1;
@%p61 bra $L__BB3_79;
bra.uni $L__BB3_35;
$L__BB3_79:
setp.eq.s32 %p117, %r102, 0;
@%p117 bra $L__BB3_81;
ld.f32 %f138, [%rd6];
mul.rn.f32 %f139, %f138, 0f3F000000;
st.f32 [%rd6], %f139;
bra.uni $L__BB3_81;
$L__BB3_35:
setp.eq.s32 %p62, %r18, 0;
xor.b32 %r103, %r102, 1;
add.s32 %r104, %r16, -1;
mul.wide.u32 %rd28, %r104, 4;
add.s64 %rd11, %rd6, %rd28;
add.s32 %r105, %r16, -2;
mul.wide.u32 %rd29, %r105, 4;
add.s64 %rd12, %rd6, %rd29;
@%p62 bra $L__BB3_67;
setp.eq.s32 %p6, %r102, 0;
selp.f32 %f7, 0f3F9D7658, 0f3F5019C3, %p6;
setp.lt.u32 %p63, %r16, 2;
@%p63 bra $L__BB3_39;
selp.f32 %f8, 0f3F5019C3, 0f3F9D7658, %p6;
mov.u32 %r298, 0;
$L__BB3_38:
mul.wide.u32 %rd30, %r298, 4;
add.s64 %rd31, %rd6, %rd30;
ld.f32 %f12, [%rd31];
mul.rn.f32 %f13, %f7, %f12;
st.f32 [%rd31], %f13;
ld.f32 %f14, [%rd31+4];
mul.rn.f32 %f15, %f8, %f14;
st.f32 [%rd31+4], %f15;
add.s32 %r107, %r298, 2;
add.s32 %r223, %r298, 3;
setp.lt.u32 %p64, %r223, %r16;
mov.u32 %r298, %r107;
@%p64 bra $L__BB3_38;
$L__BB3_39:
and.b32 %r224, %r16, 1;
setp.eq.b32 %p65, %r224, 1;
mov.pred %p66, 0;
xor.pred %p67, %p65, %p66;
not.pred %p68, %p67;
@%p68 bra $L__BB3_41;
ld.f32 %f16, [%rd11];
mul.rn.f32 %f17, %f7, %f16;
st.f32 [%rd11], %f17;
$L__BB3_41:
setp.ne.s32 %p69, %r102, 0;
@%p69 bra $L__BB3_43;
setp.gt.u32 %p70, %r16, 1;
add.s32 %r225, %r16, %r16;
add.s32 %r226, %r225, -3;
selp.b32 %r227, 1, %r226, %p70;
mul.wide.u32 %rd32, %r227, 4;
add.s64 %rd33, %rd6, %rd32;
ld.f32 %f18, [%rd33];
ld.f32 %f19, [%rd6+4];
add.rn.f32 %f20, %f19, %f18;
ld.f32 %f21, [%rd6];
mov.f32 %f22, 0fBEE31355;
fma.rn.f32 %f23, %f20, %f22, %f21;
st.f32 [%rd6], %f23;
$L__BB3_43:
selp.b32 %r304, 2, 1, %p6;
add.s32 %r303, %r304, 1;
setp.ge.u32 %p72, %r303, %r16;
@%p72 bra $L__BB3_46;
mov.u32 %r299, %r303;
mov.u32 %r300, %r304;
$L__BB3_45:
add.s32 %r228, %r300, -1;
mul.wide.u32 %rd34, %r228, 4;
add.s64 %rd35, %rd6, %rd34;
mul.wide.u32 %rd36, %r299, 4;
add.s64 %rd37, %rd6, %rd36;
ld.f32 %f24, [%rd37];
ld.f32 %f25, [%rd35];
add.rn.f32 %f26, %f25, %f24;
mul.wide.u32 %rd38, %r300, 4;
add.s64 %rd39, %rd6, %rd38;
ld.f32 %f27, [%rd39];
mov.f32 %f28, 0fBEE31355;
fma.rn.f32 %f29, %f26, %f28, %f27;
st.f32 [%rd39], %f29;
add.s32 %r112, %r300, 2;
add.s32 %r299, %r300, 3;
setp.lt.u32 %p73, %r299, %r16;
mov.u32 %r300, %r112;
@%p73 bra $L__BB3_45;
$L__BB3_46:
setp.gt.u32 %p74, %r16, 1;
and.b32 %r115, %r104, 1;
setp.eq.s32 %p75, %r115, %r102;
and.pred %p7, %p74, %p75;
not.pred %p76, %p7;
@%p76 bra $L__BB3_48;
setp.gt.u32 %p77, %r104, 1;
mov.u32 %r229, 2;
sub.s32 %r230, %r229, %r16;
selp.b32 %r232, %r105, %r230, %p77;
mul.wide.u32 %rd40, %r232, 4;
add.s64 %rd41, %rd6, %rd40;
ld.f32 %f30, [%rd12];
ld.f32 %f31, [%rd41];
add.rn.f32 %f32, %f31, %f30;
ld.f32 %f33, [%rd11];
mov.f32 %f34, 0fBEE31355;
fma.rn.f32 %f35, %f32, %f34, %f33;
st.f32 [%rd11], %f35;
$L__BB3_48:
setp.ne.s32 %p78, %r103, 0;
@%p78 bra $L__BB3_50;
add.s32 %r233, %r16, %r16;
add.s32 %r234, %r233, -3;
selp.b32 %r235, 1, %r234, %p74;
mul.wide.u32 %rd42, %r235, 4;
add.s64 %rd43, %rd6, %rd42;
ld.f32 %f36, [%rd43];
ld.f32 %f37, [%rd6+4];
add.rn.f32 %f38, %f37, %f36;
ld.f32 %f39, [%rd6];
mov.f32 %f40, 0fBF620676;
fma.rn.f32 %f41, %f38, %f40, %f39;
st.f32 [%rd6], %f41;
$L__BB3_50:
setp.eq.s32 %p80, %r103, 0;
selp.b32 %r306, 2, 1, %p80;
add.s32 %r305, %r306, 1;
setp.ge.u32 %p81, %r305, %r16;
@%p81 bra $L__BB3_53;
mov.u32 %r301, %r305;
mov.u32 %r302, %r306;
$L__BB3_52:
add.s32 %r236, %r302, -1;
mul.wide.u32 %rd44, %r236, 4;
add.s64 %rd45, %rd6, %rd44;
mul.wide.u32 %rd46, %r301, 4;
add.s64 %rd47, %rd6, %rd46;
ld.f32 %f42, [%rd47];
ld.f32 %f43, [%rd45];
add.rn.f32 %f44, %f43, %f42;
mul.wide.u32 %rd48, %r302, 4;
add.s64 %rd49, %rd6, %rd48;
ld.f32 %f45, [%rd49];
mov.f32 %f46, 0fBF620676;
fma.rn.f32 %f47, %f44, %f46, %f45;
st.f32 [%rd49], %f47;
add.s32 %r120, %r302, 2;
add.s32 %r301, %r302, 3;
setp.lt.u32 %p82, %r301, %r16;
mov.u32 %r302, %r120;
@%p82 bra $L__BB3_52;
$L__BB3_53:
setp.eq.s32 %p84, %r115, %r103;
and.pred %p8, %p74, %p84;
not.pred %p85, %p8;
@%p85 bra $L__BB3_55;
setp.gt.u32 %p86, %r104, 1;
mov.u32 %r237, 2;
sub.s32 %r238, %r237, %r16;
selp.b32 %r240, %r105, %r238, %p86;
mul.wide.u32 %rd50, %r240, 4;
add.s64 %rd51, %rd6, %rd50;
ld.f32 %f48, [%rd12];
ld.f32 %f49, [%rd51];
add.rn.f32 %f50, %f49, %f48;
ld.f32 %f51, [%rd11];
mov.f32 %f52, 0fBF620676;
fma.rn.f32 %f53, %f50, %f52, %f51;
st.f32 [%rd11], %f53;
$L__BB3_55:
@%p69 bra $L__BB3_57;
add.s32 %r241, %r16, %r16;
add.s32 %r242, %r241, -3;
selp.b32 %r243, 1, %r242, %p74;
mul.wide.u32 %rd52, %r243, 4;
add.s64 %rd53, %rd6, %rd52;
ld.f32 %f54, [%rd53];
ld.f32 %f55, [%rd6+4];
add.rn.f32 %f56, %f55, %f54;
ld.f32 %f57, [%rd6];
mov.f32 %f58, 0f3D5901AE;
fma.rn.f32 %f59, %f56, %f58, %f57;
st.f32 [%rd6], %f59;
$L__BB3_57:
@%p72 bra $L__BB3_59;
$L__BB3_58:
add.s32 %r244, %r304, -1;
mul.wide.u32 %rd54, %r244, 4;
add.s64 %rd55, %rd6, %rd54;
mul.wide.u32 %rd56, %r303, 4;
add.s64 %rd57, %rd6, %rd56;
ld.f32 %f60, [%rd57];
ld.f32 %f61, [%rd55];
add.rn.f32 %f62, %f61, %f60;
mul.wide.u32 %rd58, %r304, 4;
add.s64 %rd59, %rd6, %rd58;
ld.f32 %f63, [%rd59];
mov.f32 %f64, 0f3D5901AE;
fma.rn.f32 %f65, %f62, %f64, %f63;
st.f32 [%rd59], %f65;
add.s32 %r124, %r304, 2;
add.s32 %r303, %r304, 3;
setp.lt.u32 %p90, %r303, %r16;
mov.u32 %r304, %r124;
@%p90 bra $L__BB3_58;
$L__BB3_59:
@%p76 bra $L__BB3_61;
setp.gt.u32 %p92, %r104, 1;
mov.u32 %r245, 2;
sub.s32 %r246, %r245, %r16;
selp.b32 %r248, %r105, %r246, %p92;
mul.wide.u32 %rd60, %r248, 4;
add.s64 %rd61, %rd6, %rd60;
ld.f32 %f66, [%rd12];
ld.f32 %f67, [%rd61];
add.rn.f32 %f68, %f67, %f66;
ld.f32 %f69, [%rd11];
mov.f32 %f70, 0f3D5901AE;
fma.rn.f32 %f71, %f68, %f70, %f69;
st.f32 [%rd11], %f71;
$L__BB3_61:
@%p78 bra $L__BB3_63;
add.s32 %r249, %r16, %r16;
add.s32 %r250, %r249, -3;
selp.b32 %r251, 1, %r250, %p74;
mul.wide.u32 %rd62, %r251, 4;
add.s64 %rd63, %rd6, %rd62;
ld.f32 %f72, [%rd63];
ld.f32 %f73, [%rd6+4];
add.rn.f32 %f74, %f73, %f72;
ld.f32 %f75, [%rd6];
mov.f32 %f76, 0f3FCB0673;
fma.rn.f32 %f77, %f74, %f76, %f75;
st.f32 [%rd6], %f77;
$L__BB3_63:
@%p81 bra $L__BB3_65;
$L__BB3_64:
add.s32 %r252, %r306, -1;
mul.wide.u32 %rd64, %r252, 4;
add.s64 %rd65, %rd6, %rd64;
mul.wide.u32 %rd66, %r305, 4;
add.s64 %rd67, %rd6, %rd66;
ld.f32 %f78, [%rd67];
ld.f32 %f79, [%rd65];
add.rn.f32 %f80, %f79, %f78;
mul.wide.u32 %rd68, %r306, 4;
add.s64 %rd69, %rd6, %rd68;
ld.f32 %f81, [%rd69];
mov.f32 %f82, 0f3FCB0673;
fma.rn.f32 %f83, %f80, %f82, %f81;
st.f32 [%rd69], %f83;
add.s32 %r128, %r306, 2;
add.s32 %r305, %r306, 3;
setp.lt.u32 %p96, %r305, %r16;
mov.u32 %r306, %r128;
@%p96 bra $L__BB3_64;
$L__BB3_65:
@%p85 bra $L__BB3_81;
setp.gt.u32 %p98, %r104, 1;
mov.u32 %r253, 2;
sub.s32 %r254, %r253, %r16;
selp.b32 %r256, %r105, %r254, %p98;
mul.wide.u32 %rd70, %r256, 4;
add.s64 %rd71, %rd6, %rd70;
ld.f32 %f84, [%rd12];
ld.f32 %f85, [%rd71];
add.rn.f32 %f86, %f85, %f84;
ld.f32 %f87, [%rd11];
mov.f32 %f88, 0f3FCB0673;
fma.rn.f32 %f89, %f86, %f88, %f87;
st.f32 [%rd11], %f89;
bra.uni $L__BB3_81;
$L__BB3_67:
setp.ne.s32 %p99, %r102, 0;
@%p99 bra $L__BB3_69;
setp.gt.u32 %p100, %r16, 1;
add.s32 %r257, %r16, %r16;
add.s32 %r258, %r257, -3;
selp.b32 %r259, 1, %r258, %p100;
mul.wide.u32 %rd72, %r259, 4;
add.s64 %rd73, %rd6, %rd72;
ld.f32 %f90, [%rd73];
ld.f32 %f91, [%rd6+4];
add.rn.f32 %f92, %f91, %f90;
mov.f32 %f93, 0f3F000000;
mov.f32 %f94, 0f3E800000;
fma.rn.f32 %f95, %f92, %f94, %f93;
cvt.rmi.f32.f32 %f96, %f95;
ld.f32 %f97, [%rd6];
sub.rn.f32 %f98, %f97, %f96;
st.f32 [%rd6], %f98;
$L__BB3_69:
setp.eq.s32 %p101, %r102, 0;
selp.b32 %r308, 2, 1, %p101;
add.s32 %r307, %r308, 1;
setp.ge.u32 %p102, %r307, %r16;
@%p102 bra $L__BB3_71;
$L__BB3_70:
mul.wide.u32 %rd74, %r308, 4;
add.s64 %rd75, %rd6, %rd74;
add.s32 %r260, %r308, -1;
mul.wide.u32 %rd76, %r260, 4;
add.s64 %rd77, %rd6, %rd76;
mul.wide.u32 %rd78, %r307, 4;
add.s64 %rd79, %rd6, %rd78;
ld.f32 %f99, [%rd79];
ld.f32 %f100, [%rd77];
add.rn.f32 %f101, %f100, %f99;
mov.f32 %f102, 0f3F000000;
mov.f32 %f103, 0f3E800000;
fma.rn.f32 %f104, %f101, %f103, %f102;
cvt.rmi.f32.f32 %f105, %f104;
ld.f32 %f106, [%rd75];
sub.rn.f32 %f107, %f106, %f105;
st.f32 [%rd75], %f107;
add.s32 %r134, %r308, 2;
add.s32 %r307, %r308, 3;
setp.lt.u32 %p103, %r307, %r16;
mov.u32 %r308, %r134;
@%p103 bra $L__BB3_70;
$L__BB3_71:
setp.lt.u32 %p104, %r16, 2;
and.b32 %r136, %r16, 1;
xor.b32 %r261, %r136, 1;
setp.ne.s32 %p105, %r261, %r102;
or.pred %p106, %p104, %p105;
@%p106 bra $L__BB3_73;
setp.gt.u32 %p107, %r104, 1;
mov.u32 %r263, 2;
sub.s32 %r264, %r263, %r16;
selp.b32 %r266, %r105, %r264, %p107;
mul.wide.u32 %rd80, %r266, 4;
add.s64 %rd81, %rd6, %rd80;
ld.f32 %f108, [%rd12];
ld.f32 %f109, [%rd81];
add.rn.f32 %f110, %f109, %f108;
mov.f32 %f111, 0f3F000000;
mov.f32 %f112, 0f3E800000;
fma.rn.f32 %f113, %f110, %f112, %f111;
cvt.rmi.f32.f32 %f114, %f113;
ld.f32 %f115, [%rd11];
sub.rn.f32 %f116, %f115, %f114;
st.f32 [%rd11], %f116;
$L__BB3_73:
setp.ne.s32 %p108, %r103, 0;
@%p108 bra $L__BB3_75;
setp.gt.u32 %p109, %r16, 1;
add.s32 %r267, %r16, %r16;
add.s32 %r268, %r267, -3;
selp.b32 %r269, 1, %r268, %p109;
mul.wide.u32 %rd82, %r269, 4;
add.s64 %rd83, %rd6, %rd82;
ld.f32 %f117, [%rd83];
ld.f32 %f118, [%rd6+4];
add.rn.f32 %f119, %f118, %f117;
mul.rn.f32 %f120, %f119, 0f3F000000;
cvt.rmi.f32.f32 %f121, %f120;
ld.f32 %f122, [%rd6];
add.rn.f32 %f123, %f122, %f121;
st.f32 [%rd6], %f123;
$L__BB3_75:
setp.eq.s32 %p110, %r103, 0;
selp.b32 %r310, 2, 1, %p110;
add.s32 %r309, %r310, 1;
setp.ge.u32 %p111, %r309, %r16;
@%p111 bra $L__BB3_77;
$L__BB3_76:
mul.wide.u32 %rd84, %r310, 4;
add.s64 %rd85, %rd6, %rd84;
add.s32 %r270, %r310, -1;
mul.wide.u32 %rd86, %r270, 4;
add.s64 %rd87, %rd6, %rd86;
mul.wide.u32 %rd88, %r309, 4;
add.s64 %rd89, %rd6, %rd88;
ld.f32 %f124, [%rd89];
ld.f32 %f125, [%rd87];
add.rn.f32 %f126, %f125, %f124;
mul.rn.f32 %f127, %f126, 0f3F000000;
cvt.rmi.f32.f32 %f128, %f127;
ld.f32 %f129, [%rd85];
add.rn.f32 %f130, %f129, %f128;
st.f32 [%rd85], %f130;
add.s32 %r141, %r310, 2;
add.s32 %r309, %r310, 3;
setp.lt.u32 %p112, %r309, %r16;
mov.u32 %r310, %r141;
@%p112 bra $L__BB3_76;
$L__BB3_77:
setp.ne.s32 %p114, %r136, %r102;
or.pred %p115, %p104, %p114;
@%p115 bra $L__BB3_81;
setp.gt.u32 %p116, %r104, 1;
mov.u32 %r271, 2;
sub.s32 %r272, %r271, %r16;
selp.b32 %r273, %r105, %r272, %p116;
mul.wide.u32 %rd90, %r273, 4;
add.s64 %rd91, %rd6, %rd90;
ld.f32 %f131, [%rd12];
ld.f32 %f132, [%rd91];
add.rn.f32 %f133, %f132, %f131;
mul.rn.f32 %f134, %f133, 0f3F000000;
cvt.rmi.f32.f32 %f135, %f134;
ld.f32 %f136, [%rd11];
add.rn.f32 %f137, %f136, %f135;
st.f32 [%rd11], %f137;
$L__BB3_81:
ret;
}
// .globl j2k_idwt_interleave_horizontal_53_multi
.visible .entry j2k_idwt_interleave_horizontal_53_multi(
.param .u64 j2k_idwt_interleave_horizontal_53_multi_param_0
)
{
.reg .pred %p<39>;
.reg .f32 %f<25>;
.reg .b32 %r<150>;
.reg .b64 %rd<19>;
// demoted variable
.shared .align 4 .b8 _ZZ48j2k_idwt_interleave_horizontal_53_multiE11row_samples[2048];
ld.param.u64 %rd8, [j2k_idwt_interleave_horizontal_53_multi_param_0];
cvta.to.global.u64 %rd9, %rd8;
mov.u32 %r1, %tid.x;
mov.u32 %r61, %ctaid.y;
mul.wide.u32 %rd10, %r61, 128;
add.s64 %rd11, %rd9, %rd10;
add.s64 %rd1, %rd11, 112;
ld.global.v2.u32 {%r143, %r144}, [%rd11+104];
ld.global.v2.u32 {%r64, %r65}, [%rd11+88];
ld.global.v2.u32 {%r66, %r67}, [%rd11+72];
ld.global.v2.u32 {%r68, %r69}, [%rd11+56];
ld.global.v2.u32 {%r70, %r71}, [%rd11+48];
ld.global.v2.u32 {%r74, %r148}, [%rd11+40];
ld.global.u64 %rd2, [%rd11+32];
sub.s32 %r16, %r70, %r74;
sub.s32 %r76, %r71, %r148;
mov.u32 %r17, %ctaid.x;
setp.ge.u32 %p1, %r17, %r76;
@%p1 bra $L__BB4_23;
setp.ge.u32 %p2, %r1, %r16;
shl.b32 %r77, %r1, 2;
mov.u32 %r78, _ZZ48j2k_idwt_interleave_horizontal_53_multiE11row_samples;
add.s32 %r18, %r78, %r77;
@%p2 bra $L__BB4_12;
add.s32 %r19, %r74, %r1;
and.b32 %r20, %r19, 1;
add.s32 %r149, %r148, %r17;
and.b32 %r22, %r149, 1;
setp.eq.b32 %p3, %r22, 1;
setp.eq.b32 %p4, %r20, 1;
or.pred %p5, %p4, %p3;
mov.pred %p6, 0;
xor.pred %p7, %p5, %p6;
not.pred %p8, %p7;
@%p8 bra $L__BB4_8;
bra.uni $L__BB4_3;
$L__BB4_8:
ld.global.v2.u32 {%r145, %r146}, [%rd1+-48];
ld.global.u64 %rd18, [%rd1+-112];
add.s32 %r98, %r19, 1;
shr.u32 %r99, %r98, 1;
add.s32 %r100, %r74, 1;
shr.u32 %r101, %r100, 1;
sub.s32 %r102, %r68, %r101;
add.s32 %r147, %r102, %r99;
add.s32 %r148, %r148, 1;
add.s32 %r149, %r149, 1;
mov.u32 %r143, %r68;
mov.u32 %r144, %r69;
bra.uni $L__BB4_9;
$L__BB4_3:
setp.eq.s32 %p9, %r22, 0;
setp.ne.s32 %p10, %r20, 0;
and.pred %p11, %p9, %p10;
@%p11 bra $L__BB4_7;
bra.uni $L__BB4_4;
$L__BB4_7:
ld.global.v2.u32 {%r145, %r146}, [%rd1+-32];
ld.global.u64 %rd18, [%rd1+-104];
shr.u32 %r93, %r74, 1;
sub.s32 %r94, %r66, %r93;
shr.u32 %r95, %r19, 1;
add.s32 %r147, %r94, %r95;
add.s32 %r148, %r148, 1;
add.s32 %r149, %r149, 1;
mov.u32 %r143, %r66;
mov.u32 %r144, %r67;
bra.uni $L__BB4_9;
$L__BB4_4:
setp.ne.s32 %p12, %r22, 0;
setp.eq.s32 %p13, %r20, 0;
and.pred %p14, %p13, %p12;
@%p14 bra $L__BB4_6;
bra.uni $L__BB4_5;
$L__BB4_6:
ld.global.v2.u32 {%r145, %r146}, [%rd1+-16];
ld.global.u64 %rd18, [%rd1+-96];
add.s32 %r86, %r19, 1;
shr.u32 %r87, %r86, 1;
add.s32 %r88, %r74, 1;
shr.u32 %r89, %r88, 1;
sub.s32 %r90, %r64, %r89;
add.s32 %r147, %r90, %r87;
mov.u32 %r143, %r64;
mov.u32 %r144, %r65;
bra.uni $L__BB4_9;
$L__BB4_5:
ld.global.v2.u32 {%r145, %r146}, [%rd1];
ld.global.u64 %rd18, [%rd1+-88];
shr.u32 %r81, %r74, 1;
sub.s32 %r82, %r143, %r81;
shr.u32 %r83, %r19, 1;
add.s32 %r147, %r82, %r83;
$L__BB4_9:
shr.u32 %r103, %r148, 1;
sub.s32 %r104, %r144, %r103;
shr.u32 %r105, %r149, 1;
add.s32 %r55, %r104, %r105;
setp.lt.u32 %p15, %r147, %r143;
setp.le.u32 %p16, %r145, %r147;
or.pred %p17, %p15, %p16;
setp.lt.u32 %p18, %r55, %r144;
or.pred %p19, %p17, %p18;
setp.le.u32 %p20, %r146, %r55;
or.pred %p21, %p20, %p19;
mov.f32 %f24, 0f00000000;
@%p21 bra $L__BB4_11;
sub.s32 %r106, %r145, %r143;
sub.s32 %r107, %r55, %r144;
sub.s32 %r108, %r147, %r143;
mad.lo.s32 %r109, %r107, %r106, %r108;
mul.wide.u32 %rd12, %r109, 4;
add.s64 %rd13, %rd18, %rd12;
ld.f32 %f24, [%rd13];
$L__BB4_11:
st.shared.f32 [%r18], %f24;
$L__BB4_12:
bar.sync 0;
setp.eq.s32 %p22, %r16, 1;
@%p22 bra $L__BB4_19;
bra.uni $L__BB4_13;
$L__BB4_19:
setp.ne.s32 %p34, %r1, 0;
and.b32 %r142, %r74, 1;
setp.eq.b32 %p35, %r142, 1;
not.pred %p36, %p35;
or.pred %p37, %p34, %p36;
@%p37 bra $L__BB4_21;
ld.shared.f32 %f21, [_ZZ48j2k_idwt_interleave_horizontal_53_multiE11row_samples];
mul.rn.f32 %f22, %f21, 0f3F000000;
st.shared.f32 [_ZZ48j2k_idwt_interleave_horizontal_53_multiE11row_samples], %f22;
$L__BB4_21:
bar.sync 0;
@%p34 bra $L__BB4_23;
ld.shared.f32 %f23, [_ZZ48j2k_idwt_interleave_horizontal_53_multiE11row_samples];
mul.wide.u32 %rd16, %r17, 4;
add.s64 %rd17, %rd2, %rd16;
st.f32 [%rd17], %f23;
bra.uni $L__BB4_23;
$L__BB4_13:
and.b32 %r59, %r74, 1;
and.b32 %r60, %r1, 1;
setp.ne.s32 %p24, %r60, %r59;
or.pred %p25, %p2, %p24;
@%p25 bra $L__BB4_15;
setp.gt.u32 %p26, %r1, 1;
mov.u32 %r110, 1;
sub.s32 %r111, %r110, %r1;
add.s32 %r112, %r1, -1;
selp.b32 %r113, %r112, %r111, %p26;
add.s32 %r114, %r1, 1;
setp.lt.u32 %p27, %r114, %r16;
mov.u32 %r115, -3;
sub.s32 %r116, %r115, %r1;
add.s32 %r117, %r116, %r16;
add.s32 %r118, %r117, %r16;
selp.b32 %r119, %r114, %r118, %p27;
shl.b32 %r120, %r113, 2;
add.s32 %r122, %r78, %r120;
shl.b32 %r123, %r119, 2;
add.s32 %r124, %r78, %r123;
ld.shared.f32 %f4, [%r124];
ld.shared.f32 %f5, [%r122];
add.rn.f32 %f6, %f5, %f4;
mov.f32 %f7, 0f3F000000;
mov.f32 %f8, 0f3E800000;
fma.rn.f32 %f9, %f6, %f8, %f7;
cvt.rmi.f32.f32 %f10, %f9;
ld.shared.f32 %f11, [%r18];
sub.rn.f32 %f12, %f11, %f10;
st.shared.f32 [%r18], %f12;
$L__BB4_15:
bar.sync 0;
xor.b32 %r125, %r59, 1;
setp.ne.s32 %p29, %r60, %r125;
or.pred %p30, %p2, %p29;
@%p30 bra $L__BB4_17;
setp.gt.u32 %p31, %r1, 1;
mov.u32 %r126, 1;
sub.s32 %r127, %r126, %r1;
add.s32 %r128, %r1, -1;
selp.b32 %r129, %r128, %r127, %p31;
add.s32 %r130, %r1, 1;
setp.lt.u32 %p32, %r130, %r16;
mov.u32 %r131, -3;
sub.s32 %r132, %r131, %r1;
add.s32 %r133, %r132, %r16;
add.s32 %r134, %r133, %r16;
selp.b32 %r135, %r130, %r134, %p32;
shl.b32 %r136, %r129, 2;
add.s32 %r138, %r78, %r136;
shl.b32 %r139, %r135, 2;
add.s32 %r140, %r78, %r139;
ld.shared.f32 %f13, [%r140];
ld.shared.f32 %f14, [%r138];
add.rn.f32 %f15, %f14, %f13;
mul.rn.f32 %f16, %f15, 0f3F000000;
cvt.rmi.f32.f32 %f17, %f16;
ld.shared.f32 %f18, [%r18];
add.rn.f32 %f19, %f18, %f17;
st.shared.f32 [%r18], %f19;
$L__BB4_17:
bar.sync 0;
@%p2 bra $L__BB4_23;
ld.shared.f32 %f20, [%r18];
mad.lo.s32 %r141, %r16, %r17, %r1;
mul.wide.u32 %rd14, %r141, 4;
add.s64 %rd15, %rd2, %rd14;
st.f32 [%rd15], %f20;
$L__BB4_23:
ret;
}
// .globl j2k_idwt_interleave_horizontal_97_multi
.visible .entry j2k_idwt_interleave_horizontal_97_multi(
.param .u64 j2k_idwt_interleave_horizontal_97_multi_param_0
)
{
.reg .pred %p<50>;
.reg .f32 %f<38>;
.reg .b32 %r<181>;
.reg .b64 %rd<19>;
// demoted variable
.shared .align 4 .b8 _ZZ48j2k_idwt_interleave_horizontal_97_multiE11row_samples[2048];
ld.param.u64 %rd8, [j2k_idwt_interleave_horizontal_97_multi_param_0];
cvta.to.global.u64 %rd9, %rd8;
mov.u32 %r1, %tid.x;
mov.u32 %r61, %ctaid.y;
mul.wide.u32 %rd10, %r61, 128;
add.s64 %rd11, %rd9, %rd10;
add.s64 %rd1, %rd11, 112;
ld.global.v2.u32 {%r174, %r175}, [%rd11+104];
ld.global.v2.u32 {%r64, %r65}, [%rd11+88];
ld.global.v2.u32 {%r66, %r67}, [%rd11+72];
ld.global.v2.u32 {%r68, %r69}, [%rd11+56];
ld.global.v2.u32 {%r70, %r71}, [%rd11+48];
ld.global.v2.u32 {%r74, %r179}, [%rd11+40];
ld.global.u64 %rd2, [%rd11+32];
sub.s32 %r16, %r70, %r74;
sub.s32 %r76, %r71, %r179;
mov.u32 %r17, %ctaid.x;
setp.ge.u32 %p3, %r17, %r76;
@%p3 bra $L__BB5_29;
setp.ge.u32 %p4, %r1, %r16;
shl.b32 %r77, %r1, 2;
mov.u32 %r78, _ZZ48j2k_idwt_interleave_horizontal_97_multiE11row_samples;
add.s32 %r18, %r78, %r77;
@%p4 bra $L__BB5_12;
add.s32 %r19, %r74, %r1;
and.b32 %r20, %r19, 1;
add.s32 %r180, %r179, %r17;
and.b32 %r22, %r180, 1;
setp.eq.b32 %p5, %r22, 1;
setp.eq.b32 %p6, %r20, 1;
or.pred %p7, %p6, %p5;
mov.pred %p8, 0;
xor.pred %p9, %p7, %p8;
not.pred %p10, %p9;
@%p10 bra $L__BB5_8;
bra.uni $L__BB5_3;
$L__BB5_8:
ld.global.v2.u32 {%r176, %r177}, [%rd1+-48];
ld.global.u64 %rd18, [%rd1+-112];
add.s32 %r98, %r19, 1;
shr.u32 %r99, %r98, 1;
add.s32 %r100, %r74, 1;
shr.u32 %r101, %r100, 1;
sub.s32 %r102, %r68, %r101;
add.s32 %r178, %r102, %r99;
add.s32 %r179, %r179, 1;
add.s32 %r180, %r180, 1;
mov.u32 %r174, %r68;
mov.u32 %r175, %r69;
bra.uni $L__BB5_9;
$L__BB5_3:
setp.eq.s32 %p11, %r22, 0;
setp.ne.s32 %p12, %r20, 0;
and.pred %p13, %p11, %p12;
@%p13 bra $L__BB5_7;
bra.uni $L__BB5_4;
$L__BB5_7:
ld.global.v2.u32 {%r176, %r177}, [%rd1+-32];
ld.global.u64 %rd18, [%rd1+-104];
shr.u32 %r93, %r74, 1;
sub.s32 %r94, %r66, %r93;
shr.u32 %r95, %r19, 1;
add.s32 %r178, %r94, %r95;
add.s32 %r179, %r179, 1;
add.s32 %r180, %r180, 1;
mov.u32 %r174, %r66;
mov.u32 %r175, %r67;
bra.uni $L__BB5_9;
$L__BB5_4:
setp.ne.s32 %p14, %r22, 0;
setp.eq.s32 %p15, %r20, 0;
and.pred %p16, %p15, %p14;
@%p16 bra $L__BB5_6;
bra.uni $L__BB5_5;
$L__BB5_6:
ld.global.v2.u32 {%r176, %r177}, [%rd1+-16];
ld.global.u64 %rd18, [%rd1+-96];
add.s32 %r86, %r19, 1;
shr.u32 %r87, %r86, 1;
add.s32 %r88, %r74, 1;
shr.u32 %r89, %r88, 1;
sub.s32 %r90, %r64, %r89;
add.s32 %r178, %r90, %r87;
mov.u32 %r174, %r64;
mov.u32 %r175, %r65;
bra.uni $L__BB5_9;
$L__BB5_5:
ld.global.v2.u32 {%r176, %r177}, [%rd1];
ld.global.u64 %rd18, [%rd1+-88];
shr.u32 %r81, %r74, 1;
sub.s32 %r82, %r174, %r81;
shr.u32 %r83, %r19, 1;
add.s32 %r178, %r82, %r83;
$L__BB5_9:
shr.u32 %r103, %r179, 1;
sub.s32 %r104, %r175, %r103;
shr.u32 %r105, %r180, 1;
add.s32 %r55, %r104, %r105;
setp.lt.u32 %p17, %r178, %r174;
setp.le.u32 %p18, %r176, %r178;
or.pred %p19, %p17, %p18;
setp.lt.u32 %p20, %r55, %r175;
or.pred %p21, %p19, %p20;
setp.le.u32 %p22, %r177, %r55;
or.pred %p23, %p22, %p21;
mov.f32 %f37, 0f00000000;
@%p23 bra $L__BB5_11;
sub.s32 %r106, %r176, %r174;
sub.s32 %r107, %r55, %r175;
sub.s32 %r108, %r178, %r174;
mad.lo.s32 %r109, %r107, %r106, %r108;
mul.wide.u32 %rd12, %r109, 4;
add.s64 %rd13, %rd18, %rd12;
ld.f32 %f37, [%rd13];
$L__BB5_11:
st.shared.f32 [%r18], %f37;
$L__BB5_12:
bar.sync 0;
setp.eq.s32 %p24, %r16, 1;
@%p24 bra $L__BB5_25;
bra.uni $L__BB5_13;
$L__BB5_25:
setp.ne.s32 %p45, %r1, 0;
and.b32 %r173, %r74, 1;
setp.eq.b32 %p46, %r173, 1;
not.pred %p47, %p46;
or.pred %p48, %p45, %p47;
@%p48 bra $L__BB5_27;
ld.shared.f32 %f34, [_ZZ48j2k_idwt_interleave_horizontal_97_multiE11row_samples];
mul.rn.f32 %f35, %f34, 0f3F000000;
st.shared.f32 [_ZZ48j2k_idwt_interleave_horizontal_97_multiE11row_samples], %f35;
$L__BB5_27:
bar.sync 0;
@%p45 bra $L__BB5_29;
ld.shared.f32 %f36, [_ZZ48j2k_idwt_interleave_horizontal_97_multiE11row_samples];
mul.wide.u32 %rd16, %r17, 4;
add.s64 %rd17, %rd2, %rd16;
st.f32 [%rd17], %f36;
bra.uni $L__BB5_29;
$L__BB5_13:
setp.lt.u32 %p25, %r1, %r16;
and.b32 %r59, %r74, 1;
@%p25 bra $L__BB5_14;
bra.uni $L__BB5_15;
$L__BB5_14:
setp.eq.s32 %p26, %r59, 0;
selp.f32 %f4, 0f3F5019C3, 0f3F9D7658, %p26;
selp.f32 %f5, 0f3F9D7658, 0f3F5019C3, %p26;
and.b32 %r110, %r1, 1;
setp.eq.b32 %p27, %r110, 1;
selp.f32 %f6, %f4, %f5, %p27;
ld.shared.f32 %f7, [%r18];
mul.rn.f32 %f8, %f6, %f7;
st.shared.f32 [%r18], %f8;
$L__BB5_15:
bar.sync 0;
and.b32 %r60, %r1, 1;
setp.eq.s32 %p29, %r60, %r59;
and.pred %p1, %p25, %p29;
not.pred %p30, %p1;
@%p30 bra $L__BB5_17;
setp.gt.u32 %p31, %r1, 1;
mov.u32 %r111, 1;
sub.s32 %r112, %r111, %r1;
add.s32 %r113, %r1, -1;
selp.b32 %r114, %r113, %r112, %p31;
add.s32 %r115, %r1, 1;
setp.lt.u32 %p32, %r115, %r16;
mov.u32 %r116, -3;
sub.s32 %r117, %r116, %r1;
add.s32 %r118, %r117, %r16;
add.s32 %r119, %r118, %r16;
selp.b32 %r120, %r115, %r119, %p32;
shl.b32 %r121, %r114, 2;
add.s32 %r123, %r78, %r121;
shl.b32 %r124, %r120, 2;
add.s32 %r125, %r78, %r124;
ld.shared.f32 %f9, [%r125];
ld.shared.f32 %f10, [%r123];
add.rn.f32 %f11, %f10, %f9;
ld.shared.f32 %f12, [%r18];
mov.f32 %f13, 0fBEE31355;
fma.rn.f32 %f14, %f11, %f13, %f12;
st.shared.f32 [%r18], %f14;
$L__BB5_17:
bar.sync 0;
xor.b32 %r126, %r59, 1;
setp.eq.s32 %p34, %r60, %r126;
and.pred %p2, %p25, %p34;
not.pred %p35, %p2;
@%p35 bra $L__BB5_19;
setp.gt.u32 %p36, %r1, 1;
mov.u32 %r127, 1;
sub.s32 %r128, %r127, %r1;
add.s32 %r129, %r1, -1;
selp.b32 %r130, %r129, %r128, %p36;
add.s32 %r131, %r1, 1;
setp.lt.u32 %p37, %r131, %r16;
mov.u32 %r132, -3;
sub.s32 %r133, %r132, %r1;
add.s32 %r134, %r133, %r16;
add.s32 %r135, %r134, %r16;
selp.b32 %r136, %r131, %r135, %p37;
shl.b32 %r137, %r130, 2;
add.s32 %r139, %r78, %r137;
shl.b32 %r140, %r136, 2;
add.s32 %r141, %r78, %r140;
ld.shared.f32 %f15, [%r141];
ld.shared.f32 %f16, [%r139];
add.rn.f32 %f17, %f16, %f15;
ld.shared.f32 %f18, [%r18];
mov.f32 %f19, 0fBF620676;
fma.rn.f32 %f20, %f17, %f19, %f18;
st.shared.f32 [%r18], %f20;
$L__BB5_19:
bar.sync 0;
@%p30 bra $L__BB5_21;
setp.gt.u32 %p39, %r1, 1;
mov.u32 %r142, 1;
sub.s32 %r143, %r142, %r1;
add.s32 %r144, %r1, -1;
selp.b32 %r145, %r144, %r143, %p39;
add.s32 %r146, %r1, 1;
setp.lt.u32 %p40, %r146, %r16;
mov.u32 %r147, -3;
sub.s32 %r148, %r147, %r1;
add.s32 %r149, %r148, %r16;
add.s32 %r150, %r149, %r16;
selp.b32 %r151, %r146, %r150, %p40;
shl.b32 %r152, %r145, 2;
add.s32 %r154, %r78, %r152;
shl.b32 %r155, %r151, 2;
add.s32 %r156, %r78, %r155;
ld.shared.f32 %f21, [%r156];
ld.shared.f32 %f22, [%r154];
add.rn.f32 %f23, %f22, %f21;
ld.shared.f32 %f24, [%r18];
mov.f32 %f25, 0f3D5901AE;
fma.rn.f32 %f26, %f23, %f25, %f24;
st.shared.f32 [%r18], %f26;
$L__BB5_21:
bar.sync 0;
@%p35 bra $L__BB5_23;
setp.gt.u32 %p42, %r1, 1;
mov.u32 %r157, 1;
sub.s32 %r158, %r157, %r1;
add.s32 %r159, %r1, -1;
selp.b32 %r160, %r159, %r158, %p42;
add.s32 %r161, %r1, 1;
setp.lt.u32 %p43, %r161, %r16;
mov.u32 %r162, -3;
sub.s32 %r163, %r162, %r1;
add.s32 %r164, %r163, %r16;
add.s32 %r165, %r164, %r16;
selp.b32 %r166, %r161, %r165, %p43;
shl.b32 %r167, %r160, 2;
add.s32 %r169, %r78, %r167;
shl.b32 %r170, %r166, 2;
add.s32 %r171, %r78, %r170;
ld.shared.f32 %f27, [%r171];
ld.shared.f32 %f28, [%r169];
add.rn.f32 %f29, %f28, %f27;
ld.shared.f32 %f30, [%r18];
mov.f32 %f31, 0f3FCB0673;
fma.rn.f32 %f32, %f29, %f31, %f30;
st.shared.f32 [%r18], %f32;
$L__BB5_23:
bar.sync 0;
@%p4 bra $L__BB5_29;
ld.shared.f32 %f33, [%r18];
mad.lo.s32 %r172, %r16, %r17, %r1;
mul.wide.u32 %rd14, %r172, 4;
add.s64 %rd15, %rd2, %rd14;
st.f32 [%rd15], %f33;
$L__BB5_29:
ret;
}
// .globl j2k_idwt_horizontal
.visible .entry j2k_idwt_horizontal(
.param .u64 j2k_idwt_horizontal_param_0,
.param .u64 j2k_idwt_horizontal_param_1
)
{
.reg .pred %p<62>;
.reg .f32 %f<131>;
.reg .b32 %r<103>;
.reg .b64 %rd<73>;
ld.param.u64 %rd5, [j2k_idwt_horizontal_param_0];
ld.param.u64 %rd6, [j2k_idwt_horizontal_param_1];
cvta.to.global.u64 %rd1, %rd6;
ld.global.u32 %r29, [%rd1+8];
ld.global.u32 %r1, [%rd1];
sub.s32 %r2, %r29, %r1;
ld.global.u32 %r30, [%rd1+12];
ld.global.u32 %r31, [%rd1+4];
sub.s32 %r32, %r30, %r31;
mov.u32 %r33, %ntid.x;
mov.u32 %r34, %ctaid.x;
mov.u32 %r35, %tid.x;
mad.lo.s32 %r3, %r34, %r33, %r35;
setp.ge.u32 %p4, %r3, %r32;
@%p4 bra $L__BB6_48;
cvta.to.global.u64 %rd7, %rd5;
mul.lo.s32 %r36, %r2, %r3;
mul.wide.u32 %rd8, %r36, 4;
add.s64 %rd2, %rd7, %rd8;
and.b32 %r4, %r1, 1;
setp.eq.s32 %p5, %r2, 1;
@%p5 bra $L__BB6_46;
bra.uni $L__BB6_2;
$L__BB6_46:
setp.eq.s32 %p61, %r4, 0;
@%p61 bra $L__BB6_48;
ld.global.f32 %f129, [%rd2];
mul.rn.f32 %f130, %f129, 0f3F000000;
st.global.f32 [%rd2], %f130;
bra.uni $L__BB6_48;
$L__BB6_2:
ld.global.u32 %r37, [%rd1+80];
setp.eq.s32 %p6, %r37, 0;
xor.b32 %r5, %r4, 1;
add.s32 %r6, %r2, -1;
mul.wide.u32 %rd9, %r6, 4;
add.s64 %rd3, %rd2, %rd9;
add.s32 %r7, %r2, -2;
mul.wide.u32 %rd10, %r7, 4;
add.s64 %rd4, %rd2, %rd10;
@%p6 bra $L__BB6_34;
setp.eq.s32 %p1, %r4, 0;
selp.f32 %f1, 0f3F9D7658, 0f3F5019C3, %p1;
setp.lt.u32 %p7, %r2, 2;
@%p7 bra $L__BB6_6;
selp.f32 %f2, 0f3F5019C3, 0f3F9D7658, %p1;
mov.u32 %r96, 1;
$L__BB6_5:
add.s32 %r39, %r96, -1;
mul.wide.u32 %rd11, %r39, 4;
add.s64 %rd12, %rd2, %rd11;
ld.global.f32 %f3, [%rd12];
mul.rn.f32 %f4, %f1, %f3;
st.global.f32 [%rd12], %f4;
ld.global.f32 %f5, [%rd12+4];
mul.rn.f32 %f6, %f2, %f5;
st.global.f32 [%rd12+4], %f6;
add.s32 %r96, %r96, 2;
setp.lt.u32 %p8, %r96, %r2;
@%p8 bra $L__BB6_5;
$L__BB6_6:
and.b32 %r40, %r2, 1;
setp.eq.b32 %p9, %r40, 1;
mov.pred %p10, 0;
xor.pred %p11, %p9, %p10;
not.pred %p12, %p11;
@%p12 bra $L__BB6_8;
ld.global.f32 %f7, [%rd3];
mul.rn.f32 %f8, %f1, %f7;
st.global.f32 [%rd3], %f8;
$L__BB6_8:
setp.ne.s32 %p13, %r4, 0;
@%p13 bra $L__BB6_10;
setp.gt.u32 %p14, %r2, 1;
add.s32 %r41, %r2, %r2;
add.s32 %r42, %r41, -3;
selp.b32 %r43, 1, %r42, %p14;
mul.wide.u32 %rd13, %r43, 4;
add.s64 %rd14, %rd2, %rd13;
ld.global.f32 %f9, [%rd14];
ld.global.f32 %f10, [%rd2+4];
add.rn.f32 %f11, %f10, %f9;
ld.global.f32 %f12, [%rd2];
mov.f32 %f13, 0fBEE31355;
fma.rn.f32 %f14, %f11, %f13, %f12;
st.global.f32 [%rd2], %f14;
$L__BB6_10:
selp.b32 %r99, 3, 2, %p1;
setp.ge.u32 %p16, %r99, %r2;
@%p16 bra $L__BB6_13;
mov.u32 %r97, %r99;
$L__BB6_12:
add.s32 %r44, %r97, -2;
mul.wide.u32 %rd15, %r44, 4;
add.s64 %rd16, %rd2, %rd15;
mul.wide.u32 %rd17, %r97, 4;
add.s64 %rd18, %rd2, %rd17;
ld.global.f32 %f15, [%rd18];
ld.global.f32 %f16, [%rd16];
add.rn.f32 %f17, %f16, %f15;
add.s32 %r45, %r97, -1;
mul.wide.u32 %rd19, %r45, 4;
add.s64 %rd20, %rd2, %rd19;
ld.global.f32 %f18, [%rd20];
mov.f32 %f19, 0fBEE31355;
fma.rn.f32 %f20, %f17, %f19, %f18;
st.global.f32 [%rd20], %f20;
add.s32 %r97, %r97, 2;
setp.lt.u32 %p17, %r97, %r2;
@%p17 bra $L__BB6_12;
$L__BB6_13:
setp.gt.u32 %p18, %r2, 1;
and.b32 %r14, %r6, 1;
setp.eq.s32 %p19, %r14, %r4;
and.pred %p2, %p18, %p19;
not.pred %p20, %p2;
@%p20 bra $L__BB6_15;
setp.gt.u32 %p21, %r6, 1;
mov.u32 %r46, 2;
sub.s32 %r47, %r46, %r2;
selp.b32 %r49, %r7, %r47, %p21;
mul.wide.u32 %rd21, %r49, 4;
add.s64 %rd22, %rd2, %rd21;
ld.global.f32 %f21, [%rd4];
ld.global.f32 %f22, [%rd22];
add.rn.f32 %f23, %f22, %f21;
ld.global.f32 %f24, [%rd3];
mov.f32 %f25, 0fBEE31355;
fma.rn.f32 %f26, %f23, %f25, %f24;
st.global.f32 [%rd3], %f26;
$L__BB6_15:
setp.ne.s32 %p22, %r5, 0;
@%p22 bra $L__BB6_17;
add.s32 %r50, %r2, %r2;
add.s32 %r51, %r50, -3;
selp.b32 %r52, 1, %r51, %p18;
mul.wide.u32 %rd23, %r52, 4;
add.s64 %rd24, %rd2, %rd23;
ld.global.f32 %f27, [%rd24];
ld.global.f32 %f28, [%rd2+4];
add.rn.f32 %f29, %f28, %f27;
ld.global.f32 %f30, [%rd2];
mov.f32 %f31, 0fBF620676;
fma.rn.f32 %f32, %f29, %f31, %f30;
st.global.f32 [%rd2], %f32;
$L__BB6_17:
setp.eq.s32 %p24, %r5, 0;
selp.b32 %r100, 3, 2, %p24;
setp.ge.u32 %p25, %r100, %r2;
@%p25 bra $L__BB6_20;
mov.u32 %r98, %r100;
$L__BB6_19:
add.s32 %r53, %r98, -2;
mul.wide.u32 %rd25, %r53, 4;
add.s64 %rd26, %rd2, %rd25;
mul.wide.u32 %rd27, %r98, 4;
add.s64 %rd28, %rd2, %rd27;
ld.global.f32 %f33, [%rd28];
ld.global.f32 %f34, [%rd26];
add.rn.f32 %f35, %f34, %f33;
add.s32 %r54, %r98, -1;
mul.wide.u32 %rd29, %r54, 4;
add.s64 %rd30, %rd2, %rd29;
ld.global.f32 %f36, [%rd30];
mov.f32 %f37, 0fBF620676;
fma.rn.f32 %f38, %f35, %f37, %f36;
st.global.f32 [%rd30], %f38;
add.s32 %r98, %r98, 2;
setp.lt.u32 %p26, %r98, %r2;
@%p26 bra $L__BB6_19;
$L__BB6_20:
setp.eq.s32 %p28, %r14, %r5;
and.pred %p3, %p18, %p28;
not.pred %p29, %p3;
@%p29 bra $L__BB6_22;
setp.gt.u32 %p30, %r6, 1;
mov.u32 %r55, 2;
sub.s32 %r56, %r55, %r2;
selp.b32 %r58, %r7, %r56, %p30;
mul.wide.u32 %rd31, %r58, 4;
add.s64 %rd32, %rd2, %rd31;
ld.global.f32 %f39, [%rd4];
ld.global.f32 %f40, [%rd32];
add.rn.f32 %f41, %f40, %f39;
ld.global.f32 %f42, [%rd3];
mov.f32 %f43, 0fBF620676;
fma.rn.f32 %f44, %f41, %f43, %f42;
st.global.f32 [%rd3], %f44;
$L__BB6_22:
@%p13 bra $L__BB6_24;
add.s32 %r59, %r2, %r2;
add.s32 %r60, %r59, -3;
selp.b32 %r61, 1, %r60, %p18;
mul.wide.u32 %rd33, %r61, 4;
add.s64 %rd34, %rd2, %rd33;
ld.global.f32 %f45, [%rd34];
ld.global.f32 %f46, [%rd2+4];
add.rn.f32 %f47, %f46, %f45;
ld.global.f32 %f48, [%rd2];
mov.f32 %f49, 0f3D5901AE;
fma.rn.f32 %f50, %f47, %f49, %f48;
st.global.f32 [%rd2], %f50;
$L__BB6_24:
@%p16 bra $L__BB6_26;
$L__BB6_25:
add.s32 %r62, %r99, -2;
mul.wide.u32 %rd35, %r62, 4;
add.s64 %rd36, %rd2, %rd35;
mul.wide.u32 %rd37, %r99, 4;
add.s64 %rd38, %rd2, %rd37;
ld.global.f32 %f51, [%rd38];
ld.global.f32 %f52, [%rd36];
add.rn.f32 %f53, %f52, %f51;
add.s32 %r63, %r99, -1;
mul.wide.u32 %rd39, %r63, 4;
add.s64 %rd40, %rd2, %rd39;
ld.global.f32 %f54, [%rd40];
mov.f32 %f55, 0f3D5901AE;
fma.rn.f32 %f56, %f53, %f55, %f54;
st.global.f32 [%rd40], %f56;
add.s32 %r99, %r99, 2;
setp.lt.u32 %p34, %r99, %r2;
@%p34 bra $L__BB6_25;
$L__BB6_26:
@%p20 bra $L__BB6_28;
setp.gt.u32 %p36, %r6, 1;
mov.u32 %r64, 2;
sub.s32 %r65, %r64, %r2;
selp.b32 %r67, %r7, %r65, %p36;
mul.wide.u32 %rd41, %r67, 4;
add.s64 %rd42, %rd2, %rd41;
ld.global.f32 %f57, [%rd4];
ld.global.f32 %f58, [%rd42];
add.rn.f32 %f59, %f58, %f57;
ld.global.f32 %f60, [%rd3];
mov.f32 %f61, 0f3D5901AE;
fma.rn.f32 %f62, %f59, %f61, %f60;
st.global.f32 [%rd3], %f62;
$L__BB6_28:
@%p22 bra $L__BB6_30;
add.s32 %r68, %r2, %r2;
add.s32 %r69, %r68, -3;
selp.b32 %r70, 1, %r69, %p18;
mul.wide.u32 %rd43, %r70, 4;
add.s64 %rd44, %rd2, %rd43;
ld.global.f32 %f63, [%rd44];
ld.global.f32 %f64, [%rd2+4];
add.rn.f32 %f65, %f64, %f63;
ld.global.f32 %f66, [%rd2];
mov.f32 %f67, 0f3FCB0673;
fma.rn.f32 %f68, %f65, %f67, %f66;
st.global.f32 [%rd2], %f68;
$L__BB6_30:
@%p25 bra $L__BB6_32;
$L__BB6_31:
add.s32 %r71, %r100, -2;
mul.wide.u32 %rd45, %r71, 4;
add.s64 %rd46, %rd2, %rd45;
mul.wide.u32 %rd47, %r100, 4;
add.s64 %rd48, %rd2, %rd47;
ld.global.f32 %f69, [%rd48];
ld.global.f32 %f70, [%rd46];
add.rn.f32 %f71, %f70, %f69;
add.s32 %r72, %r100, -1;
mul.wide.u32 %rd49, %r72, 4;
add.s64 %rd50, %rd2, %rd49;
ld.global.f32 %f72, [%rd50];
mov.f32 %f73, 0f3FCB0673;
fma.rn.f32 %f74, %f71, %f73, %f72;
st.global.f32 [%rd50], %f74;
add.s32 %r100, %r100, 2;
setp.lt.u32 %p40, %r100, %r2;
@%p40 bra $L__BB6_31;
$L__BB6_32:
@%p29 bra $L__BB6_48;
setp.gt.u32 %p42, %r6, 1;
mov.u32 %r73, 2;
sub.s32 %r74, %r73, %r2;
selp.b32 %r76, %r7, %r74, %p42;
mul.wide.u32 %rd51, %r76, 4;
add.s64 %rd52, %rd2, %rd51;
ld.global.f32 %f75, [%rd4];
ld.global.f32 %f76, [%rd52];
add.rn.f32 %f77, %f76, %f75;
ld.global.f32 %f78, [%rd3];
mov.f32 %f79, 0f3FCB0673;
fma.rn.f32 %f80, %f77, %f79, %f78;
st.global.f32 [%rd3], %f80;
bra.uni $L__BB6_48;
$L__BB6_34:
setp.ne.s32 %p43, %r4, 0;
@%p43 bra $L__BB6_36;
setp.gt.u32 %p44, %r2, 1;
add.s32 %r77, %r2, %r2;
add.s32 %r78, %r77, -3;
selp.b32 %r79, 1, %r78, %p44;
mul.wide.u32 %rd53, %r79, 4;
add.s64 %rd54, %rd2, %rd53;
ld.global.f32 %f81, [%rd54];
ld.global.f32 %f82, [%rd2+4];
add.rn.f32 %f83, %f82, %f81;
mov.f32 %f84, 0f3F000000;
mov.f32 %f85, 0f3E800000;
fma.rn.f32 %f86, %f83, %f85, %f84;
cvt.rmi.f32.f32 %f87, %f86;
ld.global.f32 %f88, [%rd2];
sub.rn.f32 %f89, %f88, %f87;
st.global.f32 [%rd2], %f89;
$L__BB6_36:
setp.eq.s32 %p45, %r4, 0;
selp.b32 %r101, 3, 2, %p45;
setp.ge.u32 %p46, %r101, %r2;
@%p46 bra $L__BB6_38;
$L__BB6_37:
add.s32 %r80, %r101, -1;
mul.wide.u32 %rd55, %r80, 4;
add.s64 %rd56, %rd2, %rd55;
add.s32 %r81, %r101, -2;
mul.wide.u32 %rd57, %r81, 4;
add.s64 %rd58, %rd2, %rd57;
mul.wide.u32 %rd59, %r101, 4;
add.s64 %rd60, %rd2, %rd59;
ld.global.f32 %f90, [%rd60];
ld.global.f32 %f91, [%rd58];
add.rn.f32 %f92, %f91, %f90;
mov.f32 %f93, 0f3F000000;
mov.f32 %f94, 0f3E800000;
fma.rn.f32 %f95, %f92, %f94, %f93;
cvt.rmi.f32.f32 %f96, %f95;
ld.global.f32 %f97, [%rd56];
sub.rn.f32 %f98, %f97, %f96;
st.global.f32 [%rd56], %f98;
add.s32 %r101, %r101, 2;
setp.lt.u32 %p47, %r101, %r2;
@%p47 bra $L__BB6_37;
$L__BB6_38:
setp.lt.u32 %p48, %r2, 2;
and.b32 %r25, %r2, 1;
xor.b32 %r82, %r25, 1;
setp.ne.s32 %p49, %r82, %r4;
or.pred %p50, %p48, %p49;
@%p50 bra $L__BB6_40;
setp.gt.u32 %p51, %r6, 1;
mov.u32 %r84, 2;
sub.s32 %r85, %r84, %r2;
selp.b32 %r87, %r7, %r85, %p51;
mul.wide.u32 %rd61, %r87, 4;
add.s64 %rd62, %rd2, %rd61;
ld.global.f32 %f99, [%rd4];
ld.global.f32 %f100, [%rd62];
add.rn.f32 %f101, %f100, %f99;
mov.f32 %f102, 0f3F000000;
mov.f32 %f103, 0f3E800000;
fma.rn.f32 %f104, %f101, %f103, %f102;
cvt.rmi.f32.f32 %f105, %f104;
ld.global.f32 %f106, [%rd3];
sub.rn.f32 %f107, %f106, %f105;
st.global.f32 [%rd3], %f107;
$L__BB6_40:
setp.ne.s32 %p52, %r5, 0;
@%p52 bra $L__BB6_42;
setp.gt.u32 %p53, %r2, 1;
add.s32 %r88, %r2, %r2;
add.s32 %r89, %r88, -3;
selp.b32 %r90, 1, %r89, %p53;
mul.wide.u32 %rd63, %r90, 4;
add.s64 %rd64, %rd2, %rd63;
ld.global.f32 %f108, [%rd64];
ld.global.f32 %f109, [%rd2+4];
add.rn.f32 %f110, %f109, %f108;
mul.rn.f32 %f111, %f110, 0f3F000000;
cvt.rmi.f32.f32 %f112, %f111;
ld.global.f32 %f113, [%rd2];
add.rn.f32 %f114, %f113, %f112;
st.global.f32 [%rd2], %f114;
$L__BB6_42:
setp.eq.s32 %p54, %r5, 0;
selp.b32 %r102, 3, 2, %p54;
setp.ge.u32 %p55, %r102, %r2;
@%p55 bra $L__BB6_44;
$L__BB6_43:
add.s32 %r91, %r102, -1;
mul.wide.u32 %rd65, %r91, 4;
add.s64 %rd66, %rd2, %rd65;
add.s32 %r92, %r102, -2;
mul.wide.u32 %rd67, %r92, 4;
add.s64 %rd68, %rd2, %rd67;
mul.wide.u32 %rd69, %r102, 4;
add.s64 %rd70, %rd2, %rd69;
ld.global.f32 %f115, [%rd70];
ld.global.f32 %f116, [%rd68];
add.rn.f32 %f117, %f116, %f115;
mul.rn.f32 %f118, %f117, 0f3F000000;
cvt.rmi.f32.f32 %f119, %f118;
ld.global.f32 %f120, [%rd66];
add.rn.f32 %f121, %f120, %f119;
st.global.f32 [%rd66], %f121;
add.s32 %r102, %r102, 2;
setp.lt.u32 %p56, %r102, %r2;
@%p56 bra $L__BB6_43;
$L__BB6_44:
setp.ne.s32 %p58, %r25, %r4;
or.pred %p59, %p48, %p58;
@%p59 bra $L__BB6_48;
setp.gt.u32 %p60, %r6, 1;
mov.u32 %r93, 2;
sub.s32 %r94, %r93, %r2;
selp.b32 %r95, %r7, %r94, %p60;
mul.wide.u32 %rd71, %r95, 4;
add.s64 %rd72, %rd2, %rd71;
ld.global.f32 %f122, [%rd4];
ld.global.f32 %f123, [%rd72];
add.rn.f32 %f124, %f123, %f122;
mul.rn.f32 %f125, %f124, 0f3F000000;
cvt.rmi.f32.f32 %f126, %f125;
ld.global.f32 %f127, [%rd3];
add.rn.f32 %f128, %f127, %f126;
st.global.f32 [%rd3], %f128;
$L__BB6_48:
ret;
}
// .globl j2k_idwt_horizontal_53
.visible .entry j2k_idwt_horizontal_53(
.param .u64 j2k_idwt_horizontal_53_param_0,
.param .u64 j2k_idwt_horizontal_53_param_1
)
{
.reg .pred %p<22>;
.reg .f32 %f<51>;
.reg .b32 %r<43>;
.reg .b64 %rd<31>;
ld.param.u64 %rd4, [j2k_idwt_horizontal_53_param_0];
ld.param.u64 %rd5, [j2k_idwt_horizontal_53_param_1];
cvta.to.global.u64 %rd6, %rd5;
ld.global.u32 %r14, [%rd6+8];
ld.global.u32 %r1, [%rd6];
sub.s32 %r2, %r14, %r1;
ld.global.u32 %r15, [%rd6+12];
ld.global.u32 %r16, [%rd6+4];
sub.s32 %r17, %r15, %r16;
mov.u32 %r18, %ntid.x;
mov.u32 %r19, %ctaid.x;
mov.u32 %r20, %tid.x;
mad.lo.s32 %r3, %r19, %r18, %r20;
setp.ge.u32 %p1, %r3, %r17;
@%p1 bra $L__BB7_16;
cvta.to.global.u64 %rd7, %rd4;
mul.lo.s32 %r21, %r2, %r3;
mul.wide.u32 %rd8, %r21, 4;
add.s64 %rd1, %rd7, %rd8;
and.b32 %r4, %r1, 1;
setp.eq.s32 %p2, %r2, 1;
@%p2 bra $L__BB7_14;
bra.uni $L__BB7_2;
$L__BB7_14:
setp.eq.s32 %p21, %r4, 0;
@%p21 bra $L__BB7_16;
ld.global.f32 %f49, [%rd1];
mul.rn.f32 %f50, %f49, 0f3F000000;
st.global.f32 [%rd1], %f50;
bra.uni $L__BB7_16;
$L__BB7_2:
setp.ne.s32 %p3, %r4, 0;
@%p3 bra $L__BB7_4;
setp.gt.u32 %p4, %r2, 1;
add.s32 %r22, %r2, %r2;
add.s32 %r23, %r22, -3;
selp.b32 %r24, 1, %r23, %p4;
mul.wide.u32 %rd9, %r24, 4;
add.s64 %rd10, %rd1, %rd9;
ld.global.f32 %f1, [%rd10];
ld.global.f32 %f2, [%rd1+4];
add.rn.f32 %f3, %f2, %f1;
mov.f32 %f4, 0f3F000000;
mov.f32 %f5, 0f3E800000;
fma.rn.f32 %f6, %f3, %f5, %f4;
cvt.rmi.f32.f32 %f7, %f6;
ld.global.f32 %f8, [%rd1];
sub.rn.f32 %f9, %f8, %f7;
st.global.f32 [%rd1], %f9;
$L__BB7_4:
setp.eq.s32 %p5, %r4, 0;
selp.b32 %r41, 3, 2, %p5;
setp.ge.u32 %p6, %r41, %r2;
@%p6 bra $L__BB7_6;
$L__BB7_5:
add.s32 %r25, %r41, -1;
mul.wide.u32 %rd11, %r25, 4;
add.s64 %rd12, %rd1, %rd11;
add.s32 %r26, %r41, -2;
mul.wide.u32 %rd13, %r26, 4;
add.s64 %rd14, %rd1, %rd13;
mul.wide.u32 %rd15, %r41, 4;
add.s64 %rd16, %rd1, %rd15;
ld.global.f32 %f10, [%rd16];
ld.global.f32 %f11, [%rd14];
add.rn.f32 %f12, %f11, %f10;
mov.f32 %f13, 0f3F000000;
mov.f32 %f14, 0f3E800000;
fma.rn.f32 %f15, %f12, %f14, %f13;
cvt.rmi.f32.f32 %f16, %f15;
ld.global.f32 %f17, [%rd12];
sub.rn.f32 %f18, %f17, %f16;
st.global.f32 [%rd12], %f18;
add.s32 %r41, %r41, 2;
setp.lt.u32 %p7, %r41, %r2;
@%p7 bra $L__BB7_5;
$L__BB7_6:
setp.lt.u32 %p8, %r2, 2;
and.b32 %r8, %r2, 1;
xor.b32 %r27, %r8, 1;
setp.ne.s32 %p9, %r27, %r4;
add.s32 %r9, %r2, -1;
mul.wide.u32 %rd17, %r9, 4;
add.s64 %rd2, %rd1, %rd17;
add.s32 %r10, %r2, -2;
mul.wide.u32 %rd18, %r10, 4;
add.s64 %rd3, %rd1, %rd18;
or.pred %p10, %p8, %p9;
@%p10 bra $L__BB7_8;
setp.gt.u32 %p11, %r9, 1;
mov.u32 %r29, 2;
sub.s32 %r30, %r29, %r2;
selp.b32 %r32, %r10, %r30, %p11;
mul.wide.u32 %rd19, %r32, 4;
add.s64 %rd20, %rd1, %rd19;
ld.global.f32 %f19, [%rd3];
ld.global.f32 %f20, [%rd20];
add.rn.f32 %f21, %f20, %f19;
mov.f32 %f22, 0f3F000000;
mov.f32 %f23, 0f3E800000;
fma.rn.f32 %f24, %f21, %f23, %f22;
cvt.rmi.f32.f32 %f25, %f24;
ld.global.f32 %f26, [%rd2];
sub.rn.f32 %f27, %f26, %f25;
st.global.f32 [%rd2], %f27;
$L__BB7_8:
@%p5 bra $L__BB7_10;
setp.gt.u32 %p13, %r2, 1;
add.s32 %r33, %r2, %r2;
add.s32 %r34, %r33, -3;
selp.b32 %r35, 1, %r34, %p13;
mul.wide.u32 %rd21, %r35, 4;
add.s64 %rd22, %rd1, %rd21;
ld.global.f32 %f28, [%rd22];
ld.global.f32 %f29, [%rd1+4];
add.rn.f32 %f30, %f29, %f28;
mul.rn.f32 %f31, %f30, 0f3F000000;
cvt.rmi.f32.f32 %f32, %f31;
ld.global.f32 %f33, [%rd1];
add.rn.f32 %f34, %f33, %f32;
st.global.f32 [%rd1], %f34;
$L__BB7_10:
selp.b32 %r42, 3, 2, %p3;
setp.ge.u32 %p15, %r42, %r2;
@%p15 bra $L__BB7_12;
$L__BB7_11:
add.s32 %r36, %r42, -1;
mul.wide.u32 %rd23, %r36, 4;
add.s64 %rd24, %rd1, %rd23;
add.s32 %r37, %r42, -2;
mul.wide.u32 %rd25, %r37, 4;
add.s64 %rd26, %rd1, %rd25;
mul.wide.u32 %rd27, %r42, 4;
add.s64 %rd28, %rd1, %rd27;
ld.global.f32 %f35, [%rd28];
ld.global.f32 %f36, [%rd26];
add.rn.f32 %f37, %f36, %f35;
mul.rn.f32 %f38, %f37, 0f3F000000;
cvt.rmi.f32.f32 %f39, %f38;
ld.global.f32 %f40, [%rd24];
add.rn.f32 %f41, %f40, %f39;
st.global.f32 [%rd24], %f41;
add.s32 %r42, %r42, 2;
setp.lt.u32 %p16, %r42, %r2;
@%p16 bra $L__BB7_11;
$L__BB7_12:
setp.ne.s32 %p18, %r8, %r4;
or.pred %p19, %p8, %p18;
@%p19 bra $L__BB7_16;
setp.gt.u32 %p20, %r9, 1;
mov.u32 %r38, 2;
sub.s32 %r39, %r38, %r2;
selp.b32 %r40, %r10, %r39, %p20;
mul.wide.u32 %rd29, %r40, 4;
add.s64 %rd30, %rd1, %rd29;
ld.global.f32 %f42, [%rd3];
ld.global.f32 %f43, [%rd30];
add.rn.f32 %f44, %f43, %f42;
mul.rn.f32 %f45, %f44, 0f3F000000;
cvt.rmi.f32.f32 %f46, %f45;
ld.global.f32 %f47, [%rd2];
add.rn.f32 %f48, %f47, %f46;
st.global.f32 [%rd2], %f48;
$L__BB7_16:
ret;
}
// .globl j2k_idwt_horizontal_97
.visible .entry j2k_idwt_horizontal_97(
.param .u64 j2k_idwt_horizontal_97_param_0,
.param .u64 j2k_idwt_horizontal_97_param_1
)
{
.reg .pred %p<43>;
.reg .f32 %f<83>;
.reg .b32 %r<72>;
.reg .b64 %rd<53>;
ld.param.u64 %rd4, [j2k_idwt_horizontal_97_param_0];
ld.param.u64 %rd5, [j2k_idwt_horizontal_97_param_1];
cvta.to.global.u64 %rd6, %rd5;
ld.global.u32 %r21, [%rd6+8];
ld.global.u32 %r1, [%rd6];
sub.s32 %r2, %r21, %r1;
ld.global.u32 %r22, [%rd6+12];
ld.global.u32 %r23, [%rd6+4];
sub.s32 %r24, %r22, %r23;
mov.u32 %r25, %ntid.x;
mov.u32 %r26, %ctaid.x;
mov.u32 %r27, %tid.x;
mad.lo.s32 %r3, %r26, %r25, %r27;
setp.ge.u32 %p4, %r3, %r24;
@%p4 bra $L__BB8_35;
cvta.to.global.u64 %rd7, %rd4;
mul.lo.s32 %r28, %r2, %r3;
mul.wide.u32 %rd8, %r28, 4;
add.s64 %rd1, %rd7, %rd8;
and.b32 %r4, %r1, 1;
setp.eq.s32 %p5, %r2, 1;
@%p5 bra $L__BB8_33;
bra.uni $L__BB8_2;
$L__BB8_33:
setp.eq.s32 %p42, %r4, 0;
@%p42 bra $L__BB8_35;
ld.global.f32 %f81, [%rd1];
mul.rn.f32 %f82, %f81, 0f3F000000;
st.global.f32 [%rd1], %f82;
bra.uni $L__BB8_35;
$L__BB8_2:
xor.b32 %r5, %r4, 1;
setp.eq.s32 %p1, %r4, 0;
selp.f32 %f1, 0f3F9D7658, 0f3F5019C3, %p1;
setp.eq.s32 %p6, %r2, 0;
@%p6 bra $L__BB8_5;
selp.f32 %f2, 0f3F5019C3, 0f3F9D7658, %p1;
mov.u32 %r67, 1;
$L__BB8_4:
add.s32 %r30, %r67, -1;
mul.wide.u32 %rd9, %r30, 4;
add.s64 %rd10, %rd1, %rd9;
ld.global.f32 %f3, [%rd10];
mul.rn.f32 %f4, %f1, %f3;
st.global.f32 [%rd10], %f4;
ld.global.f32 %f5, [%rd10+4];
mul.rn.f32 %f6, %f2, %f5;
st.global.f32 [%rd10+4], %f6;
add.s32 %r67, %r67, 2;
setp.lt.u32 %p7, %r67, %r2;
@%p7 bra $L__BB8_4;
$L__BB8_5:
and.b32 %r31, %r2, 1;
setp.eq.b32 %p8, %r31, 1;
mov.pred %p9, 0;
xor.pred %p10, %p8, %p9;
not.pred %p11, %p10;
add.s32 %r8, %r2, -1;
mul.wide.u32 %rd11, %r8, 4;
add.s64 %rd2, %rd1, %rd11;
@%p11 bra $L__BB8_7;
ld.global.f32 %f7, [%rd2];
mul.rn.f32 %f8, %f1, %f7;
st.global.f32 [%rd2], %f8;
$L__BB8_7:
setp.ne.s32 %p12, %r4, 0;
@%p12 bra $L__BB8_9;
setp.gt.u32 %p13, %r2, 1;
add.s32 %r32, %r2, %r2;
add.s32 %r33, %r32, -3;
selp.b32 %r34, 1, %r33, %p13;
mul.wide.u32 %rd12, %r34, 4;
add.s64 %rd13, %rd1, %rd12;
ld.global.f32 %f9, [%rd13];
ld.global.f32 %f10, [%rd1+4];
add.rn.f32 %f11, %f10, %f9;
ld.global.f32 %f12, [%rd1];
mov.f32 %f13, 0fBEE31355;
fma.rn.f32 %f14, %f11, %f13, %f12;
st.global.f32 [%rd1], %f14;
$L__BB8_9:
selp.b32 %r70, 3, 2, %p1;
setp.ge.u32 %p15, %r70, %r2;
@%p15 bra $L__BB8_12;
mov.u32 %r68, %r70;
$L__BB8_11:
add.s32 %r35, %r68, -2;
mul.wide.u32 %rd14, %r35, 4;
add.s64 %rd15, %rd1, %rd14;
mul.wide.u32 %rd16, %r68, 4;
add.s64 %rd17, %rd1, %rd16;
ld.global.f32 %f15, [%rd17];
ld.global.f32 %f16, [%rd15];
add.rn.f32 %f17, %f16, %f15;
add.s32 %r36, %r68, -1;
mul.wide.u32 %rd18, %r36, 4;
add.s64 %rd19, %rd1, %rd18;
ld.global.f32 %f18, [%rd19];
mov.f32 %f19, 0fBEE31355;
fma.rn.f32 %f20, %f17, %f19, %f18;
st.global.f32 [%rd19], %f20;
add.s32 %r68, %r68, 2;
setp.lt.u32 %p16, %r68, %r2;
@%p16 bra $L__BB8_11;
$L__BB8_12:
setp.gt.u32 %p17, %r2, 1;
and.b32 %r12, %r8, 1;
setp.eq.s32 %p18, %r12, %r4;
and.pred %p2, %p17, %p18;
add.s32 %r13, %r2, -2;
mul.wide.u32 %rd20, %r13, 4;
add.s64 %rd3, %rd1, %rd20;
not.pred %p19, %p2;
@%p19 bra $L__BB8_14;
setp.gt.u32 %p20, %r8, 1;
mov.u32 %r37, 2;
sub.s32 %r38, %r37, %r2;
selp.b32 %r40, %r13, %r38, %p20;
mul.wide.u32 %rd21, %r40, 4;
add.s64 %rd22, %rd1, %rd21;
ld.global.f32 %f21, [%rd3];
ld.global.f32 %f22, [%rd22];
add.rn.f32 %f23, %f22, %f21;
ld.global.f32 %f24, [%rd2];
mov.f32 %f25, 0fBEE31355;
fma.rn.f32 %f26, %f23, %f25, %f24;
st.global.f32 [%rd2], %f26;
$L__BB8_14:
setp.ne.s32 %p21, %r5, 0;
@%p21 bra $L__BB8_16;
add.s32 %r41, %r2, %r2;
add.s32 %r42, %r41, -3;
selp.b32 %r43, 1, %r42, %p17;
mul.wide.u32 %rd23, %r43, 4;
add.s64 %rd24, %rd1, %rd23;
ld.global.f32 %f27, [%rd24];
ld.global.f32 %f28, [%rd1+4];
add.rn.f32 %f29, %f28, %f27;
ld.global.f32 %f30, [%rd1];
mov.f32 %f31, 0fBF620676;
fma.rn.f32 %f32, %f29, %f31, %f30;
st.global.f32 [%rd1], %f32;
$L__BB8_16:
setp.eq.s32 %p23, %r5, 0;
selp.b32 %r71, 3, 2, %p23;
setp.ge.u32 %p24, %r71, %r2;
@%p24 bra $L__BB8_19;
mov.u32 %r69, %r71;
$L__BB8_18:
add.s32 %r44, %r69, -2;
mul.wide.u32 %rd25, %r44, 4;
add.s64 %rd26, %rd1, %rd25;
mul.wide.u32 %rd27, %r69, 4;
add.s64 %rd28, %rd1, %rd27;
ld.global.f32 %f33, [%rd28];
ld.global.f32 %f34, [%rd26];
add.rn.f32 %f35, %f34, %f33;
add.s32 %r45, %r69, -1;
mul.wide.u32 %rd29, %r45, 4;
add.s64 %rd30, %rd1, %rd29;
ld.global.f32 %f36, [%rd30];
mov.f32 %f37, 0fBF620676;
fma.rn.f32 %f38, %f35, %f37, %f36;
st.global.f32 [%rd30], %f38;
add.s32 %r69, %r69, 2;
setp.lt.u32 %p25, %r69, %r2;
@%p25 bra $L__BB8_18;
$L__BB8_19:
setp.eq.s32 %p27, %r12, %r5;
and.pred %p3, %p17, %p27;
not.pred %p28, %p3;
@%p28 bra $L__BB8_21;
setp.gt.u32 %p29, %r8, 1;
mov.u32 %r46, 2;
sub.s32 %r47, %r46, %r2;
selp.b32 %r49, %r13, %r47, %p29;
mul.wide.u32 %rd31, %r49, 4;
add.s64 %rd32, %rd1, %rd31;
ld.global.f32 %f39, [%rd3];
ld.global.f32 %f40, [%rd32];
add.rn.f32 %f41, %f40, %f39;
ld.global.f32 %f42, [%rd2];
mov.f32 %f43, 0fBF620676;
fma.rn.f32 %f44, %f41, %f43, %f42;
st.global.f32 [%rd2], %f44;
$L__BB8_21:
@%p12 bra $L__BB8_23;
add.s32 %r50, %r2, %r2;
add.s32 %r51, %r50, -3;
selp.b32 %r52, 1, %r51, %p17;
mul.wide.u32 %rd33, %r52, 4;
add.s64 %rd34, %rd1, %rd33;
ld.global.f32 %f45, [%rd34];
ld.global.f32 %f46, [%rd1+4];
add.rn.f32 %f47, %f46, %f45;
ld.global.f32 %f48, [%rd1];
mov.f32 %f49, 0f3D5901AE;
fma.rn.f32 %f50, %f47, %f49, %f48;
st.global.f32 [%rd1], %f50;
$L__BB8_23:
@%p15 bra $L__BB8_25;
$L__BB8_24:
add.s32 %r53, %r70, -2;
mul.wide.u32 %rd35, %r53, 4;
add.s64 %rd36, %rd1, %rd35;
mul.wide.u32 %rd37, %r70, 4;
add.s64 %rd38, %rd1, %rd37;
ld.global.f32 %f51, [%rd38];
ld.global.f32 %f52, [%rd36];
add.rn.f32 %f53, %f52, %f51;
add.s32 %r54, %r70, -1;
mul.wide.u32 %rd39, %r54, 4;
add.s64 %rd40, %rd1, %rd39;
ld.global.f32 %f54, [%rd40];
mov.f32 %f55, 0f3D5901AE;
fma.rn.f32 %f56, %f53, %f55, %f54;
st.global.f32 [%rd40], %f56;
add.s32 %r70, %r70, 2;
setp.lt.u32 %p33, %r70, %r2;
@%p33 bra $L__BB8_24;
$L__BB8_25:
@%p19 bra $L__BB8_27;
setp.gt.u32 %p35, %r8, 1;
mov.u32 %r55, 2;
sub.s32 %r56, %r55, %r2;
selp.b32 %r58, %r13, %r56, %p35;
mul.wide.u32 %rd41, %r58, 4;
add.s64 %rd42, %rd1, %rd41;
ld.global.f32 %f57, [%rd3];
ld.global.f32 %f58, [%rd42];
add.rn.f32 %f59, %f58, %f57;
ld.global.f32 %f60, [%rd2];
mov.f32 %f61, 0f3D5901AE;
fma.rn.f32 %f62, %f59, %f61, %f60;
st.global.f32 [%rd2], %f62;
$L__BB8_27:
@%p21 bra $L__BB8_29;
add.s32 %r59, %r2, %r2;
add.s32 %r60, %r59, -3;
selp.b32 %r61, 1, %r60, %p17;
mul.wide.u32 %rd43, %r61, 4;
add.s64 %rd44, %rd1, %rd43;
ld.global.f32 %f63, [%rd44];
ld.global.f32 %f64, [%rd1+4];
add.rn.f32 %f65, %f64, %f63;
ld.global.f32 %f66, [%rd1];
mov.f32 %f67, 0f3FCB0673;
fma.rn.f32 %f68, %f65, %f67, %f66;
st.global.f32 [%rd1], %f68;
$L__BB8_29:
@%p24 bra $L__BB8_31;
$L__BB8_30:
add.s32 %r62, %r71, -2;
mul.wide.u32 %rd45, %r62, 4;
add.s64 %rd46, %rd1, %rd45;
mul.wide.u32 %rd47, %r71, 4;
add.s64 %rd48, %rd1, %rd47;
ld.global.f32 %f69, [%rd48];
ld.global.f32 %f70, [%rd46];
add.rn.f32 %f71, %f70, %f69;
add.s32 %r63, %r71, -1;
mul.wide.u32 %rd49, %r63, 4;
add.s64 %rd50, %rd1, %rd49;
ld.global.f32 %f72, [%rd50];
mov.f32 %f73, 0f3FCB0673;
fma.rn.f32 %f74, %f71, %f73, %f72;
st.global.f32 [%rd50], %f74;
add.s32 %r71, %r71, 2;
setp.lt.u32 %p39, %r71, %r2;
@%p39 bra $L__BB8_30;
$L__BB8_31:
@%p28 bra $L__BB8_35;
setp.gt.u32 %p41, %r8, 1;
mov.u32 %r64, 2;
sub.s32 %r65, %r64, %r2;
selp.b32 %r66, %r13, %r65, %p41;
mul.wide.u32 %rd51, %r66, 4;
add.s64 %rd52, %rd1, %rd51;
ld.global.f32 %f75, [%rd3];
ld.global.f32 %f76, [%rd52];
add.rn.f32 %f77, %f76, %f75;
ld.global.f32 %f78, [%rd2];
mov.f32 %f79, 0f3FCB0673;
fma.rn.f32 %f80, %f77, %f79, %f78;
st.global.f32 [%rd2], %f80;
$L__BB8_35:
ret;
}
// .globl j2k_idwt_horizontal_multi
.visible .entry j2k_idwt_horizontal_multi(
.param .u64 j2k_idwt_horizontal_multi_param_0
)
{
.reg .pred %p<62>;
.reg .f32 %f<131>;
.reg .b32 %r<108>;
.reg .b64 %rd<75>;
ld.param.u64 %rd5, [j2k_idwt_horizontal_multi_param_0];
cvta.to.global.u64 %rd6, %rd5;
mov.u32 %r29, %ctaid.y;
mul.wide.u32 %rd7, %r29, 128;
add.s64 %rd8, %rd6, %rd7;
add.s64 %rd1, %rd8, 120;
ld.global.v2.u32 {%r30, %r31}, [%rd8+48];
ld.global.v2.u32 {%r34, %r35}, [%rd8+40];
sub.s32 %r2, %r30, %r34;
sub.s32 %r37, %r31, %r35;
mov.u32 %r38, %ntid.x;
mov.u32 %r39, %ctaid.x;
mov.u32 %r40, %tid.x;
mad.lo.s32 %r3, %r39, %r38, %r40;
setp.ge.u32 %p4, %r3, %r37;
@%p4 bra $L__BB9_48;
mul.lo.s32 %r41, %r2, %r3;
ld.global.u64 %rd9, [%rd1+-88];
mul.wide.u32 %rd10, %r41, 4;
add.s64 %rd2, %rd9, %rd10;
and.b32 %r4, %r34, 1;
setp.eq.s32 %p5, %r2, 1;
@%p5 bra $L__BB9_46;
bra.uni $L__BB9_2;
$L__BB9_46:
setp.eq.s32 %p61, %r4, 0;
@%p61 bra $L__BB9_48;
ld.f32 %f129, [%rd2];
mul.rn.f32 %f130, %f129, 0f3F000000;
st.f32 [%rd2], %f130;
bra.uni $L__BB9_48;
$L__BB9_2:
ld.global.u32 %r42, [%rd1];
setp.eq.s32 %p6, %r42, 0;
xor.b32 %r5, %r4, 1;
add.s32 %r6, %r2, -1;
mul.wide.u32 %rd11, %r6, 4;
add.s64 %rd3, %rd2, %rd11;
add.s32 %r7, %r2, -2;
mul.wide.u32 %rd12, %r7, 4;
add.s64 %rd4, %rd2, %rd12;
@%p6 bra $L__BB9_34;
setp.eq.s32 %p1, %r4, 0;
selp.f32 %f1, 0f3F9D7658, 0f3F5019C3, %p1;
setp.lt.u32 %p7, %r2, 2;
@%p7 bra $L__BB9_6;
selp.f32 %f2, 0f3F5019C3, 0f3F9D7658, %p1;
mov.u32 %r101, 1;
$L__BB9_5:
add.s32 %r44, %r101, -1;
mul.wide.u32 %rd13, %r44, 4;
add.s64 %rd14, %rd2, %rd13;
ld.f32 %f3, [%rd14];
mul.rn.f32 %f4, %f1, %f3;
st.f32 [%rd14], %f4;
ld.f32 %f5, [%rd14+4];
mul.rn.f32 %f6, %f2, %f5;
st.f32 [%rd14+4], %f6;
add.s32 %r101, %r101, 2;
setp.lt.u32 %p8, %r101, %r2;
@%p8 bra $L__BB9_5;
$L__BB9_6:
and.b32 %r45, %r2, 1;
setp.eq.b32 %p9, %r45, 1;
mov.pred %p10, 0;
xor.pred %p11, %p9, %p10;
not.pred %p12, %p11;
@%p12 bra $L__BB9_8;
ld.f32 %f7, [%rd3];
mul.rn.f32 %f8, %f1, %f7;
st.f32 [%rd3], %f8;
$L__BB9_8:
setp.ne.s32 %p13, %r4, 0;
@%p13 bra $L__BB9_10;
setp.gt.u32 %p14, %r2, 1;
add.s32 %r46, %r2, %r2;
add.s32 %r47, %r46, -3;
selp.b32 %r48, 1, %r47, %p14;
mul.wide.u32 %rd15, %r48, 4;
add.s64 %rd16, %rd2, %rd15;
ld.f32 %f9, [%rd16];
ld.f32 %f10, [%rd2+4];
add.rn.f32 %f11, %f10, %f9;
ld.f32 %f12, [%rd2];
mov.f32 %f13, 0fBEE31355;
fma.rn.f32 %f14, %f11, %f13, %f12;
st.f32 [%rd2], %f14;
$L__BB9_10:
selp.b32 %r104, 3, 2, %p1;
setp.ge.u32 %p16, %r104, %r2;
@%p16 bra $L__BB9_13;
mov.u32 %r102, %r104;
$L__BB9_12:
add.s32 %r49, %r102, -2;
mul.wide.u32 %rd17, %r49, 4;
add.s64 %rd18, %rd2, %rd17;
mul.wide.u32 %rd19, %r102, 4;
add.s64 %rd20, %rd2, %rd19;
ld.f32 %f15, [%rd20];
ld.f32 %f16, [%rd18];
add.rn.f32 %f17, %f16, %f15;
add.s32 %r50, %r102, -1;
mul.wide.u32 %rd21, %r50, 4;
add.s64 %rd22, %rd2, %rd21;
ld.f32 %f18, [%rd22];
mov.f32 %f19, 0fBEE31355;
fma.rn.f32 %f20, %f17, %f19, %f18;
st.f32 [%rd22], %f20;
add.s32 %r102, %r102, 2;
setp.lt.u32 %p17, %r102, %r2;
@%p17 bra $L__BB9_12;
$L__BB9_13:
setp.gt.u32 %p18, %r2, 1;
and.b32 %r14, %r6, 1;
setp.eq.s32 %p19, %r14, %r4;
and.pred %p2, %p18, %p19;
not.pred %p20, %p2;
@%p20 bra $L__BB9_15;
setp.gt.u32 %p21, %r6, 1;
mov.u32 %r51, 2;
sub.s32 %r52, %r51, %r2;
selp.b32 %r54, %r7, %r52, %p21;
mul.wide.u32 %rd23, %r54, 4;
add.s64 %rd24, %rd2, %rd23;
ld.f32 %f21, [%rd4];
ld.f32 %f22, [%rd24];
add.rn.f32 %f23, %f22, %f21;
ld.f32 %f24, [%rd3];
mov.f32 %f25, 0fBEE31355;
fma.rn.f32 %f26, %f23, %f25, %f24;
st.f32 [%rd3], %f26;
$L__BB9_15:
setp.ne.s32 %p22, %r5, 0;
@%p22 bra $L__BB9_17;
add.s32 %r55, %r2, %r2;
add.s32 %r56, %r55, -3;
selp.b32 %r57, 1, %r56, %p18;
mul.wide.u32 %rd25, %r57, 4;
add.s64 %rd26, %rd2, %rd25;
ld.f32 %f27, [%rd26];
ld.f32 %f28, [%rd2+4];
add.rn.f32 %f29, %f28, %f27;
ld.f32 %f30, [%rd2];
mov.f32 %f31, 0fBF620676;
fma.rn.f32 %f32, %f29, %f31, %f30;
st.f32 [%rd2], %f32;
$L__BB9_17:
setp.eq.s32 %p24, %r5, 0;
selp.b32 %r105, 3, 2, %p24;
setp.ge.u32 %p25, %r105, %r2;
@%p25 bra $L__BB9_20;
mov.u32 %r103, %r105;
$L__BB9_19:
add.s32 %r58, %r103, -2;
mul.wide.u32 %rd27, %r58, 4;
add.s64 %rd28, %rd2, %rd27;
mul.wide.u32 %rd29, %r103, 4;
add.s64 %rd30, %rd2, %rd29;
ld.f32 %f33, [%rd30];
ld.f32 %f34, [%rd28];
add.rn.f32 %f35, %f34, %f33;
add.s32 %r59, %r103, -1;
mul.wide.u32 %rd31, %r59, 4;
add.s64 %rd32, %rd2, %rd31;
ld.f32 %f36, [%rd32];
mov.f32 %f37, 0fBF620676;
fma.rn.f32 %f38, %f35, %f37, %f36;
st.f32 [%rd32], %f38;
add.s32 %r103, %r103, 2;
setp.lt.u32 %p26, %r103, %r2;
@%p26 bra $L__BB9_19;
$L__BB9_20:
setp.eq.s32 %p28, %r14, %r5;
and.pred %p3, %p18, %p28;
not.pred %p29, %p3;
@%p29 bra $L__BB9_22;
setp.gt.u32 %p30, %r6, 1;
mov.u32 %r60, 2;
sub.s32 %r61, %r60, %r2;
selp.b32 %r63, %r7, %r61, %p30;
mul.wide.u32 %rd33, %r63, 4;
add.s64 %rd34, %rd2, %rd33;
ld.f32 %f39, [%rd4];
ld.f32 %f40, [%rd34];
add.rn.f32 %f41, %f40, %f39;
ld.f32 %f42, [%rd3];
mov.f32 %f43, 0fBF620676;
fma.rn.f32 %f44, %f41, %f43, %f42;
st.f32 [%rd3], %f44;
$L__BB9_22:
@%p13 bra $L__BB9_24;
add.s32 %r64, %r2, %r2;
add.s32 %r65, %r64, -3;
selp.b32 %r66, 1, %r65, %p18;
mul.wide.u32 %rd35, %r66, 4;
add.s64 %rd36, %rd2, %rd35;
ld.f32 %f45, [%rd36];
ld.f32 %f46, [%rd2+4];
add.rn.f32 %f47, %f46, %f45;
ld.f32 %f48, [%rd2];
mov.f32 %f49, 0f3D5901AE;
fma.rn.f32 %f50, %f47, %f49, %f48;
st.f32 [%rd2], %f50;
$L__BB9_24:
@%p16 bra $L__BB9_26;
$L__BB9_25:
add.s32 %r67, %r104, -2;
mul.wide.u32 %rd37, %r67, 4;
add.s64 %rd38, %rd2, %rd37;
mul.wide.u32 %rd39, %r104, 4;
add.s64 %rd40, %rd2, %rd39;
ld.f32 %f51, [%rd40];
ld.f32 %f52, [%rd38];
add.rn.f32 %f53, %f52, %f51;
add.s32 %r68, %r104, -1;
mul.wide.u32 %rd41, %r68, 4;
add.s64 %rd42, %rd2, %rd41;
ld.f32 %f54, [%rd42];
mov.f32 %f55, 0f3D5901AE;
fma.rn.f32 %f56, %f53, %f55, %f54;
st.f32 [%rd42], %f56;
add.s32 %r104, %r104, 2;
setp.lt.u32 %p34, %r104, %r2;
@%p34 bra $L__BB9_25;
$L__BB9_26:
@%p20 bra $L__BB9_28;
setp.gt.u32 %p36, %r6, 1;
mov.u32 %r69, 2;
sub.s32 %r70, %r69, %r2;
selp.b32 %r72, %r7, %r70, %p36;
mul.wide.u32 %rd43, %r72, 4;
add.s64 %rd44, %rd2, %rd43;
ld.f32 %f57, [%rd4];
ld.f32 %f58, [%rd44];
add.rn.f32 %f59, %f58, %f57;
ld.f32 %f60, [%rd3];
mov.f32 %f61, 0f3D5901AE;
fma.rn.f32 %f62, %f59, %f61, %f60;
st.f32 [%rd3], %f62;
$L__BB9_28:
@%p22 bra $L__BB9_30;
add.s32 %r73, %r2, %r2;
add.s32 %r74, %r73, -3;
selp.b32 %r75, 1, %r74, %p18;
mul.wide.u32 %rd45, %r75, 4;
add.s64 %rd46, %rd2, %rd45;
ld.f32 %f63, [%rd46];
ld.f32 %f64, [%rd2+4];
add.rn.f32 %f65, %f64, %f63;
ld.f32 %f66, [%rd2];
mov.f32 %f67, 0f3FCB0673;
fma.rn.f32 %f68, %f65, %f67, %f66;
st.f32 [%rd2], %f68;
$L__BB9_30:
@%p25 bra $L__BB9_32;
$L__BB9_31:
add.s32 %r76, %r105, -2;
mul.wide.u32 %rd47, %r76, 4;
add.s64 %rd48, %rd2, %rd47;
mul.wide.u32 %rd49, %r105, 4;
add.s64 %rd50, %rd2, %rd49;
ld.f32 %f69, [%rd50];
ld.f32 %f70, [%rd48];
add.rn.f32 %f71, %f70, %f69;
add.s32 %r77, %r105, -1;
mul.wide.u32 %rd51, %r77, 4;
add.s64 %rd52, %rd2, %rd51;
ld.f32 %f72, [%rd52];
mov.f32 %f73, 0f3FCB0673;
fma.rn.f32 %f74, %f71, %f73, %f72;
st.f32 [%rd52], %f74;
add.s32 %r105, %r105, 2;
setp.lt.u32 %p40, %r105, %r2;
@%p40 bra $L__BB9_31;
$L__BB9_32:
@%p29 bra $L__BB9_48;
setp.gt.u32 %p42, %r6, 1;
mov.u32 %r78, 2;
sub.s32 %r79, %r78, %r2;
selp.b32 %r81, %r7, %r79, %p42;
mul.wide.u32 %rd53, %r81, 4;
add.s64 %rd54, %rd2, %rd53;
ld.f32 %f75, [%rd4];
ld.f32 %f76, [%rd54];
add.rn.f32 %f77, %f76, %f75;
ld.f32 %f78, [%rd3];
mov.f32 %f79, 0f3FCB0673;
fma.rn.f32 %f80, %f77, %f79, %f78;
st.f32 [%rd3], %f80;
bra.uni $L__BB9_48;
$L__BB9_34:
setp.ne.s32 %p43, %r4, 0;
@%p43 bra $L__BB9_36;
setp.gt.u32 %p44, %r2, 1;
add.s32 %r82, %r2, %r2;
add.s32 %r83, %r82, -3;
selp.b32 %r84, 1, %r83, %p44;
mul.wide.u32 %rd55, %r84, 4;
add.s64 %rd56, %rd2, %rd55;
ld.f32 %f81, [%rd56];
ld.f32 %f82, [%rd2+4];
add.rn.f32 %f83, %f82, %f81;
mov.f32 %f84, 0f3F000000;
mov.f32 %f85, 0f3E800000;
fma.rn.f32 %f86, %f83, %f85, %f84;
cvt.rmi.f32.f32 %f87, %f86;
ld.f32 %f88, [%rd2];
sub.rn.f32 %f89, %f88, %f87;
st.f32 [%rd2], %f89;
$L__BB9_36:
setp.eq.s32 %p45, %r4, 0;
selp.b32 %r106, 3, 2, %p45;
setp.ge.u32 %p46, %r106, %r2;
@%p46 bra $L__BB9_38;
$L__BB9_37:
add.s32 %r85, %r106, -1;
mul.wide.u32 %rd57, %r85, 4;
add.s64 %rd58, %rd2, %rd57;
add.s32 %r86, %r106, -2;
mul.wide.u32 %rd59, %r86, 4;
add.s64 %rd60, %rd2, %rd59;
mul.wide.u32 %rd61, %r106, 4;
add.s64 %rd62, %rd2, %rd61;
ld.f32 %f90, [%rd62];
ld.f32 %f91, [%rd60];
add.rn.f32 %f92, %f91, %f90;
mov.f32 %f93, 0f3F000000;
mov.f32 %f94, 0f3E800000;
fma.rn.f32 %f95, %f92, %f94, %f93;
cvt.rmi.f32.f32 %f96, %f95;
ld.f32 %f97, [%rd58];
sub.rn.f32 %f98, %f97, %f96;
st.f32 [%rd58], %f98;
add.s32 %r106, %r106, 2;
setp.lt.u32 %p47, %r106, %r2;
@%p47 bra $L__BB9_37;
$L__BB9_38:
setp.lt.u32 %p48, %r2, 2;
and.b32 %r25, %r2, 1;
xor.b32 %r87, %r25, 1;
setp.ne.s32 %p49, %r87, %r4;
or.pred %p50, %p48, %p49;
@%p50 bra $L__BB9_40;
setp.gt.u32 %p51, %r6, 1;
mov.u32 %r89, 2;
sub.s32 %r90, %r89, %r2;
selp.b32 %r92, %r7, %r90, %p51;
mul.wide.u32 %rd63, %r92, 4;
add.s64 %rd64, %rd2, %rd63;
ld.f32 %f99, [%rd4];
ld.f32 %f100, [%rd64];
add.rn.f32 %f101, %f100, %f99;
mov.f32 %f102, 0f3F000000;
mov.f32 %f103, 0f3E800000;
fma.rn.f32 %f104, %f101, %f103, %f102;
cvt.rmi.f32.f32 %f105, %f104;
ld.f32 %f106, [%rd3];
sub.rn.f32 %f107, %f106, %f105;
st.f32 [%rd3], %f107;
$L__BB9_40:
setp.ne.s32 %p52, %r5, 0;
@%p52 bra $L__BB9_42;
setp.gt.u32 %p53, %r2, 1;
add.s32 %r93, %r2, %r2;
add.s32 %r94, %r93, -3;
selp.b32 %r95, 1, %r94, %p53;
mul.wide.u32 %rd65, %r95, 4;
add.s64 %rd66, %rd2, %rd65;
ld.f32 %f108, [%rd66];
ld.f32 %f109, [%rd2+4];
add.rn.f32 %f110, %f109, %f108;
mul.rn.f32 %f111, %f110, 0f3F000000;
cvt.rmi.f32.f32 %f112, %f111;
ld.f32 %f113, [%rd2];
add.rn.f32 %f114, %f113, %f112;
st.f32 [%rd2], %f114;
$L__BB9_42:
setp.eq.s32 %p54, %r5, 0;
selp.b32 %r107, 3, 2, %p54;
setp.ge.u32 %p55, %r107, %r2;
@%p55 bra $L__BB9_44;
$L__BB9_43:
add.s32 %r96, %r107, -1;
mul.wide.u32 %rd67, %r96, 4;
add.s64 %rd68, %rd2, %rd67;
add.s32 %r97, %r107, -2;
mul.wide.u32 %rd69, %r97, 4;
add.s64 %rd70, %rd2, %rd69;
mul.wide.u32 %rd71, %r107, 4;
add.s64 %rd72, %rd2, %rd71;
ld.f32 %f115, [%rd72];
ld.f32 %f116, [%rd70];
add.rn.f32 %f117, %f116, %f115;
mul.rn.f32 %f118, %f117, 0f3F000000;
cvt.rmi.f32.f32 %f119, %f118;
ld.f32 %f120, [%rd68];
add.rn.f32 %f121, %f120, %f119;
st.f32 [%rd68], %f121;
add.s32 %r107, %r107, 2;
setp.lt.u32 %p56, %r107, %r2;
@%p56 bra $L__BB9_43;
$L__BB9_44:
setp.ne.s32 %p58, %r25, %r4;
or.pred %p59, %p48, %p58;
@%p59 bra $L__BB9_48;
setp.gt.u32 %p60, %r6, 1;
mov.u32 %r98, 2;
sub.s32 %r99, %r98, %r2;
selp.b32 %r100, %r7, %r99, %p60;
mul.wide.u32 %rd73, %r100, 4;
add.s64 %rd74, %rd2, %rd73;
ld.f32 %f122, [%rd4];
ld.f32 %f123, [%rd74];
add.rn.f32 %f124, %f123, %f122;
mul.rn.f32 %f125, %f124, 0f3F000000;
cvt.rmi.f32.f32 %f126, %f125;
ld.f32 %f127, [%rd3];
add.rn.f32 %f128, %f127, %f126;
st.f32 [%rd3], %f128;
$L__BB9_48:
ret;
}
// .globl j2k_idwt_vertical
.visible .entry j2k_idwt_vertical(
.param .u64 j2k_idwt_vertical_param_0,
.param .u64 j2k_idwt_vertical_param_1
)
{
.reg .pred %p<36>;
.reg .f32 %f<51>;
.reg .b32 %r<208>;
.reg .b64 %rd<49>;
ld.param.u64 %rd3, [j2k_idwt_vertical_param_0];
ld.param.u64 %rd4, [j2k_idwt_vertical_param_1];
cvta.to.global.u64 %rd1, %rd3;
cvta.to.global.u64 %rd2, %rd4;
ld.global.u32 %r1, [%rd2+4];
ld.global.u32 %r2, [%rd2+8];
ld.global.u32 %r3, [%rd2];
sub.s32 %r4, %r2, %r3;
mov.u32 %r92, %ntid.x;
mov.u32 %r93, %ctaid.x;
mul.lo.s32 %r5, %r93, %r92;
mov.u32 %r6, %tid.x;
add.s32 %r7, %r5, %r6;
setp.ge.u32 %p2, %r7, %r4;
@%p2 bra $L__BB10_28;
ld.global.u32 %r8, [%rd2+12];
sub.s32 %r9, %r8, %r1;
setp.eq.s32 %p3, %r9, 1;
and.b32 %r195, %r1, 1;
@%p3 bra $L__BB10_26;
bra.uni $L__BB10_2;
$L__BB10_26:
setp.eq.s32 %p35, %r195, 0;
@%p35 bra $L__BB10_28;
mul.wide.u32 %rd47, %r7, 4;
add.s64 %rd48, %rd1, %rd47;
ld.global.f32 %f49, [%rd48];
mul.rn.f32 %f50, %f49, 0f3F000000;
st.global.f32 [%rd48], %f50;
bra.uni $L__BB10_28;
$L__BB10_2:
ld.global.u32 %r94, [%rd2+80];
setp.eq.s32 %p4, %r94, 0;
xor.b32 %r199, %r195, 1;
@%p4 bra $L__BB10_20;
setp.eq.s32 %p1, %r195, 0;
selp.f32 %f1, 0f3F9D7658, 0f3F5019C3, %p1;
setp.lt.u32 %p5, %r9, 2;
@%p5 bra $L__BB10_6;
add.s32 %r96, %r6, %r2;
add.s32 %r97, %r96, %r5;
sub.s32 %r183, %r97, %r3;
shl.b32 %r98, %r3, 1;
mov.u32 %r182, 1;
shl.b32 %r99, %r2, 1;
sub.s32 %r13, %r99, %r98;
selp.f32 %f2, 0f3F5019C3, 0f3F9D7658, %p1;
mov.u32 %r181, %r7;
$L__BB10_5:
mul.wide.u32 %rd5, %r181, 4;
add.s64 %rd6, %rd1, %rd5;
ld.global.f32 %f3, [%rd6];
mul.rn.f32 %f4, %f1, %f3;
st.global.f32 [%rd6], %f4;
mul.wide.u32 %rd7, %r183, 4;
add.s64 %rd8, %rd1, %rd7;
ld.global.f32 %f5, [%rd8];
mul.rn.f32 %f6, %f2, %f5;
st.global.f32 [%rd8], %f6;
add.s32 %r183, %r183, %r13;
add.s32 %r181, %r181, %r13;
add.s32 %r182, %r182, 2;
setp.lt.u32 %p6, %r182, %r9;
@%p6 bra $L__BB10_5;
$L__BB10_6:
and.b32 %r100, %r9, 1;
setp.eq.b32 %p7, %r100, 1;
mov.pred %p8, 0;
xor.pred %p9, %p7, %p8;
not.pred %p10, %p9;
@%p10 bra $L__BB10_8;
add.s32 %r101, %r9, -1;
mad.lo.s32 %r102, %r101, %r4, %r7;
mul.wide.u32 %rd9, %r102, 4;
add.s64 %rd10, %rd1, %rd9;
ld.global.f32 %f7, [%rd10];
mul.rn.f32 %f8, %f1, %f7;
st.global.f32 [%rd10], %f8;
$L__BB10_8:
setp.ge.u32 %p11, %r195, %r9;
@%p11 bra $L__BB10_11;
shl.b32 %r103, %r8, 1;
add.s32 %r104, %r103, -3;
sub.s32 %r105, %r104, %r195;
shl.b32 %r106, %r1, 1;
sub.s32 %r186, %r105, %r106;
neg.s32 %r185, %r195;
mad.lo.s32 %r184, %r4, %r195, %r7;
shl.b32 %r107, %r3, 1;
shl.b32 %r108, %r2, 1;
sub.s32 %r23, %r108, %r107;
mov.u32 %r187, %r195;
$L__BB10_10:
add.s32 %r109, %r185, 1;
add.s32 %r110, %r187, -1;
setp.gt.u32 %p12, %r187, 1;
selp.b32 %r111, %r110, %r109, %p12;
add.s32 %r112, %r187, 1;
setp.lt.u32 %p13, %r112, %r9;
selp.b32 %r113, %r112, %r186, %p13;
mad.lo.s32 %r114, %r111, %r4, %r7;
mul.wide.u32 %rd11, %r114, 4;
add.s64 %rd12, %rd1, %rd11;
mad.lo.s32 %r115, %r113, %r4, %r7;
mul.wide.u32 %rd13, %r115, 4;
add.s64 %rd14, %rd1, %rd13;
ld.global.f32 %f9, [%rd14];
ld.global.f32 %f10, [%rd12];
add.rn.f32 %f11, %f10, %f9;
mul.wide.u32 %rd15, %r184, 4;
add.s64 %rd16, %rd1, %rd15;
ld.global.f32 %f12, [%rd16];
mov.f32 %f13, 0fBEE31355;
fma.rn.f32 %f14, %f11, %f13, %f12;
st.global.f32 [%rd16], %f14;
add.s32 %r186, %r186, -2;
add.s32 %r185, %r185, -2;
add.s32 %r184, %r184, %r23;
add.s32 %r187, %r187, 2;
setp.lt.u32 %p14, %r187, %r9;
@%p14 bra $L__BB10_10;
$L__BB10_11:
setp.ge.u32 %p15, %r199, %r9;
@%p15 bra $L__BB10_14;
shl.b32 %r116, %r8, 1;
add.s32 %r117, %r116, -3;
sub.s32 %r118, %r117, %r199;
shl.b32 %r119, %r1, 1;
sub.s32 %r190, %r118, %r119;
neg.s32 %r189, %r199;
mad.lo.s32 %r188, %r4, %r199, %r7;
shl.b32 %r120, %r3, 1;
shl.b32 %r121, %r2, 1;
sub.s32 %r35, %r121, %r120;
mov.u32 %r191, %r199;
$L__BB10_13:
add.s32 %r122, %r189, 1;
add.s32 %r123, %r191, -1;
setp.gt.u32 %p16, %r191, 1;
selp.b32 %r124, %r123, %r122, %p16;
add.s32 %r125, %r191, 1;
setp.lt.u32 %p17, %r125, %r9;
selp.b32 %r126, %r125, %r190, %p17;
mad.lo.s32 %r127, %r124, %r4, %r7;
mul.wide.u32 %rd17, %r127, 4;
add.s64 %rd18, %rd1, %rd17;
mad.lo.s32 %r128, %r126, %r4, %r7;
mul.wide.u32 %rd19, %r128, 4;
add.s64 %rd20, %rd1, %rd19;
ld.global.f32 %f15, [%rd20];
ld.global.f32 %f16, [%rd18];
add.rn.f32 %f17, %f16, %f15;
mul.wide.u32 %rd21, %r188, 4;
add.s64 %rd22, %rd1, %rd21;
ld.global.f32 %f18, [%rd22];
mov.f32 %f19, 0fBF620676;
fma.rn.f32 %f20, %f17, %f19, %f18;
st.global.f32 [%rd22], %f20;
add.s32 %r190, %r190, -2;
add.s32 %r189, %r189, -2;
add.s32 %r188, %r188, %r35;
add.s32 %r191, %r191, 2;
setp.lt.u32 %p18, %r191, %r9;
@%p18 bra $L__BB10_13;
$L__BB10_14:
@%p11 bra $L__BB10_17;
shl.b32 %r129, %r8, 1;
add.s32 %r130, %r129, -3;
sub.s32 %r131, %r130, %r195;
shl.b32 %r132, %r1, 1;
sub.s32 %r194, %r131, %r132;
neg.s32 %r193, %r195;
mad.lo.s32 %r192, %r4, %r195, %r7;
shl.b32 %r133, %r3, 1;
shl.b32 %r134, %r2, 1;
sub.s32 %r47, %r134, %r133;
$L__BB10_16:
add.s32 %r135, %r193, 1;
add.s32 %r136, %r195, -1;
setp.gt.u32 %p20, %r195, 1;
selp.b32 %r137, %r136, %r135, %p20;
add.s32 %r138, %r195, 1;
setp.lt.u32 %p21, %r138, %r9;
selp.b32 %r139, %r138, %r194, %p21;
mad.lo.s32 %r140, %r137, %r4, %r7;
mul.wide.u32 %rd23, %r140, 4;
add.s64 %rd24, %rd1, %rd23;
mad.lo.s32 %r141, %r139, %r4, %r7;
mul.wide.u32 %rd25, %r141, 4;
add.s64 %rd26, %rd1, %rd25;
ld.global.f32 %f21, [%rd26];
ld.global.f32 %f22, [%rd24];
add.rn.f32 %f23, %f22, %f21;
mul.wide.u32 %rd27, %r192, 4;
add.s64 %rd28, %rd1, %rd27;
ld.global.f32 %f24, [%rd28];
mov.f32 %f25, 0f3D5901AE;
fma.rn.f32 %f26, %f23, %f25, %f24;
st.global.f32 [%rd28], %f26;
add.s32 %r194, %r194, -2;
add.s32 %r193, %r193, -2;
add.s32 %r192, %r192, %r47;
add.s32 %r195, %r195, 2;
setp.lt.u32 %p22, %r195, %r9;
@%p22 bra $L__BB10_16;
$L__BB10_17:
@%p15 bra $L__BB10_28;
shl.b32 %r142, %r8, 1;
add.s32 %r143, %r142, -3;
sub.s32 %r144, %r143, %r199;
shl.b32 %r145, %r1, 1;
sub.s32 %r198, %r144, %r145;
neg.s32 %r197, %r199;
mad.lo.s32 %r196, %r4, %r199, %r7;
shl.b32 %r146, %r3, 1;
shl.b32 %r147, %r2, 1;
sub.s32 %r59, %r147, %r146;
$L__BB10_19:
add.s32 %r148, %r197, 1;
add.s32 %r149, %r199, -1;
setp.gt.u32 %p24, %r199, 1;
selp.b32 %r150, %r149, %r148, %p24;
add.s32 %r151, %r199, 1;
setp.lt.u32 %p25, %r151, %r9;
selp.b32 %r152, %r151, %r198, %p25;
mad.lo.s32 %r153, %r150, %r4, %r7;
mul.wide.u32 %rd29, %r153, 4;
add.s64 %rd30, %rd1, %rd29;
mad.lo.s32 %r154, %r152, %r4, %r7;
mul.wide.u32 %rd31, %r154, 4;
add.s64 %rd32, %rd1, %rd31;
ld.global.f32 %f27, [%rd32];
ld.global.f32 %f28, [%rd30];
add.rn.f32 %f29, %f28, %f27;
mul.wide.u32 %rd33, %r196, 4;
add.s64 %rd34, %rd1, %rd33;
ld.global.f32 %f30, [%rd34];
mov.f32 %f31, 0f3FCB0673;
fma.rn.f32 %f32, %f29, %f31, %f30;
st.global.f32 [%rd34], %f32;
add.s32 %r198, %r198, -2;
add.s32 %r197, %r197, -2;
add.s32 %r196, %r196, %r59;
add.s32 %r199, %r199, 2;
setp.lt.u32 %p26, %r199, %r9;
@%p26 bra $L__BB10_19;
bra.uni $L__BB10_28;
$L__BB10_20:
setp.ge.u32 %p27, %r195, %r9;
@%p27 bra $L__BB10_23;
shl.b32 %r155, %r8, 1;
add.s32 %r156, %r155, -3;
sub.s32 %r157, %r156, %r195;
shl.b32 %r158, %r1, 1;
sub.s32 %r202, %r157, %r158;
neg.s32 %r201, %r195;
mad.lo.s32 %r200, %r4, %r195, %r7;
shl.b32 %r159, %r3, 1;
shl.b32 %r160, %r2, 1;
sub.s32 %r71, %r160, %r159;
$L__BB10_22:
add.s32 %r161, %r201, 1;
add.s32 %r162, %r195, -1;
setp.gt.u32 %p28, %r195, 1;
selp.b32 %r163, %r162, %r161, %p28;
add.s32 %r164, %r195, 1;
setp.lt.u32 %p29, %r164, %r9;
selp.b32 %r165, %r164, %r202, %p29;
mad.lo.s32 %r166, %r163, %r4, %r7;
mul.wide.u32 %rd35, %r166, 4;
add.s64 %rd36, %rd1, %rd35;
mad.lo.s32 %r167, %r165, %r4, %r7;
mul.wide.u32 %rd37, %r167, 4;
add.s64 %rd38, %rd1, %rd37;
mul.wide.u32 %rd39, %r200, 4;
add.s64 %rd40, %rd1, %rd39;
ld.global.f32 %f33, [%rd38];
ld.global.f32 %f34, [%rd36];
add.rn.f32 %f35, %f34, %f33;
mov.f32 %f36, 0f3F000000;
mov.f32 %f37, 0f3E800000;
fma.rn.f32 %f38, %f35, %f37, %f36;
cvt.rmi.f32.f32 %f39, %f38;
ld.global.f32 %f40, [%rd40];
sub.rn.f32 %f41, %f40, %f39;
st.global.f32 [%rd40], %f41;
add.s32 %r202, %r202, -2;
add.s32 %r201, %r201, -2;
add.s32 %r200, %r200, %r71;
add.s32 %r195, %r195, 2;
setp.lt.u32 %p30, %r195, %r9;
@%p30 bra $L__BB10_22;
$L__BB10_23:
setp.ge.u32 %p31, %r199, %r9;
@%p31 bra $L__BB10_28;
shl.b32 %r168, %r8, 1;
add.s32 %r169, %r168, -3;
sub.s32 %r170, %r169, %r199;
shl.b32 %r171, %r1, 1;
sub.s32 %r206, %r170, %r171;
neg.s32 %r205, %r199;
mad.lo.s32 %r204, %r4, %r199, %r7;
shl.b32 %r172, %r3, 1;
shl.b32 %r173, %r2, 1;
sub.s32 %r83, %r173, %r172;
$L__BB10_25:
add.s32 %r174, %r205, 1;
add.s32 %r175, %r199, -1;
setp.gt.u32 %p32, %r199, 1;
selp.b32 %r176, %r175, %r174, %p32;
add.s32 %r177, %r199, 1;
setp.lt.u32 %p33, %r177, %r9;
selp.b32 %r178, %r177, %r206, %p33;
mad.lo.s32 %r179, %r176, %r4, %r7;
mul.wide.u32 %rd41, %r179, 4;
add.s64 %rd42, %rd1, %rd41;
mad.lo.s32 %r180, %r178, %r4, %r7;
mul.wide.u32 %rd43, %r180, 4;
add.s64 %rd44, %rd1, %rd43;
mul.wide.u32 %rd45, %r204, 4;
add.s64 %rd46, %rd1, %rd45;
ld.global.f32 %f42, [%rd44];
ld.global.f32 %f43, [%rd42];
add.rn.f32 %f44, %f43, %f42;
mul.rn.f32 %f45, %f44, 0f3F000000;
cvt.rmi.f32.f32 %f46, %f45;
ld.global.f32 %f47, [%rd46];
add.rn.f32 %f48, %f47, %f46;
st.global.f32 [%rd46], %f48;
add.s32 %r206, %r206, -2;
add.s32 %r205, %r205, -2;
add.s32 %r204, %r204, %r83;
add.s32 %r199, %r199, 2;
setp.lt.u32 %p34, %r199, %r9;
@%p34 bra $L__BB10_25;
$L__BB10_28:
ret;
}
// .globl j2k_idwt_vertical_53
.visible .entry j2k_idwt_vertical_53(
.param .u64 j2k_idwt_vertical_53_param_0,
.param .u64 j2k_idwt_vertical_53_param_1
)
{
.reg .pred %p<12>;
.reg .f32 %f<19>;
.reg .b32 %r<71>;
.reg .b64 %rd<19>;
ld.param.u64 %rd3, [j2k_idwt_vertical_53_param_0];
ld.param.u64 %rd4, [j2k_idwt_vertical_53_param_1];
cvta.to.global.u64 %rd1, %rd3;
cvta.to.global.u64 %rd2, %rd4;
ld.global.u32 %r1, [%rd2+4];
ld.global.u32 %r2, [%rd2+8];
ld.global.u32 %r3, [%rd2];
sub.s32 %r4, %r2, %r3;
mov.u32 %r34, %ntid.x;
mov.u32 %r35, %ctaid.x;
mov.u32 %r36, %tid.x;
mad.lo.s32 %r5, %r35, %r34, %r36;
setp.ge.u32 %p1, %r5, %r4;
@%p1 bra $L__BB11_10;
ld.global.u32 %r6, [%rd2+12];
sub.s32 %r7, %r6, %r1;
setp.eq.s32 %p2, %r7, 1;
and.b32 %r66, %r1, 1;
@%p2 bra $L__BB11_8;
bra.uni $L__BB11_2;
$L__BB11_8:
setp.eq.s32 %p11, %r66, 0;
@%p11 bra $L__BB11_10;
mul.wide.u32 %rd17, %r5, 4;
add.s64 %rd18, %rd1, %rd17;
ld.global.f32 %f17, [%rd18];
mul.rn.f32 %f18, %f17, 0f3F000000;
st.global.f32 [%rd18], %f18;
bra.uni $L__BB11_10;
$L__BB11_2:
xor.b32 %r70, %r66, 1;
setp.ge.u32 %p3, %r66, %r7;
@%p3 bra $L__BB11_5;
shl.b32 %r37, %r6, 1;
add.s32 %r38, %r37, -3;
sub.s32 %r39, %r38, %r66;
shl.b32 %r40, %r1, 1;
sub.s32 %r65, %r39, %r40;
neg.s32 %r64, %r66;
mad.lo.s32 %r63, %r4, %r66, %r5;
shl.b32 %r41, %r3, 1;
shl.b32 %r42, %r2, 1;
sub.s32 %r13, %r42, %r41;
$L__BB11_4:
add.s32 %r43, %r64, 1;
add.s32 %r44, %r66, -1;
setp.gt.u32 %p4, %r66, 1;
selp.b32 %r45, %r44, %r43, %p4;
add.s32 %r46, %r66, 1;
setp.lt.u32 %p5, %r46, %r7;
selp.b32 %r47, %r46, %r65, %p5;
mad.lo.s32 %r48, %r45, %r4, %r5;
mul.wide.u32 %rd5, %r48, 4;
add.s64 %rd6, %rd1, %rd5;
mad.lo.s32 %r49, %r47, %r4, %r5;
mul.wide.u32 %rd7, %r49, 4;
add.s64 %rd8, %rd1, %rd7;
mul.wide.u32 %rd9, %r63, 4;
add.s64 %rd10, %rd1, %rd9;
ld.global.f32 %f1, [%rd8];
ld.global.f32 %f2, [%rd6];
add.rn.f32 %f3, %f2, %f1;
mov.f32 %f4, 0f3F000000;
mov.f32 %f5, 0f3E800000;
fma.rn.f32 %f6, %f3, %f5, %f4;
cvt.rmi.f32.f32 %f7, %f6;
ld.global.f32 %f8, [%rd10];
sub.rn.f32 %f9, %f8, %f7;
st.global.f32 [%rd10], %f9;
add.s32 %r65, %r65, -2;
add.s32 %r64, %r64, -2;
add.s32 %r63, %r63, %r13;
add.s32 %r66, %r66, 2;
setp.lt.u32 %p6, %r66, %r7;
@%p6 bra $L__BB11_4;
$L__BB11_5:
setp.ge.u32 %p7, %r70, %r7;
@%p7 bra $L__BB11_10;
shl.b32 %r50, %r6, 1;
add.s32 %r51, %r50, -3;
sub.s32 %r52, %r51, %r70;
shl.b32 %r53, %r1, 1;
sub.s32 %r69, %r52, %r53;
neg.s32 %r68, %r70;
mad.lo.s32 %r67, %r4, %r70, %r5;
shl.b32 %r54, %r3, 1;
shl.b32 %r55, %r2, 1;
sub.s32 %r25, %r55, %r54;
$L__BB11_7:
add.s32 %r56, %r68, 1;
add.s32 %r57, %r70, -1;
setp.gt.u32 %p8, %r70, 1;
selp.b32 %r58, %r57, %r56, %p8;
add.s32 %r59, %r70, 1;
setp.lt.u32 %p9, %r59, %r7;
selp.b32 %r60, %r59, %r69, %p9;
mad.lo.s32 %r61, %r58, %r4, %r5;
mul.wide.u32 %rd11, %r61, 4;
add.s64 %rd12, %rd1, %rd11;
mad.lo.s32 %r62, %r60, %r4, %r5;
mul.wide.u32 %rd13, %r62, 4;
add.s64 %rd14, %rd1, %rd13;
mul.wide.u32 %rd15, %r67, 4;
add.s64 %rd16, %rd1, %rd15;
ld.global.f32 %f10, [%rd14];
ld.global.f32 %f11, [%rd12];
add.rn.f32 %f12, %f11, %f10;
mul.rn.f32 %f13, %f12, 0f3F000000;
cvt.rmi.f32.f32 %f14, %f13;
ld.global.f32 %f15, [%rd16];
add.rn.f32 %f16, %f15, %f14;
st.global.f32 [%rd16], %f16;
add.s32 %r69, %r69, -2;
add.s32 %r68, %r68, -2;
add.s32 %r67, %r67, %r25;
add.s32 %r70, %r70, 2;
setp.lt.u32 %p10, %r70, %r7;
@%p10 bra $L__BB11_7;
$L__BB11_10:
ret;
}
// .globl j2k_idwt_vertical_97
.visible .entry j2k_idwt_vertical_97(
.param .u64 j2k_idwt_vertical_97_param_0,
.param .u64 j2k_idwt_vertical_97_param_1
)
{
.reg .pred %p<27>;
.reg .f32 %f<35>;
.reg .b32 %r<149>;
.reg .b64 %rd<37>;
ld.param.u64 %rd3, [j2k_idwt_vertical_97_param_0];
ld.param.u64 %rd4, [j2k_idwt_vertical_97_param_1];
cvta.to.global.u64 %rd1, %rd3;
cvta.to.global.u64 %rd2, %rd4;
ld.global.u32 %r1, [%rd2+4];
ld.global.u32 %r2, [%rd2+8];
ld.global.u32 %r3, [%rd2];
sub.s32 %r4, %r2, %r3;
mov.u32 %r68, %ntid.x;
mov.u32 %r69, %ctaid.x;
mul.lo.s32 %r5, %r69, %r68;
mov.u32 %r6, %tid.x;
add.s32 %r7, %r5, %r6;
setp.ge.u32 %p2, %r7, %r4;
@%p2 bra $L__BB12_21;
ld.global.u32 %r8, [%rd2+12];
sub.s32 %r9, %r8, %r1;
setp.eq.s32 %p3, %r9, 1;
and.b32 %r144, %r1, 1;
@%p3 bra $L__BB12_19;
bra.uni $L__BB12_2;
$L__BB12_19:
setp.eq.s32 %p26, %r144, 0;
@%p26 bra $L__BB12_21;
mul.wide.u32 %rd35, %r7, 4;
add.s64 %rd36, %rd1, %rd35;
ld.global.f32 %f33, [%rd36];
mul.rn.f32 %f34, %f33, 0f3F000000;
st.global.f32 [%rd36], %f34;
bra.uni $L__BB12_21;
$L__BB12_2:
xor.b32 %r148, %r144, 1;
setp.eq.s32 %p1, %r144, 0;
selp.f32 %f1, 0f3F9D7658, 0f3F5019C3, %p1;
setp.eq.s32 %p4, %r9, 0;
@%p4 bra $L__BB12_5;
add.s32 %r71, %r6, %r2;
add.s32 %r72, %r71, %r5;
sub.s32 %r132, %r72, %r3;
shl.b32 %r73, %r3, 1;
mov.u32 %r131, 1;
shl.b32 %r74, %r2, 1;
sub.s32 %r13, %r74, %r73;
selp.f32 %f2, 0f3F5019C3, 0f3F9D7658, %p1;
mov.u32 %r130, %r7;
$L__BB12_4:
mul.wide.u32 %rd5, %r130, 4;
add.s64 %rd6, %rd1, %rd5;
ld.global.f32 %f3, [%rd6];
mul.rn.f32 %f4, %f1, %f3;
st.global.f32 [%rd6], %f4;
mul.wide.u32 %rd7, %r132, 4;
add.s64 %rd8, %rd1, %rd7;
ld.global.f32 %f5, [%rd8];
mul.rn.f32 %f6, %f2, %f5;
st.global.f32 [%rd8], %f6;
add.s32 %r132, %r132, %r13;
add.s32 %r130, %r130, %r13;
add.s32 %r131, %r131, 2;
setp.lt.u32 %p5, %r131, %r9;
@%p5 bra $L__BB12_4;
$L__BB12_5:
and.b32 %r75, %r9, 1;
setp.eq.b32 %p6, %r75, 1;
mov.pred %p7, 0;
xor.pred %p8, %p6, %p7;
not.pred %p9, %p8;
@%p9 bra $L__BB12_7;
add.s32 %r76, %r9, -1;
mad.lo.s32 %r77, %r76, %r4, %r7;
mul.wide.u32 %rd9, %r77, 4;
add.s64 %rd10, %rd1, %rd9;
ld.global.f32 %f7, [%rd10];
mul.rn.f32 %f8, %f1, %f7;
st.global.f32 [%rd10], %f8;
$L__BB12_7:
setp.ge.u32 %p10, %r144, %r9;
@%p10 bra $L__BB12_10;
shl.b32 %r78, %r8, 1;
add.s32 %r79, %r78, -3;
sub.s32 %r80, %r79, %r144;
shl.b32 %r81, %r1, 1;
sub.s32 %r135, %r80, %r81;
neg.s32 %r134, %r144;
mad.lo.s32 %r133, %r4, %r144, %r7;
shl.b32 %r82, %r3, 1;
shl.b32 %r83, %r2, 1;
sub.s32 %r23, %r83, %r82;
mov.u32 %r136, %r144;
$L__BB12_9:
add.s32 %r84, %r134, 1;
add.s32 %r85, %r136, -1;
setp.gt.u32 %p11, %r136, 1;
selp.b32 %r86, %r85, %r84, %p11;
add.s32 %r87, %r136, 1;
setp.lt.u32 %p12, %r87, %r9;
selp.b32 %r88, %r87, %r135, %p12;
mad.lo.s32 %r89, %r86, %r4, %r7;
mul.wide.u32 %rd11, %r89, 4;
add.s64 %rd12, %rd1, %rd11;
mad.lo.s32 %r90, %r88, %r4, %r7;
mul.wide.u32 %rd13, %r90, 4;
add.s64 %rd14, %rd1, %rd13;
ld.global.f32 %f9, [%rd14];
ld.global.f32 %f10, [%rd12];
add.rn.f32 %f11, %f10, %f9;
mul.wide.u32 %rd15, %r133, 4;
add.s64 %rd16, %rd1, %rd15;
ld.global.f32 %f12, [%rd16];
mov.f32 %f13, 0fBEE31355;
fma.rn.f32 %f14, %f11, %f13, %f12;
st.global.f32 [%rd16], %f14;
add.s32 %r135, %r135, -2;
add.s32 %r134, %r134, -2;
add.s32 %r133, %r133, %r23;
add.s32 %r136, %r136, 2;
setp.lt.u32 %p13, %r136, %r9;
@%p13 bra $L__BB12_9;
$L__BB12_10:
setp.ge.u32 %p14, %r148, %r9;
@%p14 bra $L__BB12_13;
shl.b32 %r91, %r8, 1;
add.s32 %r92, %r91, -3;
sub.s32 %r93, %r92, %r148;
shl.b32 %r94, %r1, 1;
sub.s32 %r139, %r93, %r94;
neg.s32 %r138, %r148;
mad.lo.s32 %r137, %r4, %r148, %r7;
shl.b32 %r95, %r3, 1;
shl.b32 %r96, %r2, 1;
sub.s32 %r35, %r96, %r95;
mov.u32 %r140, %r148;
$L__BB12_12:
add.s32 %r97, %r138, 1;
add.s32 %r98, %r140, -1;
setp.gt.u32 %p15, %r140, 1;
selp.b32 %r99, %r98, %r97, %p15;
add.s32 %r100, %r140, 1;
setp.lt.u32 %p16, %r100, %r9;
selp.b32 %r101, %r100, %r139, %p16;
mad.lo.s32 %r102, %r99, %r4, %r7;
mul.wide.u32 %rd17, %r102, 4;
add.s64 %rd18, %rd1, %rd17;
mad.lo.s32 %r103, %r101, %r4, %r7;
mul.wide.u32 %rd19, %r103, 4;
add.s64 %rd20, %rd1, %rd19;
ld.global.f32 %f15, [%rd20];
ld.global.f32 %f16, [%rd18];
add.rn.f32 %f17, %f16, %f15;
mul.wide.u32 %rd21, %r137, 4;
add.s64 %rd22, %rd1, %rd21;
ld.global.f32 %f18, [%rd22];
mov.f32 %f19, 0fBF620676;
fma.rn.f32 %f20, %f17, %f19, %f18;
st.global.f32 [%rd22], %f20;
add.s32 %r139, %r139, -2;
add.s32 %r138, %r138, -2;
add.s32 %r137, %r137, %r35;
add.s32 %r140, %r140, 2;
setp.lt.u32 %p17, %r140, %r9;
@%p17 bra $L__BB12_12;
$L__BB12_13:
@%p10 bra $L__BB12_16;
shl.b32 %r104, %r8, 1;
add.s32 %r105, %r104, -3;
sub.s32 %r106, %r105, %r144;
shl.b32 %r107, %r1, 1;
sub.s32 %r143, %r106, %r107;
neg.s32 %r142, %r144;
mad.lo.s32 %r141, %r4, %r144, %r7;
shl.b32 %r108, %r3, 1;
shl.b32 %r109, %r2, 1;
sub.s32 %r47, %r109, %r108;
$L__BB12_15:
add.s32 %r110, %r142, 1;
add.s32 %r111, %r144, -1;
setp.gt.u32 %p19, %r144, 1;
selp.b32 %r112, %r111, %r110, %p19;
add.s32 %r113, %r144, 1;
setp.lt.u32 %p20, %r113, %r9;
selp.b32 %r114, %r113, %r143, %p20;
mad.lo.s32 %r115, %r112, %r4, %r7;
mul.wide.u32 %rd23, %r115, 4;
add.s64 %rd24, %rd1, %rd23;
mad.lo.s32 %r116, %r114, %r4, %r7;
mul.wide.u32 %rd25, %r116, 4;
add.s64 %rd26, %rd1, %rd25;
ld.global.f32 %f21, [%rd26];
ld.global.f32 %f22, [%rd24];
add.rn.f32 %f23, %f22, %f21;
mul.wide.u32 %rd27, %r141, 4;
add.s64 %rd28, %rd1, %rd27;
ld.global.f32 %f24, [%rd28];
mov.f32 %f25, 0f3D5901AE;
fma.rn.f32 %f26, %f23, %f25, %f24;
st.global.f32 [%rd28], %f26;
add.s32 %r143, %r143, -2;
add.s32 %r142, %r142, -2;
add.s32 %r141, %r141, %r47;
add.s32 %r144, %r144, 2;
setp.lt.u32 %p21, %r144, %r9;
@%p21 bra $L__BB12_15;
$L__BB12_16:
@%p14 bra $L__BB12_21;
shl.b32 %r117, %r8, 1;
add.s32 %r118, %r117, -3;
sub.s32 %r119, %r118, %r148;
shl.b32 %r120, %r1, 1;
sub.s32 %r147, %r119, %r120;
neg.s32 %r146, %r148;
mad.lo.s32 %r145, %r4, %r148, %r7;
shl.b32 %r121, %r3, 1;
shl.b32 %r122, %r2, 1;
sub.s32 %r59, %r122, %r121;
$L__BB12_18:
add.s32 %r123, %r146, 1;
add.s32 %r124, %r148, -1;
setp.gt.u32 %p23, %r148, 1;
selp.b32 %r125, %r124, %r123, %p23;
add.s32 %r126, %r148, 1;
setp.lt.u32 %p24, %r126, %r9;
selp.b32 %r127, %r126, %r147, %p24;
mad.lo.s32 %r128, %r125, %r4, %r7;
mul.wide.u32 %rd29, %r128, 4;
add.s64 %rd30, %rd1, %rd29;
mad.lo.s32 %r129, %r127, %r4, %r7;
mul.wide.u32 %rd31, %r129, 4;
add.s64 %rd32, %rd1, %rd31;
ld.global.f32 %f27, [%rd32];
ld.global.f32 %f28, [%rd30];
add.rn.f32 %f29, %f28, %f27;
mul.wide.u32 %rd33, %r145, 4;
add.s64 %rd34, %rd1, %rd33;
ld.global.f32 %f30, [%rd34];
mov.f32 %f31, 0f3FCB0673;
fma.rn.f32 %f32, %f29, %f31, %f30;
st.global.f32 [%rd34], %f32;
add.s32 %r147, %r147, -2;
add.s32 %r146, %r146, -2;
add.s32 %r145, %r145, %r59;
add.s32 %r148, %r148, 2;
setp.lt.u32 %p25, %r148, %r9;
@%p25 bra $L__BB12_18;
$L__BB12_21:
ret;
}
// .globl j2k_idwt_vertical_multi
.visible .entry j2k_idwt_vertical_multi(
.param .u64 j2k_idwt_vertical_multi_param_0
)
{
.reg .pred %p<36>;
.reg .f32 %f<51>;
.reg .b32 %r<211>;
.reg .b64 %rd<51>;
ld.param.u64 %rd3, [j2k_idwt_vertical_multi_param_0];
cvta.to.global.u64 %rd4, %rd3;
mov.u32 %r92, %ctaid.y;
mul.wide.u32 %rd5, %r92, 128;
add.s64 %rd6, %rd4, %rd5;
add.s64 %rd1, %rd6, 120;
ld.global.v2.u32 {%r93, %r94}, [%rd6+40];
ld.global.u32 %r3, [%rd6+48];
sub.s32 %r4, %r3, %r93;
mov.u32 %r95, %ntid.x;
mov.u32 %r96, %ctaid.x;
mul.lo.s32 %r5, %r96, %r95;
mov.u32 %r6, %tid.x;
add.s32 %r7, %r5, %r6;
setp.ge.u32 %p2, %r7, %r4;
@%p2 bra $L__BB13_28;
ld.global.u64 %rd2, [%rd1+-88];
ld.global.u32 %r8, [%rd1+-68];
sub.s32 %r9, %r8, %r94;
setp.eq.s32 %p3, %r9, 1;
and.b32 %r198, %r94, 1;
@%p3 bra $L__BB13_26;
bra.uni $L__BB13_2;
$L__BB13_26:
setp.eq.s32 %p35, %r198, 0;
@%p35 bra $L__BB13_28;
mul.wide.u32 %rd49, %r7, 4;
add.s64 %rd50, %rd2, %rd49;
ld.f32 %f49, [%rd50];
mul.rn.f32 %f50, %f49, 0f3F000000;
st.f32 [%rd50], %f50;
bra.uni $L__BB13_28;
$L__BB13_2:
ld.global.u32 %r97, [%rd1];
setp.eq.s32 %p4, %r97, 0;
xor.b32 %r202, %r198, 1;
@%p4 bra $L__BB13_20;
setp.eq.s32 %p1, %r198, 0;
selp.f32 %f1, 0f3F9D7658, 0f3F5019C3, %p1;
setp.lt.u32 %p5, %r9, 2;
@%p5 bra $L__BB13_6;
add.s32 %r99, %r6, %r3;
add.s32 %r100, %r99, %r5;
sub.s32 %r186, %r100, %r93;
shl.b32 %r101, %r93, 1;
mov.u32 %r185, 1;
shl.b32 %r102, %r3, 1;
sub.s32 %r13, %r102, %r101;
selp.f32 %f2, 0f3F5019C3, 0f3F9D7658, %p1;
mov.u32 %r184, %r7;
$L__BB13_5:
mul.wide.u32 %rd7, %r184, 4;
add.s64 %rd8, %rd2, %rd7;
ld.f32 %f3, [%rd8];
mul.rn.f32 %f4, %f1, %f3;
st.f32 [%rd8], %f4;
mul.wide.u32 %rd9, %r186, 4;
add.s64 %rd10, %rd2, %rd9;
ld.f32 %f5, [%rd10];
mul.rn.f32 %f6, %f2, %f5;
st.f32 [%rd10], %f6;
add.s32 %r186, %r186, %r13;
add.s32 %r184, %r184, %r13;
add.s32 %r185, %r185, 2;
setp.lt.u32 %p6, %r185, %r9;
@%p6 bra $L__BB13_5;
$L__BB13_6:
and.b32 %r103, %r9, 1;
setp.eq.b32 %p7, %r103, 1;
mov.pred %p8, 0;
xor.pred %p9, %p7, %p8;
not.pred %p10, %p9;
@%p10 bra $L__BB13_8;
add.s32 %r104, %r9, -1;
mad.lo.s32 %r105, %r104, %r4, %r7;
mul.wide.u32 %rd11, %r105, 4;
add.s64 %rd12, %rd2, %rd11;
ld.f32 %f7, [%rd12];
mul.rn.f32 %f8, %f1, %f7;
st.f32 [%rd12], %f8;
$L__BB13_8:
setp.ge.u32 %p11, %r198, %r9;
@%p11 bra $L__BB13_11;
shl.b32 %r106, %r8, 1;
add.s32 %r107, %r106, -3;
sub.s32 %r108, %r107, %r198;
shl.b32 %r109, %r94, 1;
sub.s32 %r189, %r108, %r109;
neg.s32 %r188, %r198;
mad.lo.s32 %r187, %r4, %r198, %r7;
shl.b32 %r110, %r93, 1;
shl.b32 %r111, %r3, 1;
sub.s32 %r23, %r111, %r110;
mov.u32 %r190, %r198;
$L__BB13_10:
add.s32 %r112, %r188, 1;
add.s32 %r113, %r190, -1;
setp.gt.u32 %p12, %r190, 1;
selp.b32 %r114, %r113, %r112, %p12;
add.s32 %r115, %r190, 1;
setp.lt.u32 %p13, %r115, %r9;
selp.b32 %r116, %r115, %r189, %p13;
mad.lo.s32 %r117, %r114, %r4, %r7;
mul.wide.u32 %rd13, %r117, 4;
add.s64 %rd14, %rd2, %rd13;
mad.lo.s32 %r118, %r116, %r4, %r7;
mul.wide.u32 %rd15, %r118, 4;
add.s64 %rd16, %rd2, %rd15;
ld.f32 %f9, [%rd16];
ld.f32 %f10, [%rd14];
add.rn.f32 %f11, %f10, %f9;
mul.wide.u32 %rd17, %r187, 4;
add.s64 %rd18, %rd2, %rd17;
ld.f32 %f12, [%rd18];
mov.f32 %f13, 0fBEE31355;
fma.rn.f32 %f14, %f11, %f13, %f12;
st.f32 [%rd18], %f14;
add.s32 %r189, %r189, -2;
add.s32 %r188, %r188, -2;
add.s32 %r187, %r187, %r23;
add.s32 %r190, %r190, 2;
setp.lt.u32 %p14, %r190, %r9;
@%p14 bra $L__BB13_10;
$L__BB13_11:
setp.ge.u32 %p15, %r202, %r9;
@%p15 bra $L__BB13_14;
shl.b32 %r119, %r8, 1;
add.s32 %r120, %r119, -3;
sub.s32 %r121, %r120, %r202;
shl.b32 %r122, %r94, 1;
sub.s32 %r193, %r121, %r122;
neg.s32 %r192, %r202;
mad.lo.s32 %r191, %r4, %r202, %r7;
shl.b32 %r123, %r93, 1;
shl.b32 %r124, %r3, 1;
sub.s32 %r35, %r124, %r123;
mov.u32 %r194, %r202;
$L__BB13_13:
add.s32 %r125, %r192, 1;
add.s32 %r126, %r194, -1;
setp.gt.u32 %p16, %r194, 1;
selp.b32 %r127, %r126, %r125, %p16;
add.s32 %r128, %r194, 1;
setp.lt.u32 %p17, %r128, %r9;
selp.b32 %r129, %r128, %r193, %p17;
mad.lo.s32 %r130, %r127, %r4, %r7;
mul.wide.u32 %rd19, %r130, 4;
add.s64 %rd20, %rd2, %rd19;
mad.lo.s32 %r131, %r129, %r4, %r7;
mul.wide.u32 %rd21, %r131, 4;
add.s64 %rd22, %rd2, %rd21;
ld.f32 %f15, [%rd22];
ld.f32 %f16, [%rd20];
add.rn.f32 %f17, %f16, %f15;
mul.wide.u32 %rd23, %r191, 4;
add.s64 %rd24, %rd2, %rd23;
ld.f32 %f18, [%rd24];
mov.f32 %f19, 0fBF620676;
fma.rn.f32 %f20, %f17, %f19, %f18;
st.f32 [%rd24], %f20;
add.s32 %r193, %r193, -2;
add.s32 %r192, %r192, -2;
add.s32 %r191, %r191, %r35;
add.s32 %r194, %r194, 2;
setp.lt.u32 %p18, %r194, %r9;
@%p18 bra $L__BB13_13;
$L__BB13_14:
@%p11 bra $L__BB13_17;
shl.b32 %r132, %r8, 1;
add.s32 %r133, %r132, -3;
sub.s32 %r134, %r133, %r198;
shl.b32 %r135, %r94, 1;
sub.s32 %r197, %r134, %r135;
neg.s32 %r196, %r198;
mad.lo.s32 %r195, %r4, %r198, %r7;
shl.b32 %r136, %r93, 1;
shl.b32 %r137, %r3, 1;
sub.s32 %r47, %r137, %r136;
$L__BB13_16:
add.s32 %r138, %r196, 1;
add.s32 %r139, %r198, -1;
setp.gt.u32 %p20, %r198, 1;
selp.b32 %r140, %r139, %r138, %p20;
add.s32 %r141, %r198, 1;
setp.lt.u32 %p21, %r141, %r9;
selp.b32 %r142, %r141, %r197, %p21;
mad.lo.s32 %r143, %r140, %r4, %r7;
mul.wide.u32 %rd25, %r143, 4;
add.s64 %rd26, %rd2, %rd25;
mad.lo.s32 %r144, %r142, %r4, %r7;
mul.wide.u32 %rd27, %r144, 4;
add.s64 %rd28, %rd2, %rd27;
ld.f32 %f21, [%rd28];
ld.f32 %f22, [%rd26];
add.rn.f32 %f23, %f22, %f21;
mul.wide.u32 %rd29, %r195, 4;
add.s64 %rd30, %rd2, %rd29;
ld.f32 %f24, [%rd30];
mov.f32 %f25, 0f3D5901AE;
fma.rn.f32 %f26, %f23, %f25, %f24;
st.f32 [%rd30], %f26;
add.s32 %r197, %r197, -2;
add.s32 %r196, %r196, -2;
add.s32 %r195, %r195, %r47;
add.s32 %r198, %r198, 2;
setp.lt.u32 %p22, %r198, %r9;
@%p22 bra $L__BB13_16;
$L__BB13_17:
@%p15 bra $L__BB13_28;
shl.b32 %r145, %r8, 1;
add.s32 %r146, %r145, -3;
sub.s32 %r147, %r146, %r202;
shl.b32 %r148, %r94, 1;
sub.s32 %r201, %r147, %r148;
neg.s32 %r200, %r202;
mad.lo.s32 %r199, %r4, %r202, %r7;
shl.b32 %r149, %r93, 1;
shl.b32 %r150, %r3, 1;
sub.s32 %r59, %r150, %r149;
$L__BB13_19:
add.s32 %r151, %r200, 1;
add.s32 %r152, %r202, -1;
setp.gt.u32 %p24, %r202, 1;
selp.b32 %r153, %r152, %r151, %p24;
add.s32 %r154, %r202, 1;
setp.lt.u32 %p25, %r154, %r9;
selp.b32 %r155, %r154, %r201, %p25;
mad.lo.s32 %r156, %r153, %r4, %r7;
mul.wide.u32 %rd31, %r156, 4;
add.s64 %rd32, %rd2, %rd31;
mad.lo.s32 %r157, %r155, %r4, %r7;
mul.wide.u32 %rd33, %r157, 4;
add.s64 %rd34, %rd2, %rd33;
ld.f32 %f27, [%rd34];
ld.f32 %f28, [%rd32];
add.rn.f32 %f29, %f28, %f27;
mul.wide.u32 %rd35, %r199, 4;
add.s64 %rd36, %rd2, %rd35;
ld.f32 %f30, [%rd36];
mov.f32 %f31, 0f3FCB0673;
fma.rn.f32 %f32, %f29, %f31, %f30;
st.f32 [%rd36], %f32;
add.s32 %r201, %r201, -2;
add.s32 %r200, %r200, -2;
add.s32 %r199, %r199, %r59;
add.s32 %r202, %r202, 2;
setp.lt.u32 %p26, %r202, %r9;
@%p26 bra $L__BB13_19;
bra.uni $L__BB13_28;
$L__BB13_20:
setp.ge.u32 %p27, %r198, %r9;
@%p27 bra $L__BB13_23;
shl.b32 %r158, %r8, 1;
add.s32 %r159, %r158, -3;
sub.s32 %r160, %r159, %r198;
shl.b32 %r161, %r94, 1;
sub.s32 %r205, %r160, %r161;
neg.s32 %r204, %r198;
mad.lo.s32 %r203, %r4, %r198, %r7;
shl.b32 %r162, %r93, 1;
shl.b32 %r163, %r3, 1;
sub.s32 %r71, %r163, %r162;
$L__BB13_22:
add.s32 %r164, %r204, 1;
add.s32 %r165, %r198, -1;
setp.gt.u32 %p28, %r198, 1;
selp.b32 %r166, %r165, %r164, %p28;
add.s32 %r167, %r198, 1;
setp.lt.u32 %p29, %r167, %r9;
selp.b32 %r168, %r167, %r205, %p29;
mad.lo.s32 %r169, %r166, %r4, %r7;
mul.wide.u32 %rd37, %r169, 4;
add.s64 %rd38, %rd2, %rd37;
mad.lo.s32 %r170, %r168, %r4, %r7;
mul.wide.u32 %rd39, %r170, 4;
add.s64 %rd40, %rd2, %rd39;
mul.wide.u32 %rd41, %r203, 4;
add.s64 %rd42, %rd2, %rd41;
ld.f32 %f33, [%rd40];
ld.f32 %f34, [%rd38];
add.rn.f32 %f35, %f34, %f33;
mov.f32 %f36, 0f3F000000;
mov.f32 %f37, 0f3E800000;
fma.rn.f32 %f38, %f35, %f37, %f36;
cvt.rmi.f32.f32 %f39, %f38;
ld.f32 %f40, [%rd42];
sub.rn.f32 %f41, %f40, %f39;
st.f32 [%rd42], %f41;
add.s32 %r205, %r205, -2;
add.s32 %r204, %r204, -2;
add.s32 %r203, %r203, %r71;
add.s32 %r198, %r198, 2;
setp.lt.u32 %p30, %r198, %r9;
@%p30 bra $L__BB13_22;
$L__BB13_23:
setp.ge.u32 %p31, %r202, %r9;
@%p31 bra $L__BB13_28;
shl.b32 %r171, %r8, 1;
add.s32 %r172, %r171, -3;
sub.s32 %r173, %r172, %r202;
shl.b32 %r174, %r94, 1;
sub.s32 %r209, %r173, %r174;
neg.s32 %r208, %r202;
mad.lo.s32 %r207, %r4, %r202, %r7;
shl.b32 %r175, %r93, 1;
shl.b32 %r176, %r3, 1;
sub.s32 %r83, %r176, %r175;
$L__BB13_25:
add.s32 %r177, %r208, 1;
add.s32 %r178, %r202, -1;
setp.gt.u32 %p32, %r202, 1;
selp.b32 %r179, %r178, %r177, %p32;
add.s32 %r180, %r202, 1;
setp.lt.u32 %p33, %r180, %r9;
selp.b32 %r181, %r180, %r209, %p33;
mad.lo.s32 %r182, %r179, %r4, %r7;
mul.wide.u32 %rd43, %r182, 4;
add.s64 %rd44, %rd2, %rd43;
mad.lo.s32 %r183, %r181, %r4, %r7;
mul.wide.u32 %rd45, %r183, 4;
add.s64 %rd46, %rd2, %rd45;
mul.wide.u32 %rd47, %r207, 4;
add.s64 %rd48, %rd2, %rd47;
ld.f32 %f42, [%rd46];
ld.f32 %f43, [%rd44];
add.rn.f32 %f44, %f43, %f42;
mul.rn.f32 %f45, %f44, 0f3F000000;
cvt.rmi.f32.f32 %f46, %f45;
ld.f32 %f47, [%rd48];
add.rn.f32 %f48, %f47, %f46;
st.f32 [%rd48], %f48;
add.s32 %r209, %r209, -2;
add.s32 %r208, %r208, -2;
add.s32 %r207, %r207, %r83;
add.s32 %r202, %r202, 2;
setp.lt.u32 %p34, %r202, %r9;
@%p34 bra $L__BB13_25;
$L__BB13_28:
ret;
}
// .globl j2k_idwt_vertical_53_multi
.visible .entry j2k_idwt_vertical_53_multi(
.param .u64 j2k_idwt_vertical_53_multi_param_0
)
{
.reg .pred %p<20>;
.reg .f32 %f<22>;
.reg .b32 %r<52>;
.reg .b64 %rd<10>;
// demoted variable
.shared .align 4 .b8 _ZZ35j2k_idwt_vertical_53_multiE14column_samples[2048];
ld.param.u64 %rd3, [j2k_idwt_vertical_53_multi_param_0];
cvta.to.global.u64 %rd4, %rd3;
mov.u32 %r1, %tid.x;
mov.u32 %r9, %ctaid.y;
mul.wide.u32 %rd5, %r9, 128;
add.s64 %rd6, %rd4, %rd5;
ld.global.v2.u32 {%r10, %r11}, [%rd6+48];
ld.global.v2.u32 {%r14, %r15}, [%rd6+40];
ld.global.u64 %rd1, [%rd6+32];
sub.s32 %r3, %r10, %r14;
sub.s32 %r4, %r11, %r15;
mov.u32 %r5, %ctaid.x;
setp.ge.u32 %p1, %r5, %r3;
@%p1 bra $L__BB14_14;
setp.ge.u32 %p2, %r1, %r4;
mad.lo.s32 %r17, %r3, %r1, %r5;
mul.wide.u32 %rd7, %r17, 4;
add.s64 %rd2, %rd1, %rd7;
shl.b32 %r18, %r1, 2;
mov.u32 %r19, _ZZ35j2k_idwt_vertical_53_multiE14column_samples;
add.s32 %r6, %r19, %r18;
@%p2 bra $L__BB14_3;
ld.f32 %f1, [%rd2];
st.shared.f32 [%r6], %f1;
$L__BB14_3:
bar.sync 0;
setp.eq.s32 %p3, %r4, 1;
@%p3 bra $L__BB14_10;
bra.uni $L__BB14_4;
$L__BB14_10:
setp.ne.s32 %p15, %r1, 0;
and.b32 %r51, %r15, 1;
setp.eq.b32 %p16, %r51, 1;
not.pred %p17, %p16;
or.pred %p18, %p15, %p17;
@%p18 bra $L__BB14_12;
ld.shared.f32 %f19, [_ZZ35j2k_idwt_vertical_53_multiE14column_samples];
mul.rn.f32 %f20, %f19, 0f3F000000;
st.shared.f32 [_ZZ35j2k_idwt_vertical_53_multiE14column_samples], %f20;
$L__BB14_12:
bar.sync 0;
@%p15 bra $L__BB14_14;
ld.shared.f32 %f21, [_ZZ35j2k_idwt_vertical_53_multiE14column_samples];
mul.wide.u32 %rd8, %r5, 4;
add.s64 %rd9, %rd1, %rd8;
st.f32 [%rd9], %f21;
bra.uni $L__BB14_14;
$L__BB14_4:
and.b32 %r7, %r15, 1;
and.b32 %r8, %r1, 1;
setp.ne.s32 %p5, %r8, %r7;
or.pred %p6, %p2, %p5;
@%p6 bra $L__BB14_6;
setp.gt.u32 %p7, %r1, 1;
mov.u32 %r20, 1;
sub.s32 %r21, %r20, %r1;
add.s32 %r22, %r1, -1;
selp.b32 %r23, %r22, %r21, %p7;
add.s32 %r24, %r1, 1;
setp.lt.u32 %p8, %r24, %r4;
mov.u32 %r25, -3;
sub.s32 %r26, %r25, %r1;
add.s32 %r27, %r26, %r4;
add.s32 %r28, %r27, %r4;
selp.b32 %r29, %r24, %r28, %p8;
shl.b32 %r30, %r23, 2;
add.s32 %r32, %r19, %r30;
shl.b32 %r33, %r29, 2;
add.s32 %r34, %r19, %r33;
ld.shared.f32 %f2, [%r34];
ld.shared.f32 %f3, [%r32];
add.rn.f32 %f4, %f3, %f2;
mov.f32 %f5, 0f3F000000;
mov.f32 %f6, 0f3E800000;
fma.rn.f32 %f7, %f4, %f6, %f5;
cvt.rmi.f32.f32 %f8, %f7;
ld.shared.f32 %f9, [%r6];
sub.rn.f32 %f10, %f9, %f8;
st.shared.f32 [%r6], %f10;
$L__BB14_6:
bar.sync 0;
xor.b32 %r35, %r7, 1;
setp.ne.s32 %p10, %r8, %r35;
or.pred %p11, %p2, %p10;
@%p11 bra $L__BB14_8;
setp.gt.u32 %p12, %r1, 1;
mov.u32 %r36, 1;
sub.s32 %r37, %r36, %r1;
add.s32 %r38, %r1, -1;
selp.b32 %r39, %r38, %r37, %p12;
add.s32 %r40, %r1, 1;
setp.lt.u32 %p13, %r40, %r4;
mov.u32 %r41, -3;
sub.s32 %r42, %r41, %r1;
add.s32 %r43, %r42, %r4;
add.s32 %r44, %r43, %r4;
selp.b32 %r45, %r40, %r44, %p13;
shl.b32 %r46, %r39, 2;
add.s32 %r48, %r19, %r46;
shl.b32 %r49, %r45, 2;
add.s32 %r50, %r19, %r49;
ld.shared.f32 %f11, [%r50];
ld.shared.f32 %f12, [%r48];
add.rn.f32 %f13, %f12, %f11;
mul.rn.f32 %f14, %f13, 0f3F000000;
cvt.rmi.f32.f32 %f15, %f14;
ld.shared.f32 %f16, [%r6];
add.rn.f32 %f17, %f16, %f15;
st.shared.f32 [%r6], %f17;
$L__BB14_8:
bar.sync 0;
@%p2 bra $L__BB14_14;
ld.shared.f32 %f18, [%r6];
st.f32 [%rd2], %f18;
$L__BB14_14:
ret;
}
// .globl j2k_idwt_vertical_97_multi
.visible .entry j2k_idwt_vertical_97_multi(
.param .u64 j2k_idwt_vertical_97_multi_param_0
)
{
.reg .pred %p<31>;
.reg .f32 %f<35>;
.reg .b32 %r<83>;
.reg .b64 %rd<10>;
// demoted variable
.shared .align 4 .b8 _ZZ35j2k_idwt_vertical_97_multiE14column_samples[2048];
ld.param.u64 %rd3, [j2k_idwt_vertical_97_multi_param_0];
cvta.to.global.u64 %rd4, %rd3;
mov.u32 %r1, %tid.x;
mov.u32 %r9, %ctaid.y;
mul.wide.u32 %rd5, %r9, 128;
add.s64 %rd6, %rd4, %rd5;
ld.global.v2.u32 {%r10, %r11}, [%rd6+48];
ld.global.v2.u32 {%r14, %r15}, [%rd6+40];
ld.global.u64 %rd1, [%rd6+32];
sub.s32 %r3, %r10, %r14;
sub.s32 %r4, %r11, %r15;
mov.u32 %r5, %ctaid.x;
setp.ge.u32 %p3, %r5, %r3;
@%p3 bra $L__BB15_20;
setp.ge.u32 %p4, %r1, %r4;
mad.lo.s32 %r17, %r3, %r1, %r5;
mul.wide.u32 %rd7, %r17, 4;
add.s64 %rd2, %rd1, %rd7;
shl.b32 %r18, %r1, 2;
mov.u32 %r19, _ZZ35j2k_idwt_vertical_97_multiE14column_samples;
add.s32 %r6, %r19, %r18;
@%p4 bra $L__BB15_3;
ld.f32 %f1, [%rd2];
st.shared.f32 [%r6], %f1;
$L__BB15_3:
bar.sync 0;
setp.eq.s32 %p5, %r4, 1;
@%p5 bra $L__BB15_16;
bra.uni $L__BB15_4;
$L__BB15_16:
setp.ne.s32 %p26, %r1, 0;
and.b32 %r82, %r15, 1;
setp.eq.b32 %p27, %r82, 1;
not.pred %p28, %p27;
or.pred %p29, %p26, %p28;
@%p29 bra $L__BB15_18;
ld.shared.f32 %f32, [_ZZ35j2k_idwt_vertical_97_multiE14column_samples];
mul.rn.f32 %f33, %f32, 0f3F000000;
st.shared.f32 [_ZZ35j2k_idwt_vertical_97_multiE14column_samples], %f33;
$L__BB15_18:
bar.sync 0;
@%p26 bra $L__BB15_20;
ld.shared.f32 %f34, [_ZZ35j2k_idwt_vertical_97_multiE14column_samples];
mul.wide.u32 %rd8, %r5, 4;
add.s64 %rd9, %rd1, %rd8;
st.f32 [%rd9], %f34;
bra.uni $L__BB15_20;
$L__BB15_4:
setp.lt.u32 %p6, %r1, %r4;
and.b32 %r7, %r15, 1;
@%p6 bra $L__BB15_5;
bra.uni $L__BB15_6;
$L__BB15_5:
setp.eq.s32 %p7, %r7, 0;
selp.f32 %f2, 0f3F5019C3, 0f3F9D7658, %p7;
selp.f32 %f3, 0f3F9D7658, 0f3F5019C3, %p7;
and.b32 %r20, %r1, 1;
setp.eq.b32 %p8, %r20, 1;
selp.f32 %f4, %f2, %f3, %p8;
ld.shared.f32 %f5, [%r6];
mul.rn.f32 %f6, %f4, %f5;
st.shared.f32 [%r6], %f6;
$L__BB15_6:
bar.sync 0;
and.b32 %r8, %r1, 1;
setp.eq.s32 %p10, %r8, %r7;
and.pred %p1, %p6, %p10;
not.pred %p11, %p1;
@%p11 bra $L__BB15_8;
setp.gt.u32 %p12, %r1, 1;
mov.u32 %r21, 1;
sub.s32 %r22, %r21, %r1;
add.s32 %r23, %r1, -1;
selp.b32 %r24, %r23, %r22, %p12;
add.s32 %r25, %r1, 1;
setp.lt.u32 %p13, %r25, %r4;
mov.u32 %r26, -3;
sub.s32 %r27, %r26, %r1;
add.s32 %r28, %r27, %r4;
add.s32 %r29, %r28, %r4;
selp.b32 %r30, %r25, %r29, %p13;
shl.b32 %r31, %r24, 2;
add.s32 %r33, %r19, %r31;
shl.b32 %r34, %r30, 2;
add.s32 %r35, %r19, %r34;
ld.shared.f32 %f7, [%r35];
ld.shared.f32 %f8, [%r33];
add.rn.f32 %f9, %f8, %f7;
ld.shared.f32 %f10, [%r6];
mov.f32 %f11, 0fBEE31355;
fma.rn.f32 %f12, %f9, %f11, %f10;
st.shared.f32 [%r6], %f12;
$L__BB15_8:
bar.sync 0;
xor.b32 %r36, %r7, 1;
setp.eq.s32 %p15, %r8, %r36;
and.pred %p2, %p6, %p15;
not.pred %p16, %p2;
@%p16 bra $L__BB15_10;
setp.gt.u32 %p17, %r1, 1;
mov.u32 %r37, 1;
sub.s32 %r38, %r37, %r1;
add.s32 %r39, %r1, -1;
selp.b32 %r40, %r39, %r38, %p17;
add.s32 %r41, %r1, 1;
setp.lt.u32 %p18, %r41, %r4;
mov.u32 %r42, -3;
sub.s32 %r43, %r42, %r1;
add.s32 %r44, %r43, %r4;
add.s32 %r45, %r44, %r4;
selp.b32 %r46, %r41, %r45, %p18;
shl.b32 %r47, %r40, 2;
add.s32 %r49, %r19, %r47;
shl.b32 %r50, %r46, 2;
add.s32 %r51, %r19, %r50;
ld.shared.f32 %f13, [%r51];
ld.shared.f32 %f14, [%r49];
add.rn.f32 %f15, %f14, %f13;
ld.shared.f32 %f16, [%r6];
mov.f32 %f17, 0fBF620676;
fma.rn.f32 %f18, %f15, %f17, %f16;
st.shared.f32 [%r6], %f18;
$L__BB15_10:
bar.sync 0;
@%p11 bra $L__BB15_12;
setp.gt.u32 %p20, %r1, 1;
mov.u32 %r52, 1;
sub.s32 %r53, %r52, %r1;
add.s32 %r54, %r1, -1;
selp.b32 %r55, %r54, %r53, %p20;
add.s32 %r56, %r1, 1;
setp.lt.u32 %p21, %r56, %r4;
mov.u32 %r57, -3;
sub.s32 %r58, %r57, %r1;
add.s32 %r59, %r58, %r4;
add.s32 %r60, %r59, %r4;
selp.b32 %r61, %r56, %r60, %p21;
shl.b32 %r62, %r55, 2;
add.s32 %r64, %r19, %r62;
shl.b32 %r65, %r61, 2;
add.s32 %r66, %r19, %r65;
ld.shared.f32 %f19, [%r66];
ld.shared.f32 %f20, [%r64];
add.rn.f32 %f21, %f20, %f19;
ld.shared.f32 %f22, [%r6];
mov.f32 %f23, 0f3D5901AE;
fma.rn.f32 %f24, %f21, %f23, %f22;
st.shared.f32 [%r6], %f24;
$L__BB15_12:
bar.sync 0;
@%p16 bra $L__BB15_14;
setp.gt.u32 %p23, %r1, 1;
mov.u32 %r67, 1;
sub.s32 %r68, %r67, %r1;
add.s32 %r69, %r1, -1;
selp.b32 %r70, %r69, %r68, %p23;
add.s32 %r71, %r1, 1;
setp.lt.u32 %p24, %r71, %r4;
mov.u32 %r72, -3;
sub.s32 %r73, %r72, %r1;
add.s32 %r74, %r73, %r4;
add.s32 %r75, %r74, %r4;
selp.b32 %r76, %r71, %r75, %p24;
shl.b32 %r77, %r70, 2;
add.s32 %r79, %r19, %r77;
shl.b32 %r80, %r76, 2;
add.s32 %r81, %r19, %r80;
ld.shared.f32 %f25, [%r81];
ld.shared.f32 %f26, [%r79];
add.rn.f32 %f27, %f26, %f25;
ld.shared.f32 %f28, [%r6];
mov.f32 %f29, 0f3FCB0673;
fma.rn.f32 %f30, %f27, %f29, %f28;
st.shared.f32 [%r6], %f30;
$L__BB15_14:
bar.sync 0;
@%p4 bra $L__BB15_20;
ld.shared.f32 %f31, [%r6];
st.f32 [%rd2], %f31;
$L__BB15_20:
ret;
}
// .globl j2k_idwt_vertical_97_multi_cols4
.visible .entry j2k_idwt_vertical_97_multi_cols4(
.param .u64 j2k_idwt_vertical_97_multi_cols4_param_0
)
{
.reg .pred %p<30>;
.reg .f32 %f<35>;
.reg .b32 %r<102>;
.reg .b64 %rd<10>;
// demoted variable
.shared .align 4 .b8 _ZZ41j2k_idwt_vertical_97_multi_cols4E14column_samples[4096];
ld.param.u64 %rd3, [j2k_idwt_vertical_97_multi_cols4_param_0];
cvta.to.global.u64 %rd4, %rd3;
mov.u32 %r1, %tid.y;
mov.u32 %r11, %ctaid.x;
shl.b32 %r12, %r11, 2;
mov.u32 %r2, %tid.x;
add.s32 %r3, %r12, %r2;
mov.u32 %r13, %ctaid.y;
mul.wide.u32 %rd5, %r13, 128;
add.s64 %rd6, %rd4, %rd5;
ld.global.v2.u32 {%r14, %r15}, [%rd6+48];
ld.global.v2.u32 {%r18, %r19}, [%rd6+40];
ld.global.u64 %rd1, [%rd6+32];
sub.s32 %r5, %r14, %r18;
sub.s32 %r6, %r15, %r19;
setp.gt.u32 %p4, %r6, 256;
@%p4 bra $L__BB16_20;
setp.lt.u32 %p5, %r3, %r5;
setp.lt.u32 %p6, %r1, %r6;
and.pred %p1, %p5, %p6;
mad.lo.s32 %r21, %r5, %r1, %r3;
mul.wide.u32 %rd7, %r21, 4;
add.s64 %rd2, %rd1, %rd7;
shl.b32 %r22, %r1, 4;
mov.u32 %r23, _ZZ41j2k_idwt_vertical_97_multi_cols4E14column_samples;
add.s32 %r24, %r23, %r22;
shl.b32 %r25, %r2, 2;
add.s32 %r7, %r24, %r25;
not.pred %p7, %p1;
@%p7 bra $L__BB16_3;
ld.f32 %f1, [%rd2];
st.shared.f32 [%r7], %f1;
$L__BB16_3:
bar.sync 0;
and.b32 %r8, %r19, 1;
setp.eq.s32 %p8, %r6, 1;
@%p8 bra $L__BB16_16;
bra.uni $L__BB16_4;
$L__BB16_16:
setp.eq.s32 %p26, %r8, 0;
add.s32 %r10, %r23, %r25;
or.pred %p28, %p26, %p7;
@%p28 bra $L__BB16_18;
ld.shared.f32 %f32, [%r10];
mul.rn.f32 %f33, %f32, 0f3F000000;
st.shared.f32 [%r10], %f33;
$L__BB16_18:
bar.sync 0;
@%p7 bra $L__BB16_20;
ld.shared.f32 %f34, [%r10];
mul.wide.u32 %rd8, %r3, 4;
add.s64 %rd9, %rd1, %rd8;
st.f32 [%rd9], %f34;
bra.uni $L__BB16_20;
$L__BB16_4:
@%p1 bra $L__BB16_5;
bra.uni $L__BB16_6;
$L__BB16_5:
setp.eq.s32 %p9, %r8, 0;
selp.f32 %f2, 0f3F5019C3, 0f3F9D7658, %p9;
selp.f32 %f3, 0f3F9D7658, 0f3F5019C3, %p9;
and.b32 %r26, %r1, 1;
setp.eq.b32 %p10, %r26, 1;
selp.f32 %f4, %f2, %f3, %p10;
ld.shared.f32 %f5, [%r7];
mul.rn.f32 %f6, %f4, %f5;
st.shared.f32 [%r7], %f6;
$L__BB16_6:
and.b32 %r9, %r1, 1;
bar.sync 0;
setp.eq.s32 %p11, %r9, %r8;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB16_8;
setp.gt.u32 %p13, %r1, 1;
mov.u32 %r27, 1;
sub.s32 %r28, %r27, %r1;
add.s32 %r29, %r1, -1;
selp.b32 %r30, %r29, %r28, %p13;
add.s32 %r31, %r1, 1;
setp.lt.u32 %p14, %r31, %r6;
mov.u32 %r32, -3;
sub.s32 %r33, %r32, %r1;
add.s32 %r34, %r33, %r6;
add.s32 %r35, %r34, %r6;
selp.b32 %r36, %r31, %r35, %p14;
shl.b32 %r37, %r30, 4;
add.s32 %r39, %r23, %r37;
add.s32 %r41, %r39, %r25;
shl.b32 %r42, %r36, 4;
add.s32 %r43, %r23, %r42;
add.s32 %r44, %r43, %r25;
ld.shared.f32 %f7, [%r44];
ld.shared.f32 %f8, [%r41];
add.rn.f32 %f9, %f8, %f7;
ld.shared.f32 %f10, [%r7];
mov.f32 %f11, 0fBEE31355;
fma.rn.f32 %f12, %f9, %f11, %f10;
st.shared.f32 [%r7], %f12;
$L__BB16_8:
bar.sync 0;
xor.b32 %r45, %r8, 1;
setp.eq.s32 %p15, %r9, %r45;
and.pred %p3, %p1, %p15;
not.pred %p16, %p3;
@%p16 bra $L__BB16_10;
setp.gt.u32 %p17, %r1, 1;
mov.u32 %r46, 1;
sub.s32 %r47, %r46, %r1;
add.s32 %r48, %r1, -1;
selp.b32 %r49, %r48, %r47, %p17;
add.s32 %r50, %r1, 1;
setp.lt.u32 %p18, %r50, %r6;
mov.u32 %r51, -3;
sub.s32 %r52, %r51, %r1;
add.s32 %r53, %r52, %r6;
add.s32 %r54, %r53, %r6;
selp.b32 %r55, %r50, %r54, %p18;
shl.b32 %r56, %r49, 4;
add.s32 %r58, %r23, %r56;
add.s32 %r60, %r58, %r25;
shl.b32 %r61, %r55, 4;
add.s32 %r62, %r23, %r61;
add.s32 %r63, %r62, %r25;
ld.shared.f32 %f13, [%r63];
ld.shared.f32 %f14, [%r60];
add.rn.f32 %f15, %f14, %f13;
ld.shared.f32 %f16, [%r7];
mov.f32 %f17, 0fBF620676;
fma.rn.f32 %f18, %f15, %f17, %f16;
st.shared.f32 [%r7], %f18;
$L__BB16_10:
bar.sync 0;
@%p12 bra $L__BB16_12;
setp.gt.u32 %p20, %r1, 1;
mov.u32 %r64, 1;
sub.s32 %r65, %r64, %r1;
add.s32 %r66, %r1, -1;
selp.b32 %r67, %r66, %r65, %p20;
add.s32 %r68, %r1, 1;
setp.lt.u32 %p21, %r68, %r6;
mov.u32 %r69, -3;
sub.s32 %r70, %r69, %r1;
add.s32 %r71, %r70, %r6;
add.s32 %r72, %r71, %r6;
selp.b32 %r73, %r68, %r72, %p21;
shl.b32 %r74, %r67, 4;
add.s32 %r76, %r23, %r74;
add.s32 %r78, %r76, %r25;
shl.b32 %r79, %r73, 4;
add.s32 %r80, %r23, %r79;
add.s32 %r81, %r80, %r25;
ld.shared.f32 %f19, [%r81];
ld.shared.f32 %f20, [%r78];
add.rn.f32 %f21, %f20, %f19;
ld.shared.f32 %f22, [%r7];
mov.f32 %f23, 0f3D5901AE;
fma.rn.f32 %f24, %f21, %f23, %f22;
st.shared.f32 [%r7], %f24;
$L__BB16_12:
bar.sync 0;
@%p16 bra $L__BB16_14;
setp.gt.u32 %p23, %r1, 1;
mov.u32 %r82, 1;
sub.s32 %r83, %r82, %r1;
add.s32 %r84, %r1, -1;
selp.b32 %r85, %r84, %r83, %p23;
add.s32 %r86, %r1, 1;
setp.lt.u32 %p24, %r86, %r6;
mov.u32 %r87, -3;
sub.s32 %r88, %r87, %r1;
add.s32 %r89, %r88, %r6;
add.s32 %r90, %r89, %r6;
selp.b32 %r91, %r86, %r90, %p24;
shl.b32 %r92, %r85, 4;
add.s32 %r94, %r23, %r92;
add.s32 %r96, %r94, %r25;
shl.b32 %r97, %r91, 4;
add.s32 %r98, %r23, %r97;
add.s32 %r99, %r98, %r25;
ld.shared.f32 %f25, [%r99];
ld.shared.f32 %f26, [%r96];
add.rn.f32 %f27, %f26, %f25;
ld.shared.f32 %f28, [%r7];
mov.f32 %f29, 0f3FCB0673;
fma.rn.f32 %f30, %f27, %f29, %f28;
st.shared.f32 [%r7], %f30;
$L__BB16_14:
bar.sync 0;
@%p7 bra $L__BB16_20;
ld.shared.f32 %f31, [%r7];
st.f32 [%rd2], %f31;
$L__BB16_20:
ret;
}
// .globl j2k_store_gray8
.visible .entry j2k_store_gray8(
.param .u64 j2k_store_gray8_param_0,
.param .u64 j2k_store_gray8_param_1,
.param .u64 j2k_store_gray8_param_2
)
{
.reg .pred %p<5>;
.reg .f32 %f<23>;
.reg .b32 %r<32>;
.reg .b64 %rd<12>;
ld.param.u64 %rd3, [j2k_store_gray8_param_0];
ld.param.u64 %rd4, [j2k_store_gray8_param_1];
ld.param.u64 %rd5, [j2k_store_gray8_param_2];
cvta.to.global.u64 %rd6, %rd5;
add.s64 %rd1, %rd6, 12;
ld.global.u32 %r8, [%rd6+16];
ld.global.u32 %r1, [%rd6+12];
mul.lo.s32 %r9, %r8, %r1;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r2, %r11, %r10, %r12;
setp.ge.u32 %p1, %r2, %r9;
@%p1 bra $L__BB17_5;
cvta.to.global.u64 %rd2, %rd4;
div.u32 %r13, %r2, %r1;
mul.lo.s32 %r14, %r13, %r1;
sub.s32 %r15, %r2, %r14;
ld.global.u32 %r16, [%rd1+-4];
add.s32 %r17, %r13, %r16;
ld.global.u32 %r18, [%rd1+-12];
ld.global.u32 %r19, [%rd1+-8];
mad.lo.s32 %r20, %r17, %r18, %r19;
add.s32 %r21, %r20, %r15;
ld.global.u32 %r22, [%rd1+20];
add.s32 %r23, %r13, %r22;
ld.global.u32 %r24, [%rd1+8];
ld.global.u32 %r25, [%rd1+16];
mad.lo.s32 %r26, %r23, %r24, %r25;
add.s32 %r3, %r26, %r15;
cvta.to.global.u64 %rd7, %rd3;
mul.wide.u32 %rd8, %r21, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f2, [%rd9];
ld.global.f32 %f3, [%rd1+24];
add.rn.f32 %f4, %f3, %f2;
mov.f32 %f5, 0f3F000000;
copysign.f32 %f6, %f4, %f5;
add.rz.f32 %f7, %f4, %f6;
cvt.rzi.f32.f32 %f1, %f7;
ld.global.u32 %r4, [%rd1+28];
setp.eq.s32 %p2, %r4, 8;
@%p2 bra $L__BB17_3;
bra.uni $L__BB17_2;
$L__BB17_3:
mov.f32 %f19, 0f00000000;
max.f32 %f20, %f1, %f19;
mov.f32 %f21, 0f437F0000;
min.f32 %f22, %f20, %f21;
cvt.rzi.u32.f32 %r31, %f22;
bra.uni $L__BB17_4;
$L__BB17_2:
setp.gt.u32 %p3, %r4, 15;
mov.u32 %r27, -1;
shl.b32 %r28, %r27, %r4;
setp.lt.u32 %p4, %r28, -2;
not.b32 %r29, %r28;
selp.b32 %r30, %r29, 1, %p4;
cvt.rn.f32.u32 %f8, %r30;
selp.f32 %f9, 0f477FFF00, %f8, %p3;
mov.f32 %f10, 0f00000000;
max.f32 %f11, %f1, %f10;
min.f32 %f12, %f11, %f9;
div.rn.f32 %f13, %f12, %f9;
mul.rn.f32 %f14, %f13, 0f437F0000;
copysign.f32 %f16, %f14, %f5;
add.rz.f32 %f17, %f14, %f16;
cvt.rzi.f32.f32 %f18, %f17;
cvt.rzi.u32.f32 %r31, %f18;
$L__BB17_4:
cvt.u64.u32 %rd10, %r3;
add.s64 %rd11, %rd2, %rd10;
st.global.u8 [%rd11], %r31;
$L__BB17_5:
ret;
}
// .globl j2k_store_gray16
.visible .entry j2k_store_gray16(
.param .u64 j2k_store_gray16_param_0,
.param .u64 j2k_store_gray16_param_1,
.param .u64 j2k_store_gray16_param_2
)
{
.reg .pred %p<4>;
.reg .f32 %f<22>;
.reg .b32 %r<33>;
.reg .b64 %rd<12>;
ld.param.u64 %rd3, [j2k_store_gray16_param_0];
ld.param.u64 %rd4, [j2k_store_gray16_param_1];
ld.param.u64 %rd5, [j2k_store_gray16_param_2];
cvta.to.global.u64 %rd6, %rd5;
add.s64 %rd1, %rd6, 12;
ld.global.u32 %r8, [%rd6+16];
ld.global.u32 %r1, [%rd6+12];
mul.lo.s32 %r9, %r8, %r1;
mov.u32 %r10, %ntid.x;
mov.u32 %r11, %ctaid.x;
mov.u32 %r12, %tid.x;
mad.lo.s32 %r2, %r11, %r10, %r12;
setp.ge.u32 %p1, %r2, %r9;
@%p1 bra $L__BB18_5;
cvta.to.global.u64 %rd2, %rd4;
div.u32 %r13, %r2, %r1;
mul.lo.s32 %r14, %r13, %r1;
sub.s32 %r15, %r2, %r14;
ld.global.u32 %r16, [%rd1+-4];
add.s32 %r17, %r13, %r16;
ld.global.u32 %r18, [%rd1+-12];
ld.global.u32 %r19, [%rd1+-8];
mad.lo.s32 %r20, %r17, %r18, %r19;
add.s32 %r21, %r20, %r15;
ld.global.u32 %r22, [%rd1+20];
add.s32 %r23, %r13, %r22;
ld.global.u32 %r24, [%rd1+8];
ld.global.u32 %r25, [%rd1+16];
mad.lo.s32 %r26, %r23, %r24, %r25;
add.s32 %r3, %r26, %r15;
cvta.to.global.u64 %rd7, %rd3;
mul.wide.u32 %rd8, %r21, 4;
add.s64 %rd9, %rd7, %rd8;
ld.global.f32 %f2, [%rd9];
ld.global.f32 %f3, [%rd1+24];
add.rn.f32 %f4, %f3, %f2;
mov.f32 %f5, 0f3F000000;
copysign.f32 %f6, %f4, %f5;
add.rz.f32 %f7, %f4, %f6;
cvt.rzi.f32.f32 %f1, %f7;
ld.global.u32 %r4, [%rd1+28];
setp.gt.u32 %p2, %r4, 15;
@%p2 bra $L__BB18_3;
bra.uni $L__BB18_2;
$L__BB18_3:
mov.f32 %f18, 0f00000000;
max.f32 %f19, %f1, %f18;
mov.f32 %f20, 0f477FFF00;
min.f32 %f21, %f19, %f20;
cvt.rzi.u32.f32 %r32, %f21;
bra.uni $L__BB18_4;
$L__BB18_2:
setp.eq.s32 %p3, %r4, 0;
mov.u32 %r27, -1;
shl.b32 %r28, %r27, %r4;
not.b32 %r29, %r28;
selp.b32 %r30, 1, %r29, %p3;
max.u32 %r31, %r30, 1;
cvt.rn.f32.u32 %f8, %r31;
mov.f32 %f9, 0f00000000;
max.f32 %f10, %f1, %f9;
min.f32 %f11, %f10, %f8;
div.rn.f32 %f12, %f11, %f8;
mul.rn.f32 %f13, %f12, 0f477FFF00;
copysign.f32 %f15, %f13, %f5;
add.rz.f32 %f16, %f13, %f15;
cvt.rzi.f32.f32 %f17, %f16;
cvt.rzi.u32.f32 %r32, %f17;
$L__BB18_4:
mul.wide.u32 %rd10, %r3, 2;
add.s64 %rd11, %rd2, %rd10;
st.global.u16 [%rd11], %r32;
$L__BB18_5:
ret;
}
// .globl j2k_inverse_mct
.visible .entry j2k_inverse_mct(
.param .u64 j2k_inverse_mct_param_0,
.param .u64 j2k_inverse_mct_param_1,
.param .u64 j2k_inverse_mct_param_2,
.param .u64 j2k_inverse_mct_param_3
)
{
.reg .pred %p<3>;
.reg .f32 %f<30>;
.reg .b32 %r<7>;
.reg .b64 %rd<13>;
ld.param.u64 %rd5, [j2k_inverse_mct_param_0];
ld.param.u64 %rd6, [j2k_inverse_mct_param_1];
ld.param.u64 %rd7, [j2k_inverse_mct_param_2];
ld.param.u64 %rd8, [j2k_inverse_mct_param_3];
cvta.to.global.u64 %rd1, %rd8;
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %tid.x;
mad.lo.s32 %r1, %r3, %r2, %r4;
ld.global.u32 %r5, [%rd1];
setp.ge.u32 %p1, %r1, %r5;
@%p1 bra $L__BB19_5;
ld.global.f32 %f1, [%rd1+8];
ld.global.f32 %f2, [%rd1+12];
ld.global.f32 %f3, [%rd1+16];
cvta.to.global.u64 %rd9, %rd5;
mul.wide.u32 %rd10, %r1, 4;
add.s64 %rd2, %rd9, %rd10;
ld.global.f32 %f4, [%rd2];
cvta.to.global.u64 %rd11, %rd6;
add.s64 %rd3, %rd11, %rd10;
ld.global.f32 %f5, [%rd3];
cvta.to.global.u64 %rd12, %rd7;
add.s64 %rd4, %rd12, %rd10;
ld.global.f32 %f6, [%rd4];
ld.global.u32 %r6, [%rd1+4];
setp.eq.s32 %p2, %r6, 0;
@%p2 bra $L__BB19_3;
mul.rn.f32 %f16, %f6, 0f3FB374BC;
add.rn.f32 %f28, %f4, %f16;
mul.rn.f32 %f17, %f5, 0fBEB031CF;
add.rn.f32 %f18, %f4, %f17;
mul.rn.f32 %f19, %f6, 0fBF36D1E1;
add.rn.f32 %f27, %f18, %f19;
mul.rn.f32 %f20, %f5, 0f3FE2D0E5;
add.rn.f32 %f29, %f4, %f20;
bra.uni $L__BB19_4;
$L__BB19_3:
add.rn.f32 %f21, %f5, %f6;
mul.rn.f32 %f22, %f21, 0f3E800000;
cvt.rmi.f32.f32 %f23, %f22;
sub.rn.f32 %f27, %f4, %f23;
add.rn.f32 %f28, %f6, %f27;
add.rn.f32 %f29, %f5, %f27;
$L__BB19_4:
add.rn.f32 %f24, %f1, %f28;
st.global.f32 [%rd2], %f24;
add.rn.f32 %f25, %f2, %f27;
st.global.f32 [%rd3], %f25;
add.rn.f32 %f26, %f3, %f29;
st.global.f32 [%rd4], %f26;
$L__BB19_5:
ret;
}
// .globl j2k_store_rgb8
.visible .entry j2k_store_rgb8(
.param .u64 j2k_store_rgb8_param_0,
.param .u64 j2k_store_rgb8_param_1,
.param .u64 j2k_store_rgb8_param_2,
.param .u64 j2k_store_rgb8_param_3,
.param .u64 j2k_store_rgb8_param_4
)
{
.reg .pred %p<13>;
.reg .b16 %rs<2>;
.reg .f32 %f<67>;
.reg .b32 %r<68>;
.reg .b64 %rd<29>;
ld.param.u64 %rd2, [j2k_store_rgb8_param_0];
ld.param.u64 %rd3, [j2k_store_rgb8_param_1];
ld.param.u64 %rd4, [j2k_store_rgb8_param_2];
ld.param.u64 %rd5, [j2k_store_rgb8_param_3];
ld.param.u64 %rd6, [j2k_store_rgb8_param_4];
cvta.to.global.u64 %rd7, %rd6;
add.s64 %rd1, %rd7, 36;
ld.global.u32 %r19, [%rd7+40];
ld.global.u32 %r1, [%rd7+36];
mul.lo.s32 %r20, %r19, %r1;
mov.u32 %r21, %ntid.x;
mov.u32 %r22, %ctaid.x;
mov.u32 %r23, %tid.x;
mad.lo.s32 %r2, %r22, %r21, %r23;
setp.ge.u32 %p1, %r2, %r20;
@%p1 bra $L__BB20_12;
ld.global.u32 %r3, [%rd1+44];
ld.global.u32 %r4, [%rd1+40];
ld.global.f32 %f1, [%rd1+32];
ld.global.f32 %f2, [%rd1+28];
div.u32 %r24, %r2, %r1;
mul.lo.s32 %r25, %r24, %r1;
sub.s32 %r26, %r2, %r25;
ld.global.u32 %r27, [%rd1+-20];
add.s32 %r28, %r24, %r27;
ld.global.u32 %r29, [%rd1+-36];
ld.global.u32 %r30, [%rd1+-24];
mad.lo.s32 %r31, %r28, %r29, %r30;
add.s32 %r32, %r31, %r26;
ld.global.u32 %r33, [%rd1+-12];
add.s32 %r34, %r24, %r33;
ld.global.u32 %r35, [%rd1+-32];
ld.global.u32 %r36, [%rd1+-16];
mad.lo.s32 %r37, %r34, %r35, %r36;
add.s32 %r5, %r37, %r26;
ld.global.u32 %r38, [%rd1+-4];
add.s32 %r39, %r24, %r38;
ld.global.u32 %r40, [%rd1+-28];
ld.global.u32 %r41, [%rd1+-8];
mad.lo.s32 %r42, %r39, %r40, %r41;
add.s32 %r6, %r42, %r26;
ld.global.u32 %r7, [%rd1+48];
setp.eq.s32 %p2, %r7, 0;
selp.b32 %r43, 3, 4, %p2;
ld.global.u32 %r44, [%rd1+20];
add.s32 %r45, %r24, %r44;
ld.global.u32 %r46, [%rd1+8];
ld.global.u32 %r47, [%rd1+16];
mad.lo.s32 %r48, %r45, %r46, %r47;
add.s32 %r49, %r48, %r26;
mul.lo.s32 %r8, %r49, %r43;
cvta.to.global.u64 %rd8, %rd2;
mul.wide.u32 %rd9, %r32, 4;
add.s64 %rd10, %rd8, %rd9;
ld.global.f32 %f6, [%rd10];
ld.global.f32 %f7, [%rd1+24];
add.rn.f32 %f8, %f7, %f6;
mov.f32 %f9, 0f3F000000;
copysign.f32 %f10, %f8, %f9;
add.rz.f32 %f11, %f8, %f10;
cvt.rzi.f32.f32 %f3, %f11;
ld.global.u32 %r9, [%rd1+36];
setp.eq.s32 %p3, %r9, 8;
@%p3 bra $L__BB20_3;
bra.uni $L__BB20_2;
$L__BB20_3:
mov.f32 %f23, 0f00000000;
max.f32 %f24, %f3, %f23;
mov.f32 %f25, 0f437F0000;
min.f32 %f26, %f24, %f25;
cvt.rzi.u32.f32 %r65, %f26;
bra.uni $L__BB20_4;
$L__BB20_2:
setp.gt.u32 %p4, %r9, 15;
mov.u32 %r50, -1;
shl.b32 %r51, %r50, %r9;
setp.lt.u32 %p5, %r51, -2;
not.b32 %r52, %r51;
selp.b32 %r53, %r52, 1, %p5;
cvt.rn.f32.u32 %f12, %r53;
selp.f32 %f13, 0f477FFF00, %f12, %p4;
mov.f32 %f14, 0f00000000;
max.f32 %f15, %f3, %f14;
min.f32 %f16, %f15, %f13;
div.rn.f32 %f17, %f16, %f13;
mul.rn.f32 %f18, %f17, 0f437F0000;
copysign.f32 %f20, %f18, %f9;
add.rz.f32 %f21, %f18, %f20;
cvt.rzi.f32.f32 %f22, %f21;
cvt.rzi.u32.f32 %r65, %f22;
$L__BB20_4:
cvta.to.global.u64 %rd11, %rd5;
cvt.u64.u32 %rd12, %r8;
add.s64 %rd13, %rd11, %rd12;
st.global.u8 [%rd13], %r65;
cvta.to.global.u64 %rd14, %rd3;
mul.wide.u32 %rd15, %r5, 4;
add.s64 %rd16, %rd14, %rd15;
ld.global.f32 %f27, [%rd16];
add.rn.f32 %f28, %f2, %f27;
mov.f32 %f29, 0f3F000000;
copysign.f32 %f30, %f28, %f29;
add.rz.f32 %f31, %f28, %f30;
cvt.rzi.f32.f32 %f4, %f31;
setp.eq.s32 %p6, %r4, 8;
@%p6 bra $L__BB20_6;
bra.uni $L__BB20_5;
$L__BB20_6:
mov.f32 %f43, 0f00000000;
max.f32 %f44, %f4, %f43;
mov.f32 %f45, 0f437F0000;
min.f32 %f46, %f44, %f45;
cvt.rzi.u32.f32 %r66, %f46;
bra.uni $L__BB20_7;
$L__BB20_5:
setp.gt.u32 %p7, %r4, 15;
mov.u32 %r54, -1;
shl.b32 %r55, %r54, %r4;
setp.lt.u32 %p8, %r55, -2;
not.b32 %r56, %r55;
selp.b32 %r57, %r56, 1, %p8;
cvt.rn.f32.u32 %f32, %r57;
selp.f32 %f33, 0f477FFF00, %f32, %p7;
mov.f32 %f34, 0f00000000;
max.f32 %f35, %f4, %f34;
min.f32 %f36, %f35, %f33;
div.rn.f32 %f37, %f36, %f33;
mul.rn.f32 %f38, %f37, 0f437F0000;
copysign.f32 %f40, %f38, %f29;
add.rz.f32 %f41, %f38, %f40;
cvt.rzi.f32.f32 %f42, %f41;
cvt.rzi.u32.f32 %r66, %f42;
$L__BB20_7:
add.s32 %r58, %r8, 1;
cvt.u64.u32 %rd17, %r58;
add.s64 %rd19, %rd11, %rd17;
st.global.u8 [%rd19], %r66;
cvta.to.global.u64 %rd20, %rd4;
mul.wide.u32 %rd21, %r6, 4;
add.s64 %rd22, %rd20, %rd21;
ld.global.f32 %f47, [%rd22];
add.rn.f32 %f48, %f1, %f47;
mov.f32 %f49, 0f3F000000;
copysign.f32 %f50, %f48, %f49;
add.rz.f32 %f51, %f48, %f50;
cvt.rzi.f32.f32 %f5, %f51;
setp.eq.s32 %p9, %r3, 8;
@%p9 bra $L__BB20_9;
bra.uni $L__BB20_8;
$L__BB20_9:
mov.f32 %f63, 0f00000000;
max.f32 %f64, %f5, %f63;
mov.f32 %f65, 0f437F0000;
min.f32 %f66, %f64, %f65;
cvt.rzi.u32.f32 %r67, %f66;
bra.uni $L__BB20_10;
$L__BB20_8:
setp.gt.u32 %p10, %r3, 15;
mov.u32 %r59, -1;
shl.b32 %r60, %r59, %r3;
setp.lt.u32 %p11, %r60, -2;
not.b32 %r61, %r60;
selp.b32 %r62, %r61, 1, %p11;
cvt.rn.f32.u32 %f52, %r62;
selp.f32 %f53, 0f477FFF00, %f52, %p10;
mov.f32 %f54, 0f00000000;
max.f32 %f55, %f5, %f54;
min.f32 %f56, %f55, %f53;
div.rn.f32 %f57, %f56, %f53;
mul.rn.f32 %f58, %f57, 0f437F0000;
copysign.f32 %f60, %f58, %f49;
add.rz.f32 %f61, %f58, %f60;
cvt.rzi.f32.f32 %f62, %f61;
cvt.rzi.u32.f32 %r67, %f62;
$L__BB20_10:
add.s32 %r63, %r8, 2;
cvt.u64.u32 %rd23, %r63;
add.s64 %rd25, %rd11, %rd23;
st.global.u8 [%rd25], %r67;
@%p2 bra $L__BB20_12;
add.s32 %r64, %r8, 3;
cvt.u64.u32 %rd26, %r64;
add.s64 %rd28, %rd11, %rd26;
mov.u16 %rs1, 255;
st.global.u8 [%rd28], %rs1;
$L__BB20_12:
ret;
}
// .globl j2k_store_rgb16
.visible .entry j2k_store_rgb16(
.param .u64 j2k_store_rgb16_param_0,
.param .u64 j2k_store_rgb16_param_1,
.param .u64 j2k_store_rgb16_param_2,
.param .u64 j2k_store_rgb16_param_3,
.param .u64 j2k_store_rgb16_param_4
)
{
.reg .pred %p<10>;
.reg .b16 %rs<2>;
.reg .f32 %f<64>;
.reg .b32 %r<71>;
.reg .b64 %rd<29>;
ld.param.u64 %rd2, [j2k_store_rgb16_param_0];
ld.param.u64 %rd3, [j2k_store_rgb16_param_1];
ld.param.u64 %rd4, [j2k_store_rgb16_param_2];
ld.param.u64 %rd5, [j2k_store_rgb16_param_3];
ld.param.u64 %rd6, [j2k_store_rgb16_param_4];
cvta.to.global.u64 %rd7, %rd6;
add.s64 %rd1, %rd7, 36;
ld.global.u32 %r19, [%rd7+40];
ld.global.u32 %r1, [%rd7+36];
mul.lo.s32 %r20, %r19, %r1;
mov.u32 %r21, %ntid.x;
mov.u32 %r22, %ctaid.x;
mov.u32 %r23, %tid.x;
mad.lo.s32 %r2, %r22, %r21, %r23;
setp.ge.u32 %p1, %r2, %r20;
@%p1 bra $L__BB21_12;
ld.global.u32 %r3, [%rd1+44];
ld.global.u32 %r4, [%rd1+40];
ld.global.f32 %f1, [%rd1+32];
ld.global.f32 %f2, [%rd1+28];
div.u32 %r24, %r2, %r1;
mul.lo.s32 %r25, %r24, %r1;
sub.s32 %r26, %r2, %r25;
ld.global.u32 %r27, [%rd1+-20];
add.s32 %r28, %r24, %r27;
ld.global.u32 %r29, [%rd1+-36];
ld.global.u32 %r30, [%rd1+-24];
mad.lo.s32 %r31, %r28, %r29, %r30;
add.s32 %r32, %r31, %r26;
ld.global.u32 %r33, [%rd1+-12];
add.s32 %r34, %r24, %r33;
ld.global.u32 %r35, [%rd1+-32];
ld.global.u32 %r36, [%rd1+-16];
mad.lo.s32 %r37, %r34, %r35, %r36;
add.s32 %r5, %r37, %r26;
ld.global.u32 %r38, [%rd1+-4];
add.s32 %r39, %r24, %r38;
ld.global.u32 %r40, [%rd1+-28];
ld.global.u32 %r41, [%rd1+-8];
mad.lo.s32 %r42, %r39, %r40, %r41;
add.s32 %r6, %r42, %r26;
ld.global.u32 %r7, [%rd1+48];
setp.eq.s32 %p2, %r7, 0;
selp.b32 %r43, 3, 4, %p2;
ld.global.u32 %r44, [%rd1+20];
add.s32 %r45, %r24, %r44;
ld.global.u32 %r46, [%rd1+8];
ld.global.u32 %r47, [%rd1+16];
mad.lo.s32 %r48, %r45, %r46, %r47;
add.s32 %r49, %r48, %r26;
mul.lo.s32 %r8, %r49, %r43;
cvta.to.global.u64 %rd8, %rd2;
mul.wide.u32 %rd9, %r32, 4;
add.s64 %rd10, %rd8, %rd9;
ld.global.f32 %f6, [%rd10];
ld.global.f32 %f7, [%rd1+24];
add.rn.f32 %f8, %f7, %f6;
mov.f32 %f9, 0f3F000000;
copysign.f32 %f10, %f8, %f9;
add.rz.f32 %f11, %f8, %f10;
cvt.rzi.f32.f32 %f3, %f11;
ld.global.u32 %r9, [%rd1+36];
setp.gt.u32 %p3, %r9, 15;
@%p3 bra $L__BB21_3;
bra.uni $L__BB21_2;
$L__BB21_3:
mov.f32 %f22, 0f00000000;
max.f32 %f23, %f3, %f22;
mov.f32 %f24, 0f477FFF00;
min.f32 %f25, %f23, %f24;
cvt.rzi.u32.f32 %r68, %f25;
bra.uni $L__BB21_4;
$L__BB21_2:
setp.eq.s32 %p4, %r9, 0;
mov.u32 %r50, -1;
shl.b32 %r51, %r50, %r9;
not.b32 %r52, %r51;
selp.b32 %r53, 1, %r52, %p4;
max.u32 %r54, %r53, 1;
cvt.rn.f32.u32 %f12, %r54;
mov.f32 %f13, 0f00000000;
max.f32 %f14, %f3, %f13;
min.f32 %f15, %f14, %f12;
div.rn.f32 %f16, %f15, %f12;
mul.rn.f32 %f17, %f16, 0f477FFF00;
copysign.f32 %f19, %f17, %f9;
add.rz.f32 %f20, %f17, %f19;
cvt.rzi.f32.f32 %f21, %f20;
cvt.rzi.u32.f32 %r68, %f21;
$L__BB21_4:
cvta.to.global.u64 %rd11, %rd5;
mul.wide.u32 %rd12, %r8, 2;
add.s64 %rd13, %rd11, %rd12;
st.global.u16 [%rd13], %r68;
cvta.to.global.u64 %rd14, %rd3;
mul.wide.u32 %rd15, %r5, 4;
add.s64 %rd16, %rd14, %rd15;
ld.global.f32 %f26, [%rd16];
add.rn.f32 %f27, %f2, %f26;
mov.f32 %f28, 0f3F000000;
copysign.f32 %f29, %f27, %f28;
add.rz.f32 %f30, %f27, %f29;
cvt.rzi.f32.f32 %f4, %f30;
setp.gt.u32 %p5, %r4, 15;
@%p5 bra $L__BB21_6;
bra.uni $L__BB21_5;
$L__BB21_6:
mov.f32 %f41, 0f00000000;
max.f32 %f42, %f4, %f41;
mov.f32 %f43, 0f477FFF00;
min.f32 %f44, %f42, %f43;
cvt.rzi.u32.f32 %r69, %f44;
bra.uni $L__BB21_7;
$L__BB21_5:
setp.eq.s32 %p6, %r4, 0;
mov.u32 %r55, -1;
shl.b32 %r56, %r55, %r4;
not.b32 %r57, %r56;
selp.b32 %r58, 1, %r57, %p6;
max.u32 %r59, %r58, 1;
cvt.rn.f32.u32 %f31, %r59;
mov.f32 %f32, 0f00000000;
max.f32 %f33, %f4, %f32;
min.f32 %f34, %f33, %f31;
div.rn.f32 %f35, %f34, %f31;
mul.rn.f32 %f36, %f35, 0f477FFF00;
copysign.f32 %f38, %f36, %f28;
add.rz.f32 %f39, %f36, %f38;
cvt.rzi.f32.f32 %f40, %f39;
cvt.rzi.u32.f32 %r69, %f40;
$L__BB21_7:
add.s32 %r60, %r8, 1;
mul.wide.u32 %rd18, %r60, 2;
add.s64 %rd19, %rd11, %rd18;
st.global.u16 [%rd19], %r69;
cvta.to.global.u64 %rd20, %rd4;
mul.wide.u32 %rd21, %r6, 4;
add.s64 %rd22, %rd20, %rd21;
ld.global.f32 %f45, [%rd22];
add.rn.f32 %f46, %f1, %f45;
mov.f32 %f47, 0f3F000000;
copysign.f32 %f48, %f46, %f47;
add.rz.f32 %f49, %f46, %f48;
cvt.rzi.f32.f32 %f5, %f49;
setp.gt.u32 %p7, %r3, 15;
@%p7 bra $L__BB21_9;
bra.uni $L__BB21_8;
$L__BB21_9:
mov.f32 %f60, 0f00000000;
max.f32 %f61, %f5, %f60;
mov.f32 %f62, 0f477FFF00;
min.f32 %f63, %f61, %f62;
cvt.rzi.u32.f32 %r70, %f63;
bra.uni $L__BB21_10;
$L__BB21_8:
setp.eq.s32 %p8, %r3, 0;
mov.u32 %r61, -1;
shl.b32 %r62, %r61, %r3;
not.b32 %r63, %r62;
selp.b32 %r64, 1, %r63, %p8;
max.u32 %r65, %r64, 1;
cvt.rn.f32.u32 %f50, %r65;
mov.f32 %f51, 0f00000000;
max.f32 %f52, %f5, %f51;
min.f32 %f53, %f52, %f50;
div.rn.f32 %f54, %f53, %f50;
mul.rn.f32 %f55, %f54, 0f477FFF00;
copysign.f32 %f57, %f55, %f47;
add.rz.f32 %f58, %f55, %f57;
cvt.rzi.f32.f32 %f59, %f58;
cvt.rzi.u32.f32 %r70, %f59;
$L__BB21_10:
add.s32 %r66, %r8, 2;
mul.wide.u32 %rd24, %r66, 2;
add.s64 %rd25, %rd11, %rd24;
st.global.u16 [%rd25], %r70;
@%p2 bra $L__BB21_12;
add.s32 %r67, %r8, 3;
mul.wide.u32 %rd27, %r67, 2;
add.s64 %rd28, %rd11, %rd27;
mov.u16 %rs1, -1;
st.global.u16 [%rd28], %rs1;
$L__BB21_12:
ret;
}
// .globl j2k_store_rgb8_mct
.visible .entry j2k_store_rgb8_mct(
.param .u64 j2k_store_rgb8_mct_param_0,
.param .u64 j2k_store_rgb8_mct_param_1,
.param .u64 j2k_store_rgb8_mct_param_2,
.param .u64 j2k_store_rgb8_mct_param_3,
.param .u64 j2k_store_rgb8_mct_param_4
)
{
.reg .pred %p<14>;
.reg .b16 %rs<2>;
.reg .f32 %f<87>;
.reg .b32 %r<69>;
.reg .b64 %rd<26>;
ld.param.u64 %rd3, [j2k_store_rgb8_mct_param_0];
ld.param.u64 %rd4, [j2k_store_rgb8_mct_param_1];
ld.param.u64 %rd5, [j2k_store_rgb8_mct_param_2];
ld.param.u64 %rd6, [j2k_store_rgb8_mct_param_3];
ld.param.u64 %rd7, [j2k_store_rgb8_mct_param_4];
cvta.to.global.u64 %rd1, %rd6;
cvta.to.global.u64 %rd8, %rd7;
add.s64 %rd2, %rd8, 36;
ld.global.u32 %r1, [%rd8+72];
ld.global.u32 %r2, [%rd8+76];
ld.global.u32 %r3, [%rd8+80];
ld.global.u32 %r17, [%rd8+40];
ld.global.u32 %r4, [%rd8+36];
mul.lo.s32 %r18, %r17, %r4;
mov.u32 %r19, %ntid.x;
mov.u32 %r20, %ctaid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r5, %r20, %r19, %r21;
setp.ge.u32 %p1, %r5, %r18;
@%p1 bra $L__BB22_15;
ld.global.f32 %f1, [%rd2+24];
ld.global.f32 %f2, [%rd2+28];
ld.global.f32 %f3, [%rd2+32];
div.u32 %r22, %r5, %r4;
mul.lo.s32 %r23, %r22, %r4;
sub.s32 %r24, %r5, %r23;
ld.global.u32 %r25, [%rd2+-20];
add.s32 %r26, %r22, %r25;
ld.global.u32 %r27, [%rd2+-36];
ld.global.u32 %r28, [%rd2+-24];
mad.lo.s32 %r29, %r26, %r27, %r28;
add.s32 %r30, %r29, %r24;
ld.global.u32 %r31, [%rd2+-12];
add.s32 %r32, %r22, %r31;
ld.global.u32 %r33, [%rd2+-32];
ld.global.u32 %r34, [%rd2+-16];
mad.lo.s32 %r35, %r32, %r33, %r34;
add.s32 %r36, %r35, %r24;
ld.global.u32 %r37, [%rd2+-4];
add.s32 %r38, %r22, %r37;
ld.global.u32 %r39, [%rd2+-28];
ld.global.u32 %r40, [%rd2+-8];
mad.lo.s32 %r41, %r38, %r39, %r40;
add.s32 %r42, %r41, %r24;
ld.global.u32 %r6, [%rd2+48];
setp.eq.s32 %p2, %r6, 0;
selp.b32 %r43, 3, 4, %p2;
ld.global.u32 %r44, [%rd2+20];
add.s32 %r45, %r22, %r44;
ld.global.u32 %r46, [%rd2+8];
ld.global.u32 %r47, [%rd2+16];
mad.lo.s32 %r48, %r45, %r46, %r47;
add.s32 %r49, %r48, %r24;
mul.lo.s32 %r7, %r49, %r43;
cvta.to.global.u64 %rd9, %rd3;
mul.wide.u32 %rd10, %r30, 4;
add.s64 %rd11, %rd9, %rd10;
ld.global.f32 %f4, [%rd11];
cvta.to.global.u64 %rd12, %rd4;
mul.wide.u32 %rd13, %r36, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.f32 %f5, [%rd14];
cvta.to.global.u64 %rd15, %rd5;
mul.wide.u32 %rd16, %r42, 4;
add.s64 %rd17, %rd15, %rd16;
ld.global.f32 %f6, [%rd17];
ld.global.u32 %r50, [%rd2+52];
setp.eq.s32 %p3, %r50, 0;
@%p3 bra $L__BB22_3;
mul.rn.f32 %f19, %f6, 0f3FB374BC;
add.rn.f32 %f85, %f4, %f19;
mul.rn.f32 %f20, %f5, 0fBEB031CF;
add.rn.f32 %f21, %f4, %f20;
mul.rn.f32 %f22, %f6, 0fBF36D1E1;
add.rn.f32 %f84, %f21, %f22;
mul.rn.f32 %f23, %f5, 0f3FE2D0E5;
add.rn.f32 %f86, %f4, %f23;
bra.uni $L__BB22_4;
$L__BB22_3:
add.rn.f32 %f24, %f5, %f6;
mul.rn.f32 %f25, %f24, 0f3E800000;
cvt.rmi.f32.f32 %f26, %f25;
sub.rn.f32 %f84, %f4, %f26;
add.rn.f32 %f85, %f6, %f84;
add.rn.f32 %f86, %f5, %f84;
$L__BB22_4:
add.rn.f32 %f27, %f1, %f85;
mov.f32 %f28, 0f3F000000;
copysign.f32 %f29, %f27, %f28;
add.rz.f32 %f30, %f27, %f29;
cvt.rzi.f32.f32 %f16, %f30;
setp.eq.s32 %p4, %r1, 8;
@%p4 bra $L__BB22_6;
bra.uni $L__BB22_5;
$L__BB22_6:
mov.f32 %f42, 0f00000000;
max.f32 %f43, %f16, %f42;
mov.f32 %f44, 0f437F0000;
min.f32 %f45, %f43, %f44;
cvt.rzi.u32.f32 %r66, %f45;
bra.uni $L__BB22_7;
$L__BB22_5:
setp.gt.u32 %p5, %r1, 15;
mov.u32 %r51, -1;
shl.b32 %r52, %r51, %r1;
setp.lt.u32 %p6, %r52, -2;
not.b32 %r53, %r52;
selp.b32 %r54, %r53, 1, %p6;
cvt.rn.f32.u32 %f31, %r54;
selp.f32 %f32, 0f477FFF00, %f31, %p5;
mov.f32 %f33, 0f00000000;
max.f32 %f34, %f16, %f33;
min.f32 %f35, %f34, %f32;
div.rn.f32 %f36, %f35, %f32;
mul.rn.f32 %f37, %f36, 0f437F0000;
copysign.f32 %f39, %f37, %f28;
add.rz.f32 %f40, %f37, %f39;
cvt.rzi.f32.f32 %f41, %f40;
cvt.rzi.u32.f32 %r66, %f41;
$L__BB22_7:
cvt.u64.u32 %rd18, %r7;
add.s64 %rd19, %rd1, %rd18;
st.global.u8 [%rd19], %r66;
add.rn.f32 %f46, %f2, %f84;
mov.f32 %f47, 0f3F000000;
copysign.f32 %f48, %f46, %f47;
add.rz.f32 %f49, %f46, %f48;
cvt.rzi.f32.f32 %f17, %f49;
setp.eq.s32 %p7, %r2, 8;
@%p7 bra $L__BB22_9;
bra.uni $L__BB22_8;
$L__BB22_9:
mov.f32 %f61, 0f00000000;
max.f32 %f62, %f17, %f61;
mov.f32 %f63, 0f437F0000;
min.f32 %f64, %f62, %f63;
cvt.rzi.u32.f32 %r67, %f64;
bra.uni $L__BB22_10;
$L__BB22_8:
setp.gt.u32 %p8, %r2, 15;
mov.u32 %r55, -1;
shl.b32 %r56, %r55, %r2;
setp.lt.u32 %p9, %r56, -2;
not.b32 %r57, %r56;
selp.b32 %r58, %r57, 1, %p9;
cvt.rn.f32.u32 %f50, %r58;
selp.f32 %f51, 0f477FFF00, %f50, %p8;
mov.f32 %f52, 0f00000000;
max.f32 %f53, %f17, %f52;
min.f32 %f54, %f53, %f51;
div.rn.f32 %f55, %f54, %f51;
mul.rn.f32 %f56, %f55, 0f437F0000;
copysign.f32 %f58, %f56, %f47;
add.rz.f32 %f59, %f56, %f58;
cvt.rzi.f32.f32 %f60, %f59;
cvt.rzi.u32.f32 %r67, %f60;
$L__BB22_10:
add.s32 %r59, %r7, 1;
cvt.u64.u32 %rd20, %r59;
add.s64 %rd21, %rd1, %rd20;
st.global.u8 [%rd21], %r67;
add.rn.f32 %f65, %f3, %f86;
mov.f32 %f66, 0f3F000000;
copysign.f32 %f67, %f65, %f66;
add.rz.f32 %f68, %f65, %f67;
cvt.rzi.f32.f32 %f18, %f68;
setp.eq.s32 %p10, %r3, 8;
@%p10 bra $L__BB22_12;
bra.uni $L__BB22_11;
$L__BB22_12:
mov.f32 %f80, 0f00000000;
max.f32 %f81, %f18, %f80;
mov.f32 %f82, 0f437F0000;
min.f32 %f83, %f81, %f82;
cvt.rzi.u32.f32 %r68, %f83;
bra.uni $L__BB22_13;
$L__BB22_11:
setp.gt.u32 %p11, %r3, 15;
mov.u32 %r60, -1;
shl.b32 %r61, %r60, %r3;
setp.lt.u32 %p12, %r61, -2;
not.b32 %r62, %r61;
selp.b32 %r63, %r62, 1, %p12;
cvt.rn.f32.u32 %f69, %r63;
selp.f32 %f70, 0f477FFF00, %f69, %p11;
mov.f32 %f71, 0f00000000;
max.f32 %f72, %f18, %f71;
min.f32 %f73, %f72, %f70;
div.rn.f32 %f74, %f73, %f70;
mul.rn.f32 %f75, %f74, 0f437F0000;
copysign.f32 %f77, %f75, %f66;
add.rz.f32 %f78, %f75, %f77;
cvt.rzi.f32.f32 %f79, %f78;
cvt.rzi.u32.f32 %r68, %f79;
$L__BB22_13:
add.s32 %r64, %r7, 2;
cvt.u64.u32 %rd22, %r64;
add.s64 %rd23, %rd1, %rd22;
st.global.u8 [%rd23], %r68;
@%p2 bra $L__BB22_15;
add.s32 %r65, %r7, 3;
cvt.u64.u32 %rd24, %r65;
add.s64 %rd25, %rd1, %rd24;
mov.u16 %rs1, 255;
st.global.u8 [%rd25], %rs1;
$L__BB22_15:
ret;
}
// .globl j2k_store_rgb8_mct_batch
.visible .entry j2k_store_rgb8_mct_batch(
.param .u64 j2k_store_rgb8_mct_batch_param_0
)
{
.reg .pred %p<14>;
.reg .b16 %rs<2>;
.reg .f32 %f<89>;
.reg .b32 %r<84>;
.reg .b64 %rd<24>;
ld.param.u64 %rd3, [j2k_store_rgb8_mct_batch_param_0];
cvta.to.global.u64 %rd4, %rd3;
mov.u32 %r17, %ctaid.y;
mul.wide.u32 %rd5, %r17, 128;
add.s64 %rd6, %rd4, %rd5;
add.s64 %rd1, %rd6, 112;
ld.global.u32 %r1, [%rd6+112];
ld.global.v2.u32 {%r18, %r19}, [%rd6+104];
ld.global.u64 %rd2, [%rd6+24];
ld.global.u32 %r4, [%rd6+68];
ld.global.u32 %r20, [%rd6+72];
mul.lo.s32 %r21, %r4, %r20;
mov.u32 %r22, %ntid.x;
mov.u32 %r23, %ctaid.x;
mov.u32 %r24, %tid.x;
mad.lo.s32 %r5, %r23, %r22, %r24;
setp.ge.u32 %p1, %r5, %r21;
@%p1 bra $L__BB23_15;
ld.global.v2.f32 {%f19, %f20}, [%rd1+-16];
ld.global.f32 %f3, [%rd1+-20];
ld.global.v2.u32 {%r25, %r26}, [%rd1+-80];
ld.global.v2.u32 {%r29, %r30}, [%rd1+-72];
ld.global.v2.u32 {%r33, %r34}, [%rd1+-64];
ld.global.v2.u32 {%r37, %r38}, [%rd1+-56];
.pragma "used_bytes_mask 15";
ld.global.v2.u32 {%r41, %r42}, [%rd1+-48];
.pragma "used_bytes_mask 240";
ld.global.v2.u32 {%r44, %r45}, [%rd1+-40];
div.u32 %r47, %r5, %r4;
mul.lo.s32 %r48, %r47, %r4;
sub.s32 %r49, %r5, %r48;
add.s32 %r50, %r47, %r33;
mad.lo.s32 %r51, %r50, %r25, %r30;
add.s32 %r52, %r51, %r49;
add.s32 %r53, %r47, %r37;
mad.lo.s32 %r54, %r53, %r26, %r34;
add.s32 %r55, %r54, %r49;
add.s32 %r56, %r47, %r41;
mad.lo.s32 %r57, %r56, %r29, %r38;
add.s32 %r58, %r57, %r49;
ld.global.u32 %r6, [%rd1+4];
setp.eq.s32 %p2, %r6, 0;
selp.b32 %r59, 3, 4, %p2;
ld.global.u32 %r60, [%rd1+-24];
add.s32 %r61, %r47, %r60;
ld.global.u32 %r62, [%rd1+-28];
mad.lo.s32 %r63, %r61, %r45, %r62;
add.s32 %r64, %r63, %r49;
mul.lo.s32 %r7, %r64, %r59;
ld.global.u64 %rd7, [%rd1+-112];
mul.wide.u32 %rd8, %r52, 4;
add.s64 %rd9, %rd7, %rd8;
ld.f32 %f4, [%rd9];
ld.global.u64 %rd10, [%rd1+-104];
mul.wide.u32 %rd11, %r55, 4;
add.s64 %rd12, %rd10, %rd11;
ld.f32 %f5, [%rd12];
ld.global.u64 %rd13, [%rd1+-96];
mul.wide.u32 %rd14, %r58, 4;
add.s64 %rd15, %rd13, %rd14;
ld.f32 %f6, [%rd15];
ld.global.u32 %r65, [%rd1+8];
setp.eq.s32 %p3, %r65, 0;
@%p3 bra $L__BB23_3;
mul.rn.f32 %f21, %f6, 0f3FB374BC;
add.rn.f32 %f87, %f4, %f21;
mul.rn.f32 %f22, %f5, 0fBEB031CF;
add.rn.f32 %f23, %f4, %f22;
mul.rn.f32 %f24, %f6, 0fBF36D1E1;
add.rn.f32 %f86, %f23, %f24;
mul.rn.f32 %f25, %f5, 0f3FE2D0E5;
add.rn.f32 %f88, %f4, %f25;
bra.uni $L__BB23_4;
$L__BB23_3:
add.rn.f32 %f26, %f5, %f6;
mul.rn.f32 %f27, %f26, 0f3E800000;
cvt.rmi.f32.f32 %f28, %f27;
sub.rn.f32 %f86, %f4, %f28;
add.rn.f32 %f87, %f6, %f86;
add.rn.f32 %f88, %f5, %f86;
$L__BB23_4:
add.rn.f32 %f29, %f3, %f87;
mov.f32 %f30, 0f3F000000;
copysign.f32 %f31, %f29, %f30;
add.rz.f32 %f32, %f29, %f31;
cvt.rzi.f32.f32 %f16, %f32;
setp.eq.s32 %p4, %r18, 8;
@%p4 bra $L__BB23_6;
bra.uni $L__BB23_5;
$L__BB23_6:
mov.f32 %f44, 0f00000000;
max.f32 %f45, %f16, %f44;
mov.f32 %f46, 0f437F0000;
min.f32 %f47, %f45, %f46;
cvt.rzi.u32.f32 %r81, %f47;
bra.uni $L__BB23_7;
$L__BB23_5:
setp.gt.u32 %p5, %r18, 15;
mov.u32 %r66, -1;
shl.b32 %r67, %r66, %r18;
setp.lt.u32 %p6, %r67, -2;
not.b32 %r68, %r67;
selp.b32 %r69, %r68, 1, %p6;
cvt.rn.f32.u32 %f33, %r69;
selp.f32 %f34, 0f477FFF00, %f33, %p5;
mov.f32 %f35, 0f00000000;
max.f32 %f36, %f16, %f35;
min.f32 %f37, %f36, %f34;
div.rn.f32 %f38, %f37, %f34;
mul.rn.f32 %f39, %f38, 0f437F0000;
copysign.f32 %f41, %f39, %f30;
add.rz.f32 %f42, %f39, %f41;
cvt.rzi.f32.f32 %f43, %f42;
cvt.rzi.u32.f32 %r81, %f43;
$L__BB23_7:
cvt.u64.u32 %rd16, %r7;
add.s64 %rd17, %rd2, %rd16;
st.u8 [%rd17], %r81;
add.rn.f32 %f48, %f19, %f86;
mov.f32 %f49, 0f3F000000;
copysign.f32 %f50, %f48, %f49;
add.rz.f32 %f51, %f48, %f50;
cvt.rzi.f32.f32 %f17, %f51;
setp.eq.s32 %p7, %r19, 8;
@%p7 bra $L__BB23_9;
bra.uni $L__BB23_8;
$L__BB23_9:
mov.f32 %f63, 0f00000000;
max.f32 %f64, %f17, %f63;
mov.f32 %f65, 0f437F0000;
min.f32 %f66, %f64, %f65;
cvt.rzi.u32.f32 %r82, %f66;
bra.uni $L__BB23_10;
$L__BB23_8:
setp.gt.u32 %p8, %r19, 15;
mov.u32 %r70, -1;
shl.b32 %r71, %r70, %r19;
setp.lt.u32 %p9, %r71, -2;
not.b32 %r72, %r71;
selp.b32 %r73, %r72, 1, %p9;
cvt.rn.f32.u32 %f52, %r73;
selp.f32 %f53, 0f477FFF00, %f52, %p8;
mov.f32 %f54, 0f00000000;
max.f32 %f55, %f17, %f54;
min.f32 %f56, %f55, %f53;
div.rn.f32 %f57, %f56, %f53;
mul.rn.f32 %f58, %f57, 0f437F0000;
copysign.f32 %f60, %f58, %f49;
add.rz.f32 %f61, %f58, %f60;
cvt.rzi.f32.f32 %f62, %f61;
cvt.rzi.u32.f32 %r82, %f62;
$L__BB23_10:
add.s32 %r74, %r7, 1;
cvt.u64.u32 %rd18, %r74;
add.s64 %rd19, %rd2, %rd18;
st.u8 [%rd19], %r82;
add.rn.f32 %f67, %f20, %f88;
mov.f32 %f68, 0f3F000000;
copysign.f32 %f69, %f67, %f68;
add.rz.f32 %f70, %f67, %f69;
cvt.rzi.f32.f32 %f18, %f70;
setp.eq.s32 %p10, %r1, 8;
@%p10 bra $L__BB23_12;
bra.uni $L__BB23_11;
$L__BB23_12:
mov.f32 %f82, 0f00000000;
max.f32 %f83, %f18, %f82;
mov.f32 %f84, 0f437F0000;
min.f32 %f85, %f83, %f84;
cvt.rzi.u32.f32 %r83, %f85;
bra.uni $L__BB23_13;
$L__BB23_11:
setp.gt.u32 %p11, %r1, 15;
mov.u32 %r75, -1;
shl.b32 %r76, %r75, %r1;
setp.lt.u32 %p12, %r76, -2;
not.b32 %r77, %r76;
selp.b32 %r78, %r77, 1, %p12;
cvt.rn.f32.u32 %f71, %r78;
selp.f32 %f72, 0f477FFF00, %f71, %p11;
mov.f32 %f73, 0f00000000;
max.f32 %f74, %f18, %f73;
min.f32 %f75, %f74, %f72;
div.rn.f32 %f76, %f75, %f72;
mul.rn.f32 %f77, %f76, 0f437F0000;
copysign.f32 %f79, %f77, %f68;
add.rz.f32 %f80, %f77, %f79;
cvt.rzi.f32.f32 %f81, %f80;
cvt.rzi.u32.f32 %r83, %f81;
$L__BB23_13:
add.s32 %r79, %r7, 2;
cvt.u64.u32 %rd20, %r79;
add.s64 %rd21, %rd2, %rd20;
st.u8 [%rd21], %r83;
@%p2 bra $L__BB23_15;
add.s32 %r80, %r7, 3;
cvt.u64.u32 %rd22, %r80;
add.s64 %rd23, %rd2, %rd22;
mov.u16 %rs1, 255;
st.u8 [%rd23], %rs1;
$L__BB23_15:
ret;
}
// .globl j2k_store_rgb16_mct
.visible .entry j2k_store_rgb16_mct(
.param .u64 j2k_store_rgb16_mct_param_0,
.param .u64 j2k_store_rgb16_mct_param_1,
.param .u64 j2k_store_rgb16_mct_param_2,
.param .u64 j2k_store_rgb16_mct_param_3,
.param .u64 j2k_store_rgb16_mct_param_4
)
{
.reg .pred %p<11>;
.reg .b16 %rs<2>;
.reg .f32 %f<84>;
.reg .b32 %r<72>;
.reg .b64 %rd<26>;
ld.param.u64 %rd3, [j2k_store_rgb16_mct_param_0];
ld.param.u64 %rd4, [j2k_store_rgb16_mct_param_1];
ld.param.u64 %rd5, [j2k_store_rgb16_mct_param_2];
ld.param.u64 %rd6, [j2k_store_rgb16_mct_param_3];
ld.param.u64 %rd7, [j2k_store_rgb16_mct_param_4];
cvta.to.global.u64 %rd1, %rd6;
cvta.to.global.u64 %rd8, %rd7;
add.s64 %rd2, %rd8, 36;
ld.global.u32 %r1, [%rd8+72];
ld.global.u32 %r2, [%rd8+76];
ld.global.u32 %r3, [%rd8+80];
ld.global.u32 %r17, [%rd8+40];
ld.global.u32 %r4, [%rd8+36];
mul.lo.s32 %r18, %r17, %r4;
mov.u32 %r19, %ntid.x;
mov.u32 %r20, %ctaid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r5, %r20, %r19, %r21;
setp.ge.u32 %p1, %r5, %r18;
@%p1 bra $L__BB24_15;
ld.global.f32 %f1, [%rd2+24];
ld.global.f32 %f2, [%rd2+28];
ld.global.f32 %f3, [%rd2+32];
div.u32 %r22, %r5, %r4;
mul.lo.s32 %r23, %r22, %r4;
sub.s32 %r24, %r5, %r23;
ld.global.u32 %r25, [%rd2+-20];
add.s32 %r26, %r22, %r25;
ld.global.u32 %r27, [%rd2+-36];
ld.global.u32 %r28, [%rd2+-24];
mad.lo.s32 %r29, %r26, %r27, %r28;
add.s32 %r30, %r29, %r24;
ld.global.u32 %r31, [%rd2+-12];
add.s32 %r32, %r22, %r31;
ld.global.u32 %r33, [%rd2+-32];
ld.global.u32 %r34, [%rd2+-16];
mad.lo.s32 %r35, %r32, %r33, %r34;
add.s32 %r36, %r35, %r24;
ld.global.u32 %r37, [%rd2+-4];
add.s32 %r38, %r22, %r37;
ld.global.u32 %r39, [%rd2+-28];
ld.global.u32 %r40, [%rd2+-8];
mad.lo.s32 %r41, %r38, %r39, %r40;
add.s32 %r42, %r41, %r24;
ld.global.u32 %r6, [%rd2+48];
setp.eq.s32 %p2, %r6, 0;
selp.b32 %r43, 3, 4, %p2;
ld.global.u32 %r44, [%rd2+20];
add.s32 %r45, %r22, %r44;
ld.global.u32 %r46, [%rd2+8];
ld.global.u32 %r47, [%rd2+16];
mad.lo.s32 %r48, %r45, %r46, %r47;
add.s32 %r49, %r48, %r24;
mul.lo.s32 %r7, %r49, %r43;
cvta.to.global.u64 %rd9, %rd3;
mul.wide.u32 %rd10, %r30, 4;
add.s64 %rd11, %rd9, %rd10;
ld.global.f32 %f4, [%rd11];
cvta.to.global.u64 %rd12, %rd4;
mul.wide.u32 %rd13, %r36, 4;
add.s64 %rd14, %rd12, %rd13;
ld.global.f32 %f5, [%rd14];
cvta.to.global.u64 %rd15, %rd5;
mul.wide.u32 %rd16, %r42, 4;
add.s64 %rd17, %rd15, %rd16;
ld.global.f32 %f6, [%rd17];
ld.global.u32 %r50, [%rd2+52];
setp.eq.s32 %p3, %r50, 0;
@%p3 bra $L__BB24_3;
mul.rn.f32 %f19, %f6, 0f3FB374BC;
add.rn.f32 %f82, %f4, %f19;
mul.rn.f32 %f20, %f5, 0fBEB031CF;
add.rn.f32 %f21, %f4, %f20;
mul.rn.f32 %f22, %f6, 0fBF36D1E1;
add.rn.f32 %f81, %f21, %f22;
mul.rn.f32 %f23, %f5, 0f3FE2D0E5;
add.rn.f32 %f83, %f4, %f23;
bra.uni $L__BB24_4;
$L__BB24_3:
add.rn.f32 %f24, %f5, %f6;
mul.rn.f32 %f25, %f24, 0f3E800000;
cvt.rmi.f32.f32 %f26, %f25;
sub.rn.f32 %f81, %f4, %f26;
add.rn.f32 %f82, %f6, %f81;
add.rn.f32 %f83, %f5, %f81;
$L__BB24_4:
add.rn.f32 %f27, %f1, %f82;
mov.f32 %f28, 0f3F000000;
copysign.f32 %f29, %f27, %f28;
add.rz.f32 %f30, %f27, %f29;
cvt.rzi.f32.f32 %f16, %f30;
setp.gt.u32 %p4, %r1, 15;
@%p4 bra $L__BB24_6;
bra.uni $L__BB24_5;
$L__BB24_6:
mov.f32 %f41, 0f00000000;
max.f32 %f42, %f16, %f41;
mov.f32 %f43, 0f477FFF00;
min.f32 %f44, %f42, %f43;
cvt.rzi.u32.f32 %r69, %f44;
bra.uni $L__BB24_7;
$L__BB24_5:
setp.eq.s32 %p5, %r1, 0;
mov.u32 %r51, -1;
shl.b32 %r52, %r51, %r1;
not.b32 %r53, %r52;
selp.b32 %r54, 1, %r53, %p5;
max.u32 %r55, %r54, 1;
cvt.rn.f32.u32 %f31, %r55;
mov.f32 %f32, 0f00000000;
max.f32 %f33, %f16, %f32;
min.f32 %f34, %f33, %f31;
div.rn.f32 %f35, %f34, %f31;
mul.rn.f32 %f36, %f35, 0f477FFF00;
copysign.f32 %f38, %f36, %f28;
add.rz.f32 %f39, %f36, %f38;
cvt.rzi.f32.f32 %f40, %f39;
cvt.rzi.u32.f32 %r69, %f40;
$L__BB24_7:
mul.wide.u32 %rd18, %r7, 2;
add.s64 %rd19, %rd1, %rd18;
st.global.u16 [%rd19], %r69;
add.rn.f32 %f45, %f2, %f81;
mov.f32 %f46, 0f3F000000;
copysign.f32 %f47, %f45, %f46;
add.rz.f32 %f48, %f45, %f47;
cvt.rzi.f32.f32 %f17, %f48;
setp.gt.u32 %p6, %r2, 15;
@%p6 bra $L__BB24_9;
bra.uni $L__BB24_8;
$L__BB24_9:
mov.f32 %f59, 0f00000000;
max.f32 %f60, %f17, %f59;
mov.f32 %f61, 0f477FFF00;
min.f32 %f62, %f60, %f61;
cvt.rzi.u32.f32 %r70, %f62;
bra.uni $L__BB24_10;
$L__BB24_8:
setp.eq.s32 %p7, %r2, 0;
mov.u32 %r56, -1;
shl.b32 %r57, %r56, %r2;
not.b32 %r58, %r57;
selp.b32 %r59, 1, %r58, %p7;
max.u32 %r60, %r59, 1;
cvt.rn.f32.u32 %f49, %r60;
mov.f32 %f50, 0f00000000;
max.f32 %f51, %f17, %f50;
min.f32 %f52, %f51, %f49;
div.rn.f32 %f53, %f52, %f49;
mul.rn.f32 %f54, %f53, 0f477FFF00;
copysign.f32 %f56, %f54, %f46;
add.rz.f32 %f57, %f54, %f56;
cvt.rzi.f32.f32 %f58, %f57;
cvt.rzi.u32.f32 %r70, %f58;
$L__BB24_10:
add.s32 %r61, %r7, 1;
mul.wide.u32 %rd20, %r61, 2;
add.s64 %rd21, %rd1, %rd20;
st.global.u16 [%rd21], %r70;
add.rn.f32 %f63, %f3, %f83;
mov.f32 %f64, 0f3F000000;
copysign.f32 %f65, %f63, %f64;
add.rz.f32 %f66, %f63, %f65;
cvt.rzi.f32.f32 %f18, %f66;
setp.gt.u32 %p8, %r3, 15;
@%p8 bra $L__BB24_12;
bra.uni $L__BB24_11;
$L__BB24_12:
mov.f32 %f77, 0f00000000;
max.f32 %f78, %f18, %f77;
mov.f32 %f79, 0f477FFF00;
min.f32 %f80, %f78, %f79;
cvt.rzi.u32.f32 %r71, %f80;
bra.uni $L__BB24_13;
$L__BB24_11:
setp.eq.s32 %p9, %r3, 0;
mov.u32 %r62, -1;
shl.b32 %r63, %r62, %r3;
not.b32 %r64, %r63;
selp.b32 %r65, 1, %r64, %p9;
max.u32 %r66, %r65, 1;
cvt.rn.f32.u32 %f67, %r66;
mov.f32 %f68, 0f00000000;
max.f32 %f69, %f18, %f68;
min.f32 %f70, %f69, %f67;
div.rn.f32 %f71, %f70, %f67;
mul.rn.f32 %f72, %f71, 0f477FFF00;
copysign.f32 %f74, %f72, %f64;
add.rz.f32 %f75, %f72, %f74;
cvt.rzi.f32.f32 %f76, %f75;
cvt.rzi.u32.f32 %r71, %f76;
$L__BB24_13:
add.s32 %r67, %r7, 2;
mul.wide.u32 %rd22, %r67, 2;
add.s64 %rd23, %rd1, %rd22;
st.global.u16 [%rd23], %r71;
@%p2 bra $L__BB24_15;
add.s32 %r68, %r7, 3;
mul.wide.u32 %rd24, %r68, 2;
add.s64 %rd25, %rd1, %rd24;
mov.u16 %rs1, -1;
st.global.u16 [%rd25], %rs1;
$L__BB24_15:
ret;
}
// .globl j2k_htj2k_decode_codeblocks
.visible .entry j2k_htj2k_decode_codeblocks(
.param .u64 j2k_htj2k_decode_codeblocks_param_0,
.param .u64 j2k_htj2k_decode_codeblocks_param_1,
.param .u64 j2k_htj2k_decode_codeblocks_param_2,
.param .u64 j2k_htj2k_decode_codeblocks_param_3,
.param .u64 j2k_htj2k_decode_codeblocks_param_4,
.param .u64 j2k_htj2k_decode_codeblocks_param_5,
.param .u64 j2k_htj2k_decode_codeblocks_param_6,
.param .u64 j2k_htj2k_decode_codeblocks_param_7,
.param .u32 j2k_htj2k_decode_codeblocks_param_8
)
{
.local .align 16 .b8 __local_depot25[7920];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<617>;
.reg .b16 %rs<1432>;
.reg .b32 %r<2898>;
.reg .b64 %rd<660>;
mov.u64 %SPL, __local_depot25;
ld.param.u64 %rd136, [ j2k_htj2k_decode_codeblocks_param_0];
ld.param.u64 %rd137, [ j2k_htj2k_decode_codeblocks_param_1];
ld.param.u64 %rd130, [ j2k_htj2k_decode_codeblocks_param_2];
ld.param.u64 %rd135, [ j2k_htj2k_decode_codeblocks_param_7];
ld.param.u32 %r1130, [ j2k_htj2k_decode_codeblocks_param_8];
cvta.to.global.u64 %rd1, %rd136;
cvta.to.global.u64 %rd2, %rd137;
mov.u32 %r1131, %ntid.x;
mov.u32 %r1132, %ctaid.x;
mov.u32 %r1133, %tid.x;
mad.lo.s32 %r1, %r1132, %r1131, %r1133;
setp.ge.u32 %p1, %r1, %r1130;
@%p1 bra $L__BB25_541;
cvta.to.global.u64 %rd138, %rd135;
cvta.to.global.u64 %rd139, %rd130;
mul.wide.u32 %rd140, %r1, 52;
add.s64 %rd141, %rd139, %rd140;
ld.global.u32 %r2, [%rd141+4];
ld.global.u32 %r3, [%rd141+8];
ld.global.u32 %r4, [%rd141+12];
ld.global.u32 %r5, [%rd141+16];
ld.global.u32 %r2885, [%rd141+20];
ld.global.u32 %r7, [%rd141+24];
ld.global.u32 %r8, [%rd141+28];
ld.global.u32 %r1134, [%rd141+32];
ld.global.u32 %r9, [%rd141+36];
ld.global.u32 %r10, [%rd141+40];
ld.global.u32 %r11, [%rd141+48];
ld.global.u32 %rd4, [%rd141];
mul.wide.u32 %rd142, %r1, 16;
add.s64 %rd5, %rd138, %rd142;
mov.u32 %r1135, 0;
st.global.u32 [%rd5], %r1135;
st.global.u32 [%rd5+4], %r1135;
st.global.u32 [%rd5+8], %r1135;
st.global.u32 [%rd5+12], %r1135;
setp.gt.u32 %p2, %r1134, 1;
setp.eq.s32 %p3, %r2885, 0;
and.pred %p4, %p3, %p2;
selp.b32 %r12, 1, %r1134, %p4;
setp.eq.s32 %p5, %r2, 0;
setp.eq.s32 %p6, %r3, 0;
or.pred %p7, %p5, %p6;
@%p7 bra $L__BB25_541;
setp.gt.u32 %p8, %r2, 256;
setp.gt.u32 %p9, %r3, 256;
or.pred %p10, %p8, %p9;
mul.lo.s32 %r1136, %r3, %r2;
setp.gt.u32 %p11, %r1136, 4096;
or.pred %p12, %p10, %p11;
@%p12 bra $L__BB25_540;
bra.uni $L__BB25_3;
$L__BB25_540:
mov.u32 %r2303, 2;
st.global.u32 [%rd5], %r2303;
mov.u32 %r2304, 1;
st.global.u32 [%rd5+4], %r2304;
mov.u32 %r2305, 0;
st.global.u32 [%rd5+8], %r2305;
st.global.u32 [%rd5+12], %r2305;
bra.uni $L__BB25_541;
$L__BB25_3:
add.s32 %r1137, %r8, -1;
setp.gt.u32 %p13, %r1137, 30;
@%p13 bra $L__BB25_539;
bra.uni $L__BB25_4;
$L__BB25_539:
mov.u32 %r2300, 1;
st.global.u32 [%rd5], %r2300;
mov.u32 %r2301, 2;
st.global.u32 [%rd5+4], %r2301;
mov.u32 %r2302, 0;
st.global.u32 [%rd5+8], %r2302;
st.global.u32 [%rd5+12], %r2302;
bra.uni $L__BB25_541;
$L__BB25_4:
setp.gt.u32 %p14, %r12, 3;
setp.gt.u32 %p15, %r7, 29;
or.pred %p16, %p15, %p14;
@%p16 bra $L__BB25_538;
bra.uni $L__BB25_5;
$L__BB25_538:
mov.u32 %r2297, 1;
st.global.u32 [%rd5], %r2297;
mov.u32 %r2298, 3;
st.global.u32 [%rd5+4], %r2298;
mov.u32 %r2299, 0;
st.global.u32 [%rd5+8], %r2299;
st.global.u32 [%rd5+12], %r2299;
bra.uni $L__BB25_541;
$L__BB25_5:
cvt.u64.u32 %rd594, %r5;
cvt.u32.u64 %r1138, %rd594;
setp.gt.u32 %p17, %r12, 1;
setp.eq.s32 %p18, %r7, 29;
and.pred %p19, %p18, %p17;
selp.b32 %r13, 1, %r12, %p19;
setp.lt.u32 %p20, %r1138, 2;
add.s32 %r1139, %r2885, %r1138;
setp.lt.u32 %p21, %r4, %r1139;
or.pred %p22, %p20, %p21;
@%p22 bra $L__BB25_537;
bra.uni $L__BB25_6;
$L__BB25_537:
mov.u32 %r2294, 1;
st.global.u32 [%rd5], %r2294;
mov.u32 %r2295, 4;
st.global.u32 [%rd5+4], %r2295;
mov.u32 %r2296, 0;
st.global.u32 [%rd5+8], %r2296;
st.global.u32 [%rd5+12], %r2296;
bra.uni $L__BB25_541;
$L__BB25_6:
add.s32 %r1141, %r1138, -1;
cvt.u64.u32 %rd143, %r1141;
add.s64 %rd144, %rd143, %rd4;
add.s64 %rd145, %rd1, %rd144;
ld.global.u8 %rs667, [%rd145];
mul.wide.u16 %r1142, %rs667, 16;
add.s32 %r1143, %r1138, -2;
cvt.u64.u32 %rd146, %r1143;
add.s64 %rd147, %rd146, %rd4;
add.s64 %rd148, %rd1, %rd147;
ld.global.u8 %rs1, [%rd148];
and.b16 %rs668, %rs1, 15;
cvt.u32.u16 %r1144, %rs668;
or.b32 %r14, %r1142, %r1144;
setp.lt.u32 %p23, %r1138, %r14;
add.s32 %r15, %r14, -2;
setp.gt.u32 %p24, %r15, 4077;
or.pred %p25, %p23, %p24;
@%p25 bra $L__BB25_536;
bra.uni $L__BB25_7;
$L__BB25_536:
mov.u32 %r2291, 1;
st.global.u32 [%rd5], %r2291;
mov.u32 %r2292, 5;
st.global.u32 [%rd5+4], %r2292;
mov.u32 %r2293, 0;
st.global.u32 [%rd5+8], %r2293;
st.global.u32 [%rd5+12], %r2293;
bra.uni $L__BB25_541;
$L__BB25_7:
add.s32 %r1145, %r3, 1;
shr.u32 %r1146, %r1145, 1;
add.s32 %r1147, %r2, 9;
and.b32 %r1148, %r1147, -8;
setp.gt.u32 %p26, %r1148, 264;
add.s32 %r1149, %r1146, 1;
mul.lo.s32 %r1150, %r1149, %r1148;
setp.gt.u32 %p27, %r1150, 3096;
or.pred %p28, %p26, %p27;
@%p28 bra $L__BB25_535;
bra.uni $L__BB25_8;
$L__BB25_535:
mov.u32 %r2288, 2;
st.global.u32 [%rd5], %r2288;
mov.u32 %r2289, 6;
st.global.u32 [%rd5+4], %r2289;
mov.u32 %r2290, 0;
st.global.u32 [%rd5+8], %r2290;
st.global.u32 [%rd5+12], %r2290;
bra.uni $L__BB25_541;
$L__BB25_8:
and.b16 %rs1031, %rs1, 15;
cvt.u32.u16 %r2312, %rs1031;
mul.wide.u16 %r2311, %rs667, 16;
or.b32 %r2310, %r2311, %r2312;
sub.s32 %r16, %r5, %r2310;
add.s32 %r2390, %r2310, -1;
mov.u64 %rd596, 0;
mov.u32 %r2480, 0;
mov.u16 %rs1104, 0;
mov.u64 %rd151, _ZZ20mel_decode_more_runsR10MelDecoderE7MEL_EXP;
mov.u32 %r2389, %r16;
mov.u16 %rs1105, %rs1104;
mov.u16 %rs1139, %rs1104;
mov.u32 %r2362, %r2480;
$L__BB25_9:
setp.gt.u32 %p30, %r2362, 7;
@%p30 bra $L__BB25_54;
mul.wide.u32 %rd150, %r2480, 4;
add.s64 %rd152, %rd151, %rd150;
ld.global.nc.u32 %r22, [%rd152];
and.b16 %rs672, %rs1139, 255;
setp.ne.s16 %p31, %rs672, 0;
mov.u16 %rs1039, %rs1139;
@%p31 bra $L__BB25_14;
setp.eq.s32 %p32, %r2390, 0;
mov.u16 %rs1036, 255;
@%p32 bra $L__BB25_13;
cvt.u64.u32 %rd153, %r2389;
add.s64 %rd154, %rd153, %rd4;
add.s64 %rd155, %rd1, %rd154;
ld.global.u8 %rs1036, [%rd155];
$L__BB25_13:
setp.ne.s32 %p34, %r2390, 0;
selp.u32 %r1153, 1, 0, %p34;
add.s32 %r2389, %r2389, %r1153;
add.s32 %r1154, %r2390, -1;
selp.b32 %r2390, 0, %r1154, %p32;
setp.eq.s32 %p35, %r2390, 0;
or.b16 %rs674, %rs1036, 15;
selp.b16 %rs1105, %rs674, %rs1036, %p35;
and.b16 %rs675, %rs1105, 255;
mov.u16 %rs676, 8;
sub.s16 %rs1039, %rs676, %rs1104;
setp.eq.s16 %p36, %rs675, 255;
selp.u16 %rs1104, 1, 0, %p36;
$L__BB25_14:
add.s16 %rs15, %rs1039, -1;
cvt.u32.u16 %r1155, %rs15;
and.b32 %r1156, %r1155, 255;
mov.u32 %r1157, 1;
shl.b32 %r1158, %r1157, %r1156;
cvt.u32.u16 %r1159, %rs1105;
and.b32 %r1160, %r1158, %r1159;
and.b32 %r27, %r1160, 255;
add.s16 %rs1139, %rs1039, -1;
setp.eq.s32 %p37, %r27, 0;
@%p37 bra $L__BB25_16;
add.s32 %r1161, %r2480, 1;
min.u32 %r2357, %r1161, 12;
mov.u32 %r1162, -1;
shl.b32 %r1163, %r1162, %r22;
shl.b32 %r1164, %r1163, 1;
xor.b32 %r2358, %r1164, -2;
bra.uni $L__BB25_53;
$L__BB25_16:
cvt.u64.u32 %rd593, %r2480;
add.s64 %rd156, %rd593, -3;
setp.gt.u64 %p38, %rd156, 9;
mov.u32 %r2354, 0;
@%p38 bra $L__BB25_52;
add.s16 %rs1139, %rs1039, -1;
max.u32 %r30, %r22, 1;
add.s32 %r1168, %r30, -1;
setp.lt.u32 %p39, %r1168, 3;
mov.u32 %r2354, 0;
@%p39 bra $L__BB25_36;
and.b32 %r2306, %r30, 3;
add.s16 %rs1139, %rs1039, -1;
sub.s32 %r2326, %r30, %r2306;
mov.u32 %r2354, 0;
$L__BB25_19:
and.b16 %rs678, %rs1139, 255;
setp.ne.s16 %p40, %rs678, 0;
@%p40 bra $L__BB25_23;
setp.eq.s32 %p41, %r2390, 0;
mov.u16 %rs1043, 255;
@%p41 bra $L__BB25_22;
cvt.u64.u32 %rd157, %r2389;
add.s64 %rd158, %rd157, %rd4;
add.s64 %rd159, %rd1, %rd158;
ld.global.u8 %rs1043, [%rd159];
$L__BB25_22:
setp.ne.s32 %p43, %r2390, 0;
selp.u32 %r1170, 1, 0, %p43;
add.s32 %r2389, %r2389, %r1170;
add.s32 %r1171, %r2390, -1;
selp.b32 %r2390, 0, %r1171, %p41;
setp.eq.s32 %p44, %r2390, 0;
or.b16 %rs680, %rs1043, 15;
selp.b16 %rs1105, %rs680, %rs1043, %p44;
and.b16 %rs681, %rs1105, 255;
mov.u16 %rs682, 8;
sub.s16 %rs1139, %rs682, %rs1104;
setp.eq.s16 %p45, %rs681, 255;
selp.u16 %rs1104, 1, 0, %p45;
$L__BB25_23:
add.s16 %rs1050, %rs1139, -1;
and.b16 %rs683, %rs1050, 255;
cvt.u32.u16 %r1172, %rs1050;
and.b32 %r1173, %r1172, 255;
cvt.u32.u16 %r1174, %rs1105;
and.b32 %r2332, %r1174, 255;
shr.u32 %r1175, %r2332, %r1173;
and.b32 %r1176, %r1175, 1;
bfi.b32 %r42, %r2354, %r1176, 1, 31;
setp.ne.s16 %p46, %rs683, 0;
@%p46 bra $L__BB25_27;
setp.eq.s32 %p47, %r2390, 0;
mov.u16 %rs1047, 255;
@%p47 bra $L__BB25_26;
cvt.u64.u32 %rd160, %r2389;
add.s64 %rd161, %rd160, %rd4;
add.s64 %rd162, %rd1, %rd161;
ld.global.u8 %rs1047, [%rd162];
$L__BB25_26:
setp.ne.s32 %p49, %r2390, 0;
selp.u32 %r1177, 1, 0, %p49;
add.s32 %r2389, %r2389, %r1177;
add.s32 %r1178, %r2390, -1;
selp.b32 %r2390, 0, %r1178, %p47;
setp.eq.s32 %p50, %r2390, 0;
or.b16 %rs685, %rs1047, 15;
selp.b16 %rs1105, %rs685, %rs1047, %p50;
and.b16 %rs686, %rs1105, 255;
mov.u16 %rs687, 8;
sub.s16 %rs1050, %rs687, %rs1104;
setp.eq.s16 %p51, %rs686, 255;
selp.u16 %rs1104, 1, 0, %p51;
cvt.u32.u16 %r1179, %rs1105;
and.b32 %r2332, %r1179, 255;
$L__BB25_27:
add.s16 %rs1054, %rs1050, -1;
and.b16 %rs688, %rs1054, 255;
cvt.u32.u16 %r1180, %rs1054;
and.b32 %r1181, %r1180, 255;
shr.u32 %r1182, %r2332, %r1181;
and.b32 %r1183, %r1182, 1;
bfi.b32 %r49, %r42, %r1183, 1, 31;
setp.ne.s16 %p52, %rs688, 0;
@%p52 bra $L__BB25_31;
setp.eq.s32 %p53, %r2390, 0;
mov.u16 %rs1051, 255;
@%p53 bra $L__BB25_30;
cvt.u64.u32 %rd163, %r2389;
add.s64 %rd164, %rd163, %rd4;
add.s64 %rd165, %rd1, %rd164;
ld.global.u8 %rs1051, [%rd165];
$L__BB25_30:
setp.ne.s32 %p55, %r2390, 0;
selp.u32 %r1184, 1, 0, %p55;
add.s32 %r2389, %r2389, %r1184;
add.s32 %r1185, %r2390, -1;
selp.b32 %r2390, 0, %r1185, %p53;
setp.eq.s32 %p56, %r2390, 0;
or.b16 %rs690, %rs1051, 15;
selp.b16 %rs1105, %rs690, %rs1051, %p56;
and.b16 %rs691, %rs1105, 255;
mov.u16 %rs692, 8;
sub.s16 %rs1054, %rs692, %rs1104;
setp.eq.s16 %p57, %rs691, 255;
selp.u16 %rs1104, 1, 0, %p57;
cvt.u32.u16 %r1186, %rs1105;
and.b32 %r2332, %r1186, 255;
$L__BB25_31:
add.s16 %rs1058, %rs1054, -1;
and.b16 %rs693, %rs1058, 255;
cvt.u32.u16 %r1187, %rs1058;
and.b32 %r1188, %r1187, 255;
shr.u32 %r1189, %r2332, %r1188;
and.b32 %r1190, %r1189, 1;
bfi.b32 %r56, %r49, %r1190, 1, 31;
setp.ne.s16 %p58, %rs693, 0;
@%p58 bra $L__BB25_35;
setp.eq.s32 %p59, %r2390, 0;
mov.u16 %rs1055, 255;
@%p59 bra $L__BB25_34;
cvt.u64.u32 %rd166, %r2389;
add.s64 %rd167, %rd166, %rd4;
add.s64 %rd168, %rd1, %rd167;
ld.global.u8 %rs1055, [%rd168];
$L__BB25_34:
setp.ne.s32 %p61, %r2390, 0;
selp.u32 %r1191, 1, 0, %p61;
add.s32 %r2389, %r2389, %r1191;
add.s32 %r1192, %r2390, -1;
selp.b32 %r2390, 0, %r1192, %p59;
setp.eq.s32 %p62, %r2390, 0;
or.b16 %rs695, %rs1055, 15;
selp.b16 %rs1105, %rs695, %rs1055, %p62;
and.b16 %rs696, %rs1105, 255;
mov.u16 %rs697, 8;
sub.s16 %rs1058, %rs697, %rs1104;
setp.eq.s16 %p63, %rs696, 255;
selp.u16 %rs1104, 1, 0, %p63;
cvt.u32.u16 %r1193, %rs1105;
and.b32 %r2332, %r1193, 255;
$L__BB25_35:
add.s16 %rs1139, %rs1058, -1;
cvt.u32.u16 %r1194, %rs1139;
and.b32 %r1195, %r1194, 255;
shr.u32 %r1196, %r2332, %r1195;
and.b32 %r1197, %r1196, 1;
bfi.b32 %r2354, %r56, %r1197, 1, 31;
add.s32 %r2326, %r2326, -4;
setp.ne.s32 %p64, %r2326, 0;
@%p64 bra $L__BB25_19;
$L__BB25_36:
and.b32 %r2307, %r30, 3;
setp.eq.s32 %p65, %r2307, 0;
@%p65 bra $L__BB25_52;
and.b16 %rs698, %rs1139, 255;
setp.ne.s16 %p66, %rs698, 0;
@%p66 bra $L__BB25_41;
setp.eq.s32 %p67, %r2390, 0;
mov.u16 %rs1065, 255;
@%p67 bra $L__BB25_40;
cvt.u64.u32 %rd169, %r2389;
add.s64 %rd170, %rd169, %rd4;
add.s64 %rd171, %rd1, %rd170;
ld.global.u8 %rs1065, [%rd171];
$L__BB25_40:
setp.ne.s32 %p69, %r2390, 0;
selp.u32 %r1198, 1, 0, %p69;
add.s32 %r2389, %r2389, %r1198;
add.s32 %r1199, %r2390, -1;
selp.b32 %r2390, 0, %r1199, %p67;
setp.eq.s32 %p70, %r2390, 0;
or.b16 %rs700, %rs1065, 15;
selp.b16 %rs1105, %rs700, %rs1065, %p70;
and.b16 %rs701, %rs1105, 255;
mov.u16 %rs702, 8;
sub.s16 %rs1139, %rs702, %rs1104;
setp.eq.s16 %p71, %rs701, 255;
selp.u16 %rs1104, 1, 0, %p71;
$L__BB25_41:
and.b32 %r2308, %r30, 3;
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1200, %rs1139;
and.b32 %r1201, %r1200, 255;
cvt.u32.u16 %r1202, %rs1105;
and.b32 %r2349, %r1202, 255;
shr.u32 %r1203, %r2349, %r1201;
and.b32 %r1204, %r1203, 1;
bfi.b32 %r2354, %r2354, %r1204, 1, 31;
setp.eq.s32 %p72, %r2308, 1;
@%p72 bra $L__BB25_52;
and.b16 %rs703, %rs1139, 255;
setp.ne.s16 %p73, %rs703, 0;
@%p73 bra $L__BB25_46;
setp.eq.s32 %p74, %r2390, 0;
mov.u16 %rs1069, 255;
@%p74 bra $L__BB25_45;
cvt.u64.u32 %rd172, %r2389;
add.s64 %rd173, %rd172, %rd4;
add.s64 %rd174, %rd1, %rd173;
ld.global.u8 %rs1069, [%rd174];
$L__BB25_45:
setp.ne.s32 %p76, %r2390, 0;
selp.u32 %r1205, 1, 0, %p76;
add.s32 %r2389, %r2389, %r1205;
add.s32 %r1206, %r2390, -1;
selp.b32 %r2390, 0, %r1206, %p74;
setp.eq.s32 %p77, %r2390, 0;
or.b16 %rs705, %rs1069, 15;
selp.b16 %rs1105, %rs705, %rs1069, %p77;
and.b16 %rs706, %rs1105, 255;
mov.u16 %rs707, 8;
sub.s16 %rs1139, %rs707, %rs1104;
setp.eq.s16 %p78, %rs706, 255;
selp.u16 %rs1104, 1, 0, %p78;
cvt.u32.u16 %r1207, %rs1105;
and.b32 %r2349, %r1207, 255;
$L__BB25_46:
and.b32 %r2309, %r30, 3;
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1208, %rs1139;
and.b32 %r1209, %r1208, 255;
shr.u32 %r1210, %r2349, %r1209;
and.b32 %r1211, %r1210, 1;
bfi.b32 %r2354, %r2354, %r1211, 1, 31;
setp.eq.s32 %p79, %r2309, 2;
@%p79 bra $L__BB25_52;
and.b16 %rs708, %rs1139, 255;
setp.ne.s16 %p80, %rs708, 0;
@%p80 bra $L__BB25_51;
setp.eq.s32 %p81, %r2390, 0;
mov.u16 %rs1073, 255;
@%p81 bra $L__BB25_50;
cvt.u64.u32 %rd175, %r2389;
add.s64 %rd176, %rd175, %rd4;
add.s64 %rd177, %rd1, %rd176;
ld.global.u8 %rs1073, [%rd177];
$L__BB25_50:
setp.ne.s32 %p83, %r2390, 0;
selp.u32 %r1212, 1, 0, %p83;
add.s32 %r2389, %r2389, %r1212;
add.s32 %r1213, %r2390, -1;
selp.b32 %r2390, 0, %r1213, %p81;
setp.eq.s32 %p84, %r2390, 0;
or.b16 %rs710, %rs1073, 15;
selp.b16 %rs1105, %rs710, %rs1073, %p84;
and.b16 %rs711, %rs1105, 255;
mov.u16 %rs712, 8;
sub.s16 %rs1139, %rs712, %rs1104;
setp.eq.s16 %p85, %rs711, 255;
selp.u16 %rs1104, 1, 0, %p85;
cvt.u32.u16 %r1214, %rs1105;
and.b32 %r2349, %r1214, 255;
$L__BB25_51:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1215, %rs1139;
and.b32 %r1216, %r1215, 255;
shr.u32 %r1217, %r2349, %r1216;
and.b32 %r1218, %r1217, 1;
bfi.b32 %r2354, %r2354, %r1218, 1, 31;
$L__BB25_52:
shl.b32 %r1219, %r2354, 1;
or.b32 %r2358, %r1219, 1;
add.s32 %r1220, %r2480, -1;
setp.eq.s32 %p86, %r2480, 0;
selp.b32 %r2357, 0, %r1220, %p86;
$L__BB25_53:
mul.lo.s32 %r1221, %r2362, 7;
cvt.u64.u32 %rd178, %r2358;
shl.b64 %rd179, %rd178, %r1221;
or.b64 %rd596, %rd179, %rd596;
setp.ne.s32 %p87, %r2480, 12;
setp.ne.s32 %p88, %r27, 0;
or.pred %p89, %p87, %p88;
add.s32 %r2362, %r2362, 1;
setp.lt.u32 %p90, %r2362, 8;
or.pred %p91, %p90, %p89;
mov.u32 %r2480, %r2357;
@%p91 bra $L__BB25_9;
$L__BB25_54:
and.b16 %rs1032, %rs1, 15;
cvt.u32.u16 %r2316, %rs1032;
mul.wide.u16 %r2315, %rs667, 16;
or.b32 %r2314, %r2315, %r2316;
add.s32 %r2559, %r2314, -2;
setp.gt.u16 %p616, %rs1, 143;
selp.u16 %rs1271, 1, 0, %p616;
shr.u16 %rs1025, %rs1, 4;
ld.param.u64 %rd589, [ j2k_htj2k_decode_codeblocks_param_3];
ld.param.u64 %rd588, [ j2k_htj2k_decode_codeblocks_param_5];
add.s32 %r2481, %r2362, -1;
shr.u64 %rd606, %rd596, 7;
cvt.u32.u64 %r1224, %rd596;
and.b32 %r2477, %r1224, 127;
cvt.u64.u16 %rd615, %rs1025;
and.b64 %rd180, %rd615, 7;
setp.eq.s64 %p92, %rd180, 7;
selp.b32 %r2560, 3, 4, %p92;
add.s32 %r2558, %r5, -3;
cvta.to.global.u64 %rd12, %rd588;
cvta.to.global.u64 %rd13, %rd589;
add.u64 %rd14, %SPL, 0;
mov.u32 %r2363, 0;
mov.u32 %r2364, %r2363;
$L__BB25_55:
setp.gt.u32 %p93, %r2560, 31;
@%p93 bra $L__BB25_59;
$L__BB25_56:
setp.eq.s32 %p94, %r2559, 0;
mov.u16 %rs1091, 0;
@%p94 bra $L__BB25_58;
cvt.s64.s32 %rd182, %r2558;
add.s64 %rd183, %rd182, %rd4;
add.s64 %rd184, %rd1, %rd183;
ld.global.u8 %rs1091, [%rd184];
$L__BB25_58:
setp.ne.s32 %p96, %r2559, 0;
selp.b32 %r1225, -1, 0, %p96;
add.s32 %r2558, %r2558, %r1225;
add.s32 %r1226, %r2559, -1;
selp.b32 %r2559, 0, %r1226, %p94;
and.b16 %rs714, %rs1091, 255;
and.b16 %rs715, %rs1091, 127;
setp.eq.s16 %p97, %rs715, 127;
and.b16 %rs716, %rs1271, 255;
setp.ne.s16 %p98, %rs716, 0;
and.pred %p99, %p98, %p97;
selp.b32 %r1227, 7, 8, %p99;
cvt.u64.u16 %rd185, %rs1091;
and.b64 %rd186, %rd185, 255;
shl.b64 %rd187, %rd186, %r2560;
or.b64 %rd615, %rd187, %rd615;
add.s32 %r2560, %r1227, %r2560;
setp.gt.u16 %p100, %rs714, 143;
selp.u16 %rs1271, 1, 0, %p100;
setp.lt.u32 %p101, %r2560, 33;
@%p101 bra $L__BB25_56;
$L__BB25_59:
cvt.u32.u64 %r1228, %rd615;
and.b32 %r1229, %r1228, 127;
add.s32 %r1230, %r1229, %r2363;
mul.wide.u32 %rd188, %r1230, 2;
add.s64 %rd189, %rd13, %rd188;
ld.global.u16 %r2430, [%rd189];
setp.ne.s32 %p102, %r2363, 0;
@%p102 bra $L__BB25_109;
add.s32 %r129, %r2477, -2;
setp.eq.s32 %p103, %r129, -1;
selp.b32 %r2430, %r2430, 0, %p103;
setp.gt.s32 %p104, %r2477, 1;
mov.u32 %r2477, %r129;
@%p104 bra $L__BB25_109;
setp.ne.s32 %p105, %r2481, 0;
@%p105 bra $L__BB25_108;
mov.u32 %r2481, 0;
$L__BB25_63:
setp.gt.u32 %p106, %r2481, 7;
@%p106 bra $L__BB25_108;
cvt.u64.u32 %rd21, %r2480;
mul.wide.u32 %rd190, %r2480, 4;
add.s64 %rd192, %rd151, %rd190;
ld.global.nc.u32 %r135, [%rd192];
and.b16 %rs717, %rs1139, 255;
setp.ne.s16 %p107, %rs717, 0;
@%p107 bra $L__BB25_68;
setp.eq.s32 %p108, %r2390, 0;
mov.u16 %rs1096, 255;
@%p108 bra $L__BB25_67;
cvt.u64.u32 %rd193, %r2389;
add.s64 %rd194, %rd193, %rd4;
add.s64 %rd195, %rd1, %rd194;
ld.global.u8 %rs1096, [%rd195];
$L__BB25_67:
setp.ne.s32 %p110, %r2390, 0;
selp.u32 %r1232, 1, 0, %p110;
add.s32 %r2389, %r2389, %r1232;
add.s32 %r1233, %r2390, -1;
selp.b32 %r2390, 0, %r1233, %p108;
setp.eq.s32 %p111, %r2390, 0;
or.b16 %rs719, %rs1096, 15;
selp.b16 %rs1105, %rs719, %rs1096, %p111;
and.b16 %rs720, %rs1105, 255;
mov.u16 %rs721, 8;
sub.s16 %rs1139, %rs721, %rs1104;
setp.eq.s16 %p112, %rs720, 255;
selp.u16 %rs1104, 1, 0, %p112;
$L__BB25_68:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1234, %rs1139;
and.b32 %r1235, %r1234, 255;
mov.u32 %r1236, 1;
shl.b32 %r1237, %r1236, %r1235;
cvt.u32.u16 %r1238, %rs1105;
and.b32 %r1239, %r1237, %r1238;
and.b32 %r140, %r1239, 255;
setp.eq.s32 %p113, %r140, 0;
@%p113 bra $L__BB25_70;
add.s32 %r1240, %r2480, 1;
min.u32 %r2419, %r1240, 12;
mov.u32 %r1241, -1;
shl.b32 %r1242, %r1241, %r135;
shl.b32 %r1243, %r1242, 1;
xor.b32 %r2420, %r1243, -2;
bra.uni $L__BB25_107;
$L__BB25_70:
add.s64 %rd196, %rd21, -3;
setp.gt.u64 %p114, %rd196, 9;
mov.u32 %r2416, 0;
@%p114 bra $L__BB25_106;
max.u32 %r143, %r135, 1;
add.s32 %r1247, %r143, -1;
and.b32 %r144, %r143, 3;
setp.lt.u32 %p115, %r1247, 3;
mov.u32 %r2416, 0;
@%p115 bra $L__BB25_90;
sub.s32 %r2388, %r143, %r144;
mov.u32 %r2416, 0;
$L__BB25_73:
and.b16 %rs723, %rs1139, 255;
setp.ne.s16 %p116, %rs723, 0;
@%p116 bra $L__BB25_77;
setp.eq.s32 %p117, %r2390, 0;
mov.u16 %rs1103, 255;
@%p117 bra $L__BB25_76;
cvt.u64.u32 %rd197, %r2389;
add.s64 %rd198, %rd197, %rd4;
add.s64 %rd199, %rd1, %rd198;
ld.global.u8 %rs1103, [%rd199];
$L__BB25_76:
setp.ne.s32 %p119, %r2390, 0;
selp.u32 %r1249, 1, 0, %p119;
add.s32 %r2389, %r2389, %r1249;
add.s32 %r1250, %r2390, -1;
selp.b32 %r2390, 0, %r1250, %p117;
setp.eq.s32 %p120, %r2390, 0;
or.b16 %rs725, %rs1103, 15;
selp.b16 %rs1105, %rs725, %rs1103, %p120;
and.b16 %rs726, %rs1105, 255;
mov.u16 %rs727, 8;
sub.s16 %rs1139, %rs727, %rs1104;
setp.eq.s16 %p121, %rs726, 255;
selp.u16 %rs1104, 1, 0, %p121;
$L__BB25_77:
add.s16 %rs1110, %rs1139, -1;
and.b16 %rs728, %rs1110, 255;
cvt.u32.u16 %r1251, %rs1110;
and.b32 %r1252, %r1251, 255;
cvt.u32.u16 %r1253, %rs1105;
and.b32 %r2394, %r1253, 255;
shr.u32 %r1254, %r2394, %r1252;
and.b32 %r1255, %r1254, 1;
bfi.b32 %r155, %r2416, %r1255, 1, 31;
setp.ne.s16 %p122, %rs728, 0;
@%p122 bra $L__BB25_81;
setp.eq.s32 %p123, %r2390, 0;
mov.u16 %rs1107, 255;
@%p123 bra $L__BB25_80;
cvt.u64.u32 %rd200, %r2389;
add.s64 %rd201, %rd200, %rd4;
add.s64 %rd202, %rd1, %rd201;
ld.global.u8 %rs1107, [%rd202];
$L__BB25_80:
setp.ne.s32 %p125, %r2390, 0;
selp.u32 %r1256, 1, 0, %p125;
add.s32 %r2389, %r2389, %r1256;
add.s32 %r1257, %r2390, -1;
selp.b32 %r2390, 0, %r1257, %p123;
setp.eq.s32 %p126, %r2390, 0;
or.b16 %rs730, %rs1107, 15;
selp.b16 %rs1105, %rs730, %rs1107, %p126;
and.b16 %rs731, %rs1105, 255;
mov.u16 %rs732, 8;
sub.s16 %rs1110, %rs732, %rs1104;
setp.eq.s16 %p127, %rs731, 255;
selp.u16 %rs1104, 1, 0, %p127;
cvt.u32.u16 %r1258, %rs1105;
and.b32 %r2394, %r1258, 255;
$L__BB25_81:
add.s16 %rs1114, %rs1110, -1;
and.b16 %rs733, %rs1114, 255;
cvt.u32.u16 %r1259, %rs1114;
and.b32 %r1260, %r1259, 255;
shr.u32 %r1261, %r2394, %r1260;
and.b32 %r1262, %r1261, 1;
bfi.b32 %r162, %r155, %r1262, 1, 31;
setp.ne.s16 %p128, %rs733, 0;
@%p128 bra $L__BB25_85;
setp.eq.s32 %p129, %r2390, 0;
mov.u16 %rs1111, 255;
@%p129 bra $L__BB25_84;
cvt.u64.u32 %rd203, %r2389;
add.s64 %rd204, %rd203, %rd4;
add.s64 %rd205, %rd1, %rd204;
ld.global.u8 %rs1111, [%rd205];
$L__BB25_84:
setp.ne.s32 %p131, %r2390, 0;
selp.u32 %r1263, 1, 0, %p131;
add.s32 %r2389, %r2389, %r1263;
add.s32 %r1264, %r2390, -1;
selp.b32 %r2390, 0, %r1264, %p129;
setp.eq.s32 %p132, %r2390, 0;
or.b16 %rs735, %rs1111, 15;
selp.b16 %rs1105, %rs735, %rs1111, %p132;
and.b16 %rs736, %rs1105, 255;
mov.u16 %rs737, 8;
sub.s16 %rs1114, %rs737, %rs1104;
setp.eq.s16 %p133, %rs736, 255;
selp.u16 %rs1104, 1, 0, %p133;
cvt.u32.u16 %r1265, %rs1105;
and.b32 %r2394, %r1265, 255;
$L__BB25_85:
add.s16 %rs1118, %rs1114, -1;
and.b16 %rs738, %rs1118, 255;
cvt.u32.u16 %r1266, %rs1118;
and.b32 %r1267, %r1266, 255;
shr.u32 %r1268, %r2394, %r1267;
and.b32 %r1269, %r1268, 1;
bfi.b32 %r169, %r162, %r1269, 1, 31;
setp.ne.s16 %p134, %rs738, 0;
@%p134 bra $L__BB25_89;
setp.eq.s32 %p135, %r2390, 0;
mov.u16 %rs1115, 255;
@%p135 bra $L__BB25_88;
cvt.u64.u32 %rd206, %r2389;
add.s64 %rd207, %rd206, %rd4;
add.s64 %rd208, %rd1, %rd207;
ld.global.u8 %rs1115, [%rd208];
$L__BB25_88:
setp.ne.s32 %p137, %r2390, 0;
selp.u32 %r1270, 1, 0, %p137;
add.s32 %r2389, %r2389, %r1270;
add.s32 %r1271, %r2390, -1;
selp.b32 %r2390, 0, %r1271, %p135;
setp.eq.s32 %p138, %r2390, 0;
or.b16 %rs740, %rs1115, 15;
selp.b16 %rs1105, %rs740, %rs1115, %p138;
and.b16 %rs741, %rs1105, 255;
mov.u16 %rs742, 8;
sub.s16 %rs1118, %rs742, %rs1104;
setp.eq.s16 %p139, %rs741, 255;
selp.u16 %rs1104, 1, 0, %p139;
cvt.u32.u16 %r1272, %rs1105;
and.b32 %r2394, %r1272, 255;
$L__BB25_89:
add.s16 %rs1139, %rs1118, -1;
cvt.u32.u16 %r1273, %rs1139;
and.b32 %r1274, %r1273, 255;
shr.u32 %r1275, %r2394, %r1274;
and.b32 %r1276, %r1275, 1;
bfi.b32 %r2416, %r169, %r1276, 1, 31;
add.s32 %r2388, %r2388, -4;
setp.ne.s32 %p140, %r2388, 0;
@%p140 bra $L__BB25_73;
$L__BB25_90:
setp.eq.s32 %p141, %r144, 0;
@%p141 bra $L__BB25_106;
and.b16 %rs743, %rs1139, 255;
setp.ne.s16 %p142, %rs743, 0;
@%p142 bra $L__BB25_95;
setp.eq.s32 %p143, %r2390, 0;
mov.u16 %rs1125, 255;
@%p143 bra $L__BB25_94;
cvt.u64.u32 %rd209, %r2389;
add.s64 %rd210, %rd209, %rd4;
add.s64 %rd211, %rd1, %rd210;
ld.global.u8 %rs1125, [%rd211];
$L__BB25_94:
setp.ne.s32 %p145, %r2390, 0;
selp.u32 %r1277, 1, 0, %p145;
add.s32 %r2389, %r2389, %r1277;
add.s32 %r1278, %r2390, -1;
selp.b32 %r2390, 0, %r1278, %p143;
setp.eq.s32 %p146, %r2390, 0;
or.b16 %rs745, %rs1125, 15;
selp.b16 %rs1105, %rs745, %rs1125, %p146;
and.b16 %rs746, %rs1105, 255;
mov.u16 %rs747, 8;
sub.s16 %rs1139, %rs747, %rs1104;
setp.eq.s16 %p147, %rs746, 255;
selp.u16 %rs1104, 1, 0, %p147;
$L__BB25_95:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1279, %rs1139;
and.b32 %r1280, %r1279, 255;
cvt.u32.u16 %r1281, %rs1105;
and.b32 %r2411, %r1281, 255;
shr.u32 %r1282, %r2411, %r1280;
and.b32 %r1283, %r1282, 1;
bfi.b32 %r2416, %r2416, %r1283, 1, 31;
setp.eq.s32 %p148, %r144, 1;
@%p148 bra $L__BB25_106;
and.b16 %rs748, %rs1139, 255;
setp.ne.s16 %p149, %rs748, 0;
@%p149 bra $L__BB25_100;
setp.eq.s32 %p150, %r2390, 0;
mov.u16 %rs1129, 255;
@%p150 bra $L__BB25_99;
cvt.u64.u32 %rd212, %r2389;
add.s64 %rd213, %rd212, %rd4;
add.s64 %rd214, %rd1, %rd213;
ld.global.u8 %rs1129, [%rd214];
$L__BB25_99:
setp.ne.s32 %p152, %r2390, 0;
selp.u32 %r1284, 1, 0, %p152;
add.s32 %r2389, %r2389, %r1284;
add.s32 %r1285, %r2390, -1;
selp.b32 %r2390, 0, %r1285, %p150;
setp.eq.s32 %p153, %r2390, 0;
or.b16 %rs750, %rs1129, 15;
selp.b16 %rs1105, %rs750, %rs1129, %p153;
and.b16 %rs751, %rs1105, 255;
mov.u16 %rs752, 8;
sub.s16 %rs1139, %rs752, %rs1104;
setp.eq.s16 %p154, %rs751, 255;
selp.u16 %rs1104, 1, 0, %p154;
cvt.u32.u16 %r1286, %rs1105;
and.b32 %r2411, %r1286, 255;
$L__BB25_100:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1287, %rs1139;
and.b32 %r1288, %r1287, 255;
shr.u32 %r1289, %r2411, %r1288;
and.b32 %r1290, %r1289, 1;
bfi.b32 %r2416, %r2416, %r1290, 1, 31;
setp.eq.s32 %p155, %r144, 2;
@%p155 bra $L__BB25_106;
and.b16 %rs753, %rs1139, 255;
setp.ne.s16 %p156, %rs753, 0;
@%p156 bra $L__BB25_105;
setp.eq.s32 %p157, %r2390, 0;
mov.u16 %rs1133, 255;
@%p157 bra $L__BB25_104;
cvt.u64.u32 %rd215, %r2389;
add.s64 %rd216, %rd215, %rd4;
add.s64 %rd217, %rd1, %rd216;
ld.global.u8 %rs1133, [%rd217];
$L__BB25_104:
setp.ne.s32 %p159, %r2390, 0;
selp.u32 %r1291, 1, 0, %p159;
add.s32 %r2389, %r2389, %r1291;
add.s32 %r1292, %r2390, -1;
selp.b32 %r2390, 0, %r1292, %p157;
setp.eq.s32 %p160, %r2390, 0;
or.b16 %rs755, %rs1133, 15;
selp.b16 %rs1105, %rs755, %rs1133, %p160;
and.b16 %rs756, %rs1105, 255;
mov.u16 %rs757, 8;
sub.s16 %rs1139, %rs757, %rs1104;
setp.eq.s16 %p161, %rs756, 255;
selp.u16 %rs1104, 1, 0, %p161;
cvt.u32.u16 %r1293, %rs1105;
and.b32 %r2411, %r1293, 255;
$L__BB25_105:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1294, %rs1139;
and.b32 %r1295, %r1294, 255;
shr.u32 %r1296, %r2411, %r1295;
and.b32 %r1297, %r1296, 1;
bfi.b32 %r2416, %r2416, %r1297, 1, 31;
$L__BB25_106:
shl.b32 %r1298, %r2416, 1;
or.b32 %r2420, %r1298, 1;
add.s32 %r1299, %r2480, -1;
setp.eq.s32 %p162, %r2480, 0;
selp.b32 %r2419, 0, %r1299, %p162;
$L__BB25_107:
mul.lo.s32 %r1300, %r2481, 7;
cvt.u64.u32 %rd218, %r2420;
shl.b64 %rd219, %rd218, %r1300;
or.b64 %rd606, %rd219, %rd606;
setp.ne.s32 %p163, %r2480, 12;
setp.ne.s32 %p164, %r140, 0;
or.pred %p165, %p163, %p164;
add.s32 %r2481, %r2481, 1;
setp.lt.u32 %p166, %r2481, 8;
or.pred %p167, %p166, %p165;
mov.u32 %r2480, %r2419;
@%p167 bra $L__BB25_63;
$L__BB25_108:
cvt.u32.u64 %r1301, %rd606;
and.b32 %r2477, %r1301, 127;
shr.u64 %rd606, %rd606, 7;
add.s32 %r2481, %r2481, -1;
$L__BB25_109:
mul.wide.u32 %rd220, %r2364, 2;
add.s64 %rd26, %rd14, %rd220;
st.local.u16 [%rd26], %r2430;
shl.b32 %r1302, %r2430, 3;
and.b32 %r1303, %r1302, 128;
shl.b32 %r1304, %r2430, 2;
and.b32 %r1305, %r1304, 896;
or.b32 %r1306, %r1303, %r1305;
and.b32 %r1307, %r2430, 7;
shr.u64 %rd27, %rd615, %r1307;
sub.s32 %r226, %r2560, %r1307;
cvt.u32.u64 %r1308, %rd27;
and.b32 %r1309, %r1308, 127;
or.b32 %r1310, %r1309, %r1306;
mul.wide.u32 %rd221, %r1310, 2;
add.s64 %rd222, %rd13, %rd221;
ld.global.u16 %r2482, [%rd222];
setp.ne.s32 %p168, %r1306, 0;
add.s32 %r228, %r2364, 2;
setp.ge.u32 %p169, %r228, %r2;
or.pred %p170, %p169, %p168;
@%p170 bra $L__BB25_159;
add.s32 %r229, %r2477, -2;
setp.eq.s32 %p171, %r229, -1;
selp.b32 %r2482, %r2482, 0, %p171;
setp.gt.s32 %p172, %r2477, 1;
mov.u32 %r2477, %r229;
@%p172 bra $L__BB25_159;
setp.ne.s32 %p173, %r2481, 0;
@%p173 bra $L__BB25_158;
mov.u32 %r2481, 0;
$L__BB25_113:
setp.gt.u32 %p174, %r2481, 7;
@%p174 bra $L__BB25_158;
cvt.u64.u32 %rd29, %r2480;
mul.wide.u32 %rd223, %r2480, 4;
add.s64 %rd225, %rd151, %rd223;
ld.global.nc.u32 %r235, [%rd225];
and.b16 %rs758, %rs1139, 255;
setp.ne.s16 %p175, %rs758, 0;
@%p175 bra $L__BB25_118;
setp.eq.s32 %p176, %r2390, 0;
mov.u16 %rs1152, 255;
@%p176 bra $L__BB25_117;
cvt.u64.u32 %rd226, %r2389;
add.s64 %rd227, %rd226, %rd4;
add.s64 %rd228, %rd1, %rd227;
ld.global.u8 %rs1152, [%rd228];
$L__BB25_117:
setp.ne.s32 %p178, %r2390, 0;
selp.u32 %r1312, 1, 0, %p178;
add.s32 %r2389, %r2389, %r1312;
add.s32 %r1313, %r2390, -1;
selp.b32 %r2390, 0, %r1313, %p176;
setp.eq.s32 %p179, %r2390, 0;
or.b16 %rs760, %rs1152, 15;
selp.b16 %rs1105, %rs760, %rs1152, %p179;
and.b16 %rs761, %rs1105, 255;
mov.u16 %rs762, 8;
sub.s16 %rs1139, %rs762, %rs1104;
setp.eq.s16 %p180, %rs761, 255;
selp.u16 %rs1104, 1, 0, %p180;
$L__BB25_118:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1314, %rs1139;
and.b32 %r1315, %r1314, 255;
mov.u32 %r1316, 1;
shl.b32 %r1317, %r1316, %r1315;
cvt.u32.u16 %r1318, %rs1105;
and.b32 %r1319, %r1317, %r1318;
and.b32 %r240, %r1319, 255;
setp.eq.s32 %p181, %r240, 0;
@%p181 bra $L__BB25_120;
add.s32 %r1320, %r2480, 1;
min.u32 %r2471, %r1320, 12;
mov.u32 %r1321, -1;
shl.b32 %r1322, %r1321, %r235;
shl.b32 %r1323, %r1322, 1;
xor.b32 %r2472, %r1323, -2;
bra.uni $L__BB25_157;
$L__BB25_120:
add.s64 %rd229, %rd29, -3;
setp.gt.u64 %p182, %rd229, 9;
mov.u32 %r2468, 0;
@%p182 bra $L__BB25_156;
max.u32 %r243, %r235, 1;
add.s32 %r1327, %r243, -1;
and.b32 %r244, %r243, 3;
setp.lt.u32 %p183, %r1327, 3;
mov.u32 %r2468, 0;
@%p183 bra $L__BB25_140;
sub.s32 %r2440, %r243, %r244;
mov.u32 %r2468, 0;
$L__BB25_123:
and.b16 %rs764, %rs1139, 255;
setp.ne.s16 %p184, %rs764, 0;
@%p184 bra $L__BB25_127;
setp.eq.s32 %p185, %r2390, 0;
mov.u16 %rs1159, 255;
@%p185 bra $L__BB25_126;
cvt.u64.u32 %rd230, %r2389;
add.s64 %rd231, %rd230, %rd4;
add.s64 %rd232, %rd1, %rd231;
ld.global.u8 %rs1159, [%rd232];
$L__BB25_126:
setp.ne.s32 %p187, %r2390, 0;
selp.u32 %r1329, 1, 0, %p187;
add.s32 %r2389, %r2389, %r1329;
add.s32 %r1330, %r2390, -1;
selp.b32 %r2390, 0, %r1330, %p185;
setp.eq.s32 %p188, %r2390, 0;
or.b16 %rs766, %rs1159, 15;
selp.b16 %rs1105, %rs766, %rs1159, %p188;
and.b16 %rs767, %rs1105, 255;
mov.u16 %rs768, 8;
sub.s16 %rs1139, %rs768, %rs1104;
setp.eq.s16 %p189, %rs767, 255;
selp.u16 %rs1104, 1, 0, %p189;
$L__BB25_127:
add.s16 %rs1166, %rs1139, -1;
and.b16 %rs769, %rs1166, 255;
cvt.u32.u16 %r1331, %rs1166;
and.b32 %r1332, %r1331, 255;
cvt.u32.u16 %r1333, %rs1105;
and.b32 %r2446, %r1333, 255;
shr.u32 %r1334, %r2446, %r1332;
and.b32 %r1335, %r1334, 1;
bfi.b32 %r255, %r2468, %r1335, 1, 31;
setp.ne.s16 %p190, %rs769, 0;
@%p190 bra $L__BB25_131;
setp.eq.s32 %p191, %r2390, 0;
mov.u16 %rs1163, 255;
@%p191 bra $L__BB25_130;
cvt.u64.u32 %rd233, %r2389;
add.s64 %rd234, %rd233, %rd4;
add.s64 %rd235, %rd1, %rd234;
ld.global.u8 %rs1163, [%rd235];
$L__BB25_130:
setp.ne.s32 %p193, %r2390, 0;
selp.u32 %r1336, 1, 0, %p193;
add.s32 %r2389, %r2389, %r1336;
add.s32 %r1337, %r2390, -1;
selp.b32 %r2390, 0, %r1337, %p191;
setp.eq.s32 %p194, %r2390, 0;
or.b16 %rs771, %rs1163, 15;
selp.b16 %rs1105, %rs771, %rs1163, %p194;
and.b16 %rs772, %rs1105, 255;
mov.u16 %rs773, 8;
sub.s16 %rs1166, %rs773, %rs1104;
setp.eq.s16 %p195, %rs772, 255;
selp.u16 %rs1104, 1, 0, %p195;
cvt.u32.u16 %r1338, %rs1105;
and.b32 %r2446, %r1338, 255;
$L__BB25_131:
add.s16 %rs1170, %rs1166, -1;
and.b16 %rs774, %rs1170, 255;
cvt.u32.u16 %r1339, %rs1170;
and.b32 %r1340, %r1339, 255;
shr.u32 %r1341, %r2446, %r1340;
and.b32 %r1342, %r1341, 1;
bfi.b32 %r262, %r255, %r1342, 1, 31;
setp.ne.s16 %p196, %rs774, 0;
@%p196 bra $L__BB25_135;
setp.eq.s32 %p197, %r2390, 0;
mov.u16 %rs1167, 255;
@%p197 bra $L__BB25_134;
cvt.u64.u32 %rd236, %r2389;
add.s64 %rd237, %rd236, %rd4;
add.s64 %rd238, %rd1, %rd237;
ld.global.u8 %rs1167, [%rd238];
$L__BB25_134:
setp.ne.s32 %p199, %r2390, 0;
selp.u32 %r1343, 1, 0, %p199;
add.s32 %r2389, %r2389, %r1343;
add.s32 %r1344, %r2390, -1;
selp.b32 %r2390, 0, %r1344, %p197;
setp.eq.s32 %p200, %r2390, 0;
or.b16 %rs776, %rs1167, 15;
selp.b16 %rs1105, %rs776, %rs1167, %p200;
and.b16 %rs777, %rs1105, 255;
mov.u16 %rs778, 8;
sub.s16 %rs1170, %rs778, %rs1104;
setp.eq.s16 %p201, %rs777, 255;
selp.u16 %rs1104, 1, 0, %p201;
cvt.u32.u16 %r1345, %rs1105;
and.b32 %r2446, %r1345, 255;
$L__BB25_135:
add.s16 %rs1174, %rs1170, -1;
and.b16 %rs779, %rs1174, 255;
cvt.u32.u16 %r1346, %rs1174;
and.b32 %r1347, %r1346, 255;
shr.u32 %r1348, %r2446, %r1347;
and.b32 %r1349, %r1348, 1;
bfi.b32 %r269, %r262, %r1349, 1, 31;
setp.ne.s16 %p202, %rs779, 0;
@%p202 bra $L__BB25_139;
setp.eq.s32 %p203, %r2390, 0;
mov.u16 %rs1171, 255;
@%p203 bra $L__BB25_138;
cvt.u64.u32 %rd239, %r2389;
add.s64 %rd240, %rd239, %rd4;
add.s64 %rd241, %rd1, %rd240;
ld.global.u8 %rs1171, [%rd241];
$L__BB25_138:
setp.ne.s32 %p205, %r2390, 0;
selp.u32 %r1350, 1, 0, %p205;
add.s32 %r2389, %r2389, %r1350;
add.s32 %r1351, %r2390, -1;
selp.b32 %r2390, 0, %r1351, %p203;
setp.eq.s32 %p206, %r2390, 0;
or.b16 %rs781, %rs1171, 15;
selp.b16 %rs1105, %rs781, %rs1171, %p206;
and.b16 %rs782, %rs1105, 255;
mov.u16 %rs783, 8;
sub.s16 %rs1174, %rs783, %rs1104;
setp.eq.s16 %p207, %rs782, 255;
selp.u16 %rs1104, 1, 0, %p207;
cvt.u32.u16 %r1352, %rs1105;
and.b32 %r2446, %r1352, 255;
$L__BB25_139:
add.s16 %rs1139, %rs1174, -1;
cvt.u32.u16 %r1353, %rs1139;
and.b32 %r1354, %r1353, 255;
shr.u32 %r1355, %r2446, %r1354;
and.b32 %r1356, %r1355, 1;
bfi.b32 %r2468, %r269, %r1356, 1, 31;
add.s32 %r2440, %r2440, -4;
setp.ne.s32 %p208, %r2440, 0;
@%p208 bra $L__BB25_123;
$L__BB25_140:
setp.eq.s32 %p209, %r244, 0;
@%p209 bra $L__BB25_156;
and.b16 %rs784, %rs1139, 255;
setp.ne.s16 %p210, %rs784, 0;
@%p210 bra $L__BB25_145;
setp.eq.s32 %p211, %r2390, 0;
mov.u16 %rs1181, 255;
@%p211 bra $L__BB25_144;
cvt.u64.u32 %rd242, %r2389;
add.s64 %rd243, %rd242, %rd4;
add.s64 %rd244, %rd1, %rd243;
ld.global.u8 %rs1181, [%rd244];
$L__BB25_144:
setp.ne.s32 %p213, %r2390, 0;
selp.u32 %r1357, 1, 0, %p213;
add.s32 %r2389, %r2389, %r1357;
add.s32 %r1358, %r2390, -1;
selp.b32 %r2390, 0, %r1358, %p211;
setp.eq.s32 %p214, %r2390, 0;
or.b16 %rs786, %rs1181, 15;
selp.b16 %rs1105, %rs786, %rs1181, %p214;
and.b16 %rs787, %rs1105, 255;
mov.u16 %rs788, 8;
sub.s16 %rs1139, %rs788, %rs1104;
setp.eq.s16 %p215, %rs787, 255;
selp.u16 %rs1104, 1, 0, %p215;
$L__BB25_145:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1359, %rs1139;
and.b32 %r1360, %r1359, 255;
cvt.u32.u16 %r1361, %rs1105;
and.b32 %r2463, %r1361, 255;
shr.u32 %r1362, %r2463, %r1360;
and.b32 %r1363, %r1362, 1;
bfi.b32 %r2468, %r2468, %r1363, 1, 31;
setp.eq.s32 %p216, %r244, 1;
@%p216 bra $L__BB25_156;
and.b16 %rs789, %rs1139, 255;
setp.ne.s16 %p217, %rs789, 0;
@%p217 bra $L__BB25_150;
setp.eq.s32 %p218, %r2390, 0;
mov.u16 %rs1185, 255;
@%p218 bra $L__BB25_149;
cvt.u64.u32 %rd245, %r2389;
add.s64 %rd246, %rd245, %rd4;
add.s64 %rd247, %rd1, %rd246;
ld.global.u8 %rs1185, [%rd247];
$L__BB25_149:
setp.ne.s32 %p220, %r2390, 0;
selp.u32 %r1364, 1, 0, %p220;
add.s32 %r2389, %r2389, %r1364;
add.s32 %r1365, %r2390, -1;
selp.b32 %r2390, 0, %r1365, %p218;
setp.eq.s32 %p221, %r2390, 0;
or.b16 %rs791, %rs1185, 15;
selp.b16 %rs1105, %rs791, %rs1185, %p221;
and.b16 %rs792, %rs1105, 255;
mov.u16 %rs793, 8;
sub.s16 %rs1139, %rs793, %rs1104;
setp.eq.s16 %p222, %rs792, 255;
selp.u16 %rs1104, 1, 0, %p222;
cvt.u32.u16 %r1366, %rs1105;
and.b32 %r2463, %r1366, 255;
$L__BB25_150:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1367, %rs1139;
and.b32 %r1368, %r1367, 255;
shr.u32 %r1369, %r2463, %r1368;
and.b32 %r1370, %r1369, 1;
bfi.b32 %r2468, %r2468, %r1370, 1, 31;
setp.eq.s32 %p223, %r244, 2;
@%p223 bra $L__BB25_156;
and.b16 %rs794, %rs1139, 255;
setp.ne.s16 %p224, %rs794, 0;
@%p224 bra $L__BB25_155;
setp.eq.s32 %p225, %r2390, 0;
mov.u16 %rs1189, 255;
@%p225 bra $L__BB25_154;
cvt.u64.u32 %rd248, %r2389;
add.s64 %rd249, %rd248, %rd4;
add.s64 %rd250, %rd1, %rd249;
ld.global.u8 %rs1189, [%rd250];
$L__BB25_154:
setp.ne.s32 %p227, %r2390, 0;
selp.u32 %r1371, 1, 0, %p227;
add.s32 %r2389, %r2389, %r1371;
add.s32 %r1372, %r2390, -1;
selp.b32 %r2390, 0, %r1372, %p225;
setp.eq.s32 %p228, %r2390, 0;
or.b16 %rs796, %rs1189, 15;
selp.b16 %rs1105, %rs796, %rs1189, %p228;
and.b16 %rs797, %rs1105, 255;
mov.u16 %rs798, 8;
sub.s16 %rs1139, %rs798, %rs1104;
setp.eq.s16 %p229, %rs797, 255;
selp.u16 %rs1104, 1, 0, %p229;
cvt.u32.u16 %r1373, %rs1105;
and.b32 %r2463, %r1373, 255;
$L__BB25_155:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1374, %rs1139;
and.b32 %r1375, %r1374, 255;
shr.u32 %r1376, %r2463, %r1375;
and.b32 %r1377, %r1376, 1;
bfi.b32 %r2468, %r2468, %r1377, 1, 31;
$L__BB25_156:
shl.b32 %r1378, %r2468, 1;
or.b32 %r2472, %r1378, 1;
add.s32 %r1379, %r2480, -1;
setp.eq.s32 %p230, %r2480, 0;
selp.b32 %r2471, 0, %r1379, %p230;
$L__BB25_157:
mul.lo.s32 %r1380, %r2481, 7;
cvt.u64.u32 %rd251, %r2472;
shl.b64 %rd252, %rd251, %r1380;
or.b64 %rd606, %rd252, %rd606;
setp.ne.s32 %p231, %r2480, 12;
setp.ne.s32 %p232, %r240, 0;
or.pred %p233, %p231, %p232;
add.s32 %r2481, %r2481, 1;
setp.lt.u32 %p234, %r2481, 8;
or.pred %p235, %p234, %p233;
mov.u32 %r2480, %r2471;
@%p235 bra $L__BB25_113;
$L__BB25_158:
cvt.u32.u64 %r1381, %rd606;
and.b32 %r2477, %r1381, 127;
shr.u64 %rd606, %rd606, 7;
add.s32 %r2481, %r2481, -1;
$L__BB25_159:
setp.lt.u32 %p236, %r228, %r2;
selp.b32 %r326, %r2482, 0, %p236;
st.local.u16 [%rd26+4], %r326;
and.b32 %r1383, %r1302, 64;
shl.b32 %r1384, %r326, 4;
and.b32 %r1385, %r1384, 128;
or.b32 %r2534, %r1385, %r1383;
setp.ne.s32 %p237, %r2534, 192;
@%p237 bra $L__BB25_209;
add.s32 %r328, %r2477, -2;
setp.eq.s32 %p238, %r328, -1;
selp.b32 %r2534, 256, 192, %p238;
setp.gt.s32 %p239, %r2477, 1;
mov.u32 %r2477, %r328;
@%p239 bra $L__BB25_209;
setp.ne.s32 %p240, %r2481, 0;
@%p240 bra $L__BB25_208;
mov.u32 %r2481, 0;
$L__BB25_163:
setp.gt.u32 %p241, %r2481, 7;
@%p241 bra $L__BB25_208;
cvt.u64.u32 %rd35, %r2480;
mul.wide.u32 %rd253, %r2480, 4;
add.s64 %rd255, %rd151, %rd253;
ld.global.nc.u32 %r334, [%rd255];
and.b16 %rs799, %rs1139, 255;
setp.ne.s16 %p242, %rs799, 0;
@%p242 bra $L__BB25_168;
setp.eq.s32 %p243, %r2390, 0;
mov.u16 %rs1208, 255;
@%p243 bra $L__BB25_167;
cvt.u64.u32 %rd256, %r2389;
add.s64 %rd257, %rd256, %rd4;
add.s64 %rd258, %rd1, %rd257;
ld.global.u8 %rs1208, [%rd258];
$L__BB25_167:
setp.ne.s32 %p245, %r2390, 0;
selp.u32 %r1387, 1, 0, %p245;
add.s32 %r2389, %r2389, %r1387;
add.s32 %r1388, %r2390, -1;
selp.b32 %r2390, 0, %r1388, %p243;
setp.eq.s32 %p246, %r2390, 0;
or.b16 %rs801, %rs1208, 15;
selp.b16 %rs1105, %rs801, %rs1208, %p246;
and.b16 %rs802, %rs1105, 255;
mov.u16 %rs803, 8;
sub.s16 %rs1139, %rs803, %rs1104;
setp.eq.s16 %p247, %rs802, 255;
selp.u16 %rs1104, 1, 0, %p247;
$L__BB25_168:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1389, %rs1139;
and.b32 %r1390, %r1389, 255;
mov.u32 %r1391, 1;
shl.b32 %r1392, %r1391, %r1390;
cvt.u32.u16 %r1393, %rs1105;
and.b32 %r1394, %r1392, %r1393;
and.b32 %r339, %r1394, 255;
setp.eq.s32 %p248, %r339, 0;
@%p248 bra $L__BB25_170;
add.s32 %r1395, %r2480, 1;
min.u32 %r2523, %r1395, 12;
mov.u32 %r1396, -1;
shl.b32 %r1397, %r1396, %r334;
shl.b32 %r1398, %r1397, 1;
xor.b32 %r2524, %r1398, -2;
bra.uni $L__BB25_207;
$L__BB25_170:
add.s64 %rd259, %rd35, -3;
setp.gt.u64 %p249, %rd259, 9;
mov.u32 %r2520, 0;
@%p249 bra $L__BB25_206;
max.u32 %r342, %r334, 1;
add.s32 %r1402, %r342, -1;
and.b32 %r343, %r342, 3;
setp.lt.u32 %p250, %r1402, 3;
mov.u32 %r2520, 0;
@%p250 bra $L__BB25_190;
sub.s32 %r2492, %r342, %r343;
mov.u32 %r2520, 0;
$L__BB25_173:
and.b16 %rs805, %rs1139, 255;
setp.ne.s16 %p251, %rs805, 0;
@%p251 bra $L__BB25_177;
setp.eq.s32 %p252, %r2390, 0;
mov.u16 %rs1215, 255;
@%p252 bra $L__BB25_176;
cvt.u64.u32 %rd260, %r2389;
add.s64 %rd261, %rd260, %rd4;
add.s64 %rd262, %rd1, %rd261;
ld.global.u8 %rs1215, [%rd262];
$L__BB25_176:
setp.ne.s32 %p254, %r2390, 0;
selp.u32 %r1404, 1, 0, %p254;
add.s32 %r2389, %r2389, %r1404;
add.s32 %r1405, %r2390, -1;
selp.b32 %r2390, 0, %r1405, %p252;
setp.eq.s32 %p255, %r2390, 0;
or.b16 %rs807, %rs1215, 15;
selp.b16 %rs1105, %rs807, %rs1215, %p255;
and.b16 %rs808, %rs1105, 255;
mov.u16 %rs809, 8;
sub.s16 %rs1139, %rs809, %rs1104;
setp.eq.s16 %p256, %rs808, 255;
selp.u16 %rs1104, 1, 0, %p256;
$L__BB25_177:
add.s16 %rs1222, %rs1139, -1;
and.b16 %rs810, %rs1222, 255;
cvt.u32.u16 %r1406, %rs1222;
and.b32 %r1407, %r1406, 255;
cvt.u32.u16 %r1408, %rs1105;
and.b32 %r2498, %r1408, 255;
shr.u32 %r1409, %r2498, %r1407;
and.b32 %r1410, %r1409, 1;
bfi.b32 %r354, %r2520, %r1410, 1, 31;
setp.ne.s16 %p257, %rs810, 0;
@%p257 bra $L__BB25_181;
setp.eq.s32 %p258, %r2390, 0;
mov.u16 %rs1219, 255;
@%p258 bra $L__BB25_180;
cvt.u64.u32 %rd263, %r2389;
add.s64 %rd264, %rd263, %rd4;
add.s64 %rd265, %rd1, %rd264;
ld.global.u8 %rs1219, [%rd265];
$L__BB25_180:
setp.ne.s32 %p260, %r2390, 0;
selp.u32 %r1411, 1, 0, %p260;
add.s32 %r2389, %r2389, %r1411;
add.s32 %r1412, %r2390, -1;
selp.b32 %r2390, 0, %r1412, %p258;
setp.eq.s32 %p261, %r2390, 0;
or.b16 %rs812, %rs1219, 15;
selp.b16 %rs1105, %rs812, %rs1219, %p261;
and.b16 %rs813, %rs1105, 255;
mov.u16 %rs814, 8;
sub.s16 %rs1222, %rs814, %rs1104;
setp.eq.s16 %p262, %rs813, 255;
selp.u16 %rs1104, 1, 0, %p262;
cvt.u32.u16 %r1413, %rs1105;
and.b32 %r2498, %r1413, 255;
$L__BB25_181:
add.s16 %rs1226, %rs1222, -1;
and.b16 %rs815, %rs1226, 255;
cvt.u32.u16 %r1414, %rs1226;
and.b32 %r1415, %r1414, 255;
shr.u32 %r1416, %r2498, %r1415;
and.b32 %r1417, %r1416, 1;
bfi.b32 %r361, %r354, %r1417, 1, 31;
setp.ne.s16 %p263, %rs815, 0;
@%p263 bra $L__BB25_185;
setp.eq.s32 %p264, %r2390, 0;
mov.u16 %rs1223, 255;
@%p264 bra $L__BB25_184;
cvt.u64.u32 %rd266, %r2389;
add.s64 %rd267, %rd266, %rd4;
add.s64 %rd268, %rd1, %rd267;
ld.global.u8 %rs1223, [%rd268];
$L__BB25_184:
setp.ne.s32 %p266, %r2390, 0;
selp.u32 %r1418, 1, 0, %p266;
add.s32 %r2389, %r2389, %r1418;
add.s32 %r1419, %r2390, -1;
selp.b32 %r2390, 0, %r1419, %p264;
setp.eq.s32 %p267, %r2390, 0;
or.b16 %rs817, %rs1223, 15;
selp.b16 %rs1105, %rs817, %rs1223, %p267;
and.b16 %rs818, %rs1105, 255;
mov.u16 %rs819, 8;
sub.s16 %rs1226, %rs819, %rs1104;
setp.eq.s16 %p268, %rs818, 255;
selp.u16 %rs1104, 1, 0, %p268;
cvt.u32.u16 %r1420, %rs1105;
and.b32 %r2498, %r1420, 255;
$L__BB25_185:
add.s16 %rs1230, %rs1226, -1;
and.b16 %rs820, %rs1230, 255;
cvt.u32.u16 %r1421, %rs1230;
and.b32 %r1422, %r1421, 255;
shr.u32 %r1423, %r2498, %r1422;
and.b32 %r1424, %r1423, 1;
bfi.b32 %r368, %r361, %r1424, 1, 31;
setp.ne.s16 %p269, %rs820, 0;
@%p269 bra $L__BB25_189;
setp.eq.s32 %p270, %r2390, 0;
mov.u16 %rs1227, 255;
@%p270 bra $L__BB25_188;
cvt.u64.u32 %rd269, %r2389;
add.s64 %rd270, %rd269, %rd4;
add.s64 %rd271, %rd1, %rd270;
ld.global.u8 %rs1227, [%rd271];
$L__BB25_188:
setp.ne.s32 %p272, %r2390, 0;
selp.u32 %r1425, 1, 0, %p272;
add.s32 %r2389, %r2389, %r1425;
add.s32 %r1426, %r2390, -1;
selp.b32 %r2390, 0, %r1426, %p270;
setp.eq.s32 %p273, %r2390, 0;
or.b16 %rs822, %rs1227, 15;
selp.b16 %rs1105, %rs822, %rs1227, %p273;
and.b16 %rs823, %rs1105, 255;
mov.u16 %rs824, 8;
sub.s16 %rs1230, %rs824, %rs1104;
setp.eq.s16 %p274, %rs823, 255;
selp.u16 %rs1104, 1, 0, %p274;
cvt.u32.u16 %r1427, %rs1105;
and.b32 %r2498, %r1427, 255;
$L__BB25_189:
add.s16 %rs1139, %rs1230, -1;
cvt.u32.u16 %r1428, %rs1139;
and.b32 %r1429, %r1428, 255;
shr.u32 %r1430, %r2498, %r1429;
and.b32 %r1431, %r1430, 1;
bfi.b32 %r2520, %r368, %r1431, 1, 31;
add.s32 %r2492, %r2492, -4;
setp.ne.s32 %p275, %r2492, 0;
@%p275 bra $L__BB25_173;
$L__BB25_190:
setp.eq.s32 %p276, %r343, 0;
@%p276 bra $L__BB25_206;
and.b16 %rs825, %rs1139, 255;
setp.ne.s16 %p277, %rs825, 0;
@%p277 bra $L__BB25_195;
setp.eq.s32 %p278, %r2390, 0;
mov.u16 %rs1237, 255;
@%p278 bra $L__BB25_194;
cvt.u64.u32 %rd272, %r2389;
add.s64 %rd273, %rd272, %rd4;
add.s64 %rd274, %rd1, %rd273;
ld.global.u8 %rs1237, [%rd274];
$L__BB25_194:
setp.ne.s32 %p280, %r2390, 0;
selp.u32 %r1432, 1, 0, %p280;
add.s32 %r2389, %r2389, %r1432;
add.s32 %r1433, %r2390, -1;
selp.b32 %r2390, 0, %r1433, %p278;
setp.eq.s32 %p281, %r2390, 0;
or.b16 %rs827, %rs1237, 15;
selp.b16 %rs1105, %rs827, %rs1237, %p281;
and.b16 %rs828, %rs1105, 255;
mov.u16 %rs829, 8;
sub.s16 %rs1139, %rs829, %rs1104;
setp.eq.s16 %p282, %rs828, 255;
selp.u16 %rs1104, 1, 0, %p282;
$L__BB25_195:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1434, %rs1139;
and.b32 %r1435, %r1434, 255;
cvt.u32.u16 %r1436, %rs1105;
and.b32 %r2515, %r1436, 255;
shr.u32 %r1437, %r2515, %r1435;
and.b32 %r1438, %r1437, 1;
bfi.b32 %r2520, %r2520, %r1438, 1, 31;
setp.eq.s32 %p283, %r343, 1;
@%p283 bra $L__BB25_206;
and.b16 %rs830, %rs1139, 255;
setp.ne.s16 %p284, %rs830, 0;
@%p284 bra $L__BB25_200;
setp.eq.s32 %p285, %r2390, 0;
mov.u16 %rs1241, 255;
@%p285 bra $L__BB25_199;
cvt.u64.u32 %rd275, %r2389;
add.s64 %rd276, %rd275, %rd4;
add.s64 %rd277, %rd1, %rd276;
ld.global.u8 %rs1241, [%rd277];
$L__BB25_199:
setp.ne.s32 %p287, %r2390, 0;
selp.u32 %r1439, 1, 0, %p287;
add.s32 %r2389, %r2389, %r1439;
add.s32 %r1440, %r2390, -1;
selp.b32 %r2390, 0, %r1440, %p285;
setp.eq.s32 %p288, %r2390, 0;
or.b16 %rs832, %rs1241, 15;
selp.b16 %rs1105, %rs832, %rs1241, %p288;
and.b16 %rs833, %rs1105, 255;
mov.u16 %rs834, 8;
sub.s16 %rs1139, %rs834, %rs1104;
setp.eq.s16 %p289, %rs833, 255;
selp.u16 %rs1104, 1, 0, %p289;
cvt.u32.u16 %r1441, %rs1105;
and.b32 %r2515, %r1441, 255;
$L__BB25_200:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1442, %rs1139;
and.b32 %r1443, %r1442, 255;
shr.u32 %r1444, %r2515, %r1443;
and.b32 %r1445, %r1444, 1;
bfi.b32 %r2520, %r2520, %r1445, 1, 31;
setp.eq.s32 %p290, %r343, 2;
@%p290 bra $L__BB25_206;
and.b16 %rs835, %rs1139, 255;
setp.ne.s16 %p291, %rs835, 0;
@%p291 bra $L__BB25_205;
setp.eq.s32 %p292, %r2390, 0;
mov.u16 %rs1245, 255;
@%p292 bra $L__BB25_204;
cvt.u64.u32 %rd278, %r2389;
add.s64 %rd279, %rd278, %rd4;
add.s64 %rd280, %rd1, %rd279;
ld.global.u8 %rs1245, [%rd280];
$L__BB25_204:
setp.ne.s32 %p294, %r2390, 0;
selp.u32 %r1446, 1, 0, %p294;
add.s32 %r2389, %r2389, %r1446;
add.s32 %r1447, %r2390, -1;
selp.b32 %r2390, 0, %r1447, %p292;
setp.eq.s32 %p295, %r2390, 0;
or.b16 %rs837, %rs1245, 15;
selp.b16 %rs1105, %rs837, %rs1245, %p295;
and.b16 %rs838, %rs1105, 255;
mov.u16 %rs839, 8;
sub.s16 %rs1139, %rs839, %rs1104;
setp.eq.s16 %p296, %rs838, 255;
selp.u16 %rs1104, 1, 0, %p296;
cvt.u32.u16 %r1448, %rs1105;
and.b32 %r2515, %r1448, 255;
$L__BB25_205:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1449, %rs1139;
and.b32 %r1450, %r1449, 255;
shr.u32 %r1451, %r2515, %r1450;
and.b32 %r1452, %r1451, 1;
bfi.b32 %r2520, %r2520, %r1452, 1, 31;
$L__BB25_206:
shl.b32 %r1453, %r2520, 1;
or.b32 %r2524, %r1453, 1;
add.s32 %r1454, %r2480, -1;
setp.eq.s32 %p297, %r2480, 0;
selp.b32 %r2523, 0, %r1454, %p297;
$L__BB25_207:
mul.lo.s32 %r1455, %r2481, 7;
cvt.u64.u32 %rd281, %r2524;
shl.b64 %rd282, %rd281, %r1455;
or.b64 %rd606, %rd282, %rd606;
setp.ne.s32 %p298, %r2480, 12;
setp.ne.s32 %p299, %r339, 0;
or.pred %p300, %p298, %p299;
add.s32 %r2481, %r2481, 1;
setp.lt.u32 %p301, %r2481, 8;
or.pred %p302, %p301, %p300;
mov.u32 %r2480, %r2523;
@%p302 bra $L__BB25_163;
$L__BB25_208:
cvt.u32.u64 %r1456, %rd606;
and.b32 %r2477, %r1456, 127;
shr.u64 %rd606, %rd606, 7;
add.s32 %r2481, %r2481, -1;
$L__BB25_209:
and.b32 %r1457, %r326, 7;
shr.u64 %rd283, %rd27, %r1457;
cvt.u32.u64 %r1458, %rd283;
and.b32 %r1459, %r1458, 63;
add.s32 %r1460, %r2534, %r1459;
mul.wide.u32 %rd284, %r1460, 2;
add.s64 %rd285, %rd12, %rd284;
ld.global.u16 %r1461, [%rd285];
and.b32 %r1462, %r1461, 7;
shr.u64 %rd286, %rd283, %r1462;
sub.s32 %r1463, %r226, %r1457;
sub.s32 %r1464, %r1463, %r1462;
cvt.u32.u64 %r1465, %rd286;
shr.u32 %r1466, %r1461, 3;
and.b32 %r1467, %r1466, 15;
mov.u32 %r1468, -1;
shl.b32 %r1469, %r1468, %r1467;
not.b32 %r1470, %r1469;
and.b32 %r1471, %r1465, %r1470;
shr.u64 %rd615, %rd286, %r1467;
sub.s32 %r2560, %r1464, %r1467;
shr.u32 %r1472, %r1461, 7;
and.b32 %r1473, %r1472, 7;
shr.u32 %r1474, %r1461, 10;
and.b32 %r1475, %r1474, 7;
mov.u32 %r1476, 255;
shl.b32 %r1477, %r1476, %r1473;
not.b32 %r1478, %r1477;
and.b32 %r1479, %r1471, %r1478;
add.s32 %r1480, %r1475, %r1479;
add.s32 %r1481, %r1480, 1;
st.local.u16 [%rd26+2], %r1481;
shr.u32 %r1482, %r1461, 13;
shr.u32 %r1483, %r1471, %r1473;
add.s32 %r1484, %r1482, %r1483;
add.s32 %r1485, %r1484, 1;
st.local.u16 [%rd26+6], %r1485;
add.s32 %r2364, %r2364, 4;
setp.lt.u32 %p303, %r2364, %r2;
shl.b32 %r1486, %r326, 2;
and.b32 %r1487, %r1486, 896;
shl.b32 %r1488, %r326, 3;
and.b32 %r1489, %r1488, 128;
or.b32 %r2363, %r1489, %r1487;
@%p303 bra $L__BB25_55;
mul.wide.u32 %rd289, %r2364, 2;
add.s64 %rd290, %rd14, %rd289;
mov.u16 %rs840, 0;
st.local.v2.u16 [%rd290], {%rs840, %rs840};
setp.lt.u32 %p304, %r3, 3;
@%p304 bra $L__BB25_319;
ld.param.u64 %rd592, [ j2k_htj2k_decode_codeblocks_param_4];
ld.param.u64 %rd590, [ j2k_htj2k_decode_codeblocks_param_6];
cvta.to.global.u64 %rd41, %rd590;
cvta.to.global.u64 %rd42, %rd592;
mov.u32 %r2535, 2;
$L__BB25_212:
shr.u32 %r1494, %r2535, 1;
mul.lo.s32 %r438, %r1494, %r1148;
sub.s32 %r439, %r438, %r1148;
mov.u32 %r2544, 0;
mov.u32 %r2545, %r2544;
mov.u32 %r2546, %r438;
$L__BB25_213:
sub.s32 %r1495, %r2546, %r438;
add.s32 %r451, %r1495, %r439;
mul.wide.u32 %rd291, %r451, 2;
add.s64 %rd47, %rd14, %rd291;
ld.local.u16 %r1496, [%rd47];
shl.b32 %r1497, %r1496, 2;
and.b32 %r1498, %r1497, 640;
or.b32 %r1499, %r2545, %r1498;
add.s32 %r1500, %r451, 2;
mul.wide.u32 %rd292, %r1500, 2;
add.s64 %rd48, %rd14, %rd292;
ld.local.u16 %r1501, [%rd48];
shl.b32 %r1502, %r1501, 4;
and.b32 %r1503, %r1502, 512;
or.b32 %r452, %r1499, %r1503;
setp.gt.u32 %p305, %r2560, 31;
@%p305 bra $L__BB25_217;
$L__BB25_214:
setp.eq.s32 %p306, %r2559, 0;
mov.u16 %rs1270, 0;
@%p306 bra $L__BB25_216;
cvt.s64.s32 %rd293, %r2558;
add.s64 %rd294, %rd293, %rd4;
add.s64 %rd295, %rd1, %rd294;
ld.global.u8 %rs1270, [%rd295];
$L__BB25_216:
setp.ne.s32 %p308, %r2559, 0;
selp.b32 %r1504, -1, 0, %p308;
add.s32 %r2558, %r2558, %r1504;
add.s32 %r1505, %r2559, -1;
selp.b32 %r2559, 0, %r1505, %p306;
and.b16 %rs842, %rs1270, 255;
and.b16 %rs843, %rs1270, 127;
setp.eq.s16 %p309, %rs843, 127;
and.b16 %rs844, %rs1271, 255;
setp.ne.s16 %p310, %rs844, 0;
and.pred %p311, %p310, %p309;
selp.b32 %r1506, 7, 8, %p311;
cvt.u64.u16 %rd296, %rs1270;
and.b64 %rd297, %rd296, 255;
shl.b64 %rd298, %rd297, %r2560;
or.b64 %rd615, %rd298, %rd615;
add.s32 %r2560, %r1506, %r2560;
setp.gt.u16 %p312, %rs842, 143;
selp.u16 %rs1271, 1, 0, %p312;
setp.lt.u32 %p313, %r2560, 33;
@%p313 bra $L__BB25_214;
$L__BB25_217:
cvt.u32.u64 %r1507, %rd615;
and.b32 %r1508, %r1507, 127;
add.s32 %r1509, %r1508, %r452;
mul.wide.u32 %rd299, %r1509, 2;
add.s64 %rd300, %rd42, %rd299;
ld.global.u16 %r2612, [%rd300];
setp.ne.s32 %p314, %r452, 0;
@%p314 bra $L__BB25_267;
add.s32 %r463, %r2477, -2;
setp.eq.s32 %p315, %r463, -1;
selp.b32 %r2612, %r2612, 0, %p315;
setp.gt.s32 %p316, %r2477, 1;
mov.u32 %r2477, %r463;
@%p316 bra $L__BB25_267;
setp.ne.s32 %p317, %r2481, 0;
@%p317 bra $L__BB25_266;
mov.u32 %r2481, 0;
$L__BB25_221:
setp.gt.u32 %p318, %r2481, 7;
@%p318 bra $L__BB25_266;
cvt.u64.u32 %rd53, %r2480;
mul.wide.u32 %rd301, %r2480, 4;
add.s64 %rd303, %rd151, %rd301;
ld.global.nc.u32 %r469, [%rd303];
and.b16 %rs845, %rs1139, 255;
setp.ne.s16 %p319, %rs845, 0;
@%p319 bra $L__BB25_226;
setp.eq.s32 %p320, %r2390, 0;
mov.u16 %rs1275, 255;
@%p320 bra $L__BB25_225;
cvt.u64.u32 %rd304, %r2389;
add.s64 %rd305, %rd304, %rd4;
add.s64 %rd306, %rd1, %rd305;
ld.global.u8 %rs1275, [%rd306];
$L__BB25_225:
setp.ne.s32 %p322, %r2390, 0;
selp.u32 %r1511, 1, 0, %p322;
add.s32 %r2389, %r2389, %r1511;
add.s32 %r1512, %r2390, -1;
selp.b32 %r2390, 0, %r1512, %p320;
setp.eq.s32 %p323, %r2390, 0;
or.b16 %rs847, %rs1275, 15;
selp.b16 %rs1105, %rs847, %rs1275, %p323;
and.b16 %rs848, %rs1105, 255;
mov.u16 %rs849, 8;
sub.s16 %rs1139, %rs849, %rs1104;
setp.eq.s16 %p324, %rs848, 255;
selp.u16 %rs1104, 1, 0, %p324;
$L__BB25_226:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1513, %rs1139;
and.b32 %r1514, %r1513, 255;
mov.u32 %r1515, 1;
shl.b32 %r1516, %r1515, %r1514;
cvt.u32.u16 %r1517, %rs1105;
and.b32 %r1518, %r1516, %r1517;
and.b32 %r474, %r1518, 255;
setp.eq.s32 %p325, %r474, 0;
@%p325 bra $L__BB25_228;
add.s32 %r1519, %r2480, 1;
min.u32 %r2601, %r1519, 12;
mov.u32 %r1520, -1;
shl.b32 %r1521, %r1520, %r469;
shl.b32 %r1522, %r1521, 1;
xor.b32 %r2602, %r1522, -2;
bra.uni $L__BB25_265;
$L__BB25_228:
add.s64 %rd307, %rd53, -3;
setp.gt.u64 %p326, %rd307, 9;
mov.u32 %r2598, 0;
@%p326 bra $L__BB25_264;
max.u32 %r477, %r469, 1;
add.s32 %r1526, %r477, -1;
and.b32 %r478, %r477, 3;
setp.lt.u32 %p327, %r1526, 3;
mov.u32 %r2598, 0;
@%p327 bra $L__BB25_248;
sub.s32 %r2570, %r477, %r478;
mov.u32 %r2598, 0;
$L__BB25_231:
and.b16 %rs851, %rs1139, 255;
setp.ne.s16 %p328, %rs851, 0;
@%p328 bra $L__BB25_235;
setp.eq.s32 %p329, %r2390, 0;
mov.u16 %rs1282, 255;
@%p329 bra $L__BB25_234;
cvt.u64.u32 %rd308, %r2389;
add.s64 %rd309, %rd308, %rd4;
add.s64 %rd310, %rd1, %rd309;
ld.global.u8 %rs1282, [%rd310];
$L__BB25_234:
setp.ne.s32 %p331, %r2390, 0;
selp.u32 %r1528, 1, 0, %p331;
add.s32 %r2389, %r2389, %r1528;
add.s32 %r1529, %r2390, -1;
selp.b32 %r2390, 0, %r1529, %p329;
setp.eq.s32 %p332, %r2390, 0;
or.b16 %rs853, %rs1282, 15;
selp.b16 %rs1105, %rs853, %rs1282, %p332;
and.b16 %rs854, %rs1105, 255;
mov.u16 %rs855, 8;
sub.s16 %rs1139, %rs855, %rs1104;
setp.eq.s16 %p333, %rs854, 255;
selp.u16 %rs1104, 1, 0, %p333;
$L__BB25_235:
add.s16 %rs1289, %rs1139, -1;
and.b16 %rs856, %rs1289, 255;
cvt.u32.u16 %r1530, %rs1289;
and.b32 %r1531, %r1530, 255;
cvt.u32.u16 %r1532, %rs1105;
and.b32 %r2576, %r1532, 255;
shr.u32 %r1533, %r2576, %r1531;
and.b32 %r1534, %r1533, 1;
bfi.b32 %r489, %r2598, %r1534, 1, 31;
setp.ne.s16 %p334, %rs856, 0;
@%p334 bra $L__BB25_239;
setp.eq.s32 %p335, %r2390, 0;
mov.u16 %rs1286, 255;
@%p335 bra $L__BB25_238;
cvt.u64.u32 %rd311, %r2389;
add.s64 %rd312, %rd311, %rd4;
add.s64 %rd313, %rd1, %rd312;
ld.global.u8 %rs1286, [%rd313];
$L__BB25_238:
setp.ne.s32 %p337, %r2390, 0;
selp.u32 %r1535, 1, 0, %p337;
add.s32 %r2389, %r2389, %r1535;
add.s32 %r1536, %r2390, -1;
selp.b32 %r2390, 0, %r1536, %p335;
setp.eq.s32 %p338, %r2390, 0;
or.b16 %rs858, %rs1286, 15;
selp.b16 %rs1105, %rs858, %rs1286, %p338;
and.b16 %rs859, %rs1105, 255;
mov.u16 %rs860, 8;
sub.s16 %rs1289, %rs860, %rs1104;
setp.eq.s16 %p339, %rs859, 255;
selp.u16 %rs1104, 1, 0, %p339;
cvt.u32.u16 %r1537, %rs1105;
and.b32 %r2576, %r1537, 255;
$L__BB25_239:
add.s16 %rs1293, %rs1289, -1;
and.b16 %rs861, %rs1293, 255;
cvt.u32.u16 %r1538, %rs1293;
and.b32 %r1539, %r1538, 255;
shr.u32 %r1540, %r2576, %r1539;
and.b32 %r1541, %r1540, 1;
bfi.b32 %r496, %r489, %r1541, 1, 31;
setp.ne.s16 %p340, %rs861, 0;
@%p340 bra $L__BB25_243;
setp.eq.s32 %p341, %r2390, 0;
mov.u16 %rs1290, 255;
@%p341 bra $L__BB25_242;
cvt.u64.u32 %rd314, %r2389;
add.s64 %rd315, %rd314, %rd4;
add.s64 %rd316, %rd1, %rd315;
ld.global.u8 %rs1290, [%rd316];
$L__BB25_242:
setp.ne.s32 %p343, %r2390, 0;
selp.u32 %r1542, 1, 0, %p343;
add.s32 %r2389, %r2389, %r1542;
add.s32 %r1543, %r2390, -1;
selp.b32 %r2390, 0, %r1543, %p341;
setp.eq.s32 %p344, %r2390, 0;
or.b16 %rs863, %rs1290, 15;
selp.b16 %rs1105, %rs863, %rs1290, %p344;
and.b16 %rs864, %rs1105, 255;
mov.u16 %rs865, 8;
sub.s16 %rs1293, %rs865, %rs1104;
setp.eq.s16 %p345, %rs864, 255;
selp.u16 %rs1104, 1, 0, %p345;
cvt.u32.u16 %r1544, %rs1105;
and.b32 %r2576, %r1544, 255;
$L__BB25_243:
add.s16 %rs1297, %rs1293, -1;
and.b16 %rs866, %rs1297, 255;
cvt.u32.u16 %r1545, %rs1297;
and.b32 %r1546, %r1545, 255;
shr.u32 %r1547, %r2576, %r1546;
and.b32 %r1548, %r1547, 1;
bfi.b32 %r503, %r496, %r1548, 1, 31;
setp.ne.s16 %p346, %rs866, 0;
@%p346 bra $L__BB25_247;
setp.eq.s32 %p347, %r2390, 0;
mov.u16 %rs1294, 255;
@%p347 bra $L__BB25_246;
cvt.u64.u32 %rd317, %r2389;
add.s64 %rd318, %rd317, %rd4;
add.s64 %rd319, %rd1, %rd318;
ld.global.u8 %rs1294, [%rd319];
$L__BB25_246:
setp.ne.s32 %p349, %r2390, 0;
selp.u32 %r1549, 1, 0, %p349;
add.s32 %r2389, %r2389, %r1549;
add.s32 %r1550, %r2390, -1;
selp.b32 %r2390, 0, %r1550, %p347;
setp.eq.s32 %p350, %r2390, 0;
or.b16 %rs868, %rs1294, 15;
selp.b16 %rs1105, %rs868, %rs1294, %p350;
and.b16 %rs869, %rs1105, 255;
mov.u16 %rs870, 8;
sub.s16 %rs1297, %rs870, %rs1104;
setp.eq.s16 %p351, %rs869, 255;
selp.u16 %rs1104, 1, 0, %p351;
cvt.u32.u16 %r1551, %rs1105;
and.b32 %r2576, %r1551, 255;
$L__BB25_247:
add.s16 %rs1139, %rs1297, -1;
cvt.u32.u16 %r1552, %rs1139;
and.b32 %r1553, %r1552, 255;
shr.u32 %r1554, %r2576, %r1553;
and.b32 %r1555, %r1554, 1;
bfi.b32 %r2598, %r503, %r1555, 1, 31;
add.s32 %r2570, %r2570, -4;
setp.ne.s32 %p352, %r2570, 0;
@%p352 bra $L__BB25_231;
$L__BB25_248:
setp.eq.s32 %p353, %r478, 0;
@%p353 bra $L__BB25_264;
and.b16 %rs871, %rs1139, 255;
setp.ne.s16 %p354, %rs871, 0;
@%p354 bra $L__BB25_253;
setp.eq.s32 %p355, %r2390, 0;
mov.u16 %rs1304, 255;
@%p355 bra $L__BB25_252;
cvt.u64.u32 %rd320, %r2389;
add.s64 %rd321, %rd320, %rd4;
add.s64 %rd322, %rd1, %rd321;
ld.global.u8 %rs1304, [%rd322];
$L__BB25_252:
setp.ne.s32 %p357, %r2390, 0;
selp.u32 %r1556, 1, 0, %p357;
add.s32 %r2389, %r2389, %r1556;
add.s32 %r1557, %r2390, -1;
selp.b32 %r2390, 0, %r1557, %p355;
setp.eq.s32 %p358, %r2390, 0;
or.b16 %rs873, %rs1304, 15;
selp.b16 %rs1105, %rs873, %rs1304, %p358;
and.b16 %rs874, %rs1105, 255;
mov.u16 %rs875, 8;
sub.s16 %rs1139, %rs875, %rs1104;
setp.eq.s16 %p359, %rs874, 255;
selp.u16 %rs1104, 1, 0, %p359;
$L__BB25_253:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1558, %rs1139;
and.b32 %r1559, %r1558, 255;
cvt.u32.u16 %r1560, %rs1105;
and.b32 %r2593, %r1560, 255;
shr.u32 %r1561, %r2593, %r1559;
and.b32 %r1562, %r1561, 1;
bfi.b32 %r2598, %r2598, %r1562, 1, 31;
setp.eq.s32 %p360, %r478, 1;
@%p360 bra $L__BB25_264;
and.b16 %rs876, %rs1139, 255;
setp.ne.s16 %p361, %rs876, 0;
@%p361 bra $L__BB25_258;
setp.eq.s32 %p362, %r2390, 0;
mov.u16 %rs1308, 255;
@%p362 bra $L__BB25_257;
cvt.u64.u32 %rd323, %r2389;
add.s64 %rd324, %rd323, %rd4;
add.s64 %rd325, %rd1, %rd324;
ld.global.u8 %rs1308, [%rd325];
$L__BB25_257:
setp.ne.s32 %p364, %r2390, 0;
selp.u32 %r1563, 1, 0, %p364;
add.s32 %r2389, %r2389, %r1563;
add.s32 %r1564, %r2390, -1;
selp.b32 %r2390, 0, %r1564, %p362;
setp.eq.s32 %p365, %r2390, 0;
or.b16 %rs878, %rs1308, 15;
selp.b16 %rs1105, %rs878, %rs1308, %p365;
and.b16 %rs879, %rs1105, 255;
mov.u16 %rs880, 8;
sub.s16 %rs1139, %rs880, %rs1104;
setp.eq.s16 %p366, %rs879, 255;
selp.u16 %rs1104, 1, 0, %p366;
cvt.u32.u16 %r1565, %rs1105;
and.b32 %r2593, %r1565, 255;
$L__BB25_258:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1566, %rs1139;
and.b32 %r1567, %r1566, 255;
shr.u32 %r1568, %r2593, %r1567;
and.b32 %r1569, %r1568, 1;
bfi.b32 %r2598, %r2598, %r1569, 1, 31;
setp.eq.s32 %p367, %r478, 2;
@%p367 bra $L__BB25_264;
and.b16 %rs881, %rs1139, 255;
setp.ne.s16 %p368, %rs881, 0;
@%p368 bra $L__BB25_263;
setp.eq.s32 %p369, %r2390, 0;
mov.u16 %rs1312, 255;
@%p369 bra $L__BB25_262;
cvt.u64.u32 %rd326, %r2389;
add.s64 %rd327, %rd326, %rd4;
add.s64 %rd328, %rd1, %rd327;
ld.global.u8 %rs1312, [%rd328];
$L__BB25_262:
setp.ne.s32 %p371, %r2390, 0;
selp.u32 %r1570, 1, 0, %p371;
add.s32 %r2389, %r2389, %r1570;
add.s32 %r1571, %r2390, -1;
selp.b32 %r2390, 0, %r1571, %p369;
setp.eq.s32 %p372, %r2390, 0;
or.b16 %rs883, %rs1312, 15;
selp.b16 %rs1105, %rs883, %rs1312, %p372;
and.b16 %rs884, %rs1105, 255;
mov.u16 %rs885, 8;
sub.s16 %rs1139, %rs885, %rs1104;
setp.eq.s16 %p373, %rs884, 255;
selp.u16 %rs1104, 1, 0, %p373;
cvt.u32.u16 %r1572, %rs1105;
and.b32 %r2593, %r1572, 255;
$L__BB25_263:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1573, %rs1139;
and.b32 %r1574, %r1573, 255;
shr.u32 %r1575, %r2593, %r1574;
and.b32 %r1576, %r1575, 1;
bfi.b32 %r2598, %r2598, %r1576, 1, 31;
$L__BB25_264:
shl.b32 %r1577, %r2598, 1;
or.b32 %r2602, %r1577, 1;
add.s32 %r1578, %r2480, -1;
setp.eq.s32 %p374, %r2480, 0;
selp.b32 %r2601, 0, %r1578, %p374;
$L__BB25_265:
mul.lo.s32 %r1579, %r2481, 7;
cvt.u64.u32 %rd329, %r2602;
shl.b64 %rd330, %rd329, %r1579;
or.b64 %rd606, %rd330, %rd606;
setp.ne.s32 %p375, %r2480, 12;
setp.ne.s32 %p376, %r474, 0;
or.pred %p377, %p375, %p376;
add.s32 %r2481, %r2481, 1;
setp.lt.u32 %p378, %r2481, 8;
or.pred %p379, %p378, %p377;
mov.u32 %r2480, %r2601;
@%p379 bra $L__BB25_221;
$L__BB25_266:
cvt.u32.u64 %r1580, %rd606;
and.b32 %r2477, %r1580, 127;
shr.u64 %rd606, %rd606, 7;
add.s32 %r2481, %r2481, -1;
$L__BB25_267:
mul.wide.u32 %rd331, %r2546, 2;
add.s64 %rd332, %rd14, %rd331;
st.local.u16 [%rd332], %r2612;
shl.b32 %r1581, %r2612, 2;
shl.b32 %r1582, %r2612, 1;
or.b32 %r1583, %r1581, %r1582;
and.b32 %r1584, %r1583, 256;
ld.local.u16 %r1585, [%rd47];
and.b32 %r1586, %r1585, 128;
or.b32 %r1587, %r1584, %r1586;
ld.local.u16 %r1588, [%rd48];
shl.b32 %r1589, %r1588, 2;
and.b32 %r1590, %r1589, 640;
or.b32 %r1591, %r1587, %r1590;
add.s32 %r1592, %r451, 4;
mul.wide.u32 %rd333, %r1592, 2;
add.s64 %rd334, %rd14, %rd333;
ld.local.u16 %r1593, [%rd334];
shl.b32 %r1594, %r1593, 4;
and.b32 %r1595, %r1594, 512;
or.b32 %r1596, %r1591, %r1595;
and.b32 %r1597, %r2612, 7;
shr.u64 %rd58, %rd615, %r1597;
sub.s32 %r560, %r2560, %r1597;
cvt.u32.u64 %r1598, %rd58;
and.b32 %r1599, %r1598, 127;
or.b32 %r1600, %r1596, %r1599;
mul.wide.u32 %rd335, %r1600, 2;
add.s64 %rd336, %rd42, %rd335;
ld.global.u16 %r2664, [%rd336];
setp.ne.s32 %p380, %r1596, 0;
add.s32 %r562, %r2544, 2;
setp.ge.u32 %p381, %r562, %r2;
or.pred %p382, %p381, %p380;
@%p382 bra $L__BB25_317;
add.s32 %r563, %r2477, -2;
setp.eq.s32 %p383, %r563, -1;
selp.b32 %r2664, %r2664, 0, %p383;
setp.gt.s32 %p384, %r2477, 1;
mov.u32 %r2477, %r563;
@%p384 bra $L__BB25_317;
setp.ne.s32 %p385, %r2481, 0;
@%p385 bra $L__BB25_316;
mov.u32 %r2481, 0;
$L__BB25_271:
setp.gt.u32 %p386, %r2481, 7;
@%p386 bra $L__BB25_316;
cvt.u64.u32 %rd60, %r2480;
mul.wide.u32 %rd337, %r2480, 4;
add.s64 %rd339, %rd151, %rd337;
ld.global.nc.u32 %r569, [%rd339];
and.b16 %rs886, %rs1139, 255;
setp.ne.s16 %p387, %rs886, 0;
@%p387 bra $L__BB25_276;
setp.eq.s32 %p388, %r2390, 0;
mov.u16 %rs1331, 255;
@%p388 bra $L__BB25_275;
cvt.u64.u32 %rd340, %r2389;
add.s64 %rd341, %rd340, %rd4;
add.s64 %rd342, %rd1, %rd341;
ld.global.u8 %rs1331, [%rd342];
$L__BB25_275:
setp.ne.s32 %p390, %r2390, 0;
selp.u32 %r1602, 1, 0, %p390;
add.s32 %r2389, %r2389, %r1602;
add.s32 %r1603, %r2390, -1;
selp.b32 %r2390, 0, %r1603, %p388;
setp.eq.s32 %p391, %r2390, 0;
or.b16 %rs888, %rs1331, 15;
selp.b16 %rs1105, %rs888, %rs1331, %p391;
and.b16 %rs889, %rs1105, 255;
mov.u16 %rs890, 8;
sub.s16 %rs1139, %rs890, %rs1104;
setp.eq.s16 %p392, %rs889, 255;
selp.u16 %rs1104, 1, 0, %p392;
$L__BB25_276:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1604, %rs1139;
and.b32 %r1605, %r1604, 255;
mov.u32 %r1606, 1;
shl.b32 %r1607, %r1606, %r1605;
cvt.u32.u16 %r1608, %rs1105;
and.b32 %r1609, %r1607, %r1608;
and.b32 %r574, %r1609, 255;
setp.eq.s32 %p393, %r574, 0;
@%p393 bra $L__BB25_278;
add.s32 %r1610, %r2480, 1;
min.u32 %r2653, %r1610, 12;
mov.u32 %r1611, -1;
shl.b32 %r1612, %r1611, %r569;
shl.b32 %r1613, %r1612, 1;
xor.b32 %r2654, %r1613, -2;
bra.uni $L__BB25_315;
$L__BB25_278:
add.s64 %rd343, %rd60, -3;
setp.gt.u64 %p394, %rd343, 9;
mov.u32 %r2650, 0;
@%p394 bra $L__BB25_314;
max.u32 %r577, %r569, 1;
add.s32 %r1617, %r577, -1;
and.b32 %r578, %r577, 3;
setp.lt.u32 %p395, %r1617, 3;
mov.u32 %r2650, 0;
@%p395 bra $L__BB25_298;
sub.s32 %r2622, %r577, %r578;
mov.u32 %r2650, 0;
$L__BB25_281:
and.b16 %rs892, %rs1139, 255;
setp.ne.s16 %p396, %rs892, 0;
@%p396 bra $L__BB25_285;
setp.eq.s32 %p397, %r2390, 0;
mov.u16 %rs1338, 255;
@%p397 bra $L__BB25_284;
cvt.u64.u32 %rd344, %r2389;
add.s64 %rd345, %rd344, %rd4;
add.s64 %rd346, %rd1, %rd345;
ld.global.u8 %rs1338, [%rd346];
$L__BB25_284:
setp.ne.s32 %p399, %r2390, 0;
selp.u32 %r1619, 1, 0, %p399;
add.s32 %r2389, %r2389, %r1619;
add.s32 %r1620, %r2390, -1;
selp.b32 %r2390, 0, %r1620, %p397;
setp.eq.s32 %p400, %r2390, 0;
or.b16 %rs894, %rs1338, 15;
selp.b16 %rs1105, %rs894, %rs1338, %p400;
and.b16 %rs895, %rs1105, 255;
mov.u16 %rs896, 8;
sub.s16 %rs1139, %rs896, %rs1104;
setp.eq.s16 %p401, %rs895, 255;
selp.u16 %rs1104, 1, 0, %p401;
$L__BB25_285:
add.s16 %rs1345, %rs1139, -1;
and.b16 %rs897, %rs1345, 255;
cvt.u32.u16 %r1621, %rs1345;
and.b32 %r1622, %r1621, 255;
cvt.u32.u16 %r1623, %rs1105;
and.b32 %r2628, %r1623, 255;
shr.u32 %r1624, %r2628, %r1622;
and.b32 %r1625, %r1624, 1;
bfi.b32 %r589, %r2650, %r1625, 1, 31;
setp.ne.s16 %p402, %rs897, 0;
@%p402 bra $L__BB25_289;
setp.eq.s32 %p403, %r2390, 0;
mov.u16 %rs1342, 255;
@%p403 bra $L__BB25_288;
cvt.u64.u32 %rd347, %r2389;
add.s64 %rd348, %rd347, %rd4;
add.s64 %rd349, %rd1, %rd348;
ld.global.u8 %rs1342, [%rd349];
$L__BB25_288:
setp.ne.s32 %p405, %r2390, 0;
selp.u32 %r1626, 1, 0, %p405;
add.s32 %r2389, %r2389, %r1626;
add.s32 %r1627, %r2390, -1;
selp.b32 %r2390, 0, %r1627, %p403;
setp.eq.s32 %p406, %r2390, 0;
or.b16 %rs899, %rs1342, 15;
selp.b16 %rs1105, %rs899, %rs1342, %p406;
and.b16 %rs900, %rs1105, 255;
mov.u16 %rs901, 8;
sub.s16 %rs1345, %rs901, %rs1104;
setp.eq.s16 %p407, %rs900, 255;
selp.u16 %rs1104, 1, 0, %p407;
cvt.u32.u16 %r1628, %rs1105;
and.b32 %r2628, %r1628, 255;
$L__BB25_289:
add.s16 %rs1349, %rs1345, -1;
and.b16 %rs902, %rs1349, 255;
cvt.u32.u16 %r1629, %rs1349;
and.b32 %r1630, %r1629, 255;
shr.u32 %r1631, %r2628, %r1630;
and.b32 %r1632, %r1631, 1;
bfi.b32 %r596, %r589, %r1632, 1, 31;
setp.ne.s16 %p408, %rs902, 0;
@%p408 bra $L__BB25_293;
setp.eq.s32 %p409, %r2390, 0;
mov.u16 %rs1346, 255;
@%p409 bra $L__BB25_292;
cvt.u64.u32 %rd350, %r2389;
add.s64 %rd351, %rd350, %rd4;
add.s64 %rd352, %rd1, %rd351;
ld.global.u8 %rs1346, [%rd352];
$L__BB25_292:
setp.ne.s32 %p411, %r2390, 0;
selp.u32 %r1633, 1, 0, %p411;
add.s32 %r2389, %r2389, %r1633;
add.s32 %r1634, %r2390, -1;
selp.b32 %r2390, 0, %r1634, %p409;
setp.eq.s32 %p412, %r2390, 0;
or.b16 %rs904, %rs1346, 15;
selp.b16 %rs1105, %rs904, %rs1346, %p412;
and.b16 %rs905, %rs1105, 255;
mov.u16 %rs906, 8;
sub.s16 %rs1349, %rs906, %rs1104;
setp.eq.s16 %p413, %rs905, 255;
selp.u16 %rs1104, 1, 0, %p413;
cvt.u32.u16 %r1635, %rs1105;
and.b32 %r2628, %r1635, 255;
$L__BB25_293:
add.s16 %rs1353, %rs1349, -1;
and.b16 %rs907, %rs1353, 255;
cvt.u32.u16 %r1636, %rs1353;
and.b32 %r1637, %r1636, 255;
shr.u32 %r1638, %r2628, %r1637;
and.b32 %r1639, %r1638, 1;
bfi.b32 %r603, %r596, %r1639, 1, 31;
setp.ne.s16 %p414, %rs907, 0;
@%p414 bra $L__BB25_297;
setp.eq.s32 %p415, %r2390, 0;
mov.u16 %rs1350, 255;
@%p415 bra $L__BB25_296;
cvt.u64.u32 %rd353, %r2389;
add.s64 %rd354, %rd353, %rd4;
add.s64 %rd355, %rd1, %rd354;
ld.global.u8 %rs1350, [%rd355];
$L__BB25_296:
setp.ne.s32 %p417, %r2390, 0;
selp.u32 %r1640, 1, 0, %p417;
add.s32 %r2389, %r2389, %r1640;
add.s32 %r1641, %r2390, -1;
selp.b32 %r2390, 0, %r1641, %p415;
setp.eq.s32 %p418, %r2390, 0;
or.b16 %rs909, %rs1350, 15;
selp.b16 %rs1105, %rs909, %rs1350, %p418;
and.b16 %rs910, %rs1105, 255;
mov.u16 %rs911, 8;
sub.s16 %rs1353, %rs911, %rs1104;
setp.eq.s16 %p419, %rs910, 255;
selp.u16 %rs1104, 1, 0, %p419;
cvt.u32.u16 %r1642, %rs1105;
and.b32 %r2628, %r1642, 255;
$L__BB25_297:
add.s16 %rs1139, %rs1353, -1;
cvt.u32.u16 %r1643, %rs1139;
and.b32 %r1644, %r1643, 255;
shr.u32 %r1645, %r2628, %r1644;
and.b32 %r1646, %r1645, 1;
bfi.b32 %r2650, %r603, %r1646, 1, 31;
add.s32 %r2622, %r2622, -4;
setp.ne.s32 %p420, %r2622, 0;
@%p420 bra $L__BB25_281;
$L__BB25_298:
setp.eq.s32 %p421, %r578, 0;
@%p421 bra $L__BB25_314;
and.b16 %rs912, %rs1139, 255;
setp.ne.s16 %p422, %rs912, 0;
@%p422 bra $L__BB25_303;
setp.eq.s32 %p423, %r2390, 0;
mov.u16 %rs1360, 255;
@%p423 bra $L__BB25_302;
cvt.u64.u32 %rd356, %r2389;
add.s64 %rd357, %rd356, %rd4;
add.s64 %rd358, %rd1, %rd357;
ld.global.u8 %rs1360, [%rd358];
$L__BB25_302:
setp.ne.s32 %p425, %r2390, 0;
selp.u32 %r1647, 1, 0, %p425;
add.s32 %r2389, %r2389, %r1647;
add.s32 %r1648, %r2390, -1;
selp.b32 %r2390, 0, %r1648, %p423;
setp.eq.s32 %p426, %r2390, 0;
or.b16 %rs914, %rs1360, 15;
selp.b16 %rs1105, %rs914, %rs1360, %p426;
and.b16 %rs915, %rs1105, 255;
mov.u16 %rs916, 8;
sub.s16 %rs1139, %rs916, %rs1104;
setp.eq.s16 %p427, %rs915, 255;
selp.u16 %rs1104, 1, 0, %p427;
$L__BB25_303:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1649, %rs1139;
and.b32 %r1650, %r1649, 255;
cvt.u32.u16 %r1651, %rs1105;
and.b32 %r2645, %r1651, 255;
shr.u32 %r1652, %r2645, %r1650;
and.b32 %r1653, %r1652, 1;
bfi.b32 %r2650, %r2650, %r1653, 1, 31;
setp.eq.s32 %p428, %r578, 1;
@%p428 bra $L__BB25_314;
and.b16 %rs917, %rs1139, 255;
setp.ne.s16 %p429, %rs917, 0;
@%p429 bra $L__BB25_308;
setp.eq.s32 %p430, %r2390, 0;
mov.u16 %rs1364, 255;
@%p430 bra $L__BB25_307;
cvt.u64.u32 %rd359, %r2389;
add.s64 %rd360, %rd359, %rd4;
add.s64 %rd361, %rd1, %rd360;
ld.global.u8 %rs1364, [%rd361];
$L__BB25_307:
setp.ne.s32 %p432, %r2390, 0;
selp.u32 %r1654, 1, 0, %p432;
add.s32 %r2389, %r2389, %r1654;
add.s32 %r1655, %r2390, -1;
selp.b32 %r2390, 0, %r1655, %p430;
setp.eq.s32 %p433, %r2390, 0;
or.b16 %rs919, %rs1364, 15;
selp.b16 %rs1105, %rs919, %rs1364, %p433;
and.b16 %rs920, %rs1105, 255;
mov.u16 %rs921, 8;
sub.s16 %rs1139, %rs921, %rs1104;
setp.eq.s16 %p434, %rs920, 255;
selp.u16 %rs1104, 1, 0, %p434;
cvt.u32.u16 %r1656, %rs1105;
and.b32 %r2645, %r1656, 255;
$L__BB25_308:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1657, %rs1139;
and.b32 %r1658, %r1657, 255;
shr.u32 %r1659, %r2645, %r1658;
and.b32 %r1660, %r1659, 1;
bfi.b32 %r2650, %r2650, %r1660, 1, 31;
setp.eq.s32 %p435, %r578, 2;
@%p435 bra $L__BB25_314;
and.b16 %rs922, %rs1139, 255;
setp.ne.s16 %p436, %rs922, 0;
@%p436 bra $L__BB25_313;
setp.eq.s32 %p437, %r2390, 0;
mov.u16 %rs1368, 255;
@%p437 bra $L__BB25_312;
cvt.u64.u32 %rd362, %r2389;
add.s64 %rd363, %rd362, %rd4;
add.s64 %rd364, %rd1, %rd363;
ld.global.u8 %rs1368, [%rd364];
$L__BB25_312:
setp.ne.s32 %p439, %r2390, 0;
selp.u32 %r1661, 1, 0, %p439;
add.s32 %r2389, %r2389, %r1661;
add.s32 %r1662, %r2390, -1;
selp.b32 %r2390, 0, %r1662, %p437;
setp.eq.s32 %p440, %r2390, 0;
or.b16 %rs924, %rs1368, 15;
selp.b16 %rs1105, %rs924, %rs1368, %p440;
and.b16 %rs925, %rs1105, 255;
mov.u16 %rs926, 8;
sub.s16 %rs1139, %rs926, %rs1104;
setp.eq.s16 %p441, %rs925, 255;
selp.u16 %rs1104, 1, 0, %p441;
cvt.u32.u16 %r1663, %rs1105;
and.b32 %r2645, %r1663, 255;
$L__BB25_313:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1664, %rs1139;
and.b32 %r1665, %r1664, 255;
shr.u32 %r1666, %r2645, %r1665;
and.b32 %r1667, %r1666, 1;
bfi.b32 %r2650, %r2650, %r1667, 1, 31;
$L__BB25_314:
shl.b32 %r1668, %r2650, 1;
or.b32 %r2654, %r1668, 1;
add.s32 %r1669, %r2480, -1;
setp.eq.s32 %p442, %r2480, 0;
selp.b32 %r2653, 0, %r1669, %p442;
$L__BB25_315:
mul.lo.s32 %r1670, %r2481, 7;
cvt.u64.u32 %rd365, %r2654;
shl.b64 %rd366, %rd365, %r1670;
or.b64 %rd606, %rd366, %rd606;
setp.ne.s32 %p443, %r2480, 12;
setp.ne.s32 %p444, %r574, 0;
or.pred %p445, %p443, %p444;
add.s32 %r2481, %r2481, 1;
setp.lt.u32 %p446, %r2481, 8;
or.pred %p447, %p446, %p445;
mov.u32 %r2480, %r2653;
@%p447 bra $L__BB25_271;
$L__BB25_316:
cvt.u32.u64 %r1671, %rd606;
and.b32 %r2477, %r1671, 127;
shr.u64 %rd606, %rd606, 7;
add.s32 %r2481, %r2481, -1;
$L__BB25_317:
setp.lt.u32 %p448, %r562, %r2;
selp.b32 %r1672, %r2664, 0, %p448;
add.s32 %r1673, %r2546, 2;
mul.wide.u32 %rd367, %r1673, 2;
add.s64 %rd368, %rd14, %rd367;
st.local.u16 [%rd368], %r1672;
shl.b32 %r1674, %r1672, 2;
shl.b32 %r1675, %r1672, 1;
or.b32 %r1676, %r1674, %r1675;
and.b32 %r1677, %r1676, 256;
ld.local.u16 %r1678, [%rd48];
and.b32 %r1679, %r1678, 128;
or.b32 %r2545, %r1677, %r1679;
and.b32 %r1680, %r1672, 7;
shr.u64 %rd369, %rd58, %r1680;
sub.s32 %r1681, %r560, %r1680;
cvt.u32.u64 %r1682, %rd369;
shl.b32 %r1683, %r2612, 3;
and.b32 %r1684, %r1683, 64;
shl.b32 %r1685, %r1672, 4;
and.b32 %r1686, %r1685, 128;
or.b32 %r1687, %r1686, %r1684;
and.b32 %r1688, %r1682, 63;
or.b32 %r1689, %r1688, %r1687;
mul.wide.u32 %rd370, %r1689, 2;
add.s64 %rd371, %rd41, %rd370;
ld.global.u16 %r1690, [%rd371];
and.b32 %r1691, %r1690, 7;
shr.u64 %rd372, %rd369, %r1691;
sub.s32 %r1692, %r1681, %r1691;
cvt.u32.u64 %r1693, %rd372;
shr.u32 %r1694, %r1690, 3;
and.b32 %r1695, %r1694, 15;
mov.u32 %r1696, -1;
shl.b32 %r1697, %r1696, %r1695;
not.b32 %r1698, %r1697;
and.b32 %r1699, %r1693, %r1698;
shr.u64 %rd615, %rd372, %r1695;
sub.s32 %r2560, %r1692, %r1695;
shr.u32 %r1700, %r1690, 7;
and.b32 %r1701, %r1700, 7;
shr.u32 %r1702, %r1690, 10;
and.b32 %r1703, %r1702, 7;
mov.u32 %r1704, 255;
shl.b32 %r1705, %r1704, %r1701;
not.b32 %r1706, %r1705;
and.b32 %r1707, %r1699, %r1706;
add.s32 %r1708, %r1707, %r1703;
add.s32 %r1709, %r2546, 1;
mul.wide.u32 %rd373, %r1709, 2;
add.s64 %rd374, %rd14, %rd373;
st.local.u16 [%rd374], %r1708;
shr.u32 %r1710, %r1690, 13;
shr.u32 %r1711, %r1699, %r1701;
add.s32 %r1712, %r1711, %r1710;
add.s32 %r1713, %r2546, 3;
mul.wide.u32 %rd375, %r1713, 2;
add.s64 %rd376, %rd14, %rd375;
st.local.u16 [%rd376], %r1712;
add.s32 %r2546, %r2546, 4;
add.s32 %r2544, %r2544, 4;
setp.lt.u32 %p449, %r2544, %r2;
@%p449 bra $L__BB25_213;
mul.wide.u32 %rd377, %r2546, 2;
add.s64 %rd378, %rd14, %rd377;
mov.u16 %rs927, 0;
st.local.v2.u16 [%rd378], {%rs927, %rs927};
add.s32 %r2535, %r2535, 2;
setp.lt.u32 %p450, %r2535, %r3;
@%p450 bra $L__BB25_212;
$L__BB25_319:
mov.u32 %r1714, 30;
sub.s32 %r665, %r1714, %r7;
add.s32 %r1715, %r2, 1;
shr.u32 %r1716, %r1715, 1;
add.s32 %r1717, %r1716, 2;
setp.gt.u32 %p451, %r1717, 130;
@%p451 bra $L__BB25_534;
bra.uni $L__BB25_320;
$L__BB25_534:
mov.u32 %r2281, 2;
st.global.u32 [%rd5], %r2281;
mov.u32 %r2282, 12;
st.global.u32 [%rd5+4], %r2282;
mov.u32 %r2283, 0;
st.global.u32 [%rd5+8], %r2283;
st.global.u32 [%rd5+12], %r2283;
bra.uni $L__BB25_541;
$L__BB25_320:
add.s32 %r666, %r7, 2;
add.s32 %r667, %r665, -1;
mov.u32 %r2665, 0;
mov.u64 %rd643, 0;
mov.u16 %rs1411, 0;
mov.u32 %r2666, %r2665;
mov.u32 %r2667, %r2665;
mov.u32 %r2668, %r10;
mov.u32 %r2735, %r2665;
mov.u32 %r2732, %r2665;
$L__BB25_321:
mov.u32 %r670, %r2667;
mul.wide.u32 %rd380, %r2666, 2;
add.s64 %rd381, %rd14, %rd380;
ld.local.u16 %r674, [%rd381];
ld.local.u16 %r675, [%rd381+2];
setp.lt.u32 %p452, %r666, %r675;
@%p452 bra $L__BB25_533;
and.b32 %r1724, %r674, 16;
setp.eq.s32 %p453, %r1724, 0;
mov.u32 %r2685, 0;
mov.u32 %r2677, %r2685;
@%p453 bra $L__BB25_328;
setp.gt.u32 %p454, %r2735, 31;
@%p454 bra $L__BB25_327;
$L__BB25_324:
setp.ge.u32 %p455, %r2732, %r16;
mov.u16 %rs1386, 255;
@%p455 bra $L__BB25_326;
add.s32 %r678, %r2732, 1;
cvt.u64.u32 %rd382, %r2732;
add.s64 %rd383, %rd382, %rd4;
add.s64 %rd384, %rd1, %rd383;
ld.global.u8 %rs1386, [%rd384];
mov.u32 %r2732, %r678;
$L__BB25_326:
and.b16 %rs930, %rs1386, 255;
cvt.u64.u16 %rd385, %rs1386;
and.b64 %rd386, %rd385, 255;
shl.b64 %rd387, %rd386, %r2735;
or.b64 %rd643, %rd387, %rd643;
add.s32 %r1725, %r2735, 8;
cvt.u32.u16 %r1726, %rs1411;
cvt.s32.s8 %r1727, %r1726;
sub.s32 %r2735, %r1725, %r1727;
setp.eq.s16 %p456, %rs930, 255;
selp.u16 %rs1411, 1, 0, %p456;
setp.lt.u32 %p457, %r2735, 33;
@%p457 bra $L__BB25_324;
$L__BB25_327:
shr.u32 %r1728, %r674, 12;
and.b32 %r1729, %r1728, 1;
sub.s32 %r1730, %r675, %r1729;
shr.u64 %rd70, %rd643, %r1730;
sub.s32 %r2735, %r2735, %r1730;
cvt.u32.u64 %r1731, %rd643;
shl.b32 %r1732, %r1731, 31;
setp.eq.s32 %p458, %r1730, 0;
mov.u32 %r1733, -1;
shl.b32 %r1734, %r1733, %r1730;
not.b32 %r1735, %r1734;
selp.b32 %r1736, 0, %r1735, %p458;
and.b32 %r1737, %r1736, %r1731;
shr.u32 %r1738, %r674, 8;
and.b32 %r1739, %r1738, 1;
shl.b32 %r1740, %r1739, %r1730;
or.b32 %r1741, %r1740, %r1737;
or.b32 %r1742, %r1741, 1;
add.s32 %r1743, %r1742, 2;
shl.b32 %r1744, %r1743, %r667;
or.b32 %r2677, %r1744, %r1732;
mov.u64 %rd643, %rd70;
$L__BB25_328:
mul.wide.u32 %rd388, %r2668, 4;
add.s64 %rd389, %rd2, %rd388;
st.global.u32 [%rd389], %r2677;
and.b32 %r1747, %r674, 32;
setp.eq.s32 %p459, %r1747, 0;
mov.u32 %r2686, %r2685;
@%p459 bra $L__BB25_334;
setp.gt.u32 %p460, %r2735, 31;
@%p460 bra $L__BB25_333;
$L__BB25_330:
setp.ge.u32 %p461, %r2732, %r16;
mov.u16 %rs1390, 255;
@%p461 bra $L__BB25_332;
add.s32 %r690, %r2732, 1;
cvt.u64.u32 %rd390, %r2732;
add.s64 %rd391, %rd390, %rd4;
add.s64 %rd392, %rd1, %rd391;
ld.global.u8 %rs1390, [%rd392];
mov.u32 %r2732, %r690;
$L__BB25_332:
and.b16 %rs932, %rs1390, 255;
cvt.u64.u16 %rd393, %rs1390;
and.b64 %rd394, %rd393, 255;
shl.b64 %rd395, %rd394, %r2735;
or.b64 %rd643, %rd395, %rd643;
add.s32 %r1748, %r2735, 8;
cvt.u32.u16 %r1749, %rs1411;
cvt.s32.s8 %r1750, %r1749;
sub.s32 %r2735, %r1748, %r1750;
setp.eq.s16 %p462, %rs932, 255;
selp.u16 %rs1411, 1, 0, %p462;
setp.lt.u32 %p463, %r2735, 33;
@%p463 bra $L__BB25_330;
$L__BB25_333:
shr.u32 %r1751, %r674, 13;
and.b32 %r1752, %r1751, 1;
sub.s32 %r1753, %r675, %r1752;
shr.u64 %rd75, %rd643, %r1753;
sub.s32 %r2735, %r2735, %r1753;
cvt.u32.u64 %r1754, %rd643;
shl.b32 %r1755, %r1754, 31;
setp.eq.s32 %p464, %r1753, 0;
mov.u32 %r1756, -1;
shl.b32 %r1757, %r1756, %r1753;
not.b32 %r1758, %r1757;
selp.b32 %r1759, 0, %r1758, %p464;
and.b32 %r1760, %r1759, %r1754;
shr.u32 %r1761, %r674, 9;
and.b32 %r1762, %r1761, 1;
shl.b32 %r1763, %r1762, %r1753;
or.b32 %r1764, %r1763, %r1760;
or.b32 %r2686, %r1764, 1;
add.s32 %r1765, %r2686, 2;
shl.b32 %r1766, %r1765, %r667;
or.b32 %r2685, %r1766, %r1755;
mov.u64 %rd643, %rd75;
$L__BB25_334:
setp.lt.u32 %p465, %r3, 2;
@%p465 bra $L__BB25_336;
add.s32 %r1767, %r2668, %r9;
mul.wide.u32 %rd396, %r1767, 4;
add.s64 %rd397, %rd2, %rd396;
st.global.u32 [%rd397], %r2685;
$L__BB25_336:
or.b32 %r1768, %r2686, %r2665;
add.u64 %rd77, %SPL, 6192;
mul.wide.u32 %rd399, %r670, 4;
add.s64 %rd400, %rd77, %rd399;
st.local.u32 [%rd400], %r1768;
add.s32 %r702, %r2668, 1;
add.s32 %r1769, %r2666, 1;
setp.lt.u32 %p466, %r1769, %r2;
@%p466 bra $L__BB25_338;
bra.uni $L__BB25_337;
$L__BB25_338:
and.b32 %r1772, %r674, 64;
setp.eq.s32 %p467, %r1772, 0;
mov.u32 %r2702, 0;
mov.u32 %r2694, %r2702;
@%p467 bra $L__BB25_344;
setp.gt.u32 %p468, %r2735, 31;
@%p468 bra $L__BB25_343;
$L__BB25_340:
setp.ge.u32 %p469, %r2732, %r16;
mov.u16 %rs1394, 255;
@%p469 bra $L__BB25_342;
add.s32 %r705, %r2732, 1;
cvt.u64.u32 %rd401, %r2732;
add.s64 %rd402, %rd401, %rd4;
add.s64 %rd403, %rd1, %rd402;
ld.global.u8 %rs1394, [%rd403];
mov.u32 %r2732, %r705;
$L__BB25_342:
and.b16 %rs934, %rs1394, 255;
cvt.u64.u16 %rd404, %rs1394;
and.b64 %rd405, %rd404, 255;
shl.b64 %rd406, %rd405, %r2735;
or.b64 %rd643, %rd406, %rd643;
add.s32 %r1773, %r2735, 8;
cvt.u32.u16 %r1774, %rs1411;
cvt.s32.s8 %r1775, %r1774;
sub.s32 %r2735, %r1773, %r1775;
setp.eq.s16 %p470, %rs934, 255;
selp.u16 %rs1411, 1, 0, %p470;
setp.lt.u32 %p471, %r2735, 33;
@%p471 bra $L__BB25_340;
$L__BB25_343:
shr.u32 %r1776, %r674, 14;
and.b32 %r1777, %r1776, 1;
sub.s32 %r1778, %r675, %r1777;
shr.u64 %rd81, %rd643, %r1778;
sub.s32 %r2735, %r2735, %r1778;
cvt.u32.u64 %r1779, %rd643;
shl.b32 %r1780, %r1779, 31;
setp.eq.s32 %p472, %r1778, 0;
mov.u32 %r1781, -1;
shl.b32 %r1782, %r1781, %r1778;
not.b32 %r1783, %r1782;
selp.b32 %r1784, 0, %r1783, %p472;
and.b32 %r1785, %r1784, %r1779;
shr.u32 %r1786, %r674, 10;
and.b32 %r1787, %r1786, 1;
shl.b32 %r1788, %r1787, %r1778;
or.b32 %r1789, %r1788, %r1785;
or.b32 %r1790, %r1789, 1;
add.s32 %r1791, %r1790, 2;
shl.b32 %r1792, %r1791, %r667;
or.b32 %r2694, %r1792, %r1780;
mov.u64 %rd643, %rd81;
$L__BB25_344:
mul.wide.u32 %rd407, %r702, 4;
add.s64 %rd408, %rd2, %rd407;
st.global.u32 [%rd408], %r2694;
and.b32 %r1795, %r674, 128;
setp.eq.s32 %p473, %r1795, 0;
mov.u32 %r2665, %r2702;
@%p473 bra $L__BB25_350;
setp.gt.u32 %p474, %r2735, 31;
@%p474 bra $L__BB25_349;
$L__BB25_346:
setp.ge.u32 %p475, %r2732, %r16;
mov.u16 %rs1398, 255;
@%p475 bra $L__BB25_348;
add.s32 %r717, %r2732, 1;
cvt.u64.u32 %rd409, %r2732;
add.s64 %rd410, %rd409, %rd4;
add.s64 %rd411, %rd1, %rd410;
ld.global.u8 %rs1398, [%rd411];
mov.u32 %r2732, %r717;
$L__BB25_348:
and.b16 %rs936, %rs1398, 255;
cvt.u64.u16 %rd412, %rs1398;
and.b64 %rd413, %rd412, 255;
shl.b64 %rd414, %rd413, %r2735;
or.b64 %rd643, %rd414, %rd643;
add.s32 %r1796, %r2735, 8;
cvt.u32.u16 %r1797, %rs1411;
cvt.s32.s8 %r1798, %r1797;
sub.s32 %r2735, %r1796, %r1798;
setp.eq.s16 %p476, %rs936, 255;
selp.u16 %rs1411, 1, 0, %p476;
setp.lt.u32 %p477, %r2735, 33;
@%p477 bra $L__BB25_346;
$L__BB25_349:
shr.u32 %r1799, %r674, 15;
sub.s32 %r1800, %r675, %r1799;
shr.u64 %rd86, %rd643, %r1800;
sub.s32 %r2735, %r2735, %r1800;
cvt.u32.u64 %r1801, %rd643;
shl.b32 %r1802, %r1801, 31;
setp.eq.s32 %p478, %r1800, 0;
mov.u32 %r1803, -1;
shl.b32 %r1804, %r1803, %r1800;
not.b32 %r1805, %r1804;
selp.b32 %r1806, 0, %r1805, %p478;
and.b32 %r1807, %r1806, %r1801;
shr.u32 %r1808, %r674, 11;
and.b32 %r1809, %r1808, 1;
shl.b32 %r1810, %r1809, %r1800;
or.b32 %r1811, %r1810, %r1807;
or.b32 %r2665, %r1811, 1;
add.s32 %r1812, %r2665, 2;
shl.b32 %r1813, %r1812, %r667;
or.b32 %r2702, %r1813, %r1802;
mov.u64 %rd643, %rd86;
$L__BB25_350:
@%p465 bra $L__BB25_352;
add.s32 %r1814, %r702, %r9;
mul.wide.u32 %rd415, %r1814, 4;
add.s64 %rd416, %rd2, %rd415;
st.global.u32 [%rd416], %r2702;
$L__BB25_352:
add.s32 %r2668, %r2668, 2;
add.s32 %r2667, %r670, 1;
add.s32 %r2666, %r2666, 2;
setp.lt.u32 %p480, %r2666, %r2;
@%p480 bra $L__BB25_321;
bra.uni $L__BB25_353;
$L__BB25_533:
mov.u32 %r2274, 1;
st.global.u32 [%rd5], %r2274;
mov.u32 %r2275, 13;
st.global.u32 [%rd5+4], %r2275;
mov.u32 %r2276, 0;
st.global.u32 [%rd5+8], %r2276;
st.global.u32 [%rd5+12], %r2276;
bra.uni $L__BB25_541;
$L__BB25_337:
mov.u32 %r2665, 0;
$L__BB25_353:
add.s32 %r1815, %r670, 1;
mul.wide.u32 %rd419, %r1815, 4;
add.s64 %rd420, %rd77, %rd419;
st.local.u32 [%rd420], %r2665;
@%p304 bra $L__BB25_388;
mov.u32 %r2708, 2;
$L__BB25_355:
shr.u32 %r1821, %r2708, 1;
mul.lo.s32 %r2712, %r1821, %r1148;
mad.lo.s32 %r2714, %r2708, %r9, %r10;
add.s32 %r741, %r2708, 1;
ld.local.u32 %r2711, [%rd77];
mov.u32 %r2713, 0;
mov.u32 %r2715, %r2713;
mov.u32 %r2716, %r2713;
$L__BB25_356:
mul.wide.u32 %rd421, %r2712, 2;
add.s64 %rd422, %rd14, %rd421;
ld.local.v2.u16 {%rs937, %rs938}, [%rd422];
cvt.u32.u16 %r751, %rs937;
cvt.u32.u16 %r1822, %rs938;
and.b32 %r1823, %r751, 240;
add.s32 %r1824, %r1823, 240;
and.b32 %r1825, %r1824, %r1823;
add.s32 %r752, %r2713, 1;
mul.wide.u32 %rd423, %r752, 4;
add.s64 %rd91, %rd77, %rd423;
ld.local.u32 %r753, [%rd91];
or.b32 %r1826, %r2711, %r753;
or.b32 %r1827, %r1826, 2;
clz.b32 %r1828, %r1827;
xor.b32 %r1829, %r1828, 31;
setp.eq.s32 %p482, %r1825, 0;
selp.b32 %r1830, 1, %r1829, %p482;
add.s32 %r754, %r1830, %r1822;
setp.gt.u32 %p483, %r754, %r666;
@%p483 bra $L__BB25_532;
and.b32 %r1832, %r751, 16;
setp.eq.s32 %p484, %r1832, 0;
mov.u32 %r2733, 0;
mov.u32 %r2725, %r2733;
@%p484 bra $L__BB25_363;
setp.gt.u32 %p485, %r2735, 31;
@%p485 bra $L__BB25_362;
$L__BB25_359:
setp.ge.u32 %p486, %r2732, %r16;
mov.u16 %rs1405, 255;
@%p486 bra $L__BB25_361;
add.s32 %r757, %r2732, 1;
cvt.u64.u32 %rd424, %r2732;
add.s64 %rd425, %rd424, %rd4;
add.s64 %rd426, %rd1, %rd425;
ld.global.u8 %rs1405, [%rd426];
mov.u32 %r2732, %r757;
$L__BB25_361:
and.b16 %rs942, %rs1405, 255;
cvt.u64.u16 %rd427, %rs1405;
and.b64 %rd428, %rd427, 255;
shl.b64 %rd429, %rd428, %r2735;
or.b64 %rd643, %rd429, %rd643;
add.s32 %r1833, %r2735, 8;
cvt.u32.u16 %r1834, %rs1411;
cvt.s32.s8 %r1835, %r1834;
sub.s32 %r2735, %r1833, %r1835;
setp.eq.s16 %p487, %rs942, 255;
selp.u16 %rs1411, 1, 0, %p487;
setp.lt.u32 %p488, %r2735, 33;
@%p488 bra $L__BB25_359;
$L__BB25_362:
shr.u32 %r1836, %r751, 12;
and.b32 %r1837, %r1836, 1;
sub.s32 %r1838, %r754, %r1837;
shr.u64 %rd95, %rd643, %r1838;
sub.s32 %r2735, %r2735, %r1838;
cvt.u32.u64 %r1839, %rd643;
shl.b32 %r1840, %r1839, 31;
setp.eq.s32 %p489, %r1838, 0;
mov.u32 %r1841, -1;
shl.b32 %r1842, %r1841, %r1838;
not.b32 %r1843, %r1842;
selp.b32 %r1844, 0, %r1843, %p489;
and.b32 %r1845, %r1844, %r1839;
shr.u32 %r1846, %r751, 8;
and.b32 %r1847, %r1846, 1;
shl.b32 %r1848, %r1847, %r1838;
or.b32 %r1849, %r1848, %r1845;
or.b32 %r1850, %r1849, 1;
add.s32 %r1851, %r1850, 2;
shl.b32 %r1852, %r1851, %r667;
or.b32 %r2725, %r1852, %r1840;
mov.u64 %rd643, %rd95;
$L__BB25_363:
mul.wide.u32 %rd430, %r2714, 4;
add.s64 %rd431, %rd2, %rd430;
st.global.u32 [%rd431], %r2725;
and.b32 %r1855, %r751, 32;
setp.eq.s32 %p490, %r1855, 0;
mov.u32 %r2734, %r2733;
@%p490 bra $L__BB25_369;
setp.gt.u32 %p491, %r2735, 31;
@%p491 bra $L__BB25_368;
$L__BB25_365:
setp.ge.u32 %p492, %r2732, %r16;
mov.u16 %rs1409, 255;
@%p492 bra $L__BB25_367;
add.s32 %r769, %r2732, 1;
cvt.u64.u32 %rd432, %r2732;
add.s64 %rd433, %rd432, %rd4;
add.s64 %rd434, %rd1, %rd433;
ld.global.u8 %rs1409, [%rd434];
mov.u32 %r2732, %r769;
$L__BB25_367:
and.b16 %rs944, %rs1409, 255;
cvt.u64.u16 %rd435, %rs1409;
and.b64 %rd436, %rd435, 255;
shl.b64 %rd437, %rd436, %r2735;
or.b64 %rd643, %rd437, %rd643;
add.s32 %r1856, %r2735, 8;
cvt.u32.u16 %r1857, %rs1411;
cvt.s32.s8 %r1858, %r1857;
sub.s32 %r2735, %r1856, %r1858;
setp.eq.s16 %p493, %rs944, 255;
selp.u16 %rs1411, 1, 0, %p493;
setp.lt.u32 %p494, %r2735, 33;
@%p494 bra $L__BB25_365;
$L__BB25_368:
shr.u32 %r1859, %r751, 13;
and.b32 %r1860, %r1859, 1;
sub.s32 %r1861, %r754, %r1860;
shr.u64 %rd100, %rd643, %r1861;
sub.s32 %r2735, %r2735, %r1861;
cvt.u32.u64 %r1862, %rd643;
shl.b32 %r1863, %r1862, 31;
setp.eq.s32 %p495, %r1861, 0;
mov.u32 %r1864, -1;
shl.b32 %r1865, %r1864, %r1861;
not.b32 %r1866, %r1865;
selp.b32 %r1867, 0, %r1866, %p495;
and.b32 %r1868, %r1867, %r1862;
shr.u32 %r1869, %r751, 9;
and.b32 %r1870, %r1869, 1;
shl.b32 %r1871, %r1870, %r1861;
or.b32 %r1872, %r1871, %r1868;
or.b32 %r2734, %r1872, 1;
add.s32 %r1873, %r2734, 2;
shl.b32 %r1874, %r1873, %r667;
or.b32 %r2733, %r1874, %r1863;
mov.u64 %rd643, %rd100;
$L__BB25_369:
setp.ge.u32 %p496, %r741, %r3;
@%p496 bra $L__BB25_371;
add.s32 %r1875, %r2714, %r9;
mul.wide.u32 %rd438, %r1875, 4;
add.s64 %rd439, %rd2, %rd438;
st.global.u32 [%rd439], %r2733;
$L__BB25_371:
or.b32 %r1877, %r2734, %r2715;
mul.wide.u32 %rd440, %r2713, 4;
add.s64 %rd441, %rd77, %rd440;
st.local.u32 [%rd441], %r1877;
add.s32 %r781, %r2714, 1;
add.s32 %r1878, %r2716, 1;
setp.ge.u32 %p497, %r1878, %r2;
mov.u32 %r2715, 0;
@%p497 bra $L__BB25_387;
and.b32 %r1880, %r751, 64;
setp.eq.s32 %p498, %r1880, 0;
mov.u32 %r2750, 0;
mov.u32 %r2742, %r2750;
@%p498 bra $L__BB25_378;
setp.gt.u32 %p499, %r2735, 31;
@%p499 bra $L__BB25_377;
$L__BB25_374:
setp.ge.u32 %p500, %r2732, %r16;
mov.u16 %rs1413, 255;
@%p500 bra $L__BB25_376;
add.s32 %r784, %r2732, 1;
cvt.u64.u32 %rd442, %r2732;
add.s64 %rd443, %rd442, %rd4;
add.s64 %rd444, %rd1, %rd443;
ld.global.u8 %rs1413, [%rd444];
mov.u32 %r2732, %r784;
$L__BB25_376:
and.b16 %rs946, %rs1413, 255;
cvt.u64.u16 %rd445, %rs1413;
and.b64 %rd446, %rd445, 255;
shl.b64 %rd447, %rd446, %r2735;
or.b64 %rd643, %rd447, %rd643;
add.s32 %r1881, %r2735, 8;
cvt.u32.u16 %r1882, %rs1411;
cvt.s32.s8 %r1883, %r1882;
sub.s32 %r2735, %r1881, %r1883;
setp.eq.s16 %p501, %rs946, 255;
selp.u16 %rs1411, 1, 0, %p501;
setp.lt.u32 %p502, %r2735, 33;
@%p502 bra $L__BB25_374;
$L__BB25_377:
shr.u32 %r1884, %r751, 14;
and.b32 %r1885, %r1884, 1;
sub.s32 %r1886, %r754, %r1885;
shr.u64 %rd105, %rd643, %r1886;
sub.s32 %r2735, %r2735, %r1886;
cvt.u32.u64 %r1887, %rd643;
shl.b32 %r1888, %r1887, 31;
setp.eq.s32 %p503, %r1886, 0;
mov.u32 %r1889, -1;
shl.b32 %r1890, %r1889, %r1886;
not.b32 %r1891, %r1890;
selp.b32 %r1892, 0, %r1891, %p503;
and.b32 %r1893, %r1892, %r1887;
shr.u32 %r1894, %r751, 10;
and.b32 %r1895, %r1894, 1;
shl.b32 %r1896, %r1895, %r1886;
or.b32 %r1897, %r1896, %r1893;
or.b32 %r1898, %r1897, 1;
add.s32 %r1899, %r1898, 2;
shl.b32 %r1900, %r1899, %r667;
or.b32 %r2742, %r1900, %r1888;
mov.u64 %rd643, %rd105;
$L__BB25_378:
mul.wide.u32 %rd448, %r781, 4;
add.s64 %rd449, %rd2, %rd448;
st.global.u32 [%rd449], %r2742;
and.b32 %r1903, %r751, 128;
setp.eq.s32 %p504, %r1903, 0;
mov.u32 %r2715, %r2750;
@%p504 bra $L__BB25_384;
setp.gt.u32 %p505, %r2735, 31;
@%p505 bra $L__BB25_383;
$L__BB25_380:
setp.ge.u32 %p506, %r2732, %r16;
mov.u16 %rs1417, 255;
@%p506 bra $L__BB25_382;
add.s32 %r796, %r2732, 1;
cvt.u64.u32 %rd450, %r2732;
add.s64 %rd451, %rd450, %rd4;
add.s64 %rd452, %rd1, %rd451;
ld.global.u8 %rs1417, [%rd452];
mov.u32 %r2732, %r796;
$L__BB25_382:
and.b16 %rs948, %rs1417, 255;
cvt.u64.u16 %rd453, %rs1417;
and.b64 %rd454, %rd453, 255;
shl.b64 %rd455, %rd454, %r2735;
or.b64 %rd643, %rd455, %rd643;
add.s32 %r1904, %r2735, 8;
cvt.u32.u16 %r1905, %rs1411;
cvt.s32.s8 %r1906, %r1905;
sub.s32 %r2735, %r1904, %r1906;
setp.eq.s16 %p507, %rs948, 255;
selp.u16 %rs1411, 1, 0, %p507;
setp.lt.u32 %p508, %r2735, 33;
@%p508 bra $L__BB25_380;
$L__BB25_383:
shr.u32 %r1907, %r751, 15;
sub.s32 %r1908, %r754, %r1907;
shr.u64 %rd110, %rd643, %r1908;
sub.s32 %r2735, %r2735, %r1908;
cvt.u32.u64 %r1909, %rd643;
shl.b32 %r1910, %r1909, 31;
setp.eq.s32 %p509, %r1908, 0;
mov.u32 %r1911, -1;
shl.b32 %r1912, %r1911, %r1908;
not.b32 %r1913, %r1912;
selp.b32 %r1914, 0, %r1913, %p509;
and.b32 %r1915, %r1914, %r1909;
shr.u32 %r1916, %r751, 11;
and.b32 %r1917, %r1916, 1;
shl.b32 %r1918, %r1917, %r1908;
or.b32 %r1919, %r1918, %r1915;
or.b32 %r2715, %r1919, 1;
add.s32 %r1920, %r2715, 2;
shl.b32 %r1921, %r1920, %r667;
or.b32 %r2750, %r1921, %r1910;
mov.u64 %rd643, %rd110;
$L__BB25_384:
@%p496 bra $L__BB25_386;
add.s32 %r1922, %r781, %r9;
mul.wide.u32 %rd456, %r1922, 4;
add.s64 %rd457, %rd2, %rd456;
st.global.u32 [%rd457], %r2750;
$L__BB25_386:
add.s32 %r2714, %r2714, 2;
add.s32 %r2712, %r2712, 2;
add.s32 %r2716, %r2716, 2;
setp.lt.u32 %p511, %r2716, %r2;
mov.u32 %r2711, %r753;
mov.u32 %r2713, %r752;
@%p511 bra $L__BB25_356;
$L__BB25_387:
st.local.u32 [%rd91], %r2715;
add.s32 %r2708, %r2708, 2;
setp.lt.u32 %p512, %r2708, %r3;
@%p512 bra $L__BB25_355;
$L__BB25_388:
setp.lt.u32 %p513, %r13, 2;
@%p513 bra $L__BB25_541;
add.s32 %r1923, %r3, 3;
shr.u32 %r815, %r1923, 2;
add.s32 %r1924, %r815, 1;
add.s32 %r816, %r2, 3;
shr.u32 %r817, %r816, 2;
add.s32 %r1925, %r817, 9;
and.b32 %r818, %r1925, 2147483640;
add.s32 %r819, %r817, 8;
setp.gt.u32 %p514, %r818, 72;
mul.lo.s32 %r1926, %r1924, %r818;
setp.gt.u32 %p515, %r1926, 528;
or.pred %p516, %p514, %p515;
@%p516 bra $L__BB25_531;
bra.uni $L__BB25_390;
$L__BB25_531:
mov.u32 %r2260, 2;
st.global.u32 [%rd5], %r2260;
mov.u32 %r2261, 15;
st.global.u32 [%rd5+4], %r2261;
mov.u32 %r2262, 0;
st.global.u32 [%rd5+8], %r2262;
st.global.u32 [%rd5+12], %r2262;
bra.uni $L__BB25_541;
$L__BB25_532:
mov.u32 %r2267, 1;
st.global.u32 [%rd5], %r2267;
mov.u32 %r2268, 14;
st.global.u32 [%rd5+4], %r2268;
mov.u32 %r2269, 0;
st.global.u32 [%rd5+8], %r2269;
st.global.u32 [%rd5+12], %r2269;
$L__BB25_541:
ret;
$L__BB25_390:
setp.gt.u32 %p517, %r819, 72;
@%p517 bra $L__BB25_530;
bra.uni $L__BB25_391;
$L__BB25_530:
mov.u32 %r2253, 2;
st.global.u32 [%rd5], %r2253;
mov.u32 %r2254, 16;
st.global.u32 [%rd5+4], %r2254;
mov.u32 %r2255, 0;
st.global.u32 [%rd5+8], %r2255;
st.global.u32 [%rd5+12], %r2255;
bra.uni $L__BB25_541;
$L__BB25_391:
add.u64 %rd113, %SPL, 6720;
mov.u32 %r1927, 0;
mov.u32 %r2756, %r1927;
$L__BB25_392:
shr.u32 %r1930, %r2756, 1;
mul.lo.s32 %r2758, %r1930, %r1148;
shr.u32 %r1931, %r2756, 2;
mul.lo.s32 %r2757, %r1931, %r818;
mov.u32 %r2759, %r1927;
$L__BB25_393:
mul.wide.u32 %rd459, %r2758, 2;
add.s64 %rd460, %rd14, %rd459;
ld.local.u16 %rs949, [%rd460];
and.b16 %rs950, %rs949, 48;
shr.u16 %rs951, %rs950, 4;
and.b16 %rs952, %rs949, 192;
shr.u16 %rs953, %rs952, 2;
add.s32 %r1932, %r2758, 2;
mul.wide.u32 %rd461, %r1932, 2;
add.s64 %rd462, %rd14, %rd461;
ld.local.u16 %rs954, [%rd462];
shl.b16 %rs955, %rs954, 4;
and.b16 %rs956, %rs955, 768;
shl.b16 %rs957, %rs954, 6;
and.b16 %rs958, %rs957, 12288;
add.s32 %r1933, %r2758, %r1148;
mul.wide.u32 %rd463, %r1933, 2;
add.s64 %rd464, %rd14, %rd463;
ld.local.u16 %rs959, [%rd464];
and.b16 %rs960, %rs959, 48;
shr.u16 %rs961, %rs960, 2;
and.b16 %rs962, %rs959, 192;
add.s32 %r1934, %r1933, 2;
mul.wide.u32 %rd465, %r1934, 2;
add.s64 %rd466, %rd14, %rd465;
ld.local.u16 %rs963, [%rd466];
shl.b16 %rs964, %rs963, 6;
and.b16 %rs965, %rs964, 3072;
shl.b16 %rs966, %rs963, 8;
and.b16 %rs967, %rs966, -16384;
or.b16 %rs968, %rs951, %rs953;
or.b16 %rs969, %rs968, %rs958;
or.b16 %rs970, %rs969, %rs956;
or.b16 %rs971, %rs970, %rs962;
or.b16 %rs972, %rs971, %rs961;
or.b16 %rs973, %rs972, %rs967;
or.b16 %rs974, %rs973, %rs965;
mul.wide.u32 %rd467, %r2757, 2;
add.s64 %rd468, %rd113, %rd467;
st.local.u16 [%rd468], %rs974;
add.s32 %r2758, %r2758, 4;
add.s32 %r2757, %r2757, 1;
add.s32 %r2759, %r2759, 4;
setp.lt.u32 %p518, %r2759, %r2;
@%p518 bra $L__BB25_393;
mul.wide.u32 %rd469, %r2757, 2;
add.s64 %rd470, %rd113, %rd469;
mov.u16 %rs975, 0;
st.local.u16 [%rd470], %rs975;
add.s32 %r2756, %r2756, 4;
setp.lt.u32 %p519, %r2756, %r3;
@%p519 bra $L__BB25_392;
mul.lo.s32 %r831, %r818, %r815;
add.s32 %r832, %r817, 1;
and.b32 %r2871, %r832, 3;
setp.lt.u32 %p520, %r816, 12;
mov.u32 %r2762, 0;
@%p520 bra $L__BB25_398;
sub.s32 %r2761, %r832, %r2871;
mov.u32 %r2762, 0;
$L__BB25_397:
add.s32 %r1937, %r2762, %r831;
mul.wide.u32 %rd471, %r1937, 2;
add.s64 %rd472, %rd113, %rd471;
mov.u16 %rs976, 0;
st.local.v4.u16 [%rd472], {%rs976, %rs976, %rs976, %rs976};
add.s32 %r2762, %r2762, 4;
add.s32 %r2761, %r2761, -4;
setp.ne.s32 %p521, %r2761, 0;
@%p521 bra $L__BB25_397;
$L__BB25_398:
setp.eq.s32 %p522, %r2871, 0;
@%p522 bra $L__BB25_401;
mov.u32 %r2764, %r2871;
$L__BB25_400:
.pragma "nounroll";
add.s32 %r1938, %r2762, %r831;
mul.wide.u32 %rd473, %r1938, 2;
add.s64 %rd474, %rd113, %rd473;
mov.u16 %rs977, 0;
st.local.u16 [%rd474], %rs977;
add.s32 %r2762, %r2762, 1;
add.s32 %r2764, %r2764, -1;
setp.ne.s32 %p523, %r2764, 0;
@%p523 bra $L__BB25_400;
$L__BB25_401:
and.b32 %r2768, %r819, 3;
sub.s32 %r2766, %r819, %r2768;
add.u64 %rd114, %SPL, 7776;
mov.u32 %r2767, 0;
$L__BB25_402:
mul.wide.u32 %rd476, %r2767, 2;
add.s64 %rd477, %rd114, %rd476;
mov.u16 %rs978, 0;
st.local.u16 [%rd477], %rs978;
st.local.u16 [%rd477+2], %rs978;
st.local.u16 [%rd477+4], %rs978;
st.local.u16 [%rd477+6], %rs978;
add.s32 %r2767, %r2767, 4;
add.s32 %r2766, %r2766, -4;
setp.ne.s32 %p524, %r2766, 0;
@%p524 bra $L__BB25_402;
setp.eq.s32 %p525, %r2768, 0;
@%p525 bra $L__BB25_405;
$L__BB25_404:
.pragma "nounroll";
mul.wide.u32 %rd478, %r2767, 2;
add.s64 %rd479, %rd114, %rd478;
mov.u16 %rs979, 0;
st.local.u16 [%rd479], %rs979;
add.s32 %r2767, %r2767, 1;
add.s32 %r2768, %r2768, -1;
setp.ne.s32 %p526, %r2768, 0;
@%p526 bra $L__BB25_404;
$L__BB25_405:
cvt.u64.u32 %rd591, %r5;
add.s64 %rd115, %rd591, %rd4;
add.s32 %r854, %r665, -2;
mov.u32 %r1943, 3;
shl.b32 %r855, %r1943, %r854;
shl.b32 %r856, %r9, 1;
mul.lo.s32 %r857, %r9, 3;
mov.u16 %rs1426, 0;
mov.u32 %r1942, 0;
mov.u64 %rd654, 0;
mov.u32 %r2769, %r1942;
mov.u32 %r2778, %r1942;
mov.u32 %r2779, %r1942;
$L__BB25_406:
sub.s32 %r1946, %r3, %r2769;
setp.lt.u32 %p527, %r1946, 4;
setp.lt.u32 %p528, %r1946, 3;
setp.lt.u32 %p529, %r1946, 2;
selp.b32 %r1947, 4369, 13107, %p529;
selp.b32 %r1948, %r1947, 30583, %p528;
selp.b32 %r861, %r1948, 65535, %p527;
shr.u32 %r1949, %r2769, 2;
mul.lo.s32 %r862, %r1949, %r818;
add.s32 %r863, %r862, %r818;
mad.lo.s32 %r864, %r2769, %r9, %r10;
mov.u32 %r2772, %r1942;
mov.u32 %r2773, %r1942;
$L__BB25_407:
add.s32 %r1951, %r2772, 4;
sub.s32 %r1952, %r1951, %r2;
mov.u32 %r2802, 0;
max.s32 %r1953, %r1952, 0;
shl.b32 %r1954, %r1953, 2;
shr.u32 %r1955, %r861, %r1954;
shr.u32 %r869, %r2772, 2;
mul.wide.u32 %rd482, %r869, 2;
add.s64 %rd118, %rd114, %rd482;
ld.local.u16 %rs981, [%rd118];
ld.local.u16 %rs982, [%rd118+2];
mov.b32 %r1956, {%rs981, %rs982};
add.s32 %r1957, %r863, %r869;
mul.wide.u32 %rd483, %r1957, 2;
add.s64 %rd484, %rd113, %rd483;
ld.local.u16 %rs983, [%rd484];
add.s32 %r1958, %r1957, 1;
mul.wide.u32 %rd485, %r1958, 2;
add.s64 %rd486, %rd113, %rd485;
ld.local.u16 %rs984, [%rd486];
mov.b32 %r1959, {%rs983, %rs984};
and.b32 %r1960, %r1956, -2004318072;
shr.u32 %r1961, %r1960, 3;
shl.b32 %r1962, %r1959, 3;
and.b32 %r1963, %r1962, -2004318072;
setp.eq.s32 %p530, %r11, 0;
selp.b32 %r1964, %r1963, 0, %p530;
or.b32 %r870, %r1964, %r1961;
add.s32 %r1965, %r869, %r862;
mul.wide.u32 %rd487, %r1965, 2;
add.s64 %rd488, %rd113, %rd487;
ld.local.u16 %rs985, [%rd488];
add.s32 %r1966, %r1965, 1;
mul.wide.u32 %rd489, %r1966, 2;
add.s64 %rd490, %rd113, %rd489;
ld.local.u16 %rs986, [%rd490];
mov.b32 %r871, {%rs985, %rs986};
shl.b32 %r1967, %r871, 1;
and.b32 %r1968, %r1967, -286331154;
or.b32 %r1969, %r1968, %r871;
and.b32 %r1970, %r871, -286331154;
shr.u32 %r1971, %r1970, 1;
or.b32 %r1972, %r1969, %r1971;
or.b32 %r1973, %r1972, %r870;
shl.b32 %r1974, %r1973, 4;
shr.u32 %r1975, %r1973, 4;
shr.u32 %r1976, %r2773, 12;
or.b32 %r1977, %r1973, %r1976;
or.b32 %r1978, %r1977, %r1974;
or.b32 %r1979, %r1978, %r1975;
not.b32 %r1980, %r871;
and.b32 %r872, %r1955, %r1980;
and.b32 %r873, %r872, %r1979;
setp.eq.s32 %p531, %r873, 0;
@%p531 bra $L__BB25_486;
setp.gt.u32 %p532, %r2779, 31;
@%p532 bra $L__BB25_412;
$L__BB25_409:
setp.ge.u32 %p533, %r2778, %r2885;
mov.u16 %rs1424, 0;
@%p533 bra $L__BB25_411;
add.s32 %r876, %r2778, 1;
cvt.u64.u32 %rd491, %r2778;
add.s64 %rd492, %rd115, %rd491;
add.s64 %rd493, %rd1, %rd492;
ld.global.u8 %rs1424, [%rd493];
mov.u32 %r2778, %r876;
$L__BB25_411:
and.b16 %rs988, %rs1424, 255;
cvt.u64.u16 %rd494, %rs1424;
and.b64 %rd495, %rd494, 255;
shl.b64 %rd496, %rd495, %r2779;
or.b64 %rd654, %rd496, %rd654;
cvt.u32.u16 %r1981, %rs1426;
cvt.s32.s8 %r1982, %r1981;
mov.u32 %r1983, 8;
sub.s32 %r1984, %r1983, %r1982;
add.s32 %r2779, %r1984, %r2779;
setp.eq.s16 %p534, %rs988, 255;
selp.u16 %rs1426, 1, 0, %p534;
setp.lt.u32 %p535, %r2779, 33;
@%p535 bra $L__BB25_409;
$L__BB25_412:
cvt.u32.u64 %r2803, %rd654;
and.b32 %r1986, %r873, 15;
setp.eq.s32 %p536, %r1986, 0;
mov.u32 %r2804, 0;
mov.u32 %r2802, %r873;
@%p536 bra $L__BB25_421;
and.b32 %r1988, %r873, 1;
setp.eq.b32 %p537, %r1988, 1;
mov.pred %p538, 0;
xor.pred %p539, %p537, %p538;
not.pred %p540, %p539;
mov.u32 %r2804, 0;
mov.u32 %r2802, %r873;
@%p540 bra $L__BB25_415;
and.b32 %r1990, %r873, -2;
and.b32 %r1991, %r2803, 1;
neg.s32 %r1992, %r1991;
mov.u32 %r2804, 1;
and.b32 %r1993, %r1992, %r872;
and.b32 %r1994, %r1993, 51;
or.b32 %r2802, %r1994, %r1990;
shr.u32 %r2803, %r2803, 1;
$L__BB25_415:
and.b32 %r1995, %r2802, 2;
setp.eq.s32 %p541, %r1995, 0;
@%p541 bra $L__BB25_417;
and.b32 %r1996, %r2802, -3;
and.b32 %r1997, %r2803, 1;
neg.s32 %r1998, %r1997;
and.b32 %r1999, %r1998, %r872;
and.b32 %r2000, %r1999, 118;
or.b32 %r2802, %r2000, %r1996;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_417:
and.b32 %r2001, %r2802, 4;
setp.eq.s32 %p542, %r2001, 0;
@%p542 bra $L__BB25_419;
and.b32 %r2002, %r2802, -5;
and.b32 %r2003, %r2803, 1;
neg.s32 %r2004, %r2003;
and.b32 %r2005, %r2004, %r872;
and.b32 %r2006, %r2005, 236;
or.b32 %r2802, %r2006, %r2002;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_419:
and.b32 %r2007, %r2802, 8;
setp.eq.s32 %p543, %r2007, 0;
@%p543 bra $L__BB25_421;
and.b32 %r2008, %r2802, -9;
and.b32 %r2009, %r2803, 1;
neg.s32 %r2010, %r2009;
and.b32 %r2011, %r2010, %r872;
and.b32 %r2012, %r2011, 200;
or.b32 %r2802, %r2012, %r2008;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_421:
and.b32 %r2013, %r2802, 240;
setp.eq.s32 %p544, %r2013, 0;
@%p544 bra $L__BB25_430;
and.b32 %r2014, %r2802, 16;
setp.eq.s32 %p545, %r2014, 0;
@%p545 bra $L__BB25_424;
and.b32 %r2015, %r2802, -17;
and.b32 %r2016, %r2803, 1;
neg.s32 %r2017, %r2016;
and.b32 %r2018, %r2017, %r872;
and.b32 %r2019, %r2018, 816;
or.b32 %r2802, %r2019, %r2015;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_424:
and.b32 %r2020, %r2802, 32;
setp.eq.s32 %p546, %r2020, 0;
@%p546 bra $L__BB25_426;
and.b32 %r2021, %r2802, -33;
and.b32 %r2022, %r2803, 1;
neg.s32 %r2023, %r2022;
and.b32 %r2024, %r2023, %r872;
and.b32 %r2025, %r2024, 1888;
or.b32 %r2802, %r2025, %r2021;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_426:
and.b32 %r2026, %r2802, 64;
setp.eq.s32 %p547, %r2026, 0;
@%p547 bra $L__BB25_428;
and.b32 %r2027, %r2802, -65;
and.b32 %r2028, %r2803, 1;
neg.s32 %r2029, %r2028;
and.b32 %r2030, %r2029, %r872;
and.b32 %r2031, %r2030, 3776;
or.b32 %r2802, %r2031, %r2027;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_428:
and.b32 %r2032, %r2802, 128;
setp.eq.s32 %p548, %r2032, 0;
@%p548 bra $L__BB25_430;
and.b32 %r2033, %r2802, -129;
and.b32 %r2034, %r2803, 1;
neg.s32 %r2035, %r2034;
and.b32 %r2036, %r2035, %r872;
and.b32 %r2037, %r2036, 3200;
or.b32 %r2802, %r2037, %r2033;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_430:
and.b32 %r2038, %r2802, 3840;
setp.eq.s32 %p549, %r2038, 0;
@%p549 bra $L__BB25_439;
and.b32 %r2039, %r2802, 256;
setp.eq.s32 %p550, %r2039, 0;
@%p550 bra $L__BB25_433;
and.b32 %r2040, %r2802, -257;
and.b32 %r2041, %r2803, 1;
neg.s32 %r2042, %r2041;
and.b32 %r2043, %r2042, %r872;
and.b32 %r2044, %r2043, 13056;
or.b32 %r2802, %r2044, %r2040;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_433:
and.b32 %r2045, %r2802, 512;
setp.eq.s32 %p551, %r2045, 0;
@%p551 bra $L__BB25_435;
and.b32 %r2046, %r2802, -513;
and.b32 %r2047, %r2803, 1;
neg.s32 %r2048, %r2047;
and.b32 %r2049, %r2048, %r872;
and.b32 %r2050, %r2049, 30208;
or.b32 %r2802, %r2050, %r2046;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_435:
and.b32 %r2051, %r2802, 1024;
setp.eq.s32 %p552, %r2051, 0;
@%p552 bra $L__BB25_437;
and.b32 %r2052, %r2802, -1025;
and.b32 %r2053, %r2803, 1;
neg.s32 %r2054, %r2053;
and.b32 %r2055, %r2054, %r872;
and.b32 %r2056, %r2055, 60416;
or.b32 %r2802, %r2056, %r2052;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_437:
and.b32 %r2057, %r2802, 2048;
setp.eq.s32 %p553, %r2057, 0;
@%p553 bra $L__BB25_439;
and.b32 %r2058, %r2802, -2049;
and.b32 %r2059, %r2803, 1;
neg.s32 %r2060, %r2059;
and.b32 %r2061, %r2060, %r872;
and.b32 %r2062, %r2061, 51200;
or.b32 %r2802, %r2062, %r2058;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_439:
and.b32 %r2063, %r2802, 61440;
setp.eq.s32 %p554, %r2063, 0;
@%p554 bra $L__BB25_448;
and.b32 %r2064, %r2802, 4096;
setp.eq.s32 %p555, %r2064, 0;
@%p555 bra $L__BB25_442;
and.b32 %r2065, %r2802, -4097;
and.b32 %r2066, %r2803, 1;
neg.s32 %r2067, %r2066;
and.b32 %r2068, %r2067, %r872;
and.b32 %r2069, %r2068, 208896;
or.b32 %r2802, %r2069, %r2065;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_442:
and.b32 %r2070, %r2802, 8192;
setp.eq.s32 %p556, %r2070, 0;
@%p556 bra $L__BB25_444;
and.b32 %r2071, %r2802, -8193;
and.b32 %r2072, %r2803, 1;
neg.s32 %r2073, %r2072;
and.b32 %r2074, %r2073, %r872;
and.b32 %r2075, %r2074, 483328;
or.b32 %r2802, %r2075, %r2071;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_444:
and.b32 %r2076, %r2802, 16384;
setp.eq.s32 %p557, %r2076, 0;
@%p557 bra $L__BB25_446;
and.b32 %r2077, %r2802, -16385;
and.b32 %r2078, %r2803, 1;
neg.s32 %r2079, %r2078;
and.b32 %r2080, %r2079, %r872;
and.b32 %r2081, %r2080, 966656;
or.b32 %r2802, %r2081, %r2077;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_446:
cvt.u16.u32 %rs989, %r2802;
setp.gt.s16 %p558, %rs989, -1;
@%p558 bra $L__BB25_448;
and.b32 %r2082, %r2802, -32769;
and.b32 %r2083, %r2803, 1;
neg.s32 %r2084, %r2083;
and.b32 %r2085, %r2084, %r872;
and.b32 %r2086, %r2085, 819200;
or.b32 %r2802, %r2086, %r2082;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_448:
setp.eq.s32 %p559, %r2802, 0;
@%p559 bra $L__BB25_485;
add.s32 %r977, %r864, %r2772;
and.b32 %r2087, %r2802, 15;
setp.eq.s32 %p560, %r2087, 0;
@%p560 bra $L__BB25_458;
and.b32 %r2088, %r2802, 1;
setp.eq.b32 %p561, %r2088, 1;
mov.pred %p562, 0;
xor.pred %p563, %p561, %p562;
not.pred %p564, %p563;
@%p564 bra $L__BB25_452;
shl.b32 %r2089, %r2803, 31;
or.b32 %r2090, %r2089, %r855;
mul.wide.u32 %rd497, %r977, 4;
add.s64 %rd498, %rd2, %rd497;
st.global.u32 [%rd498], %r2090;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_452:
and.b32 %r2091, %r2802, 2;
setp.eq.s32 %p565, %r2091, 0;
@%p565 bra $L__BB25_454;
shl.b32 %r2092, %r2803, 31;
or.b32 %r2093, %r2092, %r855;
add.s32 %r2094, %r977, %r9;
mul.wide.u32 %rd499, %r2094, 4;
add.s64 %rd500, %rd2, %rd499;
st.global.u32 [%rd500], %r2093;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_454:
and.b32 %r2095, %r2802, 4;
setp.eq.s32 %p566, %r2095, 0;
@%p566 bra $L__BB25_456;
shl.b32 %r2096, %r2803, 31;
or.b32 %r2097, %r2096, %r855;
add.s32 %r2098, %r977, %r856;
mul.wide.u32 %rd501, %r2098, 4;
add.s64 %rd502, %rd2, %rd501;
st.global.u32 [%rd502], %r2097;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_456:
and.b32 %r2099, %r2802, 8;
setp.eq.s32 %p567, %r2099, 0;
@%p567 bra $L__BB25_458;
shl.b32 %r2100, %r2803, 31;
or.b32 %r2101, %r2100, %r855;
add.s32 %r2102, %r977, %r857;
mul.wide.u32 %rd503, %r2102, 4;
add.s64 %rd504, %rd2, %rd503;
st.global.u32 [%rd504], %r2101;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_458:
add.s32 %r994, %r977, 1;
and.b32 %r2103, %r2802, 240;
setp.eq.s32 %p568, %r2103, 0;
@%p568 bra $L__BB25_467;
and.b32 %r2104, %r2802, 16;
setp.eq.s32 %p569, %r2104, 0;
@%p569 bra $L__BB25_461;
shl.b32 %r2105, %r2803, 31;
or.b32 %r2106, %r2105, %r855;
mul.wide.u32 %rd505, %r994, 4;
add.s64 %rd506, %rd2, %rd505;
st.global.u32 [%rd506], %r2106;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_461:
and.b32 %r2107, %r2802, 32;
setp.eq.s32 %p570, %r2107, 0;
@%p570 bra $L__BB25_463;
shl.b32 %r2108, %r2803, 31;
or.b32 %r2109, %r2108, %r855;
add.s32 %r2110, %r994, %r9;
mul.wide.u32 %rd507, %r2110, 4;
add.s64 %rd508, %rd2, %rd507;
st.global.u32 [%rd508], %r2109;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_463:
and.b32 %r2111, %r2802, 64;
setp.eq.s32 %p571, %r2111, 0;
@%p571 bra $L__BB25_465;
shl.b32 %r2112, %r2803, 31;
or.b32 %r2113, %r2112, %r855;
add.s32 %r2114, %r994, %r856;
mul.wide.u32 %rd509, %r2114, 4;
add.s64 %rd510, %rd2, %rd509;
st.global.u32 [%rd510], %r2113;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_465:
and.b32 %r2115, %r2802, 128;
setp.eq.s32 %p572, %r2115, 0;
@%p572 bra $L__BB25_467;
shl.b32 %r2116, %r2803, 31;
or.b32 %r2117, %r2116, %r855;
add.s32 %r2118, %r994, %r857;
mul.wide.u32 %rd511, %r2118, 4;
add.s64 %rd512, %rd2, %rd511;
st.global.u32 [%rd512], %r2117;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_467:
add.s32 %r1011, %r977, 2;
and.b32 %r2119, %r2802, 3840;
setp.eq.s32 %p573, %r2119, 0;
@%p573 bra $L__BB25_476;
and.b32 %r2120, %r2802, 256;
setp.eq.s32 %p574, %r2120, 0;
@%p574 bra $L__BB25_470;
shl.b32 %r2121, %r2803, 31;
or.b32 %r2122, %r2121, %r855;
mul.wide.u32 %rd513, %r1011, 4;
add.s64 %rd514, %rd2, %rd513;
st.global.u32 [%rd514], %r2122;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_470:
and.b32 %r2123, %r2802, 512;
setp.eq.s32 %p575, %r2123, 0;
@%p575 bra $L__BB25_472;
shl.b32 %r2124, %r2803, 31;
or.b32 %r2125, %r2124, %r855;
add.s32 %r2126, %r1011, %r9;
mul.wide.u32 %rd515, %r2126, 4;
add.s64 %rd516, %rd2, %rd515;
st.global.u32 [%rd516], %r2125;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_472:
and.b32 %r2127, %r2802, 1024;
setp.eq.s32 %p576, %r2127, 0;
@%p576 bra $L__BB25_474;
shl.b32 %r2128, %r2803, 31;
or.b32 %r2129, %r2128, %r855;
add.s32 %r2130, %r1011, %r856;
mul.wide.u32 %rd517, %r2130, 4;
add.s64 %rd518, %rd2, %rd517;
st.global.u32 [%rd518], %r2129;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_474:
and.b32 %r2131, %r2802, 2048;
setp.eq.s32 %p577, %r2131, 0;
@%p577 bra $L__BB25_476;
shl.b32 %r2132, %r2803, 31;
or.b32 %r2133, %r2132, %r855;
add.s32 %r2134, %r1011, %r857;
mul.wide.u32 %rd519, %r2134, 4;
add.s64 %rd520, %rd2, %rd519;
st.global.u32 [%rd520], %r2133;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_476:
add.s32 %r1028, %r977, 3;
and.b32 %r2135, %r2802, 61440;
setp.eq.s32 %p578, %r2135, 0;
@%p578 bra $L__BB25_485;
and.b32 %r2136, %r2802, 4096;
setp.eq.s32 %p579, %r2136, 0;
@%p579 bra $L__BB25_479;
shl.b32 %r2137, %r2803, 31;
or.b32 %r2138, %r2137, %r855;
mul.wide.u32 %rd521, %r1028, 4;
add.s64 %rd522, %rd2, %rd521;
st.global.u32 [%rd522], %r2138;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_479:
and.b32 %r2139, %r2802, 8192;
setp.eq.s32 %p580, %r2139, 0;
@%p580 bra $L__BB25_481;
shl.b32 %r2140, %r2803, 31;
or.b32 %r2141, %r2140, %r855;
add.s32 %r2142, %r1028, %r9;
mul.wide.u32 %rd523, %r2142, 4;
add.s64 %rd524, %rd2, %rd523;
st.global.u32 [%rd524], %r2141;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_481:
and.b32 %r2143, %r2802, 16384;
setp.eq.s32 %p581, %r2143, 0;
@%p581 bra $L__BB25_483;
shl.b32 %r2144, %r2803, 31;
or.b32 %r2145, %r2144, %r855;
add.s32 %r2146, %r1028, %r856;
mul.wide.u32 %rd525, %r2146, 4;
add.s64 %rd526, %rd2, %rd525;
st.global.u32 [%rd526], %r2145;
shr.u32 %r2803, %r2803, 1;
add.s32 %r2804, %r2804, 1;
$L__BB25_483:
cvt.u16.u32 %rs990, %r2802;
setp.gt.s16 %p582, %rs990, -1;
@%p582 bra $L__BB25_485;
shl.b32 %r2147, %r2803, 31;
or.b32 %r2148, %r2147, %r855;
add.s32 %r2149, %r1028, %r857;
mul.wide.u32 %rd527, %r2149, 4;
add.s64 %rd528, %rd2, %rd527;
st.global.u32 [%rd528], %r2148;
add.s32 %r2804, %r2804, 1;
$L__BB25_485:
shr.u64 %rd654, %rd654, %r2804;
sub.s32 %r2779, %r2779, %r2804;
$L__BB25_486:
or.b32 %r1047, %r2802, %r871;
st.local.u16 [%rd118], %r1047;
add.s32 %r2150, %r869, 1;
setp.ge.u32 %p583, %r2150, %r819;
@%p583 bra $L__BB25_488;
shr.u32 %r2151, %r1047, 16;
st.local.u16 [%rd118+2], %r2151;
$L__BB25_488:
shl.b32 %r2152, %r1047, 1;
and.b32 %r2153, %r2152, 57344;
and.b32 %r2154, %r1047, 57344;
shr.u32 %r2155, %r2154, 1;
or.b32 %r2156, %r1047, %r870;
and.b32 %r2157, %r2156, 61440;
or.b32 %r2158, %r2157, %r2153;
or.b32 %r2773, %r2158, %r2155;
setp.lt.u32 %p584, %r1951, %r2;
mov.u32 %r2772, %r1951;
@%p584 bra $L__BB25_407;
add.s32 %r2769, %r2769, 4;
setp.gt.u32 %p585, %r3, %r2769;
@%p585 bra $L__BB25_406;
setp.lt.u32 %p586, %r13, 3;
@%p586 bra $L__BB25_541;
mov.u32 %r2159, 0;
mov.u32 %r2863, %r2159;
$L__BB25_492:
shr.u32 %r2161, %r2863, 1;
mul.lo.s32 %r2865, %r2161, %r1148;
shr.u32 %r2162, %r2863, 2;
mul.lo.s32 %r2864, %r2162, %r818;
mov.u32 %r2866, %r2159;
$L__BB25_493:
mul.wide.u32 %rd529, %r2865, 2;
add.s64 %rd530, %rd14, %rd529;
ld.local.u16 %rs991, [%rd530];
and.b16 %rs992, %rs991, 48;
shr.u16 %rs993, %rs992, 4;
and.b16 %rs994, %rs991, 192;
shr.u16 %rs995, %rs994, 2;
add.s32 %r2163, %r2865, 2;
mul.wide.u32 %rd531, %r2163, 2;
add.s64 %rd532, %rd14, %rd531;
ld.local.u16 %rs996, [%rd532];
shl.b16 %rs997, %rs996, 4;
and.b16 %rs998, %rs997, 768;
shl.b16 %rs999, %rs996, 6;
and.b16 %rs1000, %rs999, 12288;
add.s32 %r2164, %r2865, %r1148;
mul.wide.u32 %rd533, %r2164, 2;
add.s64 %rd534, %rd14, %rd533;
ld.local.u16 %rs1001, [%rd534];
and.b16 %rs1002, %rs1001, 48;
shr.u16 %rs1003, %rs1002, 2;
and.b16 %rs1004, %rs1001, 192;
add.s32 %r2165, %r2164, 2;
mul.wide.u32 %rd535, %r2165, 2;
add.s64 %rd536, %rd14, %rd535;
ld.local.u16 %rs1005, [%rd536];
shl.b16 %rs1006, %rs1005, 6;
and.b16 %rs1007, %rs1006, 3072;
shl.b16 %rs1008, %rs1005, 8;
and.b16 %rs1009, %rs1008, -16384;
or.b16 %rs1010, %rs993, %rs995;
or.b16 %rs1011, %rs1010, %rs1000;
or.b16 %rs1012, %rs1011, %rs998;
or.b16 %rs1013, %rs1012, %rs1004;
or.b16 %rs1014, %rs1013, %rs1003;
or.b16 %rs1015, %rs1014, %rs1009;
or.b16 %rs1016, %rs1015, %rs1007;
mul.wide.u32 %rd537, %r2864, 2;
add.s64 %rd538, %rd113, %rd537;
st.local.u16 [%rd538], %rs1016;
add.s32 %r2865, %r2865, 4;
add.s32 %r2864, %r2864, 1;
add.s32 %r2866, %r2866, 4;
setp.lt.u32 %p587, %r2866, %r2;
@%p587 bra $L__BB25_493;
mul.wide.u32 %rd539, %r2864, 2;
add.s64 %rd540, %rd113, %rd539;
mov.u16 %rs1017, 0;
st.local.u16 [%rd540], %rs1017;
add.s32 %r2863, %r2863, 4;
setp.lt.u32 %p588, %r2863, %r3;
@%p588 bra $L__BB25_492;
mov.u32 %r2869, 0;
@%p520 bra $L__BB25_498;
sub.s32 %r2868, %r832, %r2871;
mov.u32 %r2869, 0;
$L__BB25_497:
add.s32 %r2168, %r2869, %r831;
mul.wide.u32 %rd541, %r2168, 2;
add.s64 %rd542, %rd113, %rd541;
mov.u16 %rs1018, 0;
st.local.v4.u16 [%rd542], {%rs1018, %rs1018, %rs1018, %rs1018};
add.s32 %r2869, %r2869, 4;
add.s32 %r2868, %r2868, -4;
setp.ne.s32 %p590, %r2868, 0;
@%p590 bra $L__BB25_497;
$L__BB25_498:
@%p522 bra $L__BB25_500;
$L__BB25_499:
.pragma "nounroll";
add.s32 %r2169, %r2869, %r831;
mul.wide.u32 %rd543, %r2169, 2;
add.s64 %rd544, %rd113, %rd543;
mov.u16 %rs1019, 0;
st.local.u16 [%rd544], %rs1019;
add.s32 %r2869, %r2869, 1;
add.s32 %r2871, %r2871, -1;
setp.ne.s32 %p592, %r2871, 0;
@%p592 bra $L__BB25_499;
$L__BB25_500:
add.s32 %r2172, %r5, %r2885;
add.s32 %r2886, %r2172, -1;
mov.u32 %r2173, 1;
shl.b32 %r1072, %r2173, %r854;
mov.u16 %rs1431, 1;
mov.u32 %r2171, 0;
mov.u64 %rd659, 0;
mov.u32 %r2872, %r2171;
mov.u32 %r2884, %r2171;
$L__BB25_501:
shr.u32 %r2175, %r2872, 2;
mul.lo.s32 %r2877, %r2175, %r818;
mad.lo.s32 %r1078, %r2872, %r9, %r10;
mov.u32 %r2876, %r2171;
$L__BB25_502:
setp.gt.u32 %p593, %r2884, 31;
@%p593 bra $L__BB25_507;
mov.u32 %r2882, %r2885;
$L__BB25_504:
setp.eq.s32 %p594, %r2882, 0;
mov.u16 %rs1430, 0;
@%p594 bra $L__BB25_506;
cvt.s64.s32 %rd546, %r2886;
add.s64 %rd547, %rd546, %rd4;
add.s64 %rd548, %rd1, %rd547;
ld.global.u8 %rs1430, [%rd548];
$L__BB25_506:
add.s32 %r2176, %r2882, -1;
selp.b32 %r2885, 0, %r2176, %p594;
setp.ne.s32 %p596, %r2882, 0;
selp.b32 %r2177, -1, 0, %p596;
add.s32 %r2886, %r2886, %r2177;
and.b16 %rs1022, %rs1430, 255;
and.b16 %rs1023, %rs1430, 127;
setp.eq.s16 %p597, %rs1023, 127;
and.b16 %rs1024, %rs1431, 255;
setp.ne.s16 %p598, %rs1024, 0;
and.pred %p599, %p598, %p597;
selp.b32 %r2178, 7, 8, %p599;
cvt.u64.u16 %rd549, %rs1430;
and.b64 %rd550, %rd549, 255;
shl.b64 %rd551, %rd550, %r2884;
or.b64 %rd659, %rd551, %rd659;
add.s32 %r2884, %r2178, %r2884;
setp.gt.u16 %p600, %rs1022, 143;
selp.u16 %rs1431, 1, 0, %p600;
setp.lt.u32 %p601, %r2884, 33;
mov.u32 %r2882, %r2885;
@%p601 bra $L__BB25_504;
$L__BB25_507:
mul.wide.u32 %rd552, %r2877, 2;
add.s64 %rd553, %rd113, %rd552;
ld.local.u32 %r1093, [%rd553];
setp.eq.s32 %p602, %r1093, 0;
@%p602 bra $L__BB25_528;
cvt.u32.u64 %r2897, %rd659;
add.s32 %r1095, %r1078, %r2876;
mov.u32 %r2889, 15;
mov.u32 %r2887, 0;
$L__BB25_509:
and.b32 %r2181, %r2889, %r1093;
setp.eq.s32 %p603, %r2181, 0;
@%p603 bra $L__BB25_518;
add.s32 %r1099, %r1095, %r2887;
and.b32 %r1100, %r2889, 286331137;
and.b32 %r2182, %r1100, %r1093;
setp.eq.s32 %p604, %r2182, 0;
@%p604 bra $L__BB25_512;
not.b32 %r2183, %r2897;
and.b32 %r2184, %r2183, 1;
shl.b32 %r2185, %r2184, %r667;
or.b32 %r2186, %r2185, %r1072;
mul.wide.u32 %rd554, %r1099, 4;
add.s64 %rd555, %rd2, %rd554;
ld.global.u32 %r2187, [%rd555];
xor.b32 %r2188, %r2187, %r2186;
st.global.u32 [%rd555], %r2188;
shr.u32 %r2897, %r2897, 1;
$L__BB25_512:
add.s32 %r1103, %r1099, %r9;
shl.b32 %r2189, %r1100, 1;
and.b32 %r2190, %r2189, %r1093;
setp.eq.s32 %p605, %r2190, 0;
@%p605 bra $L__BB25_514;
not.b32 %r2191, %r2897;
and.b32 %r2192, %r2191, 1;
shl.b32 %r2193, %r2192, %r667;
or.b32 %r2194, %r2193, %r1072;
mul.wide.u32 %rd556, %r1103, 4;
add.s64 %rd557, %rd2, %rd556;
ld.global.u32 %r2195, [%rd557];
xor.b32 %r2196, %r2195, %r2194;
st.global.u32 [%rd557], %r2196;
shr.u32 %r2897, %r2897, 1;
$L__BB25_514:
add.s32 %r1106, %r1103, %r9;
shl.b32 %r2197, %r1100, 2;
and.b32 %r2198, %r2197, %r1093;
setp.eq.s32 %p606, %r2198, 0;
@%p606 bra $L__BB25_516;
not.b32 %r2199, %r2897;
and.b32 %r2200, %r2199, 1;
shl.b32 %r2201, %r2200, %r667;
or.b32 %r2202, %r2201, %r1072;
mul.wide.u32 %rd558, %r1106, 4;
add.s64 %rd559, %rd2, %rd558;
ld.global.u32 %r2203, [%rd559];
xor.b32 %r2204, %r2203, %r2202;
st.global.u32 [%rd559], %r2204;
shr.u32 %r2897, %r2897, 1;
$L__BB25_516:
shl.b32 %r2205, %r1100, 3;
and.b32 %r2206, %r2205, %r1093;
setp.eq.s32 %p607, %r2206, 0;
@%p607 bra $L__BB25_518;
not.b32 %r2207, %r2897;
and.b32 %r2208, %r2207, 1;
shl.b32 %r2209, %r2208, %r667;
or.b32 %r2210, %r2209, %r1072;
add.s32 %r2211, %r1106, %r9;
mul.wide.u32 %rd560, %r2211, 4;
add.s64 %rd561, %rd2, %rd560;
ld.global.u32 %r2212, [%rd561];
xor.b32 %r2213, %r2212, %r2210;
st.global.u32 [%rd561], %r2213;
shr.u32 %r2897, %r2897, 1;
$L__BB25_518:
shl.b32 %r1111, %r2889, 4;
and.b32 %r2214, %r1111, %r1093;
setp.eq.s32 %p608, %r2214, 0;
@%p608 bra $L__BB25_527;
add.s32 %r2215, %r1095, %r2887;
add.s32 %r1112, %r2215, 1;
and.b32 %r1113, %r1111, 286330896;
and.b32 %r2216, %r1113, %r1093;
setp.eq.s32 %p609, %r2216, 0;
@%p609 bra $L__BB25_521;
not.b32 %r2217, %r2897;
and.b32 %r2218, %r2217, 1;
shl.b32 %r2219, %r2218, %r667;
or.b32 %r2220, %r2219, %r1072;
mul.wide.u32 %rd562, %r1112, 4;
add.s64 %rd563, %rd2, %rd562;
ld.global.u32 %r2221, [%rd563];
xor.b32 %r2222, %r2221, %r2220;
st.global.u32 [%rd563], %r2222;
shr.u32 %r2897, %r2897, 1;
$L__BB25_521:
add.s32 %r1116, %r1112, %r9;
shl.b32 %r2223, %r1113, 1;
and.b32 %r2224, %r2223, %r1093;
setp.eq.s32 %p610, %r2224, 0;
@%p610 bra $L__BB25_523;
not.b32 %r2225, %r2897;
and.b32 %r2226, %r2225, 1;
shl.b32 %r2227, %r2226, %r667;
or.b32 %r2228, %r2227, %r1072;
mul.wide.u32 %rd564, %r1116, 4;
add.s64 %rd565, %rd2, %rd564;
ld.global.u32 %r2229, [%rd565];
xor.b32 %r2230, %r2229, %r2228;
st.global.u32 [%rd565], %r2230;
shr.u32 %r2897, %r2897, 1;
$L__BB25_523:
add.s32 %r1119, %r1116, %r9;
shl.b32 %r2231, %r1113, 2;
and.b32 %r2232, %r2231, %r1093;
setp.eq.s32 %p611, %r2232, 0;
@%p611 bra $L__BB25_525;
not.b32 %r2233, %r2897;
and.b32 %r2234, %r2233, 1;
shl.b32 %r2235, %r2234, %r667;
or.b32 %r2236, %r2235, %r1072;
mul.wide.u32 %rd566, %r1119, 4;
add.s64 %rd567, %rd2, %rd566;
ld.global.u32 %r2237, [%rd567];
xor.b32 %r2238, %r2237, %r2236;
st.global.u32 [%rd567], %r2238;
shr.u32 %r2897, %r2897, 1;
$L__BB25_525:
shl.b32 %r2239, %r1113, 3;
and.b32 %r2240, %r2239, %r1093;
setp.eq.s32 %p612, %r2240, 0;
@%p612 bra $L__BB25_527;
not.b32 %r2241, %r2897;
and.b32 %r2242, %r2241, 1;
shl.b32 %r2243, %r2242, %r667;
or.b32 %r2244, %r2243, %r1072;
add.s32 %r2245, %r1119, %r9;
mul.wide.u32 %rd568, %r2245, 4;
add.s64 %rd569, %rd2, %rd568;
ld.global.u32 %r2246, [%rd569];
xor.b32 %r2247, %r2246, %r2244;
st.global.u32 [%rd569], %r2247;
shr.u32 %r2897, %r2897, 1;
$L__BB25_527:
shl.b32 %r2889, %r2889, 8;
add.s32 %r2887, %r2887, 2;
setp.ne.s32 %p613, %r2887, 8;
@%p613 bra $L__BB25_509;
$L__BB25_528:
popc.b32 %r2248, %r1093;
shr.u64 %rd659, %rd659, %r2248;
sub.s32 %r2884, %r2884, %r2248;
add.s32 %r2876, %r2876, 8;
setp.lt.u32 %p614, %r2876, %r2;
add.s32 %r2877, %r2877, 2;
@%p614 bra $L__BB25_502;
add.s32 %r2872, %r2872, 4;
setp.lt.u32 %p615, %r2872, %r3;
@%p615 bra $L__BB25_501;
bra.uni $L__BB25_541;
}
// .globl j2k_htj2k_decode_codeblocks_multi
.visible .entry j2k_htj2k_decode_codeblocks_multi(
.param .u64 j2k_htj2k_decode_codeblocks_multi_param_0,
.param .u64 j2k_htj2k_decode_codeblocks_multi_param_1,
.param .u64 j2k_htj2k_decode_codeblocks_multi_param_2,
.param .u64 j2k_htj2k_decode_codeblocks_multi_param_3,
.param .u64 j2k_htj2k_decode_codeblocks_multi_param_4,
.param .u64 j2k_htj2k_decode_codeblocks_multi_param_5,
.param .u64 j2k_htj2k_decode_codeblocks_multi_param_6,
.param .u32 j2k_htj2k_decode_codeblocks_multi_param_7
)
{
.local .align 16 .b8 __local_depot26[7920];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<617>;
.reg .b16 %rs<1432>;
.reg .b32 %r<2913>;
.reg .b64 %rd<657>;
mov.u64 %SPL, __local_depot26;
ld.param.u64 %rd135, [ j2k_htj2k_decode_codeblocks_multi_param_0];
ld.param.u64 %rd129, [ j2k_htj2k_decode_codeblocks_multi_param_1];
ld.param.u64 %rd134, [ j2k_htj2k_decode_codeblocks_multi_param_6];
ld.param.u32 %r1136, [ j2k_htj2k_decode_codeblocks_multi_param_7];
cvta.to.global.u64 %rd1, %rd135;
mov.u32 %r1137, %ntid.x;
mov.u32 %r1138, %ctaid.x;
mov.u32 %r1139, %tid.x;
mad.lo.s32 %r1, %r1138, %r1137, %r1139;
setp.ge.u32 %p1, %r1, %r1136;
@%p1 bra $L__BB26_541;
cvta.to.global.u64 %rd136, %rd134;
cvta.to.global.u64 %rd137, %rd129;
mul.wide.u32 %rd138, %r1, 64;
add.s64 %rd139, %rd137, %rd138;
ld.global.u64 %rd140, [%rd139];
cvta.to.global.u64 %rd2, %rd140;
ld.global.v2.u32 {%r1140, %r1141}, [%rd139+8];
mov.u32 %r1143, 0;
ld.global.v2.u32 {%r1144, %r1145}, [%rd139+16];
ld.global.v2.u32 {%r1146, %r2900}, [%rd139+24];
ld.global.v2.u32 {%r1148, %r1149}, [%rd139+32];
ld.global.v2.u32 {%r1150, %r1151}, [%rd139+40];
ld.global.u32 %r14, [%rd139+48];
ld.global.u32 %r15, [%rd139+56];
cvt.u64.u32 %rd3, %r1140;
mul.wide.u32 %rd141, %r1, 16;
add.s64 %rd4, %rd136, %rd141;
st.global.u32 [%rd4], %r1143;
st.global.u32 [%rd4+4], %r1143;
st.global.u32 [%rd4+8], %r1143;
st.global.u32 [%rd4+12], %r1143;
setp.gt.u32 %p2, %r1150, 1;
setp.eq.s32 %p3, %r2900, 0;
and.pred %p4, %p3, %p2;
selp.b32 %r16, 1, %r1150, %p4;
setp.eq.s32 %p5, %r1141, 0;
setp.eq.s32 %p6, %r1144, 0;
or.pred %p7, %p5, %p6;
@%p7 bra $L__BB26_541;
setp.gt.u32 %p8, %r1141, 256;
setp.gt.u32 %p9, %r1144, 256;
or.pred %p10, %p8, %p9;
mul.lo.s32 %r1153, %r1144, %r1141;
setp.gt.u32 %p11, %r1153, 4096;
or.pred %p12, %p10, %p11;
@%p12 bra $L__BB26_540;
bra.uni $L__BB26_3;
$L__BB26_540:
mov.u32 %r2318, 2;
st.global.u32 [%rd4], %r2318;
mov.u32 %r2319, 1;
st.global.u32 [%rd4+4], %r2319;
mov.u32 %r2320, 0;
st.global.u32 [%rd4+8], %r2320;
st.global.u32 [%rd4+12], %r2320;
bra.uni $L__BB26_541;
$L__BB26_3:
add.s32 %r1154, %r1149, -1;
setp.gt.u32 %p13, %r1154, 30;
@%p13 bra $L__BB26_539;
bra.uni $L__BB26_4;
$L__BB26_539:
mov.u32 %r2315, 1;
st.global.u32 [%rd4], %r2315;
mov.u32 %r2316, 2;
st.global.u32 [%rd4+4], %r2316;
mov.u32 %r2317, 0;
st.global.u32 [%rd4+8], %r2317;
st.global.u32 [%rd4+12], %r2317;
bra.uni $L__BB26_541;
$L__BB26_4:
setp.gt.u32 %p14, %r16, 3;
setp.gt.u32 %p15, %r1148, 29;
or.pred %p16, %p15, %p14;
@%p16 bra $L__BB26_538;
bra.uni $L__BB26_5;
$L__BB26_538:
mov.u32 %r2312, 1;
st.global.u32 [%rd4], %r2312;
mov.u32 %r2313, 3;
st.global.u32 [%rd4+4], %r2313;
mov.u32 %r2314, 0;
st.global.u32 [%rd4+8], %r2314;
st.global.u32 [%rd4+12], %r2314;
bra.uni $L__BB26_541;
$L__BB26_5:
setp.eq.s32 %p17, %r1148, 29;
setp.gt.u32 %p18, %r16, 1;
and.pred %p19, %p17, %p18;
selp.b32 %r17, 1, %r16, %p19;
add.s32 %r1155, %r2900, %r1146;
setp.lt.u32 %p20, %r1145, %r1155;
setp.lt.u32 %p21, %r1146, 2;
or.pred %p22, %p21, %p20;
@%p22 bra $L__BB26_537;
bra.uni $L__BB26_6;
$L__BB26_537:
mov.u32 %r2309, 1;
st.global.u32 [%rd4], %r2309;
mov.u32 %r2310, 4;
st.global.u32 [%rd4+4], %r2310;
mov.u32 %r2311, 0;
st.global.u32 [%rd4+8], %r2311;
st.global.u32 [%rd4+12], %r2311;
bra.uni $L__BB26_541;
$L__BB26_6:
add.s32 %r1156, %r1146, -1;
cvt.u64.u32 %rd142, %r1156;
add.s64 %rd143, %rd142, %rd3;
add.s64 %rd144, %rd1, %rd143;
ld.global.u8 %rs667, [%rd144];
mul.wide.u16 %r1157, %rs667, 16;
add.s32 %r1158, %r1146, -2;
cvt.u64.u32 %rd145, %r1158;
add.s64 %rd146, %rd145, %rd3;
add.s64 %rd147, %rd1, %rd146;
ld.global.u8 %rs1, [%rd147];
and.b16 %rs668, %rs1, 15;
cvt.u32.u16 %r1159, %rs668;
or.b32 %r18, %r1157, %r1159;
setp.lt.u32 %p23, %r1146, %r18;
add.s32 %r19, %r18, -2;
setp.gt.u32 %p24, %r19, 4077;
or.pred %p25, %p23, %p24;
@%p25 bra $L__BB26_536;
bra.uni $L__BB26_7;
$L__BB26_536:
mov.u32 %r2306, 1;
st.global.u32 [%rd4], %r2306;
mov.u32 %r2307, 5;
st.global.u32 [%rd4+4], %r2307;
mov.u32 %r2308, 0;
st.global.u32 [%rd4+8], %r2308;
st.global.u32 [%rd4+12], %r2308;
bra.uni $L__BB26_541;
$L__BB26_7:
add.s32 %r1160, %r1144, 1;
shr.u32 %r1161, %r1160, 1;
add.s32 %r1162, %r1141, 9;
and.b32 %r1163, %r1162, -8;
setp.gt.u32 %p26, %r1163, 264;
add.s32 %r1164, %r1161, 1;
mul.lo.s32 %r1165, %r1164, %r1163;
setp.gt.u32 %p27, %r1165, 3096;
or.pred %p28, %p26, %p27;
@%p28 bra $L__BB26_535;
bra.uni $L__BB26_8;
$L__BB26_535:
mov.u32 %r2303, 2;
st.global.u32 [%rd4], %r2303;
mov.u32 %r2304, 6;
st.global.u32 [%rd4+4], %r2304;
mov.u32 %r2305, 0;
st.global.u32 [%rd4+8], %r2305;
st.global.u32 [%rd4+12], %r2305;
bra.uni $L__BB26_541;
$L__BB26_8:
and.b16 %rs1031, %rs1, 15;
cvt.u32.u16 %r2327, %rs1031;
mul.wide.u16 %r2326, %rs667, 16;
or.b32 %r2325, %r2326, %r2327;
mov.u64 %rd593, 0;
mov.u32 %r2495, 0;
sub.s32 %r20, %r1146, %r2325;
add.s32 %r2405, %r2325, -1;
mov.u16 %rs1104, 0;
mov.u64 %rd150, _ZZ20mel_decode_more_runsR10MelDecoderE7MEL_EXP;
mov.u32 %r2404, %r20;
mov.u16 %rs1105, %rs1104;
mov.u16 %rs1139, %rs1104;
mov.u32 %r2377, %r2495;
$L__BB26_9:
setp.gt.u32 %p30, %r2377, 7;
@%p30 bra $L__BB26_54;
mul.wide.u32 %rd149, %r2495, 4;
add.s64 %rd151, %rd150, %rd149;
ld.global.nc.u32 %r26, [%rd151];
and.b16 %rs672, %rs1139, 255;
setp.ne.s16 %p31, %rs672, 0;
mov.u16 %rs1039, %rs1139;
@%p31 bra $L__BB26_14;
setp.eq.s32 %p32, %r2405, 0;
mov.u16 %rs1036, 255;
@%p32 bra $L__BB26_13;
cvt.u64.u32 %rd152, %r2404;
add.s64 %rd153, %rd152, %rd3;
add.s64 %rd154, %rd1, %rd153;
ld.global.u8 %rs1036, [%rd154];
$L__BB26_13:
setp.ne.s32 %p34, %r2405, 0;
selp.u32 %r1168, 1, 0, %p34;
add.s32 %r2404, %r2404, %r1168;
add.s32 %r1169, %r2405, -1;
selp.b32 %r2405, 0, %r1169, %p32;
setp.eq.s32 %p35, %r2405, 0;
or.b16 %rs674, %rs1036, 15;
selp.b16 %rs1105, %rs674, %rs1036, %p35;
and.b16 %rs675, %rs1105, 255;
mov.u16 %rs676, 8;
sub.s16 %rs1039, %rs676, %rs1104;
setp.eq.s16 %p36, %rs675, 255;
selp.u16 %rs1104, 1, 0, %p36;
$L__BB26_14:
add.s16 %rs15, %rs1039, -1;
cvt.u32.u16 %r1170, %rs15;
and.b32 %r1171, %r1170, 255;
mov.u32 %r1172, 1;
shl.b32 %r1173, %r1172, %r1171;
cvt.u32.u16 %r1174, %rs1105;
and.b32 %r1175, %r1173, %r1174;
and.b32 %r31, %r1175, 255;
add.s16 %rs1139, %rs1039, -1;
setp.eq.s32 %p37, %r31, 0;
@%p37 bra $L__BB26_16;
add.s32 %r1176, %r2495, 1;
min.u32 %r2372, %r1176, 12;
mov.u32 %r1177, -1;
shl.b32 %r1178, %r1177, %r26;
shl.b32 %r1179, %r1178, 1;
xor.b32 %r2373, %r1179, -2;
bra.uni $L__BB26_53;
$L__BB26_16:
cvt.u64.u32 %rd591, %r2495;
add.s64 %rd155, %rd591, -3;
setp.gt.u64 %p38, %rd155, 9;
mov.u32 %r2369, 0;
@%p38 bra $L__BB26_52;
add.s16 %rs1139, %rs1039, -1;
max.u32 %r34, %r26, 1;
add.s32 %r1183, %r34, -1;
setp.lt.u32 %p39, %r1183, 3;
mov.u32 %r2369, 0;
@%p39 bra $L__BB26_36;
and.b32 %r2321, %r34, 3;
add.s16 %rs1139, %rs1039, -1;
sub.s32 %r2341, %r34, %r2321;
mov.u32 %r2369, 0;
$L__BB26_19:
and.b16 %rs678, %rs1139, 255;
setp.ne.s16 %p40, %rs678, 0;
@%p40 bra $L__BB26_23;
setp.eq.s32 %p41, %r2405, 0;
mov.u16 %rs1043, 255;
@%p41 bra $L__BB26_22;
cvt.u64.u32 %rd156, %r2404;
add.s64 %rd157, %rd156, %rd3;
add.s64 %rd158, %rd1, %rd157;
ld.global.u8 %rs1043, [%rd158];
$L__BB26_22:
setp.ne.s32 %p43, %r2405, 0;
selp.u32 %r1185, 1, 0, %p43;
add.s32 %r2404, %r2404, %r1185;
add.s32 %r1186, %r2405, -1;
selp.b32 %r2405, 0, %r1186, %p41;
setp.eq.s32 %p44, %r2405, 0;
or.b16 %rs680, %rs1043, 15;
selp.b16 %rs1105, %rs680, %rs1043, %p44;
and.b16 %rs681, %rs1105, 255;
mov.u16 %rs682, 8;
sub.s16 %rs1139, %rs682, %rs1104;
setp.eq.s16 %p45, %rs681, 255;
selp.u16 %rs1104, 1, 0, %p45;
$L__BB26_23:
add.s16 %rs1050, %rs1139, -1;
and.b16 %rs683, %rs1050, 255;
cvt.u32.u16 %r1187, %rs1050;
and.b32 %r1188, %r1187, 255;
cvt.u32.u16 %r1189, %rs1105;
and.b32 %r2347, %r1189, 255;
shr.u32 %r1190, %r2347, %r1188;
and.b32 %r1191, %r1190, 1;
bfi.b32 %r46, %r2369, %r1191, 1, 31;
setp.ne.s16 %p46, %rs683, 0;
@%p46 bra $L__BB26_27;
setp.eq.s32 %p47, %r2405, 0;
mov.u16 %rs1047, 255;
@%p47 bra $L__BB26_26;
cvt.u64.u32 %rd159, %r2404;
add.s64 %rd160, %rd159, %rd3;
add.s64 %rd161, %rd1, %rd160;
ld.global.u8 %rs1047, [%rd161];
$L__BB26_26:
setp.ne.s32 %p49, %r2405, 0;
selp.u32 %r1192, 1, 0, %p49;
add.s32 %r2404, %r2404, %r1192;
add.s32 %r1193, %r2405, -1;
selp.b32 %r2405, 0, %r1193, %p47;
setp.eq.s32 %p50, %r2405, 0;
or.b16 %rs685, %rs1047, 15;
selp.b16 %rs1105, %rs685, %rs1047, %p50;
and.b16 %rs686, %rs1105, 255;
mov.u16 %rs687, 8;
sub.s16 %rs1050, %rs687, %rs1104;
setp.eq.s16 %p51, %rs686, 255;
selp.u16 %rs1104, 1, 0, %p51;
cvt.u32.u16 %r1194, %rs1105;
and.b32 %r2347, %r1194, 255;
$L__BB26_27:
add.s16 %rs1054, %rs1050, -1;
and.b16 %rs688, %rs1054, 255;
cvt.u32.u16 %r1195, %rs1054;
and.b32 %r1196, %r1195, 255;
shr.u32 %r1197, %r2347, %r1196;
and.b32 %r1198, %r1197, 1;
bfi.b32 %r53, %r46, %r1198, 1, 31;
setp.ne.s16 %p52, %rs688, 0;
@%p52 bra $L__BB26_31;
setp.eq.s32 %p53, %r2405, 0;
mov.u16 %rs1051, 255;
@%p53 bra $L__BB26_30;
cvt.u64.u32 %rd162, %r2404;
add.s64 %rd163, %rd162, %rd3;
add.s64 %rd164, %rd1, %rd163;
ld.global.u8 %rs1051, [%rd164];
$L__BB26_30:
setp.ne.s32 %p55, %r2405, 0;
selp.u32 %r1199, 1, 0, %p55;
add.s32 %r2404, %r2404, %r1199;
add.s32 %r1200, %r2405, -1;
selp.b32 %r2405, 0, %r1200, %p53;
setp.eq.s32 %p56, %r2405, 0;
or.b16 %rs690, %rs1051, 15;
selp.b16 %rs1105, %rs690, %rs1051, %p56;
and.b16 %rs691, %rs1105, 255;
mov.u16 %rs692, 8;
sub.s16 %rs1054, %rs692, %rs1104;
setp.eq.s16 %p57, %rs691, 255;
selp.u16 %rs1104, 1, 0, %p57;
cvt.u32.u16 %r1201, %rs1105;
and.b32 %r2347, %r1201, 255;
$L__BB26_31:
add.s16 %rs1058, %rs1054, -1;
and.b16 %rs693, %rs1058, 255;
cvt.u32.u16 %r1202, %rs1058;
and.b32 %r1203, %r1202, 255;
shr.u32 %r1204, %r2347, %r1203;
and.b32 %r1205, %r1204, 1;
bfi.b32 %r60, %r53, %r1205, 1, 31;
setp.ne.s16 %p58, %rs693, 0;
@%p58 bra $L__BB26_35;
setp.eq.s32 %p59, %r2405, 0;
mov.u16 %rs1055, 255;
@%p59 bra $L__BB26_34;
cvt.u64.u32 %rd165, %r2404;
add.s64 %rd166, %rd165, %rd3;
add.s64 %rd167, %rd1, %rd166;
ld.global.u8 %rs1055, [%rd167];
$L__BB26_34:
setp.ne.s32 %p61, %r2405, 0;
selp.u32 %r1206, 1, 0, %p61;
add.s32 %r2404, %r2404, %r1206;
add.s32 %r1207, %r2405, -1;
selp.b32 %r2405, 0, %r1207, %p59;
setp.eq.s32 %p62, %r2405, 0;
or.b16 %rs695, %rs1055, 15;
selp.b16 %rs1105, %rs695, %rs1055, %p62;
and.b16 %rs696, %rs1105, 255;
mov.u16 %rs697, 8;
sub.s16 %rs1058, %rs697, %rs1104;
setp.eq.s16 %p63, %rs696, 255;
selp.u16 %rs1104, 1, 0, %p63;
cvt.u32.u16 %r1208, %rs1105;
and.b32 %r2347, %r1208, 255;
$L__BB26_35:
add.s16 %rs1139, %rs1058, -1;
cvt.u32.u16 %r1209, %rs1139;
and.b32 %r1210, %r1209, 255;
shr.u32 %r1211, %r2347, %r1210;
and.b32 %r1212, %r1211, 1;
bfi.b32 %r2369, %r60, %r1212, 1, 31;
add.s32 %r2341, %r2341, -4;
setp.ne.s32 %p64, %r2341, 0;
@%p64 bra $L__BB26_19;
$L__BB26_36:
and.b32 %r2322, %r34, 3;
setp.eq.s32 %p65, %r2322, 0;
@%p65 bra $L__BB26_52;
and.b16 %rs698, %rs1139, 255;
setp.ne.s16 %p66, %rs698, 0;
@%p66 bra $L__BB26_41;
setp.eq.s32 %p67, %r2405, 0;
mov.u16 %rs1065, 255;
@%p67 bra $L__BB26_40;
cvt.u64.u32 %rd168, %r2404;
add.s64 %rd169, %rd168, %rd3;
add.s64 %rd170, %rd1, %rd169;
ld.global.u8 %rs1065, [%rd170];
$L__BB26_40:
setp.ne.s32 %p69, %r2405, 0;
selp.u32 %r1213, 1, 0, %p69;
add.s32 %r2404, %r2404, %r1213;
add.s32 %r1214, %r2405, -1;
selp.b32 %r2405, 0, %r1214, %p67;
setp.eq.s32 %p70, %r2405, 0;
or.b16 %rs700, %rs1065, 15;
selp.b16 %rs1105, %rs700, %rs1065, %p70;
and.b16 %rs701, %rs1105, 255;
mov.u16 %rs702, 8;
sub.s16 %rs1139, %rs702, %rs1104;
setp.eq.s16 %p71, %rs701, 255;
selp.u16 %rs1104, 1, 0, %p71;
$L__BB26_41:
and.b32 %r2323, %r34, 3;
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1215, %rs1139;
and.b32 %r1216, %r1215, 255;
cvt.u32.u16 %r1217, %rs1105;
and.b32 %r2364, %r1217, 255;
shr.u32 %r1218, %r2364, %r1216;
and.b32 %r1219, %r1218, 1;
bfi.b32 %r2369, %r2369, %r1219, 1, 31;
setp.eq.s32 %p72, %r2323, 1;
@%p72 bra $L__BB26_52;
and.b16 %rs703, %rs1139, 255;
setp.ne.s16 %p73, %rs703, 0;
@%p73 bra $L__BB26_46;
setp.eq.s32 %p74, %r2405, 0;
mov.u16 %rs1069, 255;
@%p74 bra $L__BB26_45;
cvt.u64.u32 %rd171, %r2404;
add.s64 %rd172, %rd171, %rd3;
add.s64 %rd173, %rd1, %rd172;
ld.global.u8 %rs1069, [%rd173];
$L__BB26_45:
setp.ne.s32 %p76, %r2405, 0;
selp.u32 %r1220, 1, 0, %p76;
add.s32 %r2404, %r2404, %r1220;
add.s32 %r1221, %r2405, -1;
selp.b32 %r2405, 0, %r1221, %p74;
setp.eq.s32 %p77, %r2405, 0;
or.b16 %rs705, %rs1069, 15;
selp.b16 %rs1105, %rs705, %rs1069, %p77;
and.b16 %rs706, %rs1105, 255;
mov.u16 %rs707, 8;
sub.s16 %rs1139, %rs707, %rs1104;
setp.eq.s16 %p78, %rs706, 255;
selp.u16 %rs1104, 1, 0, %p78;
cvt.u32.u16 %r1222, %rs1105;
and.b32 %r2364, %r1222, 255;
$L__BB26_46:
and.b32 %r2324, %r34, 3;
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1223, %rs1139;
and.b32 %r1224, %r1223, 255;
shr.u32 %r1225, %r2364, %r1224;
and.b32 %r1226, %r1225, 1;
bfi.b32 %r2369, %r2369, %r1226, 1, 31;
setp.eq.s32 %p79, %r2324, 2;
@%p79 bra $L__BB26_52;
and.b16 %rs708, %rs1139, 255;
setp.ne.s16 %p80, %rs708, 0;
@%p80 bra $L__BB26_51;
setp.eq.s32 %p81, %r2405, 0;
mov.u16 %rs1073, 255;
@%p81 bra $L__BB26_50;
cvt.u64.u32 %rd174, %r2404;
add.s64 %rd175, %rd174, %rd3;
add.s64 %rd176, %rd1, %rd175;
ld.global.u8 %rs1073, [%rd176];
$L__BB26_50:
setp.ne.s32 %p83, %r2405, 0;
selp.u32 %r1227, 1, 0, %p83;
add.s32 %r2404, %r2404, %r1227;
add.s32 %r1228, %r2405, -1;
selp.b32 %r2405, 0, %r1228, %p81;
setp.eq.s32 %p84, %r2405, 0;
or.b16 %rs710, %rs1073, 15;
selp.b16 %rs1105, %rs710, %rs1073, %p84;
and.b16 %rs711, %rs1105, 255;
mov.u16 %rs712, 8;
sub.s16 %rs1139, %rs712, %rs1104;
setp.eq.s16 %p85, %rs711, 255;
selp.u16 %rs1104, 1, 0, %p85;
cvt.u32.u16 %r1229, %rs1105;
and.b32 %r2364, %r1229, 255;
$L__BB26_51:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1230, %rs1139;
and.b32 %r1231, %r1230, 255;
shr.u32 %r1232, %r2364, %r1231;
and.b32 %r1233, %r1232, 1;
bfi.b32 %r2369, %r2369, %r1233, 1, 31;
$L__BB26_52:
shl.b32 %r1234, %r2369, 1;
or.b32 %r2373, %r1234, 1;
add.s32 %r1235, %r2495, -1;
setp.eq.s32 %p86, %r2495, 0;
selp.b32 %r2372, 0, %r1235, %p86;
$L__BB26_53:
mul.lo.s32 %r1236, %r2377, 7;
cvt.u64.u32 %rd177, %r2373;
shl.b64 %rd178, %rd177, %r1236;
or.b64 %rd593, %rd178, %rd593;
setp.ne.s32 %p87, %r2495, 12;
setp.ne.s32 %p88, %r31, 0;
or.pred %p89, %p87, %p88;
add.s32 %r2377, %r2377, 1;
setp.lt.u32 %p90, %r2377, 8;
or.pred %p91, %p90, %p89;
mov.u32 %r2495, %r2372;
@%p91 bra $L__BB26_9;
$L__BB26_54:
and.b16 %rs1032, %rs1, 15;
cvt.u32.u16 %r2331, %rs1032;
mul.wide.u16 %r2330, %rs667, 16;
or.b32 %r2329, %r2330, %r2331;
add.s32 %r2574, %r2329, -2;
setp.gt.u16 %p616, %rs1, 143;
selp.u16 %rs1271, 1, 0, %p616;
ld.param.u64 %rd590, [ j2k_htj2k_decode_codeblocks_multi_param_4];
shr.u16 %rs1025, %rs1, 4;
ld.param.u64 %rd587, [ j2k_htj2k_decode_codeblocks_multi_param_2];
add.s32 %r2496, %r2377, -1;
shr.u64 %rd603, %rd593, 7;
cvt.u32.u64 %r1239, %rd593;
and.b32 %r2492, %r1239, 127;
cvt.u64.u16 %rd612, %rs1025;
and.b64 %rd179, %rd612, 7;
setp.eq.s64 %p92, %rd179, 7;
selp.b32 %r2575, 3, 4, %p92;
mov.u32 %r2378, 0;
add.s32 %r2573, %r1146, -3;
cvta.to.global.u64 %rd11, %rd590;
cvta.to.global.u64 %rd12, %rd587;
add.u64 %rd13, %SPL, 0;
mov.u32 %r2379, %r2378;
$L__BB26_55:
setp.gt.u32 %p93, %r2575, 31;
@%p93 bra $L__BB26_59;
$L__BB26_56:
setp.eq.s32 %p94, %r2574, 0;
mov.u16 %rs1091, 0;
@%p94 bra $L__BB26_58;
cvt.s64.s32 %rd181, %r2573;
add.s64 %rd182, %rd181, %rd3;
add.s64 %rd183, %rd1, %rd182;
ld.global.u8 %rs1091, [%rd183];
$L__BB26_58:
setp.ne.s32 %p96, %r2574, 0;
selp.b32 %r1240, -1, 0, %p96;
add.s32 %r2573, %r2573, %r1240;
add.s32 %r1241, %r2574, -1;
selp.b32 %r2574, 0, %r1241, %p94;
and.b16 %rs714, %rs1091, 255;
and.b16 %rs715, %rs1091, 127;
setp.eq.s16 %p97, %rs715, 127;
and.b16 %rs716, %rs1271, 255;
setp.ne.s16 %p98, %rs716, 0;
and.pred %p99, %p98, %p97;
selp.b32 %r1242, 7, 8, %p99;
cvt.u64.u16 %rd184, %rs1091;
and.b64 %rd185, %rd184, 255;
shl.b64 %rd186, %rd185, %r2575;
or.b64 %rd612, %rd186, %rd612;
add.s32 %r2575, %r1242, %r2575;
setp.gt.u16 %p100, %rs714, 143;
selp.u16 %rs1271, 1, 0, %p100;
setp.lt.u32 %p101, %r2575, 33;
@%p101 bra $L__BB26_56;
$L__BB26_59:
cvt.u32.u64 %r1243, %rd612;
and.b32 %r1244, %r1243, 127;
add.s32 %r1245, %r1244, %r2378;
mul.wide.u32 %rd187, %r1245, 2;
add.s64 %rd188, %rd12, %rd187;
ld.global.u16 %r2445, [%rd188];
setp.ne.s32 %p102, %r2378, 0;
@%p102 bra $L__BB26_109;
add.s32 %r133, %r2492, -2;
setp.eq.s32 %p103, %r133, -1;
selp.b32 %r2445, %r2445, 0, %p103;
setp.gt.s32 %p104, %r2492, 1;
mov.u32 %r2492, %r133;
@%p104 bra $L__BB26_109;
setp.ne.s32 %p105, %r2496, 0;
@%p105 bra $L__BB26_108;
mov.u32 %r2496, 0;
$L__BB26_63:
setp.gt.u32 %p106, %r2496, 7;
@%p106 bra $L__BB26_108;
cvt.u64.u32 %rd20, %r2495;
mul.wide.u32 %rd189, %r2495, 4;
add.s64 %rd191, %rd150, %rd189;
ld.global.nc.u32 %r139, [%rd191];
and.b16 %rs717, %rs1139, 255;
setp.ne.s16 %p107, %rs717, 0;
@%p107 bra $L__BB26_68;
setp.eq.s32 %p108, %r2405, 0;
mov.u16 %rs1096, 255;
@%p108 bra $L__BB26_67;
cvt.u64.u32 %rd192, %r2404;
add.s64 %rd193, %rd192, %rd3;
add.s64 %rd194, %rd1, %rd193;
ld.global.u8 %rs1096, [%rd194];
$L__BB26_67:
setp.ne.s32 %p110, %r2405, 0;
selp.u32 %r1247, 1, 0, %p110;
add.s32 %r2404, %r2404, %r1247;
add.s32 %r1248, %r2405, -1;
selp.b32 %r2405, 0, %r1248, %p108;
setp.eq.s32 %p111, %r2405, 0;
or.b16 %rs719, %rs1096, 15;
selp.b16 %rs1105, %rs719, %rs1096, %p111;
and.b16 %rs720, %rs1105, 255;
mov.u16 %rs721, 8;
sub.s16 %rs1139, %rs721, %rs1104;
setp.eq.s16 %p112, %rs720, 255;
selp.u16 %rs1104, 1, 0, %p112;
$L__BB26_68:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1249, %rs1139;
and.b32 %r1250, %r1249, 255;
mov.u32 %r1251, 1;
shl.b32 %r1252, %r1251, %r1250;
cvt.u32.u16 %r1253, %rs1105;
and.b32 %r1254, %r1252, %r1253;
and.b32 %r144, %r1254, 255;
setp.eq.s32 %p113, %r144, 0;
@%p113 bra $L__BB26_70;
add.s32 %r1255, %r2495, 1;
min.u32 %r2434, %r1255, 12;
mov.u32 %r1256, -1;
shl.b32 %r1257, %r1256, %r139;
shl.b32 %r1258, %r1257, 1;
xor.b32 %r2435, %r1258, -2;
bra.uni $L__BB26_107;
$L__BB26_70:
add.s64 %rd195, %rd20, -3;
setp.gt.u64 %p114, %rd195, 9;
mov.u32 %r2431, 0;
@%p114 bra $L__BB26_106;
max.u32 %r147, %r139, 1;
add.s32 %r1262, %r147, -1;
and.b32 %r148, %r147, 3;
setp.lt.u32 %p115, %r1262, 3;
mov.u32 %r2431, 0;
@%p115 bra $L__BB26_90;
sub.s32 %r2403, %r147, %r148;
mov.u32 %r2431, 0;
$L__BB26_73:
and.b16 %rs723, %rs1139, 255;
setp.ne.s16 %p116, %rs723, 0;
@%p116 bra $L__BB26_77;
setp.eq.s32 %p117, %r2405, 0;
mov.u16 %rs1103, 255;
@%p117 bra $L__BB26_76;
cvt.u64.u32 %rd196, %r2404;
add.s64 %rd197, %rd196, %rd3;
add.s64 %rd198, %rd1, %rd197;
ld.global.u8 %rs1103, [%rd198];
$L__BB26_76:
setp.ne.s32 %p119, %r2405, 0;
selp.u32 %r1264, 1, 0, %p119;
add.s32 %r2404, %r2404, %r1264;
add.s32 %r1265, %r2405, -1;
selp.b32 %r2405, 0, %r1265, %p117;
setp.eq.s32 %p120, %r2405, 0;
or.b16 %rs725, %rs1103, 15;
selp.b16 %rs1105, %rs725, %rs1103, %p120;
and.b16 %rs726, %rs1105, 255;
mov.u16 %rs727, 8;
sub.s16 %rs1139, %rs727, %rs1104;
setp.eq.s16 %p121, %rs726, 255;
selp.u16 %rs1104, 1, 0, %p121;
$L__BB26_77:
add.s16 %rs1110, %rs1139, -1;
and.b16 %rs728, %rs1110, 255;
cvt.u32.u16 %r1266, %rs1110;
and.b32 %r1267, %r1266, 255;
cvt.u32.u16 %r1268, %rs1105;
and.b32 %r2409, %r1268, 255;
shr.u32 %r1269, %r2409, %r1267;
and.b32 %r1270, %r1269, 1;
bfi.b32 %r159, %r2431, %r1270, 1, 31;
setp.ne.s16 %p122, %rs728, 0;
@%p122 bra $L__BB26_81;
setp.eq.s32 %p123, %r2405, 0;
mov.u16 %rs1107, 255;
@%p123 bra $L__BB26_80;
cvt.u64.u32 %rd199, %r2404;
add.s64 %rd200, %rd199, %rd3;
add.s64 %rd201, %rd1, %rd200;
ld.global.u8 %rs1107, [%rd201];
$L__BB26_80:
setp.ne.s32 %p125, %r2405, 0;
selp.u32 %r1271, 1, 0, %p125;
add.s32 %r2404, %r2404, %r1271;
add.s32 %r1272, %r2405, -1;
selp.b32 %r2405, 0, %r1272, %p123;
setp.eq.s32 %p126, %r2405, 0;
or.b16 %rs730, %rs1107, 15;
selp.b16 %rs1105, %rs730, %rs1107, %p126;
and.b16 %rs731, %rs1105, 255;
mov.u16 %rs732, 8;
sub.s16 %rs1110, %rs732, %rs1104;
setp.eq.s16 %p127, %rs731, 255;
selp.u16 %rs1104, 1, 0, %p127;
cvt.u32.u16 %r1273, %rs1105;
and.b32 %r2409, %r1273, 255;
$L__BB26_81:
add.s16 %rs1114, %rs1110, -1;
and.b16 %rs733, %rs1114, 255;
cvt.u32.u16 %r1274, %rs1114;
and.b32 %r1275, %r1274, 255;
shr.u32 %r1276, %r2409, %r1275;
and.b32 %r1277, %r1276, 1;
bfi.b32 %r166, %r159, %r1277, 1, 31;
setp.ne.s16 %p128, %rs733, 0;
@%p128 bra $L__BB26_85;
setp.eq.s32 %p129, %r2405, 0;
mov.u16 %rs1111, 255;
@%p129 bra $L__BB26_84;
cvt.u64.u32 %rd202, %r2404;
add.s64 %rd203, %rd202, %rd3;
add.s64 %rd204, %rd1, %rd203;
ld.global.u8 %rs1111, [%rd204];
$L__BB26_84:
setp.ne.s32 %p131, %r2405, 0;
selp.u32 %r1278, 1, 0, %p131;
add.s32 %r2404, %r2404, %r1278;
add.s32 %r1279, %r2405, -1;
selp.b32 %r2405, 0, %r1279, %p129;
setp.eq.s32 %p132, %r2405, 0;
or.b16 %rs735, %rs1111, 15;
selp.b16 %rs1105, %rs735, %rs1111, %p132;
and.b16 %rs736, %rs1105, 255;
mov.u16 %rs737, 8;
sub.s16 %rs1114, %rs737, %rs1104;
setp.eq.s16 %p133, %rs736, 255;
selp.u16 %rs1104, 1, 0, %p133;
cvt.u32.u16 %r1280, %rs1105;
and.b32 %r2409, %r1280, 255;
$L__BB26_85:
add.s16 %rs1118, %rs1114, -1;
and.b16 %rs738, %rs1118, 255;
cvt.u32.u16 %r1281, %rs1118;
and.b32 %r1282, %r1281, 255;
shr.u32 %r1283, %r2409, %r1282;
and.b32 %r1284, %r1283, 1;
bfi.b32 %r173, %r166, %r1284, 1, 31;
setp.ne.s16 %p134, %rs738, 0;
@%p134 bra $L__BB26_89;
setp.eq.s32 %p135, %r2405, 0;
mov.u16 %rs1115, 255;
@%p135 bra $L__BB26_88;
cvt.u64.u32 %rd205, %r2404;
add.s64 %rd206, %rd205, %rd3;
add.s64 %rd207, %rd1, %rd206;
ld.global.u8 %rs1115, [%rd207];
$L__BB26_88:
setp.ne.s32 %p137, %r2405, 0;
selp.u32 %r1285, 1, 0, %p137;
add.s32 %r2404, %r2404, %r1285;
add.s32 %r1286, %r2405, -1;
selp.b32 %r2405, 0, %r1286, %p135;
setp.eq.s32 %p138, %r2405, 0;
or.b16 %rs740, %rs1115, 15;
selp.b16 %rs1105, %rs740, %rs1115, %p138;
and.b16 %rs741, %rs1105, 255;
mov.u16 %rs742, 8;
sub.s16 %rs1118, %rs742, %rs1104;
setp.eq.s16 %p139, %rs741, 255;
selp.u16 %rs1104, 1, 0, %p139;
cvt.u32.u16 %r1287, %rs1105;
and.b32 %r2409, %r1287, 255;
$L__BB26_89:
add.s16 %rs1139, %rs1118, -1;
cvt.u32.u16 %r1288, %rs1139;
and.b32 %r1289, %r1288, 255;
shr.u32 %r1290, %r2409, %r1289;
and.b32 %r1291, %r1290, 1;
bfi.b32 %r2431, %r173, %r1291, 1, 31;
add.s32 %r2403, %r2403, -4;
setp.ne.s32 %p140, %r2403, 0;
@%p140 bra $L__BB26_73;
$L__BB26_90:
setp.eq.s32 %p141, %r148, 0;
@%p141 bra $L__BB26_106;
and.b16 %rs743, %rs1139, 255;
setp.ne.s16 %p142, %rs743, 0;
@%p142 bra $L__BB26_95;
setp.eq.s32 %p143, %r2405, 0;
mov.u16 %rs1125, 255;
@%p143 bra $L__BB26_94;
cvt.u64.u32 %rd208, %r2404;
add.s64 %rd209, %rd208, %rd3;
add.s64 %rd210, %rd1, %rd209;
ld.global.u8 %rs1125, [%rd210];
$L__BB26_94:
setp.ne.s32 %p145, %r2405, 0;
selp.u32 %r1292, 1, 0, %p145;
add.s32 %r2404, %r2404, %r1292;
add.s32 %r1293, %r2405, -1;
selp.b32 %r2405, 0, %r1293, %p143;
setp.eq.s32 %p146, %r2405, 0;
or.b16 %rs745, %rs1125, 15;
selp.b16 %rs1105, %rs745, %rs1125, %p146;
and.b16 %rs746, %rs1105, 255;
mov.u16 %rs747, 8;
sub.s16 %rs1139, %rs747, %rs1104;
setp.eq.s16 %p147, %rs746, 255;
selp.u16 %rs1104, 1, 0, %p147;
$L__BB26_95:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1294, %rs1139;
and.b32 %r1295, %r1294, 255;
cvt.u32.u16 %r1296, %rs1105;
and.b32 %r2426, %r1296, 255;
shr.u32 %r1297, %r2426, %r1295;
and.b32 %r1298, %r1297, 1;
bfi.b32 %r2431, %r2431, %r1298, 1, 31;
setp.eq.s32 %p148, %r148, 1;
@%p148 bra $L__BB26_106;
and.b16 %rs748, %rs1139, 255;
setp.ne.s16 %p149, %rs748, 0;
@%p149 bra $L__BB26_100;
setp.eq.s32 %p150, %r2405, 0;
mov.u16 %rs1129, 255;
@%p150 bra $L__BB26_99;
cvt.u64.u32 %rd211, %r2404;
add.s64 %rd212, %rd211, %rd3;
add.s64 %rd213, %rd1, %rd212;
ld.global.u8 %rs1129, [%rd213];
$L__BB26_99:
setp.ne.s32 %p152, %r2405, 0;
selp.u32 %r1299, 1, 0, %p152;
add.s32 %r2404, %r2404, %r1299;
add.s32 %r1300, %r2405, -1;
selp.b32 %r2405, 0, %r1300, %p150;
setp.eq.s32 %p153, %r2405, 0;
or.b16 %rs750, %rs1129, 15;
selp.b16 %rs1105, %rs750, %rs1129, %p153;
and.b16 %rs751, %rs1105, 255;
mov.u16 %rs752, 8;
sub.s16 %rs1139, %rs752, %rs1104;
setp.eq.s16 %p154, %rs751, 255;
selp.u16 %rs1104, 1, 0, %p154;
cvt.u32.u16 %r1301, %rs1105;
and.b32 %r2426, %r1301, 255;
$L__BB26_100:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1302, %rs1139;
and.b32 %r1303, %r1302, 255;
shr.u32 %r1304, %r2426, %r1303;
and.b32 %r1305, %r1304, 1;
bfi.b32 %r2431, %r2431, %r1305, 1, 31;
setp.eq.s32 %p155, %r148, 2;
@%p155 bra $L__BB26_106;
and.b16 %rs753, %rs1139, 255;
setp.ne.s16 %p156, %rs753, 0;
@%p156 bra $L__BB26_105;
setp.eq.s32 %p157, %r2405, 0;
mov.u16 %rs1133, 255;
@%p157 bra $L__BB26_104;
cvt.u64.u32 %rd214, %r2404;
add.s64 %rd215, %rd214, %rd3;
add.s64 %rd216, %rd1, %rd215;
ld.global.u8 %rs1133, [%rd216];
$L__BB26_104:
setp.ne.s32 %p159, %r2405, 0;
selp.u32 %r1306, 1, 0, %p159;
add.s32 %r2404, %r2404, %r1306;
add.s32 %r1307, %r2405, -1;
selp.b32 %r2405, 0, %r1307, %p157;
setp.eq.s32 %p160, %r2405, 0;
or.b16 %rs755, %rs1133, 15;
selp.b16 %rs1105, %rs755, %rs1133, %p160;
and.b16 %rs756, %rs1105, 255;
mov.u16 %rs757, 8;
sub.s16 %rs1139, %rs757, %rs1104;
setp.eq.s16 %p161, %rs756, 255;
selp.u16 %rs1104, 1, 0, %p161;
cvt.u32.u16 %r1308, %rs1105;
and.b32 %r2426, %r1308, 255;
$L__BB26_105:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1309, %rs1139;
and.b32 %r1310, %r1309, 255;
shr.u32 %r1311, %r2426, %r1310;
and.b32 %r1312, %r1311, 1;
bfi.b32 %r2431, %r2431, %r1312, 1, 31;
$L__BB26_106:
shl.b32 %r1313, %r2431, 1;
or.b32 %r2435, %r1313, 1;
add.s32 %r1314, %r2495, -1;
setp.eq.s32 %p162, %r2495, 0;
selp.b32 %r2434, 0, %r1314, %p162;
$L__BB26_107:
mul.lo.s32 %r1315, %r2496, 7;
cvt.u64.u32 %rd217, %r2435;
shl.b64 %rd218, %rd217, %r1315;
or.b64 %rd603, %rd218, %rd603;
setp.ne.s32 %p163, %r2495, 12;
setp.ne.s32 %p164, %r144, 0;
or.pred %p165, %p163, %p164;
add.s32 %r2496, %r2496, 1;
setp.lt.u32 %p166, %r2496, 8;
or.pred %p167, %p166, %p165;
mov.u32 %r2495, %r2434;
@%p167 bra $L__BB26_63;
$L__BB26_108:
cvt.u32.u64 %r1316, %rd603;
and.b32 %r2492, %r1316, 127;
shr.u64 %rd603, %rd603, 7;
add.s32 %r2496, %r2496, -1;
$L__BB26_109:
mul.wide.u32 %rd219, %r2379, 2;
add.s64 %rd25, %rd13, %rd219;
st.local.u16 [%rd25], %r2445;
shl.b32 %r1317, %r2445, 3;
and.b32 %r1318, %r1317, 128;
shl.b32 %r1319, %r2445, 2;
and.b32 %r1320, %r1319, 896;
or.b32 %r1321, %r1318, %r1320;
and.b32 %r1322, %r2445, 7;
shr.u64 %rd26, %rd612, %r1322;
sub.s32 %r230, %r2575, %r1322;
cvt.u32.u64 %r1323, %rd26;
and.b32 %r1324, %r1323, 127;
or.b32 %r1325, %r1324, %r1321;
mul.wide.u32 %rd220, %r1325, 2;
add.s64 %rd221, %rd12, %rd220;
ld.global.u16 %r2497, [%rd221];
setp.ne.s32 %p168, %r1321, 0;
add.s32 %r232, %r2379, 2;
setp.ge.u32 %p169, %r232, %r1141;
or.pred %p170, %p169, %p168;
@%p170 bra $L__BB26_159;
add.s32 %r233, %r2492, -2;
setp.eq.s32 %p171, %r233, -1;
selp.b32 %r2497, %r2497, 0, %p171;
setp.gt.s32 %p172, %r2492, 1;
mov.u32 %r2492, %r233;
@%p172 bra $L__BB26_159;
setp.ne.s32 %p173, %r2496, 0;
@%p173 bra $L__BB26_158;
mov.u32 %r2496, 0;
$L__BB26_113:
setp.gt.u32 %p174, %r2496, 7;
@%p174 bra $L__BB26_158;
cvt.u64.u32 %rd28, %r2495;
mul.wide.u32 %rd222, %r2495, 4;
add.s64 %rd224, %rd150, %rd222;
ld.global.nc.u32 %r239, [%rd224];
and.b16 %rs758, %rs1139, 255;
setp.ne.s16 %p175, %rs758, 0;
@%p175 bra $L__BB26_118;
setp.eq.s32 %p176, %r2405, 0;
mov.u16 %rs1152, 255;
@%p176 bra $L__BB26_117;
cvt.u64.u32 %rd225, %r2404;
add.s64 %rd226, %rd225, %rd3;
add.s64 %rd227, %rd1, %rd226;
ld.global.u8 %rs1152, [%rd227];
$L__BB26_117:
setp.ne.s32 %p178, %r2405, 0;
selp.u32 %r1327, 1, 0, %p178;
add.s32 %r2404, %r2404, %r1327;
add.s32 %r1328, %r2405, -1;
selp.b32 %r2405, 0, %r1328, %p176;
setp.eq.s32 %p179, %r2405, 0;
or.b16 %rs760, %rs1152, 15;
selp.b16 %rs1105, %rs760, %rs1152, %p179;
and.b16 %rs761, %rs1105, 255;
mov.u16 %rs762, 8;
sub.s16 %rs1139, %rs762, %rs1104;
setp.eq.s16 %p180, %rs761, 255;
selp.u16 %rs1104, 1, 0, %p180;
$L__BB26_118:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1329, %rs1139;
and.b32 %r1330, %r1329, 255;
mov.u32 %r1331, 1;
shl.b32 %r1332, %r1331, %r1330;
cvt.u32.u16 %r1333, %rs1105;
and.b32 %r1334, %r1332, %r1333;
and.b32 %r244, %r1334, 255;
setp.eq.s32 %p181, %r244, 0;
@%p181 bra $L__BB26_120;
add.s32 %r1335, %r2495, 1;
min.u32 %r2486, %r1335, 12;
mov.u32 %r1336, -1;
shl.b32 %r1337, %r1336, %r239;
shl.b32 %r1338, %r1337, 1;
xor.b32 %r2487, %r1338, -2;
bra.uni $L__BB26_157;
$L__BB26_120:
add.s64 %rd228, %rd28, -3;
setp.gt.u64 %p182, %rd228, 9;
mov.u32 %r2483, 0;
@%p182 bra $L__BB26_156;
max.u32 %r247, %r239, 1;
add.s32 %r1342, %r247, -1;
and.b32 %r248, %r247, 3;
setp.lt.u32 %p183, %r1342, 3;
mov.u32 %r2483, 0;
@%p183 bra $L__BB26_140;
sub.s32 %r2455, %r247, %r248;
mov.u32 %r2483, 0;
$L__BB26_123:
and.b16 %rs764, %rs1139, 255;
setp.ne.s16 %p184, %rs764, 0;
@%p184 bra $L__BB26_127;
setp.eq.s32 %p185, %r2405, 0;
mov.u16 %rs1159, 255;
@%p185 bra $L__BB26_126;
cvt.u64.u32 %rd229, %r2404;
add.s64 %rd230, %rd229, %rd3;
add.s64 %rd231, %rd1, %rd230;
ld.global.u8 %rs1159, [%rd231];
$L__BB26_126:
setp.ne.s32 %p187, %r2405, 0;
selp.u32 %r1344, 1, 0, %p187;
add.s32 %r2404, %r2404, %r1344;
add.s32 %r1345, %r2405, -1;
selp.b32 %r2405, 0, %r1345, %p185;
setp.eq.s32 %p188, %r2405, 0;
or.b16 %rs766, %rs1159, 15;
selp.b16 %rs1105, %rs766, %rs1159, %p188;
and.b16 %rs767, %rs1105, 255;
mov.u16 %rs768, 8;
sub.s16 %rs1139, %rs768, %rs1104;
setp.eq.s16 %p189, %rs767, 255;
selp.u16 %rs1104, 1, 0, %p189;
$L__BB26_127:
add.s16 %rs1166, %rs1139, -1;
and.b16 %rs769, %rs1166, 255;
cvt.u32.u16 %r1346, %rs1166;
and.b32 %r1347, %r1346, 255;
cvt.u32.u16 %r1348, %rs1105;
and.b32 %r2461, %r1348, 255;
shr.u32 %r1349, %r2461, %r1347;
and.b32 %r1350, %r1349, 1;
bfi.b32 %r259, %r2483, %r1350, 1, 31;
setp.ne.s16 %p190, %rs769, 0;
@%p190 bra $L__BB26_131;
setp.eq.s32 %p191, %r2405, 0;
mov.u16 %rs1163, 255;
@%p191 bra $L__BB26_130;
cvt.u64.u32 %rd232, %r2404;
add.s64 %rd233, %rd232, %rd3;
add.s64 %rd234, %rd1, %rd233;
ld.global.u8 %rs1163, [%rd234];
$L__BB26_130:
setp.ne.s32 %p193, %r2405, 0;
selp.u32 %r1351, 1, 0, %p193;
add.s32 %r2404, %r2404, %r1351;
add.s32 %r1352, %r2405, -1;
selp.b32 %r2405, 0, %r1352, %p191;
setp.eq.s32 %p194, %r2405, 0;
or.b16 %rs771, %rs1163, 15;
selp.b16 %rs1105, %rs771, %rs1163, %p194;
and.b16 %rs772, %rs1105, 255;
mov.u16 %rs773, 8;
sub.s16 %rs1166, %rs773, %rs1104;
setp.eq.s16 %p195, %rs772, 255;
selp.u16 %rs1104, 1, 0, %p195;
cvt.u32.u16 %r1353, %rs1105;
and.b32 %r2461, %r1353, 255;
$L__BB26_131:
add.s16 %rs1170, %rs1166, -1;
and.b16 %rs774, %rs1170, 255;
cvt.u32.u16 %r1354, %rs1170;
and.b32 %r1355, %r1354, 255;
shr.u32 %r1356, %r2461, %r1355;
and.b32 %r1357, %r1356, 1;
bfi.b32 %r266, %r259, %r1357, 1, 31;
setp.ne.s16 %p196, %rs774, 0;
@%p196 bra $L__BB26_135;
setp.eq.s32 %p197, %r2405, 0;
mov.u16 %rs1167, 255;
@%p197 bra $L__BB26_134;
cvt.u64.u32 %rd235, %r2404;
add.s64 %rd236, %rd235, %rd3;
add.s64 %rd237, %rd1, %rd236;
ld.global.u8 %rs1167, [%rd237];
$L__BB26_134:
setp.ne.s32 %p199, %r2405, 0;
selp.u32 %r1358, 1, 0, %p199;
add.s32 %r2404, %r2404, %r1358;
add.s32 %r1359, %r2405, -1;
selp.b32 %r2405, 0, %r1359, %p197;
setp.eq.s32 %p200, %r2405, 0;
or.b16 %rs776, %rs1167, 15;
selp.b16 %rs1105, %rs776, %rs1167, %p200;
and.b16 %rs777, %rs1105, 255;
mov.u16 %rs778, 8;
sub.s16 %rs1170, %rs778, %rs1104;
setp.eq.s16 %p201, %rs777, 255;
selp.u16 %rs1104, 1, 0, %p201;
cvt.u32.u16 %r1360, %rs1105;
and.b32 %r2461, %r1360, 255;
$L__BB26_135:
add.s16 %rs1174, %rs1170, -1;
and.b16 %rs779, %rs1174, 255;
cvt.u32.u16 %r1361, %rs1174;
and.b32 %r1362, %r1361, 255;
shr.u32 %r1363, %r2461, %r1362;
and.b32 %r1364, %r1363, 1;
bfi.b32 %r273, %r266, %r1364, 1, 31;
setp.ne.s16 %p202, %rs779, 0;
@%p202 bra $L__BB26_139;
setp.eq.s32 %p203, %r2405, 0;
mov.u16 %rs1171, 255;
@%p203 bra $L__BB26_138;
cvt.u64.u32 %rd238, %r2404;
add.s64 %rd239, %rd238, %rd3;
add.s64 %rd240, %rd1, %rd239;
ld.global.u8 %rs1171, [%rd240];
$L__BB26_138:
setp.ne.s32 %p205, %r2405, 0;
selp.u32 %r1365, 1, 0, %p205;
add.s32 %r2404, %r2404, %r1365;
add.s32 %r1366, %r2405, -1;
selp.b32 %r2405, 0, %r1366, %p203;
setp.eq.s32 %p206, %r2405, 0;
or.b16 %rs781, %rs1171, 15;
selp.b16 %rs1105, %rs781, %rs1171, %p206;
and.b16 %rs782, %rs1105, 255;
mov.u16 %rs783, 8;
sub.s16 %rs1174, %rs783, %rs1104;
setp.eq.s16 %p207, %rs782, 255;
selp.u16 %rs1104, 1, 0, %p207;
cvt.u32.u16 %r1367, %rs1105;
and.b32 %r2461, %r1367, 255;
$L__BB26_139:
add.s16 %rs1139, %rs1174, -1;
cvt.u32.u16 %r1368, %rs1139;
and.b32 %r1369, %r1368, 255;
shr.u32 %r1370, %r2461, %r1369;
and.b32 %r1371, %r1370, 1;
bfi.b32 %r2483, %r273, %r1371, 1, 31;
add.s32 %r2455, %r2455, -4;
setp.ne.s32 %p208, %r2455, 0;
@%p208 bra $L__BB26_123;
$L__BB26_140:
setp.eq.s32 %p209, %r248, 0;
@%p209 bra $L__BB26_156;
and.b16 %rs784, %rs1139, 255;
setp.ne.s16 %p210, %rs784, 0;
@%p210 bra $L__BB26_145;
setp.eq.s32 %p211, %r2405, 0;
mov.u16 %rs1181, 255;
@%p211 bra $L__BB26_144;
cvt.u64.u32 %rd241, %r2404;
add.s64 %rd242, %rd241, %rd3;
add.s64 %rd243, %rd1, %rd242;
ld.global.u8 %rs1181, [%rd243];
$L__BB26_144:
setp.ne.s32 %p213, %r2405, 0;
selp.u32 %r1372, 1, 0, %p213;
add.s32 %r2404, %r2404, %r1372;
add.s32 %r1373, %r2405, -1;
selp.b32 %r2405, 0, %r1373, %p211;
setp.eq.s32 %p214, %r2405, 0;
or.b16 %rs786, %rs1181, 15;
selp.b16 %rs1105, %rs786, %rs1181, %p214;
and.b16 %rs787, %rs1105, 255;
mov.u16 %rs788, 8;
sub.s16 %rs1139, %rs788, %rs1104;
setp.eq.s16 %p215, %rs787, 255;
selp.u16 %rs1104, 1, 0, %p215;
$L__BB26_145:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1374, %rs1139;
and.b32 %r1375, %r1374, 255;
cvt.u32.u16 %r1376, %rs1105;
and.b32 %r2478, %r1376, 255;
shr.u32 %r1377, %r2478, %r1375;
and.b32 %r1378, %r1377, 1;
bfi.b32 %r2483, %r2483, %r1378, 1, 31;
setp.eq.s32 %p216, %r248, 1;
@%p216 bra $L__BB26_156;
and.b16 %rs789, %rs1139, 255;
setp.ne.s16 %p217, %rs789, 0;
@%p217 bra $L__BB26_150;
setp.eq.s32 %p218, %r2405, 0;
mov.u16 %rs1185, 255;
@%p218 bra $L__BB26_149;
cvt.u64.u32 %rd244, %r2404;
add.s64 %rd245, %rd244, %rd3;
add.s64 %rd246, %rd1, %rd245;
ld.global.u8 %rs1185, [%rd246];
$L__BB26_149:
setp.ne.s32 %p220, %r2405, 0;
selp.u32 %r1379, 1, 0, %p220;
add.s32 %r2404, %r2404, %r1379;
add.s32 %r1380, %r2405, -1;
selp.b32 %r2405, 0, %r1380, %p218;
setp.eq.s32 %p221, %r2405, 0;
or.b16 %rs791, %rs1185, 15;
selp.b16 %rs1105, %rs791, %rs1185, %p221;
and.b16 %rs792, %rs1105, 255;
mov.u16 %rs793, 8;
sub.s16 %rs1139, %rs793, %rs1104;
setp.eq.s16 %p222, %rs792, 255;
selp.u16 %rs1104, 1, 0, %p222;
cvt.u32.u16 %r1381, %rs1105;
and.b32 %r2478, %r1381, 255;
$L__BB26_150:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1382, %rs1139;
and.b32 %r1383, %r1382, 255;
shr.u32 %r1384, %r2478, %r1383;
and.b32 %r1385, %r1384, 1;
bfi.b32 %r2483, %r2483, %r1385, 1, 31;
setp.eq.s32 %p223, %r248, 2;
@%p223 bra $L__BB26_156;
and.b16 %rs794, %rs1139, 255;
setp.ne.s16 %p224, %rs794, 0;
@%p224 bra $L__BB26_155;
setp.eq.s32 %p225, %r2405, 0;
mov.u16 %rs1189, 255;
@%p225 bra $L__BB26_154;
cvt.u64.u32 %rd247, %r2404;
add.s64 %rd248, %rd247, %rd3;
add.s64 %rd249, %rd1, %rd248;
ld.global.u8 %rs1189, [%rd249];
$L__BB26_154:
setp.ne.s32 %p227, %r2405, 0;
selp.u32 %r1386, 1, 0, %p227;
add.s32 %r2404, %r2404, %r1386;
add.s32 %r1387, %r2405, -1;
selp.b32 %r2405, 0, %r1387, %p225;
setp.eq.s32 %p228, %r2405, 0;
or.b16 %rs796, %rs1189, 15;
selp.b16 %rs1105, %rs796, %rs1189, %p228;
and.b16 %rs797, %rs1105, 255;
mov.u16 %rs798, 8;
sub.s16 %rs1139, %rs798, %rs1104;
setp.eq.s16 %p229, %rs797, 255;
selp.u16 %rs1104, 1, 0, %p229;
cvt.u32.u16 %r1388, %rs1105;
and.b32 %r2478, %r1388, 255;
$L__BB26_155:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1389, %rs1139;
and.b32 %r1390, %r1389, 255;
shr.u32 %r1391, %r2478, %r1390;
and.b32 %r1392, %r1391, 1;
bfi.b32 %r2483, %r2483, %r1392, 1, 31;
$L__BB26_156:
shl.b32 %r1393, %r2483, 1;
or.b32 %r2487, %r1393, 1;
add.s32 %r1394, %r2495, -1;
setp.eq.s32 %p230, %r2495, 0;
selp.b32 %r2486, 0, %r1394, %p230;
$L__BB26_157:
mul.lo.s32 %r1395, %r2496, 7;
cvt.u64.u32 %rd250, %r2487;
shl.b64 %rd251, %rd250, %r1395;
or.b64 %rd603, %rd251, %rd603;
setp.ne.s32 %p231, %r2495, 12;
setp.ne.s32 %p232, %r244, 0;
or.pred %p233, %p231, %p232;
add.s32 %r2496, %r2496, 1;
setp.lt.u32 %p234, %r2496, 8;
or.pred %p235, %p234, %p233;
mov.u32 %r2495, %r2486;
@%p235 bra $L__BB26_113;
$L__BB26_158:
cvt.u32.u64 %r1396, %rd603;
and.b32 %r2492, %r1396, 127;
shr.u64 %rd603, %rd603, 7;
add.s32 %r2496, %r2496, -1;
$L__BB26_159:
setp.lt.u32 %p236, %r232, %r1141;
selp.b32 %r330, %r2497, 0, %p236;
st.local.u16 [%rd25+4], %r330;
and.b32 %r1398, %r1317, 64;
shl.b32 %r1399, %r330, 4;
and.b32 %r1400, %r1399, 128;
or.b32 %r2549, %r1400, %r1398;
setp.ne.s32 %p237, %r2549, 192;
@%p237 bra $L__BB26_209;
add.s32 %r332, %r2492, -2;
setp.eq.s32 %p238, %r332, -1;
selp.b32 %r2549, 256, 192, %p238;
setp.gt.s32 %p239, %r2492, 1;
mov.u32 %r2492, %r332;
@%p239 bra $L__BB26_209;
setp.ne.s32 %p240, %r2496, 0;
@%p240 bra $L__BB26_208;
mov.u32 %r2496, 0;
$L__BB26_163:
setp.gt.u32 %p241, %r2496, 7;
@%p241 bra $L__BB26_208;
cvt.u64.u32 %rd34, %r2495;
mul.wide.u32 %rd252, %r2495, 4;
add.s64 %rd254, %rd150, %rd252;
ld.global.nc.u32 %r338, [%rd254];
and.b16 %rs799, %rs1139, 255;
setp.ne.s16 %p242, %rs799, 0;
@%p242 bra $L__BB26_168;
setp.eq.s32 %p243, %r2405, 0;
mov.u16 %rs1208, 255;
@%p243 bra $L__BB26_167;
cvt.u64.u32 %rd255, %r2404;
add.s64 %rd256, %rd255, %rd3;
add.s64 %rd257, %rd1, %rd256;
ld.global.u8 %rs1208, [%rd257];
$L__BB26_167:
setp.ne.s32 %p245, %r2405, 0;
selp.u32 %r1402, 1, 0, %p245;
add.s32 %r2404, %r2404, %r1402;
add.s32 %r1403, %r2405, -1;
selp.b32 %r2405, 0, %r1403, %p243;
setp.eq.s32 %p246, %r2405, 0;
or.b16 %rs801, %rs1208, 15;
selp.b16 %rs1105, %rs801, %rs1208, %p246;
and.b16 %rs802, %rs1105, 255;
mov.u16 %rs803, 8;
sub.s16 %rs1139, %rs803, %rs1104;
setp.eq.s16 %p247, %rs802, 255;
selp.u16 %rs1104, 1, 0, %p247;
$L__BB26_168:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1404, %rs1139;
and.b32 %r1405, %r1404, 255;
mov.u32 %r1406, 1;
shl.b32 %r1407, %r1406, %r1405;
cvt.u32.u16 %r1408, %rs1105;
and.b32 %r1409, %r1407, %r1408;
and.b32 %r343, %r1409, 255;
setp.eq.s32 %p248, %r343, 0;
@%p248 bra $L__BB26_170;
add.s32 %r1410, %r2495, 1;
min.u32 %r2538, %r1410, 12;
mov.u32 %r1411, -1;
shl.b32 %r1412, %r1411, %r338;
shl.b32 %r1413, %r1412, 1;
xor.b32 %r2539, %r1413, -2;
bra.uni $L__BB26_207;
$L__BB26_170:
add.s64 %rd258, %rd34, -3;
setp.gt.u64 %p249, %rd258, 9;
mov.u32 %r2535, 0;
@%p249 bra $L__BB26_206;
max.u32 %r346, %r338, 1;
add.s32 %r1417, %r346, -1;
and.b32 %r347, %r346, 3;
setp.lt.u32 %p250, %r1417, 3;
mov.u32 %r2535, 0;
@%p250 bra $L__BB26_190;
sub.s32 %r2507, %r346, %r347;
mov.u32 %r2535, 0;
$L__BB26_173:
and.b16 %rs805, %rs1139, 255;
setp.ne.s16 %p251, %rs805, 0;
@%p251 bra $L__BB26_177;
setp.eq.s32 %p252, %r2405, 0;
mov.u16 %rs1215, 255;
@%p252 bra $L__BB26_176;
cvt.u64.u32 %rd259, %r2404;
add.s64 %rd260, %rd259, %rd3;
add.s64 %rd261, %rd1, %rd260;
ld.global.u8 %rs1215, [%rd261];
$L__BB26_176:
setp.ne.s32 %p254, %r2405, 0;
selp.u32 %r1419, 1, 0, %p254;
add.s32 %r2404, %r2404, %r1419;
add.s32 %r1420, %r2405, -1;
selp.b32 %r2405, 0, %r1420, %p252;
setp.eq.s32 %p255, %r2405, 0;
or.b16 %rs807, %rs1215, 15;
selp.b16 %rs1105, %rs807, %rs1215, %p255;
and.b16 %rs808, %rs1105, 255;
mov.u16 %rs809, 8;
sub.s16 %rs1139, %rs809, %rs1104;
setp.eq.s16 %p256, %rs808, 255;
selp.u16 %rs1104, 1, 0, %p256;
$L__BB26_177:
add.s16 %rs1222, %rs1139, -1;
and.b16 %rs810, %rs1222, 255;
cvt.u32.u16 %r1421, %rs1222;
and.b32 %r1422, %r1421, 255;
cvt.u32.u16 %r1423, %rs1105;
and.b32 %r2513, %r1423, 255;
shr.u32 %r1424, %r2513, %r1422;
and.b32 %r1425, %r1424, 1;
bfi.b32 %r358, %r2535, %r1425, 1, 31;
setp.ne.s16 %p257, %rs810, 0;
@%p257 bra $L__BB26_181;
setp.eq.s32 %p258, %r2405, 0;
mov.u16 %rs1219, 255;
@%p258 bra $L__BB26_180;
cvt.u64.u32 %rd262, %r2404;
add.s64 %rd263, %rd262, %rd3;
add.s64 %rd264, %rd1, %rd263;
ld.global.u8 %rs1219, [%rd264];
$L__BB26_180:
setp.ne.s32 %p260, %r2405, 0;
selp.u32 %r1426, 1, 0, %p260;
add.s32 %r2404, %r2404, %r1426;
add.s32 %r1427, %r2405, -1;
selp.b32 %r2405, 0, %r1427, %p258;
setp.eq.s32 %p261, %r2405, 0;
or.b16 %rs812, %rs1219, 15;
selp.b16 %rs1105, %rs812, %rs1219, %p261;
and.b16 %rs813, %rs1105, 255;
mov.u16 %rs814, 8;
sub.s16 %rs1222, %rs814, %rs1104;
setp.eq.s16 %p262, %rs813, 255;
selp.u16 %rs1104, 1, 0, %p262;
cvt.u32.u16 %r1428, %rs1105;
and.b32 %r2513, %r1428, 255;
$L__BB26_181:
add.s16 %rs1226, %rs1222, -1;
and.b16 %rs815, %rs1226, 255;
cvt.u32.u16 %r1429, %rs1226;
and.b32 %r1430, %r1429, 255;
shr.u32 %r1431, %r2513, %r1430;
and.b32 %r1432, %r1431, 1;
bfi.b32 %r365, %r358, %r1432, 1, 31;
setp.ne.s16 %p263, %rs815, 0;
@%p263 bra $L__BB26_185;
setp.eq.s32 %p264, %r2405, 0;
mov.u16 %rs1223, 255;
@%p264 bra $L__BB26_184;
cvt.u64.u32 %rd265, %r2404;
add.s64 %rd266, %rd265, %rd3;
add.s64 %rd267, %rd1, %rd266;
ld.global.u8 %rs1223, [%rd267];
$L__BB26_184:
setp.ne.s32 %p266, %r2405, 0;
selp.u32 %r1433, 1, 0, %p266;
add.s32 %r2404, %r2404, %r1433;
add.s32 %r1434, %r2405, -1;
selp.b32 %r2405, 0, %r1434, %p264;
setp.eq.s32 %p267, %r2405, 0;
or.b16 %rs817, %rs1223, 15;
selp.b16 %rs1105, %rs817, %rs1223, %p267;
and.b16 %rs818, %rs1105, 255;
mov.u16 %rs819, 8;
sub.s16 %rs1226, %rs819, %rs1104;
setp.eq.s16 %p268, %rs818, 255;
selp.u16 %rs1104, 1, 0, %p268;
cvt.u32.u16 %r1435, %rs1105;
and.b32 %r2513, %r1435, 255;
$L__BB26_185:
add.s16 %rs1230, %rs1226, -1;
and.b16 %rs820, %rs1230, 255;
cvt.u32.u16 %r1436, %rs1230;
and.b32 %r1437, %r1436, 255;
shr.u32 %r1438, %r2513, %r1437;
and.b32 %r1439, %r1438, 1;
bfi.b32 %r372, %r365, %r1439, 1, 31;
setp.ne.s16 %p269, %rs820, 0;
@%p269 bra $L__BB26_189;
setp.eq.s32 %p270, %r2405, 0;
mov.u16 %rs1227, 255;
@%p270 bra $L__BB26_188;
cvt.u64.u32 %rd268, %r2404;
add.s64 %rd269, %rd268, %rd3;
add.s64 %rd270, %rd1, %rd269;
ld.global.u8 %rs1227, [%rd270];
$L__BB26_188:
setp.ne.s32 %p272, %r2405, 0;
selp.u32 %r1440, 1, 0, %p272;
add.s32 %r2404, %r2404, %r1440;
add.s32 %r1441, %r2405, -1;
selp.b32 %r2405, 0, %r1441, %p270;
setp.eq.s32 %p273, %r2405, 0;
or.b16 %rs822, %rs1227, 15;
selp.b16 %rs1105, %rs822, %rs1227, %p273;
and.b16 %rs823, %rs1105, 255;
mov.u16 %rs824, 8;
sub.s16 %rs1230, %rs824, %rs1104;
setp.eq.s16 %p274, %rs823, 255;
selp.u16 %rs1104, 1, 0, %p274;
cvt.u32.u16 %r1442, %rs1105;
and.b32 %r2513, %r1442, 255;
$L__BB26_189:
add.s16 %rs1139, %rs1230, -1;
cvt.u32.u16 %r1443, %rs1139;
and.b32 %r1444, %r1443, 255;
shr.u32 %r1445, %r2513, %r1444;
and.b32 %r1446, %r1445, 1;
bfi.b32 %r2535, %r372, %r1446, 1, 31;
add.s32 %r2507, %r2507, -4;
setp.ne.s32 %p275, %r2507, 0;
@%p275 bra $L__BB26_173;
$L__BB26_190:
setp.eq.s32 %p276, %r347, 0;
@%p276 bra $L__BB26_206;
and.b16 %rs825, %rs1139, 255;
setp.ne.s16 %p277, %rs825, 0;
@%p277 bra $L__BB26_195;
setp.eq.s32 %p278, %r2405, 0;
mov.u16 %rs1237, 255;
@%p278 bra $L__BB26_194;
cvt.u64.u32 %rd271, %r2404;
add.s64 %rd272, %rd271, %rd3;
add.s64 %rd273, %rd1, %rd272;
ld.global.u8 %rs1237, [%rd273];
$L__BB26_194:
setp.ne.s32 %p280, %r2405, 0;
selp.u32 %r1447, 1, 0, %p280;
add.s32 %r2404, %r2404, %r1447;
add.s32 %r1448, %r2405, -1;
selp.b32 %r2405, 0, %r1448, %p278;
setp.eq.s32 %p281, %r2405, 0;
or.b16 %rs827, %rs1237, 15;
selp.b16 %rs1105, %rs827, %rs1237, %p281;
and.b16 %rs828, %rs1105, 255;
mov.u16 %rs829, 8;
sub.s16 %rs1139, %rs829, %rs1104;
setp.eq.s16 %p282, %rs828, 255;
selp.u16 %rs1104, 1, 0, %p282;
$L__BB26_195:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1449, %rs1139;
and.b32 %r1450, %r1449, 255;
cvt.u32.u16 %r1451, %rs1105;
and.b32 %r2530, %r1451, 255;
shr.u32 %r1452, %r2530, %r1450;
and.b32 %r1453, %r1452, 1;
bfi.b32 %r2535, %r2535, %r1453, 1, 31;
setp.eq.s32 %p283, %r347, 1;
@%p283 bra $L__BB26_206;
and.b16 %rs830, %rs1139, 255;
setp.ne.s16 %p284, %rs830, 0;
@%p284 bra $L__BB26_200;
setp.eq.s32 %p285, %r2405, 0;
mov.u16 %rs1241, 255;
@%p285 bra $L__BB26_199;
cvt.u64.u32 %rd274, %r2404;
add.s64 %rd275, %rd274, %rd3;
add.s64 %rd276, %rd1, %rd275;
ld.global.u8 %rs1241, [%rd276];
$L__BB26_199:
setp.ne.s32 %p287, %r2405, 0;
selp.u32 %r1454, 1, 0, %p287;
add.s32 %r2404, %r2404, %r1454;
add.s32 %r1455, %r2405, -1;
selp.b32 %r2405, 0, %r1455, %p285;
setp.eq.s32 %p288, %r2405, 0;
or.b16 %rs832, %rs1241, 15;
selp.b16 %rs1105, %rs832, %rs1241, %p288;
and.b16 %rs833, %rs1105, 255;
mov.u16 %rs834, 8;
sub.s16 %rs1139, %rs834, %rs1104;
setp.eq.s16 %p289, %rs833, 255;
selp.u16 %rs1104, 1, 0, %p289;
cvt.u32.u16 %r1456, %rs1105;
and.b32 %r2530, %r1456, 255;
$L__BB26_200:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1457, %rs1139;
and.b32 %r1458, %r1457, 255;
shr.u32 %r1459, %r2530, %r1458;
and.b32 %r1460, %r1459, 1;
bfi.b32 %r2535, %r2535, %r1460, 1, 31;
setp.eq.s32 %p290, %r347, 2;
@%p290 bra $L__BB26_206;
and.b16 %rs835, %rs1139, 255;
setp.ne.s16 %p291, %rs835, 0;
@%p291 bra $L__BB26_205;
setp.eq.s32 %p292, %r2405, 0;
mov.u16 %rs1245, 255;
@%p292 bra $L__BB26_204;
cvt.u64.u32 %rd277, %r2404;
add.s64 %rd278, %rd277, %rd3;
add.s64 %rd279, %rd1, %rd278;
ld.global.u8 %rs1245, [%rd279];
$L__BB26_204:
setp.ne.s32 %p294, %r2405, 0;
selp.u32 %r1461, 1, 0, %p294;
add.s32 %r2404, %r2404, %r1461;
add.s32 %r1462, %r2405, -1;
selp.b32 %r2405, 0, %r1462, %p292;
setp.eq.s32 %p295, %r2405, 0;
or.b16 %rs837, %rs1245, 15;
selp.b16 %rs1105, %rs837, %rs1245, %p295;
and.b16 %rs838, %rs1105, 255;
mov.u16 %rs839, 8;
sub.s16 %rs1139, %rs839, %rs1104;
setp.eq.s16 %p296, %rs838, 255;
selp.u16 %rs1104, 1, 0, %p296;
cvt.u32.u16 %r1463, %rs1105;
and.b32 %r2530, %r1463, 255;
$L__BB26_205:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1464, %rs1139;
and.b32 %r1465, %r1464, 255;
shr.u32 %r1466, %r2530, %r1465;
and.b32 %r1467, %r1466, 1;
bfi.b32 %r2535, %r2535, %r1467, 1, 31;
$L__BB26_206:
shl.b32 %r1468, %r2535, 1;
or.b32 %r2539, %r1468, 1;
add.s32 %r1469, %r2495, -1;
setp.eq.s32 %p297, %r2495, 0;
selp.b32 %r2538, 0, %r1469, %p297;
$L__BB26_207:
mul.lo.s32 %r1470, %r2496, 7;
cvt.u64.u32 %rd280, %r2539;
shl.b64 %rd281, %rd280, %r1470;
or.b64 %rd603, %rd281, %rd603;
setp.ne.s32 %p298, %r2495, 12;
setp.ne.s32 %p299, %r343, 0;
or.pred %p300, %p298, %p299;
add.s32 %r2496, %r2496, 1;
setp.lt.u32 %p301, %r2496, 8;
or.pred %p302, %p301, %p300;
mov.u32 %r2495, %r2538;
@%p302 bra $L__BB26_163;
$L__BB26_208:
cvt.u32.u64 %r1471, %rd603;
and.b32 %r2492, %r1471, 127;
shr.u64 %rd603, %rd603, 7;
add.s32 %r2496, %r2496, -1;
$L__BB26_209:
and.b32 %r1472, %r330, 7;
shr.u64 %rd282, %rd26, %r1472;
cvt.u32.u64 %r1473, %rd282;
and.b32 %r1474, %r1473, 63;
add.s32 %r1475, %r2549, %r1474;
mul.wide.u32 %rd283, %r1475, 2;
add.s64 %rd284, %rd11, %rd283;
ld.global.u16 %r1476, [%rd284];
and.b32 %r1477, %r1476, 7;
shr.u64 %rd285, %rd282, %r1477;
sub.s32 %r1478, %r230, %r1472;
sub.s32 %r1479, %r1478, %r1477;
cvt.u32.u64 %r1480, %rd285;
shr.u32 %r1481, %r1476, 3;
and.b32 %r1482, %r1481, 15;
mov.u32 %r1483, -1;
shl.b32 %r1484, %r1483, %r1482;
not.b32 %r1485, %r1484;
and.b32 %r1486, %r1480, %r1485;
shr.u64 %rd612, %rd285, %r1482;
sub.s32 %r2575, %r1479, %r1482;
shr.u32 %r1487, %r1476, 7;
and.b32 %r1488, %r1487, 7;
shr.u32 %r1489, %r1476, 10;
and.b32 %r1490, %r1489, 7;
mov.u32 %r1491, 255;
shl.b32 %r1492, %r1491, %r1488;
not.b32 %r1493, %r1492;
and.b32 %r1494, %r1486, %r1493;
add.s32 %r1495, %r1490, %r1494;
add.s32 %r1496, %r1495, 1;
st.local.u16 [%rd25+2], %r1496;
shr.u32 %r1497, %r1476, 13;
shr.u32 %r1498, %r1486, %r1488;
add.s32 %r1499, %r1497, %r1498;
add.s32 %r1500, %r1499, 1;
st.local.u16 [%rd25+6], %r1500;
add.s32 %r2379, %r2379, 4;
setp.lt.u32 %p303, %r2379, %r1141;
shl.b32 %r1501, %r330, 2;
and.b32 %r1502, %r1501, 896;
shl.b32 %r1503, %r330, 3;
and.b32 %r1504, %r1503, 128;
or.b32 %r2378, %r1504, %r1502;
@%p303 bra $L__BB26_55;
mul.wide.u32 %rd288, %r2379, 2;
add.s64 %rd289, %rd13, %rd288;
mov.u16 %rs840, 0;
st.local.v2.u16 [%rd289], {%rs840, %rs840};
setp.lt.u32 %p304, %r1144, 3;
@%p304 bra $L__BB26_319;
ld.param.u64 %rd589, [ j2k_htj2k_decode_codeblocks_multi_param_3];
ld.param.u64 %rd588, [ j2k_htj2k_decode_codeblocks_multi_param_5];
cvta.to.global.u64 %rd40, %rd588;
cvta.to.global.u64 %rd41, %rd589;
mov.u32 %r2550, 2;
$L__BB26_212:
shr.u32 %r1509, %r2550, 1;
mul.lo.s32 %r442, %r1509, %r1163;
sub.s32 %r443, %r442, %r1163;
mov.u32 %r2559, 0;
mov.u32 %r2560, %r2559;
mov.u32 %r2561, %r442;
$L__BB26_213:
sub.s32 %r1510, %r2561, %r442;
add.s32 %r455, %r1510, %r443;
mul.wide.u32 %rd290, %r455, 2;
add.s64 %rd46, %rd13, %rd290;
ld.local.u16 %r1511, [%rd46];
shl.b32 %r1512, %r1511, 2;
and.b32 %r1513, %r1512, 640;
or.b32 %r1514, %r2560, %r1513;
add.s32 %r1515, %r455, 2;
mul.wide.u32 %rd291, %r1515, 2;
add.s64 %rd47, %rd13, %rd291;
ld.local.u16 %r1516, [%rd47];
shl.b32 %r1517, %r1516, 4;
and.b32 %r1518, %r1517, 512;
or.b32 %r456, %r1514, %r1518;
setp.gt.u32 %p305, %r2575, 31;
@%p305 bra $L__BB26_217;
$L__BB26_214:
setp.eq.s32 %p306, %r2574, 0;
mov.u16 %rs1270, 0;
@%p306 bra $L__BB26_216;
cvt.s64.s32 %rd292, %r2573;
add.s64 %rd293, %rd292, %rd3;
add.s64 %rd294, %rd1, %rd293;
ld.global.u8 %rs1270, [%rd294];
$L__BB26_216:
setp.ne.s32 %p308, %r2574, 0;
selp.b32 %r1519, -1, 0, %p308;
add.s32 %r2573, %r2573, %r1519;
add.s32 %r1520, %r2574, -1;
selp.b32 %r2574, 0, %r1520, %p306;
and.b16 %rs842, %rs1270, 255;
and.b16 %rs843, %rs1270, 127;
setp.eq.s16 %p309, %rs843, 127;
and.b16 %rs844, %rs1271, 255;
setp.ne.s16 %p310, %rs844, 0;
and.pred %p311, %p310, %p309;
selp.b32 %r1521, 7, 8, %p311;
cvt.u64.u16 %rd295, %rs1270;
and.b64 %rd296, %rd295, 255;
shl.b64 %rd297, %rd296, %r2575;
or.b64 %rd612, %rd297, %rd612;
add.s32 %r2575, %r1521, %r2575;
setp.gt.u16 %p312, %rs842, 143;
selp.u16 %rs1271, 1, 0, %p312;
setp.lt.u32 %p313, %r2575, 33;
@%p313 bra $L__BB26_214;
$L__BB26_217:
cvt.u32.u64 %r1522, %rd612;
and.b32 %r1523, %r1522, 127;
add.s32 %r1524, %r1523, %r456;
mul.wide.u32 %rd298, %r1524, 2;
add.s64 %rd299, %rd41, %rd298;
ld.global.u16 %r2627, [%rd299];
setp.ne.s32 %p314, %r456, 0;
@%p314 bra $L__BB26_267;
add.s32 %r467, %r2492, -2;
setp.eq.s32 %p315, %r467, -1;
selp.b32 %r2627, %r2627, 0, %p315;
setp.gt.s32 %p316, %r2492, 1;
mov.u32 %r2492, %r467;
@%p316 bra $L__BB26_267;
setp.ne.s32 %p317, %r2496, 0;
@%p317 bra $L__BB26_266;
mov.u32 %r2496, 0;
$L__BB26_221:
setp.gt.u32 %p318, %r2496, 7;
@%p318 bra $L__BB26_266;
cvt.u64.u32 %rd52, %r2495;
mul.wide.u32 %rd300, %r2495, 4;
add.s64 %rd302, %rd150, %rd300;
ld.global.nc.u32 %r473, [%rd302];
and.b16 %rs845, %rs1139, 255;
setp.ne.s16 %p319, %rs845, 0;
@%p319 bra $L__BB26_226;
setp.eq.s32 %p320, %r2405, 0;
mov.u16 %rs1275, 255;
@%p320 bra $L__BB26_225;
cvt.u64.u32 %rd303, %r2404;
add.s64 %rd304, %rd303, %rd3;
add.s64 %rd305, %rd1, %rd304;
ld.global.u8 %rs1275, [%rd305];
$L__BB26_225:
setp.ne.s32 %p322, %r2405, 0;
selp.u32 %r1526, 1, 0, %p322;
add.s32 %r2404, %r2404, %r1526;
add.s32 %r1527, %r2405, -1;
selp.b32 %r2405, 0, %r1527, %p320;
setp.eq.s32 %p323, %r2405, 0;
or.b16 %rs847, %rs1275, 15;
selp.b16 %rs1105, %rs847, %rs1275, %p323;
and.b16 %rs848, %rs1105, 255;
mov.u16 %rs849, 8;
sub.s16 %rs1139, %rs849, %rs1104;
setp.eq.s16 %p324, %rs848, 255;
selp.u16 %rs1104, 1, 0, %p324;
$L__BB26_226:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1528, %rs1139;
and.b32 %r1529, %r1528, 255;
mov.u32 %r1530, 1;
shl.b32 %r1531, %r1530, %r1529;
cvt.u32.u16 %r1532, %rs1105;
and.b32 %r1533, %r1531, %r1532;
and.b32 %r478, %r1533, 255;
setp.eq.s32 %p325, %r478, 0;
@%p325 bra $L__BB26_228;
add.s32 %r1534, %r2495, 1;
min.u32 %r2616, %r1534, 12;
mov.u32 %r1535, -1;
shl.b32 %r1536, %r1535, %r473;
shl.b32 %r1537, %r1536, 1;
xor.b32 %r2617, %r1537, -2;
bra.uni $L__BB26_265;
$L__BB26_228:
add.s64 %rd306, %rd52, -3;
setp.gt.u64 %p326, %rd306, 9;
mov.u32 %r2613, 0;
@%p326 bra $L__BB26_264;
max.u32 %r481, %r473, 1;
add.s32 %r1541, %r481, -1;
and.b32 %r482, %r481, 3;
setp.lt.u32 %p327, %r1541, 3;
mov.u32 %r2613, 0;
@%p327 bra $L__BB26_248;
sub.s32 %r2585, %r481, %r482;
mov.u32 %r2613, 0;
$L__BB26_231:
and.b16 %rs851, %rs1139, 255;
setp.ne.s16 %p328, %rs851, 0;
@%p328 bra $L__BB26_235;
setp.eq.s32 %p329, %r2405, 0;
mov.u16 %rs1282, 255;
@%p329 bra $L__BB26_234;
cvt.u64.u32 %rd307, %r2404;
add.s64 %rd308, %rd307, %rd3;
add.s64 %rd309, %rd1, %rd308;
ld.global.u8 %rs1282, [%rd309];
$L__BB26_234:
setp.ne.s32 %p331, %r2405, 0;
selp.u32 %r1543, 1, 0, %p331;
add.s32 %r2404, %r2404, %r1543;
add.s32 %r1544, %r2405, -1;
selp.b32 %r2405, 0, %r1544, %p329;
setp.eq.s32 %p332, %r2405, 0;
or.b16 %rs853, %rs1282, 15;
selp.b16 %rs1105, %rs853, %rs1282, %p332;
and.b16 %rs854, %rs1105, 255;
mov.u16 %rs855, 8;
sub.s16 %rs1139, %rs855, %rs1104;
setp.eq.s16 %p333, %rs854, 255;
selp.u16 %rs1104, 1, 0, %p333;
$L__BB26_235:
add.s16 %rs1289, %rs1139, -1;
and.b16 %rs856, %rs1289, 255;
cvt.u32.u16 %r1545, %rs1289;
and.b32 %r1546, %r1545, 255;
cvt.u32.u16 %r1547, %rs1105;
and.b32 %r2591, %r1547, 255;
shr.u32 %r1548, %r2591, %r1546;
and.b32 %r1549, %r1548, 1;
bfi.b32 %r493, %r2613, %r1549, 1, 31;
setp.ne.s16 %p334, %rs856, 0;
@%p334 bra $L__BB26_239;
setp.eq.s32 %p335, %r2405, 0;
mov.u16 %rs1286, 255;
@%p335 bra $L__BB26_238;
cvt.u64.u32 %rd310, %r2404;
add.s64 %rd311, %rd310, %rd3;
add.s64 %rd312, %rd1, %rd311;
ld.global.u8 %rs1286, [%rd312];
$L__BB26_238:
setp.ne.s32 %p337, %r2405, 0;
selp.u32 %r1550, 1, 0, %p337;
add.s32 %r2404, %r2404, %r1550;
add.s32 %r1551, %r2405, -1;
selp.b32 %r2405, 0, %r1551, %p335;
setp.eq.s32 %p338, %r2405, 0;
or.b16 %rs858, %rs1286, 15;
selp.b16 %rs1105, %rs858, %rs1286, %p338;
and.b16 %rs859, %rs1105, 255;
mov.u16 %rs860, 8;
sub.s16 %rs1289, %rs860, %rs1104;
setp.eq.s16 %p339, %rs859, 255;
selp.u16 %rs1104, 1, 0, %p339;
cvt.u32.u16 %r1552, %rs1105;
and.b32 %r2591, %r1552, 255;
$L__BB26_239:
add.s16 %rs1293, %rs1289, -1;
and.b16 %rs861, %rs1293, 255;
cvt.u32.u16 %r1553, %rs1293;
and.b32 %r1554, %r1553, 255;
shr.u32 %r1555, %r2591, %r1554;
and.b32 %r1556, %r1555, 1;
bfi.b32 %r500, %r493, %r1556, 1, 31;
setp.ne.s16 %p340, %rs861, 0;
@%p340 bra $L__BB26_243;
setp.eq.s32 %p341, %r2405, 0;
mov.u16 %rs1290, 255;
@%p341 bra $L__BB26_242;
cvt.u64.u32 %rd313, %r2404;
add.s64 %rd314, %rd313, %rd3;
add.s64 %rd315, %rd1, %rd314;
ld.global.u8 %rs1290, [%rd315];
$L__BB26_242:
setp.ne.s32 %p343, %r2405, 0;
selp.u32 %r1557, 1, 0, %p343;
add.s32 %r2404, %r2404, %r1557;
add.s32 %r1558, %r2405, -1;
selp.b32 %r2405, 0, %r1558, %p341;
setp.eq.s32 %p344, %r2405, 0;
or.b16 %rs863, %rs1290, 15;
selp.b16 %rs1105, %rs863, %rs1290, %p344;
and.b16 %rs864, %rs1105, 255;
mov.u16 %rs865, 8;
sub.s16 %rs1293, %rs865, %rs1104;
setp.eq.s16 %p345, %rs864, 255;
selp.u16 %rs1104, 1, 0, %p345;
cvt.u32.u16 %r1559, %rs1105;
and.b32 %r2591, %r1559, 255;
$L__BB26_243:
add.s16 %rs1297, %rs1293, -1;
and.b16 %rs866, %rs1297, 255;
cvt.u32.u16 %r1560, %rs1297;
and.b32 %r1561, %r1560, 255;
shr.u32 %r1562, %r2591, %r1561;
and.b32 %r1563, %r1562, 1;
bfi.b32 %r507, %r500, %r1563, 1, 31;
setp.ne.s16 %p346, %rs866, 0;
@%p346 bra $L__BB26_247;
setp.eq.s32 %p347, %r2405, 0;
mov.u16 %rs1294, 255;
@%p347 bra $L__BB26_246;
cvt.u64.u32 %rd316, %r2404;
add.s64 %rd317, %rd316, %rd3;
add.s64 %rd318, %rd1, %rd317;
ld.global.u8 %rs1294, [%rd318];
$L__BB26_246:
setp.ne.s32 %p349, %r2405, 0;
selp.u32 %r1564, 1, 0, %p349;
add.s32 %r2404, %r2404, %r1564;
add.s32 %r1565, %r2405, -1;
selp.b32 %r2405, 0, %r1565, %p347;
setp.eq.s32 %p350, %r2405, 0;
or.b16 %rs868, %rs1294, 15;
selp.b16 %rs1105, %rs868, %rs1294, %p350;
and.b16 %rs869, %rs1105, 255;
mov.u16 %rs870, 8;
sub.s16 %rs1297, %rs870, %rs1104;
setp.eq.s16 %p351, %rs869, 255;
selp.u16 %rs1104, 1, 0, %p351;
cvt.u32.u16 %r1566, %rs1105;
and.b32 %r2591, %r1566, 255;
$L__BB26_247:
add.s16 %rs1139, %rs1297, -1;
cvt.u32.u16 %r1567, %rs1139;
and.b32 %r1568, %r1567, 255;
shr.u32 %r1569, %r2591, %r1568;
and.b32 %r1570, %r1569, 1;
bfi.b32 %r2613, %r507, %r1570, 1, 31;
add.s32 %r2585, %r2585, -4;
setp.ne.s32 %p352, %r2585, 0;
@%p352 bra $L__BB26_231;
$L__BB26_248:
setp.eq.s32 %p353, %r482, 0;
@%p353 bra $L__BB26_264;
and.b16 %rs871, %rs1139, 255;
setp.ne.s16 %p354, %rs871, 0;
@%p354 bra $L__BB26_253;
setp.eq.s32 %p355, %r2405, 0;
mov.u16 %rs1304, 255;
@%p355 bra $L__BB26_252;
cvt.u64.u32 %rd319, %r2404;
add.s64 %rd320, %rd319, %rd3;
add.s64 %rd321, %rd1, %rd320;
ld.global.u8 %rs1304, [%rd321];
$L__BB26_252:
setp.ne.s32 %p357, %r2405, 0;
selp.u32 %r1571, 1, 0, %p357;
add.s32 %r2404, %r2404, %r1571;
add.s32 %r1572, %r2405, -1;
selp.b32 %r2405, 0, %r1572, %p355;
setp.eq.s32 %p358, %r2405, 0;
or.b16 %rs873, %rs1304, 15;
selp.b16 %rs1105, %rs873, %rs1304, %p358;
and.b16 %rs874, %rs1105, 255;
mov.u16 %rs875, 8;
sub.s16 %rs1139, %rs875, %rs1104;
setp.eq.s16 %p359, %rs874, 255;
selp.u16 %rs1104, 1, 0, %p359;
$L__BB26_253:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1573, %rs1139;
and.b32 %r1574, %r1573, 255;
cvt.u32.u16 %r1575, %rs1105;
and.b32 %r2608, %r1575, 255;
shr.u32 %r1576, %r2608, %r1574;
and.b32 %r1577, %r1576, 1;
bfi.b32 %r2613, %r2613, %r1577, 1, 31;
setp.eq.s32 %p360, %r482, 1;
@%p360 bra $L__BB26_264;
and.b16 %rs876, %rs1139, 255;
setp.ne.s16 %p361, %rs876, 0;
@%p361 bra $L__BB26_258;
setp.eq.s32 %p362, %r2405, 0;
mov.u16 %rs1308, 255;
@%p362 bra $L__BB26_257;
cvt.u64.u32 %rd322, %r2404;
add.s64 %rd323, %rd322, %rd3;
add.s64 %rd324, %rd1, %rd323;
ld.global.u8 %rs1308, [%rd324];
$L__BB26_257:
setp.ne.s32 %p364, %r2405, 0;
selp.u32 %r1578, 1, 0, %p364;
add.s32 %r2404, %r2404, %r1578;
add.s32 %r1579, %r2405, -1;
selp.b32 %r2405, 0, %r1579, %p362;
setp.eq.s32 %p365, %r2405, 0;
or.b16 %rs878, %rs1308, 15;
selp.b16 %rs1105, %rs878, %rs1308, %p365;
and.b16 %rs879, %rs1105, 255;
mov.u16 %rs880, 8;
sub.s16 %rs1139, %rs880, %rs1104;
setp.eq.s16 %p366, %rs879, 255;
selp.u16 %rs1104, 1, 0, %p366;
cvt.u32.u16 %r1580, %rs1105;
and.b32 %r2608, %r1580, 255;
$L__BB26_258:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1581, %rs1139;
and.b32 %r1582, %r1581, 255;
shr.u32 %r1583, %r2608, %r1582;
and.b32 %r1584, %r1583, 1;
bfi.b32 %r2613, %r2613, %r1584, 1, 31;
setp.eq.s32 %p367, %r482, 2;
@%p367 bra $L__BB26_264;
and.b16 %rs881, %rs1139, 255;
setp.ne.s16 %p368, %rs881, 0;
@%p368 bra $L__BB26_263;
setp.eq.s32 %p369, %r2405, 0;
mov.u16 %rs1312, 255;
@%p369 bra $L__BB26_262;
cvt.u64.u32 %rd325, %r2404;
add.s64 %rd326, %rd325, %rd3;
add.s64 %rd327, %rd1, %rd326;
ld.global.u8 %rs1312, [%rd327];
$L__BB26_262:
setp.ne.s32 %p371, %r2405, 0;
selp.u32 %r1585, 1, 0, %p371;
add.s32 %r2404, %r2404, %r1585;
add.s32 %r1586, %r2405, -1;
selp.b32 %r2405, 0, %r1586, %p369;
setp.eq.s32 %p372, %r2405, 0;
or.b16 %rs883, %rs1312, 15;
selp.b16 %rs1105, %rs883, %rs1312, %p372;
and.b16 %rs884, %rs1105, 255;
mov.u16 %rs885, 8;
sub.s16 %rs1139, %rs885, %rs1104;
setp.eq.s16 %p373, %rs884, 255;
selp.u16 %rs1104, 1, 0, %p373;
cvt.u32.u16 %r1587, %rs1105;
and.b32 %r2608, %r1587, 255;
$L__BB26_263:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1588, %rs1139;
and.b32 %r1589, %r1588, 255;
shr.u32 %r1590, %r2608, %r1589;
and.b32 %r1591, %r1590, 1;
bfi.b32 %r2613, %r2613, %r1591, 1, 31;
$L__BB26_264:
shl.b32 %r1592, %r2613, 1;
or.b32 %r2617, %r1592, 1;
add.s32 %r1593, %r2495, -1;
setp.eq.s32 %p374, %r2495, 0;
selp.b32 %r2616, 0, %r1593, %p374;
$L__BB26_265:
mul.lo.s32 %r1594, %r2496, 7;
cvt.u64.u32 %rd328, %r2617;
shl.b64 %rd329, %rd328, %r1594;
or.b64 %rd603, %rd329, %rd603;
setp.ne.s32 %p375, %r2495, 12;
setp.ne.s32 %p376, %r478, 0;
or.pred %p377, %p375, %p376;
add.s32 %r2496, %r2496, 1;
setp.lt.u32 %p378, %r2496, 8;
or.pred %p379, %p378, %p377;
mov.u32 %r2495, %r2616;
@%p379 bra $L__BB26_221;
$L__BB26_266:
cvt.u32.u64 %r1595, %rd603;
and.b32 %r2492, %r1595, 127;
shr.u64 %rd603, %rd603, 7;
add.s32 %r2496, %r2496, -1;
$L__BB26_267:
mul.wide.u32 %rd330, %r2561, 2;
add.s64 %rd331, %rd13, %rd330;
st.local.u16 [%rd331], %r2627;
shl.b32 %r1596, %r2627, 2;
shl.b32 %r1597, %r2627, 1;
or.b32 %r1598, %r1596, %r1597;
and.b32 %r1599, %r1598, 256;
ld.local.u16 %r1600, [%rd46];
and.b32 %r1601, %r1600, 128;
or.b32 %r1602, %r1599, %r1601;
ld.local.u16 %r1603, [%rd47];
shl.b32 %r1604, %r1603, 2;
and.b32 %r1605, %r1604, 640;
or.b32 %r1606, %r1602, %r1605;
add.s32 %r1607, %r455, 4;
mul.wide.u32 %rd332, %r1607, 2;
add.s64 %rd333, %rd13, %rd332;
ld.local.u16 %r1608, [%rd333];
shl.b32 %r1609, %r1608, 4;
and.b32 %r1610, %r1609, 512;
or.b32 %r1611, %r1606, %r1610;
and.b32 %r1612, %r2627, 7;
shr.u64 %rd57, %rd612, %r1612;
sub.s32 %r564, %r2575, %r1612;
cvt.u32.u64 %r1613, %rd57;
and.b32 %r1614, %r1613, 127;
or.b32 %r1615, %r1611, %r1614;
mul.wide.u32 %rd334, %r1615, 2;
add.s64 %rd335, %rd41, %rd334;
ld.global.u16 %r2679, [%rd335];
setp.ne.s32 %p380, %r1611, 0;
add.s32 %r566, %r2559, 2;
setp.ge.u32 %p381, %r566, %r1141;
or.pred %p382, %p381, %p380;
@%p382 bra $L__BB26_317;
add.s32 %r567, %r2492, -2;
setp.eq.s32 %p383, %r567, -1;
selp.b32 %r2679, %r2679, 0, %p383;
setp.gt.s32 %p384, %r2492, 1;
mov.u32 %r2492, %r567;
@%p384 bra $L__BB26_317;
setp.ne.s32 %p385, %r2496, 0;
@%p385 bra $L__BB26_316;
mov.u32 %r2496, 0;
$L__BB26_271:
setp.gt.u32 %p386, %r2496, 7;
@%p386 bra $L__BB26_316;
cvt.u64.u32 %rd59, %r2495;
mul.wide.u32 %rd336, %r2495, 4;
add.s64 %rd338, %rd150, %rd336;
ld.global.nc.u32 %r573, [%rd338];
and.b16 %rs886, %rs1139, 255;
setp.ne.s16 %p387, %rs886, 0;
@%p387 bra $L__BB26_276;
setp.eq.s32 %p388, %r2405, 0;
mov.u16 %rs1331, 255;
@%p388 bra $L__BB26_275;
cvt.u64.u32 %rd339, %r2404;
add.s64 %rd340, %rd339, %rd3;
add.s64 %rd341, %rd1, %rd340;
ld.global.u8 %rs1331, [%rd341];
$L__BB26_275:
setp.ne.s32 %p390, %r2405, 0;
selp.u32 %r1617, 1, 0, %p390;
add.s32 %r2404, %r2404, %r1617;
add.s32 %r1618, %r2405, -1;
selp.b32 %r2405, 0, %r1618, %p388;
setp.eq.s32 %p391, %r2405, 0;
or.b16 %rs888, %rs1331, 15;
selp.b16 %rs1105, %rs888, %rs1331, %p391;
and.b16 %rs889, %rs1105, 255;
mov.u16 %rs890, 8;
sub.s16 %rs1139, %rs890, %rs1104;
setp.eq.s16 %p392, %rs889, 255;
selp.u16 %rs1104, 1, 0, %p392;
$L__BB26_276:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1619, %rs1139;
and.b32 %r1620, %r1619, 255;
mov.u32 %r1621, 1;
shl.b32 %r1622, %r1621, %r1620;
cvt.u32.u16 %r1623, %rs1105;
and.b32 %r1624, %r1622, %r1623;
and.b32 %r578, %r1624, 255;
setp.eq.s32 %p393, %r578, 0;
@%p393 bra $L__BB26_278;
add.s32 %r1625, %r2495, 1;
min.u32 %r2668, %r1625, 12;
mov.u32 %r1626, -1;
shl.b32 %r1627, %r1626, %r573;
shl.b32 %r1628, %r1627, 1;
xor.b32 %r2669, %r1628, -2;
bra.uni $L__BB26_315;
$L__BB26_278:
add.s64 %rd342, %rd59, -3;
setp.gt.u64 %p394, %rd342, 9;
mov.u32 %r2665, 0;
@%p394 bra $L__BB26_314;
max.u32 %r581, %r573, 1;
add.s32 %r1632, %r581, -1;
and.b32 %r582, %r581, 3;
setp.lt.u32 %p395, %r1632, 3;
mov.u32 %r2665, 0;
@%p395 bra $L__BB26_298;
sub.s32 %r2637, %r581, %r582;
mov.u32 %r2665, 0;
$L__BB26_281:
and.b16 %rs892, %rs1139, 255;
setp.ne.s16 %p396, %rs892, 0;
@%p396 bra $L__BB26_285;
setp.eq.s32 %p397, %r2405, 0;
mov.u16 %rs1338, 255;
@%p397 bra $L__BB26_284;
cvt.u64.u32 %rd343, %r2404;
add.s64 %rd344, %rd343, %rd3;
add.s64 %rd345, %rd1, %rd344;
ld.global.u8 %rs1338, [%rd345];
$L__BB26_284:
setp.ne.s32 %p399, %r2405, 0;
selp.u32 %r1634, 1, 0, %p399;
add.s32 %r2404, %r2404, %r1634;
add.s32 %r1635, %r2405, -1;
selp.b32 %r2405, 0, %r1635, %p397;
setp.eq.s32 %p400, %r2405, 0;
or.b16 %rs894, %rs1338, 15;
selp.b16 %rs1105, %rs894, %rs1338, %p400;
and.b16 %rs895, %rs1105, 255;
mov.u16 %rs896, 8;
sub.s16 %rs1139, %rs896, %rs1104;
setp.eq.s16 %p401, %rs895, 255;
selp.u16 %rs1104, 1, 0, %p401;
$L__BB26_285:
add.s16 %rs1345, %rs1139, -1;
and.b16 %rs897, %rs1345, 255;
cvt.u32.u16 %r1636, %rs1345;
and.b32 %r1637, %r1636, 255;
cvt.u32.u16 %r1638, %rs1105;
and.b32 %r2643, %r1638, 255;
shr.u32 %r1639, %r2643, %r1637;
and.b32 %r1640, %r1639, 1;
bfi.b32 %r593, %r2665, %r1640, 1, 31;
setp.ne.s16 %p402, %rs897, 0;
@%p402 bra $L__BB26_289;
setp.eq.s32 %p403, %r2405, 0;
mov.u16 %rs1342, 255;
@%p403 bra $L__BB26_288;
cvt.u64.u32 %rd346, %r2404;
add.s64 %rd347, %rd346, %rd3;
add.s64 %rd348, %rd1, %rd347;
ld.global.u8 %rs1342, [%rd348];
$L__BB26_288:
setp.ne.s32 %p405, %r2405, 0;
selp.u32 %r1641, 1, 0, %p405;
add.s32 %r2404, %r2404, %r1641;
add.s32 %r1642, %r2405, -1;
selp.b32 %r2405, 0, %r1642, %p403;
setp.eq.s32 %p406, %r2405, 0;
or.b16 %rs899, %rs1342, 15;
selp.b16 %rs1105, %rs899, %rs1342, %p406;
and.b16 %rs900, %rs1105, 255;
mov.u16 %rs901, 8;
sub.s16 %rs1345, %rs901, %rs1104;
setp.eq.s16 %p407, %rs900, 255;
selp.u16 %rs1104, 1, 0, %p407;
cvt.u32.u16 %r1643, %rs1105;
and.b32 %r2643, %r1643, 255;
$L__BB26_289:
add.s16 %rs1349, %rs1345, -1;
and.b16 %rs902, %rs1349, 255;
cvt.u32.u16 %r1644, %rs1349;
and.b32 %r1645, %r1644, 255;
shr.u32 %r1646, %r2643, %r1645;
and.b32 %r1647, %r1646, 1;
bfi.b32 %r600, %r593, %r1647, 1, 31;
setp.ne.s16 %p408, %rs902, 0;
@%p408 bra $L__BB26_293;
setp.eq.s32 %p409, %r2405, 0;
mov.u16 %rs1346, 255;
@%p409 bra $L__BB26_292;
cvt.u64.u32 %rd349, %r2404;
add.s64 %rd350, %rd349, %rd3;
add.s64 %rd351, %rd1, %rd350;
ld.global.u8 %rs1346, [%rd351];
$L__BB26_292:
setp.ne.s32 %p411, %r2405, 0;
selp.u32 %r1648, 1, 0, %p411;
add.s32 %r2404, %r2404, %r1648;
add.s32 %r1649, %r2405, -1;
selp.b32 %r2405, 0, %r1649, %p409;
setp.eq.s32 %p412, %r2405, 0;
or.b16 %rs904, %rs1346, 15;
selp.b16 %rs1105, %rs904, %rs1346, %p412;
and.b16 %rs905, %rs1105, 255;
mov.u16 %rs906, 8;
sub.s16 %rs1349, %rs906, %rs1104;
setp.eq.s16 %p413, %rs905, 255;
selp.u16 %rs1104, 1, 0, %p413;
cvt.u32.u16 %r1650, %rs1105;
and.b32 %r2643, %r1650, 255;
$L__BB26_293:
add.s16 %rs1353, %rs1349, -1;
and.b16 %rs907, %rs1353, 255;
cvt.u32.u16 %r1651, %rs1353;
and.b32 %r1652, %r1651, 255;
shr.u32 %r1653, %r2643, %r1652;
and.b32 %r1654, %r1653, 1;
bfi.b32 %r607, %r600, %r1654, 1, 31;
setp.ne.s16 %p414, %rs907, 0;
@%p414 bra $L__BB26_297;
setp.eq.s32 %p415, %r2405, 0;
mov.u16 %rs1350, 255;
@%p415 bra $L__BB26_296;
cvt.u64.u32 %rd352, %r2404;
add.s64 %rd353, %rd352, %rd3;
add.s64 %rd354, %rd1, %rd353;
ld.global.u8 %rs1350, [%rd354];
$L__BB26_296:
setp.ne.s32 %p417, %r2405, 0;
selp.u32 %r1655, 1, 0, %p417;
add.s32 %r2404, %r2404, %r1655;
add.s32 %r1656, %r2405, -1;
selp.b32 %r2405, 0, %r1656, %p415;
setp.eq.s32 %p418, %r2405, 0;
or.b16 %rs909, %rs1350, 15;
selp.b16 %rs1105, %rs909, %rs1350, %p418;
and.b16 %rs910, %rs1105, 255;
mov.u16 %rs911, 8;
sub.s16 %rs1353, %rs911, %rs1104;
setp.eq.s16 %p419, %rs910, 255;
selp.u16 %rs1104, 1, 0, %p419;
cvt.u32.u16 %r1657, %rs1105;
and.b32 %r2643, %r1657, 255;
$L__BB26_297:
add.s16 %rs1139, %rs1353, -1;
cvt.u32.u16 %r1658, %rs1139;
and.b32 %r1659, %r1658, 255;
shr.u32 %r1660, %r2643, %r1659;
and.b32 %r1661, %r1660, 1;
bfi.b32 %r2665, %r607, %r1661, 1, 31;
add.s32 %r2637, %r2637, -4;
setp.ne.s32 %p420, %r2637, 0;
@%p420 bra $L__BB26_281;
$L__BB26_298:
setp.eq.s32 %p421, %r582, 0;
@%p421 bra $L__BB26_314;
and.b16 %rs912, %rs1139, 255;
setp.ne.s16 %p422, %rs912, 0;
@%p422 bra $L__BB26_303;
setp.eq.s32 %p423, %r2405, 0;
mov.u16 %rs1360, 255;
@%p423 bra $L__BB26_302;
cvt.u64.u32 %rd355, %r2404;
add.s64 %rd356, %rd355, %rd3;
add.s64 %rd357, %rd1, %rd356;
ld.global.u8 %rs1360, [%rd357];
$L__BB26_302:
setp.ne.s32 %p425, %r2405, 0;
selp.u32 %r1662, 1, 0, %p425;
add.s32 %r2404, %r2404, %r1662;
add.s32 %r1663, %r2405, -1;
selp.b32 %r2405, 0, %r1663, %p423;
setp.eq.s32 %p426, %r2405, 0;
or.b16 %rs914, %rs1360, 15;
selp.b16 %rs1105, %rs914, %rs1360, %p426;
and.b16 %rs915, %rs1105, 255;
mov.u16 %rs916, 8;
sub.s16 %rs1139, %rs916, %rs1104;
setp.eq.s16 %p427, %rs915, 255;
selp.u16 %rs1104, 1, 0, %p427;
$L__BB26_303:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1664, %rs1139;
and.b32 %r1665, %r1664, 255;
cvt.u32.u16 %r1666, %rs1105;
and.b32 %r2660, %r1666, 255;
shr.u32 %r1667, %r2660, %r1665;
and.b32 %r1668, %r1667, 1;
bfi.b32 %r2665, %r2665, %r1668, 1, 31;
setp.eq.s32 %p428, %r582, 1;
@%p428 bra $L__BB26_314;
and.b16 %rs917, %rs1139, 255;
setp.ne.s16 %p429, %rs917, 0;
@%p429 bra $L__BB26_308;
setp.eq.s32 %p430, %r2405, 0;
mov.u16 %rs1364, 255;
@%p430 bra $L__BB26_307;
cvt.u64.u32 %rd358, %r2404;
add.s64 %rd359, %rd358, %rd3;
add.s64 %rd360, %rd1, %rd359;
ld.global.u8 %rs1364, [%rd360];
$L__BB26_307:
setp.ne.s32 %p432, %r2405, 0;
selp.u32 %r1669, 1, 0, %p432;
add.s32 %r2404, %r2404, %r1669;
add.s32 %r1670, %r2405, -1;
selp.b32 %r2405, 0, %r1670, %p430;
setp.eq.s32 %p433, %r2405, 0;
or.b16 %rs919, %rs1364, 15;
selp.b16 %rs1105, %rs919, %rs1364, %p433;
and.b16 %rs920, %rs1105, 255;
mov.u16 %rs921, 8;
sub.s16 %rs1139, %rs921, %rs1104;
setp.eq.s16 %p434, %rs920, 255;
selp.u16 %rs1104, 1, 0, %p434;
cvt.u32.u16 %r1671, %rs1105;
and.b32 %r2660, %r1671, 255;
$L__BB26_308:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1672, %rs1139;
and.b32 %r1673, %r1672, 255;
shr.u32 %r1674, %r2660, %r1673;
and.b32 %r1675, %r1674, 1;
bfi.b32 %r2665, %r2665, %r1675, 1, 31;
setp.eq.s32 %p435, %r582, 2;
@%p435 bra $L__BB26_314;
and.b16 %rs922, %rs1139, 255;
setp.ne.s16 %p436, %rs922, 0;
@%p436 bra $L__BB26_313;
setp.eq.s32 %p437, %r2405, 0;
mov.u16 %rs1368, 255;
@%p437 bra $L__BB26_312;
cvt.u64.u32 %rd361, %r2404;
add.s64 %rd362, %rd361, %rd3;
add.s64 %rd363, %rd1, %rd362;
ld.global.u8 %rs1368, [%rd363];
$L__BB26_312:
setp.ne.s32 %p439, %r2405, 0;
selp.u32 %r1676, 1, 0, %p439;
add.s32 %r2404, %r2404, %r1676;
add.s32 %r1677, %r2405, -1;
selp.b32 %r2405, 0, %r1677, %p437;
setp.eq.s32 %p440, %r2405, 0;
or.b16 %rs924, %rs1368, 15;
selp.b16 %rs1105, %rs924, %rs1368, %p440;
and.b16 %rs925, %rs1105, 255;
mov.u16 %rs926, 8;
sub.s16 %rs1139, %rs926, %rs1104;
setp.eq.s16 %p441, %rs925, 255;
selp.u16 %rs1104, 1, 0, %p441;
cvt.u32.u16 %r1678, %rs1105;
and.b32 %r2660, %r1678, 255;
$L__BB26_313:
add.s16 %rs1139, %rs1139, -1;
cvt.u32.u16 %r1679, %rs1139;
and.b32 %r1680, %r1679, 255;
shr.u32 %r1681, %r2660, %r1680;
and.b32 %r1682, %r1681, 1;
bfi.b32 %r2665, %r2665, %r1682, 1, 31;
$L__BB26_314:
shl.b32 %r1683, %r2665, 1;
or.b32 %r2669, %r1683, 1;
add.s32 %r1684, %r2495, -1;
setp.eq.s32 %p442, %r2495, 0;
selp.b32 %r2668, 0, %r1684, %p442;
$L__BB26_315:
mul.lo.s32 %r1685, %r2496, 7;
cvt.u64.u32 %rd364, %r2669;
shl.b64 %rd365, %rd364, %r1685;
or.b64 %rd603, %rd365, %rd603;
setp.ne.s32 %p443, %r2495, 12;
setp.ne.s32 %p444, %r578, 0;
or.pred %p445, %p443, %p444;
add.s32 %r2496, %r2496, 1;
setp.lt.u32 %p446, %r2496, 8;
or.pred %p447, %p446, %p445;
mov.u32 %r2495, %r2668;
@%p447 bra $L__BB26_271;
$L__BB26_316:
cvt.u32.u64 %r1686, %rd603;
and.b32 %r2492, %r1686, 127;
shr.u64 %rd603, %rd603, 7;
add.s32 %r2496, %r2496, -1;
$L__BB26_317:
setp.lt.u32 %p448, %r566, %r1141;
selp.b32 %r1687, %r2679, 0, %p448;
add.s32 %r1688, %r2561, 2;
mul.wide.u32 %rd366, %r1688, 2;
add.s64 %rd367, %rd13, %rd366;
st.local.u16 [%rd367], %r1687;
shl.b32 %r1689, %r1687, 2;
shl.b32 %r1690, %r1687, 1;
or.b32 %r1691, %r1689, %r1690;
and.b32 %r1692, %r1691, 256;
ld.local.u16 %r1693, [%rd47];
and.b32 %r1694, %r1693, 128;
or.b32 %r2560, %r1692, %r1694;
and.b32 %r1695, %r1687, 7;
shr.u64 %rd368, %rd57, %r1695;
sub.s32 %r1696, %r564, %r1695;
cvt.u32.u64 %r1697, %rd368;
shl.b32 %r1698, %r2627, 3;
and.b32 %r1699, %r1698, 64;
shl.b32 %r1700, %r1687, 4;
and.b32 %r1701, %r1700, 128;
or.b32 %r1702, %r1701, %r1699;
and.b32 %r1703, %r1697, 63;
or.b32 %r1704, %r1703, %r1702;
mul.wide.u32 %rd369, %r1704, 2;
add.s64 %rd370, %rd40, %rd369;
ld.global.u16 %r1705, [%rd370];
and.b32 %r1706, %r1705, 7;
shr.u64 %rd371, %rd368, %r1706;
sub.s32 %r1707, %r1696, %r1706;
cvt.u32.u64 %r1708, %rd371;
shr.u32 %r1709, %r1705, 3;
and.b32 %r1710, %r1709, 15;
mov.u32 %r1711, -1;
shl.b32 %r1712, %r1711, %r1710;
not.b32 %r1713, %r1712;
and.b32 %r1714, %r1708, %r1713;
shr.u64 %rd612, %rd371, %r1710;
sub.s32 %r2575, %r1707, %r1710;
shr.u32 %r1715, %r1705, 7;
and.b32 %r1716, %r1715, 7;
shr.u32 %r1717, %r1705, 10;
and.b32 %r1718, %r1717, 7;
mov.u32 %r1719, 255;
shl.b32 %r1720, %r1719, %r1716;
not.b32 %r1721, %r1720;
and.b32 %r1722, %r1714, %r1721;
add.s32 %r1723, %r1722, %r1718;
add.s32 %r1724, %r2561, 1;
mul.wide.u32 %rd372, %r1724, 2;
add.s64 %rd373, %rd13, %rd372;
st.local.u16 [%rd373], %r1723;
shr.u32 %r1725, %r1705, 13;
shr.u32 %r1726, %r1714, %r1716;
add.s32 %r1727, %r1726, %r1725;
add.s32 %r1728, %r2561, 3;
mul.wide.u32 %rd374, %r1728, 2;
add.s64 %rd375, %rd13, %rd374;
st.local.u16 [%rd375], %r1727;
add.s32 %r2561, %r2561, 4;
add.s32 %r2559, %r2559, 4;
setp.lt.u32 %p449, %r2559, %r1141;
@%p449 bra $L__BB26_213;
mul.wide.u32 %rd376, %r2561, 2;
add.s64 %rd377, %rd13, %rd376;
mov.u16 %rs927, 0;
st.local.v2.u16 [%rd377], {%rs927, %rs927};
add.s32 %r2550, %r2550, 2;
setp.lt.u32 %p450, %r2550, %r1144;
@%p450 bra $L__BB26_212;
$L__BB26_319:
mov.u32 %r1729, 30;
sub.s32 %r669, %r1729, %r1148;
add.s32 %r1730, %r1141, 1;
shr.u32 %r1731, %r1730, 1;
add.s32 %r1732, %r1731, 2;
setp.gt.u32 %p451, %r1732, 130;
@%p451 bra $L__BB26_534;
bra.uni $L__BB26_320;
$L__BB26_534:
mov.u32 %r2296, 2;
st.global.u32 [%rd4], %r2296;
mov.u32 %r2297, 12;
st.global.u32 [%rd4+4], %r2297;
mov.u32 %r2298, 0;
st.global.u32 [%rd4+8], %r2298;
st.global.u32 [%rd4+12], %r2298;
bra.uni $L__BB26_541;
$L__BB26_320:
add.s32 %r670, %r1148, 2;
add.s32 %r671, %r669, -1;
mov.u32 %r2680, 0;
mov.u64 %rd640, 0;
mov.u16 %rs1411, 0;
mov.u32 %r2681, %r2680;
mov.u32 %r2682, %r2680;
mov.u32 %r2683, %r14;
mov.u32 %r2750, %r2680;
mov.u32 %r2749, %r2680;
$L__BB26_321:
mov.u32 %r674, %r2682;
mul.wide.u32 %rd379, %r2681, 2;
add.s64 %rd380, %rd13, %rd379;
ld.local.u16 %r678, [%rd380];
ld.local.u16 %r679, [%rd380+2];
setp.lt.u32 %p452, %r670, %r679;
@%p452 bra $L__BB26_533;
and.b32 %r1739, %r678, 16;
setp.eq.s32 %p453, %r1739, 0;
mov.u32 %r2699, 0;
mov.u32 %r2691, %r2699;
@%p453 bra $L__BB26_328;
setp.gt.u32 %p454, %r2750, 31;
@%p454 bra $L__BB26_327;
$L__BB26_324:
setp.ge.u32 %p455, %r2749, %r20;
mov.u16 %rs1386, 255;
@%p455 bra $L__BB26_326;
add.s32 %r682, %r2749, 1;
cvt.u64.u32 %rd381, %r2749;
add.s64 %rd382, %rd381, %rd3;
add.s64 %rd383, %rd1, %rd382;
ld.global.u8 %rs1386, [%rd383];
mov.u32 %r2749, %r682;
$L__BB26_326:
and.b16 %rs930, %rs1386, 255;
cvt.u64.u16 %rd384, %rs1386;
and.b64 %rd385, %rd384, 255;
shl.b64 %rd386, %rd385, %r2750;
or.b64 %rd640, %rd386, %rd640;
add.s32 %r1740, %r2750, 8;
cvt.u32.u16 %r1741, %rs1411;
cvt.s32.s8 %r1742, %r1741;
sub.s32 %r2750, %r1740, %r1742;
setp.eq.s16 %p456, %rs930, 255;
selp.u16 %rs1411, 1, 0, %p456;
setp.lt.u32 %p457, %r2750, 33;
@%p457 bra $L__BB26_324;
$L__BB26_327:
shr.u32 %r1743, %r678, 12;
and.b32 %r1744, %r1743, 1;
sub.s32 %r1745, %r679, %r1744;
shr.u64 %rd69, %rd640, %r1745;
sub.s32 %r2750, %r2750, %r1745;
cvt.u32.u64 %r1746, %rd640;
shl.b32 %r1747, %r1746, 31;
setp.eq.s32 %p458, %r1745, 0;
mov.u32 %r1748, -1;
shl.b32 %r1749, %r1748, %r1745;
not.b32 %r1750, %r1749;
selp.b32 %r1751, 0, %r1750, %p458;
and.b32 %r1752, %r1751, %r1746;
shr.u32 %r1753, %r678, 8;
and.b32 %r1754, %r1753, 1;
shl.b32 %r1755, %r1754, %r1745;
or.b32 %r1756, %r1755, %r1752;
or.b32 %r1757, %r1756, 1;
add.s32 %r1758, %r1757, 2;
shl.b32 %r1759, %r1758, %r671;
or.b32 %r2691, %r1759, %r1747;
mov.u64 %rd640, %rd69;
$L__BB26_328:
mul.wide.u32 %rd387, %r2683, 4;
add.s64 %rd388, %rd2, %rd387;
st.global.u32 [%rd388], %r2691;
and.b32 %r1762, %r678, 32;
setp.eq.s32 %p459, %r1762, 0;
mov.u32 %r2700, %r2699;
@%p459 bra $L__BB26_334;
setp.gt.u32 %p460, %r2750, 31;
@%p460 bra $L__BB26_333;
$L__BB26_330:
setp.ge.u32 %p461, %r2749, %r20;
mov.u16 %rs1390, 255;
@%p461 bra $L__BB26_332;
add.s32 %r694, %r2749, 1;
cvt.u64.u32 %rd389, %r2749;
add.s64 %rd390, %rd389, %rd3;
add.s64 %rd391, %rd1, %rd390;
ld.global.u8 %rs1390, [%rd391];
mov.u32 %r2749, %r694;
$L__BB26_332:
and.b16 %rs932, %rs1390, 255;
cvt.u64.u16 %rd392, %rs1390;
and.b64 %rd393, %rd392, 255;
shl.b64 %rd394, %rd393, %r2750;
or.b64 %rd640, %rd394, %rd640;
add.s32 %r1763, %r2750, 8;
cvt.u32.u16 %r1764, %rs1411;
cvt.s32.s8 %r1765, %r1764;
sub.s32 %r2750, %r1763, %r1765;
setp.eq.s16 %p462, %rs932, 255;
selp.u16 %rs1411, 1, 0, %p462;
setp.lt.u32 %p463, %r2750, 33;
@%p463 bra $L__BB26_330;
$L__BB26_333:
shr.u32 %r1766, %r678, 13;
and.b32 %r1767, %r1766, 1;
sub.s32 %r1768, %r679, %r1767;
shr.u64 %rd74, %rd640, %r1768;
sub.s32 %r2750, %r2750, %r1768;
cvt.u32.u64 %r1769, %rd640;
shl.b32 %r1770, %r1769, 31;
setp.eq.s32 %p464, %r1768, 0;
mov.u32 %r1771, -1;
shl.b32 %r1772, %r1771, %r1768;
not.b32 %r1773, %r1772;
selp.b32 %r1774, 0, %r1773, %p464;
and.b32 %r1775, %r1774, %r1769;
shr.u32 %r1776, %r678, 9;
and.b32 %r1777, %r1776, 1;
shl.b32 %r1778, %r1777, %r1768;
or.b32 %r1779, %r1778, %r1775;
or.b32 %r2699, %r1779, 1;
add.s32 %r1780, %r2699, 2;
shl.b32 %r1781, %r1780, %r671;
or.b32 %r2700, %r1781, %r1770;
mov.u64 %rd640, %rd74;
$L__BB26_334:
setp.lt.u32 %p465, %r1144, 2;
@%p465 bra $L__BB26_336;
add.s32 %r1782, %r2683, %r1151;
mul.wide.u32 %rd395, %r1782, 4;
add.s64 %rd396, %rd2, %rd395;
st.global.u32 [%rd396], %r2700;
$L__BB26_336:
or.b32 %r1783, %r2699, %r2680;
add.u64 %rd76, %SPL, 6192;
mul.wide.u32 %rd398, %r674, 4;
add.s64 %rd399, %rd76, %rd398;
st.local.u32 [%rd399], %r1783;
add.s32 %r706, %r2683, 1;
add.s32 %r1784, %r2681, 1;
setp.lt.u32 %p466, %r1784, %r1141;
@%p466 bra $L__BB26_338;
bra.uni $L__BB26_337;
$L__BB26_338:
and.b32 %r1787, %r678, 64;
setp.eq.s32 %p467, %r1787, 0;
mov.u32 %r2716, 0;
mov.u32 %r2708, %r2716;
@%p467 bra $L__BB26_344;
setp.gt.u32 %p468, %r2750, 31;
@%p468 bra $L__BB26_343;
$L__BB26_340:
setp.ge.u32 %p469, %r2749, %r20;
mov.u16 %rs1394, 255;
@%p469 bra $L__BB26_342;
add.s32 %r709, %r2749, 1;
cvt.u64.u32 %rd400, %r2749;
add.s64 %rd401, %rd400, %rd3;
add.s64 %rd402, %rd1, %rd401;
ld.global.u8 %rs1394, [%rd402];
mov.u32 %r2749, %r709;
$L__BB26_342:
and.b16 %rs934, %rs1394, 255;
cvt.u64.u16 %rd403, %rs1394;
and.b64 %rd404, %rd403, 255;
shl.b64 %rd405, %rd404, %r2750;
or.b64 %rd640, %rd405, %rd640;
add.s32 %r1788, %r2750, 8;
cvt.u32.u16 %r1789, %rs1411;
cvt.s32.s8 %r1790, %r1789;
sub.s32 %r2750, %r1788, %r1790;
setp.eq.s16 %p470, %rs934, 255;
selp.u16 %rs1411, 1, 0, %p470;
setp.lt.u32 %p471, %r2750, 33;
@%p471 bra $L__BB26_340;
$L__BB26_343:
shr.u32 %r1791, %r678, 14;
and.b32 %r1792, %r1791, 1;
sub.s32 %r1793, %r679, %r1792;
shr.u64 %rd80, %rd640, %r1793;
sub.s32 %r2750, %r2750, %r1793;
cvt.u32.u64 %r1794, %rd640;
shl.b32 %r1795, %r1794, 31;
setp.eq.s32 %p472, %r1793, 0;
mov.u32 %r1796, -1;
shl.b32 %r1797, %r1796, %r1793;
not.b32 %r1798, %r1797;
selp.b32 %r1799, 0, %r1798, %p472;
and.b32 %r1800, %r1799, %r1794;
shr.u32 %r1801, %r678, 10;
and.b32 %r1802, %r1801, 1;
shl.b32 %r1803, %r1802, %r1793;
or.b32 %r1804, %r1803, %r1800;
or.b32 %r1805, %r1804, 1;
add.s32 %r1806, %r1805, 2;
shl.b32 %r1807, %r1806, %r671;
or.b32 %r2708, %r1807, %r1795;
mov.u64 %rd640, %rd80;
$L__BB26_344:
mul.wide.u32 %rd406, %r706, 4;
add.s64 %rd407, %rd2, %rd406;
st.global.u32 [%rd407], %r2708;
and.b32 %r1810, %r678, 128;
setp.eq.s32 %p473, %r1810, 0;
mov.u32 %r2680, %r2716;
@%p473 bra $L__BB26_350;
setp.gt.u32 %p474, %r2750, 31;
@%p474 bra $L__BB26_349;
$L__BB26_346:
setp.ge.u32 %p475, %r2749, %r20;
mov.u16 %rs1398, 255;
@%p475 bra $L__BB26_348;
add.s32 %r721, %r2749, 1;
cvt.u64.u32 %rd408, %r2749;
add.s64 %rd409, %rd408, %rd3;
add.s64 %rd410, %rd1, %rd409;
ld.global.u8 %rs1398, [%rd410];
mov.u32 %r2749, %r721;
$L__BB26_348:
and.b16 %rs936, %rs1398, 255;
cvt.u64.u16 %rd411, %rs1398;
and.b64 %rd412, %rd411, 255;
shl.b64 %rd413, %rd412, %r2750;
or.b64 %rd640, %rd413, %rd640;
add.s32 %r1811, %r2750, 8;
cvt.u32.u16 %r1812, %rs1411;
cvt.s32.s8 %r1813, %r1812;
sub.s32 %r2750, %r1811, %r1813;
setp.eq.s16 %p476, %rs936, 255;
selp.u16 %rs1411, 1, 0, %p476;
setp.lt.u32 %p477, %r2750, 33;
@%p477 bra $L__BB26_346;
$L__BB26_349:
shr.u32 %r1814, %r678, 15;
sub.s32 %r1815, %r679, %r1814;
shr.u64 %rd85, %rd640, %r1815;
sub.s32 %r2750, %r2750, %r1815;
cvt.u32.u64 %r1816, %rd640;
shl.b32 %r1817, %r1816, 31;
setp.eq.s32 %p478, %r1815, 0;
mov.u32 %r1818, -1;
shl.b32 %r1819, %r1818, %r1815;
not.b32 %r1820, %r1819;
selp.b32 %r1821, 0, %r1820, %p478;
and.b32 %r1822, %r1821, %r1816;
shr.u32 %r1823, %r678, 11;
and.b32 %r1824, %r1823, 1;
shl.b32 %r1825, %r1824, %r1815;
or.b32 %r1826, %r1825, %r1822;
or.b32 %r2680, %r1826, 1;
add.s32 %r1827, %r2680, 2;
shl.b32 %r1828, %r1827, %r671;
or.b32 %r2716, %r1828, %r1817;
mov.u64 %rd640, %rd85;
$L__BB26_350:
@%p465 bra $L__BB26_352;
add.s32 %r1829, %r706, %r1151;
mul.wide.u32 %rd414, %r1829, 4;
add.s64 %rd415, %rd2, %rd414;
st.global.u32 [%rd415], %r2716;
$L__BB26_352:
add.s32 %r2683, %r2683, 2;
add.s32 %r2682, %r674, 1;
add.s32 %r2681, %r2681, 2;
setp.lt.u32 %p480, %r2681, %r1141;
@%p480 bra $L__BB26_321;
bra.uni $L__BB26_353;
$L__BB26_533:
mov.u32 %r2289, 1;
st.global.u32 [%rd4], %r2289;
mov.u32 %r2290, 13;
st.global.u32 [%rd4+4], %r2290;
mov.u32 %r2291, 0;
st.global.u32 [%rd4+8], %r2291;
st.global.u32 [%rd4+12], %r2291;
bra.uni $L__BB26_541;
$L__BB26_337:
mov.u32 %r2680, 0;
$L__BB26_353:
add.s32 %r1830, %r674, 1;
mul.wide.u32 %rd418, %r1830, 4;
add.s64 %rd419, %rd76, %rd418;
st.local.u32 [%rd419], %r2680;
@%p304 bra $L__BB26_388;
mov.u32 %r2723, 2;
$L__BB26_355:
shr.u32 %r1836, %r2723, 1;
mul.lo.s32 %r2727, %r1836, %r1163;
mad.lo.s32 %r2729, %r2723, %r1151, %r14;
add.s32 %r745, %r2723, 1;
ld.local.u32 %r2726, [%rd76];
mov.u32 %r2728, 0;
mov.u32 %r2730, %r2728;
mov.u32 %r2731, %r2728;
$L__BB26_356:
mul.wide.u32 %rd420, %r2727, 2;
add.s64 %rd421, %rd13, %rd420;
ld.local.v2.u16 {%rs937, %rs938}, [%rd421];
cvt.u32.u16 %r755, %rs937;
cvt.u32.u16 %r1837, %rs938;
and.b32 %r1838, %r755, 240;
add.s32 %r1839, %r1838, 240;
and.b32 %r1840, %r1839, %r1838;
add.s32 %r756, %r2728, 1;
mul.wide.u32 %rd422, %r756, 4;
add.s64 %rd90, %rd76, %rd422;
ld.local.u32 %r757, [%rd90];
or.b32 %r1841, %r2726, %r757;
or.b32 %r1842, %r1841, 2;
clz.b32 %r1843, %r1842;
xor.b32 %r1844, %r1843, 31;
setp.eq.s32 %p482, %r1840, 0;
selp.b32 %r1845, 1, %r1844, %p482;
add.s32 %r758, %r1845, %r1837;
setp.gt.u32 %p483, %r758, %r670;
@%p483 bra $L__BB26_532;
and.b32 %r1847, %r755, 16;
setp.eq.s32 %p484, %r1847, 0;
mov.u32 %r2747, 0;
mov.u32 %r2739, %r2747;
@%p484 bra $L__BB26_363;
setp.gt.u32 %p485, %r2750, 31;
@%p485 bra $L__BB26_362;
$L__BB26_359:
setp.ge.u32 %p486, %r2749, %r20;
mov.u16 %rs1405, 255;
@%p486 bra $L__BB26_361;
add.s32 %r761, %r2749, 1;
cvt.u64.u32 %rd423, %r2749;
add.s64 %rd424, %rd423, %rd3;
add.s64 %rd425, %rd1, %rd424;
ld.global.u8 %rs1405, [%rd425];
mov.u32 %r2749, %r761;
$L__BB26_361:
and.b16 %rs942, %rs1405, 255;
cvt.u64.u16 %rd426, %rs1405;
and.b64 %rd427, %rd426, 255;
shl.b64 %rd428, %rd427, %r2750;
or.b64 %rd640, %rd428, %rd640;
add.s32 %r1848, %r2750, 8;
cvt.u32.u16 %r1849, %rs1411;
cvt.s32.s8 %r1850, %r1849;
sub.s32 %r2750, %r1848, %r1850;
setp.eq.s16 %p487, %rs942, 255;
selp.u16 %rs1411, 1, 0, %p487;
setp.lt.u32 %p488, %r2750, 33;
@%p488 bra $L__BB26_359;
$L__BB26_362:
shr.u32 %r1851, %r755, 12;
and.b32 %r1852, %r1851, 1;
sub.s32 %r1853, %r758, %r1852;
shr.u64 %rd94, %rd640, %r1853;
sub.s32 %r2750, %r2750, %r1853;
cvt.u32.u64 %r1854, %rd640;
shl.b32 %r1855, %r1854, 31;
setp.eq.s32 %p489, %r1853, 0;
mov.u32 %r1856, -1;
shl.b32 %r1857, %r1856, %r1853;
not.b32 %r1858, %r1857;
selp.b32 %r1859, 0, %r1858, %p489;
and.b32 %r1860, %r1859, %r1854;
shr.u32 %r1861, %r755, 8;
and.b32 %r1862, %r1861, 1;
shl.b32 %r1863, %r1862, %r1853;
or.b32 %r1864, %r1863, %r1860;
or.b32 %r1865, %r1864, 1;
add.s32 %r1866, %r1865, 2;
shl.b32 %r1867, %r1866, %r671;
or.b32 %r2739, %r1867, %r1855;
mov.u64 %rd640, %rd94;
$L__BB26_363:
mul.wide.u32 %rd429, %r2729, 4;
add.s64 %rd430, %rd2, %rd429;
st.global.u32 [%rd430], %r2739;
and.b32 %r1870, %r755, 32;
setp.eq.s32 %p490, %r1870, 0;
mov.u32 %r2748, %r2747;
@%p490 bra $L__BB26_369;
setp.gt.u32 %p491, %r2750, 31;
@%p491 bra $L__BB26_368;
$L__BB26_365:
setp.ge.u32 %p492, %r2749, %r20;
mov.u16 %rs1409, 255;
@%p492 bra $L__BB26_367;
add.s32 %r773, %r2749, 1;
cvt.u64.u32 %rd431, %r2749;
add.s64 %rd432, %rd431, %rd3;
add.s64 %rd433, %rd1, %rd432;
ld.global.u8 %rs1409, [%rd433];
mov.u32 %r2749, %r773;
$L__BB26_367:
and.b16 %rs944, %rs1409, 255;
cvt.u64.u16 %rd434, %rs1409;
and.b64 %rd435, %rd434, 255;
shl.b64 %rd436, %rd435, %r2750;
or.b64 %rd640, %rd436, %rd640;
add.s32 %r1871, %r2750, 8;
cvt.u32.u16 %r1872, %rs1411;
cvt.s32.s8 %r1873, %r1872;
sub.s32 %r2750, %r1871, %r1873;
setp.eq.s16 %p493, %rs944, 255;
selp.u16 %rs1411, 1, 0, %p493;
setp.lt.u32 %p494, %r2750, 33;
@%p494 bra $L__BB26_365;
$L__BB26_368:
shr.u32 %r1874, %r755, 13;
and.b32 %r1875, %r1874, 1;
sub.s32 %r1876, %r758, %r1875;
shr.u64 %rd99, %rd640, %r1876;
sub.s32 %r2750, %r2750, %r1876;
cvt.u32.u64 %r1877, %rd640;
shl.b32 %r1878, %r1877, 31;
setp.eq.s32 %p495, %r1876, 0;
mov.u32 %r1879, -1;
shl.b32 %r1880, %r1879, %r1876;
not.b32 %r1881, %r1880;
selp.b32 %r1882, 0, %r1881, %p495;
and.b32 %r1883, %r1882, %r1877;
shr.u32 %r1884, %r755, 9;
and.b32 %r1885, %r1884, 1;
shl.b32 %r1886, %r1885, %r1876;
or.b32 %r1887, %r1886, %r1883;
or.b32 %r2748, %r1887, 1;
add.s32 %r1888, %r2748, 2;
shl.b32 %r1889, %r1888, %r671;
or.b32 %r2747, %r1889, %r1878;
mov.u64 %rd640, %rd99;
$L__BB26_369:
setp.ge.u32 %p496, %r745, %r1144;
@%p496 bra $L__BB26_371;
add.s32 %r1890, %r2729, %r1151;
mul.wide.u32 %rd437, %r1890, 4;
add.s64 %rd438, %rd2, %rd437;
st.global.u32 [%rd438], %r2747;
$L__BB26_371:
or.b32 %r1892, %r2748, %r2730;
mul.wide.u32 %rd439, %r2728, 4;
add.s64 %rd440, %rd76, %rd439;
st.local.u32 [%rd440], %r1892;
add.s32 %r785, %r2729, 1;
add.s32 %r1893, %r2731, 1;
setp.ge.u32 %p497, %r1893, %r1141;
mov.u32 %r2730, 0;
@%p497 bra $L__BB26_387;
and.b32 %r1895, %r755, 64;
setp.eq.s32 %p498, %r1895, 0;
mov.u32 %r2764, 0;
mov.u32 %r2756, %r2764;
@%p498 bra $L__BB26_378;
setp.gt.u32 %p499, %r2750, 31;
@%p499 bra $L__BB26_377;
$L__BB26_374:
setp.ge.u32 %p500, %r2749, %r20;
mov.u16 %rs1413, 255;
@%p500 bra $L__BB26_376;
add.s32 %r788, %r2749, 1;
cvt.u64.u32 %rd441, %r2749;
add.s64 %rd442, %rd441, %rd3;
add.s64 %rd443, %rd1, %rd442;
ld.global.u8 %rs1413, [%rd443];
mov.u32 %r2749, %r788;
$L__BB26_376:
and.b16 %rs946, %rs1413, 255;
cvt.u64.u16 %rd444, %rs1413;
and.b64 %rd445, %rd444, 255;
shl.b64 %rd446, %rd445, %r2750;
or.b64 %rd640, %rd446, %rd640;
add.s32 %r1896, %r2750, 8;
cvt.u32.u16 %r1897, %rs1411;
cvt.s32.s8 %r1898, %r1897;
sub.s32 %r2750, %r1896, %r1898;
setp.eq.s16 %p501, %rs946, 255;
selp.u16 %rs1411, 1, 0, %p501;
setp.lt.u32 %p502, %r2750, 33;
@%p502 bra $L__BB26_374;
$L__BB26_377:
shr.u32 %r1899, %r755, 14;
and.b32 %r1900, %r1899, 1;
sub.s32 %r1901, %r758, %r1900;
shr.u64 %rd104, %rd640, %r1901;
sub.s32 %r2750, %r2750, %r1901;
cvt.u32.u64 %r1902, %rd640;
shl.b32 %r1903, %r1902, 31;
setp.eq.s32 %p503, %r1901, 0;
mov.u32 %r1904, -1;
shl.b32 %r1905, %r1904, %r1901;
not.b32 %r1906, %r1905;
selp.b32 %r1907, 0, %r1906, %p503;
and.b32 %r1908, %r1907, %r1902;
shr.u32 %r1909, %r755, 10;
and.b32 %r1910, %r1909, 1;
shl.b32 %r1911, %r1910, %r1901;
or.b32 %r1912, %r1911, %r1908;
or.b32 %r1913, %r1912, 1;
add.s32 %r1914, %r1913, 2;
shl.b32 %r1915, %r1914, %r671;
or.b32 %r2756, %r1915, %r1903;
mov.u64 %rd640, %rd104;
$L__BB26_378:
mul.wide.u32 %rd447, %r785, 4;
add.s64 %rd448, %rd2, %rd447;
st.global.u32 [%rd448], %r2756;
and.b32 %r1918, %r755, 128;
setp.eq.s32 %p504, %r1918, 0;
mov.u32 %r2730, %r2764;
@%p504 bra $L__BB26_384;
setp.gt.u32 %p505, %r2750, 31;
@%p505 bra $L__BB26_383;
$L__BB26_380:
setp.ge.u32 %p506, %r2749, %r20;
mov.u16 %rs1417, 255;
@%p506 bra $L__BB26_382;
add.s32 %r800, %r2749, 1;
cvt.u64.u32 %rd449, %r2749;
add.s64 %rd450, %rd449, %rd3;
add.s64 %rd451, %rd1, %rd450;
ld.global.u8 %rs1417, [%rd451];
mov.u32 %r2749, %r800;
$L__BB26_382:
and.b16 %rs948, %rs1417, 255;
cvt.u64.u16 %rd452, %rs1417;
and.b64 %rd453, %rd452, 255;
shl.b64 %rd454, %rd453, %r2750;
or.b64 %rd640, %rd454, %rd640;
add.s32 %r1919, %r2750, 8;
cvt.u32.u16 %r1920, %rs1411;
cvt.s32.s8 %r1921, %r1920;
sub.s32 %r2750, %r1919, %r1921;
setp.eq.s16 %p507, %rs948, 255;
selp.u16 %rs1411, 1, 0, %p507;
setp.lt.u32 %p508, %r2750, 33;
@%p508 bra $L__BB26_380;
$L__BB26_383:
shr.u32 %r1922, %r755, 15;
sub.s32 %r1923, %r758, %r1922;
shr.u64 %rd109, %rd640, %r1923;
sub.s32 %r2750, %r2750, %r1923;
cvt.u32.u64 %r1924, %rd640;
shl.b32 %r1925, %r1924, 31;
setp.eq.s32 %p509, %r1923, 0;
mov.u32 %r1926, -1;
shl.b32 %r1927, %r1926, %r1923;
not.b32 %r1928, %r1927;
selp.b32 %r1929, 0, %r1928, %p509;
and.b32 %r1930, %r1929, %r1924;
shr.u32 %r1931, %r755, 11;
and.b32 %r1932, %r1931, 1;
shl.b32 %r1933, %r1932, %r1923;
or.b32 %r1934, %r1933, %r1930;
or.b32 %r2730, %r1934, 1;
add.s32 %r1935, %r2730, 2;
shl.b32 %r1936, %r1935, %r671;
or.b32 %r2764, %r1936, %r1925;
mov.u64 %rd640, %rd109;
$L__BB26_384:
@%p496 bra $L__BB26_386;
add.s32 %r1937, %r785, %r1151;
mul.wide.u32 %rd455, %r1937, 4;
add.s64 %rd456, %rd2, %rd455;
st.global.u32 [%rd456], %r2764;
$L__BB26_386:
add.s32 %r2729, %r2729, 2;
add.s32 %r2727, %r2727, 2;
add.s32 %r2731, %r2731, 2;
setp.lt.u32 %p511, %r2731, %r1141;
mov.u32 %r2726, %r757;
mov.u32 %r2728, %r756;
@%p511 bra $L__BB26_356;
$L__BB26_387:
st.local.u32 [%rd90], %r2730;
add.s32 %r2723, %r2723, 2;
setp.lt.u32 %p512, %r2723, %r1144;
@%p512 bra $L__BB26_355;
$L__BB26_388:
setp.lt.u32 %p513, %r17, 2;
@%p513 bra $L__BB26_541;
add.s32 %r1938, %r1144, 3;
shr.u32 %r819, %r1938, 2;
add.s32 %r1939, %r819, 1;
add.s32 %r820, %r1141, 3;
shr.u32 %r821, %r820, 2;
add.s32 %r1940, %r821, 9;
and.b32 %r822, %r1940, 2147483640;
add.s32 %r823, %r821, 8;
setp.gt.u32 %p514, %r822, 72;
mul.lo.s32 %r1941, %r1939, %r822;
setp.gt.u32 %p515, %r1941, 528;
or.pred %p516, %p514, %p515;
@%p516 bra $L__BB26_531;
bra.uni $L__BB26_390;
$L__BB26_531:
mov.u32 %r2275, 2;
st.global.u32 [%rd4], %r2275;
mov.u32 %r2276, 15;
st.global.u32 [%rd4+4], %r2276;
mov.u32 %r2277, 0;
st.global.u32 [%rd4+8], %r2277;
st.global.u32 [%rd4+12], %r2277;
bra.uni $L__BB26_541;
$L__BB26_532:
mov.u32 %r2282, 1;
st.global.u32 [%rd4], %r2282;
mov.u32 %r2283, 14;
st.global.u32 [%rd4+4], %r2283;
mov.u32 %r2284, 0;
st.global.u32 [%rd4+8], %r2284;
st.global.u32 [%rd4+12], %r2284;
$L__BB26_541:
ret;
$L__BB26_390:
setp.gt.u32 %p517, %r823, 72;
@%p517 bra $L__BB26_530;
bra.uni $L__BB26_391;
$L__BB26_530:
mov.u32 %r2268, 2;
st.global.u32 [%rd4], %r2268;
mov.u32 %r2269, 16;
st.global.u32 [%rd4+4], %r2269;
mov.u32 %r2270, 0;
st.global.u32 [%rd4+8], %r2270;
st.global.u32 [%rd4+12], %r2270;
bra.uni $L__BB26_541;
$L__BB26_391:
add.u64 %rd112, %SPL, 6720;
mov.u32 %r1942, 0;
mov.u32 %r2771, %r1942;
$L__BB26_392:
shr.u32 %r1945, %r2771, 1;
mul.lo.s32 %r2773, %r1945, %r1163;
shr.u32 %r1946, %r2771, 2;
mul.lo.s32 %r2772, %r1946, %r822;
mov.u32 %r2774, %r1942;
$L__BB26_393:
mul.wide.u32 %rd458, %r2773, 2;
add.s64 %rd459, %rd13, %rd458;
ld.local.u16 %rs949, [%rd459];
and.b16 %rs950, %rs949, 48;
shr.u16 %rs951, %rs950, 4;
and.b16 %rs952, %rs949, 192;
shr.u16 %rs953, %rs952, 2;
add.s32 %r1947, %r2773, 2;
mul.wide.u32 %rd460, %r1947, 2;
add.s64 %rd461, %rd13, %rd460;
ld.local.u16 %rs954, [%rd461];
shl.b16 %rs955, %rs954, 4;
and.b16 %rs956, %rs955, 768;
shl.b16 %rs957, %rs954, 6;
and.b16 %rs958, %rs957, 12288;
add.s32 %r1948, %r2773, %r1163;
mul.wide.u32 %rd462, %r1948, 2;
add.s64 %rd463, %rd13, %rd462;
ld.local.u16 %rs959, [%rd463];
and.b16 %rs960, %rs959, 48;
shr.u16 %rs961, %rs960, 2;
and.b16 %rs962, %rs959, 192;
add.s32 %r1949, %r1948, 2;
mul.wide.u32 %rd464, %r1949, 2;
add.s64 %rd465, %rd13, %rd464;
ld.local.u16 %rs963, [%rd465];
shl.b16 %rs964, %rs963, 6;
and.b16 %rs965, %rs964, 3072;
shl.b16 %rs966, %rs963, 8;
and.b16 %rs967, %rs966, -16384;
or.b16 %rs968, %rs951, %rs953;
or.b16 %rs969, %rs968, %rs958;
or.b16 %rs970, %rs969, %rs956;
or.b16 %rs971, %rs970, %rs962;
or.b16 %rs972, %rs971, %rs961;
or.b16 %rs973, %rs972, %rs967;
or.b16 %rs974, %rs973, %rs965;
mul.wide.u32 %rd466, %r2772, 2;
add.s64 %rd467, %rd112, %rd466;
st.local.u16 [%rd467], %rs974;
add.s32 %r2773, %r2773, 4;
add.s32 %r2772, %r2772, 1;
add.s32 %r2774, %r2774, 4;
setp.lt.u32 %p518, %r2774, %r1141;
@%p518 bra $L__BB26_393;
mul.wide.u32 %rd468, %r2772, 2;
add.s64 %rd469, %rd112, %rd468;
mov.u16 %rs975, 0;
st.local.u16 [%rd469], %rs975;
add.s32 %r2771, %r2771, 4;
setp.lt.u32 %p519, %r2771, %r1144;
@%p519 bra $L__BB26_392;
mul.lo.s32 %r835, %r822, %r819;
add.s32 %r836, %r821, 1;
and.b32 %r2886, %r836, 3;
setp.lt.u32 %p520, %r820, 12;
mov.u32 %r2777, 0;
@%p520 bra $L__BB26_398;
sub.s32 %r2776, %r836, %r2886;
mov.u32 %r2777, 0;
$L__BB26_397:
add.s32 %r1952, %r2777, %r835;
mul.wide.u32 %rd470, %r1952, 2;
add.s64 %rd471, %rd112, %rd470;
mov.u16 %rs976, 0;
st.local.v4.u16 [%rd471], {%rs976, %rs976, %rs976, %rs976};
add.s32 %r2777, %r2777, 4;
add.s32 %r2776, %r2776, -4;
setp.ne.s32 %p521, %r2776, 0;
@%p521 bra $L__BB26_397;
$L__BB26_398:
setp.eq.s32 %p522, %r2886, 0;
@%p522 bra $L__BB26_401;
mov.u32 %r2779, %r2886;
$L__BB26_400:
.pragma "nounroll";
add.s32 %r1953, %r2777, %r835;
mul.wide.u32 %rd472, %r1953, 2;
add.s64 %rd473, %rd112, %rd472;
mov.u16 %rs977, 0;
st.local.u16 [%rd473], %rs977;
add.s32 %r2777, %r2777, 1;
add.s32 %r2779, %r2779, -1;
setp.ne.s32 %p523, %r2779, 0;
@%p523 bra $L__BB26_400;
$L__BB26_401:
and.b32 %r2783, %r823, 3;
sub.s32 %r2781, %r823, %r2783;
add.u64 %rd113, %SPL, 7776;
mov.u32 %r2782, 0;
$L__BB26_402:
mul.wide.u32 %rd475, %r2782, 2;
add.s64 %rd476, %rd113, %rd475;
mov.u16 %rs978, 0;
st.local.u16 [%rd476], %rs978;
st.local.u16 [%rd476+2], %rs978;
st.local.u16 [%rd476+4], %rs978;
st.local.u16 [%rd476+6], %rs978;
add.s32 %r2782, %r2782, 4;
add.s32 %r2781, %r2781, -4;
setp.ne.s32 %p524, %r2781, 0;
@%p524 bra $L__BB26_402;
setp.eq.s32 %p525, %r2783, 0;
@%p525 bra $L__BB26_405;
$L__BB26_404:
.pragma "nounroll";
mul.wide.u32 %rd477, %r2782, 2;
add.s64 %rd478, %rd113, %rd477;
mov.u16 %rs979, 0;
st.local.u16 [%rd478], %rs979;
add.s32 %r2782, %r2782, 1;
add.s32 %r2783, %r2783, -1;
setp.ne.s32 %p526, %r2783, 0;
@%p526 bra $L__BB26_404;
$L__BB26_405:
mov.u32 %r1957, 0;
mov.u64 %rd651, 0;
cvt.u64.u32 %rd480, %r1146;
add.s64 %rd114, %rd480, %rd3;
add.s32 %r858, %r669, -2;
mov.u32 %r1958, 3;
shl.b32 %r859, %r1958, %r858;
shl.b32 %r860, %r1151, 1;
mul.lo.s32 %r861, %r1151, 3;
mov.u16 %rs1426, 0;
mov.u32 %r2784, %r1957;
mov.u32 %r2793, %r1957;
mov.u32 %r2794, %r1957;
$L__BB26_406:
sub.s32 %r1961, %r1144, %r2784;
setp.lt.u32 %p527, %r1961, 4;
setp.lt.u32 %p528, %r1961, 3;
setp.lt.u32 %p529, %r1961, 2;
selp.b32 %r1962, 4369, 13107, %p529;
selp.b32 %r1963, %r1962, 30583, %p528;
selp.b32 %r865, %r1963, 65535, %p527;
shr.u32 %r1964, %r2784, 2;
mul.lo.s32 %r866, %r1964, %r822;
add.s32 %r867, %r866, %r822;
mad.lo.s32 %r868, %r2784, %r1151, %r14;
mov.u32 %r2787, %r1957;
mov.u32 %r2788, %r1957;
$L__BB26_407:
add.s32 %r1966, %r2787, 4;
sub.s32 %r1967, %r1966, %r1141;
mov.u32 %r2817, 0;
max.s32 %r1968, %r1967, 0;
shl.b32 %r1969, %r1968, 2;
shr.u32 %r1970, %r865, %r1969;
shr.u32 %r873, %r2787, 2;
mul.wide.u32 %rd481, %r873, 2;
add.s64 %rd117, %rd113, %rd481;
ld.local.u16 %rs981, [%rd117];
ld.local.u16 %rs982, [%rd117+2];
mov.b32 %r1971, {%rs981, %rs982};
add.s32 %r1972, %r867, %r873;
mul.wide.u32 %rd482, %r1972, 2;
add.s64 %rd483, %rd112, %rd482;
ld.local.u16 %rs983, [%rd483];
add.s32 %r1973, %r1972, 1;
mul.wide.u32 %rd484, %r1973, 2;
add.s64 %rd485, %rd112, %rd484;
ld.local.u16 %rs984, [%rd485];
mov.b32 %r1974, {%rs983, %rs984};
and.b32 %r1975, %r1971, -2004318072;
shr.u32 %r1976, %r1975, 3;
shl.b32 %r1977, %r1974, 3;
and.b32 %r1978, %r1977, -2004318072;
setp.eq.s32 %p530, %r15, 0;
selp.b32 %r1979, %r1978, 0, %p530;
or.b32 %r874, %r1979, %r1976;
add.s32 %r1980, %r873, %r866;
mul.wide.u32 %rd486, %r1980, 2;
add.s64 %rd487, %rd112, %rd486;
ld.local.u16 %rs985, [%rd487];
add.s32 %r1981, %r1980, 1;
mul.wide.u32 %rd488, %r1981, 2;
add.s64 %rd489, %rd112, %rd488;
ld.local.u16 %rs986, [%rd489];
mov.b32 %r875, {%rs985, %rs986};
shl.b32 %r1982, %r875, 1;
and.b32 %r1983, %r1982, -286331154;
or.b32 %r1984, %r1983, %r875;
and.b32 %r1985, %r875, -286331154;
shr.u32 %r1986, %r1985, 1;
or.b32 %r1987, %r1984, %r1986;
or.b32 %r1988, %r1987, %r874;
shl.b32 %r1989, %r1988, 4;
shr.u32 %r1990, %r1988, 4;
shr.u32 %r1991, %r2788, 12;
or.b32 %r1992, %r1988, %r1991;
or.b32 %r1993, %r1992, %r1989;
or.b32 %r1994, %r1993, %r1990;
not.b32 %r1995, %r875;
and.b32 %r876, %r1970, %r1995;
and.b32 %r877, %r876, %r1994;
setp.eq.s32 %p531, %r877, 0;
@%p531 bra $L__BB26_486;
setp.gt.u32 %p532, %r2794, 31;
@%p532 bra $L__BB26_412;
$L__BB26_409:
setp.ge.u32 %p533, %r2793, %r2900;
mov.u16 %rs1424, 0;
@%p533 bra $L__BB26_411;
add.s32 %r881, %r2793, 1;
cvt.u64.u32 %rd490, %r2793;
add.s64 %rd491, %rd114, %rd490;
add.s64 %rd492, %rd1, %rd491;
ld.global.u8 %rs1424, [%rd492];
mov.u32 %r2793, %r881;
$L__BB26_411:
and.b16 %rs988, %rs1424, 255;
cvt.u64.u16 %rd493, %rs1424;
and.b64 %rd494, %rd493, 255;
shl.b64 %rd495, %rd494, %r2794;
or.b64 %rd651, %rd495, %rd651;
cvt.u32.u16 %r1996, %rs1426;
cvt.s32.s8 %r1997, %r1996;
mov.u32 %r1998, 8;
sub.s32 %r1999, %r1998, %r1997;
add.s32 %r2794, %r1999, %r2794;
setp.eq.s16 %p534, %rs988, 255;
selp.u16 %rs1426, 1, 0, %p534;
setp.lt.u32 %p535, %r2794, 33;
@%p535 bra $L__BB26_409;
$L__BB26_412:
cvt.u32.u64 %r2818, %rd651;
and.b32 %r2001, %r877, 15;
setp.eq.s32 %p536, %r2001, 0;
mov.u32 %r2819, 0;
mov.u32 %r2817, %r877;
@%p536 bra $L__BB26_421;
and.b32 %r2003, %r877, 1;
setp.eq.b32 %p537, %r2003, 1;
mov.pred %p538, 0;
xor.pred %p539, %p537, %p538;
not.pred %p540, %p539;
mov.u32 %r2819, 0;
mov.u32 %r2817, %r877;
@%p540 bra $L__BB26_415;
and.b32 %r2005, %r877, -2;
and.b32 %r2006, %r2818, 1;
neg.s32 %r2007, %r2006;
mov.u32 %r2819, 1;
and.b32 %r2008, %r2007, %r876;
and.b32 %r2009, %r2008, 51;
or.b32 %r2817, %r2009, %r2005;
shr.u32 %r2818, %r2818, 1;
$L__BB26_415:
and.b32 %r2010, %r2817, 2;
setp.eq.s32 %p541, %r2010, 0;
@%p541 bra $L__BB26_417;
and.b32 %r2011, %r2817, -3;
and.b32 %r2012, %r2818, 1;
neg.s32 %r2013, %r2012;
and.b32 %r2014, %r2013, %r876;
and.b32 %r2015, %r2014, 118;
or.b32 %r2817, %r2015, %r2011;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_417:
and.b32 %r2016, %r2817, 4;
setp.eq.s32 %p542, %r2016, 0;
@%p542 bra $L__BB26_419;
and.b32 %r2017, %r2817, -5;
and.b32 %r2018, %r2818, 1;
neg.s32 %r2019, %r2018;
and.b32 %r2020, %r2019, %r876;
and.b32 %r2021, %r2020, 236;
or.b32 %r2817, %r2021, %r2017;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_419:
and.b32 %r2022, %r2817, 8;
setp.eq.s32 %p543, %r2022, 0;
@%p543 bra $L__BB26_421;
and.b32 %r2023, %r2817, -9;
and.b32 %r2024, %r2818, 1;
neg.s32 %r2025, %r2024;
and.b32 %r2026, %r2025, %r876;
and.b32 %r2027, %r2026, 200;
or.b32 %r2817, %r2027, %r2023;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_421:
and.b32 %r2028, %r2817, 240;
setp.eq.s32 %p544, %r2028, 0;
@%p544 bra $L__BB26_430;
and.b32 %r2029, %r2817, 16;
setp.eq.s32 %p545, %r2029, 0;
@%p545 bra $L__BB26_424;
and.b32 %r2030, %r2817, -17;
and.b32 %r2031, %r2818, 1;
neg.s32 %r2032, %r2031;
and.b32 %r2033, %r2032, %r876;
and.b32 %r2034, %r2033, 816;
or.b32 %r2817, %r2034, %r2030;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_424:
and.b32 %r2035, %r2817, 32;
setp.eq.s32 %p546, %r2035, 0;
@%p546 bra $L__BB26_426;
and.b32 %r2036, %r2817, -33;
and.b32 %r2037, %r2818, 1;
neg.s32 %r2038, %r2037;
and.b32 %r2039, %r2038, %r876;
and.b32 %r2040, %r2039, 1888;
or.b32 %r2817, %r2040, %r2036;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_426:
and.b32 %r2041, %r2817, 64;
setp.eq.s32 %p547, %r2041, 0;
@%p547 bra $L__BB26_428;
and.b32 %r2042, %r2817, -65;
and.b32 %r2043, %r2818, 1;
neg.s32 %r2044, %r2043;
and.b32 %r2045, %r2044, %r876;
and.b32 %r2046, %r2045, 3776;
or.b32 %r2817, %r2046, %r2042;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_428:
and.b32 %r2047, %r2817, 128;
setp.eq.s32 %p548, %r2047, 0;
@%p548 bra $L__BB26_430;
and.b32 %r2048, %r2817, -129;
and.b32 %r2049, %r2818, 1;
neg.s32 %r2050, %r2049;
and.b32 %r2051, %r2050, %r876;
and.b32 %r2052, %r2051, 3200;
or.b32 %r2817, %r2052, %r2048;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_430:
and.b32 %r2053, %r2817, 3840;
setp.eq.s32 %p549, %r2053, 0;
@%p549 bra $L__BB26_439;
and.b32 %r2054, %r2817, 256;
setp.eq.s32 %p550, %r2054, 0;
@%p550 bra $L__BB26_433;
and.b32 %r2055, %r2817, -257;
and.b32 %r2056, %r2818, 1;
neg.s32 %r2057, %r2056;
and.b32 %r2058, %r2057, %r876;
and.b32 %r2059, %r2058, 13056;
or.b32 %r2817, %r2059, %r2055;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_433:
and.b32 %r2060, %r2817, 512;
setp.eq.s32 %p551, %r2060, 0;
@%p551 bra $L__BB26_435;
and.b32 %r2061, %r2817, -513;
and.b32 %r2062, %r2818, 1;
neg.s32 %r2063, %r2062;
and.b32 %r2064, %r2063, %r876;
and.b32 %r2065, %r2064, 30208;
or.b32 %r2817, %r2065, %r2061;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_435:
and.b32 %r2066, %r2817, 1024;
setp.eq.s32 %p552, %r2066, 0;
@%p552 bra $L__BB26_437;
and.b32 %r2067, %r2817, -1025;
and.b32 %r2068, %r2818, 1;
neg.s32 %r2069, %r2068;
and.b32 %r2070, %r2069, %r876;
and.b32 %r2071, %r2070, 60416;
or.b32 %r2817, %r2071, %r2067;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_437:
and.b32 %r2072, %r2817, 2048;
setp.eq.s32 %p553, %r2072, 0;
@%p553 bra $L__BB26_439;
and.b32 %r2073, %r2817, -2049;
and.b32 %r2074, %r2818, 1;
neg.s32 %r2075, %r2074;
and.b32 %r2076, %r2075, %r876;
and.b32 %r2077, %r2076, 51200;
or.b32 %r2817, %r2077, %r2073;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_439:
and.b32 %r2078, %r2817, 61440;
setp.eq.s32 %p554, %r2078, 0;
@%p554 bra $L__BB26_448;
and.b32 %r2079, %r2817, 4096;
setp.eq.s32 %p555, %r2079, 0;
@%p555 bra $L__BB26_442;
and.b32 %r2080, %r2817, -4097;
and.b32 %r2081, %r2818, 1;
neg.s32 %r2082, %r2081;
and.b32 %r2083, %r2082, %r876;
and.b32 %r2084, %r2083, 208896;
or.b32 %r2817, %r2084, %r2080;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_442:
and.b32 %r2085, %r2817, 8192;
setp.eq.s32 %p556, %r2085, 0;
@%p556 bra $L__BB26_444;
and.b32 %r2086, %r2817, -8193;
and.b32 %r2087, %r2818, 1;
neg.s32 %r2088, %r2087;
and.b32 %r2089, %r2088, %r876;
and.b32 %r2090, %r2089, 483328;
or.b32 %r2817, %r2090, %r2086;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_444:
and.b32 %r2091, %r2817, 16384;
setp.eq.s32 %p557, %r2091, 0;
@%p557 bra $L__BB26_446;
and.b32 %r2092, %r2817, -16385;
and.b32 %r2093, %r2818, 1;
neg.s32 %r2094, %r2093;
and.b32 %r2095, %r2094, %r876;
and.b32 %r2096, %r2095, 966656;
or.b32 %r2817, %r2096, %r2092;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_446:
cvt.u16.u32 %rs989, %r2817;
setp.gt.s16 %p558, %rs989, -1;
@%p558 bra $L__BB26_448;
and.b32 %r2097, %r2817, -32769;
and.b32 %r2098, %r2818, 1;
neg.s32 %r2099, %r2098;
and.b32 %r2100, %r2099, %r876;
and.b32 %r2101, %r2100, 819200;
or.b32 %r2817, %r2101, %r2097;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_448:
setp.eq.s32 %p559, %r2817, 0;
@%p559 bra $L__BB26_485;
add.s32 %r982, %r868, %r2787;
and.b32 %r2102, %r2817, 15;
setp.eq.s32 %p560, %r2102, 0;
@%p560 bra $L__BB26_458;
and.b32 %r2103, %r2817, 1;
setp.eq.b32 %p561, %r2103, 1;
mov.pred %p562, 0;
xor.pred %p563, %p561, %p562;
not.pred %p564, %p563;
@%p564 bra $L__BB26_452;
shl.b32 %r2104, %r2818, 31;
or.b32 %r2105, %r2104, %r859;
mul.wide.u32 %rd496, %r982, 4;
add.s64 %rd497, %rd2, %rd496;
st.global.u32 [%rd497], %r2105;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_452:
and.b32 %r2106, %r2817, 2;
setp.eq.s32 %p565, %r2106, 0;
@%p565 bra $L__BB26_454;
shl.b32 %r2107, %r2818, 31;
or.b32 %r2108, %r2107, %r859;
add.s32 %r2109, %r982, %r1151;
mul.wide.u32 %rd498, %r2109, 4;
add.s64 %rd499, %rd2, %rd498;
st.global.u32 [%rd499], %r2108;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_454:
and.b32 %r2110, %r2817, 4;
setp.eq.s32 %p566, %r2110, 0;
@%p566 bra $L__BB26_456;
shl.b32 %r2111, %r2818, 31;
or.b32 %r2112, %r2111, %r859;
add.s32 %r2113, %r982, %r860;
mul.wide.u32 %rd500, %r2113, 4;
add.s64 %rd501, %rd2, %rd500;
st.global.u32 [%rd501], %r2112;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_456:
and.b32 %r2114, %r2817, 8;
setp.eq.s32 %p567, %r2114, 0;
@%p567 bra $L__BB26_458;
shl.b32 %r2115, %r2818, 31;
or.b32 %r2116, %r2115, %r859;
add.s32 %r2117, %r982, %r861;
mul.wide.u32 %rd502, %r2117, 4;
add.s64 %rd503, %rd2, %rd502;
st.global.u32 [%rd503], %r2116;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_458:
add.s32 %r999, %r982, 1;
and.b32 %r2118, %r2817, 240;
setp.eq.s32 %p568, %r2118, 0;
@%p568 bra $L__BB26_467;
and.b32 %r2119, %r2817, 16;
setp.eq.s32 %p569, %r2119, 0;
@%p569 bra $L__BB26_461;
shl.b32 %r2120, %r2818, 31;
or.b32 %r2121, %r2120, %r859;
mul.wide.u32 %rd504, %r999, 4;
add.s64 %rd505, %rd2, %rd504;
st.global.u32 [%rd505], %r2121;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_461:
and.b32 %r2122, %r2817, 32;
setp.eq.s32 %p570, %r2122, 0;
@%p570 bra $L__BB26_463;
shl.b32 %r2123, %r2818, 31;
or.b32 %r2124, %r2123, %r859;
add.s32 %r2125, %r999, %r1151;
mul.wide.u32 %rd506, %r2125, 4;
add.s64 %rd507, %rd2, %rd506;
st.global.u32 [%rd507], %r2124;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_463:
and.b32 %r2126, %r2817, 64;
setp.eq.s32 %p571, %r2126, 0;
@%p571 bra $L__BB26_465;
shl.b32 %r2127, %r2818, 31;
or.b32 %r2128, %r2127, %r859;
add.s32 %r2129, %r999, %r860;
mul.wide.u32 %rd508, %r2129, 4;
add.s64 %rd509, %rd2, %rd508;
st.global.u32 [%rd509], %r2128;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_465:
and.b32 %r2130, %r2817, 128;
setp.eq.s32 %p572, %r2130, 0;
@%p572 bra $L__BB26_467;
shl.b32 %r2131, %r2818, 31;
or.b32 %r2132, %r2131, %r859;
add.s32 %r2133, %r999, %r861;
mul.wide.u32 %rd510, %r2133, 4;
add.s64 %rd511, %rd2, %rd510;
st.global.u32 [%rd511], %r2132;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_467:
add.s32 %r1016, %r982, 2;
and.b32 %r2134, %r2817, 3840;
setp.eq.s32 %p573, %r2134, 0;
@%p573 bra $L__BB26_476;
and.b32 %r2135, %r2817, 256;
setp.eq.s32 %p574, %r2135, 0;
@%p574 bra $L__BB26_470;
shl.b32 %r2136, %r2818, 31;
or.b32 %r2137, %r2136, %r859;
mul.wide.u32 %rd512, %r1016, 4;
add.s64 %rd513, %rd2, %rd512;
st.global.u32 [%rd513], %r2137;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_470:
and.b32 %r2138, %r2817, 512;
setp.eq.s32 %p575, %r2138, 0;
@%p575 bra $L__BB26_472;
shl.b32 %r2139, %r2818, 31;
or.b32 %r2140, %r2139, %r859;
add.s32 %r2141, %r1016, %r1151;
mul.wide.u32 %rd514, %r2141, 4;
add.s64 %rd515, %rd2, %rd514;
st.global.u32 [%rd515], %r2140;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_472:
and.b32 %r2142, %r2817, 1024;
setp.eq.s32 %p576, %r2142, 0;
@%p576 bra $L__BB26_474;
shl.b32 %r2143, %r2818, 31;
or.b32 %r2144, %r2143, %r859;
add.s32 %r2145, %r1016, %r860;
mul.wide.u32 %rd516, %r2145, 4;
add.s64 %rd517, %rd2, %rd516;
st.global.u32 [%rd517], %r2144;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_474:
and.b32 %r2146, %r2817, 2048;
setp.eq.s32 %p577, %r2146, 0;
@%p577 bra $L__BB26_476;
shl.b32 %r2147, %r2818, 31;
or.b32 %r2148, %r2147, %r859;
add.s32 %r2149, %r1016, %r861;
mul.wide.u32 %rd518, %r2149, 4;
add.s64 %rd519, %rd2, %rd518;
st.global.u32 [%rd519], %r2148;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_476:
add.s32 %r1033, %r982, 3;
and.b32 %r2150, %r2817, 61440;
setp.eq.s32 %p578, %r2150, 0;
@%p578 bra $L__BB26_485;
and.b32 %r2151, %r2817, 4096;
setp.eq.s32 %p579, %r2151, 0;
@%p579 bra $L__BB26_479;
shl.b32 %r2152, %r2818, 31;
or.b32 %r2153, %r2152, %r859;
mul.wide.u32 %rd520, %r1033, 4;
add.s64 %rd521, %rd2, %rd520;
st.global.u32 [%rd521], %r2153;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_479:
and.b32 %r2154, %r2817, 8192;
setp.eq.s32 %p580, %r2154, 0;
@%p580 bra $L__BB26_481;
shl.b32 %r2155, %r2818, 31;
or.b32 %r2156, %r2155, %r859;
add.s32 %r2157, %r1033, %r1151;
mul.wide.u32 %rd522, %r2157, 4;
add.s64 %rd523, %rd2, %rd522;
st.global.u32 [%rd523], %r2156;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_481:
and.b32 %r2158, %r2817, 16384;
setp.eq.s32 %p581, %r2158, 0;
@%p581 bra $L__BB26_483;
shl.b32 %r2159, %r2818, 31;
or.b32 %r2160, %r2159, %r859;
add.s32 %r2161, %r1033, %r860;
mul.wide.u32 %rd524, %r2161, 4;
add.s64 %rd525, %rd2, %rd524;
st.global.u32 [%rd525], %r2160;
shr.u32 %r2818, %r2818, 1;
add.s32 %r2819, %r2819, 1;
$L__BB26_483:
cvt.u16.u32 %rs990, %r2817;
setp.gt.s16 %p582, %rs990, -1;
@%p582 bra $L__BB26_485;
shl.b32 %r2162, %r2818, 31;
or.b32 %r2163, %r2162, %r859;
add.s32 %r2164, %r1033, %r861;
mul.wide.u32 %rd526, %r2164, 4;
add.s64 %rd527, %rd2, %rd526;
st.global.u32 [%rd527], %r2163;
add.s32 %r2819, %r2819, 1;
$L__BB26_485:
shr.u64 %rd651, %rd651, %r2819;
sub.s32 %r2794, %r2794, %r2819;
$L__BB26_486:
or.b32 %r1052, %r2817, %r875;
st.local.u16 [%rd117], %r1052;
add.s32 %r2165, %r873, 1;
setp.ge.u32 %p583, %r2165, %r823;
@%p583 bra $L__BB26_488;
shr.u32 %r2166, %r1052, 16;
st.local.u16 [%rd117+2], %r2166;
$L__BB26_488:
shl.b32 %r2167, %r1052, 1;
and.b32 %r2168, %r2167, 57344;
and.b32 %r2169, %r1052, 57344;
shr.u32 %r2170, %r2169, 1;
or.b32 %r2171, %r1052, %r874;
and.b32 %r2172, %r2171, 61440;
or.b32 %r2173, %r2172, %r2168;
or.b32 %r2788, %r2173, %r2170;
setp.lt.u32 %p584, %r1966, %r1141;
mov.u32 %r2787, %r1966;
@%p584 bra $L__BB26_407;
add.s32 %r2784, %r2784, 4;
setp.gt.u32 %p585, %r1144, %r2784;
@%p585 bra $L__BB26_406;
setp.lt.u32 %p586, %r17, 3;
@%p586 bra $L__BB26_541;
mov.u32 %r2174, 0;
mov.u32 %r2878, %r2174;
$L__BB26_492:
shr.u32 %r2176, %r2878, 1;
mul.lo.s32 %r2880, %r2176, %r1163;
shr.u32 %r2177, %r2878, 2;
mul.lo.s32 %r2879, %r2177, %r822;
mov.u32 %r2881, %r2174;
$L__BB26_493:
mul.wide.u32 %rd528, %r2880, 2;
add.s64 %rd529, %rd13, %rd528;
ld.local.u16 %rs991, [%rd529];
and.b16 %rs992, %rs991, 48;
shr.u16 %rs993, %rs992, 4;
and.b16 %rs994, %rs991, 192;
shr.u16 %rs995, %rs994, 2;
add.s32 %r2178, %r2880, 2;
mul.wide.u32 %rd530, %r2178, 2;
add.s64 %rd531, %rd13, %rd530;
ld.local.u16 %rs996, [%rd531];
shl.b16 %rs997, %rs996, 4;
and.b16 %rs998, %rs997, 768;
shl.b16 %rs999, %rs996, 6;
and.b16 %rs1000, %rs999, 12288;
add.s32 %r2179, %r2880, %r1163;
mul.wide.u32 %rd532, %r2179, 2;
add.s64 %rd533, %rd13, %rd532;
ld.local.u16 %rs1001, [%rd533];
and.b16 %rs1002, %rs1001, 48;
shr.u16 %rs1003, %rs1002, 2;
and.b16 %rs1004, %rs1001, 192;
add.s32 %r2180, %r2179, 2;
mul.wide.u32 %rd534, %r2180, 2;
add.s64 %rd535, %rd13, %rd534;
ld.local.u16 %rs1005, [%rd535];
shl.b16 %rs1006, %rs1005, 6;
and.b16 %rs1007, %rs1006, 3072;
shl.b16 %rs1008, %rs1005, 8;
and.b16 %rs1009, %rs1008, -16384;
or.b16 %rs1010, %rs993, %rs995;
or.b16 %rs1011, %rs1010, %rs1000;
or.b16 %rs1012, %rs1011, %rs998;
or.b16 %rs1013, %rs1012, %rs1004;
or.b16 %rs1014, %rs1013, %rs1003;
or.b16 %rs1015, %rs1014, %rs1009;
or.b16 %rs1016, %rs1015, %rs1007;
mul.wide.u32 %rd536, %r2879, 2;
add.s64 %rd537, %rd112, %rd536;
st.local.u16 [%rd537], %rs1016;
add.s32 %r2880, %r2880, 4;
add.s32 %r2879, %r2879, 1;
add.s32 %r2881, %r2881, 4;
setp.lt.u32 %p587, %r2881, %r1141;
@%p587 bra $L__BB26_493;
mul.wide.u32 %rd538, %r2879, 2;
add.s64 %rd539, %rd112, %rd538;
mov.u16 %rs1017, 0;
st.local.u16 [%rd539], %rs1017;
add.s32 %r2878, %r2878, 4;
setp.lt.u32 %p588, %r2878, %r1144;
@%p588 bra $L__BB26_492;
mov.u32 %r2884, 0;
@%p520 bra $L__BB26_498;
sub.s32 %r2883, %r836, %r2886;
mov.u32 %r2884, 0;
$L__BB26_497:
add.s32 %r2183, %r2884, %r835;
mul.wide.u32 %rd540, %r2183, 2;
add.s64 %rd541, %rd112, %rd540;
mov.u16 %rs1018, 0;
st.local.v4.u16 [%rd541], {%rs1018, %rs1018, %rs1018, %rs1018};
add.s32 %r2884, %r2884, 4;
add.s32 %r2883, %r2883, -4;
setp.ne.s32 %p590, %r2883, 0;
@%p590 bra $L__BB26_497;
$L__BB26_498:
@%p522 bra $L__BB26_500;
$L__BB26_499:
.pragma "nounroll";
add.s32 %r2184, %r2884, %r835;
mul.wide.u32 %rd542, %r2184, 2;
add.s64 %rd543, %rd112, %rd542;
mov.u16 %rs1019, 0;
st.local.u16 [%rd543], %rs1019;
add.s32 %r2884, %r2884, 1;
add.s32 %r2886, %r2886, -1;
setp.ne.s32 %p592, %r2886, 0;
@%p592 bra $L__BB26_499;
$L__BB26_500:
mov.u32 %r2187, 1;
mov.u32 %r2186, 0;
mov.u64 %rd656, 0;
add.s32 %r2188, %r1146, %r2900;
add.s32 %r2901, %r2188, -1;
shl.b32 %r1078, %r2187, %r858;
mov.u16 %rs1431, 1;
mov.u32 %r2887, %r2186;
mov.u32 %r2899, %r2186;
$L__BB26_501:
shr.u32 %r2190, %r2887, 2;
mul.lo.s32 %r2892, %r2190, %r822;
mad.lo.s32 %r1084, %r2887, %r1151, %r14;
mov.u32 %r2891, %r2186;
$L__BB26_502:
setp.gt.u32 %p593, %r2899, 31;
@%p593 bra $L__BB26_507;
mov.u32 %r2897, %r2900;
$L__BB26_504:
setp.eq.s32 %p594, %r2897, 0;
mov.u16 %rs1430, 0;
@%p594 bra $L__BB26_506;
cvt.s64.s32 %rd545, %r2901;
add.s64 %rd546, %rd545, %rd3;
add.s64 %rd547, %rd1, %rd546;
ld.global.u8 %rs1430, [%rd547];
$L__BB26_506:
add.s32 %r2191, %r2897, -1;
selp.b32 %r2900, 0, %r2191, %p594;
setp.ne.s32 %p596, %r2897, 0;
selp.b32 %r2192, -1, 0, %p596;
add.s32 %r2901, %r2901, %r2192;
and.b16 %rs1022, %rs1430, 255;
and.b16 %rs1023, %rs1430, 127;
setp.eq.s16 %p597, %rs1023, 127;
and.b16 %rs1024, %rs1431, 255;
setp.ne.s16 %p598, %rs1024, 0;
and.pred %p599, %p598, %p597;
selp.b32 %r2193, 7, 8, %p599;
cvt.u64.u16 %rd548, %rs1430;
and.b64 %rd549, %rd548, 255;
shl.b64 %rd550, %rd549, %r2899;
or.b64 %rd656, %rd550, %rd656;
add.s32 %r2899, %r2193, %r2899;
setp.gt.u16 %p600, %rs1022, 143;
selp.u16 %rs1431, 1, 0, %p600;
setp.lt.u32 %p601, %r2899, 33;
mov.u32 %r2897, %r2900;
@%p601 bra $L__BB26_504;
$L__BB26_507:
mul.wide.u32 %rd551, %r2892, 2;
add.s64 %rd552, %rd112, %rd551;
ld.local.u32 %r1099, [%rd552];
setp.eq.s32 %p602, %r1099, 0;
@%p602 bra $L__BB26_528;
cvt.u32.u64 %r2912, %rd656;
add.s32 %r1101, %r1084, %r2891;
mov.u32 %r2904, 15;
mov.u32 %r2902, 0;
$L__BB26_509:
and.b32 %r2196, %r2904, %r1099;
setp.eq.s32 %p603, %r2196, 0;
@%p603 bra $L__BB26_518;
add.s32 %r1105, %r1101, %r2902;
and.b32 %r1106, %r2904, 286331137;
and.b32 %r2197, %r1106, %r1099;
setp.eq.s32 %p604, %r2197, 0;
@%p604 bra $L__BB26_512;
not.b32 %r2198, %r2912;
and.b32 %r2199, %r2198, 1;
shl.b32 %r2200, %r2199, %r671;
or.b32 %r2201, %r2200, %r1078;
mul.wide.u32 %rd553, %r1105, 4;
add.s64 %rd554, %rd2, %rd553;
ld.global.u32 %r2202, [%rd554];
xor.b32 %r2203, %r2202, %r2201;
st.global.u32 [%rd554], %r2203;
shr.u32 %r2912, %r2912, 1;
$L__BB26_512:
add.s32 %r1109, %r1105, %r1151;
shl.b32 %r2204, %r1106, 1;
and.b32 %r2205, %r2204, %r1099;
setp.eq.s32 %p605, %r2205, 0;
@%p605 bra $L__BB26_514;
not.b32 %r2206, %r2912;
and.b32 %r2207, %r2206, 1;
shl.b32 %r2208, %r2207, %r671;
or.b32 %r2209, %r2208, %r1078;
mul.wide.u32 %rd555, %r1109, 4;
add.s64 %rd556, %rd2, %rd555;
ld.global.u32 %r2210, [%rd556];
xor.b32 %r2211, %r2210, %r2209;
st.global.u32 [%rd556], %r2211;
shr.u32 %r2912, %r2912, 1;
$L__BB26_514:
add.s32 %r1112, %r1109, %r1151;
shl.b32 %r2212, %r1106, 2;
and.b32 %r2213, %r2212, %r1099;
setp.eq.s32 %p606, %r2213, 0;
@%p606 bra $L__BB26_516;
not.b32 %r2214, %r2912;
and.b32 %r2215, %r2214, 1;
shl.b32 %r2216, %r2215, %r671;
or.b32 %r2217, %r2216, %r1078;
mul.wide.u32 %rd557, %r1112, 4;
add.s64 %rd558, %rd2, %rd557;
ld.global.u32 %r2218, [%rd558];
xor.b32 %r2219, %r2218, %r2217;
st.global.u32 [%rd558], %r2219;
shr.u32 %r2912, %r2912, 1;
$L__BB26_516:
shl.b32 %r2220, %r1106, 3;
and.b32 %r2221, %r2220, %r1099;
setp.eq.s32 %p607, %r2221, 0;
@%p607 bra $L__BB26_518;
not.b32 %r2222, %r2912;
and.b32 %r2223, %r2222, 1;
shl.b32 %r2224, %r2223, %r671;
or.b32 %r2225, %r2224, %r1078;
add.s32 %r2226, %r1112, %r1151;
mul.wide.u32 %rd559, %r2226, 4;
add.s64 %rd560, %rd2, %rd559;
ld.global.u32 %r2227, [%rd560];
xor.b32 %r2228, %r2227, %r2225;
st.global.u32 [%rd560], %r2228;
shr.u32 %r2912, %r2912, 1;
$L__BB26_518:
shl.b32 %r1117, %r2904, 4;
and.b32 %r2229, %r1117, %r1099;
setp.eq.s32 %p608, %r2229, 0;
@%p608 bra $L__BB26_527;
add.s32 %r2230, %r1101, %r2902;
add.s32 %r1118, %r2230, 1;
and.b32 %r1119, %r1117, 286330896;
and.b32 %r2231, %r1119, %r1099;
setp.eq.s32 %p609, %r2231, 0;
@%p609 bra $L__BB26_521;
not.b32 %r2232, %r2912;
and.b32 %r2233, %r2232, 1;
shl.b32 %r2234, %r2233, %r671;
or.b32 %r2235, %r2234, %r1078;
mul.wide.u32 %rd561, %r1118, 4;
add.s64 %rd562, %rd2, %rd561;
ld.global.u32 %r2236, [%rd562];
xor.b32 %r2237, %r2236, %r2235;
st.global.u32 [%rd562], %r2237;
shr.u32 %r2912, %r2912, 1;
$L__BB26_521:
add.s32 %r1122, %r1118, %r1151;
shl.b32 %r2238, %r1119, 1;
and.b32 %r2239, %r2238, %r1099;
setp.eq.s32 %p610, %r2239, 0;
@%p610 bra $L__BB26_523;
not.b32 %r2240, %r2912;
and.b32 %r2241, %r2240, 1;
shl.b32 %r2242, %r2241, %r671;
or.b32 %r2243, %r2242, %r1078;
mul.wide.u32 %rd563, %r1122, 4;
add.s64 %rd564, %rd2, %rd563;
ld.global.u32 %r2244, [%rd564];
xor.b32 %r2245, %r2244, %r2243;
st.global.u32 [%rd564], %r2245;
shr.u32 %r2912, %r2912, 1;
$L__BB26_523:
add.s32 %r1125, %r1122, %r1151;
shl.b32 %r2246, %r1119, 2;
and.b32 %r2247, %r2246, %r1099;
setp.eq.s32 %p611, %r2247, 0;
@%p611 bra $L__BB26_525;
not.b32 %r2248, %r2912;
and.b32 %r2249, %r2248, 1;
shl.b32 %r2250, %r2249, %r671;
or.b32 %r2251, %r2250, %r1078;
mul.wide.u32 %rd565, %r1125, 4;
add.s64 %rd566, %rd2, %rd565;
ld.global.u32 %r2252, [%rd566];
xor.b32 %r2253, %r2252, %r2251;
st.global.u32 [%rd566], %r2253;
shr.u32 %r2912, %r2912, 1;
$L__BB26_525:
shl.b32 %r2254, %r1119, 3;
and.b32 %r2255, %r2254, %r1099;
setp.eq.s32 %p612, %r2255, 0;
@%p612 bra $L__BB26_527;
not.b32 %r2256, %r2912;
and.b32 %r2257, %r2256, 1;
shl.b32 %r2258, %r2257, %r671;
or.b32 %r2259, %r2258, %r1078;
add.s32 %r2260, %r1125, %r1151;
mul.wide.u32 %rd567, %r2260, 4;
add.s64 %rd568, %rd2, %rd567;
ld.global.u32 %r2261, [%rd568];
xor.b32 %r2262, %r2261, %r2259;
st.global.u32 [%rd568], %r2262;
shr.u32 %r2912, %r2912, 1;
$L__BB26_527:
shl.b32 %r2904, %r2904, 8;
add.s32 %r2902, %r2902, 2;
setp.ne.s32 %p613, %r2902, 8;
@%p613 bra $L__BB26_509;
$L__BB26_528:
popc.b32 %r2263, %r1099;
shr.u64 %rd656, %rd656, %r2263;
sub.s32 %r2899, %r2899, %r2263;
add.s32 %r2891, %r2891, 8;
setp.lt.u32 %p614, %r2891, %r1141;
add.s32 %r2892, %r2892, 2;
@%p614 bra $L__BB26_502;
add.s32 %r2887, %r2887, 4;
setp.lt.u32 %p615, %r2887, %r1144;
@%p615 bra $L__BB26_501;
bra.uni $L__BB26_541;
}
// .globl j2k_htj2k_decode_codeblocks_multi_cleanup_only
.visible .entry j2k_htj2k_decode_codeblocks_multi_cleanup_only(
.param .u64 j2k_htj2k_decode_codeblocks_multi_cleanup_only_param_0,
.param .u64 j2k_htj2k_decode_codeblocks_multi_cleanup_only_param_1,
.param .u64 j2k_htj2k_decode_codeblocks_multi_cleanup_only_param_2,
.param .u64 j2k_htj2k_decode_codeblocks_multi_cleanup_only_param_3,
.param .u64 j2k_htj2k_decode_codeblocks_multi_cleanup_only_param_4,
.param .u64 j2k_htj2k_decode_codeblocks_multi_cleanup_only_param_5,
.param .u64 j2k_htj2k_decode_codeblocks_multi_cleanup_only_param_6,
.param .u32 j2k_htj2k_decode_codeblocks_multi_cleanup_only_param_7
)
{
.local .align 16 .b8 __local_depot27[6720];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<507>;
.reg .b16 %rs<1330>;
.reg .b32 %r<2113>;
.reg .b64 %rd<510>;
mov.u64 %SPL, __local_depot27;
ld.param.u64 %rd118, [ j2k_htj2k_decode_codeblocks_multi_cleanup_only_param_0];
ld.param.u64 %rd112, [ j2k_htj2k_decode_codeblocks_multi_cleanup_only_param_1];
ld.param.u64 %rd117, [ j2k_htj2k_decode_codeblocks_multi_cleanup_only_param_6];
ld.param.u32 %r815, [ j2k_htj2k_decode_codeblocks_multi_cleanup_only_param_7];
cvta.to.global.u64 %rd1, %rd118;
mov.u32 %r816, %ntid.x;
mov.u32 %r817, %ctaid.x;
mov.u32 %r818, %tid.x;
mad.lo.s32 %r1, %r817, %r816, %r818;
setp.ge.u32 %p1, %r1, %r815;
@%p1 bra $L__BB27_399;
cvta.to.global.u64 %rd119, %rd117;
cvta.to.global.u64 %rd120, %rd112;
mul.wide.u32 %rd121, %r1, 64;
add.s64 %rd122, %rd120, %rd121;
ld.global.u64 %rd123, [%rd122];
cvta.to.global.u64 %rd2, %rd123;
ld.global.v2.u32 {%r819, %r820}, [%rd122+8];
mov.u32 %r821, 0;
ld.global.v2.u32 {%r822, %r823}, [%rd122+16];
ld.global.v2.u32 {%r824, %r825}, [%rd122+24];
ld.global.v2.u32 {%r827, %r828}, [%rd122+32];
ld.global.u32 %r12, [%rd122+44];
ld.global.u32 %r13, [%rd122+48];
cvt.u64.u32 %rd3, %r819;
mul.wide.u32 %rd124, %r1, 16;
add.s64 %rd4, %rd119, %rd124;
st.global.u32 [%rd4], %r821;
st.global.u32 [%rd4+4], %r821;
st.global.u32 [%rd4+8], %r821;
st.global.u32 [%rd4+12], %r821;
setp.eq.s32 %p2, %r825, 0;
@%p2 bra $L__BB27_3;
mov.u32 %r829, 2;
st.global.u32 [%rd4], %r829;
mov.u32 %r830, 17;
st.global.u32 [%rd4+4], %r830;
st.global.u32 [%rd4+8], %r821;
st.global.u32 [%rd4+12], %r821;
bra.uni $L__BB27_399;
$L__BB27_3:
setp.eq.s32 %p3, %r820, 0;
setp.eq.s32 %p4, %r822, 0;
or.pred %p5, %p3, %p4;
@%p5 bra $L__BB27_399;
setp.gt.u32 %p6, %r820, 256;
setp.gt.u32 %p7, %r822, 256;
or.pred %p8, %p6, %p7;
mul.lo.s32 %r832, %r822, %r820;
setp.gt.u32 %p9, %r832, 4096;
or.pred %p10, %p8, %p9;
@%p10 bra $L__BB27_398;
bra.uni $L__BB27_5;
$L__BB27_398:
mov.u32 %r1660, 2;
st.global.u32 [%rd4], %r1660;
mov.u32 %r1661, 1;
st.global.u32 [%rd4+4], %r1661;
mov.u32 %r1662, 0;
st.global.u32 [%rd4+8], %r1662;
st.global.u32 [%rd4+12], %r1662;
bra.uni $L__BB27_399;
$L__BB27_5:
add.s32 %r833, %r828, -1;
setp.gt.u32 %p11, %r833, 30;
@%p11 bra $L__BB27_397;
bra.uni $L__BB27_6;
$L__BB27_397:
mov.u32 %r1657, 1;
st.global.u32 [%rd4], %r1657;
mov.u32 %r1658, 2;
st.global.u32 [%rd4+4], %r1658;
mov.u32 %r1659, 0;
st.global.u32 [%rd4+8], %r1659;
st.global.u32 [%rd4+12], %r1659;
bra.uni $L__BB27_399;
$L__BB27_6:
setp.gt.u32 %p12, %r827, 29;
@%p12 bra $L__BB27_396;
bra.uni $L__BB27_7;
$L__BB27_396:
mov.u32 %r1654, 1;
st.global.u32 [%rd4], %r1654;
mov.u32 %r1655, 3;
st.global.u32 [%rd4+4], %r1655;
mov.u32 %r1656, 0;
st.global.u32 [%rd4+8], %r1656;
st.global.u32 [%rd4+12], %r1656;
bra.uni $L__BB27_399;
$L__BB27_7:
setp.lt.u32 %p13, %r824, 2;
setp.lt.u32 %p14, %r823, %r824;
or.pred %p15, %p13, %p14;
@%p15 bra $L__BB27_395;
bra.uni $L__BB27_8;
$L__BB27_395:
mov.u32 %r1651, 1;
st.global.u32 [%rd4], %r1651;
mov.u32 %r1652, 4;
st.global.u32 [%rd4+4], %r1652;
mov.u32 %r1653, 0;
st.global.u32 [%rd4+8], %r1653;
st.global.u32 [%rd4+12], %r1653;
bra.uni $L__BB27_399;
$L__BB27_8:
add.s32 %r834, %r824, -1;
cvt.u64.u32 %rd125, %r834;
add.s64 %rd126, %rd125, %rd3;
add.s64 %rd127, %rd1, %rd126;
ld.global.u8 %rs652, [%rd127];
mul.wide.u16 %r835, %rs652, 16;
add.s32 %r836, %r824, -2;
cvt.u64.u32 %rd128, %r836;
add.s64 %rd129, %rd128, %rd3;
add.s64 %rd130, %rd1, %rd129;
ld.global.u8 %rs1, [%rd130];
and.b16 %rs653, %rs1, 15;
cvt.u32.u16 %r837, %rs653;
or.b32 %r14, %r835, %r837;
setp.lt.u32 %p16, %r824, %r14;
add.s32 %r15, %r14, -2;
setp.gt.u32 %p17, %r15, 4077;
or.pred %p18, %p16, %p17;
@%p18 bra $L__BB27_394;
bra.uni $L__BB27_9;
$L__BB27_394:
mov.u32 %r1648, 1;
st.global.u32 [%rd4], %r1648;
mov.u32 %r1649, 5;
st.global.u32 [%rd4+4], %r1649;
mov.u32 %r1650, 0;
st.global.u32 [%rd4+8], %r1650;
st.global.u32 [%rd4+12], %r1650;
bra.uni $L__BB27_399;
$L__BB27_9:
add.s32 %r838, %r822, 1;
shr.u32 %r839, %r838, 1;
add.s32 %r840, %r820, 9;
and.b32 %r841, %r840, -8;
setp.gt.u32 %p19, %r841, 264;
add.s32 %r842, %r839, 1;
mul.lo.s32 %r843, %r842, %r841;
setp.gt.u32 %p20, %r843, 3096;
or.pred %p21, %p19, %p20;
@%p21 bra $L__BB27_393;
bra.uni $L__BB27_10;
$L__BB27_393:
mov.u32 %r1645, 2;
st.global.u32 [%rd4], %r1645;
mov.u32 %r1646, 6;
st.global.u32 [%rd4+4], %r1646;
mov.u32 %r1647, 0;
st.global.u32 [%rd4+8], %r1647;
st.global.u32 [%rd4+12], %r1647;
bra.uni $L__BB27_399;
$L__BB27_10:
and.b16 %rs940, %rs1, 15;
cvt.u32.u16 %r1669, %rs940;
mul.wide.u16 %r1668, %rs652, 16;
or.b32 %r1667, %r1668, %r1669;
sub.s32 %r16, %r824, %r1667;
add.s32 %r1747, %r1667, -1;
add.s32 %r1915, %r824, -3;
mov.u64 %rd455, 0;
mov.u32 %r1837, 0;
mov.u16 %rs1013, 0;
mov.u64 %rd133, _ZZ20mel_decode_more_runsR10MelDecoderE7MEL_EXP;
mov.u32 %r1746, %r16;
mov.u16 %rs1014, %rs1013;
mov.u16 %rs1048, %rs1013;
mov.u32 %r1719, %r1837;
$L__BB27_11:
setp.gt.u32 %p23, %r1719, 7;
@%p23 bra $L__BB27_56;
mul.wide.u32 %rd132, %r1837, 4;
add.s64 %rd134, %rd133, %rd132;
ld.global.nc.u32 %r23, [%rd134];
and.b16 %rs657, %rs1048, 255;
setp.ne.s16 %p24, %rs657, 0;
mov.u16 %rs948, %rs1048;
@%p24 bra $L__BB27_16;
setp.eq.s32 %p25, %r1747, 0;
mov.u16 %rs945, 255;
@%p25 bra $L__BB27_15;
cvt.u64.u32 %rd135, %r1746;
add.s64 %rd136, %rd135, %rd3;
add.s64 %rd137, %rd1, %rd136;
ld.global.u8 %rs945, [%rd137];
$L__BB27_15:
setp.ne.s32 %p27, %r1747, 0;
selp.u32 %r846, 1, 0, %p27;
add.s32 %r1746, %r1746, %r846;
add.s32 %r847, %r1747, -1;
selp.b32 %r1747, 0, %r847, %p25;
setp.eq.s32 %p28, %r1747, 0;
or.b16 %rs659, %rs945, 15;
selp.b16 %rs1014, %rs659, %rs945, %p28;
and.b16 %rs660, %rs1014, 255;
mov.u16 %rs661, 8;
sub.s16 %rs948, %rs661, %rs1013;
setp.eq.s16 %p29, %rs660, 255;
selp.u16 %rs1013, 1, 0, %p29;
$L__BB27_16:
add.s16 %rs15, %rs948, -1;
cvt.u32.u16 %r848, %rs15;
and.b32 %r849, %r848, 255;
mov.u32 %r850, 1;
shl.b32 %r851, %r850, %r849;
cvt.u32.u16 %r852, %rs1014;
and.b32 %r853, %r851, %r852;
and.b32 %r28, %r853, 255;
add.s16 %rs1048, %rs948, -1;
setp.eq.s32 %p30, %r28, 0;
@%p30 bra $L__BB27_18;
add.s32 %r854, %r1837, 1;
min.u32 %r1714, %r854, 12;
mov.u32 %r855, -1;
shl.b32 %r856, %r855, %r23;
shl.b32 %r857, %r856, 1;
xor.b32 %r1715, %r857, -2;
bra.uni $L__BB27_55;
$L__BB27_18:
cvt.u64.u32 %rd453, %r1837;
add.s64 %rd138, %rd453, -3;
setp.gt.u64 %p31, %rd138, 9;
mov.u32 %r1711, 0;
@%p31 bra $L__BB27_54;
add.s16 %rs1048, %rs948, -1;
max.u32 %r31, %r23, 1;
add.s32 %r861, %r31, -1;
setp.lt.u32 %p32, %r861, 3;
mov.u32 %r1711, 0;
@%p32 bra $L__BB27_38;
and.b32 %r1663, %r31, 3;
add.s16 %rs1048, %rs948, -1;
sub.s32 %r1683, %r31, %r1663;
mov.u32 %r1711, 0;
$L__BB27_21:
and.b16 %rs663, %rs1048, 255;
setp.ne.s16 %p33, %rs663, 0;
@%p33 bra $L__BB27_25;
setp.eq.s32 %p34, %r1747, 0;
mov.u16 %rs952, 255;
@%p34 bra $L__BB27_24;
cvt.u64.u32 %rd139, %r1746;
add.s64 %rd140, %rd139, %rd3;
add.s64 %rd141, %rd1, %rd140;
ld.global.u8 %rs952, [%rd141];
$L__BB27_24:
setp.ne.s32 %p36, %r1747, 0;
selp.u32 %r863, 1, 0, %p36;
add.s32 %r1746, %r1746, %r863;
add.s32 %r864, %r1747, -1;
selp.b32 %r1747, 0, %r864, %p34;
setp.eq.s32 %p37, %r1747, 0;
or.b16 %rs665, %rs952, 15;
selp.b16 %rs1014, %rs665, %rs952, %p37;
and.b16 %rs666, %rs1014, 255;
mov.u16 %rs667, 8;
sub.s16 %rs1048, %rs667, %rs1013;
setp.eq.s16 %p38, %rs666, 255;
selp.u16 %rs1013, 1, 0, %p38;
$L__BB27_25:
add.s16 %rs959, %rs1048, -1;
and.b16 %rs668, %rs959, 255;
cvt.u32.u16 %r865, %rs959;
and.b32 %r866, %r865, 255;
cvt.u32.u16 %r867, %rs1014;
and.b32 %r1689, %r867, 255;
shr.u32 %r868, %r1689, %r866;
and.b32 %r869, %r868, 1;
bfi.b32 %r43, %r1711, %r869, 1, 31;
setp.ne.s16 %p39, %rs668, 0;
@%p39 bra $L__BB27_29;
setp.eq.s32 %p40, %r1747, 0;
mov.u16 %rs956, 255;
@%p40 bra $L__BB27_28;
cvt.u64.u32 %rd142, %r1746;
add.s64 %rd143, %rd142, %rd3;
add.s64 %rd144, %rd1, %rd143;
ld.global.u8 %rs956, [%rd144];
$L__BB27_28:
setp.ne.s32 %p42, %r1747, 0;
selp.u32 %r870, 1, 0, %p42;
add.s32 %r1746, %r1746, %r870;
add.s32 %r871, %r1747, -1;
selp.b32 %r1747, 0, %r871, %p40;
setp.eq.s32 %p43, %r1747, 0;
or.b16 %rs670, %rs956, 15;
selp.b16 %rs1014, %rs670, %rs956, %p43;
and.b16 %rs671, %rs1014, 255;
mov.u16 %rs672, 8;
sub.s16 %rs959, %rs672, %rs1013;
setp.eq.s16 %p44, %rs671, 255;
selp.u16 %rs1013, 1, 0, %p44;
cvt.u32.u16 %r872, %rs1014;
and.b32 %r1689, %r872, 255;
$L__BB27_29:
add.s16 %rs963, %rs959, -1;
and.b16 %rs673, %rs963, 255;
cvt.u32.u16 %r873, %rs963;
and.b32 %r874, %r873, 255;
shr.u32 %r875, %r1689, %r874;
and.b32 %r876, %r875, 1;
bfi.b32 %r50, %r43, %r876, 1, 31;
setp.ne.s16 %p45, %rs673, 0;
@%p45 bra $L__BB27_33;
setp.eq.s32 %p46, %r1747, 0;
mov.u16 %rs960, 255;
@%p46 bra $L__BB27_32;
cvt.u64.u32 %rd145, %r1746;
add.s64 %rd146, %rd145, %rd3;
add.s64 %rd147, %rd1, %rd146;
ld.global.u8 %rs960, [%rd147];
$L__BB27_32:
setp.ne.s32 %p48, %r1747, 0;
selp.u32 %r877, 1, 0, %p48;
add.s32 %r1746, %r1746, %r877;
add.s32 %r878, %r1747, -1;
selp.b32 %r1747, 0, %r878, %p46;
setp.eq.s32 %p49, %r1747, 0;
or.b16 %rs675, %rs960, 15;
selp.b16 %rs1014, %rs675, %rs960, %p49;
and.b16 %rs676, %rs1014, 255;
mov.u16 %rs677, 8;
sub.s16 %rs963, %rs677, %rs1013;
setp.eq.s16 %p50, %rs676, 255;
selp.u16 %rs1013, 1, 0, %p50;
cvt.u32.u16 %r879, %rs1014;
and.b32 %r1689, %r879, 255;
$L__BB27_33:
add.s16 %rs967, %rs963, -1;
and.b16 %rs678, %rs967, 255;
cvt.u32.u16 %r880, %rs967;
and.b32 %r881, %r880, 255;
shr.u32 %r882, %r1689, %r881;
and.b32 %r883, %r882, 1;
bfi.b32 %r57, %r50, %r883, 1, 31;
setp.ne.s16 %p51, %rs678, 0;
@%p51 bra $L__BB27_37;
setp.eq.s32 %p52, %r1747, 0;
mov.u16 %rs964, 255;
@%p52 bra $L__BB27_36;
cvt.u64.u32 %rd148, %r1746;
add.s64 %rd149, %rd148, %rd3;
add.s64 %rd150, %rd1, %rd149;
ld.global.u8 %rs964, [%rd150];
$L__BB27_36:
setp.ne.s32 %p54, %r1747, 0;
selp.u32 %r884, 1, 0, %p54;
add.s32 %r1746, %r1746, %r884;
add.s32 %r885, %r1747, -1;
selp.b32 %r1747, 0, %r885, %p52;
setp.eq.s32 %p55, %r1747, 0;
or.b16 %rs680, %rs964, 15;
selp.b16 %rs1014, %rs680, %rs964, %p55;
and.b16 %rs681, %rs1014, 255;
mov.u16 %rs682, 8;
sub.s16 %rs967, %rs682, %rs1013;
setp.eq.s16 %p56, %rs681, 255;
selp.u16 %rs1013, 1, 0, %p56;
cvt.u32.u16 %r886, %rs1014;
and.b32 %r1689, %r886, 255;
$L__BB27_37:
add.s16 %rs1048, %rs967, -1;
cvt.u32.u16 %r887, %rs1048;
and.b32 %r888, %r887, 255;
shr.u32 %r889, %r1689, %r888;
and.b32 %r890, %r889, 1;
bfi.b32 %r1711, %r57, %r890, 1, 31;
add.s32 %r1683, %r1683, -4;
setp.ne.s32 %p57, %r1683, 0;
@%p57 bra $L__BB27_21;
$L__BB27_38:
and.b32 %r1664, %r31, 3;
setp.eq.s32 %p58, %r1664, 0;
@%p58 bra $L__BB27_54;
and.b16 %rs683, %rs1048, 255;
setp.ne.s16 %p59, %rs683, 0;
@%p59 bra $L__BB27_43;
setp.eq.s32 %p60, %r1747, 0;
mov.u16 %rs974, 255;
@%p60 bra $L__BB27_42;
cvt.u64.u32 %rd151, %r1746;
add.s64 %rd152, %rd151, %rd3;
add.s64 %rd153, %rd1, %rd152;
ld.global.u8 %rs974, [%rd153];
$L__BB27_42:
setp.ne.s32 %p62, %r1747, 0;
selp.u32 %r891, 1, 0, %p62;
add.s32 %r1746, %r1746, %r891;
add.s32 %r892, %r1747, -1;
selp.b32 %r1747, 0, %r892, %p60;
setp.eq.s32 %p63, %r1747, 0;
or.b16 %rs685, %rs974, 15;
selp.b16 %rs1014, %rs685, %rs974, %p63;
and.b16 %rs686, %rs1014, 255;
mov.u16 %rs687, 8;
sub.s16 %rs1048, %rs687, %rs1013;
setp.eq.s16 %p64, %rs686, 255;
selp.u16 %rs1013, 1, 0, %p64;
$L__BB27_43:
and.b32 %r1665, %r31, 3;
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r893, %rs1048;
and.b32 %r894, %r893, 255;
cvt.u32.u16 %r895, %rs1014;
and.b32 %r1706, %r895, 255;
shr.u32 %r896, %r1706, %r894;
and.b32 %r897, %r896, 1;
bfi.b32 %r1711, %r1711, %r897, 1, 31;
setp.eq.s32 %p65, %r1665, 1;
@%p65 bra $L__BB27_54;
and.b16 %rs688, %rs1048, 255;
setp.ne.s16 %p66, %rs688, 0;
@%p66 bra $L__BB27_48;
setp.eq.s32 %p67, %r1747, 0;
mov.u16 %rs978, 255;
@%p67 bra $L__BB27_47;
cvt.u64.u32 %rd154, %r1746;
add.s64 %rd155, %rd154, %rd3;
add.s64 %rd156, %rd1, %rd155;
ld.global.u8 %rs978, [%rd156];
$L__BB27_47:
setp.ne.s32 %p69, %r1747, 0;
selp.u32 %r898, 1, 0, %p69;
add.s32 %r1746, %r1746, %r898;
add.s32 %r899, %r1747, -1;
selp.b32 %r1747, 0, %r899, %p67;
setp.eq.s32 %p70, %r1747, 0;
or.b16 %rs690, %rs978, 15;
selp.b16 %rs1014, %rs690, %rs978, %p70;
and.b16 %rs691, %rs1014, 255;
mov.u16 %rs692, 8;
sub.s16 %rs1048, %rs692, %rs1013;
setp.eq.s16 %p71, %rs691, 255;
selp.u16 %rs1013, 1, 0, %p71;
cvt.u32.u16 %r900, %rs1014;
and.b32 %r1706, %r900, 255;
$L__BB27_48:
and.b32 %r1666, %r31, 3;
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r901, %rs1048;
and.b32 %r902, %r901, 255;
shr.u32 %r903, %r1706, %r902;
and.b32 %r904, %r903, 1;
bfi.b32 %r1711, %r1711, %r904, 1, 31;
setp.eq.s32 %p72, %r1666, 2;
@%p72 bra $L__BB27_54;
and.b16 %rs693, %rs1048, 255;
setp.ne.s16 %p73, %rs693, 0;
@%p73 bra $L__BB27_53;
setp.eq.s32 %p74, %r1747, 0;
mov.u16 %rs982, 255;
@%p74 bra $L__BB27_52;
cvt.u64.u32 %rd157, %r1746;
add.s64 %rd158, %rd157, %rd3;
add.s64 %rd159, %rd1, %rd158;
ld.global.u8 %rs982, [%rd159];
$L__BB27_52:
setp.ne.s32 %p76, %r1747, 0;
selp.u32 %r905, 1, 0, %p76;
add.s32 %r1746, %r1746, %r905;
add.s32 %r906, %r1747, -1;
selp.b32 %r1747, 0, %r906, %p74;
setp.eq.s32 %p77, %r1747, 0;
or.b16 %rs695, %rs982, 15;
selp.b16 %rs1014, %rs695, %rs982, %p77;
and.b16 %rs696, %rs1014, 255;
mov.u16 %rs697, 8;
sub.s16 %rs1048, %rs697, %rs1013;
setp.eq.s16 %p78, %rs696, 255;
selp.u16 %rs1013, 1, 0, %p78;
cvt.u32.u16 %r907, %rs1014;
and.b32 %r1706, %r907, 255;
$L__BB27_53:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r908, %rs1048;
and.b32 %r909, %r908, 255;
shr.u32 %r910, %r1706, %r909;
and.b32 %r911, %r910, 1;
bfi.b32 %r1711, %r1711, %r911, 1, 31;
$L__BB27_54:
shl.b32 %r912, %r1711, 1;
or.b32 %r1715, %r912, 1;
add.s32 %r913, %r1837, -1;
setp.eq.s32 %p79, %r1837, 0;
selp.b32 %r1714, 0, %r913, %p79;
$L__BB27_55:
mul.lo.s32 %r914, %r1719, 7;
cvt.u64.u32 %rd160, %r1715;
shl.b64 %rd161, %rd160, %r914;
or.b64 %rd455, %rd161, %rd455;
setp.ne.s32 %p80, %r1837, 12;
setp.ne.s32 %p81, %r28, 0;
or.pred %p82, %p80, %p81;
add.s32 %r1719, %r1719, 1;
setp.lt.u32 %p83, %r1719, 8;
or.pred %p84, %p83, %p82;
mov.u32 %r1837, %r1714;
@%p84 bra $L__BB27_11;
$L__BB27_56:
and.b16 %rs941, %rs1, 15;
cvt.u32.u16 %r1673, %rs941;
mul.wide.u16 %r1672, %rs652, 16;
or.b32 %r1671, %r1672, %r1673;
add.s32 %r1916, %r1671, -2;
setp.gt.u16 %p506, %rs1, 143;
selp.u16 %rs1180, 1, 0, %p506;
ld.param.u64 %rd452, [ j2k_htj2k_decode_codeblocks_multi_cleanup_only_param_4];
shr.u16 %rs934, %rs1, 4;
ld.param.u64 %rd449, [ j2k_htj2k_decode_codeblocks_multi_cleanup_only_param_2];
add.s32 %r1838, %r1719, -1;
shr.u64 %rd465, %rd455, 7;
cvt.u32.u64 %r917, %rd455;
and.b32 %r1834, %r917, 127;
cvt.u64.u16 %rd474, %rs934;
and.b64 %rd162, %rd474, 7;
setp.eq.s64 %p85, %rd162, 7;
selp.b32 %r1917, 3, 4, %p85;
cvta.to.global.u64 %rd11, %rd452;
cvta.to.global.u64 %rd12, %rd449;
add.u64 %rd13, %SPL, 0;
mov.u32 %r1720, 0;
mov.u32 %r1721, %r1720;
$L__BB27_57:
setp.gt.u32 %p86, %r1917, 31;
@%p86 bra $L__BB27_61;
$L__BB27_58:
setp.eq.s32 %p87, %r1916, 0;
mov.u16 %rs1000, 0;
@%p87 bra $L__BB27_60;
cvt.s64.s32 %rd164, %r1915;
add.s64 %rd165, %rd164, %rd3;
add.s64 %rd166, %rd1, %rd165;
ld.global.u8 %rs1000, [%rd166];
$L__BB27_60:
setp.ne.s32 %p89, %r1916, 0;
selp.b32 %r918, -1, 0, %p89;
add.s32 %r1915, %r1915, %r918;
add.s32 %r919, %r1916, -1;
selp.b32 %r1916, 0, %r919, %p87;
and.b16 %rs699, %rs1000, 255;
and.b16 %rs700, %rs1000, 127;
setp.eq.s16 %p90, %rs700, 127;
and.b16 %rs701, %rs1180, 255;
setp.ne.s16 %p91, %rs701, 0;
and.pred %p92, %p91, %p90;
selp.b32 %r920, 7, 8, %p92;
cvt.u64.u16 %rd167, %rs1000;
and.b64 %rd168, %rd167, 255;
shl.b64 %rd169, %rd168, %r1917;
or.b64 %rd474, %rd169, %rd474;
add.s32 %r1917, %r920, %r1917;
setp.gt.u16 %p93, %rs699, 143;
selp.u16 %rs1180, 1, 0, %p93;
setp.lt.u32 %p94, %r1917, 33;
@%p94 bra $L__BB27_58;
$L__BB27_61:
cvt.u32.u64 %r921, %rd474;
and.b32 %r922, %r921, 127;
add.s32 %r923, %r922, %r1720;
mul.wide.u32 %rd170, %r923, 2;
add.s64 %rd171, %rd12, %rd170;
ld.global.u16 %r1787, [%rd171];
setp.ne.s32 %p95, %r1720, 0;
@%p95 bra $L__BB27_111;
add.s32 %r130, %r1834, -2;
setp.eq.s32 %p96, %r130, -1;
selp.b32 %r1787, %r1787, 0, %p96;
setp.gt.s32 %p97, %r1834, 1;
mov.u32 %r1834, %r130;
@%p97 bra $L__BB27_111;
setp.ne.s32 %p98, %r1838, 0;
@%p98 bra $L__BB27_110;
mov.u32 %r1838, 0;
$L__BB27_65:
setp.gt.u32 %p99, %r1838, 7;
@%p99 bra $L__BB27_110;
cvt.u64.u32 %rd20, %r1837;
mul.wide.u32 %rd172, %r1837, 4;
add.s64 %rd174, %rd133, %rd172;
ld.global.nc.u32 %r136, [%rd174];
and.b16 %rs702, %rs1048, 255;
setp.ne.s16 %p100, %rs702, 0;
@%p100 bra $L__BB27_70;
setp.eq.s32 %p101, %r1747, 0;
mov.u16 %rs1005, 255;
@%p101 bra $L__BB27_69;
cvt.u64.u32 %rd175, %r1746;
add.s64 %rd176, %rd175, %rd3;
add.s64 %rd177, %rd1, %rd176;
ld.global.u8 %rs1005, [%rd177];
$L__BB27_69:
setp.ne.s32 %p103, %r1747, 0;
selp.u32 %r925, 1, 0, %p103;
add.s32 %r1746, %r1746, %r925;
add.s32 %r926, %r1747, -1;
selp.b32 %r1747, 0, %r926, %p101;
setp.eq.s32 %p104, %r1747, 0;
or.b16 %rs704, %rs1005, 15;
selp.b16 %rs1014, %rs704, %rs1005, %p104;
and.b16 %rs705, %rs1014, 255;
mov.u16 %rs706, 8;
sub.s16 %rs1048, %rs706, %rs1013;
setp.eq.s16 %p105, %rs705, 255;
selp.u16 %rs1013, 1, 0, %p105;
$L__BB27_70:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r927, %rs1048;
and.b32 %r928, %r927, 255;
mov.u32 %r929, 1;
shl.b32 %r930, %r929, %r928;
cvt.u32.u16 %r931, %rs1014;
and.b32 %r932, %r930, %r931;
and.b32 %r141, %r932, 255;
setp.eq.s32 %p106, %r141, 0;
@%p106 bra $L__BB27_72;
add.s32 %r933, %r1837, 1;
min.u32 %r1776, %r933, 12;
mov.u32 %r934, -1;
shl.b32 %r935, %r934, %r136;
shl.b32 %r936, %r935, 1;
xor.b32 %r1777, %r936, -2;
bra.uni $L__BB27_109;
$L__BB27_72:
add.s64 %rd178, %rd20, -3;
setp.gt.u64 %p107, %rd178, 9;
mov.u32 %r1773, 0;
@%p107 bra $L__BB27_108;
max.u32 %r144, %r136, 1;
add.s32 %r940, %r144, -1;
and.b32 %r145, %r144, 3;
setp.lt.u32 %p108, %r940, 3;
mov.u32 %r1773, 0;
@%p108 bra $L__BB27_92;
sub.s32 %r1745, %r144, %r145;
mov.u32 %r1773, 0;
$L__BB27_75:
and.b16 %rs708, %rs1048, 255;
setp.ne.s16 %p109, %rs708, 0;
@%p109 bra $L__BB27_79;
setp.eq.s32 %p110, %r1747, 0;
mov.u16 %rs1012, 255;
@%p110 bra $L__BB27_78;
cvt.u64.u32 %rd179, %r1746;
add.s64 %rd180, %rd179, %rd3;
add.s64 %rd181, %rd1, %rd180;
ld.global.u8 %rs1012, [%rd181];
$L__BB27_78:
setp.ne.s32 %p112, %r1747, 0;
selp.u32 %r942, 1, 0, %p112;
add.s32 %r1746, %r1746, %r942;
add.s32 %r943, %r1747, -1;
selp.b32 %r1747, 0, %r943, %p110;
setp.eq.s32 %p113, %r1747, 0;
or.b16 %rs710, %rs1012, 15;
selp.b16 %rs1014, %rs710, %rs1012, %p113;
and.b16 %rs711, %rs1014, 255;
mov.u16 %rs712, 8;
sub.s16 %rs1048, %rs712, %rs1013;
setp.eq.s16 %p114, %rs711, 255;
selp.u16 %rs1013, 1, 0, %p114;
$L__BB27_79:
add.s16 %rs1019, %rs1048, -1;
and.b16 %rs713, %rs1019, 255;
cvt.u32.u16 %r944, %rs1019;
and.b32 %r945, %r944, 255;
cvt.u32.u16 %r946, %rs1014;
and.b32 %r1751, %r946, 255;
shr.u32 %r947, %r1751, %r945;
and.b32 %r948, %r947, 1;
bfi.b32 %r156, %r1773, %r948, 1, 31;
setp.ne.s16 %p115, %rs713, 0;
@%p115 bra $L__BB27_83;
setp.eq.s32 %p116, %r1747, 0;
mov.u16 %rs1016, 255;
@%p116 bra $L__BB27_82;
cvt.u64.u32 %rd182, %r1746;
add.s64 %rd183, %rd182, %rd3;
add.s64 %rd184, %rd1, %rd183;
ld.global.u8 %rs1016, [%rd184];
$L__BB27_82:
setp.ne.s32 %p118, %r1747, 0;
selp.u32 %r949, 1, 0, %p118;
add.s32 %r1746, %r1746, %r949;
add.s32 %r950, %r1747, -1;
selp.b32 %r1747, 0, %r950, %p116;
setp.eq.s32 %p119, %r1747, 0;
or.b16 %rs715, %rs1016, 15;
selp.b16 %rs1014, %rs715, %rs1016, %p119;
and.b16 %rs716, %rs1014, 255;
mov.u16 %rs717, 8;
sub.s16 %rs1019, %rs717, %rs1013;
setp.eq.s16 %p120, %rs716, 255;
selp.u16 %rs1013, 1, 0, %p120;
cvt.u32.u16 %r951, %rs1014;
and.b32 %r1751, %r951, 255;
$L__BB27_83:
add.s16 %rs1023, %rs1019, -1;
and.b16 %rs718, %rs1023, 255;
cvt.u32.u16 %r952, %rs1023;
and.b32 %r953, %r952, 255;
shr.u32 %r954, %r1751, %r953;
and.b32 %r955, %r954, 1;
bfi.b32 %r163, %r156, %r955, 1, 31;
setp.ne.s16 %p121, %rs718, 0;
@%p121 bra $L__BB27_87;
setp.eq.s32 %p122, %r1747, 0;
mov.u16 %rs1020, 255;
@%p122 bra $L__BB27_86;
cvt.u64.u32 %rd185, %r1746;
add.s64 %rd186, %rd185, %rd3;
add.s64 %rd187, %rd1, %rd186;
ld.global.u8 %rs1020, [%rd187];
$L__BB27_86:
setp.ne.s32 %p124, %r1747, 0;
selp.u32 %r956, 1, 0, %p124;
add.s32 %r1746, %r1746, %r956;
add.s32 %r957, %r1747, -1;
selp.b32 %r1747, 0, %r957, %p122;
setp.eq.s32 %p125, %r1747, 0;
or.b16 %rs720, %rs1020, 15;
selp.b16 %rs1014, %rs720, %rs1020, %p125;
and.b16 %rs721, %rs1014, 255;
mov.u16 %rs722, 8;
sub.s16 %rs1023, %rs722, %rs1013;
setp.eq.s16 %p126, %rs721, 255;
selp.u16 %rs1013, 1, 0, %p126;
cvt.u32.u16 %r958, %rs1014;
and.b32 %r1751, %r958, 255;
$L__BB27_87:
add.s16 %rs1027, %rs1023, -1;
and.b16 %rs723, %rs1027, 255;
cvt.u32.u16 %r959, %rs1027;
and.b32 %r960, %r959, 255;
shr.u32 %r961, %r1751, %r960;
and.b32 %r962, %r961, 1;
bfi.b32 %r170, %r163, %r962, 1, 31;
setp.ne.s16 %p127, %rs723, 0;
@%p127 bra $L__BB27_91;
setp.eq.s32 %p128, %r1747, 0;
mov.u16 %rs1024, 255;
@%p128 bra $L__BB27_90;
cvt.u64.u32 %rd188, %r1746;
add.s64 %rd189, %rd188, %rd3;
add.s64 %rd190, %rd1, %rd189;
ld.global.u8 %rs1024, [%rd190];
$L__BB27_90:
setp.ne.s32 %p130, %r1747, 0;
selp.u32 %r963, 1, 0, %p130;
add.s32 %r1746, %r1746, %r963;
add.s32 %r964, %r1747, -1;
selp.b32 %r1747, 0, %r964, %p128;
setp.eq.s32 %p131, %r1747, 0;
or.b16 %rs725, %rs1024, 15;
selp.b16 %rs1014, %rs725, %rs1024, %p131;
and.b16 %rs726, %rs1014, 255;
mov.u16 %rs727, 8;
sub.s16 %rs1027, %rs727, %rs1013;
setp.eq.s16 %p132, %rs726, 255;
selp.u16 %rs1013, 1, 0, %p132;
cvt.u32.u16 %r965, %rs1014;
and.b32 %r1751, %r965, 255;
$L__BB27_91:
add.s16 %rs1048, %rs1027, -1;
cvt.u32.u16 %r966, %rs1048;
and.b32 %r967, %r966, 255;
shr.u32 %r968, %r1751, %r967;
and.b32 %r969, %r968, 1;
bfi.b32 %r1773, %r170, %r969, 1, 31;
add.s32 %r1745, %r1745, -4;
setp.ne.s32 %p133, %r1745, 0;
@%p133 bra $L__BB27_75;
$L__BB27_92:
setp.eq.s32 %p134, %r145, 0;
@%p134 bra $L__BB27_108;
and.b16 %rs728, %rs1048, 255;
setp.ne.s16 %p135, %rs728, 0;
@%p135 bra $L__BB27_97;
setp.eq.s32 %p136, %r1747, 0;
mov.u16 %rs1034, 255;
@%p136 bra $L__BB27_96;
cvt.u64.u32 %rd191, %r1746;
add.s64 %rd192, %rd191, %rd3;
add.s64 %rd193, %rd1, %rd192;
ld.global.u8 %rs1034, [%rd193];
$L__BB27_96:
setp.ne.s32 %p138, %r1747, 0;
selp.u32 %r970, 1, 0, %p138;
add.s32 %r1746, %r1746, %r970;
add.s32 %r971, %r1747, -1;
selp.b32 %r1747, 0, %r971, %p136;
setp.eq.s32 %p139, %r1747, 0;
or.b16 %rs730, %rs1034, 15;
selp.b16 %rs1014, %rs730, %rs1034, %p139;
and.b16 %rs731, %rs1014, 255;
mov.u16 %rs732, 8;
sub.s16 %rs1048, %rs732, %rs1013;
setp.eq.s16 %p140, %rs731, 255;
selp.u16 %rs1013, 1, 0, %p140;
$L__BB27_97:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r972, %rs1048;
and.b32 %r973, %r972, 255;
cvt.u32.u16 %r974, %rs1014;
and.b32 %r1768, %r974, 255;
shr.u32 %r975, %r1768, %r973;
and.b32 %r976, %r975, 1;
bfi.b32 %r1773, %r1773, %r976, 1, 31;
setp.eq.s32 %p141, %r145, 1;
@%p141 bra $L__BB27_108;
and.b16 %rs733, %rs1048, 255;
setp.ne.s16 %p142, %rs733, 0;
@%p142 bra $L__BB27_102;
setp.eq.s32 %p143, %r1747, 0;
mov.u16 %rs1038, 255;
@%p143 bra $L__BB27_101;
cvt.u64.u32 %rd194, %r1746;
add.s64 %rd195, %rd194, %rd3;
add.s64 %rd196, %rd1, %rd195;
ld.global.u8 %rs1038, [%rd196];
$L__BB27_101:
setp.ne.s32 %p145, %r1747, 0;
selp.u32 %r977, 1, 0, %p145;
add.s32 %r1746, %r1746, %r977;
add.s32 %r978, %r1747, -1;
selp.b32 %r1747, 0, %r978, %p143;
setp.eq.s32 %p146, %r1747, 0;
or.b16 %rs735, %rs1038, 15;
selp.b16 %rs1014, %rs735, %rs1038, %p146;
and.b16 %rs736, %rs1014, 255;
mov.u16 %rs737, 8;
sub.s16 %rs1048, %rs737, %rs1013;
setp.eq.s16 %p147, %rs736, 255;
selp.u16 %rs1013, 1, 0, %p147;
cvt.u32.u16 %r979, %rs1014;
and.b32 %r1768, %r979, 255;
$L__BB27_102:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r980, %rs1048;
and.b32 %r981, %r980, 255;
shr.u32 %r982, %r1768, %r981;
and.b32 %r983, %r982, 1;
bfi.b32 %r1773, %r1773, %r983, 1, 31;
setp.eq.s32 %p148, %r145, 2;
@%p148 bra $L__BB27_108;
and.b16 %rs738, %rs1048, 255;
setp.ne.s16 %p149, %rs738, 0;
@%p149 bra $L__BB27_107;
setp.eq.s32 %p150, %r1747, 0;
mov.u16 %rs1042, 255;
@%p150 bra $L__BB27_106;
cvt.u64.u32 %rd197, %r1746;
add.s64 %rd198, %rd197, %rd3;
add.s64 %rd199, %rd1, %rd198;
ld.global.u8 %rs1042, [%rd199];
$L__BB27_106:
setp.ne.s32 %p152, %r1747, 0;
selp.u32 %r984, 1, 0, %p152;
add.s32 %r1746, %r1746, %r984;
add.s32 %r985, %r1747, -1;
selp.b32 %r1747, 0, %r985, %p150;
setp.eq.s32 %p153, %r1747, 0;
or.b16 %rs740, %rs1042, 15;
selp.b16 %rs1014, %rs740, %rs1042, %p153;
and.b16 %rs741, %rs1014, 255;
mov.u16 %rs742, 8;
sub.s16 %rs1048, %rs742, %rs1013;
setp.eq.s16 %p154, %rs741, 255;
selp.u16 %rs1013, 1, 0, %p154;
cvt.u32.u16 %r986, %rs1014;
and.b32 %r1768, %r986, 255;
$L__BB27_107:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r987, %rs1048;
and.b32 %r988, %r987, 255;
shr.u32 %r989, %r1768, %r988;
and.b32 %r990, %r989, 1;
bfi.b32 %r1773, %r1773, %r990, 1, 31;
$L__BB27_108:
shl.b32 %r991, %r1773, 1;
or.b32 %r1777, %r991, 1;
add.s32 %r992, %r1837, -1;
setp.eq.s32 %p155, %r1837, 0;
selp.b32 %r1776, 0, %r992, %p155;
$L__BB27_109:
mul.lo.s32 %r993, %r1838, 7;
cvt.u64.u32 %rd200, %r1777;
shl.b64 %rd201, %rd200, %r993;
or.b64 %rd465, %rd201, %rd465;
setp.ne.s32 %p156, %r1837, 12;
setp.ne.s32 %p157, %r141, 0;
or.pred %p158, %p156, %p157;
add.s32 %r1838, %r1838, 1;
setp.lt.u32 %p159, %r1838, 8;
or.pred %p160, %p159, %p158;
mov.u32 %r1837, %r1776;
@%p160 bra $L__BB27_65;
$L__BB27_110:
cvt.u32.u64 %r994, %rd465;
and.b32 %r1834, %r994, 127;
shr.u64 %rd465, %rd465, 7;
add.s32 %r1838, %r1838, -1;
$L__BB27_111:
mul.wide.u32 %rd202, %r1721, 2;
add.s64 %rd25, %rd13, %rd202;
st.local.u16 [%rd25], %r1787;
shl.b32 %r995, %r1787, 3;
and.b32 %r996, %r995, 128;
shl.b32 %r997, %r1787, 2;
and.b32 %r998, %r997, 896;
or.b32 %r999, %r996, %r998;
and.b32 %r1000, %r1787, 7;
shr.u64 %rd26, %rd474, %r1000;
sub.s32 %r227, %r1917, %r1000;
cvt.u32.u64 %r1001, %rd26;
and.b32 %r1002, %r1001, 127;
or.b32 %r1003, %r1002, %r999;
mul.wide.u32 %rd203, %r1003, 2;
add.s64 %rd204, %rd12, %rd203;
ld.global.u16 %r1839, [%rd204];
setp.ne.s32 %p161, %r999, 0;
add.s32 %r229, %r1721, 2;
setp.ge.u32 %p162, %r229, %r820;
or.pred %p163, %p162, %p161;
@%p163 bra $L__BB27_161;
add.s32 %r230, %r1834, -2;
setp.eq.s32 %p164, %r230, -1;
selp.b32 %r1839, %r1839, 0, %p164;
setp.gt.s32 %p165, %r1834, 1;
mov.u32 %r1834, %r230;
@%p165 bra $L__BB27_161;
setp.ne.s32 %p166, %r1838, 0;
@%p166 bra $L__BB27_160;
mov.u32 %r1838, 0;
$L__BB27_115:
setp.gt.u32 %p167, %r1838, 7;
@%p167 bra $L__BB27_160;
cvt.u64.u32 %rd28, %r1837;
mul.wide.u32 %rd205, %r1837, 4;
add.s64 %rd207, %rd133, %rd205;
ld.global.nc.u32 %r236, [%rd207];
and.b16 %rs743, %rs1048, 255;
setp.ne.s16 %p168, %rs743, 0;
@%p168 bra $L__BB27_120;
setp.eq.s32 %p169, %r1747, 0;
mov.u16 %rs1061, 255;
@%p169 bra $L__BB27_119;
cvt.u64.u32 %rd208, %r1746;
add.s64 %rd209, %rd208, %rd3;
add.s64 %rd210, %rd1, %rd209;
ld.global.u8 %rs1061, [%rd210];
$L__BB27_119:
setp.ne.s32 %p171, %r1747, 0;
selp.u32 %r1005, 1, 0, %p171;
add.s32 %r1746, %r1746, %r1005;
add.s32 %r1006, %r1747, -1;
selp.b32 %r1747, 0, %r1006, %p169;
setp.eq.s32 %p172, %r1747, 0;
or.b16 %rs745, %rs1061, 15;
selp.b16 %rs1014, %rs745, %rs1061, %p172;
and.b16 %rs746, %rs1014, 255;
mov.u16 %rs747, 8;
sub.s16 %rs1048, %rs747, %rs1013;
setp.eq.s16 %p173, %rs746, 255;
selp.u16 %rs1013, 1, 0, %p173;
$L__BB27_120:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1007, %rs1048;
and.b32 %r1008, %r1007, 255;
mov.u32 %r1009, 1;
shl.b32 %r1010, %r1009, %r1008;
cvt.u32.u16 %r1011, %rs1014;
and.b32 %r1012, %r1010, %r1011;
and.b32 %r241, %r1012, 255;
setp.eq.s32 %p174, %r241, 0;
@%p174 bra $L__BB27_122;
add.s32 %r1013, %r1837, 1;
min.u32 %r1828, %r1013, 12;
mov.u32 %r1014, -1;
shl.b32 %r1015, %r1014, %r236;
shl.b32 %r1016, %r1015, 1;
xor.b32 %r1829, %r1016, -2;
bra.uni $L__BB27_159;
$L__BB27_122:
add.s64 %rd211, %rd28, -3;
setp.gt.u64 %p175, %rd211, 9;
mov.u32 %r1825, 0;
@%p175 bra $L__BB27_158;
max.u32 %r244, %r236, 1;
add.s32 %r1020, %r244, -1;
and.b32 %r245, %r244, 3;
setp.lt.u32 %p176, %r1020, 3;
mov.u32 %r1825, 0;
@%p176 bra $L__BB27_142;
sub.s32 %r1797, %r244, %r245;
mov.u32 %r1825, 0;
$L__BB27_125:
and.b16 %rs749, %rs1048, 255;
setp.ne.s16 %p177, %rs749, 0;
@%p177 bra $L__BB27_129;
setp.eq.s32 %p178, %r1747, 0;
mov.u16 %rs1068, 255;
@%p178 bra $L__BB27_128;
cvt.u64.u32 %rd212, %r1746;
add.s64 %rd213, %rd212, %rd3;
add.s64 %rd214, %rd1, %rd213;
ld.global.u8 %rs1068, [%rd214];
$L__BB27_128:
setp.ne.s32 %p180, %r1747, 0;
selp.u32 %r1022, 1, 0, %p180;
add.s32 %r1746, %r1746, %r1022;
add.s32 %r1023, %r1747, -1;
selp.b32 %r1747, 0, %r1023, %p178;
setp.eq.s32 %p181, %r1747, 0;
or.b16 %rs751, %rs1068, 15;
selp.b16 %rs1014, %rs751, %rs1068, %p181;
and.b16 %rs752, %rs1014, 255;
mov.u16 %rs753, 8;
sub.s16 %rs1048, %rs753, %rs1013;
setp.eq.s16 %p182, %rs752, 255;
selp.u16 %rs1013, 1, 0, %p182;
$L__BB27_129:
add.s16 %rs1075, %rs1048, -1;
and.b16 %rs754, %rs1075, 255;
cvt.u32.u16 %r1024, %rs1075;
and.b32 %r1025, %r1024, 255;
cvt.u32.u16 %r1026, %rs1014;
and.b32 %r1803, %r1026, 255;
shr.u32 %r1027, %r1803, %r1025;
and.b32 %r1028, %r1027, 1;
bfi.b32 %r256, %r1825, %r1028, 1, 31;
setp.ne.s16 %p183, %rs754, 0;
@%p183 bra $L__BB27_133;
setp.eq.s32 %p184, %r1747, 0;
mov.u16 %rs1072, 255;
@%p184 bra $L__BB27_132;
cvt.u64.u32 %rd215, %r1746;
add.s64 %rd216, %rd215, %rd3;
add.s64 %rd217, %rd1, %rd216;
ld.global.u8 %rs1072, [%rd217];
$L__BB27_132:
setp.ne.s32 %p186, %r1747, 0;
selp.u32 %r1029, 1, 0, %p186;
add.s32 %r1746, %r1746, %r1029;
add.s32 %r1030, %r1747, -1;
selp.b32 %r1747, 0, %r1030, %p184;
setp.eq.s32 %p187, %r1747, 0;
or.b16 %rs756, %rs1072, 15;
selp.b16 %rs1014, %rs756, %rs1072, %p187;
and.b16 %rs757, %rs1014, 255;
mov.u16 %rs758, 8;
sub.s16 %rs1075, %rs758, %rs1013;
setp.eq.s16 %p188, %rs757, 255;
selp.u16 %rs1013, 1, 0, %p188;
cvt.u32.u16 %r1031, %rs1014;
and.b32 %r1803, %r1031, 255;
$L__BB27_133:
add.s16 %rs1079, %rs1075, -1;
and.b16 %rs759, %rs1079, 255;
cvt.u32.u16 %r1032, %rs1079;
and.b32 %r1033, %r1032, 255;
shr.u32 %r1034, %r1803, %r1033;
and.b32 %r1035, %r1034, 1;
bfi.b32 %r263, %r256, %r1035, 1, 31;
setp.ne.s16 %p189, %rs759, 0;
@%p189 bra $L__BB27_137;
setp.eq.s32 %p190, %r1747, 0;
mov.u16 %rs1076, 255;
@%p190 bra $L__BB27_136;
cvt.u64.u32 %rd218, %r1746;
add.s64 %rd219, %rd218, %rd3;
add.s64 %rd220, %rd1, %rd219;
ld.global.u8 %rs1076, [%rd220];
$L__BB27_136:
setp.ne.s32 %p192, %r1747, 0;
selp.u32 %r1036, 1, 0, %p192;
add.s32 %r1746, %r1746, %r1036;
add.s32 %r1037, %r1747, -1;
selp.b32 %r1747, 0, %r1037, %p190;
setp.eq.s32 %p193, %r1747, 0;
or.b16 %rs761, %rs1076, 15;
selp.b16 %rs1014, %rs761, %rs1076, %p193;
and.b16 %rs762, %rs1014, 255;
mov.u16 %rs763, 8;
sub.s16 %rs1079, %rs763, %rs1013;
setp.eq.s16 %p194, %rs762, 255;
selp.u16 %rs1013, 1, 0, %p194;
cvt.u32.u16 %r1038, %rs1014;
and.b32 %r1803, %r1038, 255;
$L__BB27_137:
add.s16 %rs1083, %rs1079, -1;
and.b16 %rs764, %rs1083, 255;
cvt.u32.u16 %r1039, %rs1083;
and.b32 %r1040, %r1039, 255;
shr.u32 %r1041, %r1803, %r1040;
and.b32 %r1042, %r1041, 1;
bfi.b32 %r270, %r263, %r1042, 1, 31;
setp.ne.s16 %p195, %rs764, 0;
@%p195 bra $L__BB27_141;
setp.eq.s32 %p196, %r1747, 0;
mov.u16 %rs1080, 255;
@%p196 bra $L__BB27_140;
cvt.u64.u32 %rd221, %r1746;
add.s64 %rd222, %rd221, %rd3;
add.s64 %rd223, %rd1, %rd222;
ld.global.u8 %rs1080, [%rd223];
$L__BB27_140:
setp.ne.s32 %p198, %r1747, 0;
selp.u32 %r1043, 1, 0, %p198;
add.s32 %r1746, %r1746, %r1043;
add.s32 %r1044, %r1747, -1;
selp.b32 %r1747, 0, %r1044, %p196;
setp.eq.s32 %p199, %r1747, 0;
or.b16 %rs766, %rs1080, 15;
selp.b16 %rs1014, %rs766, %rs1080, %p199;
and.b16 %rs767, %rs1014, 255;
mov.u16 %rs768, 8;
sub.s16 %rs1083, %rs768, %rs1013;
setp.eq.s16 %p200, %rs767, 255;
selp.u16 %rs1013, 1, 0, %p200;
cvt.u32.u16 %r1045, %rs1014;
and.b32 %r1803, %r1045, 255;
$L__BB27_141:
add.s16 %rs1048, %rs1083, -1;
cvt.u32.u16 %r1046, %rs1048;
and.b32 %r1047, %r1046, 255;
shr.u32 %r1048, %r1803, %r1047;
and.b32 %r1049, %r1048, 1;
bfi.b32 %r1825, %r270, %r1049, 1, 31;
add.s32 %r1797, %r1797, -4;
setp.ne.s32 %p201, %r1797, 0;
@%p201 bra $L__BB27_125;
$L__BB27_142:
setp.eq.s32 %p202, %r245, 0;
@%p202 bra $L__BB27_158;
and.b16 %rs769, %rs1048, 255;
setp.ne.s16 %p203, %rs769, 0;
@%p203 bra $L__BB27_147;
setp.eq.s32 %p204, %r1747, 0;
mov.u16 %rs1090, 255;
@%p204 bra $L__BB27_146;
cvt.u64.u32 %rd224, %r1746;
add.s64 %rd225, %rd224, %rd3;
add.s64 %rd226, %rd1, %rd225;
ld.global.u8 %rs1090, [%rd226];
$L__BB27_146:
setp.ne.s32 %p206, %r1747, 0;
selp.u32 %r1050, 1, 0, %p206;
add.s32 %r1746, %r1746, %r1050;
add.s32 %r1051, %r1747, -1;
selp.b32 %r1747, 0, %r1051, %p204;
setp.eq.s32 %p207, %r1747, 0;
or.b16 %rs771, %rs1090, 15;
selp.b16 %rs1014, %rs771, %rs1090, %p207;
and.b16 %rs772, %rs1014, 255;
mov.u16 %rs773, 8;
sub.s16 %rs1048, %rs773, %rs1013;
setp.eq.s16 %p208, %rs772, 255;
selp.u16 %rs1013, 1, 0, %p208;
$L__BB27_147:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1052, %rs1048;
and.b32 %r1053, %r1052, 255;
cvt.u32.u16 %r1054, %rs1014;
and.b32 %r1820, %r1054, 255;
shr.u32 %r1055, %r1820, %r1053;
and.b32 %r1056, %r1055, 1;
bfi.b32 %r1825, %r1825, %r1056, 1, 31;
setp.eq.s32 %p209, %r245, 1;
@%p209 bra $L__BB27_158;
and.b16 %rs774, %rs1048, 255;
setp.ne.s16 %p210, %rs774, 0;
@%p210 bra $L__BB27_152;
setp.eq.s32 %p211, %r1747, 0;
mov.u16 %rs1094, 255;
@%p211 bra $L__BB27_151;
cvt.u64.u32 %rd227, %r1746;
add.s64 %rd228, %rd227, %rd3;
add.s64 %rd229, %rd1, %rd228;
ld.global.u8 %rs1094, [%rd229];
$L__BB27_151:
setp.ne.s32 %p213, %r1747, 0;
selp.u32 %r1057, 1, 0, %p213;
add.s32 %r1746, %r1746, %r1057;
add.s32 %r1058, %r1747, -1;
selp.b32 %r1747, 0, %r1058, %p211;
setp.eq.s32 %p214, %r1747, 0;
or.b16 %rs776, %rs1094, 15;
selp.b16 %rs1014, %rs776, %rs1094, %p214;
and.b16 %rs777, %rs1014, 255;
mov.u16 %rs778, 8;
sub.s16 %rs1048, %rs778, %rs1013;
setp.eq.s16 %p215, %rs777, 255;
selp.u16 %rs1013, 1, 0, %p215;
cvt.u32.u16 %r1059, %rs1014;
and.b32 %r1820, %r1059, 255;
$L__BB27_152:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1060, %rs1048;
and.b32 %r1061, %r1060, 255;
shr.u32 %r1062, %r1820, %r1061;
and.b32 %r1063, %r1062, 1;
bfi.b32 %r1825, %r1825, %r1063, 1, 31;
setp.eq.s32 %p216, %r245, 2;
@%p216 bra $L__BB27_158;
and.b16 %rs779, %rs1048, 255;
setp.ne.s16 %p217, %rs779, 0;
@%p217 bra $L__BB27_157;
setp.eq.s32 %p218, %r1747, 0;
mov.u16 %rs1098, 255;
@%p218 bra $L__BB27_156;
cvt.u64.u32 %rd230, %r1746;
add.s64 %rd231, %rd230, %rd3;
add.s64 %rd232, %rd1, %rd231;
ld.global.u8 %rs1098, [%rd232];
$L__BB27_156:
setp.ne.s32 %p220, %r1747, 0;
selp.u32 %r1064, 1, 0, %p220;
add.s32 %r1746, %r1746, %r1064;
add.s32 %r1065, %r1747, -1;
selp.b32 %r1747, 0, %r1065, %p218;
setp.eq.s32 %p221, %r1747, 0;
or.b16 %rs781, %rs1098, 15;
selp.b16 %rs1014, %rs781, %rs1098, %p221;
and.b16 %rs782, %rs1014, 255;
mov.u16 %rs783, 8;
sub.s16 %rs1048, %rs783, %rs1013;
setp.eq.s16 %p222, %rs782, 255;
selp.u16 %rs1013, 1, 0, %p222;
cvt.u32.u16 %r1066, %rs1014;
and.b32 %r1820, %r1066, 255;
$L__BB27_157:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1067, %rs1048;
and.b32 %r1068, %r1067, 255;
shr.u32 %r1069, %r1820, %r1068;
and.b32 %r1070, %r1069, 1;
bfi.b32 %r1825, %r1825, %r1070, 1, 31;
$L__BB27_158:
shl.b32 %r1071, %r1825, 1;
or.b32 %r1829, %r1071, 1;
add.s32 %r1072, %r1837, -1;
setp.eq.s32 %p223, %r1837, 0;
selp.b32 %r1828, 0, %r1072, %p223;
$L__BB27_159:
mul.lo.s32 %r1073, %r1838, 7;
cvt.u64.u32 %rd233, %r1829;
shl.b64 %rd234, %rd233, %r1073;
or.b64 %rd465, %rd234, %rd465;
setp.ne.s32 %p224, %r1837, 12;
setp.ne.s32 %p225, %r241, 0;
or.pred %p226, %p224, %p225;
add.s32 %r1838, %r1838, 1;
setp.lt.u32 %p227, %r1838, 8;
or.pred %p228, %p227, %p226;
mov.u32 %r1837, %r1828;
@%p228 bra $L__BB27_115;
$L__BB27_160:
cvt.u32.u64 %r1074, %rd465;
and.b32 %r1834, %r1074, 127;
shr.u64 %rd465, %rd465, 7;
add.s32 %r1838, %r1838, -1;
$L__BB27_161:
setp.lt.u32 %p229, %r229, %r820;
selp.b32 %r327, %r1839, 0, %p229;
st.local.u16 [%rd25+4], %r327;
and.b32 %r1076, %r995, 64;
shl.b32 %r1077, %r327, 4;
and.b32 %r1078, %r1077, 128;
or.b32 %r1891, %r1078, %r1076;
setp.ne.s32 %p230, %r1891, 192;
@%p230 bra $L__BB27_211;
add.s32 %r329, %r1834, -2;
setp.eq.s32 %p231, %r329, -1;
selp.b32 %r1891, 256, 192, %p231;
setp.gt.s32 %p232, %r1834, 1;
mov.u32 %r1834, %r329;
@%p232 bra $L__BB27_211;
setp.ne.s32 %p233, %r1838, 0;
@%p233 bra $L__BB27_210;
mov.u32 %r1838, 0;
$L__BB27_165:
setp.gt.u32 %p234, %r1838, 7;
@%p234 bra $L__BB27_210;
cvt.u64.u32 %rd34, %r1837;
mul.wide.u32 %rd235, %r1837, 4;
add.s64 %rd237, %rd133, %rd235;
ld.global.nc.u32 %r335, [%rd237];
and.b16 %rs784, %rs1048, 255;
setp.ne.s16 %p235, %rs784, 0;
@%p235 bra $L__BB27_170;
setp.eq.s32 %p236, %r1747, 0;
mov.u16 %rs1117, 255;
@%p236 bra $L__BB27_169;
cvt.u64.u32 %rd238, %r1746;
add.s64 %rd239, %rd238, %rd3;
add.s64 %rd240, %rd1, %rd239;
ld.global.u8 %rs1117, [%rd240];
$L__BB27_169:
setp.ne.s32 %p238, %r1747, 0;
selp.u32 %r1080, 1, 0, %p238;
add.s32 %r1746, %r1746, %r1080;
add.s32 %r1081, %r1747, -1;
selp.b32 %r1747, 0, %r1081, %p236;
setp.eq.s32 %p239, %r1747, 0;
or.b16 %rs786, %rs1117, 15;
selp.b16 %rs1014, %rs786, %rs1117, %p239;
and.b16 %rs787, %rs1014, 255;
mov.u16 %rs788, 8;
sub.s16 %rs1048, %rs788, %rs1013;
setp.eq.s16 %p240, %rs787, 255;
selp.u16 %rs1013, 1, 0, %p240;
$L__BB27_170:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1082, %rs1048;
and.b32 %r1083, %r1082, 255;
mov.u32 %r1084, 1;
shl.b32 %r1085, %r1084, %r1083;
cvt.u32.u16 %r1086, %rs1014;
and.b32 %r1087, %r1085, %r1086;
and.b32 %r340, %r1087, 255;
setp.eq.s32 %p241, %r340, 0;
@%p241 bra $L__BB27_172;
add.s32 %r1088, %r1837, 1;
min.u32 %r1880, %r1088, 12;
mov.u32 %r1089, -1;
shl.b32 %r1090, %r1089, %r335;
shl.b32 %r1091, %r1090, 1;
xor.b32 %r1881, %r1091, -2;
bra.uni $L__BB27_209;
$L__BB27_172:
add.s64 %rd241, %rd34, -3;
setp.gt.u64 %p242, %rd241, 9;
mov.u32 %r1877, 0;
@%p242 bra $L__BB27_208;
max.u32 %r343, %r335, 1;
add.s32 %r1095, %r343, -1;
and.b32 %r344, %r343, 3;
setp.lt.u32 %p243, %r1095, 3;
mov.u32 %r1877, 0;
@%p243 bra $L__BB27_192;
sub.s32 %r1849, %r343, %r344;
mov.u32 %r1877, 0;
$L__BB27_175:
and.b16 %rs790, %rs1048, 255;
setp.ne.s16 %p244, %rs790, 0;
@%p244 bra $L__BB27_179;
setp.eq.s32 %p245, %r1747, 0;
mov.u16 %rs1124, 255;
@%p245 bra $L__BB27_178;
cvt.u64.u32 %rd242, %r1746;
add.s64 %rd243, %rd242, %rd3;
add.s64 %rd244, %rd1, %rd243;
ld.global.u8 %rs1124, [%rd244];
$L__BB27_178:
setp.ne.s32 %p247, %r1747, 0;
selp.u32 %r1097, 1, 0, %p247;
add.s32 %r1746, %r1746, %r1097;
add.s32 %r1098, %r1747, -1;
selp.b32 %r1747, 0, %r1098, %p245;
setp.eq.s32 %p248, %r1747, 0;
or.b16 %rs792, %rs1124, 15;
selp.b16 %rs1014, %rs792, %rs1124, %p248;
and.b16 %rs793, %rs1014, 255;
mov.u16 %rs794, 8;
sub.s16 %rs1048, %rs794, %rs1013;
setp.eq.s16 %p249, %rs793, 255;
selp.u16 %rs1013, 1, 0, %p249;
$L__BB27_179:
add.s16 %rs1131, %rs1048, -1;
and.b16 %rs795, %rs1131, 255;
cvt.u32.u16 %r1099, %rs1131;
and.b32 %r1100, %r1099, 255;
cvt.u32.u16 %r1101, %rs1014;
and.b32 %r1855, %r1101, 255;
shr.u32 %r1102, %r1855, %r1100;
and.b32 %r1103, %r1102, 1;
bfi.b32 %r355, %r1877, %r1103, 1, 31;
setp.ne.s16 %p250, %rs795, 0;
@%p250 bra $L__BB27_183;
setp.eq.s32 %p251, %r1747, 0;
mov.u16 %rs1128, 255;
@%p251 bra $L__BB27_182;
cvt.u64.u32 %rd245, %r1746;
add.s64 %rd246, %rd245, %rd3;
add.s64 %rd247, %rd1, %rd246;
ld.global.u8 %rs1128, [%rd247];
$L__BB27_182:
setp.ne.s32 %p253, %r1747, 0;
selp.u32 %r1104, 1, 0, %p253;
add.s32 %r1746, %r1746, %r1104;
add.s32 %r1105, %r1747, -1;
selp.b32 %r1747, 0, %r1105, %p251;
setp.eq.s32 %p254, %r1747, 0;
or.b16 %rs797, %rs1128, 15;
selp.b16 %rs1014, %rs797, %rs1128, %p254;
and.b16 %rs798, %rs1014, 255;
mov.u16 %rs799, 8;
sub.s16 %rs1131, %rs799, %rs1013;
setp.eq.s16 %p255, %rs798, 255;
selp.u16 %rs1013, 1, 0, %p255;
cvt.u32.u16 %r1106, %rs1014;
and.b32 %r1855, %r1106, 255;
$L__BB27_183:
add.s16 %rs1135, %rs1131, -1;
and.b16 %rs800, %rs1135, 255;
cvt.u32.u16 %r1107, %rs1135;
and.b32 %r1108, %r1107, 255;
shr.u32 %r1109, %r1855, %r1108;
and.b32 %r1110, %r1109, 1;
bfi.b32 %r362, %r355, %r1110, 1, 31;
setp.ne.s16 %p256, %rs800, 0;
@%p256 bra $L__BB27_187;
setp.eq.s32 %p257, %r1747, 0;
mov.u16 %rs1132, 255;
@%p257 bra $L__BB27_186;
cvt.u64.u32 %rd248, %r1746;
add.s64 %rd249, %rd248, %rd3;
add.s64 %rd250, %rd1, %rd249;
ld.global.u8 %rs1132, [%rd250];
$L__BB27_186:
setp.ne.s32 %p259, %r1747, 0;
selp.u32 %r1111, 1, 0, %p259;
add.s32 %r1746, %r1746, %r1111;
add.s32 %r1112, %r1747, -1;
selp.b32 %r1747, 0, %r1112, %p257;
setp.eq.s32 %p260, %r1747, 0;
or.b16 %rs802, %rs1132, 15;
selp.b16 %rs1014, %rs802, %rs1132, %p260;
and.b16 %rs803, %rs1014, 255;
mov.u16 %rs804, 8;
sub.s16 %rs1135, %rs804, %rs1013;
setp.eq.s16 %p261, %rs803, 255;
selp.u16 %rs1013, 1, 0, %p261;
cvt.u32.u16 %r1113, %rs1014;
and.b32 %r1855, %r1113, 255;
$L__BB27_187:
add.s16 %rs1139, %rs1135, -1;
and.b16 %rs805, %rs1139, 255;
cvt.u32.u16 %r1114, %rs1139;
and.b32 %r1115, %r1114, 255;
shr.u32 %r1116, %r1855, %r1115;
and.b32 %r1117, %r1116, 1;
bfi.b32 %r369, %r362, %r1117, 1, 31;
setp.ne.s16 %p262, %rs805, 0;
@%p262 bra $L__BB27_191;
setp.eq.s32 %p263, %r1747, 0;
mov.u16 %rs1136, 255;
@%p263 bra $L__BB27_190;
cvt.u64.u32 %rd251, %r1746;
add.s64 %rd252, %rd251, %rd3;
add.s64 %rd253, %rd1, %rd252;
ld.global.u8 %rs1136, [%rd253];
$L__BB27_190:
setp.ne.s32 %p265, %r1747, 0;
selp.u32 %r1118, 1, 0, %p265;
add.s32 %r1746, %r1746, %r1118;
add.s32 %r1119, %r1747, -1;
selp.b32 %r1747, 0, %r1119, %p263;
setp.eq.s32 %p266, %r1747, 0;
or.b16 %rs807, %rs1136, 15;
selp.b16 %rs1014, %rs807, %rs1136, %p266;
and.b16 %rs808, %rs1014, 255;
mov.u16 %rs809, 8;
sub.s16 %rs1139, %rs809, %rs1013;
setp.eq.s16 %p267, %rs808, 255;
selp.u16 %rs1013, 1, 0, %p267;
cvt.u32.u16 %r1120, %rs1014;
and.b32 %r1855, %r1120, 255;
$L__BB27_191:
add.s16 %rs1048, %rs1139, -1;
cvt.u32.u16 %r1121, %rs1048;
and.b32 %r1122, %r1121, 255;
shr.u32 %r1123, %r1855, %r1122;
and.b32 %r1124, %r1123, 1;
bfi.b32 %r1877, %r369, %r1124, 1, 31;
add.s32 %r1849, %r1849, -4;
setp.ne.s32 %p268, %r1849, 0;
@%p268 bra $L__BB27_175;
$L__BB27_192:
setp.eq.s32 %p269, %r344, 0;
@%p269 bra $L__BB27_208;
and.b16 %rs810, %rs1048, 255;
setp.ne.s16 %p270, %rs810, 0;
@%p270 bra $L__BB27_197;
setp.eq.s32 %p271, %r1747, 0;
mov.u16 %rs1146, 255;
@%p271 bra $L__BB27_196;
cvt.u64.u32 %rd254, %r1746;
add.s64 %rd255, %rd254, %rd3;
add.s64 %rd256, %rd1, %rd255;
ld.global.u8 %rs1146, [%rd256];
$L__BB27_196:
setp.ne.s32 %p273, %r1747, 0;
selp.u32 %r1125, 1, 0, %p273;
add.s32 %r1746, %r1746, %r1125;
add.s32 %r1126, %r1747, -1;
selp.b32 %r1747, 0, %r1126, %p271;
setp.eq.s32 %p274, %r1747, 0;
or.b16 %rs812, %rs1146, 15;
selp.b16 %rs1014, %rs812, %rs1146, %p274;
and.b16 %rs813, %rs1014, 255;
mov.u16 %rs814, 8;
sub.s16 %rs1048, %rs814, %rs1013;
setp.eq.s16 %p275, %rs813, 255;
selp.u16 %rs1013, 1, 0, %p275;
$L__BB27_197:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1127, %rs1048;
and.b32 %r1128, %r1127, 255;
cvt.u32.u16 %r1129, %rs1014;
and.b32 %r1872, %r1129, 255;
shr.u32 %r1130, %r1872, %r1128;
and.b32 %r1131, %r1130, 1;
bfi.b32 %r1877, %r1877, %r1131, 1, 31;
setp.eq.s32 %p276, %r344, 1;
@%p276 bra $L__BB27_208;
and.b16 %rs815, %rs1048, 255;
setp.ne.s16 %p277, %rs815, 0;
@%p277 bra $L__BB27_202;
setp.eq.s32 %p278, %r1747, 0;
mov.u16 %rs1150, 255;
@%p278 bra $L__BB27_201;
cvt.u64.u32 %rd257, %r1746;
add.s64 %rd258, %rd257, %rd3;
add.s64 %rd259, %rd1, %rd258;
ld.global.u8 %rs1150, [%rd259];
$L__BB27_201:
setp.ne.s32 %p280, %r1747, 0;
selp.u32 %r1132, 1, 0, %p280;
add.s32 %r1746, %r1746, %r1132;
add.s32 %r1133, %r1747, -1;
selp.b32 %r1747, 0, %r1133, %p278;
setp.eq.s32 %p281, %r1747, 0;
or.b16 %rs817, %rs1150, 15;
selp.b16 %rs1014, %rs817, %rs1150, %p281;
and.b16 %rs818, %rs1014, 255;
mov.u16 %rs819, 8;
sub.s16 %rs1048, %rs819, %rs1013;
setp.eq.s16 %p282, %rs818, 255;
selp.u16 %rs1013, 1, 0, %p282;
cvt.u32.u16 %r1134, %rs1014;
and.b32 %r1872, %r1134, 255;
$L__BB27_202:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1135, %rs1048;
and.b32 %r1136, %r1135, 255;
shr.u32 %r1137, %r1872, %r1136;
and.b32 %r1138, %r1137, 1;
bfi.b32 %r1877, %r1877, %r1138, 1, 31;
setp.eq.s32 %p283, %r344, 2;
@%p283 bra $L__BB27_208;
and.b16 %rs820, %rs1048, 255;
setp.ne.s16 %p284, %rs820, 0;
@%p284 bra $L__BB27_207;
setp.eq.s32 %p285, %r1747, 0;
mov.u16 %rs1154, 255;
@%p285 bra $L__BB27_206;
cvt.u64.u32 %rd260, %r1746;
add.s64 %rd261, %rd260, %rd3;
add.s64 %rd262, %rd1, %rd261;
ld.global.u8 %rs1154, [%rd262];
$L__BB27_206:
setp.ne.s32 %p287, %r1747, 0;
selp.u32 %r1139, 1, 0, %p287;
add.s32 %r1746, %r1746, %r1139;
add.s32 %r1140, %r1747, -1;
selp.b32 %r1747, 0, %r1140, %p285;
setp.eq.s32 %p288, %r1747, 0;
or.b16 %rs822, %rs1154, 15;
selp.b16 %rs1014, %rs822, %rs1154, %p288;
and.b16 %rs823, %rs1014, 255;
mov.u16 %rs824, 8;
sub.s16 %rs1048, %rs824, %rs1013;
setp.eq.s16 %p289, %rs823, 255;
selp.u16 %rs1013, 1, 0, %p289;
cvt.u32.u16 %r1141, %rs1014;
and.b32 %r1872, %r1141, 255;
$L__BB27_207:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1142, %rs1048;
and.b32 %r1143, %r1142, 255;
shr.u32 %r1144, %r1872, %r1143;
and.b32 %r1145, %r1144, 1;
bfi.b32 %r1877, %r1877, %r1145, 1, 31;
$L__BB27_208:
shl.b32 %r1146, %r1877, 1;
or.b32 %r1881, %r1146, 1;
add.s32 %r1147, %r1837, -1;
setp.eq.s32 %p290, %r1837, 0;
selp.b32 %r1880, 0, %r1147, %p290;
$L__BB27_209:
mul.lo.s32 %r1148, %r1838, 7;
cvt.u64.u32 %rd263, %r1881;
shl.b64 %rd264, %rd263, %r1148;
or.b64 %rd465, %rd264, %rd465;
setp.ne.s32 %p291, %r1837, 12;
setp.ne.s32 %p292, %r340, 0;
or.pred %p293, %p291, %p292;
add.s32 %r1838, %r1838, 1;
setp.lt.u32 %p294, %r1838, 8;
or.pred %p295, %p294, %p293;
mov.u32 %r1837, %r1880;
@%p295 bra $L__BB27_165;
$L__BB27_210:
cvt.u32.u64 %r1149, %rd465;
and.b32 %r1834, %r1149, 127;
shr.u64 %rd465, %rd465, 7;
add.s32 %r1838, %r1838, -1;
$L__BB27_211:
and.b32 %r1150, %r327, 7;
shr.u64 %rd265, %rd26, %r1150;
cvt.u32.u64 %r1151, %rd265;
and.b32 %r1152, %r1151, 63;
add.s32 %r1153, %r1891, %r1152;
mul.wide.u32 %rd266, %r1153, 2;
add.s64 %rd267, %rd11, %rd266;
ld.global.u16 %r1154, [%rd267];
and.b32 %r1155, %r1154, 7;
shr.u64 %rd268, %rd265, %r1155;
sub.s32 %r1156, %r227, %r1150;
sub.s32 %r1157, %r1156, %r1155;
cvt.u32.u64 %r1158, %rd268;
shr.u32 %r1159, %r1154, 3;
and.b32 %r1160, %r1159, 15;
mov.u32 %r1161, -1;
shl.b32 %r1162, %r1161, %r1160;
not.b32 %r1163, %r1162;
and.b32 %r1164, %r1158, %r1163;
shr.u64 %rd474, %rd268, %r1160;
sub.s32 %r1917, %r1157, %r1160;
shr.u32 %r1165, %r1154, 7;
and.b32 %r1166, %r1165, 7;
shr.u32 %r1167, %r1154, 10;
and.b32 %r1168, %r1167, 7;
mov.u32 %r1169, 255;
shl.b32 %r1170, %r1169, %r1166;
not.b32 %r1171, %r1170;
and.b32 %r1172, %r1164, %r1171;
add.s32 %r1173, %r1168, %r1172;
add.s32 %r1174, %r1173, 1;
st.local.u16 [%rd25+2], %r1174;
shr.u32 %r1175, %r1154, 13;
shr.u32 %r1176, %r1164, %r1166;
add.s32 %r1177, %r1175, %r1176;
add.s32 %r1178, %r1177, 1;
st.local.u16 [%rd25+6], %r1178;
add.s32 %r1721, %r1721, 4;
setp.lt.u32 %p296, %r1721, %r820;
shl.b32 %r1179, %r327, 2;
and.b32 %r1180, %r1179, 896;
shl.b32 %r1181, %r327, 3;
and.b32 %r1182, %r1181, 128;
or.b32 %r1720, %r1182, %r1180;
@%p296 bra $L__BB27_57;
mul.wide.u32 %rd271, %r1721, 2;
add.s64 %rd272, %rd13, %rd271;
mov.u16 %rs825, 0;
st.local.v2.u16 [%rd272], {%rs825, %rs825};
setp.lt.u32 %p297, %r822, 3;
@%p297 bra $L__BB27_321;
ld.param.u64 %rd451, [ j2k_htj2k_decode_codeblocks_multi_cleanup_only_param_3];
ld.param.u64 %rd450, [ j2k_htj2k_decode_codeblocks_multi_cleanup_only_param_5];
cvta.to.global.u64 %rd40, %rd450;
cvta.to.global.u64 %rd41, %rd451;
mov.u32 %r1892, 2;
$L__BB27_214:
shr.u32 %r1187, %r1892, 1;
mul.lo.s32 %r439, %r1187, %r841;
sub.s32 %r440, %r439, %r841;
mov.u32 %r1901, 0;
mov.u32 %r1902, %r1901;
mov.u32 %r1903, %r439;
$L__BB27_215:
sub.s32 %r1188, %r1903, %r439;
add.s32 %r452, %r1188, %r440;
mul.wide.u32 %rd273, %r452, 2;
add.s64 %rd46, %rd13, %rd273;
ld.local.u16 %r1189, [%rd46];
shl.b32 %r1190, %r1189, 2;
and.b32 %r1191, %r1190, 640;
or.b32 %r1192, %r1902, %r1191;
add.s32 %r1193, %r452, 2;
mul.wide.u32 %rd274, %r1193, 2;
add.s64 %rd47, %rd13, %rd274;
ld.local.u16 %r1194, [%rd47];
shl.b32 %r1195, %r1194, 4;
and.b32 %r1196, %r1195, 512;
or.b32 %r453, %r1192, %r1196;
setp.gt.u32 %p298, %r1917, 31;
@%p298 bra $L__BB27_219;
$L__BB27_216:
setp.eq.s32 %p299, %r1916, 0;
mov.u16 %rs1179, 0;
@%p299 bra $L__BB27_218;
cvt.s64.s32 %rd275, %r1915;
add.s64 %rd276, %rd275, %rd3;
add.s64 %rd277, %rd1, %rd276;
ld.global.u8 %rs1179, [%rd277];
$L__BB27_218:
setp.ne.s32 %p301, %r1916, 0;
selp.b32 %r1197, -1, 0, %p301;
add.s32 %r1915, %r1915, %r1197;
add.s32 %r1198, %r1916, -1;
selp.b32 %r1916, 0, %r1198, %p299;
and.b16 %rs827, %rs1179, 255;
and.b16 %rs828, %rs1179, 127;
setp.eq.s16 %p302, %rs828, 127;
and.b16 %rs829, %rs1180, 255;
setp.ne.s16 %p303, %rs829, 0;
and.pred %p304, %p303, %p302;
selp.b32 %r1199, 7, 8, %p304;
cvt.u64.u16 %rd278, %rs1179;
and.b64 %rd279, %rd278, 255;
shl.b64 %rd280, %rd279, %r1917;
or.b64 %rd474, %rd280, %rd474;
add.s32 %r1917, %r1199, %r1917;
setp.gt.u16 %p305, %rs827, 143;
selp.u16 %rs1180, 1, 0, %p305;
setp.lt.u32 %p306, %r1917, 33;
@%p306 bra $L__BB27_216;
$L__BB27_219:
cvt.u32.u64 %r1200, %rd474;
and.b32 %r1201, %r1200, 127;
add.s32 %r1202, %r1201, %r453;
mul.wide.u32 %rd281, %r1202, 2;
add.s64 %rd282, %rd41, %rd281;
ld.global.u16 %r1969, [%rd282];
setp.ne.s32 %p307, %r453, 0;
@%p307 bra $L__BB27_269;
add.s32 %r464, %r1834, -2;
setp.eq.s32 %p308, %r464, -1;
selp.b32 %r1969, %r1969, 0, %p308;
setp.gt.s32 %p309, %r1834, 1;
mov.u32 %r1834, %r464;
@%p309 bra $L__BB27_269;
setp.ne.s32 %p310, %r1838, 0;
@%p310 bra $L__BB27_268;
mov.u32 %r1838, 0;
$L__BB27_223:
setp.gt.u32 %p311, %r1838, 7;
@%p311 bra $L__BB27_268;
cvt.u64.u32 %rd52, %r1837;
mul.wide.u32 %rd283, %r1837, 4;
add.s64 %rd285, %rd133, %rd283;
ld.global.nc.u32 %r470, [%rd285];
and.b16 %rs830, %rs1048, 255;
setp.ne.s16 %p312, %rs830, 0;
@%p312 bra $L__BB27_228;
setp.eq.s32 %p313, %r1747, 0;
mov.u16 %rs1184, 255;
@%p313 bra $L__BB27_227;
cvt.u64.u32 %rd286, %r1746;
add.s64 %rd287, %rd286, %rd3;
add.s64 %rd288, %rd1, %rd287;
ld.global.u8 %rs1184, [%rd288];
$L__BB27_227:
setp.ne.s32 %p315, %r1747, 0;
selp.u32 %r1204, 1, 0, %p315;
add.s32 %r1746, %r1746, %r1204;
add.s32 %r1205, %r1747, -1;
selp.b32 %r1747, 0, %r1205, %p313;
setp.eq.s32 %p316, %r1747, 0;
or.b16 %rs832, %rs1184, 15;
selp.b16 %rs1014, %rs832, %rs1184, %p316;
and.b16 %rs833, %rs1014, 255;
mov.u16 %rs834, 8;
sub.s16 %rs1048, %rs834, %rs1013;
setp.eq.s16 %p317, %rs833, 255;
selp.u16 %rs1013, 1, 0, %p317;
$L__BB27_228:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1206, %rs1048;
and.b32 %r1207, %r1206, 255;
mov.u32 %r1208, 1;
shl.b32 %r1209, %r1208, %r1207;
cvt.u32.u16 %r1210, %rs1014;
and.b32 %r1211, %r1209, %r1210;
and.b32 %r475, %r1211, 255;
setp.eq.s32 %p318, %r475, 0;
@%p318 bra $L__BB27_230;
add.s32 %r1212, %r1837, 1;
min.u32 %r1958, %r1212, 12;
mov.u32 %r1213, -1;
shl.b32 %r1214, %r1213, %r470;
shl.b32 %r1215, %r1214, 1;
xor.b32 %r1959, %r1215, -2;
bra.uni $L__BB27_267;
$L__BB27_230:
add.s64 %rd289, %rd52, -3;
setp.gt.u64 %p319, %rd289, 9;
mov.u32 %r1955, 0;
@%p319 bra $L__BB27_266;
max.u32 %r478, %r470, 1;
add.s32 %r1219, %r478, -1;
and.b32 %r479, %r478, 3;
setp.lt.u32 %p320, %r1219, 3;
mov.u32 %r1955, 0;
@%p320 bra $L__BB27_250;
sub.s32 %r1927, %r478, %r479;
mov.u32 %r1955, 0;
$L__BB27_233:
and.b16 %rs836, %rs1048, 255;
setp.ne.s16 %p321, %rs836, 0;
@%p321 bra $L__BB27_237;
setp.eq.s32 %p322, %r1747, 0;
mov.u16 %rs1191, 255;
@%p322 bra $L__BB27_236;
cvt.u64.u32 %rd290, %r1746;
add.s64 %rd291, %rd290, %rd3;
add.s64 %rd292, %rd1, %rd291;
ld.global.u8 %rs1191, [%rd292];
$L__BB27_236:
setp.ne.s32 %p324, %r1747, 0;
selp.u32 %r1221, 1, 0, %p324;
add.s32 %r1746, %r1746, %r1221;
add.s32 %r1222, %r1747, -1;
selp.b32 %r1747, 0, %r1222, %p322;
setp.eq.s32 %p325, %r1747, 0;
or.b16 %rs838, %rs1191, 15;
selp.b16 %rs1014, %rs838, %rs1191, %p325;
and.b16 %rs839, %rs1014, 255;
mov.u16 %rs840, 8;
sub.s16 %rs1048, %rs840, %rs1013;
setp.eq.s16 %p326, %rs839, 255;
selp.u16 %rs1013, 1, 0, %p326;
$L__BB27_237:
add.s16 %rs1198, %rs1048, -1;
and.b16 %rs841, %rs1198, 255;
cvt.u32.u16 %r1223, %rs1198;
and.b32 %r1224, %r1223, 255;
cvt.u32.u16 %r1225, %rs1014;
and.b32 %r1933, %r1225, 255;
shr.u32 %r1226, %r1933, %r1224;
and.b32 %r1227, %r1226, 1;
bfi.b32 %r490, %r1955, %r1227, 1, 31;
setp.ne.s16 %p327, %rs841, 0;
@%p327 bra $L__BB27_241;
setp.eq.s32 %p328, %r1747, 0;
mov.u16 %rs1195, 255;
@%p328 bra $L__BB27_240;
cvt.u64.u32 %rd293, %r1746;
add.s64 %rd294, %rd293, %rd3;
add.s64 %rd295, %rd1, %rd294;
ld.global.u8 %rs1195, [%rd295];
$L__BB27_240:
setp.ne.s32 %p330, %r1747, 0;
selp.u32 %r1228, 1, 0, %p330;
add.s32 %r1746, %r1746, %r1228;
add.s32 %r1229, %r1747, -1;
selp.b32 %r1747, 0, %r1229, %p328;
setp.eq.s32 %p331, %r1747, 0;
or.b16 %rs843, %rs1195, 15;
selp.b16 %rs1014, %rs843, %rs1195, %p331;
and.b16 %rs844, %rs1014, 255;
mov.u16 %rs845, 8;
sub.s16 %rs1198, %rs845, %rs1013;
setp.eq.s16 %p332, %rs844, 255;
selp.u16 %rs1013, 1, 0, %p332;
cvt.u32.u16 %r1230, %rs1014;
and.b32 %r1933, %r1230, 255;
$L__BB27_241:
add.s16 %rs1202, %rs1198, -1;
and.b16 %rs846, %rs1202, 255;
cvt.u32.u16 %r1231, %rs1202;
and.b32 %r1232, %r1231, 255;
shr.u32 %r1233, %r1933, %r1232;
and.b32 %r1234, %r1233, 1;
bfi.b32 %r497, %r490, %r1234, 1, 31;
setp.ne.s16 %p333, %rs846, 0;
@%p333 bra $L__BB27_245;
setp.eq.s32 %p334, %r1747, 0;
mov.u16 %rs1199, 255;
@%p334 bra $L__BB27_244;
cvt.u64.u32 %rd296, %r1746;
add.s64 %rd297, %rd296, %rd3;
add.s64 %rd298, %rd1, %rd297;
ld.global.u8 %rs1199, [%rd298];
$L__BB27_244:
setp.ne.s32 %p336, %r1747, 0;
selp.u32 %r1235, 1, 0, %p336;
add.s32 %r1746, %r1746, %r1235;
add.s32 %r1236, %r1747, -1;
selp.b32 %r1747, 0, %r1236, %p334;
setp.eq.s32 %p337, %r1747, 0;
or.b16 %rs848, %rs1199, 15;
selp.b16 %rs1014, %rs848, %rs1199, %p337;
and.b16 %rs849, %rs1014, 255;
mov.u16 %rs850, 8;
sub.s16 %rs1202, %rs850, %rs1013;
setp.eq.s16 %p338, %rs849, 255;
selp.u16 %rs1013, 1, 0, %p338;
cvt.u32.u16 %r1237, %rs1014;
and.b32 %r1933, %r1237, 255;
$L__BB27_245:
add.s16 %rs1206, %rs1202, -1;
and.b16 %rs851, %rs1206, 255;
cvt.u32.u16 %r1238, %rs1206;
and.b32 %r1239, %r1238, 255;
shr.u32 %r1240, %r1933, %r1239;
and.b32 %r1241, %r1240, 1;
bfi.b32 %r504, %r497, %r1241, 1, 31;
setp.ne.s16 %p339, %rs851, 0;
@%p339 bra $L__BB27_249;
setp.eq.s32 %p340, %r1747, 0;
mov.u16 %rs1203, 255;
@%p340 bra $L__BB27_248;
cvt.u64.u32 %rd299, %r1746;
add.s64 %rd300, %rd299, %rd3;
add.s64 %rd301, %rd1, %rd300;
ld.global.u8 %rs1203, [%rd301];
$L__BB27_248:
setp.ne.s32 %p342, %r1747, 0;
selp.u32 %r1242, 1, 0, %p342;
add.s32 %r1746, %r1746, %r1242;
add.s32 %r1243, %r1747, -1;
selp.b32 %r1747, 0, %r1243, %p340;
setp.eq.s32 %p343, %r1747, 0;
or.b16 %rs853, %rs1203, 15;
selp.b16 %rs1014, %rs853, %rs1203, %p343;
and.b16 %rs854, %rs1014, 255;
mov.u16 %rs855, 8;
sub.s16 %rs1206, %rs855, %rs1013;
setp.eq.s16 %p344, %rs854, 255;
selp.u16 %rs1013, 1, 0, %p344;
cvt.u32.u16 %r1244, %rs1014;
and.b32 %r1933, %r1244, 255;
$L__BB27_249:
add.s16 %rs1048, %rs1206, -1;
cvt.u32.u16 %r1245, %rs1048;
and.b32 %r1246, %r1245, 255;
shr.u32 %r1247, %r1933, %r1246;
and.b32 %r1248, %r1247, 1;
bfi.b32 %r1955, %r504, %r1248, 1, 31;
add.s32 %r1927, %r1927, -4;
setp.ne.s32 %p345, %r1927, 0;
@%p345 bra $L__BB27_233;
$L__BB27_250:
setp.eq.s32 %p346, %r479, 0;
@%p346 bra $L__BB27_266;
and.b16 %rs856, %rs1048, 255;
setp.ne.s16 %p347, %rs856, 0;
@%p347 bra $L__BB27_255;
setp.eq.s32 %p348, %r1747, 0;
mov.u16 %rs1213, 255;
@%p348 bra $L__BB27_254;
cvt.u64.u32 %rd302, %r1746;
add.s64 %rd303, %rd302, %rd3;
add.s64 %rd304, %rd1, %rd303;
ld.global.u8 %rs1213, [%rd304];
$L__BB27_254:
setp.ne.s32 %p350, %r1747, 0;
selp.u32 %r1249, 1, 0, %p350;
add.s32 %r1746, %r1746, %r1249;
add.s32 %r1250, %r1747, -1;
selp.b32 %r1747, 0, %r1250, %p348;
setp.eq.s32 %p351, %r1747, 0;
or.b16 %rs858, %rs1213, 15;
selp.b16 %rs1014, %rs858, %rs1213, %p351;
and.b16 %rs859, %rs1014, 255;
mov.u16 %rs860, 8;
sub.s16 %rs1048, %rs860, %rs1013;
setp.eq.s16 %p352, %rs859, 255;
selp.u16 %rs1013, 1, 0, %p352;
$L__BB27_255:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1251, %rs1048;
and.b32 %r1252, %r1251, 255;
cvt.u32.u16 %r1253, %rs1014;
and.b32 %r1950, %r1253, 255;
shr.u32 %r1254, %r1950, %r1252;
and.b32 %r1255, %r1254, 1;
bfi.b32 %r1955, %r1955, %r1255, 1, 31;
setp.eq.s32 %p353, %r479, 1;
@%p353 bra $L__BB27_266;
and.b16 %rs861, %rs1048, 255;
setp.ne.s16 %p354, %rs861, 0;
@%p354 bra $L__BB27_260;
setp.eq.s32 %p355, %r1747, 0;
mov.u16 %rs1217, 255;
@%p355 bra $L__BB27_259;
cvt.u64.u32 %rd305, %r1746;
add.s64 %rd306, %rd305, %rd3;
add.s64 %rd307, %rd1, %rd306;
ld.global.u8 %rs1217, [%rd307];
$L__BB27_259:
setp.ne.s32 %p357, %r1747, 0;
selp.u32 %r1256, 1, 0, %p357;
add.s32 %r1746, %r1746, %r1256;
add.s32 %r1257, %r1747, -1;
selp.b32 %r1747, 0, %r1257, %p355;
setp.eq.s32 %p358, %r1747, 0;
or.b16 %rs863, %rs1217, 15;
selp.b16 %rs1014, %rs863, %rs1217, %p358;
and.b16 %rs864, %rs1014, 255;
mov.u16 %rs865, 8;
sub.s16 %rs1048, %rs865, %rs1013;
setp.eq.s16 %p359, %rs864, 255;
selp.u16 %rs1013, 1, 0, %p359;
cvt.u32.u16 %r1258, %rs1014;
and.b32 %r1950, %r1258, 255;
$L__BB27_260:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1259, %rs1048;
and.b32 %r1260, %r1259, 255;
shr.u32 %r1261, %r1950, %r1260;
and.b32 %r1262, %r1261, 1;
bfi.b32 %r1955, %r1955, %r1262, 1, 31;
setp.eq.s32 %p360, %r479, 2;
@%p360 bra $L__BB27_266;
and.b16 %rs866, %rs1048, 255;
setp.ne.s16 %p361, %rs866, 0;
@%p361 bra $L__BB27_265;
setp.eq.s32 %p362, %r1747, 0;
mov.u16 %rs1221, 255;
@%p362 bra $L__BB27_264;
cvt.u64.u32 %rd308, %r1746;
add.s64 %rd309, %rd308, %rd3;
add.s64 %rd310, %rd1, %rd309;
ld.global.u8 %rs1221, [%rd310];
$L__BB27_264:
setp.ne.s32 %p364, %r1747, 0;
selp.u32 %r1263, 1, 0, %p364;
add.s32 %r1746, %r1746, %r1263;
add.s32 %r1264, %r1747, -1;
selp.b32 %r1747, 0, %r1264, %p362;
setp.eq.s32 %p365, %r1747, 0;
or.b16 %rs868, %rs1221, 15;
selp.b16 %rs1014, %rs868, %rs1221, %p365;
and.b16 %rs869, %rs1014, 255;
mov.u16 %rs870, 8;
sub.s16 %rs1048, %rs870, %rs1013;
setp.eq.s16 %p366, %rs869, 255;
selp.u16 %rs1013, 1, 0, %p366;
cvt.u32.u16 %r1265, %rs1014;
and.b32 %r1950, %r1265, 255;
$L__BB27_265:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1266, %rs1048;
and.b32 %r1267, %r1266, 255;
shr.u32 %r1268, %r1950, %r1267;
and.b32 %r1269, %r1268, 1;
bfi.b32 %r1955, %r1955, %r1269, 1, 31;
$L__BB27_266:
shl.b32 %r1270, %r1955, 1;
or.b32 %r1959, %r1270, 1;
add.s32 %r1271, %r1837, -1;
setp.eq.s32 %p367, %r1837, 0;
selp.b32 %r1958, 0, %r1271, %p367;
$L__BB27_267:
mul.lo.s32 %r1272, %r1838, 7;
cvt.u64.u32 %rd311, %r1959;
shl.b64 %rd312, %rd311, %r1272;
or.b64 %rd465, %rd312, %rd465;
setp.ne.s32 %p368, %r1837, 12;
setp.ne.s32 %p369, %r475, 0;
or.pred %p370, %p368, %p369;
add.s32 %r1838, %r1838, 1;
setp.lt.u32 %p371, %r1838, 8;
or.pred %p372, %p371, %p370;
mov.u32 %r1837, %r1958;
@%p372 bra $L__BB27_223;
$L__BB27_268:
cvt.u32.u64 %r1273, %rd465;
and.b32 %r1834, %r1273, 127;
shr.u64 %rd465, %rd465, 7;
add.s32 %r1838, %r1838, -1;
$L__BB27_269:
mul.wide.u32 %rd313, %r1903, 2;
add.s64 %rd314, %rd13, %rd313;
st.local.u16 [%rd314], %r1969;
shl.b32 %r1274, %r1969, 2;
shl.b32 %r1275, %r1969, 1;
or.b32 %r1276, %r1274, %r1275;
and.b32 %r1277, %r1276, 256;
ld.local.u16 %r1278, [%rd46];
and.b32 %r1279, %r1278, 128;
or.b32 %r1280, %r1277, %r1279;
ld.local.u16 %r1281, [%rd47];
shl.b32 %r1282, %r1281, 2;
and.b32 %r1283, %r1282, 640;
or.b32 %r1284, %r1280, %r1283;
add.s32 %r1285, %r452, 4;
mul.wide.u32 %rd315, %r1285, 2;
add.s64 %rd316, %rd13, %rd315;
ld.local.u16 %r1286, [%rd316];
shl.b32 %r1287, %r1286, 4;
and.b32 %r1288, %r1287, 512;
or.b32 %r1289, %r1284, %r1288;
and.b32 %r1290, %r1969, 7;
shr.u64 %rd57, %rd474, %r1290;
sub.s32 %r561, %r1917, %r1290;
cvt.u32.u64 %r1291, %rd57;
and.b32 %r1292, %r1291, 127;
or.b32 %r1293, %r1289, %r1292;
mul.wide.u32 %rd317, %r1293, 2;
add.s64 %rd318, %rd41, %rd317;
ld.global.u16 %r2021, [%rd318];
setp.ne.s32 %p373, %r1289, 0;
add.s32 %r563, %r1901, 2;
setp.ge.u32 %p374, %r563, %r820;
or.pred %p375, %p374, %p373;
@%p375 bra $L__BB27_319;
add.s32 %r564, %r1834, -2;
setp.eq.s32 %p376, %r564, -1;
selp.b32 %r2021, %r2021, 0, %p376;
setp.gt.s32 %p377, %r1834, 1;
mov.u32 %r1834, %r564;
@%p377 bra $L__BB27_319;
setp.ne.s32 %p378, %r1838, 0;
@%p378 bra $L__BB27_318;
mov.u32 %r1838, 0;
$L__BB27_273:
setp.gt.u32 %p379, %r1838, 7;
@%p379 bra $L__BB27_318;
cvt.u64.u32 %rd59, %r1837;
mul.wide.u32 %rd319, %r1837, 4;
add.s64 %rd321, %rd133, %rd319;
ld.global.nc.u32 %r570, [%rd321];
and.b16 %rs871, %rs1048, 255;
setp.ne.s16 %p380, %rs871, 0;
@%p380 bra $L__BB27_278;
setp.eq.s32 %p381, %r1747, 0;
mov.u16 %rs1240, 255;
@%p381 bra $L__BB27_277;
cvt.u64.u32 %rd322, %r1746;
add.s64 %rd323, %rd322, %rd3;
add.s64 %rd324, %rd1, %rd323;
ld.global.u8 %rs1240, [%rd324];
$L__BB27_277:
setp.ne.s32 %p383, %r1747, 0;
selp.u32 %r1295, 1, 0, %p383;
add.s32 %r1746, %r1746, %r1295;
add.s32 %r1296, %r1747, -1;
selp.b32 %r1747, 0, %r1296, %p381;
setp.eq.s32 %p384, %r1747, 0;
or.b16 %rs873, %rs1240, 15;
selp.b16 %rs1014, %rs873, %rs1240, %p384;
and.b16 %rs874, %rs1014, 255;
mov.u16 %rs875, 8;
sub.s16 %rs1048, %rs875, %rs1013;
setp.eq.s16 %p385, %rs874, 255;
selp.u16 %rs1013, 1, 0, %p385;
$L__BB27_278:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1297, %rs1048;
and.b32 %r1298, %r1297, 255;
mov.u32 %r1299, 1;
shl.b32 %r1300, %r1299, %r1298;
cvt.u32.u16 %r1301, %rs1014;
and.b32 %r1302, %r1300, %r1301;
and.b32 %r575, %r1302, 255;
setp.eq.s32 %p386, %r575, 0;
@%p386 bra $L__BB27_280;
add.s32 %r1303, %r1837, 1;
min.u32 %r2010, %r1303, 12;
mov.u32 %r1304, -1;
shl.b32 %r1305, %r1304, %r570;
shl.b32 %r1306, %r1305, 1;
xor.b32 %r2011, %r1306, -2;
bra.uni $L__BB27_317;
$L__BB27_280:
add.s64 %rd325, %rd59, -3;
setp.gt.u64 %p387, %rd325, 9;
mov.u32 %r2007, 0;
@%p387 bra $L__BB27_316;
max.u32 %r578, %r570, 1;
add.s32 %r1310, %r578, -1;
and.b32 %r579, %r578, 3;
setp.lt.u32 %p388, %r1310, 3;
mov.u32 %r2007, 0;
@%p388 bra $L__BB27_300;
sub.s32 %r1979, %r578, %r579;
mov.u32 %r2007, 0;
$L__BB27_283:
and.b16 %rs877, %rs1048, 255;
setp.ne.s16 %p389, %rs877, 0;
@%p389 bra $L__BB27_287;
setp.eq.s32 %p390, %r1747, 0;
mov.u16 %rs1247, 255;
@%p390 bra $L__BB27_286;
cvt.u64.u32 %rd326, %r1746;
add.s64 %rd327, %rd326, %rd3;
add.s64 %rd328, %rd1, %rd327;
ld.global.u8 %rs1247, [%rd328];
$L__BB27_286:
setp.ne.s32 %p392, %r1747, 0;
selp.u32 %r1312, 1, 0, %p392;
add.s32 %r1746, %r1746, %r1312;
add.s32 %r1313, %r1747, -1;
selp.b32 %r1747, 0, %r1313, %p390;
setp.eq.s32 %p393, %r1747, 0;
or.b16 %rs879, %rs1247, 15;
selp.b16 %rs1014, %rs879, %rs1247, %p393;
and.b16 %rs880, %rs1014, 255;
mov.u16 %rs881, 8;
sub.s16 %rs1048, %rs881, %rs1013;
setp.eq.s16 %p394, %rs880, 255;
selp.u16 %rs1013, 1, 0, %p394;
$L__BB27_287:
add.s16 %rs1254, %rs1048, -1;
and.b16 %rs882, %rs1254, 255;
cvt.u32.u16 %r1314, %rs1254;
and.b32 %r1315, %r1314, 255;
cvt.u32.u16 %r1316, %rs1014;
and.b32 %r1985, %r1316, 255;
shr.u32 %r1317, %r1985, %r1315;
and.b32 %r1318, %r1317, 1;
bfi.b32 %r590, %r2007, %r1318, 1, 31;
setp.ne.s16 %p395, %rs882, 0;
@%p395 bra $L__BB27_291;
setp.eq.s32 %p396, %r1747, 0;
mov.u16 %rs1251, 255;
@%p396 bra $L__BB27_290;
cvt.u64.u32 %rd329, %r1746;
add.s64 %rd330, %rd329, %rd3;
add.s64 %rd331, %rd1, %rd330;
ld.global.u8 %rs1251, [%rd331];
$L__BB27_290:
setp.ne.s32 %p398, %r1747, 0;
selp.u32 %r1319, 1, 0, %p398;
add.s32 %r1746, %r1746, %r1319;
add.s32 %r1320, %r1747, -1;
selp.b32 %r1747, 0, %r1320, %p396;
setp.eq.s32 %p399, %r1747, 0;
or.b16 %rs884, %rs1251, 15;
selp.b16 %rs1014, %rs884, %rs1251, %p399;
and.b16 %rs885, %rs1014, 255;
mov.u16 %rs886, 8;
sub.s16 %rs1254, %rs886, %rs1013;
setp.eq.s16 %p400, %rs885, 255;
selp.u16 %rs1013, 1, 0, %p400;
cvt.u32.u16 %r1321, %rs1014;
and.b32 %r1985, %r1321, 255;
$L__BB27_291:
add.s16 %rs1258, %rs1254, -1;
and.b16 %rs887, %rs1258, 255;
cvt.u32.u16 %r1322, %rs1258;
and.b32 %r1323, %r1322, 255;
shr.u32 %r1324, %r1985, %r1323;
and.b32 %r1325, %r1324, 1;
bfi.b32 %r597, %r590, %r1325, 1, 31;
setp.ne.s16 %p401, %rs887, 0;
@%p401 bra $L__BB27_295;
setp.eq.s32 %p402, %r1747, 0;
mov.u16 %rs1255, 255;
@%p402 bra $L__BB27_294;
cvt.u64.u32 %rd332, %r1746;
add.s64 %rd333, %rd332, %rd3;
add.s64 %rd334, %rd1, %rd333;
ld.global.u8 %rs1255, [%rd334];
$L__BB27_294:
setp.ne.s32 %p404, %r1747, 0;
selp.u32 %r1326, 1, 0, %p404;
add.s32 %r1746, %r1746, %r1326;
add.s32 %r1327, %r1747, -1;
selp.b32 %r1747, 0, %r1327, %p402;
setp.eq.s32 %p405, %r1747, 0;
or.b16 %rs889, %rs1255, 15;
selp.b16 %rs1014, %rs889, %rs1255, %p405;
and.b16 %rs890, %rs1014, 255;
mov.u16 %rs891, 8;
sub.s16 %rs1258, %rs891, %rs1013;
setp.eq.s16 %p406, %rs890, 255;
selp.u16 %rs1013, 1, 0, %p406;
cvt.u32.u16 %r1328, %rs1014;
and.b32 %r1985, %r1328, 255;
$L__BB27_295:
add.s16 %rs1262, %rs1258, -1;
and.b16 %rs892, %rs1262, 255;
cvt.u32.u16 %r1329, %rs1262;
and.b32 %r1330, %r1329, 255;
shr.u32 %r1331, %r1985, %r1330;
and.b32 %r1332, %r1331, 1;
bfi.b32 %r604, %r597, %r1332, 1, 31;
setp.ne.s16 %p407, %rs892, 0;
@%p407 bra $L__BB27_299;
setp.eq.s32 %p408, %r1747, 0;
mov.u16 %rs1259, 255;
@%p408 bra $L__BB27_298;
cvt.u64.u32 %rd335, %r1746;
add.s64 %rd336, %rd335, %rd3;
add.s64 %rd337, %rd1, %rd336;
ld.global.u8 %rs1259, [%rd337];
$L__BB27_298:
setp.ne.s32 %p410, %r1747, 0;
selp.u32 %r1333, 1, 0, %p410;
add.s32 %r1746, %r1746, %r1333;
add.s32 %r1334, %r1747, -1;
selp.b32 %r1747, 0, %r1334, %p408;
setp.eq.s32 %p411, %r1747, 0;
or.b16 %rs894, %rs1259, 15;
selp.b16 %rs1014, %rs894, %rs1259, %p411;
and.b16 %rs895, %rs1014, 255;
mov.u16 %rs896, 8;
sub.s16 %rs1262, %rs896, %rs1013;
setp.eq.s16 %p412, %rs895, 255;
selp.u16 %rs1013, 1, 0, %p412;
cvt.u32.u16 %r1335, %rs1014;
and.b32 %r1985, %r1335, 255;
$L__BB27_299:
add.s16 %rs1048, %rs1262, -1;
cvt.u32.u16 %r1336, %rs1048;
and.b32 %r1337, %r1336, 255;
shr.u32 %r1338, %r1985, %r1337;
and.b32 %r1339, %r1338, 1;
bfi.b32 %r2007, %r604, %r1339, 1, 31;
add.s32 %r1979, %r1979, -4;
setp.ne.s32 %p413, %r1979, 0;
@%p413 bra $L__BB27_283;
$L__BB27_300:
setp.eq.s32 %p414, %r579, 0;
@%p414 bra $L__BB27_316;
and.b16 %rs897, %rs1048, 255;
setp.ne.s16 %p415, %rs897, 0;
@%p415 bra $L__BB27_305;
setp.eq.s32 %p416, %r1747, 0;
mov.u16 %rs1269, 255;
@%p416 bra $L__BB27_304;
cvt.u64.u32 %rd338, %r1746;
add.s64 %rd339, %rd338, %rd3;
add.s64 %rd340, %rd1, %rd339;
ld.global.u8 %rs1269, [%rd340];
$L__BB27_304:
setp.ne.s32 %p418, %r1747, 0;
selp.u32 %r1340, 1, 0, %p418;
add.s32 %r1746, %r1746, %r1340;
add.s32 %r1341, %r1747, -1;
selp.b32 %r1747, 0, %r1341, %p416;
setp.eq.s32 %p419, %r1747, 0;
or.b16 %rs899, %rs1269, 15;
selp.b16 %rs1014, %rs899, %rs1269, %p419;
and.b16 %rs900, %rs1014, 255;
mov.u16 %rs901, 8;
sub.s16 %rs1048, %rs901, %rs1013;
setp.eq.s16 %p420, %rs900, 255;
selp.u16 %rs1013, 1, 0, %p420;
$L__BB27_305:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1342, %rs1048;
and.b32 %r1343, %r1342, 255;
cvt.u32.u16 %r1344, %rs1014;
and.b32 %r2002, %r1344, 255;
shr.u32 %r1345, %r2002, %r1343;
and.b32 %r1346, %r1345, 1;
bfi.b32 %r2007, %r2007, %r1346, 1, 31;
setp.eq.s32 %p421, %r579, 1;
@%p421 bra $L__BB27_316;
and.b16 %rs902, %rs1048, 255;
setp.ne.s16 %p422, %rs902, 0;
@%p422 bra $L__BB27_310;
setp.eq.s32 %p423, %r1747, 0;
mov.u16 %rs1273, 255;
@%p423 bra $L__BB27_309;
cvt.u64.u32 %rd341, %r1746;
add.s64 %rd342, %rd341, %rd3;
add.s64 %rd343, %rd1, %rd342;
ld.global.u8 %rs1273, [%rd343];
$L__BB27_309:
setp.ne.s32 %p425, %r1747, 0;
selp.u32 %r1347, 1, 0, %p425;
add.s32 %r1746, %r1746, %r1347;
add.s32 %r1348, %r1747, -1;
selp.b32 %r1747, 0, %r1348, %p423;
setp.eq.s32 %p426, %r1747, 0;
or.b16 %rs904, %rs1273, 15;
selp.b16 %rs1014, %rs904, %rs1273, %p426;
and.b16 %rs905, %rs1014, 255;
mov.u16 %rs906, 8;
sub.s16 %rs1048, %rs906, %rs1013;
setp.eq.s16 %p427, %rs905, 255;
selp.u16 %rs1013, 1, 0, %p427;
cvt.u32.u16 %r1349, %rs1014;
and.b32 %r2002, %r1349, 255;
$L__BB27_310:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1350, %rs1048;
and.b32 %r1351, %r1350, 255;
shr.u32 %r1352, %r2002, %r1351;
and.b32 %r1353, %r1352, 1;
bfi.b32 %r2007, %r2007, %r1353, 1, 31;
setp.eq.s32 %p428, %r579, 2;
@%p428 bra $L__BB27_316;
and.b16 %rs907, %rs1048, 255;
setp.ne.s16 %p429, %rs907, 0;
@%p429 bra $L__BB27_315;
setp.eq.s32 %p430, %r1747, 0;
mov.u16 %rs1277, 255;
@%p430 bra $L__BB27_314;
cvt.u64.u32 %rd344, %r1746;
add.s64 %rd345, %rd344, %rd3;
add.s64 %rd346, %rd1, %rd345;
ld.global.u8 %rs1277, [%rd346];
$L__BB27_314:
setp.ne.s32 %p432, %r1747, 0;
selp.u32 %r1354, 1, 0, %p432;
add.s32 %r1746, %r1746, %r1354;
add.s32 %r1355, %r1747, -1;
selp.b32 %r1747, 0, %r1355, %p430;
setp.eq.s32 %p433, %r1747, 0;
or.b16 %rs909, %rs1277, 15;
selp.b16 %rs1014, %rs909, %rs1277, %p433;
and.b16 %rs910, %rs1014, 255;
mov.u16 %rs911, 8;
sub.s16 %rs1048, %rs911, %rs1013;
setp.eq.s16 %p434, %rs910, 255;
selp.u16 %rs1013, 1, 0, %p434;
cvt.u32.u16 %r1356, %rs1014;
and.b32 %r2002, %r1356, 255;
$L__BB27_315:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1357, %rs1048;
and.b32 %r1358, %r1357, 255;
shr.u32 %r1359, %r2002, %r1358;
and.b32 %r1360, %r1359, 1;
bfi.b32 %r2007, %r2007, %r1360, 1, 31;
$L__BB27_316:
shl.b32 %r1361, %r2007, 1;
or.b32 %r2011, %r1361, 1;
add.s32 %r1362, %r1837, -1;
setp.eq.s32 %p435, %r1837, 0;
selp.b32 %r2010, 0, %r1362, %p435;
$L__BB27_317:
mul.lo.s32 %r1363, %r1838, 7;
cvt.u64.u32 %rd347, %r2011;
shl.b64 %rd348, %rd347, %r1363;
or.b64 %rd465, %rd348, %rd465;
setp.ne.s32 %p436, %r1837, 12;
setp.ne.s32 %p437, %r575, 0;
or.pred %p438, %p436, %p437;
add.s32 %r1838, %r1838, 1;
setp.lt.u32 %p439, %r1838, 8;
or.pred %p440, %p439, %p438;
mov.u32 %r1837, %r2010;
@%p440 bra $L__BB27_273;
$L__BB27_318:
cvt.u32.u64 %r1364, %rd465;
and.b32 %r1834, %r1364, 127;
shr.u64 %rd465, %rd465, 7;
add.s32 %r1838, %r1838, -1;
$L__BB27_319:
setp.lt.u32 %p441, %r563, %r820;
selp.b32 %r1365, %r2021, 0, %p441;
add.s32 %r1366, %r1903, 2;
mul.wide.u32 %rd349, %r1366, 2;
add.s64 %rd350, %rd13, %rd349;
st.local.u16 [%rd350], %r1365;
shl.b32 %r1367, %r1365, 2;
shl.b32 %r1368, %r1365, 1;
or.b32 %r1369, %r1367, %r1368;
and.b32 %r1370, %r1369, 256;
ld.local.u16 %r1371, [%rd47];
and.b32 %r1372, %r1371, 128;
or.b32 %r1902, %r1370, %r1372;
and.b32 %r1373, %r1365, 7;
shr.u64 %rd351, %rd57, %r1373;
sub.s32 %r1374, %r561, %r1373;
cvt.u32.u64 %r1375, %rd351;
shl.b32 %r1376, %r1969, 3;
and.b32 %r1377, %r1376, 64;
shl.b32 %r1378, %r1365, 4;
and.b32 %r1379, %r1378, 128;
or.b32 %r1380, %r1379, %r1377;
and.b32 %r1381, %r1375, 63;
or.b32 %r1382, %r1381, %r1380;
mul.wide.u32 %rd352, %r1382, 2;
add.s64 %rd353, %rd40, %rd352;
ld.global.u16 %r1383, [%rd353];
and.b32 %r1384, %r1383, 7;
shr.u64 %rd354, %rd351, %r1384;
sub.s32 %r1385, %r1374, %r1384;
cvt.u32.u64 %r1386, %rd354;
shr.u32 %r1387, %r1383, 3;
and.b32 %r1388, %r1387, 15;
mov.u32 %r1389, -1;
shl.b32 %r1390, %r1389, %r1388;
not.b32 %r1391, %r1390;
and.b32 %r1392, %r1386, %r1391;
shr.u64 %rd474, %rd354, %r1388;
sub.s32 %r1917, %r1385, %r1388;
shr.u32 %r1393, %r1383, 7;
and.b32 %r1394, %r1393, 7;
shr.u32 %r1395, %r1383, 10;
and.b32 %r1396, %r1395, 7;
mov.u32 %r1397, 255;
shl.b32 %r1398, %r1397, %r1394;
not.b32 %r1399, %r1398;
and.b32 %r1400, %r1392, %r1399;
add.s32 %r1401, %r1400, %r1396;
add.s32 %r1402, %r1903, 1;
mul.wide.u32 %rd355, %r1402, 2;
add.s64 %rd356, %rd13, %rd355;
st.local.u16 [%rd356], %r1401;
shr.u32 %r1403, %r1383, 13;
shr.u32 %r1404, %r1392, %r1394;
add.s32 %r1405, %r1404, %r1403;
add.s32 %r1406, %r1903, 3;
mul.wide.u32 %rd357, %r1406, 2;
add.s64 %rd358, %rd13, %rd357;
st.local.u16 [%rd358], %r1405;
add.s32 %r1903, %r1903, 4;
add.s32 %r1901, %r1901, 4;
setp.lt.u32 %p442, %r1901, %r820;
@%p442 bra $L__BB27_215;
mul.wide.u32 %rd359, %r1903, 2;
add.s64 %rd360, %rd13, %rd359;
mov.u16 %rs912, 0;
st.local.v2.u16 [%rd360], {%rs912, %rs912};
add.s32 %r1892, %r1892, 2;
setp.lt.u32 %p443, %r1892, %r822;
@%p443 bra $L__BB27_214;
$L__BB27_321:
add.s32 %r1407, %r820, 1;
shr.u32 %r1408, %r1407, 1;
add.s32 %r1409, %r1408, 2;
setp.gt.u32 %p444, %r1409, 130;
@%p444 bra $L__BB27_392;
bra.uni $L__BB27_322;
$L__BB27_392:
mov.u32 %r1642, 2;
st.global.u32 [%rd4], %r1642;
mov.u32 %r1643, 12;
st.global.u32 [%rd4+4], %r1643;
mov.u32 %r1644, 0;
st.global.u32 [%rd4+8], %r1644;
st.global.u32 [%rd4+12], %r1644;
bra.uni $L__BB27_399;
$L__BB27_322:
add.s32 %r666, %r827, 2;
mov.u32 %r1415, 29;
sub.s32 %r667, %r1415, %r827;
mov.u16 %rs1320, 0;
mov.u32 %r2022, 0;
mov.u64 %rd502, 0;
mov.u32 %r2023, %r2022;
mov.u32 %r2024, %r2022;
mov.u32 %r2025, %r13;
mov.u32 %r2090, %r2022;
mov.u32 %r2089, %r2022;
$L__BB27_323:
mov.u32 %r670, %r2024;
mul.wide.u32 %rd362, %r2023, 2;
add.s64 %rd363, %rd13, %rd362;
ld.local.u16 %r674, [%rd363];
ld.local.u16 %r675, [%rd363+2];
setp.lt.u32 %p445, %r666, %r675;
@%p445 bra $L__BB27_391;
and.b32 %r1417, %r674, 16;
setp.eq.s32 %p446, %r1417, 0;
mov.u32 %r2043, 0;
mov.u32 %r2035, %r2043;
@%p446 bra $L__BB27_330;
setp.gt.u32 %p447, %r2089, 31;
@%p447 bra $L__BB27_329;
$L__BB27_326:
setp.ge.u32 %p448, %r2090, %r16;
mov.u16 %rs1295, 255;
@%p448 bra $L__BB27_328;
add.s32 %r678, %r2090, 1;
cvt.u64.u32 %rd364, %r2090;
add.s64 %rd365, %rd364, %rd3;
add.s64 %rd366, %rd1, %rd365;
ld.global.u8 %rs1295, [%rd366];
mov.u32 %r2090, %r678;
$L__BB27_328:
and.b16 %rs915, %rs1295, 255;
cvt.u64.u16 %rd367, %rs1295;
and.b64 %rd368, %rd367, 255;
shl.b64 %rd369, %rd368, %r2089;
or.b64 %rd502, %rd369, %rd502;
cvt.u32.u16 %r1418, %rs1320;
cvt.s32.s8 %r1419, %r1418;
mov.u32 %r1420, 8;
sub.s32 %r1421, %r1420, %r1419;
add.s32 %r2089, %r1421, %r2089;
setp.eq.s16 %p449, %rs915, 255;
selp.u16 %rs1320, 1, 0, %p449;
setp.lt.u32 %p450, %r2089, 33;
@%p450 bra $L__BB27_326;
$L__BB27_329:
shr.u32 %r1422, %r674, 12;
and.b32 %r1423, %r1422, 1;
sub.s32 %r1424, %r675, %r1423;
shr.u64 %rd69, %rd502, %r1424;
sub.s32 %r2089, %r2089, %r1424;
cvt.u32.u64 %r1425, %rd502;
shl.b32 %r1426, %r1425, 31;
setp.eq.s32 %p451, %r1424, 0;
mov.u32 %r1427, -1;
shl.b32 %r1428, %r1427, %r1424;
not.b32 %r1429, %r1428;
selp.b32 %r1430, 0, %r1429, %p451;
and.b32 %r1431, %r1430, %r1425;
shr.u32 %r1432, %r674, 8;
and.b32 %r1433, %r1432, 1;
shl.b32 %r1434, %r1433, %r1424;
or.b32 %r1435, %r1434, %r1431;
or.b32 %r1436, %r1435, 1;
add.s32 %r1437, %r1436, 2;
shl.b32 %r1438, %r1437, %r667;
or.b32 %r2035, %r1438, %r1426;
mov.u64 %rd502, %rd69;
$L__BB27_330:
mul.wide.u32 %rd370, %r2025, 4;
add.s64 %rd371, %rd2, %rd370;
st.global.u32 [%rd371], %r2035;
and.b32 %r1441, %r674, 32;
setp.eq.s32 %p452, %r1441, 0;
mov.u32 %r2044, %r2043;
@%p452 bra $L__BB27_336;
setp.gt.u32 %p453, %r2089, 31;
@%p453 bra $L__BB27_335;
$L__BB27_332:
setp.ge.u32 %p454, %r2090, %r16;
mov.u16 %rs1299, 255;
@%p454 bra $L__BB27_334;
add.s32 %r690, %r2090, 1;
cvt.u64.u32 %rd372, %r2090;
add.s64 %rd373, %rd372, %rd3;
add.s64 %rd374, %rd1, %rd373;
ld.global.u8 %rs1299, [%rd374];
mov.u32 %r2090, %r690;
$L__BB27_334:
and.b16 %rs917, %rs1299, 255;
cvt.u64.u16 %rd375, %rs1299;
and.b64 %rd376, %rd375, 255;
shl.b64 %rd377, %rd376, %r2089;
or.b64 %rd502, %rd377, %rd502;
cvt.u32.u16 %r1442, %rs1320;
cvt.s32.s8 %r1443, %r1442;
mov.u32 %r1444, 8;
sub.s32 %r1445, %r1444, %r1443;
add.s32 %r2089, %r1445, %r2089;
setp.eq.s16 %p455, %rs917, 255;
selp.u16 %rs1320, 1, 0, %p455;
setp.lt.u32 %p456, %r2089, 33;
@%p456 bra $L__BB27_332;
$L__BB27_335:
shr.u32 %r1446, %r674, 13;
and.b32 %r1447, %r1446, 1;
sub.s32 %r1448, %r675, %r1447;
shr.u64 %rd74, %rd502, %r1448;
sub.s32 %r2089, %r2089, %r1448;
cvt.u32.u64 %r1449, %rd502;
shl.b32 %r1450, %r1449, 31;
setp.eq.s32 %p457, %r1448, 0;
mov.u32 %r1451, -1;
shl.b32 %r1452, %r1451, %r1448;
not.b32 %r1453, %r1452;
selp.b32 %r1454, 0, %r1453, %p457;
and.b32 %r1455, %r1454, %r1449;
shr.u32 %r1456, %r674, 9;
and.b32 %r1457, %r1456, 1;
shl.b32 %r1458, %r1457, %r1448;
or.b32 %r1459, %r1458, %r1455;
or.b32 %r2044, %r1459, 1;
add.s32 %r1460, %r2044, 2;
shl.b32 %r1461, %r1460, %r667;
or.b32 %r2043, %r1461, %r1450;
mov.u64 %rd502, %rd74;
$L__BB27_336:
setp.lt.u32 %p458, %r822, 2;
@%p458 bra $L__BB27_338;
add.s32 %r1462, %r2025, %r12;
mul.wide.u32 %rd378, %r1462, 4;
add.s64 %rd379, %rd2, %rd378;
st.global.u32 [%rd379], %r2043;
$L__BB27_338:
or.b32 %r1463, %r2044, %r2022;
add.u64 %rd76, %SPL, 6192;
mul.wide.u32 %rd381, %r670, 4;
add.s64 %rd382, %rd76, %rd381;
st.local.u32 [%rd382], %r1463;
add.s32 %r702, %r2025, 1;
add.s32 %r1464, %r2023, 1;
setp.lt.u32 %p459, %r1464, %r820;
@%p459 bra $L__BB27_340;
bra.uni $L__BB27_339;
$L__BB27_340:
and.b32 %r1467, %r674, 64;
setp.eq.s32 %p460, %r1467, 0;
mov.u32 %r2060, 0;
mov.u32 %r2052, %r2060;
@%p460 bra $L__BB27_346;
setp.gt.u32 %p461, %r2089, 31;
@%p461 bra $L__BB27_345;
$L__BB27_342:
setp.ge.u32 %p462, %r2090, %r16;
mov.u16 %rs1303, 255;
@%p462 bra $L__BB27_344;
add.s32 %r705, %r2090, 1;
cvt.u64.u32 %rd383, %r2090;
add.s64 %rd384, %rd383, %rd3;
add.s64 %rd385, %rd1, %rd384;
ld.global.u8 %rs1303, [%rd385];
mov.u32 %r2090, %r705;
$L__BB27_344:
and.b16 %rs919, %rs1303, 255;
cvt.u64.u16 %rd386, %rs1303;
and.b64 %rd387, %rd386, 255;
shl.b64 %rd388, %rd387, %r2089;
or.b64 %rd502, %rd388, %rd502;
cvt.u32.u16 %r1468, %rs1320;
cvt.s32.s8 %r1469, %r1468;
mov.u32 %r1470, 8;
sub.s32 %r1471, %r1470, %r1469;
add.s32 %r2089, %r1471, %r2089;
setp.eq.s16 %p463, %rs919, 255;
selp.u16 %rs1320, 1, 0, %p463;
setp.lt.u32 %p464, %r2089, 33;
@%p464 bra $L__BB27_342;
$L__BB27_345:
shr.u32 %r1472, %r674, 14;
and.b32 %r1473, %r1472, 1;
sub.s32 %r1474, %r675, %r1473;
shr.u64 %rd80, %rd502, %r1474;
sub.s32 %r2089, %r2089, %r1474;
cvt.u32.u64 %r1475, %rd502;
shl.b32 %r1476, %r1475, 31;
setp.eq.s32 %p465, %r1474, 0;
mov.u32 %r1477, -1;
shl.b32 %r1478, %r1477, %r1474;
not.b32 %r1479, %r1478;
selp.b32 %r1480, 0, %r1479, %p465;
and.b32 %r1481, %r1480, %r1475;
shr.u32 %r1482, %r674, 10;
and.b32 %r1483, %r1482, 1;
shl.b32 %r1484, %r1483, %r1474;
or.b32 %r1485, %r1484, %r1481;
or.b32 %r1486, %r1485, 1;
add.s32 %r1487, %r1486, 2;
shl.b32 %r1488, %r1487, %r667;
or.b32 %r2052, %r1488, %r1476;
mov.u64 %rd502, %rd80;
$L__BB27_346:
mul.wide.u32 %rd389, %r702, 4;
add.s64 %rd390, %rd2, %rd389;
st.global.u32 [%rd390], %r2052;
and.b32 %r1491, %r674, 128;
setp.eq.s32 %p466, %r1491, 0;
mov.u32 %r2022, %r2060;
@%p466 bra $L__BB27_352;
setp.gt.u32 %p467, %r2089, 31;
@%p467 bra $L__BB27_351;
$L__BB27_348:
setp.ge.u32 %p468, %r2090, %r16;
mov.u16 %rs1307, 255;
@%p468 bra $L__BB27_350;
add.s32 %r717, %r2090, 1;
cvt.u64.u32 %rd391, %r2090;
add.s64 %rd392, %rd391, %rd3;
add.s64 %rd393, %rd1, %rd392;
ld.global.u8 %rs1307, [%rd393];
mov.u32 %r2090, %r717;
$L__BB27_350:
and.b16 %rs921, %rs1307, 255;
cvt.u64.u16 %rd394, %rs1307;
and.b64 %rd395, %rd394, 255;
shl.b64 %rd396, %rd395, %r2089;
or.b64 %rd502, %rd396, %rd502;
cvt.u32.u16 %r1492, %rs1320;
cvt.s32.s8 %r1493, %r1492;
mov.u32 %r1494, 8;
sub.s32 %r1495, %r1494, %r1493;
add.s32 %r2089, %r1495, %r2089;
setp.eq.s16 %p469, %rs921, 255;
selp.u16 %rs1320, 1, 0, %p469;
setp.lt.u32 %p470, %r2089, 33;
@%p470 bra $L__BB27_348;
$L__BB27_351:
shr.u32 %r1496, %r674, 15;
sub.s32 %r1497, %r675, %r1496;
shr.u64 %rd85, %rd502, %r1497;
sub.s32 %r2089, %r2089, %r1497;
cvt.u32.u64 %r1498, %rd502;
shl.b32 %r1499, %r1498, 31;
setp.eq.s32 %p471, %r1497, 0;
mov.u32 %r1500, -1;
shl.b32 %r1501, %r1500, %r1497;
not.b32 %r1502, %r1501;
selp.b32 %r1503, 0, %r1502, %p471;
and.b32 %r1504, %r1503, %r1498;
shr.u32 %r1505, %r674, 11;
and.b32 %r1506, %r1505, 1;
shl.b32 %r1507, %r1506, %r1497;
or.b32 %r1508, %r1507, %r1504;
or.b32 %r2022, %r1508, 1;
add.s32 %r1509, %r2022, 2;
shl.b32 %r1510, %r1509, %r667;
or.b32 %r2060, %r1510, %r1499;
mov.u64 %rd502, %rd85;
$L__BB27_352:
@%p458 bra $L__BB27_354;
add.s32 %r1511, %r702, %r12;
mul.wide.u32 %rd397, %r1511, 4;
add.s64 %rd398, %rd2, %rd397;
st.global.u32 [%rd398], %r2060;
$L__BB27_354:
add.s32 %r2025, %r2025, 2;
add.s32 %r2024, %r670, 1;
add.s32 %r2023, %r2023, 2;
setp.lt.u32 %p473, %r2023, %r820;
@%p473 bra $L__BB27_323;
bra.uni $L__BB27_355;
$L__BB27_391:
mov.u32 %r1635, 1;
st.global.u32 [%rd4], %r1635;
mov.u32 %r1636, 13;
st.global.u32 [%rd4+4], %r1636;
mov.u32 %r1637, 0;
st.global.u32 [%rd4+8], %r1637;
st.global.u32 [%rd4+12], %r1637;
bra.uni $L__BB27_399;
$L__BB27_339:
mov.u32 %r2022, 0;
$L__BB27_355:
add.s32 %r1512, %r670, 1;
mul.wide.u32 %rd401, %r1512, 4;
add.s64 %rd402, %rd76, %rd401;
st.local.u32 [%rd402], %r2022;
@%p297 bra $L__BB27_399;
mov.u32 %r2065, 2;
$L__BB27_357:
shr.u32 %r1518, %r2065, 1;
mul.lo.s32 %r2069, %r1518, %r841;
mad.lo.s32 %r2071, %r2065, %r12, %r13;
add.s32 %r741, %r2065, 1;
ld.local.u32 %r2068, [%rd76];
mov.u32 %r2070, 0;
mov.u32 %r2072, %r2070;
mov.u32 %r2073, %r2070;
$L__BB27_358:
mul.wide.u32 %rd403, %r2069, 2;
add.s64 %rd404, %rd13, %rd403;
ld.local.v2.u16 {%rs922, %rs923}, [%rd404];
cvt.u32.u16 %r751, %rs922;
cvt.u32.u16 %r1519, %rs923;
and.b32 %r1520, %r751, 240;
add.s32 %r1521, %r1520, 240;
and.b32 %r1522, %r1521, %r1520;
add.s32 %r752, %r2070, 1;
mul.wide.u32 %rd405, %r752, 4;
add.s64 %rd90, %rd76, %rd405;
ld.local.u32 %r753, [%rd90];
or.b32 %r1523, %r2068, %r753;
or.b32 %r1524, %r1523, 2;
clz.b32 %r1525, %r1524;
xor.b32 %r1526, %r1525, 31;
setp.eq.s32 %p475, %r1522, 0;
selp.b32 %r1527, 1, %r1526, %p475;
add.s32 %r754, %r1527, %r1519;
setp.gt.u32 %p476, %r754, %r666;
@%p476 bra $L__BB27_390;
and.b32 %r1529, %r751, 16;
setp.eq.s32 %p477, %r1529, 0;
mov.u32 %r2091, 0;
mov.u32 %r2083, %r2091;
@%p477 bra $L__BB27_365;
setp.gt.u32 %p478, %r2089, 31;
@%p478 bra $L__BB27_364;
$L__BB27_361:
setp.ge.u32 %p479, %r2090, %r16;
mov.u16 %rs1314, 255;
@%p479 bra $L__BB27_363;
add.s32 %r757, %r2090, 1;
cvt.u64.u32 %rd406, %r2090;
add.s64 %rd407, %rd406, %rd3;
add.s64 %rd408, %rd1, %rd407;
ld.global.u8 %rs1314, [%rd408];
mov.u32 %r2090, %r757;
$L__BB27_363:
and.b16 %rs927, %rs1314, 255;
cvt.u64.u16 %rd409, %rs1314;
and.b64 %rd410, %rd409, 255;
shl.b64 %rd411, %rd410, %r2089;
or.b64 %rd502, %rd411, %rd502;
cvt.u32.u16 %r1530, %rs1320;
cvt.s32.s8 %r1531, %r1530;
mov.u32 %r1532, 8;
sub.s32 %r1533, %r1532, %r1531;
add.s32 %r2089, %r1533, %r2089;
setp.eq.s16 %p480, %rs927, 255;
selp.u16 %rs1320, 1, 0, %p480;
setp.lt.u32 %p481, %r2089, 33;
@%p481 bra $L__BB27_361;
$L__BB27_364:
shr.u32 %r1534, %r751, 12;
and.b32 %r1535, %r1534, 1;
sub.s32 %r1536, %r754, %r1535;
shr.u64 %rd94, %rd502, %r1536;
sub.s32 %r2089, %r2089, %r1536;
cvt.u32.u64 %r1537, %rd502;
shl.b32 %r1538, %r1537, 31;
setp.eq.s32 %p482, %r1536, 0;
mov.u32 %r1539, -1;
shl.b32 %r1540, %r1539, %r1536;
not.b32 %r1541, %r1540;
selp.b32 %r1542, 0, %r1541, %p482;
and.b32 %r1543, %r1542, %r1537;
shr.u32 %r1544, %r751, 8;
and.b32 %r1545, %r1544, 1;
shl.b32 %r1546, %r1545, %r1536;
or.b32 %r1547, %r1546, %r1543;
or.b32 %r1548, %r1547, 1;
add.s32 %r1549, %r1548, 2;
shl.b32 %r1550, %r1549, %r667;
or.b32 %r2083, %r1550, %r1538;
mov.u64 %rd502, %rd94;
$L__BB27_365:
mul.wide.u32 %rd412, %r2071, 4;
add.s64 %rd413, %rd2, %rd412;
st.global.u32 [%rd413], %r2083;
and.b32 %r1553, %r751, 32;
setp.eq.s32 %p483, %r1553, 0;
mov.u32 %r2092, %r2091;
@%p483 bra $L__BB27_371;
setp.gt.u32 %p484, %r2089, 31;
@%p484 bra $L__BB27_370;
$L__BB27_367:
setp.ge.u32 %p485, %r2090, %r16;
mov.u16 %rs1318, 255;
@%p485 bra $L__BB27_369;
add.s32 %r769, %r2090, 1;
cvt.u64.u32 %rd414, %r2090;
add.s64 %rd415, %rd414, %rd3;
add.s64 %rd416, %rd1, %rd415;
ld.global.u8 %rs1318, [%rd416];
mov.u32 %r2090, %r769;
$L__BB27_369:
and.b16 %rs929, %rs1318, 255;
cvt.u64.u16 %rd417, %rs1318;
and.b64 %rd418, %rd417, 255;
shl.b64 %rd419, %rd418, %r2089;
or.b64 %rd502, %rd419, %rd502;
cvt.u32.u16 %r1554, %rs1320;
cvt.s32.s8 %r1555, %r1554;
mov.u32 %r1556, 8;
sub.s32 %r1557, %r1556, %r1555;
add.s32 %r2089, %r1557, %r2089;
setp.eq.s16 %p486, %rs929, 255;
selp.u16 %rs1320, 1, 0, %p486;
setp.lt.u32 %p487, %r2089, 33;
@%p487 bra $L__BB27_367;
$L__BB27_370:
shr.u32 %r1558, %r751, 13;
and.b32 %r1559, %r1558, 1;
sub.s32 %r1560, %r754, %r1559;
shr.u64 %rd99, %rd502, %r1560;
sub.s32 %r2089, %r2089, %r1560;
cvt.u32.u64 %r1561, %rd502;
shl.b32 %r1562, %r1561, 31;
setp.eq.s32 %p488, %r1560, 0;
mov.u32 %r1563, -1;
shl.b32 %r1564, %r1563, %r1560;
not.b32 %r1565, %r1564;
selp.b32 %r1566, 0, %r1565, %p488;
and.b32 %r1567, %r1566, %r1561;
shr.u32 %r1568, %r751, 9;
and.b32 %r1569, %r1568, 1;
shl.b32 %r1570, %r1569, %r1560;
or.b32 %r1571, %r1570, %r1567;
or.b32 %r2092, %r1571, 1;
add.s32 %r1572, %r2092, 2;
shl.b32 %r1573, %r1572, %r667;
or.b32 %r2091, %r1573, %r1562;
mov.u64 %rd502, %rd99;
$L__BB27_371:
setp.ge.u32 %p489, %r741, %r822;
@%p489 bra $L__BB27_373;
add.s32 %r1574, %r2071, %r12;
mul.wide.u32 %rd420, %r1574, 4;
add.s64 %rd421, %rd2, %rd420;
st.global.u32 [%rd421], %r2091;
$L__BB27_373:
or.b32 %r1576, %r2092, %r2072;
mul.wide.u32 %rd422, %r2070, 4;
add.s64 %rd423, %rd76, %rd422;
st.local.u32 [%rd423], %r1576;
add.s32 %r781, %r2071, 1;
add.s32 %r1577, %r2073, 1;
setp.ge.u32 %p490, %r1577, %r820;
mov.u32 %r2072, 0;
@%p490 bra $L__BB27_389;
and.b32 %r1579, %r751, 64;
setp.eq.s32 %p491, %r1579, 0;
mov.u32 %r2108, 0;
mov.u32 %r2100, %r2108;
@%p491 bra $L__BB27_380;
setp.gt.u32 %p492, %r2089, 31;
@%p492 bra $L__BB27_379;
$L__BB27_376:
setp.ge.u32 %p493, %r2090, %r16;
mov.u16 %rs1322, 255;
@%p493 bra $L__BB27_378;
add.s32 %r784, %r2090, 1;
cvt.u64.u32 %rd424, %r2090;
add.s64 %rd425, %rd424, %rd3;
add.s64 %rd426, %rd1, %rd425;
ld.global.u8 %rs1322, [%rd426];
mov.u32 %r2090, %r784;
$L__BB27_378:
and.b16 %rs931, %rs1322, 255;
cvt.u64.u16 %rd427, %rs1322;
and.b64 %rd428, %rd427, 255;
shl.b64 %rd429, %rd428, %r2089;
or.b64 %rd502, %rd429, %rd502;
cvt.u32.u16 %r1580, %rs1320;
cvt.s32.s8 %r1581, %r1580;
mov.u32 %r1582, 8;
sub.s32 %r1583, %r1582, %r1581;
add.s32 %r2089, %r1583, %r2089;
setp.eq.s16 %p494, %rs931, 255;
selp.u16 %rs1320, 1, 0, %p494;
setp.lt.u32 %p495, %r2089, 33;
@%p495 bra $L__BB27_376;
$L__BB27_379:
shr.u32 %r1584, %r751, 14;
and.b32 %r1585, %r1584, 1;
sub.s32 %r1586, %r754, %r1585;
shr.u64 %rd104, %rd502, %r1586;
sub.s32 %r2089, %r2089, %r1586;
cvt.u32.u64 %r1587, %rd502;
shl.b32 %r1588, %r1587, 31;
setp.eq.s32 %p496, %r1586, 0;
mov.u32 %r1589, -1;
shl.b32 %r1590, %r1589, %r1586;
not.b32 %r1591, %r1590;
selp.b32 %r1592, 0, %r1591, %p496;
and.b32 %r1593, %r1592, %r1587;
shr.u32 %r1594, %r751, 10;
and.b32 %r1595, %r1594, 1;
shl.b32 %r1596, %r1595, %r1586;
or.b32 %r1597, %r1596, %r1593;
or.b32 %r1598, %r1597, 1;
add.s32 %r1599, %r1598, 2;
shl.b32 %r1600, %r1599, %r667;
or.b32 %r2100, %r1600, %r1588;
mov.u64 %rd502, %rd104;
$L__BB27_380:
mul.wide.u32 %rd430, %r781, 4;
add.s64 %rd431, %rd2, %rd430;
st.global.u32 [%rd431], %r2100;
and.b32 %r1603, %r751, 128;
setp.eq.s32 %p497, %r1603, 0;
mov.u32 %r2072, %r2108;
@%p497 bra $L__BB27_386;
setp.gt.u32 %p498, %r2089, 31;
@%p498 bra $L__BB27_385;
$L__BB27_382:
setp.ge.u32 %p499, %r2090, %r16;
mov.u16 %rs1326, 255;
@%p499 bra $L__BB27_384;
add.s32 %r796, %r2090, 1;
cvt.u64.u32 %rd432, %r2090;
add.s64 %rd433, %rd432, %rd3;
add.s64 %rd434, %rd1, %rd433;
ld.global.u8 %rs1326, [%rd434];
mov.u32 %r2090, %r796;
$L__BB27_384:
and.b16 %rs933, %rs1326, 255;
cvt.u64.u16 %rd435, %rs1326;
and.b64 %rd436, %rd435, 255;
shl.b64 %rd437, %rd436, %r2089;
or.b64 %rd502, %rd437, %rd502;
cvt.u32.u16 %r1604, %rs1320;
cvt.s32.s8 %r1605, %r1604;
mov.u32 %r1606, 8;
sub.s32 %r1607, %r1606, %r1605;
add.s32 %r2089, %r1607, %r2089;
setp.eq.s16 %p500, %rs933, 255;
selp.u16 %rs1320, 1, 0, %p500;
setp.lt.u32 %p501, %r2089, 33;
@%p501 bra $L__BB27_382;
$L__BB27_385:
shr.u32 %r1608, %r751, 15;
sub.s32 %r1609, %r754, %r1608;
shr.u64 %rd109, %rd502, %r1609;
sub.s32 %r2089, %r2089, %r1609;
cvt.u32.u64 %r1610, %rd502;
shl.b32 %r1611, %r1610, 31;
setp.eq.s32 %p502, %r1609, 0;
mov.u32 %r1612, -1;
shl.b32 %r1613, %r1612, %r1609;
not.b32 %r1614, %r1613;
selp.b32 %r1615, 0, %r1614, %p502;
and.b32 %r1616, %r1615, %r1610;
shr.u32 %r1617, %r751, 11;
and.b32 %r1618, %r1617, 1;
shl.b32 %r1619, %r1618, %r1609;
or.b32 %r1620, %r1619, %r1616;
or.b32 %r2072, %r1620, 1;
add.s32 %r1621, %r2072, 2;
shl.b32 %r1622, %r1621, %r667;
or.b32 %r2108, %r1622, %r1611;
mov.u64 %rd502, %rd109;
$L__BB27_386:
@%p489 bra $L__BB27_388;
add.s32 %r1623, %r781, %r12;
mul.wide.u32 %rd438, %r1623, 4;
add.s64 %rd439, %rd2, %rd438;
st.global.u32 [%rd439], %r2108;
$L__BB27_388:
add.s32 %r2071, %r2071, 2;
add.s32 %r2069, %r2069, 2;
add.s32 %r2073, %r2073, 2;
setp.lt.u32 %p504, %r2073, %r820;
mov.u32 %r2068, %r753;
mov.u32 %r2070, %r752;
@%p504 bra $L__BB27_358;
$L__BB27_389:
st.local.u32 [%rd90], %r2072;
add.s32 %r2065, %r2065, 2;
setp.lt.u32 %p505, %r2065, %r822;
@%p505 bra $L__BB27_357;
bra.uni $L__BB27_399;
$L__BB27_390:
mov.u32 %r1628, 1;
st.global.u32 [%rd4], %r1628;
mov.u32 %r1629, 14;
st.global.u32 [%rd4+4], %r1629;
mov.u32 %r1630, 0;
st.global.u32 [%rd4+8], %r1630;
st.global.u32 [%rd4+12], %r1630;
$L__BB27_399:
ret;
}
// .globl j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize
.visible .entry j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize(
.param .u64 j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize_param_0,
.param .u64 j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize_param_1,
.param .u64 j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize_param_2,
.param .u64 j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize_param_3,
.param .u64 j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize_param_4,
.param .u64 j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize_param_5,
.param .u64 j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize_param_6,
.param .u32 j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize_param_7
)
{
.local .align 16 .b8 __local_depot28[6720];
.reg .b64 %SP;
.reg .b64 %SPL;
.reg .pred %p<516>;
.reg .b16 %rs<1330>;
.reg .f32 %f<25>;
.reg .b32 %r<2157>;
.reg .b64 %rd<510>;
mov.u64 %SPL, __local_depot28;
ld.param.u64 %rd118, [ j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize_param_0];
ld.param.u64 %rd112, [ j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize_param_1];
ld.param.u64 %rd117, [ j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize_param_6];
ld.param.u32 %r819, [ j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize_param_7];
cvta.to.global.u64 %rd1, %rd118;
mov.u32 %r820, %ntid.x;
mov.u32 %r821, %ctaid.x;
mov.u32 %r822, %tid.x;
mad.lo.s32 %r1, %r821, %r820, %r822;
setp.ge.u32 %p1, %r1, %r819;
@%p1 bra $L__BB28_401;
cvta.to.global.u64 %rd119, %rd117;
cvta.to.global.u64 %rd120, %rd112;
mul.wide.u32 %rd121, %r1, 64;
add.s64 %rd122, %rd120, %rd121;
ld.global.u64 %rd123, [%rd122];
cvta.to.global.u64 %rd2, %rd123;
ld.global.v2.u32 {%r823, %r824}, [%rd122+8];
mov.u32 %r825, 0;
ld.global.v2.u32 {%r826, %r827}, [%rd122+16];
ld.global.v2.u32 {%r828, %r829}, [%rd122+24];
ld.global.v2.u32 {%r831, %r832}, [%rd122+32];
ld.global.v2.u32 {%r833, %r834}, [%rd122+40];
ld.global.v2.u32 {%r835, %r836}, [%rd122+48];
cvt.u64.u32 %rd3, %r823;
mul.wide.u32 %rd124, %r1, 16;
add.s64 %rd4, %rd119, %rd124;
st.global.u32 [%rd4], %r825;
st.global.u32 [%rd4+4], %r825;
st.global.u32 [%rd4+8], %r825;
st.global.u32 [%rd4+12], %r825;
setp.eq.s32 %p2, %r829, 0;
@%p2 bra $L__BB28_3;
mov.u32 %r837, 2;
st.global.u32 [%rd4], %r837;
mov.u32 %r838, 17;
st.global.u32 [%rd4+4], %r838;
st.global.u32 [%rd4+8], %r825;
st.global.u32 [%rd4+12], %r825;
bra.uni $L__BB28_401;
$L__BB28_3:
setp.gt.u32 %p3, %r833, 1;
@%p3 bra $L__BB28_400;
bra.uni $L__BB28_4;
$L__BB28_400:
mov.u32 %r1704, 2;
st.global.u32 [%rd4], %r1704;
mov.u32 %r1705, 18;
st.global.u32 [%rd4+4], %r1705;
mov.u32 %r1706, 0;
st.global.u32 [%rd4+8], %r1706;
st.global.u32 [%rd4+12], %r1706;
bra.uni $L__BB28_401;
$L__BB28_4:
setp.eq.s32 %p4, %r824, 0;
setp.eq.s32 %p5, %r826, 0;
or.pred %p6, %p4, %p5;
@%p6 bra $L__BB28_401;
setp.gt.u32 %p7, %r824, 256;
setp.gt.u32 %p8, %r826, 256;
or.pred %p9, %p7, %p8;
mul.lo.s32 %r840, %r826, %r824;
setp.gt.u32 %p10, %r840, 4096;
or.pred %p11, %p9, %p10;
@%p11 bra $L__BB28_399;
bra.uni $L__BB28_6;
$L__BB28_399:
mov.u32 %r1701, 2;
st.global.u32 [%rd4], %r1701;
mov.u32 %r1702, 1;
st.global.u32 [%rd4+4], %r1702;
mov.u32 %r1703, 0;
st.global.u32 [%rd4+8], %r1703;
st.global.u32 [%rd4+12], %r1703;
bra.uni $L__BB28_401;
$L__BB28_6:
add.s32 %r841, %r832, -1;
setp.gt.u32 %p12, %r841, 30;
@%p12 bra $L__BB28_398;
bra.uni $L__BB28_7;
$L__BB28_398:
mov.u32 %r1698, 1;
st.global.u32 [%rd4], %r1698;
mov.u32 %r1699, 2;
st.global.u32 [%rd4+4], %r1699;
mov.u32 %r1700, 0;
st.global.u32 [%rd4+8], %r1700;
st.global.u32 [%rd4+12], %r1700;
bra.uni $L__BB28_401;
$L__BB28_7:
setp.gt.u32 %p13, %r831, 29;
@%p13 bra $L__BB28_397;
bra.uni $L__BB28_8;
$L__BB28_397:
mov.u32 %r1695, 1;
st.global.u32 [%rd4], %r1695;
mov.u32 %r1696, 3;
st.global.u32 [%rd4+4], %r1696;
mov.u32 %r1697, 0;
st.global.u32 [%rd4+8], %r1697;
st.global.u32 [%rd4+12], %r1697;
bra.uni $L__BB28_401;
$L__BB28_8:
setp.lt.u32 %p14, %r828, 2;
setp.lt.u32 %p15, %r827, %r828;
or.pred %p16, %p14, %p15;
@%p16 bra $L__BB28_396;
bra.uni $L__BB28_9;
$L__BB28_396:
mov.u32 %r1692, 1;
st.global.u32 [%rd4], %r1692;
mov.u32 %r1693, 4;
st.global.u32 [%rd4+4], %r1693;
mov.u32 %r1694, 0;
st.global.u32 [%rd4+8], %r1694;
st.global.u32 [%rd4+12], %r1694;
bra.uni $L__BB28_401;
$L__BB28_9:
add.s32 %r842, %r828, -1;
cvt.u64.u32 %rd125, %r842;
add.s64 %rd126, %rd125, %rd3;
add.s64 %rd127, %rd1, %rd126;
ld.global.u8 %rs652, [%rd127];
mul.wide.u16 %r843, %rs652, 16;
add.s32 %r844, %r828, -2;
cvt.u64.u32 %rd128, %r844;
add.s64 %rd129, %rd128, %rd3;
add.s64 %rd130, %rd1, %rd129;
ld.global.u8 %rs1, [%rd130];
and.b16 %rs653, %rs1, 15;
cvt.u32.u16 %r845, %rs653;
or.b32 %r16, %r843, %r845;
setp.lt.u32 %p17, %r828, %r16;
add.s32 %r17, %r16, -2;
setp.gt.u32 %p18, %r17, 4077;
or.pred %p19, %p17, %p18;
@%p19 bra $L__BB28_395;
bra.uni $L__BB28_10;
$L__BB28_395:
mov.u32 %r1689, 1;
st.global.u32 [%rd4], %r1689;
mov.u32 %r1690, 5;
st.global.u32 [%rd4+4], %r1690;
mov.u32 %r1691, 0;
st.global.u32 [%rd4+8], %r1691;
st.global.u32 [%rd4+12], %r1691;
bra.uni $L__BB28_401;
$L__BB28_10:
add.s32 %r846, %r826, 1;
shr.u32 %r847, %r846, 1;
add.s32 %r848, %r824, 9;
and.b32 %r849, %r848, -8;
setp.gt.u32 %p20, %r849, 264;
add.s32 %r850, %r847, 1;
mul.lo.s32 %r851, %r850, %r849;
setp.gt.u32 %p21, %r851, 3096;
or.pred %p22, %p20, %p21;
@%p22 bra $L__BB28_394;
bra.uni $L__BB28_11;
$L__BB28_394:
mov.u32 %r1686, 2;
st.global.u32 [%rd4], %r1686;
mov.u32 %r1687, 6;
st.global.u32 [%rd4+4], %r1687;
mov.u32 %r1688, 0;
st.global.u32 [%rd4+8], %r1688;
st.global.u32 [%rd4+12], %r1688;
bra.uni $L__BB28_401;
$L__BB28_11:
and.b16 %rs940, %rs1, 15;
cvt.u32.u16 %r1713, %rs940;
mul.wide.u16 %r1712, %rs652, 16;
or.b32 %r1711, %r1712, %r1713;
sub.s32 %r18, %r828, %r1711;
add.s32 %r1791, %r1711, -1;
add.s32 %r1959, %r828, -3;
mov.u64 %rd455, 0;
mov.u32 %r1881, 0;
mov.u16 %rs1013, 0;
mov.u64 %rd133, _ZZ20mel_decode_more_runsR10MelDecoderE7MEL_EXP;
mov.u32 %r1790, %r18;
mov.u16 %rs1014, %rs1013;
mov.u16 %rs1048, %rs1013;
mov.u32 %r1763, %r1881;
$L__BB28_12:
setp.gt.u32 %p24, %r1763, 7;
@%p24 bra $L__BB28_57;
mul.wide.u32 %rd132, %r1881, 4;
add.s64 %rd134, %rd133, %rd132;
ld.global.nc.u32 %r25, [%rd134];
and.b16 %rs657, %rs1048, 255;
setp.ne.s16 %p25, %rs657, 0;
mov.u16 %rs948, %rs1048;
@%p25 bra $L__BB28_17;
setp.eq.s32 %p26, %r1791, 0;
mov.u16 %rs945, 255;
@%p26 bra $L__BB28_16;
cvt.u64.u32 %rd135, %r1790;
add.s64 %rd136, %rd135, %rd3;
add.s64 %rd137, %rd1, %rd136;
ld.global.u8 %rs945, [%rd137];
$L__BB28_16:
setp.ne.s32 %p28, %r1791, 0;
selp.u32 %r854, 1, 0, %p28;
add.s32 %r1790, %r1790, %r854;
add.s32 %r855, %r1791, -1;
selp.b32 %r1791, 0, %r855, %p26;
setp.eq.s32 %p29, %r1791, 0;
or.b16 %rs659, %rs945, 15;
selp.b16 %rs1014, %rs659, %rs945, %p29;
and.b16 %rs660, %rs1014, 255;
mov.u16 %rs661, 8;
sub.s16 %rs948, %rs661, %rs1013;
setp.eq.s16 %p30, %rs660, 255;
selp.u16 %rs1013, 1, 0, %p30;
$L__BB28_17:
add.s16 %rs15, %rs948, -1;
cvt.u32.u16 %r856, %rs15;
and.b32 %r857, %r856, 255;
mov.u32 %r858, 1;
shl.b32 %r859, %r858, %r857;
cvt.u32.u16 %r860, %rs1014;
and.b32 %r861, %r859, %r860;
and.b32 %r30, %r861, 255;
add.s16 %rs1048, %rs948, -1;
setp.eq.s32 %p31, %r30, 0;
@%p31 bra $L__BB28_19;
add.s32 %r862, %r1881, 1;
min.u32 %r1758, %r862, 12;
mov.u32 %r863, -1;
shl.b32 %r864, %r863, %r25;
shl.b32 %r865, %r864, 1;
xor.b32 %r1759, %r865, -2;
bra.uni $L__BB28_56;
$L__BB28_19:
cvt.u64.u32 %rd453, %r1881;
add.s64 %rd138, %rd453, -3;
setp.gt.u64 %p32, %rd138, 9;
mov.u32 %r1755, 0;
@%p32 bra $L__BB28_55;
add.s16 %rs1048, %rs948, -1;
max.u32 %r33, %r25, 1;
add.s32 %r869, %r33, -1;
setp.lt.u32 %p33, %r869, 3;
mov.u32 %r1755, 0;
@%p33 bra $L__BB28_39;
and.b32 %r1707, %r33, 3;
add.s16 %rs1048, %rs948, -1;
sub.s32 %r1727, %r33, %r1707;
mov.u32 %r1755, 0;
$L__BB28_22:
and.b16 %rs663, %rs1048, 255;
setp.ne.s16 %p34, %rs663, 0;
@%p34 bra $L__BB28_26;
setp.eq.s32 %p35, %r1791, 0;
mov.u16 %rs952, 255;
@%p35 bra $L__BB28_25;
cvt.u64.u32 %rd139, %r1790;
add.s64 %rd140, %rd139, %rd3;
add.s64 %rd141, %rd1, %rd140;
ld.global.u8 %rs952, [%rd141];
$L__BB28_25:
setp.ne.s32 %p37, %r1791, 0;
selp.u32 %r871, 1, 0, %p37;
add.s32 %r1790, %r1790, %r871;
add.s32 %r872, %r1791, -1;
selp.b32 %r1791, 0, %r872, %p35;
setp.eq.s32 %p38, %r1791, 0;
or.b16 %rs665, %rs952, 15;
selp.b16 %rs1014, %rs665, %rs952, %p38;
and.b16 %rs666, %rs1014, 255;
mov.u16 %rs667, 8;
sub.s16 %rs1048, %rs667, %rs1013;
setp.eq.s16 %p39, %rs666, 255;
selp.u16 %rs1013, 1, 0, %p39;
$L__BB28_26:
add.s16 %rs959, %rs1048, -1;
and.b16 %rs668, %rs959, 255;
cvt.u32.u16 %r873, %rs959;
and.b32 %r874, %r873, 255;
cvt.u32.u16 %r875, %rs1014;
and.b32 %r1733, %r875, 255;
shr.u32 %r876, %r1733, %r874;
and.b32 %r877, %r876, 1;
bfi.b32 %r45, %r1755, %r877, 1, 31;
setp.ne.s16 %p40, %rs668, 0;
@%p40 bra $L__BB28_30;
setp.eq.s32 %p41, %r1791, 0;
mov.u16 %rs956, 255;
@%p41 bra $L__BB28_29;
cvt.u64.u32 %rd142, %r1790;
add.s64 %rd143, %rd142, %rd3;
add.s64 %rd144, %rd1, %rd143;
ld.global.u8 %rs956, [%rd144];
$L__BB28_29:
setp.ne.s32 %p43, %r1791, 0;
selp.u32 %r878, 1, 0, %p43;
add.s32 %r1790, %r1790, %r878;
add.s32 %r879, %r1791, -1;
selp.b32 %r1791, 0, %r879, %p41;
setp.eq.s32 %p44, %r1791, 0;
or.b16 %rs670, %rs956, 15;
selp.b16 %rs1014, %rs670, %rs956, %p44;
and.b16 %rs671, %rs1014, 255;
mov.u16 %rs672, 8;
sub.s16 %rs959, %rs672, %rs1013;
setp.eq.s16 %p45, %rs671, 255;
selp.u16 %rs1013, 1, 0, %p45;
cvt.u32.u16 %r880, %rs1014;
and.b32 %r1733, %r880, 255;
$L__BB28_30:
add.s16 %rs963, %rs959, -1;
and.b16 %rs673, %rs963, 255;
cvt.u32.u16 %r881, %rs963;
and.b32 %r882, %r881, 255;
shr.u32 %r883, %r1733, %r882;
and.b32 %r884, %r883, 1;
bfi.b32 %r52, %r45, %r884, 1, 31;
setp.ne.s16 %p46, %rs673, 0;
@%p46 bra $L__BB28_34;
setp.eq.s32 %p47, %r1791, 0;
mov.u16 %rs960, 255;
@%p47 bra $L__BB28_33;
cvt.u64.u32 %rd145, %r1790;
add.s64 %rd146, %rd145, %rd3;
add.s64 %rd147, %rd1, %rd146;
ld.global.u8 %rs960, [%rd147];
$L__BB28_33:
setp.ne.s32 %p49, %r1791, 0;
selp.u32 %r885, 1, 0, %p49;
add.s32 %r1790, %r1790, %r885;
add.s32 %r886, %r1791, -1;
selp.b32 %r1791, 0, %r886, %p47;
setp.eq.s32 %p50, %r1791, 0;
or.b16 %rs675, %rs960, 15;
selp.b16 %rs1014, %rs675, %rs960, %p50;
and.b16 %rs676, %rs1014, 255;
mov.u16 %rs677, 8;
sub.s16 %rs963, %rs677, %rs1013;
setp.eq.s16 %p51, %rs676, 255;
selp.u16 %rs1013, 1, 0, %p51;
cvt.u32.u16 %r887, %rs1014;
and.b32 %r1733, %r887, 255;
$L__BB28_34:
add.s16 %rs967, %rs963, -1;
and.b16 %rs678, %rs967, 255;
cvt.u32.u16 %r888, %rs967;
and.b32 %r889, %r888, 255;
shr.u32 %r890, %r1733, %r889;
and.b32 %r891, %r890, 1;
bfi.b32 %r59, %r52, %r891, 1, 31;
setp.ne.s16 %p52, %rs678, 0;
@%p52 bra $L__BB28_38;
setp.eq.s32 %p53, %r1791, 0;
mov.u16 %rs964, 255;
@%p53 bra $L__BB28_37;
cvt.u64.u32 %rd148, %r1790;
add.s64 %rd149, %rd148, %rd3;
add.s64 %rd150, %rd1, %rd149;
ld.global.u8 %rs964, [%rd150];
$L__BB28_37:
setp.ne.s32 %p55, %r1791, 0;
selp.u32 %r892, 1, 0, %p55;
add.s32 %r1790, %r1790, %r892;
add.s32 %r893, %r1791, -1;
selp.b32 %r1791, 0, %r893, %p53;
setp.eq.s32 %p56, %r1791, 0;
or.b16 %rs680, %rs964, 15;
selp.b16 %rs1014, %rs680, %rs964, %p56;
and.b16 %rs681, %rs1014, 255;
mov.u16 %rs682, 8;
sub.s16 %rs967, %rs682, %rs1013;
setp.eq.s16 %p57, %rs681, 255;
selp.u16 %rs1013, 1, 0, %p57;
cvt.u32.u16 %r894, %rs1014;
and.b32 %r1733, %r894, 255;
$L__BB28_38:
add.s16 %rs1048, %rs967, -1;
cvt.u32.u16 %r895, %rs1048;
and.b32 %r896, %r895, 255;
shr.u32 %r897, %r1733, %r896;
and.b32 %r898, %r897, 1;
bfi.b32 %r1755, %r59, %r898, 1, 31;
add.s32 %r1727, %r1727, -4;
setp.ne.s32 %p58, %r1727, 0;
@%p58 bra $L__BB28_22;
$L__BB28_39:
and.b32 %r1708, %r33, 3;
setp.eq.s32 %p59, %r1708, 0;
@%p59 bra $L__BB28_55;
and.b16 %rs683, %rs1048, 255;
setp.ne.s16 %p60, %rs683, 0;
@%p60 bra $L__BB28_44;
setp.eq.s32 %p61, %r1791, 0;
mov.u16 %rs974, 255;
@%p61 bra $L__BB28_43;
cvt.u64.u32 %rd151, %r1790;
add.s64 %rd152, %rd151, %rd3;
add.s64 %rd153, %rd1, %rd152;
ld.global.u8 %rs974, [%rd153];
$L__BB28_43:
setp.ne.s32 %p63, %r1791, 0;
selp.u32 %r899, 1, 0, %p63;
add.s32 %r1790, %r1790, %r899;
add.s32 %r900, %r1791, -1;
selp.b32 %r1791, 0, %r900, %p61;
setp.eq.s32 %p64, %r1791, 0;
or.b16 %rs685, %rs974, 15;
selp.b16 %rs1014, %rs685, %rs974, %p64;
and.b16 %rs686, %rs1014, 255;
mov.u16 %rs687, 8;
sub.s16 %rs1048, %rs687, %rs1013;
setp.eq.s16 %p65, %rs686, 255;
selp.u16 %rs1013, 1, 0, %p65;
$L__BB28_44:
and.b32 %r1709, %r33, 3;
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r901, %rs1048;
and.b32 %r902, %r901, 255;
cvt.u32.u16 %r903, %rs1014;
and.b32 %r1750, %r903, 255;
shr.u32 %r904, %r1750, %r902;
and.b32 %r905, %r904, 1;
bfi.b32 %r1755, %r1755, %r905, 1, 31;
setp.eq.s32 %p66, %r1709, 1;
@%p66 bra $L__BB28_55;
and.b16 %rs688, %rs1048, 255;
setp.ne.s16 %p67, %rs688, 0;
@%p67 bra $L__BB28_49;
setp.eq.s32 %p68, %r1791, 0;
mov.u16 %rs978, 255;
@%p68 bra $L__BB28_48;
cvt.u64.u32 %rd154, %r1790;
add.s64 %rd155, %rd154, %rd3;
add.s64 %rd156, %rd1, %rd155;
ld.global.u8 %rs978, [%rd156];
$L__BB28_48:
setp.ne.s32 %p70, %r1791, 0;
selp.u32 %r906, 1, 0, %p70;
add.s32 %r1790, %r1790, %r906;
add.s32 %r907, %r1791, -1;
selp.b32 %r1791, 0, %r907, %p68;
setp.eq.s32 %p71, %r1791, 0;
or.b16 %rs690, %rs978, 15;
selp.b16 %rs1014, %rs690, %rs978, %p71;
and.b16 %rs691, %rs1014, 255;
mov.u16 %rs692, 8;
sub.s16 %rs1048, %rs692, %rs1013;
setp.eq.s16 %p72, %rs691, 255;
selp.u16 %rs1013, 1, 0, %p72;
cvt.u32.u16 %r908, %rs1014;
and.b32 %r1750, %r908, 255;
$L__BB28_49:
and.b32 %r1710, %r33, 3;
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r909, %rs1048;
and.b32 %r910, %r909, 255;
shr.u32 %r911, %r1750, %r910;
and.b32 %r912, %r911, 1;
bfi.b32 %r1755, %r1755, %r912, 1, 31;
setp.eq.s32 %p73, %r1710, 2;
@%p73 bra $L__BB28_55;
and.b16 %rs693, %rs1048, 255;
setp.ne.s16 %p74, %rs693, 0;
@%p74 bra $L__BB28_54;
setp.eq.s32 %p75, %r1791, 0;
mov.u16 %rs982, 255;
@%p75 bra $L__BB28_53;
cvt.u64.u32 %rd157, %r1790;
add.s64 %rd158, %rd157, %rd3;
add.s64 %rd159, %rd1, %rd158;
ld.global.u8 %rs982, [%rd159];
$L__BB28_53:
setp.ne.s32 %p77, %r1791, 0;
selp.u32 %r913, 1, 0, %p77;
add.s32 %r1790, %r1790, %r913;
add.s32 %r914, %r1791, -1;
selp.b32 %r1791, 0, %r914, %p75;
setp.eq.s32 %p78, %r1791, 0;
or.b16 %rs695, %rs982, 15;
selp.b16 %rs1014, %rs695, %rs982, %p78;
and.b16 %rs696, %rs1014, 255;
mov.u16 %rs697, 8;
sub.s16 %rs1048, %rs697, %rs1013;
setp.eq.s16 %p79, %rs696, 255;
selp.u16 %rs1013, 1, 0, %p79;
cvt.u32.u16 %r915, %rs1014;
and.b32 %r1750, %r915, 255;
$L__BB28_54:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r916, %rs1048;
and.b32 %r917, %r916, 255;
shr.u32 %r918, %r1750, %r917;
and.b32 %r919, %r918, 1;
bfi.b32 %r1755, %r1755, %r919, 1, 31;
$L__BB28_55:
shl.b32 %r920, %r1755, 1;
or.b32 %r1759, %r920, 1;
add.s32 %r921, %r1881, -1;
setp.eq.s32 %p80, %r1881, 0;
selp.b32 %r1758, 0, %r921, %p80;
$L__BB28_56:
mul.lo.s32 %r922, %r1763, 7;
cvt.u64.u32 %rd160, %r1759;
shl.b64 %rd161, %rd160, %r922;
or.b64 %rd455, %rd161, %rd455;
setp.ne.s32 %p81, %r1881, 12;
setp.ne.s32 %p82, %r30, 0;
or.pred %p83, %p81, %p82;
add.s32 %r1763, %r1763, 1;
setp.lt.u32 %p84, %r1763, 8;
or.pred %p85, %p84, %p83;
mov.u32 %r1881, %r1758;
@%p85 bra $L__BB28_12;
$L__BB28_57:
and.b16 %rs941, %rs1, 15;
cvt.u32.u16 %r1717, %rs941;
mul.wide.u16 %r1716, %rs652, 16;
or.b32 %r1715, %r1716, %r1717;
add.s32 %r1960, %r1715, -2;
setp.gt.u16 %p515, %rs1, 143;
selp.u16 %rs1180, 1, 0, %p515;
ld.param.u64 %rd452, [ j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize_param_4];
shr.u16 %rs934, %rs1, 4;
ld.param.u64 %rd449, [ j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize_param_2];
add.s32 %r1882, %r1763, -1;
shr.u64 %rd465, %rd455, 7;
cvt.u32.u64 %r925, %rd455;
and.b32 %r1878, %r925, 127;
cvt.u64.u16 %rd474, %rs934;
and.b64 %rd162, %rd474, 7;
setp.eq.s64 %p86, %rd162, 7;
selp.b32 %r1961, 3, 4, %p86;
cvta.to.global.u64 %rd11, %rd452;
cvta.to.global.u64 %rd12, %rd449;
add.u64 %rd13, %SPL, 0;
mov.u32 %r1764, 0;
mov.u32 %r1765, %r1764;
$L__BB28_58:
setp.gt.u32 %p87, %r1961, 31;
@%p87 bra $L__BB28_62;
$L__BB28_59:
setp.eq.s32 %p88, %r1960, 0;
mov.u16 %rs1000, 0;
@%p88 bra $L__BB28_61;
cvt.s64.s32 %rd164, %r1959;
add.s64 %rd165, %rd164, %rd3;
add.s64 %rd166, %rd1, %rd165;
ld.global.u8 %rs1000, [%rd166];
$L__BB28_61:
setp.ne.s32 %p90, %r1960, 0;
selp.b32 %r926, -1, 0, %p90;
add.s32 %r1959, %r1959, %r926;
add.s32 %r927, %r1960, -1;
selp.b32 %r1960, 0, %r927, %p88;
and.b16 %rs699, %rs1000, 255;
and.b16 %rs700, %rs1000, 127;
setp.eq.s16 %p91, %rs700, 127;
and.b16 %rs701, %rs1180, 255;
setp.ne.s16 %p92, %rs701, 0;
and.pred %p93, %p92, %p91;
selp.b32 %r928, 7, 8, %p93;
cvt.u64.u16 %rd167, %rs1000;
and.b64 %rd168, %rd167, 255;
shl.b64 %rd169, %rd168, %r1961;
or.b64 %rd474, %rd169, %rd474;
add.s32 %r1961, %r928, %r1961;
setp.gt.u16 %p94, %rs699, 143;
selp.u16 %rs1180, 1, 0, %p94;
setp.lt.u32 %p95, %r1961, 33;
@%p95 bra $L__BB28_59;
$L__BB28_62:
cvt.u32.u64 %r929, %rd474;
and.b32 %r930, %r929, 127;
add.s32 %r931, %r930, %r1764;
mul.wide.u32 %rd170, %r931, 2;
add.s64 %rd171, %rd12, %rd170;
ld.global.u16 %r1831, [%rd171];
setp.ne.s32 %p96, %r1764, 0;
@%p96 bra $L__BB28_112;
add.s32 %r132, %r1878, -2;
setp.eq.s32 %p97, %r132, -1;
selp.b32 %r1831, %r1831, 0, %p97;
setp.gt.s32 %p98, %r1878, 1;
mov.u32 %r1878, %r132;
@%p98 bra $L__BB28_112;
setp.ne.s32 %p99, %r1882, 0;
@%p99 bra $L__BB28_111;
mov.u32 %r1882, 0;
$L__BB28_66:
setp.gt.u32 %p100, %r1882, 7;
@%p100 bra $L__BB28_111;
cvt.u64.u32 %rd20, %r1881;
mul.wide.u32 %rd172, %r1881, 4;
add.s64 %rd174, %rd133, %rd172;
ld.global.nc.u32 %r138, [%rd174];
and.b16 %rs702, %rs1048, 255;
setp.ne.s16 %p101, %rs702, 0;
@%p101 bra $L__BB28_71;
setp.eq.s32 %p102, %r1791, 0;
mov.u16 %rs1005, 255;
@%p102 bra $L__BB28_70;
cvt.u64.u32 %rd175, %r1790;
add.s64 %rd176, %rd175, %rd3;
add.s64 %rd177, %rd1, %rd176;
ld.global.u8 %rs1005, [%rd177];
$L__BB28_70:
setp.ne.s32 %p104, %r1791, 0;
selp.u32 %r933, 1, 0, %p104;
add.s32 %r1790, %r1790, %r933;
add.s32 %r934, %r1791, -1;
selp.b32 %r1791, 0, %r934, %p102;
setp.eq.s32 %p105, %r1791, 0;
or.b16 %rs704, %rs1005, 15;
selp.b16 %rs1014, %rs704, %rs1005, %p105;
and.b16 %rs705, %rs1014, 255;
mov.u16 %rs706, 8;
sub.s16 %rs1048, %rs706, %rs1013;
setp.eq.s16 %p106, %rs705, 255;
selp.u16 %rs1013, 1, 0, %p106;
$L__BB28_71:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r935, %rs1048;
and.b32 %r936, %r935, 255;
mov.u32 %r937, 1;
shl.b32 %r938, %r937, %r936;
cvt.u32.u16 %r939, %rs1014;
and.b32 %r940, %r938, %r939;
and.b32 %r143, %r940, 255;
setp.eq.s32 %p107, %r143, 0;
@%p107 bra $L__BB28_73;
add.s32 %r941, %r1881, 1;
min.u32 %r1820, %r941, 12;
mov.u32 %r942, -1;
shl.b32 %r943, %r942, %r138;
shl.b32 %r944, %r943, 1;
xor.b32 %r1821, %r944, -2;
bra.uni $L__BB28_110;
$L__BB28_73:
add.s64 %rd178, %rd20, -3;
setp.gt.u64 %p108, %rd178, 9;
mov.u32 %r1817, 0;
@%p108 bra $L__BB28_109;
max.u32 %r146, %r138, 1;
add.s32 %r948, %r146, -1;
and.b32 %r147, %r146, 3;
setp.lt.u32 %p109, %r948, 3;
mov.u32 %r1817, 0;
@%p109 bra $L__BB28_93;
sub.s32 %r1789, %r146, %r147;
mov.u32 %r1817, 0;
$L__BB28_76:
and.b16 %rs708, %rs1048, 255;
setp.ne.s16 %p110, %rs708, 0;
@%p110 bra $L__BB28_80;
setp.eq.s32 %p111, %r1791, 0;
mov.u16 %rs1012, 255;
@%p111 bra $L__BB28_79;
cvt.u64.u32 %rd179, %r1790;
add.s64 %rd180, %rd179, %rd3;
add.s64 %rd181, %rd1, %rd180;
ld.global.u8 %rs1012, [%rd181];
$L__BB28_79:
setp.ne.s32 %p113, %r1791, 0;
selp.u32 %r950, 1, 0, %p113;
add.s32 %r1790, %r1790, %r950;
add.s32 %r951, %r1791, -1;
selp.b32 %r1791, 0, %r951, %p111;
setp.eq.s32 %p114, %r1791, 0;
or.b16 %rs710, %rs1012, 15;
selp.b16 %rs1014, %rs710, %rs1012, %p114;
and.b16 %rs711, %rs1014, 255;
mov.u16 %rs712, 8;
sub.s16 %rs1048, %rs712, %rs1013;
setp.eq.s16 %p115, %rs711, 255;
selp.u16 %rs1013, 1, 0, %p115;
$L__BB28_80:
add.s16 %rs1019, %rs1048, -1;
and.b16 %rs713, %rs1019, 255;
cvt.u32.u16 %r952, %rs1019;
and.b32 %r953, %r952, 255;
cvt.u32.u16 %r954, %rs1014;
and.b32 %r1795, %r954, 255;
shr.u32 %r955, %r1795, %r953;
and.b32 %r956, %r955, 1;
bfi.b32 %r158, %r1817, %r956, 1, 31;
setp.ne.s16 %p116, %rs713, 0;
@%p116 bra $L__BB28_84;
setp.eq.s32 %p117, %r1791, 0;
mov.u16 %rs1016, 255;
@%p117 bra $L__BB28_83;
cvt.u64.u32 %rd182, %r1790;
add.s64 %rd183, %rd182, %rd3;
add.s64 %rd184, %rd1, %rd183;
ld.global.u8 %rs1016, [%rd184];
$L__BB28_83:
setp.ne.s32 %p119, %r1791, 0;
selp.u32 %r957, 1, 0, %p119;
add.s32 %r1790, %r1790, %r957;
add.s32 %r958, %r1791, -1;
selp.b32 %r1791, 0, %r958, %p117;
setp.eq.s32 %p120, %r1791, 0;
or.b16 %rs715, %rs1016, 15;
selp.b16 %rs1014, %rs715, %rs1016, %p120;
and.b16 %rs716, %rs1014, 255;
mov.u16 %rs717, 8;
sub.s16 %rs1019, %rs717, %rs1013;
setp.eq.s16 %p121, %rs716, 255;
selp.u16 %rs1013, 1, 0, %p121;
cvt.u32.u16 %r959, %rs1014;
and.b32 %r1795, %r959, 255;
$L__BB28_84:
add.s16 %rs1023, %rs1019, -1;
and.b16 %rs718, %rs1023, 255;
cvt.u32.u16 %r960, %rs1023;
and.b32 %r961, %r960, 255;
shr.u32 %r962, %r1795, %r961;
and.b32 %r963, %r962, 1;
bfi.b32 %r165, %r158, %r963, 1, 31;
setp.ne.s16 %p122, %rs718, 0;
@%p122 bra $L__BB28_88;
setp.eq.s32 %p123, %r1791, 0;
mov.u16 %rs1020, 255;
@%p123 bra $L__BB28_87;
cvt.u64.u32 %rd185, %r1790;
add.s64 %rd186, %rd185, %rd3;
add.s64 %rd187, %rd1, %rd186;
ld.global.u8 %rs1020, [%rd187];
$L__BB28_87:
setp.ne.s32 %p125, %r1791, 0;
selp.u32 %r964, 1, 0, %p125;
add.s32 %r1790, %r1790, %r964;
add.s32 %r965, %r1791, -1;
selp.b32 %r1791, 0, %r965, %p123;
setp.eq.s32 %p126, %r1791, 0;
or.b16 %rs720, %rs1020, 15;
selp.b16 %rs1014, %rs720, %rs1020, %p126;
and.b16 %rs721, %rs1014, 255;
mov.u16 %rs722, 8;
sub.s16 %rs1023, %rs722, %rs1013;
setp.eq.s16 %p127, %rs721, 255;
selp.u16 %rs1013, 1, 0, %p127;
cvt.u32.u16 %r966, %rs1014;
and.b32 %r1795, %r966, 255;
$L__BB28_88:
add.s16 %rs1027, %rs1023, -1;
and.b16 %rs723, %rs1027, 255;
cvt.u32.u16 %r967, %rs1027;
and.b32 %r968, %r967, 255;
shr.u32 %r969, %r1795, %r968;
and.b32 %r970, %r969, 1;
bfi.b32 %r172, %r165, %r970, 1, 31;
setp.ne.s16 %p128, %rs723, 0;
@%p128 bra $L__BB28_92;
setp.eq.s32 %p129, %r1791, 0;
mov.u16 %rs1024, 255;
@%p129 bra $L__BB28_91;
cvt.u64.u32 %rd188, %r1790;
add.s64 %rd189, %rd188, %rd3;
add.s64 %rd190, %rd1, %rd189;
ld.global.u8 %rs1024, [%rd190];
$L__BB28_91:
setp.ne.s32 %p131, %r1791, 0;
selp.u32 %r971, 1, 0, %p131;
add.s32 %r1790, %r1790, %r971;
add.s32 %r972, %r1791, -1;
selp.b32 %r1791, 0, %r972, %p129;
setp.eq.s32 %p132, %r1791, 0;
or.b16 %rs725, %rs1024, 15;
selp.b16 %rs1014, %rs725, %rs1024, %p132;
and.b16 %rs726, %rs1014, 255;
mov.u16 %rs727, 8;
sub.s16 %rs1027, %rs727, %rs1013;
setp.eq.s16 %p133, %rs726, 255;
selp.u16 %rs1013, 1, 0, %p133;
cvt.u32.u16 %r973, %rs1014;
and.b32 %r1795, %r973, 255;
$L__BB28_92:
add.s16 %rs1048, %rs1027, -1;
cvt.u32.u16 %r974, %rs1048;
and.b32 %r975, %r974, 255;
shr.u32 %r976, %r1795, %r975;
and.b32 %r977, %r976, 1;
bfi.b32 %r1817, %r172, %r977, 1, 31;
add.s32 %r1789, %r1789, -4;
setp.ne.s32 %p134, %r1789, 0;
@%p134 bra $L__BB28_76;
$L__BB28_93:
setp.eq.s32 %p135, %r147, 0;
@%p135 bra $L__BB28_109;
and.b16 %rs728, %rs1048, 255;
setp.ne.s16 %p136, %rs728, 0;
@%p136 bra $L__BB28_98;
setp.eq.s32 %p137, %r1791, 0;
mov.u16 %rs1034, 255;
@%p137 bra $L__BB28_97;
cvt.u64.u32 %rd191, %r1790;
add.s64 %rd192, %rd191, %rd3;
add.s64 %rd193, %rd1, %rd192;
ld.global.u8 %rs1034, [%rd193];
$L__BB28_97:
setp.ne.s32 %p139, %r1791, 0;
selp.u32 %r978, 1, 0, %p139;
add.s32 %r1790, %r1790, %r978;
add.s32 %r979, %r1791, -1;
selp.b32 %r1791, 0, %r979, %p137;
setp.eq.s32 %p140, %r1791, 0;
or.b16 %rs730, %rs1034, 15;
selp.b16 %rs1014, %rs730, %rs1034, %p140;
and.b16 %rs731, %rs1014, 255;
mov.u16 %rs732, 8;
sub.s16 %rs1048, %rs732, %rs1013;
setp.eq.s16 %p141, %rs731, 255;
selp.u16 %rs1013, 1, 0, %p141;
$L__BB28_98:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r980, %rs1048;
and.b32 %r981, %r980, 255;
cvt.u32.u16 %r982, %rs1014;
and.b32 %r1812, %r982, 255;
shr.u32 %r983, %r1812, %r981;
and.b32 %r984, %r983, 1;
bfi.b32 %r1817, %r1817, %r984, 1, 31;
setp.eq.s32 %p142, %r147, 1;
@%p142 bra $L__BB28_109;
and.b16 %rs733, %rs1048, 255;
setp.ne.s16 %p143, %rs733, 0;
@%p143 bra $L__BB28_103;
setp.eq.s32 %p144, %r1791, 0;
mov.u16 %rs1038, 255;
@%p144 bra $L__BB28_102;
cvt.u64.u32 %rd194, %r1790;
add.s64 %rd195, %rd194, %rd3;
add.s64 %rd196, %rd1, %rd195;
ld.global.u8 %rs1038, [%rd196];
$L__BB28_102:
setp.ne.s32 %p146, %r1791, 0;
selp.u32 %r985, 1, 0, %p146;
add.s32 %r1790, %r1790, %r985;
add.s32 %r986, %r1791, -1;
selp.b32 %r1791, 0, %r986, %p144;
setp.eq.s32 %p147, %r1791, 0;
or.b16 %rs735, %rs1038, 15;
selp.b16 %rs1014, %rs735, %rs1038, %p147;
and.b16 %rs736, %rs1014, 255;
mov.u16 %rs737, 8;
sub.s16 %rs1048, %rs737, %rs1013;
setp.eq.s16 %p148, %rs736, 255;
selp.u16 %rs1013, 1, 0, %p148;
cvt.u32.u16 %r987, %rs1014;
and.b32 %r1812, %r987, 255;
$L__BB28_103:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r988, %rs1048;
and.b32 %r989, %r988, 255;
shr.u32 %r990, %r1812, %r989;
and.b32 %r991, %r990, 1;
bfi.b32 %r1817, %r1817, %r991, 1, 31;
setp.eq.s32 %p149, %r147, 2;
@%p149 bra $L__BB28_109;
and.b16 %rs738, %rs1048, 255;
setp.ne.s16 %p150, %rs738, 0;
@%p150 bra $L__BB28_108;
setp.eq.s32 %p151, %r1791, 0;
mov.u16 %rs1042, 255;
@%p151 bra $L__BB28_107;
cvt.u64.u32 %rd197, %r1790;
add.s64 %rd198, %rd197, %rd3;
add.s64 %rd199, %rd1, %rd198;
ld.global.u8 %rs1042, [%rd199];
$L__BB28_107:
setp.ne.s32 %p153, %r1791, 0;
selp.u32 %r992, 1, 0, %p153;
add.s32 %r1790, %r1790, %r992;
add.s32 %r993, %r1791, -1;
selp.b32 %r1791, 0, %r993, %p151;
setp.eq.s32 %p154, %r1791, 0;
or.b16 %rs740, %rs1042, 15;
selp.b16 %rs1014, %rs740, %rs1042, %p154;
and.b16 %rs741, %rs1014, 255;
mov.u16 %rs742, 8;
sub.s16 %rs1048, %rs742, %rs1013;
setp.eq.s16 %p155, %rs741, 255;
selp.u16 %rs1013, 1, 0, %p155;
cvt.u32.u16 %r994, %rs1014;
and.b32 %r1812, %r994, 255;
$L__BB28_108:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r995, %rs1048;
and.b32 %r996, %r995, 255;
shr.u32 %r997, %r1812, %r996;
and.b32 %r998, %r997, 1;
bfi.b32 %r1817, %r1817, %r998, 1, 31;
$L__BB28_109:
shl.b32 %r999, %r1817, 1;
or.b32 %r1821, %r999, 1;
add.s32 %r1000, %r1881, -1;
setp.eq.s32 %p156, %r1881, 0;
selp.b32 %r1820, 0, %r1000, %p156;
$L__BB28_110:
mul.lo.s32 %r1001, %r1882, 7;
cvt.u64.u32 %rd200, %r1821;
shl.b64 %rd201, %rd200, %r1001;
or.b64 %rd465, %rd201, %rd465;
setp.ne.s32 %p157, %r1881, 12;
setp.ne.s32 %p158, %r143, 0;
or.pred %p159, %p157, %p158;
add.s32 %r1882, %r1882, 1;
setp.lt.u32 %p160, %r1882, 8;
or.pred %p161, %p160, %p159;
mov.u32 %r1881, %r1820;
@%p161 bra $L__BB28_66;
$L__BB28_111:
cvt.u32.u64 %r1002, %rd465;
and.b32 %r1878, %r1002, 127;
shr.u64 %rd465, %rd465, 7;
add.s32 %r1882, %r1882, -1;
$L__BB28_112:
mul.wide.u32 %rd202, %r1765, 2;
add.s64 %rd25, %rd13, %rd202;
st.local.u16 [%rd25], %r1831;
shl.b32 %r1003, %r1831, 3;
and.b32 %r1004, %r1003, 128;
shl.b32 %r1005, %r1831, 2;
and.b32 %r1006, %r1005, 896;
or.b32 %r1007, %r1004, %r1006;
and.b32 %r1008, %r1831, 7;
shr.u64 %rd26, %rd474, %r1008;
sub.s32 %r229, %r1961, %r1008;
cvt.u32.u64 %r1009, %rd26;
and.b32 %r1010, %r1009, 127;
or.b32 %r1011, %r1010, %r1007;
mul.wide.u32 %rd203, %r1011, 2;
add.s64 %rd204, %rd12, %rd203;
ld.global.u16 %r1883, [%rd204];
setp.ne.s32 %p162, %r1007, 0;
add.s32 %r231, %r1765, 2;
setp.ge.u32 %p163, %r231, %r824;
or.pred %p164, %p163, %p162;
@%p164 bra $L__BB28_162;
add.s32 %r232, %r1878, -2;
setp.eq.s32 %p165, %r232, -1;
selp.b32 %r1883, %r1883, 0, %p165;
setp.gt.s32 %p166, %r1878, 1;
mov.u32 %r1878, %r232;
@%p166 bra $L__BB28_162;
setp.ne.s32 %p167, %r1882, 0;
@%p167 bra $L__BB28_161;
mov.u32 %r1882, 0;
$L__BB28_116:
setp.gt.u32 %p168, %r1882, 7;
@%p168 bra $L__BB28_161;
cvt.u64.u32 %rd28, %r1881;
mul.wide.u32 %rd205, %r1881, 4;
add.s64 %rd207, %rd133, %rd205;
ld.global.nc.u32 %r238, [%rd207];
and.b16 %rs743, %rs1048, 255;
setp.ne.s16 %p169, %rs743, 0;
@%p169 bra $L__BB28_121;
setp.eq.s32 %p170, %r1791, 0;
mov.u16 %rs1061, 255;
@%p170 bra $L__BB28_120;
cvt.u64.u32 %rd208, %r1790;
add.s64 %rd209, %rd208, %rd3;
add.s64 %rd210, %rd1, %rd209;
ld.global.u8 %rs1061, [%rd210];
$L__BB28_120:
setp.ne.s32 %p172, %r1791, 0;
selp.u32 %r1013, 1, 0, %p172;
add.s32 %r1790, %r1790, %r1013;
add.s32 %r1014, %r1791, -1;
selp.b32 %r1791, 0, %r1014, %p170;
setp.eq.s32 %p173, %r1791, 0;
or.b16 %rs745, %rs1061, 15;
selp.b16 %rs1014, %rs745, %rs1061, %p173;
and.b16 %rs746, %rs1014, 255;
mov.u16 %rs747, 8;
sub.s16 %rs1048, %rs747, %rs1013;
setp.eq.s16 %p174, %rs746, 255;
selp.u16 %rs1013, 1, 0, %p174;
$L__BB28_121:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1015, %rs1048;
and.b32 %r1016, %r1015, 255;
mov.u32 %r1017, 1;
shl.b32 %r1018, %r1017, %r1016;
cvt.u32.u16 %r1019, %rs1014;
and.b32 %r1020, %r1018, %r1019;
and.b32 %r243, %r1020, 255;
setp.eq.s32 %p175, %r243, 0;
@%p175 bra $L__BB28_123;
add.s32 %r1021, %r1881, 1;
min.u32 %r1872, %r1021, 12;
mov.u32 %r1022, -1;
shl.b32 %r1023, %r1022, %r238;
shl.b32 %r1024, %r1023, 1;
xor.b32 %r1873, %r1024, -2;
bra.uni $L__BB28_160;
$L__BB28_123:
add.s64 %rd211, %rd28, -3;
setp.gt.u64 %p176, %rd211, 9;
mov.u32 %r1869, 0;
@%p176 bra $L__BB28_159;
max.u32 %r246, %r238, 1;
add.s32 %r1028, %r246, -1;
and.b32 %r247, %r246, 3;
setp.lt.u32 %p177, %r1028, 3;
mov.u32 %r1869, 0;
@%p177 bra $L__BB28_143;
sub.s32 %r1841, %r246, %r247;
mov.u32 %r1869, 0;
$L__BB28_126:
and.b16 %rs749, %rs1048, 255;
setp.ne.s16 %p178, %rs749, 0;
@%p178 bra $L__BB28_130;
setp.eq.s32 %p179, %r1791, 0;
mov.u16 %rs1068, 255;
@%p179 bra $L__BB28_129;
cvt.u64.u32 %rd212, %r1790;
add.s64 %rd213, %rd212, %rd3;
add.s64 %rd214, %rd1, %rd213;
ld.global.u8 %rs1068, [%rd214];
$L__BB28_129:
setp.ne.s32 %p181, %r1791, 0;
selp.u32 %r1030, 1, 0, %p181;
add.s32 %r1790, %r1790, %r1030;
add.s32 %r1031, %r1791, -1;
selp.b32 %r1791, 0, %r1031, %p179;
setp.eq.s32 %p182, %r1791, 0;
or.b16 %rs751, %rs1068, 15;
selp.b16 %rs1014, %rs751, %rs1068, %p182;
and.b16 %rs752, %rs1014, 255;
mov.u16 %rs753, 8;
sub.s16 %rs1048, %rs753, %rs1013;
setp.eq.s16 %p183, %rs752, 255;
selp.u16 %rs1013, 1, 0, %p183;
$L__BB28_130:
add.s16 %rs1075, %rs1048, -1;
and.b16 %rs754, %rs1075, 255;
cvt.u32.u16 %r1032, %rs1075;
and.b32 %r1033, %r1032, 255;
cvt.u32.u16 %r1034, %rs1014;
and.b32 %r1847, %r1034, 255;
shr.u32 %r1035, %r1847, %r1033;
and.b32 %r1036, %r1035, 1;
bfi.b32 %r258, %r1869, %r1036, 1, 31;
setp.ne.s16 %p184, %rs754, 0;
@%p184 bra $L__BB28_134;
setp.eq.s32 %p185, %r1791, 0;
mov.u16 %rs1072, 255;
@%p185 bra $L__BB28_133;
cvt.u64.u32 %rd215, %r1790;
add.s64 %rd216, %rd215, %rd3;
add.s64 %rd217, %rd1, %rd216;
ld.global.u8 %rs1072, [%rd217];
$L__BB28_133:
setp.ne.s32 %p187, %r1791, 0;
selp.u32 %r1037, 1, 0, %p187;
add.s32 %r1790, %r1790, %r1037;
add.s32 %r1038, %r1791, -1;
selp.b32 %r1791, 0, %r1038, %p185;
setp.eq.s32 %p188, %r1791, 0;
or.b16 %rs756, %rs1072, 15;
selp.b16 %rs1014, %rs756, %rs1072, %p188;
and.b16 %rs757, %rs1014, 255;
mov.u16 %rs758, 8;
sub.s16 %rs1075, %rs758, %rs1013;
setp.eq.s16 %p189, %rs757, 255;
selp.u16 %rs1013, 1, 0, %p189;
cvt.u32.u16 %r1039, %rs1014;
and.b32 %r1847, %r1039, 255;
$L__BB28_134:
add.s16 %rs1079, %rs1075, -1;
and.b16 %rs759, %rs1079, 255;
cvt.u32.u16 %r1040, %rs1079;
and.b32 %r1041, %r1040, 255;
shr.u32 %r1042, %r1847, %r1041;
and.b32 %r1043, %r1042, 1;
bfi.b32 %r265, %r258, %r1043, 1, 31;
setp.ne.s16 %p190, %rs759, 0;
@%p190 bra $L__BB28_138;
setp.eq.s32 %p191, %r1791, 0;
mov.u16 %rs1076, 255;
@%p191 bra $L__BB28_137;
cvt.u64.u32 %rd218, %r1790;
add.s64 %rd219, %rd218, %rd3;
add.s64 %rd220, %rd1, %rd219;
ld.global.u8 %rs1076, [%rd220];
$L__BB28_137:
setp.ne.s32 %p193, %r1791, 0;
selp.u32 %r1044, 1, 0, %p193;
add.s32 %r1790, %r1790, %r1044;
add.s32 %r1045, %r1791, -1;
selp.b32 %r1791, 0, %r1045, %p191;
setp.eq.s32 %p194, %r1791, 0;
or.b16 %rs761, %rs1076, 15;
selp.b16 %rs1014, %rs761, %rs1076, %p194;
and.b16 %rs762, %rs1014, 255;
mov.u16 %rs763, 8;
sub.s16 %rs1079, %rs763, %rs1013;
setp.eq.s16 %p195, %rs762, 255;
selp.u16 %rs1013, 1, 0, %p195;
cvt.u32.u16 %r1046, %rs1014;
and.b32 %r1847, %r1046, 255;
$L__BB28_138:
add.s16 %rs1083, %rs1079, -1;
and.b16 %rs764, %rs1083, 255;
cvt.u32.u16 %r1047, %rs1083;
and.b32 %r1048, %r1047, 255;
shr.u32 %r1049, %r1847, %r1048;
and.b32 %r1050, %r1049, 1;
bfi.b32 %r272, %r265, %r1050, 1, 31;
setp.ne.s16 %p196, %rs764, 0;
@%p196 bra $L__BB28_142;
setp.eq.s32 %p197, %r1791, 0;
mov.u16 %rs1080, 255;
@%p197 bra $L__BB28_141;
cvt.u64.u32 %rd221, %r1790;
add.s64 %rd222, %rd221, %rd3;
add.s64 %rd223, %rd1, %rd222;
ld.global.u8 %rs1080, [%rd223];
$L__BB28_141:
setp.ne.s32 %p199, %r1791, 0;
selp.u32 %r1051, 1, 0, %p199;
add.s32 %r1790, %r1790, %r1051;
add.s32 %r1052, %r1791, -1;
selp.b32 %r1791, 0, %r1052, %p197;
setp.eq.s32 %p200, %r1791, 0;
or.b16 %rs766, %rs1080, 15;
selp.b16 %rs1014, %rs766, %rs1080, %p200;
and.b16 %rs767, %rs1014, 255;
mov.u16 %rs768, 8;
sub.s16 %rs1083, %rs768, %rs1013;
setp.eq.s16 %p201, %rs767, 255;
selp.u16 %rs1013, 1, 0, %p201;
cvt.u32.u16 %r1053, %rs1014;
and.b32 %r1847, %r1053, 255;
$L__BB28_142:
add.s16 %rs1048, %rs1083, -1;
cvt.u32.u16 %r1054, %rs1048;
and.b32 %r1055, %r1054, 255;
shr.u32 %r1056, %r1847, %r1055;
and.b32 %r1057, %r1056, 1;
bfi.b32 %r1869, %r272, %r1057, 1, 31;
add.s32 %r1841, %r1841, -4;
setp.ne.s32 %p202, %r1841, 0;
@%p202 bra $L__BB28_126;
$L__BB28_143:
setp.eq.s32 %p203, %r247, 0;
@%p203 bra $L__BB28_159;
and.b16 %rs769, %rs1048, 255;
setp.ne.s16 %p204, %rs769, 0;
@%p204 bra $L__BB28_148;
setp.eq.s32 %p205, %r1791, 0;
mov.u16 %rs1090, 255;
@%p205 bra $L__BB28_147;
cvt.u64.u32 %rd224, %r1790;
add.s64 %rd225, %rd224, %rd3;
add.s64 %rd226, %rd1, %rd225;
ld.global.u8 %rs1090, [%rd226];
$L__BB28_147:
setp.ne.s32 %p207, %r1791, 0;
selp.u32 %r1058, 1, 0, %p207;
add.s32 %r1790, %r1790, %r1058;
add.s32 %r1059, %r1791, -1;
selp.b32 %r1791, 0, %r1059, %p205;
setp.eq.s32 %p208, %r1791, 0;
or.b16 %rs771, %rs1090, 15;
selp.b16 %rs1014, %rs771, %rs1090, %p208;
and.b16 %rs772, %rs1014, 255;
mov.u16 %rs773, 8;
sub.s16 %rs1048, %rs773, %rs1013;
setp.eq.s16 %p209, %rs772, 255;
selp.u16 %rs1013, 1, 0, %p209;
$L__BB28_148:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1060, %rs1048;
and.b32 %r1061, %r1060, 255;
cvt.u32.u16 %r1062, %rs1014;
and.b32 %r1864, %r1062, 255;
shr.u32 %r1063, %r1864, %r1061;
and.b32 %r1064, %r1063, 1;
bfi.b32 %r1869, %r1869, %r1064, 1, 31;
setp.eq.s32 %p210, %r247, 1;
@%p210 bra $L__BB28_159;
and.b16 %rs774, %rs1048, 255;
setp.ne.s16 %p211, %rs774, 0;
@%p211 bra $L__BB28_153;
setp.eq.s32 %p212, %r1791, 0;
mov.u16 %rs1094, 255;
@%p212 bra $L__BB28_152;
cvt.u64.u32 %rd227, %r1790;
add.s64 %rd228, %rd227, %rd3;
add.s64 %rd229, %rd1, %rd228;
ld.global.u8 %rs1094, [%rd229];
$L__BB28_152:
setp.ne.s32 %p214, %r1791, 0;
selp.u32 %r1065, 1, 0, %p214;
add.s32 %r1790, %r1790, %r1065;
add.s32 %r1066, %r1791, -1;
selp.b32 %r1791, 0, %r1066, %p212;
setp.eq.s32 %p215, %r1791, 0;
or.b16 %rs776, %rs1094, 15;
selp.b16 %rs1014, %rs776, %rs1094, %p215;
and.b16 %rs777, %rs1014, 255;
mov.u16 %rs778, 8;
sub.s16 %rs1048, %rs778, %rs1013;
setp.eq.s16 %p216, %rs777, 255;
selp.u16 %rs1013, 1, 0, %p216;
cvt.u32.u16 %r1067, %rs1014;
and.b32 %r1864, %r1067, 255;
$L__BB28_153:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1068, %rs1048;
and.b32 %r1069, %r1068, 255;
shr.u32 %r1070, %r1864, %r1069;
and.b32 %r1071, %r1070, 1;
bfi.b32 %r1869, %r1869, %r1071, 1, 31;
setp.eq.s32 %p217, %r247, 2;
@%p217 bra $L__BB28_159;
and.b16 %rs779, %rs1048, 255;
setp.ne.s16 %p218, %rs779, 0;
@%p218 bra $L__BB28_158;
setp.eq.s32 %p219, %r1791, 0;
mov.u16 %rs1098, 255;
@%p219 bra $L__BB28_157;
cvt.u64.u32 %rd230, %r1790;
add.s64 %rd231, %rd230, %rd3;
add.s64 %rd232, %rd1, %rd231;
ld.global.u8 %rs1098, [%rd232];
$L__BB28_157:
setp.ne.s32 %p221, %r1791, 0;
selp.u32 %r1072, 1, 0, %p221;
add.s32 %r1790, %r1790, %r1072;
add.s32 %r1073, %r1791, -1;
selp.b32 %r1791, 0, %r1073, %p219;
setp.eq.s32 %p222, %r1791, 0;
or.b16 %rs781, %rs1098, 15;
selp.b16 %rs1014, %rs781, %rs1098, %p222;
and.b16 %rs782, %rs1014, 255;
mov.u16 %rs783, 8;
sub.s16 %rs1048, %rs783, %rs1013;
setp.eq.s16 %p223, %rs782, 255;
selp.u16 %rs1013, 1, 0, %p223;
cvt.u32.u16 %r1074, %rs1014;
and.b32 %r1864, %r1074, 255;
$L__BB28_158:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1075, %rs1048;
and.b32 %r1076, %r1075, 255;
shr.u32 %r1077, %r1864, %r1076;
and.b32 %r1078, %r1077, 1;
bfi.b32 %r1869, %r1869, %r1078, 1, 31;
$L__BB28_159:
shl.b32 %r1079, %r1869, 1;
or.b32 %r1873, %r1079, 1;
add.s32 %r1080, %r1881, -1;
setp.eq.s32 %p224, %r1881, 0;
selp.b32 %r1872, 0, %r1080, %p224;
$L__BB28_160:
mul.lo.s32 %r1081, %r1882, 7;
cvt.u64.u32 %rd233, %r1873;
shl.b64 %rd234, %rd233, %r1081;
or.b64 %rd465, %rd234, %rd465;
setp.ne.s32 %p225, %r1881, 12;
setp.ne.s32 %p226, %r243, 0;
or.pred %p227, %p225, %p226;
add.s32 %r1882, %r1882, 1;
setp.lt.u32 %p228, %r1882, 8;
or.pred %p229, %p228, %p227;
mov.u32 %r1881, %r1872;
@%p229 bra $L__BB28_116;
$L__BB28_161:
cvt.u32.u64 %r1082, %rd465;
and.b32 %r1878, %r1082, 127;
shr.u64 %rd465, %rd465, 7;
add.s32 %r1882, %r1882, -1;
$L__BB28_162:
setp.lt.u32 %p230, %r231, %r824;
selp.b32 %r329, %r1883, 0, %p230;
st.local.u16 [%rd25+4], %r329;
and.b32 %r1084, %r1003, 64;
shl.b32 %r1085, %r329, 4;
and.b32 %r1086, %r1085, 128;
or.b32 %r1935, %r1086, %r1084;
setp.ne.s32 %p231, %r1935, 192;
@%p231 bra $L__BB28_212;
add.s32 %r331, %r1878, -2;
setp.eq.s32 %p232, %r331, -1;
selp.b32 %r1935, 256, 192, %p232;
setp.gt.s32 %p233, %r1878, 1;
mov.u32 %r1878, %r331;
@%p233 bra $L__BB28_212;
setp.ne.s32 %p234, %r1882, 0;
@%p234 bra $L__BB28_211;
mov.u32 %r1882, 0;
$L__BB28_166:
setp.gt.u32 %p235, %r1882, 7;
@%p235 bra $L__BB28_211;
cvt.u64.u32 %rd34, %r1881;
mul.wide.u32 %rd235, %r1881, 4;
add.s64 %rd237, %rd133, %rd235;
ld.global.nc.u32 %r337, [%rd237];
and.b16 %rs784, %rs1048, 255;
setp.ne.s16 %p236, %rs784, 0;
@%p236 bra $L__BB28_171;
setp.eq.s32 %p237, %r1791, 0;
mov.u16 %rs1117, 255;
@%p237 bra $L__BB28_170;
cvt.u64.u32 %rd238, %r1790;
add.s64 %rd239, %rd238, %rd3;
add.s64 %rd240, %rd1, %rd239;
ld.global.u8 %rs1117, [%rd240];
$L__BB28_170:
setp.ne.s32 %p239, %r1791, 0;
selp.u32 %r1088, 1, 0, %p239;
add.s32 %r1790, %r1790, %r1088;
add.s32 %r1089, %r1791, -1;
selp.b32 %r1791, 0, %r1089, %p237;
setp.eq.s32 %p240, %r1791, 0;
or.b16 %rs786, %rs1117, 15;
selp.b16 %rs1014, %rs786, %rs1117, %p240;
and.b16 %rs787, %rs1014, 255;
mov.u16 %rs788, 8;
sub.s16 %rs1048, %rs788, %rs1013;
setp.eq.s16 %p241, %rs787, 255;
selp.u16 %rs1013, 1, 0, %p241;
$L__BB28_171:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1090, %rs1048;
and.b32 %r1091, %r1090, 255;
mov.u32 %r1092, 1;
shl.b32 %r1093, %r1092, %r1091;
cvt.u32.u16 %r1094, %rs1014;
and.b32 %r1095, %r1093, %r1094;
and.b32 %r342, %r1095, 255;
setp.eq.s32 %p242, %r342, 0;
@%p242 bra $L__BB28_173;
add.s32 %r1096, %r1881, 1;
min.u32 %r1924, %r1096, 12;
mov.u32 %r1097, -1;
shl.b32 %r1098, %r1097, %r337;
shl.b32 %r1099, %r1098, 1;
xor.b32 %r1925, %r1099, -2;
bra.uni $L__BB28_210;
$L__BB28_173:
add.s64 %rd241, %rd34, -3;
setp.gt.u64 %p243, %rd241, 9;
mov.u32 %r1921, 0;
@%p243 bra $L__BB28_209;
max.u32 %r345, %r337, 1;
add.s32 %r1103, %r345, -1;
and.b32 %r346, %r345, 3;
setp.lt.u32 %p244, %r1103, 3;
mov.u32 %r1921, 0;
@%p244 bra $L__BB28_193;
sub.s32 %r1893, %r345, %r346;
mov.u32 %r1921, 0;
$L__BB28_176:
and.b16 %rs790, %rs1048, 255;
setp.ne.s16 %p245, %rs790, 0;
@%p245 bra $L__BB28_180;
setp.eq.s32 %p246, %r1791, 0;
mov.u16 %rs1124, 255;
@%p246 bra $L__BB28_179;
cvt.u64.u32 %rd242, %r1790;
add.s64 %rd243, %rd242, %rd3;
add.s64 %rd244, %rd1, %rd243;
ld.global.u8 %rs1124, [%rd244];
$L__BB28_179:
setp.ne.s32 %p248, %r1791, 0;
selp.u32 %r1105, 1, 0, %p248;
add.s32 %r1790, %r1790, %r1105;
add.s32 %r1106, %r1791, -1;
selp.b32 %r1791, 0, %r1106, %p246;
setp.eq.s32 %p249, %r1791, 0;
or.b16 %rs792, %rs1124, 15;
selp.b16 %rs1014, %rs792, %rs1124, %p249;
and.b16 %rs793, %rs1014, 255;
mov.u16 %rs794, 8;
sub.s16 %rs1048, %rs794, %rs1013;
setp.eq.s16 %p250, %rs793, 255;
selp.u16 %rs1013, 1, 0, %p250;
$L__BB28_180:
add.s16 %rs1131, %rs1048, -1;
and.b16 %rs795, %rs1131, 255;
cvt.u32.u16 %r1107, %rs1131;
and.b32 %r1108, %r1107, 255;
cvt.u32.u16 %r1109, %rs1014;
and.b32 %r1899, %r1109, 255;
shr.u32 %r1110, %r1899, %r1108;
and.b32 %r1111, %r1110, 1;
bfi.b32 %r357, %r1921, %r1111, 1, 31;
setp.ne.s16 %p251, %rs795, 0;
@%p251 bra $L__BB28_184;
setp.eq.s32 %p252, %r1791, 0;
mov.u16 %rs1128, 255;
@%p252 bra $L__BB28_183;
cvt.u64.u32 %rd245, %r1790;
add.s64 %rd246, %rd245, %rd3;
add.s64 %rd247, %rd1, %rd246;
ld.global.u8 %rs1128, [%rd247];
$L__BB28_183:
setp.ne.s32 %p254, %r1791, 0;
selp.u32 %r1112, 1, 0, %p254;
add.s32 %r1790, %r1790, %r1112;
add.s32 %r1113, %r1791, -1;
selp.b32 %r1791, 0, %r1113, %p252;
setp.eq.s32 %p255, %r1791, 0;
or.b16 %rs797, %rs1128, 15;
selp.b16 %rs1014, %rs797, %rs1128, %p255;
and.b16 %rs798, %rs1014, 255;
mov.u16 %rs799, 8;
sub.s16 %rs1131, %rs799, %rs1013;
setp.eq.s16 %p256, %rs798, 255;
selp.u16 %rs1013, 1, 0, %p256;
cvt.u32.u16 %r1114, %rs1014;
and.b32 %r1899, %r1114, 255;
$L__BB28_184:
add.s16 %rs1135, %rs1131, -1;
and.b16 %rs800, %rs1135, 255;
cvt.u32.u16 %r1115, %rs1135;
and.b32 %r1116, %r1115, 255;
shr.u32 %r1117, %r1899, %r1116;
and.b32 %r1118, %r1117, 1;
bfi.b32 %r364, %r357, %r1118, 1, 31;
setp.ne.s16 %p257, %rs800, 0;
@%p257 bra $L__BB28_188;
setp.eq.s32 %p258, %r1791, 0;
mov.u16 %rs1132, 255;
@%p258 bra $L__BB28_187;
cvt.u64.u32 %rd248, %r1790;
add.s64 %rd249, %rd248, %rd3;
add.s64 %rd250, %rd1, %rd249;
ld.global.u8 %rs1132, [%rd250];
$L__BB28_187:
setp.ne.s32 %p260, %r1791, 0;
selp.u32 %r1119, 1, 0, %p260;
add.s32 %r1790, %r1790, %r1119;
add.s32 %r1120, %r1791, -1;
selp.b32 %r1791, 0, %r1120, %p258;
setp.eq.s32 %p261, %r1791, 0;
or.b16 %rs802, %rs1132, 15;
selp.b16 %rs1014, %rs802, %rs1132, %p261;
and.b16 %rs803, %rs1014, 255;
mov.u16 %rs804, 8;
sub.s16 %rs1135, %rs804, %rs1013;
setp.eq.s16 %p262, %rs803, 255;
selp.u16 %rs1013, 1, 0, %p262;
cvt.u32.u16 %r1121, %rs1014;
and.b32 %r1899, %r1121, 255;
$L__BB28_188:
add.s16 %rs1139, %rs1135, -1;
and.b16 %rs805, %rs1139, 255;
cvt.u32.u16 %r1122, %rs1139;
and.b32 %r1123, %r1122, 255;
shr.u32 %r1124, %r1899, %r1123;
and.b32 %r1125, %r1124, 1;
bfi.b32 %r371, %r364, %r1125, 1, 31;
setp.ne.s16 %p263, %rs805, 0;
@%p263 bra $L__BB28_192;
setp.eq.s32 %p264, %r1791, 0;
mov.u16 %rs1136, 255;
@%p264 bra $L__BB28_191;
cvt.u64.u32 %rd251, %r1790;
add.s64 %rd252, %rd251, %rd3;
add.s64 %rd253, %rd1, %rd252;
ld.global.u8 %rs1136, [%rd253];
$L__BB28_191:
setp.ne.s32 %p266, %r1791, 0;
selp.u32 %r1126, 1, 0, %p266;
add.s32 %r1790, %r1790, %r1126;
add.s32 %r1127, %r1791, -1;
selp.b32 %r1791, 0, %r1127, %p264;
setp.eq.s32 %p267, %r1791, 0;
or.b16 %rs807, %rs1136, 15;
selp.b16 %rs1014, %rs807, %rs1136, %p267;
and.b16 %rs808, %rs1014, 255;
mov.u16 %rs809, 8;
sub.s16 %rs1139, %rs809, %rs1013;
setp.eq.s16 %p268, %rs808, 255;
selp.u16 %rs1013, 1, 0, %p268;
cvt.u32.u16 %r1128, %rs1014;
and.b32 %r1899, %r1128, 255;
$L__BB28_192:
add.s16 %rs1048, %rs1139, -1;
cvt.u32.u16 %r1129, %rs1048;
and.b32 %r1130, %r1129, 255;
shr.u32 %r1131, %r1899, %r1130;
and.b32 %r1132, %r1131, 1;
bfi.b32 %r1921, %r371, %r1132, 1, 31;
add.s32 %r1893, %r1893, -4;
setp.ne.s32 %p269, %r1893, 0;
@%p269 bra $L__BB28_176;
$L__BB28_193:
setp.eq.s32 %p270, %r346, 0;
@%p270 bra $L__BB28_209;
and.b16 %rs810, %rs1048, 255;
setp.ne.s16 %p271, %rs810, 0;
@%p271 bra $L__BB28_198;
setp.eq.s32 %p272, %r1791, 0;
mov.u16 %rs1146, 255;
@%p272 bra $L__BB28_197;
cvt.u64.u32 %rd254, %r1790;
add.s64 %rd255, %rd254, %rd3;
add.s64 %rd256, %rd1, %rd255;
ld.global.u8 %rs1146, [%rd256];
$L__BB28_197:
setp.ne.s32 %p274, %r1791, 0;
selp.u32 %r1133, 1, 0, %p274;
add.s32 %r1790, %r1790, %r1133;
add.s32 %r1134, %r1791, -1;
selp.b32 %r1791, 0, %r1134, %p272;
setp.eq.s32 %p275, %r1791, 0;
or.b16 %rs812, %rs1146, 15;
selp.b16 %rs1014, %rs812, %rs1146, %p275;
and.b16 %rs813, %rs1014, 255;
mov.u16 %rs814, 8;
sub.s16 %rs1048, %rs814, %rs1013;
setp.eq.s16 %p276, %rs813, 255;
selp.u16 %rs1013, 1, 0, %p276;
$L__BB28_198:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1135, %rs1048;
and.b32 %r1136, %r1135, 255;
cvt.u32.u16 %r1137, %rs1014;
and.b32 %r1916, %r1137, 255;
shr.u32 %r1138, %r1916, %r1136;
and.b32 %r1139, %r1138, 1;
bfi.b32 %r1921, %r1921, %r1139, 1, 31;
setp.eq.s32 %p277, %r346, 1;
@%p277 bra $L__BB28_209;
and.b16 %rs815, %rs1048, 255;
setp.ne.s16 %p278, %rs815, 0;
@%p278 bra $L__BB28_203;
setp.eq.s32 %p279, %r1791, 0;
mov.u16 %rs1150, 255;
@%p279 bra $L__BB28_202;
cvt.u64.u32 %rd257, %r1790;
add.s64 %rd258, %rd257, %rd3;
add.s64 %rd259, %rd1, %rd258;
ld.global.u8 %rs1150, [%rd259];
$L__BB28_202:
setp.ne.s32 %p281, %r1791, 0;
selp.u32 %r1140, 1, 0, %p281;
add.s32 %r1790, %r1790, %r1140;
add.s32 %r1141, %r1791, -1;
selp.b32 %r1791, 0, %r1141, %p279;
setp.eq.s32 %p282, %r1791, 0;
or.b16 %rs817, %rs1150, 15;
selp.b16 %rs1014, %rs817, %rs1150, %p282;
and.b16 %rs818, %rs1014, 255;
mov.u16 %rs819, 8;
sub.s16 %rs1048, %rs819, %rs1013;
setp.eq.s16 %p283, %rs818, 255;
selp.u16 %rs1013, 1, 0, %p283;
cvt.u32.u16 %r1142, %rs1014;
and.b32 %r1916, %r1142, 255;
$L__BB28_203:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1143, %rs1048;
and.b32 %r1144, %r1143, 255;
shr.u32 %r1145, %r1916, %r1144;
and.b32 %r1146, %r1145, 1;
bfi.b32 %r1921, %r1921, %r1146, 1, 31;
setp.eq.s32 %p284, %r346, 2;
@%p284 bra $L__BB28_209;
and.b16 %rs820, %rs1048, 255;
setp.ne.s16 %p285, %rs820, 0;
@%p285 bra $L__BB28_208;
setp.eq.s32 %p286, %r1791, 0;
mov.u16 %rs1154, 255;
@%p286 bra $L__BB28_207;
cvt.u64.u32 %rd260, %r1790;
add.s64 %rd261, %rd260, %rd3;
add.s64 %rd262, %rd1, %rd261;
ld.global.u8 %rs1154, [%rd262];
$L__BB28_207:
setp.ne.s32 %p288, %r1791, 0;
selp.u32 %r1147, 1, 0, %p288;
add.s32 %r1790, %r1790, %r1147;
add.s32 %r1148, %r1791, -1;
selp.b32 %r1791, 0, %r1148, %p286;
setp.eq.s32 %p289, %r1791, 0;
or.b16 %rs822, %rs1154, 15;
selp.b16 %rs1014, %rs822, %rs1154, %p289;
and.b16 %rs823, %rs1014, 255;
mov.u16 %rs824, 8;
sub.s16 %rs1048, %rs824, %rs1013;
setp.eq.s16 %p290, %rs823, 255;
selp.u16 %rs1013, 1, 0, %p290;
cvt.u32.u16 %r1149, %rs1014;
and.b32 %r1916, %r1149, 255;
$L__BB28_208:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1150, %rs1048;
and.b32 %r1151, %r1150, 255;
shr.u32 %r1152, %r1916, %r1151;
and.b32 %r1153, %r1152, 1;
bfi.b32 %r1921, %r1921, %r1153, 1, 31;
$L__BB28_209:
shl.b32 %r1154, %r1921, 1;
or.b32 %r1925, %r1154, 1;
add.s32 %r1155, %r1881, -1;
setp.eq.s32 %p291, %r1881, 0;
selp.b32 %r1924, 0, %r1155, %p291;
$L__BB28_210:
mul.lo.s32 %r1156, %r1882, 7;
cvt.u64.u32 %rd263, %r1925;
shl.b64 %rd264, %rd263, %r1156;
or.b64 %rd465, %rd264, %rd465;
setp.ne.s32 %p292, %r1881, 12;
setp.ne.s32 %p293, %r342, 0;
or.pred %p294, %p292, %p293;
add.s32 %r1882, %r1882, 1;
setp.lt.u32 %p295, %r1882, 8;
or.pred %p296, %p295, %p294;
mov.u32 %r1881, %r1924;
@%p296 bra $L__BB28_166;
$L__BB28_211:
cvt.u32.u64 %r1157, %rd465;
and.b32 %r1878, %r1157, 127;
shr.u64 %rd465, %rd465, 7;
add.s32 %r1882, %r1882, -1;
$L__BB28_212:
and.b32 %r1158, %r329, 7;
shr.u64 %rd265, %rd26, %r1158;
cvt.u32.u64 %r1159, %rd265;
and.b32 %r1160, %r1159, 63;
add.s32 %r1161, %r1935, %r1160;
mul.wide.u32 %rd266, %r1161, 2;
add.s64 %rd267, %rd11, %rd266;
ld.global.u16 %r1162, [%rd267];
and.b32 %r1163, %r1162, 7;
shr.u64 %rd268, %rd265, %r1163;
sub.s32 %r1164, %r229, %r1158;
sub.s32 %r1165, %r1164, %r1163;
cvt.u32.u64 %r1166, %rd268;
shr.u32 %r1167, %r1162, 3;
and.b32 %r1168, %r1167, 15;
mov.u32 %r1169, -1;
shl.b32 %r1170, %r1169, %r1168;
not.b32 %r1171, %r1170;
and.b32 %r1172, %r1166, %r1171;
shr.u64 %rd474, %rd268, %r1168;
sub.s32 %r1961, %r1165, %r1168;
shr.u32 %r1173, %r1162, 7;
and.b32 %r1174, %r1173, 7;
shr.u32 %r1175, %r1162, 10;
and.b32 %r1176, %r1175, 7;
mov.u32 %r1177, 255;
shl.b32 %r1178, %r1177, %r1174;
not.b32 %r1179, %r1178;
and.b32 %r1180, %r1172, %r1179;
add.s32 %r1181, %r1176, %r1180;
add.s32 %r1182, %r1181, 1;
st.local.u16 [%rd25+2], %r1182;
shr.u32 %r1183, %r1162, 13;
shr.u32 %r1184, %r1172, %r1174;
add.s32 %r1185, %r1183, %r1184;
add.s32 %r1186, %r1185, 1;
st.local.u16 [%rd25+6], %r1186;
add.s32 %r1765, %r1765, 4;
setp.lt.u32 %p297, %r1765, %r824;
shl.b32 %r1187, %r329, 2;
and.b32 %r1188, %r1187, 896;
shl.b32 %r1189, %r329, 3;
and.b32 %r1190, %r1189, 128;
or.b32 %r1764, %r1190, %r1188;
@%p297 bra $L__BB28_58;
mul.wide.u32 %rd271, %r1765, 2;
add.s64 %rd272, %rd13, %rd271;
mov.u16 %rs825, 0;
st.local.v2.u16 [%rd272], {%rs825, %rs825};
setp.lt.u32 %p298, %r826, 3;
@%p298 bra $L__BB28_322;
ld.param.u64 %rd451, [ j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize_param_3];
ld.param.u64 %rd450, [ j2k_htj2k_decode_codeblocks_multi_cleanup_dequantize_param_5];
cvta.to.global.u64 %rd40, %rd450;
cvta.to.global.u64 %rd41, %rd451;
mov.u32 %r1936, 2;
$L__BB28_215:
shr.u32 %r1195, %r1936, 1;
mul.lo.s32 %r441, %r1195, %r849;
sub.s32 %r442, %r441, %r849;
mov.u32 %r1945, 0;
mov.u32 %r1946, %r1945;
mov.u32 %r1947, %r441;
$L__BB28_216:
sub.s32 %r1196, %r1947, %r441;
add.s32 %r454, %r1196, %r442;
mul.wide.u32 %rd273, %r454, 2;
add.s64 %rd46, %rd13, %rd273;
ld.local.u16 %r1197, [%rd46];
shl.b32 %r1198, %r1197, 2;
and.b32 %r1199, %r1198, 640;
or.b32 %r1200, %r1946, %r1199;
add.s32 %r1201, %r454, 2;
mul.wide.u32 %rd274, %r1201, 2;
add.s64 %rd47, %rd13, %rd274;
ld.local.u16 %r1202, [%rd47];
shl.b32 %r1203, %r1202, 4;
and.b32 %r1204, %r1203, 512;
or.b32 %r455, %r1200, %r1204;
setp.gt.u32 %p299, %r1961, 31;
@%p299 bra $L__BB28_220;
$L__BB28_217:
setp.eq.s32 %p300, %r1960, 0;
mov.u16 %rs1179, 0;
@%p300 bra $L__BB28_219;
cvt.s64.s32 %rd275, %r1959;
add.s64 %rd276, %rd275, %rd3;
add.s64 %rd277, %rd1, %rd276;
ld.global.u8 %rs1179, [%rd277];
$L__BB28_219:
setp.ne.s32 %p302, %r1960, 0;
selp.b32 %r1205, -1, 0, %p302;
add.s32 %r1959, %r1959, %r1205;
add.s32 %r1206, %r1960, -1;
selp.b32 %r1960, 0, %r1206, %p300;
and.b16 %rs827, %rs1179, 255;
and.b16 %rs828, %rs1179, 127;
setp.eq.s16 %p303, %rs828, 127;
and.b16 %rs829, %rs1180, 255;
setp.ne.s16 %p304, %rs829, 0;
and.pred %p305, %p304, %p303;
selp.b32 %r1207, 7, 8, %p305;
cvt.u64.u16 %rd278, %rs1179;
and.b64 %rd279, %rd278, 255;
shl.b64 %rd280, %rd279, %r1961;
or.b64 %rd474, %rd280, %rd474;
add.s32 %r1961, %r1207, %r1961;
setp.gt.u16 %p306, %rs827, 143;
selp.u16 %rs1180, 1, 0, %p306;
setp.lt.u32 %p307, %r1961, 33;
@%p307 bra $L__BB28_217;
$L__BB28_220:
cvt.u32.u64 %r1208, %rd474;
and.b32 %r1209, %r1208, 127;
add.s32 %r1210, %r1209, %r455;
mul.wide.u32 %rd281, %r1210, 2;
add.s64 %rd282, %rd41, %rd281;
ld.global.u16 %r2013, [%rd282];
setp.ne.s32 %p308, %r455, 0;
@%p308 bra $L__BB28_270;
add.s32 %r466, %r1878, -2;
setp.eq.s32 %p309, %r466, -1;
selp.b32 %r2013, %r2013, 0, %p309;
setp.gt.s32 %p310, %r1878, 1;
mov.u32 %r1878, %r466;
@%p310 bra $L__BB28_270;
setp.ne.s32 %p311, %r1882, 0;
@%p311 bra $L__BB28_269;
mov.u32 %r1882, 0;
$L__BB28_224:
setp.gt.u32 %p312, %r1882, 7;
@%p312 bra $L__BB28_269;
cvt.u64.u32 %rd52, %r1881;
mul.wide.u32 %rd283, %r1881, 4;
add.s64 %rd285, %rd133, %rd283;
ld.global.nc.u32 %r472, [%rd285];
and.b16 %rs830, %rs1048, 255;
setp.ne.s16 %p313, %rs830, 0;
@%p313 bra $L__BB28_229;
setp.eq.s32 %p314, %r1791, 0;
mov.u16 %rs1184, 255;
@%p314 bra $L__BB28_228;
cvt.u64.u32 %rd286, %r1790;
add.s64 %rd287, %rd286, %rd3;
add.s64 %rd288, %rd1, %rd287;
ld.global.u8 %rs1184, [%rd288];
$L__BB28_228:
setp.ne.s32 %p316, %r1791, 0;
selp.u32 %r1212, 1, 0, %p316;
add.s32 %r1790, %r1790, %r1212;
add.s32 %r1213, %r1791, -1;
selp.b32 %r1791, 0, %r1213, %p314;
setp.eq.s32 %p317, %r1791, 0;
or.b16 %rs832, %rs1184, 15;
selp.b16 %rs1014, %rs832, %rs1184, %p317;
and.b16 %rs833, %rs1014, 255;
mov.u16 %rs834, 8;
sub.s16 %rs1048, %rs834, %rs1013;
setp.eq.s16 %p318, %rs833, 255;
selp.u16 %rs1013, 1, 0, %p318;
$L__BB28_229:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1214, %rs1048;
and.b32 %r1215, %r1214, 255;
mov.u32 %r1216, 1;
shl.b32 %r1217, %r1216, %r1215;
cvt.u32.u16 %r1218, %rs1014;
and.b32 %r1219, %r1217, %r1218;
and.b32 %r477, %r1219, 255;
setp.eq.s32 %p319, %r477, 0;
@%p319 bra $L__BB28_231;
add.s32 %r1220, %r1881, 1;
min.u32 %r2002, %r1220, 12;
mov.u32 %r1221, -1;
shl.b32 %r1222, %r1221, %r472;
shl.b32 %r1223, %r1222, 1;
xor.b32 %r2003, %r1223, -2;
bra.uni $L__BB28_268;
$L__BB28_231:
add.s64 %rd289, %rd52, -3;
setp.gt.u64 %p320, %rd289, 9;
mov.u32 %r1999, 0;
@%p320 bra $L__BB28_267;
max.u32 %r480, %r472, 1;
add.s32 %r1227, %r480, -1;
and.b32 %r481, %r480, 3;
setp.lt.u32 %p321, %r1227, 3;
mov.u32 %r1999, 0;
@%p321 bra $L__BB28_251;
sub.s32 %r1971, %r480, %r481;
mov.u32 %r1999, 0;
$L__BB28_234:
and.b16 %rs836, %rs1048, 255;
setp.ne.s16 %p322, %rs836, 0;
@%p322 bra $L__BB28_238;
setp.eq.s32 %p323, %r1791, 0;
mov.u16 %rs1191, 255;
@%p323 bra $L__BB28_237;
cvt.u64.u32 %rd290, %r1790;
add.s64 %rd291, %rd290, %rd3;
add.s64 %rd292, %rd1, %rd291;
ld.global.u8 %rs1191, [%rd292];
$L__BB28_237:
setp.ne.s32 %p325, %r1791, 0;
selp.u32 %r1229, 1, 0, %p325;
add.s32 %r1790, %r1790, %r1229;
add.s32 %r1230, %r1791, -1;
selp.b32 %r1791, 0, %r1230, %p323;
setp.eq.s32 %p326, %r1791, 0;
or.b16 %rs838, %rs1191, 15;
selp.b16 %rs1014, %rs838, %rs1191, %p326;
and.b16 %rs839, %rs1014, 255;
mov.u16 %rs840, 8;
sub.s16 %rs1048, %rs840, %rs1013;
setp.eq.s16 %p327, %rs839, 255;
selp.u16 %rs1013, 1, 0, %p327;
$L__BB28_238:
add.s16 %rs1198, %rs1048, -1;
and.b16 %rs841, %rs1198, 255;
cvt.u32.u16 %r1231, %rs1198;
and.b32 %r1232, %r1231, 255;
cvt.u32.u16 %r1233, %rs1014;
and.b32 %r1977, %r1233, 255;
shr.u32 %r1234, %r1977, %r1232;
and.b32 %r1235, %r1234, 1;
bfi.b32 %r492, %r1999, %r1235, 1, 31;
setp.ne.s16 %p328, %rs841, 0;
@%p328 bra $L__BB28_242;
setp.eq.s32 %p329, %r1791, 0;
mov.u16 %rs1195, 255;
@%p329 bra $L__BB28_241;
cvt.u64.u32 %rd293, %r1790;
add.s64 %rd294, %rd293, %rd3;
add.s64 %rd295, %rd1, %rd294;
ld.global.u8 %rs1195, [%rd295];
$L__BB28_241:
setp.ne.s32 %p331, %r1791, 0;
selp.u32 %r1236, 1, 0, %p331;
add.s32 %r1790, %r1790, %r1236;
add.s32 %r1237, %r1791, -1;
selp.b32 %r1791, 0, %r1237, %p329;
setp.eq.s32 %p332, %r1791, 0;
or.b16 %rs843, %rs1195, 15;
selp.b16 %rs1014, %rs843, %rs1195, %p332;
and.b16 %rs844, %rs1014, 255;
mov.u16 %rs845, 8;
sub.s16 %rs1198, %rs845, %rs1013;
setp.eq.s16 %p333, %rs844, 255;
selp.u16 %rs1013, 1, 0, %p333;
cvt.u32.u16 %r1238, %rs1014;
and.b32 %r1977, %r1238, 255;
$L__BB28_242:
add.s16 %rs1202, %rs1198, -1;
and.b16 %rs846, %rs1202, 255;
cvt.u32.u16 %r1239, %rs1202;
and.b32 %r1240, %r1239, 255;
shr.u32 %r1241, %r1977, %r1240;
and.b32 %r1242, %r1241, 1;
bfi.b32 %r499, %r492, %r1242, 1, 31;
setp.ne.s16 %p334, %rs846, 0;
@%p334 bra $L__BB28_246;
setp.eq.s32 %p335, %r1791, 0;
mov.u16 %rs1199, 255;
@%p335 bra $L__BB28_245;
cvt.u64.u32 %rd296, %r1790;
add.s64 %rd297, %rd296, %rd3;
add.s64 %rd298, %rd1, %rd297;
ld.global.u8 %rs1199, [%rd298];
$L__BB28_245:
setp.ne.s32 %p337, %r1791, 0;
selp.u32 %r1243, 1, 0, %p337;
add.s32 %r1790, %r1790, %r1243;
add.s32 %r1244, %r1791, -1;
selp.b32 %r1791, 0, %r1244, %p335;
setp.eq.s32 %p338, %r1791, 0;
or.b16 %rs848, %rs1199, 15;
selp.b16 %rs1014, %rs848, %rs1199, %p338;
and.b16 %rs849, %rs1014, 255;
mov.u16 %rs850, 8;
sub.s16 %rs1202, %rs850, %rs1013;
setp.eq.s16 %p339, %rs849, 255;
selp.u16 %rs1013, 1, 0, %p339;
cvt.u32.u16 %r1245, %rs1014;
and.b32 %r1977, %r1245, 255;
$L__BB28_246:
add.s16 %rs1206, %rs1202, -1;
and.b16 %rs851, %rs1206, 255;
cvt.u32.u16 %r1246, %rs1206;
and.b32 %r1247, %r1246, 255;
shr.u32 %r1248, %r1977, %r1247;
and.b32 %r1249, %r1248, 1;
bfi.b32 %r506, %r499, %r1249, 1, 31;
setp.ne.s16 %p340, %rs851, 0;
@%p340 bra $L__BB28_250;
setp.eq.s32 %p341, %r1791, 0;
mov.u16 %rs1203, 255;
@%p341 bra $L__BB28_249;
cvt.u64.u32 %rd299, %r1790;
add.s64 %rd300, %rd299, %rd3;
add.s64 %rd301, %rd1, %rd300;
ld.global.u8 %rs1203, [%rd301];
$L__BB28_249:
setp.ne.s32 %p343, %r1791, 0;
selp.u32 %r1250, 1, 0, %p343;
add.s32 %r1790, %r1790, %r1250;
add.s32 %r1251, %r1791, -1;
selp.b32 %r1791, 0, %r1251, %p341;
setp.eq.s32 %p344, %r1791, 0;
or.b16 %rs853, %rs1203, 15;
selp.b16 %rs1014, %rs853, %rs1203, %p344;
and.b16 %rs854, %rs1014, 255;
mov.u16 %rs855, 8;
sub.s16 %rs1206, %rs855, %rs1013;
setp.eq.s16 %p345, %rs854, 255;
selp.u16 %rs1013, 1, 0, %p345;
cvt.u32.u16 %r1252, %rs1014;
and.b32 %r1977, %r1252, 255;
$L__BB28_250:
add.s16 %rs1048, %rs1206, -1;
cvt.u32.u16 %r1253, %rs1048;
and.b32 %r1254, %r1253, 255;
shr.u32 %r1255, %r1977, %r1254;
and.b32 %r1256, %r1255, 1;
bfi.b32 %r1999, %r506, %r1256, 1, 31;
add.s32 %r1971, %r1971, -4;
setp.ne.s32 %p346, %r1971, 0;
@%p346 bra $L__BB28_234;
$L__BB28_251:
setp.eq.s32 %p347, %r481, 0;
@%p347 bra $L__BB28_267;
and.b16 %rs856, %rs1048, 255;
setp.ne.s16 %p348, %rs856, 0;
@%p348 bra $L__BB28_256;
setp.eq.s32 %p349, %r1791, 0;
mov.u16 %rs1213, 255;
@%p349 bra $L__BB28_255;
cvt.u64.u32 %rd302, %r1790;
add.s64 %rd303, %rd302, %rd3;
add.s64 %rd304, %rd1, %rd303;
ld.global.u8 %rs1213, [%rd304];
$L__BB28_255:
setp.ne.s32 %p351, %r1791, 0;
selp.u32 %r1257, 1, 0, %p351;
add.s32 %r1790, %r1790, %r1257;
add.s32 %r1258, %r1791, -1;
selp.b32 %r1791, 0, %r1258, %p349;
setp.eq.s32 %p352, %r1791, 0;
or.b16 %rs858, %rs1213, 15;
selp.b16 %rs1014, %rs858, %rs1213, %p352;
and.b16 %rs859, %rs1014, 255;
mov.u16 %rs860, 8;
sub.s16 %rs1048, %rs860, %rs1013;
setp.eq.s16 %p353, %rs859, 255;
selp.u16 %rs1013, 1, 0, %p353;
$L__BB28_256:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1259, %rs1048;
and.b32 %r1260, %r1259, 255;
cvt.u32.u16 %r1261, %rs1014;
and.b32 %r1994, %r1261, 255;
shr.u32 %r1262, %r1994, %r1260;
and.b32 %r1263, %r1262, 1;
bfi.b32 %r1999, %r1999, %r1263, 1, 31;
setp.eq.s32 %p354, %r481, 1;
@%p354 bra $L__BB28_267;
and.b16 %rs861, %rs1048, 255;
setp.ne.s16 %p355, %rs861, 0;
@%p355 bra $L__BB28_261;
setp.eq.s32 %p356, %r1791, 0;
mov.u16 %rs1217, 255;
@%p356 bra $L__BB28_260;
cvt.u64.u32 %rd305, %r1790;
add.s64 %rd306, %rd305, %rd3;
add.s64 %rd307, %rd1, %rd306;
ld.global.u8 %rs1217, [%rd307];
$L__BB28_260:
setp.ne.s32 %p358, %r1791, 0;
selp.u32 %r1264, 1, 0, %p358;
add.s32 %r1790, %r1790, %r1264;
add.s32 %r1265, %r1791, -1;
selp.b32 %r1791, 0, %r1265, %p356;
setp.eq.s32 %p359, %r1791, 0;
or.b16 %rs863, %rs1217, 15;
selp.b16 %rs1014, %rs863, %rs1217, %p359;
and.b16 %rs864, %rs1014, 255;
mov.u16 %rs865, 8;
sub.s16 %rs1048, %rs865, %rs1013;
setp.eq.s16 %p360, %rs864, 255;
selp.u16 %rs1013, 1, 0, %p360;
cvt.u32.u16 %r1266, %rs1014;
and.b32 %r1994, %r1266, 255;
$L__BB28_261:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1267, %rs1048;
and.b32 %r1268, %r1267, 255;
shr.u32 %r1269, %r1994, %r1268;
and.b32 %r1270, %r1269, 1;
bfi.b32 %r1999, %r1999, %r1270, 1, 31;
setp.eq.s32 %p361, %r481, 2;
@%p361 bra $L__BB28_267;
and.b16 %rs866, %rs1048, 255;
setp.ne.s16 %p362, %rs866, 0;
@%p362 bra $L__BB28_266;
setp.eq.s32 %p363, %r1791, 0;
mov.u16 %rs1221, 255;
@%p363 bra $L__BB28_265;
cvt.u64.u32 %rd308, %r1790;
add.s64 %rd309, %rd308, %rd3;
add.s64 %rd310, %rd1, %rd309;
ld.global.u8 %rs1221, [%rd310];
$L__BB28_265:
setp.ne.s32 %p365, %r1791, 0;
selp.u32 %r1271, 1, 0, %p365;
add.s32 %r1790, %r1790, %r1271;
add.s32 %r1272, %r1791, -1;
selp.b32 %r1791, 0, %r1272, %p363;
setp.eq.s32 %p366, %r1791, 0;
or.b16 %rs868, %rs1221, 15;
selp.b16 %rs1014, %rs868, %rs1221, %p366;
and.b16 %rs869, %rs1014, 255;
mov.u16 %rs870, 8;
sub.s16 %rs1048, %rs870, %rs1013;
setp.eq.s16 %p367, %rs869, 255;
selp.u16 %rs1013, 1, 0, %p367;
cvt.u32.u16 %r1273, %rs1014;
and.b32 %r1994, %r1273, 255;
$L__BB28_266:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1274, %rs1048;
and.b32 %r1275, %r1274, 255;
shr.u32 %r1276, %r1994, %r1275;
and.b32 %r1277, %r1276, 1;
bfi.b32 %r1999, %r1999, %r1277, 1, 31;
$L__BB28_267:
shl.b32 %r1278, %r1999, 1;
or.b32 %r2003, %r1278, 1;
add.s32 %r1279, %r1881, -1;
setp.eq.s32 %p368, %r1881, 0;
selp.b32 %r2002, 0, %r1279, %p368;
$L__BB28_268:
mul.lo.s32 %r1280, %r1882, 7;
cvt.u64.u32 %rd311, %r2003;
shl.b64 %rd312, %rd311, %r1280;
or.b64 %rd465, %rd312, %rd465;
setp.ne.s32 %p369, %r1881, 12;
setp.ne.s32 %p370, %r477, 0;
or.pred %p371, %p369, %p370;
add.s32 %r1882, %r1882, 1;
setp.lt.u32 %p372, %r1882, 8;
or.pred %p373, %p372, %p371;
mov.u32 %r1881, %r2002;
@%p373 bra $L__BB28_224;
$L__BB28_269:
cvt.u32.u64 %r1281, %rd465;
and.b32 %r1878, %r1281, 127;
shr.u64 %rd465, %rd465, 7;
add.s32 %r1882, %r1882, -1;
$L__BB28_270:
mul.wide.u32 %rd313, %r1947, 2;
add.s64 %rd314, %rd13, %rd313;
st.local.u16 [%rd314], %r2013;
shl.b32 %r1282, %r2013, 2;
shl.b32 %r1283, %r2013, 1;
or.b32 %r1284, %r1282, %r1283;
and.b32 %r1285, %r1284, 256;
ld.local.u16 %r1286, [%rd46];
and.b32 %r1287, %r1286, 128;
or.b32 %r1288, %r1285, %r1287;
ld.local.u16 %r1289, [%rd47];
shl.b32 %r1290, %r1289, 2;
and.b32 %r1291, %r1290, 640;
or.b32 %r1292, %r1288, %r1291;
add.s32 %r1293, %r454, 4;
mul.wide.u32 %rd315, %r1293, 2;
add.s64 %rd316, %rd13, %rd315;
ld.local.u16 %r1294, [%rd316];
shl.b32 %r1295, %r1294, 4;
and.b32 %r1296, %r1295, 512;
or.b32 %r1297, %r1292, %r1296;
and.b32 %r1298, %r2013, 7;
shr.u64 %rd57, %rd474, %r1298;
sub.s32 %r563, %r1961, %r1298;
cvt.u32.u64 %r1299, %rd57;
and.b32 %r1300, %r1299, 127;
or.b32 %r1301, %r1297, %r1300;
mul.wide.u32 %rd317, %r1301, 2;
add.s64 %rd318, %rd41, %rd317;
ld.global.u16 %r2065, [%rd318];
setp.ne.s32 %p374, %r1297, 0;
add.s32 %r565, %r1945, 2;
setp.ge.u32 %p375, %r565, %r824;
or.pred %p376, %p375, %p374;
@%p376 bra $L__BB28_320;
add.s32 %r566, %r1878, -2;
setp.eq.s32 %p377, %r566, -1;
selp.b32 %r2065, %r2065, 0, %p377;
setp.gt.s32 %p378, %r1878, 1;
mov.u32 %r1878, %r566;
@%p378 bra $L__BB28_320;
setp.ne.s32 %p379, %r1882, 0;
@%p379 bra $L__BB28_319;
mov.u32 %r1882, 0;
$L__BB28_274:
setp.gt.u32 %p380, %r1882, 7;
@%p380 bra $L__BB28_319;
cvt.u64.u32 %rd59, %r1881;
mul.wide.u32 %rd319, %r1881, 4;
add.s64 %rd321, %rd133, %rd319;
ld.global.nc.u32 %r572, [%rd321];
and.b16 %rs871, %rs1048, 255;
setp.ne.s16 %p381, %rs871, 0;
@%p381 bra $L__BB28_279;
setp.eq.s32 %p382, %r1791, 0;
mov.u16 %rs1240, 255;
@%p382 bra $L__BB28_278;
cvt.u64.u32 %rd322, %r1790;
add.s64 %rd323, %rd322, %rd3;
add.s64 %rd324, %rd1, %rd323;
ld.global.u8 %rs1240, [%rd324];
$L__BB28_278:
setp.ne.s32 %p384, %r1791, 0;
selp.u32 %r1303, 1, 0, %p384;
add.s32 %r1790, %r1790, %r1303;
add.s32 %r1304, %r1791, -1;
selp.b32 %r1791, 0, %r1304, %p382;
setp.eq.s32 %p385, %r1791, 0;
or.b16 %rs873, %rs1240, 15;
selp.b16 %rs1014, %rs873, %rs1240, %p385;
and.b16 %rs874, %rs1014, 255;
mov.u16 %rs875, 8;
sub.s16 %rs1048, %rs875, %rs1013;
setp.eq.s16 %p386, %rs874, 255;
selp.u16 %rs1013, 1, 0, %p386;
$L__BB28_279:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1305, %rs1048;
and.b32 %r1306, %r1305, 255;
mov.u32 %r1307, 1;
shl.b32 %r1308, %r1307, %r1306;
cvt.u32.u16 %r1309, %rs1014;
and.b32 %r1310, %r1308, %r1309;
and.b32 %r577, %r1310, 255;
setp.eq.s32 %p387, %r577, 0;
@%p387 bra $L__BB28_281;
add.s32 %r1311, %r1881, 1;
min.u32 %r2054, %r1311, 12;
mov.u32 %r1312, -1;
shl.b32 %r1313, %r1312, %r572;
shl.b32 %r1314, %r1313, 1;
xor.b32 %r2055, %r1314, -2;
bra.uni $L__BB28_318;
$L__BB28_281:
add.s64 %rd325, %rd59, -3;
setp.gt.u64 %p388, %rd325, 9;
mov.u32 %r2051, 0;
@%p388 bra $L__BB28_317;
max.u32 %r580, %r572, 1;
add.s32 %r1318, %r580, -1;
and.b32 %r581, %r580, 3;
setp.lt.u32 %p389, %r1318, 3;
mov.u32 %r2051, 0;
@%p389 bra $L__BB28_301;
sub.s32 %r2023, %r580, %r581;
mov.u32 %r2051, 0;
$L__BB28_284:
and.b16 %rs877, %rs1048, 255;
setp.ne.s16 %p390, %rs877, 0;
@%p390 bra $L__BB28_288;
setp.eq.s32 %p391, %r1791, 0;
mov.u16 %rs1247, 255;
@%p391 bra $L__BB28_287;
cvt.u64.u32 %rd326, %r1790;
add.s64 %rd327, %rd326, %rd3;
add.s64 %rd328, %rd1, %rd327;
ld.global.u8 %rs1247, [%rd328];
$L__BB28_287:
setp.ne.s32 %p393, %r1791, 0;
selp.u32 %r1320, 1, 0, %p393;
add.s32 %r1790, %r1790, %r1320;
add.s32 %r1321, %r1791, -1;
selp.b32 %r1791, 0, %r1321, %p391;
setp.eq.s32 %p394, %r1791, 0;
or.b16 %rs879, %rs1247, 15;
selp.b16 %rs1014, %rs879, %rs1247, %p394;
and.b16 %rs880, %rs1014, 255;
mov.u16 %rs881, 8;
sub.s16 %rs1048, %rs881, %rs1013;
setp.eq.s16 %p395, %rs880, 255;
selp.u16 %rs1013, 1, 0, %p395;
$L__BB28_288:
add.s16 %rs1254, %rs1048, -1;
and.b16 %rs882, %rs1254, 255;
cvt.u32.u16 %r1322, %rs1254;
and.b32 %r1323, %r1322, 255;
cvt.u32.u16 %r1324, %rs1014;
and.b32 %r2029, %r1324, 255;
shr.u32 %r1325, %r2029, %r1323;
and.b32 %r1326, %r1325, 1;
bfi.b32 %r592, %r2051, %r1326, 1, 31;
setp.ne.s16 %p396, %rs882, 0;
@%p396 bra $L__BB28_292;
setp.eq.s32 %p397, %r1791, 0;
mov.u16 %rs1251, 255;
@%p397 bra $L__BB28_291;
cvt.u64.u32 %rd329, %r1790;
add.s64 %rd330, %rd329, %rd3;
add.s64 %rd331, %rd1, %rd330;
ld.global.u8 %rs1251, [%rd331];
$L__BB28_291:
setp.ne.s32 %p399, %r1791, 0;
selp.u32 %r1327, 1, 0, %p399;
add.s32 %r1790, %r1790, %r1327;
add.s32 %r1328, %r1791, -1;
selp.b32 %r1791, 0, %r1328, %p397;
setp.eq.s32 %p400, %r1791, 0;
or.b16 %rs884, %rs1251, 15;
selp.b16 %rs1014, %rs884, %rs1251, %p400;
and.b16 %rs885, %rs1014, 255;
mov.u16 %rs886, 8;
sub.s16 %rs1254, %rs886, %rs1013;
setp.eq.s16 %p401, %rs885, 255;
selp.u16 %rs1013, 1, 0, %p401;
cvt.u32.u16 %r1329, %rs1014;
and.b32 %r2029, %r1329, 255;
$L__BB28_292:
add.s16 %rs1258, %rs1254, -1;
and.b16 %rs887, %rs1258, 255;
cvt.u32.u16 %r1330, %rs1258;
and.b32 %r1331, %r1330, 255;
shr.u32 %r1332, %r2029, %r1331;
and.b32 %r1333, %r1332, 1;
bfi.b32 %r599, %r592, %r1333, 1, 31;
setp.ne.s16 %p402, %rs887, 0;
@%p402 bra $L__BB28_296;
setp.eq.s32 %p403, %r1791, 0;
mov.u16 %rs1255, 255;
@%p403 bra $L__BB28_295;
cvt.u64.u32 %rd332, %r1790;
add.s64 %rd333, %rd332, %rd3;
add.s64 %rd334, %rd1, %rd333;
ld.global.u8 %rs1255, [%rd334];
$L__BB28_295:
setp.ne.s32 %p405, %r1791, 0;
selp.u32 %r1334, 1, 0, %p405;
add.s32 %r1790, %r1790, %r1334;
add.s32 %r1335, %r1791, -1;
selp.b32 %r1791, 0, %r1335, %p403;
setp.eq.s32 %p406, %r1791, 0;
or.b16 %rs889, %rs1255, 15;
selp.b16 %rs1014, %rs889, %rs1255, %p406;
and.b16 %rs890, %rs1014, 255;
mov.u16 %rs891, 8;
sub.s16 %rs1258, %rs891, %rs1013;
setp.eq.s16 %p407, %rs890, 255;
selp.u16 %rs1013, 1, 0, %p407;
cvt.u32.u16 %r1336, %rs1014;
and.b32 %r2029, %r1336, 255;
$L__BB28_296:
add.s16 %rs1262, %rs1258, -1;
and.b16 %rs892, %rs1262, 255;
cvt.u32.u16 %r1337, %rs1262;
and.b32 %r1338, %r1337, 255;
shr.u32 %r1339, %r2029, %r1338;
and.b32 %r1340, %r1339, 1;
bfi.b32 %r606, %r599, %r1340, 1, 31;
setp.ne.s16 %p408, %rs892, 0;
@%p408 bra $L__BB28_300;
setp.eq.s32 %p409, %r1791, 0;
mov.u16 %rs1259, 255;
@%p409 bra $L__BB28_299;
cvt.u64.u32 %rd335, %r1790;
add.s64 %rd336, %rd335, %rd3;
add.s64 %rd337, %rd1, %rd336;
ld.global.u8 %rs1259, [%rd337];
$L__BB28_299:
setp.ne.s32 %p411, %r1791, 0;
selp.u32 %r1341, 1, 0, %p411;
add.s32 %r1790, %r1790, %r1341;
add.s32 %r1342, %r1791, -1;
selp.b32 %r1791, 0, %r1342, %p409;
setp.eq.s32 %p412, %r1791, 0;
or.b16 %rs894, %rs1259, 15;
selp.b16 %rs1014, %rs894, %rs1259, %p412;
and.b16 %rs895, %rs1014, 255;
mov.u16 %rs896, 8;
sub.s16 %rs1262, %rs896, %rs1013;
setp.eq.s16 %p413, %rs895, 255;
selp.u16 %rs1013, 1, 0, %p413;
cvt.u32.u16 %r1343, %rs1014;
and.b32 %r2029, %r1343, 255;
$L__BB28_300:
add.s16 %rs1048, %rs1262, -1;
cvt.u32.u16 %r1344, %rs1048;
and.b32 %r1345, %r1344, 255;
shr.u32 %r1346, %r2029, %r1345;
and.b32 %r1347, %r1346, 1;
bfi.b32 %r2051, %r606, %r1347, 1, 31;
add.s32 %r2023, %r2023, -4;
setp.ne.s32 %p414, %r2023, 0;
@%p414 bra $L__BB28_284;
$L__BB28_301:
setp.eq.s32 %p415, %r581, 0;
@%p415 bra $L__BB28_317;
and.b16 %rs897, %rs1048, 255;
setp.ne.s16 %p416, %rs897, 0;
@%p416 bra $L__BB28_306;
setp.eq.s32 %p417, %r1791, 0;
mov.u16 %rs1269, 255;
@%p417 bra $L__BB28_305;
cvt.u64.u32 %rd338, %r1790;
add.s64 %rd339, %rd338, %rd3;
add.s64 %rd340, %rd1, %rd339;
ld.global.u8 %rs1269, [%rd340];
$L__BB28_305:
setp.ne.s32 %p419, %r1791, 0;
selp.u32 %r1348, 1, 0, %p419;
add.s32 %r1790, %r1790, %r1348;
add.s32 %r1349, %r1791, -1;
selp.b32 %r1791, 0, %r1349, %p417;
setp.eq.s32 %p420, %r1791, 0;
or.b16 %rs899, %rs1269, 15;
selp.b16 %rs1014, %rs899, %rs1269, %p420;
and.b16 %rs900, %rs1014, 255;
mov.u16 %rs901, 8;
sub.s16 %rs1048, %rs901, %rs1013;
setp.eq.s16 %p421, %rs900, 255;
selp.u16 %rs1013, 1, 0, %p421;
$L__BB28_306:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1350, %rs1048;
and.b32 %r1351, %r1350, 255;
cvt.u32.u16 %r1352, %rs1014;
and.b32 %r2046, %r1352, 255;
shr.u32 %r1353, %r2046, %r1351;
and.b32 %r1354, %r1353, 1;
bfi.b32 %r2051, %r2051, %r1354, 1, 31;
setp.eq.s32 %p422, %r581, 1;
@%p422 bra $L__BB28_317;
and.b16 %rs902, %rs1048, 255;
setp.ne.s16 %p423, %rs902, 0;
@%p423 bra $L__BB28_311;
setp.eq.s32 %p424, %r1791, 0;
mov.u16 %rs1273, 255;
@%p424 bra $L__BB28_310;
cvt.u64.u32 %rd341, %r1790;
add.s64 %rd342, %rd341, %rd3;
add.s64 %rd343, %rd1, %rd342;
ld.global.u8 %rs1273, [%rd343];
$L__BB28_310:
setp.ne.s32 %p426, %r1791, 0;
selp.u32 %r1355, 1, 0, %p426;
add.s32 %r1790, %r1790, %r1355;
add.s32 %r1356, %r1791, -1;
selp.b32 %r1791, 0, %r1356, %p424;
setp.eq.s32 %p427, %r1791, 0;
or.b16 %rs904, %rs1273, 15;
selp.b16 %rs1014, %rs904, %rs1273, %p427;
and.b16 %rs905, %rs1014, 255;
mov.u16 %rs906, 8;
sub.s16 %rs1048, %rs906, %rs1013;
setp.eq.s16 %p428, %rs905, 255;
selp.u16 %rs1013, 1, 0, %p428;
cvt.u32.u16 %r1357, %rs1014;
and.b32 %r2046, %r1357, 255;
$L__BB28_311:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1358, %rs1048;
and.b32 %r1359, %r1358, 255;
shr.u32 %r1360, %r2046, %r1359;
and.b32 %r1361, %r1360, 1;
bfi.b32 %r2051, %r2051, %r1361, 1, 31;
setp.eq.s32 %p429, %r581, 2;
@%p429 bra $L__BB28_317;
and.b16 %rs907, %rs1048, 255;
setp.ne.s16 %p430, %rs907, 0;
@%p430 bra $L__BB28_316;
setp.eq.s32 %p431, %r1791, 0;
mov.u16 %rs1277, 255;
@%p431 bra $L__BB28_315;
cvt.u64.u32 %rd344, %r1790;
add.s64 %rd345, %rd344, %rd3;
add.s64 %rd346, %rd1, %rd345;
ld.global.u8 %rs1277, [%rd346];
$L__BB28_315:
setp.ne.s32 %p433, %r1791, 0;
selp.u32 %r1362, 1, 0, %p433;
add.s32 %r1790, %r1790, %r1362;
add.s32 %r1363, %r1791, -1;
selp.b32 %r1791, 0, %r1363, %p431;
setp.eq.s32 %p434, %r1791, 0;
or.b16 %rs909, %rs1277, 15;
selp.b16 %rs1014, %rs909, %rs1277, %p434;
and.b16 %rs910, %rs1014, 255;
mov.u16 %rs911, 8;
sub.s16 %rs1048, %rs911, %rs1013;
setp.eq.s16 %p435, %rs910, 255;
selp.u16 %rs1013, 1, 0, %p435;
cvt.u32.u16 %r1364, %rs1014;
and.b32 %r2046, %r1364, 255;
$L__BB28_316:
add.s16 %rs1048, %rs1048, -1;
cvt.u32.u16 %r1365, %rs1048;
and.b32 %r1366, %r1365, 255;
shr.u32 %r1367, %r2046, %r1366;
and.b32 %r1368, %r1367, 1;
bfi.b32 %r2051, %r2051, %r1368, 1, 31;
$L__BB28_317:
shl.b32 %r1369, %r2051, 1;
or.b32 %r2055, %r1369, 1;
add.s32 %r1370, %r1881, -1;
setp.eq.s32 %p436, %r1881, 0;
selp.b32 %r2054, 0, %r1370, %p436;
$L__BB28_318:
mul.lo.s32 %r1371, %r1882, 7;
cvt.u64.u32 %rd347, %r2055;
shl.b64 %rd348, %rd347, %r1371;
or.b64 %rd465, %rd348, %rd465;
setp.ne.s32 %p437, %r1881, 12;
setp.ne.s32 %p438, %r577, 0;
or.pred %p439, %p437, %p438;
add.s32 %r1882, %r1882, 1;
setp.lt.u32 %p440, %r1882, 8;
or.pred %p441, %p440, %p439;
mov.u32 %r1881, %r2054;
@%p441 bra $L__BB28_274;
$L__BB28_319:
cvt.u32.u64 %r1372, %rd465;
and.b32 %r1878, %r1372, 127;
shr.u64 %rd465, %rd465, 7;
add.s32 %r1882, %r1882, -1;
$L__BB28_320:
setp.lt.u32 %p442, %r565, %r824;
selp.b32 %r1373, %r2065, 0, %p442;
add.s32 %r1374, %r1947, 2;
mul.wide.u32 %rd349, %r1374, 2;
add.s64 %rd350, %rd13, %rd349;
st.local.u16 [%rd350], %r1373;
shl.b32 %r1375, %r1373, 2;
shl.b32 %r1376, %r1373, 1;
or.b32 %r1377, %r1375, %r1376;
and.b32 %r1378, %r1377, 256;
ld.local.u16 %r1379, [%rd47];
and.b32 %r1380, %r1379, 128;
or.b32 %r1946, %r1378, %r1380;
and.b32 %r1381, %r1373, 7;
shr.u64 %rd351, %rd57, %r1381;
sub.s32 %r1382, %r563, %r1381;
cvt.u32.u64 %r1383, %rd351;
shl.b32 %r1384, %r2013, 3;
and.b32 %r1385, %r1384, 64;
shl.b32 %r1386, %r1373, 4;
and.b32 %r1387, %r1386, 128;
or.b32 %r1388, %r1387, %r1385;
and.b32 %r1389, %r1383, 63;
or.b32 %r1390, %r1389, %r1388;
mul.wide.u32 %rd352, %r1390, 2;
add.s64 %rd353, %rd40, %rd352;
ld.global.u16 %r1391, [%rd353];
and.b32 %r1392, %r1391, 7;
shr.u64 %rd354, %rd351, %r1392;
sub.s32 %r1393, %r1382, %r1392;
cvt.u32.u64 %r1394, %rd354;
shr.u32 %r1395, %r1391, 3;
and.b32 %r1396, %r1395, 15;
mov.u32 %r1397, -1;
shl.b32 %r1398, %r1397, %r1396;
not.b32 %r1399, %r1398;
and.b32 %r1400, %r1394, %r1399;
shr.u64 %rd474, %rd354, %r1396;
sub.s32 %r1961, %r1393, %r1396;
shr.u32 %r1401, %r1391, 7;
and.b32 %r1402, %r1401, 7;
shr.u32 %r1403, %r1391, 10;
and.b32 %r1404, %r1403, 7;
mov.u32 %r1405, 255;
shl.b32 %r1406, %r1405, %r1402;
not.b32 %r1407, %r1406;
and.b32 %r1408, %r1400, %r1407;
add.s32 %r1409, %r1408, %r1404;
add.s32 %r1410, %r1947, 1;
mul.wide.u32 %rd355, %r1410, 2;
add.s64 %rd356, %rd13, %rd355;
st.local.u16 [%rd356], %r1409;
shr.u32 %r1411, %r1391, 13;
shr.u32 %r1412, %r1400, %r1402;
add.s32 %r1413, %r1412, %r1411;
add.s32 %r1414, %r1947, 3;
mul.wide.u32 %rd357, %r1414, 2;
add.s64 %rd358, %rd13, %rd357;
st.local.u16 [%rd358], %r1413;
add.s32 %r1947, %r1947, 4;
add.s32 %r1945, %r1945, 4;
setp.lt.u32 %p443, %r1945, %r824;
@%p443 bra $L__BB28_216;
mul.wide.u32 %rd359, %r1947, 2;
add.s64 %rd360, %rd13, %rd359;
mov.u16 %rs912, 0;
st.local.v2.u16 [%rd360], {%rs912, %rs912};
add.s32 %r1936, %r1936, 2;
setp.lt.u32 %p444, %r1936, %r826;
@%p444 bra $L__BB28_215;
$L__BB28_322:
add.s32 %r1415, %r824, 1;
shr.u32 %r1416, %r1415, 1;
add.s32 %r1417, %r1416, 2;
setp.gt.u32 %p445, %r1417, 130;
@%p445 bra $L__BB28_393;
bra.uni $L__BB28_323;
$L__BB28_393:
mov.u32 %r1683, 2;
st.global.u32 [%rd4], %r1683;
mov.u32 %r1684, 12;
st.global.u32 [%rd4+4], %r1684;
mov.u32 %r1685, 0;
st.global.u32 [%rd4+8], %r1685;
st.global.u32 [%rd4+12], %r1685;
bra.uni $L__BB28_401;
$L__BB28_323:
mov.u32 %r2066, 0;
mov.u64 %rd502, 0;
add.s32 %r668, %r831, 2;
mov.u32 %r1423, 31;
sub.s32 %r669, %r1423, %r832;
mov.u32 %r1424, 29;
sub.s32 %r670, %r1424, %r831;
mov.u16 %rs1320, 0;
mov.b32 %f2, %r836;
mov.u32 %r2067, %r2066;
mov.u32 %r2068, %r2066;
mov.u32 %r2069, %r835;
mov.u32 %r2134, %r2066;
mov.u32 %r2133, %r2066;
$L__BB28_324:
mov.u32 %r674, %r2068;
mul.wide.u32 %rd362, %r2067, 2;
add.s64 %rd363, %rd13, %rd362;
ld.local.u16 %r678, [%rd363];
ld.local.u16 %r679, [%rd363+2];
setp.lt.u32 %p446, %r668, %r679;
@%p446 bra $L__BB28_392;
and.b32 %r1426, %r678, 16;
setp.eq.s32 %p447, %r1426, 0;
mov.u32 %r2087, 0;
mov.u32 %r2079, %r2087;
@%p447 bra $L__BB28_331;
setp.gt.u32 %p448, %r2133, 31;
@%p448 bra $L__BB28_330;
$L__BB28_327:
setp.ge.u32 %p449, %r2134, %r18;
mov.u16 %rs1295, 255;
@%p449 bra $L__BB28_329;
add.s32 %r682, %r2134, 1;
cvt.u64.u32 %rd364, %r2134;
add.s64 %rd365, %rd364, %rd3;
add.s64 %rd366, %rd1, %rd365;
ld.global.u8 %rs1295, [%rd366];
mov.u32 %r2134, %r682;
$L__BB28_329:
and.b16 %rs915, %rs1295, 255;
cvt.u64.u16 %rd367, %rs1295;
and.b64 %rd368, %rd367, 255;
shl.b64 %rd369, %rd368, %r2133;
or.b64 %rd502, %rd369, %rd502;
cvt.u32.u16 %r1427, %rs1320;
cvt.s32.s8 %r1428, %r1427;
mov.u32 %r1429, 8;
sub.s32 %r1430, %r1429, %r1428;
add.s32 %r2133, %r1430, %r2133;
setp.eq.s16 %p450, %rs915, 255;
selp.u16 %rs1320, 1, 0, %p450;
setp.lt.u32 %p451, %r2133, 33;
@%p451 bra $L__BB28_327;
$L__BB28_330:
shr.u32 %r1431, %r678, 12;
and.b32 %r1432, %r1431, 1;
sub.s32 %r1433, %r679, %r1432;
shr.u64 %rd69, %rd502, %r1433;
sub.s32 %r2133, %r2133, %r1433;
cvt.u32.u64 %r1434, %rd502;
shl.b32 %r1435, %r1434, 31;
setp.eq.s32 %p452, %r1433, 0;
mov.u32 %r1436, -1;
shl.b32 %r1437, %r1436, %r1433;
not.b32 %r1438, %r1437;
selp.b32 %r1439, 0, %r1438, %p452;
and.b32 %r1440, %r1439, %r1434;
shr.u32 %r1441, %r678, 8;
and.b32 %r1442, %r1441, 1;
shl.b32 %r1443, %r1442, %r1433;
or.b32 %r1444, %r1443, %r1440;
or.b32 %r1445, %r1444, 1;
add.s32 %r1446, %r1445, 2;
shl.b32 %r1447, %r1446, %r670;
or.b32 %r2079, %r1447, %r1435;
mov.u64 %rd502, %rd69;
$L__BB28_331:
and.b32 %r1450, %r2079, 2147483647;
shr.u32 %r1451, %r1450, %r669;
neg.s32 %r1452, %r1451;
setp.lt.s32 %p453, %r2079, 0;
selp.b32 %r1453, %r1452, %r1451, %p453;
cvt.rn.f32.s32 %f1, %r1453;
mul.rn.f32 %f3, %f2, %f1;
mul.wide.u32 %rd370, %r2069, 4;
add.s64 %rd371, %rd2, %rd370;
st.global.f32 [%rd371], %f3;
and.b32 %r1454, %r678, 32;
setp.eq.s32 %p454, %r1454, 0;
mov.u32 %r2088, %r2087;
@%p454 bra $L__BB28_337;
setp.gt.u32 %p455, %r2133, 31;
@%p455 bra $L__BB28_336;
$L__BB28_333:
setp.ge.u32 %p456, %r2134, %r18;
mov.u16 %rs1299, 255;
@%p456 bra $L__BB28_335;
add.s32 %r694, %r2134, 1;
cvt.u64.u32 %rd372, %r2134;
add.s64 %rd373, %rd372, %rd3;
add.s64 %rd374, %rd1, %rd373;
ld.global.u8 %rs1299, [%rd374];
mov.u32 %r2134, %r694;
$L__BB28_335:
and.b16 %rs917, %rs1299, 255;
cvt.u64.u16 %rd375, %rs1299;
and.b64 %rd376, %rd375, 255;
shl.b64 %rd377, %rd376, %r2133;
or.b64 %rd502, %rd377, %rd502;
cvt.u32.u16 %r1455, %rs1320;
cvt.s32.s8 %r1456, %r1455;
mov.u32 %r1457, 8;
sub.s32 %r1458, %r1457, %r1456;
add.s32 %r2133, %r1458, %r2133;
setp.eq.s16 %p457, %rs917, 255;
selp.u16 %rs1320, 1, 0, %p457;
setp.lt.u32 %p458, %r2133, 33;
@%p458 bra $L__BB28_333;
$L__BB28_336:
shr.u32 %r1459, %r678, 13;
and.b32 %r1460, %r1459, 1;
sub.s32 %r1461, %r679, %r1460;
shr.u64 %rd74, %rd502, %r1461;
sub.s32 %r2133, %r2133, %r1461;
cvt.u32.u64 %r1462, %rd502;
shl.b32 %r1463, %r1462, 31;
setp.eq.s32 %p459, %r1461, 0;
mov.u32 %r1464, -1;
shl.b32 %r1465, %r1464, %r1461;
not.b32 %r1466, %r1465;
selp.b32 %r1467, 0, %r1466, %p459;
and.b32 %r1468, %r1467, %r1462;
shr.u32 %r1469, %r678, 9;
and.b32 %r1470, %r1469, 1;
shl.b32 %r1471, %r1470, %r1461;
or.b32 %r1472, %r1471, %r1468;
or.b32 %r2088, %r1472, 1;
add.s32 %r1473, %r2088, 2;
shl.b32 %r1474, %r1473, %r670;
or.b32 %r2087, %r1474, %r1463;
mov.u64 %rd502, %rd74;
$L__BB28_337:
setp.lt.u32 %p460, %r826, 2;
@%p460 bra $L__BB28_339;
add.s32 %r1475, %r2069, %r834;
and.b32 %r1476, %r2087, 2147483647;
shr.u32 %r1477, %r1476, %r669;
neg.s32 %r1478, %r1477;
setp.lt.s32 %p461, %r2087, 0;
selp.b32 %r1479, %r1478, %r1477, %p461;
cvt.rn.f32.s32 %f4, %r1479;
mul.rn.f32 %f6, %f2, %f4;
mul.wide.u32 %rd378, %r1475, 4;
add.s64 %rd379, %rd2, %rd378;
st.global.f32 [%rd379], %f6;
$L__BB28_339:
or.b32 %r1480, %r2088, %r2066;
add.u64 %rd76, %SPL, 6192;
mul.wide.u32 %rd381, %r674, 4;
add.s64 %rd382, %rd76, %rd381;
st.local.u32 [%rd382], %r1480;
add.s32 %r706, %r2069, 1;
add.s32 %r1481, %r2067, 1;
setp.lt.u32 %p462, %r1481, %r824;
@%p462 bra $L__BB28_341;
bra.uni $L__BB28_340;
$L__BB28_341:
and.b32 %r1484, %r678, 64;
setp.eq.s32 %p463, %r1484, 0;
mov.u32 %r2104, 0;
mov.u32 %r2096, %r2104;
@%p463 bra $L__BB28_347;
setp.gt.u32 %p464, %r2133, 31;
@%p464 bra $L__BB28_346;
$L__BB28_343:
setp.ge.u32 %p465, %r2134, %r18;
mov.u16 %rs1303, 255;
@%p465 bra $L__BB28_345;
add.s32 %r709, %r2134, 1;
cvt.u64.u32 %rd383, %r2134;
add.s64 %rd384, %rd383, %rd3;
add.s64 %rd385, %rd1, %rd384;
ld.global.u8 %rs1303, [%rd385];
mov.u32 %r2134, %r709;
$L__BB28_345:
and.b16 %rs919, %rs1303, 255;
cvt.u64.u16 %rd386, %rs1303;
and.b64 %rd387, %rd386, 255;
shl.b64 %rd388, %rd387, %r2133;
or.b64 %rd502, %rd388, %rd502;
cvt.u32.u16 %r1485, %rs1320;
cvt.s32.s8 %r1486, %r1485;
mov.u32 %r1487, 8;
sub.s32 %r1488, %r1487, %r1486;
add.s32 %r2133, %r1488, %r2133;
setp.eq.s16 %p466, %rs919, 255;
selp.u16 %rs1320, 1, 0, %p466;
setp.lt.u32 %p467, %r2133, 33;
@%p467 bra $L__BB28_343;
$L__BB28_346:
shr.u32 %r1489, %r678, 14;
and.b32 %r1490, %r1489, 1;
sub.s32 %r1491, %r679, %r1490;
shr.u64 %rd80, %rd502, %r1491;
sub.s32 %r2133, %r2133, %r1491;
cvt.u32.u64 %r1492, %rd502;
shl.b32 %r1493, %r1492, 31;
setp.eq.s32 %p468, %r1491, 0;
mov.u32 %r1494, -1;
shl.b32 %r1495, %r1494, %r1491;
not.b32 %r1496, %r1495;
selp.b32 %r1497, 0, %r1496, %p468;
and.b32 %r1498, %r1497, %r1492;
shr.u32 %r1499, %r678, 10;
and.b32 %r1500, %r1499, 1;
shl.b32 %r1501, %r1500, %r1491;
or.b32 %r1502, %r1501, %r1498;
or.b32 %r1503, %r1502, 1;
add.s32 %r1504, %r1503, 2;
shl.b32 %r1505, %r1504, %r670;
or.b32 %r2096, %r1505, %r1493;
mov.u64 %rd502, %rd80;
$L__BB28_347:
and.b32 %r1508, %r2096, 2147483647;
shr.u32 %r1509, %r1508, %r669;
neg.s32 %r1510, %r1509;
setp.lt.s32 %p469, %r2096, 0;
selp.b32 %r1511, %r1510, %r1509, %p469;
cvt.rn.f32.s32 %f7, %r1511;
mul.rn.f32 %f9, %f2, %f7;
mul.wide.u32 %rd389, %r706, 4;
add.s64 %rd390, %rd2, %rd389;
st.global.f32 [%rd390], %f9;
and.b32 %r1512, %r678, 128;
setp.eq.s32 %p470, %r1512, 0;
mov.u32 %r2066, %r2104;
@%p470 bra $L__BB28_353;
setp.gt.u32 %p471, %r2133, 31;
@%p471 bra $L__BB28_352;
$L__BB28_349:
setp.ge.u32 %p472, %r2134, %r18;
mov.u16 %rs1307, 255;
@%p472 bra $L__BB28_351;
add.s32 %r721, %r2134, 1;
cvt.u64.u32 %rd391, %r2134;
add.s64 %rd392, %rd391, %rd3;
add.s64 %rd393, %rd1, %rd392;
ld.global.u8 %rs1307, [%rd393];
mov.u32 %r2134, %r721;
$L__BB28_351:
and.b16 %rs921, %rs1307, 255;
cvt.u64.u16 %rd394, %rs1307;
and.b64 %rd395, %rd394, 255;
shl.b64 %rd396, %rd395, %r2133;
or.b64 %rd502, %rd396, %rd502;
cvt.u32.u16 %r1513, %rs1320;
cvt.s32.s8 %r1514, %r1513;
mov.u32 %r1515, 8;
sub.s32 %r1516, %r1515, %r1514;
add.s32 %r2133, %r1516, %r2133;
setp.eq.s16 %p473, %rs921, 255;
selp.u16 %rs1320, 1, 0, %p473;
setp.lt.u32 %p474, %r2133, 33;
@%p474 bra $L__BB28_349;
$L__BB28_352:
shr.u32 %r1517, %r678, 15;
sub.s32 %r1518, %r679, %r1517;
shr.u64 %rd85, %rd502, %r1518;
sub.s32 %r2133, %r2133, %r1518;
cvt.u32.u64 %r1519, %rd502;
shl.b32 %r1520, %r1519, 31;
setp.eq.s32 %p475, %r1518, 0;
mov.u32 %r1521, -1;
shl.b32 %r1522, %r1521, %r1518;
not.b32 %r1523, %r1522;
selp.b32 %r1524, 0, %r1523, %p475;
and.b32 %r1525, %r1524, %r1519;
shr.u32 %r1526, %r678, 11;
and.b32 %r1527, %r1526, 1;
shl.b32 %r1528, %r1527, %r1518;
or.b32 %r1529, %r1528, %r1525;
or.b32 %r2066, %r1529, 1;
add.s32 %r1530, %r2066, 2;
shl.b32 %r1531, %r1530, %r670;
or.b32 %r2104, %r1531, %r1520;
mov.u64 %rd502, %rd85;
$L__BB28_353:
@%p460 bra $L__BB28_355;
add.s32 %r1532, %r706, %r834;
and.b32 %r1533, %r2104, 2147483647;
shr.u32 %r1534, %r1533, %r669;
neg.s32 %r1535, %r1534;
setp.lt.s32 %p477, %r2104, 0;
selp.b32 %r1536, %r1535, %r1534, %p477;
cvt.rn.f32.s32 %f10, %r1536;
mul.rn.f32 %f12, %f2, %f10;
mul.wide.u32 %rd397, %r1532, 4;
add.s64 %rd398, %rd2, %rd397;
st.global.f32 [%rd398], %f12;
$L__BB28_355:
add.s32 %r2069, %r2069, 2;
add.s32 %r2068, %r674, 1;
add.s32 %r2067, %r2067, 2;
setp.lt.u32 %p478, %r2067, %r824;
@%p478 bra $L__BB28_324;
bra.uni $L__BB28_356;
$L__BB28_392:
mov.u32 %r1676, 1;
st.global.u32 [%rd4], %r1676;
mov.u32 %r1677, 13;
st.global.u32 [%rd4+4], %r1677;
mov.u32 %r1678, 0;
st.global.u32 [%rd4+8], %r1678;
st.global.u32 [%rd4+12], %r1678;
bra.uni $L__BB28_401;
$L__BB28_340:
mov.u32 %r2066, 0;
$L__BB28_356:
add.s32 %r1537, %r674, 1;
mul.wide.u32 %rd401, %r1537, 4;
add.s64 %rd402, %rd76, %rd401;
st.local.u32 [%rd402], %r2066;
@%p298 bra $L__BB28_401;
mov.u32 %r2109, 2;
$L__BB28_358:
shr.u32 %r1543, %r2109, 1;
mul.lo.s32 %r2113, %r1543, %r849;
mad.lo.s32 %r2115, %r2109, %r834, %r835;
add.s32 %r745, %r2109, 1;
ld.local.u32 %r2112, [%rd76];
mov.u32 %r2114, 0;
mov.u32 %r2116, %r2114;
mov.u32 %r2117, %r2114;
$L__BB28_359:
mul.wide.u32 %rd403, %r2113, 2;
add.s64 %rd404, %rd13, %rd403;
ld.local.v2.u16 {%rs922, %rs923}, [%rd404];
cvt.u32.u16 %r755, %rs922;
cvt.u32.u16 %r1544, %rs923;
and.b32 %r1545, %r755, 240;
add.s32 %r1546, %r1545, 240;
and.b32 %r1547, %r1546, %r1545;
add.s32 %r756, %r2114, 1;
mul.wide.u32 %rd405, %r756, 4;
add.s64 %rd90, %rd76, %rd405;
ld.local.u32 %r757, [%rd90];
or.b32 %r1548, %r2112, %r757;
or.b32 %r1549, %r1548, 2;
clz.b32 %r1550, %r1549;
xor.b32 %r1551, %r1550, 31;
setp.eq.s32 %p480, %r1547, 0;
selp.b32 %r1552, 1, %r1551, %p480;
add.s32 %r758, %r1552, %r1544;
setp.gt.u32 %p481, %r758, %r668;
@%p481 bra $L__BB28_391;
and.b32 %r1554, %r755, 16;
setp.eq.s32 %p482, %r1554, 0;
mov.u32 %r2135, 0;
mov.u32 %r2127, %r2135;
@%p482 bra $L__BB28_366;
setp.gt.u32 %p483, %r2133, 31;
@%p483 bra $L__BB28_365;
$L__BB28_362:
setp.ge.u32 %p484, %r2134, %r18;
mov.u16 %rs1314, 255;
@%p484 bra $L__BB28_364;
add.s32 %r761, %r2134, 1;
cvt.u64.u32 %rd406, %r2134;
add.s64 %rd407, %rd406, %rd3;
add.s64 %rd408, %rd1, %rd407;
ld.global.u8 %rs1314, [%rd408];
mov.u32 %r2134, %r761;
$L__BB28_364:
and.b16 %rs927, %rs1314, 255;
cvt.u64.u16 %rd409, %rs1314;
and.b64 %rd410, %rd409, 255;
shl.b64 %rd411, %rd410, %r2133;
or.b64 %rd502, %rd411, %rd502;
cvt.u32.u16 %r1555, %rs1320;
cvt.s32.s8 %r1556, %r1555;
mov.u32 %r1557, 8;
sub.s32 %r1558, %r1557, %r1556;
add.s32 %r2133, %r1558, %r2133;
setp.eq.s16 %p485, %rs927, 255;
selp.u16 %rs1320, 1, 0, %p485;
setp.lt.u32 %p486, %r2133, 33;
@%p486 bra $L__BB28_362;
$L__BB28_365:
shr.u32 %r1559, %r755, 12;
and.b32 %r1560, %r1559, 1;
sub.s32 %r1561, %r758, %r1560;
shr.u64 %rd94, %rd502, %r1561;
sub.s32 %r2133, %r2133, %r1561;
cvt.u32.u64 %r1562, %rd502;
shl.b32 %r1563, %r1562, 31;
setp.eq.s32 %p487, %r1561, 0;
mov.u32 %r1564, -1;
shl.b32 %r1565, %r1564, %r1561;
not.b32 %r1566, %r1565;
selp.b32 %r1567, 0, %r1566, %p487;
and.b32 %r1568, %r1567, %r1562;
shr.u32 %r1569, %r755, 8;
and.b32 %r1570, %r1569, 1;
shl.b32 %r1571, %r1570, %r1561;
or.b32 %r1572, %r1571, %r1568;
or.b32 %r1573, %r1572, 1;
add.s32 %r1574, %r1573, 2;
shl.b32 %r1575, %r1574, %r670;
or.b32 %r2127, %r1575, %r1563;
mov.u64 %rd502, %rd94;
$L__BB28_366:
and.b32 %r1578, %r2127, 2147483647;
shr.u32 %r1579, %r1578, %r669;
neg.s32 %r1580, %r1579;
setp.lt.s32 %p488, %r2127, 0;
selp.b32 %r1581, %r1580, %r1579, %p488;
cvt.rn.f32.s32 %f13, %r1581;
mul.rn.f32 %f15, %f2, %f13;
mul.wide.u32 %rd412, %r2115, 4;
add.s64 %rd413, %rd2, %rd412;
st.global.f32 [%rd413], %f15;
and.b32 %r1582, %r755, 32;
setp.eq.s32 %p489, %r1582, 0;
mov.u32 %r2136, %r2135;
@%p489 bra $L__BB28_372;
setp.gt.u32 %p490, %r2133, 31;
@%p490 bra $L__BB28_371;
$L__BB28_368:
setp.ge.u32 %p491, %r2134, %r18;
mov.u16 %rs1318, 255;
@%p491 bra $L__BB28_370;
add.s32 %r773, %r2134, 1;
cvt.u64.u32 %rd414, %r2134;
add.s64 %rd415, %rd414, %rd3;
add.s64 %rd416, %rd1, %rd415;
ld.global.u8 %rs1318, [%rd416];
mov.u32 %r2134, %r773;
$L__BB28_370:
and.b16 %rs929, %rs1318, 255;
cvt.u64.u16 %rd417, %rs1318;
and.b64 %rd418, %rd417, 255;
shl.b64 %rd419, %rd418, %r2133;
or.b64 %rd502, %rd419, %rd502;
cvt.u32.u16 %r1583, %rs1320;
cvt.s32.s8 %r1584, %r1583;
mov.u32 %r1585, 8;
sub.s32 %r1586, %r1585, %r1584;
add.s32 %r2133, %r1586, %r2133;
setp.eq.s16 %p492, %rs929, 255;
selp.u16 %rs1320, 1, 0, %p492;
setp.lt.u32 %p493, %r2133, 33;
@%p493 bra $L__BB28_368;
$L__BB28_371:
shr.u32 %r1587, %r755, 13;
and.b32 %r1588, %r1587, 1;
sub.s32 %r1589, %r758, %r1588;
shr.u64 %rd99, %rd502, %r1589;
sub.s32 %r2133, %r2133, %r1589;
cvt.u32.u64 %r1590, %rd502;
shl.b32 %r1591, %r1590, 31;
setp.eq.s32 %p494, %r1589, 0;
mov.u32 %r1592, -1;
shl.b32 %r1593, %r1592, %r1589;
not.b32 %r1594, %r1593;
selp.b32 %r1595, 0, %r1594, %p494;
and.b32 %r1596, %r1595, %r1590;
shr.u32 %r1597, %r755, 9;
and.b32 %r1598, %r1597, 1;
shl.b32 %r1599, %r1598, %r1589;
or.b32 %r1600, %r1599, %r1596;
or.b32 %r2136, %r1600, 1;
add.s32 %r1601, %r2136, 2;
shl.b32 %r1602, %r1601, %r670;
or.b32 %r2135, %r1602, %r1591;
mov.u64 %rd502, %rd99;
$L__BB28_372:
setp.ge.u32 %p495, %r745, %r826;
@%p495 bra $L__BB28_374;
add.s32 %r1603, %r2115, %r834;
and.b32 %r1604, %r2135, 2147483647;
shr.u32 %r1605, %r1604, %r669;
neg.s32 %r1606, %r1605;
setp.lt.s32 %p496, %r2135, 0;
selp.b32 %r1607, %r1606, %r1605, %p496;
cvt.rn.f32.s32 %f16, %r1607;
mul.rn.f32 %f18, %f2, %f16;
mul.wide.u32 %rd420, %r1603, 4;
add.s64 %rd421, %rd2, %rd420;
st.global.f32 [%rd421], %f18;
$L__BB28_374:
or.b32 %r1609, %r2136, %r2116;
mul.wide.u32 %rd422, %r2114, 4;
add.s64 %rd423, %rd76, %rd422;
st.local.u32 [%rd423], %r1609;
add.s32 %r785, %r2115, 1;
add.s32 %r1610, %r2117, 1;
setp.ge.u32 %p497, %r1610, %r824;
mov.u32 %r2116, 0;
@%p497 bra $L__BB28_390;
and.b32 %r1612, %r755, 64;
setp.eq.s32 %p498, %r1612, 0;
mov.u32 %r2152, 0;
mov.u32 %r2144, %r2152;
@%p498 bra $L__BB28_381;
setp.gt.u32 %p499, %r2133, 31;
@%p499 bra $L__BB28_380;
$L__BB28_377:
setp.ge.u32 %p500, %r2134, %r18;
mov.u16 %rs1322, 255;
@%p500 bra $L__BB28_379;
add.s32 %r788, %r2134, 1;
cvt.u64.u32 %rd424, %r2134;
add.s64 %rd425, %rd424, %rd3;
add.s64 %rd426, %rd1, %rd425;
ld.global.u8 %rs1322, [%rd426];
mov.u32 %r2134, %r788;
$L__BB28_379:
and.b16 %rs931, %rs1322, 255;
cvt.u64.u16 %rd427, %rs1322;
and.b64 %rd428, %rd427, 255;
shl.b64 %rd429, %rd428, %r2133;
or.b64 %rd502, %rd429, %rd502;
cvt.u32.u16 %r1613, %rs1320;
cvt.s32.s8 %r1614, %r1613;
mov.u32 %r1615, 8;
sub.s32 %r1616, %r1615, %r1614;
add.s32 %r2133, %r1616, %r2133;
setp.eq.s16 %p501, %rs931, 255;
selp.u16 %rs1320, 1, 0, %p501;
setp.lt.u32 %p502, %r2133, 33;
@%p502 bra $L__BB28_377;
$L__BB28_380:
shr.u32 %r1617, %r755, 14;
and.b32 %r1618, %r1617, 1;
sub.s32 %r1619, %r758, %r1618;
shr.u64 %rd104, %rd502, %r1619;
sub.s32 %r2133, %r2133, %r1619;
cvt.u32.u64 %r1620, %rd502;
shl.b32 %r1621, %r1620, 31;
setp.eq.s32 %p503, %r1619, 0;
mov.u32 %r1622, -1;
shl.b32 %r1623, %r1622, %r1619;
not.b32 %r1624, %r1623;
selp.b32 %r1625, 0, %r1624, %p503;
and.b32 %r1626, %r1625, %r1620;
shr.u32 %r1627, %r755, 10;
and.b32 %r1628, %r1627, 1;
shl.b32 %r1629, %r1628, %r1619;
or.b32 %r1630, %r1629, %r1626;
or.b32 %r1631, %r1630, 1;
add.s32 %r1632, %r1631, 2;
shl.b32 %r1633, %r1632, %r670;
or.b32 %r2144, %r1633, %r1621;
mov.u64 %rd502, %rd104;
$L__BB28_381:
and.b32 %r1636, %r2144, 2147483647;
shr.u32 %r1637, %r1636, %r669;
neg.s32 %r1638, %r1637;
setp.lt.s32 %p504, %r2144, 0;
selp.b32 %r1639, %r1638, %r1637, %p504;
cvt.rn.f32.s32 %f19, %r1639;
mul.rn.f32 %f21, %f2, %f19;
mul.wide.u32 %rd430, %r785, 4;
add.s64 %rd431, %rd2, %rd430;
st.global.f32 [%rd431], %f21;
and.b32 %r1640, %r755, 128;
setp.eq.s32 %p505, %r1640, 0;
mov.u32 %r2116, %r2152;
@%p505 bra $L__BB28_387;
setp.gt.u32 %p506, %r2133, 31;
@%p506 bra $L__BB28_386;
$L__BB28_383:
setp.ge.u32 %p507, %r2134, %r18;
mov.u16 %rs1326, 255;
@%p507 bra $L__BB28_385;
add.s32 %r800, %r2134, 1;
cvt.u64.u32 %rd432, %r2134;
add.s64 %rd433, %rd432, %rd3;
add.s64 %rd434, %rd1, %rd433;
ld.global.u8 %rs1326, [%rd434];
mov.u32 %r2134, %r800;
$L__BB28_385:
and.b16 %rs933, %rs1326, 255;
cvt.u64.u16 %rd435, %rs1326;
and.b64 %rd436, %rd435, 255;
shl.b64 %rd437, %rd436, %r2133;
or.b64 %rd502, %rd437, %rd502;
cvt.u32.u16 %r1641, %rs1320;
cvt.s32.s8 %r1642, %r1641;
mov.u32 %r1643, 8;
sub.s32 %r1644, %r1643, %r1642;
add.s32 %r2133, %r1644, %r2133;
setp.eq.s16 %p508, %rs933, 255;
selp.u16 %rs1320, 1, 0, %p508;
setp.lt.u32 %p509, %r2133, 33;
@%p509 bra $L__BB28_383;
$L__BB28_386:
shr.u32 %r1645, %r755, 15;
sub.s32 %r1646, %r758, %r1645;
shr.u64 %rd109, %rd502, %r1646;
sub.s32 %r2133, %r2133, %r1646;
cvt.u32.u64 %r1647, %rd502;
shl.b32 %r1648, %r1647, 31;
setp.eq.s32 %p510, %r1646, 0;
mov.u32 %r1649, -1;
shl.b32 %r1650, %r1649, %r1646;
not.b32 %r1651, %r1650;
selp.b32 %r1652, 0, %r1651, %p510;
and.b32 %r1653, %r1652, %r1647;
shr.u32 %r1654, %r755, 11;
and.b32 %r1655, %r1654, 1;
shl.b32 %r1656, %r1655, %r1646;
or.b32 %r1657, %r1656, %r1653;
or.b32 %r2116, %r1657, 1;
add.s32 %r1658, %r2116, 2;
shl.b32 %r1659, %r1658, %r670;
or.b32 %r2152, %r1659, %r1648;
mov.u64 %rd502, %rd109;
$L__BB28_387:
@%p495 bra $L__BB28_389;
add.s32 %r1660, %r785, %r834;
and.b32 %r1661, %r2152, 2147483647;
shr.u32 %r1662, %r1661, %r669;
neg.s32 %r1663, %r1662;
setp.lt.s32 %p512, %r2152, 0;
selp.b32 %r1664, %r1663, %r1662, %p512;
cvt.rn.f32.s32 %f22, %r1664;
mul.rn.f32 %f24, %f2, %f22;
mul.wide.u32 %rd438, %r1660, 4;
add.s64 %rd439, %rd2, %rd438;
st.global.f32 [%rd439], %f24;
$L__BB28_389:
add.s32 %r2115, %r2115, 2;
add.s32 %r2113, %r2113, 2;
add.s32 %r2117, %r2117, 2;
setp.lt.u32 %p513, %r2117, %r824;
mov.u32 %r2112, %r757;
mov.u32 %r2114, %r756;
@%p513 bra $L__BB28_359;
$L__BB28_390:
st.local.u32 [%rd90], %r2116;
add.s32 %r2109, %r2109, 2;
setp.lt.u32 %p514, %r2109, %r826;
@%p514 bra $L__BB28_358;
bra.uni $L__BB28_401;
$L__BB28_391:
mov.u32 %r1669, 1;
st.global.u32 [%rd4], %r1669;
mov.u32 %r1670, 14;
st.global.u32 [%rd4+4], %r1670;
mov.u32 %r1671, 0;
st.global.u32 [%rd4+8], %r1671;
st.global.u32 [%rd4+12], %r1671;
$L__BB28_401:
ret;
}
// .globl j2k_dequantize_htj2k_codeblocks
.visible .entry j2k_dequantize_htj2k_codeblocks(
.param .u64 j2k_dequantize_htj2k_codeblocks_param_0,
.param .u64 j2k_dequantize_htj2k_codeblocks_param_1
)
{
.reg .pred %p<4>;
.reg .f32 %f<4>;
.reg .b32 %r<24>;
.reg .b64 %rd<10>;
ld.param.u64 %rd3, [j2k_dequantize_htj2k_codeblocks_param_0];
ld.param.u64 %rd4, [j2k_dequantize_htj2k_codeblocks_param_1];
mov.u32 %r10, %ctaid.x;
cvta.to.global.u64 %rd5, %rd4;
mul.wide.u32 %rd6, %r10, 52;
add.s64 %rd7, %rd5, %rd6;
add.s64 %rd1, %rd7, 4;
ld.global.u32 %r11, [%rd7+8];
ld.global.u32 %r1, [%rd7+4];
mul.lo.s32 %r2, %r11, %r1;
mov.u32 %r23, %tid.x;
setp.ge.u32 %p1, %r23, %r2;
@%p1 bra $L__BB29_3;
ld.global.u32 %r4, [%rd1+36];
ld.global.f32 %f1, [%rd1+40];
ld.global.u32 %r12, [%rd1+32];
sub.s32 %r5, %r12, %r1;
ld.global.u32 %r13, [%rd1+24];
mov.u32 %r14, 31;
sub.s32 %r6, %r14, %r13;
mov.u32 %r7, %ntid.x;
cvta.to.global.u64 %rd2, %rd3;
$L__BB29_2:
div.u32 %r15, %r23, %r1;
add.s32 %r16, %r23, %r4;
mad.lo.s32 %r17, %r5, %r15, %r16;
mul.wide.u32 %rd8, %r17, 4;
add.s64 %rd9, %rd2, %rd8;
ld.global.u32 %r18, [%rd9];
and.b32 %r19, %r18, 2147483647;
shr.u32 %r20, %r19, %r6;
setp.lt.s32 %p2, %r18, 0;
neg.s32 %r21, %r20;
selp.b32 %r22, %r21, %r20, %p2;
cvt.rn.f32.s32 %f2, %r22;
mul.rn.f32 %f3, %f1, %f2;
st.global.f32 [%rd9], %f3;
add.s32 %r23, %r23, %r7;
setp.lt.u32 %p3, %r23, %r2;
@%p3 bra $L__BB29_2;
$L__BB29_3:
ret;
}
// .globl j2k_dequantize_htj2k_codeblocks_multi
.visible .entry j2k_dequantize_htj2k_codeblocks_multi(
.param .u64 j2k_dequantize_htj2k_codeblocks_multi_param_0
)
{
.reg .pred %p<4>;
.reg .f32 %f<4>;
.reg .b32 %r<28>;
.reg .b64 %rd<9>;
ld.param.u64 %rd3, [j2k_dequantize_htj2k_codeblocks_multi_param_0];
cvta.to.global.u64 %rd4, %rd3;
mov.u32 %r10, %ctaid.x;
mul.wide.u32 %rd5, %r10, 40;
add.s64 %rd1, %rd4, %rd5;
ld.global.v2.u32 {%r11, %r12}, [%rd1+8];
mul.lo.s32 %r2, %r12, %r11;
mov.u32 %r27, %tid.x;
setp.ge.u32 %p1, %r27, %r2;
@%p1 bra $L__BB30_3;
ld.global.u64 %rd6, [%rd1];
ld.global.u32 %r4, [%rd1+20];
ld.global.f32 %f1, [%rd1+32];
.pragma "used_bytes_mask 15";
ld.global.v2.u32 {%r14, %r15}, [%rd1+16];
sub.s32 %r5, %r14, %r11;
ld.global.u32 %r17, [%rd1+24];
mov.u32 %r18, 31;
sub.s32 %r6, %r18, %r17;
mov.u32 %r7, %ntid.x;
cvta.to.global.u64 %rd2, %rd6;
$L__BB30_2:
div.u32 %r19, %r27, %r11;
add.s32 %r20, %r27, %r4;
mad.lo.s32 %r21, %r5, %r19, %r20;
mul.wide.u32 %rd7, %r21, 4;
add.s64 %rd8, %rd2, %rd7;
ld.global.u32 %r22, [%rd8];
and.b32 %r23, %r22, 2147483647;
shr.u32 %r24, %r23, %r6;
setp.lt.s32 %p2, %r22, 0;
neg.s32 %r25, %r24;
selp.b32 %r26, %r25, %r24, %p2;
cvt.rn.f32.s32 %f2, %r26;
mul.rn.f32 %f3, %f1, %f2;
st.global.f32 [%rd8], %f3;
add.s32 %r27, %r27, %r7;
setp.lt.u32 %p3, %r27, %r2;
@%p3 bra $L__BB30_2;
$L__BB30_3:
ret;
}
// .globl j2k_dequantize_htj2k_cleanup_jobs_multi
.visible .entry j2k_dequantize_htj2k_cleanup_jobs_multi(
.param .u64 j2k_dequantize_htj2k_cleanup_jobs_multi_param_0
)
{
.reg .pred %p<4>;
.reg .f32 %f<4>;
.reg .b32 %r<27>;
.reg .b64 %rd<9>;
ld.param.u64 %rd3, [j2k_dequantize_htj2k_cleanup_jobs_multi_param_0];
cvta.to.global.u64 %rd4, %rd3;
mov.u32 %r10, %ctaid.x;
mul.wide.u32 %rd5, %r10, 64;
add.s64 %rd1, %rd4, %rd5;
ld.global.u32 %r11, [%rd1+16];
ld.global.u32 %r1, [%rd1+12];
mul.lo.s32 %r2, %r11, %r1;
mov.u32 %r26, %tid.x;
setp.ge.u32 %p1, %r26, %r2;
@%p1 bra $L__BB31_3;
ld.global.u64 %rd6, [%rd1];
ld.global.v2.u32 {%r12, %r13}, [%rd1+48];
ld.global.u32 %r15, [%rd1+44];
sub.s32 %r4, %r15, %r1;
ld.global.u32 %r16, [%rd1+36];
mov.u32 %r17, 31;
sub.s32 %r5, %r17, %r16;
mov.u32 %r6, %ntid.x;
mov.b32 %f1, %r13;
cvta.to.global.u64 %rd2, %rd6;
$L__BB31_2:
div.u32 %r18, %r26, %r1;
add.s32 %r19, %r26, %r12;
mad.lo.s32 %r20, %r4, %r18, %r19;
mul.wide.u32 %rd7, %r20, 4;
add.s64 %rd8, %rd2, %rd7;
ld.global.u32 %r21, [%rd8];
and.b32 %r22, %r21, 2147483647;
shr.u32 %r23, %r22, %r5;
setp.lt.s32 %p2, %r21, 0;
neg.s32 %r24, %r23;
selp.b32 %r25, %r24, %r23, %p2;
cvt.rn.f32.s32 %f2, %r25;
mul.rn.f32 %f3, %f1, %f2;
st.global.f32 [%rd8], %f3;
add.s32 %r26, %r26, %r6;
setp.lt.u32 %p3, %r26, %r2;
@%p3 bra $L__BB31_2;
$L__BB31_3:
ret;
}