; Copyright © 2021, VideoLAN and dav1d authors
; Copyright © 2021, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 32
pd_1321_2482: dd 1321, 1321, 1321, 1321, 2482, 2482, 2482, 2482
itx4_shuf: dd 0x50401600, 0xd0c09284, 0x70603422, 0xf0e0b0a6
dd 0x50401701, 0xd0c09385, 0x70603523, 0xf0e0b1a7
pw_2048_m2048: dw 2048, 2048, 2048, 2048, -2048, -2048, -2048, -2048
iadst4_dconly2a: dw 10568, 10568, 10568, 10568, 19856, 19856, 19856, 19856
idct4_shuf: db 0, 1, 4, 5, 12, 13, 8, 9, 2, 3, 6, 7, 14, 15, 10, 11
idct32_shuf: db 0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 10, 11, 6, 7, 14, 15
%macro COEF_PAIR 2
pd_%1_%2: dd %1, %1, %2, %2
%define pd_%1 (pd_%1_%2 + 4*0)
%define pd_%2 (pd_%1_%2 + 4*2)
%endmacro
COEF_PAIR 201, 995
COEF_PAIR 401, 1931
COEF_PAIR 799, 3406
COEF_PAIR 1380, 601
COEF_PAIR 1751, 2440
COEF_PAIR 2598, 1189
COEF_PAIR 2751, 2106
COEF_PAIR 2896, 1567
COEF_PAIR 2896, 3784
COEF_PAIR 3035, 3513
COEF_PAIR 3166, 3920
COEF_PAIR 3703, 3290
COEF_PAIR 3857, 4052
COEF_PAIR 4017, 2276
COEF_PAIR 4076, 3612
COEF_PAIR 4091, 3973
%define pd_1321 (pd_1321_2482 + 4*0)
%define pd_2482 (pd_1321_2482 + 4*4)
pd_m601: dd -601
pd_m1189: dd -1189
pd_m1380: dd -1380
pd_m2106: dd -2106
pd_m2598: dd -2598
pd_m2751: dd -2751
pd_m3344: dd -3344
pd_3803: dd 3803
pd_5793: dd 5793
pd_6144: dd 6144 ; 2048 + 4096
pd_10239: dd 10239 ; 2048 + 8192 - 1
pd_10240: dd 10240 ; 2048 + 8192
pd_11586: dd 11586 ; 5793 * 2
pd_38912: dd 38912 ; 2048 + 4096 + 32768
pixel_max: times 2 dw 0x03ff ; 10bpc
clip_min: dd -0x20000
clip_max: dd 0x1ffff
idct64_mul_16bpc:
dd 4095, 101, 2967, -2824, 3745, 1660, 3822, -1474, 401, 4076, 799, 4017
dd -700, 4036, 2359, 3349, -2191, 3461, 897, 3996, -2598, -3166, -4017, -799
dd 4065, 501, 3229, -2520, 3564, 2019, 3948, -1092, 1931, 3612, 3406, 2276
dd -301, 4085, 2675, 3102, -1842, 3659, 1285, 3889, -1189, -3920, -2276, -3406
cextern deint_shuf
cextern idct64_mul
cextern pw_1697x8
cextern pw_1697x16
cextern pw_1567_3784
cextern pw_m1567_m3784
cextern pw_m3784_1567
cextern pw_2896_2896
cextern pw_m2896_2896
cextern pw_5
cextern pw_2048
cextern pw_4096
cextern pw_8192
cextern pw_16384
cextern pw_2896x8
cextern pd_2048
cextern idct_4x8_internal_8bpc_avx2.main
cextern idct_4x16_internal_8bpc_avx2.main
cextern idct_8x8_internal_8bpc_avx2.main
cextern idct_8x16_internal_8bpc_avx2.main
cextern idct_16x4_internal_8bpc_avx2.main
cextern idct_16x8_internal_8bpc_avx2.main
cextern idct_16x16_internal_8bpc_avx2.main
cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main
cextern inv_txfm_add_dct_dct_8x32_8bpc_avx2.main_fast
cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf
cextern inv_txfm_add_dct_dct_16x32_8bpc_avx2.main_oddhalf_fast
cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part1
cextern inv_txfm_add_dct_dct_16x64_8bpc_avx2.main_part2_internal
cextern iadst_4x4_internal_8bpc_avx2.main
cextern iadst_4x8_internal_8bpc_avx2.main_pass2
cextern iadst_4x16_internal_8bpc_avx2.main2
cextern iadst_8x4_internal_8bpc_avx2.main
cextern iadst_8x8_internal_8bpc_avx2.main_pass2
cextern iadst_8x16_internal_8bpc_avx2.main
cextern iadst_8x16_internal_8bpc_avx2.main_pass2_end
cextern iadst_16x4_internal_8bpc_avx2.main
cextern iadst_16x8_internal_8bpc_avx2.main
cextern iadst_16x8_internal_8bpc_avx2.main_pass2_end
cextern iadst_16x16_internal_8bpc_avx2.main
cextern iadst_16x16_internal_8bpc_avx2.main_pass2_end
SECTION .text
%macro REPX 2-*
%xdefine %%f(x) %1
%rep %0 - 1
%rotate 1
%%f(%1)
%endrep
%endmacro
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
%macro WRAP_XMM 1+
INIT_XMM cpuname
%1
INIT_YMM cpuname
%endmacro
%macro IWHT4_1D_PACKED 0
; m0 = in0 in2, m1 = in1 in3
psubd m2, m0, m1 ; t2
paddd xm0, xm1 ; t0
vpermq m2, m2, q3322
vpermq m0, m0, q1100
vpermq m1, m1, q3120
psubd m3, m0, m2
psrad m3, 1
psubd m3, m1 ; t1 t3
psubd m0, m3 ; ____ out0
paddd m2, m3 ; out3 ____
%endmacro
INIT_YMM avx2
cglobal inv_txfm_add_wht_wht_4x4_16bpc, 3, 7, 6, dst, stride, c, eob, bdmax
mova xm0, [cq+16*0]
vinserti128 m0, [cq+16*2], 1
mova xm1, [cq+16*1]
vinserti128 m1, [cq+16*3], 1
pxor m4, m4
mova [cq+32*0], m4
mova [cq+32*1], m4
lea r6, [dstq+strideq*2]
psrad m0, 2
psrad m1, 2
IWHT4_1D_PACKED
punpckhdq m0, m3
punpckldq m3, m2
punpckhqdq m1, m0, m3
punpcklqdq m0, m3
IWHT4_1D_PACKED
vpblendd m0, m2, 0x33
packssdw m0, m3
vextracti128 xm2, m0, 1
punpckhdq xm1, xm0, xm2 ; out2 out1
punpckldq xm0, xm2 ; out3 out0
movq xm2, [r6 +strideq*1]
movhps xm2, [dstq+strideq*0]
movq xm3, [r6 +strideq*0]
movhps xm3, [dstq+strideq*1]
%ifidn bdmaxd, bdmaxm
movd xm5, bdmaxd
vpbroadcastw xm5, xm5
%else ; win64: load from stack
vpbroadcastw xm5, bdmaxm
%endif
paddsw xm0, xm2
paddsw xm1, xm3
pmaxsw xm0, xm4
pmaxsw xm1, xm4
pminsw xm0, xm5
pminsw xm1, xm5
movhps [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm1
movq [r6 +strideq*0], xm1
movq [r6 +strideq*1], xm0
RET
; dst1 = (src1 * coef1 - src2 * coef2 + rnd) >> 12
; dst2 = (src1 * coef2 + src2 * coef1 + rnd) >> 12
; flags: 1 = packed, 2 = inv_dst1, 4 = inv_dst2
; skip round/shift if rnd is not a number
%macro ITX_MULSUB_2D 8-9 0 ; dst/src[1-2], tmp[1-3], rnd, coef[1-2], flags
%if %8 < 32
pmulld m%4, m%1, m%8
pmulld m%3, m%2, m%8
%else
%if %9 & 1
vbroadcasti128 m%3, [pd_%8]
%else
vpbroadcastd m%3, [pd_%8]
%endif
pmulld m%4, m%1, m%3
pmulld m%3, m%2
%endif
%if %7 < 32
pmulld m%1, m%7
pmulld m%2, m%7
%else
%if %9 & 1
vbroadcasti128 m%5, [pd_%7]
%else
vpbroadcastd m%5, [pd_%7]
%endif
pmulld m%1, m%5
pmulld m%2, m%5
%endif
%if %9 & 4
psubd m%4, m%6, m%4
psubd m%2, m%4, m%2
%else
%ifnum %6
paddd m%4, m%6
%endif
paddd m%2, m%4
%endif
%if %9 & 2 ; invert the upper half of dst1 before rounding
vbroadcasti128 m%4, [pw_2048_m2048]
psubd m%1, m%3
psignd m%1, m%4
paddd m%1, m%6
%else
%ifnum %6
paddd m%1, m%6
%endif
psubd m%1, m%3
%endif
%ifnum %6
psrad m%2, 12
psrad m%1, 12
%endif
%endmacro
%macro INV_TXFM_FN 4 ; type1, type2, eob_offset, size
cglobal inv_txfm_add_%1_%2_%4_16bpc, 4, 5, 0, dst, stride, c, eob, tx2
%define %%p1 m(i%1_%4_internal_16bpc)
; Jump to the 1st txfm function if we're not taking the fast path, which
; in turn performs an indirect jump to the 2nd txfm function.
lea tx2q, [m(i%2_%4_internal_16bpc).pass2]
%ifidn %1_%2, dct_dct
test eobd, eobd
jnz %%p1
%else
%if %3
add eobd, %3
%endif
; jump to the 1st txfm function unless it's located directly after this
times ((%%end - %%p1) >> 31) & 1 jmp %%p1
ALIGN function_align
%%end:
%endif
%endmacro
%macro INV_TXFM_4X4_FN 2 ; type1, type2
INV_TXFM_FN %1, %2, 0, 4x4
%ifidn %1_%2, dct_dct
imul r6d, [cq], 2896
movd xm1, [pw_2896x8]
mov [cq], eobd ; 0
add r6d, 2048
sar r6d, 12
movd xm0, r6d
packssdw xm0, xm0
pmulhrsw xm0, xm1
vpbroadcastw xm0, xm0
mova xm1, xm0
jmp m(iadst_4x4_internal_16bpc).end
%endif
%endmacro
%macro IDCT4_1D_PACKED 6 ; dst/src[1-2], tmp[1-3], rnd
ITX_MULSUB_2D %1, %2, %3, %4, %5, %6, 2896_1567, 2896_3784, 1
punpckhqdq m%3, m%2, m%1 ; t3 t2
punpcklqdq m%2, m%1 ; t0 t1
paddd m%1, m%2, m%3 ; out0 out1
psubd m%2, m%3 ; out3 out2
%endmacro
%macro IDCT4_1D_PACKED_WORD 6 ; dst/src[1-2], tmp[1-3], rnd
vpbroadcastd m%5, [pw_m3784_1567]
punpckhwd m%3, m%2, m%1
psubw m%4, m%1, m%2
paddw m%1, m%2
vpbroadcastd m%2, [pw_1567_3784]
punpcklqdq m%1, m%4
vpbroadcastd m%4, [pw_2896x8]
pmaddwd m%5, m%3
pmaddwd m%3, m%2
pmulhrsw m%1, m%4 ; t0 t1
paddd m%5, m%6
paddd m%3, m%6
psrad m%5, 12
psrad m%3, 12
packssdw m%3, m%5 ; t3 t2
psubsw m%2, m%1, m%3 ; out3 out2
paddsw m%1, m%3 ; out0 out1
%endmacro
INV_TXFM_4X4_FN dct, dct
INV_TXFM_4X4_FN dct, identity
INV_TXFM_4X4_FN dct, adst
INV_TXFM_4X4_FN dct, flipadst
cglobal idct_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2
vpermq m0, [cq+32*0], q3120
vpermq m1, [cq+32*1], q3120
vpbroadcastd m5, [pd_2048]
IDCT4_1D_PACKED 0, 1, 2, 3, 4, 5
vbroadcasti128 m2, [idct4_shuf]
packssdw m0, m1
pshufb m0, m2
jmp tx2q
.pass2:
vextracti128 xm1, m0, 1
WRAP_XMM IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5
packssdw xm5, xm5 ; pw_2048
pmulhrsw xm0, xm5
pmulhrsw xm1, xm5
movq xm2, [dstq+strideq*0]
movhps xm2, [dstq+strideq*1]
lea r6, [dstq+strideq*2]
movq xm3, [r6 +strideq*1]
movhps xm3, [r6 +strideq*0]
vpbroadcastd xm5, [pixel_max]
pxor m4, m4
mova [cq+32*0], m4
mova [cq+32*1], m4
paddw xm0, xm2
paddw xm1, xm3
pmaxsw xm0, xm4
pmaxsw xm1, xm4
pminsw xm0, xm5
pminsw xm1, xm5
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movhps [r6 +strideq*0], xm1
movq [r6 +strideq*1], xm1
RET
INV_TXFM_4X4_FN adst, dct
INV_TXFM_4X4_FN adst, adst
INV_TXFM_4X4_FN adst, flipadst
INV_TXFM_4X4_FN adst, identity
cglobal iadst_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2
call .main
vpermd m0, m4, m0
psrld m4, 4
pshufb m0, m4
jmp tx2q
.pass2:
lea rax, [deint_shuf+128]
vextracti128 xm1, m0, 1
call m(iadst_4x4_internal_8bpc).main
.end:
vpbroadcastd xm4, [pw_2048]
movq xm2, [dstq+strideq*0]
movhps xm2, [dstq+strideq*1]
lea r6, [dstq+strideq*2]
movq xm3, [r6 +strideq*0]
movhps xm3, [r6 +strideq*1]
vpbroadcastd xm5, [pixel_max]
pmulhrsw xm0, xm4
pmulhrsw xm1, xm4
pxor m4, m4
mova [cq+32*0], m4
mova [cq+32*1], m4
paddw xm0, xm2
paddw xm1, xm3
pmaxsw xm0, xm4
pmaxsw xm1, xm4
pminsw xm0, xm5
pminsw xm1, xm5
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movq [r6 +strideq*0], xm1
movhps [r6 +strideq*1], xm1
RET
ALIGN function_align
.main:
mova m2, [cq+16*2]
mova m0, [pd_1321_2482]
vpbroadcastd m3, [pd_3803]
vbroadcasti128 m5, [cq+16*0]
vpbroadcastd m1, [pd_m3344]
pmulld m4, m0, m2
pmulld m3, m2
pmulld m0, m5
vpbroadcastd m5, [pd_2048]
psubd xm2, [cq+16*3]
psubd m2, [cq+16*0]
pmulld m2, m1 ; t2 t3
vpermq m4, m4, q1032
paddd m4, m3
psubd m0, m4
paddd xm4, xm4
paddd m4, m0 ; t0 t1
vinserti128 m3, m2, xm4, 1 ; t2 t0
paddd m0, m4, m5
psubd xm4, xm2
psubd m1, m0, m2
vpermq m2, m2, q3232 ; t3 t3
psubd m1, m4
mova m4, [itx4_shuf]
paddd m0, m2 ; out0 out1
paddd m1, m3 ; out2 out3
psrad m0, 12
psrad m1, 12
packssdw m0, m1
ret
INV_TXFM_4X4_FN flipadst, dct
INV_TXFM_4X4_FN flipadst, adst
INV_TXFM_4X4_FN flipadst, flipadst
INV_TXFM_4X4_FN flipadst, identity
cglobal iflipadst_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2
call m(iadst_4x4_internal_16bpc).main
psrld m1, m4, 8
vpermd m0, m1, m0
psrld m4, 4
pshufb m0, m4
jmp tx2q
.pass2:
lea rax, [deint_shuf+128]
vextracti128 xm1, m0, 1
call m(iadst_4x4_internal_8bpc).main
vpbroadcastd xm4, [pw_2048]
movq xm3, [dstq+strideq*1]
movhps xm3, [dstq+strideq*0]
lea r6, [dstq+strideq*2]
movq xm2, [r6 +strideq*1]
movhps xm2, [r6 +strideq*0]
vpbroadcastd xm5, [pixel_max]
pmulhrsw xm0, xm4
pmulhrsw xm1, xm4
pxor m4, m4
mova [cq+32*0], m4
mova [cq+32*1], m4
paddw xm0, xm2
paddw xm1, xm3
pmaxsw xm0, xm4
pmaxsw xm1, xm4
pminsw xm0, xm5
pminsw xm1, xm5
movhps [dstq+strideq*0], xm1
movq [dstq+strideq*1], xm1
movhps [r6 +strideq*0], xm0
movq [r6 +strideq*1], xm0
RET
INV_TXFM_4X4_FN identity, dct
INV_TXFM_4X4_FN identity, adst
INV_TXFM_4X4_FN identity, flipadst
INV_TXFM_4X4_FN identity, identity
cglobal iidentity_4x4_internal_16bpc, 0, 7, 6, dst, stride, c, eob, tx2
vpbroadcastd m1, [pd_5793]
pmulld m0, m1, [cq+32*0]
pmulld m1, [cq+32*1]
vpbroadcastd m5, [pd_2048]
mova m3, [itx4_shuf]
paddd m0, m5
paddd m1, m5
psrad m0, 12
psrad m1, 12
packssdw m0, m1
vpermd m0, m3, m0
psrld m3, 4
pshufb m0, m3
jmp tx2q
.pass2:
vpbroadcastd m1, [pw_1697x8]
movq xm2, [dstq+strideq*0]
movhps xm2, [dstq+strideq*1]
lea r6, [dstq+strideq*2]
pmulhrsw m1, m0
paddsw m0, m1
movq xm3, [r6 +strideq*0]
movhps xm3, [r6 +strideq*1]
vpbroadcastd xm4, [pixel_max]
packssdw m5, m5 ; pw_2048
pmulhrsw m0, m5
pxor m5, m5
mova [cq+32*0], m5
mova [cq+32*1], m5
vextracti128 xm1, m0, 1
paddw xm0, xm2
paddw xm1, xm3
pmaxsw xm0, xm5
pmaxsw xm1, xm5
pminsw xm0, xm4
pminsw xm1, xm4
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movq [r6 +strideq*0], xm1
movhps [r6 +strideq*1], xm1
RET
%macro INV_TXFM_4X8_FN 2 ; type1, type2
INV_TXFM_FN %1, %2, 0, 4x8
%ifidn %1_%2, dct_dct
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 8
add r6d, 2048
sar r6d, 12
imul r6d, 2896
add r6d, 2048
sar r6d, 12
.end:
imul r6d, 2896
add r6d, 34816
sar r6d, 16
movd xm0, r6d
vpbroadcastw xm0, xm0
.end2:
vpbroadcastd xm3, [pixel_max]
pxor xm2, xm2
.end_loop:
movq xm1, [dstq+strideq*0]
movhps xm1, [dstq+strideq*1]
paddw xm1, xm0
pmaxsw xm1, xm2
pminsw xm1, xm3
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
lea dstq, [dstq+strideq*2]
sub r3d, 2
jg .end_loop
WRAP_XMM RET
%endif
%endmacro
%macro IDCT4_1D 8 ; src[1-4], tmp[1-3], rnd
ITX_MULSUB_2D %2, %4, %5, %6, %7, %8, 1567, 3784 ; t2, t3
vpbroadcastd m%5, [pd_2896]
pmulld m%1, m%5
pmulld m%3, m%5
paddd m%1, m%8
paddd m%5, m%1, m%3
psubd m%1, m%3
psrad m%5, 12 ; t0
psrad m%1, 12 ; t1
psubd m%3, m%1, m%2
paddd m%2, m%1
paddd m%1, m%5, m%4
psubd m%4, m%5, m%4
%endmacro
INV_TXFM_4X8_FN dct, dct
INV_TXFM_4X8_FN dct, identity
INV_TXFM_4X8_FN dct, adst
INV_TXFM_4X8_FN dct, flipadst
cglobal idct_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2
vpbroadcastd m3, [pd_2896]
pmulld m0, m3, [cq+32*0]
pmulld m1, m3, [cq+32*1]
pmulld m2, m3, [cq+32*2]
pmulld m3, m3, [cq+32*3]
vpbroadcastd m7, [pd_2048]
REPX {paddd x, m7}, m0, m1, m2, m3
REPX {psrad x, 12}, m0, m1, m2, m3
IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7
jmp tx2q
.pass2:
packssdw m0, m2
packssdw m1, m3
lea rax, [deint_shuf+128]
punpckhwd m2, m0, m1
punpcklwd m0, m1
punpckhdq m1, m0, m2 ; 2 3
punpckldq m0, m2 ; 0 1
vextracti128 xm2, m0, 1 ; 4 5
vextracti128 xm3, m1, 1 ; 6 7
call m(idct_4x8_internal_8bpc).main
vpbroadcastd xm4, [pw_2048]
REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
lea r3, [strideq*3]
lea r6, [dstq+strideq*4]
movq xm4, [dstq+strideq*0]
movhps xm4, [dstq+strideq*1]
movq xm5, [dstq+r3 ]
movhps xm5, [dstq+strideq*2]
movq xm6, [r6 +strideq*0]
movhps xm6, [r6 +strideq*1]
movq xm7, [r6 +r3 ]
movhps xm7, [r6 +strideq*2]
paddw xm0, xm4 ; 0 1
paddw xm1, xm5 ; 3 2
paddw xm2, xm6 ; 4 5
paddw xm3, xm7 ; 7 6
vpbroadcastd xm5, [pixel_max]
pxor m4, m4
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3
REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movhps [dstq+strideq*2], xm1
movq [dstq+r3 ], xm1
movq [r6 +strideq*0], xm2
movhps [r6 +strideq*1], xm2
movhps [r6 +strideq*2], xm3
movq [r6 +r3 ], xm3
RET
INV_TXFM_4X8_FN adst, dct
INV_TXFM_4X8_FN adst, adst
INV_TXFM_4X8_FN adst, flipadst
INV_TXFM_4X8_FN adst, identity
cglobal iadst_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2
call m(iadst_8x4_internal_16bpc).main
psrad m0, m4, 12
psrad m1, m5, 12
psrad m2, 12
psrad m3, 12
jmp tx2q
.pass2:
call .pass2_main
mova xm4, [pw_2048_m2048]
REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3
.end:
lea r3, [strideq*3]
lea r6, [dstq+strideq*4]
movq xm4, [dstq+strideq*0]
movhps xm4, [dstq+strideq*1]
movq xm5, [dstq+strideq*2]
movhps xm5, [dstq+r3 ]
movq xm6, [r6 +strideq*0]
movhps xm6, [r6 +strideq*1]
movq xm7, [r6 +strideq*2]
movhps xm7, [r6 +r3 ]
paddw xm0, xm4 ; 0 1
paddw xm1, xm5 ; 2 3
paddw xm2, xm6 ; 4 5
paddw xm3, xm7 ; 6 7
vpbroadcastd xm5, [pixel_max]
pxor m4, m4
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
REPX {pmaxsw x, xm4}, xm0, xm1, xm2, xm3
REPX {pminsw x, xm5}, xm0, xm1, xm2, xm3
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movq [dstq+strideq*2], xm1
movhps [dstq+r3 ], xm1
movq [r6 +strideq*0], xm2
movhps [r6 +strideq*1], xm2
movq [r6 +strideq*2], xm3
movhps [r6 +r3 ], xm3
RET
ALIGN function_align
.pass2_main:
packssdw m0, m2
packssdw m1, m3
lea rax, [deint_shuf+128]
punpcklwd m4, m0, m1
punpckhwd m0, m1
punpckhdq m5, m4, m0
punpckldq m4, m0
vextracti128 xm2, m4, 1 ; 4 5
vextracti128 xm3, m5, 1 ; 6 7
pshufd xm4, xm4, q1032 ; 1 0
pshufd xm5, xm5, q1032 ; 3 2
jmp m(iadst_4x8_internal_8bpc).main_pass2
ALIGN function_align
.main:
vbroadcasti128 m0, [cq+16*0]
vbroadcasti128 m2, [cq+16*2]
vbroadcasti128 m3, [cq+16*5]
vbroadcasti128 m1, [cq+16*7]
vpbroadcastd m6, [pd_2896]
shufpd m0, m2, 0x0c ; 0 2
shufpd m1, m3, 0x0c ; 7 5
vbroadcasti128 m2, [cq+16*4]
vbroadcasti128 m4, [cq+16*6]
vbroadcasti128 m5, [cq+16*1]
vbroadcasti128 m3, [cq+16*3]
vpbroadcastd m7, [pd_2048]
vpbroadcastd m8, [clip_min]
vpbroadcastd m9, [clip_max]
shufpd m2, m4, 0x0c ; 4 6
shufpd m3, m5, 0x0c ; 3 1
REPX {pmulld x, m6}, m0, m1, m2, m3
REPX {paddd x, m7}, m0, m1, m2, m3
REPX {psrad x, 12}, m0, m1, m2, m3
ITX_MULSUB_2D 1, 0, 4, 5, 6, 7, 401_1931, 4076_3612, 1
ITX_MULSUB_2D 3, 2, 4, 5, 6, 7, 3166_3920, 2598_1189, 1
psubd m4, m0, m2 ; t4 t6
paddd m0, m2 ; t0 t2
psubd m2, m1, m3 ; t5 t7
paddd m1, m3 ; t1 t3
REPX {pmaxsd x, m8}, m4, m2, m0, m1
REPX {pminsd x, m9}, m4, m2, m0, m1
pxor m5, m5
psubd m5, m4
vpblendd m4, m2, 0xcc ; t4 t7
vpblendd m2, m5, 0xcc ; t5 -t6
ITX_MULSUB_2D 4, 2, 3, 5, 6, 7, 1567, 3784
vpbroadcastd m5, [pd_2896]
vbroadcasti128 m6, [pw_2048_m2048] ; + + - -
punpckhqdq m3, m0, m1
punpcklqdq m0, m1
psubd m1, m0, m3 ; t2 t3
paddd m0, m3 ; out0 -out7
punpckhqdq m3, m4, m2 ; t7a t6a
punpcklqdq m4, m2 ; t5a t4a
psubd m2, m4, m3 ; t7 t6
paddd m4, m3 ; out6 -out1
REPX {pmaxsd x, m8}, m1, m2
REPX {pminsd x, m9}, m1, m2
vpblendd m3, m1, m2, 0xcc
shufpd m1, m2, 0x05
pmulld m3, m5
pmulld m5, m1
psignd m0, m6 ; out0 out7
psignd m4, m6 ; out6 out1
paddd m3, m7
psubd m2, m3, m5
paddd m5, m3
psrad m2, 12 ; out4 -out5
psrad m5, 12 ; -out3 out2
ret
INV_TXFM_4X8_FN flipadst, dct
INV_TXFM_4X8_FN flipadst, adst
INV_TXFM_4X8_FN flipadst, flipadst
INV_TXFM_4X8_FN flipadst, identity
cglobal iflipadst_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2
call m(iadst_8x4_internal_16bpc).main
psrad m0, m3, 12
psrad m1, m2, 12
psrad m2, m5, 12
psrad m3, m4, 12
jmp tx2q
.pass2:
call m(iadst_4x8_internal_16bpc).pass2_main
mova xm4, [pw_2048_m2048]
REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0
lea r3, [strideq*3]
lea r6, [dstq+strideq*4]
movq xm4, [dstq+strideq*1]
movhps xm4, [dstq+strideq*0]
movq xm5, [dstq+r3 ]
movhps xm5, [dstq+strideq*2]
movq xm6, [r6 +strideq*1]
movhps xm6, [r6 +strideq*0]
movq xm7, [r6 +r3 ]
movhps xm7, [r6 +strideq*2]
paddw xm3, xm4 ; 1 0
paddw xm2, xm5 ; 3 2
paddw xm1, xm6 ; 5 4
paddw xm0, xm7 ; 7 6
vpbroadcastd xm5, [pixel_max]
pxor m4, m4
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
REPX {pmaxsw x, xm4}, xm3, xm2, xm1, xm0
REPX {pminsw x, xm5}, xm3, xm2, xm1, xm0
movhps [dstq+strideq*0], xm3
movq [dstq+strideq*1], xm3
movhps [dstq+strideq*2], xm2
movq [dstq+r3 ], xm2
movhps [r6 +strideq*0], xm1
movq [r6 +strideq*1], xm1
movhps [r6 +strideq*2], xm0
movq [r6 +r3 ], xm0
RET
INV_TXFM_4X8_FN identity, dct
INV_TXFM_4X8_FN identity, adst
INV_TXFM_4X8_FN identity, flipadst
INV_TXFM_4X8_FN identity, identity
cglobal iidentity_4x8_internal_16bpc, 0, 7, 8, dst, stride, c, eob, tx2
vpbroadcastd m3, [pd_2896]
pmulld m0, m3, [cq+32*0]
pmulld m1, m3, [cq+32*1]
pmulld m2, m3, [cq+32*2]
pmulld m3, [cq+32*3]
vpbroadcastd m5, [pd_2048]
vpbroadcastd m4, [pd_5793]
REPX {paddd x, m5}, m0, m1, m2, m3
REPX {psrad x, 12}, m0, m1, m2, m3
REPX {pmulld x, m4}, m0, m1, m2, m3
REPX {paddd x, m5}, m0, m1, m2, m3
REPX {psrad x, 12}, m0, m1, m2, m3
jmp tx2q
.pass2:
vpbroadcastd m4, [pw_4096]
packssdw m0, m2
packssdw m1, m3
punpckhwd m2, m0, m1
punpcklwd m0, m1
pmulhrsw m2, m4
pmulhrsw m0, m4
punpckhdq m1, m0, m2 ; 2 3 6 7
punpckldq m0, m2 ; 0 1 4 5
lea r3, [strideq*3]
lea r6, [dstq+strideq*4]
movq xm2, [dstq+strideq*0]
movhps xm2, [dstq+strideq*1]
vpbroadcastq m4, [r6 +strideq*0]
vpbroadcastq m5, [r6 +strideq*1]
movq xm3, [dstq+strideq*2]
movhps xm3, [dstq+r3 ]
vpblendd m2, m4, 0x30
vpblendd m2, m5, 0xc0
vpbroadcastq m4, [r6 +strideq*2]
vpbroadcastq m5, [r6 +r3 ]
vpblendd m3, m4, 0x30
vpblendd m3, m5, 0xc0
vpbroadcastd m5, [pixel_max]
pxor m4, m4
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
paddw m0, m2
paddw m1, m3
pmaxsw m0, m4
pmaxsw m1, m4
pminsw m0, m5
pminsw m1, m5
vextracti128 xm2, m0, 1
vextracti128 xm3, m1, 1
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movq [dstq+strideq*2], xm1
movhps [dstq+r3 ], xm1
movq [r6 +strideq*0], xm2
movhps [r6 +strideq*1], xm2
movq [r6 +strideq*2], xm3
movhps [r6 +r3 ], xm3
RET
%macro INV_TXFM_4X16_FN 2 ; type1, type2
INV_TXFM_FN %1, %2, 0, 4x16
%ifidn %1_%2, dct_dct
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 16
add r6d, 6144
sar r6d, 13
jmp m(inv_txfm_add_dct_dct_4x8_16bpc).end
%endif
%endmacro
INV_TXFM_4X16_FN dct, dct
INV_TXFM_4X16_FN dct, identity
INV_TXFM_4X16_FN dct, adst
INV_TXFM_4X16_FN dct, flipadst
cglobal idct_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2
mova m1, [cq+32*2]
mova m3, [cq+32*6]
mova m5, [cq+32*3]
mova m7, [cq+32*7]
vpbroadcastd m4, [pd_3784]
vpbroadcastd m8, [pd_1567]
vpbroadcastd m9, [pd_2048]
vpbroadcastd m6, [pd_2896]
ITX_MULSUB_2D 1, 3, 0, 2, _, 9, 8, 4 ; t2l, t3l
ITX_MULSUB_2D 5, 7, 4, 2, _, 9, 8, 4 ; t2h, t3h
pmulld m0, m6, [cq+32*0]
pmulld m2, m6, [cq+32*4]
pmulld m4, m6, [cq+32*1]
pmulld m6, [cq+32*5]
vpbroadcastd m8, [pd_6144]
paddd m0, m8
paddd m4, m8
paddd m8, m0, m2
psubd m0, m2
paddd m9, m4, m6
psubd m4, m6
REPX {psrad x, 12}, m8, m0, m9, m4 ; t0l, t1l, t0h, t1h
psubd m2, m0, m1
paddd m1, m0
psubd m6, m4, m5
paddd m5, m4
paddd m0, m8, m3
psubd m3, m8, m3
paddd m4, m9, m7
psubd m7, m9, m7
REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
jmp tx2q
.pass2:
packssdw m0, m4
packssdw m1, m5
packssdw m2, m6
packssdw m3, m7
lea rax, [deint_shuf+128]
punpcklwd m4, m2, m3
punpckhwd m2, m3
punpckhwd m5, m0, m1
punpcklwd m0, m1
punpckhdq m1, m0, m4 ; 2 3
punpckldq m0, m4 ; 0 1
punpckldq m4, m5, m2 ; 8 9
punpckhdq m5, m2 ; a b
vextracti128 xm2, m0, 1 ; 4 5
vextracti128 xm3, m1, 1 ; 6 7
vextracti128 xm6, m4, 1 ; c d
vextracti128 xm7, m5, 1 ; e f
call m(idct_4x16_internal_8bpc).main
vpbroadcastd m9, [pw_2048]
vinserti128 m0, m0, xm1, 1 ; 0 1 3 2
vinserti128 m1, m2, xm3, 1 ; 4 5 7 6
vinserti128 m2, m4, xm5, 1 ; 8 9 b a
vinserti128 m3, m6, xm7, 1 ; c d f e
vpbroadcastd m8, [pixel_max]
lea r6, [strideq*3]
pxor m7, m7
pmulhrsw m0, m9
call .write_4x4
pmulhrsw m0, m1, m9
call .write_4x4
pmulhrsw m0, m2, m9
call .write_4x4
pmulhrsw m0, m3, m9
call .write_4x4
RET
ALIGN function_align
.write_4x4:
movq xm4, [dstq+strideq*0]
movhps xm4, [dstq+strideq*1]
vpbroadcastq m5, [dstq+strideq*2]
vpbroadcastq m6, [dstq+r6 ]
mova [cq+32*0], m7
mova [cq+32*1], m7
add cq, 32*2
vpblendd m4, m5, 0xc0
vpblendd m4, m6, 0x30
paddw m4, m0
pmaxsw m4, m7
pminsw m4, m8
vextracti128 xm5, m4, 1
movq [dstq+strideq*0], xm4
movhps [dstq+strideq*1], xm4
movhps [dstq+strideq*2], xm5
movq [dstq+r6 ], xm5
lea dstq, [dstq+strideq*4]
ret
INV_TXFM_4X16_FN adst, dct
INV_TXFM_4X16_FN adst, adst
INV_TXFM_4X16_FN adst, flipadst
INV_TXFM_4X16_FN adst, identity
cglobal iadst_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2
call m(iadst_16x4_internal_16bpc).main
psrad m0, m4, 13
psrad m1, m5, 13
psrad m2, 13
psrad m3, 13
psrad m4, m8, 13
psrad m5, m9, 13
psrad m6, 13
psrad m7, 13
jmp tx2q
.pass2:
call .pass2_main
vpbroadcastd m5, [pw_2048]
vpbroadcastd m8, [pixel_max]
lea r6, [strideq*3]
vpblendd m4, m3, m0, 0xcc ; -out3 out0 out2 -out1
pshufd m2, m2, q1032 ; -out11 out8 out10 -out9
vpblendd m3, m0, 0x33 ; -out15 out12 out14 -out13
pxor m7, m7
psubw m9, m7, m5
vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048
pmulhrsw m0, m4, m9
call .write_4x4
pmulhrsw m0, m1, m9
call .write_4x4
pmulhrsw m0, m2, m9
call .write_4x4
pmulhrsw m0, m3, m9
call .write_4x4
RET
ALIGN function_align
.write_4x4:
movq xm4, [dstq+r6 ]
movhps xm4, [dstq+strideq*0]
vpbroadcastq m5, [dstq+strideq*1]
vpbroadcastq m6, [dstq+strideq*2]
mova [cq+32*0], m7
mova [cq+32*1], m7
add cq, 32*2
vpblendd m4, m5, 0xc0
vpblendd m4, m6, 0x30
paddw m4, m0
pmaxsw m4, m7
pminsw m4, m8
vextracti128 xm5, m4, 1
movhps [dstq+strideq*0], xm4
movhps [dstq+strideq*1], xm5
movq [dstq+strideq*2], xm5
movq [dstq+r6 ], xm4
lea dstq, [dstq+strideq*4]
ret
ALIGN function_align
.pass2_main:
packssdw m0, m4
packssdw m1, m5
packssdw m2, m6
packssdw m3, m7
lea rax, [deint_shuf+128]
punpcklwd m4, m2, m3
punpckhwd m2, m3
punpckhwd m5, m0, m1
punpcklwd m0, m1
punpckhdq m1, m0, m4
punpckldq m0, m4
punpckldq m4, m5, m2
punpckhdq m5, m2
vpblendd m3, m0, m1, 0x33
vpblendd m0, m1, 0xcc
shufpd m2, m5, m4, 0x05
shufpd m4, m5, 0x05
vperm2i128 m1, m0, m3, 0x31 ; 4 7 6 5
vinserti128 m0, xm3, 1 ; 0 3 2 1
vperm2i128 m3, m2, m4, 0x31 ; c f e d ; ????
vinserti128 m2, xm4, 1 ; b 8 9 a
call m(iadst_4x16_internal_8bpc).main2
vpbroadcastd m5, [pw_2896x8]
paddsw m1, m2, m4
psubsw m2, m4
pmulhrsw m1, m5 ; -out7 out4 out6 -out5
pmulhrsw m2, m5 ; out8 -out11 -out9 out10
ret
ALIGN function_align
.main:
vbroadcasti128 m0, [cq+16* 0]
vbroadcasti128 m4, [cq+16* 2]
vbroadcasti128 m1, [cq+16*15]
vbroadcasti128 m5, [cq+16*13]
vbroadcasti128 m2, [cq+16* 4]
vbroadcasti128 m6, [cq+16* 6]
vbroadcasti128 m3, [cq+16*11]
vbroadcasti128 m7, [cq+16* 9]
shufpd m0, m4, 0x0c ; 0 2
shufpd m1, m5, 0x0c ; 15 13
shufpd m2, m6, 0x0c ; 4 6
shufpd m3, m7, 0x0c ; 11 9
vbroadcasti128 m4, [cq+16* 8]
vbroadcasti128 m6, [cq+16*10]
vbroadcasti128 m5, [cq+16* 7]
vbroadcasti128 m7, [cq+16* 5]
shufpd m4, m6, 0x0c ; 8 10
shufpd m5, m7, 0x0c ; 7 5
vbroadcasti128 m6, [cq+16*12]
vbroadcasti128 m7, [cq+16*14]
shufpd m6, m7, 0x0c ; 12 14
vbroadcasti128 m7, [cq+16* 3]
vbroadcasti128 m8, [cq+16* 1]
shufpd m7, m8, 0x0c ; 3 1
vpbroadcastd m11, [pd_2048]
ITX_MULSUB_2D 1, 0, 8, 9, 10, 11, 201_995, 4091_3973, 1
ITX_MULSUB_2D 3, 2, 8, 9, 10, 11, 1751_2440, 3703_3290, 1
ITX_MULSUB_2D 5, 4, 8, 9, 10, 11, 3035_3513, 2751_2106, 1
ITX_MULSUB_2D 7, 6, 8, 9, 10, 11, 3857_4052, 1380_601, 1
psubd m8, m0, m4 ; t8a t10a
paddd m0, m4 ; t0a t2a
psubd m4, m1, m5 ; t9a t11a
paddd m1, m5 ; t1a t3a
psubd m5, m2, m6 ; t12a t14a
paddd m2, m6 ; t4a t6a
psubd m6, m3, m7 ; t13a t15a
paddd m3, m7 ; t5a t7a
ITX_MULSUB_2D 8, 4, 7, 9, 10, 11, 799_3406, 4017_2276, 1
ITX_MULSUB_2D 6, 5, 7, 9, 10, 11, 4017_2276, 10, 1
psubd m7, m0, m2 ; t4 t6
paddd m0, m2 ; t0 t2
psubd m2, m1, m3 ; t5 t7
paddd m1, m3 ; t1 t3
psubd m3, m4, m6 ; t12a t14a
paddd m4, m6 ; t8a t10a
psubd m6, m8, m5 ; t13a t15a
paddd m8, m5 ; t9a t11a
punpcklqdq m5, m3, m7 ; t12a t4
punpckhqdq m3, m7 ; t14a t6
punpckhqdq m7, m6, m2 ; t15a t7
punpcklqdq m6, m2 ; t13a t5
ITX_MULSUB_2D 7, 3, 2, 9, 10, 11, 3784, 1567
ITX_MULSUB_2D 5, 6, 2, 9, 10, 11, 1567, 10
vpbroadcastd m10, [pd_2896]
vbroadcasti128 m9, [pw_2048_m2048] ; + + - -
punpckhqdq m2, m4, m0 ; t10a t2
punpcklqdq m4, m0 ; t8a t0
punpckhqdq m0, m8, m1 ; t11a t3
punpcklqdq m8, m1 ; t9a t1
paddd m1, m6, m7 ; out2 -out3
psubd m6, m7 ; t14a t6
paddd m7, m5, m3 ; -out13 out12
psubd m5, m3 ; t15a t7
psubd m3, m8, m0 ; t11 t3a
paddd m8, m0 ; out14 -out15
paddd m0, m4, m2 ; -out1 out0
psubd m4, m2 ; t10 t2a
REPX {pmulld x, m10}, m6, m5, m3, m4
paddd m6, m11
paddd m4, m11
paddd m2, m6, m5 ; -out5 out4
psubd m6, m5 ; out10 -out11
psubd m5, m4, m3 ; -out9 out8
paddd m3, m4 ; out6 -out7
REPX {psrad x, 12}, m2, m3, m5, m6
REPX {psignd x, m9}, m1, m8, m3, m6
pshufd m9, m9, q1032
REPX {psignd x, m9}, m0, m7, m2, m5
ret
INV_TXFM_4X16_FN flipadst, dct
INV_TXFM_4X16_FN flipadst, adst
INV_TXFM_4X16_FN flipadst, flipadst
INV_TXFM_4X16_FN flipadst, identity
cglobal iflipadst_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2
call m(iadst_16x4_internal_16bpc).main
psrad m0, m3, 13
psrad m1, m2, 13
psrad m2, m5, 13
psrad m3, m4, 13
psrad m4, m7, 13
psrad m5, m6, 13
psrad m6, m9, 13
psrad m7, m8, 13
jmp tx2q
.pass2:
call m(iadst_4x16_internal_16bpc).pass2_main
vpbroadcastd m5, [pw_2048]
vpbroadcastd m8, [pixel_max]
lea r6, [strideq*3]
vpblendd m4, m3, m0, 0x33 ; -out0 out3 out1 -out2
pshufd m2, m2, q1032 ; -out11 out8 out10 -out9
vpblendd m3, m0, 0xcc ; -out12 out15 out13 -out14
pxor m7, m7
psubw m9, m7, m5
vpblendd m9, m5, 0x3c ; -2048 2048 2048 -2048
pmulhrsw m0, m4, m9
call .write_4x4
pmulhrsw m0, m2, m9
call .write_4x4
pmulhrsw m0, m1, m9
call .write_4x4
pmulhrsw m0, m3, m9
call .write_4x4
RET
ALIGN function_align
.write_4x4:
movq xm4, [dstq+strideq*0]
movhps xm4, [dstq+r6 ]
vpbroadcastq m5, [dstq+strideq*1]
vpbroadcastq m6, [dstq+strideq*2]
mova [cq+32*0], m7
mova [cq+32*1], m7
add cq, 32*2
vpblendd m4, m5, 0x30
vpblendd m4, m6, 0xc0
paddw m4, m0
pmaxsw m4, m7
pminsw m4, m8
vextracti128 xm5, m4, 1
movq [dstq+strideq*0], xm4
movq [dstq+strideq*1], xm5
movhps [dstq+strideq*2], xm5
movhps [dstq+r6 ], xm4
lea dstq, [dstq+strideq*4]
ret
INV_TXFM_4X16_FN identity, dct
INV_TXFM_4X16_FN identity, adst
INV_TXFM_4X16_FN identity, flipadst
INV_TXFM_4X16_FN identity, identity
cglobal iidentity_4x16_internal_16bpc, 0, 7, 11, dst, stride, c, eob, tx2
vpbroadcastd m7, [pd_5793]
pmulld m0, m7, [cq+32*0]
pmulld m4, m7, [cq+32*1]
pmulld m1, m7, [cq+32*2]
pmulld m5, m7, [cq+32*3]
pmulld m2, m7, [cq+32*4]
pmulld m6, m7, [cq+32*5]
pmulld m3, m7, [cq+32*6]
pmulld m7, [cq+32*7]
vpbroadcastd m8, [pd_6144]
REPX {paddd x, m8}, m0, m4, m1, m5, m2, m6, m3, m7
REPX {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7
jmp tx2q
.pass2:
packssdw m0, m4
packssdw m1, m5
packssdw m2, m6
packssdw m3, m7
vpbroadcastd m7, [pw_1697x16]
vpbroadcastd m8, [pw_2048]
pmulhrsw m4, m7, m0
pmulhrsw m5, m7, m1
pmulhrsw m6, m7, m2
pmulhrsw m7, m3
REPX {paddsw x, x}, m0, m1, m2, m3
paddsw m0, m4
paddsw m1, m5
paddsw m2, m6
paddsw m3, m7
vpbroadcastd m4, [pixel_max]
punpckhwd m7, m0, m1
punpcklwd m0, m1
punpckhwd m1, m2, m3
punpcklwd m2, m3
lea r6, [strideq*5]
pxor m3, m3
punpckhdq m5, m0, m2 ; 2 3 6 7
punpckldq m0, m2 ; 0 1 4 5
punpckldq m6, m7, m1 ; 8 9 c d
punpckhdq m7, m1 ; a b e f
pmulhrsw m0, m8
call .write_2x4x2
pmulhrsw m0, m5, m8
call .write_2x4x2
pmulhrsw m0, m6, m8
lea dstq, [dstq+strideq*4]
call .write_2x4x2
pmulhrsw m0, m7, m8
call .write_2x4x2
RET
ALIGN function_align
.write_2x4x2:
movq xm1, [dstq+strideq*0]
movhps xm1, [dstq+strideq*1]
vpbroadcastq m2, [dstq+strideq*4]
vpblendd m1, m2, 0x30
vpbroadcastq m2, [dstq+r6 ]
vpblendd m1, m2, 0xc0
mova [cq+32*0], m3
mova [cq+32*1], m3
add cq, 32*2
paddw m1, m0
pmaxsw m1, m3
pminsw m1, m4
vextracti128 xm2, m1, 1
movq [dstq+strideq*0], xm1
movhps [dstq+strideq*1], xm1
movq [dstq+strideq*4], xm2
movhps [dstq+r6 ], xm2
lea dstq, [dstq+strideq*2]
ret
%macro INV_TXFM_8X4_FN 2 ; type1, type2
INV_TXFM_FN %1, %2, 0, 8x4
%ifidn %1_%2, dct_dct
imul r6d, [cq], 2896
mov [cq], eobd ; 0
add r6d, 2048
sar r6d, 12
imul r6d, 2896
add r6d, 2048
sar r6d, 12
imul r6d, 2896
add r6d, 34816
sar r6d, 16
movd xm0, r6d
vpbroadcastw m0, xm0
.end:
vpbroadcastd m4, [pixel_max]
pxor m3, m3
mova xm1, [dstq+strideq*0]
vinserti128 m1, [dstq+strideq*1], 1
lea r6, [dstq+strideq*2]
mova xm2, [r6 +strideq*0]
vinserti128 m2, [r6 +strideq*1], 1
paddw m1, m0
paddw m2, m0
pmaxsw m1, m3
pmaxsw m2, m3
pminsw m1, m4
pminsw m2, m4
mova [dstq+strideq*0], xm1
vextracti128 [dstq+strideq*1], m1, 1
mova [r6 +strideq*0], xm2
vextracti128 [r6 +strideq*1], m2, 1
RET
%endif
%endmacro
INV_TXFM_8X4_FN dct, dct
INV_TXFM_8X4_FN dct, identity
INV_TXFM_8X4_FN dct, adst
INV_TXFM_8X4_FN dct, flipadst
cglobal idct_8x4_internal_16bpc, 0, 7, 10, dst, stride, c, eob, tx2
vbroadcasti128 m1, [cq+16*1]
vbroadcasti128 m0, [cq+16*5]
vbroadcasti128 m2, [cq+16*3]
vbroadcasti128 m3, [cq+16*7]
vpbroadcastd m6, [pd_2896]
shufpd m1, m0, 0x0c ; 1 5
shufpd m3, m2, 0x0c ; 7 3
vbroadcasti128 m0, [cq+16*0]
vbroadcasti128 m4, [cq+16*2]
vbroadcasti128 m2, [cq+16*4]
vbroadcasti128 m5, [cq+16*6]
vpbroadcastd m7, [pd_2048]
shufpd m0, m4, 0x0c ; 0 2
shufpd m2, m5, 0x0c ; 4 6
REPX {pmulld x, m6}, m1, m3, m0, m2
REPX {paddd x, m7}, m1, m3, m0, m2
REPX {psrad x, 12}, m1, m3, m0, m2
call .main
psubd m3, m0, m4 ; out7 out6
paddd m0, m4 ; out0 out1
paddd m1, m2, m5 ; out3 out2
psubd m2, m5 ; out4 out5
pshufd m1, m1, q1032
pshufd m3, m3, q1032
jmp tx2q
.pass2:
vbroadcasti128 m4, [deint_shuf]
packssdw m0, m1
packssdw m2, m3
vperm2i128 m1, m0, m2, 0x31
vinserti128 m0, xm2, 1
pshufb m0, m4
pshufb m1, m4
IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 7
vpermq m0, m0, q3120 ; out0 out1
vpermq m2, m1, q2031 ; out2 out3
jmp m(iadst_8x4_internal_16bpc).end
ALIGN function_align
.main:
ITX_MULSUB_2D 1, 3, 4, 5, 6, 7, 799_3406, 4017_2276, 1
IDCT4_1D_PACKED 0, 2, 4, 5, 6, 7
vpbroadcastd m8, [clip_min]
vpbroadcastd m9, [clip_max]
vpbroadcastd m6, [pd_2896]
punpcklqdq m4, m1, m3 ; t4a t7a
punpckhqdq m1, m3 ; t5a t6a
psubd m3, m4, m1 ; t5a t6a
paddd m4, m1 ; t4 t7
REPX {pmaxsd x, m8}, m3, m4, m0, m2
REPX {pminsd x, m9}, m3, m4, m0, m2
pmulld m3, m6
pshufd m1, m3, q1032
paddd m3, m7
psubd m5, m3, m1
paddd m1, m3
psrad m5, 12
psrad m1, 12
vpblendd m5, m4, 0x33 ; t4 t5
punpckhqdq m4, m1 ; t7 t6
ret
INV_TXFM_8X4_FN adst, dct
INV_TXFM_8X4_FN adst, adst
INV_TXFM_8X4_FN adst, flipadst
INV_TXFM_8X4_FN adst, identity
cglobal iadst_8x4_internal_16bpc, 0, 7, 10, dst, stride, c, eob, tx2
call m(iadst_4x8_internal_16bpc).main
vpblendd m3, m0, m4, 0x33 ; out6 out7
vpblendd m0, m4, 0xcc ; out0 out1
pshufd m1, m5, q1032
psignd m2, m6 ; out4 out5
psignd m1, m6 ; out2 out3
jmp tx2q
.pass2:
call .pass2_main
vpermq m0, m0, q3120 ; out0 out1
vpermq m2, m1, q3120 ; out2 out3
.end:
vpbroadcastd m1, [pw_2048]
pmulhrsw m0, m1
pmulhrsw m1, m2
.end2:
mova xm2, [dstq+strideq*0]
vinserti128 m2, [dstq+strideq*1], 1
lea r6, [dstq+strideq*2]
mova xm3, [r6 +strideq*0]
vinserti128 m3, [r6 +strideq*1], 1
vpbroadcastd m5, [pixel_max]
pxor m4, m4
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
paddw m0, m2
paddw m1, m3
pmaxsw m0, m4
pmaxsw m1, m4
pminsw m0, m5
pminsw m1, m5
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
mova [r6 +strideq*0], xm1
vextracti128 [r6 +strideq*1], m1, 1
RET
ALIGN function_align
.pass2_main:
vbroadcasti128 m4, [deint_shuf]
packssdw m0, m1
packssdw m2, m3
lea rax, [deint_shuf+128]
vperm2i128 m1, m0, m2, 0x31
vinserti128 m0, xm2, 1
pshufb m0, m4
pshufb m1, m4
jmp m(iadst_8x4_internal_8bpc).main
ALIGN function_align
.main:
vpbroadcastd m1, [pd_2896]
pmulld m0, m1, [cq+32*0]
pmulld m3, m1, [cq+32*3]
pmulld m2, m1, [cq+32*2]
pmulld m1, [cq+32*1]
vpbroadcastd m4, [pd_2048]
REPX {paddd x, m4}, m0, m3, m2, m1
REPX {psrad x, 12}, m0, m3, m2, m1
vbroadcasti128 m6, [pd_1321]
vbroadcasti128 m7, [pd_2482]
pmulld m4, m0, m6 ; 1321*in0
pmulld m5, m3, m7 ; 2482*in3
paddd m4, m5 ; 1321*in0 + 2482*in3
pmulld m5, m0, m7 ; 2482*in0
paddd m0, m3 ; in0 + in3
paddd m7, m6 ; pd_3803
pmulld m6, m2 ; 1321*in2
pmulld m3, m7 ; 3803*in3
pmulld m7, m2 ; 3803*in2
psubd m2, m0 ; in2 - in0 - in3
vpbroadcastd m0, [pd_m3344]
psubd m5, m6 ; 2482*in0 - 1321*in2
vpbroadcastd m6, [pd_2048]
psubd m5, m3 ; t1
pmulld m2, m0 ; t2
pmulld m1, m0 ; -t3
paddd m4, m7 ; t0
paddd m5, m6
paddd m3, m4, m5
paddd m4, m6
psubd m4, m1 ; out0 (unshifted)
psubd m5, m1 ; out1 (unshifted)
paddd m2, m6 ; out2 (unshifted)
paddd m3, m1 ; out3 (unshifted)
ret
INV_TXFM_8X4_FN flipadst, dct
INV_TXFM_8X4_FN flipadst, adst
INV_TXFM_8X4_FN flipadst, flipadst
INV_TXFM_8X4_FN flipadst, identity
cglobal iflipadst_8x4_internal_16bpc, 0, 5, 10, dst, stride, c, eob, tx2
call m(iadst_4x8_internal_16bpc).main
shufpd m3, m4, m0, 0x05
shufpd m0, m4, 0x05
psignd m2, m6
pshufd m6, m6, q1032
pshufd m1, m2, q1032
psignd m2, m5, m6
jmp tx2q
.pass2:
call m(iadst_8x4_internal_16bpc).pass2_main
vpermq m2, m0, q2031
vpermq m0, m1, q2031
jmp m(iadst_8x4_internal_16bpc).end
INV_TXFM_8X4_FN identity, dct
INV_TXFM_8X4_FN identity, adst
INV_TXFM_8X4_FN identity, flipadst
INV_TXFM_8X4_FN identity, identity
cglobal iidentity_8x4_internal_16bpc, 0, 7, 10, dst, stride, c, eob, tx2
vpbroadcastd m4, [pd_2896]
vpermq m0, [cq+32*0], q3120
vpermq m1, [cq+32*1], q3120
vpermq m2, [cq+32*2], q3120
vpermq m3, [cq+32*3], q3120
vpbroadcastd m7, [pd_2048]
REPX {pmulld x, m4}, m0, m1, m2, m3
REPX {paddd x, m7}, m0, m1, m2, m3
REPX {psrad x, 12}, m0, m1, m2, m3
REPX {paddd x, x }, m0, m1, m2, m3
jmp tx2q
.pass2:
vpbroadcastd m4, [pw_1697x8]
packssdw m0, m1
packssdw m2, m3
pmulhrsw m1, m4, m0
pmulhrsw m4, m2
paddsw m0, m1
paddsw m2, m4
punpckhwd m1, m0, m2
punpcklwd m0, m2
packssdw m7, m7 ; pw_2048
lea r6, [dstq+strideq*2]
punpckhwd m2, m0, m1
punpcklwd m0, m1
pmulhrsw m2, m7
pmulhrsw m0, m7
punpckhwd m1, m0, m2
punpcklwd m0, m2
mova xm2, [dstq+strideq*0]
vinserti128 m2, [r6 +strideq*0], 1
mova xm3, [dstq+strideq*1]
vinserti128 m3, [r6 +strideq*1], 1
vpbroadcastd m5, [pixel_max]
pxor m4, m4
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3
paddw m0, m2
paddw m1, m3
pmaxsw m0, m4
pmaxsw m1, m4
pminsw m0, m5
pminsw m1, m5
mova [dstq+strideq*0], xm0
mova [dstq+strideq*1], xm1
vextracti128 [r6 +strideq*0], m0, 1
vextracti128 [r6 +strideq*1], m1, 1
RET
%macro INV_TXFM_8X8_FN 2 ; type1, type2
INV_TXFM_FN %1, %2, 0, 8x8
%ifidn %1_%2, dct_dct
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 8
.dconly:
add r6d, 6144
sar r6d, 13
.dconly2:
imul r6d, 2896
add r6d, 34816
sar r6d, 16
movd xm0, r6d
vpbroadcastw m0, xm0
vpbroadcastd m3, [pixel_max]
pxor m2, m2
.dconly_loop:
mova xm1, [dstq+strideq*0]
vinserti128 m1, [dstq+strideq*1], 1
paddw m1, m0
pmaxsw m1, m2
pminsw m1, m3
mova [dstq+strideq*0], xm1
vextracti128 [dstq+strideq*1], m1, 1
lea dstq, [dstq+strideq*2]
sub r3d, 2
jg .dconly_loop
RET
%endif
%endmacro
%macro IADST8_1D 14 ; src[1-8], tmp[1-3], pd_2048, clip[1-2]
ITX_MULSUB_2D %8, %1, %9, %10, %11, %12, 401, 4076 ; t1a, t0a
ITX_MULSUB_2D %2, %7, %9, %10, %11, %12, 3920, 1189 ; t7a, t6a
ITX_MULSUB_2D %6, %3, %9, %10, %11, %12, 1931, 3612 ; t3a, t2a
ITX_MULSUB_2D %4, %5, %9, %10, %11, %12, 3166, 2598 ; t5a, t4a
psubd m%9, m%3, m%7 ; t6
paddd m%3, m%7 ; t2
psubd m%7, m%1, m%5 ; t4
paddd m%1, m%5 ; t0
psubd m%5, m%6, m%2 ; t7
paddd m%6, m%2 ; t3
psubd m%2, m%8, m%4 ; t5
paddd m%8, m%4 ; t1
REPX {pmaxsd x, m%13}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8
REPX {pminsd x, m%14}, m%7, m%2, m%9, m%5, m%3, m%1, m%6, m%8
ITX_MULSUB_2D %7, %2, %4, %10, %11, %12, 1567, 3784 ; t5a, t4a
ITX_MULSUB_2D %5, %9, %4, %10, %11, %12, 3784, %11 ; t6a, t7a
psubd m%10, m%7, m%9 ; t7
paddd m%7, m%9 ; out6
vpbroadcastd m%9, [pd_2896]
psubd m%4, m%8, m%6 ; t3
paddd m%8, m%6 ; -out7
psubd m%6, m%1, m%3 ; t2
paddd m%1, m%3 ; out0
psubd m%3, m%2, m%5 ; t6
paddd m%2, m%5 ; -out1
REPX {pmaxsd x, m%13}, m%6, m%4, m%3, m%10
REPX {pminsd x, m%14}, m%6, m%4, m%3, m%10
REPX {pmulld x, m%9 }, m%6, m%4, m%3, m%10
psubd m%5, m%6, m%4 ; (t2 - t3) * 2896
paddd m%4, m%6 ; (t2 + t3) * 2896
psubd m%6, m%3, m%10 ; (t6 - t7) * 2896
paddd m%3, m%10 ; (t6 + t7) * 2896
%endmacro
INV_TXFM_8X8_FN dct, dct
INV_TXFM_8X8_FN dct, identity
INV_TXFM_8X8_FN dct, adst
INV_TXFM_8X8_FN dct, flipadst
cglobal idct_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
mova m0, [cq+32*0]
mova m1, [cq+32*1]
mova m2, [cq+32*2]
mova m3, [cq+32*3]
mova m4, [cq+32*4]
mova m5, [cq+32*5]
mova m6, [cq+32*6]
mova m7, [cq+32*7]
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_min]
vpbroadcastd m13, [clip_max]
call .main
call .round_shift1
jmp tx2q
.pass2:
call .transpose_8x8_packed
call m(idct_8x8_internal_8bpc).main
vpbroadcastd m12, [pw_2048]
vpermq m0, m0, q3120
vpermq m1, m1, q2031
vpermq m2, m2, q3120
vpermq m3, m3, q2031
pmulhrsw m0, m12
pmulhrsw m1, m12
call .write_8x4_start
pmulhrsw m0, m2, m12
pmulhrsw m1, m3, m12
call .write_8x4
RET
ALIGN function_align
.write_8x4_start:
vpbroadcastd m11, [pixel_max]
lea r6, [strideq*3]
pxor m10, m10
.write_8x4:
mova xm8, [dstq+strideq*0]
vinserti128 m8, [dstq+strideq*1], 1
mova xm9, [dstq+strideq*2]
vinserti128 m9, [dstq+r6 ], 1
mova [cq+32*0], m10
mova [cq+32*1], m10
mova [cq+32*2], m10
mova [cq+32*3], m10
add cq, 32*4
paddw m0, m8
paddw m1, m9
pmaxsw m0, m10
pmaxsw m1, m10
pminsw m0, m11
pminsw m1, m11
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], xm1
vextracti128 [dstq+r6 ], m1, 1
lea dstq, [dstq+strideq*4]
ret
ALIGN function_align
.transpose_8x8_packed:
packssdw m0, m4
packssdw m1, m5
packssdw m2, m6
packssdw m3, m7
lea rax, [deint_shuf+128]
punpckhwd m4, m0, m1
punpcklwd m0, m1
punpckhwd m1, m2, m3
punpcklwd m2, m3
punpckhdq m3, m0, m2
punpckldq m0, m2
punpckhdq m2, m4, m1
punpckldq m4, m1
vinserti128 m1, m3, xm2, 1
vperm2i128 m3, m2, 0x31
vperm2i128 m2, m0, m4, 0x31
vinserti128 m0, xm4, 1
ret
ALIGN function_align
.main_rect2:
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
.main:
ITX_MULSUB_2D 5, 3, 8, 9, 10, 11, 3406, 2276 ; t5a t6a
ITX_MULSUB_2D 1, 7, 8, 9, 10, 11, 799, 4017 ; t4a t7a
ITX_MULSUB_2D 2, 6, 8, 9, 10, 11, 1567, 3784 ; t2 t3
paddd m8, m1, m5 ; t4
psubd m1, m5 ; t5a
paddd m9, m7, m3 ; t7
psubd m7, m3 ; t6a
vpbroadcastd m3, [pd_2896]
REPX {pmaxsd x, m12}, m1, m8, m7, m9
REPX {pminsd x, m13}, m1, m8, m7, m9
REPX {pmulld x, m3 }, m0, m4, m7, m1
paddd m0, m11
paddd m7, m11
psubd m5, m0, m4
paddd m0, m4
psubd m4, m7, m1
paddd m7, m1
REPX {psrad x, 12 }, m5, m0, m4, m7
psubd m3, m0, m6 ; dct4 out3
paddd m0, m6 ; dct4 out0
paddd m6, m5, m2 ; dct4 out1
psubd m5, m2 ; dct4 out2
REPX {pmaxsd x, m12}, m0, m6, m5, m3
REPX {pminsd x, m13}, m0, m6, m5, m3
ret
ALIGN function_align
.round_shift1:
pcmpeqd m1, m1
REPX {psubd x, m1}, m0, m6, m5, m3
paddd m1, m6, m7 ; out1
psubd m6, m7 ; out6
psubd m7, m0, m9 ; out7
paddd m0, m9 ; out0
paddd m2, m5, m4 ; out2
psubd m5, m4 ; out5
psubd m4, m3, m8 ; out4
paddd m3, m8 ; out3
REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
ret
INV_TXFM_8X8_FN adst, dct
INV_TXFM_8X8_FN adst, adst
INV_TXFM_8X8_FN adst, flipadst
INV_TXFM_8X8_FN adst, identity
cglobal iadst_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
call .main
call .main_end
jmp tx2q
.pass2:
call m(idct_8x8_internal_16bpc).transpose_8x8_packed
pshufd m4, m0, q1032
pshufd m5, m1, q1032
call m(iadst_8x8_internal_8bpc).main_pass2
vpbroadcastd m5, [pw_2048]
vpbroadcastd xm12, [pw_4096]
psubw m12, m5
REPX {vpermq x, x, q3120}, m0, m1, m2, m3
pmulhrsw m0, m12
pmulhrsw m1, m12
call m(idct_8x8_internal_16bpc).write_8x4_start
pmulhrsw m0, m2, m12
pmulhrsw m1, m3, m12
call m(idct_8x8_internal_16bpc).write_8x4
RET
ALIGN function_align
.main:
mova m0, [cq+32*0]
mova m7, [cq+32*7]
mova m1, [cq+32*1]
mova m6, [cq+32*6]
mova m2, [cq+32*2]
mova m5, [cq+32*5]
mova m3, [cq+32*3]
mova m4, [cq+32*4]
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_min]
vpbroadcastd m13, [clip_max]
.main2:
IADST8_1D 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
psrld m8, 11 ; pd_1
vpbroadcastd m9, [pd_6144]
ret
ALIGN function_align
.main_end:
paddd m0, m8
psubd m1, m8, m1
paddd m6, m8
psubd m7, m8, m7
REPX {psrad x, 1 }, m0, m1, m6, m7
; (1 + ((x + 2048) >> 12)) >> 1 = (6144 + x) >> 13
; (1 - ((x + 2048) >> 12)) >> 1 = (6143 - x) >> 13
psubd m8, m9, m8 ; pd_6143
paddd m2, m9
psubd m3, m8, m3
paddd m4, m9
psubd m5, m8, m5
REPX {psrad x, 13}, m2, m3, m4, m5
ret
INV_TXFM_8X8_FN flipadst, dct
INV_TXFM_8X8_FN flipadst, adst
INV_TXFM_8X8_FN flipadst, flipadst
INV_TXFM_8X8_FN flipadst, identity
cglobal iflipadst_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
call m(iadst_8x8_internal_16bpc).main
call .main_end
jmp tx2q
.pass2:
call m(idct_8x8_internal_16bpc).transpose_8x8_packed
pshufd m4, m0, q1032
pshufd m5, m1, q1032
call m(iadst_8x8_internal_8bpc).main_pass2
vpbroadcastd m12, [pw_2048]
vpbroadcastd xm5, [pw_4096]
psubw m12, m5
vpermq m8, m3, q2031
vpermq m9, m2, q2031
vpermq m2, m1, q2031
vpermq m3, m0, q2031
pmulhrsw m0, m8, m12
pmulhrsw m1, m9, m12
call m(idct_8x8_internal_16bpc).write_8x4_start
pmulhrsw m0, m2, m12
pmulhrsw m1, m3, m12
call m(idct_8x8_internal_16bpc).write_8x4
RET
ALIGN function_align
.main_end:
paddd m10, m8, m0
psubd m0, m8, m7
psubd m7, m8, m1
paddd m1, m8, m6
psrad m0, 1
psrad m1, 1
psrad m6, m7, 1
psrad m7, m10, 1
psubd m8, m9, m8 ; pd_6143
psubd m10, m8, m5
paddd m5, m9, m2
psubd m2, m8, m3
paddd m3, m9, m4
psrad m4, m2, 13
psrad m2, m10, 13
psrad m3, 13
psrad m5, 13
ret
INV_TXFM_8X8_FN identity, dct
INV_TXFM_8X8_FN identity, adst
INV_TXFM_8X8_FN identity, flipadst
INV_TXFM_8X8_FN identity, identity
cglobal iidentity_8x8_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
mova m0, [cq+32*0]
mova m1, [cq+32*1]
mova m2, [cq+32*2]
mova m3, [cq+32*3]
mova m4, [cq+32*4]
mova m5, [cq+32*5]
mova m6, [cq+32*6]
mova m7, [cq+32*7]
jmp tx2q
.pass2:
packssdw m0, m4
packssdw m1, m5
packssdw m2, m6
packssdw m3, m7
vpbroadcastd m12, [pw_4096]
punpckhwd m4, m0, m1
punpcklwd m0, m1
punpckhwd m1, m2, m3
punpcklwd m2, m3
punpckhdq m3, m0, m2
punpckldq m0, m2
punpckldq m2, m4, m1
punpckhdq m4, m1
punpckhqdq m1, m0, m2 ; 1 5
punpcklqdq m0, m2 ; 0 4
punpcklqdq m2, m3, m4 ; 2 6
punpckhqdq m3, m4 ; 3 7
pmulhrsw m0, m12
pmulhrsw m1, m12
call .write_2x8x2_start
pmulhrsw m0, m2, m12
pmulhrsw m1, m3, m12
call .write_2x8x2_zero
RET
.write_2x8x2_start:
vpbroadcastd m7, [pixel_max]
lea r6, [strideq*5]
pxor m6, m6
.write_2x8x2_zero:
mova [cq+32*0], m6
mova [cq+32*1], m6
mova [cq+32*2], m6
mova [cq+32*3], m6
add cq, 32*4
.write_2x8x2:
mova xm4, [dstq+strideq*0]
vinserti128 m4, [dstq+strideq*4], 1
mova xm5, [dstq+strideq*1]
vinserti128 m5, [dstq+r6 ], 1
paddw m0, m4
paddw m1, m5
pmaxsw m0, m6
pmaxsw m1, m6
pminsw m0, m7
pminsw m1, m7
mova [dstq+strideq*0], xm0
mova [dstq+strideq*1], xm1
vextracti128 [dstq+strideq*4], m0, 1
vextracti128 [dstq+r6 ], m1, 1
lea dstq, [dstq+strideq*2]
ret
%macro INV_TXFM_8X16_FN 2-3 0 ; type1, type2, eob_offset
INV_TXFM_FN %1, %2, %3, 8x16
%ifidn %1_%2, dct_dct
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 16
add r6d, 2048
sar r6d, 12
imul r6d, 2896
jmp m(inv_txfm_add_dct_dct_8x8_16bpc).dconly
%endif
%endmacro
INV_TXFM_8X16_FN dct, dct
INV_TXFM_8X16_FN dct, identity, 35
INV_TXFM_8X16_FN dct, adst
INV_TXFM_8X16_FN dct, flipadst
cglobal idct_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2
%undef cmp
vpbroadcastd m14, [pd_2896]
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_min]
vpbroadcastd m13, [clip_max]
cmp eobd, 43
jl .fast
add cq, 32
call .pass1_main
sub cq, 32
mova [cq+32* 1], m0
mova [cq+32* 3], m1
mova [cq+32* 5], m2
mova [cq+32* 7], m3
mova [cq+32* 9], m4
mova [cq+32*11], m5
mova [cq+32*13], m6
mova m15, m7
call .pass1_main
mova m8, [cq+32* 1]
mova m9, [cq+32* 3]
mova m10, [cq+32* 5]
mova m11, [cq+32* 7]
mova m12, [cq+32* 9]
mova m13, [cq+32*11]
mova m14, [cq+32*13]
jmp tx2q
.fast:
call .pass1_main
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
jmp tx2q
.pass2:
call .transpose
call m(idct_8x16_internal_8bpc).main
vpbroadcastd m12, [pw_2048]
REPX {vpermq x, x, q3120}, m0, m2, m4, m6
REPX {vpermq x, x, q2031}, m1, m3, m5, m7
.end:
pmulhrsw m0, m12
pmulhrsw m1, m12
call m(idct_8x8_internal_16bpc).write_8x4_start
pmulhrsw m0, m2, m12
pmulhrsw m1, m3, m12
call m(idct_8x8_internal_16bpc).write_8x4
pmulhrsw m0, m4, m12
pmulhrsw m1, m5, m12
call m(idct_8x8_internal_16bpc).write_8x4
pmulhrsw m0, m6, m12
pmulhrsw m1, m7, m12
call m(idct_8x8_internal_16bpc).write_8x4
RET
ALIGN function_align
.transpose:
packssdw m0, m8
packssdw m1, m9
packssdw m2, m10
packssdw m3, m11
packssdw m4, m12
packssdw m5, m13
packssdw m6, m14
packssdw m7, m15
lea rax, [deint_shuf+128]
punpckhwd m8, m0, m1
punpcklwd m0, m1
punpckhwd m1, m2, m3
punpcklwd m2, m3
punpcklwd m3, m4, m5
punpckhwd m4, m5
punpckhwd m5, m6, m7
punpcklwd m6, m7
punpckhdq m7, m3, m6
punpckldq m3, m6
punpckhdq m6, m4, m5
punpckldq m4, m5
punpckhdq m5, m8, m1
punpckldq m8, m1
punpckhdq m1, m0, m2
punpckldq m0, m2
vperm2i128 m2, m0, m3, 0x31
vinserti128 m0, xm3, 1
vperm2i128 m3, m1, m7, 0x31
vinserti128 m1, xm7, 1
vperm2i128 m7, m5, m6, 0x31
vinserti128 m5, xm6, 1
vperm2i128 m6, m8, m4, 0x31
vinserti128 m4, m8, xm4, 1
ret
ALIGN function_align
.pass1_main:
pmulld m0, m14, [cq+32* 0]
pmulld m1, m14, [cq+32* 2]
pmulld m2, m14, [cq+32* 4]
pmulld m3, m14, [cq+32* 6]
pmulld m4, m14, [cq+32* 8]
pmulld m5, m14, [cq+32*10]
pmulld m6, m14, [cq+32*12]
pmulld m7, m14, [cq+32*14]
call m(idct_8x8_internal_16bpc).main_rect2
jmp m(idct_8x8_internal_16bpc).round_shift1
ALIGN function_align
.main_evenhalf:
paddd m1, m6, m7 ; idct8 out1
psubd m6, m7 ; idct8 out6
psubd m7, m0, m9 ; idct8 out7
paddd m0, m9 ; idct8 out0
paddd m2, m5, m4 ; idct8 out2
psubd m5, m4 ; idct8 out5
psubd m4, m3, m8 ; idct8 out4
paddd m3, m8 ; idct8 out3
REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {pminsd x, m13}, m0, m1, m2, m3, m4, m5, m6, m7
ret
.main_oddhalf_fast_rect2:
REPX {paddd x, m11}, m0, m1, m2, m3
REPX {psrad x, 12 }, m0, m1, m2, m3
.main_oddhalf_fast: ; lower half zero
vpbroadcastd m7, [pd_4076]
vpbroadcastd m8, [pd_401]
vpbroadcastd m6, [pd_m1189]
vpbroadcastd m9, [pd_3920]
vpbroadcastd m5, [pd_3612]
vpbroadcastd m10, [pd_1931]
vpbroadcastd m4, [pd_m2598]
vpbroadcastd m15, [pd_3166]
pmulld m7, m0
pmulld m0, m8
pmulld m6, m1
pmulld m1, m9
pmulld m5, m2
pmulld m2, m10
pmulld m4, m3
pmulld m3, m15
jmp .main_oddhalf_fast2
.main_oddhalf_rect2:
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
.main_oddhalf:
ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 401, 4076 ; t8a, t15a
ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3920, 1189 ; t11a, t12a
ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1931, 3612 ; t10a, t13a
ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3166, 2598 ; t9a, t14a
.main_oddhalf_fast2:
REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
psubd m8, m0, m4 ; t9
paddd m0, m4 ; t8
psubd m4, m6, m2 ; t10
paddd m2, m6 ; t11
psubd m6, m1, m5 ; t13
paddd m5, m1 ; t12
psubd m1, m7, m3 ; t14
paddd m7, m3 ; t15
REPX {pmaxsd x, m12}, m8, m1, m4, m6, m0, m2, m5, m7
REPX {pminsd x, m13}, m8, m1, m4, m6, m0, m2, m5, m7
vpbroadcastd m15, [pd_3784]
vpbroadcastd m10, [pd_1567]
ITX_MULSUB_2D 1, 8, 3, 9, _, 11, 10, 15
ITX_MULSUB_2D 6, 4, 3, 9, _, 11, 10, 15, 4
psubd m3, m1, m4 ; t10
paddd m1, m4 ; t9
psubd m4, m0, m2 ; t11a
paddd m0, m2 ; t8a
psubd m2, m8, m6 ; t13
paddd m6, m8 ; t14
psubd m8, m7, m5 ; t12a
paddd m7, m5 ; t15a
REPX {pmaxsd x, m12}, m2, m8, m3, m4, m0, m1, m6, m7
REPX {pminsd x, m13}, m2, m8, m3, m4, m0, m1, m6, m7
REPX {pmulld x, m14}, m2, m8, m3, m4
paddd m2, m11
paddd m8, m11
paddd m5, m2, m3 ; t13a
psubd m2, m3 ; t10a
psubd m3, m8, m4 ; t11
paddd m4, m8 ; t12
REPX {psrad x, 12}, m5, m2, m3, m4
mova [r6-32*4], m7
mova [r6-32*3], m6
mova [r6-32*2], m5
mova [r6-32*1], m4
mova [r6+32*0], m3
mova [r6+32*1], m2
mova [r6+32*2], m1
mova [r6+32*3], m0
ret
INV_TXFM_8X16_FN adst, dct
INV_TXFM_8X16_FN adst, adst
INV_TXFM_8X16_FN adst, flipadst
INV_TXFM_8X16_FN adst, identity, 35
cglobal iadst_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2
%undef cmp
vpbroadcastd m14, [pd_2896]
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_min]
vpbroadcastd m13, [clip_max]
cmp eobd, 43
jl .fast
add cq, 32
call .pass1_main
call m(iadst_8x8_internal_16bpc).main_end
sub cq, 32
mova [cq+32* 1], m0
mova [cq+32* 3], m1
mova [cq+32* 5], m2
mova [cq+32* 7], m3
mova [cq+32* 9], m4
mova [cq+32*11], m5
mova [cq+32*13], m6
mova m15, m7
call .pass1_main
call m(iadst_8x8_internal_16bpc).main_end
mova m8, [cq+32* 1]
mova m9, [cq+32* 3]
mova m10, [cq+32* 5]
mova m11, [cq+32* 7]
mova m12, [cq+32* 9]
mova m13, [cq+32*11]
mova m14, [cq+32*13]
jmp tx2q
.fast:
call .pass1_main
call m(iadst_8x8_internal_16bpc).main_end
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
jmp tx2q
.pass2:
call m(idct_8x16_internal_16bpc).transpose
call m(iadst_8x16_internal_8bpc).main
call m(iadst_8x16_internal_8bpc).main_pass2_end
vpbroadcastd m8, [pw_2048]
vpbroadcastd xm12, [pw_4096]
REPX {vpermq x, x, q2031}, m0, m1, m2, m3
REPX {vpermq x, x, q3120}, m4, m5, m6, m7
psubw m12, m8
jmp m(idct_8x16_internal_16bpc).end
ALIGN function_align
.pass1_main:
pmulld m0, m14, [cq+32* 0]
pmulld m7, m14, [cq+32*14]
pmulld m1, m14, [cq+32* 2]
pmulld m6, m14, [cq+32*12]
pmulld m2, m14, [cq+32* 4]
pmulld m5, m14, [cq+32*10]
pmulld m3, m14, [cq+32* 6]
pmulld m4, m14, [cq+32* 8]
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
jmp m(iadst_8x8_internal_16bpc).main2
INV_TXFM_8X16_FN flipadst, dct
INV_TXFM_8X16_FN flipadst, adst
INV_TXFM_8X16_FN flipadst, flipadst
INV_TXFM_8X16_FN flipadst, identity, 35
cglobal iflipadst_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2
%undef cmp
vpbroadcastd m14, [pd_2896]
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_min]
vpbroadcastd m13, [clip_max]
cmp eobd, 43
jl .fast
add cq, 32
call m(iadst_8x16_internal_16bpc).pass1_main
call m(iflipadst_8x8_internal_16bpc).main_end
sub cq, 32
mova [cq+32* 1], m0
mova [cq+32* 3], m1
mova [cq+32* 5], m2
mova [cq+32* 7], m3
mova [cq+32* 9], m4
mova [cq+32*11], m5
mova [cq+32*13], m6
mova m15, m7
call m(iadst_8x16_internal_16bpc).pass1_main
call m(iflipadst_8x8_internal_16bpc).main_end
mova m8, [cq+32* 1]
mova m9, [cq+32* 3]
mova m10, [cq+32* 5]
mova m11, [cq+32* 7]
mova m12, [cq+32* 9]
mova m13, [cq+32*11]
mova m14, [cq+32*13]
jmp tx2q
.fast:
call m(iadst_8x16_internal_16bpc).pass1_main
call m(iflipadst_8x8_internal_16bpc).main_end
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
jmp tx2q
.pass2:
call m(idct_8x16_internal_16bpc).transpose
call m(iadst_8x16_internal_8bpc).main
call m(iadst_8x16_internal_8bpc).main_pass2_end
vpbroadcastd m12, [pw_2048]
vpbroadcastd xm13, [pw_4096]
mova m11, m0
vpermq m0, m7, q2031
mova m10, m1
vpermq m1, m6, q2031
mova m9, m2
vpermq m2, m5, q2031
mova m8, m3
vpermq m3, m4, q2031
vpermq m4, m8, q3120
vpermq m5, m9, q3120
vpermq m6, m10, q3120
vpermq m7, m11, q3120
psubw m12, m13
jmp m(idct_8x16_internal_16bpc).end
INV_TXFM_8X16_FN identity, dct
INV_TXFM_8X16_FN identity, adst
INV_TXFM_8X16_FN identity, flipadst
INV_TXFM_8X16_FN identity, identity
%macro IDTX16 3-4 ; src/dst, tmp, pw_1697x16, [pw_16394]
pmulhrsw m%2, m%3, m%1
%if %0 == 4 ; if downshifting by 1
pmulhrsw m%2, m%4
%else
paddsw m%1, m%1
%endif
paddsw m%1, m%2
%endmacro
cglobal iidentity_8x16_internal_16bpc, 0, 7, 16, dst, stride, c, eob, tx2
vpbroadcastd m15, [pd_2896]
pmulld m0, m15, [cq+32* 0]
pmulld m8, m15, [cq+32* 1]
pmulld m1, m15, [cq+32* 2]
pmulld m9, m15, [cq+32* 3]
pmulld m2, m15, [cq+32* 4]
pmulld m10, m15, [cq+32* 5]
pmulld m3, m15, [cq+32* 6]
pmulld m11, m15, [cq+32* 7]
pmulld m4, m15, [cq+32* 8]
pmulld m12, m15, [cq+32* 9]
pmulld m5, m15, [cq+32*10]
pmulld m13, m15, [cq+32*11]
pmulld m6, m15, [cq+32*12]
pmulld m14, m15, [cq+32*13]
pmulld m7, m15, [cq+32*14]
pmulld m15, [cq+32*15]
mova [cq], m7
vpbroadcastd m7, [pd_2048]
REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
m8, m9, m10, m11, m12, m13, m14, m15
paddd m7, [cq]
REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
jmp tx2q
.pass2:
packssdw m0, m8
packssdw m1, m9
packssdw m2, m10
packssdw m3, m11
packssdw m4, m12
packssdw m5, m13
packssdw m6, m14
packssdw m7, m15
vpbroadcastd m8, [pw_1697x16]
REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7
punpckhwd m9, m0, m1
punpcklwd m0, m1
punpckhwd m1, m6, m7
punpcklwd m6, m7
punpckhwd m7, m4, m5
punpcklwd m4, m5
punpcklwd m5, m2, m3
punpckhwd m2, m3
vpbroadcastd m12, [pw_2048]
punpckhdq m3, m0, m5
punpckldq m0, m5
punpckhdq m11, m9, m2
punpckldq m9, m2
punpckldq m2, m4, m6
punpckhdq m4, m6
punpckldq m6, m7, m1
punpckhdq m7, m1
punpckhqdq m1, m0, m2
punpcklqdq m0, m2
punpcklqdq m2, m3, m4
punpckhqdq m3, m4
punpcklqdq m8, m9, m6
punpckhqdq m9, m6
punpcklqdq m10, m11, m7
punpckhqdq m11, m7
pmulhrsw m0, m12
pmulhrsw m1, m12
call m(iidentity_8x8_internal_16bpc).write_2x8x2_start
pmulhrsw m0, m12, m2
pmulhrsw m1, m12, m3
call m(iidentity_8x8_internal_16bpc).write_2x8x2_zero
pmulhrsw m0, m12, m8
pmulhrsw m1, m12, m9
lea dstq, [dstq+strideq*4]
call m(iidentity_8x8_internal_16bpc).write_2x8x2_zero
pmulhrsw m0, m12, m10
pmulhrsw m1, m12, m11
call m(iidentity_8x8_internal_16bpc).write_2x8x2_zero
RET
%macro INV_TXFM_16X4_FN 2 ; type1, type2
INV_TXFM_FN %1, %2, 0, 16x4
%ifidn %1_%2, dct_dct
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 4
.dconly:
add r6d, 6144
sar r6d, 13
.dconly2:
imul r6d, 2896
add r6d, 34816
sar r6d, 16
movd xm0, r6d
vpbroadcastw m0, xm0
vpbroadcastd m4, [pixel_max]
pxor m3, m3
.dconly_loop:
paddw m1, m0, [dstq+strideq*0]
paddw m2, m0, [dstq+strideq*1]
pmaxsw m1, m3
pmaxsw m2, m3
pminsw m1, m4
pminsw m2, m4
mova [dstq+strideq*0], m1
mova [dstq+strideq*1], m2
lea dstq, [dstq+strideq*2]
sub r3d, 2
jg .dconly_loop
RET
%endif
%endmacro
INV_TXFM_16X4_FN dct, dct
INV_TXFM_16X4_FN dct, identity
INV_TXFM_16X4_FN dct, adst
INV_TXFM_16X4_FN dct, flipadst
cglobal idct_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
vbroadcasti128 m0, [cq+16* 0]
vbroadcasti128 m4, [cq+16* 4]
vbroadcasti128 m1, [cq+16* 2]
vbroadcasti128 m7, [cq+16* 6]
vbroadcasti128 m5, [cq+16*10]
vbroadcasti128 m2, [cq+16* 8]
vbroadcasti128 m6, [cq+16*12]
vbroadcasti128 m3, [cq+16*14]
shufpd m0, m4, 0x0c ; 0 4
shufpd m1, m5, 0x0c ; 2 10
shufpd m2, m6, 0x0c ; 8 12
shufpd m3, m7, 0x0c ; 14 6
vpbroadcastd m7, [pd_2048]
call m(idct_8x4_internal_16bpc).main
pcmpeqd m6, m6
psubd m0, m6
psubd m2, m6
psubd m3, m0, m4 ; idct8 out7 out6
paddd m0, m4 ; idct8 out0 out1
paddd m1, m2, m5 ; idct8 out3 out2
psubd m2, m5 ; idct8 out4 out5
vbroadcasti128 m10, [cq+16* 1]
vbroadcasti128 m4, [cq+16* 5]
vbroadcasti128 m11, [cq+16*15]
vbroadcasti128 m5, [cq+16*11]
shufpd m10, m4, 0x0c ; 1 5
shufpd m11, m5, 0x0c ; 15 11
vbroadcasti128 m5, [cq+16* 9]
vbroadcasti128 m4, [cq+16*13]
shufpd m5, m4, 0x0c ; 9 13
vbroadcasti128 m6, [cq+16* 7]
vbroadcasti128 m4, [cq+16* 3]
shufpd m6, m4, 0x0c ; 7 3
ITX_MULSUB_2D 10, 11, 4, 12, 13, 7, 401_1931, 4076_3612, 1
ITX_MULSUB_2D 5, 6, 4, 12, 13, 7, 3166_3920, 2598_1189, 1
psubd m4, m10, m5 ; t9 -t10
paddd m10, m5 ; t8 t11
psubd m5, m11, m6 ; t14 -t13
paddd m11, m6 ; t15 t12
REPX {pmaxsd x, m8}, m4, m5, m10, m11
REPX {pminsd x, m9}, m4, m5, m10, m11
ITX_MULSUB_2D 5, 4, 6, 12, 13, 7, 1567, 3784, 2
vpbroadcastd m12, [pd_2896]
punpckhqdq m6, m11, m5
punpcklqdq m11, m4
punpckhqdq m4, m10, m4
punpcklqdq m10, m5
psubd m5, m11, m6 ; t12a t13
paddd m11, m6 ; t15a t14
psubd m6, m10, m4 ; t11a t10
paddd m10, m4 ; t8a t9
REPX {pmaxsd x, m8}, m5, m6
REPX {pminsd x, m9}, m5, m6
pmulld m5, m12
pmulld m6, m12
REPX {pmaxsd x, m8}, m0, m1, m2, m3, m11, m10
REPX {pminsd x, m9}, m0, m1, m2, m3, m11, m10
paddd m5, m7
psubd m4, m5, m6
paddd m5, m6
psrad m4, 12 ; t11 t10a
psrad m5, 12 ; t12 t13a
psubd m7, m0, m11 ; out15 out14
paddd m0, m11 ; out0 out1
psubd m6, m1, m5 ; out12 out13
paddd m1, m5 ; out3 out2
psubd m5, m2, m4 ; out11 out10
paddd m2, m4 ; out4 out5
psubd m4, m3, m10 ; out8 out9
paddd m3, m10 ; out7 out6
REPX {pshufd x, x, q1032}, m1, m3, m5, m7
REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
jmp tx2q
.pass2:
call .transpose_4x16_packed
lea rax, [deint_shuf+128]
call m(idct_16x4_internal_8bpc).main
.end:
vpbroadcastd m4, [pw_2048]
REPX {pmulhrsw x, m4}, m0, m1, m2, m3
.end2:
paddw m0, [dstq+strideq*0]
paddw m1, [dstq+strideq*1]
.end3:
lea r6, [dstq+strideq*2]
paddw m2, [r6 +strideq*0]
paddw m3, [r6 +strideq*1]
vpbroadcastd m5, [pixel_max]
pxor m4, m4
REPX {mova [cq+32*x], m4}, 0, 1, 2, 3, 4, 5, 6, 7
REPX {pmaxsw x, m4}, m0, m1, m2, m3
REPX {pminsw x, m5}, m0, m1, m2, m3
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [r6 +strideq*0], m2
mova [r6 +strideq*1], m3
RET
ALIGN function_align
.transpose_4x16_packed:
vbroadcasti128 m8, [deint_shuf]
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
REPX {pshufb x, m8}, m0, m2, m4, m6
punpckhqdq m1, m0, m2
punpcklqdq m0, m2
punpckhqdq m2, m4, m6
punpcklqdq m4, m6
vperm2i128 m3, m1, m2, 0x31
vinserti128 m1, xm2, 1
vperm2i128 m2, m0, m4, 0x31
vinserti128 m0, xm4, 1
ret
INV_TXFM_16X4_FN adst, dct
INV_TXFM_16X4_FN adst, adst
INV_TXFM_16X4_FN adst, flipadst
INV_TXFM_16X4_FN adst, identity
cglobal iadst_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
call m(iadst_4x16_internal_16bpc).main
psrad m11, 11 ; pd_1
REPX {paddd x, m11}, m0, m1, m2, m3
paddd m4, m5, m11
paddd m5, m6, m11
paddd m6, m7, m11
paddd m7, m8, m11
.pass1_end:
REPX {pshufd x, x, q1032}, m0, m2, m4, m6
REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7
jmp tx2q
.pass2:
call m(idct_16x4_internal_16bpc).transpose_4x16_packed
lea rax, [deint_shuf+128]
call m(iadst_16x4_internal_8bpc).main
jmp m(idct_16x4_internal_16bpc).end
ALIGN function_align
.main:
vbroadcasti128 m6, [pd_1321]
mova m0, [cq+32*0]
mova m1, [cq+32*1]
vbroadcasti128 m7, [pd_2482]
mova m2, [cq+32*6]
mova m3, [cq+32*7]
pmulld m4, m0, m6
pmulld m5, m1, m6 ; 1321*in0
pmulld m9, m2, m7
pmulld m8, m3, m7 ; 2482*in3
paddd m4, m9
paddd m8, m5 ; 1321*in0 + 2482*in3
pmulld m5, m0, m7
pmulld m9, m1, m7 ; 2482*in0
paddd m0, m2
paddd m1, m3 ; in0 + in3
paddd m7, m6 ; pd_3803
pmulld m2, m7
pmulld m3, m7 ; 3803*in3
psubd m5, m2
psubd m9, m3 ; 2482*in0 - 3803*in3
mova m2, [cq+32*4]
pmulld m10, m7, m2
pmulld m3, m6, m2
psubd m2, m0
mova m0, [cq+32*5]
pmulld m7, m0 ; 3803*in2
pmulld m6, m0 ; 1321*in2
psubd m0, m1 ; in2 - in0 - in3
vpbroadcastd m1, [pd_m3344]
paddd m4, m10
paddd m7, m8 ; t0
psubd m5, m3
psubd m9, m6 ; t1
vpbroadcastd m6, [pd_6144]
pmulld m2, m1
pmulld m0, m1 ; t2
pmulld m3, m1, [cq+32*2]
pmulld m1, [cq+32*3] ; -t3
paddd m5, m6
paddd m9, m6
paddd m10, m4, m5
paddd m4, m6
paddd m8, m7, m6
paddd m7, m9
psubd m4, m3 ; out0 (unshifted)
psubd m5, m3 ; out1 (unshifted)
paddd m2, m6 ; out2 (unshifted)
paddd m3, m10 ; out3 (unshifted)
psubd m8, m1 ; out4 (unshifted)
psubd m9, m1 ; out5 (unshifted)
paddd m6, m0 ; out6 (unshifted)
paddd m7, m1 ; out7 (unshifted)
ret
INV_TXFM_16X4_FN flipadst, dct
INV_TXFM_16X4_FN flipadst, adst
INV_TXFM_16X4_FN flipadst, flipadst
INV_TXFM_16X4_FN flipadst, identity
cglobal iflipadst_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
call m(iadst_4x16_internal_16bpc).main
psrad m11, 11 ; pd_1
paddd m4, m3, m11
paddd m3, m5, m11
paddd m5, m2, m11
paddd m2, m6, m11
paddd m6, m1, m11
paddd m1, m7, m11
paddd m7, m0, m11
paddd m0, m8, m11
jmp m(iadst_16x4_internal_16bpc).pass1_end
.pass2:
call m(idct_16x4_internal_16bpc).transpose_4x16_packed
lea rax, [deint_shuf+128]
call m(iadst_16x4_internal_8bpc).main
vpbroadcastd m4, [pw_2048]
pmulhrsw m5, m3, m4
pmulhrsw m6, m2, m4
pmulhrsw m2, m1, m4
pmulhrsw m3, m0, m4
paddw m0, m5, [dstq+strideq*0]
paddw m1, m6, [dstq+strideq*1]
jmp m(idct_16x4_internal_16bpc).end3
INV_TXFM_16X4_FN identity, dct
INV_TXFM_16X4_FN identity, adst
INV_TXFM_16X4_FN identity, flipadst
INV_TXFM_16X4_FN identity, identity
cglobal iidentity_16x4_internal_16bpc, 0, 7, 14, dst, stride, c, eob, tx2
vpbroadcastd m8, [pd_11586]
vpermq m0, [cq+32*0], q3120 ; 0 1
vpermq m1, [cq+32*1], q3120 ; 2 3
vpermq m2, [cq+32*2], q3120 ; 4 5
vpermq m3, [cq+32*3], q3120 ; 6 7
vpermq m4, [cq+32*4], q3120 ; 8 9
vpermq m5, [cq+32*5], q3120 ; a b
vpermq m6, [cq+32*6], q3120 ; c d
vpermq m7, [cq+32*7], q3120 ; e f
vpbroadcastd m9, [pd_6144]
REPX {pmulld x, m8}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {paddd x, m9}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psrad x, 13}, m0, m1, m2, m3, m4, m5, m6, m7
jmp tx2q
.pass2:
call m(idct_16x4_internal_16bpc).transpose_4x16_packed
vpbroadcastd m7, [pw_1697x8]
pmulhrsw m4, m7, m0
pmulhrsw m5, m7, m1
pmulhrsw m6, m7, m2
pmulhrsw m7, m3
paddsw m0, m4
paddsw m1, m5
paddsw m2, m6
paddsw m3, m7
jmp m(idct_16x4_internal_16bpc).end
%macro INV_TXFM_16X8_FN 2 ; type1, type2
INV_TXFM_FN %1, %2, 0, 16x8
%ifidn %1_%2, dct_dct
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 8
add r6d, 2048
sar r6d, 12
imul r6d, 2896
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
%endif
%endmacro
INV_TXFM_16X8_FN dct, dct
INV_TXFM_16X8_FN dct, identity
INV_TXFM_16X8_FN dct, adst
INV_TXFM_16X8_FN dct, flipadst
cglobal idct_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
vpbroadcastd m14, [pd_2896]
pmulld m0, m14, [cq+32* 1]
pmulld m1, m14, [cq+32* 3]
pmulld m2, m14, [cq+32* 5]
pmulld m3, m14, [cq+32* 7]
pmulld m4, m14, [cq+32* 9]
pmulld m5, m14, [cq+32*11]
pmulld m6, m14, [cq+32*13]
pmulld m7, m14, [cq+32*15]
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_min]
vpbroadcastd m13, [clip_max]
lea r6, [rsp+32*4]
call m(idct_8x16_internal_16bpc).main_oddhalf_rect2
pmulld m0, m14, [cq+32* 0]
pmulld m1, m14, [cq+32* 2]
pmulld m2, m14, [cq+32* 4]
pmulld m3, m14, [cq+32* 6]
pmulld m4, m14, [cq+32* 8]
pmulld m5, m14, [cq+32*10]
pmulld m6, m14, [cq+32*12]
pmulld m7, m14, [cq+32*14]
call m(idct_8x8_internal_16bpc).main_rect2
call m(idct_8x16_internal_16bpc).main_evenhalf
psrld m11, 11 ; pd_1
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
mova m14, [r6-32*4]
mova m13, [r6-32*3]
mova m12, [r6-32*2]
mova m11, [r6-32*1]
mova m10, [r6+32*0]
mova m9, [r6+32*1]
mova m8, [r6+32*2]
psubd m15, m0, m14 ; out15
paddd m0, m14 ; out0
psubd m14, m1, m13 ; out14
paddd m1, m13 ; out1
psubd m13, m2, m12 ; out13
paddd m2, m12 ; out2
psubd m12, m3, m11 ; out12
paddd m3, m11 ; out3
psubd m11, m4, m10 ; out11
paddd m4, m10 ; out4
psubd m10, m5, m9 ; out10
paddd m5, m9 ; out5
psubd m9, m6, m8 ; out9
paddd m6, m8 ; out6
psubd m8, m7, [r6+32*3] ; out8
paddd m7, [r6+32*3] ; out7
REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
jmp tx2q
.pass2:
call .transpose
call m(idct_16x8_internal_8bpc).main
vpbroadcastd m10, [pw_2048]
.end:
pmulhrsw m0, m10
pmulhrsw m1, m10
pmulhrsw m2, m10
pmulhrsw m3, m10
call .write_16x4_start
pmulhrsw m0, m4, m10
pmulhrsw m1, m5, m10
pmulhrsw m2, m6, m10
pmulhrsw m3, m7, m10
call .write_16x4_zero
RET
ALIGN function_align
.transpose:
lea rax, [deint_shuf+128]
.transpose2:
packssdw m0, m8
packssdw m1, m9
packssdw m2, m10
packssdw m3, m11
packssdw m4, m12
packssdw m5, m13
packssdw m6, m14
packssdw m7, m15
.transpose3:
punpckhwd m8, m0, m1
punpcklwd m0, m1
punpcklwd m1, m2, m3
punpckhwd m2, m3
punpckhwd m3, m4, m5
punpcklwd m4, m5
punpckhwd m5, m6, m7
punpcklwd m6, m7
punpckhdq m7, m4, m6
punpckldq m4, m6
punpckldq m6, m8, m2
punpckhdq m8, m2
punpckhdq m2, m0, m1
punpckldq m0, m1
punpckhdq m1, m3, m5
punpckldq m3, m5
punpcklqdq m5, m6, m3
punpckhqdq m6, m3
punpckhqdq m3, m2, m7
punpcklqdq m2, m7
punpcklqdq m7, m8, m1
punpckhqdq m8, m1
punpckhqdq m1, m0, m4
punpcklqdq m0, m4
vperm2i128 m4, m0, m5, 0x31
vinserti128 m0, xm5, 1
vperm2i128 m5, m1, m6, 0x31
vinserti128 m1, xm6, 1
vperm2i128 m6, m2, m7, 0x31
vinserti128 m2, xm7, 1
vperm2i128 m7, m3, m8, 0x31
vinserti128 m3, xm8, 1
ret
ALIGN function_align
.write_16x4_start:
vpbroadcastd m9, [pixel_max]
lea r3, [strideq*3]
pxor m8, m8
.write_16x4_zero:
REPX {mova [cq+32*x], m8}, 0, 1, 2, 3, 4, 5, 6, 7
add cq, 32*8
.write_16x4:
paddw m0, [dstq+strideq*0]
paddw m1, [dstq+strideq*1]
paddw m2, [dstq+strideq*2]
paddw m3, [dstq+r3 ]
REPX {pmaxsw x, m8}, m0, m1, m2, m3
REPX {pminsw x, m9}, m0, m1, m2, m3
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [dstq+strideq*2], m2
mova [dstq+r3 ], m3
lea dstq, [dstq+strideq*4]
ret
INV_TXFM_16X8_FN adst, dct
INV_TXFM_16X8_FN adst, adst
INV_TXFM_16X8_FN adst, flipadst
INV_TXFM_16X8_FN adst, identity
cglobal iadst_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
lea r6, [rsp+32*4]
call .main
vpbroadcastd m14, [pd_6144]
psrld m15, 11 ; pd_1
psubd m13, m14, m15 ; pd_6143
paddd m0, m15
psubd m1, m15, m1
paddd m2, m15
psubd m3, m15, m3
paddd m4, m14
psubd m5, m13, m5
paddd m6, m14
psubd m7, m13, m7
paddd m8, m14, m9
psubd m9, m13, m10
paddd m10, m14, m11
psubd m11, m13, m12
paddd m12, m15, [r6-32*1]
psubd m13, m15, [r6-32*2]
paddd m14, m15, [r6-32*3]
psubd m15, [r6-32*4]
.pass1_end:
REPX {psrad x, 1 }, m0, m1, m2, m3, m12, m13, m14, m15
REPX {psrad x, 13}, m4, m5, m6, m7, m8, m9, m10, m11
jmp tx2q
.pass2:
call m(idct_16x8_internal_16bpc).transpose
call m(iadst_16x8_internal_8bpc).main
call m(iadst_16x8_internal_8bpc).main_pass2_end
vpbroadcastd m10, [pw_2048]
pxor m11, m11
psubw m11, m10
pmulhrsw m0, m10
pmulhrsw m1, m11
pmulhrsw m2, m10
pmulhrsw m3, m11
call m(idct_16x8_internal_16bpc).write_16x4_start
pmulhrsw m0, m4, m10
pmulhrsw m1, m5, m11
pmulhrsw m2, m6, m10
pmulhrsw m3, m7, m11
call m(idct_16x8_internal_16bpc).write_16x4_zero
RET
ALIGN function_align
.main:
vpbroadcastd m15, [pd_2896]
pmulld m0, m15, [cq+32* 2]
pmulld m1, m15, [cq+32*13]
pmulld m2, m15, [cq+32* 6]
pmulld m3, m15, [cq+32* 9]
pmulld m4, m15, [cq+32*10]
pmulld m5, m15, [cq+32* 5]
pmulld m6, m15, [cq+32*14]
pmulld m7, m15, [cq+32* 1]
vpbroadcastd m12, [pd_2048]
vpbroadcastd m13, [clip_min]
vpbroadcastd m14, [clip_max]
REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
call .main_part1
pmulld m0, m15, [cq+32* 0]
pmulld m1, m15, [cq+32*15]
pmulld m2, m15, [cq+32* 4]
pmulld m3, m15, [cq+32*11]
pmulld m4, m15, [cq+32* 8]
pmulld m5, m15, [cq+32* 7]
pmulld m6, m15, [cq+32*12]
pmulld m7, m15, [cq+32* 3]
REPX {paddd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
.main_part2:
ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 201, 4091
ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 1751, 3703
ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3035, 2751
ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 3857, 1380
psubd m8, m0, m4 ; t8a
paddd m0, m4 ; t0a
psubd m4, m1, m5 ; t9a
paddd m1, m5 ; t1a
psubd m5, m2, m6 ; t12a
paddd m2, m6 ; t4a
psubd m6, m3, m7 ; t13a
paddd m7, m3 ; t5a
REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7
vpbroadcastd m11, [pd_4017]
vpbroadcastd m10, [pd_799]
ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11
ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10
psubd m3, m0, m2 ; t4
paddd m0, m2 ; t0
psubd m2, m1, m7 ; t5
paddd m1, m7 ; t1
psubd m7, m4, m6 ; t12a
paddd m4, m6 ; t8a
psubd m6, m8, m5 ; t13a
paddd m5, m8 ; t9a
REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
REPX {pminsd x, m14}, m3, m2, m7, m6, m0, m1, m4, m5
vpbroadcastd m11, [pd_3784]
vpbroadcastd m10, [pd_1567]
ITX_MULSUB_2D 3, 2, 8, 9, _, 12, 10, 11
ITX_MULSUB_2D 7, 6, 8, 9, _, 12, 10, 11
pminsd m10, m14, [r6-32*4] ; t2
pminsd m8, m14, [r6-32*3] ; t3
psubd m9, m0, m10 ; t2a
paddd m0, m10 ; out0
psubd m10, m1, m8 ; t3a
paddd m1, m8 ; -out15
pmaxsd m9, m13
pmaxsd m10, m13
pminsd m9, m14
pminsd m10, m14
pmulld m9, m15
pmulld m10, m15
mova [r6-32*4], m1
mova m11, [r6-32*1] ; t7a
mova m1, [r6-32*2] ; t6a
psubd m8, m3, m11 ; t7
paddd m11, m3 ; out12
paddd m3, m2, m1 ; -out3
psubd m2, m1 ; t6
pmaxsd m8, m13
pmaxsd m2, m13
pminsd m8, m14
pminsd m2, m14
pmulld m8, m15
mova [r6-32*1], m11
mova [r6-32*3], m2
mova m1, [r6+32*3] ; t15
mova m2, [r6+32*2] ; t14
paddd m12, m7, m1 ; -out13
psubd m7, m1 ; t15a
psubd m11, m6, m2 ; t14a
paddd m2, m6 ; out2
pmaxsd m7, m13
pmaxsd m11, m13
pminsd m7, m14
pminsd m11, m14
pmulld m7, m15
pmulld m11, m15
mova [r6-32*2], m12
pminsd m1, m14, [r6+32*0] ; t10a
pminsd m12, m14, [r6+32*1] ; t11a
psubd m6, m4, m1 ; t10
paddd m1, m4 ; -out1
psubd m4, m5, m12 ; t11
paddd m5, m12 ; out14
pmulld m12, m15, [r6-32*3] ; t6
pmaxsd m6, m13
pmaxsd m4, m13
pminsd m6, m14
pminsd m4, m14
pmulld m6, m15
pmulld m4, m15
mova [r6-32*3], m5
paddd m5, m11, m7 ; -out5 (unshifted)
psubd m11, m7 ; out10 (unshifted)
paddd m7, m9, m10 ; -out7 (unshifted)
psubd m9, m10 ; out8 (unshifted)
psubd m10, m6, m4 ; -out9 (unshifted)
paddd m6, m4 ; out6 (unshifted)
paddd m4, m12, m8 ; out4 (unshifted)
psubd m12, m8 ; -out11 (unshifted)
ret
.main_part1:
ITX_MULSUB_2D 1, 0, 8, 9, 10, 12, 995, 3973
ITX_MULSUB_2D 3, 2, 8, 9, 10, 12, 2440, 3290
ITX_MULSUB_2D 5, 4, 8, 9, 10, 12, 3513, 2106
ITX_MULSUB_2D 7, 6, 8, 9, 10, 12, 4052, 601
psubd m8, m0, m4 ; t10a
paddd m0, m4 ; t2a
psubd m4, m1, m5 ; t11a
paddd m1, m5 ; t3a
psubd m5, m2, m6 ; t14a
paddd m2, m6 ; t6a
psubd m6, m3, m7 ; t15a
paddd m7, m3 ; t7a
REPX {pmaxsd x, m13}, m8, m4, m5, m6, m0, m1, m2, m7
REPX {pminsd x, m14}, m8, m4, m5, m6, m0, m1, m2, m7
vpbroadcastd m11, [pd_2276]
vpbroadcastd m10, [pd_3406]
ITX_MULSUB_2D 8, 4, 3, 9, _, 12, 10, 11
ITX_MULSUB_2D 6, 5, 3, 9, _, 12, 11, 10
psubd m3, m0, m2 ; t6
paddd m0, m2 ; t2
psubd m2, m1, m7 ; t7
paddd m1, m7 ; t3
psubd m7, m4, m6 ; t14a
paddd m4, m6 ; t10a
psubd m6, m8, m5 ; t15a
paddd m5, m8 ; t11a
REPX {pmaxsd x, m13}, m3, m2, m7, m6, m0, m1, m4, m5
REPX {pminsd x, m14}, m3, m2, m7, m6 ; clip the rest later
vpbroadcastd m11, [pd_1567]
vpbroadcastd m10, [pd_3784]
ITX_MULSUB_2D 2, 3, 8, 9, _, 12, 10, 11
ITX_MULSUB_2D 6, 7, 8, 9, _, 12, 10, 11
mova [r6-32*4], m0
mova [r6-32*3], m1
mova [r6+32*0], m4
mova [r6+32*1], m5
mova [r6-32*2], m2
mova [r6-32*1], m3
mova [r6+32*2], m6
mova [r6+32*3], m7
ret
INV_TXFM_16X8_FN flipadst, dct
INV_TXFM_16X8_FN flipadst, adst
INV_TXFM_16X8_FN flipadst, flipadst
INV_TXFM_16X8_FN flipadst, identity
cglobal iflipadst_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
lea r6, [rsp+32*4]
call m(iadst_16x8_internal_16bpc).main
vpbroadcastd m14, [pd_6144]
psrld m15, 11
psubd m13, m14, m15
psubd m8, m13, m7
paddd m7, m14, m9
paddd m9, m14, m6
psubd m6, m13, m10
psubd m10, m13, m5
paddd m5, m14, m11
paddd m11, m14, m4
psubd m4, m13, m12
psubd m12, m15, m3
paddd m3, m15, [r6-32*1]
paddd m13, m15, m2
psubd m2, m15, [r6-32*2]
psubd m14, m15, m1
mova m1, m15
paddd m15, m0
psubd m0, m1, [r6-32*4]
paddd m1, [r6-32*3]
jmp m(iadst_16x8_internal_16bpc).pass1_end
.pass2:
call m(idct_16x8_internal_16bpc).transpose
call m(iadst_16x8_internal_8bpc).main
call m(iadst_16x8_internal_8bpc).main_pass2_end
vpbroadcastd m10, [pw_2048]
pxor m11, m11
psubw m11, m10
mova m12, m0
pmulhrsw m0, m7, m11
mova m7, m1
pmulhrsw m1, m6, m10
mova m6, m2
pmulhrsw m2, m5, m11
mova m5, m3
pmulhrsw m3, m4, m10
call m(idct_16x8_internal_16bpc).write_16x4_start
pmulhrsw m0, m5, m11
pmulhrsw m1, m6, m10
pmulhrsw m2, m7, m11
pmulhrsw m3, m12, m10
call m(idct_16x8_internal_16bpc).write_16x4_zero
RET
INV_TXFM_16X8_FN identity, dct
INV_TXFM_16X8_FN identity, adst
INV_TXFM_16X8_FN identity, flipadst
INV_TXFM_16X8_FN identity, identity
cglobal iidentity_16x8_internal_16bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2
vpbroadcastd m15, [pd_2896]
pmulld m0, m15, [cq+32* 0]
pmulld m1, m15, [cq+32* 1]
pmulld m2, m15, [cq+32* 2]
pmulld m3, m15, [cq+32* 3]
pmulld m4, m15, [cq+32* 4]
pmulld m5, m15, [cq+32* 5]
pmulld m6, m15, [cq+32* 6]
pmulld m7, m15, [cq+32* 7]
pmulld m8, m15, [cq+32* 8]
pmulld m9, m15, [cq+32* 9]
pmulld m10, m15, [cq+32*10]
pmulld m11, m15, [cq+32*11]
pmulld m12, m15, [cq+32*12]
pmulld m13, m15, [cq+32*13]
pmulld m14, m15, [cq+32*14]
pmulld m15, [cq+32*15]
mova [rsp], m7
vpbroadcastd m7, [pd_2048]
REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \
m8, m9, m10, m11, m12, m13, m14, m15
paddd m7, [rsp]
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
mova [rsp], m15
vpbroadcastd m15, [pd_11586]
REPX {pmulld x, m15}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14
pmulld m15, [rsp]
mova [rsp], m7
vpbroadcastd m7, [pd_6144]
REPX {paddd x, m7 }, m0, m1, m2, m3, m4, m5, m6, \
m8, m9, m10, m11, m12, m13, m14, m15
paddd m7, [rsp]
REPX {psrad x, 13 }, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
jmp tx2q
.pass2:
call m(idct_16x8_internal_16bpc).transpose
vpbroadcastd m10, [pw_4096]
jmp m(idct_16x8_internal_16bpc).end
%macro INV_TXFM_16X16_FN 2-3 0 ; type1, type2, eob_offset
INV_TXFM_FN %1, %2, %3, 16x16
%ifidn %1_%2, dct_dct
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 16
add r6d, 10240
sar r6d, 14
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
%endif
%endmacro
INV_TXFM_16X16_FN dct, dct
INV_TXFM_16X16_FN dct, identity, 28
INV_TXFM_16X16_FN dct, adst
INV_TXFM_16X16_FN dct, flipadst
cglobal idct_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_min]
vpbroadcastd m13, [clip_max]
vpbroadcastd m14, [pd_2896]
lea r6, [rsp+32*4]
sub eobd, 36
jl .fast
add cq, 32
call .main
sub cq, 32
mova m10, [r6-32*4]
mova m9, [r6-32*3]
mova m8, [r6-32*2]
psubd m15, m0, m10 ; out15
paddd m0, m10 ; out0
psubd m10, m1, m9 ; out14
paddd m1, m9 ; out1
psubd m9, m2, m8 ; out13
paddd m2, m8 ; out2
REPX {psrad x, 2}, m0, m1, m2
mova [r6-32*4], m0
mova [r6-32*3], m1
mova [r6-32*2], m2
mova m2, [r6-32*1]
mova m1, [r6+32*0]
mova m0, [r6+32*1]
REPX {psrad x, 2}, m9, m10, m15
psubd m8, m3, m2 ; out12
paddd m3, m2 ; out3
psubd m2, m4, m1 ; out11
paddd m4, m1 ; out4
psubd m1, m5, m0 ; out10
paddd m5, m0 ; out5
REPX {psrad x, 2}, m3, m4, m5
mova [r6-32*1], m3
mova [r6+32*0], m4
mova [r6+32*1], m5
mova m4, [r6+32*2]
mova m3, [r6+32*3]
REPX {psrad x, 2}, m1, m2, m8
psubd m5, m6, m4 ; out9
paddd m6, m4 ; out6
psubd m4, m7, m3 ; out8
paddd m7, m3 ; out7
REPX {psrad x, 2}, m6, m7, m4, m5
mova [r6+32*2], m6
mova [r6+32*3], m7
add r6, 32*8
mova [r6-32*4], m4
mova [r6-32*3], m5
mova [r6-32*2], m1
mova [r6-32*1], m2
mova [r6+32*0], m8
mova [r6+32*1], m9
mova [r6+32*2], m10
mova [r6+32*3], m15
.fast:
add r6, 32*8
call .main
mova m14, [r6-32*4]
mova m13, [r6-32*3]
mova m12, [r6-32*2]
mova m11, [r6-32*1]
mova m10, [r6+32*0]
mova m9, [r6+32*1]
mova m8, [r6+32*2]
psubd m15, m0, m14 ; out15
paddd m0, m14 ; out0
psubd m14, m1, m13 ; out14
paddd m1, m13 ; out1
psubd m13, m2, m12 ; out13
paddd m2, m12 ; out2
psubd m12, m3, m11 ; out12
paddd m3, m11 ; out3
psubd m11, m4, m10 ; out11
paddd m4, m10 ; out4
psubd m10, m5, m9 ; out10
paddd m5, m9 ; out5
psubd m9, m6, m8 ; out9
paddd m6, m8 ; out6
psubd m8, m7, [r6+32*3] ; out8
paddd m7, [r6+32*3] ; out7
sub r6, 32*8
REPX {psrad x, 2}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
jmp tx2q
.pass2:
call .transpose
lea rax, [pw_5+128]
mova [rsp], m15
call m(idct_16x16_internal_8bpc).main
mova m1, [rsp+32*1]
.end:
call .write_16x16
RET
ALIGN function_align
.write_16x16:
mova [rsp+gprsize+32*0], m8
mova [rsp+gprsize+32*1], m9
mova [rsp+gprsize+32*2], m12
vpbroadcastd m12, [pw_2048]
pmulhrsw m0, m12
pmulhrsw m1, m12
pmulhrsw m2, m12
pmulhrsw m3, m12
call m(idct_16x8_internal_16bpc).write_16x4_start
pmulhrsw m0, m12, m4
pmulhrsw m1, m12, m5
pmulhrsw m2, m12, m6
pmulhrsw m3, m12, m7
call m(idct_16x8_internal_16bpc).write_16x4_zero
pmulhrsw m0, m12, [rsp+gprsize+32*0]
pmulhrsw m1, m12, [rsp+gprsize+32*1]
pmulhrsw m2, m12, m10
pmulhrsw m3, m12, m11
call m(idct_16x8_internal_16bpc).write_16x4_zero
pmulhrsw m0, m12, [rsp+gprsize+32*2]
pmulhrsw m1, m12, m13
pmulhrsw m2, m12, m14
pmulhrsw m3, m12, m15
jmp m(idct_16x8_internal_16bpc).write_16x4_zero
ALIGN function_align
.transpose:
test eobd, eobd
jl .transpose_fast
packssdw m8, [r6-32*4]
packssdw m9, [r6-32*3]
packssdw m10, [r6-32*2]
packssdw m11, [r6-32*1]
packssdw m12, [r6+32*0]
packssdw m13, [r6+32*1]
packssdw m14, [r6+32*2]
packssdw m15, [r6+32*3]
sub r6, 32*8
packssdw m0, [r6-32*4]
packssdw m1, [r6-32*3]
packssdw m2, [r6-32*2]
packssdw m3, [r6-32*1]
packssdw m4, [r6+32*0]
packssdw m5, [r6+32*1]
packssdw m6, [r6+32*2]
packssdw m7, [r6+32*3]
mova [r6], m8
punpckhwd m8, m0, m1
punpcklwd m0, m1
punpcklwd m1, m2, m3
punpckhwd m2, m3
punpckhwd m3, m6, m7
punpcklwd m6, m7
punpcklwd m7, m4, m5
punpckhwd m4, m5
punpckldq m5, m8, m2
punpckhdq m8, m2
punpckhdq m2, m0, m1
punpckldq m0, m1
punpckhdq m1, m7, m6
punpckldq m7, m6
punpckhdq m6, m4, m3
punpckldq m4, m3
punpckhqdq m3, m2, m1
punpcklqdq m2, m1
punpckhqdq m1, m0, m7
punpcklqdq m0, m7
punpcklqdq m7, m8, m6
punpckhqdq m8, m6
punpckhqdq m6, m5, m4
punpcklqdq m5, m4
mova m4, [r6]
mova [r6], m8
punpcklwd m8, m4, m9
punpckhwd m4, m9
punpcklwd m9, m10, m11
punpckhwd m10, m11
punpckhwd m11, m14, m15
punpcklwd m14, m15
punpckhwd m15, m12, m13
punpcklwd m12, m13
punpckldq m13, m4, m10
punpckhdq m4, m10
punpckhdq m10, m8, m9
punpckldq m8, m9
punpckhdq m9, m12, m14
punpckldq m12, m14
punpckhdq m14, m15, m11
punpckldq m15, m11
punpckhqdq m11, m10, m9
punpcklqdq m10, m9
punpckhqdq m9, m8, m12
punpcklqdq m8, m12
punpcklqdq m12, m13, m15
punpckhqdq m13, m15
punpckhqdq m15, m4, m14
punpcklqdq m14, m4, m14
vperm2i128 m4, m0, m8, 0x31
vinserti128 m0, xm8, 1
vinserti128 m8, m5, xm12, 1
vperm2i128 m12, m5, 0x13
vperm2i128 m5, m1, m9, 0x31
vinserti128 m1, xm9, 1
vinserti128 m9, m6, xm13, 1
vperm2i128 m13, m6, 0x13
vperm2i128 m6, m2, m10, 0x31
vinserti128 m2, xm10, 1
vinserti128 m10, m7, xm14, 1
vperm2i128 m14, m7, 0x13
vperm2i128 m7, m3, m11, 0x31
vinserti128 m3, xm11, 1
mova xm11, [r6]
vinserti128 m11, xm15, 1
vinserti128 m15, [r6+16], 0
ret
.transpose_fast:
call m(idct_16x8_internal_16bpc).transpose2
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
ret
ALIGN function_align
.main:
mova m0, [cq+64* 1]
mova m1, [cq+64* 3]
mova m2, [cq+64* 5]
mova m3, [cq+64* 7]
mova m4, [cq+64* 9]
mova m5, [cq+64*11]
mova m6, [cq+64*13]
mova m7, [cq+64*15]
call m(idct_8x16_internal_16bpc).main_oddhalf
mova m0, [cq+64* 0]
mova m1, [cq+64* 2]
mova m2, [cq+64* 4]
mova m3, [cq+64* 6]
mova m4, [cq+64* 8]
mova m5, [cq+64*10]
mova m6, [cq+64*12]
mova m7, [cq+64*14]
call m(idct_8x8_internal_16bpc).main
call m(idct_8x16_internal_16bpc).main_evenhalf
psrld m10, m11, 10 ; pd_2
REPX {paddd x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
ret
INV_TXFM_16X16_FN adst, dct
INV_TXFM_16X16_FN adst, adst
INV_TXFM_16X16_FN adst, flipadst
cglobal iadst_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
vpbroadcastd m13, [clip_min]
vpbroadcastd m14, [clip_max]
vpbroadcastd m15, [pd_2896]
lea r6, [rsp+32*4]
sub eobd, 36
jl .fast
add cq, 32
call .main
sub cq, 32
vpbroadcastd m8, [pd_10240]
paddd m4, m8
paddd m6, m8
paddd m9, m8
paddd m11, m8
vpbroadcastd m8, [pd_10239]
psubd m5, m8, m5
psubd m7, m8, m7
psubd m10, m8, m10
psubd m12, m8, m12
REPX {psrad x, 14}, m4, m5, m6, m7, m9, m10, m11, m12
mova [r6+32*0], m4
mova [r6+32*1], m5
mova [r6+32*2], m6
mova [r6+32*3], m7
psrld m4, m15, 10 ; pd_2
paddd m0, m4
psubd m1, m4, m1
paddd m2, m4
psubd m3, m4, m3
psubd m7, m4, [r6-32*4]
paddd m6, m4, [r6-32*3]
psubd m5, m4, [r6-32*2]
paddd m4, [r6-32*1]
REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
mova [r6-32*4], m0
mova [r6-32*3], m1
mova [r6-32*2], m2
mova [r6-32*1], m3
add r6, 32*8
mova [r6-32*4], m9
mova [r6-32*3], m10
mova [r6-32*2], m11
mova [r6-32*1], m12
mova [r6+32*0], m4
mova [r6+32*1], m5
mova [r6+32*2], m6
mova [r6+32*3], m7
.fast:
add r6, 32*8
call .main
vpbroadcastd m14, [pd_10240]
vpbroadcastd m13, [pd_10239]
psrld m15, 10 ; pd_2
paddd m0, m15
psubd m1, m15, m1
paddd m2, m15
psubd m3, m15, m3
paddd m4, m14
psubd m5, m13, m5
paddd m6, m14
psubd m7, m13, m7
paddd m8, m14, m9
psubd m9, m13, m10
paddd m10, m14, m11
psubd m11, m13, m12
paddd m12, m15, [r6-32*1]
psubd m13, m15, [r6-32*2]
paddd m14, m15, [r6-32*3]
psubd m15, [r6-32*4]
.pass1_end:
REPX {psrad x, 2 }, m0, m1, m2, m3, m12, m13, m14, m15
REPX {psrad x, 14}, m4, m5, m6, m7, m8, m9, m10, m11
sub r6, 32*8
jmp tx2q
.pass2:
call m(idct_16x16_internal_16bpc).transpose
lea rax, [pw_5+128]
mova [rsp], m15
call m(iadst_16x16_internal_8bpc).main
call m(iadst_16x16_internal_8bpc).main_pass2_end
mova [rsp+32*0], m8
mova [rsp+32*2], m12
mova [rsp+32*3], m13
vpbroadcastd m12, [pw_2048]
pxor m13, m13
psubw m13, m12
pmulhrsw m0, m12
pmulhrsw m1, m13, [rsp+32*1]
mova [rsp+32*1], m9
pmulhrsw m2, m12
pmulhrsw m3, m13
call m(idct_16x8_internal_16bpc).write_16x4_start
pmulhrsw m0, m12, m4
pmulhrsw m1, m13, m5
pmulhrsw m2, m12, m6
pmulhrsw m3, m13, m7
call m(idct_16x8_internal_16bpc).write_16x4_zero
pmulhrsw m0, m12, [rsp+32*0]
pmulhrsw m1, m13, [rsp+32*1]
pmulhrsw m2, m12, m10
pmulhrsw m3, m13, m11
call m(idct_16x8_internal_16bpc).write_16x4_zero
pmulhrsw m0, m12, [rsp+32*2]
pmulhrsw m1, m13, [rsp+32*3]
pmulhrsw m2, m12, m14
pmulhrsw m3, m13, m15
call m(idct_16x8_internal_16bpc).write_16x4_zero
RET
ALIGN function_align
.main:
mova m0, [cq+64* 2]
mova m1, [cq+64*13]
mova m2, [cq+64* 6]
mova m3, [cq+64* 9]
mova m4, [cq+64*10]
mova m5, [cq+64* 5]
mova m6, [cq+64*14]
mova m7, [cq+64* 1]
vpbroadcastd m12, [pd_2048]
call m(iadst_16x8_internal_16bpc).main_part1
mova m0, [cq+64* 0]
mova m1, [cq+64*15]
mova m2, [cq+64* 4]
mova m3, [cq+64*11]
mova m4, [cq+64* 8]
mova m5, [cq+64* 7]
mova m6, [cq+64*12]
mova m7, [cq+64* 3]
jmp m(iadst_16x8_internal_16bpc).main_part2
INV_TXFM_16X16_FN flipadst, dct
INV_TXFM_16X16_FN flipadst, adst
INV_TXFM_16X16_FN flipadst, flipadst
cglobal iflipadst_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
vpbroadcastd m13, [clip_min]
vpbroadcastd m14, [clip_max]
vpbroadcastd m15, [pd_2896]
lea r6, [rsp+32*4]
sub eobd, 36
jl .fast
add cq, 32
call m(iadst_16x16_internal_16bpc).main
sub cq, 32
vpbroadcastd m8, [pd_10240]
paddd m11, m8
paddd m9, m8
paddd m6, m8
paddd m4, m8
vpbroadcastd m8, [pd_10239]
psubd m12, m8, m12
psubd m10, m8, m10
psubd m7, m8, m7
psubd m5, m8, m5
REPX {psrad x, 14}, m12, m11, m10, m9, m7, m6, m5, m4
mova [r6+32*0], m12
mova [r6+32*1], m11
mova [r6+32*2], m10
mova [r6+32*3], m9
psrld m9, m15, 10 ; pd_2
psubd m3, m9, m3
paddd m2, m9
psubd m1, m9, m1
paddd m0, m9
psubd m12, m9, [r6-32*4]
paddd m11, m9, [r6-32*3]
psubd m10, m9, [r6-32*2]
paddd m9, [r6-32*1]
REPX {psrad x, 2 }, m12, m11, m10, m9, m3, m2, m1, m0
mova [r6-32*4], m12
mova [r6-32*3], m11
mova [r6-32*2], m10
mova [r6-32*1], m9
add r6, 32*8
mova [r6-32*4], m7
mova [r6-32*3], m6
mova [r6-32*2], m5
mova [r6-32*1], m4
mova [r6+32*0], m3
mova [r6+32*1], m2
mova [r6+32*2], m1
mova [r6+32*3], m0
.fast:
add r6, 32*8
call m(iadst_16x16_internal_16bpc).main
vpbroadcastd m14, [pd_10240]
vpbroadcastd m13, [pd_10239]
psrld m15, 10 ; pd_2
psubd m8, m13, m7
paddd m7, m14, m9
paddd m9, m14, m6
psubd m6, m13, m10
psubd m10, m13, m5
paddd m5, m14, m11
paddd m11, m14, m4
psubd m4, m13, m12
psubd m12, m15, m3
paddd m3, m15, [r6-32*1]
paddd m13, m15, m2
psubd m2, m15, [r6-32*2]
psubd m14, m15, m1
mova m1, m15
paddd m15, m0
psubd m0, m1, [r6-32*4]
paddd m1, [r6-32*3]
jmp m(iadst_16x16_internal_16bpc).pass1_end
.pass2:
call m(idct_16x16_internal_16bpc).transpose
lea rax, [pw_5+128]
mova [rsp], m15
call m(iadst_16x16_internal_8bpc).main
call m(iadst_16x16_internal_8bpc).main_pass2_end
mova [rsp+32*3], m3
mova [rsp+32*2], m2
mova [rsp+32*0], m0
mova m2, m13
mova m3, m12
vpbroadcastd m12, [pw_2048]
pxor m13, m13
psubw m13, m12
pmulhrsw m0, m13, m15
pmulhrsw m1, m12, m14
pmulhrsw m2, m13
pmulhrsw m3, m12
mova m14, m8
mova m15, m9
call m(idct_16x8_internal_16bpc).write_16x4_start
pmulhrsw m0, m13, m11
pmulhrsw m1, m12, m10
pmulhrsw m2, m13, m15
pmulhrsw m3, m12, m14
call m(idct_16x8_internal_16bpc).write_16x4_zero
pmulhrsw m0, m13, m7
pmulhrsw m1, m12, m6
pmulhrsw m2, m13, m5
pmulhrsw m3, m12, m4
call m(idct_16x8_internal_16bpc).write_16x4_zero
pmulhrsw m0, m13, [rsp+32*3]
pmulhrsw m1, m12, [rsp+32*2]
pmulhrsw m2, m13, [rsp+32*1]
pmulhrsw m3, m12, [rsp+32*0]
call m(idct_16x8_internal_16bpc).write_16x4_zero
RET
INV_TXFM_16X16_FN identity, dct, -92
INV_TXFM_16X16_FN identity, identity
cglobal iidentity_16x16_internal_16bpc, 0, 7, 16, 32*24, dst, stride, c, eob, tx2
vpbroadcastd m15, [pd_11586]
vpbroadcastd m7, [pd_10240]
lea r6, [rsp+32*4]
sub eobd, 36
jl .fast
mov r3, -32*8*4
.righthalf:
pmulld m0, m15, [cq+r3+32*33]
pmulld m1, m15, [cq+r3+32*35]
pmulld m2, m15, [cq+r3+32*37]
pmulld m3, m15, [cq+r3+32*39]
add r6, 32*4
REPX {paddd x, m7}, m0, m1, m2, m3
REPX {psrad x, 14}, m0, m1, m2, m3
mova [r6+32*0], m0
mova [r6+32*1], m1
mova [r6+32*2], m2
mova [r6+32*3], m3
add r3, 32*8
jl .righthalf
.fast:
pmulld m0, m15, [cq+64* 0]
pmulld m1, m15, [cq+64* 1]
pmulld m2, m15, [cq+64* 2]
pmulld m3, m15, [cq+64* 3]
pmulld m4, m15, [cq+64* 4]
pmulld m5, m15, [cq+64* 5]
pmulld m6, m15, [cq+64* 6]
pmulld m8, m15, [cq+64* 7]
mova [cq], m8
pmulld m8, m15, [cq+64* 8]
pmulld m9, m15, [cq+64* 9]
pmulld m10, m15, [cq+64*10]
pmulld m11, m15, [cq+64*11]
pmulld m12, m15, [cq+64*12]
pmulld m13, m15, [cq+64*13]
pmulld m14, m15, [cq+64*14]
pmulld m15, [cq+64*15]
REPX {paddd x, m7}, m0, m1, m2, m3, m4, m5, m6, \
m8, m9, m10, m11, m12, m13, m14, m15
paddd m7, [cq]
REPX {psrad x, 14}, m0, m1, m2, m3, m4, m5, m6, m7, \
m8, m9, m10, m11, m12, m13, m14, m15
jmp tx2q
.pass2:
call m(idct_16x16_internal_16bpc).transpose
mova [cq+32*0], m15
mova [cq+32*1], m0
vpbroadcastd m15, [pw_1697x16]
REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \
8, 9, 10, 11, 12, 13, 14
mova m0, [cq+32*1]
mova [cq+32*1], m1
IDTX16 0, 1, 15
mova m1, [cq+32*0]
pmulhrsw m15, m1
paddsw m1, m1
paddsw m15, m1
mova m1, [cq+32*1]
jmp m(idct_16x16_internal_16bpc).end
%macro IDCT32_END 6 ; in/out1, out2, tmp[1-3], shift
mova m%4, [r6+32*(%1-4)]
mova m%2, [r5+32*(3-%1)]
mova m%5, [r4+32*(%1-4)]
psubd m%3, m%1, m%4 ; idct16 out15 - n
paddd m%1, m%4 ; idct16 out0 + n
pmaxsd m%1, m12
pmaxsd m%3, m12
pminsd m%1, m13
pminsd m%3, m13
paddd m%1, m11
paddd m%3, m11
psubd m%4, m%1, m%2 ; out31 - n
paddd m%1, m%2 ; out0 + n
paddd m%2, m%3, m%5 ; out15 - n
psubd m%3, m%5 ; out16 + n
REPX {psrad x, %6}, m%1, m%3, m%2, m%4
packssdw m%1, m%3 ; out0 + n, out16 + n
packssdw m%2, m%4 ; out15 - n, out31 - n
%endmacro
cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jz .dconly
PROLOGUE 0, 7, 16, 32*12, dst, stride, c, eob
%undef cmp
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_min]
vpbroadcastd m13, [clip_max]
vbroadcasti128 m14, [idct32_shuf]
mov r4, cq
call .pass1_main
mova [rsp+32*0], m2
mova [rsp+32*1], m3
cmp eobd, 43
jge .eob43
pxor m4, m4
REPX {mova x, m4}, [rsp+32*2], m2, m3, m11
jmp .pass1_end_fast
.eob43:
lea r6, [rsp+32*8]
mova [r6-32*4], m0
mova [r6-32*3], m1
call .pass1_main
mova [rsp+32*2], m2
cmp eobd, 107
jge .eob107
mova m11, m3
mova m2, m0
mova m3, m1
mova m0, [r6-32*4]
mova m1, [r6-32*3]
pxor m4, m4
.pass1_end_fast:
vpbroadcastd m10, [pw_2048]
lea rax, [deint_shuf+128]
REPX {mova x, m4}, m5, m6, m7
call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
jmp .end
.eob107:
mova [rsp+32*3], m3
mova [r6-32*2], m0
mova [r6-32*1], m1
call .pass1_main
cmp eobd, 171
jge .eob171
pshufd m12, m2, q1032
pshufd m13, m3, q1032
mova m4, m0
mova m5, m1
pxor m6, m6
REPX {mova x, m6}, m7, m14, m15
jmp .pass1_end
.eob171:
mova [r6+32*0], m0
mova [r6+32*1], m1
mova [r6+32*2], m2
mova [r6+32*3], m3
call .pass1_main
pshufd m12, [r6+32*2], q1032 ; out19 out17
pshufd m13, [r6+32*3], q1032 ; out23 out21
mova m4, [r6+32*0] ; out16 out18
mova m5, [r6+32*1] ; out20 out22
pshufd m14, m2, q1032 ; out27 out25
pshufd m15, m3, q1032 ; out31 out29
mova m6, m0 ; out24 out26
mova m7, m1 ; out28 out30
.pass1_end:
mova m0, [r6-32*4] ; out0 out2
mova m1, [r6-32*3] ; out4 out6
mova m2, [r6-32*2] ; out8 out10
mova m3, [r6-32*1] ; out12 out14
lea rax, [deint_shuf+128]
mova m11, [rsp+32*3] ; out13 out15
vpbroadcastd m10, [pw_2048]
call m(inv_txfm_add_dct_dct_8x32_8bpc).main
.end: ; [rsp+0*32] = m12
vpbroadcastd m12, [pw_2048]
mov cq, r4
mova [rsp+32*1], m8
mova [rsp+32*2], m9
mova [rsp+32*3], m10
mova [rsp+32*4], m11
vpermq m0, m0, q3120
vpermq m1, m1, q2031
pmulhrsw m0, m12
pmulhrsw m1, m12
call m(idct_8x8_internal_16bpc).write_8x4_start
vpermq m0, m2, q3120
vpermq m1, m3, q2031
pmulhrsw m0, m12
pmulhrsw m1, m12
call m(idct_8x8_internal_16bpc).write_8x4
vpermq m0, m4, q3120
vpermq m1, m5, q2031
pmulhrsw m0, m12
pmulhrsw m1, m12
call m(idct_8x8_internal_16bpc).write_8x4
vpermq m0, m6, q3120
vpermq m1, m7, q2031
pmulhrsw m0, m12
pmulhrsw m1, m12
call m(idct_8x8_internal_16bpc).write_8x4
vpermq m0, [rsp+32*1], q3120
vpermq m1, [rsp+32*2], q2031
pmulhrsw m0, m12
pmulhrsw m1, m12
call m(idct_8x8_internal_16bpc).write_8x4
vpermq m0, [rsp+32*3], q3120
vpermq m1, [rsp+32*4], q2031
pmulhrsw m0, m12
pmulhrsw m1, m12
call m(idct_8x8_internal_16bpc).write_8x4
vpermq m0, [rsp+32*0], q3120
vpermq m1, m13, q2031
pmulhrsw m0, m12
pmulhrsw m1, m12
call m(idct_8x8_internal_16bpc).write_8x4
vpermq m0, m14, q3120
vpermq m1, m15, q2031
pmulhrsw m0, m12
pmulhrsw m1, m12
call m(idct_8x8_internal_16bpc).write_8x4
RET
.dconly:
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 32
add r6d, 10240
sar r6d, 14
jmp m(inv_txfm_add_dct_dct_8x8_16bpc).dconly2
ALIGN function_align
.pass1_main:
mova m0, [cq+128*0]
mova m1, [cq+128*1]
mova m2, [cq+128*2]
mova m3, [cq+128*3]
mova m4, [cq+128*4]
mova m5, [cq+128*5]
mova m6, [cq+128*6]
mova m7, [cq+128*7]
add cq, 32
call m(idct_8x8_internal_16bpc).main
psrld m1, m11, 10 ; pd_2
REPX {paddd x, m1}, m0, m6, m5, m3
paddd m1, m6, m7 ; out1
psubd m6, m7 ; out6
psubd m7, m0, m9 ; out7
paddd m0, m9 ; out0
paddd m2, m5, m4 ; out2
psubd m5, m4 ; out5
psubd m4, m3, m8 ; out4
paddd m3, m8 ; out3
REPX {psrad x, 2 }, m0, m1, m2, m3, m4, m5, m6, m7
packssdw m0, m1
packssdw m2, m3
packssdw m4, m5
packssdw m6, m7
pshufb m0, m14
pshufb m2, m14
pshufb m4, m14
pshufb m6, m14
punpckhdq m3, m0, m2
punpckldq m0, m2
punpckldq m2, m4, m6
punpckhdq m4, m6
vperm2i128 m1, m0, m2, 0x31 ; 4 6
vinserti128 m0, xm2, 1 ; 0 2
vinserti128 m2, m3, xm4, 1 ; 1 3
vperm2i128 m3, m4, 0x31 ; 5 7
ret
.main_oddhalf_part1_fast_rect2:
REPX {paddd x, m11}, m0, m1, m2, m3
REPX {psrad x, 12 }, m0, m1, m2, m3
.main_oddhalf_part1_fast: ; lower half zero
vpbroadcastd m7, [pd_4091]
vpbroadcastd m8, [pd_201]
vpbroadcastd m6, [pd_m1380]
vpbroadcastd m9, [pd_3857]
vpbroadcastd m5, [pd_3703]
vpbroadcastd m10, [pd_1751]
vpbroadcastd m4, [pd_m2751]
vpbroadcastd m15, [pd_3035]
pmulld m7, m0
pmulld m0, m8
pmulld m6, m1
pmulld m1, m9
pmulld m5, m2
pmulld m2, m10
pmulld m4, m3
pmulld m3, m15
jmp .main_oddhalf_part1_fast2
.main_oddhalf_part1_rect2:
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
.main_oddhalf_part1: ; in1, in7, in9, in15, in17, in23, in25, in31
ITX_MULSUB_2D 0, 7, 8, 9, 10, _, 201, 4091 ; t16a, t31a
ITX_MULSUB_2D 6, 1, 8, 9, 10, _, 3857, 1380 ; t19a, t28a
ITX_MULSUB_2D 2, 5, 8, 9, 10, _, 1751, 3703 ; t18a, t29a
ITX_MULSUB_2D 4, 3, 8, 9, 10, _, 3035, 2751 ; t17a, t30a
.main_oddhalf_part1_fast2:
REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
psubd m8, m0, m4 ; t17
paddd m0, m4 ; t16
psubd m4, m6, m2 ; t18
paddd m6, m2 ; t19
psubd m2, m1, m5 ; t29
paddd m1, m5 ; t28
psubd m5, m7, m3 ; t30
paddd m7, m3 ; t31
REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
vpbroadcastd m15, [pd_4017]
vpbroadcastd m10, [pd_799]
ITX_MULSUB_2D 5, 8, 3, 9, _, 11, 10, 15 ; t17a, t30a
ITX_MULSUB_2D 2, 4, 3, 9, _, 11, 10, 15, 4 ; t29a, t18a
psubd m3, m0, m6 ; t19a
paddd m0, m6 ; t16a
psubd m6, m7, m1 ; t28a
paddd m7, m1 ; t31a
psubd m1, m5, m4 ; t18
paddd m5, m4 ; t17
psubd m4, m8, m2 ; t29
paddd m8, m2 ; t30
REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
vpbroadcastd m15, [pd_3784]
vpbroadcastd m10, [pd_1567]
ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15 ; t18a, t29a
ITX_MULSUB_2D 6, 3, 2, 9, _, 11, 10, 15 ; t19, t28
mova [r6-32*4], m0
mova [r6-32*3], m5
mova [r6-32*2], m4
mova [r6-32*1], m6
mova [r6+32*0], m3
mova [r6+32*1], m1
mova [r6+32*2], m8
mova [r6+32*3], m7
ret
.main_oddhalf_part2_fast_rect2:
REPX {paddd x, m11}, m0, m1, m2, m3
REPX {psrad x, 12 }, m0, m1, m2, m3
.main_oddhalf_part2_fast: ; lower half zero
vpbroadcastd m7, [pd_m601]
vpbroadcastd m8, [pd_4052]
vpbroadcastd m6, [pd_3973]
vpbroadcastd m9, [pd_995]
vpbroadcastd m5, [pd_m2106]
vpbroadcastd m10, [pd_3513]
vpbroadcastd m4, [pd_3290]
vpbroadcastd m15, [pd_2440]
pmulld m7, m0
pmulld m0, m8
pmulld m6, m1
pmulld m1, m9
pmulld m5, m2
pmulld m2, m10
pmulld m4, m3
pmulld m3, m15
jmp .main_oddhalf_part2_fast2
.main_oddhalf_part2_rect2:
REPX {paddd x, m11}, m0, m1, m2, m3, m4, m5, m6, m7
REPX {psrad x, 12 }, m0, m1, m2, m3, m4, m5, m6, m7
.main_oddhalf_part2: ; in3, in5, in11, in13, in19, in21, in27, in29
ITX_MULSUB_2D 7, 0, 8, 9, 10, _, 4052, 601 ; t23a, t24a
ITX_MULSUB_2D 1, 6, 8, 9, 10, _, 995, 3973 ; t20a, t27a
ITX_MULSUB_2D 5, 2, 8, 9, 10, _, 3513, 2106 ; t21a, t26a
ITX_MULSUB_2D 3, 4, 8, 9, 10, _, 2440, 3290 ; t22a, t25a
.main_oddhalf_part2_fast2:
REPX {paddd x, m11}, m0, m7, m6, m1, m2, m5, m4, m3
REPX {psrad x, 12 }, m0, m4, m6, m2, m1, m5, m7, m3
psubd m8, m0, m4 ; t25
paddd m0, m4 ; t24
psubd m4, m6, m2 ; t26
paddd m6, m2 ; t27
psubd m2, m1, m5 ; t21
paddd m1, m5 ; t20
psubd m5, m7, m3 ; t22
paddd m7, m3 ; t23
REPX {pmaxsd x, m12}, m8, m5, m4, m2, m0, m6, m1, m7
REPX {pminsd x, m13}, m8, m5, m4, m2, m0, m6, m1, m7
vpbroadcastd m15, [pd_2276]
vpbroadcastd m10, [pd_3406]
ITX_MULSUB_2D 4, 2, 3, 9, _, 11, 10, 15 ; t21a, t26a
ITX_MULSUB_2D 8, 5, 3, 9, _, 11, 10, 15, 4 ; t25a, t22a
psubd m3, m0, m6 ; t27a
paddd m0, m6 ; t24a
psubd m6, m7, m1 ; t20a
paddd m7, m1 ; t23a
psubd m1, m5, m4 ; t21
paddd m5, m4 ; t22
psubd m4, m8, m2 ; t26
paddd m8, m2 ; t25
REPX {pmaxsd x, m12}, m3, m6, m1, m4, m0, m7, m5, m8
REPX {pminsd x, m13}, m3, m6, m1, m4, m0, m7, m5, m8
vpbroadcastd m15, [pd_3784]
vpbroadcastd m10, [pd_1567]
ITX_MULSUB_2D 4, 1, 2, 9, _, 11, 10, 15, 4 ; t26a, t21a
ITX_MULSUB_2D 3, 6, 2, 9, _, 11, 10, 15, 4 ; t27, t20
mova m9, [r6-32*4] ; t16a
mova m10, [r6-32*3] ; t17
psubd m2, m9, m7 ; t23
paddd m9, m7 ; t16
psubd m7, m10, m5 ; t22a
paddd m10, m5 ; t17a
REPX {pmaxsd x, m12}, m9, m10, m2, m7
REPX {pminsd x, m13}, m9, m10, m2, m7
mova [r6-32*4], m9
mova [r6-32*3], m10
mova m9, [r6-32*2] ; t18a
mova m10, [r6-32*1] ; t19
psubd m5, m9, m1 ; t21
paddd m9, m1 ; t18
psubd m1, m10, m6 ; t20a
paddd m10, m6 ; t19a
REPX {pmaxsd x, m12}, m9, m10, m5, m1
REPX {pminsd x, m13}, m9, m10, m5, m1
mova [r6-32*2], m9
mova [r6-32*1], m10
mova m9, [r6+32*0] ; t28
mova m10, [r6+32*1] ; t29a
psubd m6, m9, m3 ; t27a
paddd m9, m3 ; t28a
psubd m3, m10, m4 ; t26
paddd m10, m4 ; t29
REPX {pmaxsd x, m12}, m9, m10, m6, m3
REPX {pminsd x, m13}, m9, m10, m6, m3
REPX {pmulld x, m14}, m6, m3, m1, m5
paddd m6, m11
paddd m3, m11
psubd m4, m6, m1 ; t20
paddd m6, m1 ; t27
psubd m1, m3, m5 ; t21a
paddd m3, m5 ; t26a
REPX {psrad x, 12 }, m4, m1, m3, m6
mova [r6+32*0], m4
mova [r6+32*1], m1
mova m4, [r6+32*2] ; t30
mova m1, [r6+32*3] ; t31a
psubd m5, m4, m8 ; t25a
paddd m4, m8 ; t30a
psubd m8, m1, m0 ; t24
paddd m1, m0 ; t31
REPX {pmaxsd x, m12}, m8, m5, m4, m1
REPX {pminsd x, m13}, m8, m5, m4, m1
REPX {pmulld x, m14}, m5, m8, m7, m2
paddd m5, m11
paddd m8, m11
psubd m0, m5, m7 ; t22
paddd m5, m7 ; t25
psubd m7, m8, m2 ; t23a
paddd m2, m8 ; t24a
REPX {psrad x, 12 }, m0, m7, m2, m5
mova [r6+32*2], m0
mova [r6+32*3], m7
mov r4, r6
add r6, 32*8
mova [r6-32*4], m2
mova [r6-32*3], m5
mova [r6-32*2], m3
mova [r6-32*1], m6
mova [r6+32*0], m9
mova [r6+32*1], m10
mova [r6+32*2], m4
mova [r6+32*3], m1
mov r5, r6
add r6, 32*8
ret
ALIGN function_align
.main_end:
psrld m11, 10 ; pd_2
IDCT32_END 0, 15, 8, 9, 10, 2
IDCT32_END 1, 14, 8, 9, 10, 2
punpckhwd m8, m0, m1 ; 16 17
punpcklwd m0, m1 ; 0 1
punpcklwd m1, m14, m15 ; 14 15
punpckhwd m14, m15 ; 30 31
mova [r5+32*3], m8
mova [r5+32*2], m14
IDCT32_END 2, 15, 8, 9, 10, 2
IDCT32_END 3, 14, 8, 9, 10, 2
punpckhwd m8, m2, m3 ; 18 19
punpcklwd m2, m3 ; 2 3
punpcklwd m3, m14, m15 ; 12 13
punpckhwd m14, m15 ; 28 29
mova [r5+32*1], m8
mova [r5+32*0], m14
IDCT32_END 4, 15, 8, 9, 10, 2
IDCT32_END 5, 14, 8, 9, 10, 2
punpckhwd m8, m4, m5 ; 20 21
punpcklwd m4, m5 ; 4 5
punpcklwd m5, m14, m15 ; 10 11
punpckhwd m14, m15 ; 26 27
mova [r5-32*1], m8
mova [r5-32*2], m14
IDCT32_END 6, 15, 8, 9, 10, 2
IDCT32_END 7, 14, 8, 9, 10, 2
punpckhwd m8, m6, m7 ; 22 23
punpcklwd m6, m7 ; 6 7
punpcklwd m7, m14, m15 ; 8 9
punpckhwd m14, m15 ; 24 25
mova [r5-32*3], m8
mova [r5-32*4], m14
.transpose:
punpckhdq m15, m3, m1
punpckldq m3, m1
punpckhdq m1, m4, m6
punpckldq m4, m6
punpckhdq m6, m0, m2
punpckldq m0, m2
punpckhdq m2, m7, m5
punpckldq m7, m5
punpcklqdq m5, m2, m15
punpckhqdq m2, m15
punpckhqdq m15, m7, m3
punpcklqdq m7, m3
punpckhqdq m3, m6, m1
punpcklqdq m6, m1
punpckhqdq m1, m0, m4
punpcklqdq m0, m4
vperm2i128 m4, m0, m7, 0x31
vinserti128 m0, xm7, 1
vperm2i128 m7, m3, m2, 0x31
vinserti128 m3, xm2, 1
vinserti128 m2, m6, xm5, 1
vperm2i128 m6, m5, 0x31
vperm2i128 m5, m1, m15, 0x31
vinserti128 m1, xm15, 1
ret
cglobal inv_txfm_add_identity_identity_8x32_16bpc, 4, 7, 8, dst, stride, c, eob
vpbroadcastd m5, [pw_5]
vpbroadcastd m7, [pixel_max]
pxor m6, m6
mov r6d, eobd
add eobb, 21
cmovc eobd, r6d ; 43, 107, 171 -> 64, 128, 192
lea r6, [strideq*3]
lea r5, [strideq*5]
lea r4, [strideq+r6*2] ; strideq*7
.loop:
mova m0, [cq+128*0]
packssdw m0, [cq+128*1]
mova m1, [cq+128*2]
packssdw m1, [cq+128*3]
mova m2, [cq+128*4]
packssdw m2, [cq+128*5]
mova m3, [cq+128*6]
packssdw m3, [cq+128*7]
REPX {paddsw x, m5}, m0, m1, m2, m3
REPX {psraw x, 3 }, m0, m1, m2, m3
call .main_zero
add cq, 32
lea dstq, [dstq+strideq*8]
sub eobd, 64
jge .loop
RET
ALIGN function_align
.main_zero:
REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
.main:
punpckhwd m4, m0, m1
punpcklwd m0, m1
punpckhwd m1, m2, m3
punpcklwd m2, m3
punpckhwd m3, m0, m4
punpcklwd m0, m4
punpckhwd m4, m2, m1
punpcklwd m2, m1
punpckhqdq m1, m0, m2
punpcklqdq m0, m2
punpcklqdq m2, m3, m4
punpckhqdq m3, m4
mova xm4, [dstq+strideq*0]
vinserti128 m4, [dstq+strideq*4], 1
paddw m0, m4
mova xm4, [dstq+strideq*1]
vinserti128 m4, [dstq+r5 ], 1
paddw m1, m4
mova xm4, [dstq+strideq*2]
vinserti128 m4, [dstq+r6*2 ], 1
paddw m2, m4
mova xm4, [dstq+r6 ]
vinserti128 m4, [dstq+r4 ], 1
paddw m3, m4
REPX {pmaxsw x, m6}, m0, m1, m2, m3
REPX {pminsw x, m7}, m0, m1, m2, m3
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*4], m0, 1
mova [dstq+strideq*1], xm1
vextracti128 [dstq+r5 ], m1, 1
mova [dstq+strideq*2], xm2
vextracti128 [dstq+r6*2 ], m2, 1
mova [dstq+r6 ], xm3
vextracti128 [dstq+r4 ], m3, 1
ret
cglobal inv_txfm_add_dct_dct_32x8_16bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jnz .full
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 8
.dconly:
add r6d, 10240
sar r6d, 14
.dconly2:
imul r6d, 2896
add r6d, 34816
sar r6d, 16
movd xm0, r6d
vpbroadcastw m0, xm0
vpbroadcastd m4, [pixel_max]
pxor m3, m3
.dconly_loop:
paddw m1, m0, [dstq+32*0]
paddw m2, m0, [dstq+32*1]
pmaxsw m1, m3
pmaxsw m2, m3
pminsw m1, m4
pminsw m2, m4
mova [dstq+32*0], m1
mova [dstq+32*1], m2
add dstq, strideq
dec r3d
jg .dconly_loop
RET
.full:
PROLOGUE 0, 7, 16, 32*24, dst, stride, c, eob
mova m0, [cq+32* 1]
mova m1, [cq+32* 7]
mova m2, [cq+32* 9]
mova m3, [cq+32*15]
mova m4, [cq+32*17]
mova m5, [cq+32*23]
mova m6, [cq+32*25]
mova m7, [cq+32*31]
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_min]
vpbroadcastd m13, [clip_max]
vpbroadcastd m14, [pd_2896]
lea r6, [rsp+32*4]
call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1
mova m0, [cq+32* 3]
mova m1, [cq+32* 5]
mova m2, [cq+32*11]
mova m3, [cq+32*13]
mova m4, [cq+32*19]
mova m5, [cq+32*21]
mova m6, [cq+32*27]
mova m7, [cq+32*29]
call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2
mova m0, [cq+32* 2]
mova m1, [cq+32* 6]
mova m2, [cq+32*10]
mova m3, [cq+32*14]
mova m4, [cq+32*18]
mova m5, [cq+32*22]
mova m6, [cq+32*26]
mova m7, [cq+32*30]
call m(idct_8x16_internal_16bpc).main_oddhalf
mova m0, [cq+32* 0]
mova m1, [cq+32* 4]
mova m2, [cq+32* 8]
mova m3, [cq+32*12]
mova m4, [cq+32*16]
mova m5, [cq+32*20]
mova m6, [cq+32*24]
mova m7, [cq+32*28]
call m(idct_8x8_internal_16bpc).main
call m(idct_8x16_internal_16bpc).main_evenhalf
call m(inv_txfm_add_dct_dct_8x32_16bpc).main_end
lea rax, [deint_shuf+128]
vpbroadcastd m11, [pw_2048]
mov r4, dstq
call .pass2
mova m0, [r5+32*3] ; 16 17
mova m1, [r5+32*2] ; 30 31
mova m2, [r5+32*1] ; 18 19
mova m3, [r5+32*0] ; 28 29
mova m4, [r5-32*1] ; 20 21
mova m5, [r5-32*2] ; 26 27
mova m6, [r5-32*3] ; 22 23
mova m7, [r5-32*4] ; 24 25
call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose
lea dstq, [r4+32]
call .pass2
RET
ALIGN function_align
.pass2:
call m(idct_16x8_internal_8bpc).main
REPX {pmulhrsw x, m11}, m0, m1, m2, m3
call m(idct_16x8_internal_16bpc).write_16x4_start
pmulhrsw m0, m11, m4
pmulhrsw m1, m11, m5
pmulhrsw m2, m11, m6
pmulhrsw m3, m11, m7
jmp m(idct_16x8_internal_16bpc).write_16x4_zero
cglobal inv_txfm_add_identity_identity_32x8_16bpc, 4, 7, 8, dst, stride, c, eob
vpbroadcastd m5, [pw_4096]
vpbroadcastd m7, [pixel_max]
pxor m6, m6
mov r6d, eobd
add eobb, 21
cmovc eobd, r6d
lea r6, [strideq*3]
lea r5, [strideq*5]
lea r4, [strideq+r6*2] ; strideq*7
.loop:
mova m0, [cq+32*0]
packssdw m0, [cq+32*1]
mova m1, [cq+32*2]
packssdw m1, [cq+32*3]
REPX {mova [cq+32*x], m6}, 0, 1, 2, 3
add cq, 32*8
mova m2, [cq-32*4]
packssdw m2, [cq-32*3]
mova m3, [cq-32*2]
packssdw m3, [cq-32*1]
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
REPX {mova [cq+32*x], m6}, -4, -3, -2, -1
call m(inv_txfm_add_identity_identity_8x32_16bpc).main
add dstq, 16
sub eobd, 64
jge .loop
RET
%macro IDCT32_PASS2_END 6 ; coefs[1-2], tmp[1-2], offset[1-2]
mova m%4, [%2]
paddsw m%3, m%1, m%4
psubsw m%1, m%4
%if %1 == 0
pxor m6, m6
%endif
pmulhrsw m%3, m15
pmulhrsw m%1, m15
paddw m%3, [dstq+%5]
paddw m%1, [r2+%6]
pmaxsw m%3, m6
pmaxsw m%1, m6
pminsw m%3, m7
pminsw m%1, m7
mova [dstq+%5], m%3
mova [r2+%6], m%1
%endmacro
cglobal inv_txfm_add_dct_dct_16x32_16bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jz .dconly
PROLOGUE 0, 8, 16, 32*36, dst, stride, c, eob
%undef cmp
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_min]
vpbroadcastd m13, [clip_max]
vpbroadcastd m14, [pd_2896]
lea r6, [rsp+32*16]
lea r4, [r6+32*8]
lea r5, [r6+32*16]
call .main
sub eobd, 44
jge .eob44
vperm2i128 m2, m0, m3, 0x31 ; 5
vinserti128 m0, xm3, 1 ; 1
vperm2i128 m3, m1, m4, 0x31 ; 7
vinserti128 m1, xm4, 1 ; 3
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7
REPX {mova [r6+32*x], m4}, 0, 1, 2, 3
jmp .fast
.dconly:
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 32
add r6d, 2048
sar r6d, 12
imul r6d, 2896
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly
.eob44:
mova [r4+16*0], xm0
mova [r4+16*1], xm3
mova [r4+16*2], xm1
mova [r4+16*3], xm4
vextracti128 [r4+16*4], m0, 1
vextracti128 [r4+16*5], m3, 1
vextracti128 [r4+16*6], m1, 1
vextracti128 [r4+16*7], m4, 1
call .main
sub eobd, 107
jge .eob151
vperm2i128 m7, m1, m4, 0x31 ; 15
vinserti128 m5, m1, xm4, 1 ; 11
vperm2i128 m6, m0, m3, 0x31 ; 13
vinserti128 m4, m0, xm3, 1 ; 9
mova m0, [r4+32*0]
mova m1, [r4+32*1]
mova m2, [r4+32*2]
mova m3, [r4+32*3]
.fast:
lea rax, [pw_5+128]
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15
jmp .idct16
.eob151:
mova [r4-16*8], xm0
mova [r4-16*7], xm3
mova [r4-16*6], xm1
mova [r4-16*5], xm4
vextracti128 [r4-16*4], m0, 1
vextracti128 [r4-16*3], m3, 1
vextracti128 [r4-16*2], m1, 1
vextracti128 [r4-16*1], m4, 1
call .main
sub eobd, 128
jge .eob279
vperm2i128 m10, m0, m3, 0x31 ; 21
vinserti128 m8, m0, xm3, 1 ; 17
vperm2i128 m11, m1, m4, 0x31 ; 23
vinserti128 m9, m1, xm4, 1 ; 19
pxor m12, m12
REPX {mova x, m12}, m13, m14, m15
REPX {mova [r6+32*x], m12}, 0, 1, 2, 3
jmp .full
.eob279:
mova [r5+16*0], xm0
mova [r5+16*1], xm3
mova [r5+16*2], xm1
mova [r5+16*3], xm4
vextracti128 [r5+16*4], m0, 1
vextracti128 [r5+16*5], m3, 1
vextracti128 [r5+16*6], m1, 1
vextracti128 [r5+16*7], m4, 1
call .main
vperm2i128 m14, m0, m3, 0x31 ; 29
vinserti128 m12, m0, xm3, 1 ; 25
vperm2i128 m15, m1, m4, 0x31 ; 31
vinserti128 m13, m1, xm4, 1 ; 27
mova m8, [r5+32*0]
mova m9, [r5+32*1]
mova m10, [r5+32*2]
mova m11, [r5+32*3]
.full:
mova m0, [r4+32*0]
mova m1, [r4+32*1]
mova m2, [r4+32*2]
mova m3, [r4+32*3]
mova m4, [r4-32*4]
mova m5, [r4-32*3]
mova m6, [r4-32*2]
mova m7, [r4-32*1]
lea rax, [pw_5 + 128]
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
lea r3, [rsp+32*8]
mova m8, [r3+32*0]
mova m9, [r3+32*1]
mova m10, [r3+32*2]
mova m11, [r3+32*3]
mova m12, [r3-32*4]
mova m13, [r3-32*3]
mova m14, [r3-32*2]
mova m15, [r3-32*1]
.idct16:
lea r3, [rsp+32*16]
mova m0, [r3+32*0]
mova m1, [r3+32*1]
mova m2, [r3+32*2]
mova m3, [r3+32*3]
mova m4, [r3-32*4]
mova m5, [r3-32*3]
mova m6, [r3-32*2]
mova m7, [r3-32*1]
mova [rsp], m15
call m(idct_16x16_internal_8bpc).main
imul r2, strideq, 19
lea r3, [strideq*3]
add r2, dstq
call .pass2_end
RET
ALIGN function_align
.main:
pmulld m0, m14, [cq+128* 1]
pmulld m1, m14, [cq+128* 3]
pmulld m2, m14, [cq+128* 5]
pmulld m3, m14, [cq+128* 7]
pmulld m4, m14, [cq+128* 9]
pmulld m5, m14, [cq+128*11]
pmulld m6, m14, [cq+128*13]
pmulld m7, m14, [cq+128*15]
call m(idct_8x16_internal_16bpc).main_oddhalf_rect2
pmulld m0, m14, [cq+128* 0]
pmulld m1, m14, [cq+128* 2]
pmulld m2, m14, [cq+128* 4]
pmulld m3, m14, [cq+128* 6]
pmulld m4, m14, [cq+128* 8]
pmulld m5, m14, [cq+128*10]
pmulld m6, m14, [cq+128*12]
pmulld m7, m14, [cq+128*14]
call m(idct_8x8_internal_16bpc).main_rect2
call m(idct_8x16_internal_16bpc).main_evenhalf
psrld m15, m11, 11 ; pd_1
mova m8, [r6-32*4]
mova m9, [r6-32*3]
REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
psubd m10, m0, m8 ; out15
paddd m0, m8 ; out0
mova m8, [r6-32*2]
paddd m15, m1, m9 ; out1
psubd m1, m9 ; out14
mova m9, [r6-32*1]
REPX {psrad x, 1}, m0, m15, m10, m1
packssdw m0, m15
packssdw m1, m10
psubd m10, m2, m8 ; out13
paddd m2, m8 ; out2
mova m8, [r6+32*0]
paddd m15, m3, m9 ; out3
psubd m3, m9 ; out12
mova m9, [r6+32*1]
REPX {psrad x, 1}, m2, m15, m10, m3
packssdw m2, m15
packssdw m3, m10
psubd m10, m4, m8 ; out11
paddd m4, m8 ; out4
mova m8, [r6+32*2]
paddd m15, m5, m9 ; out5
psubd m5, m9 ; out10
mova m9, [r6+32*3]
REPX {psrad x, 1}, m4, m10, m15, m5
packssdw m4, m15
packssdw m5, m10
psubd m10, m6, m8 ; out9
paddd m6, m8 ; out6
paddd m15, m7, m9 ; out7
psubd m7, m9 ; out8
REPX {psrad x, 1}, m6, m10, m15, m7
packssdw m6, m15
packssdw m7, m10
punpckhwd m8, m0, m2
punpcklwd m0, m2
punpckhwd m2, m3, m1
punpcklwd m3, m1
punpckhwd m1, m4, m6
punpcklwd m4, m6
punpcklwd m6, m7, m5
punpckhwd m7, m5
pxor m5, m5
mov r7d, 128*13
.main_zero_loop:
mova [cq+r7-128*1], m5
mova [cq+r7+128*0], m5
mova [cq+r7+128*1], m5
mova [cq+r7+128*2], m5
sub r7d, 128*4
jg .main_zero_loop
add cq, 32
punpcklwd m5, m3, m2
punpckhwd m3, m2
punpcklwd m2, m4, m1
punpckhwd m4, m1
punpckhwd m1, m0, m8
punpcklwd m0, m8
punpckhwd m8, m6, m7
punpcklwd m6, m7
punpcklqdq m7, m1, m4
punpckhqdq m1, m4
punpckhqdq m4, m8, m3
punpcklqdq m8, m3
punpckhqdq m3, m6, m5
punpcklqdq m6, m5
punpcklqdq m5, m0, m2
punpckhqdq m0, m2
mova [r6+16*0], xm5
mova [r6+16*1], xm6
mova [r6+16*2], xm7
mova [r6+16*3], xm8
vextracti128 [r6+16*4], m5, 1
vextracti128 [r6+16*5], m6, 1
vextracti128 [r6+16*6], m7, 1
vextracti128 [r6+16*7], m8, 1
sub r6, 32*4
ret
ALIGN function_align
.pass2_end:
mova [rsp+gprsize+32*0], m6
mova [rsp+gprsize+32*2], m7
mova [rsp+gprsize+32*3], m15
vpbroadcastd m15, [pw_2048]
vpbroadcastd m7, [pixel_max]
IDCT32_PASS2_END 0, r5+32*3, 1, 6, strideq*0, r3*4
IDCT32_PASS2_END 4, r5-32*1, 0, 1, strideq*4, strideq*8
IDCT32_PASS2_END 8, r4+32*3, 0, 4, strideq*8, strideq*4
IDCT32_PASS2_END 12, r4-32*1, 0, 4, r3*4, strideq*0
add dstq, strideq
sub r2, strideq
mova m1, [rsp+gprsize+32*1]
IDCT32_PASS2_END 1, r5+32*2, 0, 4, strideq*0, r3*4
IDCT32_PASS2_END 5, r5-32*2, 0, 4, strideq*4, strideq*8
IDCT32_PASS2_END 9, r4+32*2, 0, 4, strideq*8, strideq*4
IDCT32_PASS2_END 13, r4-32*2, 0, 4, r3*4, strideq*0
add dstq, strideq
sub r2, strideq
mova m1, [rsp+gprsize+32*0]
IDCT32_PASS2_END 2, r5+32*1, 0, 4, strideq*0, r3*4
IDCT32_PASS2_END 1, r5-32*3, 0, 4, strideq*4, strideq*8
IDCT32_PASS2_END 10, r4+32*1, 0, 4, strideq*8, strideq*4
IDCT32_PASS2_END 14, r4-32*3, 0, 4, r3*4, strideq*0
add dstq, strideq
sub r2, strideq
mova m1, [rsp+gprsize+32*2]
mova m2, [rsp+gprsize+32*3]
IDCT32_PASS2_END 3, r5+32*0, 0, 4, strideq*0, r3*4
IDCT32_PASS2_END 1, r5-32*4, 0, 4, strideq*4, strideq*8
IDCT32_PASS2_END 11, r4+32*0, 0, 4, strideq*8, strideq*4
IDCT32_PASS2_END 2, r4-32*4, 0, 4, r3*4, strideq*0
ret
cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4, 7, 12, dst, stride, c, eob
vpbroadcastd m8, [pw_2896x8]
vpbroadcastd m9, [pw_1697x16]
vpbroadcastd m11, [pw_8192]
vpbroadcastd m7, [pixel_max]
lea r6, [strideq*5]
pxor m6, m6
paddw m10, m11, m11 ; pw_16384
mov r5, dstq
call .main
sub eobd, 36
jl .ret
add cq, 128*8
lea dstq, [r5+16]
call .main
sub cq, 128*8-32
lea dstq, [r5+strideq*8]
mov r5, dstq
call .main
sub eobd, 107 ; eob < 143
jl .ret
add cq, 128*8
lea dstq, [r5+16]
call .main
sub cq, 128*8-32
lea dstq, [r5+strideq*8]
mov r5, dstq
call .main
sub eobd, 128 ; eob < 271
jl .ret
add cq, 128*8
lea dstq, [r5+16]
call .main
sub cq, 128*8-32
lea dstq, [r5+strideq*8]
mov r5, dstq
call .main
sub eobd, 128 ; eob < 399
jl .ret
add cq, 128*8
lea dstq, [r5+16]
call .main
.ret:
RET
ALIGN function_align
.main:
mova m0, [cq+128*0]
packssdw m0, [cq+128*1]
mova m1, [cq+128*2]
packssdw m1, [cq+128*3]
mova m2, [cq+128*4]
packssdw m2, [cq+128*5]
mova m3, [cq+128*6]
packssdw m3, [cq+128*7]
REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
REPX {IDTX16 x, 4, 9, 10}, 0, 1, 2, 3
REPX {pmulhrsw x, m11}, m0, m1, m2, m3
REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
.main2:
punpckhwd m4, m0, m1
punpcklwd m0, m1
punpckhwd m1, m2, m3
punpcklwd m2, m3
punpckhwd m3, m0, m4
punpcklwd m0, m4
punpcklwd m4, m2, m1
punpckhwd m2, m1
punpckhqdq m1, m0, m4
punpcklqdq m0, m4
call m(iidentity_8x8_internal_16bpc).write_2x8x2
punpcklqdq m0, m3, m2
punpckhqdq m1, m3, m2
jmp m(iidentity_8x8_internal_16bpc).write_2x8x2
cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jz .dconly
PROLOGUE 0, 8, 16, 32*40, dst, stride, c, eob
%undef cmp
vpbroadcastd m12, [clip_min]
vpbroadcastd m13, [clip_max]
lea r6, [rsp+32*4]
call .main
cmp eobd, 36
jge .full
call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
lea rax, [pw_5+128]
mov r7, dstq
call m(idct_16x16_internal_8bpc).main
call .write_16x16
mova m0, [r5+32*3]
mova m1, [r5+32*2]
mova m2, [r5+32*1]
mova m3, [r5+32*0]
mova m4, [r5-32*1]
mova m5, [r5-32*2]
mova m6, [r5-32*3]
mova m7, [r5-32*4]
call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, [rsp]
jmp .end
.dconly:
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 16
add r6d, 2048
sar r6d, 12
imul r6d, 2896
add r6d, 6144
sar r6d, 13
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
.full:
add cq, 32
mova [r4+32*3], m0
mova [r4+32*2], m1
mova [r4+32*1], m2
mova [r4+32*0], m3
mova [r4-32*1], m4
mova [r4-32*2], m5
mova [r4-32*3], m6
mova [r4-32*4], m7
call .main
sub r4, 32*16 ; topleft 16x8
call .transpose_16x16
lea rax, [pw_5+128]
mov r7, dstq
call m(idct_16x16_internal_8bpc).main
call .write_16x16
mova m0, [r5+32*3]
mova m1, [r5+32*2]
mova m2, [r5+32*1]
mova m3, [r5+32*0]
mova m4, [r5-32*1]
mova m5, [r5-32*2]
mova m6, [r5-32*3]
mova m7, [r5-32*4]
add r4, 32*8 ; bottomleft 16x8
call .transpose_16x16
.end:
lea dstq, [r7+32]
call m(idct_16x16_internal_8bpc).main
call .write_16x16
RET
ALIGN function_align
.transpose_16x16:
punpckhdq m8, m3, m1
punpckldq m3, m1
punpckhdq m1, m0, m2
punpckldq m0, m2
punpckhdq m2, m7, m5
punpckldq m7, m5
punpckhdq m5, m4, m6
punpckldq m4, m6
punpckhqdq m6, m0, m4
punpcklqdq m0, m4
punpckhqdq m4, m1, m5
punpcklqdq m1, m5
punpckhqdq m5, m7, m3
punpcklqdq m7, m3
punpckhqdq m3, m2, m8
punpcklqdq m2, m8
vinserti128 m8, m0, xm7, 1
vperm2i128 m12, m0, m7, 0x31
vinserti128 m9, m6, xm5, 1
vperm2i128 m13, m6, m5, 0x31
vinserti128 m10, m1, xm2, 1
vperm2i128 m14, m1, m2, 0x31
vinserti128 m11, m4, xm3, 1
vperm2i128 m15, m4, m3, 0x31
mova m0, [r4+32*3]
mova m1, [r4+32*2]
mova m2, [r4+32*1]
mova m3, [r4+32*0]
mova m4, [r4-32*1]
mova m5, [r4-32*2]
mova m6, [r4-32*3]
mova m7, [r4-32*4]
mova [rsp+gprsize], m15
jmp m(inv_txfm_add_dct_dct_8x32_16bpc).transpose
ALIGN function_align
.main:
vpbroadcastd m14, [pd_2896]
vpbroadcastd m11, [pd_2048]
pmulld m0, m14, [cq+64* 1]
pmulld m1, m14, [cq+64* 7]
pmulld m2, m14, [cq+64* 9]
pmulld m3, m14, [cq+64*15]
pmulld m4, m14, [cq+64*17]
pmulld m5, m14, [cq+64*23]
pmulld m6, m14, [cq+64*25]
pmulld m7, m14, [cq+64*31]
call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_rect2
pmulld m0, m14, [cq+64* 3]
pmulld m1, m14, [cq+64* 5]
pmulld m2, m14, [cq+64*11]
pmulld m3, m14, [cq+64*13]
pmulld m4, m14, [cq+64*19]
pmulld m5, m14, [cq+64*21]
pmulld m6, m14, [cq+64*27]
pmulld m7, m14, [cq+64*29]
call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_rect2
pmulld m0, m14, [cq+64* 2]
pmulld m1, m14, [cq+64* 6]
pmulld m2, m14, [cq+64*10]
pmulld m3, m14, [cq+64*14]
pmulld m4, m14, [cq+64*18]
pmulld m5, m14, [cq+64*22]
pmulld m6, m14, [cq+64*26]
pmulld m7, m14, [cq+64*30]
call m(idct_8x16_internal_16bpc).main_oddhalf_rect2
pmulld m0, m14, [cq+64* 0]
pmulld m1, m14, [cq+64* 4]
pmulld m2, m14, [cq+64* 8]
pmulld m3, m14, [cq+64*12]
pmulld m4, m14, [cq+64*16]
pmulld m5, m14, [cq+64*20]
pmulld m6, m14, [cq+64*24]
pmulld m7, m14, [cq+64*28]
call m(idct_8x8_internal_16bpc).main_rect2
call m(idct_8x16_internal_16bpc).main_evenhalf
pxor m8, m8
mov r7d, 64*30
.main_zero_loop:
mova [cq+r7-64*2], m8
mova [cq+r7-64*1], m8
mova [cq+r7+64*0], m8
mova [cq+r7+64*1], m8
sub r7d, 64*4
jg .main_zero_loop
.main_end:
psrld m11, 11 ; pd_1
IDCT32_END 0, 15, 8, 9, 10, 1
IDCT32_END 1, 14, 8, 9, 10, 1
punpckhwd m8, m0, m1 ; 16 17
punpcklwd m0, m1 ; 0 1
punpcklwd m1, m14, m15 ; 14 15
punpckhwd m14, m15 ; 30 31
mova [r5+32*3], m8
mova [r5+32*2], m14
IDCT32_END 2, 15, 8, 9, 10, 1
IDCT32_END 3, 14, 8, 9, 10, 1
punpckhwd m8, m2, m3 ; 18 19
punpcklwd m2, m3 ; 2 3
punpcklwd m3, m14, m15 ; 12 13
punpckhwd m14, m15 ; 28 29
mova [r5+32*1], m8
mova [r5+32*0], m14
IDCT32_END 4, 15, 8, 9, 10, 1
IDCT32_END 5, 14, 8, 9, 10, 1
punpckhwd m8, m4, m5 ; 20 21
punpcklwd m4, m5 ; 4 5
punpcklwd m5, m14, m15 ; 10 11
punpckhwd m14, m15 ; 26 27
mova [r5-32*1], m8
mova [r5-32*2], m14
IDCT32_END 6, 15, 8, 9, 10, 1
IDCT32_END 7, 14, 8, 9, 10, 1
punpckhwd m8, m6, m7 ; 22 23
punpcklwd m6, m7 ; 6 7
punpcklwd m7, m14, m15 ; 8 9
punpckhwd m14, m15 ; 24 25
mova [r5-32*3], m8
mova [r5-32*4], m14
ret
ALIGN function_align
.write_16x16:
mova m1, [rsp+gprsize+32*1]
mova [rsp+gprsize+32*0], m8
mova [rsp+gprsize+32*1], m9
mova [rsp+gprsize+32*2], m12
vpbroadcastd m12, [pw_2048]
vpbroadcastd m9, [pixel_max]
lea r3, [strideq*3]
pxor m8, m8
pmulhrsw m0, m12
pmulhrsw m1, m12
pmulhrsw m2, m12
pmulhrsw m3, m12
call m(idct_16x8_internal_16bpc).write_16x4
pmulhrsw m0, m12, m4
pmulhrsw m1, m12, m5
pmulhrsw m2, m12, m6
pmulhrsw m3, m12, m7
call m(idct_16x8_internal_16bpc).write_16x4
pmulhrsw m0, m12, [rsp+gprsize+32*0]
pmulhrsw m1, m12, [rsp+gprsize+32*1]
pmulhrsw m2, m12, m10
pmulhrsw m3, m12, m11
call m(idct_16x8_internal_16bpc).write_16x4
pmulhrsw m0, m12, [rsp+gprsize+32*2]
pmulhrsw m1, m12, m13
pmulhrsw m2, m12, m14
pmulhrsw m3, m12, m15
jmp m(idct_16x8_internal_16bpc).write_16x4
cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4, 7, 11, dst, stride, c, eob
vpbroadcastd m8, [pw_2896x8]
vpbroadcastd m9, [pw_1697x16]
vpbroadcastd m10, [pw_2048]
vpbroadcastd m7, [pixel_max]
lea r6, [strideq*5]
pxor m6, m6
mov r5, dstq
call .main
sub eobd, 36
jl .ret
add cq, 32
lea dstq, [dstq+strideq*4]
call .main
add cq, 64*8-32
lea dstq, [r5+16*1]
call .main
sub eobd, 107 ; eob < 143
jl .ret
add cq, 32
lea dstq, [dstq+strideq*4]
call .main
add cq, 64*8-32
lea dstq, [r5+16*2]
call .main
sub eobd, 128 ; eob < 271
jl .ret
add cq, 32
lea dstq, [dstq+strideq*4]
call .main
add cq, 64*8-32
lea dstq, [r5+16*3]
call .main
sub eobd, 128 ; eob < 399
jl .ret
add cq, 32
lea dstq, [dstq+strideq*4]
call .main
.ret:
RET
ALIGN function_align
.main:
mova m0, [cq+64*0]
packssdw m0, [cq+64*1]
mova m1, [cq+64*2]
packssdw m1, [cq+64*3]
mova m2, [cq+64*4]
packssdw m2, [cq+64*5]
mova m3, [cq+64*6]
packssdw m3, [cq+64*7]
REPX {pmulhrsw x, m8 }, m0, m1, m2, m3
REPX {paddsw x, x }, m0, m1, m2, m3
REPX {IDTX16 x, 4, 9 }, 0, 1, 2, 3
REPX {pmulhrsw x, m10}, m0, m1, m2, m3
REPX {mova [cq+64*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
jmp m(inv_txfm_add_identity_identity_16x32_16bpc).main2
cglobal inv_txfm_add_dct_dct_32x32_16bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jz .dconly
PROLOGUE 0, 8, 16, 32*83, dst, stride, c, eob
%undef cmp
vpbroadcastd m12, [clip_min]
vpbroadcastd m13, [clip_max]
lea r6, [rsp+32*7]
call .main
cmp eobd, 36
jl .fast
call .main
cmp eobd, 136
jl .fast
call .main
cmp eobd, 300
jl .fast
call .main
jmp .pass2
.dconly:
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 32
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly
.fast:
lea r4, [rsp+32*71]
pxor m0, m0
.fast_loop:
REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
add r6, 32*8
cmp r6, r4
jl .fast_loop
.pass2:
lea r3, [rsp+32*3]
mov r4, r6
lea r5, [r6+32*8]
lea rax, [pw_5+128]
call .pass2_oddhalf
call .pass2_evenhalf
imul r2, strideq, 19
lea r3, [strideq*3]
add r2, dstq
call m(inv_txfm_add_dct_dct_16x32_16bpc).pass2_end
sub dstq, r3
lea r2, [r2+r3+32]
add dstq, 32
lea r3, [rsp+32*11]
call .pass2_oddhalf
call .pass2_evenhalf
lea r3, [strideq*3]
call m(inv_txfm_add_dct_dct_16x32_16bpc).pass2_end
RET
ALIGN function_align
.main:
mova m0, [cq+128* 1]
mova m1, [cq+128* 7]
mova m2, [cq+128* 9]
mova m3, [cq+128*15]
mova m4, [cq+128*17]
mova m5, [cq+128*23]
mova m6, [cq+128*25]
mova m7, [cq+128*31]
vpbroadcastd m11, [pd_2048]
vpbroadcastd m14, [pd_2896]
call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1
mova m0, [cq+128* 3]
mova m1, [cq+128* 5]
mova m2, [cq+128*11]
mova m3, [cq+128*13]
mova m4, [cq+128*19]
mova m5, [cq+128*21]
mova m6, [cq+128*27]
mova m7, [cq+128*29]
call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2
mova m0, [cq+128* 2]
mova m1, [cq+128* 6]
mova m2, [cq+128*10]
mova m3, [cq+128*14]
mova m4, [cq+128*18]
mova m5, [cq+128*22]
mova m6, [cq+128*26]
mova m7, [cq+128*30]
call m(idct_8x16_internal_16bpc).main_oddhalf
mova m0, [cq+128* 0]
mova m1, [cq+128* 4]
mova m2, [cq+128* 8]
mova m3, [cq+128*12]
mova m4, [cq+128*16]
mova m5, [cq+128*20]
mova m6, [cq+128*24]
mova m7, [cq+128*28]
call m(idct_8x8_internal_16bpc).main
call m(idct_8x16_internal_16bpc).main_evenhalf
call m(inv_txfm_add_dct_dct_8x32_16bpc).main_end
pxor m15, m15
mov r7d, 128*29
.main_zero_loop:
mova [cq+r7-128*1], m15
mova [cq+r7+128*0], m15
mova [cq+r7+128*1], m15
mova [cq+r7+128*2], m15
sub r7d, 128*4
jg .main_zero_loop
add cq, 32
mova [r4-32*4], m0
mova [r4-32*3], m1
mova [r4-32*2], m2
mova [r4-32*1], m3
mova [r4+32*0], m4
mova [r4+32*1], m5
mova [r4+32*2], m6
mova [r4+32*3], m7
mova m0, [r5+32*3]
mova m1, [r5+32*2]
mova m2, [r5+32*1]
mova m3, [r5+32*0]
mova m4, [r5-32*1]
mova m5, [r5-32*2]
mova m6, [r5-32*3]
mova m7, [r5-32*4]
call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose
mova [r5-32*4], m0
mova [r5-32*3], m1
mova [r5-32*2], m2
mova [r5-32*1], m3
mova [r5+32*0], m4
mova [r5+32*1], m5
mova [r5+32*2], m6
mova [r5+32*3], m7
ret
ALIGN function_align
.pass2_oddhalf:
mova m0, [r3+32* 1] ; 1
mova m1, [r3+32* 3] ; 3
mova m2, [r3+32* 5] ; 5
mova m3, [r3+32* 7] ; 7
mova m4, [r3+32*17] ; 9
mova m5, [r3+32*19] ; 11
mova m6, [r3+32*21] ; 13
mova m7, [r3+32*23] ; 15
mova m8, [r3+32*33] ; 17
mova m9, [r3+32*35] ; 19
mova m10, [r3+32*37] ; 21
mova m11, [r3+32*39] ; 23
mova m12, [r3+32*49] ; 25
mova m13, [r3+32*51] ; 27
mova m14, [r3+32*53] ; 29
mova m15, [r3+32*55] ; 31
jmp m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
ALIGN function_align
.pass2_evenhalf:
mova m0, [r3+32* 0] ; 0
mova m1, [r3+32* 2] ; 2
mova m2, [r3+32* 4] ; 4
mova m3, [r3+32* 6] ; 6
mova m4, [r3+32*16] ; 8
mova m5, [r3+32*18] ; 10
mova m6, [r3+32*20] ; 12
mova m7, [r3+32*22] ; 14
mova m8, [r3+32*32] ; 16
mova m9, [r3+32*34] ; 18
mova m10, [r3+32*36] ; 20
mova m11, [r3+32*38] ; 22
mova m12, [r3+32*48] ; 24
mova m13, [r3+32*50] ; 26
mova m14, [r3+32*52] ; 28
mova m15, [r3+32*54] ; 30
mova [rsp+gprsize], m15
jmp m(idct_16x16_internal_8bpc).main
cglobal inv_txfm_add_identity_identity_32x32_16bpc, 4, 8, 8, dst, stride, c, eob
%undef cmp
vpbroadcastd m5, [pw_8192]
vpbroadcastd m7, [pixel_max]
pxor m6, m6
lea r6, [strideq*3]
lea r5, [strideq*5]
lea r4, [strideq+r6*2] ; strideq*7
call .main ; 0
cmp eobd, 36
jl .ret
add cq, 128*8 ; 0 1
mov r7, dstq ; 1
add dstq, 16
call .main
call .main2
cmp eobd, 136
jl .ret
add cq, 128*16-32 ; 0 1 2
lea dstq, [r7+16*2] ; 1 2
call .main ; 2
call .main2
call .main2
cmp eobd, 300
jl .ret
add cq, 128*24-64 ; 0 1 2 3
add r7, 16*3 ; 1 2 3
mov dstq, r7 ; 2 3
call .main ; 3
call .main2
call .main2
call .main2
cmp eobd, 535
jl .ret
add cq, 128*24-64 ; 0 1 2 3
lea dstq, [r7+strideq*8] ; 1 2 3 4
mov r7, dstq ; 2 3 4
call .main ; 3 4
call .main2
call .main2
cmp eobd, 755
jl .ret
add cq, 128*16-32 ; 0 1 2 3
lea dstq, [r7+strideq*8] ; 1 2 3 4
call .main ; 2 3 4 5
call .main2 ; 3 4 5
cmp eobd, 911
jl .ret
add cq, 128*8 ; 0 1 2 3
add dstq, 16 ; 1 2 3 4
call .main ; 2 3 4 5
.ret: ; 3 4 5 6
RET
ALIGN function_align
.main2:
sub cq, 128*8-32
lea dstq, [dstq+strideq*8-16]
.main:
mova m0, [cq+128*0]
packssdw m0, [cq+128*1]
mova m1, [cq+128*2]
packssdw m1, [cq+128*3]
mova m2, [cq+128*4]
packssdw m2, [cq+128*5]
mova m3, [cq+128*6]
packssdw m3, [cq+128*7]
REPX {pmulhrsw x, m5}, m0, m1, m2, m3
jmp m(inv_txfm_add_identity_identity_8x32_16bpc).main_zero
%macro IDCT64_PART2_END 6-10 ; out, src[1-2], tmp[1-3], (offset[1-4])
%if %1 & 1
mova m%5, [r5-32*(51-%1)] ; idct16 out 0+n
mova m%4, [r4-32*(14+%1)] ; idct32 out31-n
%else
mova m%5, [r4-32*(45-%1)]
mova m%4, [r5-32*(20+%1)]
%endif
paddsw m%6, m%5, m%4 ; idct32 out 0+n
psubsw m%5, m%4 ; idct32 out31-n
paddsw m%4, m%5, m%3 ; out31-n
psubsw m%5, m%3 ; out32+n
paddsw m%3, m%6, m%2 ; out 0+n
psubsw m%6, m%2 ; out63-n
REPX {pmulhrsw x, m14}, m%5, m%6, m%4, m%3
%if %1 & 1
%define %%d0 r2
%define %%d1 dstq
%else
%define %%d0 dstq
%define %%d1 r2
%endif
paddw m%3, [%%d0+%7 ]
paddw m%4, [%%d1+%8 ]
paddw m%5, [%%d0+%9 ]
paddw m%6, [%%d1+%10]
pxor m%2, m%2
REPX {pmaxsw x, m%2}, m%3, m%4, m%5, m%6
vpbroadcastd m%2, [pixel_max]
REPX {pminsw x, m%2}, m%3, m%4, m%5, m%6
mova [%%d0+%7 ], m%3
mova [%%d1+%8 ], m%4
mova [%%d0+%9 ], m%5
mova [%%d1+%10], m%6
%endmacro
cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jz .dconly
PROLOGUE 0, 10, 16, 32*98, dst, stride, c, eob
%undef cmp
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_min]
vpbroadcastd m13, [clip_max]
vpbroadcastd m14, [pd_2896]
lea r6, [rsp+32*6]
call .main
sub eobd, 44
jl .fast
call .main
sub eobd, 107
jl .fast
call .main
sub eobd, 128
jl .fast
call .main
jmp .pass2
.dconly:
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 64
add r6d, 10240
sar r6d, 14
jmp m(inv_txfm_add_dct_dct_16x4_16bpc).dconly2
.fast:
lea r4, [rsp+32*38]
pxor m0, m0
.fast_loop:
REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
add r6, 32*8
cmp r6, r4
jl .fast_loop
.pass2:
lea rax, [pw_5+128]
mova m0, [rsp+32* 2] ; in0
mova m1, [rsp+32* 6] ; in4
mova m2, [rsp+32*10] ; in8
mova m3, [rsp+32*14] ; in12
mova m4, [rsp+32*18] ; in16
mova m5, [rsp+32*22] ; in20
mova m6, [rsp+32*26] ; in24
mova m7, [rsp+32*30] ; in28
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
mova [rsp], m8
call m(idct_16x16_internal_8bpc).main
mova m1, [rsp+32*1]
lea r4, [rsp+32*38]
mova [r4-32*4], m0
mova [r4-32*3], m1
mova [r4-32*2], m2
mova [r4-32*1], m3
mova [r4+32*0], m4
mova [r4+32*1], m5
mova [r4+32*2], m6
mova [r4+32*3], m7
add r4, 32*8
mova [r4-32*4], m8
mova [r4-32*3], m9
mova [r4-32*2], m10
mova [r4-32*1], m11
mova [r4+32*0], m12
mova [r4+32*1], m13
mova [r4+32*2], m14
mova [r4+32*3], m15
mova m0, [rsp+32* 4] ; in2
mova m1, [rsp+32* 8] ; in6
mova m2, [rsp+32*12] ; in10
mova m3, [rsp+32*16] ; in14
mova m4, [rsp+32*20] ; in18
mova m5, [rsp+32*24] ; in22
mova m6, [rsp+32*28] ; in26
mova m7, [rsp+32*32] ; in30
lea r5, [r4+32*16]
add r4, 32*8
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
mova m0, [rsp+32* 3] ; in1
mova m1, [rsp+32*33] ; in31
mova m2, [rsp+32*19] ; in17
mova m3, [rsp+32*17] ; in15
mova m4, [rsp+32*11] ; in9
mova m5, [rsp+32*25] ; in23
mova m6, [rsp+32*27] ; in25
mova m7, [rsp+32* 9] ; in7
lea rax, [idct64_mul - 8]
add r4, 32*16
add r5, 32*32
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
mova m0, [rsp+32* 7] ; in5
mova m1, [rsp+32*29] ; in27
mova m2, [rsp+32*23] ; in21
mova m3, [rsp+32*13] ; in11
mova m4, [rsp+32*15] ; in13
mova m5, [rsp+32*21] ; in19
mova m6, [rsp+32*31] ; in29
mova m7, [rsp+32* 5] ; in3
add rax, 8
add r4, 32*8
sub r5, 32*8
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
lea r8, [strideq*4]
lea r9, [strideq*5]
lea r3, [r9+strideq*1] ; stride*6
lea r7, [r9+strideq*2] ; stride*7
call .main_part2_pass2
RET
ALIGN function_align
.main:
mova m0, [cq+128* 1]
mova m1, [cq+128* 3]
mova m2, [cq+128* 5]
mova m3, [cq+128* 7]
mova m4, [cq+128* 9]
mova m5, [cq+128*11]
mova m6, [cq+128*13]
mova m7, [cq+128*15]
call m(idct_8x16_internal_16bpc).main_oddhalf
mova m0, [cq+128* 0]
mova m1, [cq+128* 2]
mova m2, [cq+128* 4]
mova m3, [cq+128* 6]
mova m4, [cq+128* 8]
mova m5, [cq+128*10]
mova m6, [cq+128*12]
mova m7, [cq+128*14]
call m(idct_8x8_internal_16bpc).main
call m(idct_8x16_internal_16bpc).main_evenhalf
pxor m15, m15
mov r7d, 128*13
.main_zero_loop:
mova [cq+r7-128*1], m15
mova [cq+r7+128*0], m15
mova [cq+r7+128*1], m15
mova [cq+r7+128*2], m15
sub r7d, 128*4
jg .main_zero_loop
add cq, 32
psrld m15, m11, 10 ; pd_2
mova m8, [r6-32*4]
mova m9, [r6+32*3]
REPX {paddd x, m15}, m0, m1, m2, m3, m4, m5, m6, m7
psubd m10, m0, m8 ; out15
paddd m0, m8 ; out0
mova m8, [r6-32*3]
psubd m15, m7, m9 ; out8
paddd m7, m9 ; out7
mova m9, [r6+32*2]
REPX {psrad x, 2}, m0, m15, m10, m7
packssdw m0, m15
packssdw m7, m10
psubd m10, m1, m8 ; out14
paddd m1, m8 ; out1
mova m8, [r6-32*2]
psubd m15, m6, m9 ; out9
paddd m6, m9 ; out6
mova m9, [r6+32*1]
REPX {psrad x, 2}, m1, m15, m10, m6
packssdw m1, m15
packssdw m6, m10
psubd m10, m2, m8 ; out13
paddd m2, m8 ; out2
mova m8, [r6-32*1]
psubd m15, m5, m9 ; out10
paddd m5, m9 ; out5
mova m9, [r6+32*0]
REPX {psrad x, 2}, m2, m15, m10, m5
packssdw m2, m15
packssdw m5, m10
psubd m10, m3, m8 ; out12
paddd m3, m8 ; out3
psubd m15, m4, m9 ; out11
paddd m4, m9 ; out4
REPX {psrad x, 2}, m3, m15, m10, m4
packssdw m3, m15
packssdw m4, m10
call m(idct_16x8_internal_16bpc).transpose3
mova [r6-32*4], m0
mova [r6-32*3], m1
mova [r6-32*2], m2
mova [r6-32*1], m3
mova [r6+32*0], m4
mova [r6+32*1], m5
mova [r6+32*2], m6
mova [r6+32*3], m7
add r6, 32*8
ret
.main_part2_pass2:
vpbroadcastd m11, [pw_1567_3784]
vpbroadcastd m12, [pw_m3784_1567]
vpbroadcastd m13, [pw_2896_2896]
lea rax, [pw_5+128]
lea r2, [dstq+r7]
.main_part2_pass2_loop:
vpbroadcastd m14, [pw_m2896_2896]
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part2_internal
vpbroadcastd m14, [pw_2048]
IDCT64_PART2_END 0, 7, 0, 6, 9, 10, strideq*0, r3*4, r8*8, r7*8
IDCT64_PART2_END 7, 8, 5, 0, 6, 7, strideq*0, r3*4, r8*8, r7*8
IDCT64_PART2_END 8, 2, 1, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8
IDCT64_PART2_END 15, 3, 4, 0, 6, 7, strideq*8, r8*4, r9*8, r3*8
add dstq, strideq
sub r2, strideq
cmp r4, r5
jne .main_part2_pass2_loop
ret
ALIGN function_align
.main_part1_rect2:
REPX {paddd x, m11}, m0, m1, m2, m3
REPX {psrad x, 12 }, m0, m1, m2, m3
.main_part1: ; idct64 steps 1-5
; in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
; in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
; in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
; in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
vpbroadcastd m7, [r5+4*0]
vpbroadcastd m8, [r5+4*1]
vpbroadcastd m6, [r5+4*2]
vpbroadcastd m9, [r5+4*3]
vpbroadcastd m5, [r5+4*4]
vpbroadcastd m10, [r5+4*5]
vpbroadcastd m4, [r5+4*6]
vpbroadcastd m15, [r5+4*7]
pmulld m7, m0 ; t63a
pmulld m0, m8 ; t32a
pmulld m6, m1 ; t62a
pmulld m1, m9 ; t33a
pmulld m5, m2 ; t61a
pmulld m2, m10 ; t34a
pmulld m4, m3 ; t60a
pmulld m3, m15 ; t35a
vpbroadcastd m10, [r5+4*8]
vpbroadcastd m15, [r5+4*9]
REPX {paddd x, m11}, m7, m0, m6, m1, m5, m2, m4, m3
REPX {psrad x, 12 }, m0, m1, m7, m6, m2, m3, m5, m4
psubd m8, m0, m1 ; t33
paddd m0, m1 ; t32
psubd m1, m7, m6 ; t62
paddd m7, m6 ; t63
psubd m6, m3, m2 ; t34
paddd m3, m2 ; t35
psubd m2, m4, m5 ; t61
paddd m4, m5 ; t60
REPX {pmaxsd x, m12}, m8, m1, m6, m2
REPX {pminsd x, m13}, m8, m1, m6, m2
ITX_MULSUB_2D 1, 8, 5, 9, _, 11, 10, 15 ; t33a, t62a
ITX_MULSUB_2D 2, 6, 5, 9, _, 11, 10, 15, 4 ; t61a, t34a
REPX {pmaxsd x, m12}, m0, m3, m7, m4
REPX {pminsd x, m13}, m0, m3, m7, m4
vpbroadcastd m10, [r5+4*10]
vpbroadcastd m15, [r5+4*11]
psubd m5, m0, m3 ; t35a
paddd m0, m3 ; t32a
psubd m3, m7, m4 ; t60a
paddd m7, m4 ; t63a
psubd m4, m1, m6 ; t34
paddd m1, m6 ; t33
psubd m6, m8, m2 ; t61
paddd m8, m2 ; t62
REPX {pmaxsd x, m12}, m5, m3, m4, m6
REPX {pminsd x, m13}, m5, m3, m4, m6
ITX_MULSUB_2D 3, 5, 2, 9, _, 11, 10, 15 ; t35, t60
ITX_MULSUB_2D 6, 4, 2, 9, _, 11, 10, 15 ; t34a, t61a
REPX {pmaxsd x, m12}, m0, m7, m1, m8
REPX {pminsd x, m13}, m0, m7, m1, m8
add r5, 4*12
mova [r6-32*4], m0
mova [r6+32*3], m7
mova [r6-32*3], m1
mova [r6+32*2], m8
mova [r6-32*2], m6
mova [r6+32*1], m4
mova [r6-32*1], m3
mova [r6+32*0], m5
add r6, 32*8
ret
.main_part2: ; idct64 steps 6-9
lea r5, [r6+32*3]
sub r6, 32*4
vpbroadcastd m10, [pd_1567]
vpbroadcastd m15, [pd_3784]
.main_part2_loop:
mova m0, [r6-32*32] ; t32a
mova m1, [r5-32*24] ; t39a
mova m2, [r5-32*32] ; t63a
mova m3, [r6-32*24] ; t56a
mova m4, [r6-32*16] ; t40a
mova m5, [r5-32* 8] ; t47a
mova m6, [r5-32*16] ; t55a
mova m7, [r6-32* 8] ; t48a
psubd m8, m0, m1 ; t39
paddd m0, m1 ; t32
psubd m1, m2, m3 ; t56
paddd m2, m3 ; t63
psubd m3, m5, m4 ; t40
paddd m5, m4 ; t47
psubd m4, m7, m6 ; t55
paddd m7, m6 ; t48
REPX {pmaxsd x, m12}, m8, m1, m3, m4
REPX {pminsd x, m13}, m8, m1, m3, m4
ITX_MULSUB_2D 1, 8, 6, 9, _, 11, 10, 15 ; t39a, t56a
ITX_MULSUB_2D 4, 3, 6, 9, _, 11, 10, 15, 4 ; t55a, t40a
REPX {pmaxsd x, m12}, m0, m2, m5, m7
REPX {pminsd x, m13}, m0, m5, m2, m7
psubd m6, m2, m7 ; t48a
paddd m2, m7 ; t63a
psubd m7, m0, m5 ; t47a
paddd m0, m5 ; t32a
psubd m5, m8, m4 ; t55
paddd m8, m4 ; t56
psubd m4, m1, m3 ; t40
paddd m1, m3 ; t39
REPX {pmaxsd x, m12}, m6, m7, m5, m4
REPX {pminsd x, m13}, m6, m7, m5, m4
REPX {pmulld x, m14}, m6, m7, m5, m4
REPX {pmaxsd x, m12}, m2, m0, m8, m1
REPX {pminsd x, m13}, m2, m0, m8, m1
paddd m6, m11
paddd m5, m11
psubd m3, m6, m7 ; t47
paddd m6, m7 ; t48
psubd m7, m5, m4 ; t40a
paddd m5, m4 ; t55a
REPX {psrad x, 12}, m3, m6, m7, m5
mova [r5-32* 8], m2
mova [r6-32*32], m0
mova [r6-32* 8], m8
mova [r5-32*32], m1
mova [r5-32*24], m3
mova [r6-32*16], m6
mova [r6-32*24], m7
mova [r5-32*16], m5
add r6, 32
sub r5, 32
cmp r6, r5
jl .main_part2_loop
ret
cglobal inv_txfm_add_dct_dct_32x64_16bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jz .dconly
PROLOGUE 0, 11, 16, 32*134, dst, stride, c, eob
%undef cmp
vpbroadcastd m12, [clip_min]
vpbroadcastd m13, [clip_max]
lea r6, [rsp+32*6]
call .main
cmp eobd, 36
jl .fast
call .main
cmp eobd, 136
jl .fast
call .main
cmp eobd, 300
jl .fast
call .main
jmp .pass2
.dconly:
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 64
add r6d, 2048
sar r6d, 12
imul r6d, 2896
add r6d, 6144
sar r6d, 13
jmp m(inv_txfm_add_dct_dct_32x8_16bpc).dconly2
.fast:
lea r4, [rsp+32*70]
pxor m0, m0
.fast_loop:
REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
add r6, 32*8
cmp r6, r4
jl .fast_loop
.pass2:
lea rax, [pw_5 + 128]
mov r10, rsp
lea r8, [strideq*4]
lea r9, [strideq*5]
lea r3, [r9+strideq*1] ; stride*6
lea r7, [r9+strideq*2] ; stride*7
.pass2_loop:
mova m0, [r10+32* 2] ; in0
mova m1, [r10+32* 6] ; in4
mova m2, [r10+32*18] ; in8
mova m3, [r10+32*22] ; in12
mova m4, [r10+32*34] ; in16
mova m5, [r10+32*38] ; in20
mova m6, [r10+32*50] ; in24
mova m7, [r10+32*54] ; in28
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
mova [rsp], m8
call m(idct_16x16_internal_8bpc).main
mova m1, [rsp+32*1]
lea r4, [rsp+32*70]
mova [r4-32*4], m0
mova [r4-32*3], m1
mova [r4-32*2], m2
mova [r4-32*1], m3
mova [r4+32*0], m4
mova [r4+32*1], m5
mova [r4+32*2], m6
mova [r4+32*3], m7
add r4, 32*8
mova [r4-32*4], m8
mova [r4-32*3], m9
mova [r4-32*2], m10
mova [r4-32*1], m11
mova [r4+32*0], m12
mova [r4+32*1], m13
mova [r4+32*2], m14
mova [r4+32*3], m15
mova m0, [r10+32* 4] ; in2
mova m1, [r10+32* 8] ; in6
mova m2, [r10+32*20] ; in10
mova m3, [r10+32*24] ; in14
mova m4, [r10+32*36] ; in18
mova m5, [r10+32*40] ; in22
mova m6, [r10+32*52] ; in26
mova m7, [r10+32*56] ; in30
lea r5, [r4+32*16]
add r4, 32*8
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
mova m0, [r10+32* 3] ; in1
mova m1, [r10+32*57] ; in31
mova m2, [r10+32*35] ; in17
mova m3, [r10+32*25] ; in15
mova m4, [r10+32*19] ; in9
mova m5, [r10+32*41] ; in23
mova m6, [r10+32*51] ; in25
mova m7, [r10+32* 9] ; in7
lea rax, [idct64_mul - 8]
add r4, 32*16
add r5, 32*32
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
mova m0, [r10+32* 7] ; in5
mova m1, [r10+32*53] ; in27
mova m2, [r10+32*39] ; in21
mova m3, [r10+32*21] ; in11
mova m4, [r10+32*23] ; in13
mova m5, [r10+32*37] ; in19
mova m6, [r10+32*55] ; in29
mova m7, [r10+32* 5] ; in3
add rax, 8
add r4, 32*8
sub r5, 32*8
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2_pass2
add r10, 32*8
sub r4, 32*98 ; rsp+32*16
sub dstq, r8
add dstq, 32
cmp r10, r4
jl .pass2_loop
RET
ALIGN function_align
.main:
vpbroadcastd m14, [pd_2896]
vpbroadcastd m11, [pd_2048]
pmulld m0, m14, [cq+128* 1]
pmulld m1, m14, [cq+128* 7]
pmulld m2, m14, [cq+128* 9]
pmulld m3, m14, [cq+128*15]
pmulld m4, m14, [cq+128*17]
pmulld m5, m14, [cq+128*23]
pmulld m6, m14, [cq+128*25]
pmulld m7, m14, [cq+128*31]
call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_rect2
pmulld m0, m14, [cq+128* 3]
pmulld m1, m14, [cq+128* 5]
pmulld m2, m14, [cq+128*11]
pmulld m3, m14, [cq+128*13]
pmulld m4, m14, [cq+128*19]
pmulld m5, m14, [cq+128*21]
pmulld m6, m14, [cq+128*27]
pmulld m7, m14, [cq+128*29]
call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_rect2
pmulld m0, m14, [cq+128* 2]
pmulld m1, m14, [cq+128* 6]
pmulld m2, m14, [cq+128*10]
pmulld m3, m14, [cq+128*14]
pmulld m4, m14, [cq+128*18]
pmulld m5, m14, [cq+128*22]
pmulld m6, m14, [cq+128*26]
pmulld m7, m14, [cq+128*30]
call m(idct_8x16_internal_16bpc).main_oddhalf_rect2
pmulld m0, m14, [cq+128* 0]
pmulld m1, m14, [cq+128* 4]
pmulld m2, m14, [cq+128* 8]
pmulld m3, m14, [cq+128*12]
pmulld m4, m14, [cq+128*16]
pmulld m5, m14, [cq+128*20]
pmulld m6, m14, [cq+128*24]
pmulld m7, m14, [cq+128*28]
pxor m15, m15
mov r7d, 128*29
.main_zero_loop:
mova [cq+r7-128*1], m15
mova [cq+r7+128*0], m15
mova [cq+r7+128*1], m15
mova [cq+r7+128*2], m15
sub r7d, 128*4
jg .main_zero_loop
add cq, 32
call m(idct_8x8_internal_16bpc).main_rect2
call m(idct_8x16_internal_16bpc).main_evenhalf
call m(inv_txfm_add_dct_dct_32x16_16bpc).main_end
call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose
mova [r4-32*4], m0
mova [r4-32*3], m1
mova [r4-32*2], m2
mova [r4-32*1], m3
mova [r4+32*0], m4
mova [r4+32*1], m5
mova [r4+32*2], m6
mova [r4+32*3], m7
mova m0, [r5+32*3]
mova m1, [r5+32*2]
mova m2, [r5+32*1]
mova m3, [r5+32*0]
mova m4, [r5-32*1]
mova m5, [r5-32*2]
mova m6, [r5-32*3]
mova m7, [r5-32*4]
call m(inv_txfm_add_dct_dct_8x32_16bpc).transpose
mova [r5-32*4], m0
mova [r5-32*3], m1
mova [r5-32*2], m2
mova [r5-32*1], m3
mova [r5+32*0], m4
mova [r5+32*1], m5
mova [r5+32*2], m6
mova [r5+32*3], m7
ret
cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jnz .normal
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 16
.dconly:
add r6d, 10240
sar r6d, 14
.dconly2:
imul r6d, 2896
add r6d, 34816
sar r6d, 16
movd xm0, r6d
%if WIN64
movaps [rsp+8], xmm6
%endif
vpbroadcastw m0, xm0
vpbroadcastd m6, [pixel_max]
pxor m5, m5
.dconly_loop:
paddw m1, m0, [dstq+32*0]
paddw m2, m0, [dstq+32*1]
paddw m3, m0, [dstq+32*2]
paddw m4, m0, [dstq+32*3]
REPX {pmaxsw x, m5}, m1, m2, m3, m4
REPX {pminsw x, m6}, m1, m2, m3, m4
mova [dstq+32*0], m1
mova [dstq+32*1], m2
mova [dstq+32*2], m3
mova [dstq+32*3], m4
add dstq, strideq
dec r3d
jg .dconly_loop
%if WIN64
movaps xmm6, [rsp+8]
%endif
RET
.normal:
PROLOGUE 0, 8, 16, 32*96, dst, stride, c, eob
%undef cmp
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_min]
vpbroadcastd m13, [clip_max]
vpbroadcastd m14, [pd_2896]
lea r6, [rsp+32*4]
call .main
call .shift_transpose
cmp eobd, 36
jl .fast
call .main
call .shift_transpose
jmp .pass2
.fast:
pxor m0, m0
mov r3d, 4
.fast_loop:
REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
add r6, 32*8
dec r3d
jg .fast_loop
.pass2:
lea r7, [r6-32*64]
lea r4, [r6-32*32]
lea rax, [pw_5+128]
mov r5, dstq
.pass2_loop:
mova m0, [r7-32*4]
mova m1, [r7-32*3]
mova m2, [r7-32*2]
mova m3, [r7-32*1]
mova m4, [r7+32*0]
mova m5, [r7+32*1]
mova m6, [r7+32*2]
mova m7, [r7+32*3]
add r7, 32*32
mova m8, [r7-32*4]
mova m9, [r7-32*3]
mova m10, [r7-32*2]
mova m11, [r7-32*1]
mova m12, [r7+32*0]
mova m13, [r7+32*1]
mova m14, [r7+32*2]
mova m15, [r7+32*3]
sub r7, 32*24
mova [rsp], m15
call m(idct_16x16_internal_8bpc).main
mova m1, [rsp+32*1]
call m(inv_txfm_add_dct_dct_32x16_16bpc).write_16x16
add r5, 32
mov dstq, r5
cmp r7, r4
jl .pass2_loop
RET
ALIGN function_align
.main:
lea r5, [idct64_mul_16bpc]
mova m0, [cq+64* 1]
mova m1, [cq+64*31]
mova m2, [cq+64*17]
mova m3, [cq+64*15]
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1
mova m0, [cq+64* 7]
mova m1, [cq+64*25]
mova m2, [cq+64*23]
mova m3, [cq+64* 9]
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1
mova m0, [cq+64* 5]
mova m1, [cq+64*27]
mova m2, [cq+64*21]
mova m3, [cq+64*11]
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1
mova m0, [cq+64* 3]
mova m1, [cq+64*29]
mova m2, [cq+64*19]
mova m3, [cq+64*13]
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2
mova m0, [cq+64* 2]
mova m1, [cq+64*14]
mova m2, [cq+64*18]
mova m3, [cq+64*30]
call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_fast
mova m0, [cq+64* 6]
mova m1, [cq+64*10]
mova m2, [cq+64*22]
mova m3, [cq+64*26]
call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_fast
mova m0, [cq+64* 4]
mova m1, [cq+64*12]
mova m2, [cq+64*20]
mova m3, [cq+64*28]
call m(idct_8x16_internal_16bpc).main_oddhalf_fast
mova m0, [cq+64* 0]
mova m1, [cq+64* 8]
mova m2, [cq+64*16]
mova m3, [cq+64*24]
pxor m15, m15
mov r7d, 64*30
.main_zero_loop:
mova [cq+r7-64*2], m15
mova [cq+r7-64*1], m15
mova [cq+r7+64*0], m15
mova [cq+r7+64*1], m15
sub r7d, 64*4
jg .main_zero_loop
.main_end:
psrld m15, m11, 10 ; pd_2
.main_end2:
add cq, 32
pxor m4, m4
REPX {mova x, m4}, m5, m6, m7
call m(idct_8x8_internal_16bpc).main
add r6, 32*8
call m(idct_8x16_internal_16bpc).main_evenhalf
mova [r6+32*2], m1
mova [r6+32*1], m2
mova [r6+32*0], m3
mova [r6-32*1], m4
mova [r6-32*2], m5
mova [r6-32*3], m6
mova [r6-32*4], m7
jmp .main_end_loop_start
.main_end_loop:
mova m0, [r6+32* 3] ; idct8 0 + n
.main_end_loop_start:
mova m1, [r5+32* 4] ; idct16 15 - n
mova m2, [r5-32*12] ; idct32 16 + n
mova m3, [r6-32*13] ; idct32 31 - n
mova m4, [r6-32*29] ; idct64 63 - n
mova m5, [r5-32*28] ; idct64 48 + n
mova m6, [r6-32*45] ; idct64 47 - n
mova m7, [r5-32*44] ; idct64 32 + n
paddd m8, m0, m1 ; idct16 out0 + n
psubd m0, m1 ; idct16 out15 - n
REPX {pmaxsd x, m12}, m8, m0
REPX {pminsd x, m13}, m8, m0
paddd m1, m8, m3 ; idct32 out0 + n
psubd m8, m3 ; idct32 out31 - n
paddd m3, m0, m2 ; idct32 out15 - n
psubd m0, m2 ; idct32 out16 + n
REPX {pmaxsd x, m12}, m1, m8, m3, m0
REPX {pminsd x, m13}, m1, m3, m8, m0
REPX {paddd x, m15}, m1, m3, m0, m8
paddd m2, m1, m4 ; idct64 out0 + n (unshifted)
psubd m1, m4 ; idct64 out63 - n (unshifted)
paddd m4, m3, m5 ; idct64 out15 - n (unshifted)
psubd m3, m5 ; idct64 out48 + n (unshifted)
paddd m5, m0, m6 ; idct64 out16 + n (unshifted)
psubd m0, m6 ; idct64 out47 - n (unshifted)
paddd m6, m8, m7 ; idct64 out31 - n (unshifted)
psubd m8, m7 ; idct64 out32 + n (unshifted)
mova [r5-32*44], m2
mova [r6+32* 3], m1
mova [r6-32*45], m4
mova [r5+32* 4], m3
mova [r5-32*28], m5
mova [r6-32*13], m0
mova [r6-32*29], m6
mova [r5-32*12], m8
add r5, 32
sub r6, 32
cmp r5, r6
jl .main_end_loop
ret
.shift_transpose:
%macro IDCT64_SHIFT_TRANSPOSE 1 ; shift
sub r6, 32*48
mov r5, r6
%%loop:
mova m0, [r6-32* 4]
mova m4, [r6+32* 4]
mova m1, [r6-32* 3]
mova m5, [r6+32* 5]
mova m2, [r6-32* 2]
mova m6, [r6+32* 6]
mova m3, [r6-32* 1]
mova m7, [r6+32* 7]
REPX {psrad x, %1}, m0, m4, m1, m5, m2, m6, m3, m7
packssdw m0, m4
packssdw m1, m5
packssdw m2, m6
packssdw m3, m7
mova m4, [r6+32* 0]
mova m6, [r6+32* 8]
mova m5, [r6+32* 1]
mova m7, [r6+32* 9]
REPX {psrad x, %1}, m4, m6, m5, m7
packssdw m4, m6
packssdw m5, m7
mova m6, [r6+32* 2]
mova m8, [r6+32*10]
mova m7, [r6+32* 3]
mova m9, [r6+32*11]
REPX {psrad x, %1}, m6, m8, m7, m9
packssdw m6, m8
packssdw m7, m9
call m(idct_16x8_internal_16bpc).transpose3
mova [r5-32*4], m0
mova [r5-32*3], m1
mova [r5-32*2], m2
mova [r5-32*1], m3
mova [r5+32*0], m4
mova [r5+32*1], m5
mova [r5+32*2], m6
mova [r5+32*3], m7
add r6, 32*16
add r5, 32*8
cmp r5, r4
jl %%loop
mov r6, r4
%endmacro
IDCT64_SHIFT_TRANSPOSE 2
ret
cglobal inv_txfm_add_dct_dct_64x32_16bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jz .dconly
PROLOGUE 0, 8, 16, 32*163, dst, stride, c, eob
%undef cmp
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_min]
vpbroadcastd m13, [clip_max]
vpbroadcastd m14, [pd_2896]
lea r6, [rsp+32*7]
call .main
cmp eobd, 36
jl .fast
call .main
cmp eobd, 136
jl .fast
call .main
cmp eobd, 300
jl .fast
call .main
jmp .pass2
.dconly:
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 32
add r6d, 2048
sar r6d, 12
imul r6d, 2896
add r6d, 6144
sar r6d, 13
jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly2
.fast:
pxor m0, m0
lea r4, [rsp+32*135]
.fast_loop:
REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
add r6, 32*8
cmp r6, r4
jl .fast_loop
.pass2:
lea r7, [r6-32*32]
lea r5, [r6+32*8]
lea rax, [pw_5+128]
imul r2, strideq, 19
lea r3, [strideq*3]
add r2, dstq
.pass2_loop:
mova m0, [r7-32*99]
mova m1, [r7-32*97]
mova m2, [r7-32*95]
mova m3, [r7-32*93]
mova m4, [r7-32*67]
mova m5, [r7-32*65]
mova m6, [r7-32*63]
mova m7, [r7-32*61]
mova m8, [r7-32*35]
mova m9, [r7-32*33]
mova m10, [r7-32*31]
mova m11, [r7-32*29]
mova m12, [r7-32* 3]
mova m13, [r7-32* 1]
mova m14, [r7+32* 1]
mova m15, [r7+32* 3]
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf
mova m0, [r7-32*100]
mova m1, [r7-32*98]
mova m2, [r7-32*96]
mova m3, [r7-32*94]
mova m4, [r7-32*68]
mova m5, [r7-32*66]
mova m6, [r7-32*64]
mova m7, [r7-32*62]
mova m8, [r7-32*36]
mova m9, [r7-32*34]
mova m10, [r7-32*32]
mova m11, [r7-32*30]
mova m12, [r7-32* 4]
mova m13, [r7-32* 2]
mova m14, [r7+32* 0]
mova m15, [r7+32* 2]
add r7, 32*8
mova [rsp], m15
call m(idct_16x16_internal_8bpc).main
call m(inv_txfm_add_dct_dct_16x32_16bpc).pass2_end
sub dstq, r3
lea r2, [r2+r3+32]
add dstq, 32
cmp r7, r4
jl .pass2_loop
RET
ALIGN function_align
.main:
lea r5, [idct64_mul_16bpc]
pmulld m0, m14, [cq+128* 1]
pmulld m1, m14, [cq+128*31]
pmulld m2, m14, [cq+128*17]
pmulld m3, m14, [cq+128*15]
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1_rect2
pmulld m0, m14, [cq+128* 7]
pmulld m1, m14, [cq+128*25]
pmulld m2, m14, [cq+128*23]
pmulld m3, m14, [cq+128* 9]
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1_rect2
pmulld m0, m14, [cq+128* 5]
pmulld m1, m14, [cq+128*27]
pmulld m2, m14, [cq+128*21]
pmulld m3, m14, [cq+128*11]
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1_rect2
pmulld m0, m14, [cq+128* 3]
pmulld m1, m14, [cq+128*29]
pmulld m2, m14, [cq+128*19]
pmulld m3, m14, [cq+128*13]
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1_rect2
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2
pmulld m0, m14, [cq+128* 2]
pmulld m1, m14, [cq+128*14]
pmulld m2, m14, [cq+128*18]
pmulld m3, m14, [cq+128*30]
call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_fast_rect2
pmulld m0, m14, [cq+128* 6]
pmulld m1, m14, [cq+128*10]
pmulld m2, m14, [cq+128*22]
pmulld m3, m14, [cq+128*26]
call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_fast_rect2
pmulld m0, m14, [cq+128* 4]
pmulld m1, m14, [cq+128*12]
pmulld m2, m14, [cq+128*20]
pmulld m3, m14, [cq+128*28]
call m(idct_8x16_internal_16bpc).main_oddhalf_fast_rect2
pmulld m0, m14, [cq+128* 0]
pmulld m1, m14, [cq+128* 8]
pmulld m2, m14, [cq+128*16]
pmulld m3, m14, [cq+128*24]
pxor m15, m15
mov r7d, 128*29
.main_zero_loop:
mova [cq+r7-128*1], m15
mova [cq+r7+128*0], m15
mova [cq+r7+128*1], m15
mova [cq+r7+128*2], m15
sub r7d, 128*4
jg .main_zero_loop
psrld m15, m11, 11 ; pd_1
REPX {paddd x, m11}, m0, m1, m2, m3
REPX {psrad x, 12 }, m0, m1, m2, m3
call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end2
IDCT64_SHIFT_TRANSPOSE 1
ret
cglobal inv_txfm_add_dct_dct_64x64_16bpc, 4, 7, 0, dst, stride, c, eob
test eobd, eobd
jz .dconly
PROLOGUE 0, 11, 16, 32*195, dst, stride, c, eob
%undef cmp
vpbroadcastd m11, [pd_2048]
vpbroadcastd m12, [clip_min]
vpbroadcastd m13, [clip_max]
vpbroadcastd m14, [pd_2896]
lea r6, [rsp+32*7]
call .main
cmp eobd, 36
jl .fast
call .main
cmp eobd, 136
jl .fast
call .main
cmp eobd, 300
jl .fast
call .main
jmp .pass2
.dconly:
imul r6d, [cq], 2896
mov [cq], eobd ; 0
mov r3d, 64
jmp m(inv_txfm_add_dct_dct_64x16_16bpc).dconly
.fast:
pxor m0, m0
lea r4, [rsp+32*135]
.fast_loop:
REPX {mova [r6+32*x], m0}, -4, -3, -2, -1, 0, 1, 2, 3
add r6, 32*8
cmp r6, r4
jl .fast_loop
.pass2:
lea r10, [r6-32*32]
lea rax, [pw_5+128]
lea r8, [strideq*4]
lea r9, [strideq*5]
lea r3, [r9+strideq*1] ; stride*6
lea r7, [r9+strideq*2] ; stride*7
.pass2_loop:
mova m0, [r10-32*100] ; in0
mova m1, [r10-32*96] ; in4
mova m2, [r10-32*68] ; in8
mova m3, [r10-32*64] ; in12
mova m4, [r10-32*36] ; in16
mova m5, [r10-32*32] ; in20
mova m6, [r10-32* 4] ; in24
mova m7, [r10+32* 0] ; in28
pxor m8, m8
REPX {mova x, m8}, m9, m10, m11, m12, m13, m14
mova [rsp], m8
call m(idct_16x16_internal_8bpc).main
mova m1, [rsp+32*1]
mova [r4-32*4], m0
mova [r4-32*3], m1
mova [r4-32*2], m2
mova [r4-32*1], m3
mova [r4+32*0], m4
mova [r4+32*1], m5
mova [r4+32*2], m6
mova [r4+32*3], m7
add r4, 32*8
mova [r4-32*4], m8
mova [r4-32*3], m9
mova [r4-32*2], m10
mova [r4-32*1], m11
mova [r4+32*0], m12
mova [r4+32*1], m13
mova [r4+32*2], m14
mova [r4+32*3], m15
mova m0, [r10-32*98] ; in2
mova m1, [r10-32*94] ; in6
mova m2, [r10-32*66] ; in10
mova m3, [r10-32*62] ; in14
mova m4, [r10-32*34] ; in18
mova m5, [r10-32*30] ; in22
mova m6, [r10-32* 2] ; in26
mova m7, [r10+32* 2] ; in30
lea r5, [r4+32*16]
add r4, 32*8
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
mova m0, [r10-32*99] ; in1
mova m1, [r10+32* 3] ; in31
mova m2, [r10-32*35] ; in17
mova m3, [r10-32*61] ; in15
mova m4, [r10-32*67] ; in9
mova m5, [r10-32*29] ; in23
mova m6, [r10-32* 3] ; in25
mova m7, [r10-32*93] ; in7
lea rax, [idct64_mul - 8]
add r4, 32*16
add r5, 32*32
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
mova m0, [r10-32*95] ; in5
mova m1, [r10-32* 1] ; in27
mova m2, [r10-32*31] ; in21
mova m3, [r10-32*65] ; in11
mova m4, [r10-32*63] ; in13
mova m5, [r10-32*33] ; in19
mova m6, [r10+32* 1] ; in29
mova m7, [r10-32*97] ; in3
add rax, 8
add r4, 32*8
sub r5, 32*8
call m(inv_txfm_add_dct_dct_16x64_8bpc).main_part1
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2_pass2
add r10, 32*8
sub dstq, r8
sub r4, 32*44
add dstq, 32
cmp r10, r4
jl .pass2_loop
RET
ALIGN function_align
.main:
lea r5, [idct64_mul_16bpc]
mova m0, [cq+128* 1]
mova m1, [cq+128*31]
mova m2, [cq+128*17]
mova m3, [cq+128*15]
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1
mova m0, [cq+128* 7]
mova m1, [cq+128*25]
mova m2, [cq+128*23]
mova m3, [cq+128* 9]
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1
mova m0, [cq+128* 5]
mova m1, [cq+128*27]
mova m2, [cq+128*21]
mova m3, [cq+128*11]
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1
mova m0, [cq+128* 3]
mova m1, [cq+128*29]
mova m2, [cq+128*19]
mova m3, [cq+128*13]
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part1
call m(inv_txfm_add_dct_dct_16x64_16bpc).main_part2
mova m0, [cq+128* 2]
mova m1, [cq+128*14]
mova m2, [cq+128*18]
mova m3, [cq+128*30]
call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part1_fast
mova m0, [cq+128* 6]
mova m1, [cq+128*10]
mova m2, [cq+128*22]
mova m3, [cq+128*26]
call m(inv_txfm_add_dct_dct_8x32_16bpc).main_oddhalf_part2_fast
mova m0, [cq+128* 4]
mova m1, [cq+128*12]
mova m2, [cq+128*20]
mova m3, [cq+128*28]
call m(idct_8x16_internal_16bpc).main_oddhalf_fast
mova m0, [cq+128* 0]
mova m1, [cq+128* 8]
mova m2, [cq+128*16]
mova m3, [cq+128*24]
pxor m15, m15
mov r7d, 128*29
.main_zero_loop:
mova [cq+r7-128*1], m15
mova [cq+r7+128*0], m15
mova [cq+r7+128*1], m15
mova [cq+r7+128*2], m15
sub r7d, 128*4
jg .main_zero_loop
call m(inv_txfm_add_dct_dct_64x16_16bpc).main_end
jmp m(inv_txfm_add_dct_dct_64x16_16bpc).shift_transpose
%endif ; ARCH_X86_64