; Copyright (c) 2019, The rav1e contributors. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
SECTION_RODATA 32
maddubsw_hsub: times 16 db 1, -1
SECTION .text
%define m(x) mangle(private_prefix %+ _ %+ x %+ SUFFIX)
; Perform 4x4 hadamard transform on input with 2 rows per register.
; Rows 0 and 2 are in m0 and rows 1 and 3 are in m1.
; A second set of packed input can also be taken in m2 and m3.
; Ends with sums in every other entry (i.e. already reduced horizontally).
%macro HADAMARD_4x4_PACKED 1
%if %1 == 1
%define tmp m2
; 2->0, 1->2, 0->2
%define ROTATE SWAP 2, 1, 0
%elif %1 == 2
%define tmp m4
; 4->0, 3->2, 2->3, 1->2, 0->1
%define ROTATE SWAP 4, 3, 2, 1, 0
%endif
; m0 d2 c2 b2 a2 d0 c0 b0 a0
; m1 d3 c3 b3 a3 d1 c1 b1 a1
; Stage 1
; m0 d2+d3 c2+c3 b2+b3 a2+a3 d0+d1 c0+c1 b0+b1 a0+a1
; m1 d2-d3 c2-c3 b2-b3 a2-a3 d0-d1 c0-c1 b0-b1 a0-a1
paddw tmp, m0, m1
psubw m0, m1
%if %1 == 2
paddw m1, m2, m3
psubw m2, m3
%endif
ROTATE
; Stage 2
; m0 d0-d1 d0+d1 c0-c1 c0+c1 b0-b1 b0+b1 a0-a1 a0+a1
; m1 d2-d3 d2+d3 c2-c3 c2+c3 b2-b3 b2+b3 a2-a3 a2+a3
punpcklwd tmp, m0, m1
punpckhwd m0, m1
%if %1 == 2
punpcklwd m1, m2, m3
punpckhwd m2, m3
%endif
ROTATE
; m0 d0-d1+d2-d3 d0+d1+d2+d3 c0-c1+c2-c3 c0+c1+c2+c3
; b0-b1+b2-b3 b0+b1+b2+b3 a0-a1+a2-a3 a0+a1+a2+a3
; m1 d0-d2-d2+d3 d0+d1-d2-d3 c0-c1-c2+c3 c0+c1-c2-c3
; b0-b1-b2+b3 b0+b1-b2-b3 a0-a1-a2-a3 a0+a1-a2-a3
paddw tmp, m0, m1
psubw m0, m1
%if %1 == 2
paddw m1, m2, m3
psubw m2, m3
%endif
ROTATE
; m0 s2 s0 r2 r0 q2 q0 p2 p0
; m1 s3 s1 r3 r1 q3 q1 p3 p1
; Stage 1
; m0 q3 q1 q2 q0 p3 p1 p2 p0
; m1 s3 s1 s2 s0 r3 r1 r2 r0
punpckldq tmp, m0, m1
punpckhdq m0, m1
%if %1 == 2
punpckldq m1, m2, m3
punpckhdq m2, m3
%endif
ROTATE
; m0 q3+s3 q1+s1 q2+s2 q0+s0 p3+r3 p1+r1 p2+r2 p0+r0
; m1 q3-s3 q1-s1 q2-s2 q0-s0 p3-r3 p1-r1 p2-r2 p0-r0
paddw tmp, m0, m1
psubw m0, m1
%if %1 == 2
paddw m1, m2, m3
psubw m2, m3
%endif
ROTATE
; Stage 2
; m0 p3-r3 p1-r1 p2-r2 p0-r0 p3+r3 p1+r1 p2+r2 p0+r0
; m1 q3-s3 q1-s1 q2-s2 q0-s0 q3+s3 q1+s1 q2+s2 q0+s0
punpcklqdq tmp, m0, m1
punpckhqdq m0, m1
%if %1 == 2
punpcklqdq m1, m2, m3
punpckhqdq m2, m3
%endif
ROTATE
; Use the fact that
; (abs(a+b)+abs(a-b))/2 = max(abs(a),abs(b))
; to merge the final butterfly with the abs and the first stage of
; accumulation.
; Avoid pabsw by using max(a, b) + max(a + b + 0x7FFF, 0x7FFF) instead.
; Actually calculates (abs(a+b)+abs(a-b))/2-0x7FFF.
; The final sum must be offset to compensate for subtracting 0x7FFF.
paddw tmp, m0, m1
pmaxsw m0, m1
; m1 is free
; 0x7FFF
pcmpeqb m1, m1
psrlw m1, 1
paddsw tmp, m1
psubw m0, tmp
%if %1 == 2
paddw tmp, m2, m3
pmaxsw m2, m3
paddsw tmp, m1
psubw m2, tmp
paddw m0, m2
%endif
%endmacro
; Load diffs of 4 entries for 2 rows
%macro LOAD_PACK_DIFF_Dx2 7
movd m%1, %2
movd m%6, %4
punpckldq m%1, m%6
pmovzxbw m%1, m%1
movd m%6, %3
movd m%7, %5
punpckldq m%6, m%7
pmovzxbw m%6, m%6
psubw m%1, m%6
%endmacro
; Can only use 128-bit vectors
%macro SATD_4x4_FN 0
cglobal satd_4x4, 4, 6, 4, src, src_stride, dst, dst_stride, \
src_stride3, dst_stride3
lea src_stride3q, [src_strideq*3]
lea dst_stride3q, [dst_strideq*3]
; Load rows 0 and 2 to m0 and 1 and 3 to m1
LOAD_PACK_DIFF_Dx2 0, [srcq], [dstq], \
[srcq+src_strideq*2], [dstq+dst_strideq*2], \
2, 3
LOAD_PACK_DIFF_Dx2 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
[srcq+src_stride3q], [dstq+dst_stride3q], \
2, 3
HADAMARD_4x4_PACKED 1
; Reduce horizontally
pshufd m1, m0, q3232
paddw m0, m1
pshuflw m1, m0, q3232
paddw m0, m1
pshuflw m1, m0, q1111
; Perform normalization during the final stage of accumulation
pavgw m0, m1
movd eax, m0
movzx eax, ax
; Add an offset for how the final butterfly stage and the first stage of
; accumulation was done. Since this offset is an even number, this can
; safely be done after normalization using pavgw.
sub ax, 4
RET
%endmacro
INIT_XMM sse4
SATD_4x4_FN
INIT_XMM avx2
SATD_4x4_FN
; Load diffs of 8 entries for 2 row
; Each set of 4 columns share an 128-bit lane
%macro LOAD_PACK_DIFF_Qx2 7
movq xm%1, %2
movq xm%6, %4
punpckldq xm%1, xm%6
pmovzxbw m%1, xm%1
movq xm%6, %3
movq xm%7, %5
punpckldq xm%6, xm%7
pmovzxbw m%6, xm%6
psubw m%1, m%6
%endmacro
INIT_YMM avx2
cglobal satd_8x4, 4, 6, 4, src, src_stride, dst, dst_stride, \
src_stride3, dst_stride3
lea src_stride3q, [src_strideq*3]
lea dst_stride3q, [dst_strideq*3]
; Load rows 0 and 2 to m0 and 1 and 3 to m1
; Each set of 4 columns share 128-bit lanes
LOAD_PACK_DIFF_Qx2 0, [srcq], [dstq], \
[srcq+src_strideq*2], [dstq+dst_strideq*2], \
2, 3
LOAD_PACK_DIFF_Qx2 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
[srcq+src_stride3q], [dstq+dst_stride3q], \
2, 3
HADAMARD_4x4_PACKED 1
; Reduce horizontally
vextracti128 xm1, m0, 1
paddw xm0, xm1
pshufd xm1, xm0, q3232
paddw xm0, xm1
pshuflw xm1, xm0, q3232
paddw xm0, xm1
pshuflw xm1, xm0, q1111
; Perform normalization during the final stage of accumulation
pavgw xm0, xm1
movd eax, xm0
movzx eax, ax
; Add an offset for how the final butterfly stage and the first stage of
; accumulation was done. Since this offset is an even number, this can
; safely be done after normalization using pavgw.
sub ax, 8
RET
; Load diffs of 4 entries for 4 rows
; Each set of two rows share 128-bit lanes
%macro LOAD_PACK_DIFF_Dx4 12
movd xm%1, %2
movd xm%10, %4
punpckldq xm%1, xm%10
movd xm%10, %6
movd xm%11, %8
punpckldq xm%10, xm%11
punpcklqdq xm%1, xm%10
pmovzxbw m%1, xm%1
movd xm%10, %3
movd xm%11, %5
punpckldq xm%10, xm%11
movd xm%11, %7
movd xm%12, %9
punpckldq xm%11, xm%12
punpcklqdq xm%10, xm%11
pmovzxbw m%10, xm%10
psubw m%1, m%10
%endmacro
INIT_YMM avx2
cglobal satd_4x8, 4, 8, 5, src, src_stride, dst, dst_stride, \
src4, dst4, src_stride3, dst_stride3
lea src_stride3q, [src_strideq*3]
lea dst_stride3q, [dst_strideq*3]
lea src4q, [srcq+src_strideq*4]
lea dst4q, [dstq+dst_strideq*4]
; Load rows 0, 2, 4 and 6 to m0 and 1, 3, 5 and 7 to m1.
; Lanes split the low and high rows of m0 and m1.
LOAD_PACK_DIFF_Dx4 0, [srcq], [dstq], \
[srcq+src_strideq*2], [dstq+dst_strideq*2], \
[src4q], [dst4q], \
[src4q+src_strideq*2], [dst4q+dst_strideq*2], \
2, 3, 4
LOAD_PACK_DIFF_Dx4 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
[srcq+src_stride3q], [dstq+dst_stride3q], \
[src4q+src_strideq*1], [dst4q+dst_strideq*1], \
[src4q+src_stride3q], [dst4q+dst_stride3q], \
2, 3, 4
HADAMARD_4x4_PACKED 1
; Reduce horizontally
vextracti128 xm1, m0, 1
paddw xm0, xm1
pshufd xm1, xm0, q3232
paddw xm0, xm1
pshuflw xm1, xm0, q3232
paddw xm0, xm1
pshuflw xm1, xm0, q1111
; Perform normalization during the final stage of accumulation.
pavgw xm0, xm1
movd eax, xm0
movzx eax, ax
sub ax, 8
RET
; Rudimentary fast hadamard transform
; Two Hadamard transforms share an 128-bit lane.
%macro HADAMARD_4x4 0
; 4->0, 3->2, 2->3, 1->2, 0->1
%define ROTATE SWAP 4, 3, 2, 1, 0
; Stage 1
paddw m0, m1, m2
psubw m1, m2
paddw m2, m3, m4
psubw m3, m4
ROTATE
; Stage 2
paddw m0, m1, m3
psubw m1, m3
paddw m3, m2, m4
psubw m2, m4
SWAP 3, 2, 1
ROTATE
; Transpose
; Since two transforms share an 128-bit lane, unpacking results in a single
; transform's values on each register. This has to be resolved later.
; A and B indicate different 4x4 transforms.
; Start
; m1 B (a3 a2 a1 a0) A (a3 a2 a1 a0)
; m2 B (b3 b2 b1 b0) A (b3 b2 b1 b0)
; m3 B (c3 c2 c1 c0) A (c3 c2 c1 c0)
; m4 B (d3 d2 d1 d0) A (d3 d2 d1 d0)
; Stage 1
; m1 A (b3 a3 b2 a2 b1 a1 b0 a0)
; m2 B (b3 a3 b2 a2 b1 a1 b0 a0)
; m3 A (d3 c3 d2 c2 d1 c1 d0 c0)
; m4 B (d3 c3 d2 c2 d1 c1 d0 c0)
punpcklwd m0, m1, m2
punpckhwd m1, m2
punpcklwd m2, m3, m4
punpckhwd m3, m4
ROTATE
; m1 A (d3 c3 b3 a3 d2 c2 b2 a2)
; m2 A (d1 c1 b1 a1 d0 c0 b0 a0)
; m3 B (d3 c3 b3 a3 d2 c2 b2 a2)
; m4 B (d1 c1 b1 a1 d0 c0 b0 a0)
punpckldq m0, m1, m3
punpckhdq m1, m3
punpckldq m3, m2, m4
punpckhdq m2, m4
SWAP 3, 2, 1
ROTATE
; Make the transforms share 128-bit lanes again.
; m1 B (d0 c0 b0 a0) A (d0 c0 b0 a0)
; m2 B (d1 c1 b1 a1) A (d1 c1 b1 a1)
; m3 B (d2 c2 b2 a2) A (d2 c2 b2 a2)
; m4 B (d3 c3 b3 a3) A (d3 c3 b3 a3)
punpcklqdq m0, m1, m2
punpckhqdq m1, m2
punpcklqdq m2, m3, m4
punpckhqdq m3, m4
ROTATE
; Stage 1
paddw m0, m1, m2
psubw m1, m2
paddw m2, m3, m4
psubw m3, m4
ROTATE
; Use the fact that
; (abs(a+b)+abs(a-b))/2 = max(abs(a),abs(b))
; to merge the final butterfly with the abs and the first stage of
; accumulation.
; Avoid pabsw by using max(a, b) + max(a + b + 0x7FFF, 0x7FFF) instead.
; Actually calculates (abs(a+b)+abs(a-b))/2-0x7FFF.
; The final sum must be offset to compensate for subtracting 0x7FFF.
paddw m0, m1, m3
pmaxsw m1, m3
; m2 is free
; 0x7FFF
pcmpeqb m3, m3
psrlw m3, 1
paddsw m0, m3
psubw m1, m0
paddw m0, m2, m4
pmaxsw m2, m4
paddsw m0, m3
psubw m2, m0
paddw m1, m2
SWAP 1, 0
%endmacro
; Load diffs of 16 entries for 1 row
%macro LOAD_DIFF_DQ 4
movu xm%1, %2
movu xm%4, %3
vpmovzxbw m%1, xm%1
vpmovzxbw m%4, xm%4
psubw m%1, m%4
%endmacro
INIT_YMM avx2
cglobal satd_16x4, 4, 6, 5, src, src_stride, dst, dst_stride, \
src_stride3, dst_stride3
lea src_stride3q, [src_strideq*3]
lea dst_stride3q, [dst_strideq*3]
LOAD_DIFF_DQ 1, [srcq], [dstq], 0
LOAD_DIFF_DQ 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0
LOAD_DIFF_DQ 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0
LOAD_DIFF_DQ 4, [srcq+src_stride3q], [dstq+dst_stride3q], 0
HADAMARD_4x4
; Reduce horizontally
vextracti128 xm1, m0, 1
paddw xm0, xm1
pshufd xm1, xm0, q3232
paddw xm0, xm1
pshuflw xm1, xm0, q3232
paddw xm0, xm1
pshuflw xm1, xm0, q1111
; Perform normalization during the final stage of accumulation
; Avoids overflow in this case
pavgw xm0, xm1
movd eax, xm0
movzx eax, ax
; Add an offset for how the final butterfly stage and the first stage of
; accumulation was done. Since this offset is an even number, this can
; safely be done after normalization using pavgw.
sub ax, 16
RET
INIT_YMM avx2
cglobal satd_4x16, 4, 8, 7, src, src_stride, dst, dst_stride, \
src4, dst4, src_stride3, dst_stride3
lea src_stride3q, [src_strideq*3]
lea dst_stride3q, [dst_strideq*3]
lea src4q, [srcq+src_strideq*4]
lea dst4q, [dstq+dst_strideq*4]
LOAD_PACK_DIFF_Dx4 0, [srcq], [dstq], \
[srcq+src_strideq*2], [dstq+dst_strideq*2], \
[src4q], [dst4q], \
[src4q+src_strideq*2], [dst4q+dst_strideq*2], \
4, 5, 6
LOAD_PACK_DIFF_Dx4 1, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
[srcq+src_stride3q], [dstq+dst_stride3q], \
[src4q+src_strideq*1], [dst4q+dst_strideq*1], \
[src4q+src_stride3q], [dst4q+dst_stride3q], \
4, 5, 6
lea srcq, [srcq+src_strideq*8]
lea dstq, [dstq+dst_strideq*8]
lea src4q, [src4q+src_strideq*8]
lea dst4q, [dst4q+dst_strideq*8]
LOAD_PACK_DIFF_Dx4 2, [srcq], [dstq], \
[srcq+src_strideq*2], [dstq+dst_strideq*2], \
[src4q], [dst4q], \
[src4q+src_strideq*2], [dst4q+dst_strideq*2], \
4, 5, 6
LOAD_PACK_DIFF_Dx4 3, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
[srcq+src_stride3q], [dstq+dst_stride3q], \
[src4q+src_strideq*1], [dst4q+dst_strideq*1], \
[src4q+src_stride3q], [dst4q+dst_stride3q], \
4, 5, 6
HADAMARD_4x4_PACKED 2
; Reduce horizontally
vextracti128 xm1, m0, 1
paddw xm0, xm1
pshufd xm1, xm0, q3232
paddw xm0, xm1
pshuflw xm1, xm0, q3232
paddw xm0, xm1
pshuflw xm1, xm0, q1111
; Perform normalization during the final stage of accumulation
pavgw xm0, xm1
movd eax, xm0
movzx eax, ax
; Add an offset for how the final butterfly stage and the first stage of
; accumulation was done. Since this offset is an even number, this can
; safely be done after normalization using pavgw.
sub ax, 16
RET
; On x86-64 we can transpose in-place without spilling registers.
; By clever choices of the order to apply the butterflies and the order of
; their outputs, we can take the rows in order and output the columns in order
; without any extra operations and using just one temporary register.
%macro TRANSPOSE8x8 9
punpckhwd m%9, m%5, m%6
punpcklwd m%5, m%6
; m%6 is free
punpckhwd m%6, m%1, m%2
punpcklwd m%1, m%2
; m%2 is free
punpckhwd m%2, m%7, m%8
punpcklwd m%7, m%8
; m%8 is free
punpckhwd m%8, m%3, m%4
punpcklwd m%3, m%4
; m%4 is free
punpckhdq m%4, m%1, m%3
punpckldq m%1, m%3
; m%3 is free
punpckldq m%3, m%5, m%7
punpckhdq m%5, m%7
; m%7 is free
punpckhdq m%7, m%6, m%8
punpckldq m%6, m%8
; m%8 is free
punpckldq m%8, m%9, m%2
punpckhdq m%9, m%2
; m%2 is free
punpckhqdq m%2, m%1, m%3
punpcklqdq m%1, m%3
; m%3 is free
punpcklqdq m%3, m%4, m%5
punpckhqdq m%4, m%5
; m%5 is free
punpcklqdq m%5, m%6, m%8
punpckhqdq m%6, m%8
; m%8 is free
punpckhqdq m%8, m%7, m%9
punpcklqdq m%7, m%9
%endmacro
; Load diff of 8 entries for 1 row
%macro LOAD_DIFF_Q 4
movq %1, %2
movq %4, %3
punpcklbw %1, %4
pmaddubsw %1, hsub
%endmacro
%macro HADAMARD_8_STAGE_1 9
paddw m%9, m%1, m%2
psubw m%1, m%2
paddw m%2, m%3, m%4
psubw m%3, m%4
paddw m%4, m%5, m%6
psubw m%5, m%6
paddw m%6, m%7, m%8
psubw m%7, m%8
; 8->9, 7->8, 6->7, 5->6, 4->5, 3->4, 2->3, 1->2, 9->1
SWAP %8, %7, %6, %5, %4, %3, %2, %1, %9
%endmacro
%macro HADAMARD_8_STAGE_2 9
paddw m%9, m%1, m%3 ; 0
psubw m%1, m%3 ; 2
paddw m%3, m%2, m%4 ; 1
psubw m%2, m%4 ; 3
SWAP %3, %2, %1
paddw m%4, m%5, m%7 ; 4
psubw m%5, m%7 ; 6
paddw m%7, m%6, m%8 ; 5
psubw m%6, m%8 ; 7
SWAP %7, %6, %5
; 8->9, 7->8, 6->7, 5->6, 4->5, 3->4, 2->3, 1->2, 9->1
SWAP %8, %7, %6, %5, %4, %3, %2, %1, %9
%endmacro
%macro HADAMARD_8_STAGE_3 9
paddw m%9, m%1, m%5 ; 0
psubw m%1, m%5 ; 4
paddw m%5, m%2, m%6 ; 1
psubw m%2, m%6 ; 5
paddw m%6, m%3, m%7 ; 2
psubw m%3, m%7 ; 6
paddw m%7, m%4, m%8 ; 3
psubw m%4, m%8 ; 7
SWAP %5, %2, %6, %3, %7, %4, %1
; 8->9, 7->8, 6->7, 5->6, 4->5, 3->4, 2->3, 1->2, 9->1
SWAP %8, %7, %6, %5, %4, %3, %2, %1, %9
%endmacro
; Rudimentary fast hadamard transform
%macro HADAMARD_8x8 0
HADAMARD_8_STAGE_1 1, 2, 3, 4, 5, 6, 7, 8, 0
HADAMARD_8_STAGE_2 1, 2, 3, 4, 5, 6, 7, 8, 0
HADAMARD_8_STAGE_3 1, 2, 3, 4, 5, 6, 7, 8, 0
TRANSPOSE8x8 1, 2, 3, 4, 5, 6, 7, 8, 0
HADAMARD_8_STAGE_1 1, 2, 3, 4, 5, 6, 7, 8, 0
HADAMARD_8_STAGE_2 1, 2, 3, 4, 5, 6, 7, 8, 0
; Stage 3
; Use the fact that
; (abs(a+b)+abs(a-b))/2 = max(abs(a),abs(b))
; to merge the final butterfly with the abs and the first stage of
; accumulation.
; Avoid pabsw by using max(a, b) + max(a + b + 0x7FFF, 0x7FFF) instead.
; Actually calculates (abs(a+b)+abs(a-b))/2-0x7FFF.
; The final sum must be offset to compensate for subtracting 0x7FFF.
paddw m0, m1, m5
pmaxsw m1, m5
; m1 is free
; 0x7FFF
pcmpeqb m5, m5
psrlw m5, 1
paddsw m0, m5
psubw m1, m0
paddw m0, m2, m6
pmaxsw m2, m6
paddsw m0, m5
psubw m2, m0
paddw m0, m3, m7
pmaxsw m3, m7
paddsw m0, m5
psubw m3, m0
paddw m0, m4, m8
pmaxsw m4, m8
paddsw m0, m5
psubw m4, m0
paddw m1, m2
paddw m3, m4
paddw m1, m3
SWAP 1, 0
%endmacro
; Only works with 128 bit vectors
%macro SATD_8x8_FN 0
cglobal satd_8x8, 4, 6, 10, src, src_stride, dst, dst_stride, \
src_stride3, dst_stride3
%define hsub m0
mova hsub, [maddubsw_hsub]
; Load rows into m1-m8
lea src_stride3q, [src_strideq*3]
lea dst_stride3q, [dst_strideq*3]
LOAD_DIFF_Q m1, [srcq], [dstq], m2
LOAD_DIFF_Q m2, [srcq+src_strideq*1], [dstq+dst_strideq*1], m3
LOAD_DIFF_Q m3, [srcq+src_strideq*2], [dstq+dst_strideq*2], m4
LOAD_DIFF_Q m4, [srcq+src_stride3q], [dstq+dst_stride3q], m5
lea srcq, [srcq+src_strideq*4]
lea dstq, [dstq+dst_strideq*4]
LOAD_DIFF_Q m5, [srcq], [dstq], m6
LOAD_DIFF_Q m6, [srcq+src_strideq*1], [dstq+dst_strideq*1], m7
LOAD_DIFF_Q m7, [srcq+src_strideq*2], [dstq+dst_strideq*2], m8
LOAD_DIFF_Q m8, [srcq+src_stride3q], [dstq+dst_stride3q], m9
HADAMARD_8x8
; Reduce horizontally and convert to 32 bits
pxor m2, m2
punpcklwd m1, m0, m2
punpckhwd m0, m2
paddd m0, m1
pshufd m1, m0, q3232
paddd m0, m1
pshuflw m1, m0, q3232
paddd m0, m1
movd eax, m0
; Normalize
; Add rounding offset and an offset for how the final butterfly stage and
; the first stage of accumulation was done.
sub eax, 32-2
shr eax, 2
RET
%endmacro
INIT_XMM ssse3
SATD_8x8_FN
INIT_XMM avx2
SATD_8x8_FN
INIT_YMM avx2
cglobal satd_16x8, 4, 6, 9, src, src_stride, dst, dst_stride, \
src_stride3, dst_stride3
; Load rows into m1-m8
lea src_stride3q, [src_strideq*3]
lea dst_stride3q, [dst_strideq*3]
LOAD_DIFF_DQ 1, [srcq], [dstq], 0
LOAD_DIFF_DQ 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0
LOAD_DIFF_DQ 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0
LOAD_DIFF_DQ 4, [srcq+src_stride3q], [dstq+dst_stride3q], 0
lea srcq, [srcq+src_strideq*4]
lea dstq, [dstq+dst_strideq*4]
LOAD_DIFF_DQ 5, [srcq], [dstq], 0
LOAD_DIFF_DQ 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0
LOAD_DIFF_DQ 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0
LOAD_DIFF_DQ 8, [srcq+src_stride3q], [dstq+dst_stride3q], 0
HADAMARD_8x8
; Reduce horizontally and convert to 32 bits
pxor m2, m2
punpcklwd m1, m0, m2
punpckhwd m0, m2
paddd m0, m1
vextracti128 xm1, m0, 1
paddd xm0, xm1
pshufd xm1, xm0, q3232
paddd xm0, xm1
pshuflw xm1, xm0, q3232
paddd xm0, xm1
movd eax, xm0
; Normalize
; Add rounding offset and an offset for how the final butterfly stage and
; the first stage of accumulation was done.
sub eax, 64-2
shr eax, 2
RET
%macro LOAD_DIFF_Qx2 7
movq xm%1, %2
movq xm%6, %3
punpcklbw xm%1, xm%6
movq xm%6, %4
movq xm%7, %5
punpcklbw xm%6, xm%7
vinserti128 m%1, xm%6, 1
pmaddubsw m%1, hsub
%endmacro
INIT_YMM avx2
cglobal satd_8x16, 4, 8, 11, src, src_stride, dst, dst_stride, \
src8, dst8, src_stride3, dst_stride3
%define hsub m0
mova hsub, [maddubsw_hsub]
; Load rows into m1-m8
lea src8q, [srcq+src_strideq*8]
lea dst8q, [dstq+dst_strideq*8]
lea src_stride3q, [src_strideq*3]
lea dst_stride3q, [dst_strideq*3]
LOAD_DIFF_Qx2 1, [srcq], [dstq], \
[src8q], [dst8q], \
9, 10
LOAD_DIFF_Qx2 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
[src8q+src_strideq*1], [dst8q+dst_strideq*1], \
9, 10
LOAD_DIFF_Qx2 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], \
[src8q+src_strideq*2], [dst8q+dst_strideq*2], \
9, 10
LOAD_DIFF_Qx2 4, [srcq+src_stride3q], [dstq+dst_stride3q], \
[src8q+src_stride3q], [dst8q+dst_stride3q], \
9, 10
lea srcq, [srcq+src_strideq*4]
lea dstq, [dstq+dst_strideq*4]
lea src8q, [src8q+src_strideq*4]
lea dst8q, [dst8q+dst_strideq*4]
LOAD_DIFF_Qx2 5, [srcq], [dstq], \
[src8q], [dst8q], \
9, 10
LOAD_DIFF_Qx2 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
[src8q+src_strideq*1], [dst8q+dst_strideq*1], \
9, 10
LOAD_DIFF_Qx2 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], \
[src8q+src_strideq*2], [dst8q+dst_strideq*2], \
9, 10
LOAD_DIFF_Qx2 8, [srcq+src_stride3q], [dstq+dst_stride3q], \
[src8q+src_stride3q], [dst8q+dst_stride3q], \
9, 10
HADAMARD_8x8
; Reduce horizontally and convert to 32 bits
pxor m2, m2
punpcklwd m1, m0, m2
punpckhwd m0, m2
paddd m0, m1
vextracti128 xm1, m0, 1
paddd xm0, xm1
pshufd xm1, xm0, q3232
paddd xm0, xm1
pshuflw xm1, xm0, q3232
paddd xm0, xm1
movd eax, xm0
; Normalize
; Add rounding offset and an offset for how the final butterfly stage and
; the first stage of accumulation was done.
sub eax, 64-2
shr eax, 2
RET
; Less optimized, boilerplate implementations
INIT_YMM avx2
cglobal satd_8x32, 4, 9, 13, src, src_stride, dst, dst_stride, \
src8, dst8, src_stride3, dst_stride3, cnt
; ones for converting to 32-bit with pmaddwd
pcmpeqw m11, m11
pabsw m11, m11
; sum
pxor m12, m12
mov cntd, 1
lea src_stride3q, [src_strideq*3]
lea dst_stride3q, [dst_strideq*3]
lea src8q, [srcq+src_strideq*8]
lea dst8q, [dstq+dst_strideq*8]
.loop:
%define hsub m0
mova hsub, [maddubsw_hsub]
; Load rows into m1-m8
LOAD_DIFF_Qx2 1, [srcq], [dstq], \
[src8q], [dst8q], \
9, 10
LOAD_DIFF_Qx2 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
[src8q+src_strideq*1], [dst8q+dst_strideq*1], \
9, 10
LOAD_DIFF_Qx2 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], \
[src8q+src_strideq*2], [dst8q+dst_strideq*2], \
9, 10
LOAD_DIFF_Qx2 4, [srcq+src_stride3q], [dstq+dst_stride3q], \
[src8q+src_stride3q], [dst8q+dst_stride3q], \
9, 10
lea srcq, [srcq+src_strideq*4]
lea dstq, [dstq+dst_strideq*4]
lea src8q, [src8q+src_strideq*4]
lea dst8q, [dst8q+dst_strideq*4]
LOAD_DIFF_Qx2 5, [srcq], [dstq], \
[src8q], [dst8q], \
9, 10
LOAD_DIFF_Qx2 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], \
[src8q+src_strideq*1], [dst8q+dst_strideq*1], \
9, 10
LOAD_DIFF_Qx2 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], \
[src8q+src_strideq*2], [dst8q+dst_strideq*2], \
9, 10
LOAD_DIFF_Qx2 8, [srcq+src_stride3q], [dstq+dst_stride3q], \
[src8q+src_stride3q], [dst8q+dst_stride3q], \
9, 10
HADAMARD_8x8
; Reduce horizontally and convert to 32 bits
pmaddwd m0, m11
paddd m12, m0
lea srcq, [srcq+src_stride3q*4]
lea dstq, [dstq+dst_stride3q*4]
lea src8q, [src8q+src_stride3q*4]
lea dst8q, [dst8q+dst_stride3q*4]
dec cntd
jge .loop
vextracti128 xm0, m12, 1
paddd xm0, xm12
pshufd xm1, xm0, q3232
paddd xm0, xm1
pshuflw xm1, xm0, q3232
paddd xm0, xm1
movd eax, xm0
; Normalize
; Add rounding offset and an offset for how the final butterfly stage and
; the first stage of accumulation was done.
sub eax, 128-2
shr eax, 2
RET
INIT_YMM avx2
cglobal satd_16x8_internal, 0, 0, 0, \
dummy1, src_stride, dummy2, dst_stride, \
src_stride3, dst_stride3, src, dst
%define hadd m9
%define sum m10
; Load rows into m1-m8
LOAD_DIFF_DQ 1, [srcq], [dstq], 0
LOAD_DIFF_DQ 2, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0
LOAD_DIFF_DQ 3, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0
LOAD_DIFF_DQ 4, [srcq+src_stride3q], [dstq+dst_stride3q], 0
lea srcq, [srcq+src_strideq*4]
lea dstq, [dstq+dst_strideq*4]
LOAD_DIFF_DQ 5, [srcq], [dstq], 0
LOAD_DIFF_DQ 6, [srcq+src_strideq*1], [dstq+dst_strideq*1], 0
LOAD_DIFF_DQ 7, [srcq+src_strideq*2], [dstq+dst_strideq*2], 0
LOAD_DIFF_DQ 8, [srcq+src_stride3q], [dstq+dst_stride3q], 0
HADAMARD_8x8
pmaddwd m0, hadd
paddd sum, m0
ret
%macro SATD_NXM 2
%if %1 > 16
%if %2 > 8
cglobal satd_%1x%2, 4, 10, 11, src, src_stride, dst, dst_stride, \
src_stride3, dst_stride3, call_src, call_dst, \
w, h
%else
cglobal satd_%1x%2, 4, 9, 11, src, src_stride, dst, dst_stride, \
src_stride3, dst_stride3, call_src, call_dst, \
w
%endif
%else ; %2 > 8
cglobal satd_%1x%2, 4, 9, 11, src, src_stride, dst, dst_stride, \
src_stride3, dst_stride3, call_src, call_dst, \
h
%endif
; ones for converting to 32-bit with pmaddwd
pcmpeqw m9, m9
pabsw m9, m9
; sum
pxor m10, m10
lea src_stride3q, [src_strideq*3]
lea dst_stride3q, [dst_strideq*3]
%if %2 > 8
mov hd, %2/8 - 1
.looph:
%endif
%if %1 > 16
mov wd, %1/16 - 1
.loopv:
%endif
mov call_srcq, srcq
mov call_dstq, dstq
call m(satd_16x8_internal)
%if %1 > 16
add srcq, 16
add dstq, 16
dec wd
jge .loopv
sub srcq, %1
sub dstq, %1
%endif
%if %2 > 8
lea srcq, [srcq+src_strideq*8]
lea dstq, [dstq+dst_strideq*8]
dec hd
jge .looph
%endif
; Reduce horizontally
vextracti128 xm0, m10, 1
paddd xm0, xm10
pshufd xm1, xm0, q3232
paddd xm0, xm1
pshuflw xm1, xm0, q3232
paddd xm0, xm1
movd eax, xm0
; Normalize
; Add rounding offset and an offset for how the final butterfly stage and
; the first stage of accumulation was done.
sub eax, %1*%2/2 - 2
shr eax, 2
RET
%endmacro
INIT_YMM avx2
SATD_NXM 16, 16
SATD_NXM 32, 32
SATD_NXM 64, 64
SATD_NXM 128, 128
SATD_NXM 16, 32
SATD_NXM 32, 16
SATD_NXM 32, 64
SATD_NXM 64, 32
SATD_NXM 64, 128
SATD_NXM 128, 64
SATD_NXM 32, 8
SATD_NXM 16, 64
SATD_NXM 64, 16
%endif ; ARCH_X86_64