rav1e 0.4.0-alpha

; Copyright (c) 2018, The rav1e contributors. All rights reserved
;
; This source code is subject to the terms of the BSD 2 Clause License and
; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
; was not distributed with this source code in the LICENSE file, you can
; obtain it at www.aomedia.org/license/software. If the Alliance for Open
; Media Patent License 1.0 was not distributed with this source code in the
; PATENTS file, you can obtain it at www.aomedia.org/license/patent.

%include "ext/x86/x86inc.asm"

SECTION .text

%macro W_ABS_DIFF 8
    psubw               %1, %5
    psubw               %2, %6
    psubw               %3, %7
    psubw               %4, %8
    pabsw               %1, %1
    pabsw               %2, %2
    pabsw               %3, %3
    pabsw               %4, %4
%endmacro

INIT_XMM ssse3
cglobal sad_4x4_hbd, 4, 6, 8, src, src_stride, dst, dst_stride, \
                              src_stride3, dst_stride3
    lea       src_stride3q, [src_strideq*3]
    lea       dst_stride3q, [dst_strideq*3]
    movq                m0, [srcq]
    movq                m1, [srcq+src_strideq*1]
    movq                m2, [srcq+src_strideq*2]
    movq                m3, [srcq+src_stride3q]
    movq                m4, [dstq]
    movq                m5, [dstq+dst_strideq*1]
    movq                m6, [dstq+dst_strideq*2]
    movq                m7, [dstq+dst_stride3q]
    W_ABS_DIFF m0, m1, m2, m3, m4, m5, m6, m7
; Don't convert to 32 bit integers: 4*4 abs diffs of 12-bits fits in 16 bits.
; Accumulate onto m0
    %define            sum  m0
    paddw              sum, m1
    paddw               m2, m3
    paddw              sum, m2
; Horizontal reduction
    pshuflw             m1, sum, q2323
    paddw              sum, m1
    pshuflw             m1, sum, q1111
    paddw              sum, m1
    movd               eax, sum
; Convert to 16-bits since the upper half of eax is dirty
    movzx              eax, ax
    RET

%if ARCH_X86_64

INIT_XMM ssse3
cglobal sad_16x16_hbd, 4, 5, 9, src, src_stride, dst, dst_stride, \
                                cnt
    mov               cntd, 8
    %define            sum  m0
    pxor               sum, sum
.loop:
    movu                m1, [srcq]
    movu                m2, [srcq+16]
    movu                m3, [srcq+src_strideq]
    movu                m4, [srcq+src_strideq+16]
    lea               srcq, [srcq+src_strideq*2]
    movu                m5, [dstq]
    movu                m6, [dstq+16]
    movu                m7, [dstq+dst_strideq]
    movu                m8, [dstq+dst_strideq+16]
    lea               dstq, [dstq+dst_strideq*2]
    W_ABS_DIFF m1, m2, m3, m4, m5, m6, m7, m8
    paddw               m1, m2
    paddw               m3, m4
    paddw              sum, m1
    paddw              sum, m3
    dec               cntd
    jg .loop
; Convert to 32-bits
    pxor                m1, m1
    punpcklwd           m2, sum, m1
    punpckhwd          sum, m1
    paddd              sum, m2
; Horizontal reduction
    movhlps             m1, sum
    paddd              sum, m1
    pshufd              m1, sum, q1111
    paddd              sum, m1
    movd               eax, sum
    RET

%endif