; Copyright © 2018, VideoLAN and dav1d authors
; Copyright © 2018, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
%if ARCH_X86_64
%macro JMP_TABLE 2-*
%xdefine %1_jmptable %%table
%xdefine %%base mangle(private_prefix %+ _%1_avx2)
%%table:
%rep %0 - 1
dd %%base %+ .%2 - %%table
%rotate 1
%endrep
%endmacro
%macro CDEF_FILTER_JMP_TABLE 1
JMP_TABLE cdef_filter_%1_8bpc, \
d6k0, d6k1, d7k0, d7k1, \
d0k0, d0k1, d1k0, d1k1, d2k0, d2k1, d3k0, d3k1, \
d4k0, d4k1, d5k0, d5k1, d6k0, d6k1, d7k0, d7k1, \
d0k0, d0k1, d1k0, d1k1
%endmacro
SECTION_RODATA 32
pd_47130256: dd 4, 7, 1, 3, 0, 2, 5, 6
blend_4x4: dd 0x00, 0x80, 0x00, 0x00, 0x80, 0x80, 0x00, 0x00
dd 0x80, 0x00, 0x00
blend_4x8_0: dd 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
blend_4x8_1: dd 0x00, 0x00, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80
dd 0x00, 0x00
blend_4x8_2: dd 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
dd 0x0000
blend_4x8_3: dd 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080, 0x8080
dd 0x0000, 0x0000
blend_8x8_0: dq 0x00, 0x00, 0x80, 0x80, 0x80, 0x80
blend_8x8_1: dq 0x0000, 0x0000, 0x8080, 0x8080, 0x8080, 0x8080, 0x0000, 0x0000
div_table: dd 840, 420, 280, 210, 168, 140, 120, 105, 420, 210, 140, 105
shufw_6543210x:db 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15
shufb_lohi: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
pw_128: times 2 dw 128
pw_2048: times 2 dw 2048
tap_table: ; masks for 8 bit shifts
db 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01
; weights
db 4, 2, 3, 3, 2, 1
db -1 * 16 + 1, -2 * 16 + 2
db 0 * 16 + 1, -1 * 16 + 2
db 0 * 16 + 1, 0 * 16 + 2
db 0 * 16 + 1, 1 * 16 + 2
db 1 * 16 + 1, 2 * 16 + 2
db 1 * 16 + 0, 2 * 16 + 1
db 1 * 16 + 0, 2 * 16 + 0
db 1 * 16 + 0, 2 * 16 - 1
; the last 6 are repeats of the first 6 so we don't need to & 7
db -1 * 16 + 1, -2 * 16 + 2
db 0 * 16 + 1, -1 * 16 + 2
db 0 * 16 + 1, 0 * 16 + 2
db 0 * 16 + 1, 1 * 16 + 2
db 1 * 16 + 1, 2 * 16 + 2
db 1 * 16 + 0, 2 * 16 + 1
CDEF_FILTER_JMP_TABLE 4x4
CDEF_FILTER_JMP_TABLE 4x8
CDEF_FILTER_JMP_TABLE 8x8
SECTION .text
%macro PREP_REGS 2 ; w, h
; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
mov dird, r6m
lea tableq, [cdef_filter_%1x%2_8bpc_jmptable]
lea dirq, [tableq+dirq*2*4]
%if %1 == 4
%if %2 == 4
DEFINE_ARGS dst, stride, left, top, pri, sec, \
table, dir, dirjmp, dst4, stride3, k
%else
DEFINE_ARGS dst, stride, left, top, pri, sec, \
table, dir, dirjmp, dst4, dst8, stride3, k
lea dst8q, [dstq+strideq*8]
%endif
%else
DEFINE_ARGS dst, stride, h, top1, pri, sec, \
table, dir, dirjmp, top2, dst4, stride3, k
mov hq, -8
lea top1q, [top1q+strideq*0]
lea top2q, [top1q+strideq*1]
%endif
lea dst4q, [dstq+strideq*4]
%if %1 == 4
lea stride3q, [strideq*3]
%endif
%endmacro
%macro LOAD_BLOCK 2-3 0 ; w, h, init_min_max
mov kd, 1
pxor m15, m15 ; sum
%if %2 == 8
pxor m12, m12
%if %1 == 4
movd xm4, [dstq +strideq*0]
movd xm6, [dstq +strideq*1]
movd xm5, [dstq +strideq*2]
movd xm7, [dstq +stride3q ]
vinserti128 m4, [dst4q+strideq*0], 1
vinserti128 m6, [dst4q+strideq*1], 1
vinserti128 m5, [dst4q+strideq*2], 1
vinserti128 m7, [dst4q+stride3q ], 1
punpckldq m4, m6
punpckldq m5, m7
%else
movq xm4, [dstq+strideq*0]
movq xm5, [dstq+strideq*1]
vinserti128 m4, [dstq+strideq*2], 1
vinserti128 m5, [dstq+stride3q ], 1
%endif
punpcklqdq m4, m5
%else
movd xm4, [dstq+strideq*0]
movd xm5, [dstq+strideq*1]
vinserti128 m4, [dstq+strideq*2], 1
vinserti128 m5, [dstq+stride3q ], 1
punpckldq m4, m5
%endif
%if %3 == 1
mova m7, m4 ; min
mova m8, m4 ; max
%endif
%endmacro
%macro ACCUMULATE_TAP_BYTE 7-8 0 ; tap_offset, shift, mask, strength
; mul_tap, w, h, clip
; load p0/p1
movsxd dirjmpq, [dirq+kq*4+%1*2*4]
add dirjmpq, tableq
call dirjmpq
%if %8 == 1
pmaxub m7, m5
pminub m8, m5
pmaxub m7, m6
pminub m8, m6
%endif
; accumulate sum[m15] over p0/p1
%if %7 == 4
punpcklbw m5, m6
punpcklbw m6, m4, m4
psubusb m9, m5, m6
psubusb m5, m6, m5
por m9, m5 ; abs_diff_p01(p01 - px)
pcmpeqb m5, m9
por m5, %5
psignb m6, %5, m5
psrlw m5, m9, %2 ; emulate 8-bit shift
pand m5, %3
psubusb m5, %4, m5
pminub m5, m9
pmaddubsw m5, m6
paddw m15, m5
%else
psubusb m9, m5, m4
psubusb m5, m4, m5
psubusb m11, m6, m4
psubusb m6, m4, m6
por m9, m5 ; abs_diff_p0(p0 - px)
por m11, m6 ; abs_diff_p1(p1 - px)
pcmpeqb m5, m9
pcmpeqb m6, m11
punpckhbw m10, m9, m11
punpcklbw m9, m11
por m5, %5
por m11, m6, %5
punpckhbw m6, m5, m11
punpcklbw m5, m11
psignb m11, %5, m6
psrlw m6, m10, %2 ; emulate 8-bit shift
pand m6, %3
psubusb m6, %4, m6
pminub m6, m10
pmaddubsw m6, m11
paddw m12, m6
psignb m11, %5, m5
psrlw m5, m9, %2 ; emulate 8-bit shift
pand m5, %3
psubusb m5, %4, m5
pminub m5, m9
pmaddubsw m5, m11
paddw m15, m5
%endif
%endmacro
%macro ADJUST_PIXEL 4-5 0 ; w, h, zero, pw_2048, clip
%if %2 == 4
%if %5 == 1
punpcklbw m4, %3
%endif
pcmpgtw %3, m15
paddw m15, %3
pmulhrsw m15, %4
%if %5 == 0
packsswb m15, m15
paddb m4, m15
%else
paddw m4, m15
packuswb m4, m4 ; clip px in [0x0,0xff]
pminub m4, m7
pmaxub m4, m8
%endif
vextracti128 xm5, m4, 1
movd [dstq+strideq*0], xm4
movd [dstq+strideq*2], xm5
pextrd [dstq+strideq*1], xm4, 1
pextrd [dstq+stride3q ], xm5, 1
%else
pcmpgtw m6, %3, m12
pcmpgtw m5, %3, m15
paddw m12, m6
paddw m15, m5
%if %5 == 1
punpckhbw m5, m4, %3
punpcklbw m4, %3
%endif
pmulhrsw m12, %4
pmulhrsw m15, %4
%if %5 == 0
packsswb m15, m12
paddb m4, m15
%else
paddw m5, m12
paddw m4, m15
packuswb m4, m5 ; clip px in [0x0,0xff]
pminub m4, m7
pmaxub m4, m8
%endif
vextracti128 xm5, m4, 1
%if %1 == 4
movd [dstq +strideq*0], xm4
movd [dst4q+strideq*0], xm5
pextrd [dstq +strideq*1], xm4, 1
pextrd [dst4q+strideq*1], xm5, 1
pextrd [dstq +strideq*2], xm4, 2
pextrd [dst4q+strideq*2], xm5, 2
pextrd [dstq +stride3q ], xm4, 3
pextrd [dst4q+stride3q ], xm5, 3
%else
movq [dstq+strideq*0], xm4
movq [dstq+strideq*2], xm5
movhps [dstq+strideq*1], xm4
movhps [dstq+stride3q ], xm5
%endif
%endif
%endmacro
%macro BORDER_PREP_REGS 2 ; w, h
; off1/2/3[k] [6 total] from [tapq+12+(dir+0/2/6)*2+k]
mov dird, r6m
lea dirq, [tableq+dirq*2+14]
%if %1*%2*2/mmsize > 1
%if %1 == 4
DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, h, off, k
%else
DEFINE_ARGS dst, stride, dir, stk, pri, sec, h, off, k
%endif
mov hd, %1*%2*2/mmsize
%else
DEFINE_ARGS dst, stride, dir, stk, pri, sec, stride3, off, k
%endif
lea stkq, [px]
pxor m11, m11
%endmacro
%macro BORDER_LOAD_BLOCK 2-3 0 ; w, h, init_min_max
mov kd, 1
%if %1 == 4
movq xm4, [stkq+32*0]
movhps xm4, [stkq+32*1]
movq xm5, [stkq+32*2]
movhps xm5, [stkq+32*3]
vinserti128 m4, xm5, 1
%else
mova xm4, [stkq+32*0] ; px
vinserti128 m4, [stkq+32*1], 1
%endif
pxor m15, m15 ; sum
%if %3 == 1
mova m7, m4 ; max
mova m8, m4 ; min
%endif
%endmacro
%macro ACCUMULATE_TAP_WORD 6-7 0 ; tap_offset, shift, mask, strength
; mul_tap, w, clip
; load p0/p1
movsx offq, byte [dirq+kq+%1] ; off1
%if %6 == 4
movq xm5, [stkq+offq*2+32*0] ; p0
movq xm6, [stkq+offq*2+32*2]
movhps xm5, [stkq+offq*2+32*1]
movhps xm6, [stkq+offq*2+32*3]
vinserti128 m5, xm6, 1
%else
movu xm5, [stkq+offq*2+32*0] ; p0
vinserti128 m5, [stkq+offq*2+32*1], 1
%endif
neg offq ; -off1
%if %6 == 4
movq xm6, [stkq+offq*2+32*0] ; p1
movq xm9, [stkq+offq*2+32*2]
movhps xm6, [stkq+offq*2+32*1]
movhps xm9, [stkq+offq*2+32*3]
vinserti128 m6, xm9, 1
%else
movu xm6, [stkq+offq*2+32*0] ; p1
vinserti128 m6, [stkq+offq*2+32*1], 1
%endif
%if %7 == 1
; out of bounds values are set to a value that is a both a large unsigned
; value and a negative signed value.
; use signed max and unsigned min to remove them
pmaxsw m7, m5 ; max after p0
pminuw m8, m5 ; min after p0
pmaxsw m7, m6 ; max after p1
pminuw m8, m6 ; min after p1
%endif
; accumulate sum[m15] over p0/p1
; calculate difference before converting
psubw m5, m4 ; diff_p0(p0 - px)
psubw m6, m4 ; diff_p1(p1 - px)
; convert to 8-bits with signed saturation
; saturating to large diffs has no impact on the results
packsswb m5, m6
; group into pairs so we can accumulate using maddubsw
pshufb m5, m12
pabsb m9, m5
psignb m10, %5, m5
psrlw m5, m9, %2 ; emulate 8-bit shift
pand m5, %3
psubusb m5, %4, m5
; use unsigned min since abs diff can equal 0x80
pminub m5, m9
pmaddubsw m5, m10
paddw m15, m5
%endmacro
%macro BORDER_ADJUST_PIXEL 2-3 0 ; w, pw_2048, clip
pcmpgtw m9, m11, m15
paddw m15, m9
pmulhrsw m15, %2
paddw m4, m15
%if %3 == 1
pminsw m4, m7
pmaxsw m4, m8
%endif
packuswb m4, m4
vextracti128 xm5, m4, 1
%if %1 == 4
movd [dstq+strideq*0], xm4
pextrd [dstq+strideq*1], xm4, 1
movd [dstq+strideq*2], xm5
pextrd [dstq+stride3q], xm5, 1
%else
movq [dstq+strideq*0], xm4
movq [dstq+strideq*1], xm5
%endif
%endmacro
%macro CDEF_FILTER 2 ; w, h
INIT_YMM avx2
cglobal cdef_filter_%1x%2_8bpc, 4, 9, 0, dst, stride, left, top, \
pri, sec, dir, damping, edge
%assign stack_offset_entry stack_offset
mov edged, edgem
cmp edged, 0xf
jne .border_block
PUSH r9
PUSH r10
PUSH r11
%if %2 == 4
%assign regs_used 12
%if STACK_ALIGNMENT < 32
PUSH r%+regs_used
%assign regs_used regs_used+1
%endif
ALLOC_STACK 0x60, 16
pmovzxbw xm0, [leftq+1]
vpermq m0, m0, q0110
psrldq m1, m0, 4
vpalignr m2, m0, m0, 12
movu [rsp+0x10], m0
movu [rsp+0x28], m1
movu [rsp+0x40], m2
%elif %1 == 4
PUSH r12
%assign regs_used 13
%if STACK_ALIGNMENT < 32
PUSH r%+regs_used
%assign regs_used regs_used+1
%endif
ALLOC_STACK 8*2+%1*%2*1, 16
pmovzxwd m0, [leftq]
mova [rsp+0x10], m0
%else
PUSH r12
PUSH r13
%assign regs_used 14
%if STACK_ALIGNMENT < 32
PUSH r%+regs_used
%assign regs_used regs_used+1
%endif
ALLOC_STACK 8*2+%1*%2*2+32, 16
lea r11, [strideq*3]
movu xm4, [dstq+strideq*2]
pmovzxwq m0, [leftq+0]
pmovzxwq m1, [leftq+8]
vinserti128 m4, [dstq+r11], 1
pmovzxbd m2, [leftq+1]
pmovzxbd m3, [leftq+9]
mova [rsp+0x10], m0
mova [rsp+0x30], m1
mova [rsp+0x50], m2
mova [rsp+0x70], m3
mova [rsp+0x90], m4
%endif
DEFINE_ARGS dst, stride, left, top, pri, secdmp, zero, pridmp, damping
mov dampingd, r7m
xor zerod, zerod
movifnidn prid, prim
sub dampingd, 31
movifnidn secdmpd, secdmpm
test prid, prid
jz .sec_only
movd xm0, prid
lzcnt pridmpd, prid
add pridmpd, dampingd
cmovs pridmpd, zerod
mov [rsp+0], pridmpq ; pri_shift
test secdmpd, secdmpd
jz .pri_only
movd xm1, secdmpd
lzcnt secdmpd, secdmpd
add secdmpd, dampingd
cmovs secdmpd, zerod
mov [rsp+8], secdmpq ; sec_shift
DEFINE_ARGS dst, stride, left, top, pri, secdmp, table, pridmp
lea tableq, [tap_table]
vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
; pri/sec_taps[k] [4 total]
DEFINE_ARGS dst, stride, left, top, pri, sec, table, dir
vpbroadcastb m0, xm0 ; pri_strength
vpbroadcastb m1, xm1 ; sec_strength
and prid, 1
lea priq, [tableq+priq*2+8] ; pri_taps
lea secq, [tableq+12] ; sec_taps
PREP_REGS %1, %2
%if %1*%2 > mmsize
.v_loop:
%endif
LOAD_BLOCK %1, %2, 1
.k_loop:
vpbroadcastb m2, [priq+kq] ; pri_taps
vpbroadcastb m3, [secq+kq] ; sec_taps
ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2, 1 ; dir + 0
ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir + 2
ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2, 1 ; dir - 2
dec kq
jge .k_loop
vpbroadcastd m10, [pw_2048]
pxor m9, m9
ADJUST_PIXEL %1, %2, m9, m10, 1
%if %1*%2 > mmsize
mov dstq, dst4q
lea top1q, [rsp+0x90]
lea top2q, [rsp+0xA0]
lea dst4q, [dst4q+strideq*4]
add hq, 4
jl .v_loop
%endif
RET
.pri_only:
DEFINE_ARGS dst, stride, left, top, pri, _, table, pridmp
lea tableq, [tap_table]
vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
; pri/sec_taps[k] [4 total]
DEFINE_ARGS dst, stride, left, top, pri, _, table, dir
vpbroadcastb m0, xm0 ; pri_strength
and prid, 1
lea priq, [tableq+priq*2+8] ; pri_taps
PREP_REGS %1, %2
vpbroadcastd m3, [pw_2048]
pxor m1, m1
%if %1*%2 > mmsize
.pri_v_loop:
%endif
LOAD_BLOCK %1, %2
.pri_k_loop:
vpbroadcastb m2, [priq+kq] ; pri_taps
ACCUMULATE_TAP_BYTE 2, [rsp+0], m13, m0, m2, %1, %2 ; dir + 0
dec kq
jge .pri_k_loop
ADJUST_PIXEL %1, %2, m1, m3
%if %1*%2 > mmsize
mov dstq, dst4q
lea top1q, [rsp+0x90]
lea top2q, [rsp+0xA0]
lea dst4q, [dst4q+strideq*4]
add hq, 4
jl .pri_v_loop
%endif
RET
.sec_only:
DEFINE_ARGS dst, stride, left, top, _, secdmp, zero, _, damping
movd xm1, secdmpd
lzcnt secdmpd, secdmpd
add secdmpd, dampingd
cmovs secdmpd, zerod
mov [rsp+8], secdmpq ; sec_shift
DEFINE_ARGS dst, stride, left, top, _, secdmp, table
lea tableq, [tap_table]
vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
; pri/sec_taps[k] [4 total]
DEFINE_ARGS dst, stride, left, top, _, sec, table, dir
vpbroadcastb m1, xm1 ; sec_strength
lea secq, [tableq+12] ; sec_taps
PREP_REGS %1, %2
vpbroadcastd m2, [pw_2048]
pxor m0, m0
%if %1*%2 > mmsize
.sec_v_loop:
%endif
LOAD_BLOCK %1, %2
.sec_k_loop:
vpbroadcastb m3, [secq+kq] ; sec_taps
ACCUMULATE_TAP_BYTE 4, [rsp+8], m14, m1, m3, %1, %2 ; dir + 2
ACCUMULATE_TAP_BYTE 0, [rsp+8], m14, m1, m3, %1, %2 ; dir - 2
dec kq
jge .sec_k_loop
ADJUST_PIXEL %1, %2, m0, m2
%if %1*%2 > mmsize
mov dstq, dst4q
lea top1q, [rsp+0x90]
lea top2q, [rsp+0xA0]
lea dst4q, [dst4q+strideq*4]
add hq, 4
jl .sec_v_loop
%endif
RET
.d0k0:
%if %1 == 4
%if %2 == 4
vpbroadcastq m6, [dstq+strideq*1-1]
vpbroadcastq m10, [dstq+strideq*2-1]
movd xm5, [topq+strideq*1+1]
movd xm9, [dstq+strideq*0+1]
psrldq m11, m6, 2
psrldq m12, m10, 2
vinserti128 m6, [dstq+stride3q -1], 1
vinserti128 m10, [dstq+strideq*4-1], 1
vpblendd m5, m11, 0x10
vpblendd m9, m12, 0x10
movu m11, [blend_4x4+16]
punpckldq m6, m10
punpckldq m5, m9
vpblendvb m6, [rsp+gprsize+0x28], m11
%else
movd xm5, [topq +strideq*1+1]
movq xm6, [dstq +strideq*1-1]
movq xm10, [dstq +stride3q -1]
movq xm11, [dst4q+strideq*1-1]
pinsrd xm5, [dstq +strideq*0+1], 1
movhps xm6, [dstq +strideq*2-1]
movhps xm10, [dst4q+strideq*0-1]
movhps xm11, [dst4q+strideq*2-1]
psrldq xm9, xm6, 2
shufps xm5, xm9, q2010 ; -1 +0 +1 +2
shufps xm6, xm10, q2020 ; +1 +2 +3 +4
psrldq xm9, xm11, 2
psrldq xm10, 2
shufps xm10, xm9, q2020 ; +3 +4 +5 +6
movd xm9, [dst4q+stride3q -1]
pinsrd xm9, [dst4q+strideq*4-1], 1
shufps xm11, xm9, q1020 ; +5 +6 +7 +8
pmovzxbw m9, [leftq+3]
vinserti128 m6, xm11, 1
movu m11, [blend_4x8_0+4]
vinserti128 m5, xm10, 1
vpblendvb m6, m9, m11
%endif
%else
lea r13, [blend_8x8_0+16]
movq xm5, [top2q +1]
vbroadcasti128 m10, [dstq+strideq*1-1]
vbroadcasti128 m11, [dstq+strideq*2-1]
movhps xm5, [dstq+strideq*0+1]
vinserti128 m6, m10, [dstq+stride3q -1], 1
vinserti128 m9, m11, [dstq+strideq*4-1], 1
psrldq m10, 2
psrldq m11, 2
punpcklqdq m6, m9
movu m9, [r13+hq*2*1+16*1]
punpcklqdq m10, m11
vpblendd m5, m10, 0xF0
vpblendvb m6, [rsp+gprsize+80+hq*8+64+8*1], m9
%endif
ret
.d1k0:
.d2k0:
.d3k0:
%if %1 == 4
%if %2 == 4
movq xm6, [dstq+strideq*0-1]
movq xm9, [dstq+strideq*1-1]
vinserti128 m6, [dstq+strideq*2-1], 1
vinserti128 m9, [dstq+stride3q -1], 1
movu m11, [rsp+gprsize+0x10]
pcmpeqd m12, m12
psrldq m5, m6, 2
psrldq m10, m9, 2
psrld m12, 24
punpckldq m6, m9
punpckldq m5, m10
vpblendvb m6, m11, m12
%else
movq xm6, [dstq +strideq*0-1]
movq xm9, [dstq +strideq*2-1]
movhps xm6, [dstq +strideq*1-1]
movhps xm9, [dstq +stride3q -1]
movq xm10, [dst4q+strideq*0-1]
movhps xm10, [dst4q+strideq*1-1]
psrldq xm5, xm6, 2
psrldq xm11, xm9, 2
shufps xm5, xm11, q2020
movq xm11, [dst4q+strideq*2-1]
movhps xm11, [dst4q+stride3q -1]
shufps xm6, xm9, q2020
shufps xm9, xm10, xm11, q2020
vinserti128 m6, xm9, 1
pmovzxbw m9, [leftq+1]
psrldq xm10, 2
psrldq xm11, 2
shufps xm10, xm11, q2020
vpbroadcastd m11, [blend_4x8_0+4]
vinserti128 m5, xm10, 1
vpblendvb m6, m9, m11
%endif
%else
movu xm5, [dstq+strideq*0-1]
movu xm9, [dstq+strideq*1-1]
vinserti128 m5, [dstq+strideq*2-1], 1
vinserti128 m9, [dstq+stride3q -1], 1
movu m10, [blend_8x8_0+16]
punpcklqdq m6, m5, m9
vpblendvb m6, [rsp+gprsize+80+hq*8+64], m10
psrldq m5, 2
psrldq m9, 2
punpcklqdq m5, m9
%endif
ret
.d4k0:
%if %1 == 4
%if %2 == 4
vpbroadcastq m10, [dstq+strideq*1-1]
vpbroadcastq m11, [dstq+strideq*2-1]
movd xm6, [topq+strideq*1-1]
movd xm9, [dstq+strideq*0-1]
psrldq m5, m10, 2
psrldq m12, m11, 2
vpblendd m6, m10, 0x10
vpblendd m9, m11, 0x10
movu m10, [blend_4x4]
vinserti128 m5, [dstq+stride3q +1], 1
vinserti128 m12, [dstq+strideq*4+1], 1
punpckldq m6, m9
punpckldq m5, m12
vpblendvb m6, [rsp+gprsize+0x40], m10
%else
movd xm6, [topq +strideq*1-1]
movq xm9, [dstq +strideq*1-1]
movq xm10, [dstq +stride3q -1]
movq xm11, [dst4q+strideq*1-1]
pinsrd xm6, [dstq +strideq*0-1], 1
movhps xm9, [dstq +strideq*2-1]
movhps xm10, [dst4q+strideq*0-1]
movhps xm11, [dst4q+strideq*2-1]
psrldq xm5, xm9, 2
shufps xm6, xm9, q2010
psrldq xm9, xm10, 2
shufps xm5, xm9, q2020
shufps xm10, xm11, q2020
movd xm9, [dst4q+stride3q +1]
vinserti128 m6, xm10, 1
pinsrd xm9, [dst4q+strideq*4+1], 1
psrldq xm11, 2
pmovzxbw m10, [leftq-1]
shufps xm11, xm9, q1020
movu m9, [blend_4x8_0]
vinserti128 m5, xm11, 1
vpblendvb m6, m10, m9
%endif
%else
lea r13, [blend_8x8_0+8]
movq xm6, [top2q -1]
vbroadcasti128 m5, [dstq+strideq*1-1]
vbroadcasti128 m9, [dstq+strideq*2-1]
movhps xm6, [dstq+strideq*0-1]
movu m11, [r13+hq*2*1+16*1]
punpcklqdq m10, m5, m9
vinserti128 m5, [dstq+stride3q -1], 1
vinserti128 m9, [dstq+strideq*4-1], 1
vpblendd m6, m10, 0xF0
vpblendvb m6, [rsp+gprsize+80+hq*8+64-8*1], m11
psrldq m5, 2
psrldq m9, 2
punpcklqdq m5, m9
%endif
ret
.d5k0:
.d6k0:
.d7k0:
%if %1 == 4
%if %2 == 4
movd xm6, [topq+strideq*1 ]
vpbroadcastd m5, [dstq+strideq*1 ]
vpbroadcastd m9, [dstq+strideq*2 ]
vpblendd xm6, [dstq+strideq*0-4], 0x2
vpblendd m5, m9, 0x22
vpblendd m6, m5, 0x30
vinserti128 m5, [dstq+stride3q ], 1
vpblendd m5, [dstq+strideq*4-20], 0x20
%else
movd xm6, [topq +strideq*1]
movd xm5, [dstq +strideq*1]
movd xm9, [dstq +stride3q ]
movd xm10, [dst4q+strideq*1]
movd xm11, [dst4q+stride3q ]
pinsrd xm6, [dstq +strideq*0], 1
pinsrd xm5, [dstq +strideq*2], 1
pinsrd xm9, [dst4q+strideq*0], 1
pinsrd xm10, [dst4q+strideq*2], 1
pinsrd xm11, [dst4q+strideq*4], 1
punpcklqdq xm6, xm5
punpcklqdq xm5, xm9
punpcklqdq xm9, xm10
punpcklqdq xm10, xm11
vinserti128 m6, xm9, 1
vinserti128 m5, xm10, 1
%endif
%else
movq xm6, [top2q ]
movq xm5, [dstq+strideq*1]
movq xm9, [dstq+stride3q ]
movhps xm6, [dstq+strideq*0]
movhps xm5, [dstq+strideq*2]
movhps xm9, [dstq+strideq*4]
vinserti128 m6, xm5, 1
vinserti128 m5, xm9, 1
%endif
ret
.d0k1:
%if %1 == 4
%if %2 == 4
movd xm6, [dstq +strideq*2-2]
movd xm9, [dstq +stride3q -2]
movd xm5, [topq +strideq*0+2]
movd xm10, [topq +strideq*1+2]
pinsrw xm6, [leftq+4], 0
pinsrw xm9, [leftq+6], 0
vinserti128 m5, [dstq +strideq*0+2], 1
vinserti128 m10, [dstq +strideq*1+2], 1
vinserti128 m6, [dst4q+strideq*0-2], 1
vinserti128 m9, [dst4q+strideq*1-2], 1
punpckldq m5, m10
punpckldq m6, m9
%else
movq xm6, [dstq +strideq*2-2]
movd xm10, [dst4q+strideq*2-2]
movd xm5, [topq +strideq*0+2]
movq xm9, [dst4q+strideq*0-2]
movhps xm6, [dstq +stride3q -2]
pinsrw xm10, [dst4q+stride3q ], 3
pinsrd xm5, [topq +strideq*1+2], 1
movhps xm9, [dst4q+strideq*1-2]
pinsrd xm10, [dst8q+strideq*0-2], 2
pinsrd xm5, [dstq +strideq*0+2], 2
pinsrd xm10, [dst8q+strideq*1-2], 3
pinsrd xm5, [dstq +strideq*1+2], 3
shufps xm11, xm6, xm9, q3131
shufps xm6, xm9, q2020
movu m9, [blend_4x8_3+8]
vinserti128 m6, xm10, 1
vinserti128 m5, xm11, 1
vpblendvb m6, [rsp+gprsize+16+8], m9
%endif
%else
lea r13, [blend_8x8_1+16]
movq xm6, [dstq +strideq*2-2]
movq xm9, [dstq +stride3q -2]
movq xm5, [top1q +2]
movq xm10, [top2q +2]
movu m11, [r13+hq*2*2+16*2]
vinserti128 m6, [dst4q+strideq*0-2], 1
vinserti128 m9, [dst4q+strideq*1-2], 1
vinserti128 m5, [dstq +strideq*0+2], 1
vinserti128 m10, [dstq +strideq*1+2], 1
punpcklqdq m6, m9
punpcklqdq m5, m10
vpblendvb m6, [rsp+gprsize+16+hq*8+64+8*2], m11
%endif
ret
.d1k1:
%if %1 == 4
%if %2 == 4
vpbroadcastq m6, [dstq+strideq*1-2]
vpbroadcastq m9, [dstq+strideq*2-2]
movd xm5, [topq+strideq*1+2]
movd xm10, [dstq+strideq*0+2]
psrldq m11, m6, 4
psrldq m12, m9, 4
vpblendd m5, m11, 0x10
movq xm11, [leftq+2]
vinserti128 m6, [dstq+stride3q -2], 1
punpckldq xm11, xm11
vpblendd m10, m12, 0x10
pcmpeqd m12, m12
pmovzxwd m11, xm11
psrld m12, 16
punpckldq m6, m9
vpbroadcastd m9, [dstq+strideq*4-2]
vpblendvb m6, m11, m12
punpckldq m5, m10
vpblendd m6, m9, 0x20
%else
movd xm5, [topq +strideq*1+2]
movq xm6, [dstq +strideq*1-2]
movq xm9, [dstq +stride3q -2]
movq xm10, [dst4q+strideq*1-2]
movd xm11, [dst4q+stride3q -2]
pinsrd xm5, [dstq +strideq*0+2], 1
movhps xm6, [dstq +strideq*2-2]
movhps xm9, [dst4q+strideq*0-2]
movhps xm10, [dst4q+strideq*2-2]
pinsrd xm11, [dst4q+strideq*4-2], 1
shufps xm5, xm6, q3110
shufps xm6, xm9, q2020
shufps xm9, xm10, q3131
shufps xm10, xm11, q1020
movu m11, [blend_4x8_2+4]
vinserti128 m6, xm10, 1
vinserti128 m5, xm9, 1
vpblendvb m6, [rsp+gprsize+16+4], m11
%endif
%else
lea r13, [blend_8x8_1+16]
movq xm5, [top2q +2]
vbroadcasti128 m6, [dstq+strideq*1-2]
vbroadcasti128 m9, [dstq+strideq*2-2]
movhps xm5, [dstq+strideq*0+2]
shufps m10, m6, m9, q2121
vinserti128 m6, [dstq+stride3q -2], 1
vinserti128 m9, [dstq+strideq*4-2], 1
movu m11, [r13+hq*2*1+16*1]
vpblendd m5, m10, 0xF0
punpcklqdq m6, m9
vpblendvb m6, [rsp+gprsize+16+hq*8+64+8*1], m11
%endif
ret
.d2k1:
%if %1 == 4
%if %2 == 4
movq xm11, [leftq]
movq xm6, [dstq+strideq*0-2]
movq xm9, [dstq+strideq*1-2]
vinserti128 m6, [dstq+strideq*2-2], 1
vinserti128 m9, [dstq+stride3q -2], 1
punpckldq xm11, xm11
psrldq m5, m6, 4
psrldq m10, m9, 4
pmovzxwd m11, xm11
punpckldq m6, m9
punpckldq m5, m10
pblendw m6, m11, 0x05
%else
movq xm5, [dstq +strideq*0-2]
movq xm9, [dstq +strideq*2-2]
movq xm10, [dst4q+strideq*0-2]
movq xm11, [dst4q+strideq*2-2]
movhps xm5, [dstq +strideq*1-2]
movhps xm9, [dstq +stride3q -2]
movhps xm10, [dst4q+strideq*1-2]
movhps xm11, [dst4q+stride3q -2]
shufps xm6, xm5, xm9, q2020
shufps xm5, xm9, q3131
shufps xm9, xm10, xm11, q2020
shufps xm10, xm11, q3131
pmovzxwd m11, [leftq]
vinserti128 m6, xm9, 1
vinserti128 m5, xm10, 1
pblendw m6, m11, 0x55
%endif
%else
mova m11, [rsp+gprsize+16+hq*8+64]
movu xm5, [dstq+strideq*0-2]
movu xm9, [dstq+strideq*1-2]
vinserti128 m5, [dstq+strideq*2-2], 1
vinserti128 m9, [dstq+stride3q -2], 1
shufps m6, m5, m9, q1010
shufps m5, m9, q2121
pblendw m6, m11, 0x11
%endif
ret
.d3k1:
%if %1 == 4
%if %2 == 4
vpbroadcastq m11, [dstq+strideq*1-2]
vpbroadcastq m12, [dstq+strideq*2-2]
movd xm6, [topq+strideq*1-2]
movd xm9, [dstq+strideq*0-2]
pblendw m11, [leftq-16+2], 0x01
pblendw m12, [leftq-16+4], 0x01
pinsrw xm9, [leftq- 0+0], 0
psrldq m5, m11, 4
psrldq m10, m12, 4
vinserti128 m5, [dstq+stride3q +2], 1
vinserti128 m10, [dstq+strideq*4+2], 1
vpblendd m6, m11, 0x10
vpblendd m9, m12, 0x10
punpckldq m6, m9
punpckldq m5, m10
%else
movd xm6, [topq +strideq*1-2]
movq xm5, [dstq +strideq*1-2]
movq xm9, [dstq +stride3q -2]
movq xm10, [dst4q+strideq*1-2]
movd xm11, [dst4q+stride3q +2]
pinsrw xm6, [dstq +strideq*0 ], 3
movhps xm5, [dstq +strideq*2-2]
movhps xm9, [dst4q+strideq*0-2]
movhps xm10, [dst4q+strideq*2-2]
pinsrd xm11, [dst4q+strideq*4+2], 1
shufps xm6, xm5, q2010
shufps xm5, xm9, q3131
shufps xm9, xm10, q2020
shufps xm10, xm11, q1031
movu m11, [blend_4x8_2]
vinserti128 m6, xm9, 1
vinserti128 m5, xm10, 1
vpblendvb m6, [rsp+gprsize+16-4], m11
%endif
%else
lea r13, [blend_8x8_1+8]
movq xm6, [top2q -2]
vbroadcasti128 m5, [dstq+strideq*1-2]
vbroadcasti128 m10, [dstq+strideq*2-2]
movhps xm6, [dstq+strideq*0-2]
punpcklqdq m9, m5, m10
vinserti128 m5, [dstq+stride3q -2], 1
vinserti128 m10, [dstq+strideq*4-2], 1
movu m11, [r13+hq*2*1+16*1]
vpblendd m6, m9, 0xF0
shufps m5, m10, q2121
vpblendvb m6, [rsp+gprsize+16+hq*8+64-8*1], m11
%endif
ret
.d4k1:
%if %1 == 4
%if %2 == 4
vinserti128 m6, [dstq +strideq*0-2], 1
vinserti128 m9, [dstq +strideq*1-2], 1
movd xm5, [dstq +strideq*2+2]
movd xm10, [dstq +stride3q +2]
pblendw m6, [leftq-16+0], 0x01
pblendw m9, [leftq-16+2], 0x01
vinserti128 m5, [dst4q+strideq*0+2], 1
vinserti128 m10, [dst4q+strideq*1+2], 1
vpblendd m6, [topq +strideq*0-2], 0x01
vpblendd m9, [topq +strideq*1-2], 0x01
punpckldq m5, m10
punpckldq m6, m9
%else
movd xm6, [topq +strideq*0-2]
movq xm5, [dstq +strideq*2-2]
movq xm9, [dst4q+strideq*0-2]
movd xm10, [dst4q+strideq*2+2]
pinsrd xm6, [topq +strideq*1-2], 1
movhps xm5, [dstq +stride3q -2]
movhps xm9, [dst4q+strideq*1-2]
pinsrd xm10, [dst4q+stride3q +2], 1
pinsrd xm6, [dstq +strideq*0-2], 2
pinsrd xm10, [dst8q+strideq*0+2], 2
pinsrd xm6, [dstq +strideq*1-2], 3
pinsrd xm10, [dst8q+strideq*1+2], 3
shufps xm11, xm5, xm9, q2020
shufps xm5, xm9, q3131
movu m9, [blend_4x8_3]
vinserti128 m6, xm11, 1
vinserti128 m5, xm10, 1
vpblendvb m6, [rsp+gprsize+16-8], m9
%endif
%else
lea r13, [blend_8x8_1]
movu m11, [r13+hq*2*2+16*2]
movq xm6, [top1q -2]
movq xm9, [top2q -2]
movq xm5, [dstq +strideq*2+2]
movq xm10, [dstq +stride3q +2]
vinserti128 m6, [dstq +strideq*0-2], 1
vinserti128 m9, [dstq +strideq*1-2], 1
vinserti128 m5, [dst4q+strideq*0+2], 1
vinserti128 m10, [dst4q+strideq*1+2], 1
punpcklqdq m6, m9
vpblendvb m6, [rsp+gprsize+16+hq*8+64-8*2], m11
punpcklqdq m5, m10
%endif
ret
.d5k1:
%if %1 == 4
%if %2 == 4
movd xm6, [topq +strideq*0-1]
movd xm9, [topq +strideq*1-1]
movd xm5, [dstq +strideq*2+1]
movd xm10, [dstq +stride3q +1]
pcmpeqd m12, m12
pmovzxbw m11, [leftq-8+1]
psrld m12, 24
vinserti128 m6, [dstq +strideq*0-1], 1
vinserti128 m9, [dstq +strideq*1-1], 1
vinserti128 m5, [dst4q+strideq*0+1], 1
vinserti128 m10, [dst4q+strideq*1+1], 1
punpckldq m6, m9
pxor m9, m9
vpblendd m12, m9, 0x0F
punpckldq m5, m10
vpblendvb m6, m11, m12
%else
movd xm6, [topq +strideq*0-1]
movq xm5, [dstq +strideq*2-1]
movq xm9, [dst4q+strideq*0-1]
movd xm10, [dst4q+strideq*2+1]
pinsrd xm6, [topq +strideq*1-1], 1
movhps xm5, [dstq +stride3q -1]
movhps xm9, [dst4q+strideq*1-1]
pinsrd xm10, [dst4q+stride3q +1], 1
pinsrd xm6, [dstq +strideq*0-1], 2
pinsrd xm10, [dst8q+strideq*0+1], 2
pinsrd xm6, [dstq +strideq*1-1], 3
pinsrd xm10, [dst8q+strideq*1+1], 3
shufps xm11, xm5, xm9, q2020
vinserti128 m6, xm11, 1
pmovzxbw m11, [leftq-3]
psrldq xm5, 2
psrldq xm9, 2
shufps xm5, xm9, q2020
movu m9, [blend_4x8_1]
vinserti128 m5, xm10, 1
vpblendvb m6, m11, m9
%endif
%else
lea r13, [blend_8x8_0]
movu m11, [r13+hq*2*2+16*2]
movq xm6, [top1q -1]
movq xm9, [top2q -1]
movq xm5, [dstq +strideq*2+1]
movq xm10, [dstq +stride3q +1]
vinserti128 m6, [dstq +strideq*0-1], 1
vinserti128 m9, [dstq +strideq*1-1], 1
vinserti128 m5, [dst4q+strideq*0+1], 1
vinserti128 m10, [dst4q+strideq*1+1], 1
punpcklqdq m6, m9
punpcklqdq m5, m10
vpblendvb m6, [rsp+gprsize+80+hq*8+64-8*2], m11
%endif
ret
.d6k1:
%if %1 == 4
%if %2 == 4
movd xm6, [topq +strideq*0]
movd xm9, [topq +strideq*1]
movd xm5, [dstq +strideq*2]
movd xm10, [dstq +stride3q ]
vinserti128 m6, [dstq +strideq*0], 1
vinserti128 m9, [dstq +strideq*1], 1
vinserti128 m5, [dst4q+strideq*0], 1
vinserti128 m10, [dst4q+strideq*1], 1
punpckldq m6, m9
punpckldq m5, m10
%else
movd xm5, [dstq +strideq*2]
movd xm6, [topq +strideq*0]
movd xm9, [dst4q+strideq*2]
pinsrd xm5, [dstq +stride3q ], 1
pinsrd xm6, [topq +strideq*1], 1
pinsrd xm9, [dst4q+stride3q ], 1
pinsrd xm5, [dst4q+strideq*0], 2
pinsrd xm6, [dstq +strideq*0], 2
pinsrd xm9, [dst8q+strideq*0], 2
pinsrd xm5, [dst4q+strideq*1], 3
pinsrd xm6, [dstq +strideq*1], 3
pinsrd xm9, [dst8q+strideq*1], 3
vinserti128 m6, xm5, 1
vinserti128 m5, xm9, 1
%endif
%else
movq xm5, [dstq +strideq*2]
movq xm9, [dst4q+strideq*0]
movq xm6, [top1q ]
movq xm10, [dstq +strideq*0]
movhps xm5, [dstq +stride3q ]
movhps xm9, [dst4q+strideq*1]
movhps xm6, [top2q ]
movhps xm10, [dstq +strideq*1]
vinserti128 m5, xm9, 1
vinserti128 m6, xm10, 1
%endif
ret
.d7k1:
%if %1 == 4
%if %2 == 4
movd xm5, [dstq +strideq*2-1]
movd xm9, [dstq +stride3q -1]
movd xm6, [topq +strideq*0+1]
movd xm10, [topq +strideq*1+1]
pinsrb xm5, [leftq+ 5], 0
pinsrb xm9, [leftq+ 7], 0
vinserti128 m6, [dstq +strideq*0+1], 1
vinserti128 m10, [dstq +strideq*1+1], 1
vinserti128 m5, [dst4q+strideq*0-1], 1
vinserti128 m9, [dst4q+strideq*1-1], 1
punpckldq m6, m10
punpckldq m5, m9
%else
movd xm6, [topq +strideq*0+1]
movq xm9, [dstq +strideq*2-1]
movq xm10, [dst4q+strideq*0-1]
movd xm11, [dst4q+strideq*2-1]
pinsrd xm6, [topq +strideq*1+1], 1
movhps xm9, [dstq +stride3q -1]
movhps xm10, [dst4q+strideq*1-1]
pinsrd xm11, [dst4q+stride3q -1], 1
pinsrd xm6, [dstq +strideq*0+1], 2
pinsrd xm11, [dst8q+strideq*0-1], 2
pinsrd xm6, [dstq +strideq*1+1], 3
pinsrd xm11, [dst8q+strideq*1-1], 3
shufps xm5, xm9, xm10, q2020
vinserti128 m5, xm11, 1
pmovzxbw m11, [leftq+5]
psrldq xm9, 2
psrldq xm10, 2
shufps xm9, xm10, q2020
movu m10, [blend_4x8_1+8]
vinserti128 m6, xm9, 1
vpblendvb m5, m11, m10
%endif
%else
lea r13, [blend_8x8_0+16]
movq xm5, [dstq +strideq*2-1]
movq xm9, [dst4q+strideq*0-1]
movq xm6, [top1q +1]
movq xm10, [dstq +strideq*0+1]
movhps xm5, [dstq +stride3q -1]
movhps xm9, [dst4q+strideq*1-1]
movhps xm6, [top2q +1]
movhps xm10, [dstq +strideq*1+1]
movu m11, [r13+hq*2*2+16*2]
vinserti128 m5, xm9, 1
vinserti128 m6, xm10, 1
vpblendvb m5, [rsp+gprsize+80+hq*8+64+8*2], m11
%endif
ret
.border_block:
DEFINE_ARGS dst, stride, left, top, pri, sec, stride3, dst4, edge
%define rstk rsp
%assign stack_offset stack_offset_entry
%if %1 == 4 && %2 == 8
PUSH r9
%assign regs_used 10
%else
%assign regs_used 9
%endif
%if STACK_ALIGNMENT < 32
PUSH r%+regs_used
%assign regs_used regs_used+1
%endif
ALLOC_STACK 2*16+(%2+4)*32, 16
%define px rsp+2*16+2*32
pcmpeqw m14, m14
psllw m14, 15 ; 0x8000
; prepare pixel buffers - body/right
%if %1 == 4
INIT_XMM avx2
%endif
%if %2 == 8
lea dst4q, [dstq+strideq*4]
%endif
lea stride3q, [strideq*3]
test edgeb, 2 ; have_right
jz .no_right
pmovzxbw m1, [dstq+strideq*0]
pmovzxbw m2, [dstq+strideq*1]
pmovzxbw m3, [dstq+strideq*2]
pmovzxbw m4, [dstq+stride3q]
mova [px+0*32], m1
mova [px+1*32], m2
mova [px+2*32], m3
mova [px+3*32], m4
%if %2 == 8
pmovzxbw m1, [dst4q+strideq*0]
pmovzxbw m2, [dst4q+strideq*1]
pmovzxbw m3, [dst4q+strideq*2]
pmovzxbw m4, [dst4q+stride3q]
mova [px+4*32], m1
mova [px+5*32], m2
mova [px+6*32], m3
mova [px+7*32], m4
%endif
jmp .body_done
.no_right:
%if %1 == 4
movd xm1, [dstq+strideq*0]
movd xm2, [dstq+strideq*1]
movd xm3, [dstq+strideq*2]
movd xm4, [dstq+stride3q]
pmovzxbw xm1, xm1
pmovzxbw xm2, xm2
pmovzxbw xm3, xm3
pmovzxbw xm4, xm4
movq [px+0*32], xm1
movq [px+1*32], xm2
movq [px+2*32], xm3
movq [px+3*32], xm4
%else
pmovzxbw xm1, [dstq+strideq*0]
pmovzxbw xm2, [dstq+strideq*1]
pmovzxbw xm3, [dstq+strideq*2]
pmovzxbw xm4, [dstq+stride3q]
mova [px+0*32], xm1
mova [px+1*32], xm2
mova [px+2*32], xm3
mova [px+3*32], xm4
%endif
movd [px+0*32+%1*2], xm14
movd [px+1*32+%1*2], xm14
movd [px+2*32+%1*2], xm14
movd [px+3*32+%1*2], xm14
%if %2 == 8
%if %1 == 4
movd xm1, [dst4q+strideq*0]
movd xm2, [dst4q+strideq*1]
movd xm3, [dst4q+strideq*2]
movd xm4, [dst4q+stride3q]
pmovzxbw xm1, xm1
pmovzxbw xm2, xm2
pmovzxbw xm3, xm3
pmovzxbw xm4, xm4
movq [px+4*32], xm1
movq [px+5*32], xm2
movq [px+6*32], xm3
movq [px+7*32], xm4
%else
pmovzxbw xm1, [dst4q+strideq*0]
pmovzxbw xm2, [dst4q+strideq*1]
pmovzxbw xm3, [dst4q+strideq*2]
pmovzxbw xm4, [dst4q+stride3q]
mova [px+4*32], xm1
mova [px+5*32], xm2
mova [px+6*32], xm3
mova [px+7*32], xm4
%endif
movd [px+4*32+%1*2], xm14
movd [px+5*32+%1*2], xm14
movd [px+6*32+%1*2], xm14
movd [px+7*32+%1*2], xm14
%endif
.body_done:
; top
test edgeb, 4 ; have_top
jz .no_top
test edgeb, 1 ; have_left
jz .top_no_left
test edgeb, 2 ; have_right
jz .top_no_right
pmovzxbw m1, [topq+strideq*0-(%1/2)]
pmovzxbw m2, [topq+strideq*1-(%1/2)]
movu [px-2*32-%1], m1
movu [px-1*32-%1], m2
jmp .top_done
.top_no_right:
pmovzxbw m1, [topq+strideq*0-%1]
pmovzxbw m2, [topq+strideq*1-%1]
movu [px-2*32-%1*2], m1
movu [px-1*32-%1*2], m2
movd [px-2*32+%1*2], xm14
movd [px-1*32+%1*2], xm14
jmp .top_done
.top_no_left:
test edgeb, 2 ; have_right
jz .top_no_left_right
pmovzxbw m1, [topq+strideq*0]
pmovzxbw m2, [topq+strideq*1]
mova [px-2*32+0], m1
mova [px-1*32+0], m2
movd [px-2*32-4], xm14
movd [px-1*32-4], xm14
jmp .top_done
.top_no_left_right:
%if %1 == 4
movd xm1, [topq+strideq*0]
pinsrd xm1, [topq+strideq*1], 1
pmovzxbw xm1, xm1
movq [px-2*32+0], xm1
movhps [px-1*32+0], xm1
%else
pmovzxbw xm1, [topq+strideq*0]
pmovzxbw xm2, [topq+strideq*1]
mova [px-2*32+0], xm1
mova [px-1*32+0], xm2
%endif
movd [px-2*32-4], xm14
movd [px-1*32-4], xm14
movd [px-2*32+%1*2], xm14
movd [px-1*32+%1*2], xm14
jmp .top_done
.no_top:
movu [px-2*32-%1], m14
movu [px-1*32-%1], m14
.top_done:
; left
test edgeb, 1 ; have_left
jz .no_left
pmovzxbw xm1, [leftq+ 0]
%if %2 == 8
pmovzxbw xm2, [leftq+ 8]
%endif
movd [px+0*32-4], xm1
pextrd [px+1*32-4], xm1, 1
pextrd [px+2*32-4], xm1, 2
pextrd [px+3*32-4], xm1, 3
%if %2 == 8
movd [px+4*32-4], xm2
pextrd [px+5*32-4], xm2, 1
pextrd [px+6*32-4], xm2, 2
pextrd [px+7*32-4], xm2, 3
%endif
jmp .left_done
.no_left:
movd [px+0*32-4], xm14
movd [px+1*32-4], xm14
movd [px+2*32-4], xm14
movd [px+3*32-4], xm14
%if %2 == 8
movd [px+4*32-4], xm14
movd [px+5*32-4], xm14
movd [px+6*32-4], xm14
movd [px+7*32-4], xm14
%endif
.left_done:
; bottom
DEFINE_ARGS dst, stride, dst8, dummy1, pri, sec, stride3, dummy3, edge
test edgeb, 8 ; have_bottom
jz .no_bottom
lea dst8q, [dstq+%2*strideq]
test edgeb, 1 ; have_left
jz .bottom_no_left
test edgeb, 2 ; have_right
jz .bottom_no_right
pmovzxbw m1, [dst8q-(%1/2)]
pmovzxbw m2, [dst8q+strideq-(%1/2)]
movu [px+(%2+0)*32-%1], m1
movu [px+(%2+1)*32-%1], m2
jmp .bottom_done
.bottom_no_right:
pmovzxbw m1, [dst8q-%1]
pmovzxbw m2, [dst8q+strideq-%1]
movu [px+(%2+0)*32-%1*2], m1
movu [px+(%2+1)*32-%1*2], m2
%if %1 == 8
movd [px+(%2-1)*32+%1*2], xm14 ; overwritten by previous movu
%endif
movd [px+(%2+0)*32+%1*2], xm14
movd [px+(%2+1)*32+%1*2], xm14
jmp .bottom_done
.bottom_no_left:
test edgeb, 2 ; have_right
jz .bottom_no_left_right
pmovzxbw m1, [dst8q]
pmovzxbw m2, [dst8q+strideq]
mova [px+(%2+0)*32+0], m1
mova [px+(%2+1)*32+0], m2
movd [px+(%2+0)*32-4], xm14
movd [px+(%2+1)*32-4], xm14
jmp .bottom_done
.bottom_no_left_right:
%if %1 == 4
movd xm1, [dst8q]
pinsrd xm1, [dst8q+strideq], 1
pmovzxbw xm1, xm1
movq [px+(%2+0)*32+0], xm1
movhps [px+(%2+1)*32+0], xm1
%else
pmovzxbw xm1, [dst8q]
pmovzxbw xm2, [dst8q+strideq]
mova [px+(%2+0)*32+0], xm1
mova [px+(%2+1)*32+0], xm2
%endif
movd [px+(%2+0)*32-4], xm14
movd [px+(%2+1)*32-4], xm14
movd [px+(%2+0)*32+%1*2], xm14
movd [px+(%2+1)*32+%1*2], xm14
jmp .bottom_done
.no_bottom:
movu [px+(%2+0)*32-%1], m14
movu [px+(%2+1)*32-%1], m14
.bottom_done:
; actual filter
INIT_YMM avx2
DEFINE_ARGS dst, stride, pridmp, damping, pri, secdmp, stride3, zero
%undef edged
; register to shuffle values into after packing
vbroadcasti128 m12, [shufb_lohi]
mov dampingd, r7m
xor zerod, zerod
movifnidn prid, prim
sub dampingd, 31
movifnidn secdmpd, secdmpm
test prid, prid
jz .border_sec_only
movd xm0, prid
lzcnt pridmpd, prid
add pridmpd, dampingd
cmovs pridmpd, zerod
mov [rsp+0], pridmpq ; pri_shift
test secdmpd, secdmpd
jz .border_pri_only
movd xm1, secdmpd
lzcnt secdmpd, secdmpd
add secdmpd, dampingd
cmovs secdmpd, zerod
mov [rsp+8], secdmpq ; sec_shift
DEFINE_ARGS dst, stride, pridmp, table, pri, secdmp, stride3
lea tableq, [tap_table]
vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
; pri/sec_taps[k] [4 total]
DEFINE_ARGS dst, stride, dir, table, pri, sec, stride3
vpbroadcastb m0, xm0 ; pri_strength
vpbroadcastb m1, xm1 ; sec_strength
and prid, 1
lea priq, [tableq+priq*2+8] ; pri_taps
lea secq, [tableq+12] ; sec_taps
BORDER_PREP_REGS %1, %2
%if %1*%2*2/mmsize > 1
.border_v_loop:
%endif
BORDER_LOAD_BLOCK %1, %2, 1
.border_k_loop:
vpbroadcastb m2, [priq+kq] ; pri_taps
vpbroadcastb m3, [secq+kq] ; sec_taps
ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1, 1
ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1, 1
ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1, 1
dec kq
jge .border_k_loop
vpbroadcastd m10, [pw_2048]
BORDER_ADJUST_PIXEL %1, m10, 1
%if %1*%2*2/mmsize > 1
%define vloop_lines (mmsize/(%1*2))
lea dstq, [dstq+strideq*vloop_lines]
add stkq, 32*vloop_lines
dec hd
jg .border_v_loop
%endif
RET
.border_pri_only:
DEFINE_ARGS dst, stride, pridmp, table, pri, _, stride3
lea tableq, [tap_table]
vpbroadcastb m13, [tableq+pridmpq] ; pri_shift_mask
DEFINE_ARGS dst, stride, dir, table, pri, _, stride3
vpbroadcastb m0, xm0 ; pri_strength
and prid, 1
lea priq, [tableq+priq*2+8] ; pri_taps
BORDER_PREP_REGS %1, %2
vpbroadcastd m1, [pw_2048]
%if %1*%2*2/mmsize > 1
.border_pri_v_loop:
%endif
BORDER_LOAD_BLOCK %1, %2
.border_pri_k_loop:
vpbroadcastb m2, [priq+kq] ; pri_taps
ACCUMULATE_TAP_WORD 0*2, [rsp+0], m13, m0, m2, %1
dec kq
jge .border_pri_k_loop
BORDER_ADJUST_PIXEL %1, m1
%if %1*%2*2/mmsize > 1
%define vloop_lines (mmsize/(%1*2))
lea dstq, [dstq+strideq*vloop_lines]
add stkq, 32*vloop_lines
dec hd
jg .border_pri_v_loop
%endif
RET
.border_sec_only:
DEFINE_ARGS dst, stride, _, damping, _, secdmp, stride3, zero
movd xm1, secdmpd
lzcnt secdmpd, secdmpd
add secdmpd, dampingd
cmovs secdmpd, zerod
mov [rsp+8], secdmpq ; sec_shift
DEFINE_ARGS dst, stride, _, table, _, secdmp, stride3
lea tableq, [tap_table]
vpbroadcastb m14, [tableq+secdmpq] ; sec_shift_mask
DEFINE_ARGS dst, stride, dir, table, _, sec, stride3
vpbroadcastb m1, xm1 ; sec_strength
lea secq, [tableq+12] ; sec_taps
BORDER_PREP_REGS %1, %2
vpbroadcastd m0, [pw_2048]
%if %1*%2*2/mmsize > 1
.border_sec_v_loop:
%endif
BORDER_LOAD_BLOCK %1, %2
.border_sec_k_loop:
vpbroadcastb m3, [secq+kq] ; sec_taps
ACCUMULATE_TAP_WORD 2*2, [rsp+8], m14, m1, m3, %1
ACCUMULATE_TAP_WORD 6*2, [rsp+8], m14, m1, m3, %1
dec kq
jge .border_sec_k_loop
BORDER_ADJUST_PIXEL %1, m0
%if %1*%2*2/mmsize > 1
%define vloop_lines (mmsize/(%1*2))
lea dstq, [dstq+strideq*vloop_lines]
add stkq, 32*vloop_lines
dec hd
jg .border_sec_v_loop
%endif
RET
%endmacro
CDEF_FILTER 8, 8
CDEF_FILTER 4, 8
CDEF_FILTER 4, 4
INIT_YMM avx2
cglobal cdef_dir_8bpc, 3, 4, 6, src, stride, var, stride3
lea stride3q, [strideq*3]
movq xm0, [srcq+strideq*0]
movq xm1, [srcq+strideq*1]
movq xm2, [srcq+strideq*2]
movq xm3, [srcq+stride3q ]
lea srcq, [srcq+strideq*4]
vpbroadcastq m4, [srcq+stride3q ]
vpbroadcastq m5, [srcq+strideq*2]
vpblendd m0, m4, 0xf0
vpblendd m1, m5, 0xf0
vpbroadcastq m4, [srcq+strideq*1]
vpbroadcastq m5, [srcq+strideq*0]
vpblendd m2, m4, 0xf0
vpblendd m3, m5, 0xf0
pxor m4, m4
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
punpcklbw m3, m4
cglobal_label .main
vpbroadcastd m4, [pw_128]
PROLOGUE 3, 4, 15
psubw m0, m4
psubw m1, m4
psubw m2, m4
psubw m3, m4
; shuffle registers to generate partial_sum_diag[0-1] together
vperm2i128 m7, m0, m0, 0x01
vperm2i128 m6, m1, m1, 0x01
vperm2i128 m5, m2, m2, 0x01
vperm2i128 m4, m3, m3, 0x01
; start with partial_sum_hv[0-1]
paddw m8, m0, m1
paddw m9, m2, m3
phaddw m10, m0, m1
phaddw m11, m2, m3
paddw m8, m9
phaddw m10, m11
vextracti128 xm9, m8, 1
vextracti128 xm11, m10, 1
paddw xm8, xm9 ; partial_sum_hv[1]
phaddw xm10, xm11 ; partial_sum_hv[0]
vinserti128 m8, xm10, 1
vpbroadcastd m9, [div_table+44]
pmaddwd m8, m8
pmulld m8, m9 ; cost6[2a-d] | cost2[a-d]
; create aggregates [lower half]:
; m9 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234+
; m4:xxxx0123+m5:xxxxx012+m6:xxxxxx01+m7:xxxxxxx0
; m10= m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx+
; m4:4567xxxx+m5:34567xxx+m6:234567xx+m7:1234567x
; and [upper half]:
; m9 = m0:xxxxxxx0+m1:xxxxxx01+m2:xxxxx012+m3:xxxx0123+
; m4:xxx01234+m5:xx012345+m6:x0123456+m7:01234567
; m10= m0:1234567x+m1:234567xx+m2:34567xxx+m3:4567xxxx+
; m4:567xxxxx+m5:67xxxxxx+m6:7xxxxxxx
; and then shuffle m11 [shufw_6543210x], unpcklwd, pmaddwd, pmulld, paddd
pslldq m9, m1, 2
psrldq m10, m1, 14
pslldq m11, m2, 4
psrldq m12, m2, 12
pslldq m13, m3, 6
psrldq m14, m3, 10
paddw m9, m11
paddw m10, m12
paddw m9, m13
paddw m10, m14
pslldq m11, m4, 8
psrldq m12, m4, 8
pslldq m13, m5, 10
psrldq m14, m5, 6
paddw m9, m11
paddw m10, m12
paddw m9, m13
paddw m10, m14
pslldq m11, m6, 12
psrldq m12, m6, 4
pslldq m13, m7, 14
psrldq m14, m7, 2
paddw m9, m11
paddw m10, m12
paddw m9, m13
paddw m10, m14 ; partial_sum_diag[0/1][8-14,zero]
vbroadcasti128 m14, [shufw_6543210x]
vbroadcasti128 m13, [div_table+16]
vbroadcasti128 m12, [div_table+0]
paddw m9, m0 ; partial_sum_diag[0/1][0-7]
pshufb m10, m14
punpckhwd m11, m9, m10
punpcklwd m9, m10
pmaddwd m11, m11
pmaddwd m9, m9
pmulld m11, m13
pmulld m9, m12
paddd m9, m11 ; cost0[a-d] | cost4[a-d]
; merge horizontally and vertically for partial_sum_alt[0-3]
paddw m10, m0, m1
paddw m11, m2, m3
paddw m12, m4, m5
paddw m13, m6, m7
phaddw m0, m4
phaddw m1, m5
phaddw m2, m6
phaddw m3, m7
; create aggregates [lower half]:
; m4 = m10:01234567+m11:x0123456+m12:xx012345+m13:xxx01234
; m11= m11:7xxxxxxx+m12:67xxxxxx+m13:567xxxxx
; and [upper half]:
; m4 = m10:xxx01234+m11:xx012345+m12:x0123456+m13:01234567
; m11= m10:567xxxxx+m11:67xxxxxx+m12:7xxxxxxx
; and then pshuflw m11 3012, unpcklwd, pmaddwd, pmulld, paddd
pslldq m4, m11, 2
psrldq m11, 14
pslldq m5, m12, 4
psrldq m12, 12
pslldq m6, m13, 6
psrldq m13, 10
paddw m4, m10
paddw m11, m12
vpbroadcastd m12, [div_table+44]
paddw m5, m6
paddw m11, m13 ; partial_sum_alt[3/2] right
vbroadcasti128 m13, [div_table+32]
paddw m4, m5 ; partial_sum_alt[3/2] left
pshuflw m5, m11, q3012
punpckhwd m6, m11, m4
punpcklwd m4, m5
pmaddwd m6, m6
pmaddwd m4, m4
pmulld m6, m12
pmulld m4, m13
paddd m4, m6 ; cost7[a-d] | cost5[a-d]
; create aggregates [lower half]:
; m5 = m0:01234567+m1:x0123456+m2:xx012345+m3:xxx01234
; m1 = m1:7xxxxxxx+m2:67xxxxxx+m3:567xxxxx
; and [upper half]:
; m5 = m0:xxx01234+m1:xx012345+m2:x0123456+m3:01234567
; m1 = m0:567xxxxx+m1:67xxxxxx+m2:7xxxxxxx
; and then pshuflw m1 3012, unpcklwd, pmaddwd, pmulld, paddd
pslldq m5, m1, 2
psrldq m1, 14
pslldq m6, m2, 4
psrldq m2, 12
pslldq m7, m3, 6
psrldq m3, 10
paddw m5, m0
paddw m1, m2
paddw m6, m7
paddw m1, m3 ; partial_sum_alt[0/1] right
paddw m5, m6 ; partial_sum_alt[0/1] left
pshuflw m0, m1, q3012
punpckhwd m1, m5
punpcklwd m5, m0
pmaddwd m1, m1
pmaddwd m5, m5
pmulld m1, m12
pmulld m5, m13
paddd m5, m1 ; cost1[a-d] | cost3[a-d]
mova xm0, [pd_47130256+ 16]
mova m1, [pd_47130256]
phaddd m9, m8
phaddd m5, m4
phaddd m9, m5
vpermd m0, m9 ; cost[0-3]
vpermd m1, m9 ; cost[4-7] | cost[0-3]
; now find the best cost
pmaxsd xm2, xm0, xm1
pshufd xm3, xm2, q1032
pmaxsd xm2, xm3
pshufd xm3, xm2, q2301
pmaxsd xm2, xm3 ; best cost
; find the idx using minpos
; make everything other than the best cost negative via subtraction
; find the min of unsigned 16-bit ints to sort out the negative values
psubd xm4, xm1, xm2
psubd xm3, xm0, xm2
packssdw xm3, xm4
phminposuw xm3, xm3
; convert idx to 32-bits
psrld xm3, 16
movd eax, xm3
; get idx^4 complement
vpermd m3, m1
psubd xm2, xm3
psrld xm2, 10
movd [varq], xm2
RET
%endif ; ARCH_X86_64