wavpack-sys 0.4.0

############################################################################
##                           **** WAVPACK ****                            ##
##                  Hybrid Lossless Wavefile Compressor                   ##
##              Copyright (c) 1998 - 2015 Conifer Software.               ##
##                          All Rights Reserved.                          ##
##      Distributed under the BSD Software License (see license.txt)      ##
############################################################################

        .intel_syntax noprefix
        .text

        .globl  _unpack_decorr_stereo_pass_cont_x86
        .globl  _unpack_decorr_mono_pass_cont_x86
        .globl  _unpack_cpu_has_feature_x86

        .globl  unpack_decorr_stereo_pass_cont_x86
        .globl  unpack_decorr_mono_pass_cont_x86
        .globl  unpack_cpu_has_feature_x86


# This module contains X86 assembly optimized versions of functions required
# to decode WavPack files. Note that the stereo versions of these functions
# use the MMX registers and instructions of the X86 processor, and so a
# helper function is provided to make a runtime check for that feature.

# This is an assembly optimized version of the following WavPack function:
#
# void unpack_decorr_stereo_pass_cont (struct decorr_pass *dpp,
#                                      int32_t *buffer,
#                                      int32_t sample_count,
#                                      int32_t long_math;
#
# It performs a single pass of stereo decorrelation on the provided buffer.
# Note that this version of the function requires that up to 8 previous
# stereo samples are visible and correct. In other words, it ignores the
# "samples_*" fields in the decorr_pass structure and gets the history data
# directly from the buffer. It does, however, return the appropriate history
# samples to the decorr_pass structure before returning.
#
# The "long_math" argument is used to specify that a 32-bit multiply is
# not enough for the "apply_weight" operation (although in this case it
# would only apply to the -1 and -2 terms because the MMX code does not have
# this limitation) but we ignore the parameter and use the overflow detection
# of the "imul" instruction to switch automatically to the "long_math" loop.
#
# This is written to work on an IA-32 processor and uses the MMX extensions
# to improve the performance by processing both stereo channels together.
# For terms -1 and -2 the MMX extensions are not usable, and so these are
# performed independently without them.
#
# arguments on entry:
#
#   struct decorr_pass *dpp     [ebp+8]
#   int32_t *buffer             [ebp+12]
#   int32_t sample_count        [ebp+16]
#   int32_t long_math           [ebp+20]
#
# registers after entry:
#
#   rdi         bptr
#   rsi         eptr
#
# on stack (used for terms -1 and -2 only):
#
#   int32_t delta             DWORD [esp]
#

_unpack_decorr_stereo_pass_cont_x86:
unpack_decorr_stereo_pass_cont_x86:
        push    ebp
        mov     ebp, esp
        push    ebx
        push    esi
        push    edi

        mov     edx, [ebp+8]                # copy delta from dpp to top of stack
        mov     eax, [edx+4]
        push    eax

        mov     edi, [ebp+12]               # edi = buffer
        mov     eax, [ebp+16]               # get sample_count and divide by 8
        shl     eax, 3
        jz      done                        # exit now if there's nothing to do

        add     eax, edi                    # else add to buffer point to make eptr
        mov     esi, eax

        mov     eax, [ebp+8]                # get term from dpp and vector appropriately
        mov     eax, [eax]
        cmp     eax, 17
        je      term_17_entry
        cmp     eax, 18
        je      term_18_entry
        cmp     eax, -1
        je      term_minus_1_entry
        cmp     eax, -2
        je      term_minus_2_entry
        cmp     eax, -3
        je      term_minus_3_entry

#
# registers during default term processing loop:
#   edi         active buffer pointer
#   esi         end of buffer pointer
#
# MMX:
#   mm0, mm1    scratch
#   mm2         original sample values
#   mm3         correlation samples
#   mm4         zero (for pcmpeqd)
#   mm5         weights
#   mm6         delta
#   mm7         512 (for rounding)
#

default_term_entry:
        imul    ebx, eax, -8                # set ebx to term * -8 for decorrelation index
        mov     eax, 512
        movd    mm7, eax
        punpckldq mm7, mm7                  # mm7 = round (512)
        mov     edx, [ebp+8]                # edx = *dpp
        mov     eax, [edx+4]
        movd    mm6, eax
        punpckldq mm6, mm6                  # mm6 = delta (0-7)
        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
        movd    mm5, eax
        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
        pand    mm5, [edx+8]                # mm5 = weight_AB masked to 16 bits
        pxor    mm4, mm4                    # mm4 = zero (for pcmpeqd)
        jmp     default_term_loop

        .balign  64
default_term_loop:
        movq    mm3, [edi+ebx]              # mm3 = sam_AB
        movq    mm1, mm3
        movq    mm0, mm3
        paddd   mm1, mm1
        psrld   mm0, 15
        psrlw   mm1, 1
        pmaddwd mm0, mm5
        pmaddwd mm1, mm5
        movq    mm2, [edi]                  # mm2 = left_right
        pslld   mm0, 5
        paddd   mm1, mm7                    # add 512 for rounding
        psrad   mm1, 10
        paddd   mm0, mm2
        paddd   mm0, mm1                    # add shifted sums
        movq    [edi], mm0                  # store result
        movq    mm0, mm3
        pxor    mm0, mm2
        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
        add     edi, 8
        pcmpeqd mm2, mm4                    # mm2 = 1s if left_right was zero
        pcmpeqd mm3, mm4                    # mm3 = 1s if sam_AB was zero
        por     mm2, mm3                    # mm2 = 1s if either was zero
        pandn   mm2, mm6                    # mask delta with zeros check
        pxor    mm5, mm0
        paddw   mm5, mm2                    # and add to weight_AB
        pxor    mm5, mm0
        cmp     edi, esi                    # compare bptr and eptr to see if we're done
        jb      default_term_loop

        pslld   mm5, 16                     # sign-extend 16-bit weights back to dwords
        psrad   mm5, 16
        mov     eax, [ebp+8]                # point to dpp
        movq    [eax+8], mm5                # put weight_AB back
        emms
        mov     edx, [ebp+8]                # access dpp with edx
        mov     ecx, [edx]                  # ecx = dpp->term

default_store_samples:
        dec     ecx
        sub     edi, 8                      # back up one full sample
        mov     eax, [edi+4]
        mov     [edx+ecx*4+48], eax         # store samples_B [ecx]
        mov     eax, [edi]
        mov     [edx+ecx*4+16], eax         # store samples_A [ecx]
        test    ecx, ecx
        jnz     default_store_samples

        jmp     done

#
# registers during processing loop for terms 17 & 18:
#   edi         active buffer pointer
#   esi         end of buffer pointer
#
# MMX:
#   mm0, mm1    scratch
#   mm2         original sample values
#   mm3         calculated correlation samples
#   mm4         last calculated values (so we don't need to reload)
#   mm5         weights
#   mm6         delta
#   mm7         512 (for rounding)
#

term_17_entry:
        mov     eax, 512
        movd    mm7, eax
        punpckldq mm7, mm7                  # mm7 = round (512)
        mov     edx, [ebp+8]                # point to dpp & get delta
        mov     eax, [edx+4]
        movd    mm6, eax
        punpckldq mm6, mm6                  # mm6 = delta (0-7)
        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
        movd    mm5, eax
        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
        pand    mm5, [edx+8]                # mm5 = weight_AB masked to 16 bits
        movq    mm4, [edi-8]                # preload previous calculated values
        jmp     term_17_loop

        .balign  64
term_17_loop:
        paddd   mm4, mm4
        psubd   mm4, [edi-16]               # mm3 = sam_AB
        movq    mm3, mm4
        movq    mm1, mm3
        paddd   mm1, mm1
        psrld   mm4, 15
        psrlw   mm1, 1
        pmaddwd mm4, mm5
        pmaddwd mm1, mm5
        movq    mm2, [edi]                  # mm2 = left_right
        pslld   mm4, 5
        paddd   mm1, mm7                    # add 512 for rounding
        psrad   mm1, 10
        paddd   mm4, mm2
        paddd   mm4, mm1                    # add shifted sums
        movq    mm0, mm3
        movq    [edi], mm4                  # store result
        pxor    mm0, mm2
        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
        add     edi, 8
        pxor    mm1, mm1                    # mm1 = zero
        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
        por     mm2, mm3                    # mm2 = 1s if either was zero
        pandn   mm2, mm6                    # mask delta with zeros check
        pxor    mm5, mm0
        paddw   mm5, mm2                    # and add to weight_AB
        pxor    mm5, mm0
        cmp     edi, esi                    # compare bptr and eptr to see if we're done
        jb      term_17_loop

        pslld   mm5, 16                     # sign-extend 16-bit weights back to dwords
        psrad   mm5, 16
        mov     eax, [ebp+8]                # point to dpp
        movq    [eax+8], mm5                # put weight_AB back
        emms
        jmp     term_1718_exit

term_18_entry:
        mov     eax, 512
        movd    mm7, eax
        punpckldq mm7, mm7                  # mm7 = round (512)
        mov     edx, [ebp+8]                # point to dpp & get delta
        mov     eax, [edx+4]
        movd    mm6, eax
        punpckldq mm6, mm6                  # mm6 = delta (0-7)
        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
        movd    mm5, eax
        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
        pand    mm5, [edx+8]                # mm5 = weight_AB masked to 16 bits
        movq    mm4, [edi-8]                # preload previous calculated value
        jmp     term_18_loop

        .balign  64
term_18_loop:
        movq    mm3, mm4
        psubd   mm3, [edi-16]
        psrad   mm3, 1
        paddd   mm3, mm4                    # mm3 = sam_AB
        movq    mm1, mm3
        movq    mm4, mm3
        paddd   mm1, mm1
        psrld   mm4, 15
        psrlw   mm1, 1
        pmaddwd mm4, mm5
        pmaddwd mm1, mm5
        movq    mm2, [edi]                  # mm2 = left_right
        pslld   mm4, 5
        paddd   mm1, mm7                    # add 512 for rounding
        psrad   mm1, 10
        paddd   mm4, mm2
        paddd   mm4, mm1                    # add shifted sums
        movq    mm0, mm3
        movq    [edi], mm4                  # store result
        pxor    mm0, mm2
        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
        add     edi, 8
        pxor    mm1, mm1                    # mm1 = zero
        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
        por     mm2, mm3                    # mm2 = 1s if either was zero
        pandn   mm2, mm6                    # mask delta with zeros check
        pxor    mm5, mm0
        paddw   mm5, mm2                    # and add to weight_AB
        pxor    mm5, mm0
        cmp     edi, esi                    # compare bptr and eptr to see if we're done
        jb      term_18_loop

        pslld   mm5, 16                     # sign-extend 16-bit weights back to dwords
        psrad   mm5, 16
        mov     eax, [ebp+8]                # point to dpp
        movq    [eax+8], mm5                # put weight_AB back
        emms

term_1718_exit:
        mov     edx, [edi-4]                # dpp->samples_B [0] = bptr [-1];
        mov     eax, [ebp+8]
        mov     [eax+48], edx
        mov     edx, [edi-8]                # dpp->samples_A [0] = bptr [-2];
        mov     [eax+16], edx
        mov     edx, [edi-12]               # dpp->samples_B [1] = bptr [-3];
        mov     [eax+52], edx
        mov     edx, [edi-16]               # dpp->samples_A [1] = bptr [-4];
        mov     [eax+20], edx
        jmp     done

#
# registers in term -1 & -2 loops:
#
#   eax,ebx,edx scratch
#   ecx         weight_A
#   ebp         weight_B
#   edi         bptr
#   esi         eptr
#

term_minus_1_entry:
        cld                                 # we use stosd here...
        mov     eax, [ebp+8]                # point to dpp
        mov     ecx, [eax+8]                # ecx = weight_A and ebp = weight_B
        mov     ebp, [eax+12]
        mov     eax, [edi-4]
        jmp     term_minus_1_loop

        .balign  64
term_minus_1_loop:
        mov     ebx, eax
        imul    eax, ecx
        mov     edx, [edi]
        jo      OV11
        sar     eax, 10
        adc     eax, edx
        stosd
        test    ebx, ebx
        je      L182
        test    edx, edx
        je      L182
        xor     ebx, edx
        sar     ebx, 31
        xor     ecx, ebx
        add     ecx, [esp]
        mov     edx, 1024
        add     edx, ebx
        cmp     ecx, edx
        jle     L183
        mov     ecx, edx
L183:   xor     ecx, ebx
L182:   mov     ebx, eax
        imul    eax, ebp
        mov     edx, [edi]
        jo      OV12
        sar     eax, 10
        adc     eax, edx
        stosd
        test    ebx, ebx
        je      L189
        test    edx, edx
        je      L189
        xor     ebx, edx
        sar     ebx, 31
        xor     ebp, ebx
        add     ebp, [esp]
        mov     edx, 1024
        add     edx, ebx
        cmp     ebp, edx
        jle     L188
        mov     ebp, edx
L188:   xor     ebp, ebx
L189:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
        jb      term_minus_1_loop
        jmp     term_minus_1_done

OV11:   mov     eax, ebx                    # restore previous sample into eax
        jmp     long_term_minus_1_loop

OV12:   mov     eax, ebx                    # restore previous sample into eax
        jmp     L282

        .balign  64
long_term_minus_1_loop:
        mov     ebx, eax
        imul    ecx
        shl     edx, 22
        shr     eax, 10
        adc     eax, edx
        mov     edx, [edi]
        add     eax, edx
        stosd
        test    ebx, ebx
        je      L282
        test    edx, edx
        je      L282
        xor     ebx, edx
        sar     ebx, 31
        xor     ecx, ebx
        add     ecx, [esp]
        mov     edx, 1024
        add     edx, ebx
        cmp     ecx, edx
        jle     L283
        mov     ecx, edx
L283:   xor     ecx, ebx
L282:   mov     ebx, eax
        imul    ebp
        shl     edx, 22
        shr     eax, 10
        adc     eax, edx
        mov     edx, [edi]
        add     eax, edx
        stosd
        test    ebx, ebx
        je      L289
        test    edx, edx
        je      L289
        xor     ebx, edx
        sar     ebx, 31
        xor     ebp, ebx
        add     ebp, [esp]
        mov     edx, 1024
        add     edx, ebx
        cmp     ebp, edx
        jle     L288
        mov     ebp, edx
L288:   xor     ebp, ebx
L289:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
        jb      long_term_minus_1_loop

term_minus_1_done:
        mov     edx, ebp
        mov     ebp, esp                    # restore ebp (we've pushed 4 DWORDS)
        add     ebp, 16
        mov     eax, [ebp+8]                # point to dpp
        mov     [eax+8], ecx
        mov     [eax+12], edx
        mov     edx, [edi-4]                # dpp->samples_A [0] = bptr [-1]
        mov     [eax+16], edx
        jmp     done


term_minus_2_entry:
        mov     eax, [ebp+8]                # point to dpp
        mov     ecx, [eax+8]                # ecx = weight_A and ebp = weight_B
        mov     ebp, [eax+12]
        mov     eax, [edi-8]
        jmp     term_minus_2_loop

        .balign  64
term_minus_2_loop:
        mov     ebx, eax
        imul    eax, ebp
        mov     edx, [edi+4]
        jo      OV21
        sar     eax, 10
        adc     eax, edx
        mov     [edi+4], eax
        test    ebx, ebx
        je      L194
        test    edx, edx
        je      L194
        xor     ebx, edx
        sar     ebx, 31
        xor     ebp, ebx
        add     ebp, [esp]
        mov     edx, 1024
        add     edx, ebx
        cmp     ebp, edx
        jle     L195
        mov     ebp, edx
L195:   xor     ebp, ebx
L194:   mov     ebx, eax
        imul    eax, ecx
        mov     edx, [edi]
        jo      OV22
        sar     eax, 10
        adc     eax, edx
        mov     [edi], eax
        add     edi, 8
        test    ebx, ebx
        je      L201
        test    edx, edx
        je      L201
        xor     ebx, edx
        sar     ebx, 31
        xor     ecx, ebx
        add     ecx, [esp]
        mov     edx, 1024
        add     edx, ebx
        cmp     ecx, edx
        jle     L200
        mov     ecx, edx
L200:   xor     ecx, ebx
L201:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
        jb      term_minus_2_loop
        jmp     term_minus_2_done

OV21:   mov     eax, ebx                    # restore previous sample into eax
        jmp     long_term_minus_2_loop

OV22:   mov     eax, ebx                    # restore previous sample into eax
        jmp     L294

        .balign  64
long_term_minus_2_loop:
        mov     ebx, eax
        imul    ebp
        shl     edx, 22
        shr     eax, 10
        adc     eax, edx
        mov     edx, [edi+4]
        add     eax, edx
        mov     [edi+4], eax
        test    ebx, ebx
        je      L294
        test    edx, edx
        je      L294
        xor     ebx, edx
        sar     ebx, 31
        xor     ebp, ebx
        add     ebp, [esp]
        mov     edx, 1024
        add     edx, ebx
        cmp     ebp, edx
        jle     L295
        mov     ebp, edx
L295:   xor     ebp, ebx
L294:   mov     ebx, eax
        imul    ecx
        shl     edx, 22
        shr     eax, 10
        adc     eax, edx
        mov     edx, [edi]
        add     eax, edx
        mov     [edi], eax
        add     edi, 8
        test    ebx, ebx
        je      L301
        test    edx, edx
        je      L301
        xor     ebx, edx
        sar     ebx, 31
        xor     ecx, ebx
        add     ecx, [esp]
        mov     edx, 1024
        add     edx, ebx
        cmp     ecx, edx
        jle     L300
        mov     ecx, edx
L300:   xor     ecx, ebx
L301:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
        jb      long_term_minus_2_loop

term_minus_2_done:
        mov     edx, ebp
        lea     ebp, [esp+16]               # restore ebp (we've pushed 4 DWORDS)
        mov     eax, [ebp+8]                # point to dpp
        mov     [eax+8], ecx
        mov     [eax+12], edx
        mov     edx, [edi-8]                # dpp->samples_B [0] = bptr [-2];
        mov     [eax+48], edx
        jmp     done

#
# registers during processing loop for term -3:
#   edi         active buffer pointer
#   esi         end of buffer pointer
#
# MMX:
#   mm0, mm1    scratch
#   mm2         original sample values
#   mm3         calculated correlation samples
#   mm4         last calculated values (so we don't need to reload)
#   mm5         weights
#   mm6         delta
#   mm7         512 (for rounding)
#

term_minus_3_entry:
        mov     eax, 512
        movd    mm7, eax
        punpckldq mm7, mm7                  # mm7 = round (512)
        mov     edx, [ebp+8]                # point to dpp & get delta
        mov     eax, [edx+4]
        movd    mm6, eax
        punpckldq mm6, mm6                  # mm6 = delta (0-7)
        mov     eax, 0xFFFF                 # mask high weights to zero for PMADDWD
        movd    mm5, eax
        punpckldq mm5, mm5                  # mm5 = weight mask 0x0000FFFF0000FFFF
        pand    mm5, [edx+8]                # mm5 = weight_AB masked to 16 bits
        movq    mm4, [edi-8]                # preload previous calculated values
        jmp     term_minus_3_loop

        .balign  64
term_minus_3_loop:
        movq    mm3, mm4                    # mm3 = swap dwords (mm4)
        psrlq   mm3, 32
        punpckldq mm3, mm4                  # mm3 = sam_AB
        movq    mm1, mm3
        movq    mm4, mm3
        pslld   mm1, 1
        psrld   mm4, 15
        psrlw   mm1, 1
        pmaddwd mm4, mm5
        pmaddwd mm1, mm5
        movq    mm2, [edi]                  # mm2 = left_right
        pslld   mm4, 5
        paddd   mm1, mm7                    # add 512 for rounding
        psrad   mm1, 10
        paddd   mm4, mm2
        paddd   mm4, mm1                    # add shifted sums
        movq    [edi], mm4                  # store result
        movq    mm0, mm3
        pxor    mm0, mm2
        psrad   mm0, 31                     # mm0 = sign (sam_AB ^ left_right)
        add     edi, 8
        pxor    mm1, mm1                    # mm1 = zero
        pcmpeqd mm2, mm1                    # mm2 = 1s if left_right was zero
        pcmpeqd mm3, mm1                    # mm3 = 1s if sam_AB was zero
        por     mm2, mm3                    # mm2 = 1s if either was zero
        pandn   mm2, mm6                    # mask delta with zeros check
        pcmpeqd mm1, mm1
        psubd   mm1, mm7
        psubd   mm1, mm7
        psubd   mm1, mm0
        pxor    mm5, mm0
        paddw   mm5, mm1
        paddusw mm5, mm2                    # and add to weight_AB
        psubw   mm5, mm1
        pxor    mm5, mm0
        cmp     edi, esi                    # compare bptr and eptr to see if we're done
        jb      term_minus_3_loop

        pslld   mm5, 16                     # sign-extend 16-bit weights back to dwords
        psrad   mm5, 16
        mov     eax, [ebp+8]                # point to dpp
        movq    [eax+8], mm5                # put weight_AB back
        emms
        mov     edx, [edi-4]                # dpp->samples_A [0] = bptr [-1];
        mov     eax, [ebp+8]
        mov     [eax+16], edx
        mov     edx, [edi-8]                # dpp->samples_B [0] = bptr [-2];
        mov     [eax+48], edx

done:   pop     eax                         # pop delta & saved regs
        pop     edi
        pop     esi
        pop     ebx
        pop     ebp
        ret

#######################################################################################################################
#
# This is the mono version of the above function. It does not use MMX and does not handle negative terms.
#
# void unpack_decorr_mono_pass_cont (struct decorr_pass *dpp,
#                                    int32_t *buffer,
#                                    int32_t sample_count,
#                                    int32_t long_math;
# arguments on entry:
#
#   struct decorr_pass *dpp     [ebp+8]
#   int32_t *buffer             [ebp+12]
#   int32_t sample_count        [ebp+16]
#   int32_t long_math           [ebp+20]
#
# registers after entry:
#
#   rdi         bptr
#   rsi         eptr
#
# on stack:
#
#   int16_t delta             DWORD [esp]
#

_unpack_decorr_mono_pass_cont_x86:
unpack_decorr_mono_pass_cont_x86:
        push    ebp
        mov     ebp, esp
        push    ebx
        push    esi
        push    edi
        cld

        mov     edx, [ebp+8]                # copy delta from dpp to local stack
        mov     eax, [edx+4]
        push    eax

        mov     edi, [ebp+12]               # edi = buffer
        mov     eax, [ebp+16]               # get sample_count and multiply by 4
        shl     eax, 2
        jz      mono_done                   # exit now if there's nothing to do
        lea     esi, [edi+eax]              # else add to buffer point to make eptr

        mov     eax, [ebp+8]                # get term from dpp and vector appropriately
        mov     eax, [eax]
        cmp     eax, 17
        je      mono_17_entry
        cmp     eax, 18
        je      mono_18_entry

#
# registers during default term processing loop:
#   edi         active buffer pointer
#   esi         end of buffer pointer
#   ecx         weight_A
#   ebp         free
#   ebx         term * -4
#   eax,edx     scratch
#

default_mono_entry:
        imul    ebx, eax, -4                # set ebx to term * -4 for decorrelation index
        mov     edx, [ebp+8]                # edx = dpp*
        mov     ecx, [edx+8]                # ecx = weight
        jmp     default_mono_loop

#
# registers during processing loop for terms 17 & 18:
#   edi         active buffer pointer
#   esi         end of buffer pointer
#   ecx         weight_A
#   ebp         previously calculated value
#   ebx         calculated correlation sample
#   eax,edx     scratch
#

mono_17_entry:
        mov     edx, [ebp+8]                # edx = dpp*
        mov     ecx, [edx+8]                # ecx = weight_A
        mov     ebp, [edi-4]
        jmp     mono_17_loop

mono_18_entry:
        mov     edx, [ebp+8]                # edx = dpp*
        mov     ecx, [edx+8]                # ecx = weight_A
        mov     ebp, [edi-4]
        jmp     mono_18_loop

        .balign  64
default_mono_loop:
        mov     eax, [edi+ebx]
        imul    eax, ecx
        mov     edx, [edi]
        jo      long_default_mono_loop
        sar     eax, 10
        adc     eax, edx
        mov     [edi], eax
        mov     eax, [edi+ebx]
        add     edi, 4
        test    edx, edx
        je      L100
        test    eax, eax
        je      L100
        xor     eax, edx
        cdq
        xor     ecx, edx
        add     ecx, [esp]
        xor     ecx, edx
L100:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
        jb      default_mono_loop
        jmp     default_mono_done

        .balign  64
long_default_mono_loop:
        mov     eax, [edi+ebx]
        imul    ecx
        shl     edx, 22
        shr     eax, 10
        adc     eax, edx
        mov     edx, [edi]
        add     eax, edx
        mov     [edi], eax
        mov     eax, [edi+ebx]
        add     edi, 4
        test    edx, edx
        je      L101
        test    eax, eax
        je      L101
        xor     eax, edx
        cdq
        xor     ecx, edx
        add     ecx, [esp]
        xor     ecx, edx
L101:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
        jb      long_default_mono_loop

default_mono_done:
        mov     edx, [ebp+8]                # edx = dpp*
        mov     [edx+8], ecx                # store weight_A back
        mov     ecx, [edx]                  # ecx = dpp->term

default_mono_store_samples:
        dec     ecx
        sub     edi, 4                      # back up one full sample
        mov     eax, [edi]
        mov     [edx+ecx*4+16], eax         # store samples_A [ecx]
        test    ecx, ecx
        jnz     default_mono_store_samples
        jmp     mono_done

        .balign  64
mono_17_loop:
        lea     ebx, [ebp+ebp]
        sub     ebx, [edi-8]
        mov     eax, ecx
        imul    eax, ebx
        mov     edx, [edi]
        jo      long_mono_17_loop
        sar     eax, 10
        adc     eax, edx
        stosd
        test    ebx, ebx
        mov     ebp, eax
        je      L117
        test    edx, edx
        je      L117
        mov     eax, [esp]
        xor     ebx, edx
        sar     ebx, 31
        xor     eax, ebx
        sub     eax, ebx
        add     ecx, eax
L117:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
        jb      mono_17_loop
        jmp     mono_1718_exit

        .balign  64
long_mono_17_loop:
        lea     ebx, [ebp+ebp]
        sub     ebx, [edi-8]
        mov     eax, ecx
        imul    ebx
        shl     edx, 22
        shr     eax, 10
        adc     eax, edx
        mov     edx, [edi]
        add     eax, edx
        stosd
        test    ebx, ebx
        mov     ebp, eax
        je      L217
        test    edx, edx
        je      L217
        mov     eax, [esp]
        xor     ebx, edx
        sar     ebx, 31
        xor     eax, ebx
        sub     eax, ebx
        add     ecx, eax
L217:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
        jb      long_mono_17_loop
        jmp     mono_1718_exit

        .balign  64
mono_18_loop:
        lea     ebx, [ebp+ebp*2]
        sub     ebx, [edi-8]
        sar     ebx, 1
        mov     eax, ecx
        imul    eax, ebx
        mov     edx, [edi]
        jo      long_mono_18_loop
        sar     eax, 10
        adc     eax, edx
        stosd
        test    ebx, ebx
        mov     ebp, eax
        je      L118
        test    edx, edx
        je      L118
        mov     eax, [esp]
        xor     ebx, edx
        sar     ebx, 31
        xor     eax, ebx
        sub     eax, ebx
        add     ecx, eax
L118:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
        jb      mono_18_loop
        jmp     mono_1718_exit

        .balign  64
long_mono_18_loop:
        lea     ebx, [ebp+ebp*2]
        sub     ebx, [edi-8]
        sar     ebx, 1
        mov     eax, ecx
        imul    ebx
        shl     edx, 22
        shr     eax, 10
        adc     eax, edx
        mov     edx, [edi]
        add     eax, edx
        stosd
        test    ebx, ebx
        mov     ebp, eax
        je      L218
        test    edx, edx
        je      L218
        mov     eax, [esp]
        xor     ebx, edx
        sar     ebx, 31
        xor     eax, ebx
        sub     eax, ebx
        add     ecx, eax
L218:   cmp     edi, esi                    # compare bptr and eptr to see if we're done
        jb      long_mono_18_loop

mono_1718_exit:
        lea     ebp, [esp+16]               # restore ebp (we've pushed 4 DWORDS)
        mov     edx, [ebp+8]                # edx = dpp*
        mov     [edx+8], ecx                # store weight_A back
        mov     eax, [edi-4]                # dpp->samples_A [0] = bptr [-1];
        mov     [edx+16], eax
        mov     eax, [edi-8]                # dpp->samples_A [1] = bptr [-2];
        mov     [edx+20], eax

mono_done:
        pop     eax                         # pop delta & saved regs
        pop     edi
        pop     esi
        pop     ebx
        pop     ebp
        ret

# Helper function to determine if specified CPU feature is available (used here for MMX).
# Input parameter is index of feature to be checked (EDX from CPUID(1) only, MMX = 23).
# Return value is the specified bit (0 or 1) or 0 if CPUID is not supported.

_unpack_cpu_has_feature_x86:
unpack_cpu_has_feature_x86:
        pushfd                              # save eflags
        pushfd                              # push another copy
        xor     dword ptr [esp], 0x200000   # toggle ID bit on stack & pop it back into eflags
        popfd
        pushfd                              # store possibly modified eflags
        pop     eax                         # and pop back into eax
        xor     eax, [esp]                  # compare to original pushed eflags
        popfd                               # restore original eflags
        and     eax, 0x200000               # eax = 1 if eflags ID bit was changeable
        jz      oldcpu                      # return zero if CPUID is not available (wow!)

        push    ebx                         # we must save ebx
        mov     eax, 1                      # do cpuid (1) to get features into edx
        cpuid
        mov     eax, edx                    # copy into eax for shift
        mov     cl, [esp+8]                 # get parameter and shift that bit index into LSB
        sar     eax, cl
        and     eax, 1
        pop     ebx                         # restore ebx and return 0 or 1

oldcpu: ret                                 # return value in eax

#ifdef __ELF__
        .section .note.GNU-stack,"",@progbits
#endif