wavpack-sys 0.4.0

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;                           **** WAVPACK ****                            ;;
;;                  Hybrid Lossless Wavefile Compressor                   ;;
;;              Copyright (c) 1998 - 2015 Conifer Software.               ;;
;;                          All Rights Reserved.                          ;;
;;      Distributed under the BSD Software License (see license.txt)      ;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

        .686
        .mmx
        .model  flat
asmcode segment page 'CODE'
        public  _unpack_decorr_stereo_pass_cont_x86
        public  _unpack_decorr_mono_pass_cont_x86
        public  _unpack_cpu_has_feature_x86

; This is an assembly optimized version of the following WavPack function:
;
; void unpack_decorr_stereo_pass_cont (struct decorr_pass *dpp,
;                                      int32_t *buffer,
;                                      int32_t sample_count,
;                                      int32_t long_math;
;
; It performs a single pass of stereo decorrelation on the provided buffer.
; Note that this version of the function requires that up to 8 previous
; stereo samples are visible and correct. In other words, it ignores the
; "samples_*" fields in the decorr_pass structure and gets the history data
; directly from the buffer. It does, however, return the appropriate history
; samples to the decorr_pass structure before returning.
;
; The "long_math" argument is used to specify that a 32-bit multiply is
; not enough for the "apply_weight" operation (although in this case it
; would only apply to the -1 and -2 terms because the MMX code does not have
; this limitation) but we ignore the parameter and use the overflow detection
; of the "imul" instruction to switch automatically to the "long_math" loop.
;
; This is written to work on an IA-32 processor and uses the MMX extensions
; to improve the performance by processing both stereo channels together.
; For terms -1 and -2 the MMX extensions are not usable, and so these are
; performed independently without them.
;
; arguments on entry:
;
;   struct decorr_pass *dpp     [ebp+8]
;   int32_t *buffer             [ebp+12]
;   int32_t sample_count        [ebp+16]
;   int32_t long_math           [ebp+20]
;
; registers after entry:
;
;   rdi         bptr
;   rsi         eptr
;
; on stack (used for terms -1 and -2 only):
;
;   int32_t delta             DWORD [esp]
;

_unpack_decorr_stereo_pass_cont_x86:
        push    ebp
        mov     ebp, esp
        push    ebx
        push    esi
        push    edi

        mov     edx, [ebp+8]                ; copy delta from dpp to top of stack
        mov     eax, [edx+4]
        push    eax

        mov     edi, [ebp+12]               ; edi = buffer
        mov     eax, [ebp+16]               ; get sample_count and divide by 8
        sal     eax, 3
        jz      done                        ; exit now if there's nothing to do

        add     eax, edi                    ; else add to buffer point to make eptr
        mov     esi, eax

        mov     eax, [ebp+8]                ; get term from dpp and vector appropriately
        mov     eax, [eax]
        cmp     eax, 17
        je      term_17_entry
        cmp     eax, 18
        je      term_18_entry
        cmp     eax, -1
        je      term_minus_1_entry
        cmp     eax, -2
        je      term_minus_2_entry
        cmp     eax, -3
        je      term_minus_3_entry

;
; registers during default term processing loop:
;   edi         active buffer pointer
;   esi         end of buffer pointer
;
; MMX:
;   mm0, mm1    scratch
;   mm2         original sample values
;   mm3         correlation samples
;   mm4         zero (for pcmpeqd)
;   mm5         weights
;   mm6         delta
;   mm7         512 (for rounding)
;

default_term_entry:
        imul    ebx, eax, -8                ; set ebx to term * -8 for decorrelation index
        mov     eax, 512
        movd    mm7, eax
        punpckldq mm7, mm7                  ; mm7 = round (512)
        mov     edx, [ebp+8]                ; edx = *dpp
        mov     eax, [edx+4]
        movd    mm6, eax
        punpckldq mm6, mm6                  ; mm6 = delta (0-7)
        mov     eax, 0FFFFh                 ; mask high weights to zero for PMADDWD
        movd    mm5, eax
        punpckldq mm5, mm5                  ; mm5 = weight mask 0x0000FFFF0000FFFF
        pand    mm5, [edx+8]                ; mm5 = weight_AB masked to 16 bits
        pxor    mm4, mm4                    ; mm4 = zero (for pcmpeqd)
        jmp     default_term_loop

        align  64
default_term_loop:
        movq    mm3, [edi+ebx]              ; mm3 = sam_AB
        movq    mm1, mm3
        movq    mm0, mm3
        paddd   mm1, mm1
        psrld   mm0, 15
        psrlw   mm1, 1
        pmaddwd mm0, mm5
        pmaddwd mm1, mm5
        movq    mm2, [edi]                  ; mm2 = left_right
        pslld   mm0, 5
        paddd   mm1, mm7                    ; add 512 for rounding
        psrad   mm1, 10
        paddd   mm0, mm2
        paddd   mm0, mm1                    ; add shifted sums
        movq    [edi], mm0                  ; store result
        movq    mm0, mm3
        pxor    mm0, mm2
        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
        add     edi, 8
        pcmpeqd mm2, mm4                    ; mm2 = 1s if left_right was zero
        pcmpeqd mm3, mm4                    ; mm3 = 1s if sam_AB was zero
        por     mm2, mm3                    ; mm2 = 1s if either was zero
        pandn   mm2, mm6                    ; mask delta with zeros check
        pxor    mm5, mm0
        paddw   mm5, mm2                    ; and add to weight_AB
        pxor    mm5, mm0
        cmp     edi, esi                    ; compare bptr and eptr to see if we're done
        jb      default_term_loop

        pslld   mm5, 16                     ; sign-extend 16-bit weights back to dwords
        psrad   mm5, 16
        mov     eax, [ebp+8]                ; point to dpp
        movq    [eax+8], mm5                ; put weight_AB back
        emms
        mov     edx, [ebp+8]                ; access dpp with edx
        mov     ecx, [edx]                  ; ecx = dpp->term

default_store_samples:
        dec     ecx
        sub     edi, 8                      ; back up one full sample
        mov     eax, [edi+4]
        mov     [edx+ecx*4+48], eax         ; store samples_B [ecx]
        mov     eax, [edi]
        mov     [edx+ecx*4+16], eax         ; store samples_A [ecx]
        test    ecx, ecx
        jnz     default_store_samples

        jmp     done

;
; registers during processing loop for terms 17 & 18:
;   edi         active buffer pointer
;   esi         end of buffer pointer
;
; MMX:
;   mm0, mm1    scratch
;   mm2         original sample values
;   mm3         calculated correlation samples
;   mm4         last calculated values (so we don't need to reload)
;   mm5         weights
;   mm6         delta
;   mm7         512 (for rounding)
;

term_17_entry:
        mov     eax, 512
        movd    mm7, eax
        punpckldq mm7, mm7                  ; mm7 = round (512)
        mov     edx, [ebp+8]                ; point to dpp & get delta
        mov     eax, [edx+4]
        movd    mm6, eax
        punpckldq mm6, mm6                  ; mm6 = delta (0-7)
        mov     eax, 0FFFFh                 ; mask high weights to zero for PMADDWD
        movd    mm5, eax
        punpckldq mm5, mm5                  ; mm5 = weight mask 0x0000FFFF0000FFFF
        pand    mm5, [edx+8]                ; mm5 = weight_AB masked to 16 bits
        movq    mm4, [edi-8]                ; preload previous calculated values
        jmp     term_17_loop

        align  64
term_17_loop:
        paddd   mm4, mm4
        psubd   mm4, [edi-16]               ; mm3 = sam_AB
        movq    mm3, mm4
        movq    mm1, mm3
        paddd   mm1, mm1
        psrld   mm4, 15
        psrlw   mm1, 1
        pmaddwd mm4, mm5
        pmaddwd mm1, mm5
        movq    mm2, [edi]                  ; mm2 = left_right
        pslld   mm4, 5
        paddd   mm1, mm7                    ; add 512 for rounding
        psrad   mm1, 10
        paddd   mm4, mm2
        paddd   mm4, mm1                    ; add shifted sums
        movq    mm0, mm3
        movq    [edi], mm4                  ; store result
        pxor    mm0, mm2
        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
        add     edi, 8
        pxor    mm1, mm1                    ; mm1 = zero
        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
        por     mm2, mm3                    ; mm2 = 1s if either was zero
        pandn   mm2, mm6                    ; mask delta with zeros check
        pxor    mm5, mm0
        paddw   mm5, mm2                    ; and add to weight_AB
        pxor    mm5, mm0
        cmp     edi, esi                    ; compare bptr and eptr to see if we're done
        jb      term_17_loop

        pslld   mm5, 16                     ; sign-extend 16-bit weights back to dwords
        psrad   mm5, 16
        mov     eax, [ebp+8]                ; point to dpp
        movq    [eax+8], mm5                ; put weight_AB back
        emms
        jmp     term_1718_exit

term_18_entry:
        mov     eax, 512
        movd    mm7, eax
        punpckldq mm7, mm7                  ; mm7 = round (512)
        mov     edx, [ebp+8]                ; point to dpp & get delta
        mov     eax, [edx+4]
        movd    mm6, eax
        punpckldq mm6, mm6                  ; mm6 = delta (0-7)
        mov     eax, 0FFFFh                 ; mask high weights to zero for PMADDWD
        movd    mm5, eax
        punpckldq mm5, mm5                  ; mm5 = weight mask 0x0000FFFF0000FFFF
        pand    mm5, [edx+8]                ; mm5 = weight_AB masked to 16 bits
        movq    mm4, [edi-8]                ; preload previous calculated value
        jmp     term_18_loop

        align  64
term_18_loop:
        movq    mm3, mm4
        psubd   mm3, [edi-16]
        psrad   mm3, 1
        paddd   mm3, mm4                    ; mm3 = sam_AB
        movq    mm1, mm3
        movq    mm4, mm3
        paddd   mm1, mm1
        psrld   mm4, 15
        psrlw   mm1, 1
        pmaddwd mm4, mm5
        pmaddwd mm1, mm5
        movq    mm2, [edi]                  ; mm2 = left_right
        pslld   mm4, 5
        paddd   mm1, mm7                    ; add 512 for rounding
        psrad   mm1, 10
        paddd   mm4, mm2
        paddd   mm4, mm1                    ; add shifted sums
        movq    mm0, mm3
        movq    [edi], mm4                  ; store result
        pxor    mm0, mm2
        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
        add     edi, 8
        pxor    mm1, mm1                    ; mm1 = zero
        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
        por     mm2, mm3                    ; mm2 = 1s if either was zero
        pandn   mm2, mm6                    ; mask delta with zeros check
        pxor    mm5, mm0
        paddw   mm5, mm2                    ; and add to weight_AB
        pxor    mm5, mm0
        cmp     edi, esi                    ; compare bptr and eptr to see if we're done
        jb      term_18_loop

        pslld   mm5, 16                     ; sign-extend 16-bit weights back to dwords
        psrad   mm5, 16
        mov     eax, [ebp+8]                ; point to dpp
        movq    [eax+8], mm5                ; put weight_AB back
        emms

term_1718_exit:
        mov     edx, [edi-4]                ; dpp->samples_B [0] = bptr [-1];
        mov     eax, [ebp+8]
        mov     [eax+48], edx
        mov     edx, [edi-8]                ; dpp->samples_A [0] = bptr [-2];
        mov     [eax+16], edx
        mov     edx, [edi-12]               ; dpp->samples_B [1] = bptr [-3];
        mov     [eax+52], edx
        mov     edx, [edi-16]               ; dpp->samples_A [1] = bptr [-4];
        mov     [eax+20], edx
        jmp     done

;
; registers in term -1 & -2 loops:
;
;   eax,ebx,edx scratch
;   ecx         weight_A
;   ebp         weight_B
;   edi         bptr
;   esi         eptr
;

term_minus_1_entry:
        cld                                 ; we use stosd here...
        mov     eax, [ebp+8]                ; point to dpp
        mov     ecx, [eax+8]                ; ecx = weight_A and ebp = weight_B
        mov     ebp, [eax+12]
        mov     eax, [edi-4]
        jmp     term_minus_1_loop

        align  64
term_minus_1_loop:
        mov     ebx, eax
        imul    eax, ecx
        mov     edx, [edi]
        jo      OV11
        sar     eax, 10
        adc     eax, edx
        stosd
        test    ebx, ebx
        je      L182
        test    edx, edx
        je      L182
        xor     ebx, edx
        sar     ebx, 31
        xor     ecx, ebx
        add     ecx, [esp]
        mov     edx, 1024
        add     edx, ebx
        cmp     ecx, edx
        jle     L183
        mov     ecx, edx
L183:   xor     ecx, ebx
L182:   mov     ebx, eax
        imul    eax, ebp
        mov     edx, [edi]
        jo      OV12
        sar     eax, 10
        adc     eax, edx
        stosd
        test    ebx, ebx
        je      L189
        test    edx, edx
        je      L189
        xor     ebx, edx
        sar     ebx, 31
        xor     ebp, ebx
        add     ebp, [esp]
        mov     edx, 1024
        add     edx, ebx
        cmp     ebp, edx
        jle     L188
        mov     ebp, edx
L188:   xor     ebp, ebx
L189:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
        jb      term_minus_1_loop
        jmp     term_minus_1_done

OV11:   mov     eax, ebx                    ; restore previous sample into eax
        jmp     long_term_minus_1_loop

OV12:   mov     eax, ebx                    ; restore previous sample into eax
        jmp     L282

        align  64
long_term_minus_1_loop:
        mov     ebx, eax
        imul    ecx
        shl     edx, 22
        shr     eax, 10
        adc     eax, edx
        mov     edx, [edi]
        add     eax, edx
        stosd
        test    ebx, ebx
        je      L282
        test    edx, edx
        je      L282
        xor     ebx, edx
        sar     ebx, 31
        xor     ecx, ebx
        add     ecx, [esp]
        mov     edx, 1024
        add     edx, ebx
        cmp     ecx, edx
        jle     L283
        mov     ecx, edx
L283:   xor     ecx, ebx
L282:   mov     ebx, eax
        imul    ebp
        shl     edx, 22
        shr     eax, 10
        adc     eax, edx
        mov     edx, [edi]
        add     eax, edx
        stosd
        test    ebx, ebx
        je      L289
        test    edx, edx
        je      L289
        xor     ebx, edx
        sar     ebx, 31
        xor     ebp, ebx
        add     ebp, [esp]
        mov     edx, 1024
        add     edx, ebx
        cmp     ebp, edx
        jle     L288
        mov     ebp, edx
L288:   xor     ebp, ebx
L289:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
        jb      long_term_minus_1_loop

term_minus_1_done:
        mov     edx, ebp
        mov     ebp, esp                    ; restore ebp (we've pushed 4 DWORDS)
        add     ebp, 16
        mov     eax, [ebp+8]                ; point to dpp
        mov     [eax+8], ecx
        mov     [eax+12], edx
        mov     edx, [edi-4]                ; dpp->samples_A [0] = bptr [-1]
        mov     [eax+16], edx
        jmp     done


term_minus_2_entry:
        mov     eax, [ebp+8]                ; point to dpp
        mov     ecx, [eax+8]                ; ecx = weight_A and ebp = weight_B
        mov     ebp, [eax+12]
        mov     eax, [edi-8]
        jmp     term_minus_2_loop

        align  64
term_minus_2_loop:
        mov     ebx, eax
        imul    eax, ebp
        mov     edx, [edi+4]
        jo      OV21
        sar     eax, 10
        adc     eax, edx
        mov     [edi+4], eax
        test    ebx, ebx
        je      L194
        test    edx, edx
        je      L194
        xor     ebx, edx
        sar     ebx, 31
        xor     ebp, ebx
        add     ebp, [esp]
        mov     edx, 1024
        add     edx, ebx
        cmp     ebp, edx
        jle     L195
        mov     ebp, edx
L195:   xor     ebp, ebx
L194:   mov     ebx, eax
        imul    eax, ecx
        mov     edx, [edi]
        jo      OV22
        sar     eax, 10
        adc     eax, edx
        mov     [edi], eax
        add     edi, 8
        test    ebx, ebx
        je      L201
        test    edx, edx
        je      L201
        xor     ebx, edx
        sar     ebx, 31
        xor     ecx, ebx
        add     ecx, [esp]
        mov     edx, 1024
        add     edx, ebx
        cmp     ecx, edx
        jle     L200
        mov     ecx, edx
L200:   xor     ecx, ebx
L201:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
        jb      term_minus_2_loop
        jmp     term_minus_2_done

OV21:   mov     eax, ebx                    ; restore previous sample into eax
        jmp     long_term_minus_2_loop

OV22:   mov     eax, ebx                    ; restore previous sample into eax
        jmp     L294

        align  64
long_term_minus_2_loop:
        mov     ebx, eax
        imul    ebp
        shl     edx, 22
        shr     eax, 10
        adc     eax, edx
        mov     edx, [edi+4]
        add     eax, edx
        mov     [edi+4], eax
        test    ebx, ebx
        je      L294
        test    edx, edx
        je      L294
        xor     ebx, edx
        sar     ebx, 31
        xor     ebp, ebx
        add     ebp, [esp]
        mov     edx, 1024
        add     edx, ebx
        cmp     ebp, edx
        jle     L295
        mov     ebp, edx
L295:   xor     ebp, ebx
L294:   mov     ebx, eax
        imul    ecx
        shl     edx, 22
        shr     eax, 10
        adc     eax, edx
        mov     edx, [edi]
        add     eax, edx
        mov     [edi], eax
        add     edi, 8
        test    ebx, ebx
        je      L301
        test    edx, edx
        je      L301
        xor     ebx, edx
        sar     ebx, 31
        xor     ecx, ebx
        add     ecx, [esp]
        mov     edx, 1024
        add     edx, ebx
        cmp     ecx, edx
        jle     L300
        mov     ecx, edx
L300:   xor     ecx, ebx
L301:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
        jb      long_term_minus_2_loop

term_minus_2_done:
        mov     edx, ebp
        lea     ebp, [esp+16]               ; restore ebp (we've pushed 4 DWORDS)
        mov     eax, [ebp+8]                ; point to dpp
        mov     [eax+8], ecx
        mov     [eax+12], edx
        mov     edx, [edi-8]                ; dpp->samples_B [0] = bptr [-2];
        mov     [eax+48], edx
        jmp     done

;
; registers during processing loop for term -3:
;   edi         active buffer pointer
;   esi         end of buffer pointer
;
; MMX:
;   mm0, mm1    scratch
;   mm2         original sample values
;   mm3         calculated correlation samples
;   mm4         last calculated values (so we don't need to reload)
;   mm5         weights
;   mm6         delta
;   mm7         512 (for rounding)
;

term_minus_3_entry:
        mov     eax, 512
        movd    mm7, eax
        punpckldq mm7, mm7                  ; mm7 = round (512)
        mov     edx, [ebp+8]                ; point to dpp & get delta
        mov     eax, [edx+4]
        movd    mm6, eax
        punpckldq mm6, mm6                  ; mm6 = delta (0-7)
        mov     eax, 0FFFFh                 ; mask high weights to zero for PMADDWD
        movd    mm5, eax
        punpckldq mm5, mm5                  ; mm5 = weight mask 0x0000FFFF0000FFFF
        pand    mm5, [edx+8]                ; mm5 = weight_AB masked to 16 bits
        movq    mm4, [edi-8]                ; preload previous calculated values
        jmp     term_minus_3_loop

        align  64
term_minus_3_loop:
        movq    mm3, mm4                    ; mm3 = swap dwords (mm4)
        psrlq   mm3, 32
        punpckldq mm3, mm4                  ; mm3 = sam_AB
        movq    mm1, mm3
        movq    mm4, mm3
        pslld   mm1, 1
        psrld   mm4, 15
        psrlw   mm1, 1
        pmaddwd mm4, mm5
        pmaddwd mm1, mm5
        movq    mm2, [edi]                  ; mm2 = left_right
        pslld   mm4, 5
        paddd   mm1, mm7                    ; add 512 for rounding
        psrad   mm1, 10
        paddd   mm4, mm2
        paddd   mm4, mm1                    ; add shifted sums
        movq    [edi], mm4                  ; store result
        movq    mm0, mm3
        pxor    mm0, mm2
        psrad   mm0, 31                     ; mm0 = sign (sam_AB ^ left_right)
        add     edi, 8
        pxor    mm1, mm1                    ; mm1 = zero
        pcmpeqd mm2, mm1                    ; mm2 = 1s if left_right was zero
        pcmpeqd mm3, mm1                    ; mm3 = 1s if sam_AB was zero
        por     mm2, mm3                    ; mm2 = 1s if either was zero
        pandn   mm2, mm6                    ; mask delta with zeros check
        pcmpeqd mm1, mm1
        psubd   mm1, mm7
        psubd   mm1, mm7
        psubd   mm1, mm0
        pxor    mm5, mm0
        paddw   mm5, mm1
        paddusw mm5, mm2                    ; and add to weight_AB
        psubw   mm5, mm1
        pxor    mm5, mm0
        cmp     edi, esi                    ; compare bptr and eptr to see if we're done
        jb      term_minus_3_loop

        pslld   mm5, 16                     ; sign-extend 16-bit weights back to dwords
        psrad   mm5, 16
        mov     eax, [ebp+8]                ; point to dpp
        movq    [eax+8], mm5                ; put weight_AB back
        emms
        mov     edx, [edi-4]                ; dpp->samples_A [0] = bptr [-1];
        mov     eax, [ebp+8]
        mov     [eax+16], edx
        mov     edx, [edi-8]                ; dpp->samples_B [0] = bptr [-2];
        mov     [eax+48], edx

done:   pop     eax                         ; pop delta & saved regs
        pop     edi
        pop     esi
        pop     ebx
        pop     ebp
        ret

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;
; This is the mono version of the above function. It does not use MMX and does not handle negative terms.
;
; void unpack_decorr_mono_pass_cont (struct decorr_pass *dpp,
;                                    int32_t *buffer,
;                                    int32_t sample_count,
;                                    int32_t long_math;
; arguments on entry:
;
;   struct decorr_pass *dpp     [ebp+8]
;   int32_t *buffer             [ebp+12]
;   int32_t sample_count        [ebp+16]
;   int32_t long_math           [ebp+20]
;
; registers after entry:
;
;   rdi         bptr
;   rsi         eptr
;
; on stack:
;
;   int16_t delta             DWORD [esp]
;

_unpack_decorr_mono_pass_cont_x86:
        push    ebp
        mov     ebp, esp
        push    ebx
        push    esi
        push    edi
        cld

        mov     edx, [ebp+8]                ; copy delta from dpp to local stack
        mov     eax, [edx+4]
        push    eax

        mov     edi, [ebp+12]               ; edi = buffer
        mov     eax, [ebp+16]               ; get sample_count and multiply by 4
        sal     eax, 2
        jz      mono_done                   ; exit now if there's nothing to do
        lea     esi, [edi+eax]              ; else add to buffer point to make eptr

        mov     eax, [ebp+8]                ; get term from dpp and vector appropriately
        mov     eax, [eax]
        cmp     eax, 17
        je      mono_17_entry
        cmp     eax, 18
        je      mono_18_entry

;
; registers during default term processing loop:
;   edi         active buffer pointer
;   esi         end of buffer pointer
;   ecx         weight_A
;   ebp         free
;   ebx         term * -4
;   eax,edx     scratch
;

default_mono_entry:
        imul    ebx, eax, -4                ; set ebx to term * -4 for decorrelation index
        mov     edx, [ebp+8]                ; edx = dpp*
        mov     ecx, [edx+8]                ; ecx = weight
        jmp     default_mono_loop

;
; registers during processing loop for terms 17 & 18:
;   edi         active buffer pointer
;   esi         end of buffer pointer
;   ecx         weight_A
;   ebp         previously calculated value
;   ebx         calculated correlation sample
;   eax,edx     scratch
;

mono_17_entry:
        mov     edx, [ebp+8]                ; edx = dpp*
        mov     ecx, [edx+8]                ; ecx = weight_A
        mov     ebp, [edi-4]
        jmp     mono_17_loop

mono_18_entry:
        mov     edx, [ebp+8]                ; edx = dpp*
        mov     ecx, [edx+8]                ; ecx = weight_A
        mov     ebp, [edi-4]
        jmp     mono_18_loop

        align  64
default_mono_loop:
        mov     eax, [edi+ebx]
        imul    eax, ecx
        mov     edx, [edi]
        jo      long_default_mono_loop
        sar     eax, 10
        adc     eax, edx
        mov     [edi], eax
        mov     eax, [edi+ebx]
        add     edi, 4
        test    edx, edx
        je      L100
        test    eax, eax
        je      L100
        xor     eax, edx
        cdq
        xor     ecx, edx
        add     ecx, [esp]
        xor     ecx, edx
L100:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
        jb      default_mono_loop
        jmp     default_mono_done

        align  64
long_default_mono_loop:
        mov     eax, [edi+ebx]
        imul    ecx
        shl     edx, 22
        shr     eax, 10
        adc     eax, edx
        mov     edx, [edi]
        add     eax, edx
        mov     [edi], eax
        mov     eax, [edi+ebx]
        add     edi, 4
        test    edx, edx
        je      L101
        test    eax, eax
        je      L101
        xor     eax, edx
        cdq
        xor     ecx, edx
        add     ecx, [esp]
        xor     ecx, edx
L101:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
        jb      long_default_mono_loop

default_mono_done:
        mov     edx, [ebp+8]                ; edx = dpp*
        mov     [edx+8], ecx                ; store weight_A back
        mov     ecx, [edx]                  ; ecx = dpp->term

default_mono_store_samples:
        dec     ecx
        sub     edi, 4                      ; back up one full sample
        mov     eax, [edi]
        mov     [edx+ecx*4+16], eax         ; store samples_A [ecx]
        test    ecx, ecx
        jnz     default_mono_store_samples
        jmp     mono_done

        align  64
mono_17_loop:
        lea     ebx, [ebp+ebp]
        sub     ebx, [edi-8]
        mov     eax, ecx
        imul    eax, ebx
        mov     edx, [edi]
        jo      long_mono_17_loop
        sar     eax, 10
        adc     eax, edx
        stosd
        test    ebx, ebx
        mov     ebp, eax
        je      L117
        test    edx, edx
        je      L117
        mov     eax, [esp]
        xor     ebx, edx
        sar     ebx, 31
        xor     eax, ebx
        sub     eax, ebx
        add     ecx, eax
L117:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
        jb      mono_17_loop
        jmp     mono_1718_exit

        align  64
long_mono_17_loop:
        lea     ebx, [ebp+ebp]
        sub     ebx, [edi-8]
        mov     eax, ecx
        imul    ebx
        shl     edx, 22
        shr     eax, 10
        adc     eax, edx
        mov     edx, [edi]
        add     eax, edx
        stosd
        test    ebx, ebx
        mov     ebp, eax
        je      L217
        test    edx, edx
        je      L217
        mov     eax, [esp]
        xor     ebx, edx
        sar     ebx, 31
        xor     eax, ebx
        sub     eax, ebx
        add     ecx, eax
L217:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
        jb      long_mono_17_loop
        jmp     mono_1718_exit

        align  64
mono_18_loop:
        lea     ebx, [ebp+ebp*2]
        sub     ebx, [edi-8]
        sar     ebx, 1
        mov     eax, ecx
        imul    eax, ebx
        mov     edx, [edi]
        jo      long_mono_18_loop
        sar     eax, 10
        adc     eax, edx
        stosd
        test    ebx, ebx
        mov     ebp, eax
        je      L118
        test    edx, edx
        je      L118
        mov     eax, [esp]
        xor     ebx, edx
        sar     ebx, 31
        xor     eax, ebx
        sub     eax, ebx
        add     ecx, eax
L118:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
        jb      mono_18_loop
        jmp     mono_1718_exit

        align  64
long_mono_18_loop:
        lea     ebx, [ebp+ebp*2]
        sub     ebx, [edi-8]
        sar     ebx, 1
        mov     eax, ecx
        imul    ebx
        shl     edx, 22
        shr     eax, 10
        adc     eax, edx
        mov     edx, [edi]
        add     eax, edx
        stosd
        test    ebx, ebx
        mov     ebp, eax
        je      L218
        test    edx, edx
        je      L218
        mov     eax, [esp]
        xor     ebx, edx
        sar     ebx, 31
        xor     eax, ebx
        sub     eax, ebx
        add     ecx, eax
L218:   cmp     edi, esi                    ; compare bptr and eptr to see if we're done
        jb      long_mono_18_loop

mono_1718_exit:
        lea     ebp, [esp+16]               ; restore ebp (we've pushed 4 DWORDS)
        mov     edx, [ebp+8]                ; edx = dpp*
        mov     [edx+8], ecx                ; store weight_A back
        mov     eax, [edi-4]                ; dpp->samples_A [0] = bptr [-1];
        mov     [edx+16], eax
        mov     eax, [edi-8]                ; dpp->samples_A [1] = bptr [-2];
        mov     [edx+20], eax

mono_done:
        pop     eax                         ; pop delta & saved regs
        pop     edi
        pop     esi
        pop     ebx
        pop     ebp
        ret

; Helper function to determine if specified CPU feature is available (used here for MMX).
; Input parameter is index of feature to be checked (EDX from CPUID(1) only, MMX = 23).
; Return value is the specified bit (0 or 1) or 0 if CPUID is not supported.

_unpack_cpu_has_feature_x86:
        pushfd                              ; save eflags
        pushfd                              ; push another copy
        xor     dword ptr [esp], 200000h    ; toggle ID bit on stack & pop it back into eflags
        popfd
        pushfd                              ; store possibly modified eflags
        pop     eax                         ; and pop back into eax
        xor     eax, [esp]                  ; compare to original pushed eflags
        popfd                               ; restore original eflags
        and     eax, 200000h                ; eax = 1 if eflags ID bit was changeable
        jz      oldcpu                      ; return zero if CPUID is not available (wow!)

        push    ebx                         ; we must save ebx
        mov     eax, 1                      ; do cpuid (1) to get features into edx
        cpuid
        mov     eax, edx                    ; copy into eax for shift
        mov     cl, [esp+8]                 ; get parameter and shift that bit index into LSB
        sar     eax, cl
        and     eax, 1
        pop     ebx                         ; restore ebx and return 0 or 1

oldcpu: ret                                 ; return value in eax

asmcode ends

        end