ramp 0.7.0

A high-performance multiple-precision arithmetic library
Documentation
    .text
    .file "addsub_n.S"

#define wp %rdi
#define xp %rsi
#define yp %rdx
#define n %rcx

    .section .text.ramp_add_n,"ax",@progbits
    .globl ramp_add_n
    .align 16, 0x90
    .type ramp_add_n,@function
ramp_add_n:
    .cfi_startproc

#define L(lbl) .LADD_ ## lbl

    mov %ecx, %eax
    shr $2, n
    and $3, %eax
    jrcxz L(lt4)

    mov (xp), %r8
    mov 8(xp), %r9
    dec n
    jmp L(mid)

L(lt4):
    dec %eax
    mov (xp), %r8
    jnz L(2)
    adc (yp), %r8
    mov %r8, (wp)
    adc %eax, %eax
    ret

L(2):
    dec %eax
    mov 8(xp), %r9
    jnz L(3)
    adc (yp), %r8
    adc 8(yp), %r9
    mov %r8, (wp)
    mov %r9, 8(wp)
    adc %eax, %eax
    ret

L(3):
    mov 16(xp), %r10
    adc (yp), %r8
    adc 8(yp), %r9
    adc 16(yp), %r10
    mov %r8, (wp)
    mov %r9, 8(wp)
    mov %r10, 16(wp)
    setc %al
    ret

    .align 16
L(top):
    adc (yp), %r8
    adc 8(yp), %r9
    adc 16(yp), %r10
    adc 24(yp), %r11
    mov %r8, (wp)
    lea 32(xp), xp
    mov %r9, 8(wp)
    mov %r10, 16(wp)
    dec n
    mov %r11, 24(wp)
    lea 32(yp), yp
    mov (xp), %r8
    mov 8(xp), %r9
    lea 32(wp), %rdi
L(mid):
    mov 16(xp), %r10
    mov 24(xp), %r11
    jnz L(top)

L(end):
    lea 32(xp), xp
    adc (yp), %r8
    adc 8(yp), %r9
    adc 16(yp), %r10
    adc 24(yp), %r11
    lea 32(yp), yp
    mov %r8, (wp)
    mov %r9, 8(wp)
    mov %r10, 16(wp)
    mov %r11, 24(wp)
    lea 32(wp), wp

    inc %eax
    dec %eax
    jnz L(lt4)
    adc %eax, %eax
    ret
L(tmp):
    .size ramp_add_n, L(tmp) - ramp_add_n
    .cfi_endproc

    .section .text.ramp_sub_n,"ax",@progbits
    .globl ramp_sub_n
    .align 16, 0x90
    .type ramp_sub_n,@function
ramp_sub_n:
    .cfi_startproc

#undef L
#define L(lbl) .LSUB_ ## lbl

    mov %ecx, %eax
    shr $2, n
    and $3, %eax
    jrcxz L(lt4)

    mov (xp), %r8
    mov 8(xp), %r9
    dec n
    jmp L(mid)

L(lt4):
    dec %eax
    mov (xp), %r8
    jnz L(2)
    sbb (yp), %r8
    mov %r8, (wp)
    adc %eax, %eax
    ret

L(2):
    dec %eax
    mov 8(xp), %r9
    jnz L(3)
    sbb (yp), %r8
    sbb 8(yp), %r9
    mov %r8, (wp)
    mov %r9, 8(wp)
    adc %eax, %eax
    ret

L(3):
    mov 16(xp), %r10
    sbb (yp), %r8
    sbb 8(yp), %r9
    sbb 16(yp), %r10
    mov %r8, (wp)
    mov %r9, 8(wp)
    mov %r10, 16(wp)
    setc %al
    ret

    .align 16
L(top):
    sbb (yp), %r8
    sbb 8(yp), %r9
    sbb 16(yp), %r10
    sbb 24(yp), %r11
    mov %r8, (wp)
    lea 32(xp), xp
    mov %r9, 8(wp)
    mov %r10, 16(wp)
    dec n
    mov %r11, 24(wp)
    lea 32(yp), yp
    mov (xp), %r8
    mov 8(xp), %r9
    lea 32(wp), %rdi
L(mid):
    mov 16(xp), %r10
    mov 24(xp), %r11
    jnz L(top)

L(end):
    lea 32(xp), xp
    sbb (yp), %r8
    sbb 8(yp), %r9
    sbb 16(yp), %r10
    sbb 24(yp), %r11
    lea 32(yp), yp
    mov %r8, (wp)
    mov %r9, 8(wp)
    mov %r10, 16(wp)
    mov %r11, 24(wp)
    lea 32(wp), wp

    inc %eax
    dec %eax
    jnz L(lt4)
    adc %eax, %eax
    ret
L(tmp):
    .size ramp_sub_n, L(tmp) - ramp_sub_n
    .cfi_endproc