dnl
dnl Copyright 2017 Free Software Foundation, Inc.
dnl Contributed to the GNU project by Torbjorn Granlund.
dnl Copyright (C) 2024 Albin Ahlbäck
dnl
dnl This file is part of FLINT.
dnl
dnl FLINT is free software: you can redistribute it and/or modify it under
dnl the terms of the GNU Lesser General Public License (LGPL) as published
dnl by the Free Software Foundation; either version 3 of the License, or
dnl (at your option) any later version. See <https://www.gnu.org/licenses/>.
dnl
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of either:
dnl
dnl * the GNU Lesser General Public License as published by the Free
dnl Software Foundation; either version 3 of the License, or (at your
dnl option) any later version.
dnl
dnl or
dnl
dnl * the GNU General Public License as published by the Free Software
dnl Foundation; either version 2 of the License, or (at your option) any
dnl later version.
dnl
dnl or both in parallel, as here.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
dnl for more details.
dnl
dnl You should have received copies of the GNU General Public License and the
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
dnl see https://www.gnu.org/licenses/.
include(`config.m4')
define(`rp', `%rdi')
define(`ap', `%rsi')
define(`bp_param', `%rdx')
define(`n', `%rcx')
define(`bp', `%r11')
define(`jmpreg',`%r15')
define(`nn', `%rbp')
define(`mm', `%rbx')
define(`s0', `%r8')
define(`s1', `%r9')
define(`s2', `%r10')
define(`s3', `%r12')
define(`s4', `%r13')
define(`s5', `%r14')
define(`sx', `%rax')
dnl Scheme:
dnl 0 1 2 3 4 5 6 7 8
dnl 0 x x x x x x x x x
dnl 1 x x x x x x x x l
dnl 2 x x x x x x x l
dnl 3 x x x x x x l
dnl 4 x x x x x l
dnl 5 x x x x l
dnl 6 x x x l
dnl 7 x x l
dnl 8 x l
dnl NOTE: Requires n > 8.
TEXT
ALIGN(32)
PROLOGUE(flint_mpn_mullow_basecase)
push mm
push nn
push s3
push s4
push s5
push jmpreg
lea -2(n), R32(nn)
lea 1*8(bp_param), bp C Prepare bp for addmul_1
mov 0*8(bp_param), %rdx C Load rdx for mul_1
mov R32(n), R32(sx)
shr $3, R32(n)
and $7, R32(sx) C clear OF, CF as side-effect
lea L(mtab)(%rip), s0
ifdef(`PIC',
` movslq (s0,sx,4), sx
lea (sx,s0), s0
jmp *s0
',`
jmp *(s0,sx,8)
')
L(mf0): mulx 0*8(ap), s0, s2
lea 7*8(ap), ap
lea -1*8(rp), rp
lea L(f0)(%rip), jmpreg
jmp L(mb0)
L(mf3): mulx 0*8(ap), s1, sx
lea 2*8(ap), ap
lea 2*8(rp), rp
inc R32(n)
lea L(f3)(%rip), jmpreg
jmp L(mb3)
L(mf4): mulx 0*8(ap), s0, s2
lea 3*8(ap), ap
lea 3*8(rp), rp
inc R32(n)
lea L(f4)(%rip), jmpreg
jmp L(mb4)
L(mf5): mulx 0*8(ap), s1, sx
lea 4*8(ap), ap
lea 4*8(rp), rp
inc R32(n)
lea L(f5)(%rip), jmpreg
jmp L(mb5)
L(mf6): mulx 0*8(ap), s0, s2
lea 5*8(ap), ap
lea 5*8(rp), rp
inc R32(n)
lea L(f6)(%rip), jmpreg
jmp L(mb6)
L(mf7): mulx 0*8(ap), s1, sx
lea 6*8(ap), ap
lea 6*8(rp), rp
inc R32(n)
lea L(f7)(%rip), jmpreg
jmp L(mb7)
L(mf1): mulx 0*8(ap), s1, sx
lea L(f1)(%rip), jmpreg
jmp L(mb1)
L(mf2): mulx 0*8(ap), s0, s2
lea 1*8(ap), ap
lea 1*8(rp), rp
lea L(f2)(%rip), jmpreg
mulx 0*8(ap), s1, sx
ALIGN(32)
L(mtop):mov s0, -1*8(rp)
adc s2, s1
L(mb1): mulx 1*8(ap), s0, s2
adc sx, s0
lea 8*8(ap), ap
mov s1, 0*8(rp)
L(mb0): mov s0, 1*8(rp)
mulx -6*8(ap), s1, sx
lea 8*8(rp), rp
adc s2, s1
L(mb7): mulx -5*8(ap), s0, s2
mov s1, -6*8(rp)
adc sx, s0
L(mb6): mov s0, -5*8(rp)
mulx -4*8(ap), s1, sx
adc s2, s1
L(mb5): mulx -3*8(ap), s0, s2
mov s1, -4*8(rp)
adc sx, s0
L(mb4): mulx -2*8(ap), s1, sx
mov s0, -3*8(rp)
adc s2, s1
L(mb3): mulx -1*8(ap), s0, s2
adc sx, s0
mov s1, -2*8(rp)
dec R32(n)
mulx 0*8(ap), s1, sx
jnz L(mtop)
lea 1*8(,nn,8), R32(mm)
mov s0, -1*8(rp)
adc s2, s1
mov 0*8(bp), %rdx
mov s1, 0*8(rp)
adc n, sx C n = 0
shr $3, R32(nn)
neg mm
or R32(nn), R32(n) C Reset n, clear OF and CF
lea 1*8(rp,mm), rp C Reset rp
lea (ap,mm), ap C Reset ap
jmp *jmpreg
nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;nop;
L(f7): mulx 0*8(ap), s0, s2
lea 5*8(ap), ap
lea -3*8(rp), rp
lea L(f6)(%rip), jmpreg
jmp L(b7)
L(f6): mulx 0*8(ap), s1, s3
lea 4*8(ap), ap
lea -4*8(rp), rp
lea L(f5)(%rip), jmpreg
jmp L(b6)
L(end): adox 0*8(rp), s1
mulx 1*8(ap), s0, s2 C Only s0 is used
lea 1*8(mm), mm
adox s3, sx
mov s1, 0*8(rp)
lea 1*8(bp), bp
adc s0, sx
lea (ap,mm), ap C Reset ap
mov 0*8(bp), %rdx
or R32(nn), R32(n) C Reset count, clear CF and OF
lea 1*8(rp,mm), rp C Reset rp
jmp *jmpreg
L(f0): mulx 0*8(ap), s1, s3
lea -2*8(ap), ap
lea -2*8(rp), rp
lea L(f7)(%rip), jmpreg
jmp L(b0)
L(f3): mulx 0*8(ap), s0, s2
lea 1*8(ap), ap
lea 1*8(rp), rp
mulx 0*8(ap), s1, s3
lea L(f2)(%rip), jmpreg
ALIGN(32)
L(top): adox -1*8(rp), s0
adcx s2, s1
mov s0, -1*8(rp)
jrcxz L(end)
L(b2): mulx 1*8(ap), s0, s2
adox 0*8(rp), s1
lea -1(n), R32(n)
mov s1, 0*8(rp)
adcx s3, s0
L(b1): mulx 2*8(ap), s1, s3
adcx s2, s1
adox 1*8(rp), s0
mov s0, 1*8(rp)
L(b0): mulx 3*8(ap), s0, s2
lea 8*8(ap), ap
adcx s3, s0
adox 2*8(rp), s1
mov s1, 2*8(rp)
L(b7): mulx -4*8(ap), s1, s3
adox 3*8(rp), s0
adcx s2, s1
mov s0, 3*8(rp)
L(b6): mulx -3*8(ap), s0, s2
adcx s3, s0
adox 4*8(rp), s1
mov s1, 4*8(rp)
L(b5): mulx -2*8(ap), s1, s3
adox 5*8(rp), s0
adcx s2, s1
mov s0, 5*8(rp)
L(b4): adox 6*8(rp), s1
mulx -1*8(ap), s0, s2
mov s1, 6*8(rp)
lea 8*8(rp), rp
adcx s3, s0
mulx 0*8(ap), s1, s3
jmp L(top)
L(f5): mulx 0*8(ap), s0, s2
lea 3*8(ap), ap
lea -5*8(rp), rp
lea L(f4)(%rip), jmpreg
jmp L(b5)
L(f4): mulx 0*8(ap), s1, s3
lea 2*8(ap), ap
lea -6*8(rp), rp
lea L(f3)(%rip), jmpreg
jmp L(b4)
L(f2): mulx 0*8(ap), s1, s3
lea -1(nn), R32(nn)
lea L(f1)(%rip), jmpreg
jmp L(b2)
L(f1): mulx 0*8(ap), s0, s2
jrcxz L(cor)
lea -1*8(ap), ap
lea -1*8(rp), rp
lea L(f0)(%rip), jmpreg
jmp L(b1)
define(`t0', `s0')
define(`t1', `s2')
define(`t2', `s1')
define(`t3', `s3')
define(`t4', `s4')
define(`t5', `s5')
define(`t6', `jmpreg')
define(`t7', `mm')
define(`t8', `nn')
define(`t9', `n')
define(`tx', `sx')
L(cor): mulx 1*8(ap), t2, t3
mulx 2*8(ap), t4, t5
adcx 0*8(rp), t0
adox t1, t2
mulx 3*8(ap), t6, t7
adcx 1*8(rp), t2
adox t3, t4
mulx 4*8(ap), t8, t9
adcx 2*8(rp), t4
adox t5, t6
mulx 5*8(ap), t1, t3
adcx 3*8(rp), t6
adox t7, t8
mulx 6*8(ap), t5, t7
mov t0, 0*8(rp)
adcx 4*8(rp), t8
adox t9, t1
mulx 7*8(ap), t0, t9
adcx 5*8(rp), t1
adox t3, t5
mulx 8*8(ap), t3, %rdx C %rdx unused
adcx 6*8(rp), t5
adox t7, t0
adcx 7*8(rp), t0
adox t9, tx
C 2, 4, 6, 8, 1, 5, 0, x
mov 1*8(bp), %rdx
mulx 0*8(ap), t7, t9
adc t3, tx
test %al, %al C Reset OF and CF
adcx t7, t2
adox t9, t4
mov t2, 1*8(rp)
mulx 1*8(ap), t7, t9
mulx 2*8(ap), t2, t3
adcx t7, t4
adox t9, t6
mulx 3*8(ap), t7, t9
adcx t2, t6
adox t3, t8
mulx 4*8(ap), t2, t3
adcx t7, t8
adox t9, t1
mulx 5*8(ap), t7, t9
adcx t2, t1
adox t3, t5
mulx 6*8(ap), t2, t3
adcx t7, t5
adox t9, t0
mulx 7*8(ap), t7, t9 C t9 unused
adcx t2, t0
adox t3, tx
C 4, 6, 8, 1, 5, 0, x
mov 2*8(bp), %rdx
mulx 0*8(ap), t2, t3
adc t7, tx
test %al, %al
mulx 1*8(ap), t7, t9
adcx t2, t4
adox t3, t6
mov t4, 2*8(rp)
mulx 2*8(ap), t2, t3
adcx t7, t6
adox t9, t8
mulx 3*8(ap), t7, t9
adcx t2, t8
adox t3, t1
mulx 4*8(ap), t2, t3
adcx t7, t1
adox t9, t5
mulx 5*8(ap), t7, t9
adcx t2, t5
adox t3, t0
mulx 6*8(ap), t2, t3 C t3 unused
adcx t7, t0
adox t9, tx
C 6, 8, 1, 5, 0, x
mov 3*8(bp), %rdx
mulx 0*8(ap), t4, t7
adc t2, tx
test %al, %al
mulx 1*8(ap), t2, t3
adcx t4, t6
adox t7, t8
mov t6, 3*8(rp)
mulx 2*8(ap), t4, t7
mulx 3*8(ap), t6, t9
adcx t2, t8
adox t3, t1
mulx 4*8(ap), t2, t3
adcx t4, t1
adox t7, t5
mulx 5*8(ap), t4, t7 C t7 unused
adcx t6, t5
adox t9, t0
adcx t2, t0
adox t3, tx
C 8, 1, 5, 0, x
mov 4*8(bp), %rdx
mulx 0*8(ap), t2, t3
adc t4, tx
test %al, %al
mulx 1*8(ap), t4, t6
mulx 2*8(ap), t7, t9
adcx t2, t8
adox t3, t1
mulx 3*8(ap), t2, t3
adcx t4, t1
adox t6, t5
mulx 4*8(ap), t4, t6 C t6 unused
adcx t7, t5
adox t9, t0
mov t8, 4*8(rp)
adox t3, tx
adc t2, t0
C 1, 5, 0, x
mov 5*8(bp), %rdx
mulx 0*8(ap), t2, t3
adc t4, tx
mulx 1*8(ap), t4, t6
mulx 2*8(ap), t7, t8
imul 3*8(ap), %rdx
add t2, t1
adc t3, t5
mov t1, 5*8(rp)
adc t6, t0
adc t8, tx
add t4, t5
adc t7, t0
adc %rdx, tx
C 5, 0, x
mov 6*8(bp), %rdx
mulx 0*8(ap), t1, t2
mulx 1*8(ap), t3, t4
imul 2*8(ap), %rdx
add t1, t5
adc t2, t0
mov t5, 6*8(rp)
adc t4, tx
add t3, t0
adc %rdx, tx
C 0, x
mov 7*8(bp), %rdx
mulx 0*8(ap), t1, t2
imul 1*8(ap), %rdx
pop jmpreg
pop s5
pop s4
pop s3
pop nn
pop mm
add t1, t0
adc t2, tx
mov t0, 7*8(rp)
add %rdx, tx
ret
EPILOGUE()
JUMPTABSECT
ALIGN(8)
L(mtab):JMPENT( L(mf0), L(mtab))
JMPENT( L(mf1), L(mtab))
JMPENT( L(mf2), L(mtab))
JMPENT( L(mf3), L(mtab))
JMPENT( L(mf4), L(mtab))
JMPENT( L(mf5), L(mtab))
JMPENT( L(mf6), L(mtab))
JMPENT( L(mf7), L(mtab))