fff 0.3.1

Library for building and interfacing with finite fields
Documentation
// A*B
// Schoolbook multiplication of four 64b limbs 
// result in r8 - r15
.macro mul_256 a  b
   xor    %rax,       %rax
   mov    0x00\a,     %rdx
   mulx   0x00\b,     %r8,   %r9
   mulx   0x08\b,     %rbx,  %r10
   adcx   %rbx,       %r9
   mulx   0x10\b,     %rbx,  %r11
   adcx   %rbx,       %r10
   mulx   0x18\b,     %rbx,  %r12
   adcx   %rbx,       %r11
   adcx   %rax,       %r12
   xor    %rax,       %rax
   mov    0x08\a,     %rdx
   mulx   0x00\b,     %rbp,  %rbx
   adcx   %rbp,       %r9
   adox   %rbx,       %r10
   mulx   0x08\b,     %rbp,  %rbx
   adcx   %rbp,       %r10
   adox   %rbx,       %r11
   mulx   0x10\b,     %rbp,  %rbx
   adcx   %rbp,       %r11
   adox   %rbx,       %r12
   mulx   0x18\b,     %rbp,  %r13
   adcx   %rbp,       %r12
   adox   %rax,       %r13
   adcx   %rax,       %r13
   xor    %rax,       %rax
   mov    0x10\a,     %rdx
   mulx   0x00\b,     %rbp,  %rbx
   adcx   %rbp,       %r10
   adox   %rbx,       %r11
   mulx   0x08\b,     %rbp,  %rbx
   adcx   %rbp,       %r11
   adox   %rbx,       %r12
   mulx   0x10\b,     %rbp,  %rbx
   adcx   %rbp,       %r12
   adox   %rbx,       %r13
   mulx   0x18\b,     %rbp,  %r14
   adcx   %rbp,       %r13
   adox   %rax,       %r14
   adcx   %rax,       %r14
   xor    %rax,       %rax
   mov    0x18\a,     %rdx
   mulx   0x00\b,     %rbp,  %rbx
   adcx   %rbp,       %r11
   adox   %rbx,       %r12
   mulx   0x08\b,     %rbp,  %rbx
   adcx   %rbp,       %r12
   adox   %rbx,       %r13
   mulx   0x10\b,     %rbp,  %rbx
   adcx   %rbp,       %r13
   adox   %rbx,       %r14
   mulx   0x18\b,     %rbp,  %r15
   adcx   %rbp,       %r14
   adox   %rax,       %r15
   adcx   %rax,       %r15
.endm

// Montgomery reduction 
// expects multiplication result in r8 - r15
// See algo 14.32 from Handbook of Applied Cryptography
.macro red_256 res name
   push   %rsi
   lea    .LM(%rip),   %rsi
   xor    %rax,        %rax
   mov    0x20(%rsi),  %rdx
   mulx   %r8,         %rdx,  %rbp
   mulx   0x00(%rsi),  %rbp,  %rbx
   adox   %rbp,        %r8
   adcx   %rbx,        %r9
   mulx   0x08(%rsi),  %rbp,  %rbx
   adox   %rbp,        %r9
   adcx   %rbx,        %r10
   mulx   0x10(%rsi),  %rbp,  %rbx
   adox   %rbp,        %r10
   adcx   %rbx,        %r11
   mulx   0x18(%rsi),  %rbp,  %rbx
   adox   %rbp,        %r11
   adcx   %rbx,        %r12
   adox   %rax,        %r12
   adcx   %rax,        %r13
   adox   %rax,        %r13
   adcx   %rax,        %r14
   adox   %rax,        %r14
   adcx   %rax,        %r15
   adox   %rax,        %r15
   mov    0x20(%rsi),  %rdx
   mulx   %r9,         %rdx,  %rbp
   mulx   0x00(%rsi),  %rbp,  %rbx
   adox   %rbp,        %r9
   adcx   %rbx,        %r10
   mulx   0x08(%rsi),  %rbp,  %rbx
   adox   %rbp,        %r10
   adcx   %rbx,        %r11
   mulx   0x10(%rsi),  %rbp,  %rbx
   adox   %rbp,        %r11
   adcx   %rbx,        %r12
   mulx   0x18(%rsi),  %rbp,  %rbx
   adox   %rbp,        %r12
   adcx   %rbx,        %r13
   adox   %rax,        %r13
   adcx   %rax,        %r14
   adox   %rax,        %r14
   adcx   %rax,        %r15
   adox   %rax,        %r15
   mov    0x20(%rsi),  %rdx
   mulx   %r10,        %rdx,  %rbp
   mulx   0x00(%rsi),  %rbp,  %rbx
   adox   %rbp,        %r10
   adcx   %rbx,        %r11
   mulx   0x08(%rsi),  %rbp,  %rbx
   adox   %rbp,        %r11
   adcx   %rbx,        %r12
   mulx   0x10(%rsi),  %rbp,  %rbx
   adox   %rbp,        %r12
   adcx   %rbx,        %r13
   mulx   0x18(%rsi),  %rbp,  %rbx
   adox   %rbp,        %r13
   adcx   %rbx,        %r14
   adox   %rax,        %r14
   adcx   %rax,        %r15
   adox   %rax,        %r15
   mov    0x20(%rsi),  %rdx
   mulx   %r11,        %rdx,  %rbp
   mov    0x00(%rsi),  %r8
   mulx   %r8,         %rbp,  %rbx
   adox   %rbp,        %r11
   adcx   %rbx,        %r12
   mov    0x08(%rsi),  %r9
   mulx   %r9,         %rbp,  %rbx
   adox   %rbp,        %r12
   adcx   %rbx,        %r13
   mov    0x10(%rsi),  %r10
   mulx   %r10,        %rbp,  %rbx
   adox   %rbp,        %r13
   adcx   %rbx,        %r14
   mov    0x18(%rsi),  %r11
   mulx   %r11,        %rbp,  %rbx
   adox   %rbp,        %r14
   adcx   %rbx,        %r15
   adox   %rax,        %r15
   mov    %r12,        0x00\res
   mov    %r13,        0x08\res
   mov    %r14,        0x10\res
   mov    %r15,        0x18\res
   sub    %r8,         %r12
   sbb    %r9,         %r13
   sbb    %r10,        %r14
   sbb    %r11,        %r15
   jb     .Lred_256\name
   mov    %r12,        0x00\res
   mov    %r13,        0x08\res
   mov    %r14,        0x10\res
   mov    %r15,        0x18\res
.Lred_256\name:
   pop    %rsi
.endm

.macro mod_mul_256 a b res name
   mul_256 \a, \b
   red_256 \res, \name
.endm

// BLS12-381 G1 order r used as modulus
// Montgomery constant -m^-1 mod b
.LM:
   .quad 0xffffffff00000001
   .quad 0x53bda402fffe5bfe
   .quad 0x3339d80809a1d805
   .quad 0x73eda753299d7d48
   .quad 0xfffffffeffffffff
        
#ifdef __APPLE__
.global _mod_mul_4w
_mod_mul_4w:
#else
.global mod_mul_4w
mod_mul_4w:
#endif
   // x      = rdi
   // y      = rsi
   // result = rdx
   push %rbp
   push %rbx
   push %r12
   push %r13
   push %r14
   push %r15
   mov  %rdx,  %rcx  // rcx = result

   // x * y
   mod_mul_256 (%rdi), (%rsi), (%rcx), mm

   pop  %r15
   pop  %r14
   pop  %r13
   pop  %r12
   pop  %rbx
   pop  %rbp
   ret