#ifdef __GNUC__
#pragma once
#include <inttypes.h>
static inline uint64_t add1_inline (uint64_t* arg0, uint64_t* arg1, uint64_t arg2) {
register uint64_t* arg0_r asm("rdi") = arg0;
register uint64_t* arg1_r asm("rsi") = arg1;
register uint64_t arg2_r asm("rdx") = arg2;
register uint64_t carry_r asm("rax");
__asm__ __volatile__(
" xor %%r8, %%r8;"
" xor %%r9, %%r9;"
" xor %%r10, %%r10;"
" xor %%r11, %%r11;"
" xor %%rax, %%rax;"
" addq 0(%%rsi), %%rdx;"
" movq %%rdx, 0(%%rdi);"
" adcxq 8(%%rsi), %%r8;"
" movq %%r8, 8(%%rdi);"
" adcxq 16(%%rsi), %%r9;"
" movq %%r9, 16(%%rdi);"
" adcxq 24(%%rsi), %%r10;"
" movq %%r10, 24(%%rdi);"
" adcx %%r11, %%rax;"
: "+r" (arg2_r), "=r" (carry_r)
: "r" (arg0_r), "r" (arg1_r)
: "%r8", "%r9", "%r10", "%r11", "memory", "cc"
);
return carry_r;
}
static inline void fadd_inline (uint64_t* arg0, uint64_t* arg1, uint64_t* arg2) {
register uint64_t* arg0_r asm("rdi") = arg0;
register uint64_t* arg1_r asm("rsi") = arg1;
register uint64_t* arg2_r asm("rdx") = arg2;
__asm__ __volatile__(
" movq 0(%%rdx), %%r8;"
" addq 0(%%rsi), %%r8;"
" movq 8(%%rdx), %%r9;"
" adcxq 8(%%rsi), %%r9;"
" movq 16(%%rdx), %%r10;"
" adcxq 16(%%rsi), %%r10;"
" movq 24(%%rdx), %%r11;"
" adcxq 24(%%rsi), %%r11;"
" mov $0, %%rax;"
" mov $38, %%rdx;"
" cmovc %%rdx, %%rax;"
" xor %%rcx, %%rcx;"
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
" movq %%r9, 8(%%rdi);"
" adcx %%rcx, %%r10;"
" movq %%r10, 16(%%rdi);"
" adcx %%rcx, %%r11;"
" movq %%r11, 24(%%rdi);"
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 0(%%rdi);"
: "+r" (arg2_r)
: "r" (arg0_r), "r" (arg1_r)
: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
);
}
static inline void fsub_inline (uint64_t* arg0, uint64_t* arg1, uint64_t* arg2) {
register uint64_t* arg0_r asm("rdi") = arg0;
register uint64_t* arg1_r asm("rsi") = arg1;
register uint64_t* arg2_r asm("rdx") = arg2;
__asm__ __volatile__(
" movq 0(%%rsi), %%r8;"
" subq 0(%%rdx), %%r8;"
" movq 8(%%rsi), %%r9;"
" sbbq 8(%%rdx), %%r9;"
" movq 16(%%rsi), %%r10;"
" sbbq 16(%%rdx), %%r10;"
" movq 24(%%rsi), %%r11;"
" sbbq 24(%%rdx), %%r11;"
" mov $0, %%rax;"
" mov $38, %%rcx;"
" cmovc %%rcx, %%rax;"
" sub %%rax, %%r8;"
" sbb $0, %%r9;"
" sbb $0, %%r10;"
" sbb $0, %%r11;"
" mov $0, %%rax;"
" cmovc %%rcx, %%rax;"
" sub %%rax, %%r8;"
" movq %%r8, 0(%%rdi);"
" movq %%r9, 8(%%rdi);"
" movq %%r10, 16(%%rdi);"
" movq %%r11, 24(%%rdi);"
:
: "r" (arg0_r), "r" (arg1_r), "r" (arg2_r)
: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
);
}
static inline void fmul_inline (uint64_t* arg0, uint64_t* arg1, uint64_t* arg2, uint64_t* arg3) {
register uint64_t* arg0_r asm("rdi") = arg0;
register uint64_t* arg1_r asm("rsi") = arg1;
register uint64_t* arg2_r asm("rdx") = arg2;
register uint64_t* arg3_r asm("rcx") = arg3;
__asm__ __volatile__(
" mov %%rdx, %%r15;"
" movq 0(%%rsi), %%rdx;"
" mulxq 0(%%rcx), %%r8, %%r9;"
" xor %%r10, %%r10;"
" movq %%r8, 0(%%rdi);"
" mulxq 8(%%rcx), %%r10, %%r11;"
" adox %%r9, %%r10;"
" movq %%r10, 8(%%rdi);"
" mulxq 16(%%rcx), %%r12, %%r13;"
" adox %%r11, %%r12;"
" mulxq 24(%%rcx), %%r14, %%rdx;"
" adox %%r13, %%r14;"
" mov $0, %%rax;"
" adox %%rdx, %%rax;"
" movq 8(%%rsi), %%rdx;"
" mulxq 0(%%rcx), %%r8, %%r9;"
" xor %%r10, %%r10;"
" adcxq 8(%%rdi), %%r8;"
" movq %%r8, 8(%%rdi);"
" mulxq 8(%%rcx), %%r10, %%r11;"
" adox %%r9, %%r10;"
" adcx %%r12, %%r10;"
" movq %%r10, 16(%%rdi);"
" mulxq 16(%%rcx), %%r12, %%r13;"
" adox %%r11, %%r12;"
" adcx %%r14, %%r12;"
" mov $0, %%r8;"
" mulxq 24(%%rcx), %%r14, %%rdx;"
" adox %%r13, %%r14;"
" adcx %%rax, %%r14;"
" mov $0, %%rax;"
" adox %%rdx, %%rax;"
" adcx %%r8, %%rax;"
" movq 16(%%rsi), %%rdx;"
" mulxq 0(%%rcx), %%r8, %%r9;"
" xor %%r10, %%r10;"
" adcxq 16(%%rdi), %%r8;"
" movq %%r8, 16(%%rdi);"
" mulxq 8(%%rcx), %%r10, %%r11;"
" adox %%r9, %%r10;"
" adcx %%r12, %%r10;"
" movq %%r10, 24(%%rdi);"
" mulxq 16(%%rcx), %%r12, %%r13;"
" adox %%r11, %%r12;"
" adcx %%r14, %%r12;"
" mov $0, %%r8;"
" mulxq 24(%%rcx), %%r14, %%rdx;"
" adox %%r13, %%r14;"
" adcx %%rax, %%r14;"
" mov $0, %%rax;"
" adox %%rdx, %%rax;"
" adcx %%r8, %%rax;"
" movq 24(%%rsi), %%rdx;"
" mulxq 0(%%rcx), %%r8, %%r9;"
" xor %%r10, %%r10;"
" adcxq 24(%%rdi), %%r8;"
" movq %%r8, 24(%%rdi);"
" mulxq 8(%%rcx), %%r10, %%r11;"
" adox %%r9, %%r10;"
" adcx %%r12, %%r10;"
" movq %%r10, 32(%%rdi);"
" mulxq 16(%%rcx), %%r12, %%r13;"
" adox %%r11, %%r12;"
" adcx %%r14, %%r12;"
" movq %%r12, 40(%%rdi);"
" mov $0, %%r8;"
" mulxq 24(%%rcx), %%r14, %%rdx;"
" adox %%r13, %%r14;"
" adcx %%rax, %%r14;"
" movq %%r14, 48(%%rdi);"
" mov $0, %%rax;"
" adox %%rdx, %%rax;"
" adcx %%r8, %%rax;"
" movq %%rax, 56(%%rdi);"
" mov %%rdi, %%rsi;"
" mov %%r15, %%rdi;"
" mov $38, %%rdx;"
" mulxq 32(%%rsi), %%r8, %%r13;"
" xor %%rcx, %%rcx;"
" adoxq 0(%%rsi), %%r8;"
" mulxq 40(%%rsi), %%r9, %%r12;"
" adcx %%r13, %%r9;"
" adoxq 8(%%rsi), %%r9;"
" mulxq 48(%%rsi), %%r10, %%r13;"
" adcx %%r12, %%r10;"
" adoxq 16(%%rsi), %%r10;"
" mulxq 56(%%rsi), %%r11, %%rax;"
" adcx %%r13, %%r11;"
" adoxq 24(%%rsi), %%r11;"
" adcx %%rcx, %%rax;"
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
" movq %%r9, 8(%%rdi);"
" adcx %%rcx, %%r10;"
" movq %%r10, 16(%%rdi);"
" adcx %%rcx, %%r11;"
" movq %%r11, 24(%%rdi);"
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 0(%%rdi);"
: "+r" (arg0_r), "+r" (arg1_r), "+r" (arg2_r), "+r" (arg3_r)
:
: "%rax", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
);
}
static inline void fmul2_inline (uint64_t* arg0, uint64_t* arg1, uint64_t* arg2, uint64_t* arg3) {
register uint64_t* arg0_r asm("rdi") = arg0;
register uint64_t* arg1_r asm("rsi") = arg1;
register uint64_t* arg2_r asm("rdx") = arg2;
register uint64_t* arg3_r asm("rcx") = arg3;
__asm__ __volatile__(
" mov %%rdx, %%r15;"
" movq 0(%%rsi), %%rdx;"
" mulxq 0(%%rcx), %%r8, %%r9;"
" xor %%r10, %%r10;"
" movq %%r8, 0(%%rdi);"
" mulxq 8(%%rcx), %%r10, %%r11;"
" adox %%r9, %%r10;"
" movq %%r10, 8(%%rdi);"
" mulxq 16(%%rcx), %%r12, %%r13;"
" adox %%r11, %%r12;"
" mulxq 24(%%rcx), %%r14, %%rdx;"
" adox %%r13, %%r14;"
" mov $0, %%rax;"
" adox %%rdx, %%rax;"
" movq 8(%%rsi), %%rdx;"
" mulxq 0(%%rcx), %%r8, %%r9;"
" xor %%r10, %%r10;"
" adcxq 8(%%rdi), %%r8;"
" movq %%r8, 8(%%rdi);"
" mulxq 8(%%rcx), %%r10, %%r11;"
" adox %%r9, %%r10;"
" adcx %%r12, %%r10;"
" movq %%r10, 16(%%rdi);"
" mulxq 16(%%rcx), %%r12, %%r13;"
" adox %%r11, %%r12;"
" adcx %%r14, %%r12;"
" mov $0, %%r8;"
" mulxq 24(%%rcx), %%r14, %%rdx;"
" adox %%r13, %%r14;"
" adcx %%rax, %%r14;"
" mov $0, %%rax;"
" adox %%rdx, %%rax;"
" adcx %%r8, %%rax;"
" movq 16(%%rsi), %%rdx;"
" mulxq 0(%%rcx), %%r8, %%r9;"
" xor %%r10, %%r10;"
" adcxq 16(%%rdi), %%r8;"
" movq %%r8, 16(%%rdi);"
" mulxq 8(%%rcx), %%r10, %%r11;"
" adox %%r9, %%r10;"
" adcx %%r12, %%r10;"
" movq %%r10, 24(%%rdi);"
" mulxq 16(%%rcx), %%r12, %%r13;"
" adox %%r11, %%r12;"
" adcx %%r14, %%r12;"
" mov $0, %%r8;"
" mulxq 24(%%rcx), %%r14, %%rdx;"
" adox %%r13, %%r14;"
" adcx %%rax, %%r14;"
" mov $0, %%rax;"
" adox %%rdx, %%rax;"
" adcx %%r8, %%rax;"
" movq 24(%%rsi), %%rdx;"
" mulxq 0(%%rcx), %%r8, %%r9;"
" xor %%r10, %%r10;"
" adcxq 24(%%rdi), %%r8;"
" movq %%r8, 24(%%rdi);"
" mulxq 8(%%rcx), %%r10, %%r11;"
" adox %%r9, %%r10;"
" adcx %%r12, %%r10;"
" movq %%r10, 32(%%rdi);"
" mulxq 16(%%rcx), %%r12, %%r13;"
" adox %%r11, %%r12;"
" adcx %%r14, %%r12;"
" movq %%r12, 40(%%rdi);"
" mov $0, %%r8;"
" mulxq 24(%%rcx), %%r14, %%rdx;"
" adox %%r13, %%r14;"
" adcx %%rax, %%r14;"
" movq %%r14, 48(%%rdi);"
" mov $0, %%rax;"
" adox %%rdx, %%rax;"
" adcx %%r8, %%rax;"
" movq %%rax, 56(%%rdi);"
" movq 32(%%rsi), %%rdx;"
" mulxq 32(%%rcx), %%r8, %%r9;"
" xor %%r10, %%r10;"
" movq %%r8, 64(%%rdi);"
" mulxq 40(%%rcx), %%r10, %%r11;"
" adox %%r9, %%r10;"
" movq %%r10, 72(%%rdi);"
" mulxq 48(%%rcx), %%r12, %%r13;"
" adox %%r11, %%r12;"
" mulxq 56(%%rcx), %%r14, %%rdx;"
" adox %%r13, %%r14;"
" mov $0, %%rax;"
" adox %%rdx, %%rax;"
" movq 40(%%rsi), %%rdx;"
" mulxq 32(%%rcx), %%r8, %%r9;"
" xor %%r10, %%r10;"
" adcxq 72(%%rdi), %%r8;"
" movq %%r8, 72(%%rdi);"
" mulxq 40(%%rcx), %%r10, %%r11;"
" adox %%r9, %%r10;"
" adcx %%r12, %%r10;"
" movq %%r10, 80(%%rdi);"
" mulxq 48(%%rcx), %%r12, %%r13;"
" adox %%r11, %%r12;"
" adcx %%r14, %%r12;"
" mov $0, %%r8;"
" mulxq 56(%%rcx), %%r14, %%rdx;"
" adox %%r13, %%r14;"
" adcx %%rax, %%r14;"
" mov $0, %%rax;"
" adox %%rdx, %%rax;"
" adcx %%r8, %%rax;"
" movq 48(%%rsi), %%rdx;"
" mulxq 32(%%rcx), %%r8, %%r9;"
" xor %%r10, %%r10;"
" adcxq 80(%%rdi), %%r8;"
" movq %%r8, 80(%%rdi);"
" mulxq 40(%%rcx), %%r10, %%r11;"
" adox %%r9, %%r10;"
" adcx %%r12, %%r10;"
" movq %%r10, 88(%%rdi);"
" mulxq 48(%%rcx), %%r12, %%r13;"
" adox %%r11, %%r12;"
" adcx %%r14, %%r12;"
" mov $0, %%r8;"
" mulxq 56(%%rcx), %%r14, %%rdx;"
" adox %%r13, %%r14;"
" adcx %%rax, %%r14;"
" mov $0, %%rax;"
" adox %%rdx, %%rax;"
" adcx %%r8, %%rax;"
" movq 56(%%rsi), %%rdx;"
" mulxq 32(%%rcx), %%r8, %%r9;"
" xor %%r10, %%r10;"
" adcxq 88(%%rdi), %%r8;"
" movq %%r8, 88(%%rdi);"
" mulxq 40(%%rcx), %%r10, %%r11;"
" adox %%r9, %%r10;"
" adcx %%r12, %%r10;"
" movq %%r10, 96(%%rdi);"
" mulxq 48(%%rcx), %%r12, %%r13;"
" adox %%r11, %%r12;"
" adcx %%r14, %%r12;"
" movq %%r12, 104(%%rdi);"
" mov $0, %%r8;"
" mulxq 56(%%rcx), %%r14, %%rdx;"
" adox %%r13, %%r14;"
" adcx %%rax, %%r14;"
" movq %%r14, 112(%%rdi);"
" mov $0, %%rax;"
" adox %%rdx, %%rax;"
" adcx %%r8, %%rax;"
" movq %%rax, 120(%%rdi);"
" mov %%rdi, %%rsi;"
" mov %%r15, %%rdi;"
" mov $38, %%rdx;"
" mulxq 32(%%rsi), %%r8, %%r13;"
" xor %%rcx, %%rcx;"
" adoxq 0(%%rsi), %%r8;"
" mulxq 40(%%rsi), %%r9, %%r12;"
" adcx %%r13, %%r9;"
" adoxq 8(%%rsi), %%r9;"
" mulxq 48(%%rsi), %%r10, %%r13;"
" adcx %%r12, %%r10;"
" adoxq 16(%%rsi), %%r10;"
" mulxq 56(%%rsi), %%r11, %%rax;"
" adcx %%r13, %%r11;"
" adoxq 24(%%rsi), %%r11;"
" adcx %%rcx, %%rax;"
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
" movq %%r9, 8(%%rdi);"
" adcx %%rcx, %%r10;"
" movq %%r10, 16(%%rdi);"
" adcx %%rcx, %%r11;"
" movq %%r11, 24(%%rdi);"
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 0(%%rdi);"
" mov $38, %%rdx;"
" mulxq 96(%%rsi), %%r8, %%r13;"
" xor %%rcx, %%rcx;"
" adoxq 64(%%rsi), %%r8;"
" mulxq 104(%%rsi), %%r9, %%r12;"
" adcx %%r13, %%r9;"
" adoxq 72(%%rsi), %%r9;"
" mulxq 112(%%rsi), %%r10, %%r13;"
" adcx %%r12, %%r10;"
" adoxq 80(%%rsi), %%r10;"
" mulxq 120(%%rsi), %%r11, %%rax;"
" adcx %%r13, %%r11;"
" adoxq 88(%%rsi), %%r11;"
" adcx %%rcx, %%rax;"
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
" movq %%r9, 40(%%rdi);"
" adcx %%rcx, %%r10;"
" movq %%r10, 48(%%rdi);"
" adcx %%rcx, %%r11;"
" movq %%r11, 56(%%rdi);"
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 32(%%rdi);"
: "+r" (arg0_r), "+r" (arg1_r), "+r" (arg2_r), "+r" (arg3_r)
:
: "%rax", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
);
}
static inline void fmul1_inline (uint64_t* arg0, uint64_t* arg1, uint64_t arg2) {
register uint64_t* arg0_r asm("rdi") = arg0;
register uint64_t* arg1_r asm("rsi") = arg1;
register uint64_t arg2_r asm("rdx") = arg2;
__asm__ __volatile__(
" mulxq 0(%%rsi), %%r8, %%rcx;"
" mulxq 8(%%rsi), %%r9, %%r12;"
" add %%rcx, %%r9;"
" mov $0, %%rcx;"
" mulxq 16(%%rsi), %%r10, %%r13;"
" adcx %%r12, %%r10;"
" mulxq 24(%%rsi), %%r11, %%rax;"
" adcx %%r13, %%r11;"
" adcx %%rcx, %%rax;"
" mov $38, %%rdx;"
" imul %%rdx, %%rax;"
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
" movq %%r9, 8(%%rdi);"
" adcx %%rcx, %%r10;"
" movq %%r10, 16(%%rdi);"
" adcx %%rcx, %%r11;"
" movq %%r11, 24(%%rdi);"
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 0(%%rdi);"
: "+r" (arg2_r)
: "r" (arg0_r), "r" (arg1_r)
: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "memory", "cc"
);
}
static inline void cswap2_inline (uint64_t arg0, uint64_t* arg1, uint64_t* arg2) {
register uint64_t arg0_r asm("rdi") = arg0;
register uint64_t* arg1_r asm("rsi") = arg1;
register uint64_t* arg2_r asm("rdx") = arg2;
__asm__ __volatile__(
" add $18446744073709551615, %%rdi;"
" movq 0(%%rsi), %%r8;"
" movq 0(%%rdx), %%r9;"
" mov %%r8, %%r10;"
" cmovc %%r9, %%r8;"
" cmovc %%r10, %%r9;"
" movq %%r8, 0(%%rsi);"
" movq %%r9, 0(%%rdx);"
" movq 8(%%rsi), %%r8;"
" movq 8(%%rdx), %%r9;"
" mov %%r8, %%r10;"
" cmovc %%r9, %%r8;"
" cmovc %%r10, %%r9;"
" movq %%r8, 8(%%rsi);"
" movq %%r9, 8(%%rdx);"
" movq 16(%%rsi), %%r8;"
" movq 16(%%rdx), %%r9;"
" mov %%r8, %%r10;"
" cmovc %%r9, %%r8;"
" cmovc %%r10, %%r9;"
" movq %%r8, 16(%%rsi);"
" movq %%r9, 16(%%rdx);"
" movq 24(%%rsi), %%r8;"
" movq 24(%%rdx), %%r9;"
" mov %%r8, %%r10;"
" cmovc %%r9, %%r8;"
" cmovc %%r10, %%r9;"
" movq %%r8, 24(%%rsi);"
" movq %%r9, 24(%%rdx);"
" movq 32(%%rsi), %%r8;"
" movq 32(%%rdx), %%r9;"
" mov %%r8, %%r10;"
" cmovc %%r9, %%r8;"
" cmovc %%r10, %%r9;"
" movq %%r8, 32(%%rsi);"
" movq %%r9, 32(%%rdx);"
" movq 40(%%rsi), %%r8;"
" movq 40(%%rdx), %%r9;"
" mov %%r8, %%r10;"
" cmovc %%r9, %%r8;"
" cmovc %%r10, %%r9;"
" movq %%r8, 40(%%rsi);"
" movq %%r9, 40(%%rdx);"
" movq 48(%%rsi), %%r8;"
" movq 48(%%rdx), %%r9;"
" mov %%r8, %%r10;"
" cmovc %%r9, %%r8;"
" cmovc %%r10, %%r9;"
" movq %%r8, 48(%%rsi);"
" movq %%r9, 48(%%rdx);"
" movq 56(%%rsi), %%r8;"
" movq 56(%%rdx), %%r9;"
" mov %%r8, %%r10;"
" cmovc %%r9, %%r8;"
" cmovc %%r10, %%r9;"
" movq %%r8, 56(%%rsi);"
" movq %%r9, 56(%%rdx);"
: "+r" (arg0_r)
: "r" (arg1_r), "r" (arg2_r)
: "%r8", "%r9", "%r10", "memory", "cc"
);
}
static inline void fsqr_inline (uint64_t* arg0, uint64_t* arg1, uint64_t* arg2) {
register uint64_t* arg0_r asm("rdi") = arg0;
register uint64_t* arg1_r asm("rsi") = arg1;
register uint64_t* arg2_r asm("rdx") = arg2;
__asm__ __volatile__(
" mov %%rdx, %%rbx;"
" movq 0(%%rsi), %%rdx;"
" mulxq 8(%%rsi), %%r8, %%r14;"
" xor %%r15, %%r15;"
" mulxq 16(%%rsi), %%r9, %%r10;"
" adcx %%r14, %%r9;"
" mulxq 24(%%rsi), %%rax, %%rcx;"
" adcx %%rax, %%r10;"
" movq 24(%%rsi), %%rdx;"
" mulxq 8(%%rsi), %%r11, %%r12;"
" adcx %%rcx, %%r11;"
" mulxq 16(%%rsi), %%rax, %%r13;"
" adcx %%rax, %%r12;"
" movq 8(%%rsi), %%rdx;"
" adcx %%r15, %%r13;"
" mulxq 16(%%rsi), %%rax, %%rcx;"
" mov $0, %%r14;"
" xor %%r15, %%r15;"
" adox %%rax, %%r10;"
" adcx %%r8, %%r8;"
" adox %%rcx, %%r11;"
" adcx %%r9, %%r9;"
" adox %%r15, %%r12;"
" adcx %%r10, %%r10;"
" adox %%r15, %%r13;"
" adcx %%r11, %%r11;"
" adox %%r15, %%r14;"
" adcx %%r12, %%r12;"
" adcx %%r13, %%r13;"
" adcx %%r14, %%r14;"
" movq 0(%%rsi), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;"
" movq %%rax, 0(%%rdi);"
" add %%rcx, %%r8;"
" movq %%r8, 8(%%rdi);"
" movq 8(%%rsi), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;"
" adcx %%rax, %%r9;"
" movq %%r9, 16(%%rdi);"
" adcx %%rcx, %%r10;"
" movq %%r10, 24(%%rdi);"
" movq 16(%%rsi), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;"
" adcx %%rax, %%r11;"
" movq %%r11, 32(%%rdi);"
" adcx %%rcx, %%r12;"
" movq %%r12, 40(%%rdi);"
" movq 24(%%rsi), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;"
" adcx %%rax, %%r13;"
" movq %%r13, 48(%%rdi);"
" adcx %%rcx, %%r14;"
" movq %%r14, 56(%%rdi);"
" mov %%rdi, %%rsi;"
" mov %%rbx, %%rdi;"
" mov $38, %%rdx;"
" mulxq 32(%%rsi), %%r8, %%r13;"
" xor %%rcx, %%rcx;"
" adoxq 0(%%rsi), %%r8;"
" mulxq 40(%%rsi), %%r9, %%r12;"
" adcx %%r13, %%r9;"
" adoxq 8(%%rsi), %%r9;"
" mulxq 48(%%rsi), %%r10, %%r13;"
" adcx %%r12, %%r10;"
" adoxq 16(%%rsi), %%r10;"
" mulxq 56(%%rsi), %%r11, %%rax;"
" adcx %%r13, %%r11;"
" adoxq 24(%%rsi), %%r11;"
" adcx %%rcx, %%rax;"
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
" movq %%r9, 8(%%rdi);"
" adcx %%rcx, %%r10;"
" movq %%r10, 16(%%rdi);"
" adcx %%rcx, %%r11;"
" movq %%r11, 24(%%rdi);"
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 0(%%rdi);"
: "+r" (arg0_r), "+r" (arg1_r), "+r" (arg2_r)
:
: "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
);
}
static inline void fsqr2_inline (uint64_t* arg0, uint64_t* arg1, uint64_t* arg2) {
register uint64_t* arg0_r asm("rdi") = arg0;
register uint64_t* arg1_r asm("rsi") = arg1;
register uint64_t* arg2_r asm("rdx") = arg2;
__asm__ __volatile__(
" mov %%rdx, %%rbx;"
" movq 0(%%rsi), %%rdx;"
" mulxq 8(%%rsi), %%r8, %%r14;"
" xor %%r15, %%r15;"
" mulxq 16(%%rsi), %%r9, %%r10;"
" adcx %%r14, %%r9;"
" mulxq 24(%%rsi), %%rax, %%rcx;"
" adcx %%rax, %%r10;"
" movq 24(%%rsi), %%rdx;"
" mulxq 8(%%rsi), %%r11, %%r12;"
" adcx %%rcx, %%r11;"
" mulxq 16(%%rsi), %%rax, %%r13;"
" adcx %%rax, %%r12;"
" movq 8(%%rsi), %%rdx;"
" adcx %%r15, %%r13;"
" mulxq 16(%%rsi), %%rax, %%rcx;"
" mov $0, %%r14;"
" xor %%r15, %%r15;"
" adox %%rax, %%r10;"
" adcx %%r8, %%r8;"
" adox %%rcx, %%r11;"
" adcx %%r9, %%r9;"
" adox %%r15, %%r12;"
" adcx %%r10, %%r10;"
" adox %%r15, %%r13;"
" adcx %%r11, %%r11;"
" adox %%r15, %%r14;"
" adcx %%r12, %%r12;"
" adcx %%r13, %%r13;"
" adcx %%r14, %%r14;"
" movq 0(%%rsi), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;"
" movq %%rax, 0(%%rdi);"
" add %%rcx, %%r8;"
" movq %%r8, 8(%%rdi);"
" movq 8(%%rsi), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;"
" adcx %%rax, %%r9;"
" movq %%r9, 16(%%rdi);"
" adcx %%rcx, %%r10;"
" movq %%r10, 24(%%rdi);"
" movq 16(%%rsi), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;"
" adcx %%rax, %%r11;"
" movq %%r11, 32(%%rdi);"
" adcx %%rcx, %%r12;"
" movq %%r12, 40(%%rdi);"
" movq 24(%%rsi), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;"
" adcx %%rax, %%r13;"
" movq %%r13, 48(%%rdi);"
" adcx %%rcx, %%r14;"
" movq %%r14, 56(%%rdi);"
" movq 32(%%rsi), %%rdx;"
" mulxq 40(%%rsi), %%r8, %%r14;"
" xor %%r15, %%r15;"
" mulxq 48(%%rsi), %%r9, %%r10;"
" adcx %%r14, %%r9;"
" mulxq 56(%%rsi), %%rax, %%rcx;"
" adcx %%rax, %%r10;"
" movq 56(%%rsi), %%rdx;"
" mulxq 40(%%rsi), %%r11, %%r12;"
" adcx %%rcx, %%r11;"
" mulxq 48(%%rsi), %%rax, %%r13;"
" adcx %%rax, %%r12;"
" movq 40(%%rsi), %%rdx;"
" adcx %%r15, %%r13;"
" mulxq 48(%%rsi), %%rax, %%rcx;"
" mov $0, %%r14;"
" xor %%r15, %%r15;"
" adox %%rax, %%r10;"
" adcx %%r8, %%r8;"
" adox %%rcx, %%r11;"
" adcx %%r9, %%r9;"
" adox %%r15, %%r12;"
" adcx %%r10, %%r10;"
" adox %%r15, %%r13;"
" adcx %%r11, %%r11;"
" adox %%r15, %%r14;"
" adcx %%r12, %%r12;"
" adcx %%r13, %%r13;"
" adcx %%r14, %%r14;"
" movq 32(%%rsi), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;"
" movq %%rax, 64(%%rdi);"
" add %%rcx, %%r8;"
" movq %%r8, 72(%%rdi);"
" movq 40(%%rsi), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;"
" adcx %%rax, %%r9;"
" movq %%r9, 80(%%rdi);"
" adcx %%rcx, %%r10;"
" movq %%r10, 88(%%rdi);"
" movq 48(%%rsi), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;"
" adcx %%rax, %%r11;"
" movq %%r11, 96(%%rdi);"
" adcx %%rcx, %%r12;"
" movq %%r12, 104(%%rdi);"
" movq 56(%%rsi), %%rdx;"
" mulx %%rdx, %%rax, %%rcx;"
" adcx %%rax, %%r13;"
" movq %%r13, 112(%%rdi);"
" adcx %%rcx, %%r14;"
" movq %%r14, 120(%%rdi);"
" mov %%rdi, %%rsi;"
" mov %%rbx, %%rdi;"
" mov $38, %%rdx;"
" mulxq 32(%%rsi), %%r8, %%r13;"
" xor %%rcx, %%rcx;"
" adoxq 0(%%rsi), %%r8;"
" mulxq 40(%%rsi), %%r9, %%r12;"
" adcx %%r13, %%r9;"
" adoxq 8(%%rsi), %%r9;"
" mulxq 48(%%rsi), %%r10, %%r13;"
" adcx %%r12, %%r10;"
" adoxq 16(%%rsi), %%r10;"
" mulxq 56(%%rsi), %%r11, %%rax;"
" adcx %%r13, %%r11;"
" adoxq 24(%%rsi), %%r11;"
" adcx %%rcx, %%rax;"
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
" movq %%r9, 8(%%rdi);"
" adcx %%rcx, %%r10;"
" movq %%r10, 16(%%rdi);"
" adcx %%rcx, %%r11;"
" movq %%r11, 24(%%rdi);"
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 0(%%rdi);"
" mov $38, %%rdx;"
" mulxq 96(%%rsi), %%r8, %%r13;"
" xor %%rcx, %%rcx;"
" adoxq 64(%%rsi), %%r8;"
" mulxq 104(%%rsi), %%r9, %%r12;"
" adcx %%r13, %%r9;"
" adoxq 72(%%rsi), %%r9;"
" mulxq 112(%%rsi), %%r10, %%r13;"
" adcx %%r12, %%r10;"
" adoxq 80(%%rsi), %%r10;"
" mulxq 120(%%rsi), %%r11, %%rax;"
" adcx %%r13, %%r11;"
" adoxq 88(%%rsi), %%r11;"
" adcx %%rcx, %%rax;"
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
" movq %%r9, 40(%%rdi);"
" adcx %%rcx, %%r10;"
" movq %%r10, 48(%%rdi);"
" adcx %%rcx, %%r11;"
" movq %%r11, 56(%%rdi);"
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 32(%%rdi);"
: "+r" (arg0_r), "+r" (arg1_r), "+r" (arg2_r)
:
: "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
);
}
#endif