flint3-sys 3.5.1

#
#   Copyright (C) 2023 Albin Ahlbäck
#
#   This file is part of FLINT.
#
#   FLINT is free software: you can redistribute it and/or modify it under
#   the terms of the GNU Lesser General Public License (LGPL) as published
#   by the Free Software Foundation; either version 3 of the License, or
#   (at your option) any later version.  See <https://www.gnu.org/licenses/>.
#

_regs = ["%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%rax"]

# We have omitted rsp as we cannot use it unless we push it to xmm register
__regs = ["%rbx", "%rbp", "%r12", "%r13", "%r14", "%r15"]

function reg_8_bit(reg::String)
    if reg == "%rax"
        return "%al"
    elseif reg == "%rbx"
        return "%bl"
    elseif reg == "%rcx"
        return "%cl"
    elseif reg == "%rdx"
        return "%dl"
    elseif reg == "%rsp"
        return "%spl"
    elseif reg == "%rbp"
        return "%bpl"
    elseif reg == "%rsi"
        return "%sil"
    elseif reg == "%rdi"
        return "%dil"
    elseif reg == "%r8"
        return "%r8b"
    elseif reg == "%r9"
        return "%r9b"
    elseif reg == "%r10"
        return "%r10b"
    elseif reg == "%r11"
        return "%r11b"
    elseif reg == "%r12"
        return "%r12b"
    elseif reg == "%r13"
        return "%r13b"
    elseif reg == "%r14"
        return "%r14b"
    elseif reg == "%r15"
        return "%r15b"
    else
        return "hejhoppgummi"
    end
end

function reg_32_bit(reg::String)
    if reg == "%rax"
        return "%eax"
    elseif reg == "%rbx"
        return "%ebx"
    elseif reg == "%rcx"
        return "%ecx"
    elseif reg == "%rdx"
        return "%edx"
    elseif reg == "%rsp"
        return "%esp"
    elseif reg == "%rbp"
        return "%ebp"
    elseif reg == "%rsi"
        return "%esi"
    elseif reg == "%rdi"
        return "%edi"
    elseif reg == "%r8"
        return "%r8d"
    elseif reg == "%r9"
        return "%r9d"
    elseif reg == "%r10"
        return "%r10d"
    elseif reg == "%r11"
        return "%r11d"
    elseif reg == "%r12"
        return "%r12d"
    elseif reg == "%r13"
        return "%r13d"
    elseif reg == "%r14"
        return "%r14d"
    elseif reg == "%r15"
        return "%r15d"
    else
        return "hejhoppgummi"
    end
end

###############################################################################
# Preamble
###############################################################################

copyright = "dnl
dnl Copyright (C) 2023 Albin Ahlbäck
dnl
dnl This file is part of FLINT.
dnl
dnl FLINT is free software: you can redistribute it and/or modify it under
dnl the terms of the GNU Lesser General Public License (LGPL) as published
dnl by the Free Software Foundation; either version 3 of the License, or
dnl (at your option) any later version.  See <https://www.gnu.org/licenses/>.
dnl\n"

preamble = "include(`config.m4')\n\n\tTEXT\n"

function function_pre_post(funname::String)
    pre = "\tALIGN(16)\n"
    pre *= "PROLOGUE($funname)\n"

    post = "EPILOGUE()\n"

    return (pre, post)
end

###############################################################################
# m = 1, n = 1
###############################################################################

function function_body_1(m::Int, n::Int = m)
    if m != 1 || n != 1
        error()
    end

    regs = ["%rcx", "%rax", "%r8", "%r9", "%r10", "%r11"]

    numregs = 2 # Number of registers used

    r0 = regs[1]
    r1 = regs[2] # Important that r1 is rax

    res_reg = "%rdi"
    ap_reg = "%rsi"
    bp_reg = "%rdx"

    body = ""

    body *= "\tmov\t($bp_reg), %rdx\n"
    body *= "\tmulx\t0*8($ap_reg), $r0, $r1\n"
    body *= "\tmov\t$r0, 0*8($res_reg)\n"
    body *= "\tmov\t$r1, 1*8($res_reg)\n"

    return body * "\n\tret\n"
end

###############################################################################
# m = 2, n = 1
###############################################################################

function function_body_2_1(m::Int, n::Int = 1)
    if m != 2 || n != 1
        error()
    end

    regs = ["%rcx", "%r8", "%rax", "%r9", "%r10", "%r11"]

    numregs = 5 # Number of registers used

    r0 = regs[1]
    r1 = regs[2]
    r2 = regs[3] # Important that r2 is rax
    r3 = regs[4]
    zero_reg = regs[numregs]
    zero_reg_32 = reg_32_bit(zero_reg)

    res_reg = "%rdi"
    ap_reg = "%rsi"
    bp_reg = "%rdx"

    body = ""

    body *= "\tmov\t0*8($bp_reg), %rdx\n"
    body *= "\txor\t$zero_reg_32, $zero_reg_32\n"
    body *= "\tmulx\t0*8($ap_reg), $r0, $r1\n"
    body *= "\tmulx\t1*8($ap_reg), $r3, $r2\n"
    body *= "\tadcx\t$r3, $r1\n"
    body *= "\tadcx\t$zero_reg, $r2\n"
    body *= "\tmov\t$r0, 0*8($res_reg)\n"
    body *= "\tmov\t$r1, 1*8($res_reg)\n"
    body *= "\tmov\t$r2, 2*8($res_reg)\n"

    return body * "\n\tret\n"
end

###############################################################################
# m = 2, n = 2
###############################################################################

function function_body_2(m::Int, n::Int = m)
    if m != 2 || n != 2
        error()
    end

    regs = ["%rcx", "%r8", "%r9", "%rax", "%r10", "%r11"]

    numregs = 6 # Number of registers used

    r0 = regs[1]
    r1 = regs[2]
    r2 = regs[3]
    r3 = regs[4] # Important that r3 is rax
    bp1_reg = regs[numregs - 1]
    zero_reg = regs[numregs]
    zero_reg_32 = reg_32_bit(zero_reg)

    res_reg = "%rdi"
    ap_reg = "%rsi"
    bp_reg = "%rdx"

    body = ""

    body *= "\tmov\t1*8($bp_reg), $bp1_reg\n"
    body *= "\tmov\t0*8($bp_reg), %rdx\n"
    body *= "\txor\t$zero_reg_32, $zero_reg_32\n"
    body *= "\tmulx\t0*8($ap_reg), $r0, $r1\n"
    body *= "\tmulx\t1*8($ap_reg), $r3, $r2\n"
    body *= "\tadcx\t$r3, $r1\n"
    body *= "\tmov\t$r0, 0*8($res_reg)\n"
    body *= "\tmov\t$bp1_reg, %rdx\n"
    body *= "\tmulx\t0*8($ap_reg), $r0, $r3\n"
    body *= "\tadox\t$r0, $r1\n"
    body *= "\tadcx\t$r3, $r2\n"
    body *= "\tmov\t$r1, 1*8($res_reg)\n"
    body *= "\tmulx\t1*8($ap_reg), $r0, $r3\n"
    body *= "\tadox\t$r0, $r2\n"
    body *= "\tadcx\t$zero_reg, $r3\n"
    body *= "\tadox\t$zero_reg, $r3\n"
    body *= "\tmov\t$r2, 2*8($res_reg)\n"
    body *= "\tmov\t$r3, 3*8($res_reg)\n"

    return body * "\n\tret\n"
end

###############################################################################
# mul_1 and addmul_1 macros
###############################################################################

# Assumes bp[0] is already in %rdx.
# This pushes directly into res, apart from the last limb which is left in r5.
function mul_1_macro(m::Int)
    if m < 3
        error()
    end

    pre = ".macro\tm$(m)_str res=%rdi, res_offset=0, ap=%rsi, ap_offset=0, "
    pre *= "r0, r1, r2, r3, r4, r5\n"
    post = ".endm\n"

    r0 = "\\r0"
    r1 = "\\r1"
    r2 = "\\r2"
    r3 = "\\r3"
    r4 = "\\r4"
    r5 = "\\r5"

    # Make sure that last limb is pushed into r5
    if m % 2 == 0
        r4, r5 = r5, r4
    elseif m % 4 == 1
        r1, r5 = r5, r1
    end

    body = ""

    body *= "\tmulx\t(0+\\ap_offset)*8(\\ap), $r0, $r3\n"
    body *= "\tmulx\t(1+\\ap_offset)*8(\\ap), $r1, $r4\n"
    body *= "\tmulx\t(2+\\ap_offset)*8(\\ap), $r2, $r5\n"
    body *= "\tadd\t$r3, $r1\n"
    body *= "\tadc\t$r4, $r2\n"
    body *= "\tmov\t$r0, (0+\\res_offset)*8(\\res)\n"
    body *= "\tmov\t$r1, (1+\\res_offset)*8(\\res)\n"
    body *= "\tmov\t$r2, (2+\\res_offset)*8(\\res)\n"

    for ix in 1:(m - 1) ÷ 2 - 1
        i1 = 2 * ix + 1
        i2 = 2 * ix + 2
        body *= "\tmulx\t($i1+\\ap_offset)*8(\\ap), $r3, $r4\n"
        body *= "\tmulx\t($i2+\\ap_offset)*8(\\ap), $r0, $r1\n"
        body *= "\tadc\t$r3, $r5\n"
        body *= "\tadc\t$r4, $r0\n"
        body *= "\tmov\t$r5, ($i1+\\res_offset)*8(\\res)\n"
        body *= "\tmov\t$r0, ($i2+\\res_offset)*8(\\res)\n"
        (r0, r2), (r3, r4), (r1, r5) = (r2, r0), (r3, r4), (r5, r1)
    end

    if m % 2 == 1
        body *= "\tadc\t\$0, $r5\n"
    else
        body *= "\tmulx\t($(m - 1)+\\ap_offset)*8(\\ap), $r3, $r4\n"
        body *= "\tadc\t$r3, $r5\n"
        body *= "\tadc\t\$0, $r4\n"
        body *= "\tmov\t$r5, ($(m - 1)+\\res_offset)*8(\\res)\n"
    end

    return pre * body * post
end

# Assumes b is already in %rdx.
# Pushes least significant limb into res[res_offset].
# ip1 can be aliased with rN for N > 0 and scr2
# ip2 can be aliased with rN for N > 1
function mulM_macro(n::Int; chain::Bool = false)
    if n < 2 || n > 8
        error()
    end

    pre = ".macro\tm$(n)$(chain ? "_chain" : "") res=%rdi, res_offset=0, ap=%rsi, ap_offset=0, "
    if chain
        pre *= "ip1, ip2, "
    end
    for jx in 0:n-1
        pre *= "r$jx, "
    end
    pre *= "scr1, scr2, zero\n"

    post = ".endm\n"

    body = ""
    body *= "\tmulx\t(0+\\ap_offset)*8(\\ap), \\scr1, \\r0\n"
    if chain
        body *= "\tadcx\t\\ip1, \\scr1\n"
        body *= "\tmov\t\\scr1, \\res_offset*8(\\res)\n"
    end
    body *= "\tmulx\t(1+\\ap_offset)*8(\\ap), \\scr2, \\r1\n"
    if !chain
        body *= "\tmov\t\\scr1, \\res_offset*8(\\res)\n"
    end
    body *= "\tadcx\t\\scr2, \\r0\n"
    if chain
        body *= "\tadox\t\\ip2, \\r0\n"
    end

    for jx in 2:n ÷ 2
        jxm = 2 * (jx - 1)
        body *= "\tmulx\t($(jxm)+\\ap_offset)*8(\\ap), \\scr1, \\r$jxm\n"
        body *= "\tmulx\t($(jxm + 1)+\\ap_offset)*8(\\ap), \\scr2, \\r$(jxm + 1)\n"
        body *= "\tadcx\t\\scr1, \\r$(jxm - 1)\n"
        body *= "\tadcx\t\\scr2, \\r$jxm\n"
    end

    if n % 2 == 1
        body *= "\tmulx\t($(n - 1)+\\ap_offset)*8(\\ap), \\scr1, \\r$(n - 1)\n"
        body *= "\tadcx\t\\scr1, \\r$(n - 2)\n"
    end

    body *= "\tadcx\t\\zero, \\r$(n - 1)\n"

    return pre * body * post
end

# Assumes b is already in %rdx.
# Pushes least significant limb into res[res_offset].
function addmulM_macro(n::Int)
    if n < 2 || n > 8
        error()
    end

    pre = ".macro\tam$(n) res=%rdi, res_offset=0, ap=%rsi, ap_offset=0, "
    for jx in 0:n
        pre *= "r$jx, "
    end
    pre *= "scr, zero\n"
    post = ".endm\n"

    scr1 = "\\r$n"
    scr2 = "\\scr"
    if n % 2 == 1
        # If n is odd, we need to reorder scr1 and scr2 to make \scr occur at
        # the right place so that we do not need two scrap registers in the last
        # multiplication.
        tmp = scr1
        scr1 = scr2
        scr2 = tmp
    end

    body = ""
    body *= "\tmulx\t(0+\\ap_offset)*8(\\ap), $scr1, $scr2\n"
    body *= "\tadcx\t$scr1, \\r0\n"
    body *= "\tmov\t\\r0, \\res_offset*8(\\res)\n"

    for jx in 1:n - 1
        body *= "\tmulx\t($(jx)+\\ap_offset)*8(\\ap), \\r0, $(jx % 2 == 1 ? scr1 : scr2)\n"
        body *= "\tadcx\t$(jx % 2 == 0 ? scr1 : scr2), \\r$jx\n"
        body *= "\tadox\t\\r0, \\r$jx\n"
    end

    body *= "\tadcx\t\\zero, \\r$n\n"
    body *= "\tadox\t\\zero, \\r$n\n"

    return pre * body * post
end

###############################################################################
# m ≥ 3, n = 1
###############################################################################

function function_body_M_1(m::Int)
    if m < 3
        error()
    end

    body = "\tmov\t0*8(%rdx), %rdx\n"
    # Important that the last entry is rax to set return value
    body *= "\tm$(m)_str\t%rdi, 0, %rsi, 0, %rcx, %r8, %r9, %r10, %r11, %rax\n"
    body *= "\tmov\t%rax, $m*8(%rdi)\n"

    return body * "\n\tret\n"
end

###############################################################################
# m > 8, n = 2
###############################################################################

function M_2_getN(m::Int)
    if m <= 8
        error()
    elseif m <= 9
        return 3
    elseif m <= 12
        return 4
    elseif m <= 15
        return 5
    elseif m <= 16
        return 6
    else
        error()
    end
end

# Needed macros: mN, amN_chain, mN_chain (if m ÷ N == 2),
# m(m % N)_chain (if m % N > 1) and am(m % N) (if m % N > 1).
function function_body_M_2(m::Int)
    if m <= 8
        error()
    end

    N = M_2_getN(m)

    # number of available registers is 10
    regs = ["%rax", "%r9", "%r10", "%r11", "%rbx", "%rbp", "%r12", "%r13", "%r14", "%r15"]
    preserve_index = 5 # Starting with %rbx, we have to push it to the stack

    used_registers = N + 3

    res = "%rdi"
    ap = "%rsi"
    bp = "%rdx"
    b0 = "%rcx"
    b1 = "%r8"

    if used_registers > length(regs)
        error()
    end

    pre = "\tmov\t0*8($bp), $b0\n"
    pre *= "\tmov\t1*8($bp), $b1\n"
    post = ""
    for jx in preserve_index:used_registers
        pre = pre * "\tpush\t$(regs[jx])\n"
        post = "\tpop\t$(regs[jx])\n" * post
    end

    rg = deepcopy(regs[1:used_registers])
    if m % N == 0 && (m ÷ N) % 2 == 1
        rg[1], rg[N + 1] = rg[N + 1], rg[1]
    elseif m % N == 1
        rg[1], rg[4] = rg[4], rg[1]
    elseif m % N > 1 && (m ÷ N) % 2 == 0
        rg[1], rg[N + 1] = rg[N + 1], rg[1]
    end

    scr = rg[N + 2]
    zero = rg[N + 3]

    body = ""

    body *= "\n"
    body *= "\tmov\t$b0, %rdx\n"
    body *= "\txor\t$(reg_32_bit(zero)), $(reg_32_bit(zero))\n"
    body *= "\tm$N\t$res, 0, $ap, 0, "
    for jx in 2:N
        body *= "$(rg[jx]), "
    end
    body *= "$(rg[1]), $(rg[N + 1]), $scr, $zero\n"

    body *= "\tmov\t$b1, %rdx\n"
    body *= "\tam$(N)\t$res, 1, $ap, 0, "
    for jx in 2:N
        body *= "$(rg[jx]), "
    end
    body *= "$(rg[1]), $(rg[N + 1]), $scr, $zero\n"
    for jx in 3:N
        body *= "\tmov\t$(rg[jx]), $(jx - 1)*8($res)\n"
    end

    for ix in 1:m ÷ N - 1
        body *= "\n"
        body *= "\tmov\t$b0, %rdx\n"
        body *= "\tm$(N)_chain\t$res, $(ix * N), $ap, $(ix * N), $(rg[1]), $(rg[N + 1]), "
        for jx in 2:N
            body *= "$(rg[jx]), "
        end
        body *= "$(rg[N + 1]), $scr, $(rg[1]), $zero\n"

        body *= "\tmov\t$b1, %rdx\n"
        body *= "\tam$(N)\t$res, $(ix * N + 1), $ap, $(ix * N), "
        for jx in 2:N
            body *= "$(rg[jx]), "
        end
        body *= "$(rg[N + 1]), $(rg[1]), $scr, $zero\n"
        for jx in 3:N
            body *= "\tmov\t$(rg[jx]), $(jx + ix * N - 1)*8($res)\n"
        end
        rg[1], rg[N + 1] = rg[N + 1], rg[1]
    end

    body *= "\n"
    if m % N == 0
        body *= "\tmov\t$(rg[1]), $(m)*8($res)\n"
        body *= "\tmov\t$(rg[N + 1]), $(m + 1)*8($res)\n"
    elseif m % N == 1
        body *= "\tmov\t$b0, %rdx\n"
        body *= "\tmulx\t$((m ÷ N) * N)*8($ap), $(rg[2]), $scr\n"
        body *= "\tmov\t$b1, %rdx\n"
        body *= "\tmulx\t$((m ÷ N) * N)*8($ap), $(rg[3]), $(rg[4])\n"
        body *= "\tadcx\t$(rg[1]), $(rg[2])\n"
        body *= "\tadcx\t$(rg[N + 1]), $(rg[3])\n"
        body *= "\tadcx\t$zero, $(rg[4])\n"
        body *= "\tadox\t$scr, $(rg[3])\n"
        body *= "\tadox\t$zero, $(rg[4])\n"
        body *= "\tmov\t$(rg[2]), $((m ÷ N) * N + 0)*8($res)\n"
        body *= "\tmov\t$(rg[3]), $((m ÷ N) * N + 1)*8($res)\n"
        body *= "\tmov\t$(rg[4]), $((m ÷ N) * N + 2)*8($res)\n"
    else
        body *= "\tmov\t$b0, %rdx\n"
        body *= "\tm$(m % N)_chain\t$res, $((m ÷ N) * N + 0), $ap, $((m ÷ N) * N), $(rg[1]), $(rg[N + 1]), "
        for jx in 2:m % N + 1
            body *= "$(rg[jx]), "
        end
        body *= "$scr, $(rg[1]), $zero\n"

        body *= "\tmov\t$b1, %rdx\n"
        body *= "\tam$(m % N)\t$res, $((m ÷ N) * N + 1), $ap, $((m ÷ N) * N), "
        for jx in 2:m % N + 1
            body *= "$(rg[jx]), "
        end
        body *= "$(rg[1]), $scr, $zero\n"
        for jx in 3:m % N + 1
            body *= "\tmov\t$(rg[jx]), $((m ÷ N) * N + jx - 1)*8($res)\n"
        end
        body *= "\tmov\t$(rg[1]), $(m + 1)*8($res)\n"
    end

    return pre * body * post * "\n\tret\n"
end

###############################################################################
# 3 ≤ m ≤ 8, 2 ≤ n ≤ m, or m > 8, 2 ≤ n ≤ 8
###############################################################################

function function_body_M(m::Int, n::Int = m)
    if !((3 ≤ m ≤ 8 && 2 ≤ n ≤ m) || (m > 8 && 2 ≤ n ≤ 8))
        error()
    end

    if m ≤ 8
        res_reg = "%rdi"
        ap_reg = "%rsi"
        bp_reg_old = "%rdx"
        bp_reg = "%rcx"
        mov = "\tmov\t$bp_reg_old, $bp_reg\n"
    else
        res_reg = "%rdi"
        ap_reg_old = "%rdx" # is actually bp
        ap_reg = "%rcx"
        bp_reg = "%rsi"
        mov = "\tmov\t$ap_reg_old, $ap_reg\n"
        m, n = n, m
    end

    regs = ["%rax", "%r8", "%r9", "%r10", "%r11", "%rbx", "%rbp", "%r12", "%r13", "%r14", "%r15"]
    preserve_index = 6 # Starting with %rbx, we have to push it to the stack

    numregs = m + 3 # Number of registers used

    regs_perm = deepcopy(regs[1:m + 1])
    # We want the most significant limb to fall on rax
    if n == 2
        regs_perm[1], regs_perm[m + 1] = regs_perm[m + 1], regs_perm[1]
    else
        regs_perm[1], regs_perm[((n - 3) % (m + 1)) + 1] = regs_perm[((n - 3) % (m + 1)) + 1], regs_perm[1]
    end

    scr_reg = regs[numregs - 1]
    zero_reg = regs[numregs]
    zero_reg_32 = reg_32_bit(zero_reg)

    pre = mov
    pre *= "\tmov\t0*8($bp_reg), %rdx\n"
    body = ""
    post = ""

    for jx in preserve_index:numregs
        pre = pre * "\tpush\t$(regs[jx])\n"
        post = "\tpop\t$(regs[jx])\n" * post
    end

    body *= "\n"
    body *= "\txor\t$zero_reg_32, $zero_reg_32\n"
    body *= "\n"

    body *= "\tm$m\t$res_reg, 0, $ap_reg, 0, "
    for jx in 1:m
        body *= "$(regs_perm[jx]), "
    end
    body *= "$(regs_perm[m + 1]), $scr_reg, $zero_reg\n\n"

    for jx in 1:n - 1
        body *= "\tmov\t$jx*8($bp_reg), %rdx\n"
        body *= "\tam$m\t$res_reg, $jx, $ap_reg, 0, "
        for kx in 1:m + 1
            body *= "$(regs_perm[kx]), "
        end
        body *= "$scr_reg, $zero_reg\n"

        # Reorder registers
        regs_perm[1:m], regs_perm[m + 1] = regs_perm[2:m + 1], regs_perm[1]
    end

    body *= "\n"
    for jx in 1:m
        body *= "\tmov\t$(regs_perm[jx]), $(n + jx - 1)*8($res_reg)\n"
    end
    body *= "\n"

    return pre * body * post * "\n\tret\n"
end

###############################################################################
# sqr
###############################################################################

# NOTE: Aliasing must work for squaring.

# NOTE: Although this could be generally programmed, just like the mul case,
# we can skip some instructions and additional storage when hardcoding each
# case.

function function_body_sqr_1()
    regs = ["%rcx", "%rax"]

    r0 = regs[1]
    r1 = regs[2] # Important that r1 is rax

    res_reg = "%rdi"
    ap_reg = "%rsi"

    body = ""

    body *= "\tmov\t0*8($ap_reg), %rdx\n"
    body *= "\tmulx\t%rdx, $r0, $r1\n"
    body *= "\tmov\t$r0, 0*8($res_reg)\n"
    body *= "\tmov\t$r1, 1*8($res_reg)\n"

    return body * "\n\tret\n"
end

function function_body_sqr_2()
    n = 2

    res = _regs[1]
    ap = _regs[2]

    this_regs = [_regs[4:end - 1]; __regs]

    w = this_regs[1:3]

    s1 = this_regs[4]
    s2 = _regs[end]

    pre = "\tmov\t0*8($ap), %rdx\n"
    post = ""

    body = ""

    # Calculate upper triangle
    body *= "\txor\t$(reg_32_bit(w[3])), $(reg_32_bit(w[3]))\n"
    body *= "\tmulx\t1*8($ap), $(w[1]), $(w[2])\n"
    body *= "\tadd\t$(w[1]), $(w[1])\n"
    body *= "\tadc\t$(w[2]), $(w[2])\n"
    body *= "\tadc\t$(w[3]), $(w[3])\n"

    # Calculate diagonal and put into res
    body *= "\tmulx\t%rdx, $s1, $s2\n"
    body *= "\tmov\t$s1, 0*8($res)\n"
    body *= "\tadd\t$s2, $(w[1])\n"
    body *= "\tmov\t1*8($ap), %rdx\n"
    body *= "\tmov\t$(w[1]), 1*8($res)\n"
    body *= "\tmulx\t%rdx, $s1, $s2\n"
    body *= "\tadc\t$s1, $(w[2])\n"
    body *= "\tadc\t$(w[3]), $s2\n"
    body *= "\tmov\t$(w[2]), 2*8($res)\n"
    body *= "\tmov\t$s2, 3*8($res)\n"

    return pre * body * post * "\n\tret\n"
end

function function_body_sqr_3()
    n = 3

    res = _regs[1]
    ap = _regs[2]

    this_regs = [_regs[4:end - 1]; __regs]

    w = this_regs[1:5]

    scr = _regs[end]

    pre = "\tmov\t0*8($ap), %rdx\n"
    post = ""

    body = ""

    # Calculate upper triangle
    body *= "\txor\t$(reg_32_bit(w[5])), $(reg_32_bit(w[5]))\n"
    body *= "\tmulx\t1*8($ap), $(w[1]), $scr\n" # a0 a1
    body *= "\tmulx\t2*8($ap), $(w[2]), $(w[3])\n" # a0 a2
    body *= "\tmov\t1*8($ap), %rdx\n"
    body *= "\tadd\t$scr, $(w[2])\n"
    body *= "\tmulx\t2*8($ap), $scr, $(w[4])\n" # a1 a2
    body *= "\tadc\t$scr, $(w[3])\n"
    body *= "\tadc\t$(w[5]), $(w[4])\n"

    # Double upper triangle
    body *= "\tmov\t0*8($ap), %rdx\n"
    body *= "\tadd\t$(w[1]), $(w[1])\n"
    body *= "\tadc\t$(w[2]), $(w[2])\n"
    body *= "\tadc\t$(w[3]), $(w[3])\n"
    body *= "\tadc\t$(w[4]), $(w[4])\n"
    body *= "\tadc\t$(w[5]), $(w[5])\n"

    # Calculate diagonal and put into res
    body *= "\tmulx\t%rdx, %rdx, $scr\n" # a0^2
    body *= "\tmov\t%rdx, 0*8($res)\n"
    body *= "\tadd\t$scr, $(w[1])\n"
    body *= "\tmov\t1*8($ap), %rdx\n"
    body *= "\tmov\t$(w[1]), 1*8($res)\n"
    body *= "\tmulx\t%rdx, %rdx, $scr\n" # a1^2
    body *= "\tadc\t%rdx, $(w[2])\n"
    body *= "\tadc\t$scr, $(w[3])\n"
    body *= "\tmov\t2*8($ap), %rdx\n"
    body *= "\tmov\t$(w[2]), 2*8($res)\n"
    body *= "\tmov\t$(w[3]), 3*8($res)\n"
    body *= "\tmulx\t%rdx, %rdx, $scr\n" # a2^2
    body *= "\tadc\t%rdx, $(w[4])\n"
    body *= "\tadc\t$(w[5]), $scr\n"
    body *= "\tmov\t$(w[4]), 4*8($res)\n"
    body *= "\tmov\t$scr, 5*8($res)\n"

    return pre * body * post * "\n\tret\n"
end

function function_body_sqr_4()
    n = 4

    res = _regs[1]
    ap = _regs[2]

    this_regs = [_regs[4:end - 1]; __regs]

    w = this_regs[1:2 * n - 1]

    s1 = "error s1"
    s2 = _regs[end] # Important that this is rax

    pre = "\tmov\t0*8($ap), %rdx\n"
    post = ""
    for jx in 1:2 * n - 1 - length(_regs[4:end - 1])
        pre = pre * "\tpush\t$(__regs[jx])\n"
        post = "\tpop\t$(__regs[jx])\n" * post
    end

    body = ""

    # Calculate upper triangle
    body *= "\txor\t$(reg_32_bit(w[7])), $(reg_32_bit(w[7]))\n"
    body *= "\tmulx\t1*8($ap), $(w[1]), $(w[5])\n" # a0 a1
    body *= "\tmulx\t2*8($ap), $(w[2]), $(w[6])\n" # a0 a2
    body *= "\tmulx\t3*8($ap), $(w[3]), $(w[4])\n" # a0 a3
    body *= "\tmov\t1*8($ap), %rdx\n"
    body *= "\tadox\t$(w[5]), $(w[2])\n" # ADOX
    body *= "\tadox\t$(w[6]), $(w[3])\n"
    body *= "\tmulx\t2*8($ap), $(w[5]), $(w[6])\n" # a1 a2
    body *= "\tadcx\t$(w[5]), $(w[3])\n" # ADCX
    body *= "\tadcx\t$(w[6]), $(w[4])\n"
    body *= "\tmulx\t3*8($ap), $(w[6]), $(w[5])\n" # a1 a3
    body *= "\tmov\t2*8($ap), %rdx\n"
    body *= "\tadox\t$(w[6]), $(w[4])\n" # ADOX
    body *= "\tadox\t$(w[7]), $(w[5])\n" # Add zero
    body *= "\tmulx\t3*8($ap), %rdx, $(w[6])\n" # a2 a3
    body *= "\tadcx\t%rdx, $(w[5])\n" # ADCX
    body *= "\tadc\t$(w[7]), $(w[6])\n" # Add zero

    # Double upper triangle
    body *= "\tmov\t0*8($ap), %rdx\n"
    body *= "\tadd\t$(w[1]), $(w[1])\n"
    body *= "\tadc\t$(w[2]), $(w[2])\n"
    body *= "\tadc\t$(w[3]), $(w[3])\n"
    body *= "\tadc\t$(w[4]), $(w[4])\n"
    body *= "\tadc\t$(w[5]), $(w[5])\n"
    body *= "\tadc\t$(w[6]), $(w[6])\n"
    body *= "\tsetc\t$(reg_8_bit(w[7]))\n"

    # Calculate diagonal and put into res
    body *= "\tmulx\t%rdx, %rdx, $s2\n" # a0^2
    body *= "\tmov\t%rdx, 0*8($res)\n"
    body *= "\tadd\t$s2, $(w[1])\n"
    body *= "\tmov\t1*8($ap), %rdx\n"
    body *= "\tmov\t$(w[1]), 1*8($res)\n"
    s1 = w[1]
    w[1] = "error w[1]"
    body *= "\tmulx\t%rdx, %rdx, $s2\n" # a1^2
    body *= "\tadc\t%rdx, $(w[2])\n"
    body *= "\tadc\t$s2, $(w[3])\n"
    body *= "\tmov\t2*8($ap), %rdx\n"
    body *= "\tmov\t$(w[2]), 2*8($res)\n"
    body *= "\tmov\t$(w[3]), 3*8($res)\n"
    body *= "\tmulx\t%rdx, $s1, $s2\n" # a2^2
    body *= "\tadc\t$s1, $(w[4])\n"
    body *= "\tadc\t$s2, $(w[5])\n"
    body *= "\tmov\t3*8($ap), %rdx\n"
    body *= "\tmov\t$(w[4]), 4*8($res)\n"
    body *= "\tmov\t$(w[5]), 5*8($res)\n"
    body *= "\tmulx\t%rdx, $s1, $s2\n" # a3^2
    body *= "\tadc\t$s1, $(w[6])\n"
    body *= "\tadc\t$(w[7]), $s2\n"
    body *= "\tmov\t$(w[6]), 6*8($res)\n"
    body *= "\tmov\t$s2, 7*8($res)\n"

    return pre * body * post * "\n\tret\n"
end

function function_body_sqr_5()
    n = 5

    res = _regs[1]
    ap = _regs[2]

    this_regs = [_regs[4:end - 1]; __regs]

    w = this_regs[1:2 * n - 1]

    s1 = ""
    s2 = _regs[end] # Important that this is rax

    pre = "\tmov\t0*8($ap), %rdx\n"
    post = ""
    for jx in 1:2 * n - 1 - length(_regs[4:end - 1])
        pre = pre * "\tpush\t$(__regs[jx])\n"
        post = "\tpop\t$(__regs[jx])\n" * post
    end

    body = ""

    # Calculate upper triangle
    body *= "\txor\t$(reg_32_bit(w[9])), $(reg_32_bit(w[9]))\n"
    body *= "\tmulx\t1*8($ap), $(w[1]), $(w[6])\n" # a0 a1
    body *= "\tmulx\t2*8($ap), $(w[2]), $(w[7])\n" # a0 a2
    body *= "\tmulx\t3*8($ap), $(w[3]), $(w[8])\n" # a0 a3
    body *= "\tmulx\t4*8($ap), $(w[4]), $(w[5])\n" # a0 a4
    body *= "\tadcx\t$(w[6]), $(w[2])\n"
    body *= "\tadcx\t$(w[7]), $(w[3])\n"
    body *= "\tadcx\t$(w[8]), $(w[4])\n"
    body *= "\tadcx\t$(w[9]), $(w[5])\n" # Add zero

    body *= "\tmov\t1*8($ap), %rdx\n"
    body *= "\tmulx\t2*8($ap), $(w[8]), $(w[6])\n" # a1 a2
    body *= "\tadcx\t$(w[8]), $(w[3])\n" # ADCX
    body *= "\tadcx\t$(w[6]), $(w[4])\n"
    body *= "\tmulx\t3*8($ap), $(w[8]), $(w[6])\n" # a1 a3
    body *= "\tadox\t$(w[8]), $(w[4])\n" # ADOX
    body *= "\tadox\t$(w[6]), $(w[5])\n"
    body *= "\tmulx\t4*8($ap), $(w[8]), $(w[6])\n" # a1 a4
    body *= "\tadcx\t$(w[8]), $(w[5])\n" # ADCX
    body *= "\tadcx\t$(w[9]), $(w[6])\n" # Add zero

    body *= "\tmov\t2*8($ap), %rdx\n"
    body *= "\tmulx\t3*8($ap), $(w[8]), $(w[7])\n" # a2 a3
    body *= "\tadcx\t$(w[8]), $(w[5])\n" # ADCX
    body *= "\tadcx\t$(w[7]), $(w[6])\n"
    body *= "\tmulx\t4*8($ap), $(w[8]), $(w[7])\n" # a2 a4
    body *= "\tadox\t$(w[8]), $(w[6])\n" # ADOX
    body *= "\tadox\t$(w[9]), $(w[7])\n" # Add zero

    body *= "\tmov\t3*8($ap), %rdx\n"
    body *= "\tmulx\t4*8($ap), %rdx, $(w[8])\n" # a3 a4
    body *= "\tadcx\t%rdx, $(w[7])\n" # ADCX
    body *= "\tadc\t$(w[9]), $(w[8])\n" # Add zero

    # Double upper triangle
    body *= "\tmov\t0*8($ap), %rdx\n"
    body *= "\tadd\t$(w[1]), $(w[1])\n"
    body *= "\tadc\t$(w[2]), $(w[2])\n"
    body *= "\tadc\t$(w[3]), $(w[3])\n"
    body *= "\tadc\t$(w[4]), $(w[4])\n"
    body *= "\tadc\t$(w[5]), $(w[5])\n"
    body *= "\tadc\t$(w[6]), $(w[6])\n"
    body *= "\tadc\t$(w[7]), $(w[7])\n"
    body *= "\tadc\t$(w[8]), $(w[8])\n"
    body *= "\tadc\t$(w[9]), $(w[9])\n"

    # Calculate diagonal and put into res
    body *= "\tmulx\t%rdx, %rdx, $s2\n" # a0^2
    body *= "\tmov\t%rdx, 0*8($res)\n"
    body *= "\tadd\t$s2, $(w[1])\n"
    body *= "\tmov\t1*8($ap), %rdx\n"
    body *= "\tmov\t$(w[1]), 1*8($res)\n"
    s1 = w[1]
    w[1] = "error w[1]"
    body *= "\tmulx\t%rdx, %rdx, $s2\n" # a1^2
    body *= "\tadc\t%rdx, $(w[2])\n"
    body *= "\tadc\t$s2, $(w[3])\n"
    body *= "\tmov\t2*8($ap), %rdx\n"
    body *= "\tmov\t$(w[2]), 2*8($res)\n"
    body *= "\tmov\t$(w[3]), 3*8($res)\n"
    body *= "\tmulx\t%rdx, $s1, $s2\n" # a2^2
    body *= "\tadc\t$s1, $(w[4])\n"
    body *= "\tadc\t$s2, $(w[5])\n"
    body *= "\tmov\t3*8($ap), %rdx\n"
    body *= "\tmov\t$(w[4]), 4*8($res)\n"
    body *= "\tmov\t$(w[5]), 5*8($res)\n"
    body *= "\tmulx\t%rdx, $s1, $s2\n" # a3^2
    body *= "\tadc\t$s1, $(w[6])\n"
    body *= "\tadc\t$s2, $(w[7])\n"
    body *= "\tmov\t4*8($ap), %rdx\n"
    body *= "\tmov\t$(w[6]), 6*8($res)\n"
    body *= "\tmov\t$(w[7]), 7*8($res)\n"
    body *= "\tmulx\t%rdx, $s1, $s2\n" # a4^2
    body *= "\tadc\t$s1, $(w[8])\n"
    body *= "\tadc\t$(w[9]), $s2\n"
    body *= "\tmov\t$(w[8]), 8*8($res)\n"
    body *= "\tmov\t$s2, 9*8($res)\n"

    return pre * body * post * "\n\tret\n"
end

function function_body_sqr_6()
    n = 6

    res = _regs[1]
    ap = _regs[2]

    this_regs = [_regs[4:end - 1]; __regs]

    w = this_regs[1:2 * n - 1]

    s1 = "error s1"
    s2 = _regs[end] # Important that this is rax

    pre = "\tmov\t0*8($ap), %rdx\n"
    post = ""
    for jx in 1:6
        pre = pre * "\tpush\t$(__regs[jx])\n"
        post = "\tpop\t$(__regs[jx])\n" * post
    end

    body = ""

    # Calculate upper triangle
    body *= "\txor\t$(reg_32_bit(w[11])), $(reg_32_bit(w[11]))\n"
    body *= "\tmulx\t1*8($ap), $(w[1]), $(w[7])\n" # a0 a1
    body *= "\tmulx\t2*8($ap), $(w[2]), $(w[8])\n" # a0 a2
    body *= "\tmulx\t3*8($ap), $(w[3]), $(w[9])\n" # a0 a3
    body *= "\tmulx\t4*8($ap), $(w[4]), $(w[10])\n" # a0 a4
    body *= "\tmulx\t5*8($ap), $(w[5]), $(w[6])\n" # a0 a5
    body *= "\tadcx\t$(w[7]), $(w[2])\n"
    body *= "\tadcx\t$(w[8]), $(w[3])\n"
    body *= "\tadcx\t$(w[9]), $(w[4])\n"
    body *= "\tadcx\t$(w[10]), $(w[5])\n"
    body *= "\tadcx\t$(w[11]), $(w[6])\n" # Add zero

    body *= "\tmov\t1*8($ap), %rdx\n"
    body *= "\tmulx\t2*8($ap), $(w[10]), $(w[7])\n" # a1 a2
    body *= "\tadcx\t$(w[10]), $(w[3])\n" # ADCX
    body *= "\tadcx\t$(w[7]), $(w[4])\n"
    body *= "\tmulx\t3*8($ap), $(w[10]), $(w[7])\n" # a1 a3
    body *= "\tadox\t$(w[10]), $(w[4])\n" # ADOX
    body *= "\tadox\t$(w[7]), $(w[5])\n"
    body *= "\tmulx\t4*8($ap), $(w[10]), $(w[7])\n" # a1 a4
    body *= "\tadcx\t$(w[10]), $(w[5])\n" # ADCX
    body *= "\tadcx\t$(w[7]), $(w[6])\n"
    body *= "\tmulx\t5*8($ap), $(w[10]), $(w[7])\n" # a1 a5
    body *= "\tadox\t$(w[10]), $(w[6])\n" # ADOX
    body *= "\tadox\t$(w[11]), $(w[7])\n" # Add zero
    body *= "\tadcx\t$(w[11]), $(w[7])\n" # Add zero

    body *= "\tmov\t2*8($ap), %rdx\n"
    body *= "\tmulx\t3*8($ap), $(w[10]), $(w[8])\n" # a2 a3
    body *= "\tadcx\t$(w[10]), $(w[5])\n" # ADCX
    body *= "\tadcx\t$(w[8]), $(w[6])\n"
    body *= "\tmulx\t4*8($ap), $(w[10]), $(w[8])\n" # a2 a4
    body *= "\tadox\t$(w[10]), $(w[6])\n" # ADOX
    body *= "\tadox\t$(w[8]), $(w[7])\n"
    body *= "\tmulx\t5*8($ap), $(w[10]), $(w[8])\n" # a2 a5
    body *= "\tadcx\t$(w[10]), $(w[7])\n" # ADCX
    body *= "\tadcx\t$(w[11]), $(w[8])\n" # Add zero

    body *= "\tmov\t3*8($ap), %rdx\n"
    body *= "\tmulx\t4*8($ap), $(w[10]), $(w[9])\n" # a3 a4
    body *= "\tadcx\t$(w[10]), $(w[7])\n" # ADCX
    body *= "\tadcx\t$(w[9]), $(w[8])\n"
    body *= "\tmulx\t5*8($ap), $(w[10]), $(w[9])\n" # a3 a5
    body *= "\tadox\t$(w[10]), $(w[8])\n" # ADOX
    body *= "\tadox\t$(w[11]), $(w[9])\n" # Add zero

    body *= "\tmov\t4*8($ap), %rdx\n"
    body *= "\tmulx\t5*8($ap), %rdx, $(w[10])\n" # a4 a5
    body *= "\tadcx\t%rdx, $(w[9])\n" # ADCX
    body *= "\tadc\t$(w[11]), $(w[10])\n" # Add zero

    # Double upper triangle
    body *= "\tmov\t0*8($ap), %rdx\n"
    body *= "\tadd\t$(w[1]), $(w[1])\n"
    body *= "\tadc\t$(w[2]), $(w[2])\n"
    body *= "\tadc\t$(w[3]), $(w[3])\n"
    body *= "\tadc\t$(w[4]), $(w[4])\n"
    body *= "\tadc\t$(w[5]), $(w[5])\n"
    body *= "\tadc\t$(w[6]), $(w[6])\n"
    body *= "\tadc\t$(w[7]), $(w[7])\n"
    body *= "\tadc\t$(w[8]), $(w[8])\n"
    body *= "\tadc\t$(w[9]), $(w[9])\n"
    body *= "\tadc\t$(w[10]), $(w[10])\n"
    body *= "\tsetc\t$(reg_8_bit(w[11]))\n"

    # Calculate diagonal and put into res
    body *= "\tmulx\t%rdx, %rdx, $s2\n" # a0^2
    body *= "\tmov\t%rdx, 0*8($res)\n"
    body *= "\tmov\t1*8($ap), %rdx\n"
    body *= "\tadd\t$s2, $(w[1])\n"
    body *= "\tmov\t$(w[1]), 1*8($res)\n"
    s1 = w[1]
    w[1] = "error w[1]"
    body *= "\tmulx\t%rdx, %rdx, $s2\n" # a1^2
    body *= "\tadc\t%rdx, $(w[2])\n"
    body *= "\tadc\t$s2, $(w[3])\n"
    body *= "\tmov\t2*8($ap), %rdx\n"
    body *= "\tmov\t$(w[2]), 2*8($res)\n"
    body *= "\tmov\t$(w[3]), 3*8($res)\n"
    body *= "\tmulx\t%rdx, $s1, $s2\n" # a2^2
    body *= "\tadc\t$s1, $(w[4])\n"
    body *= "\tadc\t$s2, $(w[5])\n"
    body *= "\tmov\t3*8($ap), %rdx\n"
    body *= "\tmov\t$(w[4]), 4*8($res)\n"
    body *= "\tmov\t$(w[5]), 5*8($res)\n"
    body *= "\tmulx\t%rdx, $s1, $s2\n" # a3^2
    body *= "\tadc\t$s1, $(w[6])\n"
    body *= "\tadc\t$s2, $(w[7])\n"
    body *= "\tmov\t4*8($ap), %rdx\n"
    body *= "\tmov\t$(w[6]), 6*8($res)\n"
    body *= "\tmov\t$(w[7]), 7*8($res)\n"
    body *= "\tmulx\t%rdx, $s1, $s2\n" # a4^2
    body *= "\tadc\t$s1, $(w[8])\n"
    body *= "\tadc\t$s2, $(w[9])\n"
    body *= "\tmov\t5*8($ap), %rdx\n"
    body *= "\tmov\t$(w[8]), 8*8($res)\n"
    body *= "\tmov\t$(w[9]), 9*8($res)\n"
    body *= "\tmulx\t%rdx, $s1, $s2\n" # a5^2
    body *= "\tadc\t$s1, $(w[10])\n"
    body *= "\tadc\t$(w[11]), $s2\n"
    body *= "\tmov\t$(w[10]), 10*8($res)\n"
    body *= "\tmov\t$s2, 11*8($res)\n"

    return pre * body * post * "\n\tret\n"
end

function function_body_sqr_7()
    n = 7

    res = _regs[1]
    ap = _regs[2]

    this_regs = [_regs[4:end - 1]; __regs]

    w = [this_regs[1:3]; _regs[end]; this_regs[5:11]; ["error w[$ix]" for ix in 12:2 * n - 2]]

    s1 = "error s1"
    s2 = "error s2"
    zero = this_regs[4]

    pre = "\tmov\t0*8($ap), %rdx\n"
    post = ""
    for jx in 1:6
        pre = pre * "\tpush\t$(__regs[jx])\n"
        post = "\tpop\t$(__regs[jx])\n" * post
    end

    body = ""

    # Calculate upper triangle
    body *= "\txor\t$(reg_32_bit(zero)), $(reg_32_bit(zero))\n"
    body *= "\tmulx\t1*8($ap), $(w[1]), $(w[5])\n" # a0 a1
    body *= "\tmulx\t2*8($ap), $(w[2]), $(w[6])\n" # a0 a2
    body *= "\tmulx\t3*8($ap), $(w[3]), $(w[10])\n" # a0 a3
    body *= "\tadcx\t$(w[5]), $(w[2])\n"
    body *= "\tadcx\t$(w[6]), $(w[3])\n"
    body *= "\tmulx\t4*8($ap), $(w[4]), $(w[8])\n" # a0 a4
    body *= "\tmulx\t5*8($ap), $(w[5]), $(w[9])\n" # a0 a5
    body *= "\tmulx\t6*8($ap), $(w[6]), $(w[7])\n" # a0 a6
    body *= "\tadcx\t$(w[10]), $(w[4])\n"
    body *= "\tadcx\t$(w[8]), $(w[5])\n"
    body *= "\tmov\t1*8($ap), %rdx\n"
    body *= "\tadcx\t$(w[9]), $(w[6])\n"
    body *= "\tadcx\t$zero, $(w[7])\n"

    body *= "\tmulx\t2*8($ap), $(w[10]), $(w[8])\n" # a1 a2
    body *= "\tmulx\t3*8($ap), $(w[11]), $(w[9])\n" # a1 a3
    body *= "\tadcx\t$(w[10]), $(w[3])\n" # ADCX
    body *= "\tadcx\t$(w[8]), $(w[4])\n"
    body *= "\tadox\t$(w[11]), $(w[4])\n" # ADOX
    body *= "\tadox\t$(w[9]), $(w[5])\n"
    body *= "\tmulx\t4*8($ap), $(w[10]), $(w[8])\n" # a1 a4
    body *= "\tmulx\t5*8($ap), $(w[11]), $(w[9])\n" # a1 a5
    body *= "\tadcx\t$(w[10]), $(w[5])\n" # ADCX
    body *= "\tadcx\t$(w[8]), $(w[6])\n"
    body *= "\tmulx\t6*8($ap), $(w[10]), $(w[8])\n" # a1 a6
    body *= "\tadox\t$(w[11]), $(w[6])\n" # ADOX
    body *= "\tadox\t$(w[9]), $(w[7])\n"
    body *= "\tmov\t2*8($ap), %rdx\n"
    body *= "\tadcx\t$(w[10]), $(w[7])\n" # ADCX
    body *= "\tadox\t$zero, $(w[8])\n"
    body *= "\tadcx\t$zero, $(w[8])\n"

    body *= "\tmulx\t3*8($ap), $(w[11]), $(w[9])\n" # a2 a3
    body *= "\tadox\t$(w[11]), $(w[5])\n" # ADOX
    body *= "\tadox\t$(w[9]), $(w[6])\n"
    body *= "\tmulx\t4*8($ap), $(w[11]), $(w[9])\n" # a2 a4
    body *= "\tadcx\t$(w[11]), $(w[6])\n" # ADCX
    body *= "\tadcx\t$(w[9]), $(w[7])\n"
    body *= "\tmulx\t5*8($ap), $(w[11]), $(w[9])\n" # a2 a5
    body *= "\tadox\t$(w[11]), $(w[7])\n" # ADOX
    body *= "\tadox\t$(w[9]), $(w[8])\n"
    body *= "\tmulx\t6*8($ap), $(w[11]), $(w[9])\n" # a2 a6
    body *= "\tadcx\t$(w[11]), $(w[8])\n" # ADCX
    body *= "\tadox\t$zero, $(w[9])\n"
    body *= "\tmov\t3*8($ap), %rdx\n"
    body *= "\tadcx\t$zero, $(w[9])\n"

    body *= "\tmulx\t4*8($ap), $(w[11]), $(w[10])\n" # a3 a4
    body *= "\tadcx\t$(w[11]), $(w[7])\n" # ADCX
    body *= "\tadcx\t$(w[10]), $(w[8])\n"
    body *= "\tmulx\t5*8($ap), $(w[11]), $(w[10])\n" # a3 a5
    body *= "\tadox\t$(w[11]), $(w[8])\n" # ADOX
    body *= "\tadox\t$(w[10]), $(w[9])\n"
    body *= "\tmulx\t6*8($ap), $(w[11]), $(w[10])\n" # a3 a6
    body *= "\tmov\t4*8($ap), %rdx\n"
    body *= "\tadcx\t$(w[11]), $(w[9])\n" # ADCX
    body *= "\tadcx\t$zero, $(w[10])\n"

    w[12] = zero
    zero = "error zero"
    body *= "\tmulx\t5*8($ap), $(w[12]), $(w[11])\n" # a4 a5
    body *= "\tadcx\t$(w[12]), $(w[9])\n" # ADCX
    body *= "\tadcx\t$(w[11]), $(w[10])\n"
    body *= "\tmulx\t6*8($ap), $(w[12]), $(w[11])\n" # a4 a6
    body *= "\tmov\t\$0, %edx\n"
    body *= "\tadox\t$(w[12]), $(w[10])\n" # ADOX
    body *= "\tadox\t%rdx, $(w[11])\n"
    body *= "\tmov\t5*8($ap), %rdx\n"

    body *= "\tmulx\t6*8($ap), %rdx, $(w[12])\n" # a5 a6
    body *= "\tadcx\t%rdx, $(w[11])\n" # ADCX
    body *= "\tadc\t\$0, $(w[12])\n"
    body *= "\ttest\t%al, %al\n"

    # Double upper triangle
    body *= "\tmov\t0*8($ap), %rdx\n"
    body *= "\tadcx\t$(w[1]), $(w[1])\n"
    body *= "\tadcx\t$(w[2]), $(w[2])\n"
    body *= "\tadcx\t$(w[3]), $(w[3])\n"
    body *= "\tadcx\t$(w[4]), $(w[4])\n"
    body *= "\tadcx\t$(w[5]), $(w[5])\n"
    body *= "\tadcx\t$(w[6]), $(w[6])\n"
    body *= "\tpush\t$(w[4])\n"
    body *= "\tadcx\t$(w[7]), $(w[7])\n"
    body *= "\tadcx\t$(w[8]), $(w[8])\n"
    body *= "\tadcx\t$(w[9]), $(w[9])\n"
    body *= "\tadcx\t$(w[10]), $(w[10])\n"
    body *= "\tadcx\t$(w[11]), $(w[11])\n"
    body *= "\tadcx\t$(w[12]), $(w[12])\n"

    # Calculate diagonal and put into res
    s2 = w[4]
    w[4] = "error w[4]"
    body *= "\tmulx\t%rdx, %rdx, $s2\n" # a0^2
    body *= "\tmov\t%rdx, 0*8($res)\n"
    body *= "\tadox\t$s2, $(w[1])\n"
    body *= "\tmov\t1*8($ap), %rdx\n"
    body *= "\tmov\t$(w[1]), 1*8($res)\n"
    w[4] = w[1]
    w[1] = "error w[1]"
    body *= "\tmulx\t%rdx, %rdx, $s2\n" # a1^2
    body *= "\tadox\t%rdx, $(w[2])\n"
    body *= "\tadox\t$s2, $(w[3])\n"
    body *= "\tpop\t$(w[4])\n"
    body *= "\tmov\t2*8($ap), %rdx\n"
    body *= "\tmov\t$(w[2]), 2*8($res)\n"
    body *= "\tmov\t$(w[3]), 3*8($res)\n"
    s1 = w[2]
    w[2] = "error w[2]"
    body *= "\tmulx\t%rdx, %rdx, $s2\n" # a2^2
    body *= "\tadox\t%rdx, $(w[4])\n"
    body *= "\tadox\t$s2, $(w[5])\n"
    body *= "\tmov\t3*8($ap), %rdx\n"
    body *= "\tmov\t$(w[4]), 4*8($res)\n"
    body *= "\tmov\t$(w[5]), 5*8($res)\n"
    zero = w[4]
    w[4] = "error w[4]"
    body *= "\tmulx\t%rdx, $s1, $s2\n" # a3^2
    body *= "\tadox\t$s1, $(w[6])\n"
    body *= "\tadox\t$s2, $(w[7])\n"
    body *= "\tmov\t4*8($ap), %rdx\n"
    body *= "\tmov\t$(w[6]), 6*8($res)\n"
    body *= "\tmov\t$(w[7]), 7*8($res)\n"
    body *= "\tmulx\t%rdx, $s1, $s2\n" # a4^2
    body *= "\tadox\t$s1, $(w[8])\n"
    body *= "\tadox\t$s2, $(w[9])\n"
    body *= "\tmov\t5*8($ap), %rdx\n"
    body *= "\tmov\t$(w[8]), 8*8($res)\n"
    body *= "\tmov\t$(w[9]), 9*8($res)\n"
    body *= "\tmulx\t%rdx, $s1, $s2\n" # a5^2
    body *= "\tadox\t$s1, $(w[10])\n"
    body *= "\tadox\t$s2, $(w[11])\n"
    body *= "\tmov\t\$0, $(reg_32_bit(zero))\n"
    body *= "\tmov\t6*8($ap), %rdx\n"
    body *= "\tmov\t$(w[10]), 10*8($res)\n"
    body *= "\tmov\t$(w[11]), 11*8($res)\n"
    body *= "\tmulx\t%rdx, $s1, $s2\n" # a6^2
    body *= "\tadox\t$s1, $(w[12])\n"
    body *= "\tadox\t$zero, $s2\n"
    body *= "\tadcx\t$zero, $s2\n"
    body *= "\tmov\t$(w[12]), 12*8($res)\n"
    body *= "\tmov\t$s2, 13*8($res)\n"

    return pre * body * post * "\n\tret\n"
end

###############################################################################
# Generate file
###############################################################################

function gen_mul(m::Int, n::Int, file)
    (pre, post) = function_pre_post("flint_mpn_mul_$(m)_$n")
    functionbody = ""
    if m == 1 && n == 1
        functionbody = function_body_1(m, n)
    elseif m == 2 && n == 1
        functionbody = function_body_2_1(m, n)
    elseif m == 2 && n == 2
        functionbody = function_body_2(m, n)
    elseif n == 1
        macros = mul_1_macro(m)
        functionbody = function_body_M_1(m)
    elseif m ≤ 8
        functionbody = function_body_M(m, n)
    elseif m > 8 && n != 2
        functionbody = function_body_M(m, n)
    elseif n == 2
        functionbody = function_body_M_2(m)
    else
        error("This won't work for m = $m and n = $n")
    end

    str = "$pre$functionbody$post\n"

    write(file, str)
end

function gen_sqr(m::Int, file)
    (pre, post) = function_pre_post("flint_mpn_sqr_$m")
    functionbody = ""
    if m == 1
        functionbody = function_body_sqr_1()
    elseif m == 2
        functionbody = function_body_sqr_2()
    elseif m == 3
        functionbody = function_body_sqr_3()
    elseif m == 4
        functionbody = function_body_sqr_4()
    elseif m == 5
        functionbody = function_body_sqr_5()
    elseif m == 6
        functionbody = function_body_sqr_6()
    elseif m == 7
        functionbody = function_body_sqr_7()
    else
        error("This won't work for m = $m")
    end

    str = "$pre$functionbody$post\n"

    write(file, str)
end

function gen_all()
    path = String(@__DIR__) * "/../src/mpn_extras/broadwell/mul_hard.asm"
    file = open(path, "w")
    write(file, "$copyright\n$preamble\n")
    macros = ""
    for mx in 3:16
        macros *= mul_1_macro(mx) * "\n"
    end
    for mx in 3:8
        macros *= mulM_macro(mx) * "\n"
    end
    as = Int[]
    for mx in 9:16
        N = M_2_getN(mx)
        if mx ÷ N >= 2
            append!(as, N)
        end
        if mx % N > 1
            append!(as, mx % N)
        end
    end
    unique!(as)
    for mx in as
        macros *= mulM_macro(mx, chain = true) * "\n"
    end
    for mx in 2:8
        macros *= addmulM_macro(mx) * "\n"
    end
    write(file, macros)
    for mx in 1:16
        for nx in 1:min(mx, 8)
            gen_mul(mx, nx, file)
        end
    end
    close(file)

    path = String(@__DIR__) * "/../src/mpn_extras/broadwell/sqr_hard.asm"
    file = open(path, "w")
    write(file, "$copyright\n$preamble\n")
    for mx in 1:7
        gen_sqr(mx, file)
    end
    close(file)
end