Q_LIMBS = [
0x0aab, 0x1ffd, 0x1fff, 0x1dff, 0x1b9f, 0x1fff,
0x054f, 0x1fd6, 0x0bff, 0x00f5, 0x1d89, 0x0d61,
0x0a0f, 0x1869, 0x1d9c, 0x0257, 0x1385, 0x1c27,
0x1dd2, 0x0ec8, 0x1acd, 0x01a5, 0x1ed9, 0x0374,
0x1a4b, 0x1f34, 0x0e5f, 0x03d4, 0x0011, 0x000d,
]
INV_Q = 0x1ffd
N = 30
def t(i):
return f"t{i}"
def gen_carry_propagation(indent=" "):
lines = []
lines.append(f"{indent}// Phase 4: Carry propagation + shift (fully unrolled)")
lines.append(f"{indent}var carry: u32 = {t(0)} >> 13u;")
lines.append(f"{indent}var v: u32;")
for j in range(1, N):
lines.append(f"{indent}v = {t(j)} + carry; {t(j-1)} = v & 0x1FFFu; carry = v >> 13u;")
lines.append(f"{indent}v = {t(N)} + carry; {t(N-1)} = v & 0x1FFFu; {t(N)} = v >> 13u; {t(N+1)} = 0u;")
return "\n".join(lines)
def gen_mul_montgomery():
lines = []
lines.append("fn mul_montgomery_u384(a: U384, b: U384) -> U384 {")
for i in range(N + 2):
lines.append(f" var {t(i)}: u32 = 0u;")
lines.append("")
lines.append(f" for (var i: u32 = 0u; i < {N}u; i = i + 1u) {{")
lines.append(" let ai = a.limbs[i];")
lines.append("")
lines.append(" // Phase 1: Multiply — a[i] * b[j] (unrolled)")
for j in range(N):
lines.append(f" {t(j)} = {t(j)} + ai * b.limbs[{j}u];")
lines.append("")
lines.append(f" // Phase 2: m = t0 * (-q^{{-1}}) mod 2^13")
lines.append(f" let m = ({t(0)} * {hex(INV_Q)}u) & 0x1FFFu;")
lines.append("")
lines.append(" // Phase 3: Reduce — m * q[j] (unrolled, literal constants)")
for j in range(N):
lines.append(f" {t(j)} = {t(j)} + m * {hex(Q_LIMBS[j])}u;")
lines.append("")
lines.append(gen_carry_propagation())
lines.append(" }")
lines.append("")
lines.append(" var result: U384;")
for i in range(N):
lines.append(f" result.limbs[{i}u] = {t(i)};")
lines.append("")
lines.append(" // Conditional subtraction if result >= q")
lines.append(" var is_gte = true;")
lines.append(f" if {t(N)} > 0u {{")
lines.append(" is_gte = true;")
lines.append(" } else {")
lines.append(f" for (var i: u32 = {N-1}u; i < {N}u; i = i - 1u) {{")
lines.append(" if result.limbs[i] > Q_MODULUS[i] { break; }")
lines.append(" if result.limbs[i] < Q_MODULUS[i] { is_gte = false; break; }")
lines.append(" if i == 0u { break; }")
lines.append(" }")
lines.append(" }")
lines.append(" if is_gte {")
lines.append(" result = sub_u384(result, U384(Q_MODULUS));")
lines.append(" }")
lines.append(" return result;")
lines.append("}")
return "\n".join(lines)
def gen_sqr_montgomery():
lines = []
lines.append("fn sqr_montgomery_u384(a: U384) -> U384 {")
for i in range(N + 2):
lines.append(f" var {t(i)}: u32 = 0u;")
lines.append("")
lines.append(f" for (var i: u32 = 0u; i < {N}u; i = i + 1u) {{")
lines.append(" let ai = a.limbs[i];")
lines.append("")
lines.append(" // Phase 1: Multiply — a[i] * a[j] (squaring)")
for j in range(N):
lines.append(f" {t(j)} = {t(j)} + ai * a.limbs[{j}u];")
lines.append("")
lines.append(f" let m = ({t(0)} * {hex(INV_Q)}u) & 0x1FFFu;")
lines.append("")
lines.append(" // Phase 3: Reduce (literal Q constants)")
for j in range(N):
lines.append(f" {t(j)} = {t(j)} + m * {hex(Q_LIMBS[j])}u;")
lines.append("")
lines.append(gen_carry_propagation())
lines.append(" }")
lines.append("")
lines.append(" var result: U384;")
for i in range(N):
lines.append(f" result.limbs[{i}u] = {t(i)};")
lines.append("")
lines.append(" var is_gte = true;")
lines.append(f" if {t(N)} > 0u {{")
lines.append(" is_gte = true;")
lines.append(" } else {")
lines.append(f" for (var i: u32 = {N-1}u; i < {N}u; i = i - 1u) {{")
lines.append(" if result.limbs[i] > Q_MODULUS[i] { break; }")
lines.append(" if result.limbs[i] < Q_MODULUS[i] { is_gte = false; break; }")
lines.append(" if i == 0u { break; }")
lines.append(" }")
lines.append(" }")
lines.append(" if is_gte {")
lines.append(" result = sub_u384(result, U384(Q_MODULUS));")
lines.append(" }")
lines.append(" return result;")
lines.append("}")
return "\n".join(lines)
if __name__ == "__main__":
print("// ============================================================================")
print("// BASE FIELD MONTGOMERY MULTIPLICATION (F_q) — Lazy CIOS with 13-bit limbs")
print("//")
print("// Scalar accumulator variables (OPT-17), literal Q constants (OPT-26).")
print("// Eliminates array spilling and indexing overhead.")
print("// Generated by scripts/gen_fp_unrolled.py")
print("// ============================================================================")
print()
print(gen_mul_montgomery())
print()
print("// ============================================================================")
print("// BASE FIELD MONTGOMERY SQUARING (F_q) — Dedicated squaring (OPT-4)")
print("//")
print("// Scalar accumulators + single input = minimal register pressure.")
print("// Generated by scripts/gen_fp_unrolled.py")
print("// ============================================================================")
print()
print(gen_sqr_montgomery())