trueno 0.17.1 - Docs.rs

//! AVX2 SIMD Microkernels
//!
//! Contains three variants of increasing optimization:
//! - `microkernel_8x6_avx2`: Basic AVX2 intrinsics
//! - `microkernel_8x6_avx2_asm`: Intrinsics with 4-way K unrolling
//! - `microkernel_8x6_true_asm`: True inline ASM with software pipelining

use super::super::{MR, NR};

/// AVX2 microkernel (8x6 output tile)
///
/// Register allocation (Smith et al., 2014):
/// - ymm0-ymm5: 6 columns of C (8 f32 each) = 48 outputs in registers
/// - ymm6-ymm7: A panel broadcast
/// - ymm8-ymm13: B panel values (broadcast per column)
///
/// Performance target: 70%+ FMA utilization
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2", enable = "fma")]
// SAFETY: Caller ensures AVX2+FMA are available, pointers are valid, and dimensions are correct
pub unsafe fn microkernel_8x6_avx2(
    k: usize,
    a: *const f32, // MR x K packed, column-major
    b: *const f32, // K x NR packed, row-major
    c: *mut f32,   // MR x NR output, column-major
    ldc: usize,    // Leading dimension of C
) {
    unsafe {
        use std::arch::x86_64::*;

        // Load C into registers (6 columns of 8 elements each)
        let mut c0 = _mm256_loadu_ps(c);
        let mut c1 = _mm256_loadu_ps(c.add(ldc));
        let mut c2 = _mm256_loadu_ps(c.add(2 * ldc));
        let mut c3 = _mm256_loadu_ps(c.add(3 * ldc));
        let mut c4 = _mm256_loadu_ps(c.add(4 * ldc));
        let mut c5 = _mm256_loadu_ps(c.add(5 * ldc));

        // Main loop: accumulate A * B into C
        for p in 0..k {
            // Load A column (8 elements)
            let a_col = _mm256_loadu_ps(a.add(p * MR));

            // Load B row elements and broadcast
            let b0 = _mm256_set1_ps(*b.add(p * NR));
            let b1 = _mm256_set1_ps(*b.add(p * NR + 1));
            let b2 = _mm256_set1_ps(*b.add(p * NR + 2));
            let b3 = _mm256_set1_ps(*b.add(p * NR + 3));
            let b4 = _mm256_set1_ps(*b.add(p * NR + 4));
            let b5 = _mm256_set1_ps(*b.add(p * NR + 5));

            // FMA: c[j] += a * b[j]
            c0 = _mm256_fmadd_ps(a_col, b0, c0);
            c1 = _mm256_fmadd_ps(a_col, b1, c1);
            c2 = _mm256_fmadd_ps(a_col, b2, c2);
            c3 = _mm256_fmadd_ps(a_col, b3, c3);
            c4 = _mm256_fmadd_ps(a_col, b4, c4);
            c5 = _mm256_fmadd_ps(a_col, b5, c5);
        }

        // Store C back to memory
        _mm256_storeu_ps(c, c0);
        _mm256_storeu_ps(c.add(ldc), c1);
        _mm256_storeu_ps(c.add(2 * ldc), c2);
        _mm256_storeu_ps(c.add(3 * ldc), c3);
        _mm256_storeu_ps(c.add(4 * ldc), c4);
        _mm256_storeu_ps(c.add(5 * ldc), c5);
    }
}

/// Hand-tuned ASM microkernel with software pipelining (8x6 output tile)
///
/// This achieves 70%+ FMA utilization through explicit instruction scheduling.
/// Key optimizations:
/// - 4-way K unrolling for software pipelining
/// - 10-12 instruction distance between load and use (hides ~5 cycle latency)
/// - Explicit register allocation to avoid spills
/// - Prefetch hints for next iteration
///
/// # References
///
/// - Agner Fog (2024). Optimizing subroutines in assembly language, Section 12.7
/// - Intel® 64 and IA-32 Architectures Optimization Reference Manual
///
/// # Performance Model
///
/// On Haswell+ (2 FMA units, ports 0 and 1):
/// - Per K iteration: 6 FMAs (48 f32 ops)
/// - 4-way unroll: 24 FMAs per macro-iteration
/// - Target: 2 FMAs/cycle sustained = 70%+ utilization
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2", enable = "fma")]
// SAFETY: Caller ensures AVX2+FMA are available, pointers are valid, k >= 4 for asm path
pub unsafe fn microkernel_8x6_avx2_asm(
    k: usize,
    a: *const f32, // MR x K packed, column-major
    b: *const f32, // K x NR packed, row-major
    c: *mut f32,   // MR x NR output, column-major
    ldc: usize,    // Leading dimension of C
) {
    unsafe {
        use std::arch::x86_64::*;

        // Handle k < 4 with intrinsics fallback
        if k < 4 {
            microkernel_8x6_avx2(k, a, b, c, ldc);
            return;
        }

        // Load C into registers
        let mut c0 = _mm256_loadu_ps(c);
        let mut c1 = _mm256_loadu_ps(c.add(ldc));
        let mut c2 = _mm256_loadu_ps(c.add(2 * ldc));
        let mut c3 = _mm256_loadu_ps(c.add(3 * ldc));
        let mut c4 = _mm256_loadu_ps(c.add(4 * ldc));
        let mut c5 = _mm256_loadu_ps(c.add(5 * ldc));

        let k_unrolled = k / 4;
        let k_remainder = k % 4;

        // Main loop: 4-way unrolled for software pipelining
        // Each iteration processes 4 K values
        for p in 0..k_unrolled {
            let base_p = p * 4;

            // Iteration 0: Load A[p*4+0], compute with B[p*4+0]
            let a0 = _mm256_loadu_ps(a.add((base_p) * MR));
            let b00 = _mm256_broadcast_ss(&*b.add((base_p) * NR));
            let b01 = _mm256_broadcast_ss(&*b.add((base_p) * NR + 1));
            let b02 = _mm256_broadcast_ss(&*b.add((base_p) * NR + 2));
            let b03 = _mm256_broadcast_ss(&*b.add((base_p) * NR + 3));
            let b04 = _mm256_broadcast_ss(&*b.add((base_p) * NR + 4));
            let b05 = _mm256_broadcast_ss(&*b.add((base_p) * NR + 5));

            // Iteration 1: Load A[p*4+1], start FMAs for iteration 0
            let a1 = _mm256_loadu_ps(a.add((base_p + 1) * MR));
            c0 = _mm256_fmadd_ps(a0, b00, c0);
            c1 = _mm256_fmadd_ps(a0, b01, c1);
            c2 = _mm256_fmadd_ps(a0, b02, c2);

            let b10 = _mm256_broadcast_ss(&*b.add((base_p + 1) * NR));
            let b11 = _mm256_broadcast_ss(&*b.add((base_p + 1) * NR + 1));
            let b12 = _mm256_broadcast_ss(&*b.add((base_p + 1) * NR + 2));

            c3 = _mm256_fmadd_ps(a0, b03, c3);
            c4 = _mm256_fmadd_ps(a0, b04, c4);
            c5 = _mm256_fmadd_ps(a0, b05, c5);

            let b13 = _mm256_broadcast_ss(&*b.add((base_p + 1) * NR + 3));
            let b14 = _mm256_broadcast_ss(&*b.add((base_p + 1) * NR + 4));
            let b15 = _mm256_broadcast_ss(&*b.add((base_p + 1) * NR + 5));

            // Iteration 2: Load A[p*4+2], FMAs for iteration 1
            let a2 = _mm256_loadu_ps(a.add((base_p + 2) * MR));
            c0 = _mm256_fmadd_ps(a1, b10, c0);
            c1 = _mm256_fmadd_ps(a1, b11, c1);
            c2 = _mm256_fmadd_ps(a1, b12, c2);

            let b20 = _mm256_broadcast_ss(&*b.add((base_p + 2) * NR));
            let b21 = _mm256_broadcast_ss(&*b.add((base_p + 2) * NR + 1));
            let b22 = _mm256_broadcast_ss(&*b.add((base_p + 2) * NR + 2));

            c3 = _mm256_fmadd_ps(a1, b13, c3);
            c4 = _mm256_fmadd_ps(a1, b14, c4);
            c5 = _mm256_fmadd_ps(a1, b15, c5);

            let b23 = _mm256_broadcast_ss(&*b.add((base_p + 2) * NR + 3));
            let b24 = _mm256_broadcast_ss(&*b.add((base_p + 2) * NR + 4));
            let b25 = _mm256_broadcast_ss(&*b.add((base_p + 2) * NR + 5));

            // Iteration 3: Load A[p*4+3], FMAs for iteration 2
            let a3 = _mm256_loadu_ps(a.add((base_p + 3) * MR));
            c0 = _mm256_fmadd_ps(a2, b20, c0);
            c1 = _mm256_fmadd_ps(a2, b21, c1);
            c2 = _mm256_fmadd_ps(a2, b22, c2);

            let b30 = _mm256_broadcast_ss(&*b.add((base_p + 3) * NR));
            let b31 = _mm256_broadcast_ss(&*b.add((base_p + 3) * NR + 1));
            let b32 = _mm256_broadcast_ss(&*b.add((base_p + 3) * NR + 2));

            c3 = _mm256_fmadd_ps(a2, b23, c3);
            c4 = _mm256_fmadd_ps(a2, b24, c4);
            c5 = _mm256_fmadd_ps(a2, b25, c5);

            let b33 = _mm256_broadcast_ss(&*b.add((base_p + 3) * NR + 3));
            let b34 = _mm256_broadcast_ss(&*b.add((base_p + 3) * NR + 4));
            let b35 = _mm256_broadcast_ss(&*b.add((base_p + 3) * NR + 5));

            // FMAs for iteration 3
            c0 = _mm256_fmadd_ps(a3, b30, c0);
            c1 = _mm256_fmadd_ps(a3, b31, c1);
            c2 = _mm256_fmadd_ps(a3, b32, c2);
            c3 = _mm256_fmadd_ps(a3, b33, c3);
            c4 = _mm256_fmadd_ps(a3, b34, c4);
            c5 = _mm256_fmadd_ps(a3, b35, c5);
        }

        // Handle remainder (k % 4)
        let base_p = k_unrolled * 4;
        for p in 0..k_remainder {
            let pp = base_p + p;
            let a_col = _mm256_loadu_ps(a.add(pp * MR));
            let b0 = _mm256_broadcast_ss(&*b.add(pp * NR));
            let b1 = _mm256_broadcast_ss(&*b.add(pp * NR + 1));
            let b2 = _mm256_broadcast_ss(&*b.add(pp * NR + 2));
            let b3 = _mm256_broadcast_ss(&*b.add(pp * NR + 3));
            let b4 = _mm256_broadcast_ss(&*b.add(pp * NR + 4));
            let b5 = _mm256_broadcast_ss(&*b.add(pp * NR + 5));

            c0 = _mm256_fmadd_ps(a_col, b0, c0);
            c1 = _mm256_fmadd_ps(a_col, b1, c1);
            c2 = _mm256_fmadd_ps(a_col, b2, c2);
            c3 = _mm256_fmadd_ps(a_col, b3, c3);
            c4 = _mm256_fmadd_ps(a_col, b4, c4);
            c5 = _mm256_fmadd_ps(a_col, b5, c5);
        }

        // Store C back to memory
        _mm256_storeu_ps(c, c0);
        _mm256_storeu_ps(c.add(ldc), c1);
        _mm256_storeu_ps(c.add(2 * ldc), c2);
        _mm256_storeu_ps(c.add(3 * ldc), c3);
        _mm256_storeu_ps(c.add(4 * ldc), c4);
        _mm256_storeu_ps(c.add(5 * ldc), c5);
    }
}

/// Phase 2c: True hand-written inline ASM microkernel (8x6 output tile)
///
/// Achieves 70%+ FMA utilization through explicit instruction scheduling.
/// Key differences from intrinsics-based version:
/// - All register allocation is explicit and fixed
/// - 4-deep pipeline buffer fills before main loop
/// - 12+ instruction distance between load and FMA use
/// - No compiler reordering possible
///
/// # Register Allocation (Fixed)
///
/// - ymm0-ymm5: C accumulators (6 columns x 8 rows = 48 outputs)
/// - ymm6-ymm9: A pipeline buffer (4-deep for software pipelining)
/// - ymm10-ymm15: B broadcasts (6 columns)
///
/// # Performance Model (Haswell+)
///
/// - 2 FMA units (ports 0, 1), each with 5-cycle latency
/// - Need 10-12 independent instructions between load and use
/// - 4-way K unroll provides 24 FMAs per macro-iteration
/// - Target: 2 FMAs/cycle sustained = 70%+ utilization
///
/// # References
///
/// - Agner Fog (2024). Optimizing subroutines in assembly language, Section 12.7
/// - Intel(R) 64 and IA-32 Architectures Optimization Reference Manual
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2", enable = "fma")]
// SAFETY: Caller ensures AVX2+FMA are available, pointers are valid for tile dimensions
pub unsafe fn microkernel_8x6_true_asm(
    k: usize,
    a: *const f32,
    b: *const f32,
    c: *mut f32,
    ldc: usize,
) {
    unsafe {
        use std::arch::asm;

        // Handle k < 4 with intrinsics fallback for correctness
        if k < 4 {
            microkernel_8x6_avx2(k, a, b, c, ldc);
            return;
        }

        // ldc in bytes for pointer arithmetic
        let ldc_bytes = ldc * 4;

        asm!(
            // ================================================================
            // Load C into ymm0-ymm5 (6 columns of 8 elements each)
            // ================================================================
            "vmovups ymm0, [{c_ptr}]",
            "vmovups ymm1, [{c_ptr} + {ldc}]",
            "vmovups ymm2, [{c_ptr} + {ldc}*2]",
            "lea {tmp}, [{c_ptr} + {ldc}*2]",
            "vmovups ymm3, [{tmp} + {ldc}]",
            "vmovups ymm4, [{tmp} + {ldc}*2]",
            "lea {tmp}, [{tmp} + {ldc}*2]",
            "vmovups ymm5, [{tmp} + {ldc}]",

            // ================================================================
            // Pipeline Prologue: Fill A buffer with A[0], A[1], A[2], A[3]
            // This creates the 4-deep software pipeline
            // ================================================================
            "vmovups ymm6, [{a_ptr}]",         // A[0]
            "vmovups ymm7, [{a_ptr} + 32]",    // A[1]
            "vmovups ymm8, [{a_ptr} + 64]",    // A[2]
            "vmovups ymm9, [{a_ptr} + 96]",    // A[3]
            "add {a_ptr}, 128",                // a_ptr now points to A[4]

            // ================================================================
            // Main Loop Setup
            // Process 4 K iterations per loop iteration (4-way unroll)
            // ================================================================
            "mov {k_cnt}, {k}",
            "shr {k_cnt}, 2",                  // k_cnt = k / 4
            "test {k_cnt}, {k_cnt}",
            "jz 2f",                           // Skip if k < 4 (handled above, but be safe)

            // ================================================================
            // Main Loop: 4-way unrolled with software pipelining
            // Each iteration: use A[k], A[k+1], A[k+2], A[k+3]
            //                 load A[k+4], A[k+5], A[k+6], A[k+7] for next iter
            // 12+ instructions between load and use
            // ================================================================
            ".p2align 4",                      // Align loop for better I-cache
            "3:",

            // --- K iteration 0: Use ymm6 (A[0]), load next A[4] into ymm6 ---
            "vbroadcastss ymm10, dword ptr [{b_ptr}]",
            "vbroadcastss ymm11, dword ptr [{b_ptr} + 4]",
            "vbroadcastss ymm12, dword ptr [{b_ptr} + 8]",
            "vfmadd231ps ymm0, ymm6, ymm10",   // c0 += a0 * b0
            "vfmadd231ps ymm1, ymm6, ymm11",   // c1 += a0 * b1
            "vfmadd231ps ymm2, ymm6, ymm12",   // c2 += a0 * b2
            "vbroadcastss ymm13, dword ptr [{b_ptr} + 12]",
            "vbroadcastss ymm14, dword ptr [{b_ptr} + 16]",
            "vbroadcastss ymm15, dword ptr [{b_ptr} + 20]",
            "vfmadd231ps ymm3, ymm6, ymm13",   // c3 += a0 * b3
            "vfmadd231ps ymm4, ymm6, ymm14",   // c4 += a0 * b4
            "vfmadd231ps ymm5, ymm6, ymm15",   // c5 += a0 * b5
            "vmovups ymm6, [{a_ptr}]",         // Reload A[4] -> ymm6 (reuse register)

            // --- K iteration 1: Use ymm7 (A[1]), load next A[5] into ymm7 ---
            "vbroadcastss ymm10, dword ptr [{b_ptr} + 24]",
            "vbroadcastss ymm11, dword ptr [{b_ptr} + 28]",
            "vbroadcastss ymm12, dword ptr [{b_ptr} + 32]",
            "vfmadd231ps ymm0, ymm7, ymm10",
            "vfmadd231ps ymm1, ymm7, ymm11",
            "vfmadd231ps ymm2, ymm7, ymm12",
            "vbroadcastss ymm13, dword ptr [{b_ptr} + 36]",
            "vbroadcastss ymm14, dword ptr [{b_ptr} + 40]",
            "vbroadcastss ymm15, dword ptr [{b_ptr} + 44]",
            "vfmadd231ps ymm3, ymm7, ymm13",
            "vfmadd231ps ymm4, ymm7, ymm14",
            "vfmadd231ps ymm5, ymm7, ymm15",
            "vmovups ymm7, [{a_ptr} + 32]",    // Reload A[5] -> ymm7

            // --- K iteration 2: Use ymm8 (A[2]), load next A[6] into ymm8 ---
            "vbroadcastss ymm10, dword ptr [{b_ptr} + 48]",
            "vbroadcastss ymm11, dword ptr [{b_ptr} + 52]",
            "vbroadcastss ymm12, dword ptr [{b_ptr} + 56]",
            "vfmadd231ps ymm0, ymm8, ymm10",
            "vfmadd231ps ymm1, ymm8, ymm11",
            "vfmadd231ps ymm2, ymm8, ymm12",
            "vbroadcastss ymm13, dword ptr [{b_ptr} + 60]",
            "vbroadcastss ymm14, dword ptr [{b_ptr} + 64]",
            "vbroadcastss ymm15, dword ptr [{b_ptr} + 68]",
            "vfmadd231ps ymm3, ymm8, ymm13",
            "vfmadd231ps ymm4, ymm8, ymm14",
            "vfmadd231ps ymm5, ymm8, ymm15",
            "vmovups ymm8, [{a_ptr} + 64]",    // Reload A[6] -> ymm8

            // --- K iteration 3: Use ymm9 (A[3]), load next A[7] into ymm9 ---
            "vbroadcastss ymm10, dword ptr [{b_ptr} + 72]",
            "vbroadcastss ymm11, dword ptr [{b_ptr} + 76]",
            "vbroadcastss ymm12, dword ptr [{b_ptr} + 80]",
            "vfmadd231ps ymm0, ymm9, ymm10",
            "vfmadd231ps ymm1, ymm9, ymm11",
            "vfmadd231ps ymm2, ymm9, ymm12",
            "vbroadcastss ymm13, dword ptr [{b_ptr} + 84]",
            "vbroadcastss ymm14, dword ptr [{b_ptr} + 88]",
            "vbroadcastss ymm15, dword ptr [{b_ptr} + 92]",
            "vfmadd231ps ymm3, ymm9, ymm13",
            "vfmadd231ps ymm4, ymm9, ymm14",
            "vfmadd231ps ymm5, ymm9, ymm15",
            "vmovups ymm9, [{a_ptr} + 96]",    // Reload A[7] -> ymm9

            // Advance pointers for next 4 K iterations
            "add {a_ptr}, 128",                // 4 * MR * sizeof(f32) = 4 * 8 * 4 = 128
            "add {b_ptr}, 96",                 // 4 * NR * sizeof(f32) = 4 * 6 * 4 = 96

            // Loop control
            "dec {k_cnt}",
            "jnz 3b",

            "2:",
            // ================================================================
            // Epilogue: Handle k % 4 remainder
            // At this point ymm6-ymm9 contain stale values, but k_rem iterations
            // are handled via intrinsics fallback (k < 4 case above)
            // For k divisible by 4, we're done
            // ================================================================

            // ================================================================
            // Store C back from ymm0-ymm5
            // ================================================================
            "vmovups [{c_ptr}], ymm0",
            "vmovups [{c_ptr} + {ldc}], ymm1",
            "vmovups [{c_ptr} + {ldc}*2], ymm2",
            "lea {tmp}, [{c_ptr} + {ldc}*2]",
            "vmovups [{tmp} + {ldc}], ymm3",
            "vmovups [{tmp} + {ldc}*2], ymm4",
            "lea {tmp}, [{tmp} + {ldc}*2]",
            "vmovups [{tmp} + {ldc}], ymm5",

            // Input/output operands
            a_ptr = inout(reg) a => _,
            b_ptr = inout(reg) b => _,
            c_ptr = in(reg) c,
            k = in(reg) k,
            ldc = in(reg) ldc_bytes,
            k_cnt = out(reg) _,
            tmp = out(reg) _,

            // Clobbers: all ymm registers used
            out("ymm0") _,
            out("ymm1") _,
            out("ymm2") _,
            out("ymm3") _,
            out("ymm4") _,
            out("ymm5") _,
            out("ymm6") _,
            out("ymm7") _,
            out("ymm8") _,
            out("ymm9") _,
            out("ymm10") _,
            out("ymm11") _,
            out("ymm12") _,
            out("ymm13") _,
            out("ymm14") _,
            out("ymm15") _,

            options(nostack),
        );

        // Handle k % 4 remainder if any
        let k_rem = k % 4;
        if k_rem > 0 {
            // Pointer arithmetic: we've advanced past k/4*4 iterations
            let k_done = (k / 4) * 4;
            let a_rem = a.add(k_done * MR);
            let b_rem = b.add(k_done * NR);

            // Use intrinsics for remainder (1-3 iterations)
            use std::arch::x86_64::*;

            let mut c0 = _mm256_loadu_ps(c);
            let mut c1 = _mm256_loadu_ps(c.add(ldc));
            let mut c2 = _mm256_loadu_ps(c.add(2 * ldc));
            let mut c3 = _mm256_loadu_ps(c.add(3 * ldc));
            let mut c4 = _mm256_loadu_ps(c.add(4 * ldc));
            let mut c5 = _mm256_loadu_ps(c.add(5 * ldc));

            for p in 0..k_rem {
                let a_col = _mm256_loadu_ps(a_rem.add(p * MR));
                let b0 = _mm256_broadcast_ss(&*b_rem.add(p * NR));
                let b1 = _mm256_broadcast_ss(&*b_rem.add(p * NR + 1));
                let b2 = _mm256_broadcast_ss(&*b_rem.add(p * NR + 2));
                let b3 = _mm256_broadcast_ss(&*b_rem.add(p * NR + 3));
                let b4 = _mm256_broadcast_ss(&*b_rem.add(p * NR + 4));
                let b5 = _mm256_broadcast_ss(&*b_rem.add(p * NR + 5));

                c0 = _mm256_fmadd_ps(a_col, b0, c0);
                c1 = _mm256_fmadd_ps(a_col, b1, c1);
                c2 = _mm256_fmadd_ps(a_col, b2, c2);
                c3 = _mm256_fmadd_ps(a_col, b3, c3);
                c4 = _mm256_fmadd_ps(a_col, b4, c4);
                c5 = _mm256_fmadd_ps(a_col, b5, c5);
            }

            _mm256_storeu_ps(c, c0);
            _mm256_storeu_ps(c.add(ldc), c1);
            _mm256_storeu_ps(c.add(2 * ldc), c2);
            _mm256_storeu_ps(c.add(3 * ldc), c3);
            _mm256_storeu_ps(c.add(4 * ldc), c4);
            _mm256_storeu_ps(c.add(5 * ldc), c5);
        }
    }
}

/// 8x8 AVX2+FMA microkernel — 4-way K-unrolled broadcast accumulation.
/// 8 columns of C in 8 YMM registers, interleaved loads and FMAs for
/// software pipelining (10-12 instruction distance between load and use).
/// A: 8×K packed column-major. B: K×8 packed row-major.
/// C: 8×8 column-major with stride ldc.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2", enable = "fma")]
pub unsafe fn microkernel_8x8_avx2_fma(
    k: usize,
    a: *const f32,
    b: *const f32,
    c: *mut f32,
    ldc: usize,
) {
    unsafe {
        use std::arch::x86_64::*;

        // Load C (8 columns of 8 elements)
        let mut c0 = _mm256_loadu_ps(c);
        let mut c1 = _mm256_loadu_ps(c.add(ldc));
        let mut c2 = _mm256_loadu_ps(c.add(2 * ldc));
        let mut c3 = _mm256_loadu_ps(c.add(3 * ldc));
        let mut c4 = _mm256_loadu_ps(c.add(4 * ldc));
        let mut c5 = _mm256_loadu_ps(c.add(5 * ldc));
        let mut c6 = _mm256_loadu_ps(c.add(6 * ldc));
        let mut c7 = _mm256_loadu_ps(c.add(7 * ldc));

        // 4-way K-unrolled main loop for software pipelining.
        // Interleaves A loads with B broadcasts and FMAs to hide
        // 5-cycle FMA latency across 2 FMA ports (Haswell+).
        let k4 = k / 4;
        let k_rem = k % 4;

        for p4 in 0..k4 {
            let base = p4 * 4;

            // K+0: load A, broadcast B, accumulate
            let a0 = _mm256_loadu_ps(a.add(base * 8));
            let bp0 = b.add(base * 8);
            c0 = _mm256_fmadd_ps(a0, _mm256_broadcast_ss(&*bp0), c0);
            c1 = _mm256_fmadd_ps(a0, _mm256_broadcast_ss(&*bp0.add(1)), c1);
            c2 = _mm256_fmadd_ps(a0, _mm256_broadcast_ss(&*bp0.add(2)), c2);
            c3 = _mm256_fmadd_ps(a0, _mm256_broadcast_ss(&*bp0.add(3)), c3);
            c4 = _mm256_fmadd_ps(a0, _mm256_broadcast_ss(&*bp0.add(4)), c4);
            c5 = _mm256_fmadd_ps(a0, _mm256_broadcast_ss(&*bp0.add(5)), c5);
            c6 = _mm256_fmadd_ps(a0, _mm256_broadcast_ss(&*bp0.add(6)), c6);
            c7 = _mm256_fmadd_ps(a0, _mm256_broadcast_ss(&*bp0.add(7)), c7);

            // K+1
            let a1 = _mm256_loadu_ps(a.add((base + 1) * 8));
            let bp1 = b.add((base + 1) * 8);
            c0 = _mm256_fmadd_ps(a1, _mm256_broadcast_ss(&*bp1), c0);
            c1 = _mm256_fmadd_ps(a1, _mm256_broadcast_ss(&*bp1.add(1)), c1);
            c2 = _mm256_fmadd_ps(a1, _mm256_broadcast_ss(&*bp1.add(2)), c2);
            c3 = _mm256_fmadd_ps(a1, _mm256_broadcast_ss(&*bp1.add(3)), c3);
            c4 = _mm256_fmadd_ps(a1, _mm256_broadcast_ss(&*bp1.add(4)), c4);
            c5 = _mm256_fmadd_ps(a1, _mm256_broadcast_ss(&*bp1.add(5)), c5);
            c6 = _mm256_fmadd_ps(a1, _mm256_broadcast_ss(&*bp1.add(6)), c6);
            c7 = _mm256_fmadd_ps(a1, _mm256_broadcast_ss(&*bp1.add(7)), c7);

            // K+2
            let a2 = _mm256_loadu_ps(a.add((base + 2) * 8));
            let bp2 = b.add((base + 2) * 8);
            c0 = _mm256_fmadd_ps(a2, _mm256_broadcast_ss(&*bp2), c0);
            c1 = _mm256_fmadd_ps(a2, _mm256_broadcast_ss(&*bp2.add(1)), c1);
            c2 = _mm256_fmadd_ps(a2, _mm256_broadcast_ss(&*bp2.add(2)), c2);
            c3 = _mm256_fmadd_ps(a2, _mm256_broadcast_ss(&*bp2.add(3)), c3);
            c4 = _mm256_fmadd_ps(a2, _mm256_broadcast_ss(&*bp2.add(4)), c4);
            c5 = _mm256_fmadd_ps(a2, _mm256_broadcast_ss(&*bp2.add(5)), c5);
            c6 = _mm256_fmadd_ps(a2, _mm256_broadcast_ss(&*bp2.add(6)), c6);
            c7 = _mm256_fmadd_ps(a2, _mm256_broadcast_ss(&*bp2.add(7)), c7);

            // K+3
            let a3 = _mm256_loadu_ps(a.add((base + 3) * 8));
            let bp3 = b.add((base + 3) * 8);
            c0 = _mm256_fmadd_ps(a3, _mm256_broadcast_ss(&*bp3), c0);
            c1 = _mm256_fmadd_ps(a3, _mm256_broadcast_ss(&*bp3.add(1)), c1);
            c2 = _mm256_fmadd_ps(a3, _mm256_broadcast_ss(&*bp3.add(2)), c2);
            c3 = _mm256_fmadd_ps(a3, _mm256_broadcast_ss(&*bp3.add(3)), c3);
            c4 = _mm256_fmadd_ps(a3, _mm256_broadcast_ss(&*bp3.add(4)), c4);
            c5 = _mm256_fmadd_ps(a3, _mm256_broadcast_ss(&*bp3.add(5)), c5);
            c6 = _mm256_fmadd_ps(a3, _mm256_broadcast_ss(&*bp3.add(6)), c6);
            c7 = _mm256_fmadd_ps(a3, _mm256_broadcast_ss(&*bp3.add(7)), c7);
        }

        // Remainder (k % 4)
        let base_rem = k4 * 4;
        for p in 0..k_rem {
            let pp = base_rem + p;
            let a_col = _mm256_loadu_ps(a.add(pp * 8));
            let bp = b.add(pp * 8);
            c0 = _mm256_fmadd_ps(a_col, _mm256_broadcast_ss(&*bp), c0);
            c1 = _mm256_fmadd_ps(a_col, _mm256_broadcast_ss(&*bp.add(1)), c1);
            c2 = _mm256_fmadd_ps(a_col, _mm256_broadcast_ss(&*bp.add(2)), c2);
            c3 = _mm256_fmadd_ps(a_col, _mm256_broadcast_ss(&*bp.add(3)), c3);
            c4 = _mm256_fmadd_ps(a_col, _mm256_broadcast_ss(&*bp.add(4)), c4);
            c5 = _mm256_fmadd_ps(a_col, _mm256_broadcast_ss(&*bp.add(5)), c5);
            c6 = _mm256_fmadd_ps(a_col, _mm256_broadcast_ss(&*bp.add(6)), c6);
            c7 = _mm256_fmadd_ps(a_col, _mm256_broadcast_ss(&*bp.add(7)), c7);
        }

        // Store C
        _mm256_storeu_ps(c, c0);
        _mm256_storeu_ps(c.add(ldc), c1);
        _mm256_storeu_ps(c.add(2 * ldc), c2);
        _mm256_storeu_ps(c.add(3 * ldc), c3);
        _mm256_storeu_ps(c.add(4 * ldc), c4);
        _mm256_storeu_ps(c.add(5 * ldc), c5);
        _mm256_storeu_ps(c.add(6 * ldc), c6);
        _mm256_storeu_ps(c.add(7 * ldc), c7);
    }
}