#[allow(clippy::identity_op)]
#[allow(clippy::erasing_op)]
#[allow(unsafe_op_in_unsafe_fn)]
pub unsafe fn kernel_4x4_avx2(
a_pack: *const f64,
b_pack: *const f64,
c: *mut f64,
k: usize,
ldc: usize,
) {
use std::arch::x86_64::*;
let mut c0 = _mm256_loadu_pd(c.add(0 * ldc));
let mut c1 = _mm256_loadu_pd(c.add(1 * ldc));
let mut c2 = _mm256_loadu_pd(c.add(2 * ldc));
let mut c3 = _mm256_loadu_pd(c.add(3 * ldc));
for p in 0..k {
let b_vec = _mm256_loadu_pd(b_pack.add(p * 4));
let a0 = _mm256_broadcast_sd(&*a_pack.add(p * 4 + 0));
let a1 = _mm256_broadcast_sd(&*a_pack.add(p * 4 + 1));
let a2 = _mm256_broadcast_sd(&*a_pack.add(p * 4 + 2));
let a3 = _mm256_broadcast_sd(&*a_pack.add(p * 4 + 3));
c0 = _mm256_fmadd_pd(a0, b_vec, c0);
c1 = _mm256_fmadd_pd(a1, b_vec, c1);
c2 = _mm256_fmadd_pd(a2, b_vec, c2);
c3 = _mm256_fmadd_pd(a3, b_vec, c3);
}
_mm256_storeu_pd(c.add(0 * ldc), c0);
_mm256_storeu_pd(c.add(1 * ldc), c1);
_mm256_storeu_pd(c.add(2 * ldc), c2);
_mm256_storeu_pd(c.add(3 * ldc), c3);
}