mod colmajor;
mod dequant;
mod gemv;
#[allow(deprecated)]
pub use colmajor::{matmul_q4k_f32_colmajor, matmul_q4k_f32_colmajor_dispatch};
pub use dequant::dequantize_q4k_to_f32;
pub use gemv::{matmul_q4k_f32, matmul_q4k_f32_dispatch, matmul_q4k_f32_scalar};
pub(crate) const SUPER_BLOCK_SIZE: usize = 256;
pub(crate) const SUPER_BLOCK_BYTES: usize = 144;
pub(crate) const _SUB_BLOCK_SIZE: usize = 32;
#[inline(always)]
fn f16_to_f32(bits: u16) -> f32 {
let sign = ((bits & 0x8000) as u32) << 16;
let exp = (bits >> 10) & 0x1F;
let mantissa = (bits & 0x3FF) as u32;
if exp == 0 {
if mantissa == 0 {
f32::from_bits(sign)
} else {
let mut m = mantissa;
let mut e = 0i32;
while (m & 0x400) == 0 {
m <<= 1;
e -= 1;
}
let new_exp = ((127 - 15 + 1 + e) as u32) << 23;
let new_mantissa = (m & 0x3FF) << 13;
f32::from_bits(sign | new_exp | new_mantissa)
}
} else if exp == 31 {
f32::from_bits(sign | (0xFF << 23) | (mantissa << 13))
} else {
let new_exp = ((exp as i32 - 15 + 127) as u32) << 23;
f32::from_bits(sign | new_exp | (mantissa << 13))
}
}
#[inline(always)]
pub(crate) fn parse_q4k_header(block: &[u8]) -> (f32, f32, [u8; 8], [u8; 8]) {
debug_assert!(block.len() >= 16);
let d = f16_to_f32(u16::from_le_bytes([block[0], block[1]]));
let dmin = f16_to_f32(u16::from_le_bytes([block[2], block[3]]));
let scales_bytes = block.get(4..16).expect("Q4_K: need ≥16 bytes for header");
let mut scales = [0u8; 8];
let mut mins = [0u8; 8];
for i in 0..4 {
scales[i] = scales_bytes[i] & 0x3F;
mins[i] = scales_bytes[i + 4] & 0x3F;
scales[i + 4] = (scales_bytes[i + 8] & 0x0F) | ((scales_bytes[i] >> 6) << 4);
mins[i + 4] = (scales_bytes[i + 8] >> 4) | ((scales_bytes[i + 4] >> 6) << 4);
}
(d, dmin, scales, mins)
}
#[cfg(test)]
mod tests_core;
#[cfg(test)]
mod tests_coverage;
#[cfg(test)]
mod tests_golden;