use super::{f16_to_f32, SUPER_BLOCK_BYTES, SUPER_BLOCK_SIZE};
#[inline(always)]
fn extract_q6k_value(ql: &[u8], qh: &[u8], idx: usize) -> i8 {
let ql_byte = ql[idx / 2];
let low4 = if idx % 2 == 0 { ql_byte & 0x0F } else { ql_byte >> 4 };
let qh_byte = qh[idx / 4];
let high2 = (qh_byte >> ((idx % 4) * 2)) & 0x03;
(low4 | (high2 << 4)) as i8 - 32
}
#[inline]
fn accumulate_q6k_superblock_colmajor(
sb_data: &[u8],
x_j: f32,
output: &mut [f32],
output_offset: usize,
ne0: usize,
) {
let ql = sb_data.get(0..128).expect("Q6_K: need ≥128 bytes for ql");
let qh = sb_data.get(128..192).expect("Q6_K: need ≥192 bytes for qh");
let scales = sb_data.get(192..208).expect("Q6_K: need ≥208 bytes for scales");
let d = f16_to_f32(u16::from_le_bytes([sb_data[208], sb_data[209]]));
for group in 0..16 {
let scale = (scales[group] as i8) as f32;
let group_offset = group * 16;
for j in 0..16 {
let idx = group_offset + j;
let output_idx = output_offset + idx;
if output_idx >= ne0 {
continue;
}
let q6 = extract_q6k_value(ql, qh, idx);
output[output_idx] += x_j * d * scale * q6 as f32;
}
}
}
#[deprecated(
since = "0.15.0",
note = "LAYOUT-001: Use row-major kernels. APR/GGUF data is transposed at import boundary."
)]
pub fn matmul_q6k_f32_colmajor(
q6k_data: &[u8],
input: &[f32],
ne0: usize, ne1: usize, ) -> Vec<f32> {
assert_eq!(input.len(), ne1, "Input length must match ne1 (input dimension)");
let blocks_per_col = (ne0 + SUPER_BLOCK_SIZE - 1) / SUPER_BLOCK_SIZE;
let col_bytes = blocks_per_col * SUPER_BLOCK_BYTES;
let mut output = vec![0.0f32; ne0];
for col_idx in 0..ne1 {
let col_start = col_idx * col_bytes;
let x_j = input[col_idx];
if x_j == 0.0 {
continue;
}
for sb_idx in 0..blocks_per_col {
let sb_start = col_start + sb_idx * SUPER_BLOCK_BYTES;
if sb_start + SUPER_BLOCK_BYTES > q6k_data.len() {
break;
}
let sb_data = &q6k_data[sb_start..sb_start + SUPER_BLOCK_BYTES];
let output_offset = sb_idx * SUPER_BLOCK_SIZE;
accumulate_q6k_superblock_colmajor(sb_data, x_j, &mut output, output_offset, ne0);
}
}
output
}
#[deprecated(
since = "0.15.0",
note = "LAYOUT-001: Use row-major kernels. APR/GGUF data is transposed at import boundary."
)]
#[inline]
pub fn matmul_q6k_f32_colmajor_dispatch(
q6k_data: &[u8],
input: &[f32],
ne0: usize,
ne1: usize,
) -> Vec<f32> {
#[allow(deprecated)]
matmul_q6k_f32_colmajor(q6k_data, input, ne0, ne1)
}