use crate::error::{ModelError, ModelResult};
#[inline]
fn f16_bits_to_f32(bits: u16) -> f32 {
half::f16::from_bits(bits).to_f32()
}
#[inline]
fn read_f16_le(data: &[u8], offset: usize) -> ModelResult<f32> {
if offset + 2 > data.len() {
return Err(ModelError::simple_load_error(format!(
"read_f16_le: offset {} + 2 exceeds buffer length {}",
offset,
data.len()
)));
}
let bits = u16::from_le_bytes([data[offset], data[offset + 1]]);
Ok(f16_bits_to_f32(bits))
}
#[inline]
fn read_f32_le(data: &[u8], offset: usize) -> ModelResult<f32> {
if offset + 4 > data.len() {
return Err(ModelError::simple_load_error(format!(
"read_f32_le: offset {} + 4 exceeds buffer length {}",
offset,
data.len()
)));
}
Ok(f32::from_le_bytes([
data[offset],
data[offset + 1],
data[offset + 2],
data[offset + 3],
]))
}
pub(crate) fn dequant_q2_k(data: &[u8], n: usize) -> ModelResult<Vec<f32>> {
const BLOCK_ELEMS: usize = 256;
const BLOCK_BYTES: usize = 84;
if n == 0 || !n.is_multiple_of(BLOCK_ELEMS) {
return Err(ModelError::simple_load_error(format!(
"Q2_K: n_elements {} must be a non-zero multiple of {}",
n, BLOCK_ELEMS
)));
}
let n_blocks = n / BLOCK_ELEMS;
let required = n_blocks * BLOCK_BYTES;
if data.len() < required {
return Err(ModelError::simple_load_error(format!(
"Q2_K: buffer too small: need {} bytes, got {}",
required,
data.len()
)));
}
let mut out = Vec::with_capacity(n);
for b in 0..n_blocks {
let base = b * BLOCK_BYTES;
let d = read_f16_le(data, base)?;
let dmin = read_f16_le(data, base + 2)?;
const NUM_GROUPS: usize = 8; const GROUP_SIZE: usize = 32;
let scales_base = base + 4;
let mut sub_scales = [0u8; NUM_GROUPS];
let mut sub_mins = [0u8; NUM_GROUPS];
for g in 0..NUM_GROUPS {
let byte = data[scales_base + g]; sub_scales[g] = byte & 0x0F;
sub_mins[g] = (byte >> 4) & 0x0F;
}
let qs_base = base + 16;
for g in 0..NUM_GROUPS {
let scale = d * sub_scales[g] as f32;
let min = dmin * sub_mins[g] as f32;
let qs_group_base = qs_base + g * (GROUP_SIZE / 4);
for byte_idx in 0..(GROUP_SIZE / 4) {
let byte = data[qs_group_base + byte_idx];
for shift in [0u8, 2, 4, 6] {
let q = ((byte >> shift) & 0x03) as f32;
out.push(scale * q - min);
}
}
}
}
Ok(out)
}
pub(crate) fn dequant_q3_k(data: &[u8], n: usize) -> ModelResult<Vec<f32>> {
const BLOCK_ELEMS: usize = 256;
const BLOCK_BYTES: usize = 110;
const SUB_BLOCK_SIZE: usize = 16;
const NUM_SUB_BLOCKS: usize = BLOCK_ELEMS / SUB_BLOCK_SIZE;
if n == 0 || !n.is_multiple_of(BLOCK_ELEMS) {
return Err(ModelError::simple_load_error(format!(
"Q3_K: n_elements {} must be a non-zero multiple of {}",
n, BLOCK_ELEMS
)));
}
let n_blocks = n / BLOCK_ELEMS;
let required = n_blocks * BLOCK_BYTES;
if data.len() < required {
return Err(ModelError::simple_load_error(format!(
"Q3_K: buffer too small: need {} bytes, got {}",
required,
data.len()
)));
}
let mut out = Vec::with_capacity(n);
for b in 0..n_blocks {
let base = b * BLOCK_BYTES;
let hmask = &data[base..base + 32];
let qs = &data[base + 32..base + 96]; let scales_raw = &data[base + 96..base + 108]; let d = read_f16_le(data, base + 108)?;
let sub_scales = decode_6bit_scales_16(scales_raw)?;
for (s, &raw_scale_u8) in sub_scales.iter().enumerate().take(NUM_SUB_BLOCKS) {
let raw_scale = raw_scale_u8 as i32 - 32;
let scale = d * raw_scale as f32;
let elem_base = s * SUB_BLOCK_SIZE;
for i in 0..SUB_BLOCK_SIZE {
let elem = elem_base + i;
let qs_byte = qs[elem / 4];
let low2 = (qs_byte >> ((elem % 4) * 2)) & 0x03;
let hm_byte = hmask[elem / 8];
let high1 = (hm_byte >> (elem % 8)) & 0x01;
let q3 = (low2 | (high1 << 2)) as i32;
out.push(scale * (q3 - 4) as f32);
}
}
}
Ok(out)
}
pub(crate) fn dequant_q4_k(data: &[u8], n: usize) -> ModelResult<Vec<f32>> {
const BLOCK_ELEMS: usize = 256;
const BLOCK_BYTES: usize = 144;
const NUM_SUB_BLOCKS: usize = 8;
const SUB_BLOCK_SIZE: usize = BLOCK_ELEMS / NUM_SUB_BLOCKS;
if n == 0 || !n.is_multiple_of(BLOCK_ELEMS) {
return Err(ModelError::simple_load_error(format!(
"Q4_K: n_elements {} must be a non-zero multiple of {}",
n, BLOCK_ELEMS
)));
}
let n_blocks = n / BLOCK_ELEMS;
let required = n_blocks * BLOCK_BYTES;
if data.len() < required {
return Err(ModelError::simple_load_error(format!(
"Q4_K: buffer too small: need {} bytes, got {}",
required,
data.len()
)));
}
let mut out = Vec::with_capacity(n);
for b in 0..n_blocks {
let base = b * BLOCK_BYTES;
let d = read_f16_le(data, base)?;
let dmin = read_f16_le(data, base + 2)?;
let scales_raw = &data[base + 4..base + 16]; let qs = &data[base + 16..base + 144];
let (sub_scales, sub_mins) = decode_6bit_scales_and_mins_8(scales_raw)?;
for s in 0..NUM_SUB_BLOCKS {
let scale = d * sub_scales[s] as f32;
let min = dmin * sub_mins[s] as f32;
let qs_base = s * (SUB_BLOCK_SIZE / 2);
for byte_idx in 0..(SUB_BLOCK_SIZE / 2) {
let byte = qs[qs_base + byte_idx];
let lo = (byte & 0x0F) as f32;
let hi = ((byte >> 4) & 0x0F) as f32;
out.push(scale * lo - min);
out.push(scale * hi - min);
}
}
}
Ok(out)
}
pub(crate) fn dequant_q5_k(data: &[u8], n: usize) -> ModelResult<Vec<f32>> {
const BLOCK_ELEMS: usize = 256;
const BLOCK_BYTES: usize = 176;
const NUM_SUB_BLOCKS: usize = 8;
const SUB_BLOCK_SIZE: usize = BLOCK_ELEMS / NUM_SUB_BLOCKS;
if n == 0 || !n.is_multiple_of(BLOCK_ELEMS) {
return Err(ModelError::simple_load_error(format!(
"Q5_K: n_elements {} must be a non-zero multiple of {}",
n, BLOCK_ELEMS
)));
}
let n_blocks = n / BLOCK_ELEMS;
let required = n_blocks * BLOCK_BYTES;
if data.len() < required {
return Err(ModelError::simple_load_error(format!(
"Q5_K: buffer too small: need {} bytes, got {}",
required,
data.len()
)));
}
let mut out = Vec::with_capacity(n);
for b in 0..n_blocks {
let base = b * BLOCK_BYTES;
let d = read_f16_le(data, base)?;
let dmin = read_f16_le(data, base + 2)?;
let scales_raw = &data[base + 4..base + 16]; let qh = &data[base + 16..base + 48]; let qs = &data[base + 48..base + 176];
let (sub_scales, sub_mins) = decode_6bit_scales_and_mins_8(scales_raw)?;
for s in 0..NUM_SUB_BLOCKS {
let scale = d * sub_scales[s] as f32;
let min = dmin * sub_mins[s] as f32;
let elem_base = s * SUB_BLOCK_SIZE;
let qs_base = s * (SUB_BLOCK_SIZE / 2);
for byte_idx in 0..(SUB_BLOCK_SIZE / 2) {
let byte = qs[qs_base + byte_idx];
let lo4 = (byte & 0x0F) as u32;
let hi4 = ((byte >> 4) & 0x0F) as u32;
let elem_lo = elem_base + byte_idx * 2;
let elem_hi = elem_base + byte_idx * 2 + 1;
let hbit_lo = ((qh[elem_lo / 8] >> (elem_lo % 8)) & 0x01) as u32;
let hbit_hi = ((qh[elem_hi / 8] >> (elem_hi % 8)) & 0x01) as u32;
let q_lo = (lo4 | (hbit_lo << 4)) as f32;
let q_hi = (hi4 | (hbit_hi << 4)) as f32;
out.push(scale * q_lo - min);
out.push(scale * q_hi - min);
}
}
}
Ok(out)
}
pub(crate) fn dequant_q8_k(data: &[u8], n: usize) -> ModelResult<Vec<f32>> {
const BLOCK_ELEMS: usize = 256;
const BLOCK_BYTES: usize = 292;
if n == 0 || !n.is_multiple_of(BLOCK_ELEMS) {
return Err(ModelError::simple_load_error(format!(
"Q8_K: n_elements {} must be a non-zero multiple of {}",
n, BLOCK_ELEMS
)));
}
let n_blocks = n / BLOCK_ELEMS;
let required = n_blocks * BLOCK_BYTES;
if data.len() < required {
return Err(ModelError::simple_load_error(format!(
"Q8_K: buffer too small: need {} bytes, got {}",
required,
data.len()
)));
}
let mut out = Vec::with_capacity(n);
for b in 0..n_blocks {
let base = b * BLOCK_BYTES;
let d = read_f32_le(data, base)?;
for i in 0..BLOCK_ELEMS {
let q = data[base + 4 + i] as i8;
out.push(d * q as f32);
}
}
Ok(out)
}
fn decode_6bit_scales_16(raw: &[u8]) -> ModelResult<[u8; 16]> {
if raw.len() < 12 {
return Err(ModelError::simple_load_error(
"decode_6bit_scales_16: need 12 bytes",
));
}
let mut result = [0u8; 16];
let mut bit_buf: u32 = 0;
let mut bits_available: u32 = 0;
let mut byte_idx = 0usize;
for slot in &mut result {
while bits_available < 6 {
if byte_idx >= 12 {
return Err(ModelError::simple_load_error(
"decode_6bit_scales_16: unexpectedly exhausted input bytes",
));
}
bit_buf |= (raw[byte_idx] as u32) << bits_available;
bits_available += 8;
byte_idx += 1;
}
*slot = (bit_buf & 0x3F) as u8;
bit_buf >>= 6;
bits_available -= 6;
}
Ok(result)
}
fn decode_6bit_scales_and_mins_8(raw: &[u8]) -> ModelResult<([u8; 8], [u8; 8])> {
if raw.len() < 12 {
return Err(ModelError::simple_load_error(
"decode_6bit_scales_and_mins_8: need 12 bytes",
));
}
let mut scales = [0u8; 8];
let mut mins = [0u8; 8];
let mut bit_buf: u64 = 0;
let mut bits_available: u32 = 0;
let mut byte_idx = 0usize;
let next_6bits =
|bit_buf: &mut u64, bits_available: &mut u32, byte_idx: &mut usize| -> ModelResult<u8> {
while *bits_available < 6 {
if *byte_idx >= 12 {
return Err(ModelError::simple_load_error(
"decode_6bit_scales_and_mins_8: unexpected byte exhaustion",
));
}
*bit_buf |= (raw[*byte_idx] as u64) << *bits_available;
*bits_available += 8;
*byte_idx += 1;
}
let val = (*bit_buf & 0x3F) as u8;
*bit_buf >>= 6;
*bits_available -= 6;
Ok(val)
};
for slot in &mut scales {
*slot = next_6bits(&mut bit_buf, &mut bits_available, &mut byte_idx)?;
}
for slot in &mut mins {
*slot = next_6bits(&mut bit_buf, &mut bits_available, &mut byte_idx)?;
}
Ok((scales, mins))
}
#[cfg(test)]
mod tests {
use super::*;
fn build_q2k_block(
d_val: f32,
dmin_val: f32,
scale_nibble: u8,
min_nibble: u8,
q: u8,
) -> Vec<u8> {
let mut block = vec![0u8; 84];
let d_bits = half::f16::from_f32(d_val).to_bits();
block[0] = (d_bits & 0xFF) as u8;
block[1] = (d_bits >> 8) as u8;
let dmin_bits = half::f16::from_f32(dmin_val).to_bits();
block[2] = (dmin_bits & 0xFF) as u8;
block[3] = (dmin_bits >> 8) as u8;
let scale_byte = ((min_nibble & 0x0F) << 4) | (scale_nibble & 0x0F);
for i in 0..8 {
block[4 + i] = scale_byte;
}
let q_byte = (q & 0x03) | ((q & 0x03) << 2) | ((q & 0x03) << 4) | ((q & 0x03) << 6);
for i in 0..64 {
block[16 + i] = q_byte;
}
block
}
#[test]
fn test_dequant_q2_k_round_trip() {
let d = 2.0f32;
let dmin = 1.0f32;
let scale_nibble = 2u8;
let min_nibble = 1u8;
let q = 3u8;
let block = build_q2k_block(d, dmin, scale_nibble, min_nibble, q);
let result = dequant_q2_k(&block, 256).expect("dequant_q2_k failed");
assert_eq!(result.len(), 256, "Output must have 256 elements");
let expected = d * scale_nibble as f32 * q as f32 - dmin * min_nibble as f32;
for (i, &val) in result.iter().enumerate() {
assert!(
(val - expected).abs() < 0.05,
"Element {}: expected {}, got {}",
i,
expected,
val
);
}
}
#[test]
fn test_dequant_q2_k_zero_quants() {
let block = build_q2k_block(1.0, 0.5, 4, 0, 0);
let result = dequant_q2_k(&block, 256).expect("dequant_q2_k failed");
assert_eq!(result.len(), 256);
for &v in &result {
assert!(v.abs() < 0.05, "Expected ~0, got {}", v);
}
}
#[test]
fn test_dequant_q2_k_error_alignment() {
let block = build_q2k_block(1.0, 0.0, 1, 0, 1);
assert!(dequant_q2_k(&block, 128).is_err());
}
fn build_q3k_block(d_val: f32, high_bit: u8, low2: u8, scale_raw: u8) -> Vec<u8> {
let mut block = vec![0u8; 110];
let hm_byte = if high_bit != 0 { 0xFFu8 } else { 0x00u8 };
block[..32].fill(hm_byte);
let qs_byte =
(low2 & 0x03) | ((low2 & 0x03) << 2) | ((low2 & 0x03) << 4) | ((low2 & 0x03) << 6);
for i in 0..64 {
block[32 + i] = qs_byte;
}
let mut bit_buf: u32 = 0;
let mut bits: u32 = 0;
let mut out_idx = 96usize;
for _ in 0..16 {
bit_buf |= (scale_raw as u32 & 0x3F) << bits;
bits += 6;
while bits >= 8 {
block[out_idx] = (bit_buf & 0xFF) as u8;
bit_buf >>= 8;
bits -= 8;
out_idx += 1;
}
}
if bits > 0 && out_idx < 108 {
block[out_idx] = (bit_buf & 0xFF) as u8;
}
let d_bits = half::f16::from_f32(d_val).to_bits();
block[108] = (d_bits & 0xFF) as u8;
block[109] = (d_bits >> 8) as u8;
block
}
#[test]
fn test_dequant_q3_k_block_layout() {
let d = 1.0f32;
let scale_raw = 36u8; let low2 = 2u8;
let high_bit = 0u8;
let block = build_q3k_block(d, high_bit, low2, scale_raw);
let result = dequant_q3_k(&block, 256).expect("dequant_q3_k failed");
assert_eq!(result.len(), 256);
let expected = d * (scale_raw as f32 - 32.0) * (low2 as f32 - 4.0);
for (i, &val) in result.iter().enumerate() {
assert!(
(val - expected).abs() < 0.1,
"Element {}: expected {}, got {}",
i,
expected,
val
);
}
}
#[test]
fn test_dequant_q3_k_high_bit_set() {
let d = 1.0f32;
let scale_raw = 35u8;
let low2 = 3u8;
let high_bit = 1u8;
let block = build_q3k_block(d, high_bit, low2, scale_raw);
let result = dequant_q3_k(&block, 256).expect("dequant_q3_k failed");
let expected = d * (scale_raw as f32 - 32.0) * (7.0 - 4.0);
for (i, &val) in result.iter().enumerate() {
assert!(
(val - expected).abs() < 0.1,
"Element {}: expected {}, got {}",
i,
expected,
val
);
}
}
fn build_q4k_block(
d_val: f32,
dmin_val: f32,
scale_raw: u8,
min_raw: u8,
nibble: u8,
) -> Vec<u8> {
let mut block = vec![0u8; 144];
let d_bits = half::f16::from_f32(d_val).to_bits();
block[0] = (d_bits & 0xFF) as u8;
block[1] = (d_bits >> 8) as u8;
let dm_bits = half::f16::from_f32(dmin_val).to_bits();
block[2] = (dm_bits & 0xFF) as u8;
block[3] = (dm_bits >> 8) as u8;
encode_6bit_scales_and_mins_8(&mut block[4..16], scale_raw, min_raw);
let nibble_byte = (nibble & 0x0F) | ((nibble & 0x0F) << 4);
for i in 0..128 {
block[16 + i] = nibble_byte;
}
block
}
fn encode_6bit_scales_and_mins_8(out: &mut [u8], scale_raw: u8, min_raw: u8) {
let mut bit_buf: u64 = 0u64;
let mut bits: u32 = 0;
let mut out_idx = 0usize;
let mut push6 = |val: u8| {
bit_buf |= (val as u64 & 0x3F) << bits;
bits += 6;
while bits >= 8 {
if out_idx < out.len() {
out[out_idx] = (bit_buf & 0xFF) as u8;
out_idx += 1;
}
bit_buf >>= 8;
bits -= 8;
}
};
for _ in 0..8 {
push6(scale_raw);
}
for _ in 0..8 {
push6(min_raw);
}
if bits > 0 && out_idx < out.len() {
out[out_idx] = (bit_buf & 0xFF) as u8;
}
}
#[test]
fn test_dequant_q4_k_nibble_packing() {
let d = 2.0f32;
let dmin = 1.0f32;
let scale_raw = 10u8;
let min_raw = 5u8;
let nibble = 7u8;
let block = build_q4k_block(d, dmin, scale_raw, min_raw, nibble);
let result = dequant_q4_k(&block, 256).expect("dequant_q4_k failed");
assert_eq!(result.len(), 256);
let expected = d * scale_raw as f32 * nibble as f32 - dmin * min_raw as f32;
for (i, &val) in result.iter().enumerate() {
assert!(
(val - expected).abs() < 0.5,
"Element {}: expected {}, got {}",
i,
expected,
val
);
}
}
#[test]
fn test_dequant_q4_k_zero_min() {
let d = 1.0f32;
let dmin = 1.0f32;
let scale_raw = 8u8;
let min_raw = 0u8;
let nibble = 3u8;
let block = build_q4k_block(d, dmin, scale_raw, min_raw, nibble);
let result = dequant_q4_k(&block, 256).expect("dequant_q4_k failed");
let expected = d * scale_raw as f32 * nibble as f32;
for &v in &result {
assert!(
(v - expected).abs() < 0.3,
"Expected {}, got {}",
expected,
v
);
}
}
fn build_q5k_block(
d_val: f32,
dmin_val: f32,
scale_raw: u8,
min_raw: u8,
low4: u8,
high_bit: u8,
) -> Vec<u8> {
let mut block = vec![0u8; 176];
let d_bits = half::f16::from_f32(d_val).to_bits();
block[0] = (d_bits & 0xFF) as u8;
block[1] = (d_bits >> 8) as u8;
let dm_bits = half::f16::from_f32(dmin_val).to_bits();
block[2] = (dm_bits & 0xFF) as u8;
block[3] = (dm_bits >> 8) as u8;
encode_6bit_scales_and_mins_8(&mut block[4..16], scale_raw, min_raw);
let qh_byte = if high_bit != 0 { 0xFFu8 } else { 0x00u8 };
for i in 0..32 {
block[16 + i] = qh_byte;
}
let nibble_byte = (low4 & 0x0F) | ((low4 & 0x0F) << 4);
for i in 0..128 {
block[48 + i] = nibble_byte;
}
block
}
#[test]
fn test_dequant_q5_k_high_bit_merge() {
let d = 1.0f32;
let dmin = 0.0f32;
let scale_raw = 4u8;
let min_raw = 0u8;
let low4 = 5u8;
let high_bit = 1u8;
let block = build_q5k_block(d, dmin, scale_raw, min_raw, low4, high_bit);
let result = dequant_q5_k(&block, 256).expect("dequant_q5_k failed");
assert_eq!(result.len(), 256);
let q5 = (low4 as f32) + (high_bit as f32) * 16.0;
let expected = d * scale_raw as f32 * q5 - dmin * min_raw as f32;
for (i, &val) in result.iter().enumerate() {
assert!(
(val - expected).abs() < 0.5,
"Element {}: expected {}, got {}",
i,
expected,
val
);
}
}
#[test]
fn test_dequant_q5_k_no_high_bit() {
let d = 1.0f32;
let dmin = 1.0f32;
let scale_raw = 3u8;
let min_raw = 2u8;
let low4 = 9u8;
let high_bit = 0u8;
let block = build_q5k_block(d, dmin, scale_raw, min_raw, low4, high_bit);
let result = dequant_q5_k(&block, 256).expect("dequant_q5_k failed");
let expected = d * scale_raw as f32 * low4 as f32 - dmin * min_raw as f32;
for &v in &result {
assert!(
(v - expected).abs() < 0.5,
"Expected {}, got {}",
expected,
v
);
}
}
#[test]
fn test_dequant_q8_k_identity() {
let mut block = vec![0u8; 292];
let d_bytes = 1.0f32.to_le_bytes();
block[0..4].copy_from_slice(&d_bytes);
for i in 0..256usize {
block[4 + i] = (i as u8).wrapping_add(0); }
let result = dequant_q8_k(&block, 256).expect("dequant_q8_k failed");
assert_eq!(result.len(), 256);
for i in 0..256usize {
let expected = (block[4 + i] as i8) as f32;
assert!(
(result[i] - expected).abs() < 1e-5,
"Element {}: expected {}, got {}",
i,
expected,
result[i]
);
}
}
#[test]
fn test_dequant_q8_k_scale_multiply() {
let mut block = vec![0u8; 292];
let d_bytes = 0.5f32.to_le_bytes();
block[0..4].copy_from_slice(&d_bytes);
for i in 0..256 {
block[4 + i] = 100u8;
}
let result = dequant_q8_k(&block, 256).expect("dequant_q8_k failed");
for (i, &v) in result.iter().enumerate() {
assert!(
(v - 50.0).abs() < 1e-4,
"Element {}: expected 50.0, got {}",
i,
v
);
}
}
#[test]
fn test_dequant_q8_k_negative_quants() {
let mut block = vec![0u8; 292];
let d_bytes = 2.0f32.to_le_bytes();
block[0..4].copy_from_slice(&d_bytes);
let q_byte = (-10i8) as u8;
for i in 0..256 {
block[4 + i] = q_byte;
}
let result = dequant_q8_k(&block, 256).expect("dequant_q8_k failed");
for &v in &result {
assert!((v - (-20.0)).abs() < 1e-4, "Expected -20.0, got {}", v);
}
}
#[test]
fn test_dequant_error_on_zero_elements() {
let data = vec![0u8; 84];
assert!(dequant_q2_k(&data, 0).is_err(), "n=0 should error");
assert!(dequant_q3_k(&data, 0).is_err(), "n=0 should error");
assert!(dequant_q4_k(&data, 0).is_err(), "n=0 should error");
assert!(dequant_q5_k(&data, 0).is_err(), "n=0 should error");
assert!(dequant_q8_k(&data, 0).is_err(), "n=0 should error");
}
#[test]
fn test_dequant_error_on_short_buffer() {
let tiny = vec![0u8; 10];
assert!(dequant_q2_k(&tiny, 256).is_err());
assert!(dequant_q3_k(&tiny, 256).is_err());
assert!(dequant_q4_k(&tiny, 256).is_err());
assert!(dequant_q5_k(&tiny, 256).is_err());
assert!(dequant_q8_k(&tiny, 256).is_err());
}
#[test]
fn test_dequant_q8_k_multi_block() {
let mut data = vec![0u8; 292 * 2];
let d_bytes = 3.0f32.to_le_bytes();
for blk in 0..2 {
let base = blk * 292;
data[base..base + 4].copy_from_slice(&d_bytes);
for i in 0..256 {
data[base + 4 + i] = 1u8;
} }
let result = dequant_q8_k(&data, 512).expect("dequant_q8_k failed");
assert_eq!(result.len(), 512);
for &v in &result {
assert!((v - 3.0).abs() < 1e-4, "Expected 3.0, got {}", v);
}
}
}