#![allow(
clippy::panic,
clippy::unwrap_used,
clippy::expect_used,
clippy::as_conversions,
clippy::cast_precision_loss,
clippy::cast_possible_truncation,
clippy::indexing_slicing
)]
use std::time::Instant;
use anamnesis::dequantize_per_tensor_fp8_to_bf16;
#[cfg(feature = "gguf")]
use anamnesis::{dequantize_gguf_blocks_to_bf16, dequantize_gguf_to_bf16, GgufType};
fn fmt_stats(samples: &[f64]) -> String {
let median = samples[samples.len() / 2];
let min = samples[0];
let max = samples[samples.len() - 1];
format!("median {median:.2} ms (min {min:.2}, max {max:.2})")
}
fn time_best_of_5<F>(mut f: F) -> Vec<f64>
where
F: FnMut() -> u8,
{
let _ = f();
let _ = f();
let mut samples: Vec<f64> = Vec::with_capacity(5);
let mut anti_dce: u64 = 0;
for _ in 0..5 {
let start = Instant::now();
anti_dce = anti_dce.wrapping_add(u64::from(f()));
let ms = start.elapsed().as_secs_f64() * 1000.0;
samples.push(ms);
}
samples.sort_by(|a, b| a.partial_cmp(b).unwrap());
eprintln!("(anti-DCE accumulator: {anti_dce})");
samples
}
#[test]
#[ignore = "ad-hoc benchmark; run with --release --ignored --nocapture"]
fn bench_fp8_per_tensor() {
const ROWS: usize = 4096;
const COLS: usize = 11008;
const N: usize = ROWS * COLS;
let weight: Vec<u8> = (0..N)
.map(|i| ((i as u64 * 0x9E37_79B9) >> 24) as u8)
.collect();
let scale: f32 = 0.5;
eprintln!(
"\n=== bench_fp8_per_tensor ({ROWS} × {COLS} = {} elements, {} MB → {} MB BF16) ===",
N,
N / 1_000_000,
(N * 2) / 1_000_000,
);
let samples = time_best_of_5(|| {
let out = dequantize_per_tensor_fp8_to_bf16(&weight, scale).unwrap();
out[out.len() - 1]
});
eprintln!("samples (ms): {samples:?}");
eprintln!("{}", fmt_stats(&samples));
eprintln!(
"throughput: {:.0} MB/s (BF16 output)",
((N * 2) as f64 / 1_000_000.0) / (samples[2] / 1000.0)
);
}
#[cfg(feature = "gguf")]
fn dequantize_via_indexed_sink(
data: &[u8],
dtype: GgufType,
n_elements: usize,
) -> anamnesis::Result<Vec<u8>> {
let out_byte_len = n_elements
.checked_mul(2)
.expect("output size overflow in bench fixture");
let mut out = vec![0u8; out_byte_len];
let mut offset = 0usize;
dequantize_gguf_blocks_to_bf16(data, dtype, n_elements, |block_out| {
out[offset..offset + block_out.len()].copy_from_slice(block_out);
offset += block_out.len();
Ok(())
})?;
Ok(out)
}
#[cfg(feature = "gguf")]
fn build_q8_0_buffer(n_blocks: usize) -> Vec<u8> {
const BLOCK_BYTES: usize = 34;
let mut buf = vec![0u8; n_blocks * BLOCK_BYTES];
for block in buf.chunks_exact_mut(BLOCK_BYTES) {
block[0] = 0x00;
block[1] = 0x3C;
}
buf
}
#[cfg(feature = "gguf")]
fn build_q4_0_buffer(n_blocks: usize) -> Vec<u8> {
const BLOCK_BYTES: usize = 18;
let mut buf = vec![0u8; n_blocks * BLOCK_BYTES];
for block in buf.chunks_exact_mut(BLOCK_BYTES) {
block[0] = 0x00;
block[1] = 0x3C;
}
buf
}
#[cfg(feature = "gguf")]
fn run_gguf_one(label: &str, data: &[u8], dtype: GgufType, n_elements: usize) -> f64 {
let samples_new = time_best_of_5(|| {
let out = dequantize_gguf_to_bf16(data, dtype, n_elements).unwrap();
out[out.len() - 1]
});
let samples_old = time_best_of_5(|| {
let out = dequantize_via_indexed_sink(data, dtype, n_elements).unwrap();
out[out.len() - 1]
});
let median_new = samples_new[2];
let median_old = samples_old[2];
let delta_pct = (median_new - median_old) / median_old * 100.0;
eprintln!(
"{label:<20} NEW {median_new:>7.2} ms (range {:.2}-{:.2}) \
OLD {median_old:>7.2} ms (range {:.2}-{:.2}) Δ {delta_pct:+.1}%",
samples_new[0], samples_new[4], samples_old[0], samples_old[4],
);
delta_pct
}
#[cfg(feature = "gguf")]
#[test]
#[ignore = "ad-hoc benchmark; run with --release --features gguf --ignored --nocapture"]
fn bench_gguf_size_sweep() {
const SIZES: &[(&str, usize)] = &[
("1M (2 MB BF16)", 1_048_576),
("8M (16 MB BF16)", 8 * 1_048_576),
("45M (90 MB BF16)", 4096 * 11008),
("100M (200 MB BF16)", 100 * 1_048_576),
];
eprintln!(
"\n=== bench_gguf_size_sweep — NEW (current Vec::with_capacity + extend_from_slice) \
vs OLD (vec![0u8; n] + indexed sink) ===\n"
);
eprintln!("--- Q8_0 ---");
let mut q8_deltas: Vec<f64> = Vec::with_capacity(SIZES.len());
for &(label, n) in SIZES {
let data = build_q8_0_buffer(n / 32);
let delta = run_gguf_one(label, &data, GgufType::Q8_0, n);
q8_deltas.push(delta);
}
eprintln!("\n--- Q4_0 ---");
let mut q4_deltas: Vec<f64> = Vec::with_capacity(SIZES.len());
for &(label, n) in SIZES {
let data = build_q4_0_buffer(n / 32);
let delta = run_gguf_one(label, &data, GgufType::Q4_0, n);
q4_deltas.push(delta);
}
eprintln!(
"\n--- Summary: NEW vs OLD median deltas across sizes ---\n\
Q8_0 deltas: {q8_deltas:+.1?}\n\
Q4_0 deltas: {q4_deltas:+.1?}"
);
eprintln!(
"\nDirectional finding holds if all Q8_0 deltas have the same \
sign and all Q4_0 deltas have the same (opposite) sign."
);
}