entrenar/quant/gguf_quant/
mod.rs

1//! GGUF-compatible quantization formats (Q4_0, Q8_0)
2//!
3//! Implements quantization formats compatible with llama.cpp and GGUF:
4//! - Q4_0: 4-bit quantization with per-block f16 scale (32 elements/block)
5//! - Q8_0: 8-bit quantization with per-block f16 scale (32 elements/block)
6//!
7//! Block structure:
8//! - Q4_0: 2 bytes scale (f16) + 16 bytes data (32 × 4-bit) = 18 bytes/block
9//! - Q8_0: 2 bytes scale (f16) + 32 bytes data (32 × 8-bit) = 34 bytes/block
10
11mod q4_0;
12mod q8_0;
13mod quant_type;
14
15#[cfg(test)]
16mod tests;
17
18pub use q4_0::Q4_0;
19pub use q8_0::Q8_0;
20pub use quant_type::GGUFQuantType;
21
22/// GGUF block size (standard for llama.cpp)
23pub const GGUF_BLOCK_SIZE: usize = 32;

entrenar/quant/gguf_quant/mod.rs

entrenar/quant/gguf_quant/
mod.rs