oxillama_quant/lib.rs
1//! # oxillama-quant
2//!
3//! Quantization kernel library for OxiLLaMa.
4//!
5//! Provides dequantization and fused matmul operations for all GGUF
6//! quantization formats. Each format has three implementation tiers:
7//!
8//! 1. **Reference (naive)** — Pure scalar Rust for correctness.
9//! 2. **Portable SIMD** — Cross-platform vectorization.
10//! 3. **Platform SIMD** — AVX2, AVX-512, NEON intrinsics.
11//!
12//! ## Supported Formats (planned)
13//!
14//! | Category | Types |
15//! |----------|-------|
16//! | Legacy | Q4_0, Q4_1, Q5_0, Q5_1, Q8_0, Q8_1 |
17//! | K-Quants | Q2_K, Q3_K, Q4_K, Q5_K, Q6_K |
18//! | I-Quants | IQ1_S, IQ1_M, IQ2_XXS, IQ2_XS, IQ2_S, IQ3_XXS, IQ3_S, IQ4_XS, IQ4_NL |
19//! | 1-Bit | Q1_0_G128 (from OxiBonsai) |
20//! | Float | F16, BF16, F32 |
21
22pub mod dispatch;
23pub mod error;
24pub mod lora;
25pub mod parallel;
26pub mod quantize;
27pub mod reference;
28pub mod simd;
29pub mod traits;
30pub mod types;
31
32pub use dispatch::{global_dispatcher, CachedDispatcher, KernelDispatcher};
33pub use error::{QuantError, QuantResult};
34pub use lora::LoraAdapter;
35pub use quantize::{
36 dequantize_to_f32, quantize_f16_to_q4_0, quantize_f16_to_q8_0, quantize_f32_to_q4_0,
37 quantize_f32_to_q8_0,
38};
39pub use traits::QuantKernel;
40pub use types::{BlockInfo, QuantTensor};