rage_quant/lib.rs
1//! # rage-quant
2//!
3//! High-performance quantized GEMV kernels for CPU-only LLM inference.
4//!
5//! This crate provides direct dot product operations on GGML quantized
6//! tensor blocks (Q8_0, Q6_K, Q4_K) without requiring dequantization
7//! to dense f32 tensors first. This approach:
8//!
9//! - Reduces DRAM bandwidth by 3.76x (1.06 bytes/elem vs 4 bytes/elem)
10//! - Eliminates dense f32 cache entirely (78.8% RAM savings)
11//! - Achieves 3.0x decode speedup on CPU inference
12//!
13//! ## Key functions
14//!
15//! - [`dot_q8_0_f32`] — Direct dot product on Q8_0 blocks (auto-detects AVX2)
16//! - [`dot_q6_k_f32`] — Direct dot product on Q6_K blocks
17//! - [`dot_q4_k_f32`] — Direct dot product on Q4_K blocks
18//! - [`dequantize_q8_0_block`] — Dequantize Q8_0 block to f32
19//! - [`dequantize_q4_k_block`] — Dequantize Q4_K block to f32
20//! - [`dequantize_q6_k_block`] — Dequantize Q6_K block to f32
21//!
22//! ## GEMM/GEMV utilities
23//!
24//! - [`gemm_kernel::dot_f32`] — AVX2+FMA vectorized f32 dot product
25//! - [`gemm_kernel::gemv_par`] — Rayon-parallelized GEMV
26//! - [`gemm_kernel::gemm_par`] — Rayon-parallelized GEMM
27//!
28//! ## Example
29//!
30//! ```ignore
31//! use rage_quant::{dot_q8_0_f32, dequantize_q8_0_block};
32//!
33//! // Direct quantized dot product (no dequantization needed)
34//! let result = dot_q8_0_f32(&quantized_data, &input_vector, num_elements);
35//!
36//! // Or dequantize a single block if needed
37//! let f32_values = dequantize_q8_0_block(&block_bytes).unwrap();
38//! ```
39
40pub mod ggml_quant;
41pub mod gemm_kernel;
42
43// Re-export primary quantized dot product functions
44pub use ggml_quant::{
45 dot_q8_0_f32,
46 dot_q6_k_f32,
47 dot_q4_k_f32,
48 dequantize_q8_0_block,
49 dequantize_q4_k_block,
50 dequantize_q6_k_block,
51 decode_f16,
52};
53
54// Re-export GEMM/GEMV utilities
55pub use gemm_kernel::{dot_f32, gemv_rows_f32, gemm_f32_row_major};