llama_cpp_4/
quantize.rs

1//! Quantization types and parameters for converting models to lower-bit precisions.
2//!
3//! # Quick start
4//!
5//! ```no_run
6//! use llama_cpp_4::quantize::{LlamaFtype, QuantizeParams};
7//!
8//! let params = QuantizeParams::new(LlamaFtype::MostlyQ4KM)
9//!     .with_nthread(8)
10//!     .with_quantize_output_tensor(true);
11//!
12//! llama_cpp_4::model_quantize("model-f16.gguf", "model-q4km.gguf", &params).unwrap();
13//! ```
14//!
15//! # `TurboQuant` – attention rotation (PR #21038)
16//!
17//! llama.cpp applies a Hadamard rotation to Q/K/V tensors before writing them into the KV cache.
18//! This significantly improves KV-cache quantization quality at near-zero cost, and is enabled by
19//! default for every model whose head dimension is a power of two.  You can opt out per-context
20//! with [`LlamaContextParams::with_attn_rot_disabled`] or globally with
21//! [`set_attn_rot_disabled`].
22//!
23//! [`LlamaContextParams::with_attn_rot_disabled`]: crate::context::params::LlamaContextParams::with_attn_rot_disabled
24
25use std::ffi::{CString, NulError};
26use std::ptr::null;
27
28// ─────────────────────────────────────────────────────────────────────────────
29// LlamaFtype
30// ─────────────────────────────────────────────────────────────────────────────
31
32/// The quantization type used for the bulk of a model file (maps to `llama_ftype`).
33///
34/// Pass one of these variants to [`QuantizeParams::new`] to choose the target precision.
35#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
36#[non_exhaustive]
37#[allow(missing_docs)]
38pub enum LlamaFtype {
39    /// All tensors stored as full F32 (very large, for reference only)
40    AllF32 = 0,
41    /// F16 – 14 GB @ 7B, +0.0020 ppl vs Mistral-7B
42    MostlyF16 = 1,
43    /// `Q4_0` – 4.34 GB @ 8B, +0.4685 ppl
44    MostlyQ4_0 = 2,
45    /// `Q4_1` – 4.78 GB @ 8B, +0.4511 ppl
46    MostlyQ4_1 = 3,
47    /// `Q8_0` – 7.96 GB @ 8B, +0.0026 ppl
48    MostlyQ8_0 = 7,
49    /// `Q5_0` – 5.21 GB @ 8B, +0.1316 ppl
50    MostlyQ5_0 = 8,
51    /// `Q5_1` – 5.65 GB @ 8B, +0.1062 ppl
52    MostlyQ5_1 = 9,
53    /// `Q2_K` – 2.96 GB @ 8B, +3.5199 ppl
54    MostlyQ2K = 10,
55    /// `Q3_K` small – 3.41 GB @ 8B, +1.6321 ppl
56    MostlyQ3KS = 11,
57    /// `Q3_K` medium – 3.74 GB @ 8B, +0.6569 ppl
58    MostlyQ3KM = 12,
59    /// `Q3_K` large – 4.03 GB @ 8B, +0.5562 ppl
60    MostlyQ3KL = 13,
61    /// `Q4_K` small – 4.37 GB @ 8B, +0.2689 ppl
62    MostlyQ4KS = 14,
63    /// `Q4_K` medium – 4.58 GB @ 8B, +0.1754 ppl  *(recommended default)*
64    MostlyQ4KM = 15,
65    /// `Q5_K` small – 5.21 GB @ 8B, +0.1049 ppl
66    MostlyQ5KS = 16,
67    /// `Q5_K` medium – 5.33 GB @ 8B, +0.0569 ppl
68    MostlyQ5KM = 17,
69    /// `Q6_K` – 6.14 GB @ 8B, +0.0217 ppl
70    MostlyQ6K = 18,
71    /// `IQ2_XXS` – 2.06 bpw
72    MostlyIQ2XXS = 19,
73    /// `IQ2_XS` – 2.31 bpw
74    MostlyIQ2XS = 20,
75    /// `Q2_K` small
76    MostlyQ2KS = 21,
77    /// `IQ3_XS` – 3.3 bpw
78    MostlyIQ3XS = 22,
79    /// `IQ3_XXS` – 3.06 bpw
80    MostlyIQ3XXS = 23,
81    /// `IQ1_S` – 1.56 bpw (extremely small, high loss)
82    MostlyIQ1S = 24,
83    /// `IQ4_NL` – 4.50 bpw non-linear
84    MostlyIQ4NL = 25,
85    /// `IQ3_S` – 3.44 bpw
86    MostlyIQ3S = 26,
87    /// `IQ3_M` – 3.66 bpw
88    MostlyIQ3M = 27,
89    /// `IQ2_S` – 2.5 bpw
90    MostlyIQ2S = 28,
91    /// `IQ2_M` – 2.7 bpw
92    MostlyIQ2M = 29,
93    /// `IQ4_XS` – 4.25 bpw non-linear
94    MostlyIQ4XS = 30,
95    /// `IQ1_M` – 1.75 bpw
96    MostlyIQ1M = 31,
97    /// BF16 – 14 GB @ 7B, −0.0050 ppl vs Mistral-7B
98    MostlyBF16 = 32,
99    /// `TQ1_0` – 1.69 bpw ternary
100    MostlyTQ1_0 = 36,
101    /// `TQ2_0` – 2.06 bpw ternary
102    MostlyTQ2_0 = 37,
103    /// MXFP4 (`MoE` layers)
104    MostlyMXFP4Moe = 38,
105    /// NVFP4
106    MostlyNVFP4 = 39,
107    /// Q1_0 – 1.5 bpw binary (block size 32)
108    #[cfg(feature = "q1")]
109    MostlyQ1_0 = 40,
110    /// Q1_0_g128 – 1.125 bpw binary (block size 128)
111    #[cfg(feature = "q1")]
112    MostlyQ1_0_G128 = 41,
113}
114
115impl LlamaFtype {
116    /// Short name suitable for filenames (e.g. `"Q4_K_M"`).
117    #[must_use]
118    pub fn name(self) -> &'static str {
119        match self {
120            Self::AllF32 => "F32",
121            Self::MostlyF16 => "F16",
122            Self::MostlyQ4_0 => "Q4_0",
123            Self::MostlyQ4_1 => "Q4_1",
124            Self::MostlyQ8_0 => "Q8_0",
125            Self::MostlyQ5_0 => "Q5_0",
126            Self::MostlyQ5_1 => "Q5_1",
127            Self::MostlyQ2K => "Q2_K",
128            Self::MostlyQ3KS => "Q3_K_S",
129            Self::MostlyQ3KM => "Q3_K_M",
130            Self::MostlyQ3KL => "Q3_K_L",
131            Self::MostlyQ4KS => "Q4_K_S",
132            Self::MostlyQ4KM => "Q4_K_M",
133            Self::MostlyQ5KS => "Q5_K_S",
134            Self::MostlyQ5KM => "Q5_K_M",
135            Self::MostlyQ6K => "Q6_K",
136            Self::MostlyIQ2XXS => "IQ2_XXS",
137            Self::MostlyIQ2XS => "IQ2_XS",
138            Self::MostlyQ2KS => "Q2_K_S",
139            Self::MostlyIQ3XS => "IQ3_XS",
140            Self::MostlyIQ3XXS => "IQ3_XXS",
141            Self::MostlyIQ1S => "IQ1_S",
142            Self::MostlyIQ4NL => "IQ4_NL",
143            Self::MostlyIQ3S => "IQ3_S",
144            Self::MostlyIQ3M => "IQ3_M",
145            Self::MostlyIQ2S => "IQ2_S",
146            Self::MostlyIQ2M => "IQ2_M",
147            Self::MostlyIQ4XS => "IQ4_XS",
148            Self::MostlyIQ1M => "IQ1_M",
149            Self::MostlyBF16 => "BF16",
150            Self::MostlyTQ1_0 => "TQ1_0",
151            Self::MostlyTQ2_0 => "TQ2_0",
152            Self::MostlyMXFP4Moe => "MXFP4_MOE",
153            Self::MostlyNVFP4 => "NVFP4",
154            #[cfg(feature = "q1")]
155            Self::MostlyQ1_0 => "Q1_0",
156            #[cfg(feature = "q1")]
157            Self::MostlyQ1_0_G128 => "Q1_0_g128",
158        }
159    }
160
161    /// Human-readable description with approximate size and PPL delta.
162    #[must_use]
163    pub fn description(self) -> &'static str {
164        match self {
165            Self::AllF32 => "26.00 GB @ 7B — full precision reference",
166            Self::MostlyF16 => "14.00 GB @ 7B — +0.0020 ppl vs Mistral-7B",
167            Self::MostlyBF16 => "14.00 GB @ 7B — -0.0050 ppl vs Mistral-7B",
168            Self::MostlyQ8_0 => " 7.96 GB @ 8B — +0.0026 ppl",
169            Self::MostlyQ6K => " 6.14 GB @ 8B — +0.0217 ppl",
170            Self::MostlyQ5KM => " 5.33 GB @ 8B — +0.0569 ppl",
171            Self::MostlyQ5KS => " 5.21 GB @ 8B — +0.1049 ppl",
172            Self::MostlyQ5_1 => " 5.65 GB @ 8B — +0.1062 ppl",
173            Self::MostlyQ5_0 => " 5.21 GB @ 8B — +0.1316 ppl",
174            Self::MostlyQ4KM => " 4.58 GB @ 8B — +0.1754 ppl  [recommended]",
175            Self::MostlyQ4KS => " 4.37 GB @ 8B — +0.2689 ppl",
176            Self::MostlyQ4_1 => " 4.78 GB @ 8B — +0.4511 ppl",
177            Self::MostlyQ4_0 => " 4.34 GB @ 8B — +0.4685 ppl",
178            Self::MostlyQ3KL => " 4.03 GB @ 8B — +0.5562 ppl",
179            Self::MostlyQ3KM => " 3.74 GB @ 8B — +0.6569 ppl",
180            Self::MostlyQ3KS => " 3.41 GB @ 8B — +1.6321 ppl",
181            Self::MostlyQ2KS => " 2.96 GB @ 8B — +3.1836 ppl",
182            Self::MostlyQ2K => " 2.96 GB @ 8B — +3.5199 ppl",
183            Self::MostlyIQ4XS => " 4.25 bpw non-linear",
184            Self::MostlyIQ4NL => " 4.50 bpw non-linear",
185            Self::MostlyIQ3S => " 3.44 bpw",
186            Self::MostlyIQ3M => " 3.66 bpw",
187            Self::MostlyIQ3XS => " 3.3 bpw",
188            Self::MostlyIQ3XXS => " 3.06 bpw",
189            Self::MostlyIQ2M => " 2.7 bpw",
190            Self::MostlyIQ2S => " 2.5 bpw",
191            Self::MostlyIQ2XS => " 2.31 bpw",
192            Self::MostlyIQ2XXS => " 2.06 bpw",
193            Self::MostlyIQ1M => " 1.75 bpw — extreme compression",
194            Self::MostlyIQ1S => " 1.56 bpw — extreme compression",
195            Self::MostlyTQ1_0 => " 1.69 bpw ternary",
196            Self::MostlyTQ2_0 => " 2.06 bpw ternary",
197            Self::MostlyMXFP4Moe => "MXFP4 MoE layers",
198            Self::MostlyNVFP4 => "NVFP4",
199            #[cfg(feature = "q1")]
200            Self::MostlyQ1_0 => " 1.50 bpw — binary Q1_0 (block 32)",
201            #[cfg(feature = "q1")]
202            Self::MostlyQ1_0_G128 => " 1.125 bpw — binary Q1_0_g128 (block 128)",
203        }
204    }
205
206    /// Look up a variant by its short name (case-insensitive).
207    ///
208    /// ```
209    /// use llama_cpp_4::quantize::LlamaFtype;
210    /// assert_eq!(LlamaFtype::from_name("Q4_K_M"), Some(LlamaFtype::MostlyQ4KM));
211    /// assert_eq!(LlamaFtype::from_name("q4_k_m"), Some(LlamaFtype::MostlyQ4KM));
212    /// assert_eq!(LlamaFtype::from_name("bogus"), None);
213    /// ```
214    #[must_use]
215    pub fn from_name(name: &str) -> Option<Self> {
216        let upper = name.to_uppercase();
217        match upper.as_str() {
218            "F32" => Some(Self::AllF32),
219            "F16" => Some(Self::MostlyF16),
220            "BF16" => Some(Self::MostlyBF16),
221            "Q4_0" => Some(Self::MostlyQ4_0),
222            "Q4_1" => Some(Self::MostlyQ4_1),
223            "Q8_0" => Some(Self::MostlyQ8_0),
224            "Q5_0" => Some(Self::MostlyQ5_0),
225            "Q5_1" => Some(Self::MostlyQ5_1),
226            "Q2_K" => Some(Self::MostlyQ2K),
227            "Q2_K_S" => Some(Self::MostlyQ2KS),
228            "Q3_K_S" => Some(Self::MostlyQ3KS),
229            "Q3_K_M" => Some(Self::MostlyQ3KM),
230            "Q3_K_L" => Some(Self::MostlyQ3KL),
231            "Q4_K_S" => Some(Self::MostlyQ4KS),
232            "Q4_K_M" => Some(Self::MostlyQ4KM),
233            "Q5_K_S" => Some(Self::MostlyQ5KS),
234            "Q5_K_M" => Some(Self::MostlyQ5KM),
235            "Q6_K" => Some(Self::MostlyQ6K),
236            "IQ1_S" => Some(Self::MostlyIQ1S),
237            "IQ1_M" => Some(Self::MostlyIQ1M),
238            "IQ2_XXS" => Some(Self::MostlyIQ2XXS),
239            "IQ2_XS" => Some(Self::MostlyIQ2XS),
240            "IQ2_S" => Some(Self::MostlyIQ2S),
241            "IQ2_M" => Some(Self::MostlyIQ2M),
242            "IQ3_XXS" => Some(Self::MostlyIQ3XXS),
243            "IQ3_XS" => Some(Self::MostlyIQ3XS),
244            "IQ3_S" => Some(Self::MostlyIQ3S),
245            "IQ3_M" => Some(Self::MostlyIQ3M),
246            "IQ4_NL" => Some(Self::MostlyIQ4NL),
247            "IQ4_XS" => Some(Self::MostlyIQ4XS),
248            "TQ1_0" => Some(Self::MostlyTQ1_0),
249            "TQ2_0" => Some(Self::MostlyTQ2_0),
250            "MXFP4_MOE" => Some(Self::MostlyMXFP4Moe),
251            "NVFP4" => Some(Self::MostlyNVFP4),
252            #[cfg(feature = "q1")]
253            "Q1_0" => Some(Self::MostlyQ1_0),
254            #[cfg(feature = "q1")]
255            "Q1_0_G128" | "Q1_0_g128" => Some(Self::MostlyQ1_0_G128),
256            _ => None,
257        }
258    }
259
260    /// All available types, ordered roughly from largest to smallest.
261    #[must_use]
262    pub fn all() -> &'static [Self] {
263        &[
264            Self::AllF32,
265            Self::MostlyF16,
266            Self::MostlyBF16,
267            Self::MostlyQ8_0,
268            Self::MostlyQ6K,
269            Self::MostlyQ5KM,
270            Self::MostlyQ5KS,
271            Self::MostlyQ5_1,
272            Self::MostlyQ5_0,
273            Self::MostlyQ4KM,
274            Self::MostlyQ4KS,
275            Self::MostlyQ4_1,
276            Self::MostlyQ4_0,
277            Self::MostlyQ3KL,
278            Self::MostlyQ3KM,
279            Self::MostlyQ3KS,
280            Self::MostlyQ2KS,
281            Self::MostlyQ2K,
282            Self::MostlyIQ4XS,
283            Self::MostlyIQ4NL,
284            Self::MostlyIQ3S,
285            Self::MostlyIQ3M,
286            Self::MostlyIQ3XS,
287            Self::MostlyIQ3XXS,
288            Self::MostlyIQ2M,
289            Self::MostlyIQ2S,
290            Self::MostlyIQ2XS,
291            Self::MostlyIQ2XXS,
292            Self::MostlyIQ1M,
293            Self::MostlyIQ1S,
294            Self::MostlyTQ1_0,
295            Self::MostlyTQ2_0,
296            Self::MostlyMXFP4Moe,
297            Self::MostlyNVFP4,
298            #[cfg(feature = "q1")]
299            Self::MostlyQ1_0,
300            #[cfg(feature = "q1")]
301            Self::MostlyQ1_0_G128,
302        ]
303    }
304}
305
306impl From<LlamaFtype> for llama_cpp_sys_4::llama_ftype {
307    fn from(t: LlamaFtype) -> Self {
308        t as llama_cpp_sys_4::llama_ftype
309    }
310}
311
312impl std::fmt::Display for LlamaFtype {
313    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
314        write!(f, "{}", self.name())
315    }
316}
317
318// ─────────────────────────────────────────────────────────────────────────────
319// GgmlType
320// ─────────────────────────────────────────────────────────────────────────────
321
322/// GGML tensor storage type (maps to `ggml_type`).
323///
324/// Used to set [`QuantizeParams::output_tensor_type`] and
325/// [`QuantizeParams::token_embedding_type`], and for per-tensor type overrides
326/// in [`TensorTypeOverride`].
327#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
328#[non_exhaustive]
329#[allow(missing_docs)]
330pub enum GgmlType {
331    F32 = 0,
332    F16 = 1,
333    Q4_0 = 2,
334    Q4_1 = 3,
335    Q5_0 = 6,
336    Q5_1 = 7,
337    Q8_0 = 8,
338    Q8_1 = 9,
339    Q2K = 10,
340    Q3K = 11,
341    Q4K = 12,
342    Q5K = 13,
343    Q6K = 14,
344    Q8K = 15,
345    IQ2XXS = 16,
346    IQ2XS = 17,
347    IQ3XXS = 18,
348    IQ1S = 19,
349    IQ4NL = 20,
350    IQ3S = 21,
351    IQ2S = 22,
352    IQ4XS = 23,
353    I8 = 24,
354    I16 = 25,
355    I32 = 26,
356    I64 = 27,
357    F64 = 28,
358    IQ1M = 29,
359    BF16 = 30,
360    TQ1_0 = 34,
361    TQ2_0 = 35,
362    MXFP4 = 39,
363    /// NVFP4 — renumbered to 42 when the `q1` feature is active (40 and 41
364    /// are taken by `Q1_0` / `Q1_0_g128` for `PrismML` GGUF compatibility).
365    #[cfg(not(feature = "q1"))]
366    NVFP4 = 40,
367    #[cfg(feature = "q1")]
368    Q1_0 = 40,
369    #[cfg(feature = "q1")]
370    Q1_0_G128 = 41,
371    #[cfg(feature = "q1")]
372    NVFP4 = 42,
373}
374
375impl From<GgmlType> for llama_cpp_sys_4::ggml_type {
376    fn from(t: GgmlType) -> Self {
377        t as llama_cpp_sys_4::ggml_type
378    }
379}
380
381impl TryFrom<llama_cpp_sys_4::ggml_type> for GgmlType {
382    type Error = llama_cpp_sys_4::ggml_type;
383    fn try_from(v: llama_cpp_sys_4::ggml_type) -> Result<Self, Self::Error> {
384        match v {
385            0 => Ok(Self::F32),
386            1 => Ok(Self::F16),
387            2 => Ok(Self::Q4_0),
388            3 => Ok(Self::Q4_1),
389            6 => Ok(Self::Q5_0),
390            7 => Ok(Self::Q5_1),
391            8 => Ok(Self::Q8_0),
392            9 => Ok(Self::Q8_1),
393            10 => Ok(Self::Q2K),
394            11 => Ok(Self::Q3K),
395            12 => Ok(Self::Q4K),
396            13 => Ok(Self::Q5K),
397            14 => Ok(Self::Q6K),
398            15 => Ok(Self::Q8K),
399            16 => Ok(Self::IQ2XXS),
400            17 => Ok(Self::IQ2XS),
401            18 => Ok(Self::IQ3XXS),
402            19 => Ok(Self::IQ1S),
403            20 => Ok(Self::IQ4NL),
404            21 => Ok(Self::IQ3S),
405            22 => Ok(Self::IQ2S),
406            23 => Ok(Self::IQ4XS),
407            24 => Ok(Self::I8),
408            25 => Ok(Self::I16),
409            26 => Ok(Self::I32),
410            27 => Ok(Self::I64),
411            28 => Ok(Self::F64),
412            29 => Ok(Self::IQ1M),
413            30 => Ok(Self::BF16),
414            34 => Ok(Self::TQ1_0),
415            35 => Ok(Self::TQ2_0),
416            39 => Ok(Self::MXFP4),
417            #[cfg(not(feature = "q1"))]
418            40 => Ok(Self::NVFP4),
419            #[cfg(feature = "q1")]
420            40 => Ok(Self::Q1_0),
421            #[cfg(feature = "q1")]
422            41 => Ok(Self::Q1_0_G128),
423            #[cfg(feature = "q1")]
424            42 => Ok(Self::NVFP4),
425            _ => Err(v),
426        }
427    }
428}
429
430// ─────────────────────────────────────────────────────────────────────────────
431// ImatrixEntry / Imatrix
432// ─────────────────────────────────────────────────────────────────────────────
433
434/// A single per-tensor importance matrix entry, as loaded from a `.imatrix` file.
435///
436/// Each entry contains activation statistics for one model tensor collected from
437/// a calibration dataset. When supplied to [`QuantizeParams::with_imatrix`] these
438/// statistics guide the quantizer to allocate more precision to weights that
439/// matter most.
440#[derive(Debug, Clone)]
441pub struct ImatrixEntry {
442    name: CString,
443    data: Vec<f32>,
444}
445
446impl ImatrixEntry {
447    /// Create a new entry from a tensor name and its importance scores.
448    ///
449    /// # Errors
450    ///
451    /// Returns [`NulError`] if `name` contains an interior null byte.
452    pub fn new(name: impl Into<Vec<u8>>, data: Vec<f32>) -> Result<Self, NulError> {
453        Ok(Self {
454            name: CString::new(name)?,
455            data,
456        })
457    }
458
459    /// Tensor name.
460    #[must_use]
461    pub fn name_str(&self) -> &str {
462        self.name.to_str().unwrap_or("")
463    }
464
465    /// Number of importance values.
466    #[must_use]
467    pub fn len(&self) -> usize {
468        self.data.len()
469    }
470
471    /// Returns `true` if the data slice is empty.
472    #[must_use]
473    pub fn is_empty(&self) -> bool {
474        self.data.is_empty()
475    }
476}
477
478/// A collection of importance matrix entries (one per quantized tensor).
479///
480/// Build one by pushing [`ImatrixEntry`] values, then pass it to
481/// [`QuantizeParams::with_imatrix`].
482#[derive(Debug, Clone, Default)]
483pub struct Imatrix {
484    entries: Vec<ImatrixEntry>,
485}
486
487impl Imatrix {
488    /// Create an empty imatrix.
489    #[must_use]
490    pub fn new() -> Self {
491        Self::default()
492    }
493
494    /// Add an entry.
495    pub fn push(&mut self, entry: ImatrixEntry) {
496        self.entries.push(entry);
497    }
498
499    /// Number of entries.
500    #[must_use]
501    pub fn len(&self) -> usize {
502        self.entries.len()
503    }
504
505    /// Returns `true` if no entries have been added.
506    #[must_use]
507    pub fn is_empty(&self) -> bool {
508        self.entries.is_empty()
509    }
510}
511
512// ─────────────────────────────────────────────────────────────────────────────
513// TensorTypeOverride
514// ─────────────────────────────────────────────────────────────────────────────
515
516/// Override the quantization type of every tensor whose name matches a glob `pattern`.
517///
518/// The pattern syntax is the same as used by the `--tensor-type` flag in
519/// `llama-quantize`, e.g. `"attn.*"` or `"blk.0.*"`.
520///
521/// # Example
522///
523/// ```
524/// use llama_cpp_4::quantize::{GgmlType, TensorTypeOverride};
525///
526/// // Keep the output projection in F16:
527/// let ov = TensorTypeOverride::new("output", GgmlType::F16).unwrap();
528/// ```
529#[derive(Debug, Clone)]
530pub struct TensorTypeOverride {
531    pattern: CString,
532    ty: GgmlType,
533}
534
535impl TensorTypeOverride {
536    /// Create a new override.
537    ///
538    /// # Errors
539    ///
540    /// Returns [`NulError`] if `pattern` contains an interior null byte.
541    pub fn new(pattern: impl Into<Vec<u8>>, ty: GgmlType) -> Result<Self, NulError> {
542        Ok(Self {
543            pattern: CString::new(pattern)?,
544            ty,
545        })
546    }
547
548    /// The glob pattern that selects tensors.
549    #[must_use]
550    pub fn pattern_str(&self) -> &str {
551        self.pattern.to_str().unwrap_or("")
552    }
553
554    /// The type to assign to matching tensors.
555    #[must_use]
556    pub fn ty(&self) -> GgmlType {
557        self.ty
558    }
559}
560
561// ─────────────────────────────────────────────────────────────────────────────
562// KvOverrideValue / KvOverride
563// ─────────────────────────────────────────────────────────────────────────────
564
565/// A value in a GGUF key-value metadata override.
566#[derive(Debug, Clone, PartialEq)]
567pub enum KvOverrideValue {
568    /// 64-bit integer
569    Int(i64),
570    /// 64-bit float
571    Float(f64),
572    /// Boolean
573    Bool(bool),
574    /// Fixed-length string (up to 127 bytes + NUL)
575    Str([std::os::raw::c_char; 128]),
576}
577
578/// A single GGUF metadata key-value override.
579///
580/// These are written into the output file's metadata when quantizing.
581#[derive(Debug, Clone)]
582pub struct KvOverride {
583    key: CString,
584    /// The value for this override.
585    pub value: KvOverrideValue,
586}
587
588impl KvOverride {
589    /// Create a new override.
590    ///
591    /// # Errors
592    ///
593    /// Returns [`NulError`] if `key` contains an interior null byte.
594    pub fn new(key: impl Into<Vec<u8>>, value: KvOverrideValue) -> Result<Self, NulError> {
595        Ok(Self {
596            key: CString::new(key)?,
597            value,
598        })
599    }
600}
601
602// ─────────────────────────────────────────────────────────────────────────────
603// QuantizeParams
604// ─────────────────────────────────────────────────────────────────────────────
605
606/// Parameters for quantizing a model.
607///
608/// Create with [`QuantizeParams::new`] and chain `with_*` builder methods to
609/// configure, then pass a reference to [`crate::model_quantize`].
610///
611/// # Example
612///
613/// ```no_run
614/// use llama_cpp_4::quantize::{GgmlType, LlamaFtype, QuantizeParams, TensorTypeOverride};
615///
616/// let ov = TensorTypeOverride::new("output", GgmlType::F16).unwrap();
617///
618/// let params = QuantizeParams::new(LlamaFtype::MostlyQ4KM)
619///     .with_nthread(8)
620///     .with_allow_requantize(false)
621///     .with_quantize_output_tensor(true)
622///     .with_pure(false)
623///     .with_tensor_type_override(ov);
624///
625/// llama_cpp_4::model_quantize("in.gguf", "out.gguf", &params).unwrap();
626/// ```
627#[derive(Debug, Clone)]
628#[allow(clippy::struct_excessive_bools)]
629pub struct QuantizeParams {
630    /// Number of threads (0 = auto-detect).
631    pub nthread: i32,
632    /// Target quantization type.
633    pub ftype: LlamaFtype,
634    /// Force this storage type for the output/lm-head tensor (`None` = use ftype default).
635    pub output_tensor_type: Option<GgmlType>,
636    /// Force this storage type for the token-embedding tensor (`None` = use ftype default).
637    pub token_embedding_type: Option<GgmlType>,
638    /// Allow re-quantizing tensors that are already quantized.
639    pub allow_requantize: bool,
640    /// Quantize the output/lm-head weight tensor.
641    pub quantize_output_tensor: bool,
642    /// Copy all tensors without quantizing (ignores `ftype`).
643    pub only_copy: bool,
644    /// Quantize every tensor to the same type (no mixed k-quant strategy).
645    pub pure: bool,
646    /// Keep the same number of shards as the input (for split models).
647    pub keep_split: bool,
648    /// Estimate output size without writing anything to disk.
649    pub dry_run: bool,
650
651    imatrix: Vec<ImatrixEntry>,
652    kv_overrides: Vec<KvOverride>,
653    tt_overrides: Vec<TensorTypeOverride>,
654    prune_layers: Vec<i32>,
655}
656
657impl QuantizeParams {
658    /// Create a new params set targeting `ftype`.
659    ///
660    /// All other options are set to the same defaults as
661    /// `llama_model_quantize_default_params()`.
662    #[must_use]
663    pub fn new(ftype: LlamaFtype) -> Self {
664        // Read the C defaults so we match them exactly.
665        let d = unsafe { llama_cpp_sys_4::llama_model_quantize_default_params() };
666        Self {
667            nthread: d.nthread,
668            ftype,
669            output_tensor_type: GgmlType::try_from(d.output_tensor_type).ok(),
670            token_embedding_type: GgmlType::try_from(d.token_embedding_type).ok(),
671            allow_requantize: d.allow_requantize,
672            quantize_output_tensor: d.quantize_output_tensor,
673            only_copy: d.only_copy,
674            pure: d.pure_,
675            keep_split: d.keep_split,
676            dry_run: d.dry_run,
677            imatrix: Vec::new(),
678            kv_overrides: Vec::new(),
679            tt_overrides: Vec::new(),
680            prune_layers: Vec::new(),
681        }
682    }
683
684    /// Set the number of quantization threads (`0` = auto).
685    #[must_use]
686    pub fn with_nthread(mut self, n: i32) -> Self {
687        self.nthread = n;
688        self
689    }
690
691    /// Override the output-tensor storage type.
692    #[must_use]
693    pub fn with_output_tensor_type(mut self, ty: GgmlType) -> Self {
694        self.output_tensor_type = Some(ty);
695        self
696    }
697
698    /// Override the token-embedding storage type.
699    #[must_use]
700    pub fn with_token_embedding_type(mut self, ty: GgmlType) -> Self {
701        self.token_embedding_type = Some(ty);
702        self
703    }
704
705    /// Allow (or disallow) re-quantizing already-quantized tensors.
706    #[must_use]
707    pub fn with_allow_requantize(mut self, v: bool) -> Self {
708        self.allow_requantize = v;
709        self
710    }
711
712    /// Quantize the output/lm-head weight (`true` by default).
713    #[must_use]
714    pub fn with_quantize_output_tensor(mut self, v: bool) -> Self {
715        self.quantize_output_tensor = v;
716        self
717    }
718
719    /// When `true`, only copy tensors verbatim (no quantization at all).
720    #[must_use]
721    pub fn with_only_copy(mut self, v: bool) -> Self {
722        self.only_copy = v;
723        self
724    }
725
726    /// When `true`, quantize all tensors to the same type (no mixed k-quant strategy).
727    #[must_use]
728    pub fn with_pure(mut self, v: bool) -> Self {
729        self.pure = v;
730        self
731    }
732
733    /// Preserve the number of shards when quantizing a split model.
734    #[must_use]
735    pub fn with_keep_split(mut self, v: bool) -> Self {
736        self.keep_split = v;
737        self
738    }
739
740    /// Only estimate the output size; do not write anything to disk.
741    #[must_use]
742    pub fn with_dry_run(mut self, v: bool) -> Self {
743        self.dry_run = v;
744        self
745    }
746
747    /// Supply importance matrix data to improve quantization quality.
748    ///
749    /// The imatrix is generated by the `imatrix` tool (or the `imatrix` example
750    /// in this crate) and contains per-tensor activation statistics collected
751    /// from a calibration dataset.
752    #[must_use]
753    pub fn with_imatrix(mut self, imatrix: Imatrix) -> Self {
754        self.imatrix = imatrix.entries;
755        self
756    }
757
758    /// Append a single imatrix entry.
759    #[must_use]
760    pub fn with_imatrix_entry(mut self, entry: ImatrixEntry) -> Self {
761        self.imatrix.push(entry);
762        self
763    }
764
765    /// Add (or replace) a GGUF metadata key-value pair in the output file.
766    #[must_use]
767    pub fn with_kv_override(mut self, kv: KvOverride) -> Self {
768        self.kv_overrides.push(kv);
769        self
770    }
771
772    /// Override the quantization type for tensors whose name matches `pattern`.
773    ///
774    /// Can be called multiple times; overrides are applied in order.
775    #[must_use]
776    pub fn with_tensor_type_override(mut self, ov: TensorTypeOverride) -> Self {
777        self.tt_overrides.push(ov);
778        self
779    }
780
781    /// Mark a layer index for pruning (removal) from the output model.
782    #[must_use]
783    pub fn with_pruned_layer(mut self, layer: i32) -> Self {
784        self.prune_layers.push(layer);
785        self
786    }
787
788    /// Mark multiple layer indices for pruning.
789    #[must_use]
790    pub fn with_pruned_layers(mut self, layers: impl IntoIterator<Item = i32>) -> Self {
791        self.prune_layers.extend(layers);
792        self
793    }
794
795    /// Build the raw C struct, together with the temporary backing storage
796    /// that must outlive the struct.  Returns `(raw_params, _guards)`.
797    ///
798    /// This is `pub(crate)` so that `model_quantize` can call it safely while
799    /// holding all the guards alive.
800    #[allow(clippy::too_many_lines)]
801    pub(crate) fn to_raw(&self) -> RawQuantizeParamsGuard<'_> {
802        // ── imatrix ─────────────────────────────────────────────────────────
803        // Build a null-terminated array of llama_model_imatrix_data.
804        // The `name` and `data` pointers point directly into our owned Vecs.
805        let imatrix_c: Vec<llama_cpp_sys_4::llama_model_imatrix_data> = self
806            .imatrix
807            .iter()
808            .map(|e| llama_cpp_sys_4::llama_model_imatrix_data {
809                name: e.name.as_ptr(),
810                data: e.data.as_ptr(),
811                size: e.data.len(),
812            })
813            .chain(std::iter::once(llama_cpp_sys_4::llama_model_imatrix_data {
814                name: null(),
815                data: null(),
816                size: 0,
817            }))
818            .collect();
819
820        // ── kv_overrides ────────────────────────────────────────────────────
821        // null-terminated by a sentinel with key[0] == 0
822        let kv_c: Vec<llama_cpp_sys_4::llama_model_kv_override> = self
823            .kv_overrides
824            .iter()
825            .map(|kv| {
826                let mut raw = llama_cpp_sys_4::llama_model_kv_override {
827                    key: [0; 128],
828                    tag: 0,
829                    __bindgen_anon_1: llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
830                        val_i64: 0,
831                    },
832                };
833                // Copy key bytes (up to 127 chars + NUL).
834                let bytes = kv.key.to_bytes_with_nul();
835                let copy_len = bytes.len().min(128);
836                for (dst, &src) in raw.key.iter_mut().zip(bytes[..copy_len].iter()) {
837                    *dst = src.cast_signed();
838                }
839                match &kv.value {
840                    KvOverrideValue::Int(v) => {
841                        raw.tag = llama_cpp_sys_4::LLAMA_KV_OVERRIDE_TYPE_INT;
842                        raw.__bindgen_anon_1 =
843                            llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 { val_i64: *v };
844                    }
845                    KvOverrideValue::Float(v) => {
846                        raw.tag = llama_cpp_sys_4::LLAMA_KV_OVERRIDE_TYPE_FLOAT;
847                        raw.__bindgen_anon_1 =
848                            llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 { val_f64: *v };
849                    }
850                    KvOverrideValue::Bool(v) => {
851                        raw.tag = llama_cpp_sys_4::LLAMA_KV_OVERRIDE_TYPE_BOOL;
852                        raw.__bindgen_anon_1 =
853                            llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 { val_bool: *v };
854                    }
855                    KvOverrideValue::Str(s) => {
856                        raw.tag = llama_cpp_sys_4::LLAMA_KV_OVERRIDE_TYPE_STR;
857                        raw.__bindgen_anon_1 =
858                            llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 { val_str: *s };
859                    }
860                }
861                raw
862            })
863            .chain(std::iter::once(llama_cpp_sys_4::llama_model_kv_override {
864                key: [0; 128],
865                tag: 0,
866                __bindgen_anon_1: llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
867                    val_i64: 0,
868                },
869            }))
870            .collect();
871
872        // ── tt_overrides ────────────────────────────────────────────────────
873        // null-terminated by { null, GGML_TYPE_COUNT }
874        let tt_c: Vec<llama_cpp_sys_4::llama_model_tensor_override> = self
875            .tt_overrides
876            .iter()
877            .map(|ov| llama_cpp_sys_4::llama_model_tensor_override {
878                pattern: ov.pattern.as_ptr(),
879                type_: ov.ty as llama_cpp_sys_4::ggml_type,
880            })
881            .chain(std::iter::once(
882                llama_cpp_sys_4::llama_model_tensor_override {
883                    pattern: null(),
884                    type_: llama_cpp_sys_4::GGML_TYPE_COUNT,
885                },
886            ))
887            .collect();
888
889        // ── prune_layers ─────────────────────────────────────────────────────
890        // -1-terminated
891        let mut prune_c = self.prune_layers.clone();
892        prune_c.push(-1);
893
894        // ── assemble ────────────────────────────────────────────────────────
895        let raw = llama_cpp_sys_4::llama_model_quantize_params {
896            nthread: self.nthread,
897            ftype: self.ftype as llama_cpp_sys_4::llama_ftype,
898            output_tensor_type: self
899                .output_tensor_type
900                .map_or(llama_cpp_sys_4::GGML_TYPE_COUNT, |t| {
901                    t as llama_cpp_sys_4::ggml_type
902                }),
903            token_embedding_type: self
904                .token_embedding_type
905                .map_or(llama_cpp_sys_4::GGML_TYPE_COUNT, |t| {
906                    t as llama_cpp_sys_4::ggml_type
907                }),
908            allow_requantize: self.allow_requantize,
909            quantize_output_tensor: self.quantize_output_tensor,
910            only_copy: self.only_copy,
911            pure_: self.pure,
912            keep_split: self.keep_split,
913            dry_run: self.dry_run,
914            imatrix: if self.imatrix.is_empty() {
915                null()
916            } else {
917                imatrix_c.as_ptr()
918            },
919            kv_overrides: if self.kv_overrides.is_empty() {
920                null()
921            } else {
922                kv_c.as_ptr()
923            },
924            tt_overrides: if self.tt_overrides.is_empty() {
925                null()
926            } else {
927                tt_c.as_ptr()
928            },
929            prune_layers: if self.prune_layers.is_empty() {
930                null()
931            } else {
932                prune_c.as_ptr()
933            },
934        };
935
936        RawQuantizeParamsGuard {
937            raw,
938            _imatrix_c: imatrix_c,
939            _kv_c: kv_c,
940            _tt_c: tt_c,
941            _prune_c: prune_c,
942            _marker: std::marker::PhantomData,
943        }
944    }
945}
946
947/// Temporary storage that keeps the C pointers inside a raw
948/// `llama_model_quantize_params` valid.  Dropped after the quantize call.
949pub(crate) struct RawQuantizeParamsGuard<'a> {
950    pub(crate) raw: llama_cpp_sys_4::llama_model_quantize_params,
951    _imatrix_c: Vec<llama_cpp_sys_4::llama_model_imatrix_data>,
952    _kv_c: Vec<llama_cpp_sys_4::llama_model_kv_override>,
953    _tt_c: Vec<llama_cpp_sys_4::llama_model_tensor_override>,
954    _prune_c: Vec<i32>,
955    // tie lifetime to the source QuantizeParams so the string/data
956    // pointers inside imatrix_c and tt_c stay valid
957    _marker: std::marker::PhantomData<&'a QuantizeParams>,
958}
959
960// ─────────────────────────────────────────────────────────────────────────────
961// TurboQuant – attention rotation
962// ─────────────────────────────────────────────────────────────────────────────
963
964/// Control the `TurboQuant` attention-rotation feature globally.
965///
966/// When enabled (the default), llama.cpp applies a Hadamard rotation to Q/K/V
967/// tensors before storing them in the KV cache.  This significantly improves
968/// quantization quality of the KV cache with near-zero overhead, as described
969/// in llama.cpp PR #21038.
970///
971/// This function sets or clears the `LLAMA_ATTN_ROT_DISABLE` environment
972/// variable, which llama.cpp reads once when a context (and its KV cache) is
973/// first created.  Call it **before** creating any [`LlamaContext`] on the
974/// current process.
975///
976/// # Thread safety
977///
978/// Mutating environment variables while other threads may be reading them is
979/// undefined behaviour.  Call this function before spawning any threads that
980/// use llama contexts, or ensure no contexts are being created concurrently.
981///
982/// # Example
983///
984/// ```no_run
985/// // Disable the rotation for benchmarking purposes:
986/// llama_cpp_4::quantize::set_attn_rot_disabled(true);
987///
988/// // Re-enable (default behaviour):
989/// llama_cpp_4::quantize::set_attn_rot_disabled(false);
990/// ```
991///
992/// [`LlamaContext`]: crate::context::LlamaContext
993pub fn set_attn_rot_disabled(disabled: bool) {
994    if disabled {
995        // SAFETY: single-threaded context required by the caller.
996        #[allow(unused_unsafe)]
997        unsafe {
998            std::env::set_var("LLAMA_ATTN_ROT_DISABLE", "1");
999        }
1000    } else {
1001        #[allow(unused_unsafe)]
1002        unsafe {
1003            std::env::remove_var("LLAMA_ATTN_ROT_DISABLE");
1004        }
1005    }
1006}
1007
1008/// Returns `true` if `TurboQuant` attention rotation is currently disabled.
1009#[must_use]
1010pub fn attn_rot_disabled() -> bool {
1011    std::env::var("LLAMA_ATTN_ROT_DISABLE")
1012        .ok()
1013        .and_then(|v| v.parse::<i32>().ok())
1014        .is_some_and(|v| v != 0)
1015}
llama_cpp_4/quantize.rs

llama_cpp_4/
quantize.rs