llama_cpp_4/
quantize.rs

1//! Quantization types and parameters for converting models to lower-bit precisions.
2//!
3//! # Quick start
4//!
5//! ```no_run
6//! use llama_cpp_4::quantize::{LlamaFtype, QuantizeParams};
7//!
8//! let params = QuantizeParams::new(LlamaFtype::MostlyQ4KM)
9//!     .with_nthread(8)
10//!     .with_quantize_output_tensor(true);
11//!
12//! llama_cpp_4::model_quantize("model-f16.gguf", "model-q4km.gguf", &params).unwrap();
13//! ```
14//!
15//! # TurboQuant – attention rotation (PR #21038)
16//!
17//! llama.cpp applies a Hadamard rotation to Q/K/V tensors before writing them into the KV cache.
18//! This significantly improves KV-cache quantization quality at near-zero cost, and is enabled by
19//! default for every model whose head dimension is a power of two.  You can opt out per-context
20//! with [`LlamaContextParams::with_attn_rot_disabled`] or globally with
21//! [`set_attn_rot_disabled`].
22//!
23//! [`LlamaContextParams::with_attn_rot_disabled`]: crate::context::params::LlamaContextParams::with_attn_rot_disabled
24
25use std::ffi::{CString, NulError};
26use std::ptr::null;
27
28// ─────────────────────────────────────────────────────────────────────────────
29// LlamaFtype
30// ─────────────────────────────────────────────────────────────────────────────
31
32/// The quantization type used for the bulk of a model file (maps to `llama_ftype`).
33///
34/// Pass one of these variants to [`QuantizeParams::new`] to choose the target precision.
35#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
36#[non_exhaustive]
37#[allow(missing_docs)]
38pub enum LlamaFtype {
39    /// All tensors stored as full F32 (very large, for reference only)
40    AllF32 = 0,
41    /// F16 – 14 GB @ 7B, +0.0020 ppl vs Mistral-7B
42    MostlyF16 = 1,
43    /// Q4_0 – 4.34 GB @ 8B, +0.4685 ppl
44    MostlyQ4_0 = 2,
45    /// Q4_1 – 4.78 GB @ 8B, +0.4511 ppl
46    MostlyQ4_1 = 3,
47    /// Q8_0 – 7.96 GB @ 8B, +0.0026 ppl
48    MostlyQ8_0 = 7,
49    /// Q5_0 – 5.21 GB @ 8B, +0.1316 ppl
50    MostlyQ5_0 = 8,
51    /// Q5_1 – 5.65 GB @ 8B, +0.1062 ppl
52    MostlyQ5_1 = 9,
53    /// Q2_K – 2.96 GB @ 8B, +3.5199 ppl
54    MostlyQ2K = 10,
55    /// Q3_K small – 3.41 GB @ 8B, +1.6321 ppl
56    MostlyQ3KS = 11,
57    /// Q3_K medium – 3.74 GB @ 8B, +0.6569 ppl
58    MostlyQ3KM = 12,
59    /// Q3_K large – 4.03 GB @ 8B, +0.5562 ppl
60    MostlyQ3KL = 13,
61    /// Q4_K small – 4.37 GB @ 8B, +0.2689 ppl
62    MostlyQ4KS = 14,
63    /// Q4_K medium – 4.58 GB @ 8B, +0.1754 ppl  *(recommended default)*
64    MostlyQ4KM = 15,
65    /// Q5_K small – 5.21 GB @ 8B, +0.1049 ppl
66    MostlyQ5KS = 16,
67    /// Q5_K medium – 5.33 GB @ 8B, +0.0569 ppl
68    MostlyQ5KM = 17,
69    /// Q6_K – 6.14 GB @ 8B, +0.0217 ppl
70    MostlyQ6K = 18,
71    /// IQ2_XXS – 2.06 bpw
72    MostlyIQ2XXS = 19,
73    /// IQ2_XS – 2.31 bpw
74    MostlyIQ2XS = 20,
75    /// Q2_K small
76    MostlyQ2KS = 21,
77    /// IQ3_XS – 3.3 bpw
78    MostlyIQ3XS = 22,
79    /// IQ3_XXS – 3.06 bpw
80    MostlyIQ3XXS = 23,
81    /// IQ1_S – 1.56 bpw (extremely small, high loss)
82    MostlyIQ1S = 24,
83    /// IQ4_NL – 4.50 bpw non-linear
84    MostlyIQ4NL = 25,
85    /// IQ3_S – 3.44 bpw
86    MostlyIQ3S = 26,
87    /// IQ3_M – 3.66 bpw
88    MostlyIQ3M = 27,
89    /// IQ2_S – 2.5 bpw
90    MostlyIQ2S = 28,
91    /// IQ2_M – 2.7 bpw
92    MostlyIQ2M = 29,
93    /// IQ4_XS – 4.25 bpw non-linear
94    MostlyIQ4XS = 30,
95    /// IQ1_M – 1.75 bpw
96    MostlyIQ1M = 31,
97    /// BF16 – 14 GB @ 7B, −0.0050 ppl vs Mistral-7B
98    MostlyBF16 = 32,
99    /// TQ1_0 – 1.69 bpw ternary
100    MostlyTQ1_0 = 36,
101    /// TQ2_0 – 2.06 bpw ternary
102    MostlyTQ2_0 = 37,
103    /// MXFP4 (MoE layers)
104    MostlyMXFP4Moe = 38,
105    /// NVFP4
106    MostlyNVFP4 = 39,
107}
108
109impl LlamaFtype {
110    /// Short name suitable for filenames (e.g. `"Q4_K_M"`).
111    #[must_use]
112    pub fn name(self) -> &'static str {
113        match self {
114            Self::AllF32 => "F32",
115            Self::MostlyF16 => "F16",
116            Self::MostlyQ4_0 => "Q4_0",
117            Self::MostlyQ4_1 => "Q4_1",
118            Self::MostlyQ8_0 => "Q8_0",
119            Self::MostlyQ5_0 => "Q5_0",
120            Self::MostlyQ5_1 => "Q5_1",
121            Self::MostlyQ2K => "Q2_K",
122            Self::MostlyQ3KS => "Q3_K_S",
123            Self::MostlyQ3KM => "Q3_K_M",
124            Self::MostlyQ3KL => "Q3_K_L",
125            Self::MostlyQ4KS => "Q4_K_S",
126            Self::MostlyQ4KM => "Q4_K_M",
127            Self::MostlyQ5KS => "Q5_K_S",
128            Self::MostlyQ5KM => "Q5_K_M",
129            Self::MostlyQ6K => "Q6_K",
130            Self::MostlyIQ2XXS => "IQ2_XXS",
131            Self::MostlyIQ2XS => "IQ2_XS",
132            Self::MostlyQ2KS => "Q2_K_S",
133            Self::MostlyIQ3XS => "IQ3_XS",
134            Self::MostlyIQ3XXS => "IQ3_XXS",
135            Self::MostlyIQ1S => "IQ1_S",
136            Self::MostlyIQ4NL => "IQ4_NL",
137            Self::MostlyIQ3S => "IQ3_S",
138            Self::MostlyIQ3M => "IQ3_M",
139            Self::MostlyIQ2S => "IQ2_S",
140            Self::MostlyIQ2M => "IQ2_M",
141            Self::MostlyIQ4XS => "IQ4_XS",
142            Self::MostlyIQ1M => "IQ1_M",
143            Self::MostlyBF16 => "BF16",
144            Self::MostlyTQ1_0 => "TQ1_0",
145            Self::MostlyTQ2_0 => "TQ2_0",
146            Self::MostlyMXFP4Moe => "MXFP4_MOE",
147            Self::MostlyNVFP4 => "NVFP4",
148        }
149    }
150
151    /// Human-readable description with approximate size and PPL delta.
152    #[must_use]
153    pub fn description(self) -> &'static str {
154        match self {
155            Self::AllF32 => "26.00 GB @ 7B — full precision reference",
156            Self::MostlyF16 => "14.00 GB @ 7B — +0.0020 ppl vs Mistral-7B",
157            Self::MostlyBF16 => "14.00 GB @ 7B — -0.0050 ppl vs Mistral-7B",
158            Self::MostlyQ8_0 => " 7.96 GB @ 8B — +0.0026 ppl",
159            Self::MostlyQ6K => " 6.14 GB @ 8B — +0.0217 ppl",
160            Self::MostlyQ5KM => " 5.33 GB @ 8B — +0.0569 ppl",
161            Self::MostlyQ5KS => " 5.21 GB @ 8B — +0.1049 ppl",
162            Self::MostlyQ5_1 => " 5.65 GB @ 8B — +0.1062 ppl",
163            Self::MostlyQ5_0 => " 5.21 GB @ 8B — +0.1316 ppl",
164            Self::MostlyQ4KM => " 4.58 GB @ 8B — +0.1754 ppl  [recommended]",
165            Self::MostlyQ4KS => " 4.37 GB @ 8B — +0.2689 ppl",
166            Self::MostlyQ4_1 => " 4.78 GB @ 8B — +0.4511 ppl",
167            Self::MostlyQ4_0 => " 4.34 GB @ 8B — +0.4685 ppl",
168            Self::MostlyQ3KL => " 4.03 GB @ 8B — +0.5562 ppl",
169            Self::MostlyQ3KM => " 3.74 GB @ 8B — +0.6569 ppl",
170            Self::MostlyQ3KS => " 3.41 GB @ 8B — +1.6321 ppl",
171            Self::MostlyQ2KS => " 2.96 GB @ 8B — +3.1836 ppl",
172            Self::MostlyQ2K => " 2.96 GB @ 8B — +3.5199 ppl",
173            Self::MostlyIQ4XS => " 4.25 bpw non-linear",
174            Self::MostlyIQ4NL => " 4.50 bpw non-linear",
175            Self::MostlyIQ3S => " 3.44 bpw",
176            Self::MostlyIQ3M => " 3.66 bpw",
177            Self::MostlyIQ3XS => " 3.3 bpw",
178            Self::MostlyIQ3XXS => " 3.06 bpw",
179            Self::MostlyIQ2M => " 2.7 bpw",
180            Self::MostlyIQ2S => " 2.5 bpw",
181            Self::MostlyIQ2XS => " 2.31 bpw",
182            Self::MostlyIQ2XXS => " 2.06 bpw",
183            Self::MostlyIQ1M => " 1.75 bpw — extreme compression",
184            Self::MostlyIQ1S => " 1.56 bpw — extreme compression",
185            Self::MostlyTQ1_0 => " 1.69 bpw ternary",
186            Self::MostlyTQ2_0 => " 2.06 bpw ternary",
187            Self::MostlyMXFP4Moe => "MXFP4 MoE layers",
188            Self::MostlyNVFP4 => "NVFP4",
189        }
190    }
191
192    /// Look up a variant by its short name (case-insensitive).
193    ///
194    /// ```
195    /// use llama_cpp_4::quantize::LlamaFtype;
196    /// assert_eq!(LlamaFtype::from_name("Q4_K_M"), Some(LlamaFtype::MostlyQ4KM));
197    /// assert_eq!(LlamaFtype::from_name("q4_k_m"), Some(LlamaFtype::MostlyQ4KM));
198    /// assert_eq!(LlamaFtype::from_name("bogus"), None);
199    /// ```
200    #[must_use]
201    pub fn from_name(name: &str) -> Option<Self> {
202        let upper = name.to_uppercase();
203        match upper.as_str() {
204            "F32" => Some(Self::AllF32),
205            "F16" => Some(Self::MostlyF16),
206            "BF16" => Some(Self::MostlyBF16),
207            "Q4_0" => Some(Self::MostlyQ4_0),
208            "Q4_1" => Some(Self::MostlyQ4_1),
209            "Q8_0" => Some(Self::MostlyQ8_0),
210            "Q5_0" => Some(Self::MostlyQ5_0),
211            "Q5_1" => Some(Self::MostlyQ5_1),
212            "Q2_K" => Some(Self::MostlyQ2K),
213            "Q2_K_S" => Some(Self::MostlyQ2KS),
214            "Q3_K_S" => Some(Self::MostlyQ3KS),
215            "Q3_K_M" => Some(Self::MostlyQ3KM),
216            "Q3_K_L" => Some(Self::MostlyQ3KL),
217            "Q4_K_S" => Some(Self::MostlyQ4KS),
218            "Q4_K_M" => Some(Self::MostlyQ4KM),
219            "Q5_K_S" => Some(Self::MostlyQ5KS),
220            "Q5_K_M" => Some(Self::MostlyQ5KM),
221            "Q6_K" => Some(Self::MostlyQ6K),
222            "IQ1_S" => Some(Self::MostlyIQ1S),
223            "IQ1_M" => Some(Self::MostlyIQ1M),
224            "IQ2_XXS" => Some(Self::MostlyIQ2XXS),
225            "IQ2_XS" => Some(Self::MostlyIQ2XS),
226            "IQ2_S" => Some(Self::MostlyIQ2S),
227            "IQ2_M" => Some(Self::MostlyIQ2M),
228            "IQ3_XXS" => Some(Self::MostlyIQ3XXS),
229            "IQ3_XS" => Some(Self::MostlyIQ3XS),
230            "IQ3_S" => Some(Self::MostlyIQ3S),
231            "IQ3_M" => Some(Self::MostlyIQ3M),
232            "IQ4_NL" => Some(Self::MostlyIQ4NL),
233            "IQ4_XS" => Some(Self::MostlyIQ4XS),
234            "TQ1_0" => Some(Self::MostlyTQ1_0),
235            "TQ2_0" => Some(Self::MostlyTQ2_0),
236            "MXFP4_MOE" => Some(Self::MostlyMXFP4Moe),
237            "NVFP4" => Some(Self::MostlyNVFP4),
238            _ => None,
239        }
240    }
241
242    /// All available types, ordered roughly from largest to smallest.
243    #[must_use]
244    pub fn all() -> &'static [Self] {
245        &[
246            Self::AllF32,
247            Self::MostlyF16,
248            Self::MostlyBF16,
249            Self::MostlyQ8_0,
250            Self::MostlyQ6K,
251            Self::MostlyQ5KM,
252            Self::MostlyQ5KS,
253            Self::MostlyQ5_1,
254            Self::MostlyQ5_0,
255            Self::MostlyQ4KM,
256            Self::MostlyQ4KS,
257            Self::MostlyQ4_1,
258            Self::MostlyQ4_0,
259            Self::MostlyQ3KL,
260            Self::MostlyQ3KM,
261            Self::MostlyQ3KS,
262            Self::MostlyQ2KS,
263            Self::MostlyQ2K,
264            Self::MostlyIQ4XS,
265            Self::MostlyIQ4NL,
266            Self::MostlyIQ3S,
267            Self::MostlyIQ3M,
268            Self::MostlyIQ3XS,
269            Self::MostlyIQ3XXS,
270            Self::MostlyIQ2M,
271            Self::MostlyIQ2S,
272            Self::MostlyIQ2XS,
273            Self::MostlyIQ2XXS,
274            Self::MostlyIQ1M,
275            Self::MostlyIQ1S,
276            Self::MostlyTQ1_0,
277            Self::MostlyTQ2_0,
278            Self::MostlyMXFP4Moe,
279            Self::MostlyNVFP4,
280        ]
281    }
282}
283
284impl From<LlamaFtype> for llama_cpp_sys_4::llama_ftype {
285    fn from(t: LlamaFtype) -> Self {
286        t as llama_cpp_sys_4::llama_ftype
287    }
288}
289
290impl std::fmt::Display for LlamaFtype {
291    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
292        write!(f, "{}", self.name())
293    }
294}
295
296// ─────────────────────────────────────────────────────────────────────────────
297// GgmlType
298// ─────────────────────────────────────────────────────────────────────────────
299
300/// GGML tensor storage type (maps to `ggml_type`).
301///
302/// Used to set [`QuantizeParams::output_tensor_type`] and
303/// [`QuantizeParams::token_embedding_type`], and for per-tensor type overrides
304/// in [`TensorTypeOverride`].
305#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
306#[non_exhaustive]
307#[allow(missing_docs)]
308pub enum GgmlType {
309    F32 = 0,
310    F16 = 1,
311    Q4_0 = 2,
312    Q4_1 = 3,
313    Q5_0 = 6,
314    Q5_1 = 7,
315    Q8_0 = 8,
316    Q8_1 = 9,
317    Q2K = 10,
318    Q3K = 11,
319    Q4K = 12,
320    Q5K = 13,
321    Q6K = 14,
322    Q8K = 15,
323    IQ2XXS = 16,
324    IQ2XS = 17,
325    IQ3XXS = 18,
326    IQ1S = 19,
327    IQ4NL = 20,
328    IQ3S = 21,
329    IQ2S = 22,
330    IQ4XS = 23,
331    I8 = 24,
332    I16 = 25,
333    I32 = 26,
334    I64 = 27,
335    F64 = 28,
336    IQ1M = 29,
337    BF16 = 30,
338    TQ1_0 = 34,
339    TQ2_0 = 35,
340    MXFP4 = 39,
341    NVFP4 = 40,
342}
343
344impl From<GgmlType> for llama_cpp_sys_4::ggml_type {
345    fn from(t: GgmlType) -> Self {
346        t as llama_cpp_sys_4::ggml_type
347    }
348}
349
350impl TryFrom<llama_cpp_sys_4::ggml_type> for GgmlType {
351    type Error = llama_cpp_sys_4::ggml_type;
352    fn try_from(v: llama_cpp_sys_4::ggml_type) -> Result<Self, Self::Error> {
353        match v {
354            0 => Ok(Self::F32),
355            1 => Ok(Self::F16),
356            2 => Ok(Self::Q4_0),
357            3 => Ok(Self::Q4_1),
358            6 => Ok(Self::Q5_0),
359            7 => Ok(Self::Q5_1),
360            8 => Ok(Self::Q8_0),
361            9 => Ok(Self::Q8_1),
362            10 => Ok(Self::Q2K),
363            11 => Ok(Self::Q3K),
364            12 => Ok(Self::Q4K),
365            13 => Ok(Self::Q5K),
366            14 => Ok(Self::Q6K),
367            15 => Ok(Self::Q8K),
368            16 => Ok(Self::IQ2XXS),
369            17 => Ok(Self::IQ2XS),
370            18 => Ok(Self::IQ3XXS),
371            19 => Ok(Self::IQ1S),
372            20 => Ok(Self::IQ4NL),
373            21 => Ok(Self::IQ3S),
374            22 => Ok(Self::IQ2S),
375            23 => Ok(Self::IQ4XS),
376            24 => Ok(Self::I8),
377            25 => Ok(Self::I16),
378            26 => Ok(Self::I32),
379            27 => Ok(Self::I64),
380            28 => Ok(Self::F64),
381            29 => Ok(Self::IQ1M),
382            30 => Ok(Self::BF16),
383            34 => Ok(Self::TQ1_0),
384            35 => Ok(Self::TQ2_0),
385            39 => Ok(Self::MXFP4),
386            40 => Ok(Self::NVFP4),
387            _ => Err(v),
388        }
389    }
390}
391
392// ─────────────────────────────────────────────────────────────────────────────
393// ImatrixEntry / Imatrix
394// ─────────────────────────────────────────────────────────────────────────────
395
396/// A single per-tensor importance matrix entry, as loaded from a `.imatrix` file.
397///
398/// Each entry contains activation statistics for one model tensor collected from
399/// a calibration dataset. When supplied to [`QuantizeParams::with_imatrix`] these
400/// statistics guide the quantizer to allocate more precision to weights that
401/// matter most.
402#[derive(Debug, Clone)]
403pub struct ImatrixEntry {
404    name: CString,
405    data: Vec<f32>,
406}
407
408impl ImatrixEntry {
409    /// Create a new entry from a tensor name and its importance scores.
410    ///
411    /// # Errors
412    ///
413    /// Returns [`NulError`] if `name` contains an interior null byte.
414    pub fn new(name: impl Into<Vec<u8>>, data: Vec<f32>) -> Result<Self, NulError> {
415        Ok(Self {
416            name: CString::new(name)?,
417            data,
418        })
419    }
420
421    /// Tensor name.
422    #[must_use]
423    pub fn name_str(&self) -> &str {
424        self.name.to_str().unwrap_or("")
425    }
426
427    /// Number of importance values.
428    #[must_use]
429    pub fn len(&self) -> usize {
430        self.data.len()
431    }
432
433    /// Returns `true` if the data slice is empty.
434    #[must_use]
435    pub fn is_empty(&self) -> bool {
436        self.data.is_empty()
437    }
438}
439
440/// A collection of importance matrix entries (one per quantized tensor).
441///
442/// Build one by pushing [`ImatrixEntry`] values, then pass it to
443/// [`QuantizeParams::with_imatrix`].
444#[derive(Debug, Clone, Default)]
445pub struct Imatrix {
446    entries: Vec<ImatrixEntry>,
447}
448
449impl Imatrix {
450    /// Create an empty imatrix.
451    #[must_use]
452    pub fn new() -> Self {
453        Self::default()
454    }
455
456    /// Add an entry.
457    pub fn push(&mut self, entry: ImatrixEntry) {
458        self.entries.push(entry);
459    }
460
461    /// Number of entries.
462    #[must_use]
463    pub fn len(&self) -> usize {
464        self.entries.len()
465    }
466
467    /// Returns `true` if no entries have been added.
468    #[must_use]
469    pub fn is_empty(&self) -> bool {
470        self.entries.is_empty()
471    }
472}
473
474// ─────────────────────────────────────────────────────────────────────────────
475// TensorTypeOverride
476// ─────────────────────────────────────────────────────────────────────────────
477
478/// Override the quantization type of every tensor whose name matches a glob `pattern`.
479///
480/// The pattern syntax is the same as used by the `--tensor-type` flag in
481/// `llama-quantize`, e.g. `"attn.*"` or `"blk.0.*"`.
482///
483/// # Example
484///
485/// ```
486/// use llama_cpp_4::quantize::{GgmlType, TensorTypeOverride};
487///
488/// // Keep the output projection in F16:
489/// let ov = TensorTypeOverride::new("output", GgmlType::F16).unwrap();
490/// ```
491#[derive(Debug, Clone)]
492pub struct TensorTypeOverride {
493    pattern: CString,
494    ty: GgmlType,
495}
496
497impl TensorTypeOverride {
498    /// Create a new override.
499    ///
500    /// # Errors
501    ///
502    /// Returns [`NulError`] if `pattern` contains an interior null byte.
503    pub fn new(pattern: impl Into<Vec<u8>>, ty: GgmlType) -> Result<Self, NulError> {
504        Ok(Self {
505            pattern: CString::new(pattern)?,
506            ty,
507        })
508    }
509
510    /// The glob pattern that selects tensors.
511    #[must_use]
512    pub fn pattern_str(&self) -> &str {
513        self.pattern.to_str().unwrap_or("")
514    }
515
516    /// The type to assign to matching tensors.
517    #[must_use]
518    pub fn ty(&self) -> GgmlType {
519        self.ty
520    }
521}
522
523// ─────────────────────────────────────────────────────────────────────────────
524// KvOverrideValue / KvOverride
525// ─────────────────────────────────────────────────────────────────────────────
526
527/// A value in a GGUF key-value metadata override.
528#[derive(Debug, Clone, PartialEq)]
529pub enum KvOverrideValue {
530    /// 64-bit integer
531    Int(i64),
532    /// 64-bit float
533    Float(f64),
534    /// Boolean
535    Bool(bool),
536    /// Fixed-length string (up to 127 bytes + NUL)
537    Str([std::os::raw::c_char; 128]),
538}
539
540/// A single GGUF metadata key-value override.
541///
542/// These are written into the output file's metadata when quantizing.
543#[derive(Debug, Clone)]
544pub struct KvOverride {
545    key: CString,
546    /// The value for this override.
547    pub value: KvOverrideValue,
548}
549
550impl KvOverride {
551    /// Create a new override.
552    ///
553    /// # Errors
554    ///
555    /// Returns [`NulError`] if `key` contains an interior null byte.
556    pub fn new(key: impl Into<Vec<u8>>, value: KvOverrideValue) -> Result<Self, NulError> {
557        Ok(Self {
558            key: CString::new(key)?,
559            value,
560        })
561    }
562}
563
564// ─────────────────────────────────────────────────────────────────────────────
565// QuantizeParams
566// ─────────────────────────────────────────────────────────────────────────────
567
568/// Parameters for quantizing a model.
569///
570/// Create with [`QuantizeParams::new`] and chain `with_*` builder methods to
571/// configure, then pass a reference to [`crate::model_quantize`].
572///
573/// # Example
574///
575/// ```no_run
576/// use llama_cpp_4::quantize::{GgmlType, LlamaFtype, QuantizeParams, TensorTypeOverride};
577///
578/// let ov = TensorTypeOverride::new("output", GgmlType::F16).unwrap();
579///
580/// let params = QuantizeParams::new(LlamaFtype::MostlyQ4KM)
581///     .with_nthread(8)
582///     .with_allow_requantize(false)
583///     .with_quantize_output_tensor(true)
584///     .with_pure(false)
585///     .with_tensor_type_override(ov);
586///
587/// llama_cpp_4::model_quantize("in.gguf", "out.gguf", &params).unwrap();
588/// ```
589#[derive(Debug, Clone)]
590#[allow(clippy::struct_excessive_bools)]
591pub struct QuantizeParams {
592    /// Number of threads (0 = auto-detect).
593    pub nthread: i32,
594    /// Target quantization type.
595    pub ftype: LlamaFtype,
596    /// Force this storage type for the output/lm-head tensor (`None` = use ftype default).
597    pub output_tensor_type: Option<GgmlType>,
598    /// Force this storage type for the token-embedding tensor (`None` = use ftype default).
599    pub token_embedding_type: Option<GgmlType>,
600    /// Allow re-quantizing tensors that are already quantized.
601    pub allow_requantize: bool,
602    /// Quantize the output/lm-head weight tensor.
603    pub quantize_output_tensor: bool,
604    /// Copy all tensors without quantizing (ignores `ftype`).
605    pub only_copy: bool,
606    /// Quantize every tensor to the same type (no mixed k-quant strategy).
607    pub pure: bool,
608    /// Keep the same number of shards as the input (for split models).
609    pub keep_split: bool,
610    /// Estimate output size without writing anything to disk.
611    pub dry_run: bool,
612
613    imatrix: Vec<ImatrixEntry>,
614    kv_overrides: Vec<KvOverride>,
615    tt_overrides: Vec<TensorTypeOverride>,
616    prune_layers: Vec<i32>,
617}
618
619impl QuantizeParams {
620    /// Create a new params set targeting `ftype`.
621    ///
622    /// All other options are set to the same defaults as
623    /// `llama_model_quantize_default_params()`.
624    #[must_use]
625    pub fn new(ftype: LlamaFtype) -> Self {
626        // Read the C defaults so we match them exactly.
627        let d = unsafe { llama_cpp_sys_4::llama_model_quantize_default_params() };
628        Self {
629            nthread: d.nthread,
630            ftype,
631            output_tensor_type: GgmlType::try_from(d.output_tensor_type).ok(),
632            token_embedding_type: GgmlType::try_from(d.token_embedding_type).ok(),
633            allow_requantize: d.allow_requantize,
634            quantize_output_tensor: d.quantize_output_tensor,
635            only_copy: d.only_copy,
636            pure: d.pure_,
637            keep_split: d.keep_split,
638            dry_run: d.dry_run,
639            imatrix: Vec::new(),
640            kv_overrides: Vec::new(),
641            tt_overrides: Vec::new(),
642            prune_layers: Vec::new(),
643        }
644    }
645
646    /// Set the number of quantization threads (`0` = auto).
647    #[must_use]
648    pub fn with_nthread(mut self, n: i32) -> Self {
649        self.nthread = n;
650        self
651    }
652
653    /// Override the output-tensor storage type.
654    #[must_use]
655    pub fn with_output_tensor_type(mut self, ty: GgmlType) -> Self {
656        self.output_tensor_type = Some(ty);
657        self
658    }
659
660    /// Override the token-embedding storage type.
661    #[must_use]
662    pub fn with_token_embedding_type(mut self, ty: GgmlType) -> Self {
663        self.token_embedding_type = Some(ty);
664        self
665    }
666
667    /// Allow (or disallow) re-quantizing already-quantized tensors.
668    #[must_use]
669    pub fn with_allow_requantize(mut self, v: bool) -> Self {
670        self.allow_requantize = v;
671        self
672    }
673
674    /// Quantize the output/lm-head weight (`true` by default).
675    #[must_use]
676    pub fn with_quantize_output_tensor(mut self, v: bool) -> Self {
677        self.quantize_output_tensor = v;
678        self
679    }
680
681    /// When `true`, only copy tensors verbatim (no quantization at all).
682    #[must_use]
683    pub fn with_only_copy(mut self, v: bool) -> Self {
684        self.only_copy = v;
685        self
686    }
687
688    /// When `true`, quantize all tensors to the same type (no mixed k-quant strategy).
689    #[must_use]
690    pub fn with_pure(mut self, v: bool) -> Self {
691        self.pure = v;
692        self
693    }
694
695    /// Preserve the number of shards when quantizing a split model.
696    #[must_use]
697    pub fn with_keep_split(mut self, v: bool) -> Self {
698        self.keep_split = v;
699        self
700    }
701
702    /// Only estimate the output size; do not write anything to disk.
703    #[must_use]
704    pub fn with_dry_run(mut self, v: bool) -> Self {
705        self.dry_run = v;
706        self
707    }
708
709    /// Supply importance matrix data to improve quantization quality.
710    ///
711    /// The imatrix is generated by the `imatrix` tool (or the `imatrix` example
712    /// in this crate) and contains per-tensor activation statistics collected
713    /// from a calibration dataset.
714    #[must_use]
715    pub fn with_imatrix(mut self, imatrix: Imatrix) -> Self {
716        self.imatrix = imatrix.entries;
717        self
718    }
719
720    /// Append a single imatrix entry.
721    #[must_use]
722    pub fn with_imatrix_entry(mut self, entry: ImatrixEntry) -> Self {
723        self.imatrix.push(entry);
724        self
725    }
726
727    /// Add (or replace) a GGUF metadata key-value pair in the output file.
728    #[must_use]
729    pub fn with_kv_override(mut self, kv: KvOverride) -> Self {
730        self.kv_overrides.push(kv);
731        self
732    }
733
734    /// Override the quantization type for tensors whose name matches `pattern`.
735    ///
736    /// Can be called multiple times; overrides are applied in order.
737    #[must_use]
738    pub fn with_tensor_type_override(mut self, ov: TensorTypeOverride) -> Self {
739        self.tt_overrides.push(ov);
740        self
741    }
742
743    /// Mark a layer index for pruning (removal) from the output model.
744    #[must_use]
745    pub fn with_pruned_layer(mut self, layer: i32) -> Self {
746        self.prune_layers.push(layer);
747        self
748    }
749
750    /// Mark multiple layer indices for pruning.
751    #[must_use]
752    pub fn with_pruned_layers(mut self, layers: impl IntoIterator<Item = i32>) -> Self {
753        self.prune_layers.extend(layers);
754        self
755    }
756
757    /// Build the raw C struct, together with the temporary backing storage
758    /// that must outlive the struct.  Returns `(raw_params, _guards)`.
759    ///
760    /// This is `pub(crate)` so that `model_quantize` can call it safely while
761    /// holding all the guards alive.
762    pub(crate) fn to_raw(&self) -> RawQuantizeParamsGuard<'_> {
763        // ── imatrix ─────────────────────────────────────────────────────────
764        // Build a null-terminated array of llama_model_imatrix_data.
765        // The `name` and `data` pointers point directly into our owned Vecs.
766        let imatrix_c: Vec<llama_cpp_sys_4::llama_model_imatrix_data> = self
767            .imatrix
768            .iter()
769            .map(|e| llama_cpp_sys_4::llama_model_imatrix_data {
770                name: e.name.as_ptr(),
771                data: e.data.as_ptr(),
772                size: e.data.len(),
773            })
774            .chain(std::iter::once(llama_cpp_sys_4::llama_model_imatrix_data {
775                name: null(),
776                data: null(),
777                size: 0,
778            }))
779            .collect();
780
781        // ── kv_overrides ────────────────────────────────────────────────────
782        // null-terminated by a sentinel with key[0] == 0
783        let kv_c: Vec<llama_cpp_sys_4::llama_model_kv_override> = self
784            .kv_overrides
785            .iter()
786            .map(|kv| {
787                let mut raw = llama_cpp_sys_4::llama_model_kv_override {
788                    key: [0; 128],
789                    tag: 0,
790                    __bindgen_anon_1: llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
791                        val_i64: 0,
792                    },
793                };
794                // Copy key bytes (up to 127 chars + NUL).
795                let bytes = kv.key.to_bytes_with_nul();
796                let copy_len = bytes.len().min(128);
797                for (dst, &src) in raw.key.iter_mut().zip(bytes[..copy_len].iter()) {
798                    *dst = src as std::os::raw::c_char;
799                }
800                match &kv.value {
801                    KvOverrideValue::Int(v) => {
802                        raw.tag = llama_cpp_sys_4::LLAMA_KV_OVERRIDE_TYPE_INT;
803                        raw.__bindgen_anon_1 =
804                            llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
805                                val_i64: *v,
806                            };
807                    }
808                    KvOverrideValue::Float(v) => {
809                        raw.tag = llama_cpp_sys_4::LLAMA_KV_OVERRIDE_TYPE_FLOAT;
810                        raw.__bindgen_anon_1 =
811                            llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
812                                val_f64: *v,
813                            };
814                    }
815                    KvOverrideValue::Bool(v) => {
816                        raw.tag = llama_cpp_sys_4::LLAMA_KV_OVERRIDE_TYPE_BOOL;
817                        raw.__bindgen_anon_1 =
818                            llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
819                                val_bool: *v,
820                            };
821                    }
822                    KvOverrideValue::Str(s) => {
823                        raw.tag = llama_cpp_sys_4::LLAMA_KV_OVERRIDE_TYPE_STR;
824                        raw.__bindgen_anon_1 =
825                            llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
826                                val_str: *s,
827                            };
828                    }
829                }
830                raw
831            })
832            .chain(std::iter::once(llama_cpp_sys_4::llama_model_kv_override {
833                key: [0; 128],
834                tag: 0,
835                __bindgen_anon_1: llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
836                    val_i64: 0,
837                },
838            }))
839            .collect();
840
841        // ── tt_overrides ────────────────────────────────────────────────────
842        // null-terminated by { null, GGML_TYPE_COUNT }
843        let tt_c: Vec<llama_cpp_sys_4::llama_model_tensor_override> = self
844            .tt_overrides
845            .iter()
846            .map(|ov| llama_cpp_sys_4::llama_model_tensor_override {
847                pattern: ov.pattern.as_ptr(),
848                type_: ov.ty as llama_cpp_sys_4::ggml_type,
849            })
850            .chain(std::iter::once(
851                llama_cpp_sys_4::llama_model_tensor_override {
852                    pattern: null(),
853                    type_: llama_cpp_sys_4::GGML_TYPE_COUNT,
854                },
855            ))
856            .collect();
857
858        // ── prune_layers ─────────────────────────────────────────────────────
859        // -1-terminated
860        let mut prune_c = self.prune_layers.clone();
861        prune_c.push(-1);
862
863        // ── assemble ────────────────────────────────────────────────────────
864        let raw = llama_cpp_sys_4::llama_model_quantize_params {
865            nthread: self.nthread,
866            ftype: self.ftype as llama_cpp_sys_4::llama_ftype,
867            output_tensor_type: self
868                .output_tensor_type
869                .map(|t| t as llama_cpp_sys_4::ggml_type)
870                .unwrap_or(llama_cpp_sys_4::GGML_TYPE_COUNT),
871            token_embedding_type: self
872                .token_embedding_type
873                .map(|t| t as llama_cpp_sys_4::ggml_type)
874                .unwrap_or(llama_cpp_sys_4::GGML_TYPE_COUNT),
875            allow_requantize: self.allow_requantize,
876            quantize_output_tensor: self.quantize_output_tensor,
877            only_copy: self.only_copy,
878            pure_: self.pure,
879            keep_split: self.keep_split,
880            dry_run: self.dry_run,
881            imatrix: if self.imatrix.is_empty() {
882                null()
883            } else {
884                imatrix_c.as_ptr()
885            },
886            kv_overrides: if self.kv_overrides.is_empty() {
887                null()
888            } else {
889                kv_c.as_ptr()
890            },
891            tt_overrides: if self.tt_overrides.is_empty() {
892                null()
893            } else {
894                tt_c.as_ptr()
895            },
896            prune_layers: if self.prune_layers.is_empty() {
897                null()
898            } else {
899                prune_c.as_ptr()
900            },
901        };
902
903        RawQuantizeParamsGuard {
904            raw,
905            _imatrix_c: imatrix_c,
906            _kv_c: kv_c,
907            _tt_c: tt_c,
908            _prune_c: prune_c,
909            _marker: std::marker::PhantomData,
910        }
911    }
912}
913
914/// Temporary storage that keeps the C pointers inside a raw
915/// `llama_model_quantize_params` valid.  Dropped after the quantize call.
916pub(crate) struct RawQuantizeParamsGuard<'a> {
917    pub(crate) raw: llama_cpp_sys_4::llama_model_quantize_params,
918    _imatrix_c: Vec<llama_cpp_sys_4::llama_model_imatrix_data>,
919    _kv_c: Vec<llama_cpp_sys_4::llama_model_kv_override>,
920    _tt_c: Vec<llama_cpp_sys_4::llama_model_tensor_override>,
921    _prune_c: Vec<i32>,
922    // tie lifetime to the source QuantizeParams so the string/data
923    // pointers inside imatrix_c and tt_c stay valid
924    _marker: std::marker::PhantomData<&'a QuantizeParams>,
925}
926
927// ─────────────────────────────────────────────────────────────────────────────
928// TurboQuant – attention rotation
929// ─────────────────────────────────────────────────────────────────────────────
930
931/// Control the TurboQuant attention-rotation feature globally.
932///
933/// When enabled (the default), llama.cpp applies a Hadamard rotation to Q/K/V
934/// tensors before storing them in the KV cache.  This significantly improves
935/// quantization quality of the KV cache with near-zero overhead, as described
936/// in llama.cpp PR #21038.
937///
938/// This function sets or clears the `LLAMA_ATTN_ROT_DISABLE` environment
939/// variable, which llama.cpp reads once when a context (and its KV cache) is
940/// first created.  Call it **before** creating any [`LlamaContext`] on the
941/// current process.
942///
943/// # Thread safety
944///
945/// Mutating environment variables while other threads may be reading them is
946/// undefined behaviour.  Call this function before spawning any threads that
947/// use llama contexts, or ensure no contexts are being created concurrently.
948///
949/// # Example
950///
951/// ```no_run
952/// // Disable the rotation for benchmarking purposes:
953/// llama_cpp_4::quantize::set_attn_rot_disabled(true);
954///
955/// // Re-enable (default behaviour):
956/// llama_cpp_4::quantize::set_attn_rot_disabled(false);
957/// ```
958///
959/// [`LlamaContext`]: crate::context::LlamaContext
960pub fn set_attn_rot_disabled(disabled: bool) {
961    if disabled {
962        // SAFETY: single-threaded context required by the caller.
963        #[allow(unused_unsafe)]
964        unsafe {
965            std::env::set_var("LLAMA_ATTN_ROT_DISABLE", "1");
966        }
967    } else {
968        #[allow(unused_unsafe)]
969        unsafe {
970            std::env::remove_var("LLAMA_ATTN_ROT_DISABLE");
971        }
972    }
973}
974
975/// Returns `true` if TurboQuant attention rotation is currently disabled.
976#[must_use]
977pub fn attn_rot_disabled() -> bool {
978    std::env::var("LLAMA_ATTN_ROT_DISABLE")
979        .ok()
980        .and_then(|v| v.parse::<i32>().ok())
981        .map_or(false, |v| v != 0)
982}
llama_cpp_4/quantize.rs

llama_cpp_4/
quantize.rs