1use std::ffi::{CString, NulError};
26use std::ptr::null;
27
28#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
36#[non_exhaustive]
37#[allow(missing_docs)]
38pub enum LlamaFtype {
39 AllF32 = 0,
41 MostlyF16 = 1,
43 MostlyQ4_0 = 2,
45 MostlyQ4_1 = 3,
47 MostlyQ8_0 = 7,
49 MostlyQ5_0 = 8,
51 MostlyQ5_1 = 9,
53 MostlyQ2K = 10,
55 MostlyQ3KS = 11,
57 MostlyQ3KM = 12,
59 MostlyQ3KL = 13,
61 MostlyQ4KS = 14,
63 MostlyQ4KM = 15,
65 MostlyQ5KS = 16,
67 MostlyQ5KM = 17,
69 MostlyQ6K = 18,
71 MostlyIQ2XXS = 19,
73 MostlyIQ2XS = 20,
75 MostlyQ2KS = 21,
77 MostlyIQ3XS = 22,
79 MostlyIQ3XXS = 23,
81 MostlyIQ1S = 24,
83 MostlyIQ4NL = 25,
85 MostlyIQ3S = 26,
87 MostlyIQ3M = 27,
89 MostlyIQ2S = 28,
91 MostlyIQ2M = 29,
93 MostlyIQ4XS = 30,
95 MostlyIQ1M = 31,
97 MostlyBF16 = 32,
99 MostlyTQ1_0 = 36,
101 MostlyTQ2_0 = 37,
103 MostlyMXFP4Moe = 38,
105 MostlyNVFP4 = 39,
107 #[cfg(feature = "q1")]
109 MostlyQ1_0 = 40,
110 #[cfg(feature = "q1")]
112 MostlyQ1_0_G128 = 41,
113}
114
115impl LlamaFtype {
116 #[must_use]
118 pub fn name(self) -> &'static str {
119 match self {
120 Self::AllF32 => "F32",
121 Self::MostlyF16 => "F16",
122 Self::MostlyQ4_0 => "Q4_0",
123 Self::MostlyQ4_1 => "Q4_1",
124 Self::MostlyQ8_0 => "Q8_0",
125 Self::MostlyQ5_0 => "Q5_0",
126 Self::MostlyQ5_1 => "Q5_1",
127 Self::MostlyQ2K => "Q2_K",
128 Self::MostlyQ3KS => "Q3_K_S",
129 Self::MostlyQ3KM => "Q3_K_M",
130 Self::MostlyQ3KL => "Q3_K_L",
131 Self::MostlyQ4KS => "Q4_K_S",
132 Self::MostlyQ4KM => "Q4_K_M",
133 Self::MostlyQ5KS => "Q5_K_S",
134 Self::MostlyQ5KM => "Q5_K_M",
135 Self::MostlyQ6K => "Q6_K",
136 Self::MostlyIQ2XXS => "IQ2_XXS",
137 Self::MostlyIQ2XS => "IQ2_XS",
138 Self::MostlyQ2KS => "Q2_K_S",
139 Self::MostlyIQ3XS => "IQ3_XS",
140 Self::MostlyIQ3XXS => "IQ3_XXS",
141 Self::MostlyIQ1S => "IQ1_S",
142 Self::MostlyIQ4NL => "IQ4_NL",
143 Self::MostlyIQ3S => "IQ3_S",
144 Self::MostlyIQ3M => "IQ3_M",
145 Self::MostlyIQ2S => "IQ2_S",
146 Self::MostlyIQ2M => "IQ2_M",
147 Self::MostlyIQ4XS => "IQ4_XS",
148 Self::MostlyIQ1M => "IQ1_M",
149 Self::MostlyBF16 => "BF16",
150 Self::MostlyTQ1_0 => "TQ1_0",
151 Self::MostlyTQ2_0 => "TQ2_0",
152 Self::MostlyMXFP4Moe => "MXFP4_MOE",
153 Self::MostlyNVFP4 => "NVFP4",
154 #[cfg(feature = "q1")]
155 Self::MostlyQ1_0 => "Q1_0",
156 #[cfg(feature = "q1")]
157 Self::MostlyQ1_0_G128 => "Q1_0_g128",
158 }
159 }
160
161 #[must_use]
163 pub fn description(self) -> &'static str {
164 match self {
165 Self::AllF32 => "26.00 GB @ 7B — full precision reference",
166 Self::MostlyF16 => "14.00 GB @ 7B — +0.0020 ppl vs Mistral-7B",
167 Self::MostlyBF16 => "14.00 GB @ 7B — -0.0050 ppl vs Mistral-7B",
168 Self::MostlyQ8_0 => " 7.96 GB @ 8B — +0.0026 ppl",
169 Self::MostlyQ6K => " 6.14 GB @ 8B — +0.0217 ppl",
170 Self::MostlyQ5KM => " 5.33 GB @ 8B — +0.0569 ppl",
171 Self::MostlyQ5KS => " 5.21 GB @ 8B — +0.1049 ppl",
172 Self::MostlyQ5_1 => " 5.65 GB @ 8B — +0.1062 ppl",
173 Self::MostlyQ5_0 => " 5.21 GB @ 8B — +0.1316 ppl",
174 Self::MostlyQ4KM => " 4.58 GB @ 8B — +0.1754 ppl [recommended]",
175 Self::MostlyQ4KS => " 4.37 GB @ 8B — +0.2689 ppl",
176 Self::MostlyQ4_1 => " 4.78 GB @ 8B — +0.4511 ppl",
177 Self::MostlyQ4_0 => " 4.34 GB @ 8B — +0.4685 ppl",
178 Self::MostlyQ3KL => " 4.03 GB @ 8B — +0.5562 ppl",
179 Self::MostlyQ3KM => " 3.74 GB @ 8B — +0.6569 ppl",
180 Self::MostlyQ3KS => " 3.41 GB @ 8B — +1.6321 ppl",
181 Self::MostlyQ2KS => " 2.96 GB @ 8B — +3.1836 ppl",
182 Self::MostlyQ2K => " 2.96 GB @ 8B — +3.5199 ppl",
183 Self::MostlyIQ4XS => " 4.25 bpw non-linear",
184 Self::MostlyIQ4NL => " 4.50 bpw non-linear",
185 Self::MostlyIQ3S => " 3.44 bpw",
186 Self::MostlyIQ3M => " 3.66 bpw",
187 Self::MostlyIQ3XS => " 3.3 bpw",
188 Self::MostlyIQ3XXS => " 3.06 bpw",
189 Self::MostlyIQ2M => " 2.7 bpw",
190 Self::MostlyIQ2S => " 2.5 bpw",
191 Self::MostlyIQ2XS => " 2.31 bpw",
192 Self::MostlyIQ2XXS => " 2.06 bpw",
193 Self::MostlyIQ1M => " 1.75 bpw — extreme compression",
194 Self::MostlyIQ1S => " 1.56 bpw — extreme compression",
195 Self::MostlyTQ1_0 => " 1.69 bpw ternary",
196 Self::MostlyTQ2_0 => " 2.06 bpw ternary",
197 Self::MostlyMXFP4Moe => "MXFP4 MoE layers",
198 Self::MostlyNVFP4 => "NVFP4",
199 #[cfg(feature = "q1")]
200 Self::MostlyQ1_0 => " 1.50 bpw — binary Q1_0 (block 32)",
201 #[cfg(feature = "q1")]
202 Self::MostlyQ1_0_G128 => " 1.125 bpw — binary Q1_0_g128 (block 128)",
203 }
204 }
205
206 #[must_use]
215 pub fn from_name(name: &str) -> Option<Self> {
216 let upper = name.to_uppercase();
217 match upper.as_str() {
218 "F32" => Some(Self::AllF32),
219 "F16" => Some(Self::MostlyF16),
220 "BF16" => Some(Self::MostlyBF16),
221 "Q4_0" => Some(Self::MostlyQ4_0),
222 "Q4_1" => Some(Self::MostlyQ4_1),
223 "Q8_0" => Some(Self::MostlyQ8_0),
224 "Q5_0" => Some(Self::MostlyQ5_0),
225 "Q5_1" => Some(Self::MostlyQ5_1),
226 "Q2_K" => Some(Self::MostlyQ2K),
227 "Q2_K_S" => Some(Self::MostlyQ2KS),
228 "Q3_K_S" => Some(Self::MostlyQ3KS),
229 "Q3_K_M" => Some(Self::MostlyQ3KM),
230 "Q3_K_L" => Some(Self::MostlyQ3KL),
231 "Q4_K_S" => Some(Self::MostlyQ4KS),
232 "Q4_K_M" => Some(Self::MostlyQ4KM),
233 "Q5_K_S" => Some(Self::MostlyQ5KS),
234 "Q5_K_M" => Some(Self::MostlyQ5KM),
235 "Q6_K" => Some(Self::MostlyQ6K),
236 "IQ1_S" => Some(Self::MostlyIQ1S),
237 "IQ1_M" => Some(Self::MostlyIQ1M),
238 "IQ2_XXS" => Some(Self::MostlyIQ2XXS),
239 "IQ2_XS" => Some(Self::MostlyIQ2XS),
240 "IQ2_S" => Some(Self::MostlyIQ2S),
241 "IQ2_M" => Some(Self::MostlyIQ2M),
242 "IQ3_XXS" => Some(Self::MostlyIQ3XXS),
243 "IQ3_XS" => Some(Self::MostlyIQ3XS),
244 "IQ3_S" => Some(Self::MostlyIQ3S),
245 "IQ3_M" => Some(Self::MostlyIQ3M),
246 "IQ4_NL" => Some(Self::MostlyIQ4NL),
247 "IQ4_XS" => Some(Self::MostlyIQ4XS),
248 "TQ1_0" => Some(Self::MostlyTQ1_0),
249 "TQ2_0" => Some(Self::MostlyTQ2_0),
250 "MXFP4_MOE" => Some(Self::MostlyMXFP4Moe),
251 "NVFP4" => Some(Self::MostlyNVFP4),
252 #[cfg(feature = "q1")]
253 "Q1_0" => Some(Self::MostlyQ1_0),
254 #[cfg(feature = "q1")]
255 "Q1_0_G128" | "Q1_0_g128" => Some(Self::MostlyQ1_0_G128),
256 _ => None,
257 }
258 }
259
260 #[must_use]
262 pub fn all() -> &'static [Self] {
263 &[
264 Self::AllF32,
265 Self::MostlyF16,
266 Self::MostlyBF16,
267 Self::MostlyQ8_0,
268 Self::MostlyQ6K,
269 Self::MostlyQ5KM,
270 Self::MostlyQ5KS,
271 Self::MostlyQ5_1,
272 Self::MostlyQ5_0,
273 Self::MostlyQ4KM,
274 Self::MostlyQ4KS,
275 Self::MostlyQ4_1,
276 Self::MostlyQ4_0,
277 Self::MostlyQ3KL,
278 Self::MostlyQ3KM,
279 Self::MostlyQ3KS,
280 Self::MostlyQ2KS,
281 Self::MostlyQ2K,
282 Self::MostlyIQ4XS,
283 Self::MostlyIQ4NL,
284 Self::MostlyIQ3S,
285 Self::MostlyIQ3M,
286 Self::MostlyIQ3XS,
287 Self::MostlyIQ3XXS,
288 Self::MostlyIQ2M,
289 Self::MostlyIQ2S,
290 Self::MostlyIQ2XS,
291 Self::MostlyIQ2XXS,
292 Self::MostlyIQ1M,
293 Self::MostlyIQ1S,
294 Self::MostlyTQ1_0,
295 Self::MostlyTQ2_0,
296 Self::MostlyMXFP4Moe,
297 Self::MostlyNVFP4,
298 #[cfg(feature = "q1")]
299 Self::MostlyQ1_0,
300 #[cfg(feature = "q1")]
301 Self::MostlyQ1_0_G128,
302 ]
303 }
304}
305
306impl From<LlamaFtype> for llama_cpp_sys_4::llama_ftype {
307 fn from(t: LlamaFtype) -> Self {
308 t as llama_cpp_sys_4::llama_ftype
309 }
310}
311
312impl std::fmt::Display for LlamaFtype {
313 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
314 write!(f, "{}", self.name())
315 }
316}
317
318#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
328#[non_exhaustive]
329#[allow(missing_docs)]
330pub enum GgmlType {
331 F32 = 0,
332 F16 = 1,
333 Q4_0 = 2,
334 Q4_1 = 3,
335 Q5_0 = 6,
336 Q5_1 = 7,
337 Q8_0 = 8,
338 Q8_1 = 9,
339 Q2K = 10,
340 Q3K = 11,
341 Q4K = 12,
342 Q5K = 13,
343 Q6K = 14,
344 Q8K = 15,
345 IQ2XXS = 16,
346 IQ2XS = 17,
347 IQ3XXS = 18,
348 IQ1S = 19,
349 IQ4NL = 20,
350 IQ3S = 21,
351 IQ2S = 22,
352 IQ4XS = 23,
353 I8 = 24,
354 I16 = 25,
355 I32 = 26,
356 I64 = 27,
357 F64 = 28,
358 IQ1M = 29,
359 BF16 = 30,
360 TQ1_0 = 34,
361 TQ2_0 = 35,
362 MXFP4 = 39,
363 #[cfg(not(feature = "q1"))]
366 NVFP4 = 40,
367 #[cfg(feature = "q1")]
368 Q1_0 = 40,
369 #[cfg(feature = "q1")]
370 Q1_0_G128 = 41,
371 #[cfg(feature = "q1")]
372 NVFP4 = 42,
373}
374
375impl From<GgmlType> for llama_cpp_sys_4::ggml_type {
376 fn from(t: GgmlType) -> Self {
377 t as llama_cpp_sys_4::ggml_type
378 }
379}
380
381impl TryFrom<llama_cpp_sys_4::ggml_type> for GgmlType {
382 type Error = llama_cpp_sys_4::ggml_type;
383 fn try_from(v: llama_cpp_sys_4::ggml_type) -> Result<Self, Self::Error> {
384 match v {
385 0 => Ok(Self::F32),
386 1 => Ok(Self::F16),
387 2 => Ok(Self::Q4_0),
388 3 => Ok(Self::Q4_1),
389 6 => Ok(Self::Q5_0),
390 7 => Ok(Self::Q5_1),
391 8 => Ok(Self::Q8_0),
392 9 => Ok(Self::Q8_1),
393 10 => Ok(Self::Q2K),
394 11 => Ok(Self::Q3K),
395 12 => Ok(Self::Q4K),
396 13 => Ok(Self::Q5K),
397 14 => Ok(Self::Q6K),
398 15 => Ok(Self::Q8K),
399 16 => Ok(Self::IQ2XXS),
400 17 => Ok(Self::IQ2XS),
401 18 => Ok(Self::IQ3XXS),
402 19 => Ok(Self::IQ1S),
403 20 => Ok(Self::IQ4NL),
404 21 => Ok(Self::IQ3S),
405 22 => Ok(Self::IQ2S),
406 23 => Ok(Self::IQ4XS),
407 24 => Ok(Self::I8),
408 25 => Ok(Self::I16),
409 26 => Ok(Self::I32),
410 27 => Ok(Self::I64),
411 28 => Ok(Self::F64),
412 29 => Ok(Self::IQ1M),
413 30 => Ok(Self::BF16),
414 34 => Ok(Self::TQ1_0),
415 35 => Ok(Self::TQ2_0),
416 39 => Ok(Self::MXFP4),
417 #[cfg(not(feature = "q1"))]
418 40 => Ok(Self::NVFP4),
419 #[cfg(feature = "q1")]
420 40 => Ok(Self::Q1_0),
421 #[cfg(feature = "q1")]
422 41 => Ok(Self::Q1_0_G128),
423 #[cfg(feature = "q1")]
424 42 => Ok(Self::NVFP4),
425 _ => Err(v),
426 }
427 }
428}
429
430#[derive(Debug, Clone)]
441pub struct ImatrixEntry {
442 name: CString,
443 data: Vec<f32>,
444}
445
446impl ImatrixEntry {
447 pub fn new(name: impl Into<Vec<u8>>, data: Vec<f32>) -> Result<Self, NulError> {
453 Ok(Self {
454 name: CString::new(name)?,
455 data,
456 })
457 }
458
459 #[must_use]
461 pub fn name_str(&self) -> &str {
462 self.name.to_str().unwrap_or("")
463 }
464
465 #[must_use]
467 pub fn len(&self) -> usize {
468 self.data.len()
469 }
470
471 #[must_use]
473 pub fn is_empty(&self) -> bool {
474 self.data.is_empty()
475 }
476}
477
478#[derive(Debug, Clone, Default)]
483pub struct Imatrix {
484 entries: Vec<ImatrixEntry>,
485}
486
487impl Imatrix {
488 #[must_use]
490 pub fn new() -> Self {
491 Self::default()
492 }
493
494 pub fn push(&mut self, entry: ImatrixEntry) {
496 self.entries.push(entry);
497 }
498
499 #[must_use]
501 pub fn len(&self) -> usize {
502 self.entries.len()
503 }
504
505 #[must_use]
507 pub fn is_empty(&self) -> bool {
508 self.entries.is_empty()
509 }
510}
511
512#[derive(Debug, Clone)]
530pub struct TensorTypeOverride {
531 pattern: CString,
532 ty: GgmlType,
533}
534
535impl TensorTypeOverride {
536 pub fn new(pattern: impl Into<Vec<u8>>, ty: GgmlType) -> Result<Self, NulError> {
542 Ok(Self {
543 pattern: CString::new(pattern)?,
544 ty,
545 })
546 }
547
548 #[must_use]
550 pub fn pattern_str(&self) -> &str {
551 self.pattern.to_str().unwrap_or("")
552 }
553
554 #[must_use]
556 pub fn ty(&self) -> GgmlType {
557 self.ty
558 }
559}
560
561#[derive(Debug, Clone, PartialEq)]
567pub enum KvOverrideValue {
568 Int(i64),
570 Float(f64),
572 Bool(bool),
574 Str([std::os::raw::c_char; 128]),
576}
577
578#[derive(Debug, Clone)]
582pub struct KvOverride {
583 key: CString,
584 pub value: KvOverrideValue,
586}
587
588impl KvOverride {
589 pub fn new(key: impl Into<Vec<u8>>, value: KvOverrideValue) -> Result<Self, NulError> {
595 Ok(Self {
596 key: CString::new(key)?,
597 value,
598 })
599 }
600}
601
602#[derive(Debug, Clone)]
628#[allow(clippy::struct_excessive_bools)]
629pub struct QuantizeParams {
630 pub nthread: i32,
632 pub ftype: LlamaFtype,
634 pub output_tensor_type: Option<GgmlType>,
636 pub token_embedding_type: Option<GgmlType>,
638 pub allow_requantize: bool,
640 pub quantize_output_tensor: bool,
642 pub only_copy: bool,
644 pub pure: bool,
646 pub keep_split: bool,
648 pub dry_run: bool,
650
651 imatrix: Vec<ImatrixEntry>,
652 kv_overrides: Vec<KvOverride>,
653 tt_overrides: Vec<TensorTypeOverride>,
654 prune_layers: Vec<i32>,
655}
656
657impl QuantizeParams {
658 #[must_use]
663 pub fn new(ftype: LlamaFtype) -> Self {
664 let d = unsafe { llama_cpp_sys_4::llama_model_quantize_default_params() };
666 Self {
667 nthread: d.nthread,
668 ftype,
669 output_tensor_type: GgmlType::try_from(d.output_tensor_type).ok(),
670 token_embedding_type: GgmlType::try_from(d.token_embedding_type).ok(),
671 allow_requantize: d.allow_requantize,
672 quantize_output_tensor: d.quantize_output_tensor,
673 only_copy: d.only_copy,
674 pure: d.pure_,
675 keep_split: d.keep_split,
676 dry_run: d.dry_run,
677 imatrix: Vec::new(),
678 kv_overrides: Vec::new(),
679 tt_overrides: Vec::new(),
680 prune_layers: Vec::new(),
681 }
682 }
683
684 #[must_use]
686 pub fn with_nthread(mut self, n: i32) -> Self {
687 self.nthread = n;
688 self
689 }
690
691 #[must_use]
693 pub fn with_output_tensor_type(mut self, ty: GgmlType) -> Self {
694 self.output_tensor_type = Some(ty);
695 self
696 }
697
698 #[must_use]
700 pub fn with_token_embedding_type(mut self, ty: GgmlType) -> Self {
701 self.token_embedding_type = Some(ty);
702 self
703 }
704
705 #[must_use]
707 pub fn with_allow_requantize(mut self, v: bool) -> Self {
708 self.allow_requantize = v;
709 self
710 }
711
712 #[must_use]
714 pub fn with_quantize_output_tensor(mut self, v: bool) -> Self {
715 self.quantize_output_tensor = v;
716 self
717 }
718
719 #[must_use]
721 pub fn with_only_copy(mut self, v: bool) -> Self {
722 self.only_copy = v;
723 self
724 }
725
726 #[must_use]
728 pub fn with_pure(mut self, v: bool) -> Self {
729 self.pure = v;
730 self
731 }
732
733 #[must_use]
735 pub fn with_keep_split(mut self, v: bool) -> Self {
736 self.keep_split = v;
737 self
738 }
739
740 #[must_use]
742 pub fn with_dry_run(mut self, v: bool) -> Self {
743 self.dry_run = v;
744 self
745 }
746
747 #[must_use]
753 pub fn with_imatrix(mut self, imatrix: Imatrix) -> Self {
754 self.imatrix = imatrix.entries;
755 self
756 }
757
758 #[must_use]
760 pub fn with_imatrix_entry(mut self, entry: ImatrixEntry) -> Self {
761 self.imatrix.push(entry);
762 self
763 }
764
765 #[must_use]
767 pub fn with_kv_override(mut self, kv: KvOverride) -> Self {
768 self.kv_overrides.push(kv);
769 self
770 }
771
772 #[must_use]
776 pub fn with_tensor_type_override(mut self, ov: TensorTypeOverride) -> Self {
777 self.tt_overrides.push(ov);
778 self
779 }
780
781 #[must_use]
783 pub fn with_pruned_layer(mut self, layer: i32) -> Self {
784 self.prune_layers.push(layer);
785 self
786 }
787
788 #[must_use]
790 pub fn with_pruned_layers(mut self, layers: impl IntoIterator<Item = i32>) -> Self {
791 self.prune_layers.extend(layers);
792 self
793 }
794
795 pub(crate) fn to_raw(&self) -> RawQuantizeParamsGuard<'_> {
801 let imatrix_c: Vec<llama_cpp_sys_4::llama_model_imatrix_data> = self
805 .imatrix
806 .iter()
807 .map(|e| llama_cpp_sys_4::llama_model_imatrix_data {
808 name: e.name.as_ptr(),
809 data: e.data.as_ptr(),
810 size: e.data.len(),
811 })
812 .chain(std::iter::once(llama_cpp_sys_4::llama_model_imatrix_data {
813 name: null(),
814 data: null(),
815 size: 0,
816 }))
817 .collect();
818
819 let kv_c: Vec<llama_cpp_sys_4::llama_model_kv_override> = self
822 .kv_overrides
823 .iter()
824 .map(|kv| {
825 let mut raw = llama_cpp_sys_4::llama_model_kv_override {
826 key: [0; 128],
827 tag: 0,
828 __bindgen_anon_1: llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
829 val_i64: 0,
830 },
831 };
832 let bytes = kv.key.to_bytes_with_nul();
834 let copy_len = bytes.len().min(128);
835 for (dst, &src) in raw.key.iter_mut().zip(bytes[..copy_len].iter()) {
836 *dst = src as std::os::raw::c_char;
837 }
838 match &kv.value {
839 KvOverrideValue::Int(v) => {
840 raw.tag = llama_cpp_sys_4::LLAMA_KV_OVERRIDE_TYPE_INT;
841 raw.__bindgen_anon_1 =
842 llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
843 val_i64: *v,
844 };
845 }
846 KvOverrideValue::Float(v) => {
847 raw.tag = llama_cpp_sys_4::LLAMA_KV_OVERRIDE_TYPE_FLOAT;
848 raw.__bindgen_anon_1 =
849 llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
850 val_f64: *v,
851 };
852 }
853 KvOverrideValue::Bool(v) => {
854 raw.tag = llama_cpp_sys_4::LLAMA_KV_OVERRIDE_TYPE_BOOL;
855 raw.__bindgen_anon_1 =
856 llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
857 val_bool: *v,
858 };
859 }
860 KvOverrideValue::Str(s) => {
861 raw.tag = llama_cpp_sys_4::LLAMA_KV_OVERRIDE_TYPE_STR;
862 raw.__bindgen_anon_1 =
863 llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
864 val_str: *s,
865 };
866 }
867 }
868 raw
869 })
870 .chain(std::iter::once(llama_cpp_sys_4::llama_model_kv_override {
871 key: [0; 128],
872 tag: 0,
873 __bindgen_anon_1: llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
874 val_i64: 0,
875 },
876 }))
877 .collect();
878
879 let tt_c: Vec<llama_cpp_sys_4::llama_model_tensor_override> = self
882 .tt_overrides
883 .iter()
884 .map(|ov| llama_cpp_sys_4::llama_model_tensor_override {
885 pattern: ov.pattern.as_ptr(),
886 type_: ov.ty as llama_cpp_sys_4::ggml_type,
887 })
888 .chain(std::iter::once(
889 llama_cpp_sys_4::llama_model_tensor_override {
890 pattern: null(),
891 type_: llama_cpp_sys_4::GGML_TYPE_COUNT,
892 },
893 ))
894 .collect();
895
896 let mut prune_c = self.prune_layers.clone();
899 prune_c.push(-1);
900
901 let raw = llama_cpp_sys_4::llama_model_quantize_params {
903 nthread: self.nthread,
904 ftype: self.ftype as llama_cpp_sys_4::llama_ftype,
905 output_tensor_type: self
906 .output_tensor_type
907 .map(|t| t as llama_cpp_sys_4::ggml_type)
908 .unwrap_or(llama_cpp_sys_4::GGML_TYPE_COUNT),
909 token_embedding_type: self
910 .token_embedding_type
911 .map(|t| t as llama_cpp_sys_4::ggml_type)
912 .unwrap_or(llama_cpp_sys_4::GGML_TYPE_COUNT),
913 allow_requantize: self.allow_requantize,
914 quantize_output_tensor: self.quantize_output_tensor,
915 only_copy: self.only_copy,
916 pure_: self.pure,
917 keep_split: self.keep_split,
918 dry_run: self.dry_run,
919 imatrix: if self.imatrix.is_empty() {
920 null()
921 } else {
922 imatrix_c.as_ptr()
923 },
924 kv_overrides: if self.kv_overrides.is_empty() {
925 null()
926 } else {
927 kv_c.as_ptr()
928 },
929 tt_overrides: if self.tt_overrides.is_empty() {
930 null()
931 } else {
932 tt_c.as_ptr()
933 },
934 prune_layers: if self.prune_layers.is_empty() {
935 null()
936 } else {
937 prune_c.as_ptr()
938 },
939 };
940
941 RawQuantizeParamsGuard {
942 raw,
943 _imatrix_c: imatrix_c,
944 _kv_c: kv_c,
945 _tt_c: tt_c,
946 _prune_c: prune_c,
947 _marker: std::marker::PhantomData,
948 }
949 }
950}
951
952pub(crate) struct RawQuantizeParamsGuard<'a> {
955 pub(crate) raw: llama_cpp_sys_4::llama_model_quantize_params,
956 _imatrix_c: Vec<llama_cpp_sys_4::llama_model_imatrix_data>,
957 _kv_c: Vec<llama_cpp_sys_4::llama_model_kv_override>,
958 _tt_c: Vec<llama_cpp_sys_4::llama_model_tensor_override>,
959 _prune_c: Vec<i32>,
960 _marker: std::marker::PhantomData<&'a QuantizeParams>,
963}
964
965pub fn set_attn_rot_disabled(disabled: bool) {
999 if disabled {
1000 #[allow(unused_unsafe)]
1002 unsafe {
1003 std::env::set_var("LLAMA_ATTN_ROT_DISABLE", "1");
1004 }
1005 } else {
1006 #[allow(unused_unsafe)]
1007 unsafe {
1008 std::env::remove_var("LLAMA_ATTN_ROT_DISABLE");
1009 }
1010 }
1011}
1012
1013#[must_use]
1015pub fn attn_rot_disabled() -> bool {
1016 std::env::var("LLAMA_ATTN_ROT_DISABLE")
1017 .ok()
1018 .and_then(|v| v.parse::<i32>().ok())
1019 .map_or(false, |v| v != 0)
1020}