1use std::ffi::{CString, NulError};
26use std::ptr::null;
27
28#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
36#[non_exhaustive]
37#[allow(missing_docs)]
38pub enum LlamaFtype {
39 AllF32 = 0,
41 MostlyF16 = 1,
43 MostlyQ4_0 = 2,
45 MostlyQ4_1 = 3,
47 MostlyQ8_0 = 7,
49 MostlyQ5_0 = 8,
51 MostlyQ5_1 = 9,
53 MostlyQ2K = 10,
55 MostlyQ3KS = 11,
57 MostlyQ3KM = 12,
59 MostlyQ3KL = 13,
61 MostlyQ4KS = 14,
63 MostlyQ4KM = 15,
65 MostlyQ5KS = 16,
67 MostlyQ5KM = 17,
69 MostlyQ6K = 18,
71 MostlyIQ2XXS = 19,
73 MostlyIQ2XS = 20,
75 MostlyQ2KS = 21,
77 MostlyIQ3XS = 22,
79 MostlyIQ3XXS = 23,
81 MostlyIQ1S = 24,
83 MostlyIQ4NL = 25,
85 MostlyIQ3S = 26,
87 MostlyIQ3M = 27,
89 MostlyIQ2S = 28,
91 MostlyIQ2M = 29,
93 MostlyIQ4XS = 30,
95 MostlyIQ1M = 31,
97 MostlyBF16 = 32,
99 MostlyTQ1_0 = 36,
101 MostlyTQ2_0 = 37,
103 MostlyMXFP4Moe = 38,
105 MostlyNVFP4 = 39,
107}
108
109impl LlamaFtype {
110 #[must_use]
112 pub fn name(self) -> &'static str {
113 match self {
114 Self::AllF32 => "F32",
115 Self::MostlyF16 => "F16",
116 Self::MostlyQ4_0 => "Q4_0",
117 Self::MostlyQ4_1 => "Q4_1",
118 Self::MostlyQ8_0 => "Q8_0",
119 Self::MostlyQ5_0 => "Q5_0",
120 Self::MostlyQ5_1 => "Q5_1",
121 Self::MostlyQ2K => "Q2_K",
122 Self::MostlyQ3KS => "Q3_K_S",
123 Self::MostlyQ3KM => "Q3_K_M",
124 Self::MostlyQ3KL => "Q3_K_L",
125 Self::MostlyQ4KS => "Q4_K_S",
126 Self::MostlyQ4KM => "Q4_K_M",
127 Self::MostlyQ5KS => "Q5_K_S",
128 Self::MostlyQ5KM => "Q5_K_M",
129 Self::MostlyQ6K => "Q6_K",
130 Self::MostlyIQ2XXS => "IQ2_XXS",
131 Self::MostlyIQ2XS => "IQ2_XS",
132 Self::MostlyQ2KS => "Q2_K_S",
133 Self::MostlyIQ3XS => "IQ3_XS",
134 Self::MostlyIQ3XXS => "IQ3_XXS",
135 Self::MostlyIQ1S => "IQ1_S",
136 Self::MostlyIQ4NL => "IQ4_NL",
137 Self::MostlyIQ3S => "IQ3_S",
138 Self::MostlyIQ3M => "IQ3_M",
139 Self::MostlyIQ2S => "IQ2_S",
140 Self::MostlyIQ2M => "IQ2_M",
141 Self::MostlyIQ4XS => "IQ4_XS",
142 Self::MostlyIQ1M => "IQ1_M",
143 Self::MostlyBF16 => "BF16",
144 Self::MostlyTQ1_0 => "TQ1_0",
145 Self::MostlyTQ2_0 => "TQ2_0",
146 Self::MostlyMXFP4Moe => "MXFP4_MOE",
147 Self::MostlyNVFP4 => "NVFP4",
148 }
149 }
150
151 #[must_use]
153 pub fn description(self) -> &'static str {
154 match self {
155 Self::AllF32 => "26.00 GB @ 7B — full precision reference",
156 Self::MostlyF16 => "14.00 GB @ 7B — +0.0020 ppl vs Mistral-7B",
157 Self::MostlyBF16 => "14.00 GB @ 7B — -0.0050 ppl vs Mistral-7B",
158 Self::MostlyQ8_0 => " 7.96 GB @ 8B — +0.0026 ppl",
159 Self::MostlyQ6K => " 6.14 GB @ 8B — +0.0217 ppl",
160 Self::MostlyQ5KM => " 5.33 GB @ 8B — +0.0569 ppl",
161 Self::MostlyQ5KS => " 5.21 GB @ 8B — +0.1049 ppl",
162 Self::MostlyQ5_1 => " 5.65 GB @ 8B — +0.1062 ppl",
163 Self::MostlyQ5_0 => " 5.21 GB @ 8B — +0.1316 ppl",
164 Self::MostlyQ4KM => " 4.58 GB @ 8B — +0.1754 ppl [recommended]",
165 Self::MostlyQ4KS => " 4.37 GB @ 8B — +0.2689 ppl",
166 Self::MostlyQ4_1 => " 4.78 GB @ 8B — +0.4511 ppl",
167 Self::MostlyQ4_0 => " 4.34 GB @ 8B — +0.4685 ppl",
168 Self::MostlyQ3KL => " 4.03 GB @ 8B — +0.5562 ppl",
169 Self::MostlyQ3KM => " 3.74 GB @ 8B — +0.6569 ppl",
170 Self::MostlyQ3KS => " 3.41 GB @ 8B — +1.6321 ppl",
171 Self::MostlyQ2KS => " 2.96 GB @ 8B — +3.1836 ppl",
172 Self::MostlyQ2K => " 2.96 GB @ 8B — +3.5199 ppl",
173 Self::MostlyIQ4XS => " 4.25 bpw non-linear",
174 Self::MostlyIQ4NL => " 4.50 bpw non-linear",
175 Self::MostlyIQ3S => " 3.44 bpw",
176 Self::MostlyIQ3M => " 3.66 bpw",
177 Self::MostlyIQ3XS => " 3.3 bpw",
178 Self::MostlyIQ3XXS => " 3.06 bpw",
179 Self::MostlyIQ2M => " 2.7 bpw",
180 Self::MostlyIQ2S => " 2.5 bpw",
181 Self::MostlyIQ2XS => " 2.31 bpw",
182 Self::MostlyIQ2XXS => " 2.06 bpw",
183 Self::MostlyIQ1M => " 1.75 bpw — extreme compression",
184 Self::MostlyIQ1S => " 1.56 bpw — extreme compression",
185 Self::MostlyTQ1_0 => " 1.69 bpw ternary",
186 Self::MostlyTQ2_0 => " 2.06 bpw ternary",
187 Self::MostlyMXFP4Moe => "MXFP4 MoE layers",
188 Self::MostlyNVFP4 => "NVFP4",
189 }
190 }
191
192 #[must_use]
201 pub fn from_name(name: &str) -> Option<Self> {
202 let upper = name.to_uppercase();
203 match upper.as_str() {
204 "F32" => Some(Self::AllF32),
205 "F16" => Some(Self::MostlyF16),
206 "BF16" => Some(Self::MostlyBF16),
207 "Q4_0" => Some(Self::MostlyQ4_0),
208 "Q4_1" => Some(Self::MostlyQ4_1),
209 "Q8_0" => Some(Self::MostlyQ8_0),
210 "Q5_0" => Some(Self::MostlyQ5_0),
211 "Q5_1" => Some(Self::MostlyQ5_1),
212 "Q2_K" => Some(Self::MostlyQ2K),
213 "Q2_K_S" => Some(Self::MostlyQ2KS),
214 "Q3_K_S" => Some(Self::MostlyQ3KS),
215 "Q3_K_M" => Some(Self::MostlyQ3KM),
216 "Q3_K_L" => Some(Self::MostlyQ3KL),
217 "Q4_K_S" => Some(Self::MostlyQ4KS),
218 "Q4_K_M" => Some(Self::MostlyQ4KM),
219 "Q5_K_S" => Some(Self::MostlyQ5KS),
220 "Q5_K_M" => Some(Self::MostlyQ5KM),
221 "Q6_K" => Some(Self::MostlyQ6K),
222 "IQ1_S" => Some(Self::MostlyIQ1S),
223 "IQ1_M" => Some(Self::MostlyIQ1M),
224 "IQ2_XXS" => Some(Self::MostlyIQ2XXS),
225 "IQ2_XS" => Some(Self::MostlyIQ2XS),
226 "IQ2_S" => Some(Self::MostlyIQ2S),
227 "IQ2_M" => Some(Self::MostlyIQ2M),
228 "IQ3_XXS" => Some(Self::MostlyIQ3XXS),
229 "IQ3_XS" => Some(Self::MostlyIQ3XS),
230 "IQ3_S" => Some(Self::MostlyIQ3S),
231 "IQ3_M" => Some(Self::MostlyIQ3M),
232 "IQ4_NL" => Some(Self::MostlyIQ4NL),
233 "IQ4_XS" => Some(Self::MostlyIQ4XS),
234 "TQ1_0" => Some(Self::MostlyTQ1_0),
235 "TQ2_0" => Some(Self::MostlyTQ2_0),
236 "MXFP4_MOE" => Some(Self::MostlyMXFP4Moe),
237 "NVFP4" => Some(Self::MostlyNVFP4),
238 _ => None,
239 }
240 }
241
242 #[must_use]
244 pub fn all() -> &'static [Self] {
245 &[
246 Self::AllF32,
247 Self::MostlyF16,
248 Self::MostlyBF16,
249 Self::MostlyQ8_0,
250 Self::MostlyQ6K,
251 Self::MostlyQ5KM,
252 Self::MostlyQ5KS,
253 Self::MostlyQ5_1,
254 Self::MostlyQ5_0,
255 Self::MostlyQ4KM,
256 Self::MostlyQ4KS,
257 Self::MostlyQ4_1,
258 Self::MostlyQ4_0,
259 Self::MostlyQ3KL,
260 Self::MostlyQ3KM,
261 Self::MostlyQ3KS,
262 Self::MostlyQ2KS,
263 Self::MostlyQ2K,
264 Self::MostlyIQ4XS,
265 Self::MostlyIQ4NL,
266 Self::MostlyIQ3S,
267 Self::MostlyIQ3M,
268 Self::MostlyIQ3XS,
269 Self::MostlyIQ3XXS,
270 Self::MostlyIQ2M,
271 Self::MostlyIQ2S,
272 Self::MostlyIQ2XS,
273 Self::MostlyIQ2XXS,
274 Self::MostlyIQ1M,
275 Self::MostlyIQ1S,
276 Self::MostlyTQ1_0,
277 Self::MostlyTQ2_0,
278 Self::MostlyMXFP4Moe,
279 Self::MostlyNVFP4,
280 ]
281 }
282}
283
284impl From<LlamaFtype> for llama_cpp_sys_4::llama_ftype {
285 fn from(t: LlamaFtype) -> Self {
286 t as llama_cpp_sys_4::llama_ftype
287 }
288}
289
290impl std::fmt::Display for LlamaFtype {
291 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
292 write!(f, "{}", self.name())
293 }
294}
295
296#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
306#[non_exhaustive]
307#[allow(missing_docs)]
308pub enum GgmlType {
309 F32 = 0,
310 F16 = 1,
311 Q4_0 = 2,
312 Q4_1 = 3,
313 Q5_0 = 6,
314 Q5_1 = 7,
315 Q8_0 = 8,
316 Q8_1 = 9,
317 Q2K = 10,
318 Q3K = 11,
319 Q4K = 12,
320 Q5K = 13,
321 Q6K = 14,
322 Q8K = 15,
323 IQ2XXS = 16,
324 IQ2XS = 17,
325 IQ3XXS = 18,
326 IQ1S = 19,
327 IQ4NL = 20,
328 IQ3S = 21,
329 IQ2S = 22,
330 IQ4XS = 23,
331 I8 = 24,
332 I16 = 25,
333 I32 = 26,
334 I64 = 27,
335 F64 = 28,
336 IQ1M = 29,
337 BF16 = 30,
338 TQ1_0 = 34,
339 TQ2_0 = 35,
340 MXFP4 = 39,
341 NVFP4 = 40,
342}
343
344impl From<GgmlType> for llama_cpp_sys_4::ggml_type {
345 fn from(t: GgmlType) -> Self {
346 t as llama_cpp_sys_4::ggml_type
347 }
348}
349
350impl TryFrom<llama_cpp_sys_4::ggml_type> for GgmlType {
351 type Error = llama_cpp_sys_4::ggml_type;
352 fn try_from(v: llama_cpp_sys_4::ggml_type) -> Result<Self, Self::Error> {
353 match v {
354 0 => Ok(Self::F32),
355 1 => Ok(Self::F16),
356 2 => Ok(Self::Q4_0),
357 3 => Ok(Self::Q4_1),
358 6 => Ok(Self::Q5_0),
359 7 => Ok(Self::Q5_1),
360 8 => Ok(Self::Q8_0),
361 9 => Ok(Self::Q8_1),
362 10 => Ok(Self::Q2K),
363 11 => Ok(Self::Q3K),
364 12 => Ok(Self::Q4K),
365 13 => Ok(Self::Q5K),
366 14 => Ok(Self::Q6K),
367 15 => Ok(Self::Q8K),
368 16 => Ok(Self::IQ2XXS),
369 17 => Ok(Self::IQ2XS),
370 18 => Ok(Self::IQ3XXS),
371 19 => Ok(Self::IQ1S),
372 20 => Ok(Self::IQ4NL),
373 21 => Ok(Self::IQ3S),
374 22 => Ok(Self::IQ2S),
375 23 => Ok(Self::IQ4XS),
376 24 => Ok(Self::I8),
377 25 => Ok(Self::I16),
378 26 => Ok(Self::I32),
379 27 => Ok(Self::I64),
380 28 => Ok(Self::F64),
381 29 => Ok(Self::IQ1M),
382 30 => Ok(Self::BF16),
383 34 => Ok(Self::TQ1_0),
384 35 => Ok(Self::TQ2_0),
385 39 => Ok(Self::MXFP4),
386 40 => Ok(Self::NVFP4),
387 _ => Err(v),
388 }
389 }
390}
391
392#[derive(Debug, Clone)]
403pub struct ImatrixEntry {
404 name: CString,
405 data: Vec<f32>,
406}
407
408impl ImatrixEntry {
409 pub fn new(name: impl Into<Vec<u8>>, data: Vec<f32>) -> Result<Self, NulError> {
415 Ok(Self {
416 name: CString::new(name)?,
417 data,
418 })
419 }
420
421 #[must_use]
423 pub fn name_str(&self) -> &str {
424 self.name.to_str().unwrap_or("")
425 }
426
427 #[must_use]
429 pub fn len(&self) -> usize {
430 self.data.len()
431 }
432
433 #[must_use]
435 pub fn is_empty(&self) -> bool {
436 self.data.is_empty()
437 }
438}
439
440#[derive(Debug, Clone, Default)]
445pub struct Imatrix {
446 entries: Vec<ImatrixEntry>,
447}
448
449impl Imatrix {
450 #[must_use]
452 pub fn new() -> Self {
453 Self::default()
454 }
455
456 pub fn push(&mut self, entry: ImatrixEntry) {
458 self.entries.push(entry);
459 }
460
461 #[must_use]
463 pub fn len(&self) -> usize {
464 self.entries.len()
465 }
466
467 #[must_use]
469 pub fn is_empty(&self) -> bool {
470 self.entries.is_empty()
471 }
472}
473
474#[derive(Debug, Clone)]
492pub struct TensorTypeOverride {
493 pattern: CString,
494 ty: GgmlType,
495}
496
497impl TensorTypeOverride {
498 pub fn new(pattern: impl Into<Vec<u8>>, ty: GgmlType) -> Result<Self, NulError> {
504 Ok(Self {
505 pattern: CString::new(pattern)?,
506 ty,
507 })
508 }
509
510 #[must_use]
512 pub fn pattern_str(&self) -> &str {
513 self.pattern.to_str().unwrap_or("")
514 }
515
516 #[must_use]
518 pub fn ty(&self) -> GgmlType {
519 self.ty
520 }
521}
522
523#[derive(Debug, Clone, PartialEq)]
529pub enum KvOverrideValue {
530 Int(i64),
532 Float(f64),
534 Bool(bool),
536 Str([std::os::raw::c_char; 128]),
538}
539
540#[derive(Debug, Clone)]
544pub struct KvOverride {
545 key: CString,
546 pub value: KvOverrideValue,
548}
549
550impl KvOverride {
551 pub fn new(key: impl Into<Vec<u8>>, value: KvOverrideValue) -> Result<Self, NulError> {
557 Ok(Self {
558 key: CString::new(key)?,
559 value,
560 })
561 }
562}
563
564#[derive(Debug, Clone)]
590#[allow(clippy::struct_excessive_bools)]
591pub struct QuantizeParams {
592 pub nthread: i32,
594 pub ftype: LlamaFtype,
596 pub output_tensor_type: Option<GgmlType>,
598 pub token_embedding_type: Option<GgmlType>,
600 pub allow_requantize: bool,
602 pub quantize_output_tensor: bool,
604 pub only_copy: bool,
606 pub pure: bool,
608 pub keep_split: bool,
610 pub dry_run: bool,
612
613 imatrix: Vec<ImatrixEntry>,
614 kv_overrides: Vec<KvOverride>,
615 tt_overrides: Vec<TensorTypeOverride>,
616 prune_layers: Vec<i32>,
617}
618
619impl QuantizeParams {
620 #[must_use]
625 pub fn new(ftype: LlamaFtype) -> Self {
626 let d = unsafe { llama_cpp_sys_4::llama_model_quantize_default_params() };
628 Self {
629 nthread: d.nthread,
630 ftype,
631 output_tensor_type: GgmlType::try_from(d.output_tensor_type).ok(),
632 token_embedding_type: GgmlType::try_from(d.token_embedding_type).ok(),
633 allow_requantize: d.allow_requantize,
634 quantize_output_tensor: d.quantize_output_tensor,
635 only_copy: d.only_copy,
636 pure: d.pure_,
637 keep_split: d.keep_split,
638 dry_run: d.dry_run,
639 imatrix: Vec::new(),
640 kv_overrides: Vec::new(),
641 tt_overrides: Vec::new(),
642 prune_layers: Vec::new(),
643 }
644 }
645
646 #[must_use]
648 pub fn with_nthread(mut self, n: i32) -> Self {
649 self.nthread = n;
650 self
651 }
652
653 #[must_use]
655 pub fn with_output_tensor_type(mut self, ty: GgmlType) -> Self {
656 self.output_tensor_type = Some(ty);
657 self
658 }
659
660 #[must_use]
662 pub fn with_token_embedding_type(mut self, ty: GgmlType) -> Self {
663 self.token_embedding_type = Some(ty);
664 self
665 }
666
667 #[must_use]
669 pub fn with_allow_requantize(mut self, v: bool) -> Self {
670 self.allow_requantize = v;
671 self
672 }
673
674 #[must_use]
676 pub fn with_quantize_output_tensor(mut self, v: bool) -> Self {
677 self.quantize_output_tensor = v;
678 self
679 }
680
681 #[must_use]
683 pub fn with_only_copy(mut self, v: bool) -> Self {
684 self.only_copy = v;
685 self
686 }
687
688 #[must_use]
690 pub fn with_pure(mut self, v: bool) -> Self {
691 self.pure = v;
692 self
693 }
694
695 #[must_use]
697 pub fn with_keep_split(mut self, v: bool) -> Self {
698 self.keep_split = v;
699 self
700 }
701
702 #[must_use]
704 pub fn with_dry_run(mut self, v: bool) -> Self {
705 self.dry_run = v;
706 self
707 }
708
709 #[must_use]
715 pub fn with_imatrix(mut self, imatrix: Imatrix) -> Self {
716 self.imatrix = imatrix.entries;
717 self
718 }
719
720 #[must_use]
722 pub fn with_imatrix_entry(mut self, entry: ImatrixEntry) -> Self {
723 self.imatrix.push(entry);
724 self
725 }
726
727 #[must_use]
729 pub fn with_kv_override(mut self, kv: KvOverride) -> Self {
730 self.kv_overrides.push(kv);
731 self
732 }
733
734 #[must_use]
738 pub fn with_tensor_type_override(mut self, ov: TensorTypeOverride) -> Self {
739 self.tt_overrides.push(ov);
740 self
741 }
742
743 #[must_use]
745 pub fn with_pruned_layer(mut self, layer: i32) -> Self {
746 self.prune_layers.push(layer);
747 self
748 }
749
750 #[must_use]
752 pub fn with_pruned_layers(mut self, layers: impl IntoIterator<Item = i32>) -> Self {
753 self.prune_layers.extend(layers);
754 self
755 }
756
757 pub(crate) fn to_raw(&self) -> RawQuantizeParamsGuard<'_> {
763 let imatrix_c: Vec<llama_cpp_sys_4::llama_model_imatrix_data> = self
767 .imatrix
768 .iter()
769 .map(|e| llama_cpp_sys_4::llama_model_imatrix_data {
770 name: e.name.as_ptr(),
771 data: e.data.as_ptr(),
772 size: e.data.len(),
773 })
774 .chain(std::iter::once(llama_cpp_sys_4::llama_model_imatrix_data {
775 name: null(),
776 data: null(),
777 size: 0,
778 }))
779 .collect();
780
781 let kv_c: Vec<llama_cpp_sys_4::llama_model_kv_override> = self
784 .kv_overrides
785 .iter()
786 .map(|kv| {
787 let mut raw = llama_cpp_sys_4::llama_model_kv_override {
788 key: [0; 128],
789 tag: 0,
790 __bindgen_anon_1: llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
791 val_i64: 0,
792 },
793 };
794 let bytes = kv.key.to_bytes_with_nul();
796 let copy_len = bytes.len().min(128);
797 for (dst, &src) in raw.key.iter_mut().zip(bytes[..copy_len].iter()) {
798 *dst = src as std::os::raw::c_char;
799 }
800 match &kv.value {
801 KvOverrideValue::Int(v) => {
802 raw.tag = llama_cpp_sys_4::LLAMA_KV_OVERRIDE_TYPE_INT;
803 raw.__bindgen_anon_1 =
804 llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
805 val_i64: *v,
806 };
807 }
808 KvOverrideValue::Float(v) => {
809 raw.tag = llama_cpp_sys_4::LLAMA_KV_OVERRIDE_TYPE_FLOAT;
810 raw.__bindgen_anon_1 =
811 llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
812 val_f64: *v,
813 };
814 }
815 KvOverrideValue::Bool(v) => {
816 raw.tag = llama_cpp_sys_4::LLAMA_KV_OVERRIDE_TYPE_BOOL;
817 raw.__bindgen_anon_1 =
818 llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
819 val_bool: *v,
820 };
821 }
822 KvOverrideValue::Str(s) => {
823 raw.tag = llama_cpp_sys_4::LLAMA_KV_OVERRIDE_TYPE_STR;
824 raw.__bindgen_anon_1 =
825 llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
826 val_str: *s,
827 };
828 }
829 }
830 raw
831 })
832 .chain(std::iter::once(llama_cpp_sys_4::llama_model_kv_override {
833 key: [0; 128],
834 tag: 0,
835 __bindgen_anon_1: llama_cpp_sys_4::llama_model_kv_override__bindgen_ty_1 {
836 val_i64: 0,
837 },
838 }))
839 .collect();
840
841 let tt_c: Vec<llama_cpp_sys_4::llama_model_tensor_override> = self
844 .tt_overrides
845 .iter()
846 .map(|ov| llama_cpp_sys_4::llama_model_tensor_override {
847 pattern: ov.pattern.as_ptr(),
848 type_: ov.ty as llama_cpp_sys_4::ggml_type,
849 })
850 .chain(std::iter::once(
851 llama_cpp_sys_4::llama_model_tensor_override {
852 pattern: null(),
853 type_: llama_cpp_sys_4::GGML_TYPE_COUNT,
854 },
855 ))
856 .collect();
857
858 let mut prune_c = self.prune_layers.clone();
861 prune_c.push(-1);
862
863 let raw = llama_cpp_sys_4::llama_model_quantize_params {
865 nthread: self.nthread,
866 ftype: self.ftype as llama_cpp_sys_4::llama_ftype,
867 output_tensor_type: self
868 .output_tensor_type
869 .map(|t| t as llama_cpp_sys_4::ggml_type)
870 .unwrap_or(llama_cpp_sys_4::GGML_TYPE_COUNT),
871 token_embedding_type: self
872 .token_embedding_type
873 .map(|t| t as llama_cpp_sys_4::ggml_type)
874 .unwrap_or(llama_cpp_sys_4::GGML_TYPE_COUNT),
875 allow_requantize: self.allow_requantize,
876 quantize_output_tensor: self.quantize_output_tensor,
877 only_copy: self.only_copy,
878 pure_: self.pure,
879 keep_split: self.keep_split,
880 dry_run: self.dry_run,
881 imatrix: if self.imatrix.is_empty() {
882 null()
883 } else {
884 imatrix_c.as_ptr()
885 },
886 kv_overrides: if self.kv_overrides.is_empty() {
887 null()
888 } else {
889 kv_c.as_ptr()
890 },
891 tt_overrides: if self.tt_overrides.is_empty() {
892 null()
893 } else {
894 tt_c.as_ptr()
895 },
896 prune_layers: if self.prune_layers.is_empty() {
897 null()
898 } else {
899 prune_c.as_ptr()
900 },
901 };
902
903 RawQuantizeParamsGuard {
904 raw,
905 _imatrix_c: imatrix_c,
906 _kv_c: kv_c,
907 _tt_c: tt_c,
908 _prune_c: prune_c,
909 _marker: std::marker::PhantomData,
910 }
911 }
912}
913
914pub(crate) struct RawQuantizeParamsGuard<'a> {
917 pub(crate) raw: llama_cpp_sys_4::llama_model_quantize_params,
918 _imatrix_c: Vec<llama_cpp_sys_4::llama_model_imatrix_data>,
919 _kv_c: Vec<llama_cpp_sys_4::llama_model_kv_override>,
920 _tt_c: Vec<llama_cpp_sys_4::llama_model_tensor_override>,
921 _prune_c: Vec<i32>,
922 _marker: std::marker::PhantomData<&'a QuantizeParams>,
925}
926
927pub fn set_attn_rot_disabled(disabled: bool) {
961 if disabled {
962 #[allow(unused_unsafe)]
964 unsafe {
965 std::env::set_var("LLAMA_ATTN_ROT_DISABLE", "1");
966 }
967 } else {
968 #[allow(unused_unsafe)]
969 unsafe {
970 std::env::remove_var("LLAMA_ATTN_ROT_DISABLE");
971 }
972 }
973}
974
975#[must_use]
977pub fn attn_rot_disabled() -> bool {
978 std::env::var("LLAMA_ATTN_ROT_DISABLE")
979 .ok()
980 .and_then(|v| v.parse::<i32>().ok())
981 .map_or(false, |v| v != 0)
982}