Skip to main content

llama_cpp_4/context/
params.rs

1//! A safe wrapper around `llama_context_params`.
2use std::fmt::Debug;
3use std::num::NonZeroU32;
4
5/// A rusty wrapper around `llama_context_type`.
6#[repr(u32)]
7#[derive(Copy, Clone, Debug, PartialEq, Eq)]
8pub enum LlamaContextType {
9    /// Default context (standard inference).
10    Default = llama_cpp_sys_4::LLAMA_CONTEXT_TYPE_DEFAULT,
11    /// Multi-token-prediction draft context, used as the draft side of speculative decoding.
12    Mtp = llama_cpp_sys_4::LLAMA_CONTEXT_TYPE_MTP,
13}
14
15impl From<llama_cpp_sys_4::llama_context_type> for LlamaContextType {
16    fn from(value: llama_cpp_sys_4::llama_context_type) -> Self {
17        match value {
18            llama_cpp_sys_4::LLAMA_CONTEXT_TYPE_MTP => Self::Mtp,
19            _ => Self::Default,
20        }
21    }
22}
23
24impl From<LlamaContextType> for llama_cpp_sys_4::llama_context_type {
25    fn from(value: LlamaContextType) -> Self {
26        value as Self
27    }
28}
29
30/// A rusty wrapper around `rope_scaling_type`.
31#[repr(i8)]
32#[derive(Copy, Clone, Debug, PartialEq, Eq)]
33pub enum RopeScalingType {
34    /// The scaling type is unspecified
35    Unspecified = -1,
36    /// No scaling
37    None = 0,
38    /// Linear scaling
39    Linear = 1,
40    /// Yarn scaling
41    Yarn = 2,
42}
43
44/// Create a `RopeScalingType` from a `c_int` - returns `RopeScalingType::ScalingUnspecified` if
45/// the value is not recognized.
46impl From<i32> for RopeScalingType {
47    fn from(value: i32) -> Self {
48        match value {
49            0 => Self::None,
50            1 => Self::Linear,
51            2 => Self::Yarn,
52            _ => Self::Unspecified,
53        }
54    }
55}
56
57/// Create a `c_int` from a `RopeScalingType`.
58impl From<RopeScalingType> for i32 {
59    fn from(value: RopeScalingType) -> Self {
60        match value {
61            RopeScalingType::None => 0,
62            RopeScalingType::Linear => 1,
63            RopeScalingType::Yarn => 2,
64            RopeScalingType::Unspecified => -1,
65        }
66    }
67}
68
69/// A rusty wrapper around `LLAMA_POOLING_TYPE`.
70#[repr(i8)]
71#[derive(Copy, Clone, Debug, PartialEq, Eq)]
72pub enum LlamaPoolingType {
73    /// The pooling type is unspecified
74    Unspecified = -1,
75    /// No pooling    
76    None = 0,
77    /// Mean pooling
78    Mean = 1,
79    /// CLS pooling
80    Cls = 2,
81    /// Last pooling
82    Last = 3,
83}
84
85/// Create a `LlamaPoolingType` from a `c_int` - returns `LlamaPoolingType::Unspecified` if
86/// the value is not recognized.
87impl From<i32> for LlamaPoolingType {
88    fn from(value: i32) -> Self {
89        match value {
90            0 => Self::None,
91            1 => Self::Mean,
92            2 => Self::Cls,
93            3 => Self::Last,
94            _ => Self::Unspecified,
95        }
96    }
97}
98
99/// Create a `c_int` from a `LlamaPoolingType`.
100impl From<LlamaPoolingType> for i32 {
101    fn from(value: LlamaPoolingType) -> Self {
102        match value {
103            LlamaPoolingType::None => 0,
104            LlamaPoolingType::Mean => 1,
105            LlamaPoolingType::Cls => 2,
106            LlamaPoolingType::Last => 3,
107            LlamaPoolingType::Unspecified => -1,
108        }
109    }
110}
111
112/// A safe wrapper around `llama_context_params`.
113///
114/// Generally this should be created with [`Default::default()`] and then modified with `with_*` methods.
115///
116/// # Examples
117///
118/// ```rust
119/// # use std::num::NonZeroU32;
120/// use llama_cpp_4::context::params::LlamaContextParams;
121///
122/// let ctx_params = LlamaContextParams::default()
123///     .with_n_ctx(NonZeroU32::new(2048));
124///
125/// assert_eq!(ctx_params.n_ctx(), NonZeroU32::new(2048));
126/// ```
127#[derive(Debug, Clone)]
128#[allow(
129    missing_docs,
130    clippy::struct_excessive_bools,
131    clippy::module_name_repetitions
132)]
133pub struct LlamaContextParams {
134    pub(crate) context_params: llama_cpp_sys_4::llama_context_params,
135    /// When `true`, the `TurboQuant` attention rotation (PR #21038) will be
136    /// disabled for any context created from these params.
137    pub(crate) attn_rot_disabled: bool,
138}
139
140/// SAFETY: we do not currently allow setting or reading the pointers that cause this to not be automatically send or sync.
141unsafe impl Send for LlamaContextParams {}
142unsafe impl Sync for LlamaContextParams {}
143
144impl LlamaContextParams {
145    /// Set the side of the context
146    ///
147    /// # Examples
148    ///
149    /// ```rust
150    /// # use std::num::NonZeroU32;
151    /// use llama_cpp_4::context::params::LlamaContextParams;
152    /// let params = LlamaContextParams::default();
153    /// let params = params.with_n_ctx(NonZeroU32::new(2048));
154    /// assert_eq!(params.n_ctx(), NonZeroU32::new(2048));
155    /// ```
156    #[must_use]
157    pub fn with_n_ctx(mut self, n_ctx: Option<NonZeroU32>) -> Self {
158        self.context_params.n_ctx = n_ctx.map_or(0, std::num::NonZeroU32::get);
159        self
160    }
161
162    /// Get the size of the context.
163    ///
164    /// [`None`] if the context size is specified by the model and not the context.
165    ///
166    /// # Examples
167    ///
168    /// ```rust
169    /// let params = llama_cpp_4::context::params::LlamaContextParams::default();
170    /// assert_eq!(params.n_ctx(), std::num::NonZeroU32::new(512));
171    #[must_use]
172    pub fn n_ctx(&self) -> Option<NonZeroU32> {
173        NonZeroU32::new(self.context_params.n_ctx)
174    }
175
176    /// Set the `n_batch`
177    ///
178    /// # Examples
179    ///
180    /// ```rust
181    /// # use std::num::NonZeroU32;
182    /// use llama_cpp_4::context::params::LlamaContextParams;
183    /// let params = LlamaContextParams::default()
184    ///     .with_n_batch(2048);
185    /// assert_eq!(params.n_batch(), 2048);
186    /// ```
187    #[must_use]
188    pub fn with_n_batch(mut self, n_batch: u32) -> Self {
189        self.context_params.n_batch = n_batch;
190        self
191    }
192
193    /// Get the `n_batch`
194    ///
195    /// # Examples
196    ///
197    /// ```rust
198    /// use llama_cpp_4::context::params::LlamaContextParams;
199    /// let params = LlamaContextParams::default();
200    /// assert_eq!(params.n_batch(), 2048);
201    /// ```
202    #[must_use]
203    pub fn n_batch(&self) -> u32 {
204        self.context_params.n_batch
205    }
206
207    /// Set the `n_ubatch`
208    ///
209    /// # Examples
210    ///
211    /// ```rust
212    /// # use std::num::NonZeroU32;
213    /// use llama_cpp_4::context::params::LlamaContextParams;
214    /// let params = LlamaContextParams::default()
215    ///     .with_n_ubatch(512);
216    /// assert_eq!(params.n_ubatch(), 512);
217    /// ```
218    #[must_use]
219    pub fn with_n_ubatch(mut self, n_ubatch: u32) -> Self {
220        self.context_params.n_ubatch = n_ubatch;
221        self
222    }
223
224    /// Get the `n_ubatch`
225    ///
226    /// # Examples
227    ///
228    /// ```rust
229    /// use llama_cpp_4::context::params::LlamaContextParams;
230    /// let params = LlamaContextParams::default();
231    /// assert_eq!(params.n_ubatch(), 512);
232    /// ```
233    #[must_use]
234    pub fn n_ubatch(&self) -> u32 {
235        self.context_params.n_ubatch
236    }
237
238    /// Set the context type (e.g. [`LlamaContextType::Mtp`] to load this context as a
239    /// multi-token-prediction draft head used by upstream's `draft-mtp` speculative decoder).
240    #[must_use]
241    pub fn with_ctx_type(mut self, ctx_type: LlamaContextType) -> Self {
242        self.context_params.ctx_type = ctx_type.into();
243        self
244    }
245
246    /// Get the configured context type.
247    #[must_use]
248    pub fn ctx_type(&self) -> LlamaContextType {
249        self.context_params.ctx_type.into()
250    }
251
252    /// Set the number of recurrent-state snapshots per sequence used for MTP rollback.
253    #[must_use]
254    pub fn with_n_rs_seq(mut self, n_rs_seq: u32) -> Self {
255        self.context_params.n_rs_seq = n_rs_seq;
256        self
257    }
258
259    /// Get the number of recurrent-state snapshots per sequence used for MTP rollback.
260    #[must_use]
261    pub fn n_rs_seq(&self) -> u32 {
262        self.context_params.n_rs_seq
263    }
264
265    /// Set the `flash_attention` parameter
266    ///
267    /// # Examples
268    ///
269    /// ```rust
270    /// use llama_cpp_4::context::params::LlamaContextParams;
271    /// let params = LlamaContextParams::default()
272    ///     .with_flash_attention(true);
273    /// assert_eq!(params.flash_attention(), true);
274    /// ```
275    #[must_use]
276    pub fn with_flash_attention(mut self, enabled: bool) -> Self {
277        self.context_params.flash_attn_type = if enabled {
278            llama_cpp_sys_4::LLAMA_FLASH_ATTN_TYPE_ENABLED
279        } else {
280            llama_cpp_sys_4::LLAMA_FLASH_ATTN_TYPE_DISABLED
281        };
282        self
283    }
284
285    /// Get the `flash_attention` parameter
286    ///
287    /// # Examples
288    ///
289    /// ```rust
290    /// use llama_cpp_4::context::params::LlamaContextParams;
291    /// let params = LlamaContextParams::default();
292    /// assert_eq!(params.flash_attention(), false);
293    /// ```
294    #[must_use]
295    pub fn flash_attention(&self) -> bool {
296        self.context_params.flash_attn_type == llama_cpp_sys_4::LLAMA_FLASH_ATTN_TYPE_ENABLED
297    }
298
299    /// Set the `offload_kqv` parameter to control offloading KV cache & KQV ops to GPU
300    ///
301    /// # Examples
302    ///
303    /// ```rust
304    /// use llama_cpp_4::context::params::LlamaContextParams;
305    /// let params = LlamaContextParams::default()
306    ///     .with_offload_kqv(false);
307    /// assert_eq!(params.offload_kqv(), false);
308    /// ```
309    #[must_use]
310    pub fn with_offload_kqv(mut self, enabled: bool) -> Self {
311        self.context_params.offload_kqv = enabled;
312        self
313    }
314
315    /// Get the `offload_kqv` parameter
316    ///
317    /// # Examples
318    ///
319    /// ```rust
320    /// use llama_cpp_4::context::params::LlamaContextParams;
321    /// let params = LlamaContextParams::default();
322    /// assert_eq!(params.offload_kqv(), true);
323    /// ```
324    #[must_use]
325    pub fn offload_kqv(&self) -> bool {
326        self.context_params.offload_kqv
327    }
328
329    /// Set the type of rope scaling.
330    ///
331    /// # Examples
332    ///
333    /// ```rust
334    /// use llama_cpp_4::context::params::{LlamaContextParams, RopeScalingType};
335    /// let params = LlamaContextParams::default()
336    ///     .with_rope_scaling_type(RopeScalingType::Linear);
337    /// assert_eq!(params.rope_scaling_type(), RopeScalingType::Linear);
338    /// ```
339    #[must_use]
340    pub fn with_rope_scaling_type(mut self, rope_scaling_type: RopeScalingType) -> Self {
341        self.context_params.rope_scaling_type = i32::from(rope_scaling_type);
342        self
343    }
344
345    /// Get the type of rope scaling.
346    ///
347    /// # Examples
348    ///
349    /// ```rust
350    /// let params = llama_cpp_4::context::params::LlamaContextParams::default();
351    /// assert_eq!(params.rope_scaling_type(), llama_cpp_4::context::params::RopeScalingType::Unspecified);
352    /// ```
353    #[must_use]
354    pub fn rope_scaling_type(&self) -> RopeScalingType {
355        RopeScalingType::from(self.context_params.rope_scaling_type)
356    }
357
358    /// Set the rope frequency base.
359    ///
360    /// # Examples
361    ///
362    /// ```rust
363    /// use llama_cpp_4::context::params::LlamaContextParams;
364    /// let params = LlamaContextParams::default()
365    ///    .with_rope_freq_base(0.5);
366    /// assert_eq!(params.rope_freq_base(), 0.5);
367    /// ```
368    #[must_use]
369    pub fn with_rope_freq_base(mut self, rope_freq_base: f32) -> Self {
370        self.context_params.rope_freq_base = rope_freq_base;
371        self
372    }
373
374    /// Get the rope frequency base.
375    ///
376    /// # Examples
377    ///
378    /// ```rust
379    /// let params = llama_cpp_4::context::params::LlamaContextParams::default();
380    /// assert_eq!(params.rope_freq_base(), 0.0);
381    /// ```
382    #[must_use]
383    pub fn rope_freq_base(&self) -> f32 {
384        self.context_params.rope_freq_base
385    }
386
387    /// Set the rope frequency scale.
388    ///
389    /// # Examples
390    ///
391    /// ```rust
392    /// use llama_cpp_4::context::params::LlamaContextParams;
393    /// let params = LlamaContextParams::default()
394    ///   .with_rope_freq_scale(0.5);
395    /// assert_eq!(params.rope_freq_scale(), 0.5);
396    /// ```
397    #[must_use]
398    pub fn with_rope_freq_scale(mut self, rope_freq_scale: f32) -> Self {
399        self.context_params.rope_freq_scale = rope_freq_scale;
400        self
401    }
402
403    /// Get the rope frequency scale.
404    ///
405    /// # Examples
406    ///
407    /// ```rust
408    /// let params = llama_cpp_4::context::params::LlamaContextParams::default();
409    /// assert_eq!(params.rope_freq_scale(), 0.0);
410    /// ```
411    #[must_use]
412    pub fn rope_freq_scale(&self) -> f32 {
413        self.context_params.rope_freq_scale
414    }
415
416    /// Get the number of threads.
417    ///
418    /// # Examples
419    ///
420    /// ```rust
421    /// let params = llama_cpp_4::context::params::LlamaContextParams::default();
422    /// assert_eq!(params.n_threads(), 4);
423    /// ```
424    #[must_use]
425    pub fn n_threads(&self) -> i32 {
426        self.context_params.n_threads
427    }
428
429    /// Get the number of threads allocated for batches.
430    ///
431    /// # Examples
432    ///
433    /// ```rust
434    /// let params = llama_cpp_4::context::params::LlamaContextParams::default();
435    /// assert_eq!(params.n_threads_batch(), 4);
436    /// ```
437    #[must_use]
438    pub fn n_threads_batch(&self) -> i32 {
439        self.context_params.n_threads_batch
440    }
441
442    /// Set the number of threads.
443    ///
444    /// # Examples
445    ///
446    /// ```rust
447    /// use llama_cpp_4::context::params::LlamaContextParams;
448    /// let params = LlamaContextParams::default()
449    ///    .with_n_threads(8);
450    /// assert_eq!(params.n_threads(), 8);
451    /// ```
452    #[must_use]
453    pub fn with_n_threads(mut self, n_threads: i32) -> Self {
454        self.context_params.n_threads = n_threads;
455        self
456    }
457
458    /// Set the number of threads allocated for batches.
459    ///
460    /// # Examples
461    ///
462    /// ```rust
463    /// use llama_cpp_4::context::params::LlamaContextParams;
464    /// let params = LlamaContextParams::default()
465    ///    .with_n_threads_batch(8);
466    /// assert_eq!(params.n_threads_batch(), 8);
467    /// ```
468    #[must_use]
469    pub fn with_n_threads_batch(mut self, n_threads: i32) -> Self {
470        self.context_params.n_threads_batch = n_threads;
471        self
472    }
473
474    /// Check whether embeddings are enabled
475    ///
476    /// # Examples
477    ///
478    /// ```rust
479    /// let params = llama_cpp_4::context::params::LlamaContextParams::default();
480    /// assert!(!params.embeddings());
481    /// ```
482    #[must_use]
483    pub fn embeddings(&self) -> bool {
484        self.context_params.embeddings
485    }
486
487    /// Enable the use of embeddings
488    ///
489    /// # Examples
490    ///
491    /// ```rust
492    /// use llama_cpp_4::context::params::LlamaContextParams;
493    /// let params = LlamaContextParams::default()
494    ///    .with_embeddings(true);
495    /// assert!(params.embeddings());
496    /// ```
497    #[must_use]
498    pub fn with_embeddings(mut self, embedding: bool) -> Self {
499        self.context_params.embeddings = embedding;
500        self
501    }
502
503    /// Set the evaluation callback.
504    ///
505    /// # Examples
506    ///
507    /// ```no_run
508    /// extern "C" fn cb_eval_fn(
509    ///     t: *mut llama_cpp_sys_4::ggml_tensor,
510    ///     ask: bool,
511    ///     user_data: *mut std::ffi::c_void,
512    /// ) -> bool {
513    ///     false
514    /// }
515    ///
516    /// use llama_cpp_4::context::params::LlamaContextParams;
517    /// let params = LlamaContextParams::default().with_cb_eval(Some(cb_eval_fn));
518    /// ```
519    #[must_use]
520    pub fn with_cb_eval(
521        mut self,
522        cb_eval: llama_cpp_sys_4::ggml_backend_sched_eval_callback,
523    ) -> Self {
524        self.context_params.cb_eval = cb_eval;
525        self
526    }
527
528    /// Set the evaluation callback user data.
529    ///
530    /// # Examples
531    ///
532    /// ```no_run
533    /// use llama_cpp_4::context::params::LlamaContextParams;
534    /// let params = LlamaContextParams::default();
535    /// let user_data = std::ptr::null_mut();
536    /// let params = params.with_cb_eval_user_data(user_data);
537    /// ```
538    #[must_use]
539    pub fn with_cb_eval_user_data(mut self, cb_eval_user_data: *mut std::ffi::c_void) -> Self {
540        self.context_params.cb_eval_user_data = cb_eval_user_data;
541        self
542    }
543
544    /// Attach a [`TensorCapture`](super::tensor_capture::TensorCapture) to
545    /// intercept intermediate tensor outputs during `decode()`.
546    ///
547    /// This sets up the `cb_eval` callback to capture tensors matching the
548    /// capture's filter (e.g. specific layer outputs). After `decode()` the
549    /// captured data can be read from the `TensorCapture`.
550    ///
551    /// # Example
552    ///
553    /// ```rust,ignore
554    /// use llama_cpp_4::context::params::LlamaContextParams;
555    /// use llama_cpp_4::context::tensor_capture::TensorCapture;
556    ///
557    /// let mut capture = TensorCapture::for_layers(&[13, 20, 27]);
558    /// let ctx_params = LlamaContextParams::default()
559    ///     .with_embeddings(true)
560    ///     .with_tensor_capture(&mut capture);
561    /// ```
562    #[must_use]
563    pub fn with_tensor_capture(self, capture: &mut super::tensor_capture::TensorCapture) -> Self {
564        self.with_cb_eval(Some(super::tensor_capture::tensor_capture_callback))
565            .with_cb_eval_user_data(
566                std::ptr::from_mut::<super::tensor_capture::TensorCapture>(capture)
567                    .cast::<std::ffi::c_void>(),
568            )
569    }
570
571    /// Set the storage type for the **K** (key) KV cache tensors.
572    ///
573    /// The default is `GgmlType::F16`.  Quantized types like `GgmlType::Q5_0`
574    /// or `GgmlType::Q4_0` reduce VRAM usage significantly; combining them with
575    /// `TurboQuant` attention rotation (the default) keeps quality high.
576    ///
577    /// # Examples
578    ///
579    /// ```rust
580    /// use llama_cpp_4::context::params::LlamaContextParams;
581    /// use llama_cpp_4::quantize::GgmlType;
582    /// let params = LlamaContextParams::default()
583    ///     .with_cache_type_k(GgmlType::Q5_0);
584    /// ```
585    #[must_use]
586    pub fn with_cache_type_k(mut self, ty: crate::quantize::GgmlType) -> Self {
587        self.context_params.type_k = ty as llama_cpp_sys_4::ggml_type;
588        self
589    }
590
591    /// Get the K-cache storage type.
592    #[must_use]
593    pub fn cache_type_k(&self) -> llama_cpp_sys_4::ggml_type {
594        self.context_params.type_k
595    }
596
597    /// Set the storage type for the **V** (value) KV cache tensors.
598    ///
599    /// See [`with_cache_type_k`](Self::with_cache_type_k) for details.
600    ///
601    /// # Examples
602    ///
603    /// ```rust
604    /// use llama_cpp_4::context::params::LlamaContextParams;
605    /// use llama_cpp_4::quantize::GgmlType;
606    /// let params = LlamaContextParams::default()
607    ///     .with_cache_type_v(GgmlType::Q5_0);
608    /// ```
609    #[must_use]
610    pub fn with_cache_type_v(mut self, ty: crate::quantize::GgmlType) -> Self {
611        self.context_params.type_v = ty as llama_cpp_sys_4::ggml_type;
612        self
613    }
614
615    /// Get the V-cache storage type.
616    #[must_use]
617    pub fn cache_type_v(&self) -> llama_cpp_sys_4::ggml_type {
618        self.context_params.type_v
619    }
620
621    /// Control the `TurboQuant` attention-rotation feature (llama.cpp PR #21038).
622    ///
623    /// By default, llama.cpp applies a Hadamard rotation to Q/K/V tensors
624    /// before writing them into the KV cache.  This significantly improves
625    /// quantized KV-cache quality at near-zero overhead, and is enabled
626    /// automatically for models whose head dimension is a power of two.
627    ///
628    /// Set `disabled = true` to opt out (equivalent to `LLAMA_ATTN_ROT_DISABLE=1`).
629    /// The env-var is applied just before the context is created and restored
630    /// afterwards, so this is safe to call from a single thread.
631    ///
632    /// # Examples
633    ///
634    /// ```rust
635    /// use llama_cpp_4::context::params::LlamaContextParams;
636    /// // Disable rotation for this context only:
637    /// let params = LlamaContextParams::default().with_attn_rot_disabled(true);
638    /// assert!(params.attn_rot_disabled());
639    /// ```
640    #[must_use]
641    pub fn with_attn_rot_disabled(mut self, disabled: bool) -> Self {
642        self.attn_rot_disabled = disabled;
643        self
644    }
645
646    /// Returns `true` if `TurboQuant` attention rotation is disabled for this context.
647    ///
648    /// ```rust
649    /// let params = llama_cpp_4::context::params::LlamaContextParams::default();
650    /// assert!(!params.attn_rot_disabled());
651    /// ```
652    #[must_use]
653    pub fn attn_rot_disabled(&self) -> bool {
654        self.attn_rot_disabled
655    }
656
657    /// Set the type of pooling.
658    ///
659    /// # Examples
660    ///
661    /// ```rust
662    /// use llama_cpp_4::context::params::{LlamaContextParams, LlamaPoolingType};
663    /// let params = LlamaContextParams::default()
664    ///     .with_pooling_type(LlamaPoolingType::Last);
665    /// assert_eq!(params.pooling_type(), LlamaPoolingType::Last);
666    /// ```
667    #[must_use]
668    pub fn with_pooling_type(mut self, pooling_type: LlamaPoolingType) -> Self {
669        self.context_params.pooling_type = i32::from(pooling_type);
670        self
671    }
672
673    /// Get the type of pooling.
674    ///
675    /// # Examples
676    ///
677    /// ```rust
678    /// let params = llama_cpp_4::context::params::LlamaContextParams::default();
679    /// assert_eq!(params.pooling_type(), llama_cpp_4::context::params::LlamaPoolingType::Unspecified);
680    /// ```
681    #[must_use]
682    pub fn pooling_type(&self) -> LlamaPoolingType {
683        LlamaPoolingType::from(self.context_params.pooling_type)
684    }
685}
686
687/// Default parameters for `LlamaContext`. (as defined in llama.cpp by `llama_context_default_params`)
688/// ```
689/// # use std::num::NonZeroU32;
690/// use llama_cpp_4::context::params::{LlamaContextParams, RopeScalingType};
691/// let params = LlamaContextParams::default();
692/// assert_eq!(params.n_ctx(), NonZeroU32::new(512), "n_ctx should be 512");
693/// assert_eq!(params.rope_scaling_type(), RopeScalingType::Unspecified);
694/// ```
695impl Default for LlamaContextParams {
696    fn default() -> Self {
697        let context_params = unsafe { llama_cpp_sys_4::llama_context_default_params() };
698        Self {
699            context_params,
700            attn_rot_disabled: false,
701        }
702    }
703}