oxillama_runtime/offload/policy.rs
1//! Offload policy configuration.
2//!
3//! [`OffloadPolicy`] is the declarative entry-point that callers use to
4//! express how aggressively the runtime should evict model weights to disk.
5//! It is stored in [`EngineConfig`][crate::engine::EngineConfig] and
6//! interpreted by [`InferenceEngine`][crate::engine::InferenceEngine] during
7//! model loading to build (or skip) the [`LayerPager`][super::LayerPager].
8
9/// Configures which weights stay in RAM vs. get evicted to disk.
10///
11/// # Variants
12///
13/// - `None` — all layer weights remain in RAM. This is the default and matches
14/// the classic llama.cpp in-memory-only behaviour.
15/// - `Budget` — evict weights until the resident set fits within `ram_bytes`.
16/// The LRU pager is activated; any tensor not pinned can be evicted.
17/// - `PinnedHotSet` — like `Budget` but a named hot-set (embeddings, output
18/// head, last N attention layers) is pinned and never evicted. Cold layers
19/// cycle in and out of RAM as they are needed.
20#[derive(Debug, Clone, Default)]
21pub enum OffloadPolicy {
22 /// All layer weights remain in RAM (default, current behaviour).
23 #[default]
24 None,
25
26 /// Evict weights until resident set fits within `ram_bytes`.
27 Budget {
28 /// Maximum number of bytes allowed to be resident simultaneously.
29 ram_bytes: u64,
30 },
31
32 /// Pinned hot-set: evict cold layers but keep embeddings, output head,
33 /// and the last N attention layers always resident.
34 PinnedHotSet {
35 /// Maximum number of bytes allowed to be resident simultaneously.
36 ram_bytes: u64,
37 /// Keep token-embedding table pinned (never evicted).
38 pin_embeddings: bool,
39 /// Keep output LM head pinned (never evicted).
40 pin_output_head: bool,
41 /// Number of trailing (deepest) attention layers to always keep resident.
42 pin_last_n_layers: usize,
43 /// How many layers ahead to prefetch in the background.
44 ///
45 /// Currently advisory — the pager does not spawn a prefetch thread
46 /// automatically; callers can use this value to decide whether to
47 /// pre-acquire tensors ahead of the current layer index.
48 prefetch_n_ahead: usize,
49 },
50}
51
52impl OffloadPolicy {
53 /// Return the RAM budget in bytes, if any eviction limit is set.
54 ///
55 /// Returns `None` for [`OffloadPolicy::None`] (unlimited RAM usage).
56 pub fn ram_budget_bytes(&self) -> Option<u64> {
57 match self {
58 Self::None => None,
59 Self::Budget { ram_bytes } => Some(*ram_bytes),
60 Self::PinnedHotSet { ram_bytes, .. } => Some(*ram_bytes),
61 }
62 }
63
64 /// Returns `true` if offloading is disabled (i.e. the default in-RAM path).
65 pub fn is_disabled(&self) -> bool {
66 matches!(self, Self::None)
67 }
68}
69
70#[cfg(test)]
71mod tests {
72 use super::*;
73
74 #[test]
75 fn default_is_none() {
76 assert!(matches!(OffloadPolicy::default(), OffloadPolicy::None));
77 }
78
79 #[test]
80 fn none_has_no_budget() {
81 assert_eq!(OffloadPolicy::None.ram_budget_bytes(), None);
82 }
83
84 #[test]
85 fn budget_returns_bytes() {
86 let policy = OffloadPolicy::Budget {
87 ram_bytes: 1024 * 1024 * 1024,
88 };
89 assert_eq!(policy.ram_budget_bytes(), Some(1024 * 1024 * 1024));
90 }
91
92 #[test]
93 fn pinned_hot_set_returns_bytes() {
94 let policy = OffloadPolicy::PinnedHotSet {
95 ram_bytes: 512 * 1024 * 1024,
96 pin_embeddings: true,
97 pin_output_head: true,
98 pin_last_n_layers: 4,
99 prefetch_n_ahead: 2,
100 };
101 assert_eq!(policy.ram_budget_bytes(), Some(512 * 1024 * 1024));
102 }
103
104 #[test]
105 fn none_is_disabled() {
106 assert!(OffloadPolicy::None.is_disabled());
107 assert!(!OffloadPolicy::Budget { ram_bytes: 1 }.is_disabled());
108 }
109
110 #[test]
111 fn policy_clone_is_independent() {
112 let original = OffloadPolicy::PinnedHotSet {
113 ram_bytes: 100,
114 pin_embeddings: false,
115 pin_output_head: true,
116 pin_last_n_layers: 2,
117 prefetch_n_ahead: 1,
118 };
119 let cloned = original.clone();
120 // Both should format without panic
121 let _ = format!("{original:?}");
122 let _ = format!("{cloned:?}");
123 }
124}