oxillama_runtime/offload/
policy.rs

1//! Offload policy configuration.
2//!
3//! [`OffloadPolicy`] is the declarative entry-point that callers use to
4//! express how aggressively the runtime should evict model weights to disk.
5//! It is stored in [`EngineConfig`][crate::engine::EngineConfig] and
6//! interpreted by [`InferenceEngine`][crate::engine::InferenceEngine] during
7//! model loading to build (or skip) the [`LayerPager`][super::LayerPager].
8
9/// Configures which weights stay in RAM vs. get evicted to disk.
10///
11/// # Variants
12///
13/// - `None` — all layer weights remain in RAM. This is the default and matches
14///   the classic llama.cpp in-memory-only behaviour.
15/// - `Budget` — evict weights until the resident set fits within `ram_bytes`.
16///   The LRU pager is activated; any tensor not pinned can be evicted.
17/// - `PinnedHotSet` — like `Budget` but a named hot-set (embeddings, output
18///   head, last N attention layers) is pinned and never evicted. Cold layers
19///   cycle in and out of RAM as they are needed.
20#[derive(Debug, Clone, Default)]
21pub enum OffloadPolicy {
22    /// All layer weights remain in RAM (default, current behaviour).
23    #[default]
24    None,
25
26    /// Evict weights until resident set fits within `ram_bytes`.
27    Budget {
28        /// Maximum number of bytes allowed to be resident simultaneously.
29        ram_bytes: u64,
30    },
31
32    /// Pinned hot-set: evict cold layers but keep embeddings, output head,
33    /// and the last N attention layers always resident.
34    PinnedHotSet {
35        /// Maximum number of bytes allowed to be resident simultaneously.
36        ram_bytes: u64,
37        /// Keep token-embedding table pinned (never evicted).
38        pin_embeddings: bool,
39        /// Keep output LM head pinned (never evicted).
40        pin_output_head: bool,
41        /// Number of trailing (deepest) attention layers to always keep resident.
42        pin_last_n_layers: usize,
43        /// How many layers ahead to prefetch in the background.
44        ///
45        /// Currently advisory — the pager does not spawn a prefetch thread
46        /// automatically; callers can use this value to decide whether to
47        /// pre-acquire tensors ahead of the current layer index.
48        prefetch_n_ahead: usize,
49    },
50}
51
52impl OffloadPolicy {
53    /// Return the RAM budget in bytes, if any eviction limit is set.
54    ///
55    /// Returns `None` for [`OffloadPolicy::None`] (unlimited RAM usage).
56    pub fn ram_budget_bytes(&self) -> Option<u64> {
57        match self {
58            Self::None => None,
59            Self::Budget { ram_bytes } => Some(*ram_bytes),
60            Self::PinnedHotSet { ram_bytes, .. } => Some(*ram_bytes),
61        }
62    }
63
64    /// Returns `true` if offloading is disabled (i.e. the default in-RAM path).
65    pub fn is_disabled(&self) -> bool {
66        matches!(self, Self::None)
67    }
68}
69
70#[cfg(test)]
71mod tests {
72    use super::*;
73
74    #[test]
75    fn default_is_none() {
76        assert!(matches!(OffloadPolicy::default(), OffloadPolicy::None));
77    }
78
79    #[test]
80    fn none_has_no_budget() {
81        assert_eq!(OffloadPolicy::None.ram_budget_bytes(), None);
82    }
83
84    #[test]
85    fn budget_returns_bytes() {
86        let policy = OffloadPolicy::Budget {
87            ram_bytes: 1024 * 1024 * 1024,
88        };
89        assert_eq!(policy.ram_budget_bytes(), Some(1024 * 1024 * 1024));
90    }
91
92    #[test]
93    fn pinned_hot_set_returns_bytes() {
94        let policy = OffloadPolicy::PinnedHotSet {
95            ram_bytes: 512 * 1024 * 1024,
96            pin_embeddings: true,
97            pin_output_head: true,
98            pin_last_n_layers: 4,
99            prefetch_n_ahead: 2,
100        };
101        assert_eq!(policy.ram_budget_bytes(), Some(512 * 1024 * 1024));
102    }
103
104    #[test]
105    fn none_is_disabled() {
106        assert!(OffloadPolicy::None.is_disabled());
107        assert!(!OffloadPolicy::Budget { ram_bytes: 1 }.is_disabled());
108    }
109
110    #[test]
111    fn policy_clone_is_independent() {
112        let original = OffloadPolicy::PinnedHotSet {
113            ram_bytes: 100,
114            pin_embeddings: false,
115            pin_output_head: true,
116            pin_last_n_layers: 2,
117            prefetch_n_ahead: 1,
118        };
119        let cloned = original.clone();
120        // Both should format without panic
121        let _ = format!("{original:?}");
122        let _ = format!("{cloned:?}");
123    }
124}
oxillama_runtime/offload/policy.rs

oxillama_runtime/offload/
policy.rs