Skip to main content

mnemo_core/budget/
planner.rs

1//! Recall budget planner (v0.4.1 P1-4).
2
3use serde::{Deserialize, Serialize};
4
5use super::models::{ModelId, lookup};
6
7/// Operator-tunable budget. Built from a [`ModelId`] via
8/// `ContextBudget::for_model` and overridden in
9/// [`ContextBudget::with_*`] methods.
10#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
11pub struct ContextBudget {
12    pub model: ModelId,
13    pub total_tokens: u32,
14    pub system_reserve: u32,
15    pub response_reserve: u32,
16    /// Fraction of the post-reserves remainder reserved for memory
17    /// injection. The rest is left to the conversation history.
18    /// Default 0.45 — gives ~45% to memory and 55% to history on a
19    /// budget that's already had system + response carved out.
20    pub mem_share: f32,
21}
22
23impl ContextBudget {
24    pub fn for_model(model: ModelId) -> Self {
25        let w = lookup(model);
26        Self {
27            model,
28            total_tokens: w.total_tokens,
29            system_reserve: w.system_reserve,
30            response_reserve: w.response_reserve,
31            mem_share: 0.45,
32        }
33    }
34
35    pub fn with_mem_share(mut self, share: f32) -> Self {
36        self.mem_share = share.clamp(0.0, 1.0);
37        self
38    }
39
40    /// Tokens available to the conversation + memory after reserves.
41    pub fn available(&self) -> u32 {
42        self.total_tokens
43            .saturating_sub(self.system_reserve)
44            .saturating_sub(self.response_reserve)
45    }
46
47    pub fn memory_budget(&self) -> u32 {
48        (self.available() as f32 * self.mem_share) as u32
49    }
50}
51
52/// Strategy when the history+memory plan would overflow.
53#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
54pub enum FallbackStrategy {
55    /// Drop the oldest history turns until the budget fits.
56    TruncateOldest,
57    /// Compress the oldest k turns into a single summary block.
58    SummarizeOldestK(u32),
59    /// Drop near-duplicate memories first (uses dedup_radius).
60    DropDuplicates,
61    /// No fallback; caller handles the overflow.
62    None,
63}
64
65#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
66pub struct RecallPlan {
67    pub k: u32,
68    /// Per-memory token budget. The recall path truncates each
69    /// returned memory to fit.
70    pub chunk_tokens: u32,
71    /// Cosine-similarity threshold above which two recalled
72    /// memories are considered near-duplicates and one is dropped.
73    pub dedup_radius: f32,
74    pub fallback: FallbackStrategy,
75}
76
77#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
78pub struct Query {
79    pub text: String,
80    pub estimated_tokens: u32,
81}
82
83pub fn plan_recall(b: &ContextBudget, history_tokens: u32, query: &Query) -> RecallPlan {
84    let avail = b.available();
85    let mem_budget = b.memory_budget();
86
87    // Sanity: history shouldn't be planned past the available
88    // budget. If it is, kick in TruncateOldest as the fallback.
89    let history_share = avail.saturating_sub(mem_budget);
90    let fallback = if history_tokens > history_share {
91        FallbackStrategy::TruncateOldest
92    } else if mem_budget > 100_000 {
93        // 1M-class window: aggressive dedup.
94        FallbackStrategy::DropDuplicates
95    } else {
96        FallbackStrategy::None
97    };
98
99    // Per-memory chunk budget. We aim for ~1000 tokens per chunk on
100    // 1M-class contexts, dropping to ~256 on 128k-class. Operators
101    // can override by post-processing the plan.
102    let chunk_tokens = if b.total_tokens >= 800_000 {
103        1024
104    } else if b.total_tokens >= 200_000 {
105        512
106    } else {
107        256
108    };
109
110    // k: how many memories the planner asks for. Heuristic: spend
111    // ~70% of mem_budget on memory bodies, the remaining 30% buffers
112    // dedup + chunk overhead.
113    let usable = (mem_budget as f32 * 0.7) as u32;
114    let k = if chunk_tokens == 0 {
115        0
116    } else {
117        (usable / chunk_tokens).clamp(1, 256)
118    };
119
120    // Lighter dedup on small windows (less risk of redundancy);
121    // tighter on large.
122    let dedup_radius = if b.total_tokens >= 800_000 {
123        0.92
124    } else {
125        0.88
126    };
127
128    let _ = query; // reserved for future query-aware planning
129
130    RecallPlan {
131        k,
132        chunk_tokens,
133        dedup_radius,
134        fallback,
135    }
136}
137
138#[cfg(test)]
139mod tests {
140    use super::*;
141
142    fn q(t: u32) -> Query {
143        Query {
144            text: "test".into(),
145            estimated_tokens: t,
146        }
147    }
148
149    #[test]
150    fn deepseek_v4_yields_high_k_and_fits_under_mem_share() {
151        let b = ContextBudget::for_model(ModelId::DeepSeekV4_1m);
152        let plan = plan_recall(&b, /* history */ 200_000, &q(64));
153        assert!(
154            plan.k >= 64,
155            "expected k>=64 for 1M context, got {}",
156            plan.k
157        );
158        let injected = plan.k * plan.chunk_tokens;
159        assert!(
160            injected as f32 <= b.memory_budget() as f32 * 0.8,
161            "plan injects {injected} but mem_budget is {}",
162            b.memory_budget()
163        );
164    }
165
166    #[test]
167    fn small_window_drops_to_smaller_chunks() {
168        let b = ContextBudget::for_model(ModelId::DeepSeekV3_128k);
169        let plan = plan_recall(&b, 8_000, &q(64));
170        assert!(plan.chunk_tokens <= 512);
171    }
172
173    #[test]
174    fn budget_does_not_overflow_total() {
175        // Property test (deterministic, since the planner is pure):
176        // for every model in the table, system + response + history +
177        // injected memory must be <= total.
178        for (m, _) in super::super::models::MODEL_TABLE {
179            let b = ContextBudget::for_model(*m);
180            let plan = plan_recall(&b, 0, &q(0));
181            let injected = plan.k * plan.chunk_tokens;
182            let total = b.system_reserve + b.response_reserve + injected;
183            assert!(
184                total <= b.total_tokens,
185                "model {} overflows: total {} > {}",
186                m.as_str(),
187                total,
188                b.total_tokens
189            );
190        }
191    }
192
193    #[test]
194    fn truncate_oldest_kicks_in_when_history_overflows() {
195        let b = ContextBudget::for_model(ModelId::Gpt5_1_128k);
196        // History eats all available — should trigger fallback.
197        let plan = plan_recall(&b, b.available() + 10_000, &q(1));
198        assert_eq!(plan.fallback, FallbackStrategy::TruncateOldest);
199    }
200
201    #[test]
202    fn dedup_radius_is_tighter_on_large_windows() {
203        let small = plan_recall(&ContextBudget::for_model(ModelId::Gpt5_1_128k), 1000, &q(1));
204        let huge = plan_recall(
205            &ContextBudget::for_model(ModelId::Gemini2_5Pro2m),
206            1000,
207            &q(1),
208        );
209        assert!(huge.dedup_radius >= small.dedup_radius);
210    }
211
212    #[test]
213    fn mem_share_is_clampable() {
214        let b = ContextBudget::for_model(ModelId::Claude3_7Sonnet1m).with_mem_share(2.0);
215        assert!(b.mem_share <= 1.0);
216    }
217}