trusty-mpm 0.9.0

trusty-mpm: unified multi-agent orchestration platform (core, daemon, CLI, TUI, Telegram)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
//! The SM rolling-context engine (DOC-14 §7.2 / §7.5).
//!
//! Why: this is the orchestration layer that ties the §7 pieces together for one
//! conversation: it appends rounds, maintains the running token estimate, decides
//! WHEN to compact (round-count primary trigger + token-budget safety valve,
//! §7.2), folds evicted rounds into `compressed_context` via the injected
//! provider (§7.3), re-summarises an oversized compressed block (§7.6), persists
//! the state file after every mutation (§7.4), and assembles the working prompt
//! in the exact §7.5 order. Keeping the *policy* here — separate from the data
//! model, the compaction call, and the persistence — means each concern stays
//! small, testable, and under the SLOC cap.
//! What: [`SmContextEngine`] owns one [`SmConversation`], a `conv_id`, the
//! relevant slices of [`SmInferenceConfig`]/[`SmRoundsConfig`], and a
//! [`ConversationStore`]. [`SmContextEngine::record`] is the async entry point
//! that adds a round and runs compaction-to-convergence through a `&dyn
//! LlmProvider`. [`SmContextEngine::assemble_working_prompt`] produces the §7.5
//! ordered messages.
//! Test: `engine_tests.rs` — window eviction, evicted-content survival, the
//! goal/session-id golden, the token-budget trigger, default-vs-override model
//! selection, per-round persistence, and §7.5 assembly order.

use crate::core::sm::config::{SmInferenceConfig, SmRoundsConfig};
use crate::core::sm::providers::{ChatMessage, LlmProvider, SmLlmError};

use super::compaction::{estimate_tokens, fold_rounds, resummarise};
use super::model::{Round, SmConversation, ToolTrace};
use super::persist::{ConversationStore, ConversationStoreError};

/// Role string for a context/system-style message in the assembled prompt.
const ROLE_SYSTEM: &str = "system";
/// Role string for an operator (user) turn.
const ROLE_USER: &str = "user";
/// Role string for an SM (assistant) turn.
const ROLE_ASSISTANT: &str = "assistant";

/// Errors the context engine can surface (library → `thiserror`).
///
/// Why: `record` performs two fallible kinds of work — the compaction LLM call
/// (provider) and the state-file persistence (store) — plus model resolution. A
/// typed enum lets callers (SM-7) distinguish "compaction degraded, keep serving"
/// from "disk failed" without string matching, and keeps SM-5 panic-free.
/// What: wraps [`SmLlmError`] (resolution + compaction) and
/// [`ConversationStoreError`] (persistence).
/// Test: `record_surfaces_provider_error`, `record_surfaces_store_error` paths in
/// `engine_tests.rs`.
#[derive(Debug, thiserror::Error)]
pub enum SmContextError {
    /// A compaction / re-summarisation LLM call (or model resolution) failed.
    #[error("context compaction failed: {0}")]
    Compaction(#[from] SmLlmError),

    /// Persisting the conversation state file failed.
    #[error("context persistence failed: {0}")]
    Persist(#[from] ConversationStoreError),
}

/// The rolling auto-compaction context engine for ONE conversation (§7).
///
/// Why: one engine instance == one `conv_id`'s live context (SM-7 keeps a map of
/// these). Holding the config slices and the store on the engine means `record`
/// and `assemble_working_prompt` take only their per-call inputs (the provider,
/// the model, the current message), which keeps the call sites — and the tests —
/// simple.
/// What: owns the mutable [`SmConversation`], the `conv_id`, the rolling-window
/// size, the token budget + compressed-context cap (from [`SmInferenceConfig`]),
/// and a [`ConversationStore`]. The compaction provider and resolved model are
/// passed to [`record`](Self::record) per call (dependency injection) so the
/// engine never constructs a concrete provider.
/// Test: every test in `engine_tests.rs`.
pub struct SmContextEngine {
    /// Stable conversation id (drives the state-file name).
    conv_id: String,
    /// The live conversation state (§7.1).
    conversation: SmConversation,
    /// Verbatim rolling-window size, `> N` triggers compaction (§7.2a).
    window: usize,
    /// Token safety-valve budget; estimate `>` this triggers compaction (§7.2b).
    token_budget: usize,
    /// Max tokens the compressed block may hold before re-summarisation (§7.6).
    compressed_max_tokens: usize,
    /// Atomic state-file store (§7.4).
    store: ConversationStore,
}

impl SmContextEngine {
    /// Build (or resume) an engine for `conv_id` rooted at `data_root`.
    ///
    /// Why: on startup/resume the engine must reload any persisted state for this
    /// conversation (§7.4) so context survives a daemon restart; a fresh conv_id
    /// loads as empty. The config slices size the window and budgets. The data
    /// root is injectable so tests use a tempdir.
    /// What: constructs a [`ConversationStore`] under `data_root`, loads the
    /// conversation for `conv_id` (empty if absent), copies the window/budget
    /// numbers out of the config, and **recomputes `token_estimate` from the loaded
    /// content** so the persisted (possibly stale / heuristic-drifted) cache can
    /// never trigger a spurious compaction on the first `record`. A load failure
    /// (corrupt file) surfaces as a [`SmContextError::Persist`].
    /// Test: `engine_resumes_persisted_conversation`,
    /// `new_conversation_starts_empty`,
    /// `open_recomputes_stale_token_estimate`,
    /// `loaded_stale_estimate_does_not_spuriously_compact`.
    pub fn open(
        conv_id: impl Into<String>,
        data_root: impl Into<std::path::PathBuf>,
        inference: &SmInferenceConfig,
        rounds: &SmRoundsConfig,
    ) -> Result<Self, SmContextError> {
        let conv_id = conv_id.into();
        let store = ConversationStore::new(data_root);
        let conversation = store.load(&conv_id)?;
        let mut engine = Self {
            conv_id,
            conversation,
            window: rounds.window as usize,
            token_budget: inference.context_token_budget as usize,
            compressed_max_tokens: inference.compressed_context_max_tokens as usize,
            store,
        };
        // §7.2: the persisted `token_estimate` is a denormalised cache of the
        // content size under the *then-current* heuristic. Recompute it from the
        // loaded content immediately so a stale (or heuristic-drifted) on-disk
        // value can never trip a spurious compaction on the first `record`.
        engine.recompute_estimate();
        Ok(engine)
    }

    /// Read-only access to the live conversation (for §7.5 / `sm.context.get`).
    ///
    /// Why: SM-7's `sm.context.get` endpoint and the tests need to inspect the
    /// compressed block, window, counters, and estimate without mutating them.
    /// What: returns a shared reference to the inner [`SmConversation`].
    /// Test: used throughout `engine_tests.rs`.
    pub fn conversation(&self) -> &SmConversation {
        &self.conversation
    }

    /// The conversation id this engine is bound to.
    ///
    /// Why: callers map engines by id; exposing it keeps that lookup honest.
    /// What: returns the `conv_id`.
    /// Test: trivial; used by callers.
    pub fn conv_id(&self) -> &str {
        &self.conv_id
    }

    /// Record a completed round, compacting if the window/budget overflows (§7.2).
    ///
    /// Why: this is the engine's heart — append the verbatim round, update the
    /// running token estimate, then fold the oldest round(s) into
    /// `compressed_context` while EITHER trigger holds (round-count `>` window OR
    /// estimate `>` budget), re-summarise the compressed block if it grows past
    /// its cap (§7.6), and atomically persist after the mutation (§7.4). The
    /// compaction call is dependency-injected (`provider` + resolved
    /// `compaction_model`) so production uses the Haiku-tier provider and tests
    /// use a mock — this code never builds a concrete provider.
    /// What: pushes a [`Round`] from `(user, assistant, tool_calls)` stamped with
    /// `ts`; recomputes `token_estimate` from scratch (compressed block + every
    /// verbatim round) so it stays exact; loops `compact_once` until neither
    /// trigger holds (or the window can't shrink further); then saves. Returns the
    /// number of rounds evicted this call (0 = no compaction).
    /// Test: `window_evicts_oldest_round`, `evicted_content_lands_in_summary`,
    /// `golden_ids_survive_compaction`, `token_budget_triggers_compaction`,
    /// `default_compaction_uses_summary_model`,
    /// `compaction_model_override_is_honored`, `state_file_written_each_record`.
    pub async fn record(
        &mut self,
        provider: &dyn LlmProvider,
        compaction_model: &str,
        user: impl Into<String>,
        assistant: impl Into<String>,
        ts: chrono::DateTime<chrono::Utc>,
        tool_calls: Vec<ToolTrace>,
    ) -> Result<usize, SmContextError> {
        let round = Round::new(user, assistant, ts, tool_calls);
        self.conversation.recent_rounds.push_back(round);
        self.conversation.total_rounds += 1;
        self.recompute_estimate();

        let mut evicted = 0usize;
        // Compact while a trigger holds AND there is still a round we can evict.
        // We always keep at least one verbatim round so the window never empties
        // out from under a single huge round (the token-budget safety valve still
        // folds the others / re-summarises the compressed block, §7.2/§7.6).
        while self.should_compact() && self.conversation.recent_rounds.len() > 1 {
            self.compact_once(provider, compaction_model).await?;
            evicted += 1;
        }

        // §7.2/§7.6 convergence: the eviction loop above always keeps ≥1 verbatim
        // round, so when a SINGLE retained round (plus the compressed block) alone
        // exceeds the token budget it would exit with `should_compact()` still
        // true — leaving a silently over-budget context. Run a bounded post-loop
        // pass that re-summarises the compressed block (and, if still over budget,
        // folds the oversized retained round into it) until the budget is met or
        // no further reduction is possible.
        self.converge_within_budget(provider, compaction_model)
            .await?;

        self.store.save(&self.conv_id, &self.conversation)?;
        Ok(evicted)
    }

    /// Record a completed round VERBATIM, skipping LLM compaction entirely (§7.4).
    ///
    /// Why: a chat turn must NEVER silently drop a round it already returned to the
    /// caller — divergence between the persisted conversation and what the operator
    /// saw is a data-integrity bug. When NO inference provider can be resolved for
    /// compaction (every model tier is unavailable/degraded), [`record`](Self::record)
    /// cannot run because it requires a provider for the fold call. This method is
    /// the best-effort fallback: it persists the round so the conversation stays
    /// faithful, accepting that the verbatim window may sit over the soft cap until a
    /// later compaction-capable turn folds it down — over-budget context is strictly
    /// better than a lost round.
    /// What: pushes a [`Round`] from `(user, assistant, tool_calls)` stamped with
    /// `ts`, bumps `total_rounds`, recomputes `token_estimate`, and atomically
    /// persists — but performs NO eviction, NO fold, and NO re-summarisation, so it
    /// never touches a provider. Returns `()` (no rounds are evicted). A persistence
    /// error (disk) propagates as [`SmContextError::Persist`].
    /// Test: `record_without_compaction_persists_round_verbatim` in `engine_tests.rs`
    /// (round lands verbatim, reloads from disk) and
    /// `chat_records_round_when_no_provider_for_compaction` in `chat_tests.rs`
    /// (the SM chat turn still persists when both tiers fail to resolve).
    pub fn record_without_compaction(
        &mut self,
        user: impl Into<String>,
        assistant: impl Into<String>,
        ts: chrono::DateTime<chrono::Utc>,
        tool_calls: Vec<ToolTrace>,
    ) -> Result<(), SmContextError> {
        let round = Round::new(user, assistant, ts, tool_calls);
        self.conversation.recent_rounds.push_back(round);
        self.conversation.total_rounds += 1;
        self.recompute_estimate();
        self.store.save(&self.conv_id, &self.conversation)?;
        Ok(())
    }

    /// Post-eviction convergence: bring an over-budget single round + compressed
    /// block back within the token budget without looping forever (§7.2/§7.6).
    ///
    /// Why: the `record` eviction loop deliberately keeps the last verbatim round,
    /// so a lone oversized round (or an oversized compressed block) can leave the
    /// context silently over `token_budget` with `should_compact()` still true. The
    /// budget is a hard safety valve, so we must make a best effort to honour it
    /// rather than persist a context we know is too large — but we must also never
    /// hang, since the summariser may be unable to shrink content below the budget.
    /// What: while still over budget, (1) re-summarise the compressed block if it
    /// is non-empty, then (2) if still over budget AND a single verbatim round
    /// remains, FOLD that round into the compressed block (keeping the conversation
    /// coherent — the round's content moves into the summary rather than being
    /// dropped) and re-summarise. Each iteration MUST strictly reduce the token
    /// estimate; if an iteration fails to make progress we stop (residual logged at
    /// debug) so a stubborn summariser can never spin the loop. An empty context
    /// (nothing left to summarise) also terminates.
    /// Test: `single_oversized_round_converges_within_budget`,
    /// `convergence_terminates_when_summariser_cannot_shrink`.
    async fn converge_within_budget(
        &mut self,
        provider: &dyn LlmProvider,
        compaction_model: &str,
    ) -> Result<(), SmContextError> {
        while self.should_compact() {
            let before = self.conversation.token_estimate;

            // (1) Re-summarise the compressed block if there is one to shrink.
            if !self.conversation.compressed_context.trim().is_empty() {
                let resp = resummarise(
                    provider,
                    compaction_model,
                    &self.conversation.compressed_context,
                )
                .await?;
                self.conversation.compressed_context = resp.text;
                self.recompute_estimate();
            }

            // (2) Still over budget? Fold the lone retained round into the
            // compressed block so its content is preserved (not dropped) while the
            // verbatim window empties. We never evict the *last* round in the main
            // loop, but here folding it is the only way to honour the budget.
            if self.should_compact() && self.conversation.recent_rounds.len() == 1 {
                self.compact_once(provider, compaction_model).await?;
            }

            // Termination guard: if an iteration cannot strictly reduce the
            // estimate (summariser returned something no smaller and there is no
            // round left to fold), stop rather than spin. The residual over-budget
            // context is the best achievable; persisting it beats hanging.
            if self.conversation.token_estimate >= before
                && self.conversation.recent_rounds.is_empty()
            {
                tracing::debug!(
                    conv_id = %self.conv_id,
                    token_estimate = self.conversation.token_estimate,
                    token_budget = self.token_budget,
                    "context convergence stalled; persisting best-effort over-budget context"
                );
                break;
            }
        }
        Ok(())
    }

    /// True when EITHER §7.2 trigger holds: window over size OR estimate over
    /// budget.
    ///
    /// Why: §7.2 fires on whichever condition is met first; round-count is the
    /// primary trigger, the token budget is the safety valve. Encoding both in
    /// one predicate keeps the `record` loop readable.
    /// What: returns `recent_rounds.len() > window || token_estimate >
    /// token_budget`.
    /// Test: `window_evicts_oldest_round` (count path),
    /// `token_budget_triggers_compaction` (budget path).
    fn should_compact(&self) -> bool {
        self.conversation.recent_rounds.len() > self.window
            || self.conversation.token_estimate > self.token_budget
    }

    /// Fold the single oldest round into `compressed_context` (§7.3), then
    /// re-summarise the block if it now exceeds its cap (§7.6).
    ///
    /// Why: one overflow event evicts the oldest round; doing exactly one eviction
    /// per call keeps the compaction calls bounded and the loop in `record` in
    /// control of convergence.
    /// What: pops the front round, calls [`fold_rounds`] with the current
    /// compressed block + that round through the injected provider, replaces
    /// `compressed_context` with the response text, recomputes the estimate, and —
    /// if the compressed block's own estimate exceeds `compressed_max_tokens` —
    /// runs [`resummarise`] and recomputes again.
    /// Test: `evicted_content_lands_in_summary`, `golden_ids_survive_compaction`,
    /// `oversized_summary_is_resummarised`.
    async fn compact_once(
        &mut self,
        provider: &dyn LlmProvider,
        compaction_model: &str,
    ) -> Result<(), SmContextError> {
        let Some(oldest) = self.conversation.recent_rounds.pop_front() else {
            return Ok(());
        };
        let evicted = [oldest];
        let resp = fold_rounds(
            provider,
            compaction_model,
            &self.conversation.compressed_context,
            &evicted,
        )
        .await?;
        self.conversation.compressed_context = resp.text;
        self.recompute_estimate();

        // §7.6: keep the compressed block within its token cap.
        if self.compressed_max_tokens > 0
            && estimate_tokens(self.conversation.compressed_context.len())
                > self.compressed_max_tokens
        {
            let resp = resummarise(
                provider,
                compaction_model,
                &self.conversation.compressed_context,
            )
            .await?;
            self.conversation.compressed_context = resp.text;
            self.recompute_estimate();
        }
        Ok(())
    }

    /// Recompute `token_estimate` from the compressed block + every verbatim round.
    ///
    /// Why: keeping a running counter incrementally is error-prone across evictions
    /// and re-summarisations; recomputing from the authoritative state after each
    /// mutation is cheap (chars/4 over a bounded window) and always correct.
    /// What: sums the char length of `compressed_context` and every round, then
    /// applies the chars/4 heuristic, storing the result in `token_estimate`.
    /// Test: `token_estimate_tracks_content` and indirectly every compaction test.
    fn recompute_estimate(&mut self) {
        let chars = self.conversation.compressed_context.len()
            + self
                .conversation
                .recent_rounds
                .iter()
                .map(Round::char_len)
                .sum::<usize>();
        self.conversation.token_estimate = estimate_tokens(chars);
    }

    /// Assemble the working-prompt messages in the exact §7.5 order.
    ///
    /// Why: §7.5 mandates a precise content order — system prompt → compressed
    /// context → memory recall → recent rounds → current message. Producing it here
    /// (not at the call site) guarantees the order is correct and identical
    /// everywhere, and lets SM-7 simply pass the assembled SM system prompt and the
    /// SM-4 recall text it already has. The three leading system-role sections are
    /// consolidated into a SINGLE `system` message because several providers
    /// (OpenAI Chat Completions among them) reject more than one `system` message
    /// in the array — emitting three would make the assembled prompt unusable on
    /// those backends.
    /// What: builds a `Vec<ChatMessage>` — (1) ONE `system` message that
    /// concatenates, IN §7.5 ORDER and each omitted if empty: the base
    /// `system_prompt`, then an "Earlier in this conversation:" compressed block,
    /// then a "Relevant memory:" recall block (sections joined by blank lines); if
    /// all three are empty no system message is emitted at all; (2) each verbatim
    /// recent round as alternating `user`/`assistant` turns; (3) the current
    /// operator `message` as the final `user` turn. The recall text is a PARAMETER
    /// (not fetched here) so SM-7 passes SM-4's recall results; SM-5 does not wire
    /// memory.
    /// Test: `assembly_order_is_exact`, `assembly_skips_empty_blocks`,
    /// `assembly_emits_single_system_message`.
    pub fn assemble_working_prompt(
        &self,
        system_prompt: &str,
        memory_recall: Option<&str>,
        message: &str,
    ) -> Vec<ChatMessage> {
        let mut msgs: Vec<ChatMessage> = Vec::new();

        // 1. ONE consolidated system message: base prompt, then compressed
        // context, then memory recall — in §7.5 order, each section omitted when
        // empty. A single `system` message keeps the prompt valid on providers
        // that reject more than one system-role entry.
        let mut sections: Vec<String> = Vec::new();
        if !system_prompt.trim().is_empty() {
            sections.push(system_prompt.to_string());
        }
        if !self.conversation.compressed_context.trim().is_empty() {
            sections.push(format!(
                "Earlier in this conversation: {}",
                self.conversation.compressed_context
            ));
        }
        if let Some(recall) = memory_recall
            && !recall.trim().is_empty()
        {
            sections.push(format!("Relevant memory: {recall}"));
        }
        if !sections.is_empty() {
            msgs.push(ChatMessage {
                role: ROLE_SYSTEM.to_string(),
                content: sections.join("\n\n"),
            });
        }

        // 2. Recent verbatim rounds as alternating user/assistant turns.
        for round in &self.conversation.recent_rounds {
            msgs.push(ChatMessage {
                role: ROLE_USER.to_string(),
                content: round.user.clone(),
            });
            msgs.push(ChatMessage {
                role: ROLE_ASSISTANT.to_string(),
                content: round.assistant.clone(),
            });
        }

        // 3. Current operator message.
        msgs.push(ChatMessage {
            role: ROLE_USER.to_string(),
            content: message.to_string(),
        });

        msgs
    }
}

#[cfg(test)]
#[path = "engine_tests.rs"]
mod tests;