Skip to main content

zeph_agent_context/
service.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! [`ContextService`] — stateless façade for agent context-assembly operations.
5
6use zeph_context::budget::ContextBudget;
7use zeph_context::fidelity::FidelityScorer;
8use zeph_llm::LlmProvider;
9use zeph_llm::provider::{Message, MessagePart, Role};
10
11use crate::error::ContextError;
12use crate::helpers::{
13    CODE_CONTEXT_PREFIX, CORRECTIONS_PREFIX, CROSS_SESSION_PREFIX, DOCUMENT_RAG_PREFIX,
14    GRAPH_FACTS_PREFIX, LSP_NOTE_PREFIX, PERSONA_PREFIX, REASONING_PREFIX, RECALL_PREFIX,
15    SESSION_DIGEST_PREFIX, SUMMARY_PREFIX, TRAJECTORY_PREFIX, TREE_MEMORY_PREFIX,
16};
17use crate::state::{
18    ContextAssemblyView, ContextDelta, ContextSummarizationView, MessageWindowView,
19    ProviderHandles, StatusSink,
20};
21
22/// Configuration parameters for semantic recall injection.
23///
24/// Collects the 8 config-like arguments shared between the tiered and flat recall paths so
25/// callers do not need to pass them positionally to [`ContextService::inject_semantic_recall_bare`].
26///
27/// `window` and `memory` are kept as direct parameters on the method because they are
28/// mutable/output args rather than configuration.
29pub struct SemanticRecallParams<'a> {
30    /// Query string used for retrieval.
31    pub query: &'a str,
32    /// Maximum number of tokens the injected recall may consume.
33    pub token_budget: usize,
34    /// Maximum number of memories to retrieve (flat path only).
35    pub recall_limit: usize,
36    /// Format applied when serialising recalled memories.
37    pub context_format: zeph_config::ContextFormat,
38    /// Conversation scope used for tiered retrieval.
39    pub conversation_id: Option<zeph_memory::ConversationId>,
40    /// Optional LLM provider for intent classification (tiered path).
41    pub tiered_classifier: Option<&'a std::sync::Arc<zeph_llm::any::AnyProvider>>,
42    /// Optional LLM provider for result validation (tiered path).
43    pub tiered_validator: Option<&'a std::sync::Arc<zeph_llm::any::AnyProvider>>,
44    /// Tiered retrieval configuration controlling whether the tiered path is active.
45    pub tiered_config: &'a zeph_config::memory::TieredRetrievalConfig,
46}
47
48/// Stateless façade for agent context-assembly operations.
49///
50/// This struct has no fields. All state flows through method parameters, which allows the
51/// borrow checker to see disjoint `&mut` borrows at the call site without hiding them
52/// inside an opaque bundle.
53///
54/// Methods are `&self` — the type exists only to namespace the operations and give callers
55/// a single import.
56///
57/// # Examples
58///
59/// ```no_run
60/// use zeph_agent_context::service::ContextService;
61///
62/// let svc = ContextService::new();
63/// // call svc.prepare_context(...) or svc.clear_history(...)
64/// ```
65#[derive(Debug, Default)]
66pub struct ContextService;
67
68impl ContextService {
69    /// Create a new stateless `ContextService`.
70    ///
71    /// This is a zero-cost constructor — the struct has no fields.
72    #[must_use]
73    pub fn new() -> Self {
74        Self
75    }
76
77    // ── Trivial message-window mutators (PR1) ─────────────────────────────────
78
79    /// Clear the message history, preserving the system prompt.
80    ///
81    /// Keeps the first message (system prompt), clears the rest, and clears
82    /// `completed_tool_ids` — session-scoped dependency state resets with the history.
83    /// Recomputes `cached_prompt_tokens` inline after clearing.
84    pub fn clear_history(&self, window: &mut MessageWindowView<'_>) {
85        let system_prompt = window.messages.first().cloned();
86        window.messages.clear();
87        if let Some(sp) = system_prompt {
88            window.messages.push(sp);
89        }
90        window.completed_tool_ids.clear();
91        recompute_prompt_tokens(window);
92    }
93
94    /// Remove semantic recall messages from the window.
95    pub fn remove_recall_messages(&self, window: &mut MessageWindowView<'_>) {
96        remove_by_part_or_prefix(window.messages, RECALL_PREFIX, |p| {
97            matches!(p, MessagePart::Recall { .. })
98        });
99    }
100
101    /// Remove past-correction messages from the window.
102    pub fn remove_correction_messages(&self, window: &mut MessageWindowView<'_>) {
103        remove_by_prefix(window.messages, Role::System, CORRECTIONS_PREFIX);
104    }
105
106    /// Remove knowledge-graph fact messages from the window.
107    pub fn remove_graph_facts_messages(&self, window: &mut MessageWindowView<'_>) {
108        remove_by_prefix(window.messages, Role::System, GRAPH_FACTS_PREFIX);
109    }
110
111    /// Remove persona-facts messages from the window.
112    pub fn remove_persona_facts_messages(&self, window: &mut MessageWindowView<'_>) {
113        remove_by_prefix(window.messages, Role::System, PERSONA_PREFIX);
114    }
115
116    /// Remove trajectory-hint messages from the window.
117    pub fn remove_trajectory_hints_messages(&self, window: &mut MessageWindowView<'_>) {
118        remove_by_prefix(window.messages, Role::System, TRAJECTORY_PREFIX);
119    }
120
121    /// Remove tree-memory summary messages from the window.
122    pub fn remove_tree_memory_messages(&self, window: &mut MessageWindowView<'_>) {
123        remove_by_prefix(window.messages, Role::System, TREE_MEMORY_PREFIX);
124    }
125
126    /// Remove reasoning-strategy messages from the window.
127    pub fn remove_reasoning_strategies_messages(&self, window: &mut MessageWindowView<'_>) {
128        remove_by_prefix(window.messages, Role::System, REASONING_PREFIX);
129    }
130
131    /// Remove previously injected LSP context notes from the window.
132    ///
133    /// Called before injecting fresh notes each turn so stale diagnostics/hover
134    /// data from the previous tool call do not accumulate across iterations.
135    pub fn remove_lsp_messages(&self, window: &mut MessageWindowView<'_>) {
136        remove_by_prefix(window.messages, Role::System, LSP_NOTE_PREFIX);
137    }
138
139    /// Remove code-context (repo-map / file context) messages from the window.
140    pub fn remove_code_context_messages(&self, window: &mut MessageWindowView<'_>) {
141        remove_by_part_or_prefix(window.messages, CODE_CONTEXT_PREFIX, |p| {
142            matches!(p, MessagePart::CodeContext { .. })
143        });
144    }
145
146    /// Remove session-summary messages from the window.
147    pub fn remove_summary_messages(&self, window: &mut MessageWindowView<'_>) {
148        remove_by_part_or_prefix(window.messages, SUMMARY_PREFIX, |p| {
149            matches!(p, MessagePart::Summary { .. })
150        });
151    }
152
153    /// Remove cross-session context messages from the window.
154    pub fn remove_cross_session_messages(&self, window: &mut MessageWindowView<'_>) {
155        remove_by_part_or_prefix(window.messages, CROSS_SESSION_PREFIX, |p| {
156            matches!(p, MessagePart::CrossSession { .. })
157        });
158    }
159
160    /// Remove the session-digest user message from the window.
161    pub fn remove_session_digest_message(&self, window: &mut MessageWindowView<'_>) {
162        remove_by_prefix(window.messages, Role::User, SESSION_DIGEST_PREFIX);
163    }
164
165    /// Remove document-RAG messages from the window.
166    pub fn remove_document_rag_messages(&self, window: &mut MessageWindowView<'_>) {
167        remove_by_prefix(window.messages, Role::System, DOCUMENT_RAG_PREFIX);
168    }
169
170    /// Trim the non-system message tail to fit within `token_budget` tokens.
171    ///
172    /// Keeps the system prefix intact and the most recent messages, removing
173    /// older messages from the start of the conversation history until the
174    /// token count fits the budget. Recomputes `cached_prompt_tokens` after trimming.
175    ///
176    /// No-op when `token_budget` is zero.
177    pub fn trim_messages_to_budget(&self, window: &mut MessageWindowView<'_>, token_budget: usize) {
178        if token_budget == 0 {
179            return;
180        }
181
182        // Find the first non-system message index (skip system prefix).
183        let history_start = window
184            .messages
185            .iter()
186            .position(|m| m.role != Role::System)
187            .unwrap_or(window.messages.len());
188
189        if history_start >= window.messages.len() {
190            return;
191        }
192
193        let mut total = 0usize;
194        let mut keep_from = window.messages.len();
195
196        for i in (history_start..window.messages.len()).rev() {
197            let msg_tokens = window
198                .token_counter
199                .count_message_tokens(&window.messages[i]);
200            if total + msg_tokens > token_budget {
201                break;
202            }
203            total += msg_tokens;
204            keep_from = i;
205        }
206
207        if keep_from > history_start {
208            let removed = keep_from - history_start;
209            window.messages.drain(history_start..keep_from);
210            recompute_prompt_tokens(window);
211            tracing::info!(
212                removed,
213                token_budget,
214                "trimmed messages to fit context budget"
215            );
216        }
217    }
218
219    // ── prepare_context family (PR2) ─────────────────────────────────────────
220
221    /// Inject semantic recall messages into the window for the given query.
222    ///
223    /// Removes any existing recall messages first, fetches fresh recall up to
224    /// `token_budget` tokens, and inserts the result at position 1 (immediately
225    /// after the system prompt).
226    ///
227    /// # Errors
228    ///
229    /// Returns [`ContextError::Memory`] if the recall backend returns an error.
230    pub async fn inject_semantic_recall(
231        &self,
232        query: &str,
233        token_budget: usize,
234        window: &mut MessageWindowView<'_>,
235        view: &ContextAssemblyView<'_>,
236    ) -> Result<(), ContextError> {
237        self.remove_recall_messages(window);
238
239        let params = SemanticRecallParams {
240            query,
241            token_budget,
242            recall_limit: view.recall_limit,
243            context_format: view.context_format,
244            conversation_id: view.conversation_id,
245            tiered_classifier: view.tiered_retrieval_classifier.as_ref(),
246            tiered_validator: view.tiered_retrieval_validator.as_ref(),
247            tiered_config: &view.tiered_retrieval_config,
248        };
249        let msg = self
250            .run_tiered_recall(&params, window, view.memory.as_deref())
251            .await?;
252
253        if let Some(msg) = msg
254            && window.messages.len() > 1
255        {
256            window.messages.insert(1, msg);
257        }
258
259        Ok(())
260    }
261
262    /// Inject semantic recall without a full [`ContextAssemblyView`].
263    ///
264    /// This variant is called from `Agent::inject_semantic_recall` in `zeph-core`, where
265    /// constructing a full `ContextAssemblyView` would require duplicating all of
266    /// `prepare_context`'s setup. It carries only the fields that
267    /// `inject_semantic_recall` actually reads, enabling tiered retrieval on the
268    /// hot-path turn loop without the overhead of the full view.
269    ///
270    /// # Errors
271    ///
272    /// Returns [`ContextError::Memory`] if the recall backend returns an error.
273    pub async fn inject_semantic_recall_bare(
274        &self,
275        params: SemanticRecallParams<'_>,
276        window: &mut MessageWindowView<'_>,
277        memory: Option<&zeph_memory::semantic::SemanticMemory>,
278    ) -> Result<(), ContextError> {
279        self.remove_recall_messages(window);
280
281        let msg = self.run_tiered_recall(&params, window, memory).await?;
282
283        if let Some(msg) = msg
284            && window.messages.len() > 1
285        {
286            window.messages.insert(1, msg);
287        }
288
289        Ok(())
290    }
291
292    /// Execute tiered or flat semantic recall and return the message to inject, if any.
293    ///
294    /// Both `inject_semantic_recall` and `inject_semantic_recall_bare` share identical
295    /// retrieval logic; this method holds the single implementation.
296    async fn run_tiered_recall(
297        &self,
298        params: &SemanticRecallParams<'_>,
299        window: &MessageWindowView<'_>,
300        memory: Option<&zeph_memory::semantic::SemanticMemory>,
301    ) -> Result<Option<Message>, ContextError> {
302        if params.tiered_config.enabled {
303            use tracing::Instrument as _;
304            let Some(mem) = memory else {
305                return Ok(None);
306            };
307            let result = tokio::time::timeout(
308                std::time::Duration::from_secs(30),
309                zeph_memory::recall_tiered(
310                    mem,
311                    params.query,
312                    params.conversation_id,
313                    params.tiered_classifier,
314                    params.tiered_validator,
315                    params.tiered_config,
316                    Some(params.token_budget),
317                )
318                .instrument(tracing::info_span!("agent_context.tiered_retrieval.recall")),
319            )
320            .await
321            .map_err(|_| {
322                tracing::warn!("tiered_retrieval: recall_tiered timed out after 30s");
323                ContextError::Memory(zeph_memory::MemoryError::Timeout(
324                    "recall_tiered timed out".to_owned(),
325                ))
326            })?
327            .map_err(ContextError::Memory)?;
328
329            tracing::debug!(
330                intent = %result.intent,
331                tokens_used = result.tokens_used,
332                tier_escalated = result.tier_escalated,
333                count = result.messages.len(),
334                "tiered_retrieval: recall complete"
335            );
336
337            if result.messages.is_empty() {
338                return Ok(None);
339            }
340
341            let recalled_text = result
342                .messages
343                .iter()
344                .map(|m| m.message.content.as_str())
345                .collect::<Vec<_>>()
346                .join("\n---\n");
347            Ok(Some(Message::from_legacy(
348                Role::User,
349                format!("{RECALL_PREFIX}{recalled_text}"),
350            )))
351        } else {
352            let (msg, _score) = crate::helpers::fetch_semantic_recall_raw(
353                memory,
354                params.recall_limit,
355                params.context_format,
356                params.query,
357                params.token_budget,
358                &window.token_counter,
359                None,
360                None,
361            )
362            .await?;
363            Ok(msg)
364        }
365    }
366
367    /// Inject cross-session context messages into the window for the given query.
368    ///
369    /// Removes any existing cross-session messages first, fetches fresh cross-session
370    /// context for the current conversation, and inserts the result at position 1.
371    ///
372    /// # Errors
373    ///
374    /// Returns [`ContextError::Memory`] if the memory backend returns an error.
375    pub async fn inject_cross_session_context(
376        &self,
377        query: &str,
378        token_budget: usize,
379        window: &mut MessageWindowView<'_>,
380        view: &ContextAssemblyView<'_>,
381    ) -> Result<(), ContextError> {
382        self.remove_cross_session_messages(window);
383
384        if let Some(msg) = crate::helpers::fetch_cross_session_raw(
385            view.memory.as_deref(),
386            view.conversation_id,
387            view.cross_session_score_threshold,
388            query,
389            token_budget,
390            &view.token_counter,
391        )
392        .await?
393            && window.messages.len() > 1
394        {
395            window.messages.insert(1, msg);
396            tracing::debug!("injected cross-session context");
397        }
398
399        Ok(())
400    }
401
402    /// Inject conversation-summary messages into the window.
403    ///
404    /// Removes any existing summary messages first, fetches stored summaries for the
405    /// current conversation, and inserts the result at position 1.
406    ///
407    /// # Errors
408    ///
409    /// Returns [`ContextError::Memory`] if the memory backend returns an error.
410    pub async fn inject_summaries(
411        &self,
412        token_budget: usize,
413        window: &mut MessageWindowView<'_>,
414        view: &ContextAssemblyView<'_>,
415    ) -> Result<(), ContextError> {
416        self.remove_summary_messages(window);
417
418        if let Some(msg) = crate::helpers::fetch_summaries_raw(
419            view.memory.as_deref(),
420            view.conversation_id,
421            token_budget,
422            &view.token_counter,
423        )
424        .await?
425            && window.messages.len() > 1
426        {
427            window.messages.insert(1, msg);
428            tracing::debug!("injected summaries into context");
429        }
430
431        Ok(())
432    }
433
434    /// Select the best-matching skill among ambiguous candidates via an LLM classification call.
435    ///
436    /// Returns the reordered index list with the most likely skill first, or `None` if the
437    /// LLM call fails (caller falls back to original score order).
438    pub async fn disambiguate_skills(
439        &self,
440        query: &str,
441        all_meta: &[&zeph_skills::loader::SkillMeta],
442        scored: &[zeph_skills::ScoredMatch],
443        providers: &ProviderHandles,
444    ) -> Option<Vec<usize>> {
445        use std::fmt::Write as _;
446
447        let mut candidates = String::new();
448        for sm in scored {
449            if let Some(meta) = all_meta.get(sm.index) {
450                let _ = writeln!(
451                    candidates,
452                    "- {} (score: {:.3}): {}",
453                    meta.name, sm.score, meta.description
454                );
455            }
456        }
457
458        let prompt = format!(
459            "The user said: \"{query}\"\n\n\
460             These skills matched with similar scores:\n{candidates}\n\
461             Which skill best matches the user's intent? \
462             Return the skill_name, your confidence (0-1), and any extracted parameters."
463        );
464
465        let messages = vec![zeph_llm::provider::Message::from_legacy(
466            zeph_llm::provider::Role::User,
467            prompt,
468        )];
469        match providers
470            .disambiguate
471            .chat_typed::<zeph_skills::IntentClassification>(&messages)
472            .await
473        {
474            Ok(classification) => {
475                tracing::info!(
476                    skill = %classification.skill_name,
477                    confidence = classification.confidence,
478                    "disambiguation selected skill"
479                );
480                let mut indices: Vec<usize> = scored.iter().map(|s| s.index).collect();
481                if let Some(pos) = indices.iter().position(|&i| {
482                    all_meta
483                        .get(i)
484                        .is_some_and(|m| m.name == classification.skill_name)
485                }) {
486                    indices.swap(0, pos);
487                }
488                Some(indices)
489            }
490            Err(e) => {
491                tracing::warn!("disambiguation failed, using original order: {e:#}");
492                None
493            }
494        }
495    }
496
497    /// Prepare the context window for the current turn.
498    ///
499    /// Removes stale injection messages, runs proactive skill exploration, gathers
500    /// semantic recall and graph facts via the concurrent assembler, applies the
501    /// retrieval policy, and injects fresh context. Returns a [`ContextDelta`] whose
502    /// `code_context` field must be applied by the caller (via `inject_code_context`).
503    ///
504    /// # Errors
505    ///
506    /// Returns [`ContextError::Memory`] if recall fails or [`ContextError::Assembler`]
507    /// if the context assembler encounters an internal error.
508    #[allow(clippy::too_many_lines)] // sequential context-assembly pipeline; splitting would reduce readability
509    pub async fn prepare_context(
510        &self,
511        query: &str,
512        window: &mut MessageWindowView<'_>,
513        view: &mut ContextAssemblyView<'_>,
514    ) -> Result<ContextDelta, ContextError> {
515        if view.context_manager.budget.is_none() {
516            return Ok(ContextDelta::default());
517        }
518
519        // Remove stale injected messages before concurrent fetch.
520        self.remove_session_digest_message(window);
521        self.remove_summary_messages(window);
522        self.remove_cross_session_messages(window);
523        self.remove_recall_messages(window);
524        self.remove_document_rag_messages(window);
525        self.remove_correction_messages(window);
526        self.remove_code_context_messages(window);
527        self.remove_graph_facts_messages(window);
528        self.remove_persona_facts_messages(window);
529        self.remove_trajectory_hints_messages(window);
530        self.remove_tree_memory_messages(window);
531        if view.reasoning_config.enabled {
532            self.remove_reasoning_strategies_messages(window);
533        }
534
535        // Proactive world-knowledge exploration (feature-gated, #3320).
536        if let Some(explorer) = view.proactive_explorer.clone()
537            && let Some(domain) = explorer.classify(query)
538        {
539            let already_known = {
540                let registry_guard = view.skill_registry.read();
541                explorer.has_knowledge(&registry_guard, &domain)
542            };
543            let excluded = explorer.is_excluded(&domain);
544
545            if !already_known && !excluded {
546                tracing::debug!(domain = %domain.0, query_len = query.len(), "proactive.explore triggered");
547                let timeout_ms = explorer.timeout_ms();
548                let result = tokio::time::timeout(
549                    std::time::Duration::from_millis(timeout_ms),
550                    explorer.explore(&domain),
551                )
552                .await;
553                match result {
554                    Ok(Ok(())) => {
555                        view.skill_registry.write().reload(view.skill_paths);
556                        tracing::debug!(domain = %domain.0, "proactive.explore complete, registry reloaded");
557                    }
558                    Ok(Err(e)) => {
559                        tracing::warn!(domain = %domain.0, error = %e, "proactive exploration failed");
560                    }
561                    Err(_) => {
562                        tracing::warn!(domain = %domain.0, timeout_ms, "proactive exploration timed out");
563                    }
564                }
565            }
566        }
567
568        // Compression-spectrum retrieval policy (#3305, #3455).
569        let active_levels: &'static [zeph_memory::compression::CompressionLevel] =
570            if let Some(ref budget) = view.context_manager.budget {
571                let used = view.cached_prompt_tokens;
572                let max = budget.max_tokens();
573                #[allow(clippy::cast_precision_loss)]
574                let remaining_ratio = if max == 0 {
575                    1.0_f32
576                } else {
577                    1.0 - (used as f32 / max as f32).clamp(0.0, 1.0)
578                };
579                let levels =
580                    zeph_memory::compression::RetrievalPolicy::default().select(remaining_ratio);
581                tracing::debug!(
582                    remaining_ratio,
583                    active_levels = ?levels,
584                    "compression_spectrum: retrieval policy selected"
585                );
586                levels
587            } else {
588                &[]
589            };
590
591        let memory_backend: Option<std::sync::Arc<dyn zeph_common::memory::ContextMemoryBackend>> =
592            view.memory.clone().map(
593                |m| -> std::sync::Arc<dyn zeph_common::memory::ContextMemoryBackend> {
594                    std::sync::Arc::new(crate::memory_backend::SemanticMemoryBackend::new(m))
595                },
596            );
597
598        let memory_view = zeph_context::input::ContextMemoryView {
599            memory: memory_backend,
600            conversation_id: view.conversation_id.map(|c| c.0),
601            recall_limit: view.recall_limit,
602            cross_session_score_threshold: view.cross_session_score_threshold,
603            context_strategy: view.context_strategy,
604            crossover_turn_threshold: view.crossover_turn_threshold,
605            cached_session_digest: view.cached_session_digest.clone(),
606            graph_config: view.graph_config.clone(),
607            document_config: view.document_config.clone(),
608            persona_config: view.persona_config.clone(),
609            trajectory_config: view.trajectory_config.clone(),
610            reasoning_config: view.reasoning_config.clone(),
611            memcot_config: view.memcot_config.clone(),
612            memcot_state: view.memcot_state.clone(),
613            tree_config: view.tree_config.clone(),
614        };
615
616        #[cfg(feature = "index")]
617        let index_access = view.index;
618        #[cfg(not(feature = "index"))]
619        let index_access: Option<&dyn zeph_context::input::IndexAccess> = None;
620
621        let router = crate::memory_backend::build_memory_router(view.context_manager);
622
623        let input = zeph_context::input::ContextAssemblyInput {
624            memory: &memory_view,
625            context_manager: view.context_manager,
626            token_counter: &*view.token_counter,
627            skills_prompt: view.last_skills_prompt,
628            index: index_access,
629            correction_config: view.correction_config,
630            sidequest_turn_counter: view.sidequest_turn_counter,
631            messages: window.messages,
632            query,
633            scrub: view.scrub,
634            active_levels,
635            router,
636            planned_next_tools: view.planned_next_tools,
637        };
638
639        let mut prepared = zeph_context::assembler::ContextAssembler::gather(&input).await?;
640
641        // When tiered retrieval is enabled, suppress the flat recall assembled above and
642        // replace it with the tiered result injected directly into the window.  The span
643        // `agent_context.tiered_retrieval.recall` will appear in traces for every enabled
644        // turn, satisfying the observability requirement in issue #3996.
645        if view.tiered_retrieval_config.enabled {
646            prepared.recall = None;
647        }
648
649        // Drain background handles produced during assembly (e.g. mark_reasoning_used) and
650        // register them with the supervisor so they are tracked and abortable.  Must happen
651        // before `apply_prepared_context` consumes `prepared` to avoid silent drops.
652        for handle in prepared.background_tasks.drain(..) {
653            let task_supervisor = std::sync::Arc::clone(&view.task_supervisor);
654            task_supervisor.spawn(zeph_common::task_supervisor::TaskDescriptor {
655                name: "context.assembly.background",
656                restart: zeph_common::task_supervisor::RestartPolicy::RunOnce,
657                factory: {
658                    let cell = std::sync::Arc::new(std::sync::Mutex::new(Some(async move {
659                        let _ = handle.await;
660                    })));
661                    move || {
662                        let f = cell.lock().ok().and_then(|mut g| g.take());
663                        async move {
664                            if let Some(f) = f {
665                                f.await;
666                            }
667                        }
668                    }
669                },
670            });
671        }
672
673        let (delta, inserted_count) = self.apply_prepared_context(window, view, prepared).await;
674
675        if view.tiered_retrieval_config.enabled {
676            self.inject_semantic_recall(query, usize::MAX, window, view)
677                .await?;
678        }
679
680        // T-06: Fidelity scoring (INV-01: AFTER apply_prepared_context returns).
681        // Guard: skip when MemoryFirst is active (INV-11 / AC-09) or config absent/disabled.
682        // Spec AC-09: when memory_first=true the scorer MUST NOT run — the caller (here) is
683        // responsible for this bypass; FidelityScorer itself is stateless and has no memory of it.
684        let memory_first_active =
685            view.context_strategy == zeph_config::ContextStrategy::MemoryFirst;
686        if let Some(fidelity_cfg) = view.fidelity_config
687            && fidelity_cfg.enabled
688            && !memory_first_active
689        {
690            let _span = tracing::info_span!(
691                "context.fidelity.score",
692                message_count = window.messages.len(),
693                query_len = query.len(),
694            )
695            .entered();
696            if let Some(ref tx) = view.status_tx {
697                let _ = tx.send("Scoring context fidelity\u{2026}".into());
698            }
699            let embed_provider = view
700                .fidelity_semantic_provider
701                .as_deref()
702                .map(|p| p as &dyn zeph_llm::LlmProviderDyn);
703            let compress_provider = view
704                .fidelity_compress_provider
705                .as_deref()
706                .map(|p| p as &dyn zeph_llm::LlmProviderDyn);
707            FidelityScorer
708                .score_and_apply(
709                    window.messages,
710                    query,
711                    view.planned_next_tools,
712                    fidelity_cfg,
713                    &*view.token_counter,
714                    inserted_count,
715                    false, // floor invariant enforced on normal scoring path
716                    embed_provider,
717                    compress_provider,
718                )
719                .await;
720            // Persist fidelity tags so subsequent turns see the floor invariant.
721            persist_fidelity_tags(window.messages, view.memory.as_deref()).await;
722            recompute_prompt_tokens(window);
723            if let Some(ref tx) = view.status_tx {
724                let _ = tx.send(String::new());
725            }
726        }
727
728        Ok(delta)
729    }
730
731    /// Apply a [`PreparedContext`] to the message window.
732    ///
733    /// Injects all fetched messages in insertion order (`doc_rag` → corrections → recall →
734    /// cross-session → summaries → persona → trajectory → tree → reasoning), handles
735    /// `MemoryFirst` history drain, sanitizes memory content, trims to budget, and injects
736    /// the session digest. Returns a [`ContextDelta`] whose `code_context` field the caller
737    /// must apply via `inject_code_context`, plus the count of messages freshly inserted at
738    /// indices `1..1+inserted_count` (used by the fidelity scorer as the exempt range — INV-10).
739    #[allow(clippy::too_many_lines)] // sequential message injection: order matters, cannot split
740    async fn apply_prepared_context(
741        &self,
742        window: &mut MessageWindowView<'_>,
743        view: &mut ContextAssemblyView<'_>,
744        prepared: zeph_context::assembler::PreparedContext,
745    ) -> (ContextDelta, usize) {
746        use std::borrow::Cow;
747        use zeph_llm::provider::{Message, MessageMetadata, Role};
748        use zeph_sanitizer::{ContentSource, ContentSourceKind, MemorySourceHint};
749
750        // Store top-1 recall score for MAR routing signal.
751        *view.last_recall_confidence = prepared.recall_confidence;
752
753        // MemoryFirst: drain conversation history BEFORE inserting memory messages.
754        if prepared.memory_first {
755            let history_start = 1usize;
756            let len = window.messages.len();
757            let keep_tail =
758                zeph_context::assembler::memory_first_keep_tail(window.messages, history_start);
759            if len > history_start + keep_tail {
760                window.messages.drain(history_start..len - keep_tail);
761                recompute_prompt_tokens(window);
762                tracing::debug!(
763                    strategy = "memory_first",
764                    keep_tail,
765                    "dropped conversation history, kept last {keep_tail} messages"
766                );
767            }
768        }
769
770        // Tracks how many memory messages were freshly inserted at positions 1..1+inserted_count
771        // so the fidelity scorer can exempt them (INV-10). Incremented at every insertion path.
772        let mut inserted_count: usize = 0;
773
774        // Insert memory messages at position 1 (all sanitized before insertion — CRIT-02).
775        if let Some(msg) = prepared.graph_facts.filter(|_| window.messages.len() > 1) {
776            let sanitized = self
777                .sanitize_memory_message(msg, MemorySourceHint::ExternalContent, view)
778                .await;
779            window.messages.insert(1, sanitized);
780            inserted_count += 1;
781            tracing::debug!("injected knowledge graph facts into context");
782        }
783        if let Some(msg) = prepared.doc_rag.filter(|_| window.messages.len() > 1) {
784            let sanitized = self
785                .sanitize_memory_message(msg, MemorySourceHint::ExternalContent, view)
786                .await;
787            window.messages.insert(1, sanitized);
788            inserted_count += 1;
789            tracing::debug!("injected document RAG context");
790        }
791        if let Some(msg) = prepared.corrections.filter(|_| window.messages.len() > 1) {
792            let sanitized = self
793                .sanitize_memory_message(msg, MemorySourceHint::ConversationHistory, view)
794                .await;
795            window.messages.insert(1, sanitized);
796            inserted_count += 1;
797            tracing::debug!("injected past corrections into context");
798        }
799        if let Some(msg) = prepared.recall.filter(|_| window.messages.len() > 1) {
800            let sanitized = self
801                .sanitize_memory_message(msg, MemorySourceHint::ConversationHistory, view)
802                .await;
803            window.messages.insert(1, sanitized);
804            inserted_count += 1;
805        }
806        if let Some(msg) = prepared.cross_session.filter(|_| window.messages.len() > 1) {
807            let sanitized = self
808                .sanitize_memory_message(msg, MemorySourceHint::LlmSummary, view)
809                .await;
810            window.messages.insert(1, sanitized);
811            inserted_count += 1;
812        }
813        if let Some(msg) = prepared.summaries.filter(|_| window.messages.len() > 1) {
814            let sanitized = self
815                .sanitize_memory_message(msg, MemorySourceHint::LlmSummary, view)
816                .await;
817            window.messages.insert(1, sanitized);
818            inserted_count += 1;
819            tracing::debug!("injected summaries into context");
820        }
821        if let Some(msg) = prepared.persona_facts.filter(|_| window.messages.len() > 1) {
822            let sanitized = self
823                .sanitize_memory_message(msg, MemorySourceHint::ExternalContent, view)
824                .await;
825            window.messages.insert(1, sanitized);
826            inserted_count += 1;
827            tracing::debug!("injected persona facts into context");
828        }
829        if let Some(msg) = prepared
830            .trajectory_hints
831            .filter(|_| window.messages.len() > 1)
832        {
833            let sanitized = self
834                .sanitize_memory_message(msg, MemorySourceHint::ExternalContent, view)
835                .await;
836            window.messages.insert(1, sanitized);
837            inserted_count += 1;
838            tracing::debug!("injected trajectory hints into context");
839        }
840        if let Some(msg) = prepared.tree_memory.filter(|_| window.messages.len() > 1) {
841            let sanitized = self
842                .sanitize_memory_message(msg, MemorySourceHint::ExternalContent, view)
843                .await;
844            window.messages.insert(1, sanitized);
845            inserted_count += 1;
846            tracing::debug!("injected tree memory summary into context");
847        }
848        if let Some(msg) = prepared
849            .reasoning_hints
850            .filter(|_| window.messages.len() > 1)
851        {
852            let sanitized = self
853                .sanitize_memory_message(msg, MemorySourceHint::ExternalContent, view)
854                .await;
855            window.messages.insert(1, sanitized);
856            inserted_count += 1;
857            tracing::debug!("injected reasoning strategies into context");
858        }
859
860        // Code context: sanitize inline, return body to caller via ContextDelta.
861        let code_context = if let Some(text) = prepared.code_context {
862            let sanitized = view
863                .sanitizer
864                .sanitize(&text, ContentSource::new(ContentSourceKind::ToolResult));
865            view.metrics.sanitizer_runs += 1;
866            if !sanitized.injection_flags.is_empty() {
867                tracing::warn!(
868                    flags = sanitized.injection_flags.len(),
869                    "injection patterns detected in code RAG context"
870                );
871                view.metrics.sanitizer_injection_flags += sanitized.injection_flags.len() as u64;
872                let detail = sanitized
873                    .injection_flags
874                    .first()
875                    .map_or_else(String::new, |f| {
876                        format!("Detected pattern: {}", f.pattern_name)
877                    });
878                view.security_events.push(
879                    zeph_common::SecurityEventCategory::InjectionFlag,
880                    "code_rag",
881                    detail,
882                );
883            }
884            if sanitized.was_truncated {
885                view.metrics.sanitizer_truncations += 1;
886                view.security_events.push(
887                    zeph_common::SecurityEventCategory::Truncation,
888                    "code_rag",
889                    "Content truncated to max_content_size".to_string(),
890                );
891            }
892            Some(sanitized.body)
893        } else {
894            None
895        };
896
897        if !prepared.memory_first {
898            self.trim_messages_to_budget(window, prepared.recent_history_budget);
899        }
900
901        // Session digest injected AFTER all other memory inserts (closest to system prompt).
902        if view.digest_enabled
903            && let Some((digest_text, _)) = view
904                .cached_session_digest
905                .clone()
906                .filter(|_| window.messages.len() > 1)
907        {
908            let digest_msg = Message {
909                role: Role::User,
910                content: format!("{}{digest_text}", crate::helpers::SESSION_DIGEST_PREFIX),
911                parts: vec![],
912                metadata: MessageMetadata::default(),
913            };
914            let sanitized = self
915                .sanitize_memory_message(digest_msg, MemorySourceHint::LlmSummary, view)
916                .await;
917            window.messages.insert(1, sanitized);
918            inserted_count += 1;
919            tracing::debug!("injected session digest into context");
920        }
921
922        // Credential scrubbing pass.
923        if view.redact_credentials {
924            for msg in &mut *window.messages {
925                if msg.role == Role::System {
926                    continue;
927                }
928                if let Cow::Owned(s) = (view.scrub)(&msg.content) {
929                    msg.content = s;
930                }
931            }
932        }
933
934        recompute_prompt_tokens(window);
935
936        (ContextDelta { code_context }, inserted_count)
937    }
938
939    /// Sanitize a memory retrieval message before inserting it into the context window.
940    ///
941    /// This is the sole sanitization point for the six memory retrieval paths (`doc_rag`,
942    /// corrections, recall, `cross_session`, summaries, `graph_facts`). The `hint` parameter
943    /// modulates injection-detection sensitivity — `ConversationHistory` and `LlmSummary`
944    /// skip detection to suppress false positives; `ExternalContent` enables full detection.
945    ///
946    /// Truncation, control-char stripping, delimiter escaping, and spotlighting are active
947    /// for all hints (defense-in-depth invariant).
948    async fn sanitize_memory_message(
949        &self,
950        mut msg: zeph_llm::provider::Message,
951        hint: zeph_sanitizer::MemorySourceHint,
952        view: &mut ContextAssemblyView<'_>,
953    ) -> zeph_llm::provider::Message {
954        use zeph_sanitizer::{ContentSource, ContentSourceKind};
955
956        let source = ContentSource::new(ContentSourceKind::MemoryRetrieval).with_memory_hint(hint);
957        let sanitized = view.sanitizer.sanitize(&msg.content, source);
958        view.metrics.sanitizer_runs += 1;
959        if !sanitized.injection_flags.is_empty() {
960            tracing::warn!(
961                flags = sanitized.injection_flags.len(),
962                "injection patterns detected in memory retrieval"
963            );
964            view.metrics.sanitizer_injection_flags += sanitized.injection_flags.len() as u64;
965            let detail = sanitized
966                .injection_flags
967                .first()
968                .map_or_else(String::new, |f| {
969                    format!("Detected pattern: {}", f.pattern_name)
970                });
971            view.security_events.push(
972                zeph_common::SecurityEventCategory::InjectionFlag,
973                "memory_retrieval",
974                detail,
975            );
976        }
977        if sanitized.was_truncated {
978            view.metrics.sanitizer_truncations += 1;
979            view.security_events.push(
980                zeph_common::SecurityEventCategory::Truncation,
981                "memory_retrieval",
982                "Content truncated to max_content_size".to_string(),
983            );
984        }
985
986        // Quarantine step: route high-risk sources through an isolated LLM (defense-in-depth).
987        if view.sanitizer.is_enabled()
988            && let Some(qs) = view.quarantine_summarizer
989            && qs.should_quarantine(ContentSourceKind::MemoryRetrieval)
990        {
991            match qs.extract_facts(&sanitized, view.sanitizer).await {
992                Ok((facts, flags)) => {
993                    view.metrics.quarantine_invocations += 1;
994                    view.security_events.push(
995                        zeph_common::SecurityEventCategory::Quarantine,
996                        "memory_retrieval",
997                        "Content quarantined, facts extracted".to_string(),
998                    );
999                    let escaped = zeph_sanitizer::ContentSanitizer::escape_delimiter_tags(&facts);
1000                    msg.content = zeph_sanitizer::ContentSanitizer::apply_spotlight(
1001                        &escaped,
1002                        &sanitized.source,
1003                        &flags,
1004                    );
1005                    return msg;
1006                }
1007                Err(e) => {
1008                    tracing::warn!(
1009                        error = %e,
1010                        "quarantine failed for memory retrieval, using original sanitized content"
1011                    );
1012                    view.metrics.quarantine_failures += 1;
1013                    view.security_events.push(
1014                        zeph_common::SecurityEventCategory::Quarantine,
1015                        "memory_retrieval",
1016                        format!("Quarantine failed: {e}"),
1017                    );
1018                }
1019            }
1020        }
1021
1022        msg.content = sanitized.body;
1023        msg
1024    }
1025
1026    /// Reset the conversation history.
1027    ///
1028    /// Clears all messages except the system prompt and resets the cached token count.
1029    /// The caller (`Agent<C>`) is responsible for resetting compaction state, orchestration,
1030    /// focus, and sidequest state — those fields are outside the context-service scope.
1031    ///
1032    /// # Errors
1033    ///
1034    /// Returns [`ContextError::Memory`] if creating a new conversation in `SQLite` fails.
1035    pub fn reset_conversation(
1036        &self,
1037        window: &mut MessageWindowView<'_>,
1038        _view: &mut ContextAssemblyView<'_>,
1039    ) -> Result<(), ContextError> {
1040        self.clear_history(window);
1041        Ok(())
1042    }
1043
1044    /// Run tiered compaction if the token budget is exhausted.
1045    ///
1046    /// Dispatches to the appropriate compaction tier based on the current
1047    /// context manager state:
1048    ///
1049    /// - **None** — context is within budget; no-op.
1050    /// - **Soft** — apply deferred summaries + prune tool outputs (no LLM).
1051    /// - **Hard** — Soft steps first, then LLM full summarization if pruning is insufficient.
1052    ///
1053    /// Increments the `turns_since_last_hard_compaction` counter unconditionally so pressure
1054    /// is tracked regardless of whether compaction fires. Respects the cooldown guard: when
1055    /// cooling, Hard-tier LLM summarization is skipped.
1056    ///
1057    /// # Errors
1058    ///
1059    /// Returns [`ContextError::Memory`] if `SQLite` persistence fails during Hard compaction.
1060    #[allow(
1061        clippy::cast_precision_loss,
1062        clippy::cast_possible_truncation,
1063        clippy::cast_sign_loss,
1064        clippy::too_many_lines
1065    )]
1066    pub async fn maybe_compact(
1067        &self,
1068        summ: &mut ContextSummarizationView<'_>,
1069        status: &(impl StatusSink + ?Sized),
1070    ) -> Result<(), ContextError> {
1071        use zeph_context::manager::{CompactionState, CompactionTier};
1072
1073        // Increment turn counter unconditionally (tracks pressure regardless of guards).
1074        if let Some(count) = summ.context_manager.turns_since_last_hard_compaction_mut() {
1075            *count += 1;
1076        }
1077
1078        // Guard: exhaustion — warn once, then no-op permanently.
1079        if let CompactionState::Exhausted { warned } = summ.context_manager.compaction_state()
1080            && !warned
1081        {
1082            summ.context_manager
1083                .set_compaction_state(CompactionState::Exhausted { warned: true });
1084            tracing::warn!("compaction exhausted: context budget too tight for this session");
1085        }
1086        if summ.context_manager.compaction_state().is_exhausted() {
1087            return Ok(());
1088        }
1089
1090        // Guard: server compaction active — skip unless above 95% budget (safety fallback).
1091        if summ.server_compaction_active {
1092            let budget = summ
1093                .context_manager
1094                .budget
1095                .as_ref()
1096                .map_or(0, ContextBudget::max_tokens);
1097            if budget > 0 {
1098                let fallback = (budget * 95 / 100) as u64;
1099                if *summ.cached_prompt_tokens < fallback {
1100                    return Ok(());
1101                }
1102                tracing::warn!(
1103                    "server compaction active but context at 95%+ — falling back to client-side"
1104                );
1105            } else {
1106                return Ok(());
1107            }
1108        }
1109
1110        // Guard: already compacted this turn.
1111        if summ
1112            .context_manager
1113            .compaction_state()
1114            .is_compacted_this_turn()
1115        {
1116            return Ok(());
1117        }
1118
1119        // Decrement cooldown counter; record whether we are in cooldown.
1120        let in_cooldown = summ.context_manager.compaction_state().cooldown_remaining() > 0;
1121        if in_cooldown
1122            && let CompactionState::Cooling { turns_remaining } =
1123                summ.context_manager.compaction_state()
1124        {
1125            let next = turns_remaining - 1;
1126            summ.context_manager.set_compaction_state(if next == 0 {
1127                CompactionState::Ready
1128            } else {
1129                CompactionState::Cooling {
1130                    turns_remaining: next,
1131                }
1132            });
1133        }
1134
1135        // T-07: AgeMem proactive regrade — fires before tier dispatch (INV-06, INV-11).
1136        // Skip when MemoryFirst is active; ContextSummarizationView does not carry
1137        // context_strategy, so we check the budget ratio directly via should_proactively_regrade.
1138        if let Some(ref fidelity_cfg) = summ.fidelity_config.clone()
1139            && fidelity_cfg.enabled
1140            && summ.context_manager.should_proactively_regrade(
1141                *summ.cached_prompt_tokens,
1142                fidelity_cfg.regrade_threshold,
1143                summ.server_compaction_active,
1144            )
1145        {
1146            let _regrade_span = tracing::info_span!(
1147                "context.fidelity.regrade",
1148                budget_ratio = tracing::field::Empty,
1149            )
1150            .entered();
1151            let regrade_embed_provider = summ
1152                .fidelity_semantic_provider
1153                .as_deref()
1154                .map(|p| p as &dyn zeph_llm::LlmProviderDyn);
1155            let regrade_compress_provider = summ
1156                .fidelity_compress_provider
1157                .as_deref()
1158                .map(|p| p as &dyn zeph_llm::LlmProviderDyn);
1159            FidelityScorer
1160                .score_and_apply(
1161                    summ.messages,
1162                    &summ.current_query,
1163                    &[],
1164                    fidelity_cfg,
1165                    &*summ.token_counter,
1166                    0,
1167                    true, // proactive regrade: allow upgrading past the persisted floor
1168                    regrade_embed_provider,
1169                    regrade_compress_provider,
1170                )
1171                .await;
1172            // Persist upgraded fidelity tags so the new levels survive the next turn (F-3).
1173            persist_fidelity_tags(summ.messages, summ.memory.as_deref()).await;
1174            recompute_prompt_tokens_summ(summ);
1175            summ.context_manager.set_regraded_this_turn(true);
1176            tracing::debug!(
1177                cached_tokens = *summ.cached_prompt_tokens,
1178                "AgeMem proactive regrade complete"
1179            );
1180        }
1181
1182        match summ
1183            .context_manager
1184            .compaction_tier(*summ.cached_prompt_tokens)
1185        {
1186            CompactionTier::None => Ok(()),
1187            CompactionTier::Soft => {
1188                self.do_soft_compaction(summ, status).await;
1189                Ok(())
1190            }
1191            CompactionTier::Hard => self.do_hard_compaction(summ, status, in_cooldown).await,
1192        }
1193    }
1194
1195    /// Execute the Soft compaction tier: apply deferred summaries and prune tool outputs.
1196    ///
1197    /// Does not trigger an LLM call. Does not set `compacted_this_turn` so Hard tier
1198    /// may still fire in the same turn if context remains above the hard threshold.
1199    #[allow(
1200        clippy::cast_precision_loss,
1201        clippy::cast_possible_truncation,
1202        clippy::cast_sign_loss
1203    )]
1204    async fn do_soft_compaction(
1205        &self,
1206        summ: &mut ContextSummarizationView<'_>,
1207        status: &(impl StatusSink + ?Sized),
1208    ) {
1209        status.send_status("soft compacting context...").await;
1210
1211        // Step 0: refresh task goal / subgoal for scored pruning.
1212        match &summ.context_manager.compression.pruning_strategy {
1213            zeph_config::PruningStrategy::Subgoal | zeph_config::PruningStrategy::SubgoalMig => {
1214                crate::summarization::scheduling::maybe_refresh_subgoal(summ);
1215            }
1216            _ => crate::summarization::scheduling::maybe_refresh_task_goal(summ),
1217        }
1218
1219        // Step 1: apply deferred summaries (free tokens without LLM).
1220        let applied = crate::summarization::deferred::apply_deferred_summaries(summ);
1221
1222        // Step 1b: rebuild subgoal index if deferred summaries were applied (S5 fix).
1223        if applied > 0
1224            && summ
1225                .context_manager
1226                .compression
1227                .pruning_strategy
1228                .is_subgoal()
1229        {
1230            summ.subgoal_registry
1231                .rebuild_after_compaction(summ.messages, 0);
1232        }
1233
1234        // Step 2: prune tool outputs down to soft threshold.
1235        let budget = summ
1236            .context_manager
1237            .budget
1238            .as_ref()
1239            .map_or(0, ContextBudget::max_tokens);
1240        let soft_threshold =
1241            (budget as f32 * summ.context_manager.soft_compaction_threshold) as usize;
1242        let cached = usize::try_from(*summ.cached_prompt_tokens).unwrap_or(usize::MAX);
1243        let min_to_free = cached.saturating_sub(soft_threshold);
1244        if min_to_free > 0 {
1245            crate::summarization::pruning::prune_tool_outputs(summ, min_to_free);
1246        }
1247
1248        status.send_status("").await;
1249        tracing::info!(
1250            cached_tokens = *summ.cached_prompt_tokens,
1251            soft_threshold,
1252            "soft compaction complete"
1253        );
1254    }
1255
1256    /// Execute the Hard compaction tier: soft pass first, then LLM summarization if needed.
1257    #[allow(
1258        clippy::cast_precision_loss,
1259        clippy::cast_possible_truncation,
1260        clippy::cast_sign_loss
1261    )]
1262    async fn do_hard_compaction(
1263        &self,
1264        summ: &mut ContextSummarizationView<'_>,
1265        status: &(impl StatusSink + ?Sized),
1266        in_cooldown: bool,
1267    ) -> Result<(), ContextError> {
1268        use zeph_context::manager::CompactionState;
1269
1270        // Track hard compaction event for pressure metrics.
1271        let turns_since_last = summ
1272            .context_manager
1273            .turns_since_last_hard_compaction()
1274            .map(|t| u32::try_from(t).unwrap_or(u32::MAX));
1275        summ.context_manager
1276            .set_turns_since_last_hard_compaction(Some(0));
1277        if let Some(metrics) = summ.metrics {
1278            metrics.record_hard_compaction(turns_since_last);
1279        }
1280
1281        if in_cooldown {
1282            tracing::debug!(
1283                turns_remaining = summ.context_manager.compaction_state().cooldown_remaining(),
1284                "hard compaction skipped: cooldown active"
1285            );
1286            return Ok(());
1287        }
1288
1289        let budget = summ
1290            .context_manager
1291            .budget
1292            .as_ref()
1293            .map_or(0, ContextBudget::max_tokens);
1294        let hard_threshold =
1295            (budget as f32 * summ.context_manager.hard_compaction_threshold) as usize;
1296        let cached = usize::try_from(*summ.cached_prompt_tokens).unwrap_or(usize::MAX);
1297        let min_to_free = cached.saturating_sub(hard_threshold);
1298
1299        status.send_status("compacting context...").await;
1300
1301        // Step 1: apply deferred summaries.
1302        crate::summarization::deferred::apply_deferred_summaries(summ);
1303
1304        // Step 2: attempt pruning-only.
1305        let freed = crate::summarization::pruning::prune_tool_outputs(summ, min_to_free);
1306        if freed >= min_to_free {
1307            tracing::info!(freed, "hard compaction: pruning sufficient");
1308            summ.context_manager
1309                .set_compaction_state(CompactionState::CompactedThisTurn {
1310                    cooldown: summ.context_manager.compaction_cooldown_turns(),
1311                });
1312            if let Err(e) = crate::summarization::deferred::flush_deferred_summaries(summ).await {
1313                tracing::warn!(%e, "flush_deferred_summaries failed after hard compaction");
1314            }
1315            status.send_status("").await;
1316            return Ok(());
1317        }
1318
1319        // Step 3: Guard — too few messages to compact.
1320        let preserve_tail = summ.context_manager.compaction_preserve_tail;
1321        let compactable = summ.messages.len().saturating_sub(preserve_tail + 1);
1322        if compactable <= 1 {
1323            tracing::warn!(
1324                compactable,
1325                "hard compaction: too few messages, marking exhausted"
1326            );
1327            summ.context_manager
1328                .set_compaction_state(CompactionState::Exhausted { warned: false });
1329            status.send_status("").await;
1330            return Ok(());
1331        }
1332
1333        // Step 4: LLM summarization.
1334        tracing::info!(
1335            min_to_free,
1336            "hard compaction: falling back to LLM summarization"
1337        );
1338        let tokens_before = *summ.cached_prompt_tokens;
1339        let outcome = crate::summarization::compaction::compact_context(summ, None).await?;
1340
1341        let freed_tokens = tokens_before.saturating_sub(*summ.cached_prompt_tokens);
1342
1343        if !outcome.is_compacted() || freed_tokens == 0 {
1344            tracing::warn!("hard compaction: no net reduction, marking exhausted");
1345            summ.context_manager
1346                .set_compaction_state(CompactionState::Exhausted { warned: false });
1347            status.send_status("").await;
1348            return Ok(());
1349        }
1350
1351        if matches!(
1352            summ.context_manager
1353                .compaction_tier(*summ.cached_prompt_tokens),
1354            zeph_context::manager::CompactionTier::Hard
1355        ) {
1356            tracing::warn!(
1357                freed_tokens,
1358                "hard compaction: still above hard threshold after compaction, marking exhausted"
1359            );
1360            summ.context_manager
1361                .set_compaction_state(CompactionState::Exhausted { warned: false });
1362            status.send_status("").await;
1363            return Ok(());
1364        }
1365
1366        summ.context_manager
1367            .set_compaction_state(CompactionState::CompactedThisTurn {
1368                cooldown: summ.context_manager.compaction_cooldown_turns(),
1369            });
1370
1371        if tokens_before > *summ.cached_prompt_tokens {
1372            tracing::info!(
1373                tokens_before,
1374                tokens_after = *summ.cached_prompt_tokens,
1375                saved = freed_tokens,
1376                "context compaction complete"
1377            );
1378        }
1379
1380        status.send_status("").await;
1381        Ok(())
1382    }
1383
1384    /// Summarize the most recent tool-use/result pair if it exceeds the cutoff.
1385    ///
1386    /// Drains the backlog of unsummarized tool-use/result pairs in a single pass,
1387    /// storing results as `deferred_summary` on message metadata. Applied lazily
1388    /// by [`Self::maybe_apply_deferred_summaries`] when context pressure rises.
1389    pub async fn maybe_summarize_tool_pair(
1390        &self,
1391        summ: &mut ContextSummarizationView<'_>,
1392        providers: &ProviderHandles,
1393    ) {
1394        crate::summarization::deferred::maybe_summarize_tool_pair(
1395            summ,
1396            providers,
1397            &TxStatusSink(summ.status_tx.clone()),
1398        )
1399        .await;
1400    }
1401
1402    /// Apply any deferred tool-pair summaries to the message window.
1403    ///
1404    /// Processes all pending deferred summaries in reverse order so insertions do not
1405    /// invalidate lower indices. Returns the number of summaries applied.
1406    #[must_use]
1407    pub fn apply_deferred_summaries(&self, summ: &mut ContextSummarizationView<'_>) -> usize {
1408        crate::summarization::deferred::apply_deferred_summaries(summ)
1409    }
1410
1411    /// Flush all deferred summary IDs to the database.
1412    ///
1413    /// Calls `apply_tool_pair_summaries` to soft-delete the original tool pairs and
1414    /// persist the summaries. Always clears both deferred queues regardless of outcome.
1415    pub async fn flush_deferred_summaries(&self, summ: &mut ContextSummarizationView<'_>) {
1416        if let Err(e) = crate::summarization::deferred::flush_deferred_summaries(summ).await {
1417            tracing::warn!(%e, "flush_deferred_summaries failed");
1418        }
1419    }
1420
1421    /// Apply deferred summaries if context usage exceeds the soft compaction threshold.
1422    ///
1423    /// Two triggers: token pressure (above the soft threshold) and count pressure (pending
1424    /// summaries >= `tool_call_cutoff`). This is Tier 0 — no LLM call. Does NOT set
1425    /// `compacted_this_turn` so proactive/reactive compaction may still fire.
1426    pub fn maybe_apply_deferred_summaries(&self, summ: &mut ContextSummarizationView<'_>) {
1427        crate::summarization::deferred::maybe_apply_deferred_summaries(summ);
1428    }
1429
1430    /// Run unconditional LLM-based context compaction with an optional token budget.
1431    ///
1432    /// Bypasses tier and cooldown checks — always drains the oldest messages and inserts
1433    /// a compact summary. Use this in tests or when the caller has already determined that
1434    /// compaction is warranted. Production code should prefer [`Self::maybe_compact`].
1435    ///
1436    /// Invokes the optional callbacks wired into `summ` in this order:
1437    /// archive → LLM summarization → probe → finalize → persistence.
1438    ///
1439    /// Returns [`crate::state::CompactionOutcome::NoChange`] when there is nothing to compact.
1440    ///
1441    /// # Errors
1442    ///
1443    /// Returns [`ContextError`] if summarization fails (LLM error or timeout).
1444    pub async fn compact_context(
1445        &self,
1446        summ: &mut ContextSummarizationView<'_>,
1447        max_summary_tokens: Option<usize>,
1448    ) -> Result<crate::state::CompactionOutcome, crate::error::ContextError> {
1449        crate::summarization::compaction::compact_context(summ, max_summary_tokens).await
1450    }
1451
1452    /// Apply a soft compaction pass mid-iteration if required.
1453    ///
1454    /// Applies deferred summaries and prunes tool outputs down to the soft threshold.
1455    /// Never triggers a Hard tier LLM call. Returns immediately if `compacted_this_turn`
1456    /// is set or context is below the soft threshold.
1457    pub fn maybe_soft_compact_mid_iteration(&self, summ: &mut ContextSummarizationView<'_>) {
1458        crate::summarization::scheduling::maybe_soft_compact_mid_iteration(summ);
1459    }
1460
1461    /// Run proactive compression if token usage crosses the configured threshold.
1462    ///
1463    /// Uses the `compact_context_with_budget` path (LLM summarization with an optional
1464    /// token cap). Skips when server compaction is active unless context exceeds 95% of
1465    /// the budget. Does not impose a post-compaction cooldown.
1466    pub async fn maybe_proactive_compress(
1467        &self,
1468        summ: &mut ContextSummarizationView<'_>,
1469        status: &(impl StatusSink + ?Sized),
1470    ) {
1471        let Some((_threshold, max_summary_tokens)) = summ
1472            .context_manager
1473            .should_proactively_compress(*summ.cached_prompt_tokens)
1474        else {
1475            return;
1476        };
1477
1478        if summ.server_compaction_active {
1479            let budget = summ
1480                .context_manager
1481                .budget
1482                .as_ref()
1483                .map_or(0, ContextBudget::max_tokens);
1484            if budget > 0 {
1485                let fallback = (budget * 95 / 100) as u64;
1486                if *summ.cached_prompt_tokens <= fallback {
1487                    return;
1488                }
1489                tracing::warn!(
1490                    cached_prompt_tokens = *summ.cached_prompt_tokens,
1491                    fallback_threshold = fallback,
1492                    "server compaction active but context at 95%+ — falling back to proactive"
1493                );
1494            } else {
1495                return;
1496            }
1497        }
1498
1499        status.send_status("compressing context...").await;
1500        tracing::info!(
1501            max_summary_tokens,
1502            cached_tokens = *summ.cached_prompt_tokens,
1503            "proactive compression triggered"
1504        );
1505
1506        match crate::summarization::compaction::compact_context(summ, Some(max_summary_tokens))
1507            .await
1508        {
1509            Ok(outcome) if outcome.is_compacted() => {
1510                summ.context_manager.set_compaction_state(
1511                    zeph_context::manager::CompactionState::CompactedThisTurn { cooldown: 0 },
1512                );
1513                tracing::info!("proactive compression complete");
1514            }
1515            Ok(_) => {}
1516            Err(e) => tracing::warn!(%e, "proactive compression failed"),
1517        }
1518
1519        status.send_status("").await;
1520    }
1521
1522    /// Refresh the task goal when the last user message has changed.
1523    ///
1524    /// Two-phase non-blocking: applies any completed background result from the previous
1525    /// turn, then schedules a new extraction if the user message hash has changed.
1526    /// Only active for `TaskAware` and `Mig` pruning strategies.
1527    pub fn maybe_refresh_task_goal(&self, summ: &mut ContextSummarizationView<'_>) {
1528        crate::summarization::scheduling::maybe_refresh_task_goal(summ);
1529    }
1530
1531    /// Refresh the subgoal registry when the last user message has changed.
1532    ///
1533    /// Mirrors the two-phase `maybe_refresh_task_goal` pattern.
1534    /// Only active for `Subgoal` and `SubgoalMig` pruning strategies.
1535    pub fn maybe_refresh_subgoal(&self, summ: &mut ContextSummarizationView<'_>) {
1536        crate::summarization::scheduling::maybe_refresh_subgoal(summ);
1537    }
1538}
1539
1540// ── StatusSink adapters ───────────────────────────────────────────────────────
1541
1542/// `StatusSink` adapter over an optional `UnboundedSender<String>`.
1543///
1544/// Sends status strings when the sender is present; silently drops them otherwise.
1545struct TxStatusSink(Option<tokio::sync::mpsc::UnboundedSender<String>>);
1546
1547impl StatusSink for TxStatusSink {
1548    fn send_status(&self, msg: &str) -> impl std::future::Future<Output = ()> + Send + '_ {
1549        if let Some(ref tx) = self.0 {
1550            let _ = tx.send(msg.to_owned());
1551        }
1552        std::future::ready(())
1553    }
1554}
1555
1556// ── Free functions (helpers shared across service methods) ────────────────────
1557
1558/// Recompute `cached_prompt_tokens` from the current message list.
1559///
1560/// Called after every mutation that changes the message count or content, so the
1561/// provider call path always sees an accurate token count.
1562pub(crate) fn recompute_prompt_tokens(window: &mut MessageWindowView<'_>) {
1563    *window.cached_prompt_tokens = window
1564        .messages
1565        .iter()
1566        .map(|m| window.token_counter.count_message_tokens(m) as u64)
1567        .sum();
1568}
1569
1570/// Persist fidelity tags for all scored messages to `SQLite`.
1571///
1572/// Collects `(db_id, tag as u8)` pairs for messages that have both a `db_id` and a
1573/// non-None `fidelity_tag`, then calls [`SqliteStore::update_fidelity_tags`] inline.
1574/// The await is cheap — `SQLite` UPDATE is a sub-millisecond local I/O operation.
1575///
1576/// A warn-level log is emitted on failure; the next turn will recompute from scratch,
1577/// which is safe (the floor invariant simply won't apply until persistence succeeds).
1578async fn persist_fidelity_tags(
1579    messages: &[zeph_llm::provider::Message],
1580    memory: Option<&zeph_memory::semantic::SemanticMemory>,
1581) {
1582    let Some(mem) = memory else { return };
1583    let updates: Vec<(zeph_memory::MessageId, u8)> = messages
1584        .iter()
1585        .filter_map(|m| {
1586            let db_id = m.metadata.db_id?;
1587            let tag = m.metadata.fidelity_tag?;
1588            Some((zeph_memory::MessageId(db_id), tag as u8))
1589        })
1590        .collect();
1591    if updates.is_empty() {
1592        return;
1593    }
1594    if let Err(e) = mem.sqlite().update_fidelity_tags(&updates).await {
1595        tracing::warn!(
1596            count = updates.len(),
1597            error = %e,
1598            "failed to persist fidelity tags; floor invariant will not apply next turn"
1599        );
1600    }
1601}
1602
1603/// Recompute `cached_prompt_tokens` for a [`ContextSummarizationView`].
1604///
1605/// Used after the `AgeMem` proactive regrade modifies the message window in `maybe_compact`.
1606fn recompute_prompt_tokens_summ(summ: &mut crate::state::ContextSummarizationView<'_>) {
1607    *summ.cached_prompt_tokens = summ
1608        .messages
1609        .iter()
1610        .map(|m| summ.token_counter.count_message_tokens(m) as u64)
1611        .sum();
1612}
1613
1614/// Remove all system/user messages whose `content` starts with `prefix` and whose
1615/// role matches `role`.
1616///
1617/// Operates on the raw `messages` slice to allow callers that don't hold a full
1618/// `MessageWindowView` to use this helper (e.g., from `zeph-core` shims).
1619pub(crate) fn remove_by_prefix(
1620    messages: &mut Vec<zeph_llm::provider::Message>,
1621    role: Role,
1622    prefix: &str,
1623) {
1624    messages.retain(|m| m.role != role || !m.content.starts_with(prefix));
1625}
1626
1627/// Remove messages that match either a typed `MessagePart` or a content prefix.
1628///
1629/// For `Role::System` messages: typed-part matching takes priority — a message is removed
1630/// if its **first** part satisfies `part_matches`. As a fallback, messages that start with
1631/// `prefix` are also removed.
1632/// For `Role::User` messages: removed if their content starts with `prefix` (tiered-recall
1633/// cleanup).
1634/// All other roles are always retained.
1635pub(crate) fn remove_by_part_or_prefix(
1636    messages: &mut Vec<zeph_llm::provider::Message>,
1637    prefix: &str,
1638    part_matches: impl Fn(&MessagePart) -> bool,
1639) {
1640    messages.retain(|m| {
1641        // Role::User recall messages are produced by the tiered-retrieval path in
1642        // inject_semantic_recall. They must be cleaned up the same way as Role::System ones.
1643        if m.role == Role::User {
1644            return !m.content.starts_with(prefix);
1645        }
1646        if m.role != Role::System {
1647            return true;
1648        }
1649        if m.parts.first().is_some_and(&part_matches) {
1650            return false;
1651        }
1652        !m.content.starts_with(prefix)
1653    });
1654}
1655
1656#[cfg(test)]
1657mod tests {
1658    use std::collections::HashSet;
1659    use std::sync::Arc;
1660
1661    use zeph_llm::provider::{Message, MessagePart, Role};
1662    use zeph_memory::TokenCounter;
1663
1664    use super::*;
1665    use crate::helpers::{GRAPH_FACTS_PREFIX, RECALL_PREFIX, SUMMARY_PREFIX};
1666    use crate::state::MessageWindowView;
1667
1668    fn make_counter() -> Arc<TokenCounter> {
1669        Arc::new(TokenCounter::default())
1670    }
1671
1672    fn make_window<'a>(
1673        messages: &'a mut Vec<Message>,
1674        cached: &'a mut u64,
1675        completed: &'a mut HashSet<String>,
1676    ) -> MessageWindowView<'a> {
1677        let last = Box::leak(Box::new(None::<i64>));
1678        let deferred_hide = Box::leak(Box::new(Vec::<i64>::new()));
1679        let deferred_summ = Box::leak(Box::new(Vec::<String>::new()));
1680        MessageWindowView {
1681            messages,
1682            last_persisted_message_id: last,
1683            deferred_db_hide_ids: deferred_hide,
1684            deferred_db_summaries: deferred_summ,
1685            cached_prompt_tokens: cached,
1686            token_counter: make_counter(),
1687            completed_tool_ids: completed,
1688        }
1689    }
1690
1691    fn sys(text: &str) -> Message {
1692        Message::from_legacy(Role::System, text)
1693    }
1694
1695    fn user(text: &str) -> Message {
1696        Message::from_legacy(Role::User, text)
1697    }
1698
1699    fn assistant(text: &str) -> Message {
1700        Message::from_legacy(Role::Assistant, text)
1701    }
1702
1703    #[test]
1704    fn clear_history_keeps_system_prompt() {
1705        let mut msgs = vec![sys("system"), user("hello"), assistant("hi")];
1706        let mut cached = 0u64;
1707        let mut completed = HashSet::new();
1708        completed.insert("tool_1".to_owned());
1709        let mut window = make_window(&mut msgs, &mut cached, &mut completed);
1710
1711        ContextService::new().clear_history(&mut window);
1712
1713        assert_eq!(window.messages.len(), 1);
1714        assert_eq!(window.messages[0].content, "system");
1715        assert!(
1716            window.completed_tool_ids.is_empty(),
1717            "completed_tool_ids must be cleared"
1718        );
1719    }
1720
1721    #[test]
1722    fn clear_history_empty_messages_is_noop() {
1723        let mut msgs: Vec<Message> = vec![];
1724        let mut cached = 0u64;
1725        let mut completed = HashSet::new();
1726        let mut window = make_window(&mut msgs, &mut cached, &mut completed);
1727
1728        ContextService::new().clear_history(&mut window);
1729
1730        assert!(window.messages.is_empty());
1731    }
1732
1733    #[test]
1734    fn remove_recall_messages_removes_by_prefix() {
1735        let mut msgs = vec![
1736            sys("system"),
1737            sys(&format!("{RECALL_PREFIX}some recalled text")),
1738            user("hello"),
1739        ];
1740        let mut cached = 0u64;
1741        let mut completed = HashSet::new();
1742        let mut window = make_window(&mut msgs, &mut cached, &mut completed);
1743
1744        ContextService::new().remove_recall_messages(&mut window);
1745
1746        assert_eq!(window.messages.len(), 2);
1747        assert!(
1748            window
1749                .messages
1750                .iter()
1751                .all(|m| !m.content.starts_with(RECALL_PREFIX))
1752        );
1753    }
1754
1755    // Regression test for #4019: Role::User recall messages must be removed by
1756    // remove_recall_messages, not just Role::System ones.
1757    #[test]
1758    fn remove_recall_messages_removes_user_role_recall() {
1759        let mut msgs = vec![
1760            sys("system"),
1761            user(&format!("{RECALL_PREFIX}recalled via tiered path")),
1762            user("real user message"),
1763        ];
1764        let mut cached = 0u64;
1765        let mut completed = HashSet::new();
1766        let mut window = make_window(&mut msgs, &mut cached, &mut completed);
1767
1768        ContextService::new().remove_recall_messages(&mut window);
1769
1770        assert_eq!(
1771            window.messages.len(),
1772            2,
1773            "Role::User recall message must be removed"
1774        );
1775        assert!(
1776            window
1777                .messages
1778                .iter()
1779                .all(|m| !m.content.starts_with(RECALL_PREFIX)),
1780            "no message with RECALL_PREFIX must remain"
1781        );
1782        assert!(
1783            window
1784                .messages
1785                .iter()
1786                .any(|m| m.content == "real user message"),
1787            "non-recall user message must survive"
1788        );
1789    }
1790
1791    #[test]
1792    fn remove_graph_facts_messages_removes_matching() {
1793        let mut msgs = vec![
1794            sys("system"),
1795            sys(&format!("{GRAPH_FACTS_PREFIX}fact1")),
1796            user("hello"),
1797        ];
1798        let mut cached = 0u64;
1799        let mut completed = HashSet::new();
1800        let mut window = make_window(&mut msgs, &mut cached, &mut completed);
1801
1802        ContextService::new().remove_graph_facts_messages(&mut window);
1803
1804        assert_eq!(window.messages.len(), 2);
1805    }
1806
1807    #[test]
1808    fn remove_summary_messages_removes_by_part() {
1809        let mut msgs = vec![
1810            sys("system"),
1811            Message::from_parts(
1812                Role::System,
1813                vec![MessagePart::Summary {
1814                    text: format!("{SUMMARY_PREFIX}old summary"),
1815                }],
1816            ),
1817            user("hello"),
1818        ];
1819        let mut cached = 0u64;
1820        let mut completed = HashSet::new();
1821        let mut window = make_window(&mut msgs, &mut cached, &mut completed);
1822
1823        ContextService::new().remove_summary_messages(&mut window);
1824
1825        assert_eq!(window.messages.len(), 2);
1826    }
1827
1828    #[test]
1829    fn trim_messages_to_budget_zero_is_noop() {
1830        let mut msgs = vec![sys("system"), user("a"), assistant("b"), user("c")];
1831        let original_len = msgs.len();
1832        let mut cached = 0u64;
1833        let mut completed = HashSet::new();
1834        let mut window = make_window(&mut msgs, &mut cached, &mut completed);
1835
1836        ContextService::new().trim_messages_to_budget(&mut window, 0);
1837
1838        assert_eq!(window.messages.len(), original_len);
1839    }
1840
1841    #[test]
1842    fn trim_messages_to_budget_keeps_recent() {
1843        // With a very small budget only the most recent messages survive.
1844        let mut msgs = vec![
1845            sys("system"),
1846            user("message 1"),
1847            assistant("reply 1"),
1848            user("message 2"),
1849        ];
1850        let mut cached = 0u64;
1851        let mut completed = HashSet::new();
1852        let mut window = make_window(&mut msgs, &mut cached, &mut completed);
1853
1854        // 1-token budget keeps the last user message only.
1855        ContextService::new().trim_messages_to_budget(&mut window, 1);
1856
1857        // System prompt is always kept; at least one recent message should be present.
1858        assert!(
1859            window.messages.len() < 4,
1860            "trim should remove some messages"
1861        );
1862        assert_eq!(
1863            window.messages[0].role,
1864            Role::System,
1865            "system prompt must survive trim"
1866        );
1867    }
1868
1869    // AC-12: inserted_count must equal the number of non-None memory fields injected.
1870    // Tests that every Some(msg) field in PreparedContext increments inserted_count by 1.
1871    mod inserted_count_tests {
1872        use parking_lot::RwLock;
1873        use std::borrow::Cow;
1874        use std::collections::HashSet;
1875        use std::sync::Arc;
1876
1877        use zeph_common::SecurityEventCategory;
1878        use zeph_config::memory::TieredRetrievalConfig;
1879        use zeph_config::{
1880            ContextFormat, ContextStrategy, DocumentConfig, GraphConfig, PersonaConfig,
1881            ReasoningConfig, TrajectoryConfig, TreeConfig,
1882        };
1883        use zeph_context::assembler::PreparedContext;
1884        use zeph_context::manager::ContextManager;
1885        use zeph_llm::provider::{Message, MessageMetadata, Role};
1886        use zeph_memory::TokenCounter;
1887        use zeph_sanitizer::ContentIsolationConfig;
1888        use zeph_sanitizer::ContentSanitizer;
1889        use zeph_skills::registry::SkillRegistry;
1890
1891        use super::super::*;
1892        use crate::state::{
1893            ContextAssemblyView, MessageWindowView, MetricsCounters, SecurityEventSink,
1894        };
1895
1896        fn make_task_supervisor() -> Arc<zeph_common::TaskSupervisor> {
1897            Arc::new(zeph_common::TaskSupervisor::new(
1898                tokio_util::sync::CancellationToken::new(),
1899            ))
1900        }
1901
1902        struct NoopSink;
1903        impl SecurityEventSink for NoopSink {
1904            fn push(&mut self, _: SecurityEventCategory, _: &'static str, _: String) {}
1905        }
1906
1907        fn make_counter() -> Arc<TokenCounter> {
1908            Arc::new(TokenCounter::default())
1909        }
1910
1911        fn make_window<'a>(
1912            messages: &'a mut Vec<Message>,
1913            cached: &'a mut u64,
1914            completed: &'a mut HashSet<String>,
1915        ) -> MessageWindowView<'a> {
1916            let last = Box::leak(Box::new(None::<i64>));
1917            let deferred_hide = Box::leak(Box::new(Vec::<i64>::new()));
1918            let deferred_summ = Box::leak(Box::new(Vec::<String>::new()));
1919            MessageWindowView {
1920                messages,
1921                last_persisted_message_id: last,
1922                deferred_db_hide_ids: deferred_hide,
1923                deferred_db_summaries: deferred_summ,
1924                cached_prompt_tokens: cached,
1925                token_counter: make_counter(),
1926                completed_tool_ids: completed,
1927            }
1928        }
1929
1930        fn mem_msg(content: &str) -> Message {
1931            Message {
1932                role: Role::User,
1933                content: content.to_string(),
1934                parts: vec![],
1935                metadata: MessageMetadata::default(),
1936            }
1937        }
1938
1939        fn scrub_noop(s: &str) -> Cow<'_, str> {
1940            Cow::Borrowed(s)
1941        }
1942
1943        #[tokio::test]
1944        async fn inserted_count_incremented_for_all_paths() {
1945            // AC-12: each non-None field in PreparedContext increments inserted_count by 1.
1946            // 10 memory fields are tested here (session_digest is controlled by digest_enabled).
1947            let mut msgs = vec![
1948                Message::from_legacy(Role::System, "system"),
1949                Message::from_legacy(Role::User, "user turn"),
1950            ];
1951            let mut cached = 0u64;
1952            let mut completed = HashSet::new();
1953            let mut window = make_window(&mut msgs, &mut cached, &mut completed);
1954
1955            let sanitizer = ContentSanitizer::new(&ContentIsolationConfig::default());
1956            let mut ctx_mgr = ContextManager::new();
1957            let mut sink = NoopSink;
1958            let mut last_confidence = None::<f32>;
1959            let mut last_skills_prompt = String::new();
1960            let mut active_skill_names = Vec::new();
1961            let registry = Arc::new(RwLock::new(SkillRegistry::default()));
1962
1963            let mut view = ContextAssemblyView {
1964                memory: None,
1965                conversation_id: None,
1966                recall_limit: 10,
1967                cross_session_score_threshold: 0.5,
1968                context_format: ContextFormat::default(),
1969                last_recall_confidence: &mut last_confidence,
1970                context_strategy: ContextStrategy::default(),
1971                crossover_turn_threshold: 0,
1972                cached_session_digest: None,
1973                digest_enabled: false, // no session digest injection in this test
1974                graph_config: GraphConfig::default(),
1975                document_config: DocumentConfig::default(),
1976                persona_config: PersonaConfig::default(),
1977                trajectory_config: TrajectoryConfig::default(),
1978                reasoning_config: ReasoningConfig::default(),
1979                memcot_config: zeph_config::MemCotConfig::default(),
1980                memcot_state: None,
1981                tree_config: TreeConfig::default(),
1982                last_skills_prompt: &mut last_skills_prompt,
1983                active_skill_names: &mut active_skill_names,
1984                skill_registry: registry,
1985                skill_paths: &[],
1986                correction_config: None,
1987                sidequest_turn_counter: 0,
1988                proactive_explorer: None,
1989                sanitizer: &sanitizer,
1990                quarantine_summarizer: None,
1991                context_manager: &mut ctx_mgr,
1992                token_counter: make_counter(),
1993                metrics: MetricsCounters::default(),
1994                security_events: &mut sink,
1995                cached_prompt_tokens: 0,
1996                redact_credentials: false,
1997                channel_skills: &[],
1998                scrub: scrub_noop,
1999                tiered_retrieval_config: TieredRetrievalConfig {
2000                    enabled: false,
2001                    ..TieredRetrievalConfig::default()
2002                },
2003                tiered_retrieval_classifier: None,
2004                tiered_retrieval_validator: None,
2005                fidelity_config: None,
2006                fidelity_semantic_provider: None,
2007                fidelity_compress_provider: None,
2008                planned_next_tools: &[],
2009                status_tx: None,
2010                task_supervisor: make_task_supervisor(),
2011            };
2012
2013            // Populate all 10 message-carrying fields.
2014            let prepared = PreparedContext {
2015                graph_facts: Some(mem_msg("graph_facts")),
2016                doc_rag: Some(mem_msg("doc_rag")),
2017                corrections: Some(mem_msg("corrections")),
2018                recall: Some(mem_msg("recall")),
2019                recall_confidence: Some(0.9),
2020                cross_session: Some(mem_msg("cross_session")),
2021                summaries: Some(mem_msg("summaries")),
2022                code_context: None, // code_context returns via ContextDelta, not inserted_count
2023                persona_facts: Some(mem_msg("persona_facts")),
2024                trajectory_hints: Some(mem_msg("trajectory_hints")),
2025                tree_memory: Some(mem_msg("tree_memory")),
2026                reasoning_hints: Some(mem_msg("reasoning_hints")),
2027                memory_first: false,
2028                recent_history_budget: 100_000,
2029                background_tasks: vec![],
2030            };
2031
2032            let (_delta, inserted_count) = ContextService::new()
2033                .apply_prepared_context(&mut window, &mut view, prepared)
2034                .await;
2035
2036            // 10 message fields were Some(msg): graph_facts, doc_rag, corrections, recall,
2037            // cross_session, summaries, persona_facts, trajectory_hints, tree_memory, reasoning_hints.
2038            assert_eq!(
2039                inserted_count, 10,
2040                "all 10 message-carrying PreparedContext fields must increment inserted_count"
2041            );
2042        }
2043    }
2044
2045    mod inject_semantic_recall_tests {
2046        use parking_lot::RwLock;
2047        use std::borrow::Cow;
2048        use std::collections::HashSet;
2049        use std::sync::Arc;
2050
2051        use zeph_config::memory::TieredRetrievalConfig;
2052        use zeph_config::{
2053            ContextFormat, ContextStrategy, DocumentConfig, GraphConfig, PersonaConfig,
2054            ReasoningConfig, TrajectoryConfig, TreeConfig,
2055        };
2056        use zeph_context::manager::ContextManager;
2057        use zeph_llm::provider::Message;
2058        use zeph_memory::TokenCounter;
2059        use zeph_sanitizer::ContentIsolationConfig;
2060        use zeph_sanitizer::ContentSanitizer;
2061        use zeph_skills::registry::SkillRegistry;
2062
2063        use zeph_common::SecurityEventCategory;
2064
2065        use super::super::*;
2066        use crate::helpers::RECALL_PREFIX;
2067        use crate::state::{
2068            ContextAssemblyView, MessageWindowView, MetricsCounters, SecurityEventSink,
2069        };
2070
2071        fn make_task_supervisor() -> Arc<zeph_common::TaskSupervisor> {
2072            Arc::new(zeph_common::TaskSupervisor::new(
2073                tokio_util::sync::CancellationToken::new(),
2074            ))
2075        }
2076
2077        struct NoopSink;
2078        impl SecurityEventSink for NoopSink {
2079            fn push(&mut self, _: SecurityEventCategory, _: &'static str, _: String) {}
2080        }
2081
2082        fn make_counter() -> Arc<TokenCounter> {
2083            Arc::new(TokenCounter::default())
2084        }
2085
2086        fn make_window<'a>(
2087            messages: &'a mut Vec<Message>,
2088            cached: &'a mut u64,
2089            completed: &'a mut HashSet<String>,
2090        ) -> MessageWindowView<'a> {
2091            let last = Box::leak(Box::new(None::<i64>));
2092            let deferred_hide = Box::leak(Box::new(Vec::<i64>::new()));
2093            let deferred_summ = Box::leak(Box::new(Vec::<String>::new()));
2094            MessageWindowView {
2095                messages,
2096                last_persisted_message_id: last,
2097                deferred_db_hide_ids: deferred_hide,
2098                deferred_db_summaries: deferred_summ,
2099                cached_prompt_tokens: cached,
2100                token_counter: make_counter(),
2101                completed_tool_ids: completed,
2102            }
2103        }
2104
2105        fn scrub_noop(s: &str) -> Cow<'_, str> {
2106            Cow::Borrowed(s)
2107        }
2108
2109        #[tokio::test]
2110        async fn tiered_recall_disabled_uses_flat_path() {
2111            // With tiered_retrieval disabled and no memory, inject_semantic_recall must
2112            // return Ok(()) without inserting any recall message (flat path returns empty).
2113            let mut msgs: Vec<Message> = vec![];
2114            let mut cached = 0u64;
2115            let mut completed = HashSet::new();
2116            let mut window = make_window(&mut msgs, &mut cached, &mut completed);
2117
2118            let sanitizer = ContentSanitizer::new(&ContentIsolationConfig::default());
2119            let mut ctx_mgr = ContextManager::new();
2120            let mut sink = NoopSink;
2121            let mut last_confidence = None::<f32>;
2122            let mut last_skills_prompt = String::new();
2123            let mut active_skill_names = Vec::new();
2124            let registry = Arc::new(RwLock::new(SkillRegistry::default()));
2125
2126            let view = ContextAssemblyView {
2127                memory: None,
2128                conversation_id: None,
2129                recall_limit: 10,
2130                cross_session_score_threshold: 0.5,
2131                context_format: ContextFormat::default(),
2132                last_recall_confidence: &mut last_confidence,
2133                context_strategy: ContextStrategy::default(),
2134                crossover_turn_threshold: 0,
2135                cached_session_digest: None,
2136                digest_enabled: false,
2137                graph_config: GraphConfig::default(),
2138                document_config: DocumentConfig::default(),
2139                persona_config: PersonaConfig::default(),
2140                trajectory_config: TrajectoryConfig::default(),
2141                reasoning_config: ReasoningConfig::default(),
2142                memcot_config: zeph_config::MemCotConfig::default(),
2143                memcot_state: None,
2144                tree_config: TreeConfig::default(),
2145                last_skills_prompt: &mut last_skills_prompt,
2146                active_skill_names: &mut active_skill_names,
2147                skill_registry: registry,
2148                skill_paths: &[],
2149                correction_config: None,
2150                sidequest_turn_counter: 0,
2151                proactive_explorer: None,
2152                sanitizer: &sanitizer,
2153                quarantine_summarizer: None,
2154                context_manager: &mut ctx_mgr,
2155                token_counter: make_counter(),
2156                metrics: MetricsCounters::default(),
2157                security_events: &mut sink,
2158                cached_prompt_tokens: 0,
2159                redact_credentials: false,
2160                channel_skills: &[],
2161                scrub: scrub_noop,
2162                tiered_retrieval_config: TieredRetrievalConfig {
2163                    enabled: false,
2164                    ..TieredRetrievalConfig::default()
2165                },
2166                tiered_retrieval_classifier: None,
2167                tiered_retrieval_validator: None,
2168                fidelity_config: None,
2169                fidelity_semantic_provider: None,
2170                fidelity_compress_provider: None,
2171                planned_next_tools: &[],
2172                status_tx: None,
2173                task_supervisor: make_task_supervisor(),
2174            };
2175
2176            let result = ContextService::new()
2177                .inject_semantic_recall("test query", 1000, &mut window, &view)
2178                .await;
2179
2180            assert!(result.is_ok(), "disabled tiered recall must return Ok(())");
2181            assert!(
2182                window
2183                    .messages
2184                    .iter()
2185                    .all(|m| !m.content.starts_with(RECALL_PREFIX)),
2186                "no recall message must be injected when memory is None"
2187            );
2188        }
2189
2190        #[tokio::test]
2191        async fn tiered_recall_enabled_no_memory_returns_ok() {
2192            // With tiered_retrieval enabled but memory = None, inject_semantic_recall must
2193            // return Ok(()) via the early-return guard without inserting any recall message.
2194            let mut msgs: Vec<Message> = vec![];
2195            let mut cached = 0u64;
2196            let mut completed = HashSet::new();
2197            let mut window = make_window(&mut msgs, &mut cached, &mut completed);
2198
2199            let sanitizer = ContentSanitizer::new(&ContentIsolationConfig::default());
2200            let mut ctx_mgr = ContextManager::new();
2201            let mut sink = NoopSink;
2202            let mut last_confidence = None::<f32>;
2203            let mut last_skills_prompt = String::new();
2204            let mut active_skill_names = Vec::new();
2205            let registry = Arc::new(RwLock::new(SkillRegistry::default()));
2206
2207            let view = ContextAssemblyView {
2208                memory: None,
2209                conversation_id: None,
2210                recall_limit: 10,
2211                cross_session_score_threshold: 0.5,
2212                context_format: ContextFormat::default(),
2213                last_recall_confidence: &mut last_confidence,
2214                context_strategy: ContextStrategy::default(),
2215                crossover_turn_threshold: 0,
2216                cached_session_digest: None,
2217                digest_enabled: false,
2218                graph_config: GraphConfig::default(),
2219                document_config: DocumentConfig::default(),
2220                persona_config: PersonaConfig::default(),
2221                trajectory_config: TrajectoryConfig::default(),
2222                reasoning_config: ReasoningConfig::default(),
2223                memcot_config: zeph_config::MemCotConfig::default(),
2224                memcot_state: None,
2225                tree_config: TreeConfig::default(),
2226                last_skills_prompt: &mut last_skills_prompt,
2227                active_skill_names: &mut active_skill_names,
2228                skill_registry: registry,
2229                skill_paths: &[],
2230                correction_config: None,
2231                sidequest_turn_counter: 0,
2232                proactive_explorer: None,
2233                sanitizer: &sanitizer,
2234                quarantine_summarizer: None,
2235                context_manager: &mut ctx_mgr,
2236                token_counter: make_counter(),
2237                metrics: MetricsCounters::default(),
2238                security_events: &mut sink,
2239                cached_prompt_tokens: 0,
2240                redact_credentials: false,
2241                channel_skills: &[],
2242                scrub: scrub_noop,
2243                tiered_retrieval_config: TieredRetrievalConfig {
2244                    enabled: true,
2245                    ..TieredRetrievalConfig::default()
2246                },
2247                tiered_retrieval_classifier: None,
2248                tiered_retrieval_validator: None,
2249                fidelity_config: None,
2250                fidelity_semantic_provider: None,
2251                fidelity_compress_provider: None,
2252                planned_next_tools: &[],
2253                status_tx: None,
2254                task_supervisor: make_task_supervisor(),
2255            };
2256
2257            let result = ContextService::new()
2258                .inject_semantic_recall("test query", 1000, &mut window, &view)
2259                .await;
2260
2261            assert!(
2262                result.is_ok(),
2263                "enabled tiered recall with no memory must return Ok(())"
2264            );
2265            assert!(
2266                window.messages.is_empty(),
2267                "no recall message must be injected when memory is None"
2268            );
2269        }
2270
2271        // Regression test for #3996: prepare_context must call inject_semantic_recall when
2272        // tiered_retrieval.enabled = true. When context_manager.budget is None the function
2273        // returns early with Ok(ContextDelta::default()); this test verifies that early-return
2274        // path compiles and does not panic with the new conditional blocks in place.
2275        #[tokio::test]
2276        async fn prepare_context_tiered_enabled_no_budget_returns_default() {
2277            let mut msgs: Vec<zeph_llm::provider::Message> = vec![];
2278            let mut cached = 0u64;
2279            let mut completed = HashSet::new();
2280            let mut window = make_window(&mut msgs, &mut cached, &mut completed);
2281
2282            let sanitizer = zeph_sanitizer::ContentSanitizer::new(
2283                &zeph_sanitizer::ContentIsolationConfig::default(),
2284            );
2285            let mut ctx_mgr = zeph_context::manager::ContextManager::new();
2286            // budget = None → prepare_context returns Ok(ContextDelta::default()) immediately.
2287            assert!(ctx_mgr.budget.is_none());
2288
2289            let mut sink = NoopSink;
2290            let mut last_confidence = None::<f32>;
2291            let mut last_skills_prompt = String::new();
2292            let mut active_skill_names = Vec::new();
2293            let registry = Arc::new(RwLock::new(zeph_skills::registry::SkillRegistry::default()));
2294
2295            let mut view = ContextAssemblyView {
2296                memory: None,
2297                conversation_id: None,
2298                recall_limit: 10,
2299                cross_session_score_threshold: 0.5,
2300                context_format: ContextFormat::default(),
2301                last_recall_confidence: &mut last_confidence,
2302                context_strategy: ContextStrategy::default(),
2303                crossover_turn_threshold: 0,
2304                cached_session_digest: None,
2305                digest_enabled: false,
2306                graph_config: GraphConfig::default(),
2307                document_config: DocumentConfig::default(),
2308                persona_config: PersonaConfig::default(),
2309                trajectory_config: TrajectoryConfig::default(),
2310                reasoning_config: ReasoningConfig::default(),
2311                memcot_config: zeph_config::MemCotConfig::default(),
2312                memcot_state: None,
2313                tree_config: TreeConfig::default(),
2314                last_skills_prompt: &mut last_skills_prompt,
2315                active_skill_names: &mut active_skill_names,
2316                skill_registry: registry,
2317                skill_paths: &[],
2318                correction_config: None,
2319                sidequest_turn_counter: 0,
2320                proactive_explorer: None,
2321                sanitizer: &sanitizer,
2322                quarantine_summarizer: None,
2323                context_manager: &mut ctx_mgr,
2324                token_counter: make_counter(),
2325                metrics: MetricsCounters::default(),
2326                security_events: &mut sink,
2327                cached_prompt_tokens: 0,
2328                redact_credentials: false,
2329                channel_skills: &[],
2330                scrub: scrub_noop,
2331                tiered_retrieval_config: TieredRetrievalConfig {
2332                    enabled: true,
2333                    ..TieredRetrievalConfig::default()
2334                },
2335                tiered_retrieval_classifier: None,
2336                tiered_retrieval_validator: None,
2337                fidelity_config: None,
2338                fidelity_semantic_provider: None,
2339                fidelity_compress_provider: None,
2340                planned_next_tools: &[],
2341                status_tx: None,
2342                task_supervisor: make_task_supervisor(),
2343            };
2344
2345            let result = ContextService::new()
2346                .prepare_context("test query", &mut window, &mut view)
2347                .await;
2348
2349            assert!(
2350                result.is_ok(),
2351                "prepare_context with tiered enabled and no budget must return Ok"
2352            );
2353        }
2354
2355        // Regression test for #4022: inject_semantic_recall_bare must be callable without a
2356        // full ContextAssemblyView and must return Ok(()) when memory is None.
2357        #[tokio::test]
2358        async fn inject_semantic_recall_bare_no_memory_returns_ok() {
2359            use zeph_config::memory::TieredRetrievalConfig;
2360
2361            let mut msgs: Vec<Message> = vec![];
2362            let mut cached = 0u64;
2363            let mut completed = HashSet::new();
2364            let mut window = make_window(&mut msgs, &mut cached, &mut completed);
2365
2366            let tiered_config = TieredRetrievalConfig {
2367                enabled: true,
2368                ..TieredRetrievalConfig::default()
2369            };
2370            let params = SemanticRecallParams {
2371                query: "test query",
2372                token_budget: 1000,
2373                recall_limit: 10,
2374                context_format: zeph_config::ContextFormat::default(),
2375                conversation_id: None,
2376                tiered_classifier: None,
2377                tiered_validator: None,
2378                tiered_config: &tiered_config,
2379            };
2380            let result = ContextService::new()
2381                .inject_semantic_recall_bare(params, &mut window, None)
2382                .await;
2383
2384            assert!(
2385                result.is_ok(),
2386                "inject_semantic_recall_bare with memory=None must return Ok(())"
2387            );
2388            assert!(
2389                window.messages.is_empty(),
2390                "no recall message must be injected when memory is None"
2391            );
2392        }
2393    }
2394}