Skip to main content

zeph_context/
fidelity.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Heuristic fidelity scorer for Context-Adaptive Memory (CAM).
5//!
6//! [`FidelityScorer`] is a stateless scoring engine that assigns a three-level
7//! representation ([`ContextFidelity::Full`] / [`ContextFidelity::Compressed`] /
8//! [`ContextFidelity::Placeholder`]) to each message in the context window. Scoring is
9//! driven by weighted signals: temporal recency, role importance, keyword-based semantic
10//! relevance, and optional plan hints.
11//!
12//! [`FidelityConfig`] holds all tuning knobs; it is read from `[memory.fidelity]` in
13//! `config.toml`. When `enabled = false` (the default), the scorer returns immediately
14//! without modifying the message window.
15//!
16//! ## Embed pre-pass
17//!
18//! The private `embed_prepass_dyn` helper runs concurrently via
19//! `futures::stream::buffer_unordered` and populates per-message embedding vectors before
20//! scoring. The public [`embed_prepass`] is a thin wrapper over the same helper for
21//! closure-based callers (e.g. unit tests). The concurrency bound is controlled by
22//! [`FidelityConfig::embed_concurrency`] (default 32).
23
24use std::time::Duration;
25
26use futures::StreamExt as _;
27use futures::future::BoxFuture;
28use tracing::Instrument as _;
29use tracing::info_span;
30use zeph_common::memory::TokenCounting;
31use zeph_common::{ContextFidelity, PlannedToolHint};
32use zeph_llm::LlmError;
33use zeph_llm::LlmProviderDyn;
34use zeph_llm::provider::{EmbedFuture, Message, MessageMetadata, MessagePart, Role};
35
36use crate::assembler::CORRECTIONS_PREFIX;
37
38// Re-export FidelityConfig from zeph-config so both crates share one definition.
39pub use zeph_config::FidelityConfig;
40
41/// Object-safe embed callback used by [`embed_prepass_dyn`].
42///
43/// Exists only to unify the `'static`-future closure path (used by [`embed_prepass`])
44/// with the `&dyn LlmProviderDyn` path (used by [`FidelityScorer::score_and_apply`])
45/// without an HRTB trait-object bound.
46trait EmbedCall: Send + Sync {
47    fn call<'a>(&'a self, text: &'a str) -> BoxFuture<'a, Result<Vec<f32>, LlmError>>;
48}
49
50/// Adapts a `Fn(&str) -> EmbedFuture` (where `EmbedFuture` is `'static`) to [`EmbedCall`].
51struct ClosureEmbed<F>(F);
52
53impl<F> EmbedCall for ClosureEmbed<F>
54where
55    F: Fn(&str) -> EmbedFuture + Send + Sync,
56{
57    fn call<'a>(&'a self, text: &'a str) -> BoxFuture<'a, Result<Vec<f32>, LlmError>> {
58        // EmbedFuture is Pin<Box<dyn Future + Send>> — implicitly 'static, so
59        // coercing to BoxFuture<'a, _> is always sound.
60        Box::pin((self.0)(text))
61    }
62}
63
64/// Adapts `&dyn LlmProviderDyn` to [`EmbedCall`].
65struct ProviderEmbed<'p>(&'p dyn LlmProviderDyn);
66
67impl EmbedCall for ProviderEmbed<'_> {
68    fn call<'a>(&'a self, text: &'a str) -> BoxFuture<'a, Result<Vec<f32>, LlmError>> {
69        self.0.embed(text)
70    }
71}
72
73/// Shared concurrent embed pipeline used by both [`embed_prepass`] and
74/// [`FidelityScorer::score_and_apply`].
75///
76/// `messages` is the slice to embed (callers pre-slice to `[..score_end]` when needed).
77/// Indices in the returned map are relative to the start of the passed slice.
78async fn embed_prepass_dyn(
79    messages: &[Message],
80    embed: &dyn EmbedCall,
81    config: &FidelityConfig,
82    inserted_count: usize,
83) -> std::collections::HashMap<usize, Vec<f32>> {
84    let concurrency = if config.embed_concurrency == 0 {
85        tracing::warn!(
86            "embed_concurrency is 0, clamping to 1; set a positive value in [memory.fidelity]"
87        );
88        1
89    } else {
90        config.embed_concurrency
91    };
92
93    let tasks = messages.iter().enumerate().filter_map(|(i, msg)| {
94        if is_exempt(msg, i, inserted_count)
95            || msg.content.is_empty()
96            || msg.metadata.embedding.is_some()
97        {
98            return None;
99        }
100        let content = match config.max_embed_input_tokens {
101            Some(n) => truncate_to_byte_limit(&msg.content, n.saturating_mul(4)),
102            None => msg.content.clone(),
103        };
104        Some((i, content))
105    });
106
107    let timeout = Duration::from_secs(config.embed_timeout_secs);
108    futures::stream::iter(tasks)
109        .map(|(i, content)| async move {
110            let result = tokio::time::timeout(timeout, embed.call(&content)).await;
111            match result {
112                Ok(Ok(vec)) => Some((i, vec)),
113                Ok(Err(e)) => {
114                    tracing::warn!(idx = i, err = %e, "embed_prepass: embed failed, skipping");
115                    None
116                }
117                Err(_) => {
118                    tracing::warn!(idx = i, "embed_prepass: embed timed out, skipping");
119                    None
120                }
121            }
122        })
123        .buffer_unordered(concurrency)
124        .filter_map(|opt| async move { opt })
125        .collect()
126        .await
127}
128
129/// Run embed calls for `messages` concurrently, returning per-index embedding vectors.
130///
131/// Indexes not included in the result either had no content, already had an embedding
132/// cached in `msg.metadata.embedding`, or produced an error (errors are logged at `warn`
133/// level and silently skipped — scoring falls back to keyword overlap for those messages).
134/// Each embed call is bounded by [`FidelityConfig::embed_timeout_secs`]; timed-out messages
135/// are skipped with a `warn`-level log.
136///
137/// Only exempt messages (focus-pinned, inserted memory, system) are skipped; for non-exempt messages the content
138/// is optionally truncated to `config.max_embed_input_tokens * 4` bytes (at a char boundary)
139/// before the call.
140///
141/// Concurrency is bounded by [`FidelityConfig::embed_concurrency`] (default 32), which
142/// maps to the `buffer_unordered(N)` parameter. A zero value is clamped to 1 with a warning.
143///
144/// # Examples
145///
146/// ```no_run
147/// use zeph_context::fidelity::{embed_prepass, FidelityConfig};
148///
149/// async fn example() {
150///     let messages = vec![];
151///     let cfg = FidelityConfig::default();
152///     let embed = |_text: &str| -> zeph_llm::provider::EmbedFuture {
153///         Box::pin(async { Ok(vec![0.0f32; 384]) })
154///     };
155///     let _embeddings = embed_prepass(&messages, &embed, &cfg, 0).await;
156/// }
157/// ```
158#[tracing::instrument(name = "context.fidelity.embed_prepass", skip_all)]
159pub async fn embed_prepass<F>(
160    messages: &[Message],
161    embed: &F,
162    config: &FidelityConfig,
163    inserted_count: usize,
164) -> std::collections::HashMap<usize, Vec<f32>>
165where
166    F: Fn(&str) -> EmbedFuture + Send + Sync,
167{
168    embed_prepass_dyn(messages, &ClosureEmbed(embed), config, inserted_count).await
169}
170
171/// Truncate `s` to at most `max_bytes` bytes, landing on a valid UTF-8 char boundary.
172///
173/// The limit is a byte count, not a character count. Callers using a token approximation
174/// should pass `n * 4` (the 4-byte-per-token heuristic) via [`saturating_mul`](usize::saturating_mul).
175/// Uses [`str::floor_char_boundary`] (stable since Rust 1.91) so multibyte characters
176/// are never split.
177///
178/// Returns a new `String`; does not modify the original.
179fn truncate_to_byte_limit(s: &str, max_bytes: usize) -> String {
180    if s.len() <= max_bytes {
181        return s.to_string();
182    }
183    let boundary = s.floor_char_boundary(max_bytes);
184    s[..boundary].to_string()
185}
186
187struct FidelityScore {
188    score: f32,
189    level: ContextFidelity,
190    original_tokens: u32,
191}
192
193/// Stateless heuristic scorer that assigns and applies fidelity levels to a message window.
194///
195/// Call [`FidelityScorer::score_and_apply`] after `apply_prepared_context()` returns to
196/// enforce the three-level representation (Full / Compressed / Placeholder) on historical
197/// messages. The scorer never touches exempt messages (INV-07 through INV-10).
198///
199/// # Examples
200///
201/// ```no_run
202/// use zeph_context::fidelity::{FidelityConfig, FidelityScorer};
203///
204/// # async fn run() {
205/// let scorer = FidelityScorer;
206/// let cfg = FidelityConfig { enabled: false, ..FidelityConfig::default() };
207/// // With `enabled = false` the scorer is a no-op.
208/// let mut messages = vec![];
209/// scorer.score_and_apply(&mut messages, "query", &[], &cfg, &MockTc, 0, false, None, None).await;
210///
211/// struct MockTc;
212/// impl zeph_common::memory::TokenCounting for MockTc {
213///     fn count_tokens(&self, text: &str) -> usize { text.len() / 4 }
214///     fn count_tool_schema_tokens(&self, _: &serde_json::Value) -> usize { 0 }
215/// }
216/// # }
217/// ```
218pub struct FidelityScorer;
219
220impl FidelityScorer {
221    /// Score all non-exempt messages and apply fidelity rendering in-place.
222    ///
223    /// Steps (per spec §5 data flow):
224    /// 1. Guard: return early when `enabled == false`.
225    /// 2. Build exempt set (INV-07 through INV-10).
226    /// 3. Score each non-exempt message with normalized weight sum (INV-05).
227    /// 4. Apply floor invariant: a message with a persisted fidelity tag cannot
228    ///    be upgraded to a less restrictive level unless `allow_upgrade = true`.
229    /// 5. Apply tool-pair atomicity — both get `min(score_a, score_b)` (INV-03).
230    /// 6. Render `Compressed` / `Placeholder` messages (INV-12).
231    /// 7. Merge consecutive same-role `Placeholder` messages (INV-04).
232    ///
233    /// # Parameters
234    ///
235    /// - `messages` — mutable message window (includes system prompt at index 0).
236    /// - `query` — current user query; drives semantic signal.
237    /// - `planned_tools` — DAG lookahead hints; empty slice disables plan signal.
238    /// - `config` — scoring thresholds and weights.
239    /// - `tc` — token counter used for `Placeholder`/`Compressed` rendering.
240    /// - `inserted_count` — number of memory messages freshly injected at indices
241    ///   `1..1+inserted_count`; these are always exempt (INV-10).
242    /// - `allow_upgrade` — when `true` the persisted floor invariant is bypassed.
243    ///   Pass `true` only from the proactive regrade path; pass `false` everywhere else.
244    /// - `embed_provider` — optional LLM provider used for query and per-message embeddings
245    ///   when `config.semantic_scoring_provider` is set. Pass `None` to fall back to keyword
246    ///   overlap.
247    /// - `compress_provider` — optional LLM provider used for `Compressed` rendering when
248    ///   `config.compress_provider` is set. Pass `None` to fall back to truncation.
249    ///
250    /// The two providers may be the same instance or different ones (e.g. a fast embedding
251    /// model for `embed_provider` and a generative model for `compress_provider`).
252    #[tracing::instrument(name = "context.fidelity.score_and_apply", skip_all)]
253    #[allow(clippy::too_many_arguments, clippy::too_many_lines)]
254    pub async fn score_and_apply(
255        &self,
256        messages: &mut Vec<Message>,
257        query: &str,
258        planned_tools: &[PlannedToolHint],
259        config: &FidelityConfig,
260        tc: &dyn TokenCounting,
261        inserted_count: usize,
262        allow_upgrade: bool,
263        embed_provider: Option<&dyn LlmProviderDyn>,
264        compress_provider: Option<&dyn LlmProviderDyn>,
265    ) {
266        if !config.enabled || messages.is_empty() {
267            return;
268        }
269
270        // Embed query once when semantic provider is configured.
271        let query_embedding: Option<Vec<f32>> = if let (true, Some(p)) =
272            (config.semantic_scoring_provider.is_some(), embed_provider)
273            && p.supports_embeddings()
274        {
275            match tokio::time::timeout(
276                Duration::from_secs(config.embed_timeout_secs),
277                p.embed(query),
278            )
279            .instrument(info_span!("context.fidelity.embed_query"))
280            .await
281            {
282                Ok(Ok(v)) => Some(v),
283                Ok(Err(e)) => {
284                    tracing::warn!(error = %e, "semantic scoring provider unavailable, falling back to keyword");
285                    None
286                }
287                Err(_) => {
288                    tracing::warn!("fidelity query embed timed out, falling back to keyword");
289                    None
290                }
291            }
292        } else {
293            None
294        };
295
296        // Concurrent embed pre-pass: populate missing embeddings for the scored window.
297        // Uses buffer_unordered so all N embed calls run in parallel (bounded by
298        // embed_concurrency), replacing the former sequential O(N) loop.
299        if let (Some(q_emb), Some(p)) = (&query_embedding, embed_provider) {
300            let n = messages.len();
301            let score_end = if n > config.max_scored_messages {
302                n.saturating_sub(config.exempt_tail_messages)
303            } else {
304                n
305            };
306            let embeddings = embed_prepass_dyn(
307                &messages[..score_end],
308                &ProviderEmbed(p),
309                config,
310                inserted_count,
311            )
312            .instrument(info_span!("context.fidelity.embed_prepass"))
313            .await;
314            for (i, emb) in embeddings {
315                messages[i].metadata.embedding = Some(emb);
316            }
317            let _ = q_emb; // used below in compute_scores
318        }
319
320        let scores = compute_scores(
321            messages,
322            query,
323            planned_tools,
324            config,
325            tc,
326            inserted_count,
327            allow_upgrade,
328            query_embedding.as_deref(),
329        );
330        apply_scores(messages, &scores, config, tc, compress_provider).await;
331
332        let _merge_span = info_span!("context.fidelity.merge").entered();
333        let merged_count = merge_consecutive_placeholders(messages);
334        tracing::debug!(merged_count, "fidelity merge complete");
335    }
336}
337
338#[allow(clippy::too_many_arguments)]
339fn compute_scores(
340    messages: &[Message],
341    query: &str,
342    planned_tools: &[PlannedToolHint],
343    config: &FidelityConfig,
344    tc: &dyn TokenCounting,
345    inserted_count: usize,
346    allow_upgrade: bool,
347    query_embedding: Option<&[f32]>,
348) -> Vec<Option<FidelityScore>> {
349    let n = messages.len();
350
351    // Performance cap: only score oldest messages; newest `exempt_tail_messages` default to Full.
352    let score_end = if n > config.max_scored_messages {
353        n.saturating_sub(config.exempt_tail_messages)
354    } else {
355        n
356    };
357
358    let semantic_active = query.len() >= config.min_query_length;
359    let plan_active = !planned_tools.is_empty();
360    // Build once outside the per-message loop (SF-1: avoids 500 redundant allocations).
361    let query_words: std::collections::HashSet<&str> = if semantic_active {
362        query.split_whitespace().collect()
363    } else {
364        std::collections::HashSet::default()
365    };
366
367    // Compute the active weight sum (INV-05).
368    let mut weight_sum = config.w_temporal + config.w_importance;
369    if semantic_active {
370        weight_sum += config.w_semantic;
371    }
372    if plan_active {
373        weight_sum += config.w_plan;
374    }
375    if weight_sum <= 0.0 {
376        weight_sum = 1.0;
377    }
378
379    #[allow(clippy::cast_precision_loss)]
380    let max_dist = score_end.saturating_sub(1) as f32;
381
382    let mut scores: Vec<Option<FidelityScore>> = (0..n).map(|_| None).collect();
383
384    for (i, msg) in messages.iter().enumerate().take(score_end) {
385        if is_exempt(msg, i, inserted_count) {
386            continue;
387        }
388
389        #[allow(clippy::cast_possible_truncation)]
390        let original_tokens = tc.count_tokens(&msg.content) as u32;
391
392        // distance_from_end = 0 for newest (i = score_end-1), N-1 for oldest (i = 0).
393        // Spec §6.1: temporal = 1.0 - distance_from_end / max_dist → newest = 1.0, oldest ≈ 0.0.
394        #[allow(clippy::cast_precision_loss)]
395        let temporal = if max_dist > 0.0 {
396            let distance_from_end = (score_end - 1 - i) as f32;
397            1.0 - distance_from_end / max_dist
398        } else {
399            1.0
400        };
401        // Spec §6.2: ToolResult messages use weight 0.4 regardless of Role::User mapping.
402        let importance = if msg
403            .parts
404            .iter()
405            .any(|p| matches!(p, MessagePart::ToolResult { .. }))
406        {
407            0.4
408        } else {
409            role_weight(msg.role)
410        };
411        let semantic = if semantic_active {
412            match (query_embedding, msg.metadata.embedding.as_deref()) {
413                (Some(q_emb), Some(m_emb)) => semantic_overlap(m_emb, q_emb),
414                _ => keyword_overlap(&msg.content, &query_words),
415            }
416        } else {
417            0.0
418        };
419        let plan = if plan_active {
420            plan_relevance(&msg.content, planned_tools)
421        } else {
422            0.0
423        };
424
425        let raw = config.w_temporal * temporal
426            + config.w_importance * importance
427            + if semantic_active {
428                config.w_semantic * semantic
429            } else {
430                0.0
431            }
432            + if plan_active {
433                config.w_plan * plan
434            } else {
435                0.0
436            };
437
438        let score = (raw / weight_sum).clamp(0.0, 1.0);
439        let candidate_level = score_to_level(score, config);
440
441        // Floor invariant (CAM Phase 2-B): a persisted fidelity tag constrains upgrades.
442        // A message previously scored as Compressed cannot be upgraded back to Full;
443        // Placeholder cannot be upgraded to Full or Compressed.
444        // The regrade path passes allow_upgrade=true to bypass this constraint.
445        let level = if allow_upgrade {
446            candidate_level
447        } else {
448            match msg.metadata.fidelity_tag {
449                Some(ContextFidelity::Placeholder) => ContextFidelity::Placeholder,
450                Some(ContextFidelity::Compressed) => {
451                    if candidate_level == ContextFidelity::Full {
452                        ContextFidelity::Compressed
453                    } else {
454                        candidate_level
455                    }
456                }
457                _ => candidate_level,
458            }
459        };
460
461        scores[i] = Some(FidelityScore {
462            score,
463            level,
464            original_tokens,
465        });
466    }
467
468    apply_tool_pair_atomicity(messages, &mut scores, config);
469    scores
470}
471
472#[tracing::instrument(name = "context.fidelity.apply", skip_all)]
473async fn apply_scores(
474    messages: &mut [Message],
475    scores: &[Option<FidelityScore>],
476    config: &FidelityConfig,
477    tc: &dyn TokenCounting,
478    provider: Option<&dyn LlmProviderDyn>,
479) {
480    let (mut full_count, mut compressed_count, mut placeholder_count, mut tokens_saved) =
481        (0u32, 0u32, 0u32, 0u32);
482
483    for (i, msg) in messages.iter_mut().enumerate() {
484        let Some(ref fs) = scores[i] else { continue };
485        match fs.level {
486            ContextFidelity::Compressed => {
487                #[allow(clippy::cast_possible_truncation)]
488                let original_tokens = fs.original_tokens;
489                render_compressed(msg, config, tc, provider).await;
490                #[allow(clippy::cast_possible_truncation)]
491                let new_tokens = tc.count_tokens(&msg.content) as u32;
492                tokens_saved += original_tokens.saturating_sub(new_tokens);
493                compressed_count += 1;
494            }
495            ContextFidelity::Placeholder => {
496                render_placeholder(msg, fs.score, fs.original_tokens);
497                placeholder_count += 1;
498            }
499            // Full and any future variants keep original content.
500            _ => {
501                msg.metadata.fidelity_tag = Some(ContextFidelity::Full);
502                full_count += 1;
503            }
504        }
505    }
506
507    tracing::debug!(
508        full_count,
509        compressed_count,
510        placeholder_count,
511        tokens_saved,
512        "fidelity apply complete"
513    );
514}
515
516fn is_exempt(msg: &Message, idx: usize, inserted_count: usize) -> bool {
517    // INV-07: system prompt at index 0.
518    // INV-08: focus_pinned messages.
519    // INV-09: correction messages.
520    // INV-10: freshly injected memory context at indices 1..1+inserted_count.
521    (idx == 0 && msg.role == Role::System)
522        || msg.metadata.focus_pinned
523        || msg.content.starts_with(CORRECTIONS_PREFIX)
524        || (idx >= 1 && idx < 1 + inserted_count)
525}
526
527fn role_weight(role: Role) -> f32 {
528    match role {
529        Role::System => 1.0,
530        Role::User => 0.8,
531        Role::Assistant | _ => 0.6,
532    }
533}
534
535fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
536    if a.len() != b.len() || a.is_empty() {
537        return 0.0;
538    }
539    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
540    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
541    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
542    if norm_a == 0.0 || norm_b == 0.0 {
543        return 0.0;
544    }
545    (dot / (norm_a * norm_b)).clamp(0.0, 1.0)
546}
547
548fn semantic_overlap(msg_embedding: &[f32], query_embedding: &[f32]) -> f32 {
549    cosine_similarity(msg_embedding, query_embedding)
550}
551
552/// Simple word-intersection semantic overlap, normalized to [0, 1].
553///
554/// `query_words` is pre-built outside the per-message loop (SF-1).
555fn keyword_overlap(content: &str, query_words: &std::collections::HashSet<&str>) -> f32 {
556    let content_words: std::collections::HashSet<&str> = content.split_whitespace().collect();
557    let min_len = content_words.len().min(query_words.len());
558    if min_len == 0 {
559        return 0.0;
560    }
561    #[allow(clippy::cast_precision_loss)]
562    let result = content_words.intersection(query_words).count() as f32 / min_len as f32;
563    result.clamp(0.0, 1.0)
564}
565
566/// Keyword overlap between message content and planned tool keywords.
567///
568/// Weighted by `1.0 / distance_from_current` and averaged across all hints.
569fn plan_relevance(content: &str, planned_tools: &[PlannedToolHint]) -> f32 {
570    if planned_tools.is_empty() {
571        return 0.0;
572    }
573    let content_words: std::collections::HashSet<&str> = content.split_whitespace().collect();
574    let mut weighted_sum = 0.0f32;
575    let mut weight_total = 0.0f32;
576    for hint in planned_tools {
577        let dist = f32::from(hint.distance_from_current.max(1));
578        let weight = 1.0 / dist;
579        weight_total += weight;
580        let hint_words: std::collections::HashSet<&str> =
581            hint.keywords.iter().map(String::as_str).collect();
582        let min_len = content_words.len().min(hint_words.len());
583        if min_len == 0 {
584            continue;
585        }
586        #[allow(clippy::cast_precision_loss)]
587        let overlap = content_words.intersection(&hint_words).count() as f32 / min_len as f32;
588        weighted_sum += weight * overlap.clamp(0.0, 1.0);
589    }
590    if weight_total <= 0.0 {
591        return 0.0;
592    }
593    (weighted_sum / weight_total).clamp(0.0, 1.0)
594}
595
596/// O(N) backward scan: find `ToolUse`/`ToolResult` pairs and assign `min(score_a, score_b)`.
597///
598/// The "min" is computed over both the float score and the already-floored level, so
599/// the floor invariant set in `compute_scores` is respected (INV-03 + floor invariant).
600fn apply_tool_pair_atomicity(
601    messages: &[Message],
602    scores: &mut [Option<FidelityScore>],
603    config: &FidelityConfig,
604) {
605    // Collect (tool_use_id, message_index) for ToolResult messages.
606    let mut tool_result_map: std::collections::HashMap<&str, usize> =
607        std::collections::HashMap::new();
608    for (i, msg) in messages.iter().enumerate() {
609        for part in &msg.parts {
610            if let MessagePart::ToolResult { tool_use_id, .. } = part {
611                tool_result_map.insert(tool_use_id.as_str(), i);
612            }
613        }
614    }
615
616    // Walk backward to find ToolUse messages and pair with their result.
617    for (i, msg) in messages.iter().enumerate().rev() {
618        for part in &msg.parts {
619            if let MessagePart::ToolUse { id, .. } = part
620                && let Some(&result_idx) = tool_result_map.get(id.as_str())
621            {
622                let score_a = scores[i].as_ref().map_or(1.0, |s| s.score);
623                let score_b = scores[result_idx].as_ref().map_or(1.0, |s| s.score);
624                let min_score = score_a.min(score_b);
625
626                // The level is the more restrictive of the two already-floored levels.
627                // Taking min(float) alone would bypass the floor invariant for the pair
628                // because score_to_level(min_score) ignores persisted fidelity tags.
629                let level_a = scores[i]
630                    .as_ref()
631                    .map_or(ContextFidelity::Full, |s| s.level);
632                let level_b = scores[result_idx]
633                    .as_ref()
634                    .map_or(ContextFidelity::Full, |s| s.level);
635                let float_level = score_to_level(min_score, config);
636                let min_level = more_restrictive(more_restrictive(level_a, level_b), float_level);
637
638                let tokens_a = scores[i].as_ref().map_or(0, |s| s.original_tokens);
639                let tokens_b = scores[result_idx].as_ref().map_or(0, |s| s.original_tokens);
640                scores[i] = Some(FidelityScore {
641                    score: min_score,
642                    level: min_level,
643                    original_tokens: tokens_a,
644                });
645                scores[result_idx] = Some(FidelityScore {
646                    score: min_score,
647                    level: min_level,
648                    original_tokens: tokens_b,
649                });
650            }
651        }
652    }
653}
654
655/// Return the more restrictive of two fidelity levels.
656///
657/// Restrictiveness order: Placeholder > Compressed > Full.
658fn more_restrictive(a: ContextFidelity, b: ContextFidelity) -> ContextFidelity {
659    use ContextFidelity::{Compressed, Full, Placeholder};
660    match (a, b) {
661        (Placeholder, _) | (_, Placeholder) => Placeholder,
662        (Compressed, _) | (_, Compressed) => Compressed,
663        _ => Full,
664    }
665}
666
667fn score_to_level(score: f32, config: &FidelityConfig) -> ContextFidelity {
668    if score >= config.full_threshold {
669        ContextFidelity::Full
670    } else if score >= config.compressed_threshold {
671        ContextFidelity::Compressed
672    } else {
673        ContextFidelity::Placeholder
674    }
675}
676
677async fn render_compressed(
678    msg: &mut Message,
679    config: &FidelityConfig,
680    tc: &dyn TokenCounting,
681    provider: Option<&dyn LlmProviderDyn>,
682) {
683    // Priority 1: use pre-computed deferred summary when available.
684    if let Some(summary) = msg.metadata.deferred_summary.take() {
685        msg.content = summary;
686    } else if config.compress_provider.is_some()
687        && let Some(p) = provider
688    {
689        // Priority 2: LLM-assisted compression when provider and config name are set.
690        let input_tokens = tc.count_tokens(&msg.content);
691        // Guard: skip LLM if input is already small or within 2x of the max budget.
692        if input_tokens > config.compressed_max_tokens * 2 && input_tokens > 0 {
693            // Apply input cap before sending to LLM.
694            if let Some(max_in) = config.max_compress_input_tokens {
695                apply_input_cap(&mut msg.content, max_in);
696            }
697
698            let prompt = format!(
699                "Summarize in {} tokens or fewer: {}",
700                config.compressed_max_tokens, msg.content
701            );
702            let req = vec![Message {
703                role: Role::User,
704                content: prompt,
705                parts: vec![],
706                metadata: MessageMetadata::default(),
707            }];
708
709            let span = info_span!(
710                "context.fidelity.compress_llm",
711                input_tokens,
712                cached = false,
713            );
714            let result = tokio::time::timeout(
715                Duration::from_secs(config.compress_timeout_secs),
716                p.chat(&req),
717            )
718            .instrument(span)
719            .await;
720
721            match result {
722                Ok(Ok(summary)) => {
723                    msg.metadata.deferred_summary = Some(summary.clone());
724                    msg.content = summary;
725                }
726                Ok(Err(e)) => {
727                    tracing::debug!(error = %e, "compress_llm failed, falling back to truncation");
728                }
729                Err(_) => {
730                    tracing::warn!("compress_llm timed out, falling back to truncation");
731                }
732            }
733        }
734    } else if let Some(max_in) = config.max_compress_input_tokens {
735        // Cap input before truncation so pathologically large messages don't blow the budget.
736        apply_input_cap(&mut msg.content, max_in);
737    }
738
739    // Always cap output to compressed_max_tokens — covers both the deferred-summary path
740    // (LLM output may exceed the limit) and the truncation-only path.
741    truncate_to_tokens(&mut msg.content, config.compressed_max_tokens, tc);
742    msg.parts.clear();
743    msg.metadata.fidelity_tag = Some(ContextFidelity::Compressed);
744}
745
746/// Truncate `content` in place using the `max_tokens * 4` byte approximation.
747///
748/// Truncates at a valid UTF-8 char boundary via [`str::floor_char_boundary`]. Uses
749/// [`saturating_mul`](usize::saturating_mul) to avoid overflow on extreme token counts.
750///
751/// Pass `config.max_compress_input_tokens` as `max_tokens` before an LLM compress call,
752/// or `config.max_embed_input_tokens` before an embed call.
753pub fn apply_input_cap(content: &mut String, max_tokens: usize) {
754    let max_bytes = max_tokens.saturating_mul(4);
755    if content.len() > max_bytes {
756        let boundary = content.floor_char_boundary(max_bytes);
757        content.truncate(boundary);
758    }
759}
760
761fn truncate_to_tokens(content: &mut String, max_tokens: usize, tc: &dyn TokenCounting) {
762    if tc.count_tokens(content) <= max_tokens {
763        return;
764    }
765    // Binary search for the largest valid prefix that fits within max_tokens.
766    // lo: largest known-safe boundary (count <= max_tokens).
767    // hi: upper bound; count > max_tokens on the normal branch; on the stall branch
768    //     hi collapses to lo and the loop exits immediately.
769    let mut lo: usize = 0;
770    let mut hi: usize = content.len();
771    while hi - lo > 1 {
772        let mid = content.floor_char_boundary(usize::midpoint(lo, hi));
773        if mid == lo {
774            // floor_char_boundary landed back on lo (multibyte char spans the midpoint).
775            // Collapsing hi to lo satisfies hi-lo <= 1 and exits the loop.
776            hi = mid;
777        } else if tc.count_tokens(&content[..mid]) <= max_tokens {
778            lo = mid;
779        } else {
780            hi = mid;
781        }
782    }
783    content.truncate(lo);
784}
785
786fn render_placeholder(msg: &mut Message, score: f32, original_tokens: u32) {
787    let role_str = match msg.role {
788        Role::System => "system",
789        Role::Assistant => "assistant",
790        Role::User | _ => "user",
791    };
792    msg.content = format!(
793        "[placeholder: role={role_str}, original_tokens={original_tokens}, importance={score:.2}]"
794    );
795    msg.parts.clear();
796    msg.metadata.fidelity_tag = Some(ContextFidelity::Placeholder);
797}
798
799/// Merge consecutive same-role `Placeholder` messages into a single merged placeholder.
800///
801/// Returns the number of individual messages consumed by merges.
802fn merge_consecutive_placeholders(messages: &mut Vec<Message>) -> usize {
803    let mut merged_count = 0usize;
804    let mut i = 0;
805    while i < messages.len() {
806        if messages[i].metadata.fidelity_tag != Some(ContextFidelity::Placeholder)
807            || messages[i].role == Role::System
808        {
809            i += 1;
810            continue;
811        }
812        let role = messages[i].role;
813        let mut j = i + 1;
814        while j < messages.len()
815            && messages[j].metadata.fidelity_tag == Some(ContextFidelity::Placeholder)
816            && messages[j].role == role
817        {
818            j += 1;
819        }
820        if j - i <= 1 {
821            i += 1;
822            continue;
823        }
824        let count = j - i;
825        let mut total_tokens = 0u32;
826        let mut importance_sum = 0.0f32;
827        for msg in &messages[i..j] {
828            total_tokens += parse_placeholder_tokens(&msg.content);
829            importance_sum += parse_placeholder_importance(&msg.content);
830        }
831        debug_assert!(count >= 2, "placeholder merge triggered with count={count}");
832        #[allow(clippy::cast_precision_loss)]
833        let avg_importance = if count > 0 {
834            importance_sum / count as f32
835        } else {
836            0.0
837        };
838        let role_str = match role {
839            Role::System => "system",
840            Role::Assistant => "assistant",
841            Role::User | _ => "user",
842        };
843        let merged_content = format!(
844            "[placeholder: {count} messages, role={role_str}, total_tokens={total_tokens}, avg_importance={avg_importance:.2}]"
845        );
846        let first = messages[i].clone();
847        messages.drain(i..j);
848        messages.insert(
849            i,
850            Message {
851                role: first.role,
852                content: merged_content,
853                parts: vec![],
854                metadata: {
855                    let mut m = first.metadata;
856                    m.fidelity_tag = Some(ContextFidelity::Placeholder);
857                    m
858                },
859            },
860        );
861        merged_count += count - 1;
862        i += 1;
863    }
864    merged_count
865}
866
867fn parse_placeholder_tokens(content: &str) -> u32 {
868    for part in content.split(',') {
869        let part = part.trim();
870        for prefix in &["original_tokens=", "total_tokens="] {
871            if let Some(rest) = part.strip_prefix(prefix)
872                && let Ok(n) = rest.trim_end_matches(']').trim().parse::<u32>()
873            {
874                return n;
875            }
876        }
877    }
878    0
879}
880
881fn parse_placeholder_importance(content: &str) -> f32 {
882    for part in content.split(',') {
883        let part = part.trim();
884        for prefix in &["importance=", "avg_importance="] {
885            if let Some(rest) = part.strip_prefix(prefix)
886                && let Ok(v) = rest.trim_end_matches(']').trim().parse::<f32>()
887            {
888                return v;
889            }
890        }
891    }
892    0.0
893}
894
895#[cfg(test)]
896mod tests {
897    use super::*;
898    use zeph_common::ProviderName;
899    use zeph_llm::provider::{Message, MessageMetadata, MessagePart, Role};
900
901    struct FixedTc(usize);
902    impl TokenCounting for FixedTc {
903        fn count_tokens(&self, text: &str) -> usize {
904            text.len() / self.0.max(1)
905        }
906
907        fn count_tool_schema_tokens(&self, _schema: &serde_json::Value) -> usize {
908            0
909        }
910    }
911
912    fn make_msg(role: Role, content: &str) -> Message {
913        Message {
914            role,
915            content: content.to_string(),
916            parts: vec![],
917            metadata: MessageMetadata::default(),
918        }
919    }
920
921    fn make_cfg() -> FidelityConfig {
922        FidelityConfig {
923            enabled: true,
924            w_semantic: 0.3,
925            w_temporal: 0.3,
926            w_importance: 0.2,
927            w_plan: 0.2,
928            full_threshold: 0.7,
929            compressed_threshold: 0.3,
930            compressed_max_tokens: 50,
931            regrade_threshold: 0.6,
932            min_query_length: 8,
933            max_scored_messages: 500,
934            exempt_tail_messages: 0,
935            compress_provider: None,
936            semantic_scoring_provider: None,
937            lookahead_depth: 3,
938            embed_concurrency: 32,
939            max_embed_input_tokens: None,
940            max_compress_input_tokens: None,
941            embed_timeout_secs: 30,
942            compress_timeout_secs: 30,
943        }
944    }
945
946    // 1. Empty window → no change.
947    #[tokio::test]
948    async fn empty_window_no_change() {
949        let scorer = FidelityScorer;
950        let cfg = make_cfg();
951        let tc = FixedTc(4);
952        let mut messages: Vec<Message> = vec![];
953        scorer
954            .score_and_apply(
955                &mut messages,
956                "query text",
957                &[],
958                &cfg,
959                &tc,
960                0,
961                false,
962                None,
963                None,
964            )
965            .await;
966        assert!(messages.is_empty());
967    }
968
969    // 2. All-exempt window → no downgrade.
970    #[tokio::test]
971    async fn all_exempt_no_downgrade() {
972        let scorer = FidelityScorer;
973        let cfg = make_cfg();
974        let tc = FixedTc(4);
975        let mut messages = vec![
976            make_msg(Role::System, "system prompt"),
977            // Injected memory at index 1 with inserted_count=1.
978            make_msg(Role::User, "memory context"),
979        ];
980        scorer
981            .score_and_apply(&mut messages, "short", &[], &cfg, &tc, 1, false, None, None)
982            .await;
983        for msg in &messages {
984            assert!(
985                msg.metadata.fidelity_tag.is_none()
986                    || msg.metadata.fidelity_tag == Some(ContextFidelity::Full)
987            );
988        }
989    }
990
991    // 3. Tool pair atomicity: divergent scores → min applied.
992    #[tokio::test]
993    async fn tool_pair_atomicity() {
994        let scorer = FidelityScorer;
995        // Very high thresholds to force Placeholder for older messages.
996        let cfg = FidelityConfig {
997            full_threshold: 0.9,
998            compressed_threshold: 0.5,
999            ..make_cfg()
1000        };
1001        let tc = FixedTc(4);
1002        let tool_use_id = "abc123".to_string();
1003        let mut tool_use_msg = make_msg(Role::Assistant, "calling tool");
1004        tool_use_msg.parts = vec![MessagePart::ToolUse {
1005            id: tool_use_id.clone(),
1006            name: "shell".to_string(),
1007            input: serde_json::json!({}),
1008        }];
1009        let mut tool_result_msg = make_msg(Role::User, "tool result body");
1010        tool_result_msg.parts = vec![MessagePart::ToolResult {
1011            tool_use_id: tool_use_id.clone(),
1012            content: "result".to_string(),
1013            is_error: false,
1014        }];
1015        let mut messages = vec![
1016            make_msg(Role::System, "system"),
1017            tool_use_msg,
1018            tool_result_msg,
1019        ];
1020        scorer
1021            .score_and_apply(
1022                &mut messages,
1023                "completely unrelated query blah",
1024                &[],
1025                &cfg,
1026                &tc,
1027                0,
1028                false,
1029                None,
1030                None,
1031            )
1032            .await;
1033        let tag_a = messages[1].metadata.fidelity_tag;
1034        let tag_b = messages[2].metadata.fidelity_tag;
1035        assert_eq!(tag_a, tag_b, "tool pair must share fidelity level");
1036    }
1037
1038    // 4. Same-role Placeholder merge: 5 consecutive assistant → merged to 1.
1039    #[tokio::test]
1040    async fn same_role_placeholder_merge() {
1041        let scorer = FidelityScorer;
1042        // Force all non-system messages to become Placeholder.
1043        let cfg = FidelityConfig {
1044            full_threshold: 2.0,       // impossible to reach
1045            compressed_threshold: 1.5, // impossible to reach
1046            ..make_cfg()
1047        };
1048        let tc = FixedTc(4);
1049        let mut messages: Vec<Message> = std::iter::once(make_msg(Role::System, "system"))
1050            .chain((0..5).map(|i| make_msg(Role::Assistant, &format!("msg {i}"))))
1051            .collect();
1052        scorer
1053            .score_and_apply(
1054                &mut messages,
1055                "some query here",
1056                &[],
1057                &cfg,
1058                &tc,
1059                0,
1060                false,
1061                None,
1062                None,
1063            )
1064            .await;
1065        // System + 1 merged placeholder.
1066        assert_eq!(
1067            messages.len(),
1068            2,
1069            "5 assistant placeholders must merge to 1"
1070        );
1071        assert!(messages[1].content.contains("5 messages"));
1072    }
1073
1074    // 5. Score normalization: active signal subset still produces [0,1].
1075    #[tokio::test]
1076    async fn score_normalization_no_panic() {
1077        let scorer = FidelityScorer;
1078        let cfg = make_cfg();
1079        let tc = FixedTc(4);
1080        let mut messages = vec![
1081            make_msg(Role::System, "system"),
1082            make_msg(Role::User, "hello"),
1083            make_msg(Role::Assistant, "world response"),
1084        ];
1085        scorer
1086            .score_and_apply(
1087                &mut messages,
1088                "hello world signal",
1089                &[],
1090                &cfg,
1091                &tc,
1092                0,
1093                false,
1094                None,
1095                None,
1096            )
1097            .await;
1098        for msg in &messages {
1099            let _ = msg.metadata.fidelity_tag;
1100        }
1101    }
1102
1103    // 6. Short query fallback: query.len() < 8 → semantic signal excluded.
1104    #[tokio::test]
1105    async fn short_query_fallback() {
1106        let scorer = FidelityScorer;
1107        let cfg = FidelityConfig {
1108            min_query_length: 8,
1109            ..make_cfg()
1110        };
1111        let tc = FixedTc(4);
1112        let mut messages = vec![
1113            make_msg(Role::System, "system"),
1114            make_msg(Role::User, "test"),
1115        ];
1116        // Must not panic or produce out-of-range scores.
1117        scorer
1118            .score_and_apply(&mut messages, "short", &[], &cfg, &tc, 0, false, None, None)
1119            .await;
1120    }
1121
1122    // 7. AC-09: memory_first bypass is the caller's responsibility.
1123    //    When enabled=false, score_and_apply is always a no-op — callers that activate
1124    //    memory_first simply skip the call (see service.rs guard at INV-11).
1125    //    This test documents the contract: the scorer itself is stateless and harmless
1126    //    when called with disabled config or an all-exempt window.
1127    #[tokio::test]
1128    async fn memory_first_bypass_is_callers_responsibility() {
1129        let scorer = FidelityScorer;
1130        // Simulate: caller would skip this call when memory_first=true.
1131        // The scorer itself must be a complete no-op when enabled=false.
1132        let cfg = FidelityConfig {
1133            enabled: false,
1134            ..make_cfg()
1135        };
1136        let tc = FixedTc(4);
1137        let mut messages = vec![
1138            make_msg(Role::System, "system prompt"),
1139            make_msg(Role::User, "memory-injected context"),
1140            make_msg(Role::Assistant, "response"),
1141        ];
1142        let before: Vec<_> = messages.iter().map(|m| m.content.clone()).collect();
1143        // Even with a real query, disabled scorer must not touch any message.
1144        scorer
1145            .score_and_apply(
1146                &mut messages,
1147                "some user query text here",
1148                &[],
1149                &cfg,
1150                &tc,
1151                2,
1152                false,
1153                None,
1154                None,
1155            )
1156            .await;
1157        for (msg, orig) in messages.iter().zip(&before) {
1158            assert_eq!(msg.content, *orig, "content must be unchanged");
1159            assert!(
1160                msg.metadata.fidelity_tag.is_none(),
1161                "no fidelity tag must be set"
1162            );
1163        }
1164    }
1165
1166    // 9. enabled=false guard: no changes applied.
1167    #[tokio::test]
1168    async fn enabled_false_guard() {
1169        let scorer = FidelityScorer;
1170        let cfg = FidelityConfig {
1171            enabled: false,
1172            ..make_cfg()
1173        };
1174        let tc = FixedTc(4);
1175        let mut messages = vec![
1176            make_msg(Role::System, "system"),
1177            make_msg(Role::User, "user message that would normally be scored"),
1178        ];
1179        let original_contents: Vec<String> = messages.iter().map(|m| m.content.clone()).collect();
1180        scorer
1181            .score_and_apply(
1182                &mut messages,
1183                "query text here",
1184                &[],
1185                &cfg,
1186                &tc,
1187                0,
1188                false,
1189                None,
1190                None,
1191            )
1192            .await;
1193        for (msg, orig) in messages.iter().zip(&original_contents) {
1194            assert_eq!(msg.content, *orig);
1195            assert!(msg.metadata.fidelity_tag.is_none());
1196        }
1197    }
1198
1199    // 10. Score always in [0.0, 1.0] for extreme inputs (zero weights).
1200    #[tokio::test]
1201    async fn score_always_in_range() {
1202        let scorer = FidelityScorer;
1203        let cfg = FidelityConfig {
1204            enabled: true,
1205            w_semantic: 0.0,
1206            w_temporal: 0.0,
1207            w_importance: 0.0,
1208            w_plan: 0.0,
1209            full_threshold: 0.7,
1210            compressed_threshold: 0.3,
1211            compressed_max_tokens: 50,
1212            regrade_threshold: 0.6,
1213            min_query_length: 0,
1214            max_scored_messages: 500,
1215            exempt_tail_messages: 0,
1216            compress_provider: None,
1217            semantic_scoring_provider: None,
1218            lookahead_depth: 3,
1219            embed_concurrency: 32,
1220            max_embed_input_tokens: None,
1221            max_compress_input_tokens: None,
1222            embed_timeout_secs: 30,
1223            compress_timeout_secs: 30,
1224        };
1225        let tc = FixedTc(4);
1226        let mut messages = vec![make_msg(Role::System, ""), make_msg(Role::User, "")];
1227        // Must not panic with zero weights.
1228        scorer
1229            .score_and_apply(&mut messages, "", &[], &cfg, &tc, 0, false, None, None)
1230            .await;
1231    }
1232
1233    // 11. Token count uses tc.count_tokens for Placeholder rendering.
1234    #[tokio::test]
1235    async fn placeholder_uses_tc_count_tokens() {
1236        let scorer = FidelityScorer;
1237        let cfg = FidelityConfig {
1238            full_threshold: 2.0,
1239            compressed_threshold: 1.5,
1240            ..make_cfg()
1241        };
1242        let tc = FixedTc(1); // every character = 1 token
1243        let mut messages = vec![
1244            make_msg(Role::System, "system"),
1245            make_msg(Role::User, "user message content for placeholder rendering"),
1246        ];
1247        scorer
1248            .score_and_apply(
1249                &mut messages,
1250                "some query text here",
1251                &[],
1252                &cfg,
1253                &tc,
1254                0,
1255                false,
1256                None,
1257                None,
1258            )
1259            .await;
1260        assert_eq!(
1261            messages[1].metadata.fidelity_tag,
1262            Some(ContextFidelity::Placeholder)
1263        );
1264        assert!(messages[1].content.starts_with("[placeholder:"));
1265    }
1266
1267    // 13. #4593: exempt_tail_messages respected when n > max_scored_messages.
1268    //     Verify that tail messages (beyond score_end) keep no fidelity_tag.
1269    //     Use focus_pinned=true on tail messages so they stay exempt-from-scoring
1270    //     via is_exempt() and retain no tag regardless of the merge pass.
1271    //     n=20, max_scored_messages=10, exempt_tail_messages=5 → score_end=15.
1272    //     Indices [15..19] are in the exempt tail.
1273    #[tokio::test]
1274    async fn exempt_tail_messages_large_window() {
1275        let scorer = FidelityScorer;
1276        let cfg = FidelityConfig {
1277            // Force all scored messages to Placeholder so we can see which ones got tagged.
1278            full_threshold: 2.0,
1279            compressed_threshold: 1.5,
1280            max_scored_messages: 10,
1281            exempt_tail_messages: 5,
1282            ..make_cfg()
1283        };
1284        let tc = FixedTc(4);
1285
1286        // Index 0: system (always exempt).
1287        // Indices 1..14: regular user messages that fall in the scored region [0..15).
1288        // Indices 15..19: tail messages — mark them focus_pinned so they don't get scored.
1289        //   focus_pinned is the is_exempt() gate (INV-08), which makes them opaque to
1290        //   the scorer. This lets us assert their fidelity_tag stays None while the
1291        //   merge pass leaves them intact (it only merges Placeholder-tagged messages).
1292        let mut messages: Vec<Message> = std::iter::once(make_msg(Role::System, "system prompt"))
1293            .chain((1..15).map(|i| make_msg(Role::Assistant, &format!("assistant message {i}"))))
1294            .chain((15..20).map(|i| {
1295                let mut m = make_msg(Role::User, &format!("tail message {i}"));
1296                m.metadata.focus_pinned = true;
1297                m
1298            }))
1299            .collect();
1300
1301        scorer
1302            .score_and_apply(
1303                &mut messages,
1304                "query text here long",
1305                &[],
1306                &cfg,
1307                &tc,
1308                0,
1309                false,
1310                None,
1311                None,
1312            )
1313            .await;
1314
1315        // Tail messages (focus_pinned) must have no fidelity_tag.
1316        let tail: Vec<_> = messages
1317            .iter()
1318            .filter(|m| m.metadata.focus_pinned)
1319            .collect();
1320        assert_eq!(
1321            tail.len(),
1322            5,
1323            "all 5 tail messages must survive the merge pass"
1324        );
1325        for msg in &tail {
1326            assert!(
1327                msg.metadata.fidelity_tag.is_none(),
1328                "tail message must have no fidelity_tag, got {:?}",
1329                msg.metadata.fidelity_tag
1330            );
1331        }
1332    }
1333
1334    // 14. #4593: when n <= max_scored_messages, exempt_tail_messages has no effect.
1335    //     n=8, max_scored_messages=10, exempt_tail_messages=5 → score_end=8 (all scored).
1336    //     Use alternating roles to avoid placeholder merging.
1337    #[tokio::test]
1338    async fn exempt_tail_messages_small_window_no_effect() {
1339        let scorer = FidelityScorer;
1340        let cfg = FidelityConfig {
1341            full_threshold: 2.0,
1342            compressed_threshold: 1.5,
1343            max_scored_messages: 10,
1344            exempt_tail_messages: 5,
1345            ..make_cfg()
1346        };
1347        let tc = FixedTc(4);
1348        // 8 messages: index 0 = system, then alternating user/assistant.
1349        // Alternating roles prevent placeholder merge, keeping the length stable.
1350        let roles = [Role::User, Role::Assistant];
1351        let mut messages: Vec<Message> = std::iter::once(make_msg(Role::System, "system prompt"))
1352            .chain((1..8usize).map(|i| make_msg(roles[i % 2], &format!("message {i}"))))
1353            .collect();
1354        scorer
1355            .score_and_apply(
1356                &mut messages,
1357                "query text here long",
1358                &[],
1359                &cfg,
1360                &tc,
1361                0,
1362                false,
1363                None,
1364                None,
1365            )
1366            .await;
1367        // score_end = 8 (n=8 <= max_scored_messages=10, so exempt_tail not applied).
1368        // All non-system messages must be scored (tagged).
1369        let untagged_count = messages[1..]
1370            .iter()
1371            .filter(|m| m.metadata.fidelity_tag.is_none())
1372            .count();
1373        assert_eq!(
1374            untagged_count, 0,
1375            "all non-system messages must be scored when n <= max_scored_messages"
1376        );
1377    }
1378
1379    // 12. Compressed rendering uses deferred_summary when available.
1380    #[tokio::test]
1381    async fn compressed_uses_deferred_summary() {
1382        let scorer = FidelityScorer;
1383        let cfg = FidelityConfig {
1384            full_threshold: 2.0,       // nothing reaches Full
1385            compressed_threshold: 0.0, // everything at or above 0 → Compressed
1386            compressed_max_tokens: 5,
1387            ..make_cfg()
1388        };
1389        let tc = FixedTc(4);
1390        let mut msg_with_summary =
1391            make_msg(Role::User, "original long content that would be truncated");
1392        msg_with_summary.metadata.deferred_summary = Some("short summary".to_string());
1393        let mut messages = vec![make_msg(Role::System, "system"), msg_with_summary];
1394        scorer
1395            .score_and_apply(
1396                &mut messages,
1397                "query text here long",
1398                &[],
1399                &cfg,
1400                &tc,
1401                0,
1402                false,
1403                None,
1404                None,
1405            )
1406            .await;
1407        assert_eq!(
1408            messages[1].metadata.fidelity_tag,
1409            Some(ContextFidelity::Compressed)
1410        );
1411        assert_eq!(messages[1].content, "short summary");
1412    }
1413
1414    // ── CAM Phase 2-B: floor invariant tests ────────────────────────────────
1415
1416    fn make_msg_with_fidelity(role: Role, content: &str, tag: Option<ContextFidelity>) -> Message {
1417        let mut m = make_msg(role, content);
1418        m.metadata.fidelity_tag = tag;
1419        m
1420    }
1421
1422    // Floor: Compressed cannot upgrade to Full when allow_upgrade=false.
1423    #[tokio::test]
1424    async fn floor_prevents_compressed_upgrade_to_full() {
1425        let scorer = FidelityScorer;
1426        let cfg = FidelityConfig {
1427            // Very low thresholds so every message naturally scores Full.
1428            full_threshold: 0.0,
1429            compressed_threshold: -1.0,
1430            ..make_cfg()
1431        };
1432        let tc = FixedTc(4);
1433        let mut messages = vec![
1434            make_msg(Role::System, "system"),
1435            make_msg_with_fidelity(
1436                Role::User,
1437                "query text here long keyword",
1438                Some(ContextFidelity::Compressed),
1439            ),
1440        ];
1441        scorer
1442            .score_and_apply(
1443                &mut messages,
1444                "query text here long keyword",
1445                &[],
1446                &cfg,
1447                &tc,
1448                0,
1449                false,
1450                None,
1451                None,
1452            )
1453            .await;
1454        assert_eq!(
1455            messages[1].metadata.fidelity_tag,
1456            Some(ContextFidelity::Compressed),
1457            "Compressed floor must block upgrade to Full"
1458        );
1459    }
1460
1461    // Floor: Placeholder cannot upgrade to Full.
1462    #[tokio::test]
1463    async fn floor_prevents_placeholder_upgrade_to_full() {
1464        let scorer = FidelityScorer;
1465        let cfg = FidelityConfig {
1466            full_threshold: 0.0,
1467            compressed_threshold: -1.0,
1468            ..make_cfg()
1469        };
1470        let tc = FixedTc(4);
1471        let mut messages = vec![
1472            make_msg(Role::System, "system"),
1473            make_msg_with_fidelity(
1474                Role::User,
1475                "query text here long keyword",
1476                Some(ContextFidelity::Placeholder),
1477            ),
1478        ];
1479        scorer
1480            .score_and_apply(
1481                &mut messages,
1482                "query text here long keyword",
1483                &[],
1484                &cfg,
1485                &tc,
1486                0,
1487                false,
1488                None,
1489                None,
1490            )
1491            .await;
1492        assert_eq!(
1493            messages[1].metadata.fidelity_tag,
1494            Some(ContextFidelity::Placeholder),
1495            "Placeholder floor must block upgrade to Full"
1496        );
1497    }
1498
1499    // Floor: Placeholder cannot upgrade to Compressed.
1500    #[tokio::test]
1501    async fn floor_prevents_placeholder_upgrade_to_compressed() {
1502        let scorer = FidelityScorer;
1503        // Thresholds: full=2.0 (unreachable), compressed=0.0 (everything Compressed).
1504        let cfg = FidelityConfig {
1505            full_threshold: 2.0,
1506            compressed_threshold: 0.0,
1507            ..make_cfg()
1508        };
1509        let tc = FixedTc(4);
1510        let mut messages = vec![
1511            make_msg(Role::System, "system"),
1512            make_msg_with_fidelity(
1513                Role::User,
1514                "message content",
1515                Some(ContextFidelity::Placeholder),
1516            ),
1517        ];
1518        scorer
1519            .score_and_apply(
1520                &mut messages,
1521                "query text here long",
1522                &[],
1523                &cfg,
1524                &tc,
1525                0,
1526                false,
1527                None,
1528                None,
1529            )
1530            .await;
1531        assert_eq!(
1532            messages[1].metadata.fidelity_tag,
1533            Some(ContextFidelity::Placeholder),
1534            "Placeholder floor must block upgrade to Compressed"
1535        );
1536    }
1537
1538    // Floor: further downgrade (Compressed → Placeholder) is allowed.
1539    #[tokio::test]
1540    async fn floor_allows_further_downgrade() {
1541        let scorer = FidelityScorer;
1542        let cfg = FidelityConfig {
1543            full_threshold: 2.0,
1544            compressed_threshold: 2.0, // everything Placeholder
1545            ..make_cfg()
1546        };
1547        let tc = FixedTc(4);
1548        let mut messages = vec![
1549            make_msg(Role::System, "system"),
1550            make_msg_with_fidelity(
1551                Role::User,
1552                "some content",
1553                Some(ContextFidelity::Compressed),
1554            ),
1555        ];
1556        scorer
1557            .score_and_apply(
1558                &mut messages,
1559                "query text here long",
1560                &[],
1561                &cfg,
1562                &tc,
1563                0,
1564                false,
1565                None,
1566                None,
1567            )
1568            .await;
1569        assert_eq!(
1570            messages[1].metadata.fidelity_tag,
1571            Some(ContextFidelity::Placeholder),
1572            "downgrade from Compressed to Placeholder must be allowed"
1573        );
1574    }
1575
1576    // Floor: None tag → no constraint, normal scoring.
1577    #[tokio::test]
1578    async fn floor_no_constraint_when_none() {
1579        let scorer = FidelityScorer;
1580        // Force every message to Full.
1581        let cfg = FidelityConfig {
1582            full_threshold: 0.0,
1583            compressed_threshold: -1.0,
1584            ..make_cfg()
1585        };
1586        let tc = FixedTc(4);
1587        let mut messages = vec![
1588            make_msg(Role::System, "system"),
1589            make_msg_with_fidelity(Role::User, "query text here long keyword", None),
1590        ];
1591        scorer
1592            .score_and_apply(
1593                &mut messages,
1594                "query text here long keyword",
1595                &[],
1596                &cfg,
1597                &tc,
1598                0,
1599                false,
1600                None,
1601                None,
1602            )
1603            .await;
1604        assert_eq!(
1605            messages[1].metadata.fidelity_tag,
1606            Some(ContextFidelity::Full),
1607            "None tag must not constrain scoring"
1608        );
1609    }
1610
1611    // allow_upgrade=true bypasses the Placeholder floor.
1612    #[tokio::test]
1613    async fn allow_upgrade_bypasses_floor() {
1614        let scorer = FidelityScorer;
1615        let cfg = FidelityConfig {
1616            full_threshold: 0.0,
1617            compressed_threshold: -1.0,
1618            ..make_cfg()
1619        };
1620        let tc = FixedTc(4);
1621        let mut messages = vec![
1622            make_msg(Role::System, "system"),
1623            make_msg_with_fidelity(
1624                Role::User,
1625                "query text here long keyword",
1626                Some(ContextFidelity::Placeholder),
1627            ),
1628        ];
1629        scorer
1630            .score_and_apply(
1631                &mut messages,
1632                "query text here long keyword",
1633                &[],
1634                &cfg,
1635                &tc,
1636                0,
1637                true,
1638                None,
1639                None,
1640            )
1641            .await;
1642        assert_eq!(
1643            messages[1].metadata.fidelity_tag,
1644            Some(ContextFidelity::Full),
1645            "allow_upgrade=true must bypass the Placeholder floor"
1646        );
1647    }
1648
1649    // ── truncate_to_tokens unit tests ─────────────────────────────────────────
1650
1651    // no-op when content is well below limit
1652    #[test]
1653    fn truncate_no_op_below_limit() {
1654        let tc = FixedTc(1); // 1 char = 1 token
1655        let mut s = "hello".to_string(); // 5 tokens
1656        truncate_to_tokens(&mut s, 10, &tc);
1657        assert_eq!(s, "hello");
1658    }
1659
1660    // no-op when content is exactly at limit
1661    #[test]
1662    fn truncate_no_op_at_limit() {
1663        let tc = FixedTc(1);
1664        let mut s = "hello".to_string(); // 5 tokens
1665        truncate_to_tokens(&mut s, 5, &tc);
1666        assert_eq!(s, "hello");
1667    }
1668
1669    // truncates when content is one token over the limit
1670    #[test]
1671    fn truncate_minimal_one_over_limit() {
1672        let tc = FixedTc(1); // 1 char = 1 token
1673        let mut s = "abcdef".to_string(); // 6 tokens, limit=5
1674        truncate_to_tokens(&mut s, 5, &tc);
1675        assert!(
1676            tc.count_tokens(&s) <= 5,
1677            "result must fit in 5 tokens, got {}",
1678            tc.count_tokens(&s)
1679        );
1680        assert!(!s.is_empty(), "must keep prefix, not empty");
1681    }
1682
1683    // binary search preserves more than halving would: at 90% of limit no truncation occurs
1684    #[test]
1685    fn truncate_preserves_90pct_of_limit() {
1686        // FixedTc(1): each byte = 1 token; 90-byte string with limit=100 must not be truncated.
1687        let tc = FixedTc(1);
1688        let s_orig = "a".repeat(90);
1689        let mut s = s_orig.clone();
1690        truncate_to_tokens(&mut s, 100, &tc);
1691        assert_eq!(s, s_orig, "90% of limit must not be truncated");
1692    }
1693
1694    // empty string is a no-op
1695    #[test]
1696    fn truncate_empty_string_no_op() {
1697        let tc = FixedTc(1);
1698        let mut s = String::new();
1699        truncate_to_tokens(&mut s, 5, &tc);
1700        assert!(s.is_empty());
1701    }
1702
1703    // max_tokens=0 truncates everything
1704    #[test]
1705    fn truncate_max_tokens_zero_clears_content() {
1706        let tc = FixedTc(1);
1707        let mut s = "hello world".to_string();
1708        truncate_to_tokens(&mut s, 0, &tc);
1709        assert!(s.is_empty(), "max_tokens=0 must clear content");
1710    }
1711
1712    // multibyte characters: truncation must land on a valid char boundary
1713    #[test]
1714    fn truncate_multibyte_stays_on_char_boundary() {
1715        // "日本語" = 3 chars, each 3 bytes (9 bytes total).
1716        // FixedTc(3): count = byte_len / 3, so "日本語" = 3 tokens.
1717        // limit=2 → must truncate to "日本" (2 chars, 6 bytes).
1718        let tc = FixedTc(3);
1719        let mut s = "日本語".to_string();
1720        truncate_to_tokens(&mut s, 2, &tc);
1721        assert!(
1722            s.is_char_boundary(s.len()),
1723            "result must be on a valid char boundary"
1724        );
1725        assert!(tc.count_tokens(&s) <= 2);
1726        assert_eq!(s, "日本");
1727    }
1728
1729    // Mixed-fidelity tool pair: None + Some(Compressed) → both end up Compressed via atomicity.
1730    #[tokio::test]
1731    async fn mixed_fidelity_tool_pair_floor_plus_atomicity() {
1732        let scorer = FidelityScorer;
1733        // Force everything to Full naturally, so the floor is the only downward pressure.
1734        let cfg = FidelityConfig {
1735            full_threshold: 0.0,
1736            compressed_threshold: -1.0,
1737            ..make_cfg()
1738        };
1739        let tc = FixedTc(4);
1740        let tool_id = "tool-42".to_string();
1741
1742        let mut tool_use_msg = make_msg_with_fidelity(Role::Assistant, "call tool", None);
1743        tool_use_msg.parts = vec![MessagePart::ToolUse {
1744            id: tool_id.clone(),
1745            name: "shell".to_string(),
1746            input: serde_json::json!({}),
1747        }];
1748
1749        let mut tool_result_msg =
1750            make_msg_with_fidelity(Role::User, "result body", Some(ContextFidelity::Compressed));
1751        tool_result_msg.parts = vec![MessagePart::ToolResult {
1752            tool_use_id: tool_id.clone(),
1753            content: "output".to_string(),
1754            is_error: false,
1755        }];
1756
1757        let mut messages = vec![
1758            make_msg(Role::System, "system"),
1759            tool_use_msg,
1760            tool_result_msg,
1761        ];
1762
1763        scorer
1764            .score_and_apply(
1765                &mut messages,
1766                "query text here long",
1767                &[],
1768                &cfg,
1769                &tc,
1770                0,
1771                false,
1772                None,
1773                None,
1774            )
1775            .await;
1776
1777        // After floor clamping: tool_use scores Full (no floor), tool_result scores Full but
1778        // is clamped to Compressed by its floor. Atomicity then takes min(Full, Compressed) =
1779        // Compressed for both.
1780        let tag_use = messages[1].metadata.fidelity_tag;
1781        let tag_result = messages[2].metadata.fidelity_tag;
1782        assert_eq!(
1783            tag_use, tag_result,
1784            "tool pair must share the same fidelity level"
1785        );
1786        assert_eq!(
1787            tag_use,
1788            Some(ContextFidelity::Compressed),
1789            "atomicity must bring the tool-use down to the tool-result floor"
1790        );
1791    }
1792
1793    // 15. LLM path stores deferred_summary and updates content.
1794    #[tokio::test]
1795    async fn compress_llm_path_stores_deferred_summary() {
1796        use zeph_llm::LlmError;
1797        use zeph_llm::provider::ChatStream;
1798
1799        #[derive(Debug)]
1800        struct MockProvider;
1801
1802        impl zeph_llm::provider::LlmProvider for MockProvider {
1803            async fn chat(&self, _messages: &[Message]) -> Result<String, LlmError> {
1804                Ok("summary text".to_string())
1805            }
1806
1807            async fn chat_stream(&self, _messages: &[Message]) -> Result<ChatStream, LlmError> {
1808                Err(LlmError::Unavailable)
1809            }
1810
1811            fn supports_streaming(&self) -> bool {
1812                false
1813            }
1814
1815            async fn embed(&self, _text: &str) -> Result<Vec<f32>, LlmError> {
1816                Err(LlmError::EmbedUnsupported {
1817                    provider: "mock".into(),
1818                })
1819            }
1820
1821            fn supports_embeddings(&self) -> bool {
1822                false
1823            }
1824
1825            fn name(&self) -> &'static str {
1826                "mock"
1827            }
1828        }
1829
1830        let scorer = FidelityScorer;
1831        let cfg = FidelityConfig {
1832            enabled: true,
1833            // Force everything to Compressed.
1834            full_threshold: 2.0,
1835            compressed_threshold: 0.0,
1836            compressed_max_tokens: 5,
1837            compress_provider: Some(ProviderName::new("mock")),
1838            ..make_cfg()
1839        };
1840        // Use tc with chars-per-token=1 so a long string has more than 5*2=10 tokens.
1841        let tc = FixedTc(1);
1842        let content = "a".repeat(50); // 50 chars → 50 tokens → well above 5*2=10
1843        let mut messages = vec![
1844            make_msg(Role::System, "system"),
1845            make_msg(Role::User, &content),
1846        ];
1847
1848        let provider = MockProvider;
1849        scorer
1850            .score_and_apply(
1851                &mut messages,
1852                "some query text here",
1853                &[],
1854                &cfg,
1855                &tc,
1856                0,
1857                false,
1858                None,
1859                Some(&provider),
1860            )
1861            .await;
1862
1863        assert_eq!(
1864            messages[1].metadata.fidelity_tag,
1865            Some(ContextFidelity::Compressed),
1866        );
1867        // Content is capped to compressed_max_tokens (5 chars with FixedTc(1)) after LLM call.
1868        assert!(
1869            tc.count_tokens(&messages[1].content) <= 5,
1870            "content must be capped to compressed_max_tokens after LLM summary"
1871        );
1872        // deferred_summary stores the full LLM output for future reuse.
1873        assert_eq!(
1874            messages[1].metadata.deferred_summary,
1875            Some("summary text".to_string()),
1876        );
1877    }
1878
1879    // 16. LLM path skipped when provider is None → truncation used instead.
1880    #[tokio::test]
1881    async fn compress_llm_skipped_when_provider_none() {
1882        let scorer = FidelityScorer;
1883        let cfg = FidelityConfig {
1884            enabled: true,
1885            full_threshold: 2.0,
1886            compressed_threshold: 0.0,
1887            compressed_max_tokens: 5,
1888            compress_provider: Some(ProviderName::new("mock")),
1889            ..make_cfg()
1890        };
1891        let tc = FixedTc(1);
1892        let content = "a".repeat(50);
1893        let mut messages = vec![
1894            make_msg(Role::System, "system"),
1895            make_msg(Role::User, &content),
1896        ];
1897
1898        scorer
1899            .score_and_apply(
1900                &mut messages,
1901                "some query text here",
1902                &[],
1903                &cfg,
1904                &tc,
1905                0,
1906                false,
1907                None,
1908                None,
1909            )
1910            .await;
1911
1912        assert_eq!(
1913            messages[1].metadata.fidelity_tag,
1914            Some(ContextFidelity::Compressed),
1915        );
1916        // Truncation path: deferred_summary must NOT be set.
1917        assert!(
1918            messages[1].metadata.deferred_summary.is_none(),
1919            "deferred_summary must not be populated via truncation path"
1920        );
1921        // Content should be truncated to <= 5 chars.
1922        assert!(
1923            messages[1].content.len() <= 5,
1924            "content must be truncated, got len={}",
1925            messages[1].content.len()
1926        );
1927    }
1928
1929    // 17. cosine_similarity unit tests.
1930    #[test]
1931    fn cosine_similarity_identical() {
1932        let v = vec![1.0f32, 0.0, 0.0];
1933        assert!((cosine_similarity(&v, &v) - 1.0).abs() < 1e-6);
1934    }
1935
1936    #[test]
1937    fn cosine_similarity_orthogonal() {
1938        let a = vec![1.0f32, 0.0, 0.0];
1939        let b = vec![0.0f32, 1.0, 0.0];
1940        assert!(cosine_similarity(&a, &b).abs() < 1e-6);
1941    }
1942
1943    #[test]
1944    fn cosine_similarity_zero_vector() {
1945        let a = vec![0.0f32, 0.0, 0.0];
1946        let b = vec![1.0f32, 0.0, 0.0];
1947        assert!(cosine_similarity(&a, &b).abs() < f32::EPSILON);
1948    }
1949
1950    #[test]
1951    fn cosine_similarity_empty() {
1952        assert!(cosine_similarity(&[], &[]).abs() < f32::EPSILON);
1953    }
1954
1955    #[test]
1956    fn cosine_similarity_dimension_mismatch() {
1957        let a = vec![1.0f32, 0.0];
1958        let b = vec![1.0f32, 0.0, 0.0];
1959        assert!(cosine_similarity(&a, &b).abs() < f32::EPSILON);
1960    }
1961
1962    // 18. semantic_scoring_higher_for_similar_messages.
1963    //     Mock provider returns deterministic embeddings:
1964    //       "cat"  → [1.0, 0.0, 0.0]  (similar to query)
1965    //       "feline" → [0.9, 0.1, 0.0]  (similar to query)
1966    //       "stock" → [0.0, 0.0, 1.0]  (orthogonal to query)
1967    //       query "cat mat" → [1.0, 0.0, 0.0]
1968    #[tokio::test]
1969    async fn semantic_scoring_higher_for_similar_messages() {
1970        use zeph_llm::LlmError;
1971        use zeph_llm::provider::ChatStream;
1972
1973        #[derive(Debug)]
1974        struct EmbedMockProvider;
1975
1976        impl zeph_llm::provider::LlmProvider for EmbedMockProvider {
1977            async fn chat(&self, _: &[Message]) -> Result<String, LlmError> {
1978                Err(LlmError::Unavailable)
1979            }
1980            async fn chat_stream(&self, _: &[Message]) -> Result<ChatStream, LlmError> {
1981                Err(LlmError::Unavailable)
1982            }
1983            fn supports_streaming(&self) -> bool {
1984                false
1985            }
1986            async fn embed(&self, text: &str) -> Result<Vec<f32>, LlmError> {
1987                let v = if text.contains("cat")
1988                    || text.contains("mat")
1989                    || text.contains("feline")
1990                    || text.contains("rug")
1991                {
1992                    if text.contains("feline") || text.contains("rug") {
1993                        vec![0.9f32, 0.1, 0.0]
1994                    } else {
1995                        vec![1.0f32, 0.0, 0.0]
1996                    }
1997                } else {
1998                    vec![0.0f32, 0.0, 1.0]
1999                };
2000                Ok(v)
2001            }
2002            fn supports_embeddings(&self) -> bool {
2003                true
2004            }
2005            fn name(&self) -> &'static str {
2006                "embed-mock"
2007            }
2008        }
2009
2010        let provider = EmbedMockProvider;
2011        let scorer = FidelityScorer;
2012        let cfg = FidelityConfig {
2013            enabled: true,
2014            semantic_scoring_provider: Some(ProviderName::new("embed-mock")),
2015            // Only semantic + temporal active; force Full for all so we can inspect scores
2016            // by checking which messages survive with Full vs not.
2017            // Use extreme thresholds so all messages pass through as Full.
2018            full_threshold: 0.0,
2019            compressed_threshold: 0.0,
2020            w_semantic: 1.0,
2021            w_temporal: 0.0,
2022            w_importance: 0.0,
2023            w_plan: 0.0,
2024            ..make_cfg()
2025        };
2026        let tc = FixedTc(4);
2027        let cat_msg = make_msg(Role::User, "The cat is on the mat");
2028        let feline_msg = make_msg(Role::User, "A feline rests on the rug");
2029        let stock_msg = make_msg(Role::User, "Stock prices fell today");
2030        let mut messages = vec![
2031            make_msg(Role::System, "system"),
2032            cat_msg,
2033            feline_msg,
2034            stock_msg,
2035        ];
2036
2037        scorer
2038            .score_and_apply(
2039                &mut messages,
2040                "cat mat",
2041                &[],
2042                &cfg,
2043                &tc,
2044                0,
2045                false,
2046                Some(&provider),
2047                None,
2048            )
2049            .await;
2050
2051        // All should be scored (Full threshold = 0.0 means everything is Full).
2052        // Embeddings must have been populated.
2053        assert!(
2054            messages[1].metadata.embedding.is_some(),
2055            "cat message must have embedding"
2056        );
2057        assert!(
2058            messages[2].metadata.embedding.is_some(),
2059            "feline message must have embedding"
2060        );
2061        assert!(
2062            messages[3].metadata.embedding.is_some(),
2063            "stock message must have embedding"
2064        );
2065
2066        // Verify cosine directly: cat/feline should be > stock vs query [1,0,0].
2067        let query_emb = [1.0f32, 0.0, 0.0];
2068        let cat_emb = messages[1].metadata.embedding.as_ref().unwrap();
2069        let feline_emb = messages[2].metadata.embedding.as_ref().unwrap();
2070        let stock_emb = messages[3].metadata.embedding.as_ref().unwrap();
2071        assert!(
2072            cosine_similarity(cat_emb, &query_emb) > cosine_similarity(stock_emb, &query_emb),
2073            "cat message must be more similar to query than stock message"
2074        );
2075        assert!(
2076            cosine_similarity(feline_emb, &query_emb) > cosine_similarity(stock_emb, &query_emb),
2077            "feline message must be more similar to query than stock message"
2078        );
2079    }
2080
2081    // 19. semantic_scoring_falls_back_to_keyword_when_provider_none.
2082    #[tokio::test]
2083    async fn semantic_scoring_falls_back_to_keyword_when_provider_none() {
2084        let scorer = FidelityScorer;
2085        let cfg = FidelityConfig {
2086            enabled: true,
2087            semantic_scoring_provider: None,
2088            ..make_cfg()
2089        };
2090        let tc = FixedTc(4);
2091        let mut messages = vec![
2092            make_msg(Role::System, "system"),
2093            make_msg(Role::User, "cat mat keyword test"),
2094            make_msg(Role::User, "something unrelated here"),
2095        ];
2096
2097        scorer
2098            .score_and_apply(
2099                &mut messages,
2100                "query text here long",
2101                &[],
2102                &cfg,
2103                &tc,
2104                0,
2105                false,
2106                None,
2107                None,
2108            )
2109            .await;
2110
2111        // No embeddings should be computed since provider is None.
2112        for msg in &messages {
2113            assert!(
2114                msg.metadata.embedding.is_none(),
2115                "no embedding must be computed when provider is None"
2116            );
2117        }
2118        // Scoring must still work (keyword path).
2119        for msg in &messages[1..] {
2120            assert!(
2121                msg.metadata.fidelity_tag.is_some(),
2122                "all non-system messages must be scored via keyword path"
2123            );
2124        }
2125    }
2126
2127    // w_plan path: messages whose content overlaps planned tool keywords score higher.
2128    #[tokio::test]
2129    async fn w_plan_produces_nonzero_score_for_matching_message() {
2130        use zeph_common::PlannedToolHint;
2131
2132        let scorer = FidelityScorer;
2133        // Only w_plan is active: all other weights zero so plan signal is isolated.
2134        // full_threshold = 0.5 so that a non-zero plan score reaches Full.
2135        let cfg = FidelityConfig {
2136            w_semantic: 0.0,
2137            w_temporal: 0.0,
2138            w_importance: 0.0,
2139            w_plan: 1.0,
2140            full_threshold: 0.5,
2141            compressed_threshold: 0.1,
2142            min_query_length: 100, // disable semantic signal
2143            ..make_cfg()
2144        };
2145        let tc = FixedTc(4);
2146
2147        // Hint: tool_name="shell", keywords contain "cargo" and "build".
2148        let hint = PlannedToolHint::new("shell", vec!["cargo".to_string(), "build".to_string()], 1);
2149
2150        // matching_msg contains words from the hint keywords.
2151        // non_matching_msg has no overlap with hint keywords.
2152        let mut messages = vec![
2153            make_msg(Role::System, "system prompt"),
2154            make_msg(Role::User, "run cargo build to compile"),
2155            make_msg(Role::User, "what is the weather today"),
2156        ];
2157
2158        scorer
2159            .score_and_apply(
2160                &mut messages,
2161                "q", // short query to keep semantic inactive
2162                &[hint],
2163                &cfg,
2164                &tc,
2165                0,
2166                false,
2167                None,
2168                None,
2169            )
2170            .await;
2171
2172        // The matching message should be scored Full (plan overlap is high).
2173        assert_eq!(
2174            messages[1].metadata.fidelity_tag,
2175            Some(ContextFidelity::Full),
2176            "message matching planned tool keywords must reach Full fidelity"
2177        );
2178
2179        // The non-matching message should not be Full (plan overlap is zero).
2180        assert_ne!(
2181            messages[2].metadata.fidelity_tag,
2182            Some(ContextFidelity::Full),
2183            "message with no keyword overlap must not reach Full fidelity via w_plan"
2184        );
2185    }
2186
2187    // ── truncate_to_byte_limit ────────────────────────────────────────────────
2188
2189    #[test]
2190    fn truncate_to_byte_limit_no_op_when_short() {
2191        assert_eq!(truncate_to_byte_limit("hello", 10), "hello");
2192    }
2193
2194    #[test]
2195    fn truncate_to_byte_limit_exact_limit_no_op() {
2196        assert_eq!(truncate_to_byte_limit("hello", 5), "hello");
2197    }
2198
2199    #[test]
2200    fn truncate_to_byte_limit_over_limit() {
2201        let s = truncate_to_byte_limit("abcdefgh", 5);
2202        assert_eq!(s.len(), 5);
2203        assert_eq!(s, "abcde");
2204    }
2205
2206    #[test]
2207    fn truncate_to_byte_limit_multibyte_boundary() {
2208        // "日本語" = 3 chars, each 3 bytes (9 bytes total). max_bytes=6 → "日本" (6 bytes).
2209        let s = truncate_to_byte_limit("日本語", 6);
2210        assert!(s.is_char_boundary(s.len()));
2211        assert_eq!(s, "日本");
2212    }
2213
2214    // ── apply_input_cap ───────────────────────────────────────────────────────
2215
2216    #[test]
2217    fn apply_input_cap_no_op_below_limit() {
2218        let mut s = "hello".to_string();
2219        apply_input_cap(&mut s, 10); // 10 * 4 = 40 bytes, well above 5
2220        assert_eq!(s, "hello");
2221    }
2222
2223    #[test]
2224    fn apply_input_cap_truncates_over_limit() {
2225        // max_tokens=1 → max_bytes=4
2226        let mut s = "abcdefgh".to_string();
2227        apply_input_cap(&mut s, 1);
2228        assert_eq!(s, "abcd");
2229    }
2230
2231    #[test]
2232    fn apply_input_cap_multibyte() {
2233        // "日本語" = 9 bytes. max_tokens=1 → max_bytes=4. floor_char_boundary(4) on "日本語"
2234        // lands at 3 (end of "日"). Result is "日".
2235        let mut s = "日本語".to_string();
2236        apply_input_cap(&mut s, 1);
2237        assert!(s.is_char_boundary(s.len()));
2238        assert_eq!(s, "日");
2239    }
2240
2241    // ── embed_prepass ─────────────────────────────────────────────────────────
2242
2243    #[tokio::test]
2244    async fn embed_prepass_returns_embeddings_for_non_exempt() {
2245        let messages = vec![
2246            make_msg(Role::System, "system prompt"), // exempt (idx=0, system)
2247            make_msg(Role::User, "user message"),
2248            make_msg(Role::Assistant, "assistant reply"),
2249        ];
2250        let cfg = FidelityConfig::default();
2251        let embed = |_text: &str| -> EmbedFuture { Box::pin(async { Ok(vec![1.0f32, 2.0, 3.0]) }) };
2252        let result = embed_prepass(&messages, &embed, &cfg, 0).await;
2253        // Index 0 (system) is exempt, indices 1 and 2 get embeddings.
2254        assert!(!result.contains_key(&0));
2255        assert_eq!(result[&1], vec![1.0, 2.0, 3.0]);
2256        assert_eq!(result[&2], vec![1.0, 2.0, 3.0]);
2257    }
2258
2259    #[tokio::test]
2260    async fn embed_prepass_skips_empty_content() {
2261        let messages = vec![make_msg(Role::System, "system"), make_msg(Role::User, "")];
2262        let cfg = FidelityConfig::default();
2263        let embed = |_text: &str| -> EmbedFuture { Box::pin(async { Ok(vec![1.0f32]) }) };
2264        let result = embed_prepass(&messages, &embed, &cfg, 0).await;
2265        assert!(!result.contains_key(&1), "empty content must be skipped");
2266    }
2267
2268    #[tokio::test]
2269    async fn embed_prepass_skips_inserted_memory() {
2270        let messages = vec![
2271            make_msg(Role::System, "system"),
2272            make_msg(Role::User, "injected memory"),
2273            make_msg(Role::User, "real user message"),
2274        ];
2275        let cfg = FidelityConfig::default();
2276        let embed = |_text: &str| -> EmbedFuture { Box::pin(async { Ok(vec![1.0f32]) }) };
2277        // inserted_count=1 → idx 1 is exempt
2278        let result = embed_prepass(&messages, &embed, &cfg, 1).await;
2279        assert!(!result.contains_key(&0), "system is exempt");
2280        assert!(!result.contains_key(&1), "inserted memory is exempt");
2281        assert!(
2282            result.contains_key(&2),
2283            "real user message must be embedded"
2284        );
2285    }
2286
2287    #[tokio::test]
2288    async fn embed_prepass_silently_skips_errors() {
2289        let messages = vec![
2290            make_msg(Role::System, "system"),
2291            make_msg(Role::User, "user"),
2292        ];
2293        let cfg = FidelityConfig::default();
2294        let embed = |_text: &str| -> EmbedFuture {
2295            Box::pin(async {
2296                Err(zeph_llm::LlmError::EmbedUnsupported {
2297                    provider: "mock".to_string(),
2298                })
2299            })
2300        };
2301        let result = embed_prepass(&messages, &embed, &cfg, 0).await;
2302        assert!(result.is_empty(), "errors must be silently skipped");
2303    }
2304
2305    #[tokio::test]
2306    async fn embed_prepass_truncates_content_when_cap_set() {
2307        // With max_embed_input_tokens=1 (max_bytes=4), content longer than 4 bytes is truncated.
2308        let long_content = "a".repeat(100);
2309        let messages = vec![
2310            make_msg(Role::System, "system"),
2311            make_msg(Role::User, &long_content),
2312        ];
2313        let cfg = FidelityConfig {
2314            max_embed_input_tokens: Some(1), // 1 * 4 = 4 bytes max
2315            ..FidelityConfig::default()
2316        };
2317        let seen_len = std::sync::Arc::new(std::sync::Mutex::new(0usize));
2318        let seen_len_clone = seen_len.clone();
2319        let embed = move |text: &str| -> EmbedFuture {
2320            let len = text.len();
2321            let seen = seen_len_clone.clone();
2322            Box::pin(async move {
2323                *seen.lock().unwrap() = len;
2324                Ok(vec![1.0f32])
2325            })
2326        };
2327        embed_prepass(&messages, &embed, &cfg, 0).await;
2328        assert_eq!(
2329            *seen_len.lock().unwrap(),
2330            4,
2331            "content must be truncated to max_embed_input_tokens * 4 bytes"
2332        );
2333    }
2334
2335    #[tokio::test]
2336    async fn embed_prepass_concurrency_zero_clamped_to_one() {
2337        // embed_concurrency=0 must be clamped to 1 (with a warning) and still produce results.
2338        let messages = vec![
2339            make_msg(Role::System, "system"),
2340            make_msg(Role::User, "user message"),
2341        ];
2342        let cfg = FidelityConfig {
2343            embed_concurrency: 0,
2344            ..FidelityConfig::default()
2345        };
2346        let embed = |_text: &str| -> EmbedFuture { Box::pin(async { Ok(vec![1.0f32]) }) };
2347        let result = embed_prepass(&messages, &embed, &cfg, 0).await;
2348        assert!(
2349            result.contains_key(&1),
2350            "result must be produced even with concurrency=0"
2351        );
2352    }
2353
2354    #[tokio::test]
2355    async fn embed_prepass_skips_cached_embeddings() {
2356        let mut msg_with_cache = make_msg(Role::User, "already embedded");
2357        msg_with_cache.metadata.embedding = Some(vec![9.0f32]);
2358        let messages = vec![
2359            make_msg(Role::System, "system"),
2360            msg_with_cache,
2361            make_msg(Role::User, "needs embedding"),
2362        ];
2363        let cfg = FidelityConfig::default();
2364        let embed = |_text: &str| -> EmbedFuture { Box::pin(async { Ok(vec![1.0f32]) }) };
2365        let result = embed_prepass(&messages, &embed, &cfg, 0).await;
2366        assert!(
2367            !result.contains_key(&1),
2368            "message with cached embedding must be skipped"
2369        );
2370        assert!(
2371            result.contains_key(&2),
2372            "message without embedding must be processed"
2373        );
2374    }
2375
2376    #[tokio::test(start_paused = true)]
2377    async fn embed_prepass_timeout_skips_message() {
2378        let messages = vec![
2379            make_msg(Role::System, "system"),
2380            make_msg(Role::User, "user"),
2381        ];
2382        let cfg = FidelityConfig::default();
2383        let embed = |_text: &str| -> EmbedFuture {
2384            Box::pin(async {
2385                // Simulate a stalled embed provider.
2386                tokio::time::sleep(Duration::from_secs(45)).await;
2387                Ok(vec![1.0f32])
2388            })
2389        };
2390        // Run the pre-pass; the 30-second timeout fires before the 45-second sleep.
2391        let result = embed_prepass(&messages, &embed, &cfg, 0).await;
2392        assert!(result.is_empty(), "timed-out embed must be skipped");
2393    }
2394
2395    #[tokio::test(start_paused = true)]
2396    async fn embed_prepass_custom_timeout_respected() {
2397        let messages = vec![
2398            make_msg(Role::System, "system"),
2399            make_msg(Role::User, "user"),
2400        ];
2401        // embed_timeout_secs=5: provider takes 10 s → must be skipped.
2402        let cfg = FidelityConfig {
2403            embed_timeout_secs: 5,
2404            ..FidelityConfig::default()
2405        };
2406        let embed = |_text: &str| -> EmbedFuture {
2407            Box::pin(async {
2408                tokio::time::sleep(Duration::from_secs(10)).await;
2409                Ok(vec![1.0f32])
2410            })
2411        };
2412        let result = embed_prepass(&messages, &embed, &cfg, 0).await;
2413        assert!(
2414            result.is_empty(),
2415            "custom embed_timeout_secs must be honored"
2416        );
2417    }
2418
2419    // ── FidelityConfig new fields ─────────────────────────────────────────────
2420
2421    #[test]
2422    fn fidelity_config_new_fields_defaults() {
2423        let cfg = FidelityConfig::default();
2424        assert_eq!(cfg.embed_concurrency, 32);
2425        assert!(cfg.max_embed_input_tokens.is_none());
2426        assert!(cfg.max_compress_input_tokens.is_none());
2427    }
2428
2429    #[test]
2430    fn fidelity_config_timeout_defaults() {
2431        let cfg = FidelityConfig::default();
2432        assert_eq!(cfg.embed_timeout_secs, 30);
2433        assert_eq!(cfg.compress_timeout_secs, 30);
2434    }
2435
2436    #[test]
2437    fn fidelity_config_new_fields_custom() {
2438        let cfg = FidelityConfig {
2439            embed_concurrency: 8,
2440            max_embed_input_tokens: Some(512),
2441            max_compress_input_tokens: Some(1024),
2442            ..FidelityConfig::default()
2443        };
2444        assert_eq!(cfg.embed_concurrency, 8);
2445        assert_eq!(cfg.max_embed_input_tokens, Some(512));
2446        assert_eq!(cfg.max_compress_input_tokens, Some(1024));
2447    }
2448
2449    // Post-compress truncation: deferred_summary exceeding compressed_max_tokens is trimmed.
2450    #[tokio::test]
2451    async fn render_compressed_truncates_oversized_deferred_summary() {
2452        let scorer = FidelityScorer;
2453        // full_threshold=2.0 unreachable, compressed_threshold=0.0 → everything Compressed.
2454        // compressed_max_tokens=3 (with FixedTc(1): 1 char = 1 token, so 3 tokens = 3 chars).
2455        let cfg = FidelityConfig {
2456            full_threshold: 2.0,
2457            compressed_threshold: 0.0,
2458            compressed_max_tokens: 3,
2459            ..make_cfg()
2460        };
2461        let tc = FixedTc(1);
2462        let mut msg = make_msg(Role::User, "original long content");
2463        // Simulate LLM returning a summary that is 10 chars (> 3 tokens).
2464        msg.metadata.deferred_summary = Some("ten chars!".to_string());
2465        let mut messages = vec![make_msg(Role::System, "sys"), msg];
2466        scorer
2467            .score_and_apply(
2468                &mut messages,
2469                "query text here long",
2470                &[],
2471                &cfg,
2472                &tc,
2473                0,
2474                false,
2475                None,
2476                None,
2477            )
2478            .await;
2479        let compressed = &messages[1];
2480        assert_eq!(
2481            compressed.metadata.fidelity_tag,
2482            Some(ContextFidelity::Compressed)
2483        );
2484        assert!(
2485            tc.count_tokens(&compressed.content) <= 3,
2486            "deferred_summary result must be truncated to compressed_max_tokens"
2487        );
2488    }
2489
2490    // max_compress_input_tokens: content is capped before truncation when no deferred_summary.
2491    #[tokio::test]
2492    async fn render_compressed_applies_max_compress_input_tokens() {
2493        let scorer = FidelityScorer;
2494        // Force everything to Compressed. max_compress_input_tokens=2 → max_bytes=8.
2495        // FixedTc(1): 1 byte = 1 token. compressed_max_tokens=100 (no output cap needed here).
2496        let cfg = FidelityConfig {
2497            full_threshold: 2.0,
2498            compressed_threshold: 0.0,
2499            compressed_max_tokens: 100,
2500            max_compress_input_tokens: Some(2), // 2 * 4 = 8 bytes
2501            ..make_cfg()
2502        };
2503        let tc = FixedTc(1);
2504        let content_20 = "a".repeat(20); // 20 bytes, must be capped to 8
2505        let mut msg = make_msg(Role::User, &content_20);
2506        // No deferred_summary → input cap path applies.
2507        let mut messages = vec![make_msg(Role::System, "sys"), msg.clone()];
2508        scorer
2509            .score_and_apply(
2510                &mut messages,
2511                "query text here long",
2512                &[],
2513                &cfg,
2514                &tc,
2515                0,
2516                false,
2517                None,
2518                None,
2519            )
2520            .await;
2521        let compressed = &messages[1];
2522        assert_eq!(
2523            compressed.metadata.fidelity_tag,
2524            Some(ContextFidelity::Compressed)
2525        );
2526        // Input was capped to 8 bytes, then output cap of 100 is no-op, so content is 8 bytes.
2527        assert_eq!(
2528            compressed.content.len(),
2529            8,
2530            "content must be capped to max_compress_input_tokens * 4 bytes"
2531        );
2532        // Verify deferred_summary path is unaffected by input cap.
2533        msg.metadata.deferred_summary = Some("short".to_string());
2534        let mut messages2 = vec![make_msg(Role::System, "sys"), msg];
2535        scorer
2536            .score_and_apply(
2537                &mut messages2,
2538                "query text here long",
2539                &[],
2540                &cfg,
2541                &tc,
2542                0,
2543                false,
2544                None,
2545                None,
2546            )
2547            .await;
2548        assert_eq!(
2549            messages2[1].content, "short",
2550            "deferred_summary must bypass input cap"
2551        );
2552    }
2553}