Skip to main content

zeph_context/
fidelity.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Heuristic fidelity scorer for Context-Adaptive Memory (CAM).
5//!
6//! [`FidelityScorer`] is a stateless scoring engine that assigns a three-level
7//! representation ([`ContextFidelity::Full`] / [`ContextFidelity::Compressed`] /
8//! [`ContextFidelity::Placeholder`]) to each message in the context window. Scoring is
9//! driven by weighted signals: temporal recency, role importance, keyword-based semantic
10//! relevance, and optional plan hints.
11//!
12//! [`FidelityConfig`] holds all tuning knobs; it is read from `[memory.fidelity]` in
13//! `config.toml`. When `enabled = false` (the default), the scorer returns immediately
14//! without modifying the message window.
15//!
16//! ## Embed pre-pass
17//!
18//! [`embed_prepass`] runs concurrently via `futures::stream::buffer_unordered` and
19//! populates per-message embedding vectors before scoring. The concurrency bound is
20//! controlled by [`FidelityConfig::embed_concurrency`] (default 32).
21
22use std::time::Duration;
23
24use futures::StreamExt as _;
25use tracing::info_span;
26use zeph_common::memory::TokenCounting;
27use zeph_common::{ContextFidelity, PlannedToolHint};
28use zeph_llm::LlmProviderDyn;
29use zeph_llm::provider::{EmbedFuture, Message, MessageMetadata, MessagePart, Role};
30
31use crate::assembler::CORRECTIONS_PREFIX;
32
33// Re-export FidelityConfig from zeph-config so both crates share one definition.
34pub use zeph_config::FidelityConfig;
35
36/// Run embed calls for `messages` concurrently, returning per-index embedding vectors.
37///
38/// Indexes not included in the result either had no content, already had an embedding
39/// cached in `msg.metadata.embedding`, or produced an error (errors are logged at `debug`
40/// level and silently skipped — scoring falls back to keyword overlap for those messages).
41/// Each embed call is bounded by a 30-second timeout; timed-out messages are skipped with
42/// a `warn`-level log.
43///
44/// Only exempt messages (focus-pinned, inserted memory, system) are skipped; for non-exempt messages the content
45/// is optionally truncated to `config.max_embed_input_tokens * 4` bytes (at a char boundary)
46/// before the call.
47///
48/// # Note
49///
50/// This function accepts a generic `F: Fn(&str) -> EmbedFuture` closure, which requires
51/// the returned future to be `'static`. This is convenient for unit tests using mock
52/// closures. Production code in [`FidelityScorer::score_and_apply`] runs the equivalent
53/// concurrent pre-pass inline to avoid the `'static` bound on `&dyn LlmProviderDyn`.
54///
55/// Concurrency is bounded by [`FidelityConfig::embed_concurrency`] (default 32), which
56/// maps to the `buffer_unordered(N)` parameter. A zero value is clamped to 1 with a warning.
57///
58/// # Examples
59///
60/// ```no_run
61/// use zeph_context::fidelity::{embed_prepass, FidelityConfig};
62///
63/// async fn example() {
64///     let messages = vec![];
65///     let cfg = FidelityConfig::default();
66///     let embed = |_text: &str| -> zeph_llm::provider::EmbedFuture {
67///         Box::pin(async { Ok(vec![0.0f32; 384]) })
68///     };
69///     let _embeddings = embed_prepass(&messages, &embed, &cfg, 0).await;
70/// }
71/// ```
72pub async fn embed_prepass<F>(
73    messages: &[Message],
74    embed: &F,
75    config: &FidelityConfig,
76    inserted_count: usize,
77) -> std::collections::HashMap<usize, Vec<f32>>
78where
79    F: Fn(&str) -> EmbedFuture + Send + Sync,
80{
81    let concurrency = if config.embed_concurrency == 0 {
82        tracing::warn!(
83            "embed_concurrency is 0, clamping to 1; set a positive value in [context.fidelity]"
84        );
85        1
86    } else {
87        config.embed_concurrency
88    };
89
90    let tasks = messages.iter().enumerate().filter_map(|(i, msg)| {
91        if is_exempt(msg, i, inserted_count)
92            || msg.content.is_empty()
93            || msg.metadata.embedding.is_some()
94        {
95            return None;
96        }
97        let content = match config.max_embed_input_tokens {
98            Some(n) => truncate_to_byte_limit(&msg.content, n.saturating_mul(4)),
99            None => msg.content.clone(),
100        };
101        Some((i, content))
102    });
103
104    futures::stream::iter(tasks)
105        .map(|(i, content)| async move {
106            let result = tokio::time::timeout(Duration::from_secs(30), embed(&content)).await;
107            match result {
108                Ok(Ok(vec)) => Some((i, vec)),
109                Ok(Err(e)) => {
110                    tracing::debug!(idx = i, err = %e, "embed_prepass: embed failed, skipping");
111                    None
112                }
113                Err(_) => {
114                    tracing::warn!(idx = i, "embed_prepass: embed timed out, skipping");
115                    None
116                }
117            }
118        })
119        .buffer_unordered(concurrency)
120        .filter_map(|opt| async move { opt })
121        .collect()
122        .await
123}
124
125/// Truncate `s` to at most `max_bytes` bytes, landing on a valid UTF-8 char boundary.
126///
127/// The limit is a byte count, not a character count. Callers using a token approximation
128/// should pass `n * 4` (the 4-byte-per-token heuristic) via [`saturating_mul`](usize::saturating_mul).
129/// Uses [`str::floor_char_boundary`] (stable since Rust 1.91) so multibyte characters
130/// are never split.
131///
132/// Returns a new `String`; does not modify the original.
133fn truncate_to_byte_limit(s: &str, max_bytes: usize) -> String {
134    if s.len() <= max_bytes {
135        return s.to_string();
136    }
137    let boundary = s.floor_char_boundary(max_bytes);
138    s[..boundary].to_string()
139}
140
141struct FidelityScore {
142    score: f32,
143    level: ContextFidelity,
144    original_tokens: u32,
145}
146
147/// Stateless heuristic scorer that assigns and applies fidelity levels to a message window.
148///
149/// Call [`FidelityScorer::score_and_apply`] after `apply_prepared_context()` returns to
150/// enforce the three-level representation (Full / Compressed / Placeholder) on historical
151/// messages. The scorer never touches exempt messages (INV-07 through INV-10).
152///
153/// # Examples
154///
155/// ```no_run
156/// use zeph_context::fidelity::{FidelityConfig, FidelityScorer};
157///
158/// # async fn run() {
159/// let scorer = FidelityScorer;
160/// let cfg = FidelityConfig { enabled: false, ..FidelityConfig::default() };
161/// // With `enabled = false` the scorer is a no-op.
162/// let mut messages = vec![];
163/// scorer.score_and_apply(&mut messages, "query", &[], &cfg, &MockTc, 0, false, None, None).await;
164///
165/// struct MockTc;
166/// impl zeph_common::memory::TokenCounting for MockTc {
167///     fn count_tokens(&self, text: &str) -> usize { text.len() / 4 }
168///     fn count_tool_schema_tokens(&self, _: &serde_json::Value) -> usize { 0 }
169/// }
170/// # }
171/// ```
172pub struct FidelityScorer;
173
174impl FidelityScorer {
175    /// Score all non-exempt messages and apply fidelity rendering in-place.
176    ///
177    /// Steps (per spec §5 data flow):
178    /// 1. Guard: return early when `enabled == false`.
179    /// 2. Build exempt set (INV-07 through INV-10).
180    /// 3. Score each non-exempt message with normalized weight sum (INV-05).
181    /// 4. Apply floor invariant: a message with a persisted fidelity tag cannot
182    ///    be upgraded to a less restrictive level unless `allow_upgrade = true`.
183    /// 5. Apply tool-pair atomicity — both get `min(score_a, score_b)` (INV-03).
184    /// 6. Render `Compressed` / `Placeholder` messages (INV-12).
185    /// 7. Merge consecutive same-role `Placeholder` messages (INV-04).
186    ///
187    /// # Parameters
188    ///
189    /// - `messages` — mutable message window (includes system prompt at index 0).
190    /// - `query` — current user query; drives semantic signal.
191    /// - `planned_tools` — DAG lookahead hints; empty slice disables plan signal.
192    /// - `config` — scoring thresholds and weights.
193    /// - `tc` — token counter used for `Placeholder`/`Compressed` rendering.
194    /// - `inserted_count` — number of memory messages freshly injected at indices
195    ///   `1..1+inserted_count`; these are always exempt (INV-10).
196    /// - `allow_upgrade` — when `true` the persisted floor invariant is bypassed.
197    ///   Pass `true` only from the proactive regrade path; pass `false` everywhere else.
198    /// - `embed_provider` — optional LLM provider used for query and per-message embeddings
199    ///   when `config.semantic_scoring_provider` is set. Pass `None` to fall back to keyword
200    ///   overlap.
201    /// - `compress_provider` — optional LLM provider used for `Compressed` rendering when
202    ///   `config.compress_provider` is set. Pass `None` to fall back to truncation.
203    ///
204    /// The two providers may be the same instance or different ones (e.g. a fast embedding
205    /// model for `embed_provider` and a generative model for `compress_provider`).
206    #[allow(clippy::too_many_arguments)]
207    pub async fn score_and_apply(
208        &self,
209        messages: &mut Vec<Message>,
210        query: &str,
211        planned_tools: &[PlannedToolHint],
212        config: &FidelityConfig,
213        tc: &dyn TokenCounting,
214        inserted_count: usize,
215        allow_upgrade: bool,
216        embed_provider: Option<&dyn LlmProviderDyn>,
217        compress_provider: Option<&dyn LlmProviderDyn>,
218    ) {
219        if !config.enabled || messages.is_empty() {
220            return;
221        }
222
223        // Embed query once when semantic provider is configured.
224        let query_embedding: Option<Vec<f32>> = if let (true, Some(p)) =
225            (config.semantic_scoring_provider.is_some(), embed_provider)
226            && p.supports_embeddings()
227        {
228            let _span = info_span!("context.fidelity.embed_query").entered();
229            match tokio::time::timeout(Duration::from_secs(30), p.embed(query)).await {
230                Ok(Ok(v)) => Some(v),
231                Ok(Err(e)) => {
232                    tracing::warn!(error = %e, "semantic scoring provider unavailable, falling back to keyword");
233                    None
234                }
235                Err(_) => {
236                    tracing::warn!("fidelity query embed timed out, falling back to keyword");
237                    None
238                }
239            }
240        } else {
241            None
242        };
243
244        // Concurrent embed pre-pass: populate missing embeddings for the scored window.
245        // Uses buffer_unordered so all N embed calls run in parallel (bounded by
246        // embed_concurrency), replacing the former sequential O(N) loop.
247        if let (Some(q_emb), Some(p)) = (&query_embedding, embed_provider) {
248            let n = messages.len();
249            let score_end = if n > config.max_scored_messages {
250                n.saturating_sub(config.exempt_tail_messages)
251            } else {
252                n
253            };
254            let concurrency = if config.embed_concurrency == 0 {
255                1
256            } else {
257                config.embed_concurrency
258            };
259            let _span = info_span!("context.fidelity.embed_prepass").entered();
260            let embeddings: std::collections::HashMap<usize, Vec<f32>> =
261                futures::stream::iter(messages[..score_end].iter().enumerate().filter_map(
262                    |(i, msg)| {
263                        if msg.metadata.embedding.is_none()
264                            && !is_exempt(msg, i, inserted_count)
265                            && !msg.content.is_empty()
266                        {
267                            let content = match config.max_embed_input_tokens {
268                                Some(n) => {
269                                    truncate_to_byte_limit(&msg.content, n.saturating_mul(4))
270                                }
271                                None => msg.content.clone(),
272                            };
273                            Some((i, content))
274                        } else {
275                            None
276                        }
277                    },
278                ))
279                .map(|(i, content)| async move {
280                    let result =
281                        tokio::time::timeout(Duration::from_secs(30), p.embed(&content)).await;
282                    match result {
283                        Ok(Ok(v)) => Some((i, v)),
284                        Ok(Err(e)) => {
285                            tracing::warn!(error = %e, "message embed failed, skipping");
286                            None
287                        }
288                        Err(_) => {
289                            tracing::warn!(idx = i, "fidelity message embed timed out, skipping");
290                            None
291                        }
292                    }
293                })
294                .buffer_unordered(concurrency)
295                .filter_map(|opt| async move { opt })
296                .collect()
297                .await;
298            for (i, emb) in embeddings {
299                messages[i].metadata.embedding = Some(emb);
300            }
301            let _ = q_emb; // used below in compute_scores
302        }
303
304        let scores = compute_scores(
305            messages,
306            query,
307            planned_tools,
308            config,
309            tc,
310            inserted_count,
311            allow_upgrade,
312            query_embedding.as_deref(),
313        );
314        apply_scores(messages, &scores, config, tc, compress_provider).await;
315
316        let _merge_span = info_span!("context.fidelity.merge").entered();
317        let merged_count = merge_consecutive_placeholders(messages);
318        tracing::debug!(merged_count, "fidelity merge complete");
319    }
320}
321
322#[allow(clippy::too_many_arguments)]
323fn compute_scores(
324    messages: &[Message],
325    query: &str,
326    planned_tools: &[PlannedToolHint],
327    config: &FidelityConfig,
328    tc: &dyn TokenCounting,
329    inserted_count: usize,
330    allow_upgrade: bool,
331    query_embedding: Option<&[f32]>,
332) -> Vec<Option<FidelityScore>> {
333    let n = messages.len();
334
335    // Performance cap: only score oldest messages; newest `exempt_tail_messages` default to Full.
336    let score_end = if n > config.max_scored_messages {
337        n.saturating_sub(config.exempt_tail_messages)
338    } else {
339        n
340    };
341
342    let semantic_active = query.len() >= config.min_query_length;
343    let plan_active = !planned_tools.is_empty();
344    // Build once outside the per-message loop (SF-1: avoids 500 redundant allocations).
345    let query_words: std::collections::HashSet<&str> = if semantic_active {
346        query.split_whitespace().collect()
347    } else {
348        std::collections::HashSet::default()
349    };
350
351    // Compute the active weight sum (INV-05).
352    let mut weight_sum = config.w_temporal + config.w_importance;
353    if semantic_active {
354        weight_sum += config.w_semantic;
355    }
356    if plan_active {
357        weight_sum += config.w_plan;
358    }
359    if weight_sum <= 0.0 {
360        weight_sum = 1.0;
361    }
362
363    #[allow(clippy::cast_precision_loss)]
364    let max_dist = score_end.saturating_sub(1) as f32;
365
366    let mut scores: Vec<Option<FidelityScore>> = (0..n).map(|_| None).collect();
367
368    for (i, msg) in messages.iter().enumerate().take(score_end) {
369        if is_exempt(msg, i, inserted_count) {
370            continue;
371        }
372
373        #[allow(clippy::cast_possible_truncation)]
374        let original_tokens = tc.count_tokens(&msg.content) as u32;
375
376        // distance_from_end = 0 for newest (i = score_end-1), N-1 for oldest (i = 0).
377        // Spec §6.1: temporal = 1.0 - distance_from_end / max_dist → newest = 1.0, oldest ≈ 0.0.
378        #[allow(clippy::cast_precision_loss)]
379        let temporal = if max_dist > 0.0 {
380            let distance_from_end = (score_end - 1 - i) as f32;
381            1.0 - distance_from_end / max_dist
382        } else {
383            1.0
384        };
385        // Spec §6.2: ToolResult messages use weight 0.4 regardless of Role::User mapping.
386        let importance = if msg
387            .parts
388            .iter()
389            .any(|p| matches!(p, MessagePart::ToolResult { .. }))
390        {
391            0.4
392        } else {
393            role_weight(msg.role)
394        };
395        let semantic = if semantic_active {
396            match (query_embedding, msg.metadata.embedding.as_deref()) {
397                (Some(q_emb), Some(m_emb)) => semantic_overlap(m_emb, q_emb),
398                _ => keyword_overlap(&msg.content, &query_words),
399            }
400        } else {
401            0.0
402        };
403        let plan = if plan_active {
404            plan_relevance(&msg.content, planned_tools)
405        } else {
406            0.0
407        };
408
409        let raw = config.w_temporal * temporal
410            + config.w_importance * importance
411            + if semantic_active {
412                config.w_semantic * semantic
413            } else {
414                0.0
415            }
416            + if plan_active {
417                config.w_plan * plan
418            } else {
419                0.0
420            };
421
422        let score = (raw / weight_sum).clamp(0.0, 1.0);
423        let candidate_level = score_to_level(score, config);
424
425        // Floor invariant (CAM Phase 2-B): a persisted fidelity tag constrains upgrades.
426        // A message previously scored as Compressed cannot be upgraded back to Full;
427        // Placeholder cannot be upgraded to Full or Compressed.
428        // The regrade path passes allow_upgrade=true to bypass this constraint.
429        let level = if allow_upgrade {
430            candidate_level
431        } else {
432            match msg.metadata.fidelity_tag {
433                Some(ContextFidelity::Placeholder) => ContextFidelity::Placeholder,
434                Some(ContextFidelity::Compressed) => {
435                    if candidate_level == ContextFidelity::Full {
436                        ContextFidelity::Compressed
437                    } else {
438                        candidate_level
439                    }
440                }
441                _ => candidate_level,
442            }
443        };
444
445        scores[i] = Some(FidelityScore {
446            score,
447            level,
448            original_tokens,
449        });
450    }
451
452    apply_tool_pair_atomicity(messages, &mut scores, config);
453    scores
454}
455
456async fn apply_scores(
457    messages: &mut [Message],
458    scores: &[Option<FidelityScore>],
459    config: &FidelityConfig,
460    tc: &dyn TokenCounting,
461    provider: Option<&dyn LlmProviderDyn>,
462) {
463    let _apply_span = info_span!("context.fidelity.apply").entered();
464    let (mut full_count, mut compressed_count, mut placeholder_count, mut tokens_saved) =
465        (0u32, 0u32, 0u32, 0u32);
466
467    for (i, msg) in messages.iter_mut().enumerate() {
468        let Some(ref fs) = scores[i] else { continue };
469        match fs.level {
470            ContextFidelity::Compressed => {
471                #[allow(clippy::cast_possible_truncation)]
472                let original_tokens = fs.original_tokens;
473                render_compressed(msg, config, tc, provider).await;
474                #[allow(clippy::cast_possible_truncation)]
475                let new_tokens = tc.count_tokens(&msg.content) as u32;
476                tokens_saved += original_tokens.saturating_sub(new_tokens);
477                compressed_count += 1;
478            }
479            ContextFidelity::Placeholder => {
480                render_placeholder(msg, fs.score, fs.original_tokens);
481                placeholder_count += 1;
482            }
483            // Full and any future variants keep original content.
484            _ => {
485                msg.metadata.fidelity_tag = Some(ContextFidelity::Full);
486                full_count += 1;
487            }
488        }
489    }
490
491    tracing::debug!(
492        full_count,
493        compressed_count,
494        placeholder_count,
495        tokens_saved,
496        "fidelity apply complete"
497    );
498}
499
500fn is_exempt(msg: &Message, idx: usize, inserted_count: usize) -> bool {
501    // INV-07: system prompt at index 0.
502    // INV-08: focus_pinned messages.
503    // INV-09: correction messages.
504    // INV-10: freshly injected memory context at indices 1..1+inserted_count.
505    (idx == 0 && msg.role == Role::System)
506        || msg.metadata.focus_pinned
507        || msg.content.starts_with(CORRECTIONS_PREFIX)
508        || (idx >= 1 && idx < 1 + inserted_count)
509}
510
511fn role_weight(role: Role) -> f32 {
512    match role {
513        Role::System => 1.0,
514        Role::User => 0.8,
515        Role::Assistant => 0.6,
516    }
517}
518
519fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
520    if a.len() != b.len() || a.is_empty() {
521        return 0.0;
522    }
523    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
524    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
525    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
526    if norm_a == 0.0 || norm_b == 0.0 {
527        return 0.0;
528    }
529    (dot / (norm_a * norm_b)).clamp(0.0, 1.0)
530}
531
532fn semantic_overlap(msg_embedding: &[f32], query_embedding: &[f32]) -> f32 {
533    cosine_similarity(msg_embedding, query_embedding)
534}
535
536/// Simple word-intersection semantic overlap, normalized to [0, 1].
537///
538/// `query_words` is pre-built outside the per-message loop (SF-1).
539fn keyword_overlap(content: &str, query_words: &std::collections::HashSet<&str>) -> f32 {
540    let content_words: std::collections::HashSet<&str> = content.split_whitespace().collect();
541    let min_len = content_words.len().min(query_words.len());
542    if min_len == 0 {
543        return 0.0;
544    }
545    #[allow(clippy::cast_precision_loss)]
546    let result = content_words.intersection(query_words).count() as f32 / min_len as f32;
547    result.clamp(0.0, 1.0)
548}
549
550/// Keyword overlap between message content and planned tool keywords.
551///
552/// Weighted by `1.0 / distance_from_current` and averaged across all hints.
553fn plan_relevance(content: &str, planned_tools: &[PlannedToolHint]) -> f32 {
554    if planned_tools.is_empty() {
555        return 0.0;
556    }
557    let content_words: std::collections::HashSet<&str> = content.split_whitespace().collect();
558    let mut weighted_sum = 0.0f32;
559    let mut weight_total = 0.0f32;
560    for hint in planned_tools {
561        let dist = f32::from(hint.distance_from_current.max(1));
562        let weight = 1.0 / dist;
563        weight_total += weight;
564        let hint_words: std::collections::HashSet<&str> =
565            hint.keywords.iter().map(String::as_str).collect();
566        let min_len = content_words.len().min(hint_words.len());
567        if min_len == 0 {
568            continue;
569        }
570        #[allow(clippy::cast_precision_loss)]
571        let overlap = content_words.intersection(&hint_words).count() as f32 / min_len as f32;
572        weighted_sum += weight * overlap.clamp(0.0, 1.0);
573    }
574    if weight_total <= 0.0 {
575        return 0.0;
576    }
577    (weighted_sum / weight_total).clamp(0.0, 1.0)
578}
579
580/// O(N) backward scan: find `ToolUse`/`ToolResult` pairs and assign `min(score_a, score_b)`.
581///
582/// The "min" is computed over both the float score and the already-floored level, so
583/// the floor invariant set in `compute_scores` is respected (INV-03 + floor invariant).
584fn apply_tool_pair_atomicity(
585    messages: &[Message],
586    scores: &mut [Option<FidelityScore>],
587    config: &FidelityConfig,
588) {
589    // Collect (tool_use_id, message_index) for ToolResult messages.
590    let mut tool_result_map: std::collections::HashMap<&str, usize> =
591        std::collections::HashMap::new();
592    for (i, msg) in messages.iter().enumerate() {
593        for part in &msg.parts {
594            if let MessagePart::ToolResult { tool_use_id, .. } = part {
595                tool_result_map.insert(tool_use_id.as_str(), i);
596            }
597        }
598    }
599
600    // Walk backward to find ToolUse messages and pair with their result.
601    for (i, msg) in messages.iter().enumerate().rev() {
602        for part in &msg.parts {
603            if let MessagePart::ToolUse { id, .. } = part
604                && let Some(&result_idx) = tool_result_map.get(id.as_str())
605            {
606                let score_a = scores[i].as_ref().map_or(1.0, |s| s.score);
607                let score_b = scores[result_idx].as_ref().map_or(1.0, |s| s.score);
608                let min_score = score_a.min(score_b);
609
610                // The level is the more restrictive of the two already-floored levels.
611                // Taking min(float) alone would bypass the floor invariant for the pair
612                // because score_to_level(min_score) ignores persisted fidelity tags.
613                let level_a = scores[i]
614                    .as_ref()
615                    .map_or(ContextFidelity::Full, |s| s.level);
616                let level_b = scores[result_idx]
617                    .as_ref()
618                    .map_or(ContextFidelity::Full, |s| s.level);
619                let float_level = score_to_level(min_score, config);
620                let min_level = more_restrictive(more_restrictive(level_a, level_b), float_level);
621
622                let tokens_a = scores[i].as_ref().map_or(0, |s| s.original_tokens);
623                let tokens_b = scores[result_idx].as_ref().map_or(0, |s| s.original_tokens);
624                scores[i] = Some(FidelityScore {
625                    score: min_score,
626                    level: min_level,
627                    original_tokens: tokens_a,
628                });
629                scores[result_idx] = Some(FidelityScore {
630                    score: min_score,
631                    level: min_level,
632                    original_tokens: tokens_b,
633                });
634            }
635        }
636    }
637}
638
639/// Return the more restrictive of two fidelity levels.
640///
641/// Restrictiveness order: Placeholder > Compressed > Full.
642fn more_restrictive(a: ContextFidelity, b: ContextFidelity) -> ContextFidelity {
643    use ContextFidelity::{Compressed, Full, Placeholder};
644    match (a, b) {
645        (Placeholder, _) | (_, Placeholder) => Placeholder,
646        (Compressed, _) | (_, Compressed) => Compressed,
647        _ => Full,
648    }
649}
650
651fn score_to_level(score: f32, config: &FidelityConfig) -> ContextFidelity {
652    if score >= config.full_threshold {
653        ContextFidelity::Full
654    } else if score >= config.compressed_threshold {
655        ContextFidelity::Compressed
656    } else {
657        ContextFidelity::Placeholder
658    }
659}
660
661async fn render_compressed(
662    msg: &mut Message,
663    config: &FidelityConfig,
664    tc: &dyn TokenCounting,
665    provider: Option<&dyn LlmProviderDyn>,
666) {
667    // Priority 1: use pre-computed deferred summary when available.
668    if let Some(summary) = msg.metadata.deferred_summary.take() {
669        msg.content = summary;
670    } else if config.compress_provider.is_some()
671        && let Some(p) = provider
672    {
673        // Priority 2: LLM-assisted compression when provider and config name are set.
674        let input_tokens = tc.count_tokens(&msg.content);
675        // Guard: skip LLM if input is already small or within 2x of the max budget.
676        if input_tokens > config.compressed_max_tokens * 2 && input_tokens > 0 {
677            // Apply input cap before sending to LLM.
678            if let Some(max_in) = config.max_compress_input_tokens {
679                apply_input_cap(&mut msg.content, max_in);
680            }
681
682            let prompt = format!(
683                "Summarize in {} tokens or fewer: {}",
684                config.compressed_max_tokens, msg.content
685            );
686            let req = vec![Message {
687                role: Role::User,
688                content: prompt,
689                parts: vec![],
690                metadata: MessageMetadata::default(),
691            }];
692
693            let span = info_span!(
694                "context.fidelity.compress_llm",
695                input_tokens,
696                cached = false,
697            );
698            let result = {
699                let _enter = span.enter();
700                tokio::time::timeout(Duration::from_secs(30), p.chat(&req)).await
701            };
702
703            match result {
704                Ok(Ok(summary)) => {
705                    msg.metadata.deferred_summary = Some(summary.clone());
706                    msg.content = summary;
707                }
708                Ok(Err(e)) => {
709                    tracing::debug!(error = %e, "compress_llm failed, falling back to truncation");
710                }
711                Err(_) => {
712                    tracing::warn!("compress_llm timed out, falling back to truncation");
713                }
714            }
715        }
716    } else if let Some(max_in) = config.max_compress_input_tokens {
717        // Cap input before truncation so pathologically large messages don't blow the budget.
718        apply_input_cap(&mut msg.content, max_in);
719    }
720
721    // Always cap output to compressed_max_tokens — covers both the deferred-summary path
722    // (LLM output may exceed the limit) and the truncation-only path.
723    truncate_to_tokens(&mut msg.content, config.compressed_max_tokens, tc);
724    msg.parts.clear();
725    msg.metadata.fidelity_tag = Some(ContextFidelity::Compressed);
726}
727
728/// Truncate `content` in place using the `max_tokens * 4` byte approximation.
729///
730/// Truncates at a valid UTF-8 char boundary via [`str::floor_char_boundary`]. Uses
731/// [`saturating_mul`](usize::saturating_mul) to avoid overflow on extreme token counts.
732///
733/// Pass `config.max_compress_input_tokens` as `max_tokens` before an LLM compress call,
734/// or `config.max_embed_input_tokens` before an embed call.
735pub fn apply_input_cap(content: &mut String, max_tokens: usize) {
736    let max_bytes = max_tokens.saturating_mul(4);
737    if content.len() > max_bytes {
738        let boundary = content.floor_char_boundary(max_bytes);
739        content.truncate(boundary);
740    }
741}
742
743fn truncate_to_tokens(content: &mut String, max_tokens: usize, tc: &dyn TokenCounting) {
744    if tc.count_tokens(content) <= max_tokens {
745        return;
746    }
747    // Binary search for the largest valid prefix that fits within max_tokens.
748    // lo: largest known-safe boundary (count <= max_tokens).
749    // hi: upper bound; count > max_tokens on the normal branch; on the stall branch
750    //     hi collapses to lo and the loop exits immediately.
751    let mut lo: usize = 0;
752    let mut hi: usize = content.len();
753    while hi - lo > 1 {
754        let mid = content.floor_char_boundary(usize::midpoint(lo, hi));
755        if mid == lo {
756            // floor_char_boundary landed back on lo (multibyte char spans the midpoint).
757            // Collapsing hi to lo satisfies hi-lo <= 1 and exits the loop.
758            hi = mid;
759        } else if tc.count_tokens(&content[..mid]) <= max_tokens {
760            lo = mid;
761        } else {
762            hi = mid;
763        }
764    }
765    content.truncate(lo);
766}
767
768fn render_placeholder(msg: &mut Message, score: f32, original_tokens: u32) {
769    let role_str = match msg.role {
770        Role::System => "system",
771        Role::User => "user",
772        Role::Assistant => "assistant",
773    };
774    msg.content = format!(
775        "[placeholder: role={role_str}, original_tokens={original_tokens}, importance={score:.2}]"
776    );
777    msg.parts.clear();
778    msg.metadata.fidelity_tag = Some(ContextFidelity::Placeholder);
779}
780
781/// Merge consecutive same-role `Placeholder` messages into a single merged placeholder.
782///
783/// Returns the number of individual messages consumed by merges.
784fn merge_consecutive_placeholders(messages: &mut Vec<Message>) -> usize {
785    let mut merged_count = 0usize;
786    let mut i = 0;
787    while i < messages.len() {
788        if messages[i].metadata.fidelity_tag != Some(ContextFidelity::Placeholder)
789            || messages[i].role == Role::System
790        {
791            i += 1;
792            continue;
793        }
794        let role = messages[i].role;
795        let mut j = i + 1;
796        while j < messages.len()
797            && messages[j].metadata.fidelity_tag == Some(ContextFidelity::Placeholder)
798            && messages[j].role == role
799        {
800            j += 1;
801        }
802        if j - i <= 1 {
803            i += 1;
804            continue;
805        }
806        let count = j - i;
807        let mut total_tokens = 0u32;
808        let mut importance_sum = 0.0f32;
809        for msg in &messages[i..j] {
810            total_tokens += parse_placeholder_tokens(&msg.content);
811            importance_sum += parse_placeholder_importance(&msg.content);
812        }
813        debug_assert!(count >= 2, "placeholder merge triggered with count={count}");
814        #[allow(clippy::cast_precision_loss)]
815        let avg_importance = if count > 0 {
816            importance_sum / count as f32
817        } else {
818            0.0
819        };
820        let role_str = match role {
821            Role::System => "system",
822            Role::User => "user",
823            Role::Assistant => "assistant",
824        };
825        let merged_content = format!(
826            "[placeholder: {count} messages, role={role_str}, total_tokens={total_tokens}, avg_importance={avg_importance:.2}]"
827        );
828        let first = messages[i].clone();
829        messages.drain(i..j);
830        messages.insert(
831            i,
832            Message {
833                role: first.role,
834                content: merged_content,
835                parts: vec![],
836                metadata: {
837                    let mut m = first.metadata;
838                    m.fidelity_tag = Some(ContextFidelity::Placeholder);
839                    m
840                },
841            },
842        );
843        merged_count += count - 1;
844        i += 1;
845    }
846    merged_count
847}
848
849fn parse_placeholder_tokens(content: &str) -> u32 {
850    for part in content.split(',') {
851        let part = part.trim();
852        for prefix in &["original_tokens=", "total_tokens="] {
853            if let Some(rest) = part.strip_prefix(prefix)
854                && let Ok(n) = rest.trim_end_matches(']').trim().parse::<u32>()
855            {
856                return n;
857            }
858        }
859    }
860    0
861}
862
863fn parse_placeholder_importance(content: &str) -> f32 {
864    for part in content.split(',') {
865        let part = part.trim();
866        for prefix in &["importance=", "avg_importance="] {
867            if let Some(rest) = part.strip_prefix(prefix)
868                && let Ok(v) = rest.trim_end_matches(']').trim().parse::<f32>()
869            {
870                return v;
871            }
872        }
873    }
874    0.0
875}
876
877#[cfg(test)]
878mod tests {
879    use super::*;
880    use zeph_llm::provider::{Message, MessageMetadata, MessagePart, Role};
881
882    struct FixedTc(usize);
883    impl TokenCounting for FixedTc {
884        fn count_tokens(&self, text: &str) -> usize {
885            text.len() / self.0.max(1)
886        }
887
888        fn count_tool_schema_tokens(&self, _schema: &serde_json::Value) -> usize {
889            0
890        }
891    }
892
893    fn make_msg(role: Role, content: &str) -> Message {
894        Message {
895            role,
896            content: content.to_string(),
897            parts: vec![],
898            metadata: MessageMetadata::default(),
899        }
900    }
901
902    fn make_cfg() -> FidelityConfig {
903        FidelityConfig {
904            enabled: true,
905            w_semantic: 0.3,
906            w_temporal: 0.3,
907            w_importance: 0.2,
908            w_plan: 0.2,
909            full_threshold: 0.7,
910            compressed_threshold: 0.3,
911            compressed_max_tokens: 50,
912            regrade_threshold: 0.6,
913            min_query_length: 8,
914            max_scored_messages: 500,
915            exempt_tail_messages: 0,
916            compress_provider: None,
917            semantic_scoring_provider: None,
918            lookahead_depth: 3,
919            embed_concurrency: 32,
920            max_embed_input_tokens: None,
921            max_compress_input_tokens: None,
922        }
923    }
924
925    // 1. Empty window → no change.
926    #[tokio::test]
927    async fn empty_window_no_change() {
928        let scorer = FidelityScorer;
929        let cfg = make_cfg();
930        let tc = FixedTc(4);
931        let mut messages: Vec<Message> = vec![];
932        scorer
933            .score_and_apply(
934                &mut messages,
935                "query text",
936                &[],
937                &cfg,
938                &tc,
939                0,
940                false,
941                None,
942                None,
943            )
944            .await;
945        assert!(messages.is_empty());
946    }
947
948    // 2. All-exempt window → no downgrade.
949    #[tokio::test]
950    async fn all_exempt_no_downgrade() {
951        let scorer = FidelityScorer;
952        let cfg = make_cfg();
953        let tc = FixedTc(4);
954        let mut messages = vec![
955            make_msg(Role::System, "system prompt"),
956            // Injected memory at index 1 with inserted_count=1.
957            make_msg(Role::User, "memory context"),
958        ];
959        scorer
960            .score_and_apply(&mut messages, "short", &[], &cfg, &tc, 1, false, None, None)
961            .await;
962        for msg in &messages {
963            assert!(
964                msg.metadata.fidelity_tag.is_none()
965                    || msg.metadata.fidelity_tag == Some(ContextFidelity::Full)
966            );
967        }
968    }
969
970    // 3. Tool pair atomicity: divergent scores → min applied.
971    #[tokio::test]
972    async fn tool_pair_atomicity() {
973        let scorer = FidelityScorer;
974        // Very high thresholds to force Placeholder for older messages.
975        let cfg = FidelityConfig {
976            full_threshold: 0.9,
977            compressed_threshold: 0.5,
978            ..make_cfg()
979        };
980        let tc = FixedTc(4);
981        let tool_use_id = "abc123".to_string();
982        let mut tool_use_msg = make_msg(Role::Assistant, "calling tool");
983        tool_use_msg.parts = vec![MessagePart::ToolUse {
984            id: tool_use_id.clone(),
985            name: "shell".to_string(),
986            input: serde_json::json!({}),
987        }];
988        let mut tool_result_msg = make_msg(Role::User, "tool result body");
989        tool_result_msg.parts = vec![MessagePart::ToolResult {
990            tool_use_id: tool_use_id.clone(),
991            content: "result".to_string(),
992            is_error: false,
993        }];
994        let mut messages = vec![
995            make_msg(Role::System, "system"),
996            tool_use_msg,
997            tool_result_msg,
998        ];
999        scorer
1000            .score_and_apply(
1001                &mut messages,
1002                "completely unrelated query blah",
1003                &[],
1004                &cfg,
1005                &tc,
1006                0,
1007                false,
1008                None,
1009                None,
1010            )
1011            .await;
1012        let tag_a = messages[1].metadata.fidelity_tag;
1013        let tag_b = messages[2].metadata.fidelity_tag;
1014        assert_eq!(tag_a, tag_b, "tool pair must share fidelity level");
1015    }
1016
1017    // 4. Same-role Placeholder merge: 5 consecutive assistant → merged to 1.
1018    #[tokio::test]
1019    async fn same_role_placeholder_merge() {
1020        let scorer = FidelityScorer;
1021        // Force all non-system messages to become Placeholder.
1022        let cfg = FidelityConfig {
1023            full_threshold: 2.0,       // impossible to reach
1024            compressed_threshold: 1.5, // impossible to reach
1025            ..make_cfg()
1026        };
1027        let tc = FixedTc(4);
1028        let mut messages: Vec<Message> = std::iter::once(make_msg(Role::System, "system"))
1029            .chain((0..5).map(|i| make_msg(Role::Assistant, &format!("msg {i}"))))
1030            .collect();
1031        scorer
1032            .score_and_apply(
1033                &mut messages,
1034                "some query here",
1035                &[],
1036                &cfg,
1037                &tc,
1038                0,
1039                false,
1040                None,
1041                None,
1042            )
1043            .await;
1044        // System + 1 merged placeholder.
1045        assert_eq!(
1046            messages.len(),
1047            2,
1048            "5 assistant placeholders must merge to 1"
1049        );
1050        assert!(messages[1].content.contains("5 messages"));
1051    }
1052
1053    // 5. Score normalization: active signal subset still produces [0,1].
1054    #[tokio::test]
1055    async fn score_normalization_no_panic() {
1056        let scorer = FidelityScorer;
1057        let cfg = make_cfg();
1058        let tc = FixedTc(4);
1059        let mut messages = vec![
1060            make_msg(Role::System, "system"),
1061            make_msg(Role::User, "hello"),
1062            make_msg(Role::Assistant, "world response"),
1063        ];
1064        scorer
1065            .score_and_apply(
1066                &mut messages,
1067                "hello world signal",
1068                &[],
1069                &cfg,
1070                &tc,
1071                0,
1072                false,
1073                None,
1074                None,
1075            )
1076            .await;
1077        for msg in &messages {
1078            let _ = msg.metadata.fidelity_tag;
1079        }
1080    }
1081
1082    // 6. Short query fallback: query.len() < 8 → semantic signal excluded.
1083    #[tokio::test]
1084    async fn short_query_fallback() {
1085        let scorer = FidelityScorer;
1086        let cfg = FidelityConfig {
1087            min_query_length: 8,
1088            ..make_cfg()
1089        };
1090        let tc = FixedTc(4);
1091        let mut messages = vec![
1092            make_msg(Role::System, "system"),
1093            make_msg(Role::User, "test"),
1094        ];
1095        // Must not panic or produce out-of-range scores.
1096        scorer
1097            .score_and_apply(&mut messages, "short", &[], &cfg, &tc, 0, false, None, None)
1098            .await;
1099    }
1100
1101    // 7. AC-09: memory_first bypass is the caller's responsibility.
1102    //    When enabled=false, score_and_apply is always a no-op — callers that activate
1103    //    memory_first simply skip the call (see service.rs guard at INV-11).
1104    //    This test documents the contract: the scorer itself is stateless and harmless
1105    //    when called with disabled config or an all-exempt window.
1106    #[tokio::test]
1107    async fn memory_first_bypass_is_callers_responsibility() {
1108        let scorer = FidelityScorer;
1109        // Simulate: caller would skip this call when memory_first=true.
1110        // The scorer itself must be a complete no-op when enabled=false.
1111        let cfg = FidelityConfig {
1112            enabled: false,
1113            ..make_cfg()
1114        };
1115        let tc = FixedTc(4);
1116        let mut messages = vec![
1117            make_msg(Role::System, "system prompt"),
1118            make_msg(Role::User, "memory-injected context"),
1119            make_msg(Role::Assistant, "response"),
1120        ];
1121        let before: Vec<_> = messages.iter().map(|m| m.content.clone()).collect();
1122        // Even with a real query, disabled scorer must not touch any message.
1123        scorer
1124            .score_and_apply(
1125                &mut messages,
1126                "some user query text here",
1127                &[],
1128                &cfg,
1129                &tc,
1130                2,
1131                false,
1132                None,
1133                None,
1134            )
1135            .await;
1136        for (msg, orig) in messages.iter().zip(&before) {
1137            assert_eq!(msg.content, *orig, "content must be unchanged");
1138            assert!(
1139                msg.metadata.fidelity_tag.is_none(),
1140                "no fidelity tag must be set"
1141            );
1142        }
1143    }
1144
1145    // 9. enabled=false guard: no changes applied.
1146    #[tokio::test]
1147    async fn enabled_false_guard() {
1148        let scorer = FidelityScorer;
1149        let cfg = FidelityConfig {
1150            enabled: false,
1151            ..make_cfg()
1152        };
1153        let tc = FixedTc(4);
1154        let mut messages = vec![
1155            make_msg(Role::System, "system"),
1156            make_msg(Role::User, "user message that would normally be scored"),
1157        ];
1158        let original_contents: Vec<String> = messages.iter().map(|m| m.content.clone()).collect();
1159        scorer
1160            .score_and_apply(
1161                &mut messages,
1162                "query text here",
1163                &[],
1164                &cfg,
1165                &tc,
1166                0,
1167                false,
1168                None,
1169                None,
1170            )
1171            .await;
1172        for (msg, orig) in messages.iter().zip(&original_contents) {
1173            assert_eq!(msg.content, *orig);
1174            assert!(msg.metadata.fidelity_tag.is_none());
1175        }
1176    }
1177
1178    // 10. Score always in [0.0, 1.0] for extreme inputs (zero weights).
1179    #[tokio::test]
1180    async fn score_always_in_range() {
1181        let scorer = FidelityScorer;
1182        let cfg = FidelityConfig {
1183            enabled: true,
1184            w_semantic: 0.0,
1185            w_temporal: 0.0,
1186            w_importance: 0.0,
1187            w_plan: 0.0,
1188            full_threshold: 0.7,
1189            compressed_threshold: 0.3,
1190            compressed_max_tokens: 50,
1191            regrade_threshold: 0.6,
1192            min_query_length: 0,
1193            max_scored_messages: 500,
1194            exempt_tail_messages: 0,
1195            compress_provider: None,
1196            semantic_scoring_provider: None,
1197            lookahead_depth: 3,
1198            embed_concurrency: 32,
1199            max_embed_input_tokens: None,
1200            max_compress_input_tokens: None,
1201        };
1202        let tc = FixedTc(4);
1203        let mut messages = vec![make_msg(Role::System, ""), make_msg(Role::User, "")];
1204        // Must not panic with zero weights.
1205        scorer
1206            .score_and_apply(&mut messages, "", &[], &cfg, &tc, 0, false, None, None)
1207            .await;
1208    }
1209
1210    // 11. Token count uses tc.count_tokens for Placeholder rendering.
1211    #[tokio::test]
1212    async fn placeholder_uses_tc_count_tokens() {
1213        let scorer = FidelityScorer;
1214        let cfg = FidelityConfig {
1215            full_threshold: 2.0,
1216            compressed_threshold: 1.5,
1217            ..make_cfg()
1218        };
1219        let tc = FixedTc(1); // every character = 1 token
1220        let mut messages = vec![
1221            make_msg(Role::System, "system"),
1222            make_msg(Role::User, "user message content for placeholder rendering"),
1223        ];
1224        scorer
1225            .score_and_apply(
1226                &mut messages,
1227                "some query text here",
1228                &[],
1229                &cfg,
1230                &tc,
1231                0,
1232                false,
1233                None,
1234                None,
1235            )
1236            .await;
1237        assert_eq!(
1238            messages[1].metadata.fidelity_tag,
1239            Some(ContextFidelity::Placeholder)
1240        );
1241        assert!(messages[1].content.starts_with("[placeholder:"));
1242    }
1243
1244    // 13. #4593: exempt_tail_messages respected when n > max_scored_messages.
1245    //     Verify that tail messages (beyond score_end) keep no fidelity_tag.
1246    //     Use focus_pinned=true on tail messages so they stay exempt-from-scoring
1247    //     via is_exempt() and retain no tag regardless of the merge pass.
1248    //     n=20, max_scored_messages=10, exempt_tail_messages=5 → score_end=15.
1249    //     Indices [15..19] are in the exempt tail.
1250    #[tokio::test]
1251    async fn exempt_tail_messages_large_window() {
1252        let scorer = FidelityScorer;
1253        let cfg = FidelityConfig {
1254            // Force all scored messages to Placeholder so we can see which ones got tagged.
1255            full_threshold: 2.0,
1256            compressed_threshold: 1.5,
1257            max_scored_messages: 10,
1258            exempt_tail_messages: 5,
1259            ..make_cfg()
1260        };
1261        let tc = FixedTc(4);
1262
1263        // Index 0: system (always exempt).
1264        // Indices 1..14: regular user messages that fall in the scored region [0..15).
1265        // Indices 15..19: tail messages — mark them focus_pinned so they don't get scored.
1266        //   focus_pinned is the is_exempt() gate (INV-08), which makes them opaque to
1267        //   the scorer. This lets us assert their fidelity_tag stays None while the
1268        //   merge pass leaves them intact (it only merges Placeholder-tagged messages).
1269        let mut messages: Vec<Message> = std::iter::once(make_msg(Role::System, "system prompt"))
1270            .chain((1..15).map(|i| make_msg(Role::Assistant, &format!("assistant message {i}"))))
1271            .chain((15..20).map(|i| {
1272                let mut m = make_msg(Role::User, &format!("tail message {i}"));
1273                m.metadata.focus_pinned = true;
1274                m
1275            }))
1276            .collect();
1277
1278        scorer
1279            .score_and_apply(
1280                &mut messages,
1281                "query text here long",
1282                &[],
1283                &cfg,
1284                &tc,
1285                0,
1286                false,
1287                None,
1288                None,
1289            )
1290            .await;
1291
1292        // Tail messages (focus_pinned) must have no fidelity_tag.
1293        let tail: Vec<_> = messages
1294            .iter()
1295            .filter(|m| m.metadata.focus_pinned)
1296            .collect();
1297        assert_eq!(
1298            tail.len(),
1299            5,
1300            "all 5 tail messages must survive the merge pass"
1301        );
1302        for msg in &tail {
1303            assert!(
1304                msg.metadata.fidelity_tag.is_none(),
1305                "tail message must have no fidelity_tag, got {:?}",
1306                msg.metadata.fidelity_tag
1307            );
1308        }
1309    }
1310
1311    // 14. #4593: when n <= max_scored_messages, exempt_tail_messages has no effect.
1312    //     n=8, max_scored_messages=10, exempt_tail_messages=5 → score_end=8 (all scored).
1313    //     Use alternating roles to avoid placeholder merging.
1314    #[tokio::test]
1315    async fn exempt_tail_messages_small_window_no_effect() {
1316        let scorer = FidelityScorer;
1317        let cfg = FidelityConfig {
1318            full_threshold: 2.0,
1319            compressed_threshold: 1.5,
1320            max_scored_messages: 10,
1321            exempt_tail_messages: 5,
1322            ..make_cfg()
1323        };
1324        let tc = FixedTc(4);
1325        // 8 messages: index 0 = system, then alternating user/assistant.
1326        // Alternating roles prevent placeholder merge, keeping the length stable.
1327        let roles = [Role::User, Role::Assistant];
1328        let mut messages: Vec<Message> = std::iter::once(make_msg(Role::System, "system prompt"))
1329            .chain((1..8usize).map(|i| make_msg(roles[i % 2], &format!("message {i}"))))
1330            .collect();
1331        scorer
1332            .score_and_apply(
1333                &mut messages,
1334                "query text here long",
1335                &[],
1336                &cfg,
1337                &tc,
1338                0,
1339                false,
1340                None,
1341                None,
1342            )
1343            .await;
1344        // score_end = 8 (n=8 <= max_scored_messages=10, so exempt_tail not applied).
1345        // All non-system messages must be scored (tagged).
1346        let untagged_count = messages[1..]
1347            .iter()
1348            .filter(|m| m.metadata.fidelity_tag.is_none())
1349            .count();
1350        assert_eq!(
1351            untagged_count, 0,
1352            "all non-system messages must be scored when n <= max_scored_messages"
1353        );
1354    }
1355
1356    // 12. Compressed rendering uses deferred_summary when available.
1357    #[tokio::test]
1358    async fn compressed_uses_deferred_summary() {
1359        let scorer = FidelityScorer;
1360        let cfg = FidelityConfig {
1361            full_threshold: 2.0,       // nothing reaches Full
1362            compressed_threshold: 0.0, // everything at or above 0 → Compressed
1363            compressed_max_tokens: 5,
1364            ..make_cfg()
1365        };
1366        let tc = FixedTc(4);
1367        let mut msg_with_summary =
1368            make_msg(Role::User, "original long content that would be truncated");
1369        msg_with_summary.metadata.deferred_summary = Some("short summary".to_string());
1370        let mut messages = vec![make_msg(Role::System, "system"), msg_with_summary];
1371        scorer
1372            .score_and_apply(
1373                &mut messages,
1374                "query text here long",
1375                &[],
1376                &cfg,
1377                &tc,
1378                0,
1379                false,
1380                None,
1381                None,
1382            )
1383            .await;
1384        assert_eq!(
1385            messages[1].metadata.fidelity_tag,
1386            Some(ContextFidelity::Compressed)
1387        );
1388        assert_eq!(messages[1].content, "short summary");
1389    }
1390
1391    // ── CAM Phase 2-B: floor invariant tests ────────────────────────────────
1392
1393    fn make_msg_with_fidelity(role: Role, content: &str, tag: Option<ContextFidelity>) -> Message {
1394        let mut m = make_msg(role, content);
1395        m.metadata.fidelity_tag = tag;
1396        m
1397    }
1398
1399    // Floor: Compressed cannot upgrade to Full when allow_upgrade=false.
1400    #[tokio::test]
1401    async fn floor_prevents_compressed_upgrade_to_full() {
1402        let scorer = FidelityScorer;
1403        let cfg = FidelityConfig {
1404            // Very low thresholds so every message naturally scores Full.
1405            full_threshold: 0.0,
1406            compressed_threshold: -1.0,
1407            ..make_cfg()
1408        };
1409        let tc = FixedTc(4);
1410        let mut messages = vec![
1411            make_msg(Role::System, "system"),
1412            make_msg_with_fidelity(
1413                Role::User,
1414                "query text here long keyword",
1415                Some(ContextFidelity::Compressed),
1416            ),
1417        ];
1418        scorer
1419            .score_and_apply(
1420                &mut messages,
1421                "query text here long keyword",
1422                &[],
1423                &cfg,
1424                &tc,
1425                0,
1426                false,
1427                None,
1428                None,
1429            )
1430            .await;
1431        assert_eq!(
1432            messages[1].metadata.fidelity_tag,
1433            Some(ContextFidelity::Compressed),
1434            "Compressed floor must block upgrade to Full"
1435        );
1436    }
1437
1438    // Floor: Placeholder cannot upgrade to Full.
1439    #[tokio::test]
1440    async fn floor_prevents_placeholder_upgrade_to_full() {
1441        let scorer = FidelityScorer;
1442        let cfg = FidelityConfig {
1443            full_threshold: 0.0,
1444            compressed_threshold: -1.0,
1445            ..make_cfg()
1446        };
1447        let tc = FixedTc(4);
1448        let mut messages = vec![
1449            make_msg(Role::System, "system"),
1450            make_msg_with_fidelity(
1451                Role::User,
1452                "query text here long keyword",
1453                Some(ContextFidelity::Placeholder),
1454            ),
1455        ];
1456        scorer
1457            .score_and_apply(
1458                &mut messages,
1459                "query text here long keyword",
1460                &[],
1461                &cfg,
1462                &tc,
1463                0,
1464                false,
1465                None,
1466                None,
1467            )
1468            .await;
1469        assert_eq!(
1470            messages[1].metadata.fidelity_tag,
1471            Some(ContextFidelity::Placeholder),
1472            "Placeholder floor must block upgrade to Full"
1473        );
1474    }
1475
1476    // Floor: Placeholder cannot upgrade to Compressed.
1477    #[tokio::test]
1478    async fn floor_prevents_placeholder_upgrade_to_compressed() {
1479        let scorer = FidelityScorer;
1480        // Thresholds: full=2.0 (unreachable), compressed=0.0 (everything Compressed).
1481        let cfg = FidelityConfig {
1482            full_threshold: 2.0,
1483            compressed_threshold: 0.0,
1484            ..make_cfg()
1485        };
1486        let tc = FixedTc(4);
1487        let mut messages = vec![
1488            make_msg(Role::System, "system"),
1489            make_msg_with_fidelity(
1490                Role::User,
1491                "message content",
1492                Some(ContextFidelity::Placeholder),
1493            ),
1494        ];
1495        scorer
1496            .score_and_apply(
1497                &mut messages,
1498                "query text here long",
1499                &[],
1500                &cfg,
1501                &tc,
1502                0,
1503                false,
1504                None,
1505                None,
1506            )
1507            .await;
1508        assert_eq!(
1509            messages[1].metadata.fidelity_tag,
1510            Some(ContextFidelity::Placeholder),
1511            "Placeholder floor must block upgrade to Compressed"
1512        );
1513    }
1514
1515    // Floor: further downgrade (Compressed → Placeholder) is allowed.
1516    #[tokio::test]
1517    async fn floor_allows_further_downgrade() {
1518        let scorer = FidelityScorer;
1519        let cfg = FidelityConfig {
1520            full_threshold: 2.0,
1521            compressed_threshold: 2.0, // everything Placeholder
1522            ..make_cfg()
1523        };
1524        let tc = FixedTc(4);
1525        let mut messages = vec![
1526            make_msg(Role::System, "system"),
1527            make_msg_with_fidelity(
1528                Role::User,
1529                "some content",
1530                Some(ContextFidelity::Compressed),
1531            ),
1532        ];
1533        scorer
1534            .score_and_apply(
1535                &mut messages,
1536                "query text here long",
1537                &[],
1538                &cfg,
1539                &tc,
1540                0,
1541                false,
1542                None,
1543                None,
1544            )
1545            .await;
1546        assert_eq!(
1547            messages[1].metadata.fidelity_tag,
1548            Some(ContextFidelity::Placeholder),
1549            "downgrade from Compressed to Placeholder must be allowed"
1550        );
1551    }
1552
1553    // Floor: None tag → no constraint, normal scoring.
1554    #[tokio::test]
1555    async fn floor_no_constraint_when_none() {
1556        let scorer = FidelityScorer;
1557        // Force every message to Full.
1558        let cfg = FidelityConfig {
1559            full_threshold: 0.0,
1560            compressed_threshold: -1.0,
1561            ..make_cfg()
1562        };
1563        let tc = FixedTc(4);
1564        let mut messages = vec![
1565            make_msg(Role::System, "system"),
1566            make_msg_with_fidelity(Role::User, "query text here long keyword", None),
1567        ];
1568        scorer
1569            .score_and_apply(
1570                &mut messages,
1571                "query text here long keyword",
1572                &[],
1573                &cfg,
1574                &tc,
1575                0,
1576                false,
1577                None,
1578                None,
1579            )
1580            .await;
1581        assert_eq!(
1582            messages[1].metadata.fidelity_tag,
1583            Some(ContextFidelity::Full),
1584            "None tag must not constrain scoring"
1585        );
1586    }
1587
1588    // allow_upgrade=true bypasses the Placeholder floor.
1589    #[tokio::test]
1590    async fn allow_upgrade_bypasses_floor() {
1591        let scorer = FidelityScorer;
1592        let cfg = FidelityConfig {
1593            full_threshold: 0.0,
1594            compressed_threshold: -1.0,
1595            ..make_cfg()
1596        };
1597        let tc = FixedTc(4);
1598        let mut messages = vec![
1599            make_msg(Role::System, "system"),
1600            make_msg_with_fidelity(
1601                Role::User,
1602                "query text here long keyword",
1603                Some(ContextFidelity::Placeholder),
1604            ),
1605        ];
1606        scorer
1607            .score_and_apply(
1608                &mut messages,
1609                "query text here long keyword",
1610                &[],
1611                &cfg,
1612                &tc,
1613                0,
1614                true,
1615                None,
1616                None,
1617            )
1618            .await;
1619        assert_eq!(
1620            messages[1].metadata.fidelity_tag,
1621            Some(ContextFidelity::Full),
1622            "allow_upgrade=true must bypass the Placeholder floor"
1623        );
1624    }
1625
1626    // ── truncate_to_tokens unit tests ─────────────────────────────────────────
1627
1628    // no-op when content is well below limit
1629    #[test]
1630    fn truncate_no_op_below_limit() {
1631        let tc = FixedTc(1); // 1 char = 1 token
1632        let mut s = "hello".to_string(); // 5 tokens
1633        truncate_to_tokens(&mut s, 10, &tc);
1634        assert_eq!(s, "hello");
1635    }
1636
1637    // no-op when content is exactly at limit
1638    #[test]
1639    fn truncate_no_op_at_limit() {
1640        let tc = FixedTc(1);
1641        let mut s = "hello".to_string(); // 5 tokens
1642        truncate_to_tokens(&mut s, 5, &tc);
1643        assert_eq!(s, "hello");
1644    }
1645
1646    // truncates when content is one token over the limit
1647    #[test]
1648    fn truncate_minimal_one_over_limit() {
1649        let tc = FixedTc(1); // 1 char = 1 token
1650        let mut s = "abcdef".to_string(); // 6 tokens, limit=5
1651        truncate_to_tokens(&mut s, 5, &tc);
1652        assert!(
1653            tc.count_tokens(&s) <= 5,
1654            "result must fit in 5 tokens, got {}",
1655            tc.count_tokens(&s)
1656        );
1657        assert!(!s.is_empty(), "must keep prefix, not empty");
1658    }
1659
1660    // binary search preserves more than halving would: at 90% of limit no truncation occurs
1661    #[test]
1662    fn truncate_preserves_90pct_of_limit() {
1663        // FixedTc(1): each byte = 1 token; 90-byte string with limit=100 must not be truncated.
1664        let tc = FixedTc(1);
1665        let s_orig = "a".repeat(90);
1666        let mut s = s_orig.clone();
1667        truncate_to_tokens(&mut s, 100, &tc);
1668        assert_eq!(s, s_orig, "90% of limit must not be truncated");
1669    }
1670
1671    // empty string is a no-op
1672    #[test]
1673    fn truncate_empty_string_no_op() {
1674        let tc = FixedTc(1);
1675        let mut s = String::new();
1676        truncate_to_tokens(&mut s, 5, &tc);
1677        assert!(s.is_empty());
1678    }
1679
1680    // max_tokens=0 truncates everything
1681    #[test]
1682    fn truncate_max_tokens_zero_clears_content() {
1683        let tc = FixedTc(1);
1684        let mut s = "hello world".to_string();
1685        truncate_to_tokens(&mut s, 0, &tc);
1686        assert!(s.is_empty(), "max_tokens=0 must clear content");
1687    }
1688
1689    // multibyte characters: truncation must land on a valid char boundary
1690    #[test]
1691    fn truncate_multibyte_stays_on_char_boundary() {
1692        // "日本語" = 3 chars, each 3 bytes (9 bytes total).
1693        // FixedTc(3): count = byte_len / 3, so "日本語" = 3 tokens.
1694        // limit=2 → must truncate to "日本" (2 chars, 6 bytes).
1695        let tc = FixedTc(3);
1696        let mut s = "日本語".to_string();
1697        truncate_to_tokens(&mut s, 2, &tc);
1698        assert!(
1699            s.is_char_boundary(s.len()),
1700            "result must be on a valid char boundary"
1701        );
1702        assert!(tc.count_tokens(&s) <= 2);
1703        assert_eq!(s, "日本");
1704    }
1705
1706    // Mixed-fidelity tool pair: None + Some(Compressed) → both end up Compressed via atomicity.
1707    #[tokio::test]
1708    async fn mixed_fidelity_tool_pair_floor_plus_atomicity() {
1709        let scorer = FidelityScorer;
1710        // Force everything to Full naturally, so the floor is the only downward pressure.
1711        let cfg = FidelityConfig {
1712            full_threshold: 0.0,
1713            compressed_threshold: -1.0,
1714            ..make_cfg()
1715        };
1716        let tc = FixedTc(4);
1717        let tool_id = "tool-42".to_string();
1718
1719        let mut tool_use_msg = make_msg_with_fidelity(Role::Assistant, "call tool", None);
1720        tool_use_msg.parts = vec![MessagePart::ToolUse {
1721            id: tool_id.clone(),
1722            name: "shell".to_string(),
1723            input: serde_json::json!({}),
1724        }];
1725
1726        let mut tool_result_msg =
1727            make_msg_with_fidelity(Role::User, "result body", Some(ContextFidelity::Compressed));
1728        tool_result_msg.parts = vec![MessagePart::ToolResult {
1729            tool_use_id: tool_id.clone(),
1730            content: "output".to_string(),
1731            is_error: false,
1732        }];
1733
1734        let mut messages = vec![
1735            make_msg(Role::System, "system"),
1736            tool_use_msg,
1737            tool_result_msg,
1738        ];
1739
1740        scorer
1741            .score_and_apply(
1742                &mut messages,
1743                "query text here long",
1744                &[],
1745                &cfg,
1746                &tc,
1747                0,
1748                false,
1749                None,
1750                None,
1751            )
1752            .await;
1753
1754        // After floor clamping: tool_use scores Full (no floor), tool_result scores Full but
1755        // is clamped to Compressed by its floor. Atomicity then takes min(Full, Compressed) =
1756        // Compressed for both.
1757        let tag_use = messages[1].metadata.fidelity_tag;
1758        let tag_result = messages[2].metadata.fidelity_tag;
1759        assert_eq!(
1760            tag_use, tag_result,
1761            "tool pair must share the same fidelity level"
1762        );
1763        assert_eq!(
1764            tag_use,
1765            Some(ContextFidelity::Compressed),
1766            "atomicity must bring the tool-use down to the tool-result floor"
1767        );
1768    }
1769
1770    // 15. LLM path stores deferred_summary and updates content.
1771    #[tokio::test]
1772    async fn compress_llm_path_stores_deferred_summary() {
1773        use zeph_llm::LlmError;
1774        use zeph_llm::provider::ChatStream;
1775
1776        #[derive(Debug)]
1777        struct MockProvider;
1778
1779        impl zeph_llm::provider::LlmProvider for MockProvider {
1780            async fn chat(&self, _messages: &[Message]) -> Result<String, LlmError> {
1781                Ok("summary text".to_string())
1782            }
1783
1784            async fn chat_stream(&self, _messages: &[Message]) -> Result<ChatStream, LlmError> {
1785                Err(LlmError::Unavailable)
1786            }
1787
1788            fn supports_streaming(&self) -> bool {
1789                false
1790            }
1791
1792            async fn embed(&self, _text: &str) -> Result<Vec<f32>, LlmError> {
1793                Err(LlmError::EmbedUnsupported {
1794                    provider: "mock".into(),
1795                })
1796            }
1797
1798            fn supports_embeddings(&self) -> bool {
1799                false
1800            }
1801
1802            fn name(&self) -> &'static str {
1803                "mock"
1804            }
1805        }
1806
1807        let scorer = FidelityScorer;
1808        let cfg = FidelityConfig {
1809            enabled: true,
1810            // Force everything to Compressed.
1811            full_threshold: 2.0,
1812            compressed_threshold: 0.0,
1813            compressed_max_tokens: 5,
1814            compress_provider: Some("mock".to_string()),
1815            ..make_cfg()
1816        };
1817        // Use tc with chars-per-token=1 so a long string has more than 5*2=10 tokens.
1818        let tc = FixedTc(1);
1819        let content = "a".repeat(50); // 50 chars → 50 tokens → well above 5*2=10
1820        let mut messages = vec![
1821            make_msg(Role::System, "system"),
1822            make_msg(Role::User, &content),
1823        ];
1824
1825        let provider = MockProvider;
1826        scorer
1827            .score_and_apply(
1828                &mut messages,
1829                "some query text here",
1830                &[],
1831                &cfg,
1832                &tc,
1833                0,
1834                false,
1835                None,
1836                Some(&provider),
1837            )
1838            .await;
1839
1840        assert_eq!(
1841            messages[1].metadata.fidelity_tag,
1842            Some(ContextFidelity::Compressed),
1843        );
1844        // Content is capped to compressed_max_tokens (5 chars with FixedTc(1)) after LLM call.
1845        assert!(
1846            tc.count_tokens(&messages[1].content) <= 5,
1847            "content must be capped to compressed_max_tokens after LLM summary"
1848        );
1849        // deferred_summary stores the full LLM output for future reuse.
1850        assert_eq!(
1851            messages[1].metadata.deferred_summary,
1852            Some("summary text".to_string()),
1853        );
1854    }
1855
1856    // 16. LLM path skipped when provider is None → truncation used instead.
1857    #[tokio::test]
1858    async fn compress_llm_skipped_when_provider_none() {
1859        let scorer = FidelityScorer;
1860        let cfg = FidelityConfig {
1861            enabled: true,
1862            full_threshold: 2.0,
1863            compressed_threshold: 0.0,
1864            compressed_max_tokens: 5,
1865            compress_provider: Some("mock".to_string()),
1866            ..make_cfg()
1867        };
1868        let tc = FixedTc(1);
1869        let content = "a".repeat(50);
1870        let mut messages = vec![
1871            make_msg(Role::System, "system"),
1872            make_msg(Role::User, &content),
1873        ];
1874
1875        scorer
1876            .score_and_apply(
1877                &mut messages,
1878                "some query text here",
1879                &[],
1880                &cfg,
1881                &tc,
1882                0,
1883                false,
1884                None,
1885                None,
1886            )
1887            .await;
1888
1889        assert_eq!(
1890            messages[1].metadata.fidelity_tag,
1891            Some(ContextFidelity::Compressed),
1892        );
1893        // Truncation path: deferred_summary must NOT be set.
1894        assert!(
1895            messages[1].metadata.deferred_summary.is_none(),
1896            "deferred_summary must not be populated via truncation path"
1897        );
1898        // Content should be truncated to <= 5 chars.
1899        assert!(
1900            messages[1].content.len() <= 5,
1901            "content must be truncated, got len={}",
1902            messages[1].content.len()
1903        );
1904    }
1905
1906    // 17. cosine_similarity unit tests.
1907    #[test]
1908    fn cosine_similarity_identical() {
1909        let v = vec![1.0f32, 0.0, 0.0];
1910        assert!((cosine_similarity(&v, &v) - 1.0).abs() < 1e-6);
1911    }
1912
1913    #[test]
1914    fn cosine_similarity_orthogonal() {
1915        let a = vec![1.0f32, 0.0, 0.0];
1916        let b = vec![0.0f32, 1.0, 0.0];
1917        assert!(cosine_similarity(&a, &b).abs() < 1e-6);
1918    }
1919
1920    #[test]
1921    fn cosine_similarity_zero_vector() {
1922        let a = vec![0.0f32, 0.0, 0.0];
1923        let b = vec![1.0f32, 0.0, 0.0];
1924        assert!(cosine_similarity(&a, &b).abs() < f32::EPSILON);
1925    }
1926
1927    #[test]
1928    fn cosine_similarity_empty() {
1929        assert!(cosine_similarity(&[], &[]).abs() < f32::EPSILON);
1930    }
1931
1932    #[test]
1933    fn cosine_similarity_dimension_mismatch() {
1934        let a = vec![1.0f32, 0.0];
1935        let b = vec![1.0f32, 0.0, 0.0];
1936        assert!(cosine_similarity(&a, &b).abs() < f32::EPSILON);
1937    }
1938
1939    // 18. semantic_scoring_higher_for_similar_messages.
1940    //     Mock provider returns deterministic embeddings:
1941    //       "cat"  → [1.0, 0.0, 0.0]  (similar to query)
1942    //       "feline" → [0.9, 0.1, 0.0]  (similar to query)
1943    //       "stock" → [0.0, 0.0, 1.0]  (orthogonal to query)
1944    //       query "cat mat" → [1.0, 0.0, 0.0]
1945    #[tokio::test]
1946    async fn semantic_scoring_higher_for_similar_messages() {
1947        use zeph_llm::LlmError;
1948        use zeph_llm::provider::ChatStream;
1949
1950        #[derive(Debug)]
1951        struct EmbedMockProvider;
1952
1953        impl zeph_llm::provider::LlmProvider for EmbedMockProvider {
1954            async fn chat(&self, _: &[Message]) -> Result<String, LlmError> {
1955                Err(LlmError::Unavailable)
1956            }
1957            async fn chat_stream(&self, _: &[Message]) -> Result<ChatStream, LlmError> {
1958                Err(LlmError::Unavailable)
1959            }
1960            fn supports_streaming(&self) -> bool {
1961                false
1962            }
1963            async fn embed(&self, text: &str) -> Result<Vec<f32>, LlmError> {
1964                let v = if text.contains("cat")
1965                    || text.contains("mat")
1966                    || text.contains("feline")
1967                    || text.contains("rug")
1968                {
1969                    if text.contains("feline") || text.contains("rug") {
1970                        vec![0.9f32, 0.1, 0.0]
1971                    } else {
1972                        vec![1.0f32, 0.0, 0.0]
1973                    }
1974                } else {
1975                    vec![0.0f32, 0.0, 1.0]
1976                };
1977                Ok(v)
1978            }
1979            fn supports_embeddings(&self) -> bool {
1980                true
1981            }
1982            fn name(&self) -> &'static str {
1983                "embed-mock"
1984            }
1985        }
1986
1987        let provider = EmbedMockProvider;
1988        let scorer = FidelityScorer;
1989        let cfg = FidelityConfig {
1990            enabled: true,
1991            semantic_scoring_provider: Some("embed-mock".to_string()),
1992            // Only semantic + temporal active; force Full for all so we can inspect scores
1993            // by checking which messages survive with Full vs not.
1994            // Use extreme thresholds so all messages pass through as Full.
1995            full_threshold: 0.0,
1996            compressed_threshold: 0.0,
1997            w_semantic: 1.0,
1998            w_temporal: 0.0,
1999            w_importance: 0.0,
2000            w_plan: 0.0,
2001            ..make_cfg()
2002        };
2003        let tc = FixedTc(4);
2004        let cat_msg = make_msg(Role::User, "The cat is on the mat");
2005        let feline_msg = make_msg(Role::User, "A feline rests on the rug");
2006        let stock_msg = make_msg(Role::User, "Stock prices fell today");
2007        let mut messages = vec![
2008            make_msg(Role::System, "system"),
2009            cat_msg,
2010            feline_msg,
2011            stock_msg,
2012        ];
2013
2014        scorer
2015            .score_and_apply(
2016                &mut messages,
2017                "cat mat",
2018                &[],
2019                &cfg,
2020                &tc,
2021                0,
2022                false,
2023                Some(&provider),
2024                None,
2025            )
2026            .await;
2027
2028        // All should be scored (Full threshold = 0.0 means everything is Full).
2029        // Embeddings must have been populated.
2030        assert!(
2031            messages[1].metadata.embedding.is_some(),
2032            "cat message must have embedding"
2033        );
2034        assert!(
2035            messages[2].metadata.embedding.is_some(),
2036            "feline message must have embedding"
2037        );
2038        assert!(
2039            messages[3].metadata.embedding.is_some(),
2040            "stock message must have embedding"
2041        );
2042
2043        // Verify cosine directly: cat/feline should be > stock vs query [1,0,0].
2044        let query_emb = [1.0f32, 0.0, 0.0];
2045        let cat_emb = messages[1].metadata.embedding.as_ref().unwrap();
2046        let feline_emb = messages[2].metadata.embedding.as_ref().unwrap();
2047        let stock_emb = messages[3].metadata.embedding.as_ref().unwrap();
2048        assert!(
2049            cosine_similarity(cat_emb, &query_emb) > cosine_similarity(stock_emb, &query_emb),
2050            "cat message must be more similar to query than stock message"
2051        );
2052        assert!(
2053            cosine_similarity(feline_emb, &query_emb) > cosine_similarity(stock_emb, &query_emb),
2054            "feline message must be more similar to query than stock message"
2055        );
2056    }
2057
2058    // 19. semantic_scoring_falls_back_to_keyword_when_provider_none.
2059    #[tokio::test]
2060    async fn semantic_scoring_falls_back_to_keyword_when_provider_none() {
2061        let scorer = FidelityScorer;
2062        let cfg = FidelityConfig {
2063            enabled: true,
2064            semantic_scoring_provider: None,
2065            ..make_cfg()
2066        };
2067        let tc = FixedTc(4);
2068        let mut messages = vec![
2069            make_msg(Role::System, "system"),
2070            make_msg(Role::User, "cat mat keyword test"),
2071            make_msg(Role::User, "something unrelated here"),
2072        ];
2073
2074        scorer
2075            .score_and_apply(
2076                &mut messages,
2077                "query text here long",
2078                &[],
2079                &cfg,
2080                &tc,
2081                0,
2082                false,
2083                None,
2084                None,
2085            )
2086            .await;
2087
2088        // No embeddings should be computed since provider is None.
2089        for msg in &messages {
2090            assert!(
2091                msg.metadata.embedding.is_none(),
2092                "no embedding must be computed when provider is None"
2093            );
2094        }
2095        // Scoring must still work (keyword path).
2096        for msg in &messages[1..] {
2097            assert!(
2098                msg.metadata.fidelity_tag.is_some(),
2099                "all non-system messages must be scored via keyword path"
2100            );
2101        }
2102    }
2103
2104    // w_plan path: messages whose content overlaps planned tool keywords score higher.
2105    #[tokio::test]
2106    async fn w_plan_produces_nonzero_score_for_matching_message() {
2107        use zeph_common::PlannedToolHint;
2108
2109        let scorer = FidelityScorer;
2110        // Only w_plan is active: all other weights zero so plan signal is isolated.
2111        // full_threshold = 0.5 so that a non-zero plan score reaches Full.
2112        let cfg = FidelityConfig {
2113            w_semantic: 0.0,
2114            w_temporal: 0.0,
2115            w_importance: 0.0,
2116            w_plan: 1.0,
2117            full_threshold: 0.5,
2118            compressed_threshold: 0.1,
2119            min_query_length: 100, // disable semantic signal
2120            ..make_cfg()
2121        };
2122        let tc = FixedTc(4);
2123
2124        // Hint: tool_name="shell", keywords contain "cargo" and "build".
2125        let hint = PlannedToolHint::new("shell", vec!["cargo".to_string(), "build".to_string()], 1);
2126
2127        // matching_msg contains words from the hint keywords.
2128        // non_matching_msg has no overlap with hint keywords.
2129        let mut messages = vec![
2130            make_msg(Role::System, "system prompt"),
2131            make_msg(Role::User, "run cargo build to compile"),
2132            make_msg(Role::User, "what is the weather today"),
2133        ];
2134
2135        scorer
2136            .score_and_apply(
2137                &mut messages,
2138                "q", // short query to keep semantic inactive
2139                &[hint],
2140                &cfg,
2141                &tc,
2142                0,
2143                false,
2144                None,
2145                None,
2146            )
2147            .await;
2148
2149        // The matching message should be scored Full (plan overlap is high).
2150        assert_eq!(
2151            messages[1].metadata.fidelity_tag,
2152            Some(ContextFidelity::Full),
2153            "message matching planned tool keywords must reach Full fidelity"
2154        );
2155
2156        // The non-matching message should not be Full (plan overlap is zero).
2157        assert_ne!(
2158            messages[2].metadata.fidelity_tag,
2159            Some(ContextFidelity::Full),
2160            "message with no keyword overlap must not reach Full fidelity via w_plan"
2161        );
2162    }
2163
2164    // ── truncate_to_byte_limit ────────────────────────────────────────────────
2165
2166    #[test]
2167    fn truncate_to_byte_limit_no_op_when_short() {
2168        assert_eq!(truncate_to_byte_limit("hello", 10), "hello");
2169    }
2170
2171    #[test]
2172    fn truncate_to_byte_limit_exact_limit_no_op() {
2173        assert_eq!(truncate_to_byte_limit("hello", 5), "hello");
2174    }
2175
2176    #[test]
2177    fn truncate_to_byte_limit_over_limit() {
2178        let s = truncate_to_byte_limit("abcdefgh", 5);
2179        assert_eq!(s.len(), 5);
2180        assert_eq!(s, "abcde");
2181    }
2182
2183    #[test]
2184    fn truncate_to_byte_limit_multibyte_boundary() {
2185        // "日本語" = 3 chars, each 3 bytes (9 bytes total). max_bytes=6 → "日本" (6 bytes).
2186        let s = truncate_to_byte_limit("日本語", 6);
2187        assert!(s.is_char_boundary(s.len()));
2188        assert_eq!(s, "日本");
2189    }
2190
2191    // ── apply_input_cap ───────────────────────────────────────────────────────
2192
2193    #[test]
2194    fn apply_input_cap_no_op_below_limit() {
2195        let mut s = "hello".to_string();
2196        apply_input_cap(&mut s, 10); // 10 * 4 = 40 bytes, well above 5
2197        assert_eq!(s, "hello");
2198    }
2199
2200    #[test]
2201    fn apply_input_cap_truncates_over_limit() {
2202        // max_tokens=1 → max_bytes=4
2203        let mut s = "abcdefgh".to_string();
2204        apply_input_cap(&mut s, 1);
2205        assert_eq!(s, "abcd");
2206    }
2207
2208    #[test]
2209    fn apply_input_cap_multibyte() {
2210        // "日本語" = 9 bytes. max_tokens=1 → max_bytes=4. floor_char_boundary(4) on "日本語"
2211        // lands at 3 (end of "日"). Result is "日".
2212        let mut s = "日本語".to_string();
2213        apply_input_cap(&mut s, 1);
2214        assert!(s.is_char_boundary(s.len()));
2215        assert_eq!(s, "日");
2216    }
2217
2218    // ── embed_prepass ─────────────────────────────────────────────────────────
2219
2220    #[tokio::test]
2221    async fn embed_prepass_returns_embeddings_for_non_exempt() {
2222        let messages = vec![
2223            make_msg(Role::System, "system prompt"), // exempt (idx=0, system)
2224            make_msg(Role::User, "user message"),
2225            make_msg(Role::Assistant, "assistant reply"),
2226        ];
2227        let cfg = FidelityConfig::default();
2228        let embed = |_text: &str| -> EmbedFuture { Box::pin(async { Ok(vec![1.0f32, 2.0, 3.0]) }) };
2229        let result = embed_prepass(&messages, &embed, &cfg, 0).await;
2230        // Index 0 (system) is exempt, indices 1 and 2 get embeddings.
2231        assert!(!result.contains_key(&0));
2232        assert_eq!(result[&1], vec![1.0, 2.0, 3.0]);
2233        assert_eq!(result[&2], vec![1.0, 2.0, 3.0]);
2234    }
2235
2236    #[tokio::test]
2237    async fn embed_prepass_skips_empty_content() {
2238        let messages = vec![make_msg(Role::System, "system"), make_msg(Role::User, "")];
2239        let cfg = FidelityConfig::default();
2240        let embed = |_text: &str| -> EmbedFuture { Box::pin(async { Ok(vec![1.0f32]) }) };
2241        let result = embed_prepass(&messages, &embed, &cfg, 0).await;
2242        assert!(!result.contains_key(&1), "empty content must be skipped");
2243    }
2244
2245    #[tokio::test]
2246    async fn embed_prepass_skips_inserted_memory() {
2247        let messages = vec![
2248            make_msg(Role::System, "system"),
2249            make_msg(Role::User, "injected memory"),
2250            make_msg(Role::User, "real user message"),
2251        ];
2252        let cfg = FidelityConfig::default();
2253        let embed = |_text: &str| -> EmbedFuture { Box::pin(async { Ok(vec![1.0f32]) }) };
2254        // inserted_count=1 → idx 1 is exempt
2255        let result = embed_prepass(&messages, &embed, &cfg, 1).await;
2256        assert!(!result.contains_key(&0), "system is exempt");
2257        assert!(!result.contains_key(&1), "inserted memory is exempt");
2258        assert!(
2259            result.contains_key(&2),
2260            "real user message must be embedded"
2261        );
2262    }
2263
2264    #[tokio::test]
2265    async fn embed_prepass_silently_skips_errors() {
2266        let messages = vec![
2267            make_msg(Role::System, "system"),
2268            make_msg(Role::User, "user"),
2269        ];
2270        let cfg = FidelityConfig::default();
2271        let embed = |_text: &str| -> EmbedFuture {
2272            Box::pin(async {
2273                Err(zeph_llm::LlmError::EmbedUnsupported {
2274                    provider: "mock".to_string(),
2275                })
2276            })
2277        };
2278        let result = embed_prepass(&messages, &embed, &cfg, 0).await;
2279        assert!(result.is_empty(), "errors must be silently skipped");
2280    }
2281
2282    #[tokio::test]
2283    async fn embed_prepass_truncates_content_when_cap_set() {
2284        // With max_embed_input_tokens=1 (max_bytes=4), content longer than 4 bytes is truncated.
2285        let long_content = "a".repeat(100);
2286        let messages = vec![
2287            make_msg(Role::System, "system"),
2288            make_msg(Role::User, &long_content),
2289        ];
2290        let cfg = FidelityConfig {
2291            max_embed_input_tokens: Some(1), // 1 * 4 = 4 bytes max
2292            ..FidelityConfig::default()
2293        };
2294        let seen_len = std::sync::Arc::new(std::sync::Mutex::new(0usize));
2295        let seen_len_clone = seen_len.clone();
2296        let embed = move |text: &str| -> EmbedFuture {
2297            let len = text.len();
2298            let seen = seen_len_clone.clone();
2299            Box::pin(async move {
2300                *seen.lock().unwrap() = len;
2301                Ok(vec![1.0f32])
2302            })
2303        };
2304        embed_prepass(&messages, &embed, &cfg, 0).await;
2305        assert_eq!(
2306            *seen_len.lock().unwrap(),
2307            4,
2308            "content must be truncated to max_embed_input_tokens * 4 bytes"
2309        );
2310    }
2311
2312    #[tokio::test]
2313    async fn embed_prepass_concurrency_zero_clamped_to_one() {
2314        // embed_concurrency=0 must be clamped to 1 (with a warning) and still produce results.
2315        let messages = vec![
2316            make_msg(Role::System, "system"),
2317            make_msg(Role::User, "user message"),
2318        ];
2319        let cfg = FidelityConfig {
2320            embed_concurrency: 0,
2321            ..FidelityConfig::default()
2322        };
2323        let embed = |_text: &str| -> EmbedFuture { Box::pin(async { Ok(vec![1.0f32]) }) };
2324        let result = embed_prepass(&messages, &embed, &cfg, 0).await;
2325        assert!(
2326            result.contains_key(&1),
2327            "result must be produced even with concurrency=0"
2328        );
2329    }
2330
2331    #[tokio::test]
2332    async fn embed_prepass_skips_cached_embeddings() {
2333        let mut msg_with_cache = make_msg(Role::User, "already embedded");
2334        msg_with_cache.metadata.embedding = Some(vec![9.0f32]);
2335        let messages = vec![
2336            make_msg(Role::System, "system"),
2337            msg_with_cache,
2338            make_msg(Role::User, "needs embedding"),
2339        ];
2340        let cfg = FidelityConfig::default();
2341        let embed = |_text: &str| -> EmbedFuture { Box::pin(async { Ok(vec![1.0f32]) }) };
2342        let result = embed_prepass(&messages, &embed, &cfg, 0).await;
2343        assert!(
2344            !result.contains_key(&1),
2345            "message with cached embedding must be skipped"
2346        );
2347        assert!(
2348            result.contains_key(&2),
2349            "message without embedding must be processed"
2350        );
2351    }
2352
2353    #[tokio::test(start_paused = true)]
2354    async fn embed_prepass_timeout_skips_message() {
2355        let messages = vec![
2356            make_msg(Role::System, "system"),
2357            make_msg(Role::User, "user"),
2358        ];
2359        let cfg = FidelityConfig::default();
2360        let embed = |_text: &str| -> EmbedFuture {
2361            Box::pin(async {
2362                // Simulate a stalled embed provider.
2363                tokio::time::sleep(Duration::from_secs(45)).await;
2364                Ok(vec![1.0f32])
2365            })
2366        };
2367        // Run the pre-pass; the 30-second timeout fires before the 60-second sleep.
2368        let result = embed_prepass(&messages, &embed, &cfg, 0).await;
2369        assert!(result.is_empty(), "timed-out embed must be skipped");
2370    }
2371
2372    // ── FidelityConfig new fields ─────────────────────────────────────────────
2373
2374    #[test]
2375    fn fidelity_config_new_fields_defaults() {
2376        let cfg = FidelityConfig::default();
2377        assert_eq!(cfg.embed_concurrency, 32);
2378        assert!(cfg.max_embed_input_tokens.is_none());
2379        assert!(cfg.max_compress_input_tokens.is_none());
2380    }
2381
2382    #[test]
2383    fn fidelity_config_new_fields_custom() {
2384        let cfg = FidelityConfig {
2385            embed_concurrency: 8,
2386            max_embed_input_tokens: Some(512),
2387            max_compress_input_tokens: Some(1024),
2388            ..FidelityConfig::default()
2389        };
2390        assert_eq!(cfg.embed_concurrency, 8);
2391        assert_eq!(cfg.max_embed_input_tokens, Some(512));
2392        assert_eq!(cfg.max_compress_input_tokens, Some(1024));
2393    }
2394
2395    // Post-compress truncation: deferred_summary exceeding compressed_max_tokens is trimmed.
2396    #[tokio::test]
2397    async fn render_compressed_truncates_oversized_deferred_summary() {
2398        let scorer = FidelityScorer;
2399        // full_threshold=2.0 unreachable, compressed_threshold=0.0 → everything Compressed.
2400        // compressed_max_tokens=3 (with FixedTc(1): 1 char = 1 token, so 3 tokens = 3 chars).
2401        let cfg = FidelityConfig {
2402            full_threshold: 2.0,
2403            compressed_threshold: 0.0,
2404            compressed_max_tokens: 3,
2405            ..make_cfg()
2406        };
2407        let tc = FixedTc(1);
2408        let mut msg = make_msg(Role::User, "original long content");
2409        // Simulate LLM returning a summary that is 10 chars (> 3 tokens).
2410        msg.metadata.deferred_summary = Some("ten chars!".to_string());
2411        let mut messages = vec![make_msg(Role::System, "sys"), msg];
2412        scorer
2413            .score_and_apply(
2414                &mut messages,
2415                "query text here long",
2416                &[],
2417                &cfg,
2418                &tc,
2419                0,
2420                false,
2421                None,
2422                None,
2423            )
2424            .await;
2425        let compressed = &messages[1];
2426        assert_eq!(
2427            compressed.metadata.fidelity_tag,
2428            Some(ContextFidelity::Compressed)
2429        );
2430        assert!(
2431            tc.count_tokens(&compressed.content) <= 3,
2432            "deferred_summary result must be truncated to compressed_max_tokens"
2433        );
2434    }
2435
2436    // max_compress_input_tokens: content is capped before truncation when no deferred_summary.
2437    #[tokio::test]
2438    async fn render_compressed_applies_max_compress_input_tokens() {
2439        let scorer = FidelityScorer;
2440        // Force everything to Compressed. max_compress_input_tokens=2 → max_bytes=8.
2441        // FixedTc(1): 1 byte = 1 token. compressed_max_tokens=100 (no output cap needed here).
2442        let cfg = FidelityConfig {
2443            full_threshold: 2.0,
2444            compressed_threshold: 0.0,
2445            compressed_max_tokens: 100,
2446            max_compress_input_tokens: Some(2), // 2 * 4 = 8 bytes
2447            ..make_cfg()
2448        };
2449        let tc = FixedTc(1);
2450        let content_20 = "a".repeat(20); // 20 bytes, must be capped to 8
2451        let mut msg = make_msg(Role::User, &content_20);
2452        // No deferred_summary → input cap path applies.
2453        let mut messages = vec![make_msg(Role::System, "sys"), msg.clone()];
2454        scorer
2455            .score_and_apply(
2456                &mut messages,
2457                "query text here long",
2458                &[],
2459                &cfg,
2460                &tc,
2461                0,
2462                false,
2463                None,
2464                None,
2465            )
2466            .await;
2467        let compressed = &messages[1];
2468        assert_eq!(
2469            compressed.metadata.fidelity_tag,
2470            Some(ContextFidelity::Compressed)
2471        );
2472        // Input was capped to 8 bytes, then output cap of 100 is no-op, so content is 8 bytes.
2473        assert_eq!(
2474            compressed.content.len(),
2475            8,
2476            "content must be capped to max_compress_input_tokens * 4 bytes"
2477        );
2478        // Verify deferred_summary path is unaffected by input cap.
2479        msg.metadata.deferred_summary = Some("short".to_string());
2480        let mut messages2 = vec![make_msg(Role::System, "sys"), msg];
2481        scorer
2482            .score_and_apply(
2483                &mut messages2,
2484                "query text here long",
2485                &[],
2486                &cfg,
2487                &tc,
2488                0,
2489                false,
2490                None,
2491                None,
2492            )
2493            .await;
2494        assert_eq!(
2495            messages2[1].content, "short",
2496            "deferred_summary must bypass input cap"
2497        );
2498    }
2499}