Skip to main content

zeph_memory/
quality_gate.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! `MemReader` write quality gate (#3222).
5//!
6//! [`QualityGate`] runs **after** A-MAC admission and before any persistence write.
7//! It scores three signals — information value, reference completeness, and contradiction
8//! risk — and rejects writes below a configurable threshold.
9//!
10//! Rule-based scoring ships as MVP; an optional LLM-assisted path is enabled by setting
11//! `quality_gate_provider` in `[memory.quality_gate]`.
12//!
13//! # Composition in `SemanticMemory`
14//!
15//! ```text
16//! remember(content)
17//!   → A-MAC::evaluate()  →  Ok(None) if rejected
18//!   → QualityGate::evaluate()  →  Ok(None) if rejected
19//!   → SQLite / Qdrant persist
20//! ```
21//!
22//! # Fail-open contract
23//!
24//! Any scoring failure (embed error, LLM timeout, graph query error) is treated as a
25//! pass — the write is admitted. Quality scoring is best-effort, never a hard dependency.
26
27use std::sync::Arc;
28use std::time::Duration;
29
30use zeph_llm::any::AnyProvider;
31use zeph_llm::provider::LlmProvider as _;
32
33use crate::graph::GraphStore;
34
35// ── Config ────────────────────────────────────────────────────────────────────
36
37/// Configuration for the write quality gate (`[memory.quality_gate]` TOML section).
38#[derive(Debug, Clone)]
39pub struct QualityGateConfig {
40    /// Enable the quality gate. When `false`, all writes pass through. Default: `false`.
41    pub enabled: bool,
42    /// Combined score threshold below which writes are rejected. Range `[0, 1]`. Default: `0.55`.
43    pub threshold: f32,
44    /// Number of recent writes to compare against for information-value scoring. Default: `32`.
45    pub recent_window: usize,
46    /// Seconds: edges older than this are considered stable for contradiction detection.
47    /// Default: `300`.
48    pub contradiction_grace_seconds: u64,
49    /// Weight of `information_value` sub-score. Default: `0.4`.
50    pub information_value_weight: f32,
51    /// Weight of `reference_completeness` sub-score. Default: `0.3`.
52    pub reference_completeness_weight: f32,
53    /// Weight of `contradiction` sub-score (applied as `1 - contradiction_risk`). Default: `0.3`.
54    pub contradiction_weight: f32,
55    /// Ratio of rejections (rolling 100-write window) above which a `WARN` is emitted.
56    /// Default: `0.35`.
57    pub rejection_rate_alarm_ratio: f32,
58    /// LLM timeout for optional scoring path. Default: `500 ms`.
59    pub llm_timeout_ms: u64,
60    /// Weight blended into the final score when an LLM provider is set. Default: `0.5`.
61    pub llm_weight: f32,
62    /// Whether pronoun/deictic reference checks are active. Disable for non-English sessions.
63    /// Default: `true`.
64    pub reference_check_lang_en: bool,
65}
66
67impl Default for QualityGateConfig {
68    fn default() -> Self {
69        Self {
70            enabled: false,
71            threshold: 0.55,
72            recent_window: 32,
73            contradiction_grace_seconds: 300,
74            information_value_weight: 0.4,
75            reference_completeness_weight: 0.3,
76            contradiction_weight: 0.3,
77            rejection_rate_alarm_ratio: 0.35,
78            llm_timeout_ms: 500,
79            llm_weight: 0.5,
80            reference_check_lang_en: true,
81        }
82    }
83}
84
85// ── Types ─────────────────────────────────────────────────────────────────────
86
87/// Per-signal scores from the quality gate evaluation.
88#[derive(Debug, Clone)]
89pub struct QualityScore {
90    /// `1.0 - max_cosine(candidate, recent_writes)`. `1.0` when the store is empty.
91    pub information_value: f32,
92    /// `1.0 - unresolved_reference_ratio`. Lower = more unresolved pronouns/deictic time.
93    pub reference_completeness: f32,
94    /// `1.0` if a conflicting graph edge exists (older than grace period); `0.0` otherwise.
95    /// Returns `0.0` when no graph store is attached — improves automatically when
96    /// APEX-MEM (#3223) lands and a `GraphStore` is wired in.
97    pub contradiction_risk: f32,
98    /// Weighted combination of the three sub-scores.
99    pub combined: f32,
100    /// LLM-blended final score. Equals `combined` when no LLM provider is configured.
101    pub final_score: f32,
102}
103
104/// Reason for a quality gate rejection.
105#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize)]
106#[serde(rename_all = "snake_case")]
107#[non_exhaustive]
108pub enum QualityRejectionReason {
109    /// Cosine similarity to recent writes is too high — the content is redundant.
110    Redundant,
111    /// Unresolved pronoun or deictic time expression without an absolute referent.
112    IncompleteReference,
113    /// A conflicting graph edge exists for the same `(subject, predicate)` pair.
114    Contradiction,
115    /// Optional LLM scorer returned a score below the threshold.
116    LlmLowConfidence,
117}
118
119impl QualityRejectionReason {
120    /// Stable lowercase-snake label suitable for metric tags.
121    #[must_use]
122    pub fn label(self) -> &'static str {
123        match self {
124            Self::Redundant => "redundant",
125            Self::IncompleteReference => "incomplete_reference",
126            Self::Contradiction => "contradiction",
127            Self::LlmLowConfidence => "llm_low_confidence",
128        }
129    }
130}
131
132/// Rolling window counter for tracking the rejection rate over the last N writes.
133struct RollingRateTracker {
134    window: std::collections::VecDeque<bool>,
135    capacity: usize,
136    reject_count: usize,
137}
138
139impl RollingRateTracker {
140    fn new(capacity: usize) -> Self {
141        Self {
142            window: std::collections::VecDeque::with_capacity(capacity + 1),
143            capacity,
144            reject_count: 0,
145        }
146    }
147
148    fn push(&mut self, rejected: bool) {
149        if self.window.len() >= self.capacity
150            && let Some(evicted) = self.window.pop_front()
151            && evicted
152        {
153            self.reject_count = self.reject_count.saturating_sub(1);
154        }
155        self.window.push_back(rejected);
156        if rejected {
157            self.reject_count += 1;
158        }
159    }
160
161    #[allow(clippy::cast_precision_loss)]
162    fn rate(&self) -> f32 {
163        if self.window.is_empty() {
164            return 0.0;
165        }
166        self.reject_count as f32 / self.window.len() as f32
167    }
168}
169
170// ── QualityGate ───────────────────────────────────────────────────────────────
171
172/// Write quality gate that runs after A-MAC admission.
173///
174/// Constructed once and attached to [`crate::semantic::SemanticMemory`] via
175/// [`crate::semantic::SemanticMemory::with_quality_gate`]. Shared via `Arc`.
176///
177/// # Fail-open
178///
179/// Any internal error (embed failure, LLM timeout, graph query error) is caught
180/// and treated as a pass. The gate never causes `remember()` to return an `Err`.
181pub struct QualityGate {
182    config: Arc<QualityGateConfig>,
183    /// Optional LLM provider for the blended scoring path.
184    llm_provider: Option<Arc<AnyProvider>>,
185    graph_store: Option<Arc<GraphStore>>,
186    /// Rejection counters keyed by reason.
187    rejection_counts: std::sync::Mutex<std::collections::HashMap<QualityRejectionReason, u64>>,
188    /// Rolling rejection-rate tracker (last 100 writes).
189    rate_tracker: std::sync::Mutex<RollingRateTracker>,
190    /// Per-call timeout for every `embed()` invocation. Default: 5 s.
191    embed_timeout: std::time::Duration,
192}
193
194impl QualityGate {
195    /// Create a new quality gate with the given config.
196    #[must_use]
197    pub fn new(config: QualityGateConfig) -> Self {
198        Self {
199            config: Arc::new(config),
200            llm_provider: None,
201            graph_store: None,
202            rejection_counts: std::sync::Mutex::new(std::collections::HashMap::new()),
203            rate_tracker: std::sync::Mutex::new(RollingRateTracker::new(100)),
204            embed_timeout: std::time::Duration::from_secs(5),
205        }
206    }
207
208    /// Set the per-call timeout for every `embed()` invocation.
209    ///
210    /// Default: 5 s. Must be non-zero; the minimum effective value is 1 s.
211    #[must_use]
212    pub fn with_embed_timeout(mut self, timeout_secs: u64) -> Self {
213        self.embed_timeout = std::time::Duration::from_secs(timeout_secs.max(1));
214        self
215    }
216
217    /// Attach an LLM provider for optional blended scoring.
218    #[must_use]
219    pub fn with_llm_provider(mut self, provider: AnyProvider) -> Self {
220        self.llm_provider = Some(Arc::new(provider));
221        self
222    }
223
224    /// Attach a graph store for contradiction detection.
225    #[must_use]
226    pub fn with_graph_store(mut self, store: Arc<GraphStore>) -> Self {
227        self.graph_store = Some(store);
228        self
229    }
230
231    /// Return a reference to the configuration.
232    #[must_use]
233    pub fn config(&self) -> &QualityGateConfig {
234        &self.config
235    }
236
237    /// Return cumulative rejection counts per reason.
238    #[must_use]
239    pub fn rejection_counts(&self) -> std::collections::HashMap<QualityRejectionReason, u64> {
240        self.rejection_counts
241            .lock()
242            .map(|g| g.clone())
243            .unwrap_or_default()
244    }
245
246    /// Evaluate the quality gate for a candidate write.
247    ///
248    /// Returns `None` when the write passes (should be persisted).
249    /// Returns `Some(reason)` when the write should be rejected.
250    ///
251    /// Failures inside scoring are caught and treated as pass (fail-open).
252    #[tracing::instrument(name = "memory.quality_gate.evaluate", skip_all)]
253    pub async fn evaluate(
254        &self,
255        content: &str,
256        embed_provider: &AnyProvider,
257        recent_embeddings: &[Vec<f32>],
258    ) -> Option<QualityRejectionReason> {
259        if !self.config.enabled {
260            return None;
261        }
262
263        let info_val = compute_information_value(
264            content,
265            embed_provider,
266            recent_embeddings,
267            self.embed_timeout,
268        )
269        .await;
270        let ref_comp = if self.config.reference_check_lang_en {
271            compute_reference_completeness(content)
272        } else {
273            1.0
274        };
275        let contradiction_risk =
276            compute_contradiction_risk(content, self.graph_store.as_deref(), &self.config).await;
277
278        let w_v = self.config.information_value_weight;
279        let w_c = self.config.reference_completeness_weight;
280        let w_k = self.config.contradiction_weight;
281
282        let rule_score = w_v * info_val + w_c * ref_comp + w_k * (1.0 - contradiction_risk);
283
284        let final_score = if let Some(ref llm) = self.llm_provider {
285            let llm_score = call_llm_scorer(content, llm, self.config.llm_timeout_ms).await;
286            let lw = self.config.llm_weight;
287            (1.0 - lw) * rule_score + lw * llm_score
288        } else {
289            rule_score
290        };
291
292        let rejected = final_score < self.config.threshold;
293
294        // Track rolling rejection rate.
295        if let Ok(mut tracker) = self.rate_tracker.lock() {
296            tracker.push(rejected);
297            let rate = tracker.rate();
298            if rate > self.config.rejection_rate_alarm_ratio {
299                tracing::warn!(
300                    rate = %format!("{:.2}", rate),
301                    window_size = self.config.recent_window,
302                    threshold = self.config.rejection_rate_alarm_ratio,
303                    "quality_gate: high rejection rate alarm"
304                );
305            }
306        }
307
308        if !rejected {
309            return None;
310        }
311
312        // Determine the most specific rejection reason.
313        let reason = if info_val < 0.1 {
314            QualityRejectionReason::Redundant
315        } else if ref_comp < 0.5 && self.config.reference_check_lang_en {
316            QualityRejectionReason::IncompleteReference
317        } else if contradiction_risk >= 1.0 {
318            QualityRejectionReason::Contradiction
319        } else {
320            QualityRejectionReason::LlmLowConfidence
321        };
322
323        if let Ok(mut counts) = self.rejection_counts.lock() {
324            *counts.entry(reason).or_insert(0) += 1;
325        }
326
327        tracing::debug!(
328            reason = reason.label(),
329            final_score,
330            info_val,
331            ref_comp,
332            contradiction_risk,
333            "quality_gate: rejected write"
334        );
335
336        Some(reason)
337    }
338}
339
340// ── Sub-scorers ───────────────────────────────────────────────────────────────
341
342/// Compute `information_value` as `1.0 - max_cosine(candidate, recent_embeddings)`.
343///
344/// Returns `1.0` when the store is empty or on any embedding error (fail-open: treat as novel).
345async fn compute_information_value(
346    content: &str,
347    provider: &AnyProvider,
348    recent_embeddings: &[Vec<f32>],
349    embed_timeout: std::time::Duration,
350) -> f32 {
351    if recent_embeddings.is_empty() {
352        return 1.0;
353    }
354    if !provider.supports_embeddings() {
355        return 1.0;
356    }
357    let candidate = match tokio::time::timeout(embed_timeout, provider.embed(content)).await {
358        Ok(Ok(v)) => v,
359        Ok(Err(e)) => {
360            tracing::debug!(error = %e, "quality_gate: embed failed, treating info_val = 1.0 (fail-open)");
361            return 1.0;
362        }
363        Err(_) => {
364            tracing::warn!("quality_gate: embed timed out, treating info_val = 1.0 (fail-open)");
365            return 1.0;
366        }
367    };
368    let max_sim = recent_embeddings
369        .iter()
370        .map(|r| zeph_common::math::cosine_similarity(&candidate, r))
371        .fold(0.0f32, f32::max);
372    (1.0 - max_sim).max(0.0)
373}
374
375/// Compute `reference_completeness` as `1.0 - unresolved_reference_ratio`.
376///
377/// Heuristic: counts unresolved English pronouns and deictic time expressions.
378/// English-only; callers must skip this when `reference_check_lang_en = false`.
379#[must_use]
380pub fn compute_reference_completeness(content: &str) -> f32 {
381    // Third-person pronouns that likely refer to an unresolved entity.
382    const PRONOUNS: &[&str] = &[
383        " he ", " she ", " they ", " it ", " him ", " her ", " them ",
384    ];
385    // Deictic time expressions without an accompanying absolute date.
386    const DEICTIC_TIME: &[&str] = &[
387        "yesterday",
388        "tomorrow",
389        "last week",
390        "next week",
391        "last month",
392        "next month",
393        "last year",
394        "next year",
395    ];
396    // Absolute date anchors that resolve deictic expressions.
397    const DATE_ANCHORS: &[&str] = &[
398        "january",
399        "february",
400        "march",
401        "april",
402        "may",
403        "june",
404        "july",
405        "august",
406        "september",
407        "october",
408        "november",
409        "december",
410        "jan ",
411        "feb ",
412        "mar ",
413        "apr ",
414        "jun ",
415        "jul ",
416        "aug ",
417        "sep ",
418        "oct ",
419        "nov ",
420        "dec ",
421    ];
422
423    let lower = content.to_lowercase();
424    let padded = format!(" {lower} ");
425    let pronoun_count = PRONOUNS.iter().filter(|&&p| padded.contains(p)).count();
426
427    // Require a 4-digit year (19xx or 20xx) at a word boundary, not just "20"
428    // which produces false positives on counts like "20 items" or "id=200".
429    let has_year_anchor = has_4digit_year_anchor(&lower);
430    let has_date_anchor = has_year_anchor || DATE_ANCHORS.iter().any(|&a| lower.contains(a));
431    let deictic_count = if has_date_anchor {
432        0
433    } else {
434        DEICTIC_TIME.iter().filter(|&&t| lower.contains(t)).count()
435    };
436
437    let total_issues = pronoun_count + deictic_count;
438    if total_issues == 0 {
439        return 1.0;
440    }
441
442    // Normalize by approximate word count; each issue costs ~0.25, floor at 0.0.
443    let word_count = content.split_ascii_whitespace().count().max(1);
444    #[allow(clippy::cast_precision_loss)]
445    let ratio = total_issues as f32 / word_count as f32;
446    (1.0 - ratio * 2.0).clamp(0.0, 1.0)
447}
448
449/// Returns `true` when `text` (lowercased) contains a 4-digit year (19xx or 20xx)
450/// at a word boundary.
451///
452/// Avoids false positives from 2-digit numbers like "20 items" or "id=200".
453fn has_4digit_year_anchor(text: &str) -> bool {
454    let bytes = text.as_bytes();
455    let len = bytes.len();
456    if len < 4 {
457        return false;
458    }
459    let mut i = 0usize;
460    while i + 3 < len {
461        let c0 = bytes[i];
462        let c1 = bytes[i + 1];
463        if ((c0 == b'1' && c1 == b'9') || (c0 == b'2' && c1 == b'0'))
464            && bytes[i + 2].is_ascii_digit()
465            && bytes[i + 3].is_ascii_digit()
466        {
467            let left_ok = i == 0 || !bytes[i - 1].is_ascii_digit();
468            let right_ok = i + 4 >= len || !bytes[i + 4].is_ascii_digit();
469            if left_ok && right_ok {
470                return true;
471            }
472        }
473        i += 1;
474    }
475    false
476}
477
478/// Compute `contradiction_risk` via graph edge lookup (FR-006).
479///
480/// Extracts the subject entity from the candidate message, then queries for existing
481/// active edges with the same `(source_entity_id, canonical_relation)`. A conflicting
482/// value on the same predicate that is older than `grace_seconds` is treated as a
483/// hard contradiction (returns `1.0`).
484///
485/// Returns `0.0` when no graph store is attached, on any error, or when no conflict found.
486async fn compute_contradiction_risk(
487    content: &str,
488    graph: Option<&GraphStore>,
489    config: &QualityGateConfig,
490) -> f32 {
491    let Some(store) = graph else {
492        return 0.0;
493    };
494
495    let content_lower = content.to_lowercase();
496
497    // Extract subject: longest noun-phrase before a verb-like token ("is", "has", "was", "are").
498    // Fallback: first two tokens.
499    let subject_query = extract_subject_tokens(&content_lower);
500    if subject_query.is_empty() {
501        return 0.0;
502    }
503
504    // Resolve the subject entity.
505    let Ok(entities) = store.find_entities_fuzzy(&subject_query, 1).await else {
506        return 0.0;
507    };
508    let Some(subject_entity) = entities.into_iter().next() else {
509        return 0.0;
510    };
511
512    // Extract candidate predicate from "X <predicate> Y" pattern.
513    let canonical_predicate = extract_predicate_token(&content_lower);
514
515    // Load all active edges where this entity is the source.
516    let Ok(edges) = store.edges_for_entity(subject_entity.id.0).await else {
517        return 0.0;
518    };
519
520    // Filter to edges where source matches subject and canonical_relation matches predicate.
521    let relevant_edges: Vec<_> = edges
522        .iter()
523        .filter(|e| {
524            e.source_entity_id == subject_entity.id.0
525                && canonical_predicate
526                    .as_ref()
527                    .is_none_or(|p| e.relation == *p)
528        })
529        .collect();
530
531    if relevant_edges.is_empty() {
532        return 0.0;
533    }
534
535    let now_secs = std::time::SystemTime::now()
536        .duration_since(std::time::UNIX_EPOCH)
537        .map_or(0, |d| d.as_secs());
538
539    let has_old_conflict = relevant_edges.iter().any(|edge| {
540        let edge_ts = chrono::DateTime::parse_from_rfc3339(&edge.created_at)
541            .map_or(0u64, |dt| u64::try_from(dt.timestamp()).unwrap_or(0));
542        now_secs.saturating_sub(edge_ts) > config.contradiction_grace_seconds
543    });
544
545    if has_old_conflict { 1.0 } else { 0.5 }
546}
547
548/// Extract subject tokens from the content (first noun phrase before verb-like token).
549fn extract_subject_tokens(content_lower: &str) -> String {
550    const VERB_MARKERS: &[&str] = &["is", "was", "are", "were", "has", "have", "had", "will"];
551    let tokens: Vec<&str> = content_lower.split_ascii_whitespace().collect();
552    let end = tokens
553        .iter()
554        .position(|t| VERB_MARKERS.contains(t))
555        .unwrap_or(2.min(tokens.len()));
556    let subject_tokens = &tokens[..end.min(3)];
557    subject_tokens.join(" ")
558}
559
560/// Extract the canonical predicate token (first verb-like token in the content).
561fn extract_predicate_token(content_lower: &str) -> Option<String> {
562    const VERB_MARKERS: &[&str] = &["is", "was", "are", "were", "has", "have", "had", "will"];
563    content_lower
564        .split_ascii_whitespace()
565        .find(|t| VERB_MARKERS.contains(t))
566        .map(str::to_owned)
567}
568
569/// Call the optional LLM scorer and return a blended quality score.
570///
571/// Returns `0.5` (neutral) on timeout or any error — ensures fail-open behavior.
572async fn call_llm_scorer(content: &str, provider: &AnyProvider, timeout_ms: u64) -> f32 {
573    use zeph_llm::provider::{Message, MessageMetadata, Role};
574
575    let system = "You are a memory quality judge. Rate the quality of the following message \
576        for long-term storage on a scale of 0.0 to 1.0. Consider: information density, \
577        completeness of references, factual clarity. \
578        Respond with ONLY a JSON object: \
579        {\"information_value\": 0.0-1.0, \"reference_completeness\": 0.0-1.0, \
580        \"contradiction_risk\": 0.0-1.0}";
581
582    let user = format!(
583        "Message: {}\n\nQuality JSON:",
584        content.chars().take(500).collect::<String>()
585    );
586
587    let messages = vec![
588        Message {
589            role: Role::System,
590            content: system.to_owned(),
591            parts: vec![],
592            metadata: MessageMetadata::default(),
593        },
594        Message {
595            role: Role::User,
596            content: user,
597            parts: vec![],
598            metadata: MessageMetadata::default(),
599        },
600    ];
601
602    let timeout = Duration::from_millis(timeout_ms);
603    let result = match tokio::time::timeout(timeout, provider.chat(&messages)).await {
604        Ok(Ok(r)) => r,
605        Ok(Err(e)) => {
606            tracing::debug!(error = %e, "quality_gate: LLM scorer failed, using 0.5");
607            return 0.5;
608        }
609        Err(_) => {
610            tracing::debug!("quality_gate: LLM scorer timed out, using 0.5");
611            return 0.5;
612        }
613    };
614
615    parse_llm_score(&result)
616}
617
618/// Parse LLM JSON response into a combined quality score.
619///
620/// Returns `0.5` on any parse failure.
621fn parse_llm_score(response: &str) -> f32 {
622    // Find JSON object in the response.
623    let start = response.find('{');
624    let end = response.rfind('}');
625    let (Some(s), Some(e)) = (start, end) else {
626        return 0.5;
627    };
628    let json_str = &response[s..=e];
629    let Ok(val) = serde_json::from_str::<serde_json::Value>(json_str) else {
630        return 0.5;
631    };
632
633    #[allow(clippy::cast_possible_truncation)]
634    let iv = val["information_value"].as_f64().unwrap_or(0.5) as f32;
635    #[allow(clippy::cast_possible_truncation)]
636    let rc = val["reference_completeness"].as_f64().unwrap_or(0.5) as f32;
637    #[allow(clippy::cast_possible_truncation)]
638    let cr = val["contradiction_risk"].as_f64().unwrap_or(0.0) as f32;
639
640    // Mirror the rule-based formula with default weights.
641    let score =
642        0.4 * iv.clamp(0.0, 1.0) + 0.3 * rc.clamp(0.0, 1.0) + 0.3 * (1.0 - cr.clamp(0.0, 1.0));
643    score.clamp(0.0, 1.0)
644}
645
646// ── Tests ─────────────────────────────────────────────────────────────────────
647
648#[cfg(test)]
649mod tests {
650    use super::*;
651
652    #[test]
653    fn reference_completeness_clean_text() {
654        let score = compute_reference_completeness("The Rust compiler enforces memory safety.");
655        assert!((score - 1.0).abs() < 0.01, "clean text should score 1.0");
656    }
657
658    #[test]
659    fn reference_completeness_pronoun_heavy() {
660        // "he", "they", "it" — three unresolved pronouns in a short message.
661        let score = compute_reference_completeness("yeah he said they confirmed it");
662        assert!(
663            score < 0.5,
664            "pronoun-heavy message should score below 0.5, got {score}"
665        );
666    }
667
668    #[test]
669    fn reference_completeness_deictic_without_anchor() {
670        let score = compute_reference_completeness("We agreed yesterday to postpone");
671        assert!(
672            score < 1.0,
673            "deictic time without anchor should penalize, got {score}"
674        );
675    }
676
677    #[test]
678    fn reference_completeness_deictic_with_anchor() {
679        let score = compute_reference_completeness("We agreed yesterday (2026-04-18) to postpone");
680        assert!(
681            score >= 0.9,
682            "deictic with anchor '20' should not penalize, got {score}"
683        );
684    }
685
686    #[test]
687    fn rejection_reason_labels() {
688        assert_eq!(QualityRejectionReason::Redundant.label(), "redundant");
689        assert_eq!(
690            QualityRejectionReason::IncompleteReference.label(),
691            "incomplete_reference"
692        );
693        assert_eq!(
694            QualityRejectionReason::Contradiction.label(),
695            "contradiction"
696        );
697        assert_eq!(
698            QualityRejectionReason::LlmLowConfidence.label(),
699            "llm_low_confidence"
700        );
701    }
702
703    #[test]
704    fn rolling_rate_tracker_basic() {
705        let mut tracker = RollingRateTracker::new(4);
706        tracker.push(true);
707        tracker.push(true);
708        tracker.push(false);
709        tracker.push(false);
710        let rate = tracker.rate();
711        assert!((rate - 0.5).abs() < 0.01, "rate should be 0.5, got {rate}");
712    }
713
714    #[test]
715    fn rolling_rate_tracker_evicts_oldest() {
716        let mut tracker = RollingRateTracker::new(3);
717        tracker.push(true); // will be evicted
718        tracker.push(false);
719        tracker.push(false);
720        tracker.push(false); // evicts first `true`
721        let rate = tracker.rate();
722        assert!(
723            rate < 0.01,
724            "evicted rejection should not count, rate={rate}"
725        );
726    }
727
728    #[test]
729    fn parse_llm_score_valid_json() {
730        let json = r#"{"information_value": 0.8, "reference_completeness": 0.9, "contradiction_risk": 0.1}"#;
731        let score = parse_llm_score(json);
732        assert!(
733            score > 0.7,
734            "high-quality JSON should yield high score, got {score}"
735        );
736    }
737
738    #[test]
739    fn parse_llm_score_malformed_returns_neutral() {
740        let score = parse_llm_score("not json");
741        assert!(
742            (score - 0.5).abs() < 0.01,
743            "malformed JSON should return 0.5"
744        );
745    }
746
747    fn mock_provider() -> zeph_llm::any::AnyProvider {
748        zeph_llm::any::AnyProvider::Mock(zeph_llm::mock::MockProvider::default())
749    }
750
751    #[tokio::test]
752    async fn gate_disabled_always_passes() {
753        let config = QualityGateConfig {
754            enabled: false,
755            ..QualityGateConfig::default()
756        };
757        let gate = QualityGate::new(config);
758        let provider = mock_provider();
759
760        let result = gate.evaluate("yeah he confirmed it", &provider, &[]).await;
761        assert!(result.is_none(), "disabled gate must always pass");
762    }
763
764    #[tokio::test]
765    async fn gate_admits_novel_clean_content() {
766        let config = QualityGateConfig {
767            enabled: true,
768            threshold: 0.3, // lenient threshold for rule-only test
769            ..QualityGateConfig::default()
770        };
771        let gate = QualityGate::new(config);
772        let provider = mock_provider();
773
774        // Novel content with no recent embeddings and clean references → should pass.
775        let result = gate
776            .evaluate(
777                "The Rust compiler enforces memory safety through the borrow checker.",
778                &provider,
779                &[],
780            )
781            .await;
782        assert!(result.is_none(), "clean novel content should be admitted");
783    }
784
785    #[tokio::test]
786    async fn gate_rejects_pronoun_only_at_low_threshold() {
787        let config = QualityGateConfig {
788            enabled: true,
789            threshold: 0.75, // strict threshold
790            reference_completeness_weight: 0.9,
791            information_value_weight: 0.05,
792            contradiction_weight: 0.05,
793            ..QualityGateConfig::default()
794        };
795        let gate = QualityGate::new(config);
796        let provider = mock_provider();
797
798        let result = gate
799            .evaluate("yeah he confirmed it they said so", &provider, &[])
800            .await;
801        assert!(
802            result == Some(QualityRejectionReason::IncompleteReference),
803            "pronoun-heavy message should be rejected as IncompleteReference, got {result:?}"
804        );
805    }
806
807    #[test]
808    fn quality_gate_counts_rejections() {
809        let config = QualityGateConfig {
810            enabled: true,
811            threshold: 0.99, // reject almost everything
812            ..QualityGateConfig::default()
813        };
814        let gate = QualityGate::new(config);
815
816        // Manually record a rejection.
817        if let Ok(mut counts) = gate.rejection_counts.lock() {
818            *counts.entry(QualityRejectionReason::Redundant).or_insert(0) += 1;
819        }
820
821        let counts = gate.rejection_counts();
822        assert_eq!(counts.get(&QualityRejectionReason::Redundant), Some(&1));
823    }
824
825    /// Embed error → fail-open: gate must admit the write (return `None`).
826    #[tokio::test]
827    async fn gate_fail_open_on_embed_error() {
828        let config = QualityGateConfig {
829            enabled: true,
830            threshold: 0.5,
831            ..QualityGateConfig::default()
832        };
833        let gate = QualityGate::new(config);
834
835        // Provider that returns an embed error.
836        let provider = zeph_llm::any::AnyProvider::Mock(
837            zeph_llm::mock::MockProvider::default().with_embed_invalid_input(),
838        );
839
840        let result = gate
841            .evaluate(
842                "Alice confirmed the meeting at 3pm.",
843                &provider,
844                &[], // no recent embeddings; error occurs during info_value embed
845            )
846            .await;
847        assert!(
848            result.is_none(),
849            "embed error must be treated as fail-open (admitted), got {result:?}"
850        );
851    }
852
853    /// Pre-populated `recent_embeddings` with an identical vector triggers `Redundant` rejection.
854    #[tokio::test]
855    async fn gate_rejects_redundant_with_populated_embeddings() {
856        let config = QualityGateConfig {
857            enabled: true,
858            threshold: 0.5,
859            // Heavy weight on information_value so redundancy dominates the score.
860            information_value_weight: 0.9,
861            reference_completeness_weight: 0.05,
862            contradiction_weight: 0.05,
863            ..QualityGateConfig::default()
864        };
865        let gate = QualityGate::new(config);
866
867        // MockProvider returns the same fixed embedding for every call.
868        let fixed_embedding = vec![0.1_f32; 384];
869        let provider = zeph_llm::any::AnyProvider::Mock(
870            zeph_llm::mock::MockProvider::default().with_embedding(fixed_embedding.clone()),
871        );
872
873        // Pass the identical vector as the recent-embeddings window so cosine similarity = 1.0.
874        let result = gate
875            .evaluate(
876                "The Rust compiler enforces memory safety through the borrow checker.",
877                &provider,
878                &[fixed_embedding],
879            )
880            .await;
881        assert_eq!(
882            result,
883            Some(QualityRejectionReason::Redundant),
884            "identical recent embedding must trigger Redundant rejection"
885        );
886    }
887
888    /// `embed()` timeout → fail-open: `compute_information_value` returns 1.0,
889    /// gate admits the write (returns `None`).
890    #[tokio::test]
891    async fn gate_fail_open_on_embed_timeout() {
892        tokio::time::pause();
893
894        let config = QualityGateConfig {
895            enabled: true,
896            threshold: 0.5,
897            information_value_weight: 0.9,
898            reference_completeness_weight: 0.05,
899            contradiction_weight: 0.05,
900            ..QualityGateConfig::default()
901        };
902        let gate = QualityGate::new(config);
903
904        // embed_delay_ms >> 5000ms timeout; time is paused so the test is instant.
905        let provider = zeph_llm::any::AnyProvider::Mock(
906            zeph_llm::mock::MockProvider::default().with_embed_delay(10_000),
907        );
908
909        // Provide a non-empty recent-embeddings window so compute_information_value
910        // actually calls embed() (it returns early on empty).
911        let recent = vec![vec![0.1_f32; 384]];
912
913        let fut = gate.evaluate("Alice confirmed the meeting at 3pm.", &provider, &recent);
914        // Advance time past the 5s embed timeout.
915        let (result, ()) = tokio::join!(fut, async {
916            tokio::time::advance(std::time::Duration::from_secs(6)).await;
917        });
918
919        assert!(
920            result.is_none(),
921            "embed timeout must be treated as fail-open (info_val=1.0, admitted), got {result:?}"
922        );
923    }
924
925    /// LLM provider with 600ms latency exceeds `llm_timeout_ms`; gate falls back to rule score
926    /// and still returns a result (pass or reject based on rule score alone).
927    #[tokio::test]
928    async fn gate_llm_timeout_falls_back_to_rule_score() {
929        let config = QualityGateConfig {
930            enabled: true,
931            threshold: 0.3,     // lenient so rule score alone is likely to pass
932            llm_timeout_ms: 50, // tight timeout
933            llm_weight: 0.5,
934            ..QualityGateConfig::default()
935        };
936        let gate = QualityGate::new(config);
937
938        // Chat provider with 600ms delay — will exceed the 50ms timeout.
939        let slow_provider = zeph_llm::any::AnyProvider::Mock(
940            zeph_llm::mock::MockProvider::default().with_delay(600),
941        );
942        let gate = gate.with_llm_provider(slow_provider);
943
944        let embed_provider = mock_provider(); // no embeddings needed for this path
945
946        let result = gate
947            .evaluate(
948                "The release is scheduled for next Friday.",
949                &embed_provider,
950                &[],
951            )
952            .await;
953        // Gate must complete (no panic/hang) and fall back to rule-only score.
954        // With a lenient threshold and clean content the rule score should admit it.
955        assert!(
956            result.is_none(),
957            "LLM timeout must fall back to rule score and admit clean content, got {result:?}"
958        );
959    }
960}