Skip to main content

zeph_memory/
quality_gate.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! `MemReader` write quality gate (#3222).
5//!
6//! [`QualityGate`] runs **after** A-MAC admission and before any persistence write.
7//! It scores three signals — information value, reference completeness, and contradiction
8//! risk — and rejects writes below a configurable threshold.
9//!
10//! Rule-based scoring ships as MVP; an optional LLM-assisted path is enabled by setting
11//! `quality_gate_provider` in `[memory.quality_gate]`.
12//!
13//! # Composition in `SemanticMemory`
14//!
15//! ```text
16//! remember(content)
17//!   → A-MAC::evaluate()  →  Ok(None) if rejected
18//!   → QualityGate::evaluate()  →  Ok(None) if rejected
19//!   → SQLite / Qdrant persist
20//! ```
21//!
22//! # Fail-open contract
23//!
24//! Any scoring failure (embed error, LLM timeout, graph query error) is treated as a
25//! pass — the write is admitted. Quality scoring is best-effort, never a hard dependency.
26
27use std::sync::Arc;
28use std::time::Duration;
29
30use zeph_llm::any::AnyProvider;
31use zeph_llm::provider::LlmProvider as _;
32
33use crate::graph::GraphStore;
34
35// ── Config ────────────────────────────────────────────────────────────────────
36
37/// Configuration for the write quality gate (`[memory.quality_gate]` TOML section).
38#[derive(Debug, Clone)]
39pub struct QualityGateConfig {
40    /// Enable the quality gate. When `false`, all writes pass through. Default: `false`.
41    pub enabled: bool,
42    /// Combined score threshold below which writes are rejected. Range `[0, 1]`. Default: `0.55`.
43    pub threshold: f32,
44    /// Number of recent writes to compare against for information-value scoring. Default: `32`.
45    pub recent_window: usize,
46    /// Seconds: edges older than this are considered stable for contradiction detection.
47    /// Default: `300`.
48    pub contradiction_grace_seconds: u64,
49    /// Weight of `information_value` sub-score. Default: `0.4`.
50    pub information_value_weight: f32,
51    /// Weight of `reference_completeness` sub-score. Default: `0.3`.
52    pub reference_completeness_weight: f32,
53    /// Weight of `contradiction` sub-score (applied as `1 - contradiction_risk`). Default: `0.3`.
54    pub contradiction_weight: f32,
55    /// Ratio of rejections (rolling 100-write window) above which a `WARN` is emitted.
56    /// Default: `0.35`.
57    pub rejection_rate_alarm_ratio: f32,
58    /// LLM timeout for optional scoring path. Default: `500 ms`.
59    pub llm_timeout_ms: u64,
60    /// Weight blended into the final score when an LLM provider is set. Default: `0.5`.
61    pub llm_weight: f32,
62    /// Whether pronoun/deictic reference checks are active. Disable for non-English sessions.
63    /// Default: `true`.
64    pub reference_check_lang_en: bool,
65}
66
67impl Default for QualityGateConfig {
68    fn default() -> Self {
69        Self {
70            enabled: false,
71            threshold: 0.55,
72            recent_window: 32,
73            contradiction_grace_seconds: 300,
74            information_value_weight: 0.4,
75            reference_completeness_weight: 0.3,
76            contradiction_weight: 0.3,
77            rejection_rate_alarm_ratio: 0.35,
78            llm_timeout_ms: 500,
79            llm_weight: 0.5,
80            reference_check_lang_en: true,
81        }
82    }
83}
84
85// ── Types ─────────────────────────────────────────────────────────────────────
86
87/// Per-signal scores from the quality gate evaluation.
88#[derive(Debug, Clone)]
89pub struct QualityScore {
90    /// `1.0 - max_cosine(candidate, recent_writes)`. `1.0` when the store is empty.
91    pub information_value: f32,
92    /// `1.0 - unresolved_reference_ratio`. Lower = more unresolved pronouns/deictic time.
93    pub reference_completeness: f32,
94    /// `1.0` if a conflicting graph edge exists (older than grace period); `0.0` otherwise.
95    /// Returns `0.0` when no graph store is attached — improves automatically when
96    /// APEX-MEM (#3223) lands and a `GraphStore` is wired in.
97    pub contradiction_risk: f32,
98    /// Weighted combination of the three sub-scores.
99    pub combined: f32,
100    /// LLM-blended final score. Equals `combined` when no LLM provider is configured.
101    pub final_score: f32,
102}
103
104/// Reason for a quality gate rejection.
105#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, serde::Serialize)]
106#[serde(rename_all = "snake_case")]
107pub enum QualityRejectionReason {
108    /// Cosine similarity to recent writes is too high — the content is redundant.
109    Redundant,
110    /// Unresolved pronoun or deictic time expression without an absolute referent.
111    IncompleteReference,
112    /// A conflicting graph edge exists for the same `(subject, predicate)` pair.
113    Contradiction,
114    /// Optional LLM scorer returned a score below the threshold.
115    LlmLowConfidence,
116}
117
118impl QualityRejectionReason {
119    /// Stable lowercase-snake label suitable for metric tags.
120    #[must_use]
121    pub fn label(self) -> &'static str {
122        match self {
123            Self::Redundant => "redundant",
124            Self::IncompleteReference => "incomplete_reference",
125            Self::Contradiction => "contradiction",
126            Self::LlmLowConfidence => "llm_low_confidence",
127        }
128    }
129}
130
131/// Rolling window counter for tracking the rejection rate over the last N writes.
132struct RollingRateTracker {
133    window: std::collections::VecDeque<bool>,
134    capacity: usize,
135    reject_count: usize,
136}
137
138impl RollingRateTracker {
139    fn new(capacity: usize) -> Self {
140        Self {
141            window: std::collections::VecDeque::with_capacity(capacity + 1),
142            capacity,
143            reject_count: 0,
144        }
145    }
146
147    fn push(&mut self, rejected: bool) {
148        if self.window.len() >= self.capacity
149            && let Some(evicted) = self.window.pop_front()
150            && evicted
151        {
152            self.reject_count = self.reject_count.saturating_sub(1);
153        }
154        self.window.push_back(rejected);
155        if rejected {
156            self.reject_count += 1;
157        }
158    }
159
160    #[allow(clippy::cast_precision_loss)]
161    fn rate(&self) -> f32 {
162        if self.window.is_empty() {
163            return 0.0;
164        }
165        self.reject_count as f32 / self.window.len() as f32
166    }
167}
168
169// ── QualityGate ───────────────────────────────────────────────────────────────
170
171/// Write quality gate that runs after A-MAC admission.
172///
173/// Constructed once and attached to [`crate::semantic::SemanticMemory`] via
174/// [`crate::semantic::SemanticMemory::with_quality_gate`]. Shared via `Arc`.
175///
176/// # Fail-open
177///
178/// Any internal error (embed failure, LLM timeout, graph query error) is caught
179/// and treated as a pass. The gate never causes `remember()` to return an `Err`.
180pub struct QualityGate {
181    config: Arc<QualityGateConfig>,
182    /// Optional LLM provider for the blended scoring path.
183    llm_provider: Option<Arc<AnyProvider>>,
184    graph_store: Option<Arc<GraphStore>>,
185    /// Rejection counters keyed by reason.
186    rejection_counts: std::sync::Mutex<std::collections::HashMap<QualityRejectionReason, u64>>,
187    /// Rolling rejection-rate tracker (last 100 writes).
188    rate_tracker: std::sync::Mutex<RollingRateTracker>,
189}
190
191impl QualityGate {
192    /// Create a new quality gate with the given config.
193    #[must_use]
194    pub fn new(config: QualityGateConfig) -> Self {
195        Self {
196            config: Arc::new(config),
197            llm_provider: None,
198            graph_store: None,
199            rejection_counts: std::sync::Mutex::new(std::collections::HashMap::new()),
200            rate_tracker: std::sync::Mutex::new(RollingRateTracker::new(100)),
201        }
202    }
203
204    /// Attach an LLM provider for optional blended scoring.
205    #[must_use]
206    pub fn with_llm_provider(mut self, provider: AnyProvider) -> Self {
207        self.llm_provider = Some(Arc::new(provider));
208        self
209    }
210
211    /// Attach a graph store for contradiction detection.
212    #[must_use]
213    pub fn with_graph_store(mut self, store: Arc<GraphStore>) -> Self {
214        self.graph_store = Some(store);
215        self
216    }
217
218    /// Return a reference to the configuration.
219    #[must_use]
220    pub fn config(&self) -> &QualityGateConfig {
221        &self.config
222    }
223
224    /// Return cumulative rejection counts per reason.
225    #[must_use]
226    pub fn rejection_counts(&self) -> std::collections::HashMap<QualityRejectionReason, u64> {
227        self.rejection_counts
228            .lock()
229            .map(|g| g.clone())
230            .unwrap_or_default()
231    }
232
233    /// Evaluate the quality gate for a candidate write.
234    ///
235    /// Returns `None` when the write passes (should be persisted).
236    /// Returns `Some(reason)` when the write should be rejected.
237    ///
238    /// Failures inside scoring are caught and treated as pass (fail-open).
239    #[tracing::instrument(name = "memory.quality_gate.evaluate", skip_all)]
240    pub async fn evaluate(
241        &self,
242        content: &str,
243        embed_provider: &AnyProvider,
244        recent_embeddings: &[Vec<f32>],
245    ) -> Option<QualityRejectionReason> {
246        if !self.config.enabled {
247            return None;
248        }
249
250        let info_val = compute_information_value(content, embed_provider, recent_embeddings).await;
251        let ref_comp = if self.config.reference_check_lang_en {
252            compute_reference_completeness(content)
253        } else {
254            1.0
255        };
256        let contradiction_risk =
257            compute_contradiction_risk(content, self.graph_store.as_deref(), &self.config).await;
258
259        let w_v = self.config.information_value_weight;
260        let w_c = self.config.reference_completeness_weight;
261        let w_k = self.config.contradiction_weight;
262
263        let rule_score = w_v * info_val + w_c * ref_comp + w_k * (1.0 - contradiction_risk);
264
265        let final_score = if let Some(ref llm) = self.llm_provider {
266            let llm_score = call_llm_scorer(content, llm, self.config.llm_timeout_ms).await;
267            let lw = self.config.llm_weight;
268            (1.0 - lw) * rule_score + lw * llm_score
269        } else {
270            rule_score
271        };
272
273        let rejected = final_score < self.config.threshold;
274
275        // Track rolling rejection rate.
276        if let Ok(mut tracker) = self.rate_tracker.lock() {
277            tracker.push(rejected);
278            let rate = tracker.rate();
279            if rate > self.config.rejection_rate_alarm_ratio {
280                tracing::warn!(
281                    rate = %format!("{:.2}", rate),
282                    window_size = self.config.recent_window,
283                    threshold = self.config.rejection_rate_alarm_ratio,
284                    "quality_gate: high rejection rate alarm"
285                );
286            }
287        }
288
289        if !rejected {
290            return None;
291        }
292
293        // Determine the most specific rejection reason.
294        let reason = if info_val < 0.1 {
295            QualityRejectionReason::Redundant
296        } else if ref_comp < 0.5 && self.config.reference_check_lang_en {
297            QualityRejectionReason::IncompleteReference
298        } else if contradiction_risk >= 1.0 {
299            QualityRejectionReason::Contradiction
300        } else {
301            QualityRejectionReason::LlmLowConfidence
302        };
303
304        if let Ok(mut counts) = self.rejection_counts.lock() {
305            *counts.entry(reason).or_insert(0) += 1;
306        }
307
308        tracing::debug!(
309            reason = reason.label(),
310            final_score,
311            info_val,
312            ref_comp,
313            contradiction_risk,
314            "quality_gate: rejected write"
315        );
316
317        Some(reason)
318    }
319}
320
321// ── Sub-scorers ───────────────────────────────────────────────────────────────
322
323/// Compute `information_value` as `1.0 - max_cosine(candidate, recent_embeddings)`.
324///
325/// Returns `1.0` when the store is empty or on any embedding error (fail-open: treat as novel).
326async fn compute_information_value(
327    content: &str,
328    provider: &AnyProvider,
329    recent_embeddings: &[Vec<f32>],
330) -> f32 {
331    if recent_embeddings.is_empty() {
332        return 1.0;
333    }
334    if !provider.supports_embeddings() {
335        return 1.0;
336    }
337    let candidate = match tokio::time::timeout(Duration::from_secs(5), provider.embed(content))
338        .await
339    {
340        Ok(Ok(v)) => v,
341        Ok(Err(e)) => {
342            tracing::debug!(error = %e, "quality_gate: embed failed, treating info_val = 1.0 (fail-open)");
343            return 1.0;
344        }
345        Err(_) => {
346            tracing::warn!("quality_gate: embed timed out, treating info_val = 1.0 (fail-open)");
347            return 1.0;
348        }
349    };
350    let max_sim = recent_embeddings
351        .iter()
352        .map(|r| zeph_common::math::cosine_similarity(&candidate, r))
353        .fold(0.0f32, f32::max);
354    (1.0 - max_sim).max(0.0)
355}
356
357/// Compute `reference_completeness` as `1.0 - unresolved_reference_ratio`.
358///
359/// Heuristic: counts unresolved English pronouns and deictic time expressions.
360/// English-only; callers must skip this when `reference_check_lang_en = false`.
361#[must_use]
362pub fn compute_reference_completeness(content: &str) -> f32 {
363    // Third-person pronouns that likely refer to an unresolved entity.
364    const PRONOUNS: &[&str] = &[
365        " he ", " she ", " they ", " it ", " him ", " her ", " them ",
366    ];
367    // Deictic time expressions without an accompanying absolute date.
368    const DEICTIC_TIME: &[&str] = &[
369        "yesterday",
370        "tomorrow",
371        "last week",
372        "next week",
373        "last month",
374        "next month",
375        "last year",
376        "next year",
377    ];
378    // Absolute date anchors that resolve deictic expressions.
379    const DATE_ANCHORS: &[&str] = &[
380        "january",
381        "february",
382        "march",
383        "april",
384        "may",
385        "june",
386        "july",
387        "august",
388        "september",
389        "october",
390        "november",
391        "december",
392        "jan ",
393        "feb ",
394        "mar ",
395        "apr ",
396        "jun ",
397        "jul ",
398        "aug ",
399        "sep ",
400        "oct ",
401        "nov ",
402        "dec ",
403    ];
404
405    let lower = content.to_lowercase();
406    let padded = format!(" {lower} ");
407    let pronoun_count = PRONOUNS.iter().filter(|&&p| padded.contains(p)).count();
408
409    // Require a 4-digit year (19xx or 20xx) at a word boundary, not just "20"
410    // which produces false positives on counts like "20 items" or "id=200".
411    let has_year_anchor = has_4digit_year_anchor(&lower);
412    let has_date_anchor = has_year_anchor || DATE_ANCHORS.iter().any(|&a| lower.contains(a));
413    let deictic_count = if has_date_anchor {
414        0
415    } else {
416        DEICTIC_TIME.iter().filter(|&&t| lower.contains(t)).count()
417    };
418
419    let total_issues = pronoun_count + deictic_count;
420    if total_issues == 0 {
421        return 1.0;
422    }
423
424    // Normalize by approximate word count; each issue costs ~0.25, floor at 0.0.
425    let word_count = content.split_ascii_whitespace().count().max(1);
426    #[allow(clippy::cast_precision_loss)]
427    let ratio = total_issues as f32 / word_count as f32;
428    (1.0 - ratio * 2.0).clamp(0.0, 1.0)
429}
430
431/// Returns `true` when `text` (lowercased) contains a 4-digit year (19xx or 20xx)
432/// at a word boundary.
433///
434/// Avoids false positives from 2-digit numbers like "20 items" or "id=200".
435fn has_4digit_year_anchor(text: &str) -> bool {
436    let bytes = text.as_bytes();
437    let len = bytes.len();
438    if len < 4 {
439        return false;
440    }
441    let mut i = 0usize;
442    while i + 3 < len {
443        let c0 = bytes[i];
444        let c1 = bytes[i + 1];
445        if ((c0 == b'1' && c1 == b'9') || (c0 == b'2' && c1 == b'0'))
446            && bytes[i + 2].is_ascii_digit()
447            && bytes[i + 3].is_ascii_digit()
448        {
449            let left_ok = i == 0 || !bytes[i - 1].is_ascii_digit();
450            let right_ok = i + 4 >= len || !bytes[i + 4].is_ascii_digit();
451            if left_ok && right_ok {
452                return true;
453            }
454        }
455        i += 1;
456    }
457    false
458}
459
460/// Compute `contradiction_risk` via graph edge lookup (FR-006).
461///
462/// Extracts the subject entity from the candidate message, then queries for existing
463/// active edges with the same `(source_entity_id, canonical_relation)`. A conflicting
464/// value on the same predicate that is older than `grace_seconds` is treated as a
465/// hard contradiction (returns `1.0`).
466///
467/// Returns `0.0` when no graph store is attached, on any error, or when no conflict found.
468async fn compute_contradiction_risk(
469    content: &str,
470    graph: Option<&GraphStore>,
471    config: &QualityGateConfig,
472) -> f32 {
473    let Some(store) = graph else {
474        return 0.0;
475    };
476
477    let content_lower = content.to_lowercase();
478
479    // Extract subject: longest noun-phrase before a verb-like token ("is", "has", "was", "are").
480    // Fallback: first two tokens.
481    let subject_query = extract_subject_tokens(&content_lower);
482    if subject_query.is_empty() {
483        return 0.0;
484    }
485
486    // Resolve the subject entity.
487    let Ok(entities) = store.find_entities_fuzzy(&subject_query, 1).await else {
488        return 0.0;
489    };
490    let Some(subject_entity) = entities.into_iter().next() else {
491        return 0.0;
492    };
493
494    // Extract candidate predicate from "X <predicate> Y" pattern.
495    let canonical_predicate = extract_predicate_token(&content_lower);
496
497    // Load all active edges where this entity is the source.
498    let Ok(edges) = store.edges_for_entity(subject_entity.id.0).await else {
499        return 0.0;
500    };
501
502    // Filter to edges where source matches subject and canonical_relation matches predicate.
503    let relevant_edges: Vec<_> = edges
504        .iter()
505        .filter(|e| {
506            e.source_entity_id == subject_entity.id.0
507                && canonical_predicate
508                    .as_ref()
509                    .is_none_or(|p| e.relation == *p)
510        })
511        .collect();
512
513    if relevant_edges.is_empty() {
514        return 0.0;
515    }
516
517    let now_secs = std::time::SystemTime::now()
518        .duration_since(std::time::UNIX_EPOCH)
519        .map_or(0, |d| d.as_secs());
520
521    let has_old_conflict = relevant_edges.iter().any(|edge| {
522        let edge_ts = chrono::DateTime::parse_from_rfc3339(&edge.created_at)
523            .map_or(0u64, |dt| u64::try_from(dt.timestamp()).unwrap_or(0));
524        now_secs.saturating_sub(edge_ts) > config.contradiction_grace_seconds
525    });
526
527    if has_old_conflict { 1.0 } else { 0.5 }
528}
529
530/// Extract subject tokens from the content (first noun phrase before verb-like token).
531fn extract_subject_tokens(content_lower: &str) -> String {
532    const VERB_MARKERS: &[&str] = &["is", "was", "are", "were", "has", "have", "had", "will"];
533    let tokens: Vec<&str> = content_lower.split_ascii_whitespace().collect();
534    let end = tokens
535        .iter()
536        .position(|t| VERB_MARKERS.contains(t))
537        .unwrap_or(2.min(tokens.len()));
538    let subject_tokens = &tokens[..end.min(3)];
539    subject_tokens.join(" ")
540}
541
542/// Extract the canonical predicate token (first verb-like token in the content).
543fn extract_predicate_token(content_lower: &str) -> Option<String> {
544    const VERB_MARKERS: &[&str] = &["is", "was", "are", "were", "has", "have", "had", "will"];
545    content_lower
546        .split_ascii_whitespace()
547        .find(|t| VERB_MARKERS.contains(t))
548        .map(str::to_owned)
549}
550
551/// Call the optional LLM scorer and return a blended quality score.
552///
553/// Returns `0.5` (neutral) on timeout or any error — ensures fail-open behavior.
554async fn call_llm_scorer(content: &str, provider: &AnyProvider, timeout_ms: u64) -> f32 {
555    use zeph_llm::provider::{Message, MessageMetadata, Role};
556
557    let system = "You are a memory quality judge. Rate the quality of the following message \
558        for long-term storage on a scale of 0.0 to 1.0. Consider: information density, \
559        completeness of references, factual clarity. \
560        Respond with ONLY a JSON object: \
561        {\"information_value\": 0.0-1.0, \"reference_completeness\": 0.0-1.0, \
562        \"contradiction_risk\": 0.0-1.0}";
563
564    let user = format!(
565        "Message: {}\n\nQuality JSON:",
566        content.chars().take(500).collect::<String>()
567    );
568
569    let messages = vec![
570        Message {
571            role: Role::System,
572            content: system.to_owned(),
573            parts: vec![],
574            metadata: MessageMetadata::default(),
575        },
576        Message {
577            role: Role::User,
578            content: user,
579            parts: vec![],
580            metadata: MessageMetadata::default(),
581        },
582    ];
583
584    let timeout = Duration::from_millis(timeout_ms);
585    let result = match tokio::time::timeout(timeout, provider.chat(&messages)).await {
586        Ok(Ok(r)) => r,
587        Ok(Err(e)) => {
588            tracing::debug!(error = %e, "quality_gate: LLM scorer failed, using 0.5");
589            return 0.5;
590        }
591        Err(_) => {
592            tracing::debug!("quality_gate: LLM scorer timed out, using 0.5");
593            return 0.5;
594        }
595    };
596
597    parse_llm_score(&result)
598}
599
600/// Parse LLM JSON response into a combined quality score.
601///
602/// Returns `0.5` on any parse failure.
603fn parse_llm_score(response: &str) -> f32 {
604    // Find JSON object in the response.
605    let start = response.find('{');
606    let end = response.rfind('}');
607    let (Some(s), Some(e)) = (start, end) else {
608        return 0.5;
609    };
610    let json_str = &response[s..=e];
611    let Ok(val) = serde_json::from_str::<serde_json::Value>(json_str) else {
612        return 0.5;
613    };
614
615    #[allow(clippy::cast_possible_truncation)]
616    let iv = val["information_value"].as_f64().unwrap_or(0.5) as f32;
617    #[allow(clippy::cast_possible_truncation)]
618    let rc = val["reference_completeness"].as_f64().unwrap_or(0.5) as f32;
619    #[allow(clippy::cast_possible_truncation)]
620    let cr = val["contradiction_risk"].as_f64().unwrap_or(0.0) as f32;
621
622    // Mirror the rule-based formula with default weights.
623    let score =
624        0.4 * iv.clamp(0.0, 1.0) + 0.3 * rc.clamp(0.0, 1.0) + 0.3 * (1.0 - cr.clamp(0.0, 1.0));
625    score.clamp(0.0, 1.0)
626}
627
628// ── Tests ─────────────────────────────────────────────────────────────────────
629
630#[cfg(test)]
631mod tests {
632    use super::*;
633
634    #[test]
635    fn reference_completeness_clean_text() {
636        let score = compute_reference_completeness("The Rust compiler enforces memory safety.");
637        assert!((score - 1.0).abs() < 0.01, "clean text should score 1.0");
638    }
639
640    #[test]
641    fn reference_completeness_pronoun_heavy() {
642        // "he", "they", "it" — three unresolved pronouns in a short message.
643        let score = compute_reference_completeness("yeah he said they confirmed it");
644        assert!(
645            score < 0.5,
646            "pronoun-heavy message should score below 0.5, got {score}"
647        );
648    }
649
650    #[test]
651    fn reference_completeness_deictic_without_anchor() {
652        let score = compute_reference_completeness("We agreed yesterday to postpone");
653        assert!(
654            score < 1.0,
655            "deictic time without anchor should penalize, got {score}"
656        );
657    }
658
659    #[test]
660    fn reference_completeness_deictic_with_anchor() {
661        let score = compute_reference_completeness("We agreed yesterday (2026-04-18) to postpone");
662        assert!(
663            score >= 0.9,
664            "deictic with anchor '20' should not penalize, got {score}"
665        );
666    }
667
668    #[test]
669    fn rejection_reason_labels() {
670        assert_eq!(QualityRejectionReason::Redundant.label(), "redundant");
671        assert_eq!(
672            QualityRejectionReason::IncompleteReference.label(),
673            "incomplete_reference"
674        );
675        assert_eq!(
676            QualityRejectionReason::Contradiction.label(),
677            "contradiction"
678        );
679        assert_eq!(
680            QualityRejectionReason::LlmLowConfidence.label(),
681            "llm_low_confidence"
682        );
683    }
684
685    #[test]
686    fn rolling_rate_tracker_basic() {
687        let mut tracker = RollingRateTracker::new(4);
688        tracker.push(true);
689        tracker.push(true);
690        tracker.push(false);
691        tracker.push(false);
692        let rate = tracker.rate();
693        assert!((rate - 0.5).abs() < 0.01, "rate should be 0.5, got {rate}");
694    }
695
696    #[test]
697    fn rolling_rate_tracker_evicts_oldest() {
698        let mut tracker = RollingRateTracker::new(3);
699        tracker.push(true); // will be evicted
700        tracker.push(false);
701        tracker.push(false);
702        tracker.push(false); // evicts first `true`
703        let rate = tracker.rate();
704        assert!(
705            rate < 0.01,
706            "evicted rejection should not count, rate={rate}"
707        );
708    }
709
710    #[test]
711    fn parse_llm_score_valid_json() {
712        let json = r#"{"information_value": 0.8, "reference_completeness": 0.9, "contradiction_risk": 0.1}"#;
713        let score = parse_llm_score(json);
714        assert!(
715            score > 0.7,
716            "high-quality JSON should yield high score, got {score}"
717        );
718    }
719
720    #[test]
721    fn parse_llm_score_malformed_returns_neutral() {
722        let score = parse_llm_score("not json");
723        assert!(
724            (score - 0.5).abs() < 0.01,
725            "malformed JSON should return 0.5"
726        );
727    }
728
729    fn mock_provider() -> zeph_llm::any::AnyProvider {
730        zeph_llm::any::AnyProvider::Mock(zeph_llm::mock::MockProvider::default())
731    }
732
733    #[tokio::test]
734    async fn gate_disabled_always_passes() {
735        let config = QualityGateConfig {
736            enabled: false,
737            ..QualityGateConfig::default()
738        };
739        let gate = QualityGate::new(config);
740        let provider = mock_provider();
741
742        let result = gate.evaluate("yeah he confirmed it", &provider, &[]).await;
743        assert!(result.is_none(), "disabled gate must always pass");
744    }
745
746    #[tokio::test]
747    async fn gate_admits_novel_clean_content() {
748        let config = QualityGateConfig {
749            enabled: true,
750            threshold: 0.3, // lenient threshold for rule-only test
751            ..QualityGateConfig::default()
752        };
753        let gate = QualityGate::new(config);
754        let provider = mock_provider();
755
756        // Novel content with no recent embeddings and clean references → should pass.
757        let result = gate
758            .evaluate(
759                "The Rust compiler enforces memory safety through the borrow checker.",
760                &provider,
761                &[],
762            )
763            .await;
764        assert!(result.is_none(), "clean novel content should be admitted");
765    }
766
767    #[tokio::test]
768    async fn gate_rejects_pronoun_only_at_low_threshold() {
769        let config = QualityGateConfig {
770            enabled: true,
771            threshold: 0.75, // strict threshold
772            reference_completeness_weight: 0.9,
773            information_value_weight: 0.05,
774            contradiction_weight: 0.05,
775            ..QualityGateConfig::default()
776        };
777        let gate = QualityGate::new(config);
778        let provider = mock_provider();
779
780        let result = gate
781            .evaluate("yeah he confirmed it they said so", &provider, &[])
782            .await;
783        assert!(
784            result == Some(QualityRejectionReason::IncompleteReference),
785            "pronoun-heavy message should be rejected as IncompleteReference, got {result:?}"
786        );
787    }
788
789    #[test]
790    fn quality_gate_counts_rejections() {
791        let config = QualityGateConfig {
792            enabled: true,
793            threshold: 0.99, // reject almost everything
794            ..QualityGateConfig::default()
795        };
796        let gate = QualityGate::new(config);
797
798        // Manually record a rejection.
799        if let Ok(mut counts) = gate.rejection_counts.lock() {
800            *counts.entry(QualityRejectionReason::Redundant).or_insert(0) += 1;
801        }
802
803        let counts = gate.rejection_counts();
804        assert_eq!(counts.get(&QualityRejectionReason::Redundant), Some(&1));
805    }
806
807    /// Embed error → fail-open: gate must admit the write (return `None`).
808    #[tokio::test]
809    async fn gate_fail_open_on_embed_error() {
810        let config = QualityGateConfig {
811            enabled: true,
812            threshold: 0.5,
813            ..QualityGateConfig::default()
814        };
815        let gate = QualityGate::new(config);
816
817        // Provider that returns an embed error.
818        let provider = zeph_llm::any::AnyProvider::Mock(
819            zeph_llm::mock::MockProvider::default().with_embed_invalid_input(),
820        );
821
822        let result = gate
823            .evaluate(
824                "Alice confirmed the meeting at 3pm.",
825                &provider,
826                &[], // no recent embeddings; error occurs during info_value embed
827            )
828            .await;
829        assert!(
830            result.is_none(),
831            "embed error must be treated as fail-open (admitted), got {result:?}"
832        );
833    }
834
835    /// Pre-populated `recent_embeddings` with an identical vector triggers `Redundant` rejection.
836    #[tokio::test]
837    async fn gate_rejects_redundant_with_populated_embeddings() {
838        let config = QualityGateConfig {
839            enabled: true,
840            threshold: 0.5,
841            // Heavy weight on information_value so redundancy dominates the score.
842            information_value_weight: 0.9,
843            reference_completeness_weight: 0.05,
844            contradiction_weight: 0.05,
845            ..QualityGateConfig::default()
846        };
847        let gate = QualityGate::new(config);
848
849        // MockProvider returns the same fixed embedding for every call.
850        let fixed_embedding = vec![0.1_f32; 384];
851        let provider = zeph_llm::any::AnyProvider::Mock(
852            zeph_llm::mock::MockProvider::default().with_embedding(fixed_embedding.clone()),
853        );
854
855        // Pass the identical vector as the recent-embeddings window so cosine similarity = 1.0.
856        let result = gate
857            .evaluate(
858                "The Rust compiler enforces memory safety through the borrow checker.",
859                &provider,
860                &[fixed_embedding],
861            )
862            .await;
863        assert_eq!(
864            result,
865            Some(QualityRejectionReason::Redundant),
866            "identical recent embedding must trigger Redundant rejection"
867        );
868    }
869
870    /// `embed()` timeout → fail-open: `compute_information_value` returns 1.0,
871    /// gate admits the write (returns `None`).
872    #[tokio::test]
873    async fn gate_fail_open_on_embed_timeout() {
874        tokio::time::pause();
875
876        let config = QualityGateConfig {
877            enabled: true,
878            threshold: 0.5,
879            information_value_weight: 0.9,
880            reference_completeness_weight: 0.05,
881            contradiction_weight: 0.05,
882            ..QualityGateConfig::default()
883        };
884        let gate = QualityGate::new(config);
885
886        // embed_delay_ms >> 5000ms timeout; time is paused so the test is instant.
887        let provider = zeph_llm::any::AnyProvider::Mock(
888            zeph_llm::mock::MockProvider::default().with_embed_delay(10_000),
889        );
890
891        // Provide a non-empty recent-embeddings window so compute_information_value
892        // actually calls embed() (it returns early on empty).
893        let recent = vec![vec![0.1_f32; 384]];
894
895        let fut = gate.evaluate("Alice confirmed the meeting at 3pm.", &provider, &recent);
896        // Advance time past the 5s embed timeout.
897        let (result, ()) = tokio::join!(fut, async {
898            tokio::time::advance(std::time::Duration::from_secs(6)).await;
899        });
900
901        assert!(
902            result.is_none(),
903            "embed timeout must be treated as fail-open (info_val=1.0, admitted), got {result:?}"
904        );
905    }
906
907    /// LLM provider with 600ms latency exceeds `llm_timeout_ms`; gate falls back to rule score
908    /// and still returns a result (pass or reject based on rule score alone).
909    #[tokio::test]
910    async fn gate_llm_timeout_falls_back_to_rule_score() {
911        let config = QualityGateConfig {
912            enabled: true,
913            threshold: 0.3,     // lenient so rule score alone is likely to pass
914            llm_timeout_ms: 50, // tight timeout
915            llm_weight: 0.5,
916            ..QualityGateConfig::default()
917        };
918        let gate = QualityGate::new(config);
919
920        // Chat provider with 600ms delay — will exceed the 50ms timeout.
921        let slow_provider = zeph_llm::any::AnyProvider::Mock(
922            zeph_llm::mock::MockProvider::default().with_delay(600),
923        );
924        let gate = gate.with_llm_provider(slow_provider);
925
926        let embed_provider = mock_provider(); // no embeddings needed for this path
927
928        let result = gate
929            .evaluate(
930                "The release is scheduled for next Friday.",
931                &embed_provider,
932                &[],
933            )
934            .await;
935        // Gate must complete (no panic/hang) and fall back to rule-only score.
936        // With a lenient threshold and clean content the rule score should admit it.
937        assert!(
938            result.is_none(),
939            "LLM timeout must fall back to rule score and admit clean content, got {result:?}"
940        );
941    }
942}