Skip to main content

zeph_memory/store/
retrieval_failures.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Persistent store for memory retrieval failure records.
5//!
6//! Failures are written via [`crate::RetrievalFailureLogger`], which batches records
7//! asynchronously and inserts them in the background to avoid blocking the
8//! recall hot path.
9
10use super::SqliteStore;
11use crate::error::MemoryError;
12use zeph_db::sql;
13
14/// Classification of a memory retrieval failure event.
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16pub enum RetrievalFailureType {
17    /// No results were returned for the query.
18    NoHit,
19    /// Results were returned but the top score was below the confidence threshold.
20    LowConfidence,
21    /// The recall operation did not complete within the configured timeout.
22    Timeout,
23    /// The recall backend returned an error.
24    Error,
25}
26
27impl RetrievalFailureType {
28    /// Returns the canonical string representation stored in the database.
29    #[must_use]
30    pub fn as_str(self) -> &'static str {
31        match self {
32            Self::NoHit => "no_hit",
33            Self::LowConfidence => "low_confidence",
34            Self::Timeout => "timeout",
35            Self::Error => "error",
36        }
37    }
38}
39
40/// A single retrieval failure event to be persisted.
41#[derive(Debug, Clone)]
42pub struct RetrievalFailureRecord {
43    /// Conversation this failure occurred in. `None` when persistence is not yet
44    /// initialized (first-turn edge case).
45    pub conversation_id: Option<crate::types::ConversationId>,
46    /// Turn counter within the conversation. Use `0` when unavailable.
47    pub turn_index: i64,
48    /// How the recall failed.
49    pub failure_type: RetrievalFailureType,
50    /// Name of the retrieval strategy that was attempted.
51    pub retrieval_strategy: String,
52    /// The query text (truncated to 512 chars by [`crate::RetrievalFailureLogger::log`]).
53    pub query_text: String,
54    /// Byte length of the original query before any truncation.
55    ///
56    /// Note: `query_text` is truncated to 512 *chars* by [`crate::RetrievalFailureLogger::log`],
57    /// so `query_len` may exceed `query_text.len()` for multibyte inputs.
58    pub query_len: usize,
59    /// Top score returned, if any results were produced.
60    pub top_score: Option<f32>,
61    /// Configured confidence threshold at failure time.
62    pub confidence_threshold: Option<f32>,
63    /// Number of results returned (0 for `NoHit`).
64    pub result_count: usize,
65    /// Wall-clock duration of the recall operation in milliseconds.
66    pub latency_ms: u64,
67    /// JSON-serialized list of graph edge types used (graph recall only).
68    pub edge_types: Option<String>,
69    /// Error message or timeout context for `Error`/`Timeout` variants.
70    ///
71    /// Truncated to 256 chars by [`crate::RetrievalFailureLogger::log`] to bound channel memory.
72    pub error_context: Option<String>,
73}
74
75impl SqliteStore {
76    /// Insert a single retrieval failure record.
77    ///
78    /// Prefer [`crate::RetrievalFailureLogger`] for hot-path inserts — this method is
79    /// intended for tests and one-off writes.
80    ///
81    /// # Errors
82    ///
83    /// Returns [`MemoryError`] if the INSERT fails.
84    pub async fn record_retrieval_failure(
85        &self,
86        r: &RetrievalFailureRecord,
87    ) -> Result<(), MemoryError> {
88        self.record_retrieval_failures_batch(std::slice::from_ref(r))
89            .await
90    }
91
92    /// Batch-insert retrieval failure records in a single transaction.
93    ///
94    /// # Errors
95    ///
96    /// Returns [`MemoryError`] if any INSERT fails.
97    pub async fn record_retrieval_failures_batch(
98        &self,
99        records: &[RetrievalFailureRecord],
100    ) -> Result<(), MemoryError> {
101        if records.is_empty() {
102            return Ok(());
103        }
104        let mut tx = zeph_db::begin_write(self.pool()).await?;
105        for r in records {
106            let conversation_id = r.conversation_id.map(|c| c.0);
107            let failure_type = r.failure_type.as_str();
108            #[allow(clippy::cast_possible_wrap)]
109            let query_len = r.query_len as i64;
110            #[allow(clippy::cast_possible_wrap)]
111            let result_count = r.result_count as i64;
112            let latency_ms = r.latency_ms.cast_signed();
113            zeph_db::query(sql!(
114                "INSERT INTO memory_retrieval_failures
115                    (conversation_id, turn_index, failure_type, retrieval_strategy,
116                     query_text, query_len, top_score, confidence_threshold,
117                     result_count, latency_ms, edge_types, error_context)
118                 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
119            ))
120            .bind(conversation_id)
121            .bind(r.turn_index)
122            .bind(failure_type)
123            .bind(&r.retrieval_strategy)
124            .bind(&r.query_text)
125            .bind(query_len)
126            .bind(r.top_score)
127            .bind(r.confidence_threshold)
128            .bind(result_count)
129            .bind(latency_ms)
130            .bind(&r.edge_types)
131            .bind(&r.error_context)
132            .execute(&mut *tx)
133            .await?;
134        }
135        tx.commit().await?;
136        Ok(())
137    }
138
139    /// Delete records older than `retention_days` days.
140    ///
141    /// Called periodically by [`crate::RetrievalFailureLogger`]'s background task.
142    ///
143    /// # Errors
144    ///
145    /// Returns [`MemoryError`] if the DELETE fails.
146    pub async fn purge_old_retrieval_failures(
147        &self,
148        retention_days: u32,
149    ) -> Result<u64, MemoryError> {
150        let cutoff = format!(
151            "{}",
152            (chrono::Utc::now() - chrono::Duration::days(i64::from(retention_days)))
153                .format("%Y-%m-%d %H:%M:%S")
154        );
155        let rows = zeph_db::query(sql!(
156            "DELETE FROM memory_retrieval_failures WHERE created_at < ?"
157        ))
158        .bind(cutoff)
159        .execute(self.pool())
160        .await?;
161        Ok(rows.rows_affected())
162    }
163}