Skip to main content

zeph_memory/store/
retrieval_failures.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Persistent store for memory retrieval failure records.
5//!
6//! Failures are written via [`crate::RetrievalFailureLogger`], which batches records
7//! asynchronously and inserts them in the background to avoid blocking the
8//! recall hot path.
9
10use super::SqliteStore;
11use crate::error::MemoryError;
12use zeph_db::sql;
13
14/// Classification of a memory retrieval failure event.
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16#[non_exhaustive]
17pub enum RetrievalFailureType {
18    /// No results were returned for the query.
19    NoHit,
20    /// Results were returned but the top score was below the confidence threshold.
21    LowConfidence,
22    /// The recall operation did not complete within the configured timeout.
23    Timeout,
24    /// The recall backend returned an error.
25    Error,
26}
27
28impl RetrievalFailureType {
29    /// Returns the canonical string representation stored in the database.
30    #[must_use]
31    pub fn as_str(self) -> &'static str {
32        match self {
33            Self::NoHit => "no_hit",
34            Self::LowConfidence => "low_confidence",
35            Self::Timeout => "timeout",
36            Self::Error => "error",
37        }
38    }
39}
40
41/// A single retrieval failure event to be persisted.
42#[derive(Debug, Clone)]
43pub struct RetrievalFailureRecord {
44    /// Conversation this failure occurred in. `None` when persistence is not yet
45    /// initialized (first-turn edge case).
46    pub conversation_id: Option<crate::types::ConversationId>,
47    /// Turn counter within the conversation. Use `0` when unavailable.
48    pub turn_index: i64,
49    /// How the recall failed.
50    pub failure_type: RetrievalFailureType,
51    /// Name of the retrieval strategy that was attempted.
52    pub retrieval_strategy: String,
53    /// The query text (truncated to 512 chars by [`crate::RetrievalFailureLogger::log`]).
54    pub query_text: String,
55    /// Byte length of the original query before any truncation.
56    ///
57    /// Note: `query_text` is truncated to 512 *chars* by [`crate::RetrievalFailureLogger::log`],
58    /// so `query_len` may exceed `query_text.len()` for multibyte inputs.
59    pub query_len: usize,
60    /// Top score returned, if any results were produced.
61    pub top_score: Option<f32>,
62    /// Configured confidence threshold at failure time.
63    pub confidence_threshold: Option<f32>,
64    /// Number of results returned (0 for `NoHit`).
65    pub result_count: usize,
66    /// Wall-clock duration of the recall operation in milliseconds.
67    pub latency_ms: u64,
68    /// JSON-serialized list of graph edge types used (graph recall only).
69    pub edge_types: Option<String>,
70    /// Error message or timeout context for `Error`/`Timeout` variants.
71    ///
72    /// Truncated to 256 chars by [`crate::RetrievalFailureLogger::log`] to bound channel memory.
73    pub error_context: Option<String>,
74}
75
76impl SqliteStore {
77    /// Insert a single retrieval failure record.
78    ///
79    /// Prefer [`crate::RetrievalFailureLogger`] for hot-path inserts — this method is
80    /// intended for tests and one-off writes.
81    ///
82    /// # Errors
83    ///
84    /// Returns [`MemoryError`] if the INSERT fails.
85    pub async fn record_retrieval_failure(
86        &self,
87        r: &RetrievalFailureRecord,
88    ) -> Result<(), MemoryError> {
89        self.record_retrieval_failures_batch(std::slice::from_ref(r))
90            .await
91    }
92
93    /// Batch-insert retrieval failure records in a single transaction.
94    ///
95    /// # Errors
96    ///
97    /// Returns [`MemoryError`] if any INSERT fails.
98    pub async fn record_retrieval_failures_batch(
99        &self,
100        records: &[RetrievalFailureRecord],
101    ) -> Result<(), MemoryError> {
102        if records.is_empty() {
103            return Ok(());
104        }
105        let mut tx = zeph_db::begin_write(self.pool()).await?;
106        for r in records {
107            let conversation_id = r.conversation_id.map(|c| c.0);
108            let failure_type = r.failure_type.as_str();
109            #[allow(clippy::cast_possible_wrap)]
110            let query_len = r.query_len as i64;
111            #[allow(clippy::cast_possible_wrap)]
112            let result_count = r.result_count as i64;
113            let latency_ms = r.latency_ms.cast_signed();
114            zeph_db::query(sql!(
115                "INSERT INTO memory_retrieval_failures
116                    (conversation_id, turn_index, failure_type, retrieval_strategy,
117                     query_text, query_len, top_score, confidence_threshold,
118                     result_count, latency_ms, edge_types, error_context)
119                 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
120            ))
121            .bind(conversation_id)
122            .bind(r.turn_index)
123            .bind(failure_type)
124            .bind(&r.retrieval_strategy)
125            .bind(&r.query_text)
126            .bind(query_len)
127            .bind(r.top_score)
128            .bind(r.confidence_threshold)
129            .bind(result_count)
130            .bind(latency_ms)
131            .bind(&r.edge_types)
132            .bind(&r.error_context)
133            .execute(&mut *tx)
134            .await?;
135        }
136        tx.commit().await?;
137        Ok(())
138    }
139
140    /// Delete records older than `retention_days` days.
141    ///
142    /// Called periodically by [`crate::RetrievalFailureLogger`]'s background task.
143    ///
144    /// # Errors
145    ///
146    /// Returns [`MemoryError`] if the DELETE fails.
147    pub async fn purge_old_retrieval_failures(
148        &self,
149        retention_days: u32,
150    ) -> Result<u64, MemoryError> {
151        let cutoff = format!(
152            "{}",
153            (chrono::Utc::now() - chrono::Duration::days(i64::from(retention_days)))
154                .format("%Y-%m-%d %H:%M:%S")
155        );
156        let rows = zeph_db::query(sql!(
157            "DELETE FROM memory_retrieval_failures WHERE created_at < ?"
158        ))
159        .bind(cutoff)
160        .execute(self.pool())
161        .await?;
162        Ok(rows.rows_affected())
163    }
164}