zeph_memory/store/retrieval_failures.rs
1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Persistent store for memory retrieval failure records.
5//!
6//! Failures are written via [`crate::RetrievalFailureLogger`], which batches records
7//! asynchronously and inserts them in the background to avoid blocking the
8//! recall hot path.
9
10use super::SqliteStore;
11use crate::error::MemoryError;
12use zeph_db::sql;
13
14/// Classification of a memory retrieval failure event.
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16pub enum RetrievalFailureType {
17 /// No results were returned for the query.
18 NoHit,
19 /// Results were returned but the top score was below the confidence threshold.
20 LowConfidence,
21 /// The recall operation did not complete within the configured timeout.
22 Timeout,
23 /// The recall backend returned an error.
24 Error,
25}
26
27impl RetrievalFailureType {
28 /// Returns the canonical string representation stored in the database.
29 #[must_use]
30 pub fn as_str(self) -> &'static str {
31 match self {
32 Self::NoHit => "no_hit",
33 Self::LowConfidence => "low_confidence",
34 Self::Timeout => "timeout",
35 Self::Error => "error",
36 }
37 }
38}
39
40/// A single retrieval failure event to be persisted.
41#[derive(Debug, Clone)]
42pub struct RetrievalFailureRecord {
43 /// Conversation this failure occurred in. `None` when persistence is not yet
44 /// initialized (first-turn edge case).
45 pub conversation_id: Option<crate::types::ConversationId>,
46 /// Turn counter within the conversation. Use `0` when unavailable.
47 pub turn_index: i64,
48 /// How the recall failed.
49 pub failure_type: RetrievalFailureType,
50 /// Name of the retrieval strategy that was attempted.
51 pub retrieval_strategy: String,
52 /// The query text (truncated to 512 chars by [`crate::RetrievalFailureLogger::log`]).
53 pub query_text: String,
54 /// Byte length of the original query before any truncation.
55 ///
56 /// Note: `query_text` is truncated to 512 *chars* by [`crate::RetrievalFailureLogger::log`],
57 /// so `query_len` may exceed `query_text.len()` for multibyte inputs.
58 pub query_len: usize,
59 /// Top score returned, if any results were produced.
60 pub top_score: Option<f32>,
61 /// Configured confidence threshold at failure time.
62 pub confidence_threshold: Option<f32>,
63 /// Number of results returned (0 for `NoHit`).
64 pub result_count: usize,
65 /// Wall-clock duration of the recall operation in milliseconds.
66 pub latency_ms: u64,
67 /// JSON-serialized list of graph edge types used (graph recall only).
68 pub edge_types: Option<String>,
69 /// Error message or timeout context for `Error`/`Timeout` variants.
70 ///
71 /// Truncated to 256 chars by [`crate::RetrievalFailureLogger::log`] to bound channel memory.
72 pub error_context: Option<String>,
73}
74
75impl SqliteStore {
76 /// Insert a single retrieval failure record.
77 ///
78 /// Prefer [`crate::RetrievalFailureLogger`] for hot-path inserts — this method is
79 /// intended for tests and one-off writes.
80 ///
81 /// # Errors
82 ///
83 /// Returns [`MemoryError`] if the INSERT fails.
84 pub async fn record_retrieval_failure(
85 &self,
86 r: &RetrievalFailureRecord,
87 ) -> Result<(), MemoryError> {
88 self.record_retrieval_failures_batch(std::slice::from_ref(r))
89 .await
90 }
91
92 /// Batch-insert retrieval failure records in a single transaction.
93 ///
94 /// # Errors
95 ///
96 /// Returns [`MemoryError`] if any INSERT fails.
97 pub async fn record_retrieval_failures_batch(
98 &self,
99 records: &[RetrievalFailureRecord],
100 ) -> Result<(), MemoryError> {
101 if records.is_empty() {
102 return Ok(());
103 }
104 let mut tx = zeph_db::begin_write(self.pool()).await?;
105 for r in records {
106 let conversation_id = r.conversation_id.map(|c| c.0);
107 let failure_type = r.failure_type.as_str();
108 #[allow(clippy::cast_possible_wrap)]
109 let query_len = r.query_len as i64;
110 #[allow(clippy::cast_possible_wrap)]
111 let result_count = r.result_count as i64;
112 let latency_ms = r.latency_ms.cast_signed();
113 zeph_db::query(sql!(
114 "INSERT INTO memory_retrieval_failures
115 (conversation_id, turn_index, failure_type, retrieval_strategy,
116 query_text, query_len, top_score, confidence_threshold,
117 result_count, latency_ms, edge_types, error_context)
118 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
119 ))
120 .bind(conversation_id)
121 .bind(r.turn_index)
122 .bind(failure_type)
123 .bind(&r.retrieval_strategy)
124 .bind(&r.query_text)
125 .bind(query_len)
126 .bind(r.top_score)
127 .bind(r.confidence_threshold)
128 .bind(result_count)
129 .bind(latency_ms)
130 .bind(&r.edge_types)
131 .bind(&r.error_context)
132 .execute(&mut *tx)
133 .await?;
134 }
135 tx.commit().await?;
136 Ok(())
137 }
138
139 /// Delete records older than `retention_days` days.
140 ///
141 /// Called periodically by [`crate::RetrievalFailureLogger`]'s background task.
142 ///
143 /// # Errors
144 ///
145 /// Returns [`MemoryError`] if the DELETE fails.
146 pub async fn purge_old_retrieval_failures(
147 &self,
148 retention_days: u32,
149 ) -> Result<u64, MemoryError> {
150 let cutoff = format!(
151 "{}",
152 (chrono::Utc::now() - chrono::Duration::days(i64::from(retention_days)))
153 .format("%Y-%m-%d %H:%M:%S")
154 );
155 let rows = zeph_db::query(sql!(
156 "DELETE FROM memory_retrieval_failures WHERE created_at < ?"
157 ))
158 .bind(cutoff)
159 .execute(self.pool())
160 .await?;
161 Ok(rows.rows_affected())
162 }
163}