zeph_memory/store/retrieval_failures.rs
1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Persistent store for memory retrieval failure records.
5//!
6//! Failures are written via [`crate::RetrievalFailureLogger`], which batches records
7//! asynchronously and inserts them in the background to avoid blocking the
8//! recall hot path.
9
10use super::SqliteStore;
11use crate::error::MemoryError;
12use zeph_db::sql;
13
14/// Classification of a memory retrieval failure event.
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16#[non_exhaustive]
17pub enum RetrievalFailureType {
18 /// No results were returned for the query.
19 NoHit,
20 /// Results were returned but the top score was below the confidence threshold.
21 LowConfidence,
22 /// The recall operation did not complete within the configured timeout.
23 Timeout,
24 /// The recall backend returned an error.
25 Error,
26}
27
28impl RetrievalFailureType {
29 /// Returns the canonical string representation stored in the database.
30 #[must_use]
31 pub fn as_str(self) -> &'static str {
32 match self {
33 Self::NoHit => "no_hit",
34 Self::LowConfidence => "low_confidence",
35 Self::Timeout => "timeout",
36 Self::Error => "error",
37 }
38 }
39}
40
41/// A single retrieval failure event to be persisted.
42#[derive(Debug, Clone)]
43pub struct RetrievalFailureRecord {
44 /// Conversation this failure occurred in. `None` when persistence is not yet
45 /// initialized (first-turn edge case).
46 pub conversation_id: Option<crate::types::ConversationId>,
47 /// Turn counter within the conversation. Use `0` when unavailable.
48 pub turn_index: i64,
49 /// How the recall failed.
50 pub failure_type: RetrievalFailureType,
51 /// Name of the retrieval strategy that was attempted.
52 pub retrieval_strategy: String,
53 /// The query text (truncated to 512 chars by [`crate::RetrievalFailureLogger::log`]).
54 pub query_text: String,
55 /// Byte length of the original query before any truncation.
56 ///
57 /// Note: `query_text` is truncated to 512 *chars* by [`crate::RetrievalFailureLogger::log`],
58 /// so `query_len` may exceed `query_text.len()` for multibyte inputs.
59 pub query_len: usize,
60 /// Top score returned, if any results were produced.
61 pub top_score: Option<f32>,
62 /// Configured confidence threshold at failure time.
63 pub confidence_threshold: Option<f32>,
64 /// Number of results returned (0 for `NoHit`).
65 pub result_count: usize,
66 /// Wall-clock duration of the recall operation in milliseconds.
67 pub latency_ms: u64,
68 /// JSON-serialized list of graph edge types used (graph recall only).
69 pub edge_types: Option<String>,
70 /// Error message or timeout context for `Error`/`Timeout` variants.
71 ///
72 /// Truncated to 256 chars by [`crate::RetrievalFailureLogger::log`] to bound channel memory.
73 pub error_context: Option<String>,
74}
75
76impl SqliteStore {
77 /// Insert a single retrieval failure record.
78 ///
79 /// Prefer [`crate::RetrievalFailureLogger`] for hot-path inserts — this method is
80 /// intended for tests and one-off writes.
81 ///
82 /// # Errors
83 ///
84 /// Returns [`MemoryError`] if the INSERT fails.
85 pub async fn record_retrieval_failure(
86 &self,
87 r: &RetrievalFailureRecord,
88 ) -> Result<(), MemoryError> {
89 self.record_retrieval_failures_batch(std::slice::from_ref(r))
90 .await
91 }
92
93 /// Batch-insert retrieval failure records in a single transaction.
94 ///
95 /// # Errors
96 ///
97 /// Returns [`MemoryError`] if any INSERT fails.
98 pub async fn record_retrieval_failures_batch(
99 &self,
100 records: &[RetrievalFailureRecord],
101 ) -> Result<(), MemoryError> {
102 if records.is_empty() {
103 return Ok(());
104 }
105 let mut tx = zeph_db::begin_write(self.pool()).await?;
106 for r in records {
107 let conversation_id = r.conversation_id.map(|c| c.0);
108 let failure_type = r.failure_type.as_str();
109 #[allow(clippy::cast_possible_wrap)]
110 let query_len = r.query_len as i64;
111 #[allow(clippy::cast_possible_wrap)]
112 let result_count = r.result_count as i64;
113 let latency_ms = r.latency_ms.cast_signed();
114 zeph_db::query(sql!(
115 "INSERT INTO memory_retrieval_failures
116 (conversation_id, turn_index, failure_type, retrieval_strategy,
117 query_text, query_len, top_score, confidence_threshold,
118 result_count, latency_ms, edge_types, error_context)
119 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
120 ))
121 .bind(conversation_id)
122 .bind(r.turn_index)
123 .bind(failure_type)
124 .bind(&r.retrieval_strategy)
125 .bind(&r.query_text)
126 .bind(query_len)
127 .bind(r.top_score)
128 .bind(r.confidence_threshold)
129 .bind(result_count)
130 .bind(latency_ms)
131 .bind(&r.edge_types)
132 .bind(&r.error_context)
133 .execute(&mut *tx)
134 .await?;
135 }
136 tx.commit().await?;
137 Ok(())
138 }
139
140 /// Delete records older than `retention_days` days.
141 ///
142 /// Called periodically by [`crate::RetrievalFailureLogger`]'s background task.
143 ///
144 /// # Errors
145 ///
146 /// Returns [`MemoryError`] if the DELETE fails.
147 pub async fn purge_old_retrieval_failures(
148 &self,
149 retention_days: u32,
150 ) -> Result<u64, MemoryError> {
151 let cutoff = format!(
152 "{}",
153 (chrono::Utc::now() - chrono::Duration::days(i64::from(retention_days)))
154 .format("%Y-%m-%d %H:%M:%S")
155 );
156 let rows = zeph_db::query(sql!(
157 "DELETE FROM memory_retrieval_failures WHERE created_at < ?"
158 ))
159 .bind(cutoff)
160 .execute(self.pool())
161 .await?;
162 Ok(rows.rows_affected())
163 }
164}