Skip to main content

sapphire_retrieve/
retrieve_store.rs

1//! Unified retrieve store trait.
2//!
3//! [`RetrieveStore`] is a **synchronous** trait that abstracts over all
4//! storage backends (SQLite-vec, LanceDB, and future backends such as
5//! SurrealDB).
6//!
7//! All methods are **synchronous**.  Async backends must wrap their async
8//! operations inside a dedicated Tokio runtime.
9
10use std::collections::HashMap;
11use std::path::Path;
12
13use crate::{embed::Embedder, error::Result, vector_store::VecInfo};
14
15// ── query structs ────────────────────────────────────────────────────────────
16
17/// Full-text search query.
18#[derive(Debug, Clone)]
19pub struct FtsQuery<'a> {
20    /// Query text.
21    pub query: &'a str,
22    /// Maximum number of file-level results.
23    pub limit: usize,
24    /// When set, restrict results to documents whose `path` starts with this
25    /// absolute prefix.
26    pub path_prefix: Option<&'a Path>,
27}
28
29impl<'a> FtsQuery<'a> {
30    pub fn new(query: &'a str) -> Self {
31        Self {
32            query,
33            limit: 10,
34            path_prefix: None,
35        }
36    }
37
38    pub fn limit(mut self, n: usize) -> Self {
39        self.limit = n;
40        self
41    }
42
43    pub fn path_prefix(mut self, p: &'a Path) -> Self {
44        self.path_prefix = Some(p);
45        self
46    }
47}
48
49/// Vector (semantic) similarity query.
50///
51/// The backend embeds `query` using `embedder` internally, so callers don't
52/// need to pre-compute the vector.
53pub struct VectorQuery<'a> {
54    pub query: &'a str,
55    pub embedder: &'a dyn Embedder,
56    pub limit: usize,
57    pub path_prefix: Option<&'a Path>,
58}
59
60impl<'a> VectorQuery<'a> {
61    pub fn new(query: &'a str, embedder: &'a dyn Embedder) -> Self {
62        Self {
63            query,
64            embedder,
65            limit: 10,
66            path_prefix: None,
67        }
68    }
69
70    pub fn limit(mut self, n: usize) -> Self {
71        self.limit = n;
72        self
73    }
74
75    pub fn path_prefix(mut self, p: &'a Path) -> Self {
76        self.path_prefix = Some(p);
77        self
78    }
79}
80
81impl std::fmt::Debug for VectorQuery<'_> {
82    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
83        f.debug_struct("VectorQuery")
84            .field("query", &self.query)
85            .field("limit", &self.limit)
86            .field("path_prefix", &self.path_prefix)
87            .finish_non_exhaustive()
88    }
89}
90
91/// Hybrid (FTS + vector) search query, merged via Reciprocal Rank Fusion.
92///
93/// When `embedder` is `None`, falls back to FTS-only.
94pub struct HybridQuery<'a> {
95    pub query: &'a str,
96    pub embedder: Option<&'a dyn Embedder>,
97    pub limit: usize,
98    pub path_prefix: Option<&'a Path>,
99    pub rrf_k: f64,
100    pub weight_fts: f64,
101    pub weight_sem: f64,
102}
103
104impl<'a> HybridQuery<'a> {
105    pub fn new(query: &'a str) -> Self {
106        Self {
107            query,
108            embedder: None,
109            limit: 10,
110            path_prefix: None,
111            rrf_k: 60.0,
112            weight_fts: 1.0,
113            weight_sem: 1.0,
114        }
115    }
116
117    pub fn embedder(mut self, e: &'a dyn Embedder) -> Self {
118        self.embedder = Some(e);
119        self
120    }
121
122    pub fn limit(mut self, n: usize) -> Self {
123        self.limit = n;
124        self
125    }
126
127    pub fn path_prefix(mut self, p: &'a Path) -> Self {
128        self.path_prefix = Some(p);
129        self
130    }
131
132    pub fn rrf_k(mut self, k: f64) -> Self {
133        self.rrf_k = k;
134        self
135    }
136
137    pub fn weight_fts(mut self, w: f64) -> Self {
138        self.weight_fts = w;
139        self
140    }
141
142    pub fn weight_sem(mut self, w: f64) -> Self {
143        self.weight_sem = w;
144        self
145    }
146}
147
148impl std::fmt::Debug for HybridQuery<'_> {
149    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
150        f.debug_struct("HybridQuery")
151            .field("query", &self.query)
152            .field("limit", &self.limit)
153            .field("path_prefix", &self.path_prefix)
154            .field("rrf_k", &self.rrf_k)
155            .field("weight_fts", &self.weight_fts)
156            .field("weight_sem", &self.weight_sem)
157            .finish_non_exhaustive()
158    }
159}
160
161// ── shared domain types ───────────────────────────────────────────────────────
162
163/// A document to be indexed for FTS and/or vector search.
164#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
165pub struct Document {
166    /// Stable identifier assigned by the caller.
167    pub id: i64,
168    /// Full body text; used only as input to the chunker when `chunks` is `None`.
169    /// Not persisted to the database.
170    pub body: String,
171    /// Absolute file path (shown in search results).
172    pub path: String,
173    /// Pre-computed text chunks with source-location ranges.
174    ///
175    /// Each element is `(line_start, line_end, embed_text)` where the line
176    /// values are 0-based and inclusive.  When `None`, the storage backend
177    /// falls back to auto-chunking `body` via [`crate::chunker::chunk_document`].
178    #[serde(default, skip_serializing_if = "Option::is_none")]
179    pub chunks: Option<Vec<(usize, usize, String)>>,
180}
181
182/// A single chunk match inside a [`FileSearchResult`].
183#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
184pub struct ChunkHit {
185    /// First source line of the matched chunk (inclusive, 0-based).
186    pub line_start: usize,
187    /// Last source line of the matched chunk (inclusive, 0-based).
188    pub line_end: usize,
189    /// The chunk's extracted text.
190    pub text: String,
191    /// Per-chunk score: FTS rank (lower = better), vector L2 distance
192    /// (lower = better), or RRF score (higher = better), depending on the
193    /// search mode.
194    pub score: f64,
195}
196
197/// File-level search result with one or more matched chunks.
198#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
199pub struct FileSearchResult {
200    pub id: i64,
201    pub path: String,
202    /// Representative score for the file (best chunk for FTS/vector,
203    /// aggregated RRF score for hybrid).
204    pub score: f64,
205    /// Matched chunks within this file, ordered by per-chunk score.
206    pub chunks: Vec<ChunkHit>,
207}
208
209// ── trait ─────────────────────────────────────────────────────────────────────
210
211/// Unified synchronous interface for retrieve storage backends.
212///
213/// Built-in implementations:
214/// - [`SqliteStore`](crate::sqlite_store::SqliteStore) — SQLite-vec backend.
215/// - [`LanceDbBackend`](crate::lancedb_store::LanceDbBackend) — LanceDB backend
216///   (requires the `lancedb-store` feature).
217pub trait RetrieveStore: Send + Sync {
218    // ── file tracking ──────────────────────────────────────────────────────────
219
220    fn file_mtimes(&self) -> Result<HashMap<String, i64>>;
221    fn upsert_file(&self, path: &str, mtime: i64) -> Result<()>;
222    fn remove_file(&self, path: &str) -> Result<()>;
223    fn file_count(&self) -> Result<u64>;
224
225    // ── document management ────────────────────────────────────────────────────
226
227    fn upsert_document(&self, doc: &Document) -> Result<()>;
228    fn remove_document(&self, id: i64) -> Result<()>;
229
230    /// Rebuild the FTS index.  Call after a batch of upserts.
231    fn rebuild_fts(&self) -> Result<()>;
232
233    fn document_ids(&self) -> Result<Vec<i64>>;
234    fn document_count(&self) -> Result<u64>;
235
236    // ── embedding ──────────────────────────────────────────────────────────────
237
238    /// Generate and store embeddings for all pending chunks.
239    fn embed_pending(
240        &self,
241        embedder: &dyn Embedder,
242        on_progress: &dyn Fn(usize, usize),
243    ) -> Result<usize>;
244
245    fn vec_info(&self) -> Result<VecInfo>;
246
247    // ── search ─────────────────────────────────────────────────────────────────
248
249    /// Full-text search at chunk granularity, grouped per file.
250    fn search_fts(&self, q: &FtsQuery<'_>) -> Result<Vec<FileSearchResult>>;
251
252    /// Semantic (vector) search at chunk granularity, grouped per file.
253    ///
254    /// The backend embeds `q.query` using `q.embedder` internally.
255    fn search_similar(&self, q: &VectorQuery<'_>) -> Result<Vec<FileSearchResult>>;
256
257    /// Hybrid search: runs FTS + vector and merges via RRF (default impl).
258    ///
259    /// If `q.embedder` is `None`, falls back to FTS-only.
260    fn search_hybrid(&self, q: &HybridQuery<'_>) -> Result<Vec<FileSearchResult>> {
261        crate::db::default_hybrid(self, q)
262    }
263}