archelon_core/vector_store.rs
1//! Vector store abstraction and sqlite-vec backend.
2//!
3//! # Architecture
4//!
5//! [`VectorStore`] is a **synchronous** trait that abstracts over the two
6//! supported vector backends:
7//!
8//! - [`SqliteVecStore`] — stores chunk vectors inside the existing SQLite
9//! cache using the sqlite-vec extension.
10//! - `LanceDbVectorStore` (in `lancedb_store`) — stores chunk vectors in a
11//! separate LanceDB directory; async LanceDB calls are wrapped in an
12//! internal Tokio runtime so the trait remains sync.
13//!
14//! # Chunk identity
15//!
16//! Each chunk is identified by the pair `(entry_id, chunk_index)`. This key
17//! is stable across SQLite cache rebuilds because `entry_id` is the CarettaId
18//! (derived from the file itself) and `chunk_index` is reproducibly derived
19//! from the paragraph order of the entry body.
20//!
21//! # Shared helpers
22//!
23//! [`embed_pending_chunks`] is the one function called by both CLI commands
24//! (`cache embed`, `entry search --semantic`). It:
25//!
26//! 1. Queries `embedded_chunk_keys()` from the store.
27//! 2. Calls [`pending_chunks`] on the SQLite connection to collect chunks not
28//! yet embedded.
29//! 3. Calls the embedding API in batches of 100.
30//! 4. Stores the results via `insert_embeddings()`.
31
32use std::collections::HashSet;
33
34use rusqlite::params;
35
36use crate::{
37 cache,
38 embed::Embedder,
39 error::Result,
40 journal::Journal,
41};
42
43// ── public types ──────────────────────────────────────────────────────────────
44
45/// A single paragraph-level chunk derived from an entry, ready to be embedded.
46#[derive(Debug, Clone)]
47pub struct Chunk {
48 /// Entry ID (CarettaId as i64) — stable, used as part of the chunk key.
49 pub entry_id: i64,
50 /// Zero-based position of this paragraph in the entry body.
51 pub chunk_index: usize,
52 /// Embeddable text: title prepended to the paragraph body.
53 pub text: String,
54 /// Denormalised entry title (for display in search results).
55 pub entry_title: String,
56 /// Denormalised absolute file path (for display in search results).
57 pub entry_path: String,
58}
59
60/// A result returned by [`VectorStore::search_similar`].
61pub struct ChunkSearchResult {
62 pub entry_id: i64,
63 pub entry_title: String,
64 pub entry_path: String,
65 /// Position of the matching chunk within the entry (0-based).
66 pub chunk_index: usize,
67 /// The text of the matching chunk.
68 pub chunk_text: String,
69 /// L2 distance (lower = more similar).
70 pub score: f64,
71}
72
73/// Statistics about the vector index shown by `cache info`.
74pub struct VecInfo {
75 /// Embedding dimension (number of f32 values per vector).
76 pub embedding_dim: u32,
77 /// Number of chunks that have an embedding stored.
78 pub vector_count: u64,
79 /// Number of chunks that do not yet have an embedding.
80 pub pending_count: u64,
81}
82
83// ── trait ─────────────────────────────────────────────────────────────────────
84
85/// Abstraction over a vector storage backend.
86///
87/// All methods are **synchronous**. Backends that are inherently async
88/// (e.g. LanceDB) wrap their async operations in an internal Tokio runtime.
89pub trait VectorStore {
90 /// Return the `(entry_id, chunk_index)` pairs that already have embeddings
91 /// stored, so callers can compute the pending set.
92 fn embedded_chunk_keys(&self) -> Result<HashSet<(i64, usize)>>;
93
94 /// Store embeddings for a batch of chunks.
95 ///
96 /// `chunks` and `embeddings` are parallel slices of equal length.
97 fn insert_embeddings(&self, chunks: &[Chunk], embeddings: &[Vec<f32>]) -> Result<()>;
98
99 /// Find the `limit` most similar chunks to `query_vec`, ordered by
100 /// ascending distance.
101 fn search_similar(&self, query_vec: &[f32], limit: usize) -> Result<Vec<ChunkSearchResult>>;
102}
103
104// ── SqliteVecStore ────────────────────────────────────────────────────────────
105
106/// Vector store backed by the sqlite-vec extension, living inside the
107/// existing SQLite cache database.
108pub struct SqliteVecStore {
109 conn: rusqlite::Connection,
110}
111
112impl SqliteVecStore {
113 /// Open (or create) the sqlite-vec store for `journal` with
114 /// `embedding_dim` dimensions.
115 ///
116 /// The returned store owns the underlying connection. Call
117 /// [`Self::conn`] to pass it to [`cache::sync_cache`].
118 pub fn open(journal: &Journal, embedding_dim: u32) -> Result<Self> {
119 let conn = cache::open_cache_vec(journal, embedding_dim)?;
120 Ok(Self { conn })
121 }
122
123 /// Borrow the underlying SQLite connection.
124 ///
125 /// Needed to call [`cache::sync_cache`] with the same connection that
126 /// already has the sqlite-vec extension loaded.
127 pub fn conn(&self) -> &rusqlite::Connection {
128 &self.conn
129 }
130
131 /// Read vector index statistics for display in `cache info`.
132 pub fn vec_info(&self) -> Result<VecInfo> {
133 let embedding_dim: u32 = self
134 .conn
135 .query_row(
136 "SELECT value FROM vec_meta WHERE key = 'embedding_dim'",
137 [],
138 |row| row.get::<_, String>(0),
139 )
140 .ok()
141 .and_then(|s| s.parse().ok())
142 .unwrap_or(0);
143
144 let vector_count: u64 = self
145 .conn
146 .query_row("SELECT COUNT(*) FROM chunk_vectors", [], |row| row.get(0))
147 .unwrap_or(0);
148
149 let chunk_count: u64 = self
150 .conn
151 .query_row("SELECT COUNT(*) FROM chunks", [], |row| row.get(0))
152 .unwrap_or(0);
153
154 Ok(VecInfo {
155 embedding_dim,
156 vector_count,
157 pending_count: chunk_count.saturating_sub(vector_count),
158 })
159 }
160}
161
162impl VectorStore for SqliteVecStore {
163 fn embedded_chunk_keys(&self) -> Result<HashSet<(i64, usize)>> {
164 let mut stmt = self.conn.prepare(
165 "SELECT c.entry_id, c.chunk_index
166 FROM chunks c
167 JOIN chunk_vectors cv ON cv.chunk_id = c.id",
168 )?;
169 let keys = stmt
170 .query_map([], |row| {
171 Ok((row.get::<_, i64>(0)?, row.get::<_, i64>(1)? as usize))
172 })?
173 .filter_map(|r| r.ok())
174 .collect();
175 Ok(keys)
176 }
177
178 fn insert_embeddings(&self, chunks: &[Chunk], embeddings: &[Vec<f32>]) -> Result<()> {
179 for (chunk, emb) in chunks.iter().zip(embeddings) {
180 let chunk_id: Option<i64> = self
181 .conn
182 .query_row(
183 "SELECT id FROM chunks WHERE entry_id = ?1 AND chunk_index = ?2",
184 params![chunk.entry_id, chunk.chunk_index as i64],
185 |row| row.get(0),
186 )
187 .ok();
188
189 if let Some(id) = chunk_id {
190 let blob = vec_serialize(emb);
191 self.conn.execute(
192 "INSERT OR REPLACE INTO chunk_vectors (chunk_id, embedding) \
193 VALUES (?1, ?2)",
194 params![id, blob],
195 )?;
196 }
197 }
198 Ok(())
199 }
200
201 fn search_similar(&self, query_vec: &[f32], limit: usize) -> Result<Vec<ChunkSearchResult>> {
202 let blob = vec_serialize(query_vec);
203 let mut stmt = self.conn.prepare(
204 "SELECT e.id, e.title, e.path, c.chunk_index, c.text, cv.distance
205 FROM chunk_vectors cv
206 JOIN chunks c ON c.id = cv.chunk_id
207 JOIN entries e ON e.id = c.entry_id
208 WHERE cv.embedding MATCH ?1 AND k = ?2
209 ORDER BY cv.distance",
210 )?;
211 let results = stmt
212 .query_map(params![blob, limit as i64], |row| {
213 Ok(ChunkSearchResult {
214 entry_id: row.get::<_, i64>(0)?,
215 entry_title: row.get::<_, String>(1)?,
216 entry_path: row.get::<_, String>(2)?,
217 chunk_index: row.get::<_, i64>(3)? as usize,
218 chunk_text: row.get::<_, String>(4)?,
219 score: row.get::<_, f64>(5).unwrap_or(0.0),
220 })
221 })?
222 .collect::<rusqlite::Result<_>>()?;
223 Ok(results)
224 }
225}
226
227// ── shared helpers ────────────────────────────────────────────────────────────
228
229/// Collect all chunks from the SQLite cache that are not yet embedded.
230///
231/// Must be called **synchronously** (before any async context) because it
232/// borrows a `rusqlite::Connection` which is `!Send`.
233pub fn pending_chunks(
234 conn: &rusqlite::Connection,
235 embedded_keys: &HashSet<(i64, usize)>,
236) -> Result<Vec<Chunk>> {
237 let mut stmt = conn.prepare(
238 "SELECT c.entry_id, c.chunk_index, c.text, e.title, e.path
239 FROM chunks c
240 JOIN entries e ON e.id = c.entry_id",
241 )?;
242 let chunks = stmt
243 .query_map([], |row| {
244 Ok(Chunk {
245 entry_id: row.get::<_, i64>(0)?,
246 chunk_index: row.get::<_, i64>(1)? as usize,
247 text: row.get::<_, String>(2)?,
248 entry_title: row.get::<_, String>(3)?,
249 entry_path: row.get::<_, String>(4)?,
250 })
251 })?
252 .filter_map(|r| r.ok())
253 .filter(|c| !embedded_keys.contains(&(c.entry_id, c.chunk_index)))
254 .collect();
255 Ok(chunks)
256}
257
258/// Generate and store embeddings for all pending chunks.
259///
260/// This is the single entry point used by both `cache embed` and
261/// `entry search --semantic`.
262///
263/// 1. Queries `store.embedded_chunk_keys()` to find what is already stored.
264/// 2. Calls [`pending_chunks`] on `conn` to collect unembedded chunks.
265/// 3. Calls the embedder in batches of 100.
266/// 4. Calls `store.insert_embeddings()` for each batch.
267///
268/// Calls `on_progress(done, total)` after each batch.
269/// Returns the total number of newly embedded chunks.
270pub fn embed_pending_chunks(
271 conn: &rusqlite::Connection,
272 store: &dyn VectorStore,
273 embedder: &dyn Embedder,
274 on_progress: impl Fn(usize, usize),
275) -> Result<usize> {
276 let embedded_keys = store.embedded_chunk_keys()?;
277 let pending = pending_chunks(conn, &embedded_keys)?;
278 let total = pending.len();
279 let mut done = 0;
280
281 for batch in pending.chunks(100) {
282 let texts: Vec<&str> = batch.iter().map(|c| c.text.as_str()).collect();
283 let embeddings = embedder.embed_texts(&texts)?;
284 store.insert_embeddings(batch, &embeddings)?;
285 done += batch.len();
286 on_progress(done, total);
287 }
288
289 Ok(total)
290}
291
292// ── internal ──────────────────────────────────────────────────────────────────
293
294/// Serialize a float slice to the little-endian bytes expected by sqlite-vec.
295pub(crate) fn vec_serialize(v: &[f32]) -> Vec<u8> {
296 v.iter().flat_map(|f| f.to_le_bytes()).collect()
297}