Skip to main content

archelon_core/
vector_store.rs

1//! Vector store abstraction and sqlite-vec backend.
2//!
3//! # Architecture
4//!
5//! [`VectorStore`] is a **synchronous** trait that abstracts over the two
6//! supported vector backends:
7//!
8//! - [`SqliteVecStore`] — stores chunk vectors inside the existing SQLite
9//!   cache using the sqlite-vec extension.
10//! - `LanceDbVectorStore` (in `lancedb_store`) — stores chunk vectors in a
11//!   separate LanceDB directory; async LanceDB calls are wrapped in an
12//!   internal Tokio runtime so the trait remains sync.
13//!
14//! # Chunk identity
15//!
16//! Each chunk is identified by the pair `(entry_id, chunk_index)`.  This key
17//! is stable across SQLite cache rebuilds because `entry_id` is the CarettaId
18//! (derived from the file itself) and `chunk_index` is reproducibly derived
19//! from the paragraph order of the entry body.
20//!
21//! # Shared helpers
22//!
23//! [`embed_pending_chunks`] is the one function called by both CLI commands
24//! (`cache embed`, `entry search --semantic`).  It:
25//!
26//! 1. Queries `embedded_chunk_keys()` from the store.
27//! 2. Calls [`pending_chunks`] on the SQLite connection to collect chunks not
28//!    yet embedded.
29//! 3. Calls the embedding API in batches of 100.
30//! 4. Stores the results via `insert_embeddings()`.
31
32use std::collections::HashSet;
33
34use rusqlite::params;
35
36use crate::{
37    cache,
38    embed::Embedder,
39    error::Result,
40    journal::Journal,
41};
42
43// ── public types ──────────────────────────────────────────────────────────────
44
45/// A single paragraph-level chunk derived from an entry, ready to be embedded.
46#[derive(Debug, Clone)]
47pub struct Chunk {
48    /// Entry ID (CarettaId as i64) — stable, used as part of the chunk key.
49    pub entry_id: i64,
50    /// Zero-based position of this paragraph in the entry body.
51    pub chunk_index: usize,
52    /// Embeddable text: title prepended to the paragraph body.
53    pub text: String,
54    /// Denormalised entry title (for display in search results).
55    pub entry_title: String,
56    /// Denormalised absolute file path (for display in search results).
57    pub entry_path: String,
58}
59
60/// A result returned by [`VectorStore::search_similar`].
61pub struct ChunkSearchResult {
62    pub entry_id: i64,
63    pub entry_title: String,
64    pub entry_path: String,
65    /// Position of the matching chunk within the entry (0-based).
66    pub chunk_index: usize,
67    /// The text of the matching chunk.
68    pub chunk_text: String,
69    /// L2 distance (lower = more similar).
70    pub score: f64,
71}
72
73/// Statistics about the vector index shown by `cache info`.
74pub struct VecInfo {
75    /// Embedding dimension (number of f32 values per vector).
76    pub embedding_dim: u32,
77    /// Number of chunks that have an embedding stored.
78    pub vector_count: u64,
79    /// Number of chunks that do not yet have an embedding.
80    pub pending_count: u64,
81}
82
83// ── trait ─────────────────────────────────────────────────────────────────────
84
85/// Abstraction over a vector storage backend.
86///
87/// All methods are **synchronous**.  Backends that are inherently async
88/// (e.g. LanceDB) wrap their async operations in an internal Tokio runtime.
89pub trait VectorStore {
90    /// Return the `(entry_id, chunk_index)` pairs that already have embeddings
91    /// stored, so callers can compute the pending set.
92    fn embedded_chunk_keys(&self) -> Result<HashSet<(i64, usize)>>;
93
94    /// Store embeddings for a batch of chunks.
95    ///
96    /// `chunks` and `embeddings` are parallel slices of equal length.
97    fn insert_embeddings(&self, chunks: &[Chunk], embeddings: &[Vec<f32>]) -> Result<()>;
98
99    /// Find the `limit` most similar chunks to `query_vec`, ordered by
100    /// ascending distance.
101    fn search_similar(&self, query_vec: &[f32], limit: usize) -> Result<Vec<ChunkSearchResult>>;
102}
103
104// ── SqliteVecStore ────────────────────────────────────────────────────────────
105
106/// Vector store backed by the sqlite-vec extension, living inside the
107/// existing SQLite cache database.
108pub struct SqliteVecStore {
109    conn: rusqlite::Connection,
110}
111
112impl SqliteVecStore {
113    /// Open (or create) the sqlite-vec store for `journal` with
114    /// `embedding_dim` dimensions.
115    ///
116    /// The returned store owns the underlying connection.  Call
117    /// [`Self::conn`] to pass it to [`cache::sync_cache`].
118    pub fn open(journal: &Journal, embedding_dim: u32) -> Result<Self> {
119        let conn = cache::open_cache_vec(journal, embedding_dim)?;
120        Ok(Self { conn })
121    }
122
123    /// Borrow the underlying SQLite connection.
124    ///
125    /// Needed to call [`cache::sync_cache`] with the same connection that
126    /// already has the sqlite-vec extension loaded.
127    pub fn conn(&self) -> &rusqlite::Connection {
128        &self.conn
129    }
130
131    /// Read vector index statistics for display in `cache info`.
132    pub fn vec_info(&self) -> Result<VecInfo> {
133        let embedding_dim: u32 = self
134            .conn
135            .query_row(
136                "SELECT value FROM vec_meta WHERE key = 'embedding_dim'",
137                [],
138                |row| row.get::<_, String>(0),
139            )
140            .ok()
141            .and_then(|s| s.parse().ok())
142            .unwrap_or(0);
143
144        let vector_count: u64 = self
145            .conn
146            .query_row("SELECT COUNT(*) FROM chunk_vectors", [], |row| row.get(0))
147            .unwrap_or(0);
148
149        let chunk_count: u64 = self
150            .conn
151            .query_row("SELECT COUNT(*) FROM chunks", [], |row| row.get(0))
152            .unwrap_or(0);
153
154        Ok(VecInfo {
155            embedding_dim,
156            vector_count,
157            pending_count: chunk_count.saturating_sub(vector_count),
158        })
159    }
160}
161
162impl VectorStore for SqliteVecStore {
163    fn embedded_chunk_keys(&self) -> Result<HashSet<(i64, usize)>> {
164        let mut stmt = self.conn.prepare(
165            "SELECT c.entry_id, c.chunk_index
166             FROM chunks c
167             JOIN chunk_vectors cv ON cv.chunk_id = c.id",
168        )?;
169        let keys = stmt
170            .query_map([], |row| {
171                Ok((row.get::<_, i64>(0)?, row.get::<_, i64>(1)? as usize))
172            })?
173            .filter_map(|r| r.ok())
174            .collect();
175        Ok(keys)
176    }
177
178    fn insert_embeddings(&self, chunks: &[Chunk], embeddings: &[Vec<f32>]) -> Result<()> {
179        for (chunk, emb) in chunks.iter().zip(embeddings) {
180            let chunk_id: Option<i64> = self
181                .conn
182                .query_row(
183                    "SELECT id FROM chunks WHERE entry_id = ?1 AND chunk_index = ?2",
184                    params![chunk.entry_id, chunk.chunk_index as i64],
185                    |row| row.get(0),
186                )
187                .ok();
188
189            if let Some(id) = chunk_id {
190                let blob = vec_serialize(emb);
191                self.conn.execute(
192                    "INSERT OR REPLACE INTO chunk_vectors (chunk_id, embedding) \
193                     VALUES (?1, ?2)",
194                    params![id, blob],
195                )?;
196            }
197        }
198        Ok(())
199    }
200
201    fn search_similar(&self, query_vec: &[f32], limit: usize) -> Result<Vec<ChunkSearchResult>> {
202        let blob = vec_serialize(query_vec);
203        let mut stmt = self.conn.prepare(
204            "SELECT e.id, e.title, e.path, c.chunk_index, c.text, cv.distance
205             FROM chunk_vectors cv
206             JOIN chunks c ON c.id = cv.chunk_id
207             JOIN entries e ON e.id = c.entry_id
208             WHERE cv.embedding MATCH ?1 AND k = ?2
209             ORDER BY cv.distance",
210        )?;
211        let results = stmt
212            .query_map(params![blob, limit as i64], |row| {
213                Ok(ChunkSearchResult {
214                    entry_id: row.get::<_, i64>(0)?,
215                    entry_title: row.get::<_, String>(1)?,
216                    entry_path: row.get::<_, String>(2)?,
217                    chunk_index: row.get::<_, i64>(3)? as usize,
218                    chunk_text: row.get::<_, String>(4)?,
219                    score: row.get::<_, f64>(5).unwrap_or(0.0),
220                })
221            })?
222            .collect::<rusqlite::Result<_>>()?;
223        Ok(results)
224    }
225}
226
227// ── shared helpers ────────────────────────────────────────────────────────────
228
229/// Collect all chunks from the SQLite cache that are not yet embedded.
230///
231/// Must be called **synchronously** (before any async context) because it
232/// borrows a `rusqlite::Connection` which is `!Send`.
233pub fn pending_chunks(
234    conn: &rusqlite::Connection,
235    embedded_keys: &HashSet<(i64, usize)>,
236) -> Result<Vec<Chunk>> {
237    let mut stmt = conn.prepare(
238        "SELECT c.entry_id, c.chunk_index, c.text, e.title, e.path
239         FROM chunks c
240         JOIN entries e ON e.id = c.entry_id",
241    )?;
242    let chunks = stmt
243        .query_map([], |row| {
244            Ok(Chunk {
245                entry_id: row.get::<_, i64>(0)?,
246                chunk_index: row.get::<_, i64>(1)? as usize,
247                text: row.get::<_, String>(2)?,
248                entry_title: row.get::<_, String>(3)?,
249                entry_path: row.get::<_, String>(4)?,
250            })
251        })?
252        .filter_map(|r| r.ok())
253        .filter(|c| !embedded_keys.contains(&(c.entry_id, c.chunk_index)))
254        .collect();
255    Ok(chunks)
256}
257
258/// Generate and store embeddings for all pending chunks.
259///
260/// This is the single entry point used by both `cache embed` and
261/// `entry search --semantic`.
262///
263/// 1. Queries `store.embedded_chunk_keys()` to find what is already stored.
264/// 2. Calls [`pending_chunks`] on `conn` to collect unembedded chunks.
265/// 3. Calls the embedder in batches of 100.
266/// 4. Calls `store.insert_embeddings()` for each batch.
267///
268/// Calls `on_progress(done, total)` after each batch.
269/// Returns the total number of newly embedded chunks.
270pub fn embed_pending_chunks(
271    conn: &rusqlite::Connection,
272    store: &dyn VectorStore,
273    embedder: &dyn Embedder,
274    on_progress: impl Fn(usize, usize),
275) -> Result<usize> {
276    let embedded_keys = store.embedded_chunk_keys()?;
277    let pending = pending_chunks(conn, &embedded_keys)?;
278    let total = pending.len();
279    let mut done = 0;
280
281    for batch in pending.chunks(100) {
282        let texts: Vec<&str> = batch.iter().map(|c| c.text.as_str()).collect();
283        let embeddings = embedder.embed_texts(&texts)?;
284        store.insert_embeddings(batch, &embeddings)?;
285        done += batch.len();
286        on_progress(done, total);
287    }
288
289    Ok(total)
290}
291
292// ── internal ──────────────────────────────────────────────────────────────────
293
294/// Serialize a float slice to the little-endian bytes expected by sqlite-vec.
295pub(crate) fn vec_serialize(v: &[f32]) -> Vec<u8> {
296    v.iter().flat_map(|f| f.to_le_bytes()).collect()
297}