Skip to main content

repograph_core/search/
index.rs

1//! The SQLite-backed search store.
2//!
3//! One central database (a `repo` column on every row) so a single query spans
4//! all registered repos. FTS5 provides BM25 lexical search; a `vectors` table
5//! holds Float32 embedding BLOBs for brute-force cosine. Lexical and vector
6//! rankings are merged by reciprocal-rank fusion in [`fuse`]. Embeddings are
7//! supplied through the [`Embedder`] trait so this module never depends on the
8//! `fastembed` crate directly.
9
10use std::collections::{HashMap, HashSet};
11use std::path::Path;
12
13use rusqlite::{Connection, OpenFlags, params, params_from_iter};
14
15use crate::error::RepographError;
16use crate::search::chunk::{Chunk, TrackedFile, chunk_file};
17
18/// Bumped whenever the on-disk schema changes shape. A mismatch triggers a
19/// clean rebuild rather than a fragile migration — the index is a derived
20/// artifact, cheap to recreate.
21pub const SCHEMA_VERSION: &str = "1";
22
23/// Reciprocal-rank-fusion constant. 60 is the value from the original RRF paper
24/// and the de-facto default; it damps the contribution of low-ranked hits.
25const RRF_K: f64 = 60.0;
26
27/// An embedding backend. Implemented by the (feature-gated) `embed` module; the
28/// store takes it as a trait object so the always-on lexical path pulls in no
29/// embedding dependency.
30pub trait Embedder {
31    /// Stable identifier of the model, stored alongside vectors so a model
32    /// change invalidates the vector segment.
33    fn model_id(&self) -> &str;
34
35    /// Embed a batch of texts into vectors. Returns a human-readable message on
36    /// failure; the caller degrades to lexical rather than aborting.
37    ///
38    /// # Errors
39    ///
40    /// Returns `Err(message)` when the backend cannot produce embeddings.
41    fn embed(&mut self, texts: &[String]) -> Result<Vec<Vec<f32>>, String>;
42}
43
44/// Per-repo outcome of [`Store::reconcile_repo`].
45#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
46pub struct RepoStats {
47    /// Files (re)chunked because they were new or changed.
48    pub files_indexed: usize,
49    /// Files left untouched because their content hash matched.
50    pub files_unchanged: usize,
51    /// Files dropped from the index because they are no longer tracked.
52    pub files_purged: usize,
53}
54
55/// A chunk row materialized for output.
56#[derive(Debug, Clone)]
57pub struct ChunkRow {
58    pub repo: String,
59    pub path: String,
60    pub start_line: u32,
61    pub content: String,
62}
63
64/// Handle to the search database.
65pub struct Store {
66    conn: Connection,
67}
68
69impl std::fmt::Debug for Store {
70    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
71        f.debug_struct("Store").finish_non_exhaustive()
72    }
73}
74
75impl Store {
76    /// Open the index for *building*, creating the file and schema if absent.
77    /// A schema-version mismatch wipes and recreates all tables.
78    ///
79    /// # Errors
80    ///
81    /// Returns [`RepographError::Index`] on any `SQLite` failure.
82    pub fn open_for_build(db_path: &Path) -> Result<Self, RepographError> {
83        if let Some(parent) = db_path.parent() {
84            fs_err::create_dir_all(parent)?;
85        }
86        let conn = Connection::open(db_path)?;
87        let store = Self { conn };
88        store.ensure_schema()?;
89        Ok(store)
90    }
91
92    /// Open an *existing* index read-only. Both callers (`search`,
93    /// `index_health`) only query, so a read-only handle avoids write-lock
94    /// contention with a concurrent `repograph index` and works on read-only
95    /// mounts. Returns [`RepographError::IndexMissing`] (exit 3) when the file
96    /// does not exist, and [`RepographError::Index`] (exit 1) when it exists but
97    /// cannot be opened or is the wrong schema.
98    ///
99    /// # Errors
100    ///
101    /// See above.
102    pub fn open_existing(db_path: &Path) -> Result<Self, RepographError> {
103        if !db_path.is_file() {
104            return Err(RepographError::IndexMissing);
105        }
106        let conn = Connection::open_with_flags(db_path, OpenFlags::SQLITE_OPEN_READ_ONLY)?;
107        let store = Self { conn };
108        let version: Option<String> = store.meta_get("schema_version")?;
109        match version.as_deref() {
110            Some(v) if v == SCHEMA_VERSION => Ok(store),
111            Some(other) => Err(RepographError::Index(format!(
112                "index schema version {other} is not readable by this build (expected {SCHEMA_VERSION}); run `repograph index` to rebuild"
113            ))),
114            None => Err(RepographError::Index(
115                "index is missing its schema marker (corrupt); run `repograph index` to rebuild"
116                    .to_string(),
117            )),
118        }
119    }
120
121    fn ensure_schema(&self) -> Result<(), RepographError> {
122        self.conn.execute_batch(
123            "CREATE TABLE IF NOT EXISTS meta (key TEXT PRIMARY KEY, value TEXT NOT NULL)",
124        )?;
125        let version: Option<String> = self.meta_get("schema_version")?;
126        if version.as_deref() == Some(SCHEMA_VERSION) {
127            return Ok(());
128        }
129        if version.is_some() {
130            self.drop_all()?;
131        }
132        self.create_all()?;
133        self.meta_set("schema_version", SCHEMA_VERSION)?;
134        Ok(())
135    }
136
137    fn drop_all(&self) -> Result<(), RepographError> {
138        self.conn.execute_batch(
139            "DROP TABLE IF EXISTS chunks_fts;
140             DROP TABLE IF EXISTS vectors;
141             DROP TABLE IF EXISTS chunks;
142             DROP TABLE IF EXISTS files;
143             DROP TABLE IF EXISTS repos;",
144        )?;
145        Ok(())
146    }
147
148    fn create_all(&self) -> Result<(), RepographError> {
149        self.conn.execute_batch(
150            "CREATE TABLE IF NOT EXISTS repos (
151                 repo TEXT PRIMARY KEY,
152                 indexed_commit TEXT
153             );
154             CREATE TABLE IF NOT EXISTS files (
155                 repo TEXT NOT NULL,
156                 path TEXT NOT NULL,
157                 content_hash TEXT NOT NULL,
158                 PRIMARY KEY (repo, path)
159             );
160             CREATE TABLE IF NOT EXISTS chunks (
161                 id INTEGER PRIMARY KEY AUTOINCREMENT,
162                 repo TEXT NOT NULL,
163                 path TEXT NOT NULL,
164                 start_line INTEGER NOT NULL,
165                 end_line INTEGER NOT NULL,
166                 content TEXT NOT NULL,
167                 prefix TEXT NOT NULL
168             );
169             CREATE INDEX IF NOT EXISTS idx_chunks_repo_path ON chunks(repo, path);
170             CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(text, chunk_id UNINDEXED);
171             CREATE TABLE IF NOT EXISTS vectors (
172                 chunk_id INTEGER PRIMARY KEY,
173                 embedding BLOB NOT NULL,
174                 model TEXT NOT NULL
175             );",
176        )?;
177        Ok(())
178    }
179
180    fn meta_get(&self, key: &str) -> Result<Option<String>, RepographError> {
181        match self
182            .conn
183            .query_row("SELECT value FROM meta WHERE key = ?1", [key], |r| {
184                r.get::<_, String>(0)
185            }) {
186            Ok(v) => Ok(Some(v)),
187            Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
188            Err(e) => Err(e.into()),
189        }
190    }
191
192    fn meta_set(&self, key: &str, value: &str) -> Result<(), RepographError> {
193        self.conn.execute(
194            "INSERT INTO meta(key, value) VALUES(?1, ?2)
195             ON CONFLICT(key) DO UPDATE SET value = excluded.value",
196            params![key, value],
197        )?;
198        Ok(())
199    }
200
201    /// If `model_id` differs from the model recorded in the index, drop every
202    /// vector so the segment never mixes embedding spaces, then record the new
203    /// model. Call once before reconciling with an embedder.
204    ///
205    /// # Errors
206    ///
207    /// Returns [`RepographError::Index`] on `SQLite` failure.
208    pub fn ensure_model(&self, model_id: &str) -> Result<(), RepographError> {
209        let current: Option<String> = self.meta_get("model_id")?;
210        if current.as_deref() != Some(model_id) {
211            self.conn.execute("DELETE FROM vectors", [])?;
212            self.meta_set("model_id", model_id)?;
213        }
214        Ok(())
215    }
216
217    /// Whether any embeddings are stored — drives whether semantic retrieval can
218    /// run at query time.
219    ///
220    /// # Errors
221    ///
222    /// Returns [`RepographError::Index`] on `SQLite` failure.
223    pub fn has_vectors(&self) -> Result<bool, RepographError> {
224        let n: i64 = self
225            .conn
226            .query_row("SELECT COUNT(*) FROM vectors", [], |r| r.get(0))?;
227        Ok(n > 0)
228    }
229
230    /// The per-repo indexed commit recorded at the last build.
231    ///
232    /// # Errors
233    ///
234    /// Returns [`RepographError::Index`] on `SQLite` failure.
235    pub fn indexed_commits(&self) -> Result<HashMap<String, Option<String>>, RepographError> {
236        let mut stmt = self
237            .conn
238            .prepare("SELECT repo, indexed_commit FROM repos")?;
239        let rows = stmt.query_map([], |r| {
240            Ok((r.get::<_, String>(0)?, r.get::<_, Option<String>>(1)?))
241        })?;
242        let mut out = HashMap::new();
243        for row in rows {
244            let (repo, commit) = row?;
245            out.insert(repo, commit);
246        }
247        Ok(out)
248    }
249
250    /// Reconcile one repo's tracked files against the index in a single
251    /// transaction: re-chunk new/changed files, purge files no longer tracked,
252    /// and record the indexed commit. When `embedder` is supplied, changed
253    /// chunks are embedded and their vectors written; an embed failure for a
254    /// file degrades that file to lexical-only (logged by the caller).
255    ///
256    /// # Errors
257    ///
258    /// Returns [`RepographError::Index`] on `SQLite` failure.
259    pub fn reconcile_repo(
260        &mut self,
261        repo: &str,
262        files: &[TrackedFile],
263        head_commit: Option<&str>,
264        mut embedder: Option<&mut dyn Embedder>,
265    ) -> Result<RepoStats, RepographError> {
266        let mut stats = RepoStats::default();
267        let existing = self.existing_hashes(repo)?;
268        let embedding = embedder.is_some();
269        // When embedding, a file whose content is unchanged but which carries no
270        // stored vectors must still be reprocessed. Otherwise upgrading a lexical
271        // index with `--semantic` (or switching models, which drops every vector
272        // via `ensure_model`) would skip all unchanged files and leave the index
273        // permanently half-embedded — the `--semantic` flag would silently no-op.
274        let vectored: HashSet<String> = if embedding {
275            self.paths_with_vectors(repo)?
276        } else {
277            HashSet::new()
278        };
279        let current: HashSet<&str> = files.iter().map(|f| f.path.as_str()).collect();
280
281        let tx = self.conn.transaction()?;
282
283        for path in existing.keys() {
284            if !current.contains(path.as_str()) {
285                delete_file_chunks(&tx, repo, path)?;
286                tx.execute(
287                    "DELETE FROM files WHERE repo = ?1 AND path = ?2",
288                    params![repo, path],
289                )?;
290                stats.files_purged += 1;
291            }
292        }
293
294        for f in files {
295            let unchanged = existing.get(&f.path) == Some(&f.content_hash);
296            let needs_vectors = embedding && !vectored.contains(&f.path);
297            if unchanged && !needs_vectors {
298                stats.files_unchanged += 1;
299                continue;
300            }
301            delete_file_chunks(&tx, repo, &f.path)?;
302            let chunks = chunk_file(repo, &f.path, &f.text);
303            // Reborrow per iteration so the mutable borrow of `embedder` ends
304            // each loop pass. `match` (not `.map()`) is required: the closure
305            // form ties the reborrow to the whole fn and trips the borrow check.
306            #[allow(clippy::option_if_let_else)]
307            let emb: Option<&mut dyn Embedder> = match &mut embedder {
308                Some(e) => Some(&mut **e),
309                None => None,
310            };
311            let embeddings = embed_chunks(emb, &chunks);
312            insert_chunks(&tx, repo, &chunks, embeddings.as_ref())?;
313            tx.execute(
314                "INSERT INTO files(repo, path, content_hash) VALUES(?1, ?2, ?3)
315                 ON CONFLICT(repo, path) DO UPDATE SET content_hash = excluded.content_hash",
316                params![repo, f.path, f.content_hash],
317            )?;
318            stats.files_indexed += 1;
319        }
320
321        tx.execute(
322            "INSERT INTO repos(repo, indexed_commit) VALUES(?1, ?2)
323             ON CONFLICT(repo) DO UPDATE SET indexed_commit = excluded.indexed_commit",
324            params![repo, head_commit],
325        )?;
326        tx.commit()?;
327        Ok(stats)
328    }
329
330    /// Repo-relative paths that currently have at least one stored embedding —
331    /// used to detect files that are lexically indexed but not yet vectored.
332    fn paths_with_vectors(&self, repo: &str) -> Result<HashSet<String>, RepographError> {
333        let mut stmt = self.conn.prepare(
334            "SELECT DISTINCT c.path FROM chunks c JOIN vectors v ON v.chunk_id = c.id
335             WHERE c.repo = ?1",
336        )?;
337        let rows = stmt.query_map([repo], |r| r.get::<_, String>(0))?;
338        let mut out = HashSet::new();
339        for row in rows {
340            out.insert(row?);
341        }
342        Ok(out)
343    }
344
345    fn existing_hashes(&self, repo: &str) -> Result<HashMap<String, String>, RepographError> {
346        let mut stmt = self
347            .conn
348            .prepare("SELECT path, content_hash FROM files WHERE repo = ?1")?;
349        let rows = stmt.query_map([repo], |r| {
350            Ok((r.get::<_, String>(0)?, r.get::<_, String>(1)?))
351        })?;
352        let mut out = HashMap::new();
353        for row in rows {
354            let (path, hash) = row?;
355            out.insert(path, hash);
356        }
357        Ok(out)
358    }
359
360    /// Lexical (BM25) candidate chunk ids, best-first. `repos` (when non-empty)
361    /// restricts results to those repos. Returns an empty vec when the query
362    /// yields no usable search tokens.
363    ///
364    /// # Errors
365    ///
366    /// Returns [`RepographError::Index`] on `SQLite` failure.
367    pub fn search_lexical(
368        &self,
369        query: &str,
370        repos: &[String],
371        pool: usize,
372    ) -> Result<Vec<i64>, RepographError> {
373        let Some(match_expr) = fts_query(query) else {
374            return Ok(Vec::new());
375        };
376        let pool_i = i64::try_from(pool).unwrap_or(i64::MAX);
377        // FTS5's MATCH and bm25() must reference the virtual table by its real
378        // name, not a join alias, so `chunks_fts` is spelled out here.
379        let mut sql = String::from(
380            "SELECT chunks.id FROM chunks_fts JOIN chunks ON chunks.id = chunks_fts.chunk_id
381             WHERE chunks_fts MATCH ?1",
382        );
383        let mut binds: Vec<rusqlite::types::Value> = vec![match_expr.into()];
384        if !repos.is_empty() {
385            let placeholders = repo_placeholders(repos.len(), binds.len() + 1);
386            sql.push_str(" AND chunks.repo IN (");
387            sql.push_str(&placeholders);
388            sql.push(')');
389            for r in repos {
390                binds.push(r.clone().into());
391            }
392        }
393        sql.push_str(" ORDER BY bm25(chunks_fts) LIMIT ");
394        sql.push_str(&pool_i.to_string());
395        let mut stmt = self.conn.prepare(&sql)?;
396        let rows = stmt.query_map(params_from_iter(binds), |r| r.get::<_, i64>(0))?;
397        let mut ids = Vec::new();
398        for row in rows {
399            ids.push(row?);
400        }
401        Ok(ids)
402    }
403
404    /// Vector (cosine) candidate chunk ids, best-first, computed by brute force
405    /// over the stored embeddings (optionally restricted to `repos`).
406    ///
407    /// # Errors
408    ///
409    /// Returns [`RepographError::Index`] on `SQLite` failure.
410    pub fn search_vectors(
411        &self,
412        query_embedding: &[f32],
413        repos: &[String],
414        pool: usize,
415    ) -> Result<Vec<i64>, RepographError> {
416        let mut sql = String::from(
417            "SELECT v.chunk_id, v.embedding FROM vectors v JOIN chunks c ON c.id = v.chunk_id",
418        );
419        let mut binds: Vec<rusqlite::types::Value> = Vec::new();
420        if !repos.is_empty() {
421            let placeholders = repo_placeholders(repos.len(), 1);
422            sql.push_str(" WHERE c.repo IN (");
423            sql.push_str(&placeholders);
424            sql.push(')');
425            for r in repos {
426                binds.push(r.clone().into());
427            }
428        }
429        let mut stmt = self.conn.prepare(&sql)?;
430        let rows = stmt.query_map(params_from_iter(binds), |r| {
431            Ok((r.get::<_, i64>(0)?, r.get::<_, Vec<u8>>(1)?))
432        })?;
433        let mut scored: Vec<(i64, f32)> = Vec::new();
434        for row in rows {
435            let (id, blob) = row?;
436            let v = blob_to_vec(&blob);
437            scored.push((id, cosine(query_embedding, &v)));
438        }
439        scored.sort_by(|a, b| b.1.total_cmp(&a.1));
440        scored.truncate(pool);
441        Ok(scored.into_iter().map(|(id, _)| id).collect())
442    }
443
444    /// Fetch chunk rows for the given ids, keyed by id.
445    ///
446    /// # Errors
447    ///
448    /// Returns [`RepographError::Index`] on `SQLite` failure.
449    pub fn fetch_chunks(&self, ids: &[i64]) -> Result<HashMap<i64, ChunkRow>, RepographError> {
450        if ids.is_empty() {
451            return Ok(HashMap::new());
452        }
453        let placeholders = repo_placeholders(ids.len(), 1);
454        let sql = format!(
455            "SELECT id, repo, path, start_line, content FROM chunks WHERE id IN ({placeholders})"
456        );
457        let mut stmt = self.conn.prepare(&sql)?;
458        let binds: Vec<rusqlite::types::Value> = ids.iter().map(|i| (*i).into()).collect();
459        let rows = stmt.query_map(params_from_iter(binds), |r| {
460            Ok((
461                r.get::<_, i64>(0)?,
462                ChunkRow {
463                    repo: r.get::<_, String>(1)?,
464                    path: r.get::<_, String>(2)?,
465                    start_line: u32::try_from(r.get::<_, i64>(3)?).unwrap_or(u32::MAX),
466                    content: r.get::<_, String>(4)?,
467                },
468            ))
469        })?;
470        let mut out = HashMap::new();
471        for row in rows {
472            let (id, chunk) = row?;
473            out.insert(id, chunk);
474        }
475        Ok(out)
476    }
477}
478
479/// Merge ranked candidate lists by reciprocal-rank fusion, returning chunk ids
480/// with their fused scores, best-first. An id appearing in multiple lists
481/// accumulates contributions from each.
482#[must_use]
483pub fn fuse(lists: &[&[i64]]) -> Vec<(i64, f64)> {
484    let mut scores: HashMap<i64, f64> = HashMap::new();
485    for list in lists {
486        for (rank, id) in list.iter().enumerate() {
487            #[allow(clippy::cast_precision_loss)]
488            let contribution = 1.0 / (RRF_K + (rank as f64) + 1.0);
489            *scores.entry(*id).or_insert(0.0) += contribution;
490        }
491    }
492    let mut fused: Vec<(i64, f64)> = scores.into_iter().collect();
493    fused.sort_by(|a, b| b.1.total_cmp(&a.1).then(a.0.cmp(&b.0)));
494    fused
495}
496
497fn embed_chunks(
498    embedder: Option<&mut dyn Embedder>,
499    chunks: &[Chunk],
500) -> Option<(Vec<Vec<f32>>, String)> {
501    let embedder = embedder?;
502    if chunks.is_empty() {
503        return None;
504    }
505    let texts: Vec<String> = chunks.iter().map(Chunk::index_text).collect();
506    let model = embedder.model_id().to_string();
507    match embedder.embed(&texts) {
508        Ok(vectors) if vectors.len() == chunks.len() => Some((vectors, model)),
509        Ok(_) => {
510            tracing::warn!("embedder returned a vector count != chunk count; skipping vectors");
511            None
512        }
513        Err(msg) => {
514            tracing::warn!(error = %msg, "embedding failed; this file is lexical-only");
515            None
516        }
517    }
518}
519
520fn delete_file_chunks(
521    tx: &rusqlite::Transaction<'_>,
522    repo: &str,
523    path: &str,
524) -> Result<(), RepographError> {
525    tx.execute(
526        "DELETE FROM chunks_fts WHERE chunk_id IN
527             (SELECT id FROM chunks WHERE repo = ?1 AND path = ?2)",
528        params![repo, path],
529    )?;
530    tx.execute(
531        "DELETE FROM vectors WHERE chunk_id IN
532             (SELECT id FROM chunks WHERE repo = ?1 AND path = ?2)",
533        params![repo, path],
534    )?;
535    tx.execute(
536        "DELETE FROM chunks WHERE repo = ?1 AND path = ?2",
537        params![repo, path],
538    )?;
539    Ok(())
540}
541
542fn insert_chunks(
543    tx: &rusqlite::Transaction<'_>,
544    repo: &str,
545    chunks: &[Chunk],
546    embeddings: Option<&(Vec<Vec<f32>>, String)>,
547) -> Result<(), RepographError> {
548    for (i, chunk) in chunks.iter().enumerate() {
549        tx.execute(
550            "INSERT INTO chunks(repo, path, start_line, end_line, content, prefix)
551             VALUES(?1, ?2, ?3, ?4, ?5, ?6)",
552            params![
553                repo,
554                chunk.path,
555                chunk.start_line,
556                chunk.end_line,
557                chunk.content,
558                chunk.prefix
559            ],
560        )?;
561        let chunk_id = tx.last_insert_rowid();
562        tx.execute(
563            "INSERT INTO chunks_fts(text, chunk_id) VALUES(?1, ?2)",
564            params![chunk.index_text(), chunk_id],
565        )?;
566        if let Some((vectors, model)) = embeddings {
567            if let Some(v) = vectors.get(i) {
568                tx.execute(
569                    "INSERT INTO vectors(chunk_id, embedding, model) VALUES(?1, ?2, ?3)",
570                    params![chunk_id, vec_to_blob(v), model],
571                )?;
572            }
573        }
574    }
575    Ok(())
576}
577
578/// Build an FTS5 MATCH expression from a free-form query: extract alphanumeric
579/// tokens, lowercase, dedup, and OR them together (quoted, so FTS treats each as
580/// a bare term). Returns `None` when the query has no usable tokens.
581fn fts_query(query: &str) -> Option<String> {
582    let mut seen = HashSet::new();
583    let mut terms = Vec::new();
584    for raw in query.split(|c: char| !c.is_alphanumeric()) {
585        if raw.is_empty() {
586            continue;
587        }
588        let lower = raw.to_lowercase();
589        if seen.insert(lower.clone()) {
590            terms.push(format!("\"{lower}\""));
591        }
592    }
593    if terms.is_empty() {
594        None
595    } else {
596        Some(terms.join(" OR "))
597    }
598}
599
600/// `?(start), ?(start+1), …` for an `IN (...)` clause of `n` items.
601fn repo_placeholders(n: usize, start: usize) -> String {
602    (start..start + n)
603        .map(|i| format!("?{i}"))
604        .collect::<Vec<_>>()
605        .join(", ")
606}
607
608fn vec_to_blob(v: &[f32]) -> Vec<u8> {
609    let mut bytes = Vec::with_capacity(v.len() * 4);
610    for x in v {
611        bytes.extend_from_slice(&x.to_le_bytes());
612    }
613    bytes
614}
615
616fn blob_to_vec(bytes: &[u8]) -> Vec<f32> {
617    bytes
618        .chunks_exact(4)
619        .map(|c| f32::from_le_bytes([c[0], c[1], c[2], c[3]]))
620        .collect()
621}
622
623fn cosine(a: &[f32], b: &[f32]) -> f32 {
624    if a.len() != b.len() || a.is_empty() {
625        return 0.0;
626    }
627    let mut dot = 0.0f32;
628    let mut na = 0.0f32;
629    let mut nb = 0.0f32;
630    for (x, y) in a.iter().zip(b.iter()) {
631        dot += x * y;
632        na += x * x;
633        nb += y * y;
634    }
635    if na == 0.0 || nb == 0.0 {
636        return 0.0;
637    }
638    dot / (na.sqrt() * nb.sqrt())
639}
640
641#[cfg(test)]
642mod tests {
643    #![allow(
644        clippy::unwrap_used,
645        clippy::float_cmp,
646        clippy::cast_precision_loss,
647        clippy::cast_possible_truncation,
648        clippy::cast_sign_loss,
649        clippy::unnecessary_literal_bound
650    )]
651    use super::*;
652    use crate::search::chunk::TrackedFile;
653    use tempfile::TempDir;
654
655    /// Deterministic in-memory embedder for exercising the vector path without
656    /// the `semantic` feature / a real model download.
657    struct StubEmbedder;
658    impl Embedder for StubEmbedder {
659        fn model_id(&self) -> &str {
660            "stub-v1"
661        }
662        fn embed(&mut self, texts: &[String]) -> Result<Vec<Vec<f32>>, String> {
663            Ok(texts
664                .iter()
665                .map(|t| vec![(t.len() % 7) as f32 + 1.0, 1.0, 0.5])
666                .collect())
667        }
668    }
669
670    fn tf(path: &str, text: &str) -> TrackedFile {
671        TrackedFile {
672            path: path.to_string(),
673            content_hash: format!("h:{}:{}", path, text.len()),
674            text: text.to_string(),
675        }
676    }
677
678    fn build_store() -> (TempDir, Store) {
679        let tmp = TempDir::new().unwrap();
680        let db = tmp.path().join("repograph").join("index.db");
681        let store = Store::open_for_build(&db).unwrap();
682        (tmp, store)
683    }
684
685    #[test]
686    fn open_existing_missing_is_index_missing() {
687        let tmp = TempDir::new().unwrap();
688        let db = tmp.path().join("nope.db");
689        let err = Store::open_existing(&db).unwrap_err();
690        assert!(matches!(err, RepographError::IndexMissing));
691    }
692
693    #[test]
694    fn reconcile_then_lexical_finds_exact_token() {
695        let (_tmp, mut store) = build_store();
696        let files = vec![
697            tf("auth.rs", "fn rotate_refresh_token() { /* logic */ }\n"),
698            tf("util.rs", "fn unrelated_helper() {}\n"),
699        ];
700        let stats = store
701            .reconcile_repo("api", &files, Some("deadbeef"), None)
702            .unwrap();
703        assert_eq!(stats.files_indexed, 2);
704        let ids = store
705            .search_lexical("rotate_refresh_token", &[], 10)
706            .unwrap();
707        assert!(!ids.is_empty());
708        let rows = store.fetch_chunks(&ids).unwrap();
709        let hit = rows.values().find(|r| r.path == "auth.rs");
710        assert!(hit.is_some(), "exact-symbol query surfaces the right file");
711    }
712
713    #[test]
714    fn incremental_skips_unchanged_and_reprocesses_changed() {
715        let (_tmp, mut store) = build_store();
716        let files = vec![
717            tf("a.rs", "fn first() {}\n"),
718            tf("b.rs", "fn second() {}\n"),
719        ];
720        store.reconcile_repo("r", &files, None, None).unwrap();
721
722        // Second run: a.rs unchanged, b.rs changed.
723        let files2 = vec![
724            tf("a.rs", "fn first() {}\n"),
725            tf("b.rs", "fn second_renamed() {}\n"),
726        ];
727        let stats = store.reconcile_repo("r", &files2, None, None).unwrap();
728        assert_eq!(stats.files_unchanged, 1, "a.rs hash matched");
729        assert_eq!(stats.files_indexed, 1, "b.rs re-chunked");
730
731        // The old symbol is gone, the new one is present.
732        assert!(
733            !store
734                .search_lexical("second_renamed", &[], 10)
735                .unwrap()
736                .is_empty(),
737            "new content searchable"
738        );
739        let old = store.search_lexical("second", &[], 10).unwrap();
740        // "second" still tokenizes from "second_renamed"? No — token is the whole word.
741        let rows = store.fetch_chunks(&old).unwrap();
742        assert!(
743            !rows.values().any(|r| r.content.contains("fn second()")),
744            "stale chunk purged"
745        );
746    }
747
748    #[test]
749    fn semantic_upgrade_embeds_previously_lexical_files() {
750        let (_tmp, mut store) = build_store();
751        let files = vec![tf("a.rs", "fn a() {}\n"), tf("b.rs", "fn b() {}\n")];
752
753        // First pass is lexical-only: no embedder, no vectors written.
754        store.reconcile_repo("r", &files, None, None).unwrap();
755        assert!(
756            !store.has_vectors().unwrap(),
757            "lexical build wrote no vectors"
758        );
759
760        // Re-run with an embedder over the *same, unchanged* files. Without the
761        // missing-vector check this would skip every file and write no vectors.
762        let mut emb = StubEmbedder;
763        store.ensure_model(emb.model_id()).unwrap();
764        let stats = store
765            .reconcile_repo("r", &files, None, Some(&mut emb))
766            .unwrap();
767        assert_eq!(
768            stats.files_indexed, 2,
769            "unchanged-but-unvectored files are reprocessed to embed them"
770        );
771        assert_eq!(stats.files_unchanged, 0);
772        assert!(
773            store.has_vectors().unwrap(),
774            "vectors present after the semantic upgrade"
775        );
776
777        // A third pass (still embedding) now finds vectors for every file and
778        // skips them — no needless re-embedding once the index is whole.
779        let mut emb2 = StubEmbedder;
780        let stats2 = store
781            .reconcile_repo("r", &files, None, Some(&mut emb2))
782            .unwrap();
783        assert_eq!(
784            stats2.files_unchanged, 2,
785            "fully-vectored files are skipped"
786        );
787        assert_eq!(stats2.files_indexed, 0);
788    }
789
790    #[test]
791    fn purges_deleted_files() {
792        let (_tmp, mut store) = build_store();
793        store
794            .reconcile_repo("r", &[tf("gone.rs", "fn doomed() {}\n")], None, None)
795            .unwrap();
796        assert!(!store.search_lexical("doomed", &[], 10).unwrap().is_empty());
797        // gone.rs no longer tracked.
798        let stats = store.reconcile_repo("r", &[], None, None).unwrap();
799        assert_eq!(stats.files_purged, 1);
800        assert!(store.search_lexical("doomed", &[], 10).unwrap().is_empty());
801    }
802
803    #[test]
804    fn repo_filter_scopes_results() {
805        let (_tmp, mut store) = build_store();
806        store
807            .reconcile_repo("api", &[tf("a.rs", "fn shared_thing() {}\n")], None, None)
808            .unwrap();
809        store
810            .reconcile_repo("ui", &[tf("b.rs", "fn shared_thing() {}\n")], None, None)
811            .unwrap();
812        let all = store.search_lexical("shared_thing", &[], 10).unwrap();
813        assert_eq!(all.len(), 2);
814        let scoped = store
815            .search_lexical("shared_thing", &["api".to_string()], 10)
816            .unwrap();
817        let rows = store.fetch_chunks(&scoped).unwrap();
818        assert!(rows.values().all(|r| r.repo == "api"));
819    }
820
821    #[test]
822    fn indexed_commits_recorded() {
823        let (_tmp, mut store) = build_store();
824        store
825            .reconcile_repo("r", &[tf("a.rs", "fn a() {}\n")], Some("c0ffee"), None)
826            .unwrap();
827        let commits = store.indexed_commits().unwrap();
828        assert_eq!(commits.get("r"), Some(&Some("c0ffee".to_string())));
829    }
830
831    #[test]
832    fn fuse_rewards_agreement() {
833        // id 2 appears high in both lists; id 1 only in lexical, id 3 only in vector.
834        let lexical = [1i64, 2, 4];
835        let vector = [2i64, 3, 4];
836        let fused = fuse(&[&lexical, &vector]);
837        assert_eq!(fused[0].0, 2, "id present in both lists ranks first");
838    }
839
840    #[test]
841    fn fts_query_extracts_tokens() {
842        assert_eq!(fts_query("  !!  "), None);
843        assert_eq!(
844            fts_query("Rotate Refresh"),
845            Some("\"rotate\" OR \"refresh\"".to_string())
846        );
847    }
848
849    #[test]
850    fn cosine_identical_is_one() {
851        let v = [1.0f32, 2.0, 3.0];
852        assert!((cosine(&v, &v) - 1.0).abs() < 1e-6);
853    }
854
855    #[test]
856    fn blob_round_trips() {
857        let v = vec![0.5f32, -1.0, 3.25];
858        assert_eq!(blob_to_vec(&vec_to_blob(&v)), v);
859    }
860
861    #[test]
862    fn schema_version_mismatch_triggers_rebuild() {
863        let tmp = TempDir::new().unwrap();
864        let db = tmp.path().join("index.db");
865        {
866            let mut store = Store::open_for_build(&db).unwrap();
867            store
868                .reconcile_repo("r", &[tf("a.rs", "fn keep() {}\n")], None, None)
869                .unwrap();
870            store.meta_set("schema_version", "0").unwrap();
871        }
872        // Reopening for build sees the stale version and wipes.
873        let store = Store::open_for_build(&db).unwrap();
874        assert!(
875            store.search_lexical("keep", &[], 10).unwrap().is_empty(),
876            "stale-schema index was rebuilt empty"
877        );
878    }
879}