Skip to main content

normalize_semantic/
schema.rs

1//! SQLite schema for the embeddings table and sqlite-vec ANN index.
2//!
3//! Vectors are stored in the same SQLite database as the structural index
4//! (`.normalize/index.sqlite`). The `embeddings` table holds one row per
5//! embedded chunk; the raw f32 vector is stored as a BLOB alongside staleness
6//! metadata used for query-time re-ranking.
7//!
8//! When the sqlite-vec extension is loaded, a companion `vec_embeddings`
9//! virtual table (backed by `vec0`) mirrors the vector data and enables
10//! approximate nearest-neighbor search.  The `rowid` of `vec_embeddings` is
11//! kept in sync with `embeddings.id` so they can be JOIN-ed freely.
12
13/// DDL for the embeddings table.
14///
15/// The table is created lazily when embedding is first enabled so it does not
16/// affect repos that never turn on `embeddings.enabled`.
17///
18/// The UNIQUE constraint on `(source_type, source_path, source_id)` allows
19/// `INSERT OR REPLACE` for incremental updates without a delete-then-insert
20/// round-trip.
21pub const CREATE_EMBEDDINGS_TABLE: &str = "
22CREATE TABLE IF NOT EXISTS embeddings (
23    id          INTEGER PRIMARY KEY,
24    source_type TEXT NOT NULL,   -- 'symbol' | 'doc' | 'commit' | 'cluster'
25    source_path TEXT NOT NULL,   -- relative file path
26    source_id   INTEGER,         -- FK into symbols table where applicable
27    model       TEXT NOT NULL,   -- embedding model name (triggers invalidation on change)
28    last_commit TEXT,            -- git HEAD SHA when last embedded
29    staleness   REAL NOT NULL DEFAULT 0.0,
30    chunk_text  TEXT NOT NULL,   -- the text that was embedded (for debugging / re-use)
31    embedding   BLOB NOT NULL,   -- packed f32 array, length = model dimensions
32    UNIQUE(source_type, source_path, source_id)
33)";
34
35/// DDL statements to drop embedding tables for a full rebuild.
36pub const DROP_EMBEDDINGS_TABLE: &str = "DROP TABLE IF EXISTS embeddings";
37pub const DROP_VEC_EMBEDDINGS_TABLE: &str = "DROP TABLE IF EXISTS vec_embeddings";
38pub const DROP_EMBEDDINGS_IDX_SOURCE: &str = "DROP INDEX IF EXISTS idx_embeddings_source";
39pub const DROP_EMBEDDINGS_IDX_MODEL: &str = "DROP INDEX IF EXISTS idx_embeddings_model";
40
41pub const CREATE_EMBEDDINGS_IDX_SOURCE: &str = "
42CREATE INDEX IF NOT EXISTS idx_embeddings_source
43    ON embeddings(source_type, source_path)";
44
45pub const CREATE_EMBEDDINGS_IDX_MODEL: &str = "
46CREATE INDEX IF NOT EXISTS idx_embeddings_model
47    ON embeddings(model)";
48
49/// DDL for the sqlite-vec ANN virtual table.
50///
51/// This is created only when the sqlite-vec extension is available (i.e. after
52/// [`crate::vec_ext::register_vec_extension`] has been called and a connection
53/// opened).  The dimension count is passed at table-creation time; the default
54/// of 768 matches `nomic-embed-text-v1.5`.
55///
56/// `vec0` tables store only `(rowid, vector)`.  Metadata is fetched by
57/// JOIN-ing back to the `embeddings` table using the `rowid`.
58pub fn create_vec_embeddings_ddl(dims: usize) -> String {
59    format!("CREATE VIRTUAL TABLE IF NOT EXISTS vec_embeddings USING vec0(embedding float[{dims}])")
60}