Skip to main content

roder_code_index/
sqlite.rs

1use std::collections::BTreeSet;
2use std::path::{Path, PathBuf};
3use std::sync::Mutex;
4
5use anyhow::Context;
6use roder_api::code_index::{
7    ChunkEmbedding, CodeChunk, CodeIndexSearchRequest, CodeIndexSearchResponse,
8    CodeIndexSearchResult, CodeIndexStats, CodeIndexStatus, IndexGeneration, ProofFilteredDrop,
9};
10use rusqlite::{Connection, params};
11use time::OffsetDateTime;
12
13use crate::chunk::chunk_workspace;
14use crate::merkle::{FileManifestEntry, build_workspace_merkle, diff_file_manifests};
15use crate::proofs::{proof_for_chunk, verify_chunk_proof};
16use crate::sqlite_embeddings::ensure_embedding;
17use crate::sqlite_schema::{load_generation, migrate, save_generation};
18
19const STORE_ID: &str = "sqlite-code-index";
20const CONFIG_HASH: &str = "local-code-index-v1";
21
22#[derive(Debug, Clone, PartialEq, Eq)]
23pub struct RebuildStats {
24    pub generation: IndexGeneration,
25    pub changed_file_count: u64,
26    pub deleted_file_count: u64,
27    pub reused_file_count: u64,
28}
29
30#[derive(Debug, Clone)]
31pub struct StoredChunk {
32    pub chunk: CodeChunk,
33    pub embedding: ChunkEmbedding,
34}
35
36pub struct SqliteCodeIndexStore {
37    path: PathBuf,
38    conn: Mutex<Connection>,
39}
40
41pub fn default_store_path(base_dir: impl AsRef<Path>, workspace_root: impl AsRef<Path>) -> PathBuf {
42    base_dir
43        .as_ref()
44        .join(workspace_key(workspace_root.as_ref()))
45        .join("code-index.sqlite3")
46}
47
48impl SqliteCodeIndexStore {
49    pub fn open(path: impl AsRef<Path>) -> anyhow::Result<Self> {
50        let path = path.as_ref().to_path_buf();
51        if let Some(parent) = path.parent() {
52            std::fs::create_dir_all(parent)?;
53        }
54        let conn = Connection::open(&path)
55            .with_context(|| format!("open code index sqlite store {}", path.display()))?;
56        migrate(&conn)?;
57        Ok(Self {
58            path,
59            conn: Mutex::new(conn),
60        })
61    }
62
63    pub fn path(&self) -> &Path {
64        &self.path
65    }
66
67    pub fn id(&self) -> &'static str {
68        STORE_ID
69    }
70
71    pub fn rebuild_workspace(
72        &self,
73        workspace_root: impl AsRef<Path>,
74    ) -> anyhow::Result<RebuildStats> {
75        let build = build_workspace_merkle(workspace_root.as_ref())?;
76        let chunks = chunk_workspace(&build.tree.workspace_root, &build.files)?;
77        self.with_conn(|conn| {
78            let previous = load_file_manifest(conn)?;
79            let diff = diff_file_manifests(&previous, &build.files);
80            let generation_id = generation_id(&build.tree.root_hash);
81            let mut embedded_chunk_count = 0u64;
82            let mut cached_embedding_count = 0u64;
83
84            let tx = conn.unchecked_transaction()?;
85            tx.execute("DELETE FROM chunks", [])?;
86            tx.execute("DELETE FROM file_manifest", [])?;
87
88            for file in &build.files {
89                tx.execute(
90                    "INSERT INTO file_manifest(path, path_hash, content_hash, size)
91                     VALUES (?1, ?2, ?3, ?4)",
92                    params![
93                        path_to_string(&file.path),
94                        file.path_hash,
95                        file.content_hash,
96                        file.size as i64
97                    ],
98                )?;
99            }
100
101            for chunk in &chunks {
102                let (embedding, cached) = ensure_embedding(&tx, chunk)?;
103                if cached {
104                    cached_embedding_count += 1;
105                } else {
106                    embedded_chunk_count += 1;
107                }
108                tx.execute(
109                    "INSERT INTO chunks(
110                        chunk_hash, path, path_hash, content_hash, start_byte, end_byte,
111                        start_line, end_line, language, symbol_hint, embedding_provider,
112                        embedding_model, embedding_dimensions
113                     ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13)",
114                    params![
115                        chunk.chunk_hash,
116                        path_to_string(&chunk.path),
117                        chunk.path_hash,
118                        chunk.content_hash,
119                        chunk.byte_range.start as i64,
120                        chunk.byte_range.end as i64,
121                        chunk.line_range.start as i64,
122                        chunk.line_range.end as i64,
123                        chunk.language,
124                        chunk.symbol_hint,
125                        embedding.provider,
126                        embedding.model,
127                        embedding.dimensions as i64,
128                    ],
129                )?;
130            }
131
132            let generation = IndexGeneration {
133                id: generation_id,
134                status: CodeIndexStatus::Ready,
135                workspace_root: build.tree.workspace_root.clone(),
136                root_hash: Some(build.tree.root_hash.clone()),
137                config_hash: CONFIG_HASH.to_string(),
138                stats: CodeIndexStats {
139                    file_count: build.files.len() as u64,
140                    chunk_count: chunks.len() as u64,
141                    embedded_chunk_count,
142                    cached_embedding_count,
143                    index_bytes: index_bytes(&tx)?,
144                },
145                created_at: OffsetDateTime::now_utc(),
146                updated_at: Some(OffsetDateTime::now_utc()),
147                stale_reason: None,
148            };
149            save_generation(&tx, &generation)?;
150            tx.commit()?;
151
152            Ok(RebuildStats {
153                generation,
154                changed_file_count: diff.changed_files.len() as u64,
155                deleted_file_count: diff.deleted_files.len() as u64,
156                reused_file_count: diff.unchanged_files.len() as u64,
157            })
158        })
159    }
160
161    pub fn status(&self, workspace_root: impl AsRef<Path>) -> anyhow::Result<IndexGeneration> {
162        let workspace_root = workspace_root.as_ref().to_path_buf();
163        self.with_conn(|conn| {
164            load_generation(conn)?.map_or_else(
165                || {
166                    Ok(IndexGeneration {
167                        id: "missing".to_string(),
168                        status: CodeIndexStatus::Missing,
169                        workspace_root,
170                        root_hash: None,
171                        config_hash: CONFIG_HASH.to_string(),
172                        stats: CodeIndexStats {
173                            file_count: 0,
174                            chunk_count: 0,
175                            embedded_chunk_count: 0,
176                            cached_embedding_count: 0,
177                            index_bytes: 0,
178                        },
179                        created_at: OffsetDateTime::now_utc(),
180                        updated_at: None,
181                        stale_reason: Some("code index has not been built".to_string()),
182                    })
183                },
184                Ok,
185            )
186        })
187    }
188
189    pub fn list_chunks(&self) -> anyhow::Result<Vec<StoredChunk>> {
190        self.with_conn(load_chunks)
191    }
192
193    pub fn search(
194        &self,
195        request: CodeIndexSearchRequest,
196    ) -> anyhow::Result<CodeIndexSearchResponse> {
197        let query_terms = tokenize(&request.query);
198        self.with_conn(|conn| {
199            let generation = load_generation(conn)?
200                .with_context(|| "code index search requested before generation exists")?;
201            let root_hash = generation.root_hash.clone().unwrap_or_default();
202            let mut scored = Vec::new();
203            let mut dropped_results = Vec::new();
204            for stored in load_chunks(conn)? {
205                let score = score_chunk(&stored.chunk, &query_terms);
206                if score <= 0.0 {
207                    continue;
208                }
209                let proof = proof_for_chunk(&root_hash, generation.id.clone(), &stored.chunk);
210                if !verify_chunk_proof(&proof, &root_hash, &stored.chunk) {
211                    dropped_results.push(ProofFilteredDrop {
212                        query_id: request.query_id.clone(),
213                        path_hash: stored.chunk.path_hash,
214                        content_hash: stored.chunk.content_hash,
215                        reason: "content proof failed".to_string(),
216                    });
217                    continue;
218                }
219                scored.push(CodeIndexSearchResult {
220                    query_id: request.query_id.clone(),
221                    chunk: stored.chunk,
222                    score,
223                    proof,
224                    proof_verified: true,
225                    snippet: None,
226                });
227            }
228            scored.sort_by(|a, b| {
229                b.score
230                    .partial_cmp(&a.score)
231                    .unwrap_or(std::cmp::Ordering::Equal)
232                    .then_with(|| a.chunk.path.cmp(&b.chunk.path))
233            });
234            scored.truncate(request.limit);
235
236            Ok(CodeIndexSearchResponse {
237                generation,
238                results: scored,
239                dropped_results,
240            })
241        })
242    }
243
244    fn with_conn<T>(
245        &self,
246        f: impl FnOnce(&mut Connection) -> anyhow::Result<T>,
247    ) -> anyhow::Result<T> {
248        let mut conn = self
249            .conn
250            .lock()
251            .map_err(|_| anyhow::anyhow!("code index sqlite connection lock poisoned"))?;
252        f(&mut conn)
253    }
254}
255
256fn load_file_manifest(conn: &Connection) -> anyhow::Result<Vec<FileManifestEntry>> {
257    let mut stmt = conn
258        .prepare("SELECT path, path_hash, content_hash, size FROM file_manifest ORDER BY path")?;
259    let rows = stmt.query_map([], |row| {
260        Ok(FileManifestEntry {
261            path: PathBuf::from(row.get::<_, String>(0)?),
262            path_hash: row.get(1)?,
263            content_hash: row.get(2)?,
264            size: row.get::<_, i64>(3)? as u64,
265        })
266    })?;
267    rows.collect::<Result<Vec<_>, _>>().map_err(Into::into)
268}
269
270fn load_chunks(conn: &mut Connection) -> anyhow::Result<Vec<StoredChunk>> {
271    let mut stmt = conn.prepare(
272        "SELECT c.chunk_hash, c.path, c.path_hash, c.content_hash, c.start_byte, c.end_byte,
273                c.start_line, c.end_line, c.language, c.symbol_hint,
274                e.vector_json, e.provider, e.model, e.dimensions
275         FROM chunks c
276         JOIN embedding_cache e ON e.content_hash = c.content_hash
277         ORDER BY c.path, c.start_byte",
278    )?;
279    let rows = stmt.query_map([], |row| {
280        let vector_json: String = row.get(10)?;
281        let vector: Vec<f32> = serde_json::from_str(&vector_json).map_err(|err| {
282            rusqlite::Error::FromSqlConversionFailure(
283                10,
284                rusqlite::types::Type::Text,
285                Box::new(err),
286            )
287        })?;
288        let chunk = CodeChunk {
289            chunk_hash: row.get(0)?,
290            path: PathBuf::from(row.get::<_, String>(1)?),
291            path_hash: row.get(2)?,
292            content_hash: row.get(3)?,
293            byte_range: roder_api::code_index::CodeByteRange {
294                start: row.get::<_, i64>(4)? as u64,
295                end: row.get::<_, i64>(5)? as u64,
296            },
297            line_range: roder_api::code_index::CodeLineRange {
298                start: row.get::<_, i64>(6)? as u32,
299                end: row.get::<_, i64>(7)? as u32,
300            },
301            language: row.get(8)?,
302            symbol_hint: row.get(9)?,
303        };
304        let embedding = ChunkEmbedding {
305            chunk_hash: chunk.chunk_hash.clone(),
306            provider: row.get(11)?,
307            model: row.get(12)?,
308            dimensions: row.get::<_, i64>(13)? as usize,
309            vector,
310        };
311        Ok(StoredChunk { chunk, embedding })
312    })?;
313    rows.collect::<Result<Vec<_>, _>>().map_err(Into::into)
314}
315
316fn index_bytes(conn: &Connection) -> anyhow::Result<u64> {
317    let page_count: i64 = conn.query_row("PRAGMA page_count", [], |row| row.get(0))?;
318    let page_size: i64 = conn.query_row("PRAGMA page_size", [], |row| row.get(0))?;
319    Ok((page_count * page_size).max(0) as u64)
320}
321
322fn generation_id(root_hash: &str) -> String {
323    format!("gen-{}", &root_hash[..16.min(root_hash.len())])
324}
325
326fn workspace_key(workspace_root: &Path) -> String {
327    crate::hex_sha256(workspace_root.to_string_lossy().as_bytes())
328}
329
330fn path_to_string(path: &Path) -> String {
331    path.components()
332        .map(|component| component.as_os_str().to_string_lossy())
333        .collect::<Vec<_>>()
334        .join("/")
335}
336
337fn tokenize(query: &str) -> BTreeSet<String> {
338    query
339        .split(|ch: char| !ch.is_ascii_alphanumeric() && ch != '_')
340        .filter(|term| !term.is_empty())
341        .map(|term| term.to_ascii_lowercase())
342        .collect()
343}
344
345fn score_chunk(chunk: &CodeChunk, terms: &BTreeSet<String>) -> f32 {
346    if terms.is_empty() {
347        return 0.0;
348    }
349    let mut haystack = path_to_string(&chunk.path).to_ascii_lowercase();
350    if let Some(symbol) = &chunk.symbol_hint {
351        haystack.push(' ');
352        haystack.push_str(&symbol.to_ascii_lowercase());
353    }
354    let matches = terms
355        .iter()
356        .filter(|term| haystack.contains(term.as_str()))
357        .count();
358    matches as f32 / terms.len() as f32
359}
360
361#[cfg(test)]
362mod tests {
363    use std::fs;
364
365    use super::*;
366
367    #[test]
368    fn sqlite_rebuild_caches_unchanged_chunks_and_tracks_file_changes() {
369        let root = tempdir("sqlite_rebuild_caches_unchanged_chunks_and_tracks_file_changes");
370        write(root.join("src/a.rs"), "pub fn a() {}\n");
371        write(root.join("src/b.rs"), "pub fn b() {}\n");
372        let store = SqliteCodeIndexStore::open(root.with_extension("sqlite3")).unwrap();
373
374        let first = store.rebuild_workspace(&root).unwrap();
375        assert_eq!(first.generation.stats.embedded_chunk_count, 2);
376        assert_eq!(first.generation.stats.cached_embedding_count, 0);
377
378        write(root.join("src/a.rs"), "pub fn a_changed() {}\n");
379        let second = store.rebuild_workspace(&root).unwrap();
380
381        assert_eq!(second.changed_file_count, 1);
382        assert_eq!(second.reused_file_count, 1);
383        assert_eq!(second.generation.stats.embedded_chunk_count, 1);
384        assert_eq!(second.generation.stats.cached_embedding_count, 1);
385    }
386
387    #[test]
388    fn sqlite_rebuild_removes_deleted_files_from_chunks_and_results() {
389        let root = tempdir("sqlite_rebuild_removes_deleted_files_from_chunks_and_results");
390        write(root.join("src/keep.rs"), "pub fn keep_token() {}\n");
391        write(root.join("src/delete.rs"), "pub fn delete_token() {}\n");
392        let store = SqliteCodeIndexStore::open(root.with_extension("sqlite3")).unwrap();
393        store.rebuild_workspace(&root).unwrap();
394
395        fs::remove_file(root.join("src/delete.rs")).unwrap();
396        let second = store.rebuild_workspace(&root).unwrap();
397        assert_eq!(second.deleted_file_count, 1);
398
399        let chunks = store.list_chunks().unwrap();
400        assert!(
401            chunks
402                .iter()
403                .all(|stored| stored.chunk.path != PathBuf::from("src/delete.rs"))
404        );
405
406        let response = store
407            .search(CodeIndexSearchRequest {
408                query_id: "q1".to_string(),
409                query: "delete_token".to_string(),
410                workspace_root: root.clone(),
411                limit: 10,
412            })
413            .unwrap();
414        assert!(response.results.is_empty());
415    }
416
417    fn write(path: PathBuf, contents: &str) {
418        fs::create_dir_all(path.parent().unwrap()).unwrap();
419        fs::write(path, contents).unwrap();
420    }
421
422    fn tempdir(name: &str) -> PathBuf {
423        let path = std::env::temp_dir().join(format!(
424            "roder-code-index-{name}-{}-{}",
425            std::process::id(),
426            OffsetDateTime::now_utc().unix_timestamp_nanos()
427        ));
428        fs::create_dir_all(&path).unwrap();
429        path
430    }
431}