1use std::collections::BTreeSet;
2use std::path::{Path, PathBuf};
3use std::sync::Mutex;
4
5use anyhow::Context;
6use roder_api::code_index::{
7 ChunkEmbedding, CodeChunk, CodeIndexSearchRequest, CodeIndexSearchResponse,
8 CodeIndexSearchResult, CodeIndexStats, CodeIndexStatus, IndexGeneration, ProofFilteredDrop,
9};
10use rusqlite::{Connection, params};
11use time::OffsetDateTime;
12
13use crate::chunk::chunk_workspace;
14use crate::merkle::{FileManifestEntry, build_workspace_merkle, diff_file_manifests};
15use crate::proofs::{proof_for_chunk, verify_chunk_proof};
16use crate::sqlite_embeddings::ensure_embedding;
17use crate::sqlite_schema::{load_generation, migrate, save_generation};
18
19const STORE_ID: &str = "sqlite-code-index";
20const CONFIG_HASH: &str = "local-code-index-v1";
21
22#[derive(Debug, Clone, PartialEq, Eq)]
23pub struct RebuildStats {
24 pub generation: IndexGeneration,
25 pub changed_file_count: u64,
26 pub deleted_file_count: u64,
27 pub reused_file_count: u64,
28}
29
30#[derive(Debug, Clone)]
31pub struct StoredChunk {
32 pub chunk: CodeChunk,
33 pub embedding: ChunkEmbedding,
34}
35
36pub struct SqliteCodeIndexStore {
37 path: PathBuf,
38 conn: Mutex<Connection>,
39}
40
41pub fn default_store_path(base_dir: impl AsRef<Path>, workspace_root: impl AsRef<Path>) -> PathBuf {
42 base_dir
43 .as_ref()
44 .join(workspace_key(workspace_root.as_ref()))
45 .join("code-index.sqlite3")
46}
47
48impl SqliteCodeIndexStore {
49 pub fn open(path: impl AsRef<Path>) -> anyhow::Result<Self> {
50 let path = path.as_ref().to_path_buf();
51 if let Some(parent) = path.parent() {
52 std::fs::create_dir_all(parent)?;
53 }
54 let conn = Connection::open(&path)
55 .with_context(|| format!("open code index sqlite store {}", path.display()))?;
56 migrate(&conn)?;
57 Ok(Self {
58 path,
59 conn: Mutex::new(conn),
60 })
61 }
62
63 pub fn path(&self) -> &Path {
64 &self.path
65 }
66
67 pub fn id(&self) -> &'static str {
68 STORE_ID
69 }
70
71 pub fn rebuild_workspace(
72 &self,
73 workspace_root: impl AsRef<Path>,
74 ) -> anyhow::Result<RebuildStats> {
75 let build = build_workspace_merkle(workspace_root.as_ref())?;
76 let chunks = chunk_workspace(&build.tree.workspace_root, &build.files)?;
77 self.with_conn(|conn| {
78 let previous = load_file_manifest(conn)?;
79 let diff = diff_file_manifests(&previous, &build.files);
80 let generation_id = generation_id(&build.tree.root_hash);
81 let mut embedded_chunk_count = 0u64;
82 let mut cached_embedding_count = 0u64;
83
84 let tx = conn.unchecked_transaction()?;
85 tx.execute("DELETE FROM chunks", [])?;
86 tx.execute("DELETE FROM file_manifest", [])?;
87
88 for file in &build.files {
89 tx.execute(
90 "INSERT INTO file_manifest(path, path_hash, content_hash, size)
91 VALUES (?1, ?2, ?3, ?4)",
92 params![
93 path_to_string(&file.path),
94 file.path_hash,
95 file.content_hash,
96 file.size as i64
97 ],
98 )?;
99 }
100
101 for chunk in &chunks {
102 let (embedding, cached) = ensure_embedding(&tx, chunk)?;
103 if cached {
104 cached_embedding_count += 1;
105 } else {
106 embedded_chunk_count += 1;
107 }
108 tx.execute(
109 "INSERT INTO chunks(
110 chunk_hash, path, path_hash, content_hash, start_byte, end_byte,
111 start_line, end_line, language, symbol_hint, embedding_provider,
112 embedding_model, embedding_dimensions
113 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13)",
114 params![
115 chunk.chunk_hash,
116 path_to_string(&chunk.path),
117 chunk.path_hash,
118 chunk.content_hash,
119 chunk.byte_range.start as i64,
120 chunk.byte_range.end as i64,
121 chunk.line_range.start as i64,
122 chunk.line_range.end as i64,
123 chunk.language,
124 chunk.symbol_hint,
125 embedding.provider,
126 embedding.model,
127 embedding.dimensions as i64,
128 ],
129 )?;
130 }
131
132 let generation = IndexGeneration {
133 id: generation_id,
134 status: CodeIndexStatus::Ready,
135 workspace_root: build.tree.workspace_root.clone(),
136 root_hash: Some(build.tree.root_hash.clone()),
137 config_hash: CONFIG_HASH.to_string(),
138 stats: CodeIndexStats {
139 file_count: build.files.len() as u64,
140 chunk_count: chunks.len() as u64,
141 embedded_chunk_count,
142 cached_embedding_count,
143 index_bytes: index_bytes(&tx)?,
144 },
145 created_at: OffsetDateTime::now_utc(),
146 updated_at: Some(OffsetDateTime::now_utc()),
147 stale_reason: None,
148 };
149 save_generation(&tx, &generation)?;
150 tx.commit()?;
151
152 Ok(RebuildStats {
153 generation,
154 changed_file_count: diff.changed_files.len() as u64,
155 deleted_file_count: diff.deleted_files.len() as u64,
156 reused_file_count: diff.unchanged_files.len() as u64,
157 })
158 })
159 }
160
161 pub fn status(&self, workspace_root: impl AsRef<Path>) -> anyhow::Result<IndexGeneration> {
162 let workspace_root = workspace_root.as_ref().to_path_buf();
163 self.with_conn(|conn| {
164 load_generation(conn)?.map_or_else(
165 || {
166 Ok(IndexGeneration {
167 id: "missing".to_string(),
168 status: CodeIndexStatus::Missing,
169 workspace_root,
170 root_hash: None,
171 config_hash: CONFIG_HASH.to_string(),
172 stats: CodeIndexStats {
173 file_count: 0,
174 chunk_count: 0,
175 embedded_chunk_count: 0,
176 cached_embedding_count: 0,
177 index_bytes: 0,
178 },
179 created_at: OffsetDateTime::now_utc(),
180 updated_at: None,
181 stale_reason: Some("code index has not been built".to_string()),
182 })
183 },
184 Ok,
185 )
186 })
187 }
188
189 pub fn list_chunks(&self) -> anyhow::Result<Vec<StoredChunk>> {
190 self.with_conn(load_chunks)
191 }
192
193 pub fn search(
194 &self,
195 request: CodeIndexSearchRequest,
196 ) -> anyhow::Result<CodeIndexSearchResponse> {
197 let query_terms = tokenize(&request.query);
198 self.with_conn(|conn| {
199 let generation = load_generation(conn)?
200 .with_context(|| "code index search requested before generation exists")?;
201 let root_hash = generation.root_hash.clone().unwrap_or_default();
202 let mut scored = Vec::new();
203 let mut dropped_results = Vec::new();
204 for stored in load_chunks(conn)? {
205 let score = score_chunk(&stored.chunk, &query_terms);
206 if score <= 0.0 {
207 continue;
208 }
209 let proof = proof_for_chunk(&root_hash, generation.id.clone(), &stored.chunk);
210 if !verify_chunk_proof(&proof, &root_hash, &stored.chunk) {
211 dropped_results.push(ProofFilteredDrop {
212 query_id: request.query_id.clone(),
213 path_hash: stored.chunk.path_hash,
214 content_hash: stored.chunk.content_hash,
215 reason: "content proof failed".to_string(),
216 });
217 continue;
218 }
219 scored.push(CodeIndexSearchResult {
220 query_id: request.query_id.clone(),
221 chunk: stored.chunk,
222 score,
223 proof,
224 proof_verified: true,
225 snippet: None,
226 });
227 }
228 scored.sort_by(|a, b| {
229 b.score
230 .partial_cmp(&a.score)
231 .unwrap_or(std::cmp::Ordering::Equal)
232 .then_with(|| a.chunk.path.cmp(&b.chunk.path))
233 });
234 scored.truncate(request.limit);
235
236 Ok(CodeIndexSearchResponse {
237 generation,
238 results: scored,
239 dropped_results,
240 })
241 })
242 }
243
244 fn with_conn<T>(
245 &self,
246 f: impl FnOnce(&mut Connection) -> anyhow::Result<T>,
247 ) -> anyhow::Result<T> {
248 let mut conn = self
249 .conn
250 .lock()
251 .map_err(|_| anyhow::anyhow!("code index sqlite connection lock poisoned"))?;
252 f(&mut conn)
253 }
254}
255
256fn load_file_manifest(conn: &Connection) -> anyhow::Result<Vec<FileManifestEntry>> {
257 let mut stmt = conn
258 .prepare("SELECT path, path_hash, content_hash, size FROM file_manifest ORDER BY path")?;
259 let rows = stmt.query_map([], |row| {
260 Ok(FileManifestEntry {
261 path: PathBuf::from(row.get::<_, String>(0)?),
262 path_hash: row.get(1)?,
263 content_hash: row.get(2)?,
264 size: row.get::<_, i64>(3)? as u64,
265 })
266 })?;
267 rows.collect::<Result<Vec<_>, _>>().map_err(Into::into)
268}
269
270fn load_chunks(conn: &mut Connection) -> anyhow::Result<Vec<StoredChunk>> {
271 let mut stmt = conn.prepare(
272 "SELECT c.chunk_hash, c.path, c.path_hash, c.content_hash, c.start_byte, c.end_byte,
273 c.start_line, c.end_line, c.language, c.symbol_hint,
274 e.vector_json, e.provider, e.model, e.dimensions
275 FROM chunks c
276 JOIN embedding_cache e ON e.content_hash = c.content_hash
277 ORDER BY c.path, c.start_byte",
278 )?;
279 let rows = stmt.query_map([], |row| {
280 let vector_json: String = row.get(10)?;
281 let vector: Vec<f32> = serde_json::from_str(&vector_json).map_err(|err| {
282 rusqlite::Error::FromSqlConversionFailure(
283 10,
284 rusqlite::types::Type::Text,
285 Box::new(err),
286 )
287 })?;
288 let chunk = CodeChunk {
289 chunk_hash: row.get(0)?,
290 path: PathBuf::from(row.get::<_, String>(1)?),
291 path_hash: row.get(2)?,
292 content_hash: row.get(3)?,
293 byte_range: roder_api::code_index::CodeByteRange {
294 start: row.get::<_, i64>(4)? as u64,
295 end: row.get::<_, i64>(5)? as u64,
296 },
297 line_range: roder_api::code_index::CodeLineRange {
298 start: row.get::<_, i64>(6)? as u32,
299 end: row.get::<_, i64>(7)? as u32,
300 },
301 language: row.get(8)?,
302 symbol_hint: row.get(9)?,
303 };
304 let embedding = ChunkEmbedding {
305 chunk_hash: chunk.chunk_hash.clone(),
306 provider: row.get(11)?,
307 model: row.get(12)?,
308 dimensions: row.get::<_, i64>(13)? as usize,
309 vector,
310 };
311 Ok(StoredChunk { chunk, embedding })
312 })?;
313 rows.collect::<Result<Vec<_>, _>>().map_err(Into::into)
314}
315
316fn index_bytes(conn: &Connection) -> anyhow::Result<u64> {
317 let page_count: i64 = conn.query_row("PRAGMA page_count", [], |row| row.get(0))?;
318 let page_size: i64 = conn.query_row("PRAGMA page_size", [], |row| row.get(0))?;
319 Ok((page_count * page_size).max(0) as u64)
320}
321
322fn generation_id(root_hash: &str) -> String {
323 format!("gen-{}", &root_hash[..16.min(root_hash.len())])
324}
325
326fn workspace_key(workspace_root: &Path) -> String {
327 crate::hex_sha256(workspace_root.to_string_lossy().as_bytes())
328}
329
330fn path_to_string(path: &Path) -> String {
331 path.components()
332 .map(|component| component.as_os_str().to_string_lossy())
333 .collect::<Vec<_>>()
334 .join("/")
335}
336
337fn tokenize(query: &str) -> BTreeSet<String> {
338 query
339 .split(|ch: char| !ch.is_ascii_alphanumeric() && ch != '_')
340 .filter(|term| !term.is_empty())
341 .map(|term| term.to_ascii_lowercase())
342 .collect()
343}
344
345fn score_chunk(chunk: &CodeChunk, terms: &BTreeSet<String>) -> f32 {
346 if terms.is_empty() {
347 return 0.0;
348 }
349 let mut haystack = path_to_string(&chunk.path).to_ascii_lowercase();
350 if let Some(symbol) = &chunk.symbol_hint {
351 haystack.push(' ');
352 haystack.push_str(&symbol.to_ascii_lowercase());
353 }
354 let matches = terms
355 .iter()
356 .filter(|term| haystack.contains(term.as_str()))
357 .count();
358 matches as f32 / terms.len() as f32
359}
360
361#[cfg(test)]
362mod tests {
363 use std::fs;
364
365 use super::*;
366
367 #[test]
368 fn sqlite_rebuild_caches_unchanged_chunks_and_tracks_file_changes() {
369 let root = tempdir("sqlite_rebuild_caches_unchanged_chunks_and_tracks_file_changes");
370 write(root.join("src/a.rs"), "pub fn a() {}\n");
371 write(root.join("src/b.rs"), "pub fn b() {}\n");
372 let store = SqliteCodeIndexStore::open(root.with_extension("sqlite3")).unwrap();
373
374 let first = store.rebuild_workspace(&root).unwrap();
375 assert_eq!(first.generation.stats.embedded_chunk_count, 2);
376 assert_eq!(first.generation.stats.cached_embedding_count, 0);
377
378 write(root.join("src/a.rs"), "pub fn a_changed() {}\n");
379 let second = store.rebuild_workspace(&root).unwrap();
380
381 assert_eq!(second.changed_file_count, 1);
382 assert_eq!(second.reused_file_count, 1);
383 assert_eq!(second.generation.stats.embedded_chunk_count, 1);
384 assert_eq!(second.generation.stats.cached_embedding_count, 1);
385 }
386
387 #[test]
388 fn sqlite_rebuild_removes_deleted_files_from_chunks_and_results() {
389 let root = tempdir("sqlite_rebuild_removes_deleted_files_from_chunks_and_results");
390 write(root.join("src/keep.rs"), "pub fn keep_token() {}\n");
391 write(root.join("src/delete.rs"), "pub fn delete_token() {}\n");
392 let store = SqliteCodeIndexStore::open(root.with_extension("sqlite3")).unwrap();
393 store.rebuild_workspace(&root).unwrap();
394
395 fs::remove_file(root.join("src/delete.rs")).unwrap();
396 let second = store.rebuild_workspace(&root).unwrap();
397 assert_eq!(second.deleted_file_count, 1);
398
399 let chunks = store.list_chunks().unwrap();
400 assert!(
401 chunks
402 .iter()
403 .all(|stored| stored.chunk.path != PathBuf::from("src/delete.rs"))
404 );
405
406 let response = store
407 .search(CodeIndexSearchRequest {
408 query_id: "q1".to_string(),
409 query: "delete_token".to_string(),
410 workspace_root: root.clone(),
411 limit: 10,
412 })
413 .unwrap();
414 assert!(response.results.is_empty());
415 }
416
417 fn write(path: PathBuf, contents: &str) {
418 fs::create_dir_all(path.parent().unwrap()).unwrap();
419 fs::write(path, contents).unwrap();
420 }
421
422 fn tempdir(name: &str) -> PathBuf {
423 let path = std::env::temp_dir().join(format!(
424 "roder-code-index-{name}-{}-{}",
425 std::process::id(),
426 OffsetDateTime::now_utc().unix_timestamp_nanos()
427 ));
428 fs::create_dir_all(&path).unwrap();
429 path
430 }
431}