1use crate::embed::EmbeddedChunk;
2use crate::info_print;
3use anyhow::{anyhow, Result};
4use arroy::distances::Cosine;
5use arroy::{Database as ArroyDatabase, ItemId, Reader, Writer};
6use heed::byteorder::BigEndian;
7use heed::types::*;
8use heed::{Database, EnvFlags, EnvOpenOptions};
9use rand::rngs::StdRng;
10use rand::SeedableRng;
11use serde::{Deserialize, Serialize};
12use std::fs;
13use std::num::NonZeroUsize;
14use std::path::Path;
15
16#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct ChunkMetadata {
19 pub content: String,
20 pub path: String,
21 pub start_line: usize,
22 pub end_line: usize,
23 pub kind: String,
24 pub signature: Option<String>,
25 pub docstring: Option<String>,
26 pub context: Option<String>,
27 pub hash: String,
28 #[serde(default)]
30 pub context_prev: Option<String>,
31 #[serde(default)]
33 pub context_next: Option<String>,
34 #[serde(default)]
36 pub searchable_text: String,
37}
38
39impl ChunkMetadata {
40 fn from_embedded_chunk(chunk: &EmbeddedChunk) -> Self {
41 let searchable_text = {
43 let mut parts = Vec::new();
44
45 if let Some(sig) = &chunk.chunk.signature {
47 parts.push(sig.clone());
48 }
49
50 if let Some(doc) = &chunk.chunk.docstring {
52 parts.push(doc.clone());
53 }
54
55 parts.push(format!("{:?}", chunk.chunk.kind));
57
58 parts.push(chunk.chunk.content.clone());
60
61 parts.join("\n")
62 };
63
64 Self {
65 content: chunk.chunk.content.clone(),
66 path: chunk.chunk.path.clone(),
67 start_line: chunk.chunk.start_line,
68 end_line: chunk.chunk.end_line,
69 kind: format!("{:?}", chunk.chunk.kind),
70 signature: chunk.chunk.signature.clone(),
71 docstring: chunk.chunk.docstring.clone(),
72 context: if chunk.chunk.context.is_empty() {
73 None
74 } else {
75 Some(chunk.chunk.context.join(" > "))
76 },
77 hash: chunk.chunk.hash.clone(),
78 context_prev: chunk.chunk.context_prev.clone(),
79 context_next: chunk.chunk.context_next.clone(),
80 searchable_text,
81 }
82 }
83}
84
85pub struct VectorStore {
93 env: heed::Env,
94 vectors: ArroyDatabase<Cosine>,
95 chunks: Database<U32<BigEndian>, SerdeBincode<ChunkMetadata>>,
96 next_id: u32,
97 dimensions: usize,
98 indexed: bool,
99}
100
101impl VectorStore {
102 pub fn clear_stale_readers(&self) -> anyhow::Result<usize> {
110 let cleared = self.env.clear_stale_readers()?;
111 if cleared > 0 {
112 tracing::info!("Cleared {} stale LMDB reader slots", cleared);
113 }
114 Ok(cleared)
115 }
116
117 pub fn new(db_path: &Path, dimensions: usize) -> Result<Self> {
123 info_print!("📦 Opening vector database at: {}", db_path.display());
124
125 std::fs::create_dir_all(db_path)?;
127
128 cleanup_stale_del_files(db_path)?;
130
131 let map_size_mb = std::env::var("CODESEARCH_LMDB_MAP_SIZE_MB")
133 .ok()
134 .and_then(|s| s.parse::<usize>().ok())
135 .unwrap_or(crate::constants::DEFAULT_LMDB_MAP_SIZE_MB);
136 let env = unsafe {
137 EnvOpenOptions::new()
138 .map_size(map_size_mb * 1024 * 1024)
139 .max_dbs(10)
140 .open(db_path)?
141 };
142
143 let mut wtxn = env.write_txn()?;
145
146 let vectors: ArroyDatabase<Cosine> = env.create_database(&mut wtxn, Some("vectors"))?;
147 let chunks: Database<U32<BigEndian>, SerdeBincode<ChunkMetadata>> =
148 env.create_database(&mut wtxn, Some("chunks"))?;
149
150 let next_id = match chunks.last(&wtxn)? {
154 Some((max_key, _)) => max_key + 1,
155 None => 0,
156 };
157
158 wtxn.commit()?;
159
160 let indexed = if next_id > 0 {
162 let rtxn = env.read_txn()?;
163 match Reader::open(&rtxn, 0, vectors) {
164 Ok(_) => {
165 tracing::debug!("Index detected: Reader::open succeeded");
166 true
167 }
168 Err(e) => {
169 tracing::debug!("Index not detected: Reader::open failed: {:?}", e);
170 false
171 }
172 }
173 } else {
174 false
175 };
176
177 info_print!("✅ Database opened (next_id: {})", next_id);
178
179 Ok(Self {
180 env,
181 vectors,
182 chunks,
183 next_id,
184 dimensions,
185 indexed,
186 })
187 }
188
189 pub fn open_readonly(db_path: &Path, dimensions: usize) -> Result<Self> {
195 tracing::debug!(
196 "📦 Opening vector database (read-only) at: {}",
197 db_path.display()
198 );
199
200 if !db_path.exists() {
201 return Err(anyhow::anyhow!(
202 "Database does not exist at: {}",
203 db_path.display()
204 ));
205 }
206
207 let map_size_mb = std::env::var("CODESEARCH_LMDB_MAP_SIZE_MB")
209 .ok()
210 .and_then(|s| s.parse::<usize>().ok())
211 .unwrap_or(crate::constants::DEFAULT_LMDB_MAP_SIZE_MB);
212 let env = unsafe {
213 EnvOpenOptions::new()
214 .map_size(map_size_mb * 1024 * 1024)
215 .max_dbs(10)
216 .flags(EnvFlags::READ_ONLY)
217 .open(db_path)?
218 };
219
220 let rtxn = env.read_txn()?;
222
223 let vectors: ArroyDatabase<Cosine> = env
224 .open_database(&rtxn, Some("vectors"))?
225 .ok_or_else(|| anyhow::anyhow!("vectors database not found"))?;
226 let chunks: Database<U32<BigEndian>, SerdeBincode<ChunkMetadata>> = env
227 .open_database(&rtxn, Some("chunks"))?
228 .ok_or_else(|| anyhow::anyhow!("chunks database not found"))?;
229
230 let next_id = match chunks.last(&rtxn)? {
233 Some((max_key, _)) => max_key + 1,
234 None => 0,
235 };
236
237 let indexed = if next_id > 0 {
239 Reader::open(&rtxn, 0, vectors).is_ok()
240 } else {
241 false
242 };
243
244 drop(rtxn);
245
246 tracing::debug!(
247 "✅ Database opened read-only (next_id: {}, indexed: {})",
248 next_id,
249 indexed
250 );
251
252 Ok(Self {
253 env,
254 vectors,
255 chunks,
256 next_id,
257 dimensions,
258 indexed,
259 })
260 }
261
262 #[allow(dead_code)] pub fn insert_chunks(&mut self, chunks: Vec<EmbeddedChunk>) -> Result<usize> {
267 if chunks.is_empty() {
268 return Ok(0);
269 }
270
271 eprintln!("📊 Inserting {} chunks...", chunks.len());
272
273 let mut wtxn = self.env.write_txn()?;
274 let writer = Writer::new(self.vectors, 0, self.dimensions);
275
276 for chunk in &chunks {
277 let id = self.next_id;
278
279 if chunk.embedding.len() != self.dimensions {
281 return Err(anyhow!(
282 "Embedding dimension mismatch: expected {}, got {}",
283 self.dimensions,
284 chunk.embedding.len()
285 ));
286 }
287
288 writer.add_item(&mut wtxn, id, &chunk.embedding)?;
290
291 let metadata = ChunkMetadata::from_embedded_chunk(chunk);
293 self.chunks.put(&mut wtxn, &id, &metadata)?;
294
295 self.next_id += 1;
296 }
297
298 wtxn.commit()?;
299
300 self.indexed = false;
302
303 eprintln!(
304 "✅ Inserted {} chunks (IDs: {}-{})",
305 chunks.len(),
306 self.next_id - chunks.len() as u32,
307 self.next_id - 1
308 );
309
310 Ok(chunks.len())
311 }
312
313 pub fn build_index(&mut self) -> Result<()> {
317 let mut wtxn = self.env.write_txn()?;
318 let writer = Writer::new(self.vectors, 0, self.dimensions);
319
320 let mut rng = StdRng::seed_from_u64(rand::random());
321 writer.builder(&mut rng).build(&mut wtxn)?;
322
323 wtxn.commit()?;
324
325 self.indexed = true;
326
327 Ok(())
328 }
329
330 pub fn search(&self, query_embedding: &[f32], limit: usize) -> Result<Vec<SearchResult>> {
339 if query_embedding.len() != self.dimensions {
340 return Err(anyhow!(
341 "Query embedding dimension mismatch: expected {}, got {}",
342 self.dimensions,
343 query_embedding.len()
344 ));
345 }
346
347 if !self.indexed {
348 return Err(anyhow!(
349 "Index not built. Call build_index() after inserting chunks."
350 ));
351 }
352
353 let rtxn = self.env.read_txn()?;
354 let reader = Reader::open(&rtxn, 0, self.vectors)?;
355
356 let mut query = reader.nns(limit);
358
359 if let Some(n_trees) = NonZeroUsize::new(reader.n_trees()) {
361 if let Some(search_k) = NonZeroUsize::new(limit * n_trees.get() * 15) {
362 query.search_k(search_k);
363 }
364 }
365
366 let results = query.by_vector(&rtxn, query_embedding)?;
367
368 let mut search_results = Vec::new();
370
371 for (id, distance) in results {
372 if let Some(metadata) = self.chunks.get(&rtxn, &id)? {
373 search_results.push(SearchResult {
374 id,
375 content: metadata.content,
376 path: metadata.path,
377 start_line: metadata.start_line,
378 end_line: metadata.end_line,
379 kind: metadata.kind,
380 signature: metadata.signature,
381 docstring: metadata.docstring,
382 context: metadata.context,
383 hash: metadata.hash,
384 distance,
385 score: 1.0 - distance, context_prev: metadata.context_prev,
387 context_next: metadata.context_next,
388 });
389 }
390 }
391
392 Ok(search_results)
393 }
394
395 pub fn stats(&self) -> Result<StoreStats> {
397 let rtxn = self.env.read_txn()?;
398
399 let total_chunks = self.chunks.len(&rtxn)?;
400
401 let mut unique_files = std::collections::HashSet::new();
403 for result in self.chunks.iter(&rtxn)? {
404 let (_, metadata) = result?;
405 unique_files.insert(metadata.path.clone());
406 }
407
408 let max_chunk_id = self.chunks.last(&rtxn)?.map(|(k, _)| k).unwrap_or(0);
410
411 Ok(StoreStats {
412 total_chunks: total_chunks as usize,
413 total_files: unique_files.len(),
414 indexed: self.indexed,
415 dimensions: self.dimensions,
416 max_chunk_id,
417 })
418 }
419
420 pub fn delete_chunks(&mut self, chunk_ids: &[u32]) -> Result<usize> {
424 if chunk_ids.is_empty() {
425 return Ok(0);
426 }
427
428 let mut wtxn = self.env.write_txn()?;
429 let writer = Writer::new(self.vectors, 0, self.dimensions);
430
431 let mut deleted = 0;
432 for &id in chunk_ids {
433 if writer.del_item(&mut wtxn, id).is_ok() {
435 deleted += 1;
436 }
437 self.chunks.delete(&mut wtxn, &id)?;
439 }
440
441 wtxn.commit()?;
442
443 if deleted > 0 {
445 self.indexed = false;
446 }
447
448 Ok(deleted)
449 }
450
451 pub fn insert_chunks_with_ids(&mut self, chunks: Vec<EmbeddedChunk>) -> Result<Vec<u32>> {
458 if chunks.is_empty() {
459 return Ok(vec![]);
460 }
461
462 let start_id = self.next_id;
463 let mut wtxn = self.env.write_txn()?;
464 let writer = Writer::new(self.vectors, 0, self.dimensions);
465
466 for chunk in &chunks {
467 let id = self.next_id;
468
469 if chunk.embedding.len() != self.dimensions {
470 return Err(anyhow!(
471 "Embedding dimension mismatch: expected {}, got {}",
472 self.dimensions,
473 chunk.embedding.len()
474 ));
475 }
476
477 writer.add_item(&mut wtxn, id, &chunk.embedding)?;
478 let metadata = ChunkMetadata::from_embedded_chunk(chunk);
479 self.chunks.put(&mut wtxn, &id, &metadata)?;
480
481 self.next_id += 1;
482 }
483
484 wtxn.commit()?;
485 self.indexed = false;
486
487 let ids: Vec<u32> = (start_id..self.next_id).collect();
488 Ok(ids)
489 }
490
491 #[allow(dead_code)] pub fn clear(&mut self) -> Result<()> {
494 eprintln!("🗑️ Clearing database...");
495
496 let mut wtxn = self.env.write_txn()?;
497
498 self.chunks.clear(&mut wtxn)?;
500 self.vectors.clear(&mut wtxn)?;
501
502 wtxn.commit()?;
503
504 self.next_id = 0;
505 self.indexed = false;
506
507 eprintln!("✅ Database cleared");
508 Ok(())
509 }
510
511 pub fn get_chunk(&self, id: u32) -> Result<Option<ChunkMetadata>> {
513 let rtxn = self.env.read_txn()?;
514 Ok(self.chunks.get(&rtxn, &id)?)
515 }
516
517 pub fn get_chunk_as_result(&self, id: u32) -> Result<Option<SearchResult>> {
519 let rtxn = self.env.read_txn()?;
520 if let Some(meta) = self.chunks.get(&rtxn, &id)? {
521 Ok(Some(SearchResult {
522 id,
523 content: meta.content,
524 path: meta.path,
525 start_line: meta.start_line,
526 end_line: meta.end_line,
527 kind: meta.kind,
528 signature: meta.signature,
529 docstring: meta.docstring,
530 context: meta.context,
531 hash: meta.hash,
532 distance: 0.0,
533 score: 0.0, context_prev: meta.context_prev,
535 context_next: meta.context_next,
536 }))
537 } else {
538 Ok(None)
539 }
540 }
541
542 pub fn all_chunks(&self) -> Result<Vec<(u32, ChunkMetadata)>> {
546 let rtxn = self.env.read_txn()?;
547 let mut result = Vec::new();
548 for entry in self.chunks.iter(&rtxn)? {
549 let (id, metadata) = entry?;
550 result.push((id, metadata));
551 }
552 Ok(result)
553 }
554
555 #[allow(dead_code)] pub fn db_size(&self) -> Result<u64> {
558 let info = self.env.info();
559 Ok(info.map_size as u64)
560 }
561
562 pub fn is_indexed(&self) -> bool {
564 self.indexed
565 }
566}
567
568#[derive(Debug, Clone)]
570#[allow(dead_code)] pub struct SearchResult {
572 pub id: ItemId,
573 pub content: String,
574 pub path: String,
575 pub start_line: usize,
576 pub end_line: usize,
577 pub kind: String,
578 pub signature: Option<String>,
579 pub docstring: Option<String>,
580 pub context: Option<String>,
581 pub hash: String,
582 pub distance: f32,
583 pub score: f32, pub context_prev: Option<String>,
586 pub context_next: Option<String>,
588}
589
590#[derive(Debug, Clone)]
592pub struct StoreStats {
593 pub total_chunks: usize,
594 pub total_files: usize,
595 pub indexed: bool,
596 pub dimensions: usize,
597 pub max_chunk_id: u32,
600}
601
602fn cleanup_stale_del_files(db_path: &Path) -> Result<()> {
608 if !db_path.exists() {
609 return Ok(());
610 }
611
612 let entries = fs::read_dir(db_path)?;
613 let mut cleaned = 0;
614
615 for entry in entries {
616 let entry = entry?;
617 let path = entry.path();
618
619 if path.extension().and_then(|s| s.to_str()) == Some("del") {
621 fs::remove_file(&path)?;
623 cleaned += 1;
624 }
625 }
626
627 if cleaned > 0 {
628 tracing::debug!("Cleaned up {} stale .del files", cleaned);
629 }
630
631 Ok(())
632}
633
634#[cfg(test)]
635mod tests {
636 use super::*;
637 use crate::chunker::{Chunk, ChunkKind};
638 use crate::embed::EmbeddedChunk;
639 use tempfile::tempdir;
640
641 #[test]
642 fn test_vector_store_creation() {
643 let temp_dir = tempdir().unwrap();
644 let db_path = temp_dir.path().join("test.db");
645
646 let store = VectorStore::new(&db_path, 384);
647 assert!(store.is_ok());
648
649 let store = store.unwrap();
650 assert_eq!(store.dimensions, 384);
651 assert!(!store.is_indexed());
652 }
653
654 #[test]
655 fn test_insert_and_search() {
656 let temp_dir = tempdir().unwrap();
657 let db_path = temp_dir.path().join("test.db");
658
659 let mut store = VectorStore::new(&db_path, 4).unwrap();
660
661 let chunks = vec![
663 EmbeddedChunk::new(
664 Chunk::new(
665 "fn authenticate() {}".to_string(),
666 0,
667 1,
668 ChunkKind::Function,
669 "auth.rs".to_string(),
670 ),
671 vec![1.0, 0.0, 0.0, 0.0], ),
673 EmbeddedChunk::new(
674 Chunk::new(
675 "fn calculate() {}".to_string(),
676 2,
677 3,
678 ChunkKind::Function,
679 "math.rs".to_string(),
680 ),
681 vec![0.0, 1.0, 0.0, 0.0], ),
683 ];
684
685 let count = store.insert_chunks(chunks).unwrap();
687 assert_eq!(count, 2);
688
689 store.build_index().unwrap();
691 assert!(store.is_indexed());
692
693 let query = vec![0.9, 0.1, 0.0, 0.0];
695 let results = store.search(&query, 2).unwrap();
696
697 assert_eq!(results.len(), 2);
698 assert!(results[0].content.contains("authenticate"));
700 assert!(results[0].score > results[1].score);
701 }
702
703 #[test]
704 fn test_stats() {
705 let temp_dir = tempdir().unwrap();
706 let db_path = temp_dir.path().join("test.db");
707
708 let mut store = VectorStore::new(&db_path, 4).unwrap();
709
710 let chunks = vec![
711 EmbeddedChunk::new(
712 Chunk::new(
713 "fn test1() {}".to_string(),
714 0,
715 1,
716 ChunkKind::Function,
717 "file1.rs".to_string(),
718 ),
719 vec![1.0, 0.0, 0.0, 0.0],
720 ),
721 EmbeddedChunk::new(
722 Chunk::new(
723 "fn test2() {}".to_string(),
724 0,
725 1,
726 ChunkKind::Function,
727 "file2.rs".to_string(),
728 ),
729 vec![0.0, 1.0, 0.0, 0.0],
730 ),
731 ];
732
733 store.insert_chunks(chunks).unwrap();
734 store.build_index().unwrap();
735
736 let stats = store.stats().unwrap();
737 assert_eq!(stats.total_chunks, 2);
738 assert_eq!(stats.total_files, 2);
739 assert!(stats.indexed);
740 assert_eq!(stats.dimensions, 4);
741 }
742
743 #[test]
744 fn test_clear() {
745 let temp_dir = tempdir().unwrap();
746 let db_path = temp_dir.path().join("test.db");
747
748 let mut store = VectorStore::new(&db_path, 4).unwrap();
749
750 let chunks = vec![EmbeddedChunk::new(
751 Chunk::new(
752 "fn test() {}".to_string(),
753 0,
754 1,
755 ChunkKind::Function,
756 "test.rs".to_string(),
757 ),
758 vec![1.0, 0.0, 0.0, 0.0],
759 )];
760
761 store.insert_chunks(chunks).unwrap();
762 store.build_index().unwrap();
763
764 let stats = store.stats().unwrap();
765 assert_eq!(stats.total_chunks, 1);
766
767 store.clear().unwrap();
768
769 let stats = store.stats().unwrap();
770 assert_eq!(stats.total_chunks, 0);
771 assert!(!stats.indexed);
772 }
773
774 #[test]
775 fn test_get_chunk() {
776 let temp_dir = tempdir().unwrap();
777 let db_path = temp_dir.path().join("test.db");
778
779 let mut store = VectorStore::new(&db_path, 4).unwrap();
780
781 let chunks = vec![EmbeddedChunk::new(
782 Chunk::new(
783 "fn test() {}".to_string(),
784 0,
785 1,
786 ChunkKind::Function,
787 "test.rs".to_string(),
788 ),
789 vec![1.0, 0.0, 0.0, 0.0],
790 )];
791
792 store.insert_chunks(chunks).unwrap();
793
794 let metadata = store.get_chunk(0).unwrap();
795 assert!(metadata.is_some());
796
797 let metadata = metadata.unwrap();
798 assert_eq!(metadata.content, "fn test() {}");
799 assert_eq!(metadata.path, "test.rs");
800 }
801
802 #[test]
803 fn test_persistence() {
804 let temp_dir = tempdir().unwrap();
805 let db_path = temp_dir.path().join("test.db");
806
807 {
809 let mut store = VectorStore::new(&db_path, 4).unwrap();
810
811 let chunks = vec![EmbeddedChunk::new(
812 Chunk::new(
813 "fn test() {}".to_string(),
814 0,
815 1,
816 ChunkKind::Function,
817 "test.rs".to_string(),
818 ),
819 vec![1.0, 0.0, 0.0, 0.0],
820 )];
821
822 store.insert_chunks(chunks).unwrap();
823 store.build_index().unwrap();
824 }
825
826 {
828 let store = VectorStore::new(&db_path, 4).unwrap();
829
830 let stats = store.stats().unwrap();
831 assert_eq!(stats.total_chunks, 1);
832
833 let metadata = store.get_chunk(0).unwrap();
834 assert!(metadata.is_some());
835 }
836 }
837}