1pub mod paths;
2pub mod state;
3
4use std::collections::HashSet;
5use std::path::{Path, PathBuf};
6
7use anyhow::{Context, Result};
8use globset::{Glob, GlobSet, GlobSetBuilder};
9use ignore::gitignore::GitignoreBuilder;
10use ignore::WalkBuilder;
11use indicatif::{ProgressBar, ProgressStyle};
12use next_plaid::{
13 delete_from_index, filtering, IndexConfig, MmapIndex, SearchParameters, UpdateConfig,
14};
15use next_plaid_onnx::Colbert;
16use serde::{Deserialize, Serialize};
17
18use crate::embed::build_embedding_text;
19use crate::parser::{build_call_graph, detect_language, extract_units, CodeUnit, Language};
20
21use paths::{get_index_dir_for_project, get_vector_index_path, ProjectMetadata};
22use state::{get_mtime, hash_file, FileInfo, IndexState};
23
24const MAX_FILE_SIZE: u64 = 512 * 1024;
30
31#[derive(Debug)]
32pub struct UpdateStats {
33 pub added: usize,
34 pub changed: usize,
35 pub deleted: usize,
36 pub unchanged: usize,
37 pub skipped: usize,
38}
39
40#[derive(Debug, Default)]
41pub struct UpdatePlan {
42 pub added: Vec<PathBuf>,
43 pub changed: Vec<PathBuf>,
44 pub deleted: Vec<PathBuf>,
45 pub unchanged: usize,
46}
47
48pub struct IndexBuilder {
49 model: Colbert,
50 project_root: PathBuf,
51 index_dir: PathBuf,
52}
53
54impl IndexBuilder {
55 pub fn new(project_root: &Path, model_path: &Path) -> Result<Self> {
56 let model = Colbert::builder(model_path)
57 .with_quantized(true)
58 .build()
59 .context("Failed to load ColBERT model")?;
60
61 let index_dir = get_index_dir_for_project(project_root)?;
62
63 Ok(Self {
64 model,
65 project_root: project_root.to_path_buf(),
66 index_dir,
67 })
68 }
69
70 pub fn index_dir(&self) -> &Path {
72 &self.index_dir
73 }
74
75 pub fn index(&self, languages: Option<&[Language]>, force: bool) -> Result<UpdateStats> {
80 let state = IndexState::load(&self.index_dir)?;
81 let index_path = get_vector_index_path(&self.index_dir);
82 let index_exists = index_path.join("metadata.json").exists();
83 let filtering_exists = filtering::exists(index_path.to_str().unwrap());
84
85 if force || !index_exists || !filtering_exists {
87 return self.full_rebuild(languages);
88 }
89
90 if state.files.is_empty() {
93 return self.full_rebuild(languages);
94 }
95
96 self.incremental_update(&state, languages)
97 }
98
99 pub fn index_specific_files(&self, files: &[PathBuf]) -> Result<UpdateStats> {
103 if files.is_empty() {
104 return Ok(UpdateStats {
105 added: 0,
106 changed: 0,
107 deleted: 0,
108 unchanged: 0,
109 skipped: 0,
110 });
111 }
112
113 let state = IndexState::load(&self.index_dir)?;
114 let index_path = get_vector_index_path(&self.index_dir);
115 let index_path_str = index_path.to_str().unwrap();
116
117 let gitignore = {
120 let mut builder = GitignoreBuilder::new(&self.project_root);
121 let gitignore_path = self.project_root.join(".gitignore");
122 if gitignore_path.exists() {
123 let _ = builder.add(&gitignore_path);
124 }
125 builder.build().ok()
126 };
127
128 let mut files_added = Vec::new();
130 let mut files_changed = Vec::new();
131 let mut unchanged = 0;
132
133 for path in files {
134 if !is_within_project_root(&self.project_root, path) {
136 continue;
137 }
138
139 let full_path = self.project_root.join(path);
140 if !full_path.exists() {
141 continue;
142 }
143
144 if should_ignore(&full_path) {
146 continue;
147 }
148
149 if let Some(ref gi) = gitignore {
153 if gi
154 .matched_path_or_any_parents(path, full_path.is_dir())
155 .is_ignore()
156 {
157 continue;
158 }
159 }
160
161 let hash = hash_file(&full_path)?;
162 match state.files.get(path) {
163 Some(info) if info.content_hash == hash => {
164 unchanged += 1;
165 }
166 Some(_) => {
167 files_changed.push(path.clone());
169 }
170 None => {
171 files_added.push(path.clone());
173 }
174 }
175 }
176
177 let files_to_index: Vec<PathBuf> = files_added
178 .iter()
179 .chain(files_changed.iter())
180 .cloned()
181 .collect();
182
183 if files_to_index.is_empty() {
184 return Ok(UpdateStats {
185 added: 0,
186 changed: 0,
187 deleted: 0,
188 unchanged,
189 skipped: 0,
190 });
191 }
192
193 if filtering::exists(index_path_str) {
196 for file_path in &files_changed {
197 self.delete_file_from_index(index_path_str, file_path)?;
198 }
199 }
200
201 let mut new_state = state.clone();
203 let mut new_units: Vec<CodeUnit> = Vec::new();
204
205 let pb = ProgressBar::new(files_to_index.len() as u64);
207 pb.set_style(
208 ProgressStyle::default_bar()
209 .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}")
210 .unwrap()
211 .progress_chars("█▓░"),
212 );
213 pb.set_message("Parsing files...");
214
215 for path in &files_to_index {
216 let full_path = self.project_root.join(path);
217 let lang = match detect_language(&full_path) {
218 Some(l) => l,
219 None => {
220 pb.inc(1);
221 continue;
222 }
223 };
224 let source = std::fs::read_to_string(&full_path)
225 .with_context(|| format!("Failed to read {}", full_path.display()))?;
226 let units = extract_units(path, &source, lang);
227 new_units.extend(units);
228
229 new_state.files.insert(
230 path.clone(),
231 FileInfo {
232 content_hash: hash_file(&full_path)?,
233 mtime: get_mtime(&full_path)?,
234 },
235 );
236 pb.inc(1);
237 }
238 pb.finish_and_clear();
239
240 if new_units.is_empty() {
241 return Ok(UpdateStats {
242 added: 0,
243 changed: 0,
244 deleted: 0,
245 unchanged,
246 skipped: 0,
247 });
248 }
249
250 build_call_graph(&mut new_units);
252
253 let pb = ProgressBar::new(new_units.len() as u64);
254 pb.set_style(
255 ProgressStyle::default_bar()
256 .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}")
257 .unwrap()
258 .progress_chars("█▓░"),
259 );
260 pb.set_message("Encoding...");
261
262 std::fs::create_dir_all(&index_path)?;
264 let config = IndexConfig::default();
265 let update_config = UpdateConfig::default();
266
267 const CHUNK_SIZE: usize = 500;
269 let encode_batch_size = 64;
270
271 for (chunk_idx, unit_chunk) in new_units.chunks(CHUNK_SIZE).enumerate() {
272 let texts: Vec<String> = unit_chunk.iter().map(build_embedding_text).collect();
273 let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
274
275 let mut chunk_embeddings = Vec::new();
276 for batch in text_refs.chunks(encode_batch_size) {
277 let batch_embeddings = self
278 .model
279 .encode_documents(batch, None)
280 .context("Failed to encode documents")?;
281 chunk_embeddings.extend(batch_embeddings);
282
283 let progress = chunk_idx * CHUNK_SIZE + chunk_embeddings.len();
284 pb.set_position(progress.min(new_units.len()) as u64);
285 }
286
287 let (_, doc_ids) = MmapIndex::update_or_create(
289 &chunk_embeddings,
290 index_path_str,
291 &config,
292 &update_config,
293 )?;
294
295 let metadata: Vec<serde_json::Value> = unit_chunk
297 .iter()
298 .map(|u| serde_json::to_value(u).unwrap())
299 .collect();
300
301 if filtering::exists(index_path_str) {
302 filtering::update(index_path_str, &metadata, &doc_ids)?;
303 } else {
304 filtering::create(index_path_str, &metadata, &doc_ids)?;
305 }
306 }
307
308 pb.finish_and_clear();
309
310 new_state.save(&self.index_dir)?;
311
312 Ok(UpdateStats {
313 added: files_added.len(),
314 changed: files_changed.len(),
315 deleted: 0,
316 unchanged,
317 skipped: 0,
318 })
319 }
320
321 pub fn scan_files_matching_patterns(&self, patterns: &[String]) -> Result<Vec<PathBuf>> {
324 let (all_files, _skipped) = self.scan_files(None)?;
325
326 if patterns.is_empty() {
327 return Ok(all_files);
328 }
329
330 let filtered: Vec<PathBuf> = all_files
331 .into_iter()
332 .filter(|path| matches_glob_pattern(path, patterns))
333 .collect();
334
335 Ok(filtered)
336 }
337
338 fn full_rebuild(&self, languages: Option<&[Language]>) -> Result<UpdateStats> {
340 let index_path = get_vector_index_path(&self.index_dir);
342 if index_path.exists() {
343 std::fs::remove_dir_all(&index_path)?;
344 }
345
346 let (files, skipped) = self.scan_files(languages)?;
347 let mut state = IndexState::default();
348 let mut all_units: Vec<CodeUnit> = Vec::new();
349
350 let pb = ProgressBar::new(files.len() as u64);
352 pb.set_style(
353 ProgressStyle::default_bar()
354 .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}")
355 .unwrap()
356 .progress_chars("█▓░"),
357 );
358 pb.set_message("Parsing files...");
359
360 for path in &files {
362 let full_path = self.project_root.join(path);
363 let lang = match detect_language(&full_path) {
364 Some(l) => l,
365 None => {
366 pb.inc(1);
367 continue;
368 }
369 };
370 let source = std::fs::read_to_string(&full_path)
371 .with_context(|| format!("Failed to read {}", full_path.display()))?;
372 let units = extract_units(path, &source, lang);
373 all_units.extend(units);
374
375 state.files.insert(
376 path.clone(),
377 FileInfo {
378 content_hash: hash_file(&full_path)?,
379 mtime: get_mtime(&full_path)?,
380 },
381 );
382 pb.inc(1);
383 }
384 pb.finish_and_clear();
385
386 build_call_graph(&mut all_units);
388
389 if !all_units.is_empty() {
390 self.write_index_with_progress(&all_units)?;
391 }
392
393 state.save(&self.index_dir)?;
395 ProjectMetadata::new(&self.project_root).save(&self.index_dir)?;
396
397 Ok(UpdateStats {
398 added: files.len(),
399 changed: 0,
400 deleted: 0,
401 unchanged: 0,
402 skipped,
403 })
404 }
405
406 fn incremental_update(
408 &self,
409 old_state: &IndexState,
410 languages: Option<&[Language]>,
411 ) -> Result<UpdateStats> {
412 let plan = self.compute_update_plan(old_state, languages)?;
413 let index_path = get_vector_index_path(&self.index_dir);
414 let index_path_str = index_path.to_str().unwrap();
415
416 let orphaned_deleted = self.cleanup_orphaned_entries(index_path_str)?;
419
420 if plan.added.is_empty()
422 && plan.changed.is_empty()
423 && plan.deleted.is_empty()
424 && orphaned_deleted == 0
425 {
426 return Ok(UpdateStats {
427 added: 0,
428 changed: 0,
429 deleted: 0,
430 unchanged: plan.unchanged,
431 skipped: 0,
432 });
433 }
434
435 let mut state = old_state.clone();
436
437 let files_to_delete: Vec<&PathBuf> =
439 plan.changed.iter().chain(plan.deleted.iter()).collect();
440
441 for file_path in &files_to_delete {
442 self.delete_file_from_index(index_path_str, file_path)?;
443 }
444
445 for path in &plan.deleted {
447 state.files.remove(path);
448 }
449
450 let stale_paths: Vec<PathBuf> = state
453 .files
454 .keys()
455 .filter(|p| !self.project_root.join(p).exists())
456 .cloned()
457 .collect();
458 for path in stale_paths {
459 state.files.remove(&path);
460 }
461
462 let files_to_index: Vec<PathBuf> = plan
464 .added
465 .iter()
466 .chain(plan.changed.iter())
467 .cloned()
468 .collect();
469
470 let mut new_units: Vec<CodeUnit> = Vec::new();
471
472 let pb = if !files_to_index.is_empty() {
474 let pb = ProgressBar::new(files_to_index.len() as u64);
475 pb.set_style(
476 ProgressStyle::default_bar()
477 .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}")
478 .unwrap()
479 .progress_chars("█▓░"),
480 );
481 pb.set_message("Parsing files...");
482 Some(pb)
483 } else {
484 None
485 };
486
487 for path in &files_to_index {
488 let full_path = self.project_root.join(path);
489 let lang = match detect_language(&full_path) {
490 Some(l) => l,
491 None => {
492 if let Some(ref pb) = pb {
493 pb.inc(1);
494 }
495 continue;
496 }
497 };
498 let source = std::fs::read_to_string(&full_path)
499 .with_context(|| format!("Failed to read {}", full_path.display()))?;
500 let units = extract_units(path, &source, lang);
501 new_units.extend(units);
502
503 state.files.insert(
504 path.clone(),
505 FileInfo {
506 content_hash: hash_file(&full_path)?,
507 mtime: get_mtime(&full_path)?,
508 },
509 );
510 if let Some(ref pb) = pb {
511 pb.inc(1);
512 }
513 }
514 if let Some(pb) = pb {
515 pb.finish_and_clear();
516 }
517
518 if !new_units.is_empty() {
520 build_call_graph(&mut new_units);
522
523 let pb = ProgressBar::new(new_units.len() as u64);
525 pb.set_style(
526 ProgressStyle::default_bar()
527 .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}")
528 .unwrap()
529 .progress_chars("█▓░"),
530 );
531 pb.set_message("Encoding...");
532
533 let config = IndexConfig::default();
534 let update_config = UpdateConfig::default();
535
536 const CHUNK_SIZE: usize = 500;
538 let encode_batch_size = 64;
539
540 for (chunk_idx, unit_chunk) in new_units.chunks(CHUNK_SIZE).enumerate() {
541 let texts: Vec<String> = unit_chunk.iter().map(build_embedding_text).collect();
542 let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
543
544 let mut chunk_embeddings = Vec::new();
545 for batch in text_refs.chunks(encode_batch_size) {
546 let batch_embeddings = self
547 .model
548 .encode_documents(batch, None)
549 .context("Failed to encode documents")?;
550 chunk_embeddings.extend(batch_embeddings);
551
552 let progress = chunk_idx * CHUNK_SIZE + chunk_embeddings.len();
553 pb.set_position(progress.min(new_units.len()) as u64);
554 }
555
556 let (_, doc_ids) = MmapIndex::update_or_create(
558 &chunk_embeddings,
559 index_path_str,
560 &config,
561 &update_config,
562 )?;
563
564 let metadata: Vec<serde_json::Value> = unit_chunk
566 .iter()
567 .map(|u| serde_json::to_value(u).unwrap())
568 .collect();
569 filtering::update(index_path_str, &metadata, &doc_ids)?;
570 }
571
572 pb.finish_and_clear();
573 }
574
575 state.save(&self.index_dir)?;
576
577 Ok(UpdateStats {
578 added: plan.added.len(),
579 changed: plan.changed.len(),
580 deleted: plan.deleted.len(),
581 unchanged: plan.unchanged,
582 skipped: 0,
583 })
584 }
585
586 fn scan_files(&self, languages: Option<&[Language]>) -> Result<(Vec<PathBuf>, usize)> {
587 let walker = WalkBuilder::new(&self.project_root)
588 .hidden(false) .git_ignore(true)
590 .filter_entry(|entry| !should_ignore(entry.path()))
591 .build();
592
593 let mut files = Vec::new();
594 let mut skipped = 0;
595
596 for entry in walker.filter_map(|e| e.ok()) {
597 if !entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
598 continue;
599 }
600
601 let path = entry.path();
602
603 if is_file_too_large(path) {
605 skipped += 1;
606 continue;
607 }
608
609 let lang = match detect_language(path) {
610 Some(l) => l,
611 None => continue,
612 };
613
614 if languages.map(|ls| ls.contains(&lang)).unwrap_or(true) {
615 if let Ok(rel_path) = path.strip_prefix(&self.project_root) {
616 files.push(rel_path.to_path_buf());
617 }
618 }
619 }
620
621 Ok((files, skipped))
622 }
623}
624
625fn is_file_too_large(path: &Path) -> bool {
627 match std::fs::metadata(path) {
628 Ok(meta) => meta.len() > MAX_FILE_SIZE,
629 Err(_) => false, }
631}
632
633fn is_within_project_root(project_root: &Path, relative_path: &Path) -> bool {
638 let path_str = relative_path.to_string_lossy();
640 if path_str.contains("..") {
641 let full_path = project_root.join(relative_path);
643 match full_path.canonicalize() {
644 Ok(canonical) => {
645 match project_root.canonicalize() {
647 Ok(canonical_root) => canonical.starts_with(&canonical_root),
648 Err(_) => false,
649 }
650 }
651 Err(_) => false, }
653 } else {
654 let full_path = project_root.join(relative_path);
656 if !full_path.exists() {
657 return true; }
659 match (full_path.canonicalize(), project_root.canonicalize()) {
660 (Ok(canonical), Ok(canonical_root)) => canonical.starts_with(&canonical_root),
661 _ => false,
662 }
663 }
664}
665
666const IGNORED_DIRS: &[&str] = &[
668 ".git",
670 ".svn",
671 ".hg",
672 "node_modules",
674 "vendor",
675 "third_party",
676 "third-party",
677 "external",
678 "target",
680 "build",
681 "dist",
682 "out",
683 "output",
684 "bin",
685 "obj",
686 "__pycache__",
688 ".venv",
689 "venv",
690 ".env",
691 "env",
692 ".tox",
693 ".nox",
694 ".pytest_cache",
695 ".mypy_cache",
696 ".ruff_cache",
697 "*.egg-info",
698 ".eggs",
699 ".next",
701 ".nuxt",
702 ".output",
703 ".cache",
704 ".parcel-cache",
705 ".turbo",
706 "target",
708 "go.sum",
710 ".gradle",
712 ".m2",
713 ".idea",
715 ".vscode",
716 ".vs",
717 "*.xcworkspace",
718 "*.xcodeproj",
719 "coverage",
721 ".coverage",
722 "htmlcov",
723 ".nyc_output",
724 "tmp",
726 "temp",
727 "logs",
728 ".DS_Store",
729];
730
731const ALLOWED_HIDDEN_DIRS: &[&str] = &[".github", ".gitlab", ".circleci", ".buildkite"];
733
734const ALLOWED_HIDDEN_FILES: &[&str] = &[".gitlab-ci.yml", ".gitlab-ci.yaml", ".travis.yml"];
736
737fn should_ignore(path: &Path) -> bool {
739 for component in path.components() {
741 if let std::path::Component::Normal(name) = component {
742 let name_str = name.to_string_lossy();
743
744 if name_str.starts_with('.')
746 && !ALLOWED_HIDDEN_DIRS.contains(&name_str.as_ref())
747 && !ALLOWED_HIDDEN_FILES.contains(&name_str.as_ref())
748 {
749 return true;
750 }
751
752 for pattern in IGNORED_DIRS {
753 if let Some(suffix) = pattern.strip_prefix('*') {
754 if name_str.ends_with(suffix) {
756 return true;
757 }
758 } else if name_str == *pattern {
759 return true;
760 }
761 }
762 }
763 }
764 false
765}
766
767impl IndexBuilder {
768 fn compute_update_plan(
769 &self,
770 state: &IndexState,
771 languages: Option<&[Language]>,
772 ) -> Result<UpdatePlan> {
773 let (current_files, _skipped) = self.scan_files(languages)?;
774 let current_set: HashSet<_> = current_files.iter().cloned().collect();
775
776 let mut plan = UpdatePlan::default();
777
778 for path in ¤t_files {
779 let full_path = self.project_root.join(path);
780 let hash = hash_file(&full_path)?;
781
782 match state.files.get(path) {
783 Some(info) if info.content_hash == hash => plan.unchanged += 1,
784 Some(_) => plan.changed.push(path.clone()),
785 None => plan.added.push(path.clone()),
786 }
787 }
788
789 for path in state.files.keys() {
790 if !current_set.contains(path) {
791 plan.deleted.push(path.clone());
792 }
793 }
794
795 Ok(plan)
796 }
797
798 fn delete_file_from_index(&self, index_path: &str, file_path: &Path) -> Result<()> {
800 let file_str = file_path.to_string_lossy().to_string();
801 let ids =
802 filtering::where_condition(index_path, "file = ?", &[serde_json::json!(file_str)])
803 .unwrap_or_default();
804
805 if !ids.is_empty() {
806 delete_from_index(&ids, index_path)?;
807 filtering::delete(index_path, &ids)?;
808 }
809 Ok(())
810 }
811
812 fn cleanup_orphaned_entries(&self, index_path: &str) -> Result<usize> {
815 let all_metadata = filtering::get(index_path, None, &[], None).unwrap_or_default();
817
818 let mut indexed_files: HashSet<String> = HashSet::new();
819 for meta in &all_metadata {
820 if let Some(file) = meta.get("file").and_then(|v| v.as_str()) {
821 indexed_files.insert(file.to_string());
822 }
823 }
824
825 let mut deleted_count = 0;
826 for file_str in indexed_files {
827 let full_path = self.project_root.join(&file_str);
828 if !full_path.exists() {
829 let ids = filtering::where_condition(
831 index_path,
832 "file = ?",
833 &[serde_json::json!(file_str)],
834 )
835 .unwrap_or_default();
836
837 if !ids.is_empty() {
838 delete_from_index(&ids, index_path)?;
839 filtering::delete(index_path, &ids)?;
840 deleted_count += ids.len();
841 }
842 }
843 }
844
845 Ok(deleted_count)
846 }
847
848 #[allow(dead_code)]
849 fn write_index(&self, units: &[CodeUnit]) -> Result<()> {
850 self.write_index_impl(units, false)
851 }
852
853 fn write_index_with_progress(&self, units: &[CodeUnit]) -> Result<()> {
854 self.write_index_impl(units, true)
855 }
856
857 fn write_index_impl(&self, units: &[CodeUnit], show_progress: bool) -> Result<()> {
858 let index_path = get_vector_index_path(&self.index_dir);
859 let index_path_str = index_path.to_str().unwrap();
860 std::fs::create_dir_all(&index_path)?;
861
862 let pb = if show_progress {
864 let pb = ProgressBar::new(units.len() as u64);
865 pb.set_style(
866 ProgressStyle::default_bar()
867 .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}")
868 .unwrap()
869 .progress_chars("█▓░"),
870 );
871 pb.set_message("Encoding...");
872 Some(pb)
873 } else {
874 None
875 };
876
877 let config = IndexConfig::default();
878 let update_config = UpdateConfig::default();
879
880 const CHUNK_SIZE: usize = 500;
883 let encode_batch_size = 64;
884
885 for (chunk_idx, unit_chunk) in units.chunks(CHUNK_SIZE).enumerate() {
886 let texts: Vec<String> = unit_chunk.iter().map(build_embedding_text).collect();
888 let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
889
890 let mut chunk_embeddings = Vec::new();
892 for batch in text_refs.chunks(encode_batch_size) {
893 let batch_embeddings = self
894 .model
895 .encode_documents(batch, None)
896 .context("Failed to encode documents")?;
897 chunk_embeddings.extend(batch_embeddings);
898
899 if let Some(ref pb) = pb {
900 let progress = chunk_idx * CHUNK_SIZE + chunk_embeddings.len();
901 pb.set_position(progress.min(units.len()) as u64);
902 }
903 }
904
905 let (_, doc_ids) = MmapIndex::update_or_create(
907 &chunk_embeddings,
908 index_path_str,
909 &config,
910 &update_config,
911 )?;
912
913 let metadata: Vec<serde_json::Value> = unit_chunk
915 .iter()
916 .map(|u| serde_json::to_value(u).unwrap())
917 .collect();
918
919 if filtering::exists(index_path_str) {
920 filtering::update(index_path_str, &metadata, &doc_ids)?;
921 } else {
922 filtering::create(index_path_str, &metadata, &doc_ids)?;
923 }
924 }
925
926 if let Some(pb) = pb {
927 pb.finish_and_clear();
928 }
929
930 Ok(())
931 }
932
933 pub fn status(&self, languages: Option<&[Language]>) -> Result<UpdatePlan> {
935 let state = IndexState::load(&self.index_dir)?;
936 self.compute_update_plan(&state, languages)
937 }
938}
939
940#[derive(Debug, Clone, Serialize, Deserialize)]
942pub struct SearchResult {
943 pub unit: CodeUnit,
944 pub score: f32,
945}
946
947fn build_glob_set(patterns: &[String]) -> Option<GlobSet> {
949 if patterns.is_empty() {
950 return None;
951 }
952
953 let mut builder = GlobSetBuilder::new();
954 for pattern in patterns {
955 let normalized = if !pattern.starts_with("**/") && !pattern.starts_with('/') {
958 format!("**/{}", pattern)
959 } else {
960 pattern.clone()
961 };
962
963 if let Ok(glob) = Glob::new(&normalized) {
964 builder.add(glob);
965 }
966 }
967
968 builder.build().ok()
969}
970
971fn matches_glob_pattern(path: &Path, patterns: &[String]) -> bool {
973 if patterns.is_empty() {
974 return true;
975 }
976
977 let Some(glob_set) = build_glob_set(patterns) else {
978 return false;
979 };
980
981 glob_set.is_match(path)
982}
983
984pub struct Searcher {
985 model: Colbert,
986 index: MmapIndex,
987 index_path: String,
988}
989
990impl Searcher {
991 pub fn load(project_root: &Path, model_path: &Path) -> Result<Self> {
992 let index_dir = get_index_dir_for_project(project_root)?;
993 let index_path = get_vector_index_path(&index_dir);
994 let index_path_str = index_path.to_str().unwrap().to_string();
995
996 let model = Colbert::builder(model_path)
998 .with_quantized(true)
999 .build()
1000 .context("Failed to load ColBERT model")?;
1001
1002 let index = MmapIndex::load(&index_path_str).context("Failed to load index")?;
1004
1005 Ok(Self {
1006 model,
1007 index,
1008 index_path: index_path_str,
1009 })
1010 }
1011
1012 pub fn load_from_index_dir(index_dir: &Path, model_path: &Path) -> Result<Self> {
1014 let index_path = get_vector_index_path(index_dir);
1015 let index_path_str = index_path.to_str().unwrap().to_string();
1016
1017 let model = Colbert::builder(model_path)
1018 .with_quantized(true)
1019 .build()
1020 .context("Failed to load ColBERT model")?;
1021
1022 let index = MmapIndex::load(&index_path_str).context("Failed to load index")?;
1023
1024 Ok(Self {
1025 model,
1026 index,
1027 index_path: index_path_str,
1028 })
1029 }
1030
1031 pub fn filter_by_path_prefix(&self, prefix: &Path) -> Result<Vec<i64>> {
1034 let prefix_str = prefix.to_string_lossy();
1035 let like_pattern = format!("{}%", prefix_str);
1037 let subset = filtering::where_condition(
1038 &self.index_path,
1039 "file LIKE ?",
1040 &[serde_json::json!(like_pattern)],
1041 )
1042 .unwrap_or_default();
1043
1044 Ok(subset)
1045 }
1046
1047 pub fn filter_by_file_patterns(&self, patterns: &[String]) -> Result<Vec<i64>> {
1049 if patterns.is_empty() {
1050 return Ok(vec![]);
1051 }
1052
1053 let Some(glob_set) = build_glob_set(patterns) else {
1055 return Ok(vec![]);
1056 };
1057
1058 let all_metadata = filtering::get(&self.index_path, None, &[], None).unwrap_or_default();
1060
1061 let matching_ids: Vec<i64> = all_metadata
1063 .into_iter()
1064 .filter_map(|row| {
1065 let doc_id = row.get("_id")?.as_i64()?;
1066 let file = row.get("file")?.as_str()?;
1067 let path = Path::new(file);
1068 if glob_set.is_match(path) {
1069 Some(doc_id)
1070 } else {
1071 None
1072 }
1073 })
1074 .collect();
1075
1076 Ok(matching_ids)
1077 }
1078
1079 pub fn filter_by_files(&self, files: &[String]) -> Result<Vec<i64>> {
1081 if files.is_empty() {
1082 return Ok(vec![]);
1083 }
1084
1085 let mut conditions = Vec::new();
1087 let mut params = Vec::new();
1088
1089 for file in files {
1090 conditions.push("file = ?");
1091 params.push(serde_json::json!(file));
1092 }
1093
1094 let condition = conditions.join(" OR ");
1095 let subset =
1096 filtering::where_condition(&self.index_path, &condition, ¶ms).unwrap_or_default();
1097
1098 Ok(subset)
1099 }
1100
1101 pub fn search(
1102 &self,
1103 query: &str,
1104 top_k: usize,
1105 subset: Option<&[i64]>,
1106 ) -> Result<Vec<SearchResult>> {
1107 let query_embeddings = self
1109 .model
1110 .encode_queries(&[query])
1111 .context("Failed to encode query")?;
1112 let query_emb = &query_embeddings[0];
1113
1114 let params = SearchParameters {
1116 top_k,
1117 ..Default::default()
1118 };
1119 let results = self
1120 .index
1121 .search(query_emb, ¶ms, subset)
1122 .context("Search failed")?;
1123
1124 let doc_ids: Vec<i64> = results.passage_ids.to_vec();
1126 let metadata = filtering::get(&self.index_path, None, &[], Some(&doc_ids))
1127 .context("Failed to retrieve metadata")?;
1128
1129 let search_results: Vec<SearchResult> = metadata
1131 .into_iter()
1132 .zip(results.scores.iter())
1133 .filter_map(|(mut meta, &score)| {
1134 if let serde_json::Value::Object(ref mut obj) = meta {
1135 for key in ["has_loops", "has_branches", "has_error_handling"] {
1137 if let Some(v) = obj.get(key) {
1138 if let Some(n) = v.as_i64() {
1139 obj.insert(key.to_string(), serde_json::Value::Bool(n != 0));
1140 }
1141 }
1142 }
1143 for key in ["calls", "called_by", "parameters", "variables", "imports"] {
1145 if let Some(serde_json::Value::String(s)) = obj.get(key) {
1146 if let Ok(arr) = serde_json::from_str::<serde_json::Value>(s) {
1147 obj.insert(key.to_string(), arr);
1148 }
1149 }
1150 }
1151 }
1152 serde_json::from_value::<CodeUnit>(meta)
1153 .ok()
1154 .map(|unit| SearchResult { unit, score })
1155 })
1156 .collect();
1157
1158 Ok(search_results)
1159 }
1160
1161 pub fn num_documents(&self) -> usize {
1162 self.index.num_documents()
1163 }
1164}
1165
1166pub fn index_exists(project_root: &Path) -> bool {
1168 paths::index_exists(project_root)
1169}
1170
1171#[cfg(test)]
1172mod tests {
1173 use super::*;
1174
1175 #[test]
1176 fn test_glob_simple_extension() {
1177 let patterns = vec!["*.rs".to_string()];
1178 assert!(matches_glob_pattern(Path::new("src/main.rs"), &patterns));
1179 assert!(matches_glob_pattern(
1180 Path::new("nested/deep/file.rs"),
1181 &patterns
1182 ));
1183 assert!(!matches_glob_pattern(Path::new("src/main.py"), &patterns));
1184 }
1185
1186 #[test]
1187 fn test_glob_recursive_double_star() {
1188 let patterns = vec!["**/*.rs".to_string()];
1189 assert!(matches_glob_pattern(Path::new("src/main.rs"), &patterns));
1190 assert!(matches_glob_pattern(Path::new("a/b/c/d.rs"), &patterns));
1191 assert!(!matches_glob_pattern(Path::new("main.py"), &patterns));
1192 }
1193
1194 #[test]
1195 fn test_glob_directory_pattern() {
1196 let patterns = vec!["src/**/*.rs".to_string()];
1197 assert!(matches_glob_pattern(Path::new("src/main.rs"), &patterns));
1198 assert!(matches_glob_pattern(
1199 Path::new("src/index/mod.rs"),
1200 &patterns
1201 ));
1202 assert!(matches_glob_pattern(
1204 Path::new("project/src/main.rs"),
1205 &patterns
1206 ));
1207 assert!(!matches_glob_pattern(Path::new("lib/main.rs"), &patterns));
1208 }
1209
1210 #[test]
1211 fn test_glob_github_workflows() {
1212 let patterns = vec!["**/.github/**/*".to_string()];
1213 assert!(matches_glob_pattern(
1214 Path::new(".github/workflows/ci.yml"),
1215 &patterns
1216 ));
1217 assert!(matches_glob_pattern(
1218 Path::new("project/.github/actions/setup.yml"),
1219 &patterns
1220 ));
1221 assert!(!matches_glob_pattern(Path::new("src/main.rs"), &patterns));
1222 }
1223
1224 #[test]
1225 fn test_glob_multiple_patterns() {
1226 let patterns = vec!["*.rs".to_string(), "*.py".to_string()];
1227 assert!(matches_glob_pattern(Path::new("main.rs"), &patterns));
1228 assert!(matches_glob_pattern(Path::new("main.py"), &patterns));
1229 assert!(!matches_glob_pattern(Path::new("main.js"), &patterns));
1230 }
1231
1232 #[test]
1233 fn test_glob_test_files() {
1234 let patterns = vec!["*_test.go".to_string()];
1235 assert!(matches_glob_pattern(
1236 Path::new("pkg/main_test.go"),
1237 &patterns
1238 ));
1239 assert!(!matches_glob_pattern(Path::new("pkg/main.go"), &patterns));
1240 }
1241
1242 #[test]
1243 fn test_glob_empty_patterns() {
1244 let patterns: Vec<String> = vec![];
1245 assert!(matches_glob_pattern(Path::new("any/file.rs"), &patterns));
1247 }
1248
1249 #[test]
1250 fn test_is_within_project_root_simple_path() {
1251 let temp_dir = std::env::temp_dir().join("plaid_test_project");
1252 let _ = std::fs::create_dir_all(&temp_dir);
1253
1254 assert!(is_within_project_root(&temp_dir, Path::new("src/main.rs")));
1256 assert!(is_within_project_root(&temp_dir, Path::new("file.txt")));
1257 }
1258
1259 #[test]
1260 fn test_is_within_project_root_path_traversal() {
1261 let temp_dir = std::env::temp_dir().join("plaid_test_project");
1262 let _ = std::fs::create_dir_all(&temp_dir);
1263
1264 assert!(!is_within_project_root(
1266 &temp_dir,
1267 Path::new("../../../etc/passwd")
1268 ));
1269 assert!(!is_within_project_root(&temp_dir, Path::new("../sibling")));
1270 assert!(!is_within_project_root(
1271 &temp_dir,
1272 Path::new("foo/../../..")
1273 ));
1274 }
1275
1276 #[test]
1277 fn test_is_within_project_root_hidden_traversal() {
1278 let temp_dir = std::env::temp_dir().join("plaid_test_project");
1279 let _ = std::fs::create_dir_all(&temp_dir);
1280
1281 assert!(!is_within_project_root(
1283 &temp_dir,
1284 Path::new("src/../../../etc/passwd")
1285 ));
1286 assert!(!is_within_project_root(
1287 &temp_dir,
1288 Path::new("./foo/../../../bar")
1289 ));
1290 }
1291
1292 #[test]
1293 fn test_is_within_project_root_valid_dotdot_in_middle() {
1294 let temp_dir = std::env::temp_dir().join("plaid_test_project_dotdot");
1295 let sub_dir = temp_dir.join("src").join("subdir");
1296 let _ = std::fs::create_dir_all(&sub_dir);
1297
1298 let test_file = temp_dir.join("src").join("main.rs");
1300 let _ = std::fs::write(&test_file, "fn main() {}");
1301
1302 assert!(is_within_project_root(
1305 &temp_dir,
1306 Path::new("src/subdir/../main.rs")
1307 ));
1308
1309 let _ = std::fs::remove_dir_all(&temp_dir);
1311 }
1312}