next_plaid_cli/index/
mod.rs

1pub mod paths;
2pub mod state;
3
4use std::collections::HashSet;
5use std::path::{Path, PathBuf};
6
7use anyhow::{Context, Result};
8use globset::{Glob, GlobSet, GlobSetBuilder};
9use ignore::gitignore::GitignoreBuilder;
10use ignore::WalkBuilder;
11use indicatif::{ProgressBar, ProgressStyle};
12use next_plaid::{
13    delete_from_index, filtering, IndexConfig, MmapIndex, SearchParameters, UpdateConfig,
14};
15use next_plaid_onnx::Colbert;
16use serde::{Deserialize, Serialize};
17
18use crate::embed::build_embedding_text;
19use crate::parser::{build_call_graph, detect_language, extract_units, CodeUnit, Language};
20
21use paths::{get_index_dir_for_project, get_vector_index_path, ProjectMetadata};
22use state::{get_mtime, hash_file, FileInfo, IndexState};
23
24/// Maximum file size to index (512 KB)
25/// Files larger than this are skipped to avoid:
26/// - Slow parsing of generated/minified code
27/// - Memory issues with very large files
28/// - Indexing non-source files (binaries, data files)
29const MAX_FILE_SIZE: u64 = 512 * 1024;
30
31#[derive(Debug)]
32pub struct UpdateStats {
33    pub added: usize,
34    pub changed: usize,
35    pub deleted: usize,
36    pub unchanged: usize,
37    pub skipped: usize,
38}
39
40#[derive(Debug, Default)]
41pub struct UpdatePlan {
42    pub added: Vec<PathBuf>,
43    pub changed: Vec<PathBuf>,
44    pub deleted: Vec<PathBuf>,
45    pub unchanged: usize,
46}
47
48pub struct IndexBuilder {
49    model: Colbert,
50    project_root: PathBuf,
51    index_dir: PathBuf,
52}
53
54impl IndexBuilder {
55    pub fn new(project_root: &Path, model_path: &Path) -> Result<Self> {
56        let model = Colbert::builder(model_path)
57            .with_quantized(true)
58            .build()
59            .context("Failed to load ColBERT model")?;
60
61        let index_dir = get_index_dir_for_project(project_root)?;
62
63        Ok(Self {
64            model,
65            project_root: project_root.to_path_buf(),
66            index_dir,
67        })
68    }
69
70    /// Get the path to the index directory
71    pub fn index_dir(&self) -> &Path {
72        &self.index_dir
73    }
74
75    /// Single entry point for indexing.
76    /// - Creates index if none exists
77    /// - Updates incrementally if files changed
78    /// - Full rebuild if `force = true`
79    pub fn index(&self, languages: Option<&[Language]>, force: bool) -> Result<UpdateStats> {
80        let state = IndexState::load(&self.index_dir)?;
81        let index_path = get_vector_index_path(&self.index_dir);
82        let index_exists = index_path.join("metadata.json").exists();
83        let filtering_exists = filtering::exists(index_path.to_str().unwrap());
84
85        // Need full rebuild if forced, index doesn't exist, or filtering DB is missing
86        if force || !index_exists || !filtering_exists {
87            return self.full_rebuild(languages);
88        }
89
90        // State is out of sync with index (e.g., state.json was deleted but index exists)
91        // Do a full rebuild to avoid UNIQUE constraint errors when re-adding existing docs
92        if state.files.is_empty() {
93            return self.full_rebuild(languages);
94        }
95
96        self.incremental_update(&state, languages)
97    }
98
99    /// Index only specific files (for filtered search).
100    /// Only indexes files that are not already in the index or have changed.
101    /// Returns the number of files that were indexed.
102    pub fn index_specific_files(&self, files: &[PathBuf]) -> Result<UpdateStats> {
103        if files.is_empty() {
104            return Ok(UpdateStats {
105                added: 0,
106                changed: 0,
107                deleted: 0,
108                unchanged: 0,
109                skipped: 0,
110            });
111        }
112
113        let state = IndexState::load(&self.index_dir)?;
114        let index_path = get_vector_index_path(&self.index_dir);
115        let index_path_str = index_path.to_str().unwrap();
116
117        // Build gitignore matcher to filter out gitignored files
118        // This ensures index_specific_files respects .gitignore like scan_files does
119        let gitignore = {
120            let mut builder = GitignoreBuilder::new(&self.project_root);
121            let gitignore_path = self.project_root.join(".gitignore");
122            if gitignore_path.exists() {
123                let _ = builder.add(&gitignore_path);
124            }
125            builder.build().ok()
126        };
127
128        // Determine which files need indexing (new or changed)
129        let mut files_added = Vec::new();
130        let mut files_changed = Vec::new();
131        let mut unchanged = 0;
132
133        for path in files {
134            // Security: skip files outside the project root (path traversal protection)
135            if !is_within_project_root(&self.project_root, path) {
136                continue;
137            }
138
139            let full_path = self.project_root.join(path);
140            if !full_path.exists() {
141                continue;
142            }
143
144            // Skip files in ignored directories (same filtering as scan_files)
145            if should_ignore(&full_path) {
146                continue;
147            }
148
149            // Skip gitignored files (same filtering as scan_files)
150            // Use matched_path_or_any_parents to check if the file or any parent
151            // directory is ignored (handles patterns like "/site" matching "site/...")
152            if let Some(ref gi) = gitignore {
153                if gi
154                    .matched_path_or_any_parents(path, full_path.is_dir())
155                    .is_ignore()
156                {
157                    continue;
158                }
159            }
160
161            let hash = hash_file(&full_path)?;
162            match state.files.get(path) {
163                Some(info) if info.content_hash == hash => {
164                    unchanged += 1;
165                }
166                Some(_) => {
167                    // File exists in index but content changed
168                    files_changed.push(path.clone());
169                }
170                None => {
171                    // New file not in index
172                    files_added.push(path.clone());
173                }
174            }
175        }
176
177        let files_to_index: Vec<PathBuf> = files_added
178            .iter()
179            .chain(files_changed.iter())
180            .cloned()
181            .collect();
182
183        if files_to_index.is_empty() {
184            return Ok(UpdateStats {
185                added: 0,
186                changed: 0,
187                deleted: 0,
188                unchanged,
189                skipped: 0,
190            });
191        }
192
193        // Delete old entries for changed files before re-indexing
194        // This prevents duplicates when a file's content has changed
195        if filtering::exists(index_path_str) {
196            for file_path in &files_changed {
197                self.delete_file_from_index(index_path_str, file_path)?;
198            }
199        }
200
201        // Load or create state
202        let mut new_state = state.clone();
203        let mut new_units: Vec<CodeUnit> = Vec::new();
204
205        // Progress bar for parsing
206        let pb = ProgressBar::new(files_to_index.len() as u64);
207        pb.set_style(
208            ProgressStyle::default_bar()
209                .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}")
210                .unwrap()
211                .progress_chars("█▓░"),
212        );
213        pb.set_message("Parsing files...");
214
215        for path in &files_to_index {
216            let full_path = self.project_root.join(path);
217            let lang = match detect_language(&full_path) {
218                Some(l) => l,
219                None => {
220                    pb.inc(1);
221                    continue;
222                }
223            };
224            let source = std::fs::read_to_string(&full_path)
225                .with_context(|| format!("Failed to read {}", full_path.display()))?;
226            let units = extract_units(path, &source, lang);
227            new_units.extend(units);
228
229            new_state.files.insert(
230                path.clone(),
231                FileInfo {
232                    content_hash: hash_file(&full_path)?,
233                    mtime: get_mtime(&full_path)?,
234                },
235            );
236            pb.inc(1);
237        }
238        pb.finish_and_clear();
239
240        if new_units.is_empty() {
241            return Ok(UpdateStats {
242                added: 0,
243                changed: 0,
244                deleted: 0,
245                unchanged,
246                skipped: 0,
247            });
248        }
249
250        // Build call graph
251        build_call_graph(&mut new_units);
252
253        let pb = ProgressBar::new(new_units.len() as u64);
254        pb.set_style(
255            ProgressStyle::default_bar()
256                .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}")
257                .unwrap()
258                .progress_chars("█▓░"),
259        );
260        pb.set_message("Encoding...");
261
262        // Create or update index
263        std::fs::create_dir_all(&index_path)?;
264        let config = IndexConfig::default();
265        let update_config = UpdateConfig::default();
266
267        // Process in chunks of 500 documents to avoid RAM issues
268        const CHUNK_SIZE: usize = 500;
269        let encode_batch_size = 64;
270
271        for (chunk_idx, unit_chunk) in new_units.chunks(CHUNK_SIZE).enumerate() {
272            let texts: Vec<String> = unit_chunk.iter().map(build_embedding_text).collect();
273            let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
274
275            let mut chunk_embeddings = Vec::new();
276            for batch in text_refs.chunks(encode_batch_size) {
277                let batch_embeddings = self
278                    .model
279                    .encode_documents(batch, None)
280                    .context("Failed to encode documents")?;
281                chunk_embeddings.extend(batch_embeddings);
282
283                let progress = chunk_idx * CHUNK_SIZE + chunk_embeddings.len();
284                pb.set_position(progress.min(new_units.len()) as u64);
285            }
286
287            // Write this chunk to the index
288            let (_, doc_ids) = MmapIndex::update_or_create(
289                &chunk_embeddings,
290                index_path_str,
291                &config,
292                &update_config,
293            )?;
294
295            // Store metadata for this chunk
296            let metadata: Vec<serde_json::Value> = unit_chunk
297                .iter()
298                .map(|u| serde_json::to_value(u).unwrap())
299                .collect();
300
301            if filtering::exists(index_path_str) {
302                filtering::update(index_path_str, &metadata, &doc_ids)?;
303            } else {
304                filtering::create(index_path_str, &metadata, &doc_ids)?;
305            }
306        }
307
308        pb.finish_and_clear();
309
310        new_state.save(&self.index_dir)?;
311
312        Ok(UpdateStats {
313            added: files_added.len(),
314            changed: files_changed.len(),
315            deleted: 0,
316            unchanged,
317            skipped: 0,
318        })
319    }
320
321    /// Scan files matching glob patterns (e.g., "*.py", "*.rs")
322    /// Returns relative paths from project root
323    pub fn scan_files_matching_patterns(&self, patterns: &[String]) -> Result<Vec<PathBuf>> {
324        let (all_files, _skipped) = self.scan_files(None)?;
325
326        if patterns.is_empty() {
327            return Ok(all_files);
328        }
329
330        let filtered: Vec<PathBuf> = all_files
331            .into_iter()
332            .filter(|path| matches_glob_pattern(path, patterns))
333            .collect();
334
335        Ok(filtered)
336    }
337
338    /// Full rebuild (used when force=true or no index exists)
339    fn full_rebuild(&self, languages: Option<&[Language]>) -> Result<UpdateStats> {
340        // Clear existing index data to avoid duplicates
341        let index_path = get_vector_index_path(&self.index_dir);
342        if index_path.exists() {
343            std::fs::remove_dir_all(&index_path)?;
344        }
345
346        let (files, skipped) = self.scan_files(languages)?;
347        let mut state = IndexState::default();
348        let mut all_units: Vec<CodeUnit> = Vec::new();
349
350        // Progress bar for parsing files
351        let pb = ProgressBar::new(files.len() as u64);
352        pb.set_style(
353            ProgressStyle::default_bar()
354                .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}")
355                .unwrap()
356                .progress_chars("█▓░"),
357        );
358        pb.set_message("Parsing files...");
359
360        // Extract units from all files
361        for path in &files {
362            let full_path = self.project_root.join(path);
363            let lang = match detect_language(&full_path) {
364                Some(l) => l,
365                None => {
366                    pb.inc(1);
367                    continue;
368                }
369            };
370            let source = std::fs::read_to_string(&full_path)
371                .with_context(|| format!("Failed to read {}", full_path.display()))?;
372            let units = extract_units(path, &source, lang);
373            all_units.extend(units);
374
375            state.files.insert(
376                path.clone(),
377                FileInfo {
378                    content_hash: hash_file(&full_path)?,
379                    mtime: get_mtime(&full_path)?,
380                },
381            );
382            pb.inc(1);
383        }
384        pb.finish_and_clear();
385
386        // Build call graph to populate called_by
387        build_call_graph(&mut all_units);
388
389        if !all_units.is_empty() {
390            self.write_index_with_progress(&all_units)?;
391        }
392
393        // Save state and project metadata
394        state.save(&self.index_dir)?;
395        ProjectMetadata::new(&self.project_root).save(&self.index_dir)?;
396
397        Ok(UpdateStats {
398            added: files.len(),
399            changed: 0,
400            deleted: 0,
401            unchanged: 0,
402            skipped,
403        })
404    }
405
406    /// Incremental update (only re-index changed files)
407    fn incremental_update(
408        &self,
409        old_state: &IndexState,
410        languages: Option<&[Language]>,
411    ) -> Result<UpdateStats> {
412        let plan = self.compute_update_plan(old_state, languages)?;
413        let index_path = get_vector_index_path(&self.index_dir);
414        let index_path_str = index_path.to_str().unwrap();
415
416        // 0. Clean up orphaned entries (files in index but not on disk)
417        // This handles directory deletion/rename and any inconsistencies
418        let orphaned_deleted = self.cleanup_orphaned_entries(index_path_str)?;
419
420        // Nothing to do
421        if plan.added.is_empty()
422            && plan.changed.is_empty()
423            && plan.deleted.is_empty()
424            && orphaned_deleted == 0
425        {
426            return Ok(UpdateStats {
427                added: 0,
428                changed: 0,
429                deleted: 0,
430                unchanged: plan.unchanged,
431                skipped: 0,
432            });
433        }
434
435        let mut state = old_state.clone();
436
437        // 1. Delete chunks for changed/deleted files by querying file path
438        let files_to_delete: Vec<&PathBuf> =
439            plan.changed.iter().chain(plan.deleted.iter()).collect();
440
441        for file_path in &files_to_delete {
442            self.delete_file_from_index(index_path_str, file_path)?;
443        }
444
445        // Remove deleted files from state
446        for path in &plan.deleted {
447            state.files.remove(path);
448        }
449
450        // Also clean state of any files that no longer exist on disk
451        // (handles directory deletion/rename and any state inconsistencies)
452        let stale_paths: Vec<PathBuf> = state
453            .files
454            .keys()
455            .filter(|p| !self.project_root.join(p).exists())
456            .cloned()
457            .collect();
458        for path in stale_paths {
459            state.files.remove(&path);
460        }
461
462        // 2. Index new/changed files
463        let files_to_index: Vec<PathBuf> = plan
464            .added
465            .iter()
466            .chain(plan.changed.iter())
467            .cloned()
468            .collect();
469
470        let mut new_units: Vec<CodeUnit> = Vec::new();
471
472        // Progress bar for parsing (only if there are files to index)
473        let pb = if !files_to_index.is_empty() {
474            let pb = ProgressBar::new(files_to_index.len() as u64);
475            pb.set_style(
476                ProgressStyle::default_bar()
477                    .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}")
478                    .unwrap()
479                    .progress_chars("█▓░"),
480            );
481            pb.set_message("Parsing files...");
482            Some(pb)
483        } else {
484            None
485        };
486
487        for path in &files_to_index {
488            let full_path = self.project_root.join(path);
489            let lang = match detect_language(&full_path) {
490                Some(l) => l,
491                None => {
492                    if let Some(ref pb) = pb {
493                        pb.inc(1);
494                    }
495                    continue;
496                }
497            };
498            let source = std::fs::read_to_string(&full_path)
499                .with_context(|| format!("Failed to read {}", full_path.display()))?;
500            let units = extract_units(path, &source, lang);
501            new_units.extend(units);
502
503            state.files.insert(
504                path.clone(),
505                FileInfo {
506                    content_hash: hash_file(&full_path)?,
507                    mtime: get_mtime(&full_path)?,
508                },
509            );
510            if let Some(ref pb) = pb {
511                pb.inc(1);
512            }
513        }
514        if let Some(pb) = pb {
515            pb.finish_and_clear();
516        }
517
518        // 3. Add new units to index
519        if !new_units.is_empty() {
520            // Build call graph for new units
521            build_call_graph(&mut new_units);
522
523            // Progress bar for encoding
524            let pb = ProgressBar::new(new_units.len() as u64);
525            pb.set_style(
526                ProgressStyle::default_bar()
527                    .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}")
528                    .unwrap()
529                    .progress_chars("█▓░"),
530            );
531            pb.set_message("Encoding...");
532
533            let config = IndexConfig::default();
534            let update_config = UpdateConfig::default();
535
536            // Process in chunks of 500 documents to avoid RAM issues
537            const CHUNK_SIZE: usize = 500;
538            let encode_batch_size = 64;
539
540            for (chunk_idx, unit_chunk) in new_units.chunks(CHUNK_SIZE).enumerate() {
541                let texts: Vec<String> = unit_chunk.iter().map(build_embedding_text).collect();
542                let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
543
544                let mut chunk_embeddings = Vec::new();
545                for batch in text_refs.chunks(encode_batch_size) {
546                    let batch_embeddings = self
547                        .model
548                        .encode_documents(batch, None)
549                        .context("Failed to encode documents")?;
550                    chunk_embeddings.extend(batch_embeddings);
551
552                    let progress = chunk_idx * CHUNK_SIZE + chunk_embeddings.len();
553                    pb.set_position(progress.min(new_units.len()) as u64);
554                }
555
556                // Write this chunk to the index
557                let (_, doc_ids) = MmapIndex::update_or_create(
558                    &chunk_embeddings,
559                    index_path_str,
560                    &config,
561                    &update_config,
562                )?;
563
564                // Store metadata for this chunk
565                let metadata: Vec<serde_json::Value> = unit_chunk
566                    .iter()
567                    .map(|u| serde_json::to_value(u).unwrap())
568                    .collect();
569                filtering::update(index_path_str, &metadata, &doc_ids)?;
570            }
571
572            pb.finish_and_clear();
573        }
574
575        state.save(&self.index_dir)?;
576
577        Ok(UpdateStats {
578            added: plan.added.len(),
579            changed: plan.changed.len(),
580            deleted: plan.deleted.len(),
581            unchanged: plan.unchanged,
582            skipped: 0,
583        })
584    }
585
586    fn scan_files(&self, languages: Option<&[Language]>) -> Result<(Vec<PathBuf>, usize)> {
587        let walker = WalkBuilder::new(&self.project_root)
588            .hidden(false) // Handle hidden files manually in should_ignore (with .github exception)
589            .git_ignore(true)
590            .filter_entry(|entry| !should_ignore(entry.path()))
591            .build();
592
593        let mut files = Vec::new();
594        let mut skipped = 0;
595
596        for entry in walker.filter_map(|e| e.ok()) {
597            if !entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
598                continue;
599            }
600
601            let path = entry.path();
602
603            // Skip files that are too large
604            if is_file_too_large(path) {
605                skipped += 1;
606                continue;
607            }
608
609            let lang = match detect_language(path) {
610                Some(l) => l,
611                None => continue,
612            };
613
614            if languages.map(|ls| ls.contains(&lang)).unwrap_or(true) {
615                if let Ok(rel_path) = path.strip_prefix(&self.project_root) {
616                    files.push(rel_path.to_path_buf());
617                }
618            }
619        }
620
621        Ok((files, skipped))
622    }
623}
624
625/// Check if a file exceeds the maximum size limit
626fn is_file_too_large(path: &Path) -> bool {
627    match std::fs::metadata(path) {
628        Ok(meta) => meta.len() > MAX_FILE_SIZE,
629        Err(_) => false, // If we can't read metadata, let it fail later
630    }
631}
632
633/// Check if a path is within the project root directory.
634/// This prevents path traversal attacks (e.g., ../../../etc/passwd).
635/// The check is done by canonicalizing both paths and verifying
636/// the resolved path starts with the project root.
637fn is_within_project_root(project_root: &Path, relative_path: &Path) -> bool {
638    // Check for obvious path traversal patterns first (fast path)
639    let path_str = relative_path.to_string_lossy();
640    if path_str.contains("..") {
641        // Could be a traversal attempt - do full canonicalization check
642        let full_path = project_root.join(relative_path);
643        match full_path.canonicalize() {
644            Ok(canonical) => {
645                // Canonicalize project root as well for accurate comparison
646                match project_root.canonicalize() {
647                    Ok(canonical_root) => canonical.starts_with(&canonical_root),
648                    Err(_) => false,
649                }
650            }
651            Err(_) => false, // If canonicalization fails, reject the path
652        }
653    } else {
654        // No ".." in path, but still verify the path doesn't escape via symlinks
655        let full_path = project_root.join(relative_path);
656        if !full_path.exists() {
657            return true; // Non-existent paths will be skipped later anyway
658        }
659        match (full_path.canonicalize(), project_root.canonicalize()) {
660            (Ok(canonical), Ok(canonical_root)) => canonical.starts_with(&canonical_root),
661            _ => false,
662        }
663    }
664}
665
666/// Directories and patterns to always ignore (even without .gitignore)
667const IGNORED_DIRS: &[&str] = &[
668    // Version control
669    ".git",
670    ".svn",
671    ".hg",
672    // Dependencies
673    "node_modules",
674    "vendor",
675    "third_party",
676    "third-party",
677    "external",
678    // Build outputs
679    "target",
680    "build",
681    "dist",
682    "out",
683    "output",
684    "bin",
685    "obj",
686    // Python
687    "__pycache__",
688    ".venv",
689    "venv",
690    ".env",
691    "env",
692    ".tox",
693    ".nox",
694    ".pytest_cache",
695    ".mypy_cache",
696    ".ruff_cache",
697    "*.egg-info",
698    ".eggs",
699    // JavaScript/TypeScript
700    ".next",
701    ".nuxt",
702    ".output",
703    ".cache",
704    ".parcel-cache",
705    ".turbo",
706    // Rust
707    "target",
708    // Go
709    "go.sum",
710    // Java
711    ".gradle",
712    ".m2",
713    // IDE/Editor
714    ".idea",
715    ".vscode",
716    ".vs",
717    "*.xcworkspace",
718    "*.xcodeproj",
719    // Test/Coverage
720    "coverage",
721    ".coverage",
722    "htmlcov",
723    ".nyc_output",
724    // Misc
725    "tmp",
726    "temp",
727    "logs",
728    ".DS_Store",
729];
730
731/// Hidden directories that should be indexed (exceptions to hidden file filtering)
732const ALLOWED_HIDDEN_DIRS: &[&str] = &[".github", ".gitlab", ".circleci", ".buildkite"];
733
734/// Hidden files that should be indexed (exceptions to hidden file filtering)
735const ALLOWED_HIDDEN_FILES: &[&str] = &[".gitlab-ci.yml", ".gitlab-ci.yaml", ".travis.yml"];
736
737/// Check if a path should be ignored
738fn should_ignore(path: &Path) -> bool {
739    // Check each component of the path
740    for component in path.components() {
741        if let std::path::Component::Normal(name) = component {
742            let name_str = name.to_string_lossy();
743
744            // Skip hidden files/directories (starting with .) except allowed ones
745            if name_str.starts_with('.')
746                && !ALLOWED_HIDDEN_DIRS.contains(&name_str.as_ref())
747                && !ALLOWED_HIDDEN_FILES.contains(&name_str.as_ref())
748            {
749                return true;
750            }
751
752            for pattern in IGNORED_DIRS {
753                if let Some(suffix) = pattern.strip_prefix('*') {
754                    // Suffix match (e.g., "*.egg-info")
755                    if name_str.ends_with(suffix) {
756                        return true;
757                    }
758                } else if name_str == *pattern {
759                    return true;
760                }
761            }
762        }
763    }
764    false
765}
766
767impl IndexBuilder {
768    fn compute_update_plan(
769        &self,
770        state: &IndexState,
771        languages: Option<&[Language]>,
772    ) -> Result<UpdatePlan> {
773        let (current_files, _skipped) = self.scan_files(languages)?;
774        let current_set: HashSet<_> = current_files.iter().cloned().collect();
775
776        let mut plan = UpdatePlan::default();
777
778        for path in &current_files {
779            let full_path = self.project_root.join(path);
780            let hash = hash_file(&full_path)?;
781
782            match state.files.get(path) {
783                Some(info) if info.content_hash == hash => plan.unchanged += 1,
784                Some(_) => plan.changed.push(path.clone()),
785                None => plan.added.push(path.clone()),
786            }
787        }
788
789        for path in state.files.keys() {
790            if !current_set.contains(path) {
791                plan.deleted.push(path.clone());
792            }
793        }
794
795        Ok(plan)
796    }
797
798    /// Delete all chunks for a file from both vector index and metadata DB
799    fn delete_file_from_index(&self, index_path: &str, file_path: &Path) -> Result<()> {
800        let file_str = file_path.to_string_lossy().to_string();
801        let ids =
802            filtering::where_condition(index_path, "file = ?", &[serde_json::json!(file_str)])
803                .unwrap_or_default();
804
805        if !ids.is_empty() {
806            delete_from_index(&ids, index_path)?;
807            filtering::delete(index_path, &ids)?;
808        }
809        Ok(())
810    }
811
812    /// Clean up orphaned entries: files in index but not on disk
813    /// This handles directory deletion/rename and any state inconsistencies
814    fn cleanup_orphaned_entries(&self, index_path: &str) -> Result<usize> {
815        // Get all unique file paths from the index
816        let all_metadata = filtering::get(index_path, None, &[], None).unwrap_or_default();
817
818        let mut indexed_files: HashSet<String> = HashSet::new();
819        for meta in &all_metadata {
820            if let Some(file) = meta.get("file").and_then(|v| v.as_str()) {
821                indexed_files.insert(file.to_string());
822            }
823        }
824
825        let mut deleted_count = 0;
826        for file_str in indexed_files {
827            let full_path = self.project_root.join(&file_str);
828            if !full_path.exists() {
829                // File no longer exists on disk - delete from index
830                let ids = filtering::where_condition(
831                    index_path,
832                    "file = ?",
833                    &[serde_json::json!(file_str)],
834                )
835                .unwrap_or_default();
836
837                if !ids.is_empty() {
838                    delete_from_index(&ids, index_path)?;
839                    filtering::delete(index_path, &ids)?;
840                    deleted_count += ids.len();
841                }
842            }
843        }
844
845        Ok(deleted_count)
846    }
847
848    #[allow(dead_code)]
849    fn write_index(&self, units: &[CodeUnit]) -> Result<()> {
850        self.write_index_impl(units, false)
851    }
852
853    fn write_index_with_progress(&self, units: &[CodeUnit]) -> Result<()> {
854        self.write_index_impl(units, true)
855    }
856
857    fn write_index_impl(&self, units: &[CodeUnit], show_progress: bool) -> Result<()> {
858        let index_path = get_vector_index_path(&self.index_dir);
859        let index_path_str = index_path.to_str().unwrap();
860        std::fs::create_dir_all(&index_path)?;
861
862        // Progress bar for encoding
863        let pb = if show_progress {
864            let pb = ProgressBar::new(units.len() as u64);
865            pb.set_style(
866                ProgressStyle::default_bar()
867                    .template("{spinner:.green} [{bar:40.cyan/blue}] {pos}/{len} {msg}")
868                    .unwrap()
869                    .progress_chars("█▓░"),
870            );
871            pb.set_message("Encoding...");
872            Some(pb)
873        } else {
874            None
875        };
876
877        let config = IndexConfig::default();
878        let update_config = UpdateConfig::default();
879
880        // Process in chunks of 500 documents to avoid RAM issues
881        // Each chunk is encoded and written to the index before processing the next
882        const CHUNK_SIZE: usize = 500;
883        let encode_batch_size = 64;
884
885        for (chunk_idx, unit_chunk) in units.chunks(CHUNK_SIZE).enumerate() {
886            // Build embedding text for this chunk
887            let texts: Vec<String> = unit_chunk.iter().map(build_embedding_text).collect();
888            let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
889
890            // Encode in smaller batches within the chunk
891            let mut chunk_embeddings = Vec::new();
892            for batch in text_refs.chunks(encode_batch_size) {
893                let batch_embeddings = self
894                    .model
895                    .encode_documents(batch, None)
896                    .context("Failed to encode documents")?;
897                chunk_embeddings.extend(batch_embeddings);
898
899                if let Some(ref pb) = pb {
900                    let progress = chunk_idx * CHUNK_SIZE + chunk_embeddings.len();
901                    pb.set_position(progress.min(units.len()) as u64);
902                }
903            }
904
905            // Write this chunk to the index
906            let (_, doc_ids) = MmapIndex::update_or_create(
907                &chunk_embeddings,
908                index_path_str,
909                &config,
910                &update_config,
911            )?;
912
913            // Store metadata for this chunk
914            let metadata: Vec<serde_json::Value> = unit_chunk
915                .iter()
916                .map(|u| serde_json::to_value(u).unwrap())
917                .collect();
918
919            if filtering::exists(index_path_str) {
920                filtering::update(index_path_str, &metadata, &doc_ids)?;
921            } else {
922                filtering::create(index_path_str, &metadata, &doc_ids)?;
923            }
924        }
925
926        if let Some(pb) = pb {
927            pb.finish_and_clear();
928        }
929
930        Ok(())
931    }
932
933    /// Get index status (what would be updated)
934    pub fn status(&self, languages: Option<&[Language]>) -> Result<UpdatePlan> {
935        let state = IndexState::load(&self.index_dir)?;
936        self.compute_update_plan(&state, languages)
937    }
938}
939
940// Search result
941#[derive(Debug, Clone, Serialize, Deserialize)]
942pub struct SearchResult {
943    pub unit: CodeUnit,
944    pub score: f32,
945}
946
947/// Build a GlobSet from patterns for efficient matching
948fn build_glob_set(patterns: &[String]) -> Option<GlobSet> {
949    if patterns.is_empty() {
950        return None;
951    }
952
953    let mut builder = GlobSetBuilder::new();
954    for pattern in patterns {
955        // Prepend **/ if pattern doesn't start with ** or /
956        // This makes "*.rs" match files in any directory
957        let normalized = if !pattern.starts_with("**/") && !pattern.starts_with('/') {
958            format!("**/{}", pattern)
959        } else {
960            pattern.clone()
961        };
962
963        if let Ok(glob) = Glob::new(&normalized) {
964            builder.add(glob);
965        }
966    }
967
968    builder.build().ok()
969}
970
971/// Check if a file path matches any of the glob patterns
972fn matches_glob_pattern(path: &Path, patterns: &[String]) -> bool {
973    if patterns.is_empty() {
974        return true;
975    }
976
977    let Some(glob_set) = build_glob_set(patterns) else {
978        return false;
979    };
980
981    glob_set.is_match(path)
982}
983
984pub struct Searcher {
985    model: Colbert,
986    index: MmapIndex,
987    index_path: String,
988}
989
990impl Searcher {
991    pub fn load(project_root: &Path, model_path: &Path) -> Result<Self> {
992        let index_dir = get_index_dir_for_project(project_root)?;
993        let index_path = get_vector_index_path(&index_dir);
994        let index_path_str = index_path.to_str().unwrap().to_string();
995
996        // Load model
997        let model = Colbert::builder(model_path)
998            .with_quantized(true)
999            .build()
1000            .context("Failed to load ColBERT model")?;
1001
1002        // Load index
1003        let index = MmapIndex::load(&index_path_str).context("Failed to load index")?;
1004
1005        Ok(Self {
1006            model,
1007            index,
1008            index_path: index_path_str,
1009        })
1010    }
1011
1012    /// Load a searcher from a specific index directory (for parent index use)
1013    pub fn load_from_index_dir(index_dir: &Path, model_path: &Path) -> Result<Self> {
1014        let index_path = get_vector_index_path(index_dir);
1015        let index_path_str = index_path.to_str().unwrap().to_string();
1016
1017        let model = Colbert::builder(model_path)
1018            .with_quantized(true)
1019            .build()
1020            .context("Failed to load ColBERT model")?;
1021
1022        let index = MmapIndex::load(&index_path_str).context("Failed to load index")?;
1023
1024        Ok(Self {
1025            model,
1026            index,
1027            index_path: index_path_str,
1028        })
1029    }
1030
1031    /// Filter results to files within a subdirectory prefix.
1032    /// Returns document IDs where file path starts with the given prefix.
1033    pub fn filter_by_path_prefix(&self, prefix: &Path) -> Result<Vec<i64>> {
1034        let prefix_str = prefix.to_string_lossy();
1035        // Use SQL LIKE with the prefix followed by %
1036        let like_pattern = format!("{}%", prefix_str);
1037        let subset = filtering::where_condition(
1038            &self.index_path,
1039            "file LIKE ?",
1040            &[serde_json::json!(like_pattern)],
1041        )
1042        .unwrap_or_default();
1043
1044        Ok(subset)
1045    }
1046
1047    /// Get document IDs matching the given file patterns using globset
1048    pub fn filter_by_file_patterns(&self, patterns: &[String]) -> Result<Vec<i64>> {
1049        if patterns.is_empty() {
1050            return Ok(vec![]);
1051        }
1052
1053        // Build globset from patterns
1054        let Some(glob_set) = build_glob_set(patterns) else {
1055            return Ok(vec![]);
1056        };
1057
1058        // Get all metadata from the index
1059        let all_metadata = filtering::get(&self.index_path, None, &[], None).unwrap_or_default();
1060
1061        // Filter metadata by matching file paths against glob patterns
1062        let matching_ids: Vec<i64> = all_metadata
1063            .into_iter()
1064            .filter_map(|row| {
1065                let doc_id = row.get("_id")?.as_i64()?;
1066                let file = row.get("file")?.as_str()?;
1067                let path = Path::new(file);
1068                if glob_set.is_match(path) {
1069                    Some(doc_id)
1070                } else {
1071                    None
1072                }
1073            })
1074            .collect();
1075
1076        Ok(matching_ids)
1077    }
1078
1079    /// Get document IDs for code units in the given files (exact match)
1080    pub fn filter_by_files(&self, files: &[String]) -> Result<Vec<i64>> {
1081        if files.is_empty() {
1082            return Ok(vec![]);
1083        }
1084
1085        // Build SQL condition with OR for multiple exact file matches
1086        let mut conditions = Vec::new();
1087        let mut params = Vec::new();
1088
1089        for file in files {
1090            conditions.push("file = ?");
1091            params.push(serde_json::json!(file));
1092        }
1093
1094        let condition = conditions.join(" OR ");
1095        let subset =
1096            filtering::where_condition(&self.index_path, &condition, &params).unwrap_or_default();
1097
1098        Ok(subset)
1099    }
1100
1101    pub fn search(
1102        &self,
1103        query: &str,
1104        top_k: usize,
1105        subset: Option<&[i64]>,
1106    ) -> Result<Vec<SearchResult>> {
1107        // Encode query
1108        let query_embeddings = self
1109            .model
1110            .encode_queries(&[query])
1111            .context("Failed to encode query")?;
1112        let query_emb = &query_embeddings[0];
1113
1114        // Search
1115        let params = SearchParameters {
1116            top_k,
1117            ..Default::default()
1118        };
1119        let results = self
1120            .index
1121            .search(query_emb, &params, subset)
1122            .context("Search failed")?;
1123
1124        // Retrieve metadata for the result document IDs
1125        let doc_ids: Vec<i64> = results.passage_ids.to_vec();
1126        let metadata = filtering::get(&self.index_path, None, &[], Some(&doc_ids))
1127            .context("Failed to retrieve metadata")?;
1128
1129        // Map to SearchResult (fixing SQLite type conversions)
1130        let search_results: Vec<SearchResult> = metadata
1131            .into_iter()
1132            .zip(results.scores.iter())
1133            .filter_map(|(mut meta, &score)| {
1134                if let serde_json::Value::Object(ref mut obj) = meta {
1135                    // SQLite stores booleans as integers - convert them back
1136                    for key in ["has_loops", "has_branches", "has_error_handling"] {
1137                        if let Some(v) = obj.get(key) {
1138                            if let Some(n) = v.as_i64() {
1139                                obj.insert(key.to_string(), serde_json::Value::Bool(n != 0));
1140                            }
1141                        }
1142                    }
1143                    // SQLite stores arrays as JSON strings - parse them back
1144                    for key in ["calls", "called_by", "parameters", "variables", "imports"] {
1145                        if let Some(serde_json::Value::String(s)) = obj.get(key) {
1146                            if let Ok(arr) = serde_json::from_str::<serde_json::Value>(s) {
1147                                obj.insert(key.to_string(), arr);
1148                            }
1149                        }
1150                    }
1151                }
1152                serde_json::from_value::<CodeUnit>(meta)
1153                    .ok()
1154                    .map(|unit| SearchResult { unit, score })
1155            })
1156            .collect();
1157
1158        Ok(search_results)
1159    }
1160
1161    pub fn num_documents(&self) -> usize {
1162        self.index.num_documents()
1163    }
1164}
1165
1166/// Check if an index exists for the given project
1167pub fn index_exists(project_root: &Path) -> bool {
1168    paths::index_exists(project_root)
1169}
1170
1171#[cfg(test)]
1172mod tests {
1173    use super::*;
1174
1175    #[test]
1176    fn test_glob_simple_extension() {
1177        let patterns = vec!["*.rs".to_string()];
1178        assert!(matches_glob_pattern(Path::new("src/main.rs"), &patterns));
1179        assert!(matches_glob_pattern(
1180            Path::new("nested/deep/file.rs"),
1181            &patterns
1182        ));
1183        assert!(!matches_glob_pattern(Path::new("src/main.py"), &patterns));
1184    }
1185
1186    #[test]
1187    fn test_glob_recursive_double_star() {
1188        let patterns = vec!["**/*.rs".to_string()];
1189        assert!(matches_glob_pattern(Path::new("src/main.rs"), &patterns));
1190        assert!(matches_glob_pattern(Path::new("a/b/c/d.rs"), &patterns));
1191        assert!(!matches_glob_pattern(Path::new("main.py"), &patterns));
1192    }
1193
1194    #[test]
1195    fn test_glob_directory_pattern() {
1196        let patterns = vec!["src/**/*.rs".to_string()];
1197        assert!(matches_glob_pattern(Path::new("src/main.rs"), &patterns));
1198        assert!(matches_glob_pattern(
1199            Path::new("src/index/mod.rs"),
1200            &patterns
1201        ));
1202        // Matches anywhere src/ appears due to **/ prefix
1203        assert!(matches_glob_pattern(
1204            Path::new("project/src/main.rs"),
1205            &patterns
1206        ));
1207        assert!(!matches_glob_pattern(Path::new("lib/main.rs"), &patterns));
1208    }
1209
1210    #[test]
1211    fn test_glob_github_workflows() {
1212        let patterns = vec!["**/.github/**/*".to_string()];
1213        assert!(matches_glob_pattern(
1214            Path::new(".github/workflows/ci.yml"),
1215            &patterns
1216        ));
1217        assert!(matches_glob_pattern(
1218            Path::new("project/.github/actions/setup.yml"),
1219            &patterns
1220        ));
1221        assert!(!matches_glob_pattern(Path::new("src/main.rs"), &patterns));
1222    }
1223
1224    #[test]
1225    fn test_glob_multiple_patterns() {
1226        let patterns = vec!["*.rs".to_string(), "*.py".to_string()];
1227        assert!(matches_glob_pattern(Path::new("main.rs"), &patterns));
1228        assert!(matches_glob_pattern(Path::new("main.py"), &patterns));
1229        assert!(!matches_glob_pattern(Path::new("main.js"), &patterns));
1230    }
1231
1232    #[test]
1233    fn test_glob_test_files() {
1234        let patterns = vec!["*_test.go".to_string()];
1235        assert!(matches_glob_pattern(
1236            Path::new("pkg/main_test.go"),
1237            &patterns
1238        ));
1239        assert!(!matches_glob_pattern(Path::new("pkg/main.go"), &patterns));
1240    }
1241
1242    #[test]
1243    fn test_glob_empty_patterns() {
1244        let patterns: Vec<String> = vec![];
1245        // Empty patterns should match everything
1246        assert!(matches_glob_pattern(Path::new("any/file.rs"), &patterns));
1247    }
1248
1249    #[test]
1250    fn test_is_within_project_root_simple_path() {
1251        let temp_dir = std::env::temp_dir().join("plaid_test_project");
1252        let _ = std::fs::create_dir_all(&temp_dir);
1253
1254        // Simple relative path should be allowed
1255        assert!(is_within_project_root(&temp_dir, Path::new("src/main.rs")));
1256        assert!(is_within_project_root(&temp_dir, Path::new("file.txt")));
1257    }
1258
1259    #[test]
1260    fn test_is_within_project_root_path_traversal() {
1261        let temp_dir = std::env::temp_dir().join("plaid_test_project");
1262        let _ = std::fs::create_dir_all(&temp_dir);
1263
1264        // Path traversal attempts should be rejected
1265        assert!(!is_within_project_root(
1266            &temp_dir,
1267            Path::new("../../../etc/passwd")
1268        ));
1269        assert!(!is_within_project_root(&temp_dir, Path::new("../sibling")));
1270        assert!(!is_within_project_root(
1271            &temp_dir,
1272            Path::new("foo/../../..")
1273        ));
1274    }
1275
1276    #[test]
1277    fn test_is_within_project_root_hidden_traversal() {
1278        let temp_dir = std::env::temp_dir().join("plaid_test_project");
1279        let _ = std::fs::create_dir_all(&temp_dir);
1280
1281        // Hidden path traversal patterns
1282        assert!(!is_within_project_root(
1283            &temp_dir,
1284            Path::new("src/../../../etc/passwd")
1285        ));
1286        assert!(!is_within_project_root(
1287            &temp_dir,
1288            Path::new("./foo/../../../bar")
1289        ));
1290    }
1291
1292    #[test]
1293    fn test_is_within_project_root_valid_dotdot_in_middle() {
1294        let temp_dir = std::env::temp_dir().join("plaid_test_project_dotdot");
1295        let sub_dir = temp_dir.join("src").join("subdir");
1296        let _ = std::fs::create_dir_all(&sub_dir);
1297
1298        // Create a test file
1299        let test_file = temp_dir.join("src").join("main.rs");
1300        let _ = std::fs::write(&test_file, "fn main() {}");
1301
1302        // Path that goes down then up but stays within project should be allowed
1303        // src/subdir/../main.rs resolves to src/main.rs
1304        assert!(is_within_project_root(
1305            &temp_dir,
1306            Path::new("src/subdir/../main.rs")
1307        ));
1308
1309        // Cleanup
1310        let _ = std::fs::remove_dir_all(&temp_dir);
1311    }
1312}
next_plaid_cli/index/mod.rs

next_plaid_cli/index/
mod.rs