Skip to main content

vtcode_indexer/
lib.rs

1//! Workspace-friendly file indexer extracted from VT Code.
2//!
3//! `vtcode-indexer` offers a lightweight alternative to heavyweight
4//! search/indexing stacks. It recursively walks a workspace, computes
5//! hashes, and stores per-file metadata in Markdown-friendly summaries
6//! so changes remain easy to audit in git.
7
8use anyhow::Result;
9use hashbrown::HashMap;
10use ignore::{DirEntry, Walk, WalkBuilder};
11use regex::Regex;
12use serde::{Deserialize, Serialize};
13use std::fmt::Write as FmtWrite;
14use std::fs;
15use std::io::{BufWriter, ErrorKind, Write};
16use std::path::{Path, PathBuf};
17use std::sync::Arc;
18use std::time::SystemTime;
19
20/// Persistence backend for [`SimpleIndexer`].
21pub trait IndexStorage: Send + Sync {
22    /// Prepare any directories or resources required for persistence.
23    fn init(&self, index_dir: &Path) -> Result<()>;
24
25    /// Persist an indexed file entry.
26    fn persist(&self, index_dir: &Path, entry: &FileIndex) -> Result<()>;
27
28    /// Whether this backend expects full-snapshot persistence.
29    ///
30    /// Snapshot-aware backends receive the complete in-memory index on each
31    /// update so on-disk state stays consistent across single-file and
32    /// directory indexing flows.
33    fn prefers_snapshot_persistence(&self) -> bool {
34        false
35    }
36
37    /// Remove a previously persisted file entry.
38    ///
39    /// Defaults to a no-op to keep existing custom storage backends compatible.
40    fn remove(&self, _index_dir: &Path, _file_path: &Path) -> Result<()> {
41        Ok(())
42    }
43
44    /// Persist a batch of indexed file entries.
45    ///
46    /// Defaults to calling [`IndexStorage::persist`] for each entry, keeping
47    /// existing custom storage backends compatible.
48    fn persist_batch(&self, index_dir: &Path, entries: &[FileIndex]) -> Result<()> {
49        for entry in entries {
50            self.persist(index_dir, entry)?;
51        }
52        Ok(())
53    }
54
55    /// Persist a batch of indexed file entries borrowed from the in-memory cache.
56    ///
57    /// Defaults to cloning the borrowed entries and delegating to
58    /// [`IndexStorage::persist_batch`] so existing custom storage backends remain
59    /// compatible.
60    fn persist_batch_refs(&self, index_dir: &Path, entries: &[&FileIndex]) -> Result<()> {
61        let owned = entries
62            .iter()
63            .map(|entry| (*entry).clone())
64            .collect::<Vec<_>>();
65        self.persist_batch(index_dir, &owned)
66    }
67}
68
69/// Directory traversal filter hook for [`SimpleIndexer`].
70pub trait TraversalFilter: Send + Sync {
71    /// Determine if the indexer should descend into the provided directory.
72    fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool;
73
74    /// Determine if the indexer should process the provided file.
75    fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool;
76}
77
78/// Markdown-backed [`IndexStorage`] implementation.
79#[derive(Debug, Default, Clone)]
80pub struct MarkdownIndexStorage;
81
82impl IndexStorage for MarkdownIndexStorage {
83    fn init(&self, index_dir: &Path) -> Result<()> {
84        fs::create_dir_all(index_dir)?;
85        Ok(())
86    }
87
88    fn persist(&self, index_dir: &Path, entry: &FileIndex) -> Result<()> {
89        fs::create_dir_all(index_dir)?;
90        let file_name = format!("{}.md", calculate_hash(&entry.path));
91        let index_path = index_dir.join(file_name);
92        let file = fs::File::create(index_path)?;
93        let mut writer = BufWriter::new(file);
94        writeln!(writer, "# File Index: {}", entry.path)?;
95        writeln!(writer)?;
96        write_markdown_fields(&mut writer, entry)?;
97        writer.flush()?;
98        Ok(())
99    }
100
101    fn prefers_snapshot_persistence(&self) -> bool {
102        true
103    }
104
105    fn remove(&self, index_dir: &Path, file_path: &Path) -> Result<()> {
106        let file_name = format!(
107            "{}.md",
108            calculate_hash(file_path.to_string_lossy().as_ref())
109        );
110        let index_path = index_dir.join(file_name);
111        match fs::remove_file(index_path) {
112            Ok(()) => Ok(()),
113            Err(err) if err.kind() == ErrorKind::NotFound => Ok(()),
114            Err(err) => Err(err.into()),
115        }
116    }
117
118    fn persist_batch(&self, index_dir: &Path, entries: &[FileIndex]) -> Result<()> {
119        persist_markdown_snapshot(index_dir, entries.iter())
120    }
121
122    fn persist_batch_refs(&self, index_dir: &Path, entries: &[&FileIndex]) -> Result<()> {
123        persist_markdown_snapshot(index_dir, entries.iter().copied())
124    }
125}
126
127fn persist_markdown_snapshot<'a>(
128    index_dir: &Path,
129    entries: impl IntoIterator<Item = &'a FileIndex>,
130) -> Result<()> {
131    let entries = entries.into_iter().collect::<Vec<_>>();
132
133    fs::create_dir_all(index_dir)?;
134    let temp_path = index_dir.join(".index.md.tmp");
135    let final_path = index_dir.join("index.md");
136    let file = fs::File::create(&temp_path)?;
137    let mut writer = BufWriter::new(file);
138
139    writeln!(writer, "# Workspace File Index")?;
140    writeln!(writer)?;
141    writeln!(writer, "- **Entries**: {}", entries.len())?;
142    writeln!(writer)?;
143
144    for entry in entries {
145        write_markdown_entry(&mut writer, entry)?;
146    }
147
148    writer.flush()?;
149    fs::rename(temp_path, final_path)?;
150    cleanup_legacy_markdown_entries(index_dir)?;
151    Ok(())
152}
153
154/// Default traversal filter powered by [`SimpleIndexerConfig`].
155#[derive(Debug, Default, Clone)]
156pub struct ConfigTraversalFilter;
157
158impl TraversalFilter for ConfigTraversalFilter {
159    fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
160        !should_skip_dir(path, config)
161    }
162
163    fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
164        if !path.is_file() {
165            return false;
166        }
167
168        // Skip hidden files when configured.
169        if config.ignore_hidden
170            && path
171                .file_name()
172                .and_then(|n| n.to_str())
173                .is_some_and(|s| s.starts_with('.'))
174        {
175            return false;
176        }
177
178        // Always skip known sensitive files regardless of config.
179        if let Some(file_name) = path.file_name().and_then(|n| n.to_str())
180            && (vtcode_commons::exclusions::is_sensitive_file(file_name)
181                || file_name == ".gitignore"
182                || file_name == ".git")
183        {
184            return false;
185        }
186
187        true
188    }
189}
190
191/// Configuration for [`SimpleIndexer`].
192#[derive(Clone, Debug)]
193pub struct SimpleIndexerConfig {
194    workspace_root: PathBuf,
195    index_dir: PathBuf,
196    ignore_hidden: bool,
197    excluded_dirs: Vec<PathBuf>,
198    allowed_dirs: Vec<PathBuf>,
199}
200
201impl SimpleIndexerConfig {
202    /// Builds a configuration using VT Code's legacy layout as defaults.
203    pub fn new(workspace_root: PathBuf) -> Self {
204        let index_dir = workspace_root.join(".vtcode").join("index");
205        let vtcode_dir = workspace_root.join(".vtcode");
206        let external_dir = vtcode_dir.join("external");
207
208        let mut excluded_dirs: Vec<PathBuf> = vtcode_commons::exclusions::DEFAULT_EXCLUDED_DIRS
209            .iter()
210            .map(|name| workspace_root.join(name))
211            .collect();
212        excluded_dirs.push(index_dir.clone());
213        excluded_dirs.push(vtcode_dir);
214
215        excluded_dirs.dedup();
216
217        Self {
218            workspace_root,
219            index_dir,
220            ignore_hidden: true,
221            excluded_dirs,
222            allowed_dirs: vec![external_dir],
223        }
224    }
225
226    /// Updates the index directory used for persisted metadata.
227    pub fn with_index_dir(mut self, index_dir: impl Into<PathBuf>) -> Self {
228        let index_dir = index_dir.into();
229        self.index_dir = index_dir.clone();
230        self.push_unique_excluded(index_dir);
231        self
232    }
233
234    /// Adds an allowed directory that should be indexed even if hidden or inside an excluded parent.
235    pub fn add_allowed_dir(mut self, path: impl Into<PathBuf>) -> Self {
236        let path = path.into();
237        if !self.allowed_dirs.iter().any(|existing| existing == &path) {
238            self.allowed_dirs.push(path);
239        }
240        self
241    }
242
243    /// Adds an additional excluded directory to skip during traversal.
244    pub fn add_excluded_dir(mut self, path: impl Into<PathBuf>) -> Self {
245        let path = path.into();
246        self.push_unique_excluded(path);
247        self
248    }
249
250    /// Toggles whether hidden directories (prefix `.`) are ignored.
251    pub fn ignore_hidden(mut self, ignore_hidden: bool) -> Self {
252        self.ignore_hidden = ignore_hidden;
253        self
254    }
255
256    /// Workspace root accessor.
257    pub fn workspace_root(&self) -> &Path {
258        &self.workspace_root
259    }
260
261    /// Index directory accessor.
262    pub fn index_dir(&self) -> &Path {
263        &self.index_dir
264    }
265
266    fn push_unique_excluded(&mut self, path: PathBuf) {
267        if !self.excluded_dirs.iter().any(|existing| existing == &path) {
268            self.excluded_dirs.push(path);
269        }
270    }
271}
272
273/// Simple file index entry.
274#[derive(Debug, Clone, Serialize, Deserialize)]
275pub struct FileIndex {
276    /// File path.
277    pub path: String,
278    /// File content hash for change detection.
279    pub hash: String,
280    /// Last modified timestamp.
281    pub modified: u64,
282    /// File size.
283    pub size: u64,
284    /// Language/extension.
285    pub language: String,
286    /// Simple tags.
287    pub tags: Vec<String>,
288}
289
290/// Simple search result.
291#[derive(Debug, Clone, Serialize, Deserialize)]
292pub struct SearchResult {
293    pub file_path: String,
294    pub line_number: usize,
295    pub line_content: String,
296    pub matches: Vec<String>,
297}
298
299/// Simple file indexer.
300pub struct SimpleIndexer {
301    config: SimpleIndexerConfig,
302    index_cache: HashMap<String, FileIndex>,
303    storage: Arc<dyn IndexStorage>,
304    filter: Arc<dyn TraversalFilter>,
305}
306
307impl SimpleIndexer {
308    /// Create a new simple indexer with default VT Code paths.
309    pub fn new(workspace_root: PathBuf) -> Self {
310        Self::with_components(
311            SimpleIndexerConfig::new(workspace_root),
312            Arc::new(MarkdownIndexStorage),
313            Arc::new(ConfigTraversalFilter),
314        )
315    }
316
317    /// Create a simple indexer with the provided configuration.
318    pub fn with_config(config: SimpleIndexerConfig) -> Self {
319        Self::with_components(
320            config,
321            Arc::new(MarkdownIndexStorage),
322            Arc::new(ConfigTraversalFilter),
323        )
324    }
325
326    /// Create a new simple indexer using a custom index directory.
327    pub fn with_index_dir(workspace_root: PathBuf, index_dir: PathBuf) -> Self {
328        let config = SimpleIndexerConfig::new(workspace_root).with_index_dir(index_dir);
329        Self::with_config(config)
330    }
331
332    /// Create an indexer with explicit storage and traversal filter implementations.
333    pub fn with_components(
334        config: SimpleIndexerConfig,
335        storage: Arc<dyn IndexStorage>,
336        filter: Arc<dyn TraversalFilter>,
337    ) -> Self {
338        Self {
339            config,
340            index_cache: HashMap::new(),
341            storage,
342            filter,
343        }
344    }
345
346    /// Replace the storage backend used to persist index entries.
347    pub fn with_storage(self, storage: Arc<dyn IndexStorage>) -> Self {
348        Self { storage, ..self }
349    }
350
351    /// Replace the traversal filter used to decide which files and directories are indexed.
352    pub fn with_filter(self, filter: Arc<dyn TraversalFilter>) -> Self {
353        Self { filter, ..self }
354    }
355
356    /// Initialize the index directory.
357    pub fn init(&self) -> Result<()> {
358        self.storage.init(self.config.index_dir())
359    }
360
361    /// Get the workspace root path.
362    pub fn workspace_root(&self) -> &Path {
363        self.config.workspace_root()
364    }
365
366    /// Get the index directory used for persisted metadata.
367    pub fn index_dir(&self) -> &Path {
368        self.config.index_dir()
369    }
370
371    /// Index a single file.
372    pub fn index_file(&mut self, file_path: &Path) -> Result<()> {
373        let cache_key = file_path.to_string_lossy().into_owned();
374
375        if self.storage.prefers_snapshot_persistence() {
376            let next_entry = if file_path.exists() && self.should_process_file_path(file_path) {
377                self.build_file_index(file_path)?
378            } else {
379                None
380            };
381
382            self.apply_snapshot_file_update(cache_key, next_entry)?;
383            return Ok(());
384        }
385
386        if !file_path.exists() || !self.should_process_file_path(file_path) {
387            self.index_cache.remove(cache_key.as_str());
388            self.storage.remove(self.config.index_dir(), file_path)?;
389            return Ok(());
390        }
391
392        if let Some(index) = self.build_file_index(file_path)? {
393            self.storage.persist(self.config.index_dir(), &index)?;
394            self.index_cache.insert(index.path.clone(), index);
395        } else {
396            self.index_cache.remove(cache_key.as_str());
397            self.storage.remove(self.config.index_dir(), file_path)?;
398        }
399
400        Ok(())
401    }
402
403    /// Index all files in directory recursively.
404    /// Respects .gitignore, .ignore, and other ignore files.
405    /// SECURITY: Always skips hidden files and sensitive data (.env, .git, etc.)
406    pub fn index_directory(&mut self, dir_path: &Path) -> Result<()> {
407        let walker = self.build_walker(dir_path);
408
409        let mut entries = Vec::new();
410
411        for entry in walker.filter_map(|e| e.ok()) {
412            let path = entry.path();
413
414            // Only index files, not directories
415            if entry.file_type().is_some_and(|ft| ft.is_file())
416                && let Some(index) = self.build_file_index(path)?
417            {
418                entries.push(index);
419            }
420        }
421
422        if self.storage.prefers_snapshot_persistence() {
423            self.apply_snapshot_directory_update(dir_path, &entries)?;
424        } else {
425            entries.sort_unstable_by(|left, right| left.path.cmp(&right.path));
426            self.storage
427                .persist_batch(self.config.index_dir(), &entries)?;
428        }
429
430        self.replace_cached_entries(dir_path, &entries);
431
432        Ok(())
433    }
434
435    /// Discover all files in directory recursively without indexing them.
436    /// This is much faster than `index_directory` as it avoids hashing and persistence.
437    pub fn discover_files(&self, dir_path: &Path) -> Vec<String> {
438        let walker = self.build_walker(dir_path);
439
440        let mut files = walker
441            .filter_map(|e| e.ok())
442            .filter(|e| {
443                if !e.file_type().is_some_and(|ft| ft.is_file()) {
444                    return false;
445                }
446
447                self.should_process_file_path(e.path())
448            })
449            .map(|e| e.path().to_string_lossy().into_owned())
450            .collect::<Vec<_>>();
451        files.sort_unstable();
452        files
453    }
454
455    /// Internal helper for regex-based file content search.
456    /// Used by both `search()` and `grep()` to avoid code duplication.
457    fn search_files_internal(
458        &self,
459        regex: &Regex,
460        path_filter: Option<&str>,
461        extract_matches: bool,
462    ) -> Vec<SearchResult> {
463        let mut results = Vec::with_capacity(self.index_cache.len());
464
465        for file_path in self.index_cache.keys() {
466            if path_filter.is_some_and(|filter| !file_path.contains(filter)) {
467                continue;
468            }
469
470            if let Ok(content) = fs::read_to_string(file_path) {
471                for (line_num, line) in content.lines().enumerate() {
472                    if regex.is_match(line) {
473                        let matches = if extract_matches {
474                            regex
475                                .find_iter(line)
476                                .map(|m| m.as_str().to_string())
477                                .collect()
478                        } else {
479                            vec![line.to_string()]
480                        };
481
482                        results.push(SearchResult {
483                            file_path: file_path.clone(),
484                            line_number: line_num + 1,
485                            line_content: line.to_string(),
486                            matches,
487                        });
488                    }
489                }
490            }
491        }
492
493        results.sort_unstable_by(|left, right| {
494            left.file_path
495                .cmp(&right.file_path)
496                .then_with(|| left.line_number.cmp(&right.line_number))
497        });
498        results
499    }
500
501    /// Search files using regex pattern.
502    pub fn search(&self, pattern: &str, path_filter: Option<&str>) -> Result<Vec<SearchResult>> {
503        let regex = Regex::new(pattern)?;
504        Ok(self.search_files_internal(&regex, path_filter, true))
505    }
506
507    /// Find files by name pattern.
508    pub fn find_files(&self, pattern: &str) -> Result<Vec<String>> {
509        let regex = Regex::new(pattern)?;
510        let mut results = Vec::with_capacity(self.index_cache.len());
511
512        for file_path in self.index_cache.keys() {
513            if regex.is_match(file_path) {
514                results.push(file_path.clone());
515            }
516        }
517
518        results.sort_unstable();
519        Ok(results)
520    }
521
522    /// Get all indexed files without pattern matching.
523    /// This is more efficient than using find_files(".*").
524    pub fn all_files(&self) -> Vec<String> {
525        let mut files = self.index_cache.keys().cloned().collect::<Vec<_>>();
526        files.sort_unstable();
527        files
528    }
529
530    /// Get file content with line numbers.
531    pub fn get_file_content(
532        &self,
533        file_path: &str,
534        start_line: Option<usize>,
535        end_line: Option<usize>,
536    ) -> Result<String> {
537        let content = fs::read_to_string(file_path)?;
538        let start = start_line.unwrap_or(1).max(1);
539        let end = end_line.unwrap_or(usize::MAX);
540
541        if start > end {
542            return Ok(String::new());
543        }
544
545        let mut result = String::new();
546        for (line_number, line) in content.lines().enumerate() {
547            let line_number = line_number + 1;
548            if line_number < start {
549                continue;
550            }
551            if line_number > end {
552                break;
553            }
554            writeln!(&mut result, "{line_number}: {line}")?;
555        }
556
557        Ok(result)
558    }
559
560    /// List files in directory (like ls).
561    pub fn list_files(&self, dir_path: &str, show_hidden: bool) -> Result<Vec<String>> {
562        let path = Path::new(dir_path);
563        if !path.exists() {
564            return Ok(vec![]);
565        }
566
567        let mut files = Vec::new();
568
569        for entry in fs::read_dir(path)? {
570            let entry = entry?;
571            let file_name = entry.file_name().to_string_lossy().into_owned();
572
573            if !show_hidden && file_name.starts_with('.') {
574                continue;
575            }
576
577            files.push(file_name);
578        }
579
580        files.sort_unstable();
581        Ok(files)
582    }
583
584    /// Grep-like search (like grep command).
585    pub fn grep(&self, pattern: &str, file_pattern: Option<&str>) -> Result<Vec<SearchResult>> {
586        let regex = Regex::new(pattern)?;
587        Ok(self.search_files_internal(&regex, file_pattern, false))
588    }
589
590    fn is_allowed_path(&self, path: &Path) -> bool {
591        self.config
592            .allowed_dirs
593            .iter()
594            .any(|allowed| path.starts_with(allowed))
595    }
596
597    #[inline]
598    fn get_modified_time(&self, file_path: &Path) -> Result<u64> {
599        let metadata = fs::metadata(file_path)?;
600        let modified = metadata.modified()?;
601        Ok(modified.duration_since(SystemTime::UNIX_EPOCH)?.as_secs())
602    }
603
604    #[inline]
605    fn detect_language(&self, file_path: &Path) -> String {
606        file_path
607            .extension()
608            .and_then(|ext| ext.to_str())
609            .unwrap_or("unknown")
610            .to_string()
611    }
612
613    fn build_file_index(&self, file_path: &Path) -> Result<Option<FileIndex>> {
614        if !self.should_process_file_path(file_path) {
615            return Ok(None);
616        }
617
618        let content = match fs::read_to_string(file_path) {
619            Ok(text) => text,
620            Err(err) => {
621                if err.kind() == ErrorKind::InvalidData {
622                    return Ok(None);
623                }
624                return Err(err.into());
625            }
626        };
627
628        let index = FileIndex {
629            path: file_path.to_string_lossy().into_owned(),
630            hash: calculate_hash(&content),
631            modified: self.get_modified_time(file_path)?,
632            size: content.len() as u64,
633            language: self.detect_language(file_path),
634            tags: vec![],
635        };
636
637        Ok(Some(index))
638    }
639
640    #[inline]
641    fn is_excluded_path(&self, path: &Path) -> bool {
642        self.config
643            .excluded_dirs
644            .iter()
645            .any(|excluded| path.starts_with(excluded))
646    }
647
648    #[inline]
649    fn should_index_file_path(&self, path: &Path) -> bool {
650        self.filter.should_index_file(path, &self.config)
651    }
652
653    #[inline]
654    fn should_process_file_path(&self, path: &Path) -> bool {
655        if self.is_allowed_path(path) {
656            return self.should_index_file_path(path);
657        }
658
659        !self.is_excluded_path(path) && self.should_index_file_path(path)
660    }
661
662    fn build_walker(&self, dir_path: &Path) -> Walk {
663        let walk_root = dir_path.to_path_buf();
664        let config = self.config.clone();
665        let filter = Arc::clone(&self.filter);
666
667        let mut builder = WalkBuilder::new(dir_path);
668        builder
669            .hidden(false)
670            .git_ignore(true)
671            .git_global(true)
672            .git_exclude(true)
673            .ignore(true)
674            .parents(true);
675        builder.filter_entry(move |entry| {
676            should_visit_entry(entry, walk_root.as_path(), &config, filter.as_ref())
677        });
678        builder.build()
679    }
680
681    fn replace_cached_entries(&mut self, dir_path: &Path, entries: &[FileIndex]) {
682        self.index_cache
683            .retain(|path, _| !Path::new(path).starts_with(dir_path));
684
685        self.index_cache.extend(
686            entries
687                .iter()
688                .cloned()
689                .map(|entry| (entry.path.clone(), entry)),
690        );
691    }
692
693    fn apply_snapshot_file_update(
694        &mut self,
695        cache_key: String,
696        next_entry: Option<FileIndex>,
697    ) -> Result<()> {
698        let previous_entry = match next_entry {
699            Some(entry) => self.index_cache.insert(cache_key.clone(), entry),
700            None => self.index_cache.remove(cache_key.as_str()),
701        };
702
703        if let Err(err) = self.persist_current_snapshot() {
704            match previous_entry {
705                Some(entry) => {
706                    self.index_cache.insert(cache_key, entry);
707                }
708                None => {
709                    self.index_cache.remove(cache_key.as_str());
710                }
711            }
712            return Err(err);
713        }
714
715        Ok(())
716    }
717
718    fn apply_snapshot_directory_update(
719        &mut self,
720        dir_path: &Path,
721        entries: &[FileIndex],
722    ) -> Result<()> {
723        let previous_entries = self.take_cached_entries(dir_path);
724        self.index_cache.extend(
725            entries
726                .iter()
727                .cloned()
728                .map(|entry| (entry.path.clone(), entry)),
729        );
730
731        if let Err(err) = self.persist_current_snapshot() {
732            self.index_cache
733                .retain(|path, _| !Path::new(path).starts_with(dir_path));
734            self.index_cache.extend(
735                previous_entries
736                    .into_iter()
737                    .map(|entry| (entry.path.clone(), entry)),
738            );
739            return Err(err);
740        }
741
742        Ok(())
743    }
744
745    fn take_cached_entries(&mut self, dir_path: &Path) -> Vec<FileIndex> {
746        let keys = self
747            .index_cache
748            .keys()
749            .filter(|path| Path::new(path).starts_with(dir_path))
750            .cloned()
751            .collect::<Vec<_>>();
752
753        keys.into_iter()
754            .filter_map(|path| self.index_cache.remove(path.as_str()))
755            .collect()
756    }
757
758    fn persist_current_snapshot(&self) -> Result<()> {
759        let mut snapshot = self.index_cache.values().collect::<Vec<_>>();
760        snapshot.sort_unstable_by(|left, right| left.path.cmp(&right.path));
761        self.storage
762            .persist_batch_refs(self.config.index_dir(), &snapshot)
763    }
764}
765
766impl Clone for SimpleIndexer {
767    fn clone(&self) -> Self {
768        Self {
769            config: self.config.clone(),
770            index_cache: self.index_cache.clone(),
771            storage: self.storage.clone(),
772            filter: self.filter.clone(),
773        }
774    }
775}
776
777fn should_skip_dir(path: &Path, config: &SimpleIndexerConfig) -> bool {
778    if is_allowed_path_or_ancestor(path, config) {
779        return false;
780    }
781
782    if config
783        .excluded_dirs
784        .iter()
785        .any(|excluded| path.starts_with(excluded))
786    {
787        return true;
788    }
789
790    if config.ignore_hidden
791        && path
792            .file_name()
793            .and_then(|name| name.to_str())
794            .is_some_and(|name_str| name_str.starts_with('.'))
795    {
796        return true;
797    }
798
799    false
800}
801
802fn is_allowed_path_or_ancestor(path: &Path, config: &SimpleIndexerConfig) -> bool {
803    config
804        .allowed_dirs
805        .iter()
806        .any(|allowed| path.starts_with(allowed) || allowed.starts_with(path))
807}
808
809fn should_visit_entry(
810    entry: &DirEntry,
811    walk_root: &Path,
812    config: &SimpleIndexerConfig,
813    filter: &dyn TraversalFilter,
814) -> bool {
815    if entry.path() == walk_root {
816        return true;
817    }
818
819    if !entry
820        .file_type()
821        .is_some_and(|file_type| file_type.is_dir())
822    {
823        return true;
824    }
825
826    filter.should_descend(entry.path(), config)
827}
828
829#[inline]
830fn calculate_hash(content: &str) -> String {
831    vtcode_commons::utils::calculate_sha256(content.as_bytes())
832}
833
834fn write_markdown_entry(writer: &mut impl Write, entry: &FileIndex) -> std::io::Result<()> {
835    writeln!(writer, "## {}", entry.path)?;
836    writeln!(writer)?;
837    write_markdown_fields(writer, entry)?;
838    writeln!(writer)?;
839    Ok(())
840}
841
842fn write_markdown_fields(writer: &mut impl Write, entry: &FileIndex) -> std::io::Result<()> {
843    writeln!(writer, "- **Path**: {}", entry.path)?;
844    writeln!(writer, "- **Hash**: {}", entry.hash)?;
845    writeln!(writer, "- **Modified**: {}", entry.modified)?;
846    writeln!(writer, "- **Size**: {} bytes", entry.size)?;
847    writeln!(writer, "- **Language**: {}", entry.language)?;
848    writeln!(writer, "- **Tags**: {}", entry.tags.join(", "))?;
849    Ok(())
850}
851
852fn cleanup_legacy_markdown_entries(index_dir: &Path) -> Result<()> {
853    for entry in fs::read_dir(index_dir)? {
854        let entry = entry?;
855        let file_name = entry.file_name();
856        let file_name = file_name.to_string_lossy();
857        if is_legacy_markdown_entry_name(file_name.as_ref()) {
858            fs::remove_file(entry.path())?;
859        }
860    }
861    Ok(())
862}
863
864#[inline]
865fn is_legacy_markdown_entry_name(file_name: &str) -> bool {
866    let Some(hash_part) = file_name.strip_suffix(".md") else {
867        return false;
868    };
869    hash_part.len() == 64 && hash_part.bytes().all(|byte| byte.is_ascii_hexdigit())
870}
871
872#[cfg(test)]
873mod tests {
874    use super::*;
875    use std::fs;
876    use std::sync::{Arc, Mutex};
877    use tempfile::tempdir;
878
879    #[test]
880    fn skips_hidden_directories_by_default() -> Result<()> {
881        let temp = tempdir()?;
882        let workspace = temp.path();
883        let hidden_dir = workspace.join(".private");
884        fs::create_dir_all(&hidden_dir)?;
885        fs::write(hidden_dir.join("secret.txt"), "classified")?;
886
887        let visible_dir = workspace.join("src");
888        fs::create_dir_all(&visible_dir)?;
889        fs::write(visible_dir.join("lib.rs"), "fn main() {}")?;
890
891        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
892        indexer.init()?;
893        indexer.index_directory(workspace)?;
894
895        assert!(indexer.find_files("secret\\.txt$")?.is_empty());
896        assert!(!indexer.find_files("lib\\.rs$")?.is_empty());
897
898        Ok(())
899    }
900
901    #[test]
902    fn can_include_hidden_directories_when_configured() -> Result<()> {
903        let temp = tempdir()?;
904        let workspace = temp.path();
905        let hidden_dir = workspace.join(".cache");
906        fs::create_dir_all(&hidden_dir)?;
907        fs::write(hidden_dir.join("data.log"), "details")?;
908
909        let config = SimpleIndexerConfig::new(workspace.to_path_buf()).ignore_hidden(false);
910        let mut indexer = SimpleIndexer::with_config(config);
911        indexer.init()?;
912        indexer.index_directory(workspace)?;
913
914        let results = indexer.find_files("data\\.log$")?;
915        assert_eq!(results.len(), 1);
916
917        Ok(())
918    }
919
920    #[test]
921    fn indexes_allowed_directories_inside_hidden_excluded_parents() -> Result<()> {
922        let temp = tempdir()?;
923        let workspace = temp.path();
924        let allowed_dir = workspace.join(".vtcode").join("external");
925        fs::create_dir_all(&allowed_dir)?;
926        fs::write(allowed_dir.join("plugin.toml"), "name = 'demo'")?;
927
928        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
929        indexer.init()?;
930        indexer.index_directory(workspace)?;
931
932        let results = indexer.find_files("plugin\\.toml$")?;
933        assert_eq!(results.len(), 1);
934
935        Ok(())
936    }
937
938    #[test]
939    fn reindexing_prunes_deleted_files_from_cache() -> Result<()> {
940        let temp = tempdir()?;
941        let workspace = temp.path();
942        let file_path = workspace.join("notes.txt");
943        fs::write(&file_path, "remember this")?;
944
945        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
946        indexer.init()?;
947        indexer.index_directory(workspace)?;
948        assert_eq!(indexer.find_files("notes\\.txt$")?.len(), 1);
949
950        fs::remove_file(&file_path)?;
951        indexer.index_directory(workspace)?;
952
953        assert!(indexer.find_files("notes\\.txt$")?.is_empty());
954        assert!(indexer.all_files().is_empty());
955
956        Ok(())
957    }
958
959    #[test]
960    fn index_file_skips_excluded_paths() -> Result<()> {
961        let temp = tempdir()?;
962        let workspace = temp.path();
963        let index_dir = workspace.join(".vtcode").join("index");
964        fs::create_dir_all(&index_dir)?;
965        let generated_index = index_dir.join("index.md");
966        fs::write(&generated_index, "# generated")?;
967
968        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
969        indexer.init()?;
970        indexer.index_file(&generated_index)?;
971
972        assert!(indexer.all_files().is_empty());
973
974        Ok(())
975    }
976
977    #[test]
978    fn index_file_removes_stale_entry_when_file_becomes_unreadable() -> Result<()> {
979        let temp = tempdir()?;
980        let workspace = temp.path();
981        let file_path = workspace.join("notes.txt");
982        fs::write(&file_path, "remember this")?;
983
984        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
985        indexer.init()?;
986        indexer.index_file(&file_path)?;
987        assert!(
988            indexer
989                .find_files("notes\\.txt$")?
990                .iter()
991                .any(|file| file.ends_with("notes.txt"))
992        );
993
994        fs::write(&file_path, [0xFF, 0xFE, 0xFD])?;
995        indexer.index_file(&file_path)?;
996
997        assert!(indexer.find_files("notes\\.txt$")?.is_empty());
998
999        let index_content =
1000            fs::read_to_string(workspace.join(".vtcode").join("index").join("index.md"))?;
1001        assert!(!index_content.contains(file_path.to_string_lossy().as_ref()));
1002
1003        Ok(())
1004    }
1005
1006    #[test]
1007    fn index_file_maintains_markdown_snapshot_across_updates() -> Result<()> {
1008        let temp = tempdir()?;
1009        let workspace = temp.path();
1010        let first = workspace.join("first.txt");
1011        let second = workspace.join("second.txt");
1012        fs::write(&first, "one")?;
1013        fs::write(&second, "two")?;
1014
1015        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1016        indexer.init()?;
1017        indexer.index_file(&first)?;
1018        indexer.index_file(&second)?;
1019
1020        let index_dir = workspace.join(".vtcode").join("index");
1021        let files = fs::read_dir(&index_dir)?
1022            .filter_map(|entry| entry.ok())
1023            .map(|entry| entry.file_name().to_string_lossy().into_owned())
1024            .collect::<Vec<_>>();
1025        assert_eq!(files, vec!["index.md".to_string()]);
1026
1027        let index_content = fs::read_to_string(index_dir.join("index.md"))?;
1028        assert!(index_content.contains(first.to_string_lossy().as_ref()));
1029        assert!(index_content.contains(second.to_string_lossy().as_ref()));
1030
1031        Ok(())
1032    }
1033
1034    #[test]
1035    fn index_directory_writes_markdown_snapshot_without_manual_init() -> Result<()> {
1036        let temp = tempdir()?;
1037        let workspace = temp.path();
1038        fs::write(workspace.join("notes.txt"), "remember this")?;
1039
1040        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1041        indexer.index_directory(workspace)?;
1042
1043        let index_content =
1044            fs::read_to_string(workspace.join(".vtcode").join("index").join("index.md"))?;
1045        assert!(index_content.contains(workspace.join("notes.txt").to_string_lossy().as_ref()));
1046
1047        Ok(())
1048    }
1049
1050    #[test]
1051    fn get_file_content_clamps_ranges_without_panicking() -> Result<()> {
1052        let temp = tempdir()?;
1053        let workspace = temp.path();
1054        let file_path = workspace.join("notes.txt");
1055        fs::write(&file_path, "first\nsecond")?;
1056
1057        let indexer = SimpleIndexer::new(workspace.to_path_buf());
1058        let file_path = file_path.to_string_lossy().into_owned();
1059
1060        assert_eq!(indexer.get_file_content(&file_path, Some(5), None)?, "");
1061        assert_eq!(
1062            indexer.get_file_content(&file_path, Some(0), Some(1))?,
1063            "1: first\n"
1064        );
1065        assert_eq!(indexer.get_file_content(&file_path, Some(2), Some(1))?, "");
1066
1067        Ok(())
1068    }
1069
1070    #[test]
1071    fn supports_custom_storage_backends() -> Result<()> {
1072        #[derive(Clone, Default)]
1073        struct MemoryStorage {
1074            records: Arc<Mutex<Vec<FileIndex>>>,
1075        }
1076
1077        impl MemoryStorage {
1078            fn new(records: Arc<Mutex<Vec<FileIndex>>>) -> Self {
1079                Self { records }
1080            }
1081        }
1082
1083        impl IndexStorage for MemoryStorage {
1084            fn init(&self, _index_dir: &Path) -> Result<()> {
1085                Ok(())
1086            }
1087
1088            fn persist(&self, _index_dir: &Path, entry: &FileIndex) -> Result<()> {
1089                let mut guard = self.records.lock().expect("lock poisoned");
1090                guard.push(entry.clone());
1091                Ok(())
1092            }
1093        }
1094
1095        let temp = tempdir()?;
1096        let workspace = temp.path();
1097        fs::write(workspace.join("notes.txt"), "remember this")?;
1098
1099        let records: Arc<Mutex<Vec<FileIndex>>> = Arc::new(Mutex::new(Vec::new()));
1100        let storage = MemoryStorage::new(records.clone());
1101
1102        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1103        let mut indexer = SimpleIndexer::with_config(config).with_storage(Arc::new(storage));
1104        indexer.init()?;
1105        indexer.index_directory(workspace)?;
1106
1107        let entries = records.lock().expect("lock poisoned");
1108        assert_eq!(entries.len(), 1);
1109        assert_eq!(
1110            entries[0].path,
1111            workspace.join("notes.txt").to_string_lossy().into_owned()
1112        );
1113
1114        Ok(())
1115    }
1116
1117    #[test]
1118    fn custom_filters_can_skip_files() -> Result<()> {
1119        #[derive(Default)]
1120        struct SkipRustFilter {
1121            inner: ConfigTraversalFilter,
1122        }
1123
1124        impl TraversalFilter for SkipRustFilter {
1125            fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
1126                self.inner.should_descend(path, config)
1127            }
1128
1129            fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
1130                if path
1131                    .extension()
1132                    .and_then(|ext| ext.to_str())
1133                    .is_some_and(|ext| ext.eq_ignore_ascii_case("rs"))
1134                {
1135                    return false;
1136                }
1137
1138                self.inner.should_index_file(path, config)
1139            }
1140        }
1141
1142        let temp = tempdir()?;
1143        let workspace = temp.path();
1144        fs::write(workspace.join("lib.rs"), "fn main() {}")?;
1145        fs::write(workspace.join("README.md"), "# Notes")?;
1146
1147        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1148        let mut indexer =
1149            SimpleIndexer::with_config(config).with_filter(Arc::new(SkipRustFilter::default()));
1150        indexer.init()?;
1151        indexer.index_directory(workspace)?;
1152
1153        assert!(indexer.find_files("lib\\.rs$")?.is_empty());
1154        assert!(!indexer.find_files("README\\.md$")?.is_empty());
1155
1156        Ok(())
1157    }
1158
1159    #[test]
1160    fn custom_filters_can_skip_directories() -> Result<()> {
1161        #[derive(Default)]
1162        struct SkipGeneratedFilter {
1163            inner: ConfigTraversalFilter,
1164        }
1165
1166        impl TraversalFilter for SkipGeneratedFilter {
1167            fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
1168                if path.ends_with("generated") {
1169                    return false;
1170                }
1171
1172                self.inner.should_descend(path, config)
1173            }
1174
1175            fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
1176                self.inner.should_index_file(path, config)
1177            }
1178        }
1179
1180        let temp = tempdir()?;
1181        let workspace = temp.path();
1182        let generated_dir = workspace.join("generated");
1183        fs::create_dir_all(&generated_dir)?;
1184        fs::write(generated_dir.join("skip.txt"), "ignore me")?;
1185        fs::write(workspace.join("README.md"), "# Notes")?;
1186
1187        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1188        let indexer = SimpleIndexer::with_config(config)
1189            .with_filter(Arc::new(SkipGeneratedFilter::default()));
1190        let files = indexer.discover_files(workspace);
1191
1192        assert!(!files.iter().any(|file| file.ends_with("skip.txt")));
1193        assert!(files.iter().any(|file| file.ends_with("README.md")));
1194
1195        Ok(())
1196    }
1197
1198    #[test]
1199    fn indexing_multiple_directories_preserves_existing_cache_entries() -> Result<()> {
1200        let temp = tempdir()?;
1201        let workspace = temp.path();
1202        let src_dir = workspace.join("src");
1203        let docs_dir = workspace.join("docs");
1204        fs::create_dir_all(&src_dir)?;
1205        fs::create_dir_all(&docs_dir)?;
1206        fs::write(src_dir.join("lib.rs"), "fn main() {}")?;
1207        fs::write(docs_dir.join("guide.md"), "# Guide")?;
1208
1209        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1210        indexer.init()?;
1211        indexer.index_directory(&src_dir)?;
1212        indexer.index_directory(&docs_dir)?;
1213
1214        assert!(
1215            indexer
1216                .find_files("lib\\.rs$")?
1217                .iter()
1218                .any(|file| file.ends_with("lib.rs"))
1219        );
1220        assert!(
1221            indexer
1222                .find_files("guide\\.md$")?
1223                .iter()
1224                .any(|file| file.ends_with("guide.md"))
1225        );
1226
1227        let index_content =
1228            fs::read_to_string(workspace.join(".vtcode").join("index").join("index.md"))?;
1229        assert!(index_content.contains(src_dir.join("lib.rs").to_string_lossy().as_ref()));
1230        assert!(index_content.contains(docs_dir.join("guide.md").to_string_lossy().as_ref()));
1231
1232        Ok(())
1233    }
1234
1235    #[test]
1236    fn batch_indexing_writes_single_markdown_file() -> Result<()> {
1237        let temp = tempdir()?;
1238        let workspace = temp.path();
1239        fs::write(workspace.join("lib.rs"), "fn main() {}")?;
1240        fs::write(workspace.join("README.md"), "# Notes")?;
1241
1242        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1243        indexer.init()?;
1244        indexer.index_directory(workspace)?;
1245
1246        let index_dir = workspace.join(".vtcode").join("index");
1247        let files = fs::read_dir(&index_dir)?
1248            .filter_map(|entry| entry.ok())
1249            .map(|entry| entry.file_name().to_string_lossy().into_owned())
1250            .collect::<Vec<_>>();
1251        assert_eq!(files, vec!["index.md".to_string()]);
1252
1253        let index_content = fs::read_to_string(index_dir.join("index.md"))?;
1254        assert!(index_content.contains(workspace.join("lib.rs").to_string_lossy().as_ref()));
1255        assert!(index_content.contains(workspace.join("README.md").to_string_lossy().as_ref()));
1256
1257        Ok(())
1258    }
1259
1260    #[test]
1261    fn batch_indexing_removes_legacy_hashed_entries() -> Result<()> {
1262        let temp = tempdir()?;
1263        let workspace = temp.path();
1264        fs::write(workspace.join("lib.rs"), "fn main() {}")?;
1265
1266        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1267        indexer.init()?;
1268
1269        let legacy_file_name = format!("{}.md", calculate_hash("legacy-path"));
1270        let legacy_file_path = workspace
1271            .join(".vtcode")
1272            .join("index")
1273            .join(&legacy_file_name);
1274        fs::write(&legacy_file_path, "# legacy")?;
1275        assert!(legacy_file_path.exists());
1276
1277        indexer.index_directory(workspace)?;
1278
1279        assert!(!legacy_file_path.exists());
1280        let files = fs::read_dir(workspace.join(".vtcode").join("index"))?
1281            .filter_map(|entry| entry.ok())
1282            .map(|entry| entry.file_name().to_string_lossy().into_owned())
1283            .collect::<Vec<_>>();
1284        assert_eq!(files, vec!["index.md".to_string()]);
1285
1286        Ok(())
1287    }
1288
1289    #[test]
1290    fn snapshot_storage_uses_default_ref_batch_persistence() -> Result<()> {
1291        #[derive(Clone, Default)]
1292        struct SnapshotMemoryStorage {
1293            snapshots: Arc<Mutex<Vec<Vec<FileIndex>>>>,
1294        }
1295
1296        impl SnapshotMemoryStorage {
1297            fn new(snapshots: Arc<Mutex<Vec<Vec<FileIndex>>>>) -> Self {
1298                Self { snapshots }
1299            }
1300        }
1301
1302        impl IndexStorage for SnapshotMemoryStorage {
1303            fn init(&self, _index_dir: &Path) -> Result<()> {
1304                Ok(())
1305            }
1306
1307            fn persist(&self, _index_dir: &Path, _entry: &FileIndex) -> Result<()> {
1308                Ok(())
1309            }
1310
1311            fn prefers_snapshot_persistence(&self) -> bool {
1312                true
1313            }
1314
1315            fn persist_batch(&self, _index_dir: &Path, entries: &[FileIndex]) -> Result<()> {
1316                self.snapshots
1317                    .lock()
1318                    .expect("lock poisoned")
1319                    .push(entries.to_vec());
1320                Ok(())
1321            }
1322        }
1323
1324        let temp = tempdir()?;
1325        let workspace = temp.path();
1326        let file_path = workspace.join("notes.txt");
1327        fs::write(&file_path, "remember this")?;
1328
1329        let snapshots = Arc::new(Mutex::new(Vec::new()));
1330        let storage = SnapshotMemoryStorage::new(snapshots.clone());
1331
1332        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1333        let mut indexer = SimpleIndexer::with_config(config).with_storage(Arc::new(storage));
1334        indexer.index_file(&file_path)?;
1335
1336        let snapshots = snapshots.lock().expect("lock poisoned");
1337        assert_eq!(snapshots.len(), 1);
1338        assert_eq!(snapshots[0].len(), 1);
1339        assert_eq!(
1340            snapshots[0][0].path,
1341            workspace.join("notes.txt").to_string_lossy().into_owned()
1342        );
1343
1344        Ok(())
1345    }
1346
1347    #[test]
1348    fn snapshot_index_file_rolls_back_cache_when_persist_fails() -> Result<()> {
1349        #[derive(Clone, Default)]
1350        struct FlakySnapshotStorage {
1351            persist_count: Arc<Mutex<usize>>,
1352        }
1353
1354        impl IndexStorage for FlakySnapshotStorage {
1355            fn init(&self, _index_dir: &Path) -> Result<()> {
1356                Ok(())
1357            }
1358
1359            fn persist(&self, _index_dir: &Path, _entry: &FileIndex) -> Result<()> {
1360                Ok(())
1361            }
1362
1363            fn prefers_snapshot_persistence(&self) -> bool {
1364                true
1365            }
1366
1367            fn persist_batch(&self, _index_dir: &Path, _entries: &[FileIndex]) -> Result<()> {
1368                let mut count = self.persist_count.lock().expect("lock poisoned");
1369                *count += 1;
1370                if *count == 2 {
1371                    anyhow::bail!("simulated snapshot persistence failure");
1372                }
1373                Ok(())
1374            }
1375        }
1376
1377        let temp = tempdir()?;
1378        let workspace = temp.path();
1379        let first = workspace.join("first.txt");
1380        let second = workspace.join("second.txt");
1381        fs::write(&first, "one")?;
1382        fs::write(&second, "two")?;
1383
1384        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1385        let storage = Arc::new(FlakySnapshotStorage::default());
1386        let mut indexer = SimpleIndexer::with_config(config).with_storage(storage);
1387
1388        indexer.index_file(&first)?;
1389        assert!(
1390            indexer
1391                .find_files("first\\.txt$")?
1392                .iter()
1393                .any(|path| path.ends_with("first.txt"))
1394        );
1395
1396        let err = indexer
1397            .index_file(&second)
1398            .expect_err("second persist should fail");
1399        assert!(
1400            err.to_string()
1401                .contains("simulated snapshot persistence failure")
1402        );
1403        assert!(
1404            indexer
1405                .find_files("first\\.txt$")?
1406                .iter()
1407                .any(|path| path.ends_with("first.txt"))
1408        );
1409        assert!(indexer.find_files("second\\.txt$")?.is_empty());
1410
1411        Ok(())
1412    }
1413}