Skip to main content

vtcode_indexer/
lib.rs

1//! Workspace-friendly file indexer extracted from VT Code.
2//!
3//! `vtcode-indexer` offers a lightweight alternative to heavyweight
4//! search/indexing stacks. It recursively walks a workspace, computes
5//! hashes, and stores per-file metadata in Markdown-friendly summaries
6//! so changes remain easy to audit in git.
7
8use anyhow::Result;
9use hashbrown::HashMap;
10use ignore::{DirEntry, Walk, WalkBuilder};
11use regex::Regex;
12use serde::{Deserialize, Serialize};
13use std::fmt::Write as FmtWrite;
14use std::fs;
15use std::io::{BufWriter, ErrorKind, Write};
16use std::path::{Path, PathBuf};
17use std::sync::Arc;
18use std::time::SystemTime;
19
20/// Persistence backend for [`SimpleIndexer`].
21pub trait IndexStorage: Send + Sync {
22    /// Prepare any directories or resources required for persistence.
23    fn init(&self, index_dir: &Path) -> Result<()>;
24
25    /// Persist an indexed file entry.
26    fn persist(&self, index_dir: &Path, entry: &FileIndex) -> Result<()>;
27
28    /// Whether this backend expects full-snapshot persistence.
29    ///
30    /// Snapshot-aware backends receive the complete in-memory index on each
31    /// update so on-disk state stays consistent across single-file and
32    /// directory indexing flows.
33    fn prefers_snapshot_persistence(&self) -> bool {
34        false
35    }
36
37    /// Remove a previously persisted file entry.
38    ///
39    /// Defaults to a no-op to keep existing custom storage backends compatible.
40    fn remove(&self, _index_dir: &Path, _file_path: &Path) -> Result<()> {
41        Ok(())
42    }
43
44    /// Persist a batch of indexed file entries.
45    ///
46    /// Defaults to calling [`IndexStorage::persist`] for each entry, keeping
47    /// existing custom storage backends compatible.
48    fn persist_batch(&self, index_dir: &Path, entries: &[FileIndex]) -> Result<()> {
49        for entry in entries {
50            self.persist(index_dir, entry)?;
51        }
52        Ok(())
53    }
54}
55
56/// Directory traversal filter hook for [`SimpleIndexer`].
57pub trait TraversalFilter: Send + Sync {
58    /// Determine if the indexer should descend into the provided directory.
59    fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool;
60
61    /// Determine if the indexer should process the provided file.
62    fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool;
63}
64
65/// Markdown-backed [`IndexStorage`] implementation.
66#[derive(Debug, Default, Clone)]
67pub struct MarkdownIndexStorage;
68
69impl IndexStorage for MarkdownIndexStorage {
70    fn init(&self, index_dir: &Path) -> Result<()> {
71        fs::create_dir_all(index_dir)?;
72        Ok(())
73    }
74
75    fn persist(&self, index_dir: &Path, entry: &FileIndex) -> Result<()> {
76        fs::create_dir_all(index_dir)?;
77        let file_name = format!("{}.md", calculate_hash(&entry.path));
78        let index_path = index_dir.join(file_name);
79        let file = fs::File::create(index_path)?;
80        let mut writer = BufWriter::new(file);
81        writeln!(writer, "# File Index: {}", entry.path)?;
82        writeln!(writer)?;
83        write_markdown_fields(&mut writer, entry)?;
84        writer.flush()?;
85        Ok(())
86    }
87
88    fn prefers_snapshot_persistence(&self) -> bool {
89        true
90    }
91
92    fn remove(&self, index_dir: &Path, file_path: &Path) -> Result<()> {
93        let file_name = format!(
94            "{}.md",
95            calculate_hash(file_path.to_string_lossy().as_ref())
96        );
97        let index_path = index_dir.join(file_name);
98        match fs::remove_file(index_path) {
99            Ok(()) => Ok(()),
100            Err(err) if err.kind() == ErrorKind::NotFound => Ok(()),
101            Err(err) => Err(err.into()),
102        }
103    }
104
105    fn persist_batch(&self, index_dir: &Path, entries: &[FileIndex]) -> Result<()> {
106        fs::create_dir_all(index_dir)?;
107        let temp_path = index_dir.join(".index.md.tmp");
108        let final_path = index_dir.join("index.md");
109        let file = fs::File::create(&temp_path)?;
110        let mut writer = BufWriter::new(file);
111
112        writeln!(writer, "# Workspace File Index")?;
113        writeln!(writer)?;
114        writeln!(writer, "- **Entries**: {}", entries.len())?;
115        writeln!(writer)?;
116
117        for entry in entries {
118            write_markdown_entry(&mut writer, entry)?;
119        }
120
121        writer.flush()?;
122        fs::rename(temp_path, final_path)?;
123        cleanup_legacy_markdown_entries(index_dir)?;
124        Ok(())
125    }
126}
127
128/// Default traversal filter powered by [`SimpleIndexerConfig`].
129#[derive(Debug, Default, Clone)]
130pub struct ConfigTraversalFilter;
131
132impl TraversalFilter for ConfigTraversalFilter {
133    fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
134        !should_skip_dir(path, config)
135    }
136
137    fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
138        if !path.is_file() {
139            return false;
140        }
141
142        // Skip hidden files when configured.
143        if config.ignore_hidden
144            && path
145                .file_name()
146                .and_then(|n| n.to_str())
147                .is_some_and(|s| s.starts_with('.'))
148        {
149            return false;
150        }
151
152        // Always skip known sensitive files regardless of config.
153        if let Some(file_name) = path.file_name().and_then(|n| n.to_str()) {
154            let is_sensitive = matches!(
155                file_name,
156                ".env"
157                    | ".env.local"
158                    | ".env.production"
159                    | ".env.development"
160                    | ".env.test"
161                    | ".git"
162                    | ".gitignore"
163                    | ".DS_Store"
164            ) || file_name.starts_with(".env.");
165            if is_sensitive {
166                return false;
167            }
168        }
169
170        true
171    }
172}
173
174/// Configuration for [`SimpleIndexer`].
175#[derive(Clone, Debug)]
176pub struct SimpleIndexerConfig {
177    workspace_root: PathBuf,
178    index_dir: PathBuf,
179    ignore_hidden: bool,
180    excluded_dirs: Vec<PathBuf>,
181    allowed_dirs: Vec<PathBuf>,
182}
183
184impl SimpleIndexerConfig {
185    /// Builds a configuration using VT Code's legacy layout as defaults.
186    pub fn new(workspace_root: PathBuf) -> Self {
187        let index_dir = workspace_root.join(".vtcode").join("index");
188        let vtcode_dir = workspace_root.join(".vtcode");
189        let external_dir = vtcode_dir.join("external");
190
191        let mut excluded_dirs = vec![
192            index_dir.clone(),
193            vtcode_dir,
194            workspace_root.join("target"),
195            workspace_root.join("node_modules"),
196        ];
197
198        excluded_dirs.dedup();
199
200        Self {
201            workspace_root,
202            index_dir,
203            ignore_hidden: true,
204            excluded_dirs,
205            allowed_dirs: vec![external_dir],
206        }
207    }
208
209    /// Updates the index directory used for persisted metadata.
210    pub fn with_index_dir(mut self, index_dir: impl Into<PathBuf>) -> Self {
211        let index_dir = index_dir.into();
212        self.index_dir = index_dir.clone();
213        self.push_unique_excluded(index_dir);
214        self
215    }
216
217    /// Adds an allowed directory that should be indexed even if hidden or inside an excluded parent.
218    pub fn add_allowed_dir(mut self, path: impl Into<PathBuf>) -> Self {
219        let path = path.into();
220        if !self.allowed_dirs.iter().any(|existing| existing == &path) {
221            self.allowed_dirs.push(path);
222        }
223        self
224    }
225
226    /// Adds an additional excluded directory to skip during traversal.
227    pub fn add_excluded_dir(mut self, path: impl Into<PathBuf>) -> Self {
228        let path = path.into();
229        self.push_unique_excluded(path);
230        self
231    }
232
233    /// Toggles whether hidden directories (prefix `.`) are ignored.
234    pub fn ignore_hidden(mut self, ignore_hidden: bool) -> Self {
235        self.ignore_hidden = ignore_hidden;
236        self
237    }
238
239    /// Workspace root accessor.
240    pub fn workspace_root(&self) -> &Path {
241        &self.workspace_root
242    }
243
244    /// Index directory accessor.
245    pub fn index_dir(&self) -> &Path {
246        &self.index_dir
247    }
248
249    fn push_unique_excluded(&mut self, path: PathBuf) {
250        if !self.excluded_dirs.iter().any(|existing| existing == &path) {
251            self.excluded_dirs.push(path);
252        }
253    }
254}
255
256/// Simple file index entry.
257#[derive(Debug, Clone, Serialize, Deserialize)]
258pub struct FileIndex {
259    /// File path.
260    pub path: String,
261    /// File content hash for change detection.
262    pub hash: String,
263    /// Last modified timestamp.
264    pub modified: u64,
265    /// File size.
266    pub size: u64,
267    /// Language/extension.
268    pub language: String,
269    /// Simple tags.
270    pub tags: Vec<String>,
271}
272
273/// Simple search result.
274#[derive(Debug, Clone, Serialize, Deserialize)]
275pub struct SearchResult {
276    pub file_path: String,
277    pub line_number: usize,
278    pub line_content: String,
279    pub matches: Vec<String>,
280}
281
282/// Simple file indexer.
283pub struct SimpleIndexer {
284    config: SimpleIndexerConfig,
285    index_cache: HashMap<String, FileIndex>,
286    storage: Arc<dyn IndexStorage>,
287    filter: Arc<dyn TraversalFilter>,
288}
289
290impl SimpleIndexer {
291    /// Create a new simple indexer with default VT Code paths.
292    pub fn new(workspace_root: PathBuf) -> Self {
293        Self::with_components(
294            SimpleIndexerConfig::new(workspace_root),
295            Arc::new(MarkdownIndexStorage),
296            Arc::new(ConfigTraversalFilter),
297        )
298    }
299
300    /// Create a simple indexer with the provided configuration.
301    pub fn with_config(config: SimpleIndexerConfig) -> Self {
302        Self::with_components(
303            config,
304            Arc::new(MarkdownIndexStorage),
305            Arc::new(ConfigTraversalFilter),
306        )
307    }
308
309    /// Create a new simple indexer using a custom index directory.
310    pub fn with_index_dir(workspace_root: PathBuf, index_dir: PathBuf) -> Self {
311        let config = SimpleIndexerConfig::new(workspace_root).with_index_dir(index_dir);
312        Self::with_config(config)
313    }
314
315    /// Create an indexer with explicit storage and traversal filter implementations.
316    pub fn with_components(
317        config: SimpleIndexerConfig,
318        storage: Arc<dyn IndexStorage>,
319        filter: Arc<dyn TraversalFilter>,
320    ) -> Self {
321        Self {
322            config,
323            index_cache: HashMap::new(),
324            storage,
325            filter,
326        }
327    }
328
329    /// Replace the storage backend used to persist index entries.
330    pub fn with_storage(self, storage: Arc<dyn IndexStorage>) -> Self {
331        Self { storage, ..self }
332    }
333
334    /// Replace the traversal filter used to decide which files and directories are indexed.
335    pub fn with_filter(self, filter: Arc<dyn TraversalFilter>) -> Self {
336        Self { filter, ..self }
337    }
338
339    /// Initialize the index directory.
340    pub fn init(&self) -> Result<()> {
341        self.storage.init(self.config.index_dir())
342    }
343
344    /// Get the workspace root path.
345    pub fn workspace_root(&self) -> &Path {
346        self.config.workspace_root()
347    }
348
349    /// Get the index directory used for persisted metadata.
350    pub fn index_dir(&self) -> &Path {
351        self.config.index_dir()
352    }
353
354    /// Index a single file.
355    pub fn index_file(&mut self, file_path: &Path) -> Result<()> {
356        let cache_key = file_path.to_string_lossy().into_owned();
357
358        if self.storage.prefers_snapshot_persistence() {
359            let mut next_cache = self.index_cache.clone();
360
361            if file_path.exists() && self.should_process_file_path(file_path) {
362                if let Some(index) = self.build_file_index(file_path)? {
363                    next_cache.insert(index.path.clone(), index);
364                } else {
365                    next_cache.remove(cache_key.as_str());
366                }
367            } else {
368                next_cache.remove(cache_key.as_str());
369            }
370
371            let mut snapshot = next_cache.values().cloned().collect::<Vec<_>>();
372            snapshot.sort_unstable_by(|left, right| left.path.cmp(&right.path));
373            self.storage
374                .persist_batch(self.config.index_dir(), &snapshot)?;
375            self.index_cache = next_cache;
376            return Ok(());
377        }
378
379        if !file_path.exists() || !self.should_process_file_path(file_path) {
380            self.index_cache.remove(cache_key.as_str());
381            self.storage.remove(self.config.index_dir(), file_path)?;
382            return Ok(());
383        }
384
385        if let Some(index) = self.build_file_index(file_path)? {
386            self.storage.persist(self.config.index_dir(), &index)?;
387            self.index_cache.insert(index.path.clone(), index);
388        } else {
389            self.index_cache.remove(cache_key.as_str());
390            self.storage.remove(self.config.index_dir(), file_path)?;
391        }
392
393        Ok(())
394    }
395
396    /// Index all files in directory recursively.
397    /// Respects .gitignore, .ignore, and other ignore files.
398    /// SECURITY: Always skips hidden files and sensitive data (.env, .git, etc.)
399    pub fn index_directory(&mut self, dir_path: &Path) -> Result<()> {
400        let walker = self.build_walker(dir_path);
401
402        let mut entries = Vec::new();
403
404        for entry in walker.filter_map(|e| e.ok()) {
405            let path = entry.path();
406
407            // Only index files, not directories
408            if entry.file_type().is_some_and(|ft| ft.is_file())
409                && let Some(index) = self.build_file_index(path)?
410            {
411                entries.push(index);
412            }
413        }
414
415        if self.storage.prefers_snapshot_persistence() {
416            let mut persisted_entries = self
417                .index_cache
418                .iter()
419                .filter(|(path, _)| !Path::new(path).starts_with(dir_path))
420                .map(|(_, entry)| entry.clone())
421                .collect::<Vec<_>>();
422            persisted_entries.extend(entries.iter().cloned());
423            persisted_entries.sort_unstable_by(|left, right| left.path.cmp(&right.path));
424            self.storage
425                .persist_batch(self.config.index_dir(), &persisted_entries)?;
426        } else {
427            entries.sort_unstable_by(|left, right| left.path.cmp(&right.path));
428            self.storage
429                .persist_batch(self.config.index_dir(), &entries)?;
430        }
431
432        self.replace_cached_entries(dir_path, &entries);
433
434        Ok(())
435    }
436
437    /// Discover all files in directory recursively without indexing them.
438    /// This is much faster than `index_directory` as it avoids hashing and persistence.
439    pub fn discover_files(&self, dir_path: &Path) -> Vec<String> {
440        let walker = self.build_walker(dir_path);
441
442        let mut files = walker
443            .filter_map(|e| e.ok())
444            .filter(|e| {
445                if !e.file_type().is_some_and(|ft| ft.is_file()) {
446                    return false;
447                }
448
449                self.should_process_file_path(e.path())
450            })
451            .map(|e| e.path().to_string_lossy().into_owned())
452            .collect::<Vec<_>>();
453        files.sort_unstable();
454        files
455    }
456
457    /// Internal helper for regex-based file content search.
458    /// Used by both `search()` and `grep()` to avoid code duplication.
459    fn search_files_internal(
460        &self,
461        regex: &Regex,
462        path_filter: Option<&str>,
463        extract_matches: bool,
464    ) -> Vec<SearchResult> {
465        let mut results = Vec::new();
466
467        for file_path in self.index_cache.keys() {
468            if path_filter.is_some_and(|filter| !file_path.contains(filter)) {
469                continue;
470            }
471
472            if let Ok(content) = fs::read_to_string(file_path) {
473                for (line_num, line) in content.lines().enumerate() {
474                    if regex.is_match(line) {
475                        let matches = if extract_matches {
476                            regex
477                                .find_iter(line)
478                                .map(|m| m.as_str().to_string())
479                                .collect()
480                        } else {
481                            vec![line.to_string()]
482                        };
483
484                        results.push(SearchResult {
485                            file_path: file_path.clone(),
486                            line_number: line_num + 1,
487                            line_content: line.to_string(),
488                            matches,
489                        });
490                    }
491                }
492            }
493        }
494
495        results.sort_unstable_by(|left, right| {
496            left.file_path
497                .cmp(&right.file_path)
498                .then_with(|| left.line_number.cmp(&right.line_number))
499        });
500        results
501    }
502
503    /// Search files using regex pattern.
504    pub fn search(&self, pattern: &str, path_filter: Option<&str>) -> Result<Vec<SearchResult>> {
505        let regex = Regex::new(pattern)?;
506        Ok(self.search_files_internal(&regex, path_filter, true))
507    }
508
509    /// Find files by name pattern.
510    pub fn find_files(&self, pattern: &str) -> Result<Vec<String>> {
511        let regex = Regex::new(pattern)?;
512        let mut results = Vec::new();
513
514        for file_path in self.index_cache.keys() {
515            if regex.is_match(file_path) {
516                results.push(file_path.clone());
517            }
518        }
519
520        results.sort_unstable();
521        Ok(results)
522    }
523
524    /// Get all indexed files without pattern matching.
525    /// This is more efficient than using find_files(".*").
526    pub fn all_files(&self) -> Vec<String> {
527        let mut files = self.index_cache.keys().cloned().collect::<Vec<_>>();
528        files.sort_unstable();
529        files
530    }
531
532    /// Get file content with line numbers.
533    pub fn get_file_content(
534        &self,
535        file_path: &str,
536        start_line: Option<usize>,
537        end_line: Option<usize>,
538    ) -> Result<String> {
539        let content = fs::read_to_string(file_path)?;
540        let start = start_line.unwrap_or(1).max(1);
541        let end = end_line.unwrap_or(usize::MAX);
542
543        if start > end {
544            return Ok(String::new());
545        }
546
547        let mut result = String::new();
548        for (line_number, line) in content.lines().enumerate() {
549            let line_number = line_number + 1;
550            if line_number < start {
551                continue;
552            }
553            if line_number > end {
554                break;
555            }
556            writeln!(&mut result, "{line_number}: {line}")?;
557        }
558
559        Ok(result)
560    }
561
562    /// List files in directory (like ls).
563    pub fn list_files(&self, dir_path: &str, show_hidden: bool) -> Result<Vec<String>> {
564        let path = Path::new(dir_path);
565        if !path.exists() {
566            return Ok(vec![]);
567        }
568
569        let mut files = Vec::new();
570
571        for entry in fs::read_dir(path)? {
572            let entry = entry?;
573            let file_name = entry.file_name().to_string_lossy().into_owned();
574
575            if !show_hidden && file_name.starts_with('.') {
576                continue;
577            }
578
579            files.push(file_name);
580        }
581
582        files.sort_unstable();
583        Ok(files)
584    }
585
586    /// Grep-like search (like grep command).
587    pub fn grep(&self, pattern: &str, file_pattern: Option<&str>) -> Result<Vec<SearchResult>> {
588        let regex = Regex::new(pattern)?;
589        Ok(self.search_files_internal(&regex, file_pattern, false))
590    }
591
592    #[allow(dead_code)]
593    fn walk_directory<F>(&mut self, dir_path: &Path, callback: &mut F) -> Result<()>
594    where
595        F: FnMut(&Path) -> Result<()>,
596    {
597        if !dir_path.exists() {
598            return Ok(());
599        }
600
601        self.walk_directory_internal(dir_path, callback)
602    }
603
604    #[allow(dead_code)]
605    fn walk_directory_internal<F>(&mut self, dir_path: &Path, callback: &mut F) -> Result<()>
606    where
607        F: FnMut(&Path) -> Result<()>,
608    {
609        for entry in fs::read_dir(dir_path)? {
610            let entry = entry?;
611            let path = entry.path();
612
613            if path.is_dir() {
614                if self.is_allowed_path(&path) {
615                    self.walk_directory_internal(&path, callback)?;
616                    continue;
617                }
618
619                if !self.filter.should_descend(&path, &self.config) {
620                    self.walk_allowed_descendants(&path, callback)?;
621                    continue;
622                }
623
624                self.walk_directory_internal(&path, callback)?;
625            } else if path.is_file() {
626                callback(&path)?;
627            }
628        }
629
630        Ok(())
631    }
632
633    #[allow(dead_code)]
634    fn is_allowed_path(&self, path: &Path) -> bool {
635        self.config
636            .allowed_dirs
637            .iter()
638            .any(|allowed| path.starts_with(allowed))
639    }
640
641    #[allow(dead_code)]
642    fn walk_allowed_descendants<F>(&mut self, dir_path: &Path, callback: &mut F) -> Result<()>
643    where
644        F: FnMut(&Path) -> Result<()>,
645    {
646        let allowed_dirs = self.config.allowed_dirs.clone();
647        for allowed in allowed_dirs {
648            if allowed.starts_with(dir_path) && allowed.exists() {
649                self.walk_directory_internal(&allowed, callback)?;
650            }
651        }
652        Ok(())
653    }
654
655    #[inline]
656    fn get_modified_time(&self, file_path: &Path) -> Result<u64> {
657        let metadata = fs::metadata(file_path)?;
658        let modified = metadata.modified()?;
659        Ok(modified.duration_since(SystemTime::UNIX_EPOCH)?.as_secs())
660    }
661
662    #[inline]
663    fn detect_language(&self, file_path: &Path) -> String {
664        file_path
665            .extension()
666            .and_then(|ext| ext.to_str())
667            .unwrap_or("unknown")
668            .to_string()
669    }
670
671    fn build_file_index(&self, file_path: &Path) -> Result<Option<FileIndex>> {
672        if !self.should_process_file_path(file_path) {
673            return Ok(None);
674        }
675
676        let content = match fs::read_to_string(file_path) {
677            Ok(text) => text,
678            Err(err) => {
679                if err.kind() == ErrorKind::InvalidData {
680                    return Ok(None);
681                }
682                return Err(err.into());
683            }
684        };
685
686        let index = FileIndex {
687            path: file_path.to_string_lossy().into_owned(),
688            hash: calculate_hash(&content),
689            modified: self.get_modified_time(file_path)?,
690            size: content.len() as u64,
691            language: self.detect_language(file_path),
692            tags: vec![],
693        };
694
695        Ok(Some(index))
696    }
697
698    #[inline]
699    fn is_excluded_path(&self, path: &Path) -> bool {
700        self.config
701            .excluded_dirs
702            .iter()
703            .any(|excluded| path.starts_with(excluded))
704    }
705
706    #[inline]
707    fn should_index_file_path(&self, path: &Path) -> bool {
708        self.filter.should_index_file(path, &self.config)
709    }
710
711    #[inline]
712    fn should_process_file_path(&self, path: &Path) -> bool {
713        if self.is_allowed_path(path) {
714            return self.should_index_file_path(path);
715        }
716
717        !self.is_excluded_path(path) && self.should_index_file_path(path)
718    }
719
720    fn build_walker(&self, dir_path: &Path) -> Walk {
721        let walk_root = dir_path.to_path_buf();
722        let config = self.config.clone();
723        let filter = Arc::clone(&self.filter);
724
725        let mut builder = WalkBuilder::new(dir_path);
726        builder
727            .hidden(false)
728            .git_ignore(true)
729            .git_global(true)
730            .git_exclude(true)
731            .ignore(true)
732            .parents(true);
733        builder.filter_entry(move |entry| {
734            should_visit_entry(entry, walk_root.as_path(), &config, filter.as_ref())
735        });
736        builder.build()
737    }
738
739    fn replace_cached_entries(&mut self, dir_path: &Path, entries: &[FileIndex]) {
740        self.index_cache
741            .retain(|path, _| !Path::new(path).starts_with(dir_path));
742
743        self.index_cache.extend(
744            entries
745                .iter()
746                .cloned()
747                .map(|entry| (entry.path.clone(), entry)),
748        );
749    }
750}
751
752impl Clone for SimpleIndexer {
753    fn clone(&self) -> Self {
754        Self {
755            config: self.config.clone(),
756            index_cache: self.index_cache.clone(),
757            storage: self.storage.clone(),
758            filter: self.filter.clone(),
759        }
760    }
761}
762
763fn should_skip_dir(path: &Path, config: &SimpleIndexerConfig) -> bool {
764    if is_allowed_path_or_ancestor(path, config) {
765        return false;
766    }
767
768    if config
769        .excluded_dirs
770        .iter()
771        .any(|excluded| path.starts_with(excluded))
772    {
773        return true;
774    }
775
776    if config.ignore_hidden
777        && path
778            .file_name()
779            .and_then(|name| name.to_str())
780            .is_some_and(|name_str| name_str.starts_with('.'))
781    {
782        return true;
783    }
784
785    false
786}
787
788fn is_allowed_path_or_ancestor(path: &Path, config: &SimpleIndexerConfig) -> bool {
789    config
790        .allowed_dirs
791        .iter()
792        .any(|allowed| path.starts_with(allowed) || allowed.starts_with(path))
793}
794
795fn should_visit_entry(
796    entry: &DirEntry,
797    walk_root: &Path,
798    config: &SimpleIndexerConfig,
799    filter: &dyn TraversalFilter,
800) -> bool {
801    if entry.path() == walk_root {
802        return true;
803    }
804
805    if !entry
806        .file_type()
807        .is_some_and(|file_type| file_type.is_dir())
808    {
809        return true;
810    }
811
812    filter.should_descend(entry.path(), config)
813}
814
815#[inline]
816fn calculate_hash(content: &str) -> String {
817    vtcode_commons::utils::calculate_sha256(content.as_bytes())
818}
819
820fn write_markdown_entry(writer: &mut impl Write, entry: &FileIndex) -> std::io::Result<()> {
821    writeln!(writer, "## {}", entry.path)?;
822    writeln!(writer)?;
823    write_markdown_fields(writer, entry)?;
824    writeln!(writer)?;
825    Ok(())
826}
827
828fn write_markdown_fields(writer: &mut impl Write, entry: &FileIndex) -> std::io::Result<()> {
829    writeln!(writer, "- **Path**: {}", entry.path)?;
830    writeln!(writer, "- **Hash**: {}", entry.hash)?;
831    writeln!(writer, "- **Modified**: {}", entry.modified)?;
832    writeln!(writer, "- **Size**: {} bytes", entry.size)?;
833    writeln!(writer, "- **Language**: {}", entry.language)?;
834    writeln!(writer, "- **Tags**: {}", entry.tags.join(", "))?;
835    Ok(())
836}
837
838fn cleanup_legacy_markdown_entries(index_dir: &Path) -> Result<()> {
839    for entry in fs::read_dir(index_dir)? {
840        let entry = entry?;
841        let file_name = entry.file_name();
842        let file_name = file_name.to_string_lossy();
843        if is_legacy_markdown_entry_name(file_name.as_ref()) {
844            fs::remove_file(entry.path())?;
845        }
846    }
847    Ok(())
848}
849
850#[inline]
851fn is_legacy_markdown_entry_name(file_name: &str) -> bool {
852    let Some(hash_part) = file_name.strip_suffix(".md") else {
853        return false;
854    };
855    hash_part.len() == 64 && hash_part.bytes().all(|byte| byte.is_ascii_hexdigit())
856}
857
858#[cfg(test)]
859mod tests {
860    use super::*;
861    use std::fs;
862    use std::sync::{Arc, Mutex};
863    use tempfile::tempdir;
864
865    #[test]
866    fn skips_hidden_directories_by_default() -> Result<()> {
867        let temp = tempdir()?;
868        let workspace = temp.path();
869        let hidden_dir = workspace.join(".private");
870        fs::create_dir_all(&hidden_dir)?;
871        fs::write(hidden_dir.join("secret.txt"), "classified")?;
872
873        let visible_dir = workspace.join("src");
874        fs::create_dir_all(&visible_dir)?;
875        fs::write(visible_dir.join("lib.rs"), "fn main() {}")?;
876
877        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
878        indexer.init()?;
879        indexer.index_directory(workspace)?;
880
881        assert!(indexer.find_files("secret\\.txt$")?.is_empty());
882        assert!(!indexer.find_files("lib\\.rs$")?.is_empty());
883
884        Ok(())
885    }
886
887    #[test]
888    fn can_include_hidden_directories_when_configured() -> Result<()> {
889        let temp = tempdir()?;
890        let workspace = temp.path();
891        let hidden_dir = workspace.join(".cache");
892        fs::create_dir_all(&hidden_dir)?;
893        fs::write(hidden_dir.join("data.log"), "details")?;
894
895        let config = SimpleIndexerConfig::new(workspace.to_path_buf()).ignore_hidden(false);
896        let mut indexer = SimpleIndexer::with_config(config);
897        indexer.init()?;
898        indexer.index_directory(workspace)?;
899
900        let results = indexer.find_files("data\\.log$")?;
901        assert_eq!(results.len(), 1);
902
903        Ok(())
904    }
905
906    #[test]
907    fn indexes_allowed_directories_inside_hidden_excluded_parents() -> Result<()> {
908        let temp = tempdir()?;
909        let workspace = temp.path();
910        let allowed_dir = workspace.join(".vtcode").join("external");
911        fs::create_dir_all(&allowed_dir)?;
912        fs::write(allowed_dir.join("plugin.toml"), "name = 'demo'")?;
913
914        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
915        indexer.init()?;
916        indexer.index_directory(workspace)?;
917
918        let results = indexer.find_files("plugin\\.toml$")?;
919        assert_eq!(results.len(), 1);
920
921        Ok(())
922    }
923
924    #[test]
925    fn reindexing_prunes_deleted_files_from_cache() -> Result<()> {
926        let temp = tempdir()?;
927        let workspace = temp.path();
928        let file_path = workspace.join("notes.txt");
929        fs::write(&file_path, "remember this")?;
930
931        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
932        indexer.init()?;
933        indexer.index_directory(workspace)?;
934        assert_eq!(indexer.find_files("notes\\.txt$")?.len(), 1);
935
936        fs::remove_file(&file_path)?;
937        indexer.index_directory(workspace)?;
938
939        assert!(indexer.find_files("notes\\.txt$")?.is_empty());
940        assert!(indexer.all_files().is_empty());
941
942        Ok(())
943    }
944
945    #[test]
946    fn index_file_skips_excluded_paths() -> Result<()> {
947        let temp = tempdir()?;
948        let workspace = temp.path();
949        let index_dir = workspace.join(".vtcode").join("index");
950        fs::create_dir_all(&index_dir)?;
951        let generated_index = index_dir.join("index.md");
952        fs::write(&generated_index, "# generated")?;
953
954        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
955        indexer.init()?;
956        indexer.index_file(&generated_index)?;
957
958        assert!(indexer.all_files().is_empty());
959
960        Ok(())
961    }
962
963    #[test]
964    fn index_file_removes_stale_entry_when_file_becomes_unreadable() -> Result<()> {
965        let temp = tempdir()?;
966        let workspace = temp.path();
967        let file_path = workspace.join("notes.txt");
968        fs::write(&file_path, "remember this")?;
969
970        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
971        indexer.init()?;
972        indexer.index_file(&file_path)?;
973        assert!(
974            indexer
975                .find_files("notes\\.txt$")?
976                .iter()
977                .any(|file| file.ends_with("notes.txt"))
978        );
979
980        fs::write(&file_path, [0xFF, 0xFE, 0xFD])?;
981        indexer.index_file(&file_path)?;
982
983        assert!(indexer.find_files("notes\\.txt$")?.is_empty());
984
985        let index_content =
986            fs::read_to_string(workspace.join(".vtcode").join("index").join("index.md"))?;
987        assert!(!index_content.contains(file_path.to_string_lossy().as_ref()));
988
989        Ok(())
990    }
991
992    #[test]
993    fn index_file_maintains_markdown_snapshot_across_updates() -> Result<()> {
994        let temp = tempdir()?;
995        let workspace = temp.path();
996        let first = workspace.join("first.txt");
997        let second = workspace.join("second.txt");
998        fs::write(&first, "one")?;
999        fs::write(&second, "two")?;
1000
1001        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1002        indexer.init()?;
1003        indexer.index_file(&first)?;
1004        indexer.index_file(&second)?;
1005
1006        let index_dir = workspace.join(".vtcode").join("index");
1007        let files = fs::read_dir(&index_dir)?
1008            .filter_map(|entry| entry.ok())
1009            .map(|entry| entry.file_name().to_string_lossy().into_owned())
1010            .collect::<Vec<_>>();
1011        assert_eq!(files, vec!["index.md".to_string()]);
1012
1013        let index_content = fs::read_to_string(index_dir.join("index.md"))?;
1014        assert!(index_content.contains(first.to_string_lossy().as_ref()));
1015        assert!(index_content.contains(second.to_string_lossy().as_ref()));
1016
1017        Ok(())
1018    }
1019
1020    #[test]
1021    fn index_directory_writes_markdown_snapshot_without_manual_init() -> Result<()> {
1022        let temp = tempdir()?;
1023        let workspace = temp.path();
1024        fs::write(workspace.join("notes.txt"), "remember this")?;
1025
1026        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1027        indexer.index_directory(workspace)?;
1028
1029        let index_content =
1030            fs::read_to_string(workspace.join(".vtcode").join("index").join("index.md"))?;
1031        assert!(index_content.contains(workspace.join("notes.txt").to_string_lossy().as_ref()));
1032
1033        Ok(())
1034    }
1035
1036    #[test]
1037    fn get_file_content_clamps_ranges_without_panicking() -> Result<()> {
1038        let temp = tempdir()?;
1039        let workspace = temp.path();
1040        let file_path = workspace.join("notes.txt");
1041        fs::write(&file_path, "first\nsecond")?;
1042
1043        let indexer = SimpleIndexer::new(workspace.to_path_buf());
1044        let file_path = file_path.to_string_lossy().into_owned();
1045
1046        assert_eq!(indexer.get_file_content(&file_path, Some(5), None)?, "");
1047        assert_eq!(
1048            indexer.get_file_content(&file_path, Some(0), Some(1))?,
1049            "1: first\n"
1050        );
1051        assert_eq!(indexer.get_file_content(&file_path, Some(2), Some(1))?, "");
1052
1053        Ok(())
1054    }
1055
1056    #[test]
1057    fn supports_custom_storage_backends() -> Result<()> {
1058        #[derive(Clone, Default)]
1059        struct MemoryStorage {
1060            records: Arc<Mutex<Vec<FileIndex>>>,
1061        }
1062
1063        impl MemoryStorage {
1064            fn new(records: Arc<Mutex<Vec<FileIndex>>>) -> Self {
1065                Self { records }
1066            }
1067        }
1068
1069        impl IndexStorage for MemoryStorage {
1070            fn init(&self, _index_dir: &Path) -> Result<()> {
1071                Ok(())
1072            }
1073
1074            fn persist(&self, _index_dir: &Path, entry: &FileIndex) -> Result<()> {
1075                let mut guard = self.records.lock().expect("lock poisoned");
1076                guard.push(entry.clone());
1077                Ok(())
1078            }
1079        }
1080
1081        let temp = tempdir()?;
1082        let workspace = temp.path();
1083        fs::write(workspace.join("notes.txt"), "remember this")?;
1084
1085        let records: Arc<Mutex<Vec<FileIndex>>> = Arc::new(Mutex::new(Vec::new()));
1086        let storage = MemoryStorage::new(records.clone());
1087
1088        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1089        let mut indexer = SimpleIndexer::with_config(config).with_storage(Arc::new(storage));
1090        indexer.init()?;
1091        indexer.index_directory(workspace)?;
1092
1093        let entries = records.lock().expect("lock poisoned");
1094        assert_eq!(entries.len(), 1);
1095        assert_eq!(
1096            entries[0].path,
1097            workspace.join("notes.txt").to_string_lossy().into_owned()
1098        );
1099
1100        Ok(())
1101    }
1102
1103    #[test]
1104    fn custom_filters_can_skip_files() -> Result<()> {
1105        #[derive(Default)]
1106        struct SkipRustFilter {
1107            inner: ConfigTraversalFilter,
1108        }
1109
1110        impl TraversalFilter for SkipRustFilter {
1111            fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
1112                self.inner.should_descend(path, config)
1113            }
1114
1115            fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
1116                if path
1117                    .extension()
1118                    .and_then(|ext| ext.to_str())
1119                    .is_some_and(|ext| ext.eq_ignore_ascii_case("rs"))
1120                {
1121                    return false;
1122                }
1123
1124                self.inner.should_index_file(path, config)
1125            }
1126        }
1127
1128        let temp = tempdir()?;
1129        let workspace = temp.path();
1130        fs::write(workspace.join("lib.rs"), "fn main() {}")?;
1131        fs::write(workspace.join("README.md"), "# Notes")?;
1132
1133        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1134        let mut indexer =
1135            SimpleIndexer::with_config(config).with_filter(Arc::new(SkipRustFilter::default()));
1136        indexer.init()?;
1137        indexer.index_directory(workspace)?;
1138
1139        assert!(indexer.find_files("lib\\.rs$")?.is_empty());
1140        assert!(!indexer.find_files("README\\.md$")?.is_empty());
1141
1142        Ok(())
1143    }
1144
1145    #[test]
1146    fn custom_filters_can_skip_directories() -> Result<()> {
1147        #[derive(Default)]
1148        struct SkipGeneratedFilter {
1149            inner: ConfigTraversalFilter,
1150        }
1151
1152        impl TraversalFilter for SkipGeneratedFilter {
1153            fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
1154                if path.ends_with("generated") {
1155                    return false;
1156                }
1157
1158                self.inner.should_descend(path, config)
1159            }
1160
1161            fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
1162                self.inner.should_index_file(path, config)
1163            }
1164        }
1165
1166        let temp = tempdir()?;
1167        let workspace = temp.path();
1168        let generated_dir = workspace.join("generated");
1169        fs::create_dir_all(&generated_dir)?;
1170        fs::write(generated_dir.join("skip.txt"), "ignore me")?;
1171        fs::write(workspace.join("README.md"), "# Notes")?;
1172
1173        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1174        let indexer = SimpleIndexer::with_config(config)
1175            .with_filter(Arc::new(SkipGeneratedFilter::default()));
1176        let files = indexer.discover_files(workspace);
1177
1178        assert!(!files.iter().any(|file| file.ends_with("skip.txt")));
1179        assert!(files.iter().any(|file| file.ends_with("README.md")));
1180
1181        Ok(())
1182    }
1183
1184    #[test]
1185    fn indexing_multiple_directories_preserves_existing_cache_entries() -> Result<()> {
1186        let temp = tempdir()?;
1187        let workspace = temp.path();
1188        let src_dir = workspace.join("src");
1189        let docs_dir = workspace.join("docs");
1190        fs::create_dir_all(&src_dir)?;
1191        fs::create_dir_all(&docs_dir)?;
1192        fs::write(src_dir.join("lib.rs"), "fn main() {}")?;
1193        fs::write(docs_dir.join("guide.md"), "# Guide")?;
1194
1195        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1196        indexer.init()?;
1197        indexer.index_directory(&src_dir)?;
1198        indexer.index_directory(&docs_dir)?;
1199
1200        assert!(
1201            indexer
1202                .find_files("lib\\.rs$")?
1203                .iter()
1204                .any(|file| file.ends_with("lib.rs"))
1205        );
1206        assert!(
1207            indexer
1208                .find_files("guide\\.md$")?
1209                .iter()
1210                .any(|file| file.ends_with("guide.md"))
1211        );
1212
1213        let index_content =
1214            fs::read_to_string(workspace.join(".vtcode").join("index").join("index.md"))?;
1215        assert!(index_content.contains(src_dir.join("lib.rs").to_string_lossy().as_ref()));
1216        assert!(index_content.contains(docs_dir.join("guide.md").to_string_lossy().as_ref()));
1217
1218        Ok(())
1219    }
1220
1221    #[test]
1222    fn batch_indexing_writes_single_markdown_file() -> Result<()> {
1223        let temp = tempdir()?;
1224        let workspace = temp.path();
1225        fs::write(workspace.join("lib.rs"), "fn main() {}")?;
1226        fs::write(workspace.join("README.md"), "# Notes")?;
1227
1228        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1229        indexer.init()?;
1230        indexer.index_directory(workspace)?;
1231
1232        let index_dir = workspace.join(".vtcode").join("index");
1233        let files = fs::read_dir(&index_dir)?
1234            .filter_map(|entry| entry.ok())
1235            .map(|entry| entry.file_name().to_string_lossy().into_owned())
1236            .collect::<Vec<_>>();
1237        assert_eq!(files, vec!["index.md".to_string()]);
1238
1239        let index_content = fs::read_to_string(index_dir.join("index.md"))?;
1240        assert!(index_content.contains(workspace.join("lib.rs").to_string_lossy().as_ref()));
1241        assert!(index_content.contains(workspace.join("README.md").to_string_lossy().as_ref()));
1242
1243        Ok(())
1244    }
1245
1246    #[test]
1247    fn batch_indexing_removes_legacy_hashed_entries() -> Result<()> {
1248        let temp = tempdir()?;
1249        let workspace = temp.path();
1250        fs::write(workspace.join("lib.rs"), "fn main() {}")?;
1251
1252        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1253        indexer.init()?;
1254
1255        let legacy_file_name = format!("{}.md", calculate_hash("legacy-path"));
1256        let legacy_file_path = workspace
1257            .join(".vtcode")
1258            .join("index")
1259            .join(&legacy_file_name);
1260        fs::write(&legacy_file_path, "# legacy")?;
1261        assert!(legacy_file_path.exists());
1262
1263        indexer.index_directory(workspace)?;
1264
1265        assert!(!legacy_file_path.exists());
1266        let files = fs::read_dir(workspace.join(".vtcode").join("index"))?
1267            .filter_map(|entry| entry.ok())
1268            .map(|entry| entry.file_name().to_string_lossy().into_owned())
1269            .collect::<Vec<_>>();
1270        assert_eq!(files, vec!["index.md".to_string()]);
1271
1272        Ok(())
1273    }
1274}