Skip to main content

vtcode_indexer/
lib.rs

1//! Workspace-friendly file indexer and file utilities for VT Code.
2//!
3//! `vtcode-indexer` provides:
4//! - A lightweight workspace file indexer with markdown-backed persistence
5//! - Fast parallel fuzzy file search (via `file_search` module)
6//! - Markdown-backed storage utilities (via `markdown_store` module)
7
8pub mod file_search;
9pub mod markdown_store;
10
11use anyhow::Result;
12use hashbrown::HashMap;
13use ignore::{DirEntry, Walk};
14use regex::Regex;
15use serde::{Deserialize, Serialize};
16use std::fmt::Write as FmtWrite;
17use std::fs;
18use std::io::{BufWriter, ErrorKind, Write};
19use std::path::{Path, PathBuf};
20use std::sync::Arc;
21use std::time::SystemTime;
22
23/// Persistence backend for [`SimpleIndexer`].
24pub trait IndexStorage: Send + Sync {
25    /// Prepare any directories or resources required for persistence.
26    fn init(&self, index_dir: &Path) -> Result<()>;
27
28    /// Persist an indexed file entry.
29    fn persist(&self, index_dir: &Path, entry: &FileIndex) -> Result<()>;
30
31    /// Whether this backend expects full-snapshot persistence.
32    ///
33    /// Snapshot-aware backends receive the complete in-memory index on each
34    /// update so on-disk state stays consistent across single-file and
35    /// directory indexing flows.
36    fn prefers_snapshot_persistence(&self) -> bool {
37        false
38    }
39
40    /// Remove a previously persisted file entry.
41    ///
42    /// Defaults to a no-op to keep existing custom storage backends compatible.
43    fn remove(&self, _index_dir: &Path, _file_path: &Path) -> Result<()> {
44        Ok(())
45    }
46
47    /// Persist a batch of indexed file entries.
48    ///
49    /// Defaults to calling [`IndexStorage::persist`] for each entry, keeping
50    /// existing custom storage backends compatible.
51    fn persist_batch(&self, index_dir: &Path, entries: &[FileIndex]) -> Result<()> {
52        for entry in entries {
53            self.persist(index_dir, entry)?;
54        }
55        Ok(())
56    }
57
58    /// Persist a batch of indexed file entries borrowed from the in-memory cache.
59    ///
60    /// Defaults to cloning the borrowed entries and delegating to
61    /// [`IndexStorage::persist_batch`] so existing custom storage backends remain
62    /// compatible.
63    fn persist_batch_refs(&self, index_dir: &Path, entries: &[&FileIndex]) -> Result<()> {
64        let owned = entries
65            .iter()
66            .map(|entry| (*entry).clone())
67            .collect::<Vec<_>>();
68        self.persist_batch(index_dir, &owned)
69    }
70}
71
72/// Directory traversal filter hook for [`SimpleIndexer`].
73pub trait TraversalFilter: Send + Sync {
74    /// Determine if the indexer should descend into the provided directory.
75    fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool;
76
77    /// Determine if the indexer should process the provided file.
78    fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool;
79}
80
81/// Markdown-backed [`IndexStorage`] implementation.
82#[derive(Debug, Default, Clone)]
83pub struct MarkdownIndexStorage;
84
85impl IndexStorage for MarkdownIndexStorage {
86    fn init(&self, index_dir: &Path) -> Result<()> {
87        fs::create_dir_all(index_dir)?;
88        Ok(())
89    }
90
91    fn persist(&self, index_dir: &Path, entry: &FileIndex) -> Result<()> {
92        fs::create_dir_all(index_dir)?;
93        let file_name = format!("{}.md", calculate_hash(&entry.path));
94        let index_path = index_dir.join(file_name);
95        let file = fs::File::create(index_path)?;
96        let mut writer = BufWriter::new(file);
97        writeln!(writer, "# File Index: {}", entry.path)?;
98        writeln!(writer)?;
99        write_markdown_fields(&mut writer, entry)?;
100        writer.flush()?;
101        Ok(())
102    }
103
104    fn prefers_snapshot_persistence(&self) -> bool {
105        true
106    }
107
108    fn remove(&self, index_dir: &Path, file_path: &Path) -> Result<()> {
109        let file_name = format!(
110            "{}.md",
111            calculate_hash(file_path.to_string_lossy().as_ref())
112        );
113        let index_path = index_dir.join(file_name);
114        match fs::remove_file(index_path) {
115            Ok(()) => Ok(()),
116            Err(err) if err.kind() == ErrorKind::NotFound => Ok(()),
117            Err(err) => Err(err.into()),
118        }
119    }
120
121    fn persist_batch(&self, index_dir: &Path, entries: &[FileIndex]) -> Result<()> {
122        persist_markdown_snapshot(index_dir, entries.iter())
123    }
124
125    fn persist_batch_refs(&self, index_dir: &Path, entries: &[&FileIndex]) -> Result<()> {
126        persist_markdown_snapshot(index_dir, entries.iter().copied())
127    }
128}
129
130fn persist_markdown_snapshot<'a>(
131    index_dir: &Path,
132    entries: impl IntoIterator<Item = &'a FileIndex>,
133) -> Result<()> {
134    let entries = entries.into_iter().collect::<Vec<_>>();
135
136    fs::create_dir_all(index_dir)?;
137    let temp_path = index_dir.join(".index.md.tmp");
138    let final_path = index_dir.join("index.md");
139    let file = fs::File::create(&temp_path)?;
140    let mut writer = BufWriter::new(file);
141
142    writeln!(writer, "# Workspace File Index")?;
143    writeln!(writer)?;
144    writeln!(writer, "- **Entries**: {}", entries.len())?;
145    writeln!(writer)?;
146
147    for entry in entries {
148        write_markdown_entry(&mut writer, entry)?;
149    }
150
151    writer.flush()?;
152    fs::rename(temp_path, final_path)?;
153    cleanup_legacy_markdown_entries(index_dir)?;
154    Ok(())
155}
156
157/// Default traversal filter powered by [`SimpleIndexerConfig`].
158#[derive(Debug, Default, Clone)]
159pub struct ConfigTraversalFilter;
160
161impl TraversalFilter for ConfigTraversalFilter {
162    fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
163        !should_skip_dir(path, config)
164    }
165
166    fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
167        if !path.is_file() {
168            return false;
169        }
170
171        // Skip hidden files when configured.
172        if config.ignore_hidden
173            && path
174                .file_name()
175                .and_then(|n| n.to_str())
176                .is_some_and(|s| s.starts_with('.'))
177        {
178            return false;
179        }
180
181        // Always skip known sensitive files regardless of config.
182        if let Some(file_name) = path.file_name().and_then(|n| n.to_str())
183            && (vtcode_commons::exclusions::is_sensitive_file(file_name)
184                || file_name == ".gitignore"
185                || file_name == ".git")
186        {
187            return false;
188        }
189
190        true
191    }
192}
193
194/// Configuration for [`SimpleIndexer`].
195#[derive(Clone, Debug)]
196pub struct SimpleIndexerConfig {
197    workspace_root: PathBuf,
198    index_dir: PathBuf,
199    ignore_hidden: bool,
200    excluded_dirs: Vec<PathBuf>,
201    allowed_dirs: Vec<PathBuf>,
202}
203
204impl SimpleIndexerConfig {
205    /// Builds a configuration using VT Code's legacy layout as defaults.
206    pub fn new(workspace_root: PathBuf) -> Self {
207        let index_dir = workspace_root.join(".vtcode").join("index");
208        let vtcode_dir = workspace_root.join(".vtcode");
209        let external_dir = vtcode_dir.join("external");
210
211        let mut excluded_dirs: Vec<PathBuf> = vtcode_commons::exclusions::DEFAULT_EXCLUDED_DIRS
212            .iter()
213            .map(|name| workspace_root.join(name))
214            .collect();
215        excluded_dirs.push(index_dir.clone());
216        excluded_dirs.push(vtcode_dir);
217
218        excluded_dirs.dedup();
219
220        Self {
221            workspace_root,
222            index_dir,
223            ignore_hidden: true,
224            excluded_dirs,
225            allowed_dirs: vec![external_dir],
226        }
227    }
228
229    /// Updates the index directory used for persisted metadata.
230    pub fn with_index_dir(mut self, index_dir: impl Into<PathBuf>) -> Self {
231        let index_dir = index_dir.into();
232        self.index_dir = index_dir.clone();
233        self.push_unique_excluded(index_dir);
234        self
235    }
236
237    /// Adds an allowed directory that should be indexed even if hidden or inside an excluded parent.
238    pub fn add_allowed_dir(mut self, path: impl Into<PathBuf>) -> Self {
239        let path = path.into();
240        if !self.allowed_dirs.iter().any(|existing| existing == &path) {
241            self.allowed_dirs.push(path);
242        }
243        self
244    }
245
246    /// Adds an additional excluded directory to skip during traversal.
247    pub fn add_excluded_dir(mut self, path: impl Into<PathBuf>) -> Self {
248        let path = path.into();
249        self.push_unique_excluded(path);
250        self
251    }
252
253    /// Toggles whether hidden directories (prefix `.`) are ignored.
254    pub fn ignore_hidden(mut self, ignore_hidden: bool) -> Self {
255        self.ignore_hidden = ignore_hidden;
256        self
257    }
258
259    /// Workspace root accessor.
260    pub fn workspace_root(&self) -> &Path {
261        &self.workspace_root
262    }
263
264    /// Index directory accessor.
265    pub fn index_dir(&self) -> &Path {
266        &self.index_dir
267    }
268
269    fn push_unique_excluded(&mut self, path: PathBuf) {
270        if !self.excluded_dirs.iter().any(|existing| existing == &path) {
271            self.excluded_dirs.push(path);
272        }
273    }
274}
275
276/// Simple file index entry.
277#[derive(Debug, Clone, Serialize, Deserialize)]
278pub struct FileIndex {
279    /// File path.
280    pub path: String,
281    /// File content hash for change detection.
282    pub hash: String,
283    /// Last modified timestamp.
284    pub modified: u64,
285    /// File size.
286    pub size: u64,
287    /// Language/extension.
288    pub language: String,
289    /// Simple tags.
290    pub tags: Vec<String>,
291}
292
293/// Simple search result.
294#[derive(Debug, Clone, Serialize, Deserialize)]
295pub struct SearchResult {
296    pub file_path: String,
297    pub line_number: usize,
298    pub line_content: String,
299    pub matches: Vec<String>,
300}
301
302/// Simple file indexer.
303pub struct SimpleIndexer {
304    config: SimpleIndexerConfig,
305    index_cache: HashMap<String, FileIndex>,
306    storage: Arc<dyn IndexStorage>,
307    filter: Arc<dyn TraversalFilter>,
308}
309
310impl SimpleIndexer {
311    /// Create a new simple indexer with default VT Code paths.
312    pub fn new(workspace_root: PathBuf) -> Self {
313        Self::with_components(
314            SimpleIndexerConfig::new(workspace_root),
315            Arc::new(MarkdownIndexStorage),
316            Arc::new(ConfigTraversalFilter),
317        )
318    }
319
320    /// Create a simple indexer with the provided configuration.
321    pub fn with_config(config: SimpleIndexerConfig) -> Self {
322        Self::with_components(
323            config,
324            Arc::new(MarkdownIndexStorage),
325            Arc::new(ConfigTraversalFilter),
326        )
327    }
328
329    /// Create a new simple indexer using a custom index directory.
330    pub fn with_index_dir(workspace_root: PathBuf, index_dir: PathBuf) -> Self {
331        let config = SimpleIndexerConfig::new(workspace_root).with_index_dir(index_dir);
332        Self::with_config(config)
333    }
334
335    /// Create an indexer with explicit storage and traversal filter implementations.
336    pub fn with_components(
337        config: SimpleIndexerConfig,
338        storage: Arc<dyn IndexStorage>,
339        filter: Arc<dyn TraversalFilter>,
340    ) -> Self {
341        Self {
342            config,
343            index_cache: HashMap::new(),
344            storage,
345            filter,
346        }
347    }
348
349    /// Replace the storage backend used to persist index entries.
350    pub fn with_storage(self, storage: Arc<dyn IndexStorage>) -> Self {
351        Self { storage, ..self }
352    }
353
354    /// Replace the traversal filter used to decide which files and directories are indexed.
355    pub fn with_filter(self, filter: Arc<dyn TraversalFilter>) -> Self {
356        Self { filter, ..self }
357    }
358
359    /// Initialize the index directory.
360    pub fn init(&self) -> Result<()> {
361        self.storage.init(self.config.index_dir())
362    }
363
364    /// Get the workspace root path.
365    pub fn workspace_root(&self) -> &Path {
366        self.config.workspace_root()
367    }
368
369    /// Get the index directory used for persisted metadata.
370    pub fn index_dir(&self) -> &Path {
371        self.config.index_dir()
372    }
373
374    /// Index a single file.
375    pub fn index_file(&mut self, file_path: &Path) -> Result<()> {
376        let cache_key = file_path.to_string_lossy().into_owned();
377
378        if self.storage.prefers_snapshot_persistence() {
379            let next_entry = if file_path.exists() && self.should_process_file_path(file_path) {
380                self.build_file_index(file_path)?
381            } else {
382                None
383            };
384
385            self.apply_snapshot_file_update(cache_key, next_entry)?;
386            return Ok(());
387        }
388
389        if !file_path.exists() || !self.should_process_file_path(file_path) {
390            self.index_cache.remove(cache_key.as_str());
391            self.storage.remove(self.config.index_dir(), file_path)?;
392            return Ok(());
393        }
394
395        if let Some(index) = self.build_file_index(file_path)? {
396            self.storage.persist(self.config.index_dir(), &index)?;
397            self.index_cache.insert(index.path.clone(), index);
398        } else {
399            self.index_cache.remove(cache_key.as_str());
400            self.storage.remove(self.config.index_dir(), file_path)?;
401        }
402
403        Ok(())
404    }
405
406    /// Index all files in directory recursively.
407    /// Respects .gitignore, .ignore, and other ignore files.
408    /// SECURITY: Always skips hidden files and sensitive data (.env, .git, etc.)
409    pub fn index_directory(&mut self, dir_path: &Path) -> Result<()> {
410        let walker = self.build_walker(dir_path);
411
412        let mut entries = Vec::new();
413
414        for entry in walker.filter_map(|e| e.ok()) {
415            let path = entry.path();
416
417            // Only index files, not directories
418            if entry.file_type().is_some_and(|ft| ft.is_file())
419                && let Some(index) = self.build_file_index(path)?
420            {
421                entries.push(index);
422            }
423        }
424
425        if self.storage.prefers_snapshot_persistence() {
426            self.apply_snapshot_directory_update(dir_path, &entries)?;
427        } else {
428            entries.sort_unstable_by(|left, right| left.path.cmp(&right.path));
429            self.storage
430                .persist_batch(self.config.index_dir(), &entries)?;
431        }
432
433        self.replace_cached_entries(dir_path, &entries);
434
435        Ok(())
436    }
437
438    /// Discover all files in directory recursively without indexing them.
439    /// This is much faster than `index_directory` as it avoids hashing and persistence.
440    pub fn discover_files(&self, dir_path: &Path) -> Vec<String> {
441        let walker = self.build_walker(dir_path);
442
443        let mut files = walker
444            .filter_map(|e| e.ok())
445            .filter(|e| {
446                if !e.file_type().is_some_and(|ft| ft.is_file()) {
447                    return false;
448                }
449
450                self.should_process_file_path(e.path())
451            })
452            .map(|e| e.path().to_string_lossy().into_owned())
453            .collect::<Vec<_>>();
454        files.sort_unstable();
455        files
456    }
457
458    /// Internal helper for regex-based file content search.
459    /// Used by both `search()` and `grep()` to avoid code duplication.
460    fn search_files_internal(
461        &self,
462        regex: &Regex,
463        path_filter: Option<&str>,
464        extract_matches: bool,
465    ) -> Vec<SearchResult> {
466        let mut results = Vec::with_capacity(self.index_cache.len());
467
468        for file_path in self.index_cache.keys() {
469            if path_filter.is_some_and(|filter| !file_path.contains(filter)) {
470                continue;
471            }
472
473            if let Ok(content) = fs::read_to_string(file_path) {
474                for (line_num, line) in content.lines().enumerate() {
475                    if regex.is_match(line) {
476                        let matches = if extract_matches {
477                            regex
478                                .find_iter(line)
479                                .map(|m| m.as_str().to_string())
480                                .collect()
481                        } else {
482                            vec![line.to_string()]
483                        };
484
485                        results.push(SearchResult {
486                            file_path: file_path.clone(),
487                            line_number: line_num + 1,
488                            line_content: line.to_string(),
489                            matches,
490                        });
491                    }
492                }
493            }
494        }
495
496        results.sort_unstable_by(|left, right| {
497            left.file_path
498                .cmp(&right.file_path)
499                .then_with(|| left.line_number.cmp(&right.line_number))
500        });
501        results
502    }
503
504    /// Search files using regex pattern.
505    pub fn search(&self, pattern: &str, path_filter: Option<&str>) -> Result<Vec<SearchResult>> {
506        let regex = Regex::new(pattern)?;
507        Ok(self.search_files_internal(&regex, path_filter, true))
508    }
509
510    /// Find files by name pattern.
511    pub fn find_files(&self, pattern: &str) -> Result<Vec<String>> {
512        let regex = Regex::new(pattern)?;
513        let mut results = Vec::with_capacity(self.index_cache.len());
514
515        for file_path in self.index_cache.keys() {
516            if regex.is_match(file_path) {
517                results.push(file_path.clone());
518            }
519        }
520
521        results.sort_unstable();
522        Ok(results)
523    }
524
525    /// Get all indexed files without pattern matching.
526    /// This is more efficient than using find_files(".*").
527    pub fn all_files(&self) -> Vec<String> {
528        let mut files = self.index_cache.keys().cloned().collect::<Vec<_>>();
529        files.sort_unstable();
530        files
531    }
532
533    /// Get file content with line numbers.
534    pub fn get_file_content(
535        &self,
536        file_path: &str,
537        start_line: Option<usize>,
538        end_line: Option<usize>,
539    ) -> Result<String> {
540        let content = fs::read_to_string(file_path)?;
541        let start = start_line.unwrap_or(1).max(1);
542        let end = end_line.unwrap_or(usize::MAX);
543
544        if start > end {
545            return Ok(String::new());
546        }
547
548        let mut result = String::new();
549        for (line_number, line) in content.lines().enumerate() {
550            let line_number = line_number + 1;
551            if line_number < start {
552                continue;
553            }
554            if line_number > end {
555                break;
556            }
557            writeln!(&mut result, "{line_number}: {line}")?;
558        }
559
560        Ok(result)
561    }
562
563    /// List files in directory (like ls).
564    pub fn list_files(&self, dir_path: &str, show_hidden: bool) -> Result<Vec<String>> {
565        let path = Path::new(dir_path);
566        if !path.exists() {
567            return Ok(vec![]);
568        }
569
570        let mut files = Vec::new();
571
572        for entry in fs::read_dir(path)? {
573            let entry = entry?;
574            let file_name = entry.file_name().to_string_lossy().into_owned();
575
576            if !show_hidden && file_name.starts_with('.') {
577                continue;
578            }
579
580            files.push(file_name);
581        }
582
583        files.sort_unstable();
584        Ok(files)
585    }
586
587    /// Grep-like search (like grep command).
588    pub fn grep(&self, pattern: &str, file_pattern: Option<&str>) -> Result<Vec<SearchResult>> {
589        let regex = Regex::new(pattern)?;
590        Ok(self.search_files_internal(&regex, file_pattern, false))
591    }
592
593    fn is_allowed_path(&self, path: &Path) -> bool {
594        self.config
595            .allowed_dirs
596            .iter()
597            .any(|allowed| path.starts_with(allowed))
598    }
599
600    #[inline]
601    fn get_modified_time(&self, file_path: &Path) -> Result<u64> {
602        let metadata = fs::metadata(file_path)?;
603        let modified = metadata.modified()?;
604        Ok(modified.duration_since(SystemTime::UNIX_EPOCH)?.as_secs())
605    }
606
607    #[inline]
608    fn detect_language(&self, file_path: &Path) -> String {
609        file_path
610            .extension()
611            .and_then(|ext| ext.to_str())
612            .unwrap_or("unknown")
613            .to_string()
614    }
615
616    fn build_file_index(&self, file_path: &Path) -> Result<Option<FileIndex>> {
617        if !self.should_process_file_path(file_path) {
618            return Ok(None);
619        }
620
621        let content = match fs::read_to_string(file_path) {
622            Ok(text) => text,
623            Err(err) => {
624                if err.kind() == ErrorKind::InvalidData {
625                    return Ok(None);
626                }
627                return Err(err.into());
628            }
629        };
630
631        let index = FileIndex {
632            path: file_path.to_string_lossy().into_owned(),
633            hash: calculate_hash(&content),
634            modified: self.get_modified_time(file_path)?,
635            size: content.len() as u64,
636            language: self.detect_language(file_path),
637            tags: vec![],
638        };
639
640        Ok(Some(index))
641    }
642
643    #[inline]
644    fn is_excluded_path(&self, path: &Path) -> bool {
645        self.config
646            .excluded_dirs
647            .iter()
648            .any(|excluded| path.starts_with(excluded))
649    }
650
651    #[inline]
652    fn should_index_file_path(&self, path: &Path) -> bool {
653        self.filter.should_index_file(path, &self.config)
654    }
655
656    #[inline]
657    fn should_process_file_path(&self, path: &Path) -> bool {
658        if self.is_allowed_path(path) {
659            return self.should_index_file_path(path);
660        }
661
662        !self.is_excluded_path(path) && self.should_index_file_path(path)
663    }
664
665    fn build_walker(&self, dir_path: &Path) -> Walk {
666        let walk_root = dir_path.to_path_buf();
667        let config = self.config.clone();
668        let filter = Arc::clone(&self.filter);
669
670        let mut builder = vtcode_commons::walk::build_default_walker(dir_path);
671        builder.filter_entry(move |entry| {
672            should_visit_entry(entry, walk_root.as_path(), &config, filter.as_ref())
673        });
674        builder.build()
675    }
676
677    fn replace_cached_entries(&mut self, dir_path: &Path, entries: &[FileIndex]) {
678        self.index_cache
679            .retain(|path, _| !Path::new(path).starts_with(dir_path));
680
681        self.index_cache.extend(
682            entries
683                .iter()
684                .cloned()
685                .map(|entry| (entry.path.clone(), entry)),
686        );
687    }
688
689    fn apply_snapshot_file_update(
690        &mut self,
691        cache_key: String,
692        next_entry: Option<FileIndex>,
693    ) -> Result<()> {
694        let previous_entry = match next_entry {
695            Some(entry) => self.index_cache.insert(cache_key.clone(), entry),
696            None => self.index_cache.remove(cache_key.as_str()),
697        };
698
699        if let Err(err) = self.persist_current_snapshot() {
700            match previous_entry {
701                Some(entry) => {
702                    self.index_cache.insert(cache_key, entry);
703                }
704                None => {
705                    self.index_cache.remove(cache_key.as_str());
706                }
707            }
708            return Err(err);
709        }
710
711        Ok(())
712    }
713
714    fn apply_snapshot_directory_update(
715        &mut self,
716        dir_path: &Path,
717        entries: &[FileIndex],
718    ) -> Result<()> {
719        let previous_entries = self.take_cached_entries(dir_path);
720        self.index_cache.extend(
721            entries
722                .iter()
723                .cloned()
724                .map(|entry| (entry.path.clone(), entry)),
725        );
726
727        if let Err(err) = self.persist_current_snapshot() {
728            self.index_cache
729                .retain(|path, _| !Path::new(path).starts_with(dir_path));
730            self.index_cache.extend(
731                previous_entries
732                    .into_iter()
733                    .map(|entry| (entry.path.clone(), entry)),
734            );
735            return Err(err);
736        }
737
738        Ok(())
739    }
740
741    fn take_cached_entries(&mut self, dir_path: &Path) -> Vec<FileIndex> {
742        let keys = self
743            .index_cache
744            .keys()
745            .filter(|path| Path::new(path).starts_with(dir_path))
746            .cloned()
747            .collect::<Vec<_>>();
748
749        keys.into_iter()
750            .filter_map(|path| self.index_cache.remove(path.as_str()))
751            .collect()
752    }
753
754    fn persist_current_snapshot(&self) -> Result<()> {
755        let mut snapshot = self.index_cache.values().collect::<Vec<_>>();
756        snapshot.sort_unstable_by(|left, right| left.path.cmp(&right.path));
757        self.storage
758            .persist_batch_refs(self.config.index_dir(), &snapshot)
759    }
760}
761
762impl Clone for SimpleIndexer {
763    fn clone(&self) -> Self {
764        Self {
765            config: self.config.clone(),
766            index_cache: self.index_cache.clone(),
767            storage: self.storage.clone(),
768            filter: self.filter.clone(),
769        }
770    }
771}
772
773fn should_skip_dir(path: &Path, config: &SimpleIndexerConfig) -> bool {
774    if is_allowed_path_or_ancestor(path, config) {
775        return false;
776    }
777
778    if config
779        .excluded_dirs
780        .iter()
781        .any(|excluded| path.starts_with(excluded))
782    {
783        return true;
784    }
785
786    if config.ignore_hidden
787        && path
788            .file_name()
789            .and_then(|name| name.to_str())
790            .is_some_and(|name_str| name_str.starts_with('.'))
791    {
792        return true;
793    }
794
795    false
796}
797
798fn is_allowed_path_or_ancestor(path: &Path, config: &SimpleIndexerConfig) -> bool {
799    config
800        .allowed_dirs
801        .iter()
802        .any(|allowed| path.starts_with(allowed) || allowed.starts_with(path))
803}
804
805fn should_visit_entry(
806    entry: &DirEntry,
807    walk_root: &Path,
808    config: &SimpleIndexerConfig,
809    filter: &dyn TraversalFilter,
810) -> bool {
811    if entry.path() == walk_root {
812        return true;
813    }
814
815    if !entry
816        .file_type()
817        .is_some_and(|file_type| file_type.is_dir())
818    {
819        return true;
820    }
821
822    filter.should_descend(entry.path(), config)
823}
824
825#[inline]
826fn calculate_hash(content: &str) -> String {
827    vtcode_commons::utils::calculate_sha256(content.as_bytes())
828}
829
830fn write_markdown_entry(writer: &mut impl Write, entry: &FileIndex) -> std::io::Result<()> {
831    writeln!(writer, "## {}", entry.path)?;
832    writeln!(writer)?;
833    write_markdown_fields(writer, entry)?;
834    writeln!(writer)?;
835    Ok(())
836}
837
838fn write_markdown_fields(writer: &mut impl Write, entry: &FileIndex) -> std::io::Result<()> {
839    writeln!(writer, "- **Path**: {}", entry.path)?;
840    writeln!(writer, "- **Hash**: {}", entry.hash)?;
841    writeln!(writer, "- **Modified**: {}", entry.modified)?;
842    writeln!(writer, "- **Size**: {} bytes", entry.size)?;
843    writeln!(writer, "- **Language**: {}", entry.language)?;
844    writeln!(writer, "- **Tags**: {}", entry.tags.join(", "))?;
845    Ok(())
846}
847
848fn cleanup_legacy_markdown_entries(index_dir: &Path) -> Result<()> {
849    for entry in fs::read_dir(index_dir)? {
850        let entry = entry?;
851        let file_name = entry.file_name();
852        let file_name = file_name.to_string_lossy();
853        if is_legacy_markdown_entry_name(file_name.as_ref()) {
854            fs::remove_file(entry.path())?;
855        }
856    }
857    Ok(())
858}
859
860#[inline]
861fn is_legacy_markdown_entry_name(file_name: &str) -> bool {
862    let Some(hash_part) = file_name.strip_suffix(".md") else {
863        return false;
864    };
865    hash_part.len() == 64 && hash_part.bytes().all(|byte| byte.is_ascii_hexdigit())
866}
867
868#[cfg(test)]
869mod tests {
870    use super::*;
871    use std::fs;
872    use std::sync::{Arc, Mutex};
873    use tempfile::tempdir;
874
875    #[test]
876    fn skips_hidden_directories_by_default() -> Result<()> {
877        let temp = tempdir()?;
878        let workspace = temp.path();
879        let hidden_dir = workspace.join(".private");
880        fs::create_dir_all(&hidden_dir)?;
881        fs::write(hidden_dir.join("secret.txt"), "classified")?;
882
883        let visible_dir = workspace.join("src");
884        fs::create_dir_all(&visible_dir)?;
885        fs::write(visible_dir.join("lib.rs"), "fn main() {}")?;
886
887        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
888        indexer.init()?;
889        indexer.index_directory(workspace)?;
890
891        assert!(indexer.find_files("secret\\.txt$")?.is_empty());
892        assert!(!indexer.find_files("lib\\.rs$")?.is_empty());
893
894        Ok(())
895    }
896
897    #[test]
898    fn can_include_hidden_directories_when_configured() -> Result<()> {
899        let temp = tempdir()?;
900        let workspace = temp.path();
901        let hidden_dir = workspace.join(".cache");
902        fs::create_dir_all(&hidden_dir)?;
903        fs::write(hidden_dir.join("data.log"), "details")?;
904
905        let config = SimpleIndexerConfig::new(workspace.to_path_buf()).ignore_hidden(false);
906        let mut indexer = SimpleIndexer::with_config(config);
907        indexer.init()?;
908        indexer.index_directory(workspace)?;
909
910        let results = indexer.find_files("data\\.log$")?;
911        assert_eq!(results.len(), 1);
912
913        Ok(())
914    }
915
916    #[test]
917    fn indexes_allowed_directories_inside_hidden_excluded_parents() -> Result<()> {
918        let temp = tempdir()?;
919        let workspace = temp.path();
920        let allowed_dir = workspace.join(".vtcode").join("external");
921        fs::create_dir_all(&allowed_dir)?;
922        fs::write(allowed_dir.join("plugin.toml"), "name = 'demo'")?;
923
924        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
925        indexer.init()?;
926        indexer.index_directory(workspace)?;
927
928        let results = indexer.find_files("plugin\\.toml$")?;
929        assert_eq!(results.len(), 1);
930
931        Ok(())
932    }
933
934    #[test]
935    fn reindexing_prunes_deleted_files_from_cache() -> Result<()> {
936        let temp = tempdir()?;
937        let workspace = temp.path();
938        let file_path = workspace.join("notes.txt");
939        fs::write(&file_path, "remember this")?;
940
941        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
942        indexer.init()?;
943        indexer.index_directory(workspace)?;
944        assert_eq!(indexer.find_files("notes\\.txt$")?.len(), 1);
945
946        fs::remove_file(&file_path)?;
947        indexer.index_directory(workspace)?;
948
949        assert!(indexer.find_files("notes\\.txt$")?.is_empty());
950        assert!(indexer.all_files().is_empty());
951
952        Ok(())
953    }
954
955    #[test]
956    fn index_file_skips_excluded_paths() -> Result<()> {
957        let temp = tempdir()?;
958        let workspace = temp.path();
959        let index_dir = workspace.join(".vtcode").join("index");
960        fs::create_dir_all(&index_dir)?;
961        let generated_index = index_dir.join("index.md");
962        fs::write(&generated_index, "# generated")?;
963
964        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
965        indexer.init()?;
966        indexer.index_file(&generated_index)?;
967
968        assert!(indexer.all_files().is_empty());
969
970        Ok(())
971    }
972
973    #[test]
974    fn index_file_removes_stale_entry_when_file_becomes_unreadable() -> Result<()> {
975        let temp = tempdir()?;
976        let workspace = temp.path();
977        let file_path = workspace.join("notes.txt");
978        fs::write(&file_path, "remember this")?;
979
980        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
981        indexer.init()?;
982        indexer.index_file(&file_path)?;
983        assert!(
984            indexer
985                .find_files("notes\\.txt$")?
986                .iter()
987                .any(|file| file.ends_with("notes.txt"))
988        );
989
990        fs::write(&file_path, [0xFF, 0xFE, 0xFD])?;
991        indexer.index_file(&file_path)?;
992
993        assert!(indexer.find_files("notes\\.txt$")?.is_empty());
994
995        let index_content =
996            fs::read_to_string(workspace.join(".vtcode").join("index").join("index.md"))?;
997        assert!(!index_content.contains(file_path.to_string_lossy().as_ref()));
998
999        Ok(())
1000    }
1001
1002    #[test]
1003    fn index_file_maintains_markdown_snapshot_across_updates() -> Result<()> {
1004        let temp = tempdir()?;
1005        let workspace = temp.path();
1006        let first = workspace.join("first.txt");
1007        let second = workspace.join("second.txt");
1008        fs::write(&first, "one")?;
1009        fs::write(&second, "two")?;
1010
1011        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1012        indexer.init()?;
1013        indexer.index_file(&first)?;
1014        indexer.index_file(&second)?;
1015
1016        let index_dir = workspace.join(".vtcode").join("index");
1017        let files = fs::read_dir(&index_dir)?
1018            .filter_map(|entry| entry.ok())
1019            .map(|entry| entry.file_name().to_string_lossy().into_owned())
1020            .collect::<Vec<_>>();
1021        assert_eq!(files, vec!["index.md".to_string()]);
1022
1023        let index_content = fs::read_to_string(index_dir.join("index.md"))?;
1024        assert!(index_content.contains(first.to_string_lossy().as_ref()));
1025        assert!(index_content.contains(second.to_string_lossy().as_ref()));
1026
1027        Ok(())
1028    }
1029
1030    #[test]
1031    fn index_directory_writes_markdown_snapshot_without_manual_init() -> Result<()> {
1032        let temp = tempdir()?;
1033        let workspace = temp.path();
1034        fs::write(workspace.join("notes.txt"), "remember this")?;
1035
1036        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1037        indexer.index_directory(workspace)?;
1038
1039        let index_content =
1040            fs::read_to_string(workspace.join(".vtcode").join("index").join("index.md"))?;
1041        assert!(index_content.contains(workspace.join("notes.txt").to_string_lossy().as_ref()));
1042
1043        Ok(())
1044    }
1045
1046    #[test]
1047    fn get_file_content_clamps_ranges_without_panicking() -> Result<()> {
1048        let temp = tempdir()?;
1049        let workspace = temp.path();
1050        let file_path = workspace.join("notes.txt");
1051        fs::write(&file_path, "first\nsecond")?;
1052
1053        let indexer = SimpleIndexer::new(workspace.to_path_buf());
1054        let file_path = file_path.to_string_lossy().into_owned();
1055
1056        assert_eq!(indexer.get_file_content(&file_path, Some(5), None)?, "");
1057        assert_eq!(
1058            indexer.get_file_content(&file_path, Some(0), Some(1))?,
1059            "1: first\n"
1060        );
1061        assert_eq!(indexer.get_file_content(&file_path, Some(2), Some(1))?, "");
1062
1063        Ok(())
1064    }
1065
1066    #[test]
1067    fn supports_custom_storage_backends() -> Result<()> {
1068        #[derive(Clone, Default)]
1069        struct MemoryStorage {
1070            records: Arc<Mutex<Vec<FileIndex>>>,
1071        }
1072
1073        impl MemoryStorage {
1074            fn new(records: Arc<Mutex<Vec<FileIndex>>>) -> Self {
1075                Self { records }
1076            }
1077        }
1078
1079        impl IndexStorage for MemoryStorage {
1080            fn init(&self, _index_dir: &Path) -> Result<()> {
1081                Ok(())
1082            }
1083
1084            fn persist(&self, _index_dir: &Path, entry: &FileIndex) -> Result<()> {
1085                let mut guard = self.records.lock().expect("lock poisoned");
1086                guard.push(entry.clone());
1087                Ok(())
1088            }
1089        }
1090
1091        let temp = tempdir()?;
1092        let workspace = temp.path();
1093        fs::write(workspace.join("notes.txt"), "remember this")?;
1094
1095        let records: Arc<Mutex<Vec<FileIndex>>> = Arc::new(Mutex::new(Vec::new()));
1096        let storage = MemoryStorage::new(records.clone());
1097
1098        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1099        let mut indexer = SimpleIndexer::with_config(config).with_storage(Arc::new(storage));
1100        indexer.init()?;
1101        indexer.index_directory(workspace)?;
1102
1103        let entries = records.lock().expect("lock poisoned");
1104        assert_eq!(entries.len(), 1);
1105        assert_eq!(
1106            entries[0].path,
1107            workspace.join("notes.txt").to_string_lossy().into_owned()
1108        );
1109
1110        Ok(())
1111    }
1112
1113    #[test]
1114    fn custom_filters_can_skip_files() -> Result<()> {
1115        #[derive(Default)]
1116        struct SkipRustFilter {
1117            inner: ConfigTraversalFilter,
1118        }
1119
1120        impl TraversalFilter for SkipRustFilter {
1121            fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
1122                self.inner.should_descend(path, config)
1123            }
1124
1125            fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
1126                if path
1127                    .extension()
1128                    .and_then(|ext| ext.to_str())
1129                    .is_some_and(|ext| ext.eq_ignore_ascii_case("rs"))
1130                {
1131                    return false;
1132                }
1133
1134                self.inner.should_index_file(path, config)
1135            }
1136        }
1137
1138        let temp = tempdir()?;
1139        let workspace = temp.path();
1140        fs::write(workspace.join("lib.rs"), "fn main() {}")?;
1141        fs::write(workspace.join("README.md"), "# Notes")?;
1142
1143        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1144        let mut indexer =
1145            SimpleIndexer::with_config(config).with_filter(Arc::new(SkipRustFilter::default()));
1146        indexer.init()?;
1147        indexer.index_directory(workspace)?;
1148
1149        assert!(indexer.find_files("lib\\.rs$")?.is_empty());
1150        assert!(!indexer.find_files("README\\.md$")?.is_empty());
1151
1152        Ok(())
1153    }
1154
1155    #[test]
1156    fn custom_filters_can_skip_directories() -> Result<()> {
1157        #[derive(Default)]
1158        struct SkipGeneratedFilter {
1159            inner: ConfigTraversalFilter,
1160        }
1161
1162        impl TraversalFilter for SkipGeneratedFilter {
1163            fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
1164                if path.ends_with("generated") {
1165                    return false;
1166                }
1167
1168                self.inner.should_descend(path, config)
1169            }
1170
1171            fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
1172                self.inner.should_index_file(path, config)
1173            }
1174        }
1175
1176        let temp = tempdir()?;
1177        let workspace = temp.path();
1178        let generated_dir = workspace.join("generated");
1179        fs::create_dir_all(&generated_dir)?;
1180        fs::write(generated_dir.join("skip.txt"), "ignore me")?;
1181        fs::write(workspace.join("README.md"), "# Notes")?;
1182
1183        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1184        let indexer = SimpleIndexer::with_config(config)
1185            .with_filter(Arc::new(SkipGeneratedFilter::default()));
1186        let files = indexer.discover_files(workspace);
1187
1188        assert!(!files.iter().any(|file| file.ends_with("skip.txt")));
1189        assert!(files.iter().any(|file| file.ends_with("README.md")));
1190
1191        Ok(())
1192    }
1193
1194    #[test]
1195    fn indexing_multiple_directories_preserves_existing_cache_entries() -> Result<()> {
1196        let temp = tempdir()?;
1197        let workspace = temp.path();
1198        let src_dir = workspace.join("src");
1199        let docs_dir = workspace.join("docs");
1200        fs::create_dir_all(&src_dir)?;
1201        fs::create_dir_all(&docs_dir)?;
1202        fs::write(src_dir.join("lib.rs"), "fn main() {}")?;
1203        fs::write(docs_dir.join("guide.md"), "# Guide")?;
1204
1205        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1206        indexer.init()?;
1207        indexer.index_directory(&src_dir)?;
1208        indexer.index_directory(&docs_dir)?;
1209
1210        assert!(
1211            indexer
1212                .find_files("lib\\.rs$")?
1213                .iter()
1214                .any(|file| file.ends_with("lib.rs"))
1215        );
1216        assert!(
1217            indexer
1218                .find_files("guide\\.md$")?
1219                .iter()
1220                .any(|file| file.ends_with("guide.md"))
1221        );
1222
1223        let index_content =
1224            fs::read_to_string(workspace.join(".vtcode").join("index").join("index.md"))?;
1225        assert!(index_content.contains(src_dir.join("lib.rs").to_string_lossy().as_ref()));
1226        assert!(index_content.contains(docs_dir.join("guide.md").to_string_lossy().as_ref()));
1227
1228        Ok(())
1229    }
1230
1231    #[test]
1232    fn batch_indexing_writes_single_markdown_file() -> Result<()> {
1233        let temp = tempdir()?;
1234        let workspace = temp.path();
1235        fs::write(workspace.join("lib.rs"), "fn main() {}")?;
1236        fs::write(workspace.join("README.md"), "# Notes")?;
1237
1238        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1239        indexer.init()?;
1240        indexer.index_directory(workspace)?;
1241
1242        let index_dir = workspace.join(".vtcode").join("index");
1243        let files = fs::read_dir(&index_dir)?
1244            .filter_map(|entry| entry.ok())
1245            .map(|entry| entry.file_name().to_string_lossy().into_owned())
1246            .collect::<Vec<_>>();
1247        assert_eq!(files, vec!["index.md".to_string()]);
1248
1249        let index_content = fs::read_to_string(index_dir.join("index.md"))?;
1250        assert!(index_content.contains(workspace.join("lib.rs").to_string_lossy().as_ref()));
1251        assert!(index_content.contains(workspace.join("README.md").to_string_lossy().as_ref()));
1252
1253        Ok(())
1254    }
1255
1256    #[test]
1257    fn batch_indexing_removes_legacy_hashed_entries() -> Result<()> {
1258        let temp = tempdir()?;
1259        let workspace = temp.path();
1260        fs::write(workspace.join("lib.rs"), "fn main() {}")?;
1261
1262        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1263        indexer.init()?;
1264
1265        let legacy_file_name = format!("{}.md", calculate_hash("legacy-path"));
1266        let legacy_file_path = workspace
1267            .join(".vtcode")
1268            .join("index")
1269            .join(&legacy_file_name);
1270        fs::write(&legacy_file_path, "# legacy")?;
1271        assert!(legacy_file_path.exists());
1272
1273        indexer.index_directory(workspace)?;
1274
1275        assert!(!legacy_file_path.exists());
1276        let files = fs::read_dir(workspace.join(".vtcode").join("index"))?
1277            .filter_map(|entry| entry.ok())
1278            .map(|entry| entry.file_name().to_string_lossy().into_owned())
1279            .collect::<Vec<_>>();
1280        assert_eq!(files, vec!["index.md".to_string()]);
1281
1282        Ok(())
1283    }
1284
1285    #[test]
1286    fn snapshot_storage_uses_default_ref_batch_persistence() -> Result<()> {
1287        #[derive(Clone, Default)]
1288        struct SnapshotMemoryStorage {
1289            snapshots: Arc<Mutex<Vec<Vec<FileIndex>>>>,
1290        }
1291
1292        impl SnapshotMemoryStorage {
1293            fn new(snapshots: Arc<Mutex<Vec<Vec<FileIndex>>>>) -> Self {
1294                Self { snapshots }
1295            }
1296        }
1297
1298        impl IndexStorage for SnapshotMemoryStorage {
1299            fn init(&self, _index_dir: &Path) -> Result<()> {
1300                Ok(())
1301            }
1302
1303            fn persist(&self, _index_dir: &Path, _entry: &FileIndex) -> Result<()> {
1304                Ok(())
1305            }
1306
1307            fn prefers_snapshot_persistence(&self) -> bool {
1308                true
1309            }
1310
1311            fn persist_batch(&self, _index_dir: &Path, entries: &[FileIndex]) -> Result<()> {
1312                self.snapshots
1313                    .lock()
1314                    .expect("lock poisoned")
1315                    .push(entries.to_vec());
1316                Ok(())
1317            }
1318        }
1319
1320        let temp = tempdir()?;
1321        let workspace = temp.path();
1322        let file_path = workspace.join("notes.txt");
1323        fs::write(&file_path, "remember this")?;
1324
1325        let snapshots = Arc::new(Mutex::new(Vec::new()));
1326        let storage = SnapshotMemoryStorage::new(snapshots.clone());
1327
1328        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1329        let mut indexer = SimpleIndexer::with_config(config).with_storage(Arc::new(storage));
1330        indexer.index_file(&file_path)?;
1331
1332        let snapshots = snapshots.lock().expect("lock poisoned");
1333        assert_eq!(snapshots.len(), 1);
1334        assert_eq!(snapshots[0].len(), 1);
1335        assert_eq!(
1336            snapshots[0][0].path,
1337            workspace.join("notes.txt").to_string_lossy().into_owned()
1338        );
1339
1340        Ok(())
1341    }
1342
1343    #[test]
1344    fn snapshot_index_file_rolls_back_cache_when_persist_fails() -> Result<()> {
1345        #[derive(Clone, Default)]
1346        struct FlakySnapshotStorage {
1347            persist_count: Arc<Mutex<usize>>,
1348        }
1349
1350        impl IndexStorage for FlakySnapshotStorage {
1351            fn init(&self, _index_dir: &Path) -> Result<()> {
1352                Ok(())
1353            }
1354
1355            fn persist(&self, _index_dir: &Path, _entry: &FileIndex) -> Result<()> {
1356                Ok(())
1357            }
1358
1359            fn prefers_snapshot_persistence(&self) -> bool {
1360                true
1361            }
1362
1363            fn persist_batch(&self, _index_dir: &Path, _entries: &[FileIndex]) -> Result<()> {
1364                let mut count = self.persist_count.lock().expect("lock poisoned");
1365                *count += 1;
1366                if *count == 2 {
1367                    anyhow::bail!("simulated snapshot persistence failure");
1368                }
1369                Ok(())
1370            }
1371        }
1372
1373        let temp = tempdir()?;
1374        let workspace = temp.path();
1375        let first = workspace.join("first.txt");
1376        let second = workspace.join("second.txt");
1377        fs::write(&first, "one")?;
1378        fs::write(&second, "two")?;
1379
1380        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1381        let storage = Arc::new(FlakySnapshotStorage::default());
1382        let mut indexer = SimpleIndexer::with_config(config).with_storage(storage);
1383
1384        indexer.index_file(&first)?;
1385        assert!(
1386            indexer
1387                .find_files("first\\.txt$")?
1388                .iter()
1389                .any(|path| path.ends_with("first.txt"))
1390        );
1391
1392        let err = indexer
1393            .index_file(&second)
1394            .expect_err("second persist should fail");
1395        assert!(
1396            err.to_string()
1397                .contains("simulated snapshot persistence failure")
1398        );
1399        assert!(
1400            indexer
1401                .find_files("first\\.txt$")?
1402                .iter()
1403                .any(|path| path.ends_with("first.txt"))
1404        );
1405        assert!(indexer.find_files("second\\.txt$")?.is_empty());
1406
1407        Ok(())
1408    }
1409}