Skip to main content

vtcode_indexer/
lib.rs

1//! Workspace-friendly file indexer extracted from VT Code.
2//!
3//! `vtcode-indexer` offers a lightweight alternative to heavyweight
4//! search/indexing stacks. It recursively walks a workspace, computes
5//! hashes, and stores per-file metadata in Markdown-friendly summaries
6//! so changes remain easy to audit in git.
7
8use anyhow::Result;
9use hashbrown::HashMap;
10use ignore::{DirEntry, Walk, WalkBuilder};
11use regex::Regex;
12use serde::{Deserialize, Serialize};
13use std::fmt::Write as FmtWrite;
14use std::fs;
15use std::io::{BufWriter, ErrorKind, Write};
16use std::path::{Path, PathBuf};
17use std::sync::Arc;
18use std::time::SystemTime;
19
20/// Persistence backend for [`SimpleIndexer`].
21pub trait IndexStorage: Send + Sync {
22    /// Prepare any directories or resources required for persistence.
23    fn init(&self, index_dir: &Path) -> Result<()>;
24
25    /// Persist an indexed file entry.
26    fn persist(&self, index_dir: &Path, entry: &FileIndex) -> Result<()>;
27
28    /// Whether this backend expects full-snapshot persistence.
29    ///
30    /// Snapshot-aware backends receive the complete in-memory index on each
31    /// update so on-disk state stays consistent across single-file and
32    /// directory indexing flows.
33    fn prefers_snapshot_persistence(&self) -> bool {
34        false
35    }
36
37    /// Remove a previously persisted file entry.
38    ///
39    /// Defaults to a no-op to keep existing custom storage backends compatible.
40    fn remove(&self, _index_dir: &Path, _file_path: &Path) -> Result<()> {
41        Ok(())
42    }
43
44    /// Persist a batch of indexed file entries.
45    ///
46    /// Defaults to calling [`IndexStorage::persist`] for each entry, keeping
47    /// existing custom storage backends compatible.
48    fn persist_batch(&self, index_dir: &Path, entries: &[FileIndex]) -> Result<()> {
49        for entry in entries {
50            self.persist(index_dir, entry)?;
51        }
52        Ok(())
53    }
54
55    /// Persist a batch of indexed file entries borrowed from the in-memory cache.
56    ///
57    /// Defaults to cloning the borrowed entries and delegating to
58    /// [`IndexStorage::persist_batch`] so existing custom storage backends remain
59    /// compatible.
60    fn persist_batch_refs(&self, index_dir: &Path, entries: &[&FileIndex]) -> Result<()> {
61        let owned = entries
62            .iter()
63            .map(|entry| (*entry).clone())
64            .collect::<Vec<_>>();
65        self.persist_batch(index_dir, &owned)
66    }
67}
68
69/// Directory traversal filter hook for [`SimpleIndexer`].
70pub trait TraversalFilter: Send + Sync {
71    /// Determine if the indexer should descend into the provided directory.
72    fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool;
73
74    /// Determine if the indexer should process the provided file.
75    fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool;
76}
77
78/// Markdown-backed [`IndexStorage`] implementation.
79#[derive(Debug, Default, Clone)]
80pub struct MarkdownIndexStorage;
81
82impl IndexStorage for MarkdownIndexStorage {
83    fn init(&self, index_dir: &Path) -> Result<()> {
84        fs::create_dir_all(index_dir)?;
85        Ok(())
86    }
87
88    fn persist(&self, index_dir: &Path, entry: &FileIndex) -> Result<()> {
89        fs::create_dir_all(index_dir)?;
90        let file_name = format!("{}.md", calculate_hash(&entry.path));
91        let index_path = index_dir.join(file_name);
92        let file = fs::File::create(index_path)?;
93        let mut writer = BufWriter::new(file);
94        writeln!(writer, "# File Index: {}", entry.path)?;
95        writeln!(writer)?;
96        write_markdown_fields(&mut writer, entry)?;
97        writer.flush()?;
98        Ok(())
99    }
100
101    fn prefers_snapshot_persistence(&self) -> bool {
102        true
103    }
104
105    fn remove(&self, index_dir: &Path, file_path: &Path) -> Result<()> {
106        let file_name = format!(
107            "{}.md",
108            calculate_hash(file_path.to_string_lossy().as_ref())
109        );
110        let index_path = index_dir.join(file_name);
111        match fs::remove_file(index_path) {
112            Ok(()) => Ok(()),
113            Err(err) if err.kind() == ErrorKind::NotFound => Ok(()),
114            Err(err) => Err(err.into()),
115        }
116    }
117
118    fn persist_batch(&self, index_dir: &Path, entries: &[FileIndex]) -> Result<()> {
119        persist_markdown_snapshot(index_dir, entries.iter())
120    }
121
122    fn persist_batch_refs(&self, index_dir: &Path, entries: &[&FileIndex]) -> Result<()> {
123        persist_markdown_snapshot(index_dir, entries.iter().copied())
124    }
125}
126
127fn persist_markdown_snapshot<'a>(
128    index_dir: &Path,
129    entries: impl IntoIterator<Item = &'a FileIndex>,
130) -> Result<()> {
131    let entries = entries.into_iter().collect::<Vec<_>>();
132
133    fs::create_dir_all(index_dir)?;
134    let temp_path = index_dir.join(".index.md.tmp");
135    let final_path = index_dir.join("index.md");
136    let file = fs::File::create(&temp_path)?;
137    let mut writer = BufWriter::new(file);
138
139    writeln!(writer, "# Workspace File Index")?;
140    writeln!(writer)?;
141    writeln!(writer, "- **Entries**: {}", entries.len())?;
142    writeln!(writer)?;
143
144    for entry in entries {
145        write_markdown_entry(&mut writer, entry)?;
146    }
147
148    writer.flush()?;
149    fs::rename(temp_path, final_path)?;
150    cleanup_legacy_markdown_entries(index_dir)?;
151    Ok(())
152}
153
154/// Default traversal filter powered by [`SimpleIndexerConfig`].
155#[derive(Debug, Default, Clone)]
156pub struct ConfigTraversalFilter;
157
158impl TraversalFilter for ConfigTraversalFilter {
159    fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
160        !should_skip_dir(path, config)
161    }
162
163    fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
164        if !path.is_file() {
165            return false;
166        }
167
168        // Skip hidden files when configured.
169        if config.ignore_hidden
170            && path
171                .file_name()
172                .and_then(|n| n.to_str())
173                .is_some_and(|s| s.starts_with('.'))
174        {
175            return false;
176        }
177
178        // Always skip known sensitive files regardless of config.
179        if let Some(file_name) = path.file_name().and_then(|n| n.to_str()) {
180            if vtcode_commons::exclusions::is_sensitive_file(file_name)
181                || file_name == ".gitignore"
182                || file_name == ".git"
183            {
184                return false;
185            }
186        }
187
188        true
189    }
190}
191
192/// Configuration for [`SimpleIndexer`].
193#[derive(Clone, Debug)]
194pub struct SimpleIndexerConfig {
195    workspace_root: PathBuf,
196    index_dir: PathBuf,
197    ignore_hidden: bool,
198    excluded_dirs: Vec<PathBuf>,
199    allowed_dirs: Vec<PathBuf>,
200}
201
202impl SimpleIndexerConfig {
203    /// Builds a configuration using VT Code's legacy layout as defaults.
204    pub fn new(workspace_root: PathBuf) -> Self {
205        let index_dir = workspace_root.join(".vtcode").join("index");
206        let vtcode_dir = workspace_root.join(".vtcode");
207        let external_dir = vtcode_dir.join("external");
208
209        let mut excluded_dirs: Vec<PathBuf> = vtcode_commons::exclusions::DEFAULT_EXCLUDED_DIRS
210            .iter()
211            .map(|name| workspace_root.join(name))
212            .collect();
213        excluded_dirs.push(index_dir.clone());
214        excluded_dirs.push(vtcode_dir);
215
216        excluded_dirs.dedup();
217
218        Self {
219            workspace_root,
220            index_dir,
221            ignore_hidden: true,
222            excluded_dirs,
223            allowed_dirs: vec![external_dir],
224        }
225    }
226
227    /// Updates the index directory used for persisted metadata.
228    pub fn with_index_dir(mut self, index_dir: impl Into<PathBuf>) -> Self {
229        let index_dir = index_dir.into();
230        self.index_dir = index_dir.clone();
231        self.push_unique_excluded(index_dir);
232        self
233    }
234
235    /// Adds an allowed directory that should be indexed even if hidden or inside an excluded parent.
236    pub fn add_allowed_dir(mut self, path: impl Into<PathBuf>) -> Self {
237        let path = path.into();
238        if !self.allowed_dirs.iter().any(|existing| existing == &path) {
239            self.allowed_dirs.push(path);
240        }
241        self
242    }
243
244    /// Adds an additional excluded directory to skip during traversal.
245    pub fn add_excluded_dir(mut self, path: impl Into<PathBuf>) -> Self {
246        let path = path.into();
247        self.push_unique_excluded(path);
248        self
249    }
250
251    /// Toggles whether hidden directories (prefix `.`) are ignored.
252    pub fn ignore_hidden(mut self, ignore_hidden: bool) -> Self {
253        self.ignore_hidden = ignore_hidden;
254        self
255    }
256
257    /// Workspace root accessor.
258    pub fn workspace_root(&self) -> &Path {
259        &self.workspace_root
260    }
261
262    /// Index directory accessor.
263    pub fn index_dir(&self) -> &Path {
264        &self.index_dir
265    }
266
267    fn push_unique_excluded(&mut self, path: PathBuf) {
268        if !self.excluded_dirs.iter().any(|existing| existing == &path) {
269            self.excluded_dirs.push(path);
270        }
271    }
272}
273
274/// Simple file index entry.
275#[derive(Debug, Clone, Serialize, Deserialize)]
276pub struct FileIndex {
277    /// File path.
278    pub path: String,
279    /// File content hash for change detection.
280    pub hash: String,
281    /// Last modified timestamp.
282    pub modified: u64,
283    /// File size.
284    pub size: u64,
285    /// Language/extension.
286    pub language: String,
287    /// Simple tags.
288    pub tags: Vec<String>,
289}
290
291/// Simple search result.
292#[derive(Debug, Clone, Serialize, Deserialize)]
293pub struct SearchResult {
294    pub file_path: String,
295    pub line_number: usize,
296    pub line_content: String,
297    pub matches: Vec<String>,
298}
299
300/// Simple file indexer.
301pub struct SimpleIndexer {
302    config: SimpleIndexerConfig,
303    index_cache: HashMap<String, FileIndex>,
304    storage: Arc<dyn IndexStorage>,
305    filter: Arc<dyn TraversalFilter>,
306}
307
308impl SimpleIndexer {
309    /// Create a new simple indexer with default VT Code paths.
310    pub fn new(workspace_root: PathBuf) -> Self {
311        Self::with_components(
312            SimpleIndexerConfig::new(workspace_root),
313            Arc::new(MarkdownIndexStorage),
314            Arc::new(ConfigTraversalFilter),
315        )
316    }
317
318    /// Create a simple indexer with the provided configuration.
319    pub fn with_config(config: SimpleIndexerConfig) -> Self {
320        Self::with_components(
321            config,
322            Arc::new(MarkdownIndexStorage),
323            Arc::new(ConfigTraversalFilter),
324        )
325    }
326
327    /// Create a new simple indexer using a custom index directory.
328    pub fn with_index_dir(workspace_root: PathBuf, index_dir: PathBuf) -> Self {
329        let config = SimpleIndexerConfig::new(workspace_root).with_index_dir(index_dir);
330        Self::with_config(config)
331    }
332
333    /// Create an indexer with explicit storage and traversal filter implementations.
334    pub fn with_components(
335        config: SimpleIndexerConfig,
336        storage: Arc<dyn IndexStorage>,
337        filter: Arc<dyn TraversalFilter>,
338    ) -> Self {
339        Self {
340            config,
341            index_cache: HashMap::new(),
342            storage,
343            filter,
344        }
345    }
346
347    /// Replace the storage backend used to persist index entries.
348    pub fn with_storage(self, storage: Arc<dyn IndexStorage>) -> Self {
349        Self { storage, ..self }
350    }
351
352    /// Replace the traversal filter used to decide which files and directories are indexed.
353    pub fn with_filter(self, filter: Arc<dyn TraversalFilter>) -> Self {
354        Self { filter, ..self }
355    }
356
357    /// Initialize the index directory.
358    pub fn init(&self) -> Result<()> {
359        self.storage.init(self.config.index_dir())
360    }
361
362    /// Get the workspace root path.
363    pub fn workspace_root(&self) -> &Path {
364        self.config.workspace_root()
365    }
366
367    /// Get the index directory used for persisted metadata.
368    pub fn index_dir(&self) -> &Path {
369        self.config.index_dir()
370    }
371
372    /// Index a single file.
373    pub fn index_file(&mut self, file_path: &Path) -> Result<()> {
374        let cache_key = file_path.to_string_lossy().into_owned();
375
376        if self.storage.prefers_snapshot_persistence() {
377            let next_entry = if file_path.exists() && self.should_process_file_path(file_path) {
378                self.build_file_index(file_path)?
379            } else {
380                None
381            };
382
383            self.apply_snapshot_file_update(cache_key, next_entry)?;
384            return Ok(());
385        }
386
387        if !file_path.exists() || !self.should_process_file_path(file_path) {
388            self.index_cache.remove(cache_key.as_str());
389            self.storage.remove(self.config.index_dir(), file_path)?;
390            return Ok(());
391        }
392
393        if let Some(index) = self.build_file_index(file_path)? {
394            self.storage.persist(self.config.index_dir(), &index)?;
395            self.index_cache.insert(index.path.clone(), index);
396        } else {
397            self.index_cache.remove(cache_key.as_str());
398            self.storage.remove(self.config.index_dir(), file_path)?;
399        }
400
401        Ok(())
402    }
403
404    /// Index all files in directory recursively.
405    /// Respects .gitignore, .ignore, and other ignore files.
406    /// SECURITY: Always skips hidden files and sensitive data (.env, .git, etc.)
407    pub fn index_directory(&mut self, dir_path: &Path) -> Result<()> {
408        let walker = self.build_walker(dir_path);
409
410        let mut entries = Vec::new();
411
412        for entry in walker.filter_map(|e| e.ok()) {
413            let path = entry.path();
414
415            // Only index files, not directories
416            if entry.file_type().is_some_and(|ft| ft.is_file())
417                && let Some(index) = self.build_file_index(path)?
418            {
419                entries.push(index);
420            }
421        }
422
423        if self.storage.prefers_snapshot_persistence() {
424            self.apply_snapshot_directory_update(dir_path, &entries)?;
425        } else {
426            entries.sort_unstable_by(|left, right| left.path.cmp(&right.path));
427            self.storage
428                .persist_batch(self.config.index_dir(), &entries)?;
429        }
430
431        self.replace_cached_entries(dir_path, &entries);
432
433        Ok(())
434    }
435
436    /// Discover all files in directory recursively without indexing them.
437    /// This is much faster than `index_directory` as it avoids hashing and persistence.
438    pub fn discover_files(&self, dir_path: &Path) -> Vec<String> {
439        let walker = self.build_walker(dir_path);
440
441        let mut files = walker
442            .filter_map(|e| e.ok())
443            .filter(|e| {
444                if !e.file_type().is_some_and(|ft| ft.is_file()) {
445                    return false;
446                }
447
448                self.should_process_file_path(e.path())
449            })
450            .map(|e| e.path().to_string_lossy().into_owned())
451            .collect::<Vec<_>>();
452        files.sort_unstable();
453        files
454    }
455
456    /// Internal helper for regex-based file content search.
457    /// Used by both `search()` and `grep()` to avoid code duplication.
458    fn search_files_internal(
459        &self,
460        regex: &Regex,
461        path_filter: Option<&str>,
462        extract_matches: bool,
463    ) -> Vec<SearchResult> {
464        let mut results = Vec::with_capacity(self.index_cache.len());
465
466        for file_path in self.index_cache.keys() {
467            if path_filter.is_some_and(|filter| !file_path.contains(filter)) {
468                continue;
469            }
470
471            if let Ok(content) = fs::read_to_string(file_path) {
472                for (line_num, line) in content.lines().enumerate() {
473                    if regex.is_match(line) {
474                        let matches = if extract_matches {
475                            regex
476                                .find_iter(line)
477                                .map(|m| m.as_str().to_string())
478                                .collect()
479                        } else {
480                            vec![line.to_string()]
481                        };
482
483                        results.push(SearchResult {
484                            file_path: file_path.clone(),
485                            line_number: line_num + 1,
486                            line_content: line.to_string(),
487                            matches,
488                        });
489                    }
490                }
491            }
492        }
493
494        results.sort_unstable_by(|left, right| {
495            left.file_path
496                .cmp(&right.file_path)
497                .then_with(|| left.line_number.cmp(&right.line_number))
498        });
499        results
500    }
501
502    /// Search files using regex pattern.
503    pub fn search(&self, pattern: &str, path_filter: Option<&str>) -> Result<Vec<SearchResult>> {
504        let regex = Regex::new(pattern)?;
505        Ok(self.search_files_internal(&regex, path_filter, true))
506    }
507
508    /// Find files by name pattern.
509    pub fn find_files(&self, pattern: &str) -> Result<Vec<String>> {
510        let regex = Regex::new(pattern)?;
511        let mut results = Vec::with_capacity(self.index_cache.len());
512
513        for file_path in self.index_cache.keys() {
514            if regex.is_match(file_path) {
515                results.push(file_path.clone());
516            }
517        }
518
519        results.sort_unstable();
520        Ok(results)
521    }
522
523    /// Get all indexed files without pattern matching.
524    /// This is more efficient than using find_files(".*").
525    pub fn all_files(&self) -> Vec<String> {
526        let mut files = self.index_cache.keys().cloned().collect::<Vec<_>>();
527        files.sort_unstable();
528        files
529    }
530
531    /// Get file content with line numbers.
532    pub fn get_file_content(
533        &self,
534        file_path: &str,
535        start_line: Option<usize>,
536        end_line: Option<usize>,
537    ) -> Result<String> {
538        let content = fs::read_to_string(file_path)?;
539        let start = start_line.unwrap_or(1).max(1);
540        let end = end_line.unwrap_or(usize::MAX);
541
542        if start > end {
543            return Ok(String::new());
544        }
545
546        let mut result = String::new();
547        for (line_number, line) in content.lines().enumerate() {
548            let line_number = line_number + 1;
549            if line_number < start {
550                continue;
551            }
552            if line_number > end {
553                break;
554            }
555            writeln!(&mut result, "{line_number}: {line}")?;
556        }
557
558        Ok(result)
559    }
560
561    /// List files in directory (like ls).
562    pub fn list_files(&self, dir_path: &str, show_hidden: bool) -> Result<Vec<String>> {
563        let path = Path::new(dir_path);
564        if !path.exists() {
565            return Ok(vec![]);
566        }
567
568        let mut files = Vec::new();
569
570        for entry in fs::read_dir(path)? {
571            let entry = entry?;
572            let file_name = entry.file_name().to_string_lossy().into_owned();
573
574            if !show_hidden && file_name.starts_with('.') {
575                continue;
576            }
577
578            files.push(file_name);
579        }
580
581        files.sort_unstable();
582        Ok(files)
583    }
584
585    /// Grep-like search (like grep command).
586    pub fn grep(&self, pattern: &str, file_pattern: Option<&str>) -> Result<Vec<SearchResult>> {
587        let regex = Regex::new(pattern)?;
588        Ok(self.search_files_internal(&regex, file_pattern, false))
589    }
590
591    fn is_allowed_path(&self, path: &Path) -> bool {
592        self.config
593            .allowed_dirs
594            .iter()
595            .any(|allowed| path.starts_with(allowed))
596    }
597
598    #[inline]
599    fn get_modified_time(&self, file_path: &Path) -> Result<u64> {
600        let metadata = fs::metadata(file_path)?;
601        let modified = metadata.modified()?;
602        Ok(modified.duration_since(SystemTime::UNIX_EPOCH)?.as_secs())
603    }
604
605    #[inline]
606    fn detect_language(&self, file_path: &Path) -> String {
607        file_path
608            .extension()
609            .and_then(|ext| ext.to_str())
610            .unwrap_or("unknown")
611            .to_string()
612    }
613
614    fn build_file_index(&self, file_path: &Path) -> Result<Option<FileIndex>> {
615        if !self.should_process_file_path(file_path) {
616            return Ok(None);
617        }
618
619        let content = match fs::read_to_string(file_path) {
620            Ok(text) => text,
621            Err(err) => {
622                if err.kind() == ErrorKind::InvalidData {
623                    return Ok(None);
624                }
625                return Err(err.into());
626            }
627        };
628
629        let index = FileIndex {
630            path: file_path.to_string_lossy().into_owned(),
631            hash: calculate_hash(&content),
632            modified: self.get_modified_time(file_path)?,
633            size: content.len() as u64,
634            language: self.detect_language(file_path),
635            tags: vec![],
636        };
637
638        Ok(Some(index))
639    }
640
641    #[inline]
642    fn is_excluded_path(&self, path: &Path) -> bool {
643        self.config
644            .excluded_dirs
645            .iter()
646            .any(|excluded| path.starts_with(excluded))
647    }
648
649    #[inline]
650    fn should_index_file_path(&self, path: &Path) -> bool {
651        self.filter.should_index_file(path, &self.config)
652    }
653
654    #[inline]
655    fn should_process_file_path(&self, path: &Path) -> bool {
656        if self.is_allowed_path(path) {
657            return self.should_index_file_path(path);
658        }
659
660        !self.is_excluded_path(path) && self.should_index_file_path(path)
661    }
662
663    fn build_walker(&self, dir_path: &Path) -> Walk {
664        let walk_root = dir_path.to_path_buf();
665        let config = self.config.clone();
666        let filter = Arc::clone(&self.filter);
667
668        let mut builder = WalkBuilder::new(dir_path);
669        builder
670            .hidden(false)
671            .git_ignore(true)
672            .git_global(true)
673            .git_exclude(true)
674            .ignore(true)
675            .parents(true);
676        builder.filter_entry(move |entry| {
677            should_visit_entry(entry, walk_root.as_path(), &config, filter.as_ref())
678        });
679        builder.build()
680    }
681
682    fn replace_cached_entries(&mut self, dir_path: &Path, entries: &[FileIndex]) {
683        self.index_cache
684            .retain(|path, _| !Path::new(path).starts_with(dir_path));
685
686        self.index_cache.extend(
687            entries
688                .iter()
689                .cloned()
690                .map(|entry| (entry.path.clone(), entry)),
691        );
692    }
693
694    fn apply_snapshot_file_update(
695        &mut self,
696        cache_key: String,
697        next_entry: Option<FileIndex>,
698    ) -> Result<()> {
699        let previous_entry = match next_entry {
700            Some(entry) => self.index_cache.insert(cache_key.clone(), entry),
701            None => self.index_cache.remove(cache_key.as_str()),
702        };
703
704        if let Err(err) = self.persist_current_snapshot() {
705            match previous_entry {
706                Some(entry) => {
707                    self.index_cache.insert(cache_key, entry);
708                }
709                None => {
710                    self.index_cache.remove(cache_key.as_str());
711                }
712            }
713            return Err(err);
714        }
715
716        Ok(())
717    }
718
719    fn apply_snapshot_directory_update(
720        &mut self,
721        dir_path: &Path,
722        entries: &[FileIndex],
723    ) -> Result<()> {
724        let previous_entries = self.take_cached_entries(dir_path);
725        self.index_cache.extend(
726            entries
727                .iter()
728                .cloned()
729                .map(|entry| (entry.path.clone(), entry)),
730        );
731
732        if let Err(err) = self.persist_current_snapshot() {
733            self.index_cache
734                .retain(|path, _| !Path::new(path).starts_with(dir_path));
735            self.index_cache.extend(
736                previous_entries
737                    .into_iter()
738                    .map(|entry| (entry.path.clone(), entry)),
739            );
740            return Err(err);
741        }
742
743        Ok(())
744    }
745
746    fn take_cached_entries(&mut self, dir_path: &Path) -> Vec<FileIndex> {
747        let keys = self
748            .index_cache
749            .keys()
750            .filter(|path| Path::new(path).starts_with(dir_path))
751            .cloned()
752            .collect::<Vec<_>>();
753
754        keys.into_iter()
755            .filter_map(|path| self.index_cache.remove(path.as_str()))
756            .collect()
757    }
758
759    fn persist_current_snapshot(&self) -> Result<()> {
760        let mut snapshot = self.index_cache.values().collect::<Vec<_>>();
761        snapshot.sort_unstable_by(|left, right| left.path.cmp(&right.path));
762        self.storage
763            .persist_batch_refs(self.config.index_dir(), &snapshot)
764    }
765}
766
767impl Clone for SimpleIndexer {
768    fn clone(&self) -> Self {
769        Self {
770            config: self.config.clone(),
771            index_cache: self.index_cache.clone(),
772            storage: self.storage.clone(),
773            filter: self.filter.clone(),
774        }
775    }
776}
777
778fn should_skip_dir(path: &Path, config: &SimpleIndexerConfig) -> bool {
779    if is_allowed_path_or_ancestor(path, config) {
780        return false;
781    }
782
783    if config
784        .excluded_dirs
785        .iter()
786        .any(|excluded| path.starts_with(excluded))
787    {
788        return true;
789    }
790
791    if config.ignore_hidden
792        && path
793            .file_name()
794            .and_then(|name| name.to_str())
795            .is_some_and(|name_str| name_str.starts_with('.'))
796    {
797        return true;
798    }
799
800    false
801}
802
803fn is_allowed_path_or_ancestor(path: &Path, config: &SimpleIndexerConfig) -> bool {
804    config
805        .allowed_dirs
806        .iter()
807        .any(|allowed| path.starts_with(allowed) || allowed.starts_with(path))
808}
809
810fn should_visit_entry(
811    entry: &DirEntry,
812    walk_root: &Path,
813    config: &SimpleIndexerConfig,
814    filter: &dyn TraversalFilter,
815) -> bool {
816    if entry.path() == walk_root {
817        return true;
818    }
819
820    if !entry
821        .file_type()
822        .is_some_and(|file_type| file_type.is_dir())
823    {
824        return true;
825    }
826
827    filter.should_descend(entry.path(), config)
828}
829
830#[inline]
831fn calculate_hash(content: &str) -> String {
832    vtcode_commons::utils::calculate_sha256(content.as_bytes())
833}
834
835fn write_markdown_entry(writer: &mut impl Write, entry: &FileIndex) -> std::io::Result<()> {
836    writeln!(writer, "## {}", entry.path)?;
837    writeln!(writer)?;
838    write_markdown_fields(writer, entry)?;
839    writeln!(writer)?;
840    Ok(())
841}
842
843fn write_markdown_fields(writer: &mut impl Write, entry: &FileIndex) -> std::io::Result<()> {
844    writeln!(writer, "- **Path**: {}", entry.path)?;
845    writeln!(writer, "- **Hash**: {}", entry.hash)?;
846    writeln!(writer, "- **Modified**: {}", entry.modified)?;
847    writeln!(writer, "- **Size**: {} bytes", entry.size)?;
848    writeln!(writer, "- **Language**: {}", entry.language)?;
849    writeln!(writer, "- **Tags**: {}", entry.tags.join(", "))?;
850    Ok(())
851}
852
853fn cleanup_legacy_markdown_entries(index_dir: &Path) -> Result<()> {
854    for entry in fs::read_dir(index_dir)? {
855        let entry = entry?;
856        let file_name = entry.file_name();
857        let file_name = file_name.to_string_lossy();
858        if is_legacy_markdown_entry_name(file_name.as_ref()) {
859            fs::remove_file(entry.path())?;
860        }
861    }
862    Ok(())
863}
864
865#[inline]
866fn is_legacy_markdown_entry_name(file_name: &str) -> bool {
867    let Some(hash_part) = file_name.strip_suffix(".md") else {
868        return false;
869    };
870    hash_part.len() == 64 && hash_part.bytes().all(|byte| byte.is_ascii_hexdigit())
871}
872
873#[cfg(test)]
874mod tests {
875    use super::*;
876    use std::fs;
877    use std::sync::{Arc, Mutex};
878    use tempfile::tempdir;
879
880    #[test]
881    fn skips_hidden_directories_by_default() -> Result<()> {
882        let temp = tempdir()?;
883        let workspace = temp.path();
884        let hidden_dir = workspace.join(".private");
885        fs::create_dir_all(&hidden_dir)?;
886        fs::write(hidden_dir.join("secret.txt"), "classified")?;
887
888        let visible_dir = workspace.join("src");
889        fs::create_dir_all(&visible_dir)?;
890        fs::write(visible_dir.join("lib.rs"), "fn main() {}")?;
891
892        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
893        indexer.init()?;
894        indexer.index_directory(workspace)?;
895
896        assert!(indexer.find_files("secret\\.txt$")?.is_empty());
897        assert!(!indexer.find_files("lib\\.rs$")?.is_empty());
898
899        Ok(())
900    }
901
902    #[test]
903    fn can_include_hidden_directories_when_configured() -> Result<()> {
904        let temp = tempdir()?;
905        let workspace = temp.path();
906        let hidden_dir = workspace.join(".cache");
907        fs::create_dir_all(&hidden_dir)?;
908        fs::write(hidden_dir.join("data.log"), "details")?;
909
910        let config = SimpleIndexerConfig::new(workspace.to_path_buf()).ignore_hidden(false);
911        let mut indexer = SimpleIndexer::with_config(config);
912        indexer.init()?;
913        indexer.index_directory(workspace)?;
914
915        let results = indexer.find_files("data\\.log$")?;
916        assert_eq!(results.len(), 1);
917
918        Ok(())
919    }
920
921    #[test]
922    fn indexes_allowed_directories_inside_hidden_excluded_parents() -> Result<()> {
923        let temp = tempdir()?;
924        let workspace = temp.path();
925        let allowed_dir = workspace.join(".vtcode").join("external");
926        fs::create_dir_all(&allowed_dir)?;
927        fs::write(allowed_dir.join("plugin.toml"), "name = 'demo'")?;
928
929        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
930        indexer.init()?;
931        indexer.index_directory(workspace)?;
932
933        let results = indexer.find_files("plugin\\.toml$")?;
934        assert_eq!(results.len(), 1);
935
936        Ok(())
937    }
938
939    #[test]
940    fn reindexing_prunes_deleted_files_from_cache() -> Result<()> {
941        let temp = tempdir()?;
942        let workspace = temp.path();
943        let file_path = workspace.join("notes.txt");
944        fs::write(&file_path, "remember this")?;
945
946        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
947        indexer.init()?;
948        indexer.index_directory(workspace)?;
949        assert_eq!(indexer.find_files("notes\\.txt$")?.len(), 1);
950
951        fs::remove_file(&file_path)?;
952        indexer.index_directory(workspace)?;
953
954        assert!(indexer.find_files("notes\\.txt$")?.is_empty());
955        assert!(indexer.all_files().is_empty());
956
957        Ok(())
958    }
959
960    #[test]
961    fn index_file_skips_excluded_paths() -> Result<()> {
962        let temp = tempdir()?;
963        let workspace = temp.path();
964        let index_dir = workspace.join(".vtcode").join("index");
965        fs::create_dir_all(&index_dir)?;
966        let generated_index = index_dir.join("index.md");
967        fs::write(&generated_index, "# generated")?;
968
969        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
970        indexer.init()?;
971        indexer.index_file(&generated_index)?;
972
973        assert!(indexer.all_files().is_empty());
974
975        Ok(())
976    }
977
978    #[test]
979    fn index_file_removes_stale_entry_when_file_becomes_unreadable() -> Result<()> {
980        let temp = tempdir()?;
981        let workspace = temp.path();
982        let file_path = workspace.join("notes.txt");
983        fs::write(&file_path, "remember this")?;
984
985        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
986        indexer.init()?;
987        indexer.index_file(&file_path)?;
988        assert!(
989            indexer
990                .find_files("notes\\.txt$")?
991                .iter()
992                .any(|file| file.ends_with("notes.txt"))
993        );
994
995        fs::write(&file_path, [0xFF, 0xFE, 0xFD])?;
996        indexer.index_file(&file_path)?;
997
998        assert!(indexer.find_files("notes\\.txt$")?.is_empty());
999
1000        let index_content =
1001            fs::read_to_string(workspace.join(".vtcode").join("index").join("index.md"))?;
1002        assert!(!index_content.contains(file_path.to_string_lossy().as_ref()));
1003
1004        Ok(())
1005    }
1006
1007    #[test]
1008    fn index_file_maintains_markdown_snapshot_across_updates() -> Result<()> {
1009        let temp = tempdir()?;
1010        let workspace = temp.path();
1011        let first = workspace.join("first.txt");
1012        let second = workspace.join("second.txt");
1013        fs::write(&first, "one")?;
1014        fs::write(&second, "two")?;
1015
1016        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1017        indexer.init()?;
1018        indexer.index_file(&first)?;
1019        indexer.index_file(&second)?;
1020
1021        let index_dir = workspace.join(".vtcode").join("index");
1022        let files = fs::read_dir(&index_dir)?
1023            .filter_map(|entry| entry.ok())
1024            .map(|entry| entry.file_name().to_string_lossy().into_owned())
1025            .collect::<Vec<_>>();
1026        assert_eq!(files, vec!["index.md".to_string()]);
1027
1028        let index_content = fs::read_to_string(index_dir.join("index.md"))?;
1029        assert!(index_content.contains(first.to_string_lossy().as_ref()));
1030        assert!(index_content.contains(second.to_string_lossy().as_ref()));
1031
1032        Ok(())
1033    }
1034
1035    #[test]
1036    fn index_directory_writes_markdown_snapshot_without_manual_init() -> Result<()> {
1037        let temp = tempdir()?;
1038        let workspace = temp.path();
1039        fs::write(workspace.join("notes.txt"), "remember this")?;
1040
1041        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1042        indexer.index_directory(workspace)?;
1043
1044        let index_content =
1045            fs::read_to_string(workspace.join(".vtcode").join("index").join("index.md"))?;
1046        assert!(index_content.contains(workspace.join("notes.txt").to_string_lossy().as_ref()));
1047
1048        Ok(())
1049    }
1050
1051    #[test]
1052    fn get_file_content_clamps_ranges_without_panicking() -> Result<()> {
1053        let temp = tempdir()?;
1054        let workspace = temp.path();
1055        let file_path = workspace.join("notes.txt");
1056        fs::write(&file_path, "first\nsecond")?;
1057
1058        let indexer = SimpleIndexer::new(workspace.to_path_buf());
1059        let file_path = file_path.to_string_lossy().into_owned();
1060
1061        assert_eq!(indexer.get_file_content(&file_path, Some(5), None)?, "");
1062        assert_eq!(
1063            indexer.get_file_content(&file_path, Some(0), Some(1))?,
1064            "1: first\n"
1065        );
1066        assert_eq!(indexer.get_file_content(&file_path, Some(2), Some(1))?, "");
1067
1068        Ok(())
1069    }
1070
1071    #[test]
1072    fn supports_custom_storage_backends() -> Result<()> {
1073        #[derive(Clone, Default)]
1074        struct MemoryStorage {
1075            records: Arc<Mutex<Vec<FileIndex>>>,
1076        }
1077
1078        impl MemoryStorage {
1079            fn new(records: Arc<Mutex<Vec<FileIndex>>>) -> Self {
1080                Self { records }
1081            }
1082        }
1083
1084        impl IndexStorage for MemoryStorage {
1085            fn init(&self, _index_dir: &Path) -> Result<()> {
1086                Ok(())
1087            }
1088
1089            fn persist(&self, _index_dir: &Path, entry: &FileIndex) -> Result<()> {
1090                let mut guard = self.records.lock().expect("lock poisoned");
1091                guard.push(entry.clone());
1092                Ok(())
1093            }
1094        }
1095
1096        let temp = tempdir()?;
1097        let workspace = temp.path();
1098        fs::write(workspace.join("notes.txt"), "remember this")?;
1099
1100        let records: Arc<Mutex<Vec<FileIndex>>> = Arc::new(Mutex::new(Vec::new()));
1101        let storage = MemoryStorage::new(records.clone());
1102
1103        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1104        let mut indexer = SimpleIndexer::with_config(config).with_storage(Arc::new(storage));
1105        indexer.init()?;
1106        indexer.index_directory(workspace)?;
1107
1108        let entries = records.lock().expect("lock poisoned");
1109        assert_eq!(entries.len(), 1);
1110        assert_eq!(
1111            entries[0].path,
1112            workspace.join("notes.txt").to_string_lossy().into_owned()
1113        );
1114
1115        Ok(())
1116    }
1117
1118    #[test]
1119    fn custom_filters_can_skip_files() -> Result<()> {
1120        #[derive(Default)]
1121        struct SkipRustFilter {
1122            inner: ConfigTraversalFilter,
1123        }
1124
1125        impl TraversalFilter for SkipRustFilter {
1126            fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
1127                self.inner.should_descend(path, config)
1128            }
1129
1130            fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
1131                if path
1132                    .extension()
1133                    .and_then(|ext| ext.to_str())
1134                    .is_some_and(|ext| ext.eq_ignore_ascii_case("rs"))
1135                {
1136                    return false;
1137                }
1138
1139                self.inner.should_index_file(path, config)
1140            }
1141        }
1142
1143        let temp = tempdir()?;
1144        let workspace = temp.path();
1145        fs::write(workspace.join("lib.rs"), "fn main() {}")?;
1146        fs::write(workspace.join("README.md"), "# Notes")?;
1147
1148        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1149        let mut indexer =
1150            SimpleIndexer::with_config(config).with_filter(Arc::new(SkipRustFilter::default()));
1151        indexer.init()?;
1152        indexer.index_directory(workspace)?;
1153
1154        assert!(indexer.find_files("lib\\.rs$")?.is_empty());
1155        assert!(!indexer.find_files("README\\.md$")?.is_empty());
1156
1157        Ok(())
1158    }
1159
1160    #[test]
1161    fn custom_filters_can_skip_directories() -> Result<()> {
1162        #[derive(Default)]
1163        struct SkipGeneratedFilter {
1164            inner: ConfigTraversalFilter,
1165        }
1166
1167        impl TraversalFilter for SkipGeneratedFilter {
1168            fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
1169                if path.ends_with("generated") {
1170                    return false;
1171                }
1172
1173                self.inner.should_descend(path, config)
1174            }
1175
1176            fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
1177                self.inner.should_index_file(path, config)
1178            }
1179        }
1180
1181        let temp = tempdir()?;
1182        let workspace = temp.path();
1183        let generated_dir = workspace.join("generated");
1184        fs::create_dir_all(&generated_dir)?;
1185        fs::write(generated_dir.join("skip.txt"), "ignore me")?;
1186        fs::write(workspace.join("README.md"), "# Notes")?;
1187
1188        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1189        let indexer = SimpleIndexer::with_config(config)
1190            .with_filter(Arc::new(SkipGeneratedFilter::default()));
1191        let files = indexer.discover_files(workspace);
1192
1193        assert!(!files.iter().any(|file| file.ends_with("skip.txt")));
1194        assert!(files.iter().any(|file| file.ends_with("README.md")));
1195
1196        Ok(())
1197    }
1198
1199    #[test]
1200    fn indexing_multiple_directories_preserves_existing_cache_entries() -> Result<()> {
1201        let temp = tempdir()?;
1202        let workspace = temp.path();
1203        let src_dir = workspace.join("src");
1204        let docs_dir = workspace.join("docs");
1205        fs::create_dir_all(&src_dir)?;
1206        fs::create_dir_all(&docs_dir)?;
1207        fs::write(src_dir.join("lib.rs"), "fn main() {}")?;
1208        fs::write(docs_dir.join("guide.md"), "# Guide")?;
1209
1210        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1211        indexer.init()?;
1212        indexer.index_directory(&src_dir)?;
1213        indexer.index_directory(&docs_dir)?;
1214
1215        assert!(
1216            indexer
1217                .find_files("lib\\.rs$")?
1218                .iter()
1219                .any(|file| file.ends_with("lib.rs"))
1220        );
1221        assert!(
1222            indexer
1223                .find_files("guide\\.md$")?
1224                .iter()
1225                .any(|file| file.ends_with("guide.md"))
1226        );
1227
1228        let index_content =
1229            fs::read_to_string(workspace.join(".vtcode").join("index").join("index.md"))?;
1230        assert!(index_content.contains(src_dir.join("lib.rs").to_string_lossy().as_ref()));
1231        assert!(index_content.contains(docs_dir.join("guide.md").to_string_lossy().as_ref()));
1232
1233        Ok(())
1234    }
1235
1236    #[test]
1237    fn batch_indexing_writes_single_markdown_file() -> Result<()> {
1238        let temp = tempdir()?;
1239        let workspace = temp.path();
1240        fs::write(workspace.join("lib.rs"), "fn main() {}")?;
1241        fs::write(workspace.join("README.md"), "# Notes")?;
1242
1243        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1244        indexer.init()?;
1245        indexer.index_directory(workspace)?;
1246
1247        let index_dir = workspace.join(".vtcode").join("index");
1248        let files = fs::read_dir(&index_dir)?
1249            .filter_map(|entry| entry.ok())
1250            .map(|entry| entry.file_name().to_string_lossy().into_owned())
1251            .collect::<Vec<_>>();
1252        assert_eq!(files, vec!["index.md".to_string()]);
1253
1254        let index_content = fs::read_to_string(index_dir.join("index.md"))?;
1255        assert!(index_content.contains(workspace.join("lib.rs").to_string_lossy().as_ref()));
1256        assert!(index_content.contains(workspace.join("README.md").to_string_lossy().as_ref()));
1257
1258        Ok(())
1259    }
1260
1261    #[test]
1262    fn batch_indexing_removes_legacy_hashed_entries() -> Result<()> {
1263        let temp = tempdir()?;
1264        let workspace = temp.path();
1265        fs::write(workspace.join("lib.rs"), "fn main() {}")?;
1266
1267        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
1268        indexer.init()?;
1269
1270        let legacy_file_name = format!("{}.md", calculate_hash("legacy-path"));
1271        let legacy_file_path = workspace
1272            .join(".vtcode")
1273            .join("index")
1274            .join(&legacy_file_name);
1275        fs::write(&legacy_file_path, "# legacy")?;
1276        assert!(legacy_file_path.exists());
1277
1278        indexer.index_directory(workspace)?;
1279
1280        assert!(!legacy_file_path.exists());
1281        let files = fs::read_dir(workspace.join(".vtcode").join("index"))?
1282            .filter_map(|entry| entry.ok())
1283            .map(|entry| entry.file_name().to_string_lossy().into_owned())
1284            .collect::<Vec<_>>();
1285        assert_eq!(files, vec!["index.md".to_string()]);
1286
1287        Ok(())
1288    }
1289
1290    #[test]
1291    fn snapshot_storage_uses_default_ref_batch_persistence() -> Result<()> {
1292        #[derive(Clone, Default)]
1293        struct SnapshotMemoryStorage {
1294            snapshots: Arc<Mutex<Vec<Vec<FileIndex>>>>,
1295        }
1296
1297        impl SnapshotMemoryStorage {
1298            fn new(snapshots: Arc<Mutex<Vec<Vec<FileIndex>>>>) -> Self {
1299                Self { snapshots }
1300            }
1301        }
1302
1303        impl IndexStorage for SnapshotMemoryStorage {
1304            fn init(&self, _index_dir: &Path) -> Result<()> {
1305                Ok(())
1306            }
1307
1308            fn persist(&self, _index_dir: &Path, _entry: &FileIndex) -> Result<()> {
1309                Ok(())
1310            }
1311
1312            fn prefers_snapshot_persistence(&self) -> bool {
1313                true
1314            }
1315
1316            fn persist_batch(&self, _index_dir: &Path, entries: &[FileIndex]) -> Result<()> {
1317                self.snapshots
1318                    .lock()
1319                    .expect("lock poisoned")
1320                    .push(entries.to_vec());
1321                Ok(())
1322            }
1323        }
1324
1325        let temp = tempdir()?;
1326        let workspace = temp.path();
1327        let file_path = workspace.join("notes.txt");
1328        fs::write(&file_path, "remember this")?;
1329
1330        let snapshots = Arc::new(Mutex::new(Vec::new()));
1331        let storage = SnapshotMemoryStorage::new(snapshots.clone());
1332
1333        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1334        let mut indexer = SimpleIndexer::with_config(config).with_storage(Arc::new(storage));
1335        indexer.index_file(&file_path)?;
1336
1337        let snapshots = snapshots.lock().expect("lock poisoned");
1338        assert_eq!(snapshots.len(), 1);
1339        assert_eq!(snapshots[0].len(), 1);
1340        assert_eq!(
1341            snapshots[0][0].path,
1342            workspace.join("notes.txt").to_string_lossy().into_owned()
1343        );
1344
1345        Ok(())
1346    }
1347
1348    #[test]
1349    fn snapshot_index_file_rolls_back_cache_when_persist_fails() -> Result<()> {
1350        #[derive(Clone, Default)]
1351        struct FlakySnapshotStorage {
1352            persist_count: Arc<Mutex<usize>>,
1353        }
1354
1355        impl IndexStorage for FlakySnapshotStorage {
1356            fn init(&self, _index_dir: &Path) -> Result<()> {
1357                Ok(())
1358            }
1359
1360            fn persist(&self, _index_dir: &Path, _entry: &FileIndex) -> Result<()> {
1361                Ok(())
1362            }
1363
1364            fn prefers_snapshot_persistence(&self) -> bool {
1365                true
1366            }
1367
1368            fn persist_batch(&self, _index_dir: &Path, _entries: &[FileIndex]) -> Result<()> {
1369                let mut count = self.persist_count.lock().expect("lock poisoned");
1370                *count += 1;
1371                if *count == 2 {
1372                    anyhow::bail!("simulated snapshot persistence failure");
1373                }
1374                Ok(())
1375            }
1376        }
1377
1378        let temp = tempdir()?;
1379        let workspace = temp.path();
1380        let first = workspace.join("first.txt");
1381        let second = workspace.join("second.txt");
1382        fs::write(&first, "one")?;
1383        fs::write(&second, "two")?;
1384
1385        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
1386        let storage = Arc::new(FlakySnapshotStorage::default());
1387        let mut indexer = SimpleIndexer::with_config(config).with_storage(storage);
1388
1389        indexer.index_file(&first)?;
1390        assert!(
1391            indexer
1392                .find_files("first\\.txt$")?
1393                .iter()
1394                .any(|path| path.ends_with("first.txt"))
1395        );
1396
1397        let err = indexer
1398            .index_file(&second)
1399            .expect_err("second persist should fail");
1400        assert!(
1401            err.to_string()
1402                .contains("simulated snapshot persistence failure")
1403        );
1404        assert!(
1405            indexer
1406                .find_files("first\\.txt$")?
1407                .iter()
1408                .any(|path| path.ends_with("first.txt"))
1409        );
1410        assert!(indexer.find_files("second\\.txt$")?.is_empty());
1411
1412        Ok(())
1413    }
1414}