vtcode_indexer/
lib.rs

1//! Workspace-friendly file indexer extracted from VTCode.
2//!
3//! `vtcode-indexer` offers a lightweight alternative to heavyweight
4//! search/indexing stacks. It recursively walks a workspace, computes
5//! hashes, and stores per-file metadata in Markdown-friendly summaries
6//! so changes remain easy to audit in git.
7
8use anyhow::Result;
9use ignore::WalkBuilder;
10use regex::Regex;
11use serde::{Deserialize, Serialize};
12use std::collections::HashMap;
13use std::fs;
14use std::io::ErrorKind;
15use std::path::{Path, PathBuf};
16use std::sync::Arc;
17use std::time::SystemTime;
18
19/// Persistence backend for [`SimpleIndexer`].
20pub trait IndexStorage: Send + Sync {
21    /// Prepare any directories or resources required for persistence.
22    fn init(&self, index_dir: &Path) -> Result<()>;
23
24    /// Persist an indexed file entry.
25    fn persist(&self, index_dir: &Path, entry: &FileIndex) -> Result<()>;
26}
27
28/// Directory traversal filter hook for [`SimpleIndexer`].
29pub trait TraversalFilter: Send + Sync {
30    /// Determine if the indexer should descend into the provided directory.
31    fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool;
32
33    /// Determine if the indexer should process the provided file.
34    fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool;
35}
36
37/// Markdown-backed [`IndexStorage`] implementation.
38#[derive(Debug, Default, Clone)]
39pub struct MarkdownIndexStorage;
40
41impl IndexStorage for MarkdownIndexStorage {
42    fn init(&self, index_dir: &Path) -> Result<()> {
43        fs::create_dir_all(index_dir)?;
44        Ok(())
45    }
46
47    fn persist(&self, index_dir: &Path, entry: &FileIndex) -> Result<()> {
48        let file_name = format!("{}.md", calculate_hash(&entry.path));
49        let index_path = index_dir.join(file_name);
50
51        let markdown = format!(
52            "# File Index: {}\n\n\
53            - **Path**: {}\n\
54            - **Hash**: {}\n\
55            - **Modified**: {}\n\
56            - **Size**: {} bytes\n\
57            - **Language**: {}\n\
58            - **Tags**: {}\n\n",
59            entry.path,
60            entry.path,
61            entry.hash,
62            entry.modified,
63            entry.size,
64            entry.language,
65            entry.tags.join(", ")
66        );
67
68        fs::write(index_path, markdown)?;
69        Ok(())
70    }
71}
72
73/// Default traversal filter powered by [`SimpleIndexerConfig`].
74#[derive(Debug, Default, Clone)]
75pub struct ConfigTraversalFilter;
76
77impl TraversalFilter for ConfigTraversalFilter {
78    fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
79        !should_skip_dir(path, config)
80    }
81
82    fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
83        if !path.is_file() {
84            return false;
85        }
86
87        // Skip hidden files when configured.
88        if config.ignore_hidden
89            && path
90                .file_name()
91                .and_then(|n| n.to_str())
92                .is_some_and(|s| s.starts_with('.'))
93        {
94            return false;
95        }
96
97        // Always skip known sensitive files regardless of config.
98        if let Some(file_name) = path.file_name().and_then(|n| n.to_str()) {
99            let is_sensitive = matches!(
100                file_name,
101                ".env"
102                    | ".env.local"
103                    | ".env.production"
104                    | ".env.development"
105                    | ".env.test"
106                    | ".git"
107                    | ".gitignore"
108                    | ".DS_Store"
109            ) || file_name.starts_with(".env.");
110            if is_sensitive {
111                return false;
112            }
113        }
114
115        true
116    }
117}
118
119/// Configuration for [`SimpleIndexer`].
120#[derive(Clone, Debug)]
121pub struct SimpleIndexerConfig {
122    workspace_root: PathBuf,
123    index_dir: PathBuf,
124    ignore_hidden: bool,
125    excluded_dirs: Vec<PathBuf>,
126    allowed_dirs: Vec<PathBuf>,
127}
128
129impl SimpleIndexerConfig {
130    /// Builds a configuration using VTCode's legacy layout as defaults.
131    pub fn new(workspace_root: PathBuf) -> Self {
132        let index_dir = workspace_root.join(".vtcode").join("index");
133        let vtcode_dir = workspace_root.join(".vtcode");
134        let external_dir = vtcode_dir.join("external");
135
136        let mut excluded_dirs = vec![
137            index_dir.clone(),
138            vtcode_dir,
139            workspace_root.join("target"),
140            workspace_root.join("node_modules"),
141        ];
142
143        excluded_dirs.dedup();
144
145        Self {
146            workspace_root,
147            index_dir,
148            ignore_hidden: true,
149            excluded_dirs,
150            allowed_dirs: vec![external_dir],
151        }
152    }
153
154    /// Updates the index directory used for persisted metadata.
155    pub fn with_index_dir(mut self, index_dir: impl Into<PathBuf>) -> Self {
156        let index_dir = index_dir.into();
157        self.index_dir = index_dir.clone();
158        self.push_unique_excluded(index_dir);
159        self
160    }
161
162    /// Adds an allowed directory that should be indexed even if hidden or inside an excluded parent.
163    pub fn add_allowed_dir(mut self, path: impl Into<PathBuf>) -> Self {
164        let path = path.into();
165        if !self.allowed_dirs.iter().any(|existing| existing == &path) {
166            self.allowed_dirs.push(path);
167        }
168        self
169    }
170
171    /// Adds an additional excluded directory to skip during traversal.
172    pub fn add_excluded_dir(mut self, path: impl Into<PathBuf>) -> Self {
173        let path = path.into();
174        self.push_unique_excluded(path);
175        self
176    }
177
178    /// Toggles whether hidden directories (prefix `.`) are ignored.
179    pub fn ignore_hidden(mut self, ignore_hidden: bool) -> Self {
180        self.ignore_hidden = ignore_hidden;
181        self
182    }
183
184    /// Workspace root accessor.
185    pub fn workspace_root(&self) -> &Path {
186        &self.workspace_root
187    }
188
189    /// Index directory accessor.
190    pub fn index_dir(&self) -> &Path {
191        &self.index_dir
192    }
193
194    fn push_unique_excluded(&mut self, path: PathBuf) {
195        if !self.excluded_dirs.iter().any(|existing| existing == &path) {
196            self.excluded_dirs.push(path);
197        }
198    }
199}
200
201/// Simple file index entry.
202#[derive(Debug, Clone, Serialize, Deserialize)]
203pub struct FileIndex {
204    /// File path.
205    pub path: String,
206    /// File content hash for change detection.
207    pub hash: String,
208    /// Last modified timestamp.
209    pub modified: u64,
210    /// File size.
211    pub size: u64,
212    /// Language/extension.
213    pub language: String,
214    /// Simple tags.
215    pub tags: Vec<String>,
216}
217
218/// Simple search result.
219#[derive(Debug, Clone, Serialize, Deserialize)]
220pub struct SearchResult {
221    pub file_path: String,
222    pub line_number: usize,
223    pub line_content: String,
224    pub matches: Vec<String>,
225}
226
227/// Simple file indexer.
228pub struct SimpleIndexer {
229    config: SimpleIndexerConfig,
230    index_cache: HashMap<String, FileIndex>,
231    storage: Arc<dyn IndexStorage>,
232    filter: Arc<dyn TraversalFilter>,
233}
234
235impl SimpleIndexer {
236    /// Create a new simple indexer with default VTCode paths.
237    pub fn new(workspace_root: PathBuf) -> Self {
238        Self::with_components(
239            SimpleIndexerConfig::new(workspace_root),
240            Arc::new(MarkdownIndexStorage),
241            Arc::new(ConfigTraversalFilter),
242        )
243    }
244
245    /// Create a simple indexer with the provided configuration.
246    pub fn with_config(config: SimpleIndexerConfig) -> Self {
247        Self::with_components(
248            config,
249            Arc::new(MarkdownIndexStorage),
250            Arc::new(ConfigTraversalFilter),
251        )
252    }
253
254    /// Create a new simple indexer using a custom index directory.
255    pub fn with_index_dir(workspace_root: PathBuf, index_dir: PathBuf) -> Self {
256        let config = SimpleIndexerConfig::new(workspace_root).with_index_dir(index_dir);
257        Self::with_config(config)
258    }
259
260    /// Create an indexer with explicit storage and traversal filter implementations.
261    pub fn with_components(
262        config: SimpleIndexerConfig,
263        storage: Arc<dyn IndexStorage>,
264        filter: Arc<dyn TraversalFilter>,
265    ) -> Self {
266        Self {
267            config,
268            index_cache: HashMap::new(),
269            storage,
270            filter,
271        }
272    }
273
274    /// Replace the storage backend used to persist index entries.
275    pub fn with_storage(self, storage: Arc<dyn IndexStorage>) -> Self {
276        Self { storage, ..self }
277    }
278
279    /// Replace the traversal filter used to decide which files and directories are indexed.
280    pub fn with_filter(self, filter: Arc<dyn TraversalFilter>) -> Self {
281        Self { filter, ..self }
282    }
283
284    /// Initialize the index directory.
285    pub fn init(&self) -> Result<()> {
286        self.storage.init(self.config.index_dir())
287    }
288
289    /// Get the workspace root path.
290    pub fn workspace_root(&self) -> &Path {
291        self.config.workspace_root()
292    }
293
294    /// Get the index directory used for persisted metadata.
295    pub fn index_dir(&self) -> &Path {
296        self.config.index_dir()
297    }
298
299    /// Index a single file.
300    pub fn index_file(&mut self, file_path: &Path) -> Result<()> {
301        if !file_path.exists() || !self.filter.should_index_file(file_path, &self.config) {
302            return Ok(());
303        }
304
305        let content = match fs::read_to_string(file_path) {
306            Ok(text) => text,
307            Err(err) => {
308                if err.kind() == ErrorKind::InvalidData {
309                    return Ok(());
310                }
311                return Err(err.into());
312            }
313        };
314        let hash = calculate_hash(&content);
315        let modified = self.get_modified_time(file_path)?;
316        let size = content.len() as u64;
317        let language = self.detect_language(file_path);
318
319        let index = FileIndex {
320            path: file_path.to_string_lossy().into_owned(),
321            hash,
322            modified,
323            size,
324            language,
325            tags: vec![],
326        };
327
328        self.index_cache
329            .insert(file_path.to_string_lossy().into_owned(), index.clone());
330
331        self.storage.persist(self.config.index_dir(), &index)?;
332
333        Ok(())
334    }
335
336    /// Index all files in directory recursively.
337    /// Respects .gitignore, .ignore, and other ignore files.
338    /// SECURITY: Always skips hidden files and sensitive data (.env, .git, etc.)
339    pub fn index_directory(&mut self, dir_path: &Path) -> Result<()> {
340        let walker = WalkBuilder::new(dir_path)
341            .hidden(true) // CRITICAL: Skip hidden files (.env, .git, etc.)
342            .git_ignore(true) // Respect .gitignore
343            .git_global(true) // Respect global gitignore
344            .git_exclude(true) // Respect .git/info/exclude
345            .ignore(true) // Respect .ignore files
346            .parents(true) // Check parent directories for ignore files
347            .build();
348
349        for entry in walker.filter_map(|e| e.ok()) {
350            let path = entry.path();
351
352            // Only index files, not directories
353            if entry.file_type().is_some_and(|ft| ft.is_file()) {
354                // Additional check: skip if in excluded dirs
355                let should_skip = self
356                    .config
357                    .excluded_dirs
358                    .iter()
359                    .any(|excluded| path.starts_with(excluded));
360
361                if !should_skip && self.filter.should_index_file(path, &self.config) {
362                    self.index_file(path)?;
363                }
364            }
365        }
366
367        Ok(())
368    }
369
370    /// Internal helper for regex-based file content search.
371    /// Used by both `search()` and `grep()` to avoid code duplication.
372    fn search_files_internal(
373        &self,
374        regex: &Regex,
375        path_filter: Option<&str>,
376        extract_matches: bool,
377    ) -> Vec<SearchResult> {
378        let mut results = Vec::new();
379
380        for file_path in self.index_cache.keys() {
381            if path_filter.is_some_and(|filter| !file_path.contains(filter)) {
382                continue;
383            }
384
385            if let Ok(content) = fs::read_to_string(file_path) {
386                for (line_num, line) in content.lines().enumerate() {
387                    if regex.is_match(line) {
388                        let matches = if extract_matches {
389                            regex
390                                .find_iter(line)
391                                .map(|m| m.as_str().to_string())
392                                .collect()
393                        } else {
394                            vec![line.to_string()]
395                        };
396
397                        results.push(SearchResult {
398                            file_path: file_path.clone(),
399                            line_number: line_num + 1,
400                            line_content: line.to_string(),
401                            matches,
402                        });
403                    }
404                }
405            }
406        }
407
408        results
409    }
410
411    /// Search files using regex pattern.
412    pub fn search(&self, pattern: &str, path_filter: Option<&str>) -> Result<Vec<SearchResult>> {
413        let regex = Regex::new(pattern)?;
414        Ok(self.search_files_internal(&regex, path_filter, true))
415    }
416
417    /// Find files by name pattern.
418    pub fn find_files(&self, pattern: &str) -> Result<Vec<String>> {
419        let regex = Regex::new(pattern)?;
420        let mut results = Vec::new();
421
422        for file_path in self.index_cache.keys() {
423            if regex.is_match(file_path) {
424                results.push(file_path.clone());
425            }
426        }
427
428        Ok(results)
429    }
430
431    /// Get all indexed files without pattern matching.
432    /// This is more efficient than using find_files(".*").
433    pub fn all_files(&self) -> Vec<String> {
434        self.index_cache.keys().cloned().collect()
435    }
436
437    /// Get file content with line numbers.
438    pub fn get_file_content(
439        &self,
440        file_path: &str,
441        start_line: Option<usize>,
442        end_line: Option<usize>,
443    ) -> Result<String> {
444        let content = fs::read_to_string(file_path)?;
445        let lines: Vec<&str> = content.lines().collect();
446
447        let start = start_line.unwrap_or(1).saturating_sub(1);
448        let end = end_line.unwrap_or(lines.len());
449
450        let selected_lines = &lines[start..end.min(lines.len())];
451
452        let mut result = String::new();
453        for (i, line) in selected_lines.iter().enumerate() {
454            result.push_str(&format!("{}: {}\n", start + i + 1, line));
455        }
456
457        Ok(result)
458    }
459
460    /// List files in directory (like ls).
461    pub fn list_files(&self, dir_path: &str, show_hidden: bool) -> Result<Vec<String>> {
462        let path = Path::new(dir_path);
463        if !path.exists() {
464            return Ok(vec![]);
465        }
466
467        let mut files = Vec::new();
468
469        for entry in fs::read_dir(path)? {
470            let entry = entry?;
471            let file_name = entry.file_name().to_string_lossy().into_owned();
472
473            if !show_hidden && file_name.starts_with('.') {
474                continue;
475            }
476
477            files.push(file_name);
478        }
479
480        Ok(files)
481    }
482
483    /// Grep-like search (like grep command).
484    pub fn grep(&self, pattern: &str, file_pattern: Option<&str>) -> Result<Vec<SearchResult>> {
485        let regex = Regex::new(pattern)?;
486        Ok(self.search_files_internal(&regex, file_pattern, false))
487    }
488
489    #[allow(dead_code)]
490    fn walk_directory<F>(&mut self, dir_path: &Path, callback: &mut F) -> Result<()>
491    where
492        F: FnMut(&Path) -> Result<()>,
493    {
494        if !dir_path.exists() {
495            return Ok(());
496        }
497
498        self.walk_directory_internal(dir_path, callback)
499    }
500
501    #[allow(dead_code)]
502    fn walk_directory_internal<F>(&mut self, dir_path: &Path, callback: &mut F) -> Result<()>
503    where
504        F: FnMut(&Path) -> Result<()>,
505    {
506        for entry in fs::read_dir(dir_path)? {
507            let entry = entry?;
508            let path = entry.path();
509
510            if path.is_dir() {
511                if self.is_allowed_dir(&path) {
512                    self.walk_directory_internal(&path, callback)?;
513                    continue;
514                }
515
516                if !self.filter.should_descend(&path, &self.config) {
517                    self.walk_allowed_descendants(&path, callback)?;
518                    continue;
519                }
520
521                self.walk_directory_internal(&path, callback)?;
522            } else if path.is_file() {
523                callback(&path)?;
524            }
525        }
526
527        Ok(())
528    }
529
530    #[allow(dead_code)]
531    fn is_allowed_dir(&self, path: &Path) -> bool {
532        self.config
533            .allowed_dirs
534            .iter()
535            .any(|allowed| path.starts_with(allowed))
536    }
537
538    #[allow(dead_code)]
539    fn walk_allowed_descendants<F>(&mut self, dir_path: &Path, callback: &mut F) -> Result<()>
540    where
541        F: FnMut(&Path) -> Result<()>,
542    {
543        let allowed_dirs = self.config.allowed_dirs.clone();
544        for allowed in allowed_dirs {
545            if allowed.starts_with(dir_path) && allowed.exists() {
546                self.walk_directory_internal(&allowed, callback)?;
547            }
548        }
549        Ok(())
550    }
551
552    #[inline]
553    fn get_modified_time(&self, file_path: &Path) -> Result<u64> {
554        let metadata = fs::metadata(file_path)?;
555        let modified = metadata.modified()?;
556        Ok(modified.duration_since(SystemTime::UNIX_EPOCH)?.as_secs())
557    }
558
559    #[inline]
560    fn detect_language(&self, file_path: &Path) -> String {
561        file_path
562            .extension()
563            .and_then(|ext| ext.to_str())
564            .unwrap_or("unknown")
565            .to_string()
566    }
567}
568
569impl Clone for SimpleIndexer {
570    fn clone(&self) -> Self {
571        Self {
572            config: self.config.clone(),
573            index_cache: self.index_cache.clone(),
574            storage: self.storage.clone(),
575            filter: self.filter.clone(),
576        }
577    }
578}
579
580fn should_skip_dir(path: &Path, config: &SimpleIndexerConfig) -> bool {
581    if config
582        .allowed_dirs
583        .iter()
584        .any(|allowed| path.starts_with(allowed))
585    {
586        return false;
587    }
588
589    if config
590        .excluded_dirs
591        .iter()
592        .any(|excluded| path.starts_with(excluded))
593    {
594        return true;
595    }
596
597    if config.ignore_hidden
598        && path
599            .file_name()
600            .and_then(|name| name.to_str())
601            .is_some_and(|name_str| name_str.starts_with('.'))
602    {
603        return true;
604    }
605
606    false
607}
608
609#[inline]
610fn calculate_hash(content: &str) -> String {
611    use std::collections::hash_map::DefaultHasher;
612    use std::hash::{Hash, Hasher};
613
614    let mut hasher = DefaultHasher::new();
615    content.hash(&mut hasher);
616    format!("{:x}", hasher.finish())
617}
618
619#[cfg(test)]
620mod tests {
621    use super::*;
622    use std::fs;
623    use std::sync::{Arc, Mutex};
624    use tempfile::tempdir;
625
626    #[test]
627    fn skips_hidden_directories_by_default() -> Result<()> {
628        let temp = tempdir()?;
629        let workspace = temp.path();
630        let hidden_dir = workspace.join(".private");
631        fs::create_dir_all(&hidden_dir)?;
632        fs::write(hidden_dir.join("secret.txt"), "classified")?;
633
634        let visible_dir = workspace.join("src");
635        fs::create_dir_all(&visible_dir)?;
636        fs::write(visible_dir.join("lib.rs"), "fn main() {}")?;
637
638        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
639        indexer.init()?;
640        indexer.index_directory(workspace)?;
641
642        assert!(indexer.find_files("secret\\.txt$")?.is_empty());
643        assert!(!indexer.find_files("lib\\.rs$")?.is_empty());
644
645        Ok(())
646    }
647
648    #[test]
649    fn can_include_hidden_directories_when_configured() -> Result<()> {
650        let temp = tempdir()?;
651        let workspace = temp.path();
652        let hidden_dir = workspace.join(".cache");
653        fs::create_dir_all(&hidden_dir)?;
654        fs::write(hidden_dir.join("data.log"), "details")?;
655
656        let config = SimpleIndexerConfig::new(workspace.to_path_buf()).ignore_hidden(false);
657        let mut indexer = SimpleIndexer::with_config(config);
658        indexer.init()?;
659        indexer.index_directory(workspace)?;
660
661        let results = indexer.find_files("data\\.log$")?;
662        assert_eq!(results.len(), 1);
663
664        Ok(())
665    }
666
667    #[test]
668    fn supports_custom_storage_backends() -> Result<()> {
669        #[derive(Clone, Default)]
670        struct MemoryStorage {
671            records: Arc<Mutex<Vec<FileIndex>>>,
672        }
673
674        impl MemoryStorage {
675            fn new(records: Arc<Mutex<Vec<FileIndex>>>) -> Self {
676                Self { records }
677            }
678        }
679
680        impl IndexStorage for MemoryStorage {
681            fn init(&self, _index_dir: &Path) -> Result<()> {
682                Ok(())
683            }
684
685            fn persist(&self, _index_dir: &Path, entry: &FileIndex) -> Result<()> {
686                let mut guard = self.records.lock().expect("lock poisoned");
687                guard.push(entry.clone());
688                Ok(())
689            }
690        }
691
692        let temp = tempdir()?;
693        let workspace = temp.path();
694        fs::write(workspace.join("notes.txt"), "remember this")?;
695
696        let records: Arc<Mutex<Vec<FileIndex>>> = Arc::new(Mutex::new(Vec::new()));
697        let storage = MemoryStorage::new(records.clone());
698
699        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
700        let mut indexer = SimpleIndexer::with_config(config).with_storage(Arc::new(storage));
701        indexer.init()?;
702        indexer.index_directory(workspace)?;
703
704        let entries = records.lock().expect("lock poisoned");
705        assert_eq!(entries.len(), 1);
706        assert_eq!(
707            entries[0].path,
708            workspace.join("notes.txt").to_string_lossy().into_owned()
709        );
710
711        Ok(())
712    }
713
714    #[test]
715    fn custom_filters_can_skip_files() -> Result<()> {
716        #[derive(Default)]
717        struct SkipRustFilter {
718            inner: ConfigTraversalFilter,
719        }
720
721        impl TraversalFilter for SkipRustFilter {
722            fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
723                self.inner.should_descend(path, config)
724            }
725
726            fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
727                if path
728                    .extension()
729                    .and_then(|ext| ext.to_str())
730                    .is_some_and(|ext| ext.eq_ignore_ascii_case("rs"))
731                {
732                    return false;
733                }
734
735                self.inner.should_index_file(path, config)
736            }
737        }
738
739        let temp = tempdir()?;
740        let workspace = temp.path();
741        fs::write(workspace.join("lib.rs"), "fn main() {}")?;
742        fs::write(workspace.join("README.md"), "# Notes")?;
743
744        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
745        let mut indexer =
746            SimpleIndexer::with_config(config).with_filter(Arc::new(SkipRustFilter::default()));
747        indexer.init()?;
748        indexer.index_directory(workspace)?;
749
750        assert!(indexer.find_files("lib\\.rs$")?.is_empty());
751        assert!(!indexer.find_files("README\\.md$")?.is_empty());
752
753        Ok(())
754    }
755}