vtcode_indexer/
lib.rs

1//! Workspace-friendly file indexer extracted from VTCode.
2//!
3//! `vtcode-indexer` offers a lightweight alternative to heavyweight
4//! search/indexing stacks. It recursively walks a workspace, computes
5//! hashes, and stores per-file metadata in Markdown-friendly summaries
6//! so changes remain easy to audit in git.
7
8use anyhow::Result;
9use regex::Regex;
10use serde::{Deserialize, Serialize};
11use std::collections::HashMap;
12use std::fs;
13use std::io::ErrorKind;
14use std::path::{Path, PathBuf};
15use std::sync::Arc;
16use std::time::SystemTime;
17
18/// Persistence backend for [`SimpleIndexer`].
19pub trait IndexStorage: Send + Sync {
20    /// Prepare any directories or resources required for persistence.
21    fn init(&self, index_dir: &Path) -> Result<()>;
22
23    /// Persist an indexed file entry.
24    fn persist(&self, index_dir: &Path, entry: &FileIndex) -> Result<()>;
25}
26
27/// Directory traversal filter hook for [`SimpleIndexer`].
28pub trait TraversalFilter: Send + Sync {
29    /// Determine if the indexer should descend into the provided directory.
30    fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool;
31
32    /// Determine if the indexer should process the provided file.
33    fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool;
34}
35
36/// Markdown-backed [`IndexStorage`] implementation.
37#[derive(Debug, Default, Clone)]
38pub struct MarkdownIndexStorage;
39
40impl IndexStorage for MarkdownIndexStorage {
41    fn init(&self, index_dir: &Path) -> Result<()> {
42        fs::create_dir_all(index_dir)?;
43        Ok(())
44    }
45
46    fn persist(&self, index_dir: &Path, entry: &FileIndex) -> Result<()> {
47        let file_name = format!("{}.md", calculate_hash(&entry.path));
48        let index_path = index_dir.join(file_name);
49
50        let markdown = format!(
51            "# File Index: {}\n\n\
52            - **Path**: {}\n\
53            - **Hash**: {}\n\
54            - **Modified**: {}\n\
55            - **Size**: {} bytes\n\
56            - **Language**: {}\n\
57            - **Tags**: {}\n\n",
58            entry.path,
59            entry.path,
60            entry.hash,
61            entry.modified,
62            entry.size,
63            entry.language,
64            entry.tags.join(", ")
65        );
66
67        fs::write(index_path, markdown)?;
68        Ok(())
69    }
70}
71
72/// Default traversal filter powered by [`SimpleIndexerConfig`].
73#[derive(Debug, Default, Clone)]
74pub struct ConfigTraversalFilter;
75
76impl TraversalFilter for ConfigTraversalFilter {
77    fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
78        !should_skip_dir(path, config)
79    }
80
81    fn should_index_file(&self, path: &Path, _config: &SimpleIndexerConfig) -> bool {
82        path.is_file()
83    }
84}
85
86/// Configuration for [`SimpleIndexer`].
87#[derive(Clone, Debug)]
88pub struct SimpleIndexerConfig {
89    workspace_root: PathBuf,
90    index_dir: PathBuf,
91    ignore_hidden: bool,
92    excluded_dirs: Vec<PathBuf>,
93    allowed_dirs: Vec<PathBuf>,
94}
95
96impl SimpleIndexerConfig {
97    /// Builds a configuration using VTCode's legacy layout as defaults.
98    pub fn new(workspace_root: PathBuf) -> Self {
99        let index_dir = workspace_root.join(".vtcode").join("index");
100        let vtcode_dir = workspace_root.join(".vtcode");
101        let external_dir = vtcode_dir.join("external");
102
103        let mut excluded_dirs = vec![
104            index_dir.clone(),
105            vtcode_dir,
106            workspace_root.join("target"),
107            workspace_root.join("node_modules"),
108        ];
109
110        excluded_dirs.dedup();
111
112        Self {
113            workspace_root,
114            index_dir,
115            ignore_hidden: true,
116            excluded_dirs,
117            allowed_dirs: vec![external_dir],
118        }
119    }
120
121    /// Updates the index directory used for persisted metadata.
122    pub fn with_index_dir(mut self, index_dir: impl Into<PathBuf>) -> Self {
123        let index_dir = index_dir.into();
124        self.index_dir = index_dir.clone();
125        self.push_unique_excluded(index_dir);
126        self
127    }
128
129    /// Adds an allowed directory that should be indexed even if hidden or inside an excluded parent.
130    pub fn add_allowed_dir(mut self, path: impl Into<PathBuf>) -> Self {
131        let path = path.into();
132        if !self.allowed_dirs.iter().any(|existing| existing == &path) {
133            self.allowed_dirs.push(path);
134        }
135        self
136    }
137
138    /// Adds an additional excluded directory to skip during traversal.
139    pub fn add_excluded_dir(mut self, path: impl Into<PathBuf>) -> Self {
140        let path = path.into();
141        self.push_unique_excluded(path);
142        self
143    }
144
145    /// Toggles whether hidden directories (prefix `.`) are ignored.
146    pub fn ignore_hidden(mut self, ignore_hidden: bool) -> Self {
147        self.ignore_hidden = ignore_hidden;
148        self
149    }
150
151    /// Workspace root accessor.
152    pub fn workspace_root(&self) -> &Path {
153        &self.workspace_root
154    }
155
156    /// Index directory accessor.
157    pub fn index_dir(&self) -> &Path {
158        &self.index_dir
159    }
160
161    fn push_unique_excluded(&mut self, path: PathBuf) {
162        if !self.excluded_dirs.iter().any(|existing| existing == &path) {
163            self.excluded_dirs.push(path);
164        }
165    }
166}
167
168/// Simple file index entry.
169#[derive(Debug, Clone, Serialize, Deserialize)]
170pub struct FileIndex {
171    /// File path.
172    pub path: String,
173    /// File content hash for change detection.
174    pub hash: String,
175    /// Last modified timestamp.
176    pub modified: u64,
177    /// File size.
178    pub size: u64,
179    /// Language/extension.
180    pub language: String,
181    /// Simple tags.
182    pub tags: Vec<String>,
183}
184
185/// Simple search result.
186#[derive(Debug, Clone, Serialize, Deserialize)]
187pub struct SearchResult {
188    pub file_path: String,
189    pub line_number: usize,
190    pub line_content: String,
191    pub matches: Vec<String>,
192}
193
194/// Simple file indexer.
195pub struct SimpleIndexer {
196    config: SimpleIndexerConfig,
197    index_cache: HashMap<String, FileIndex>,
198    storage: Arc<dyn IndexStorage>,
199    filter: Arc<dyn TraversalFilter>,
200}
201
202impl SimpleIndexer {
203    /// Create a new simple indexer with default VTCode paths.
204    pub fn new(workspace_root: PathBuf) -> Self {
205        Self::with_components(
206            SimpleIndexerConfig::new(workspace_root),
207            Arc::new(MarkdownIndexStorage),
208            Arc::new(ConfigTraversalFilter),
209        )
210    }
211
212    /// Create a simple indexer with the provided configuration.
213    pub fn with_config(config: SimpleIndexerConfig) -> Self {
214        Self::with_components(
215            config,
216            Arc::new(MarkdownIndexStorage),
217            Arc::new(ConfigTraversalFilter),
218        )
219    }
220
221    /// Create a new simple indexer using a custom index directory.
222    pub fn with_index_dir(workspace_root: PathBuf, index_dir: PathBuf) -> Self {
223        let config = SimpleIndexerConfig::new(workspace_root).with_index_dir(index_dir);
224        Self::with_config(config)
225    }
226
227    /// Create an indexer with explicit storage and traversal filter implementations.
228    pub fn with_components(
229        config: SimpleIndexerConfig,
230        storage: Arc<dyn IndexStorage>,
231        filter: Arc<dyn TraversalFilter>,
232    ) -> Self {
233        Self {
234            config,
235            index_cache: HashMap::new(),
236            storage,
237            filter,
238        }
239    }
240
241    /// Replace the storage backend used to persist index entries.
242    pub fn with_storage(self, storage: Arc<dyn IndexStorage>) -> Self {
243        Self { storage, ..self }
244    }
245
246    /// Replace the traversal filter used to decide which files and directories are indexed.
247    pub fn with_filter(self, filter: Arc<dyn TraversalFilter>) -> Self {
248        Self { filter, ..self }
249    }
250
251    /// Initialize the index directory.
252    pub fn init(&self) -> Result<()> {
253        self.storage.init(self.config.index_dir())
254    }
255
256    /// Get the workspace root path.
257    pub fn workspace_root(&self) -> &Path {
258        self.config.workspace_root()
259    }
260
261    /// Get the index directory used for persisted metadata.
262    pub fn index_dir(&self) -> &Path {
263        self.config.index_dir()
264    }
265
266    /// Index a single file.
267    pub fn index_file(&mut self, file_path: &Path) -> Result<()> {
268        if !file_path.exists() || !self.filter.should_index_file(file_path, &self.config) {
269            return Ok(());
270        }
271
272        let content = match fs::read_to_string(file_path) {
273            Ok(text) => text,
274            Err(err) => {
275                if err.kind() == ErrorKind::InvalidData {
276                    return Ok(());
277                }
278                return Err(err.into());
279            }
280        };
281        let hash = calculate_hash(&content);
282        let modified = self.get_modified_time(file_path)?;
283        let size = content.len() as u64;
284        let language = self.detect_language(file_path);
285
286        let index = FileIndex {
287            path: file_path.to_string_lossy().to_string(),
288            hash,
289            modified,
290            size,
291            language,
292            tags: vec![],
293        };
294
295        self.index_cache
296            .insert(file_path.to_string_lossy().to_string(), index.clone());
297
298        self.storage.persist(self.config.index_dir(), &index)?;
299
300        Ok(())
301    }
302
303    /// Index all files in directory recursively.
304    pub fn index_directory(&mut self, dir_path: &Path) -> Result<()> {
305        let mut file_paths = Vec::new();
306
307        // First pass: collect all file paths.
308        self.walk_directory(dir_path, &mut |file_path| {
309            file_paths.push(file_path.to_path_buf());
310            Ok(())
311        })?;
312
313        // Second pass: index each file.
314        for file_path in file_paths {
315            self.index_file(&file_path)?;
316        }
317
318        Ok(())
319    }
320
321    /// Search files using regex pattern.
322    pub fn search(&self, pattern: &str, path_filter: Option<&str>) -> Result<Vec<SearchResult>> {
323        let regex = Regex::new(pattern)?;
324
325        let mut results = Vec::new();
326
327        // Search through indexed files.
328        for file_path in self.index_cache.keys() {
329            if path_filter.is_some_and(|filter| !file_path.contains(filter)) {
330                continue;
331            }
332
333            if let Ok(content) = fs::read_to_string(file_path) {
334                for (line_num, line) in content.lines().enumerate() {
335                    if regex.is_match(line) {
336                        let matches: Vec<String> = regex
337                            .find_iter(line)
338                            .map(|m| m.as_str().to_string())
339                            .collect();
340
341                        results.push(SearchResult {
342                            file_path: file_path.clone(),
343                            line_number: line_num + 1,
344                            line_content: line.to_string(),
345                            matches,
346                        });
347                    }
348                }
349            }
350        }
351
352        Ok(results)
353    }
354
355    /// Find files by name pattern.
356    pub fn find_files(&self, pattern: &str) -> Result<Vec<String>> {
357        let regex = Regex::new(pattern)?;
358        let mut results = Vec::new();
359
360        for file_path in self.index_cache.keys() {
361            if regex.is_match(file_path) {
362                results.push(file_path.clone());
363            }
364        }
365
366        Ok(results)
367    }
368
369    /// Get file content with line numbers.
370    pub fn get_file_content(
371        &self,
372        file_path: &str,
373        start_line: Option<usize>,
374        end_line: Option<usize>,
375    ) -> Result<String> {
376        let content = fs::read_to_string(file_path)?;
377        let lines: Vec<&str> = content.lines().collect();
378
379        let start = start_line.unwrap_or(1).saturating_sub(1);
380        let end = end_line.unwrap_or(lines.len());
381
382        let selected_lines = &lines[start..end.min(lines.len())];
383
384        let mut result = String::new();
385        for (i, line) in selected_lines.iter().enumerate() {
386            result.push_str(&format!("{}: {}\n", start + i + 1, line));
387        }
388
389        Ok(result)
390    }
391
392    /// List files in directory (like ls).
393    pub fn list_files(&self, dir_path: &str, show_hidden: bool) -> Result<Vec<String>> {
394        let path = Path::new(dir_path);
395        if !path.exists() {
396            return Ok(vec![]);
397        }
398
399        let mut files = Vec::new();
400
401        for entry in fs::read_dir(path)? {
402            let entry = entry?;
403            let file_name = entry.file_name().to_string_lossy().to_string();
404
405            if !show_hidden && file_name.starts_with('.') {
406                continue;
407            }
408
409            files.push(file_name);
410        }
411
412        Ok(files)
413    }
414
415    /// Grep-like search (like grep command).
416    pub fn grep(&self, pattern: &str, file_pattern: Option<&str>) -> Result<Vec<SearchResult>> {
417        let regex = Regex::new(pattern)?;
418        let mut results = Vec::new();
419
420        for file_path in self.index_cache.keys() {
421            if file_pattern.is_some_and(|fp| !file_path.contains(fp)) {
422                continue;
423            }
424
425            if let Ok(content) = fs::read_to_string(file_path) {
426                for (line_num, line) in content.lines().enumerate() {
427                    if regex.is_match(line) {
428                        results.push(SearchResult {
429                            file_path: file_path.clone(),
430                            line_number: line_num + 1,
431                            line_content: line.to_string(),
432                            matches: vec![line.to_string()],
433                        });
434                    }
435                }
436            }
437        }
438
439        Ok(results)
440    }
441
442    fn walk_directory<F>(&mut self, dir_path: &Path, callback: &mut F) -> Result<()>
443    where
444        F: FnMut(&Path) -> Result<()>,
445    {
446        if !dir_path.exists() {
447            return Ok(());
448        }
449
450        self.walk_directory_internal(dir_path, callback)
451    }
452
453    fn walk_directory_internal<F>(&mut self, dir_path: &Path, callback: &mut F) -> Result<()>
454    where
455        F: FnMut(&Path) -> Result<()>,
456    {
457        for entry in fs::read_dir(dir_path)? {
458            let entry = entry?;
459            let path = entry.path();
460
461            if path.is_dir() {
462                if self.is_allowed_dir(&path) {
463                    self.walk_directory_internal(&path, callback)?;
464                    continue;
465                }
466
467                if !self.filter.should_descend(&path, &self.config) {
468                    self.walk_allowed_descendants(&path, callback)?;
469                    continue;
470                }
471
472                self.walk_directory_internal(&path, callback)?;
473            } else if path.is_file() {
474                callback(&path)?;
475            }
476        }
477
478        Ok(())
479    }
480
481    fn is_allowed_dir(&self, path: &Path) -> bool {
482        self.config
483            .allowed_dirs
484            .iter()
485            .any(|allowed| path.starts_with(allowed))
486    }
487
488    fn walk_allowed_descendants<F>(&mut self, dir_path: &Path, callback: &mut F) -> Result<()>
489    where
490        F: FnMut(&Path) -> Result<()>,
491    {
492        let allowed_dirs = self.config.allowed_dirs.clone();
493        for allowed in allowed_dirs {
494            if allowed.starts_with(dir_path) && allowed.exists() {
495                self.walk_directory_internal(&allowed, callback)?;
496            }
497        }
498        Ok(())
499    }
500
501    fn get_modified_time(&self, file_path: &Path) -> Result<u64> {
502        let metadata = fs::metadata(file_path)?;
503        let modified = metadata.modified()?;
504        Ok(modified.duration_since(SystemTime::UNIX_EPOCH)?.as_secs())
505    }
506
507    fn detect_language(&self, file_path: &Path) -> String {
508        file_path
509            .extension()
510            .and_then(|ext| ext.to_str())
511            .unwrap_or("unknown")
512            .to_string()
513    }
514}
515
516impl Clone for SimpleIndexer {
517    fn clone(&self) -> Self {
518        Self {
519            config: self.config.clone(),
520            index_cache: self.index_cache.clone(),
521            storage: self.storage.clone(),
522            filter: self.filter.clone(),
523        }
524    }
525}
526
527fn should_skip_dir(path: &Path, config: &SimpleIndexerConfig) -> bool {
528    if config
529        .allowed_dirs
530        .iter()
531        .any(|allowed| path.starts_with(allowed))
532    {
533        return false;
534    }
535
536    if config
537        .excluded_dirs
538        .iter()
539        .any(|excluded| path.starts_with(excluded))
540    {
541        return true;
542    }
543
544    if config.ignore_hidden
545        && path
546            .file_name()
547            .and_then(|name| name.to_str())
548            .is_some_and(|name_str| name_str.starts_with('.'))
549    {
550        return true;
551    }
552
553    false
554}
555
556fn calculate_hash(content: &str) -> String {
557    use std::collections::hash_map::DefaultHasher;
558    use std::hash::{Hash, Hasher};
559
560    let mut hasher = DefaultHasher::new();
561    content.hash(&mut hasher);
562    format!("{:x}", hasher.finish())
563}
564
565#[cfg(test)]
566mod tests {
567    use super::*;
568    use std::fs;
569    use std::sync::{Arc, Mutex};
570    use tempfile::tempdir;
571
572    #[test]
573    fn skips_hidden_directories_by_default() -> Result<()> {
574        let temp = tempdir()?;
575        let workspace = temp.path();
576        let hidden_dir = workspace.join(".private");
577        fs::create_dir_all(&hidden_dir)?;
578        fs::write(hidden_dir.join("secret.txt"), "classified")?;
579
580        let visible_dir = workspace.join("src");
581        fs::create_dir_all(&visible_dir)?;
582        fs::write(visible_dir.join("lib.rs"), "fn main() {}")?;
583
584        let mut indexer = SimpleIndexer::new(workspace.to_path_buf());
585        indexer.init()?;
586        indexer.index_directory(workspace)?;
587
588        assert!(indexer.find_files("secret\\.txt$")?.is_empty());
589        assert!(!indexer.find_files("lib\\.rs$")?.is_empty());
590
591        Ok(())
592    }
593
594    #[test]
595    fn can_include_hidden_directories_when_configured() -> Result<()> {
596        let temp = tempdir()?;
597        let workspace = temp.path();
598        let hidden_dir = workspace.join(".cache");
599        fs::create_dir_all(&hidden_dir)?;
600        fs::write(hidden_dir.join("data.log"), "details")?;
601
602        let config = SimpleIndexerConfig::new(workspace.to_path_buf()).ignore_hidden(false);
603        let mut indexer = SimpleIndexer::with_config(config);
604        indexer.init()?;
605        indexer.index_directory(workspace)?;
606
607        let results = indexer.find_files("data\\.log$")?;
608        assert_eq!(results.len(), 1);
609
610        Ok(())
611    }
612
613    #[test]
614    fn supports_custom_storage_backends() -> Result<()> {
615        #[derive(Clone, Default)]
616        struct MemoryStorage {
617            records: Arc<Mutex<Vec<FileIndex>>>,
618        }
619
620        impl MemoryStorage {
621            fn new(records: Arc<Mutex<Vec<FileIndex>>>) -> Self {
622                Self { records }
623            }
624        }
625
626        impl IndexStorage for MemoryStorage {
627            fn init(&self, _index_dir: &Path) -> Result<()> {
628                Ok(())
629            }
630
631            fn persist(&self, _index_dir: &Path, entry: &FileIndex) -> Result<()> {
632                let mut guard = self.records.lock().expect("lock poisoned");
633                guard.push(entry.clone());
634                Ok(())
635            }
636        }
637
638        let temp = tempdir()?;
639        let workspace = temp.path();
640        fs::write(workspace.join("notes.txt"), "remember this")?;
641
642        let records: Arc<Mutex<Vec<FileIndex>>> = Arc::new(Mutex::new(Vec::new()));
643        let storage = MemoryStorage::new(records.clone());
644
645        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
646        let mut indexer = SimpleIndexer::with_config(config).with_storage(Arc::new(storage));
647        indexer.init()?;
648        indexer.index_directory(workspace)?;
649
650        let entries = records.lock().expect("lock poisoned");
651        assert_eq!(entries.len(), 1);
652        assert_eq!(
653            entries[0].path,
654            workspace.join("notes.txt").to_string_lossy().to_string()
655        );
656
657        Ok(())
658    }
659
660    #[test]
661    fn custom_filters_can_skip_files() -> Result<()> {
662        #[derive(Default)]
663        struct SkipRustFilter {
664            inner: ConfigTraversalFilter,
665        }
666
667        impl TraversalFilter for SkipRustFilter {
668            fn should_descend(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
669                self.inner.should_descend(path, config)
670            }
671
672            fn should_index_file(&self, path: &Path, config: &SimpleIndexerConfig) -> bool {
673                if path
674                    .extension()
675                    .and_then(|ext| ext.to_str())
676                    .is_some_and(|ext| ext.eq_ignore_ascii_case("rs"))
677                {
678                    return false;
679                }
680
681                self.inner.should_index_file(path, config)
682            }
683        }
684
685        let temp = tempdir()?;
686        let workspace = temp.path();
687        fs::write(workspace.join("lib.rs"), "fn main() {}")?;
688        fs::write(workspace.join("README.md"), "# Notes")?;
689
690        let config = SimpleIndexerConfig::new(workspace.to_path_buf());
691        let mut indexer =
692            SimpleIndexer::with_config(config).with_filter(Arc::new(SkipRustFilter::default()));
693        indexer.init()?;
694        indexer.index_directory(workspace)?;
695
696        assert!(indexer.find_files("lib\\.rs$")?.is_empty());
697        assert!(!indexer.find_files("README\\.md$")?.is_empty());
698
699        Ok(())
700    }
701}