fast_yaml_cli/batch/
discovery.rs

1//! File discovery for batch processing.
2
3use std::collections::HashSet;
4use std::io::BufRead;
5use std::path::{Path, PathBuf};
6
7use globset::{Glob, GlobSet, GlobSetBuilder};
8
9use super::error::DiscoveryError;
10
11/// Maximum number of paths that can be read from stdin.
12const MAX_STDIN_PATHS: usize = 100_000;
13
14/// Maximum line length for stdin input.
15const MAX_LINE_LENGTH: usize = 4096;
16
17/// Maximum number of glob matches to prevent memory exhaustion.
18const MAX_GLOB_MATCHES: usize = 100_000;
19
20/// Configuration for file discovery.
21#[derive(Debug, Clone)]
22pub struct DiscoveryConfig {
23    /// Glob patterns for files to include (e.g., "*.yaml", "*.yml")
24    pub include_patterns: Vec<String>,
25    /// Glob patterns for files/directories to exclude (e.g., "**/vendor/**")
26    pub exclude_patterns: Vec<String>,
27    /// Maximum recursion depth (None = unlimited)
28    pub max_depth: Option<usize>,
29    /// Whether to include hidden files/directories
30    pub include_hidden: bool,
31    /// Whether to respect .gitignore files
32    pub respect_gitignore: bool,
33    /// Whether to follow symbolic links
34    pub follow_symlinks: bool,
35}
36
37impl Default for DiscoveryConfig {
38    fn default() -> Self {
39        Self {
40            include_patterns: vec!["*.yaml".into(), "*.yml".into()],
41            exclude_patterns: vec![],
42            max_depth: Some(100),
43            include_hidden: false,
44            respect_gitignore: true,
45            follow_symlinks: false,
46        }
47    }
48}
49
50impl DiscoveryConfig {
51    /// Create a new configuration with default settings.
52    #[must_use]
53    pub fn new() -> Self {
54        Self::default()
55    }
56
57    /// Set include patterns (builder pattern).
58    #[must_use]
59    pub fn with_include_patterns(mut self, patterns: Vec<String>) -> Self {
60        self.include_patterns = patterns;
61        self
62    }
63
64    /// Set exclude patterns (builder pattern).
65    #[must_use]
66    pub fn with_exclude_patterns(mut self, patterns: Vec<String>) -> Self {
67        self.exclude_patterns = patterns;
68        self
69    }
70
71    /// Set maximum recursion depth.
72    #[must_use]
73    pub const fn with_max_depth(mut self, depth: Option<usize>) -> Self {
74        self.max_depth = depth;
75        self
76    }
77
78    /// Set unlimited recursion depth (use with caution).
79    #[must_use]
80    pub const fn with_unlimited_depth(mut self) -> Self {
81        self.max_depth = None;
82        self
83    }
84
85    /// Set whether to include hidden files.
86    #[must_use]
87    pub const fn with_hidden(mut self, include: bool) -> Self {
88        self.include_hidden = include;
89        self
90    }
91
92    /// Set whether to respect .gitignore.
93    #[must_use]
94    pub const fn with_gitignore(mut self, respect: bool) -> Self {
95        self.respect_gitignore = respect;
96        self
97    }
98
99    /// Set whether to follow symbolic links.
100    #[must_use]
101    pub const fn with_follow_symlinks(mut self, follow: bool) -> Self {
102        self.follow_symlinks = follow;
103        self
104    }
105}
106
107/// Origin of a discovered file.
108#[derive(Debug, Clone, Copy, PartialEq, Eq)]
109pub enum DiscoveryOrigin {
110    /// File was specified directly as a path argument
111    DirectPath,
112    /// File was found by walking a directory
113    DirectoryWalk,
114    /// File was found by expanding a glob pattern
115    GlobExpansion,
116    /// File path was read from stdin
117    StdinList,
118}
119
120/// A discovered file with its origin information.
121#[derive(Debug, Clone)]
122pub struct DiscoveredFile {
123    /// Canonical path to the file
124    pub path: PathBuf,
125    /// How this file was discovered
126    pub origin: DiscoveryOrigin,
127}
128
129/// File discovery engine.
130#[derive(Debug)]
131pub struct FileDiscovery {
132    config: DiscoveryConfig,
133    include_matcher: GlobSet,
134    exclude_matcher: GlobSet,
135}
136
137impl FileDiscovery {
138    /// Create a new file discovery instance.
139    pub fn new(config: DiscoveryConfig) -> Result<Self, DiscoveryError> {
140        let include_matcher = build_globset(&config.include_patterns)?;
141        let exclude_matcher = build_globset(&config.exclude_patterns)?;
142
143        Ok(Self {
144            config,
145            include_matcher,
146            exclude_matcher,
147        })
148    }
149
150    /// Discover files from the given paths.
151    ///
152    /// Paths can be:
153    /// - Regular files (included directly if matching patterns)
154    /// - Directories (walked recursively)
155    /// - Glob patterns (expanded)
156    pub fn discover(&self, paths: &[PathBuf]) -> Result<Vec<DiscoveredFile>, DiscoveryError> {
157        // Heuristic: estimate 10 files per input path
158        let estimated_capacity = paths.len().saturating_mul(10);
159        let mut discovered = Vec::with_capacity(estimated_capacity);
160        let mut seen = HashSet::new();
161
162        for path in paths {
163            if path.exists() {
164                if path.is_file() {
165                    self.discover_file(
166                        path,
167                        DiscoveryOrigin::DirectPath,
168                        &mut discovered,
169                        &mut seen,
170                    )?;
171                } else if path.is_dir() {
172                    self.discover_directory(path, &mut discovered, &mut seen);
173                }
174            } else {
175                // Treat as glob pattern
176                self.discover_glob(&path.to_string_lossy(), &mut discovered, &mut seen);
177            }
178        }
179
180        Ok(discovered)
181    }
182
183    /// Discover files from stdin (one path per line).
184    pub fn discover_from_stdin(&self) -> Result<Vec<DiscoveredFile>, DiscoveryError> {
185        self.discover_from_reader(std::io::stdin().lock())
186    }
187
188    /// Discover files from any `BufRead` source (for testing).
189    pub fn discover_from_reader<R: BufRead>(
190        &self,
191        reader: R,
192    ) -> Result<Vec<DiscoveredFile>, DiscoveryError> {
193        let mut discovered = Vec::new();
194        let mut seen = HashSet::new();
195        let mut count = 0;
196
197        for line in reader.lines() {
198            let line = line.map_err(|e| DiscoveryError::StdinError { source: e })?;
199
200            count += 1;
201            if count > MAX_STDIN_PATHS {
202                return Err(DiscoveryError::TooManyPaths {
203                    max: MAX_STDIN_PATHS,
204                });
205            }
206
207            let trimmed = line.trim();
208
209            if trimmed.len() > MAX_LINE_LENGTH {
210                eprintln!("Warning: skipping line {count} (exceeds {MAX_LINE_LENGTH} chars)");
211                continue;
212            }
213
214            // Skip empty lines and comments
215            if trimmed.is_empty() || trimmed.starts_with('#') {
216                continue;
217            }
218
219            let path = PathBuf::from(trimmed);
220            if path.is_file() {
221                self.discover_file(
222                    &path,
223                    DiscoveryOrigin::StdinList,
224                    &mut discovered,
225                    &mut seen,
226                )?;
227            }
228        }
229
230        Ok(discovered)
231    }
232
233    /// Check if a single path should be included.
234    #[must_use]
235    pub fn should_include(&self, path: &Path) -> bool {
236        // Check exclude patterns first (match against full path)
237        if self.exclude_matcher.is_match(path) {
238            return false;
239        }
240
241        // Check include patterns (match against file name for extension patterns)
242        path.file_name()
243            .is_some_and(|file_name| self.include_matcher.is_match(file_name))
244    }
245
246    fn discover_file(
247        &self,
248        path: &Path,
249        origin: DiscoveryOrigin,
250        discovered: &mut Vec<DiscoveredFile>,
251        seen: &mut HashSet<PathBuf>,
252    ) -> Result<(), DiscoveryError> {
253        if !self.should_include(path) {
254            return Ok(());
255        }
256
257        // Canonicalize for deduplication
258        let canonical = path.canonicalize().map_err(|e| {
259            if e.kind() == std::io::ErrorKind::NotFound {
260                // Check if it's a broken symlink
261                if path.symlink_metadata().is_ok() {
262                    DiscoveryError::BrokenSymlink {
263                        path: path.to_path_buf(),
264                    }
265                } else {
266                    DiscoveryError::PathNotFound {
267                        path: path.to_path_buf(),
268                    }
269                }
270            } else if e.kind() == std::io::ErrorKind::PermissionDenied {
271                DiscoveryError::PermissionDenied {
272                    path: path.to_path_buf(),
273                }
274            } else {
275                DiscoveryError::IoError {
276                    path: path.to_path_buf(),
277                    source: e,
278                }
279            }
280        })?;
281
282        // Dedup by canonical path
283        if seen.insert(canonical.clone()) {
284            discovered.push(DiscoveredFile {
285                path: canonical,
286                origin,
287            });
288        }
289
290        Ok(())
291    }
292
293    fn discover_directory(
294        &self,
295        dir: &Path,
296        discovered: &mut Vec<DiscoveredFile>,
297        seen: &mut HashSet<PathBuf>,
298    ) {
299        let mut builder = ignore::WalkBuilder::new(dir);
300        builder
301            .hidden(!self.config.include_hidden)
302            .git_ignore(self.config.respect_gitignore)
303            .git_global(self.config.respect_gitignore)
304            .git_exclude(self.config.respect_gitignore)
305            .follow_links(self.config.follow_symlinks);
306
307        if let Some(depth) = self.config.max_depth {
308            builder.max_depth(Some(depth));
309        }
310
311        for entry in builder.build() {
312            let entry = match entry {
313                Ok(e) => e,
314                Err(e) => {
315                    // Log warning but continue processing
316                    eprintln!("Warning: failed to read entry: {e}");
317                    continue;
318                }
319            };
320
321            if entry.file_type().is_some_and(|ft| ft.is_file()) {
322                let path = entry.path();
323                // Ignore errors for individual files during directory walk
324                let _ = self.discover_file(path, DiscoveryOrigin::DirectoryWalk, discovered, seen);
325            }
326        }
327    }
328
329    fn discover_glob(
330        &self,
331        pattern: &str,
332        discovered: &mut Vec<DiscoveredFile>,
333        seen: &mut HashSet<PathBuf>,
334    ) {
335        let Ok(glob) = glob::glob(pattern) else {
336            eprintln!("Warning: invalid glob pattern: {pattern}");
337            return;
338        };
339
340        let mut match_count = 0;
341        for entry in glob {
342            match_count += 1;
343            if match_count > MAX_GLOB_MATCHES {
344                eprintln!(
345                    "Warning: glob pattern '{pattern}' exceeded {MAX_GLOB_MATCHES} matches, stopping"
346                );
347                break;
348            }
349
350            match entry {
351                Ok(path) => {
352                    if path.is_file() {
353                        // Ignore errors for individual files during glob expansion
354                        let _ = self.discover_file(
355                            &path,
356                            DiscoveryOrigin::GlobExpansion,
357                            discovered,
358                            seen,
359                        );
360                    }
361                }
362                Err(e) => {
363                    eprintln!("Warning: glob error: {e}");
364                }
365            }
366        }
367    }
368}
369
370fn build_globset(patterns: &[String]) -> Result<GlobSet, DiscoveryError> {
371    let mut builder = GlobSetBuilder::new();
372
373    for pattern in patterns {
374        let glob = Glob::new(pattern).map_err(|e| DiscoveryError::InvalidPattern {
375            pattern: pattern.clone(),
376            source: e,
377        })?;
378        builder.add(glob);
379    }
380
381    builder.build().map_err(|e| DiscoveryError::InvalidPattern {
382        pattern: "<combined>".to_string(),
383        source: e,
384    })
385}
386
387#[cfg(test)]
388mod tests {
389    use super::*;
390    use std::fs;
391    use tempfile::TempDir;
392
393    fn default_config() -> DiscoveryConfig {
394        DiscoveryConfig::new()
395    }
396
397    #[test]
398    fn test_config_default() {
399        let config = DiscoveryConfig::default();
400        assert_eq!(config.include_patterns, vec!["*.yaml", "*.yml"]);
401        assert!(config.exclude_patterns.is_empty());
402        assert_eq!(config.max_depth, Some(100));
403        assert!(!config.include_hidden);
404        assert!(config.respect_gitignore);
405        assert!(!config.follow_symlinks);
406    }
407
408    #[test]
409    fn test_config_builder() {
410        let config = DiscoveryConfig::new()
411            .with_include_patterns(vec!["*.yml".to_string()])
412            .with_exclude_patterns(vec!["**/vendor/**".to_string()])
413            .with_max_depth(Some(5))
414            .with_hidden(true)
415            .with_gitignore(false)
416            .with_follow_symlinks(true);
417
418        assert_eq!(config.include_patterns, vec!["*.yml"]);
419        assert_eq!(config.exclude_patterns, vec!["**/vendor/**"]);
420        assert_eq!(config.max_depth, Some(5));
421        assert!(config.include_hidden);
422        assert!(!config.respect_gitignore);
423        assert!(config.follow_symlinks);
424    }
425
426    #[test]
427    fn test_include_pattern_yaml() {
428        let config = default_config();
429        let discovery = FileDiscovery::new(config).unwrap();
430
431        assert!(discovery.should_include(Path::new("test.yaml")));
432        assert!(discovery.should_include(Path::new("/path/to/test.yaml")));
433    }
434
435    #[test]
436    fn test_include_pattern_yml() {
437        let config = default_config();
438        let discovery = FileDiscovery::new(config).unwrap();
439
440        assert!(discovery.should_include(Path::new("test.yml")));
441        assert!(discovery.should_include(Path::new("/path/to/test.yml")));
442    }
443
444    #[test]
445    fn test_exclude_pattern() {
446        let config = default_config().with_exclude_patterns(vec!["**/vendor/**".to_string()]);
447        let discovery = FileDiscovery::new(config).unwrap();
448
449        assert!(!discovery.should_include(Path::new("vendor/test.yaml")));
450        assert!(!discovery.should_include(Path::new("path/vendor/test.yaml")));
451        assert!(discovery.should_include(Path::new("test.yaml")));
452    }
453
454    #[test]
455    fn test_exclude_vendor() {
456        let config = default_config().with_exclude_patterns(vec!["**/vendor/**".to_string()]);
457        let discovery = FileDiscovery::new(config).unwrap();
458
459        assert!(!discovery.should_include(Path::new("vendor/lib/config.yaml")));
460        assert!(discovery.should_include(Path::new("src/config.yaml")));
461    }
462
463    #[test]
464    fn test_discover_single_file() {
465        let temp = TempDir::new().unwrap();
466        let file = temp.path().join("test.yaml");
467        fs::write(&file, "key: value").unwrap();
468
469        let config = default_config();
470        let discovery = FileDiscovery::new(config).unwrap();
471        let files = discovery.discover(std::slice::from_ref(&file)).unwrap();
472
473        assert_eq!(files.len(), 1);
474        assert_eq!(files[0].origin, DiscoveryOrigin::DirectPath);
475    }
476
477    #[test]
478    fn test_discover_directory() {
479        let temp = TempDir::new().unwrap();
480        fs::write(temp.path().join("root.yaml"), "a: 1").unwrap();
481        fs::create_dir(temp.path().join("subdir")).unwrap();
482        fs::write(temp.path().join("subdir/nested.yaml"), "b: 2").unwrap();
483        fs::write(temp.path().join("subdir/skip.txt"), "c: 3").unwrap();
484
485        let config = default_config();
486        let discovery = FileDiscovery::new(config).unwrap();
487        let files = discovery.discover(&[temp.path().to_path_buf()]).unwrap();
488
489        assert_eq!(files.len(), 2);
490        assert!(
491            files
492                .iter()
493                .all(|f| f.origin == DiscoveryOrigin::DirectoryWalk)
494        );
495    }
496
497    #[test]
498    fn test_discover_glob() {
499        let temp = TempDir::new().unwrap();
500        fs::write(temp.path().join("config.yaml"), "a: 1").unwrap();
501        fs::write(temp.path().join("data.yml"), "b: 2").unwrap();
502        fs::write(temp.path().join("readme.md"), "# README").unwrap();
503
504        let pattern = format!("{}/*.yaml", temp.path().display());
505        let config = default_config();
506        let discovery = FileDiscovery::new(config).unwrap();
507        let files = discovery.discover(&[PathBuf::from(pattern)]).unwrap();
508
509        assert_eq!(files.len(), 1);
510        assert_eq!(files[0].origin, DiscoveryOrigin::GlobExpansion);
511    }
512
513    #[test]
514    fn test_discover_mixed_paths() {
515        let temp = TempDir::new().unwrap();
516
517        // Direct file
518        let file = temp.path().join("direct.yaml");
519        fs::write(&file, "a: 1").unwrap();
520
521        // Directory
522        let dir = temp.path().join("dir");
523        fs::create_dir(&dir).unwrap();
524        fs::write(dir.join("in_dir.yaml"), "b: 2").unwrap();
525
526        let config = default_config();
527        let discovery = FileDiscovery::new(config).unwrap();
528        let files = discovery.discover(&[file, dir]).unwrap();
529
530        assert_eq!(files.len(), 2);
531        assert!(
532            files
533                .iter()
534                .any(|f| f.origin == DiscoveryOrigin::DirectPath)
535        );
536        assert!(
537            files
538                .iter()
539                .any(|f| f.origin == DiscoveryOrigin::DirectoryWalk)
540        );
541    }
542
543    #[test]
544    fn test_hidden_files_excluded() {
545        let temp = TempDir::new().unwrap();
546        fs::write(temp.path().join(".hidden.yaml"), "a: 1").unwrap();
547        fs::write(temp.path().join("visible.yaml"), "b: 2").unwrap();
548
549        let config = default_config(); // include_hidden = false
550        let discovery = FileDiscovery::new(config).unwrap();
551        let files = discovery.discover(&[temp.path().to_path_buf()]).unwrap();
552
553        assert_eq!(files.len(), 1);
554        assert!(files[0].path.ends_with("visible.yaml"));
555    }
556
557    #[test]
558    fn test_hidden_files_included() {
559        let temp = TempDir::new().unwrap();
560        fs::write(temp.path().join(".hidden.yaml"), "a: 1").unwrap();
561        fs::write(temp.path().join("visible.yaml"), "b: 2").unwrap();
562
563        let config = default_config().with_hidden(true);
564        let discovery = FileDiscovery::new(config).unwrap();
565        let files = discovery.discover(&[temp.path().to_path_buf()]).unwrap();
566
567        assert_eq!(files.len(), 2);
568    }
569
570    #[test]
571    fn test_gitignore_respected() {
572        // Skip if git is not available
573        if std::process::Command::new("git")
574            .args(["--version"])
575            .output()
576            .is_err()
577        {
578            eprintln!("Skipping test_gitignore_respected: git not available");
579            return;
580        }
581
582        let temp = TempDir::new().unwrap();
583
584        // Initialize a git repo - required for ignore crate to respect .gitignore
585        std::process::Command::new("git")
586            .args(["init"])
587            .current_dir(temp.path())
588            .output()
589            .unwrap();
590
591        fs::write(temp.path().join(".gitignore"), "ignored.yaml\n").unwrap();
592        fs::write(temp.path().join("ignored.yaml"), "a: 1").unwrap();
593        fs::write(temp.path().join("included.yaml"), "b: 2").unwrap();
594
595        let config = default_config(); // respect_gitignore = true
596        let discovery = FileDiscovery::new(config).unwrap();
597        let files = discovery.discover(&[temp.path().to_path_buf()]).unwrap();
598
599        // Only included.yaml should be found (ignored.yaml is gitignored)
600        assert_eq!(files.len(), 1);
601        assert!(files[0].path.ends_with("included.yaml"));
602    }
603
604    #[test]
605    fn test_max_depth() {
606        let temp = TempDir::new().unwrap();
607        fs::write(temp.path().join("root.yaml"), "a: 1").unwrap();
608
609        let level1 = temp.path().join("level1");
610        fs::create_dir(&level1).unwrap();
611        fs::write(level1.join("l1.yaml"), "b: 2").unwrap();
612
613        let level2 = level1.join("level2");
614        fs::create_dir(&level2).unwrap();
615        fs::write(level2.join("l2.yaml"), "c: 3").unwrap();
616
617        // max_depth = 1 should only find root.yaml
618        let config = default_config().with_max_depth(Some(1));
619        let discovery = FileDiscovery::new(config).unwrap();
620        let files = discovery.discover(&[temp.path().to_path_buf()]).unwrap();
621
622        assert_eq!(files.len(), 1);
623        assert!(files[0].path.ends_with("root.yaml"));
624    }
625
626    #[test]
627    fn test_deduplication() {
628        let temp = TempDir::new().unwrap();
629        let file = temp.path().join("test.yaml");
630        fs::write(&file, "key: value").unwrap();
631
632        let config = default_config();
633        let discovery = FileDiscovery::new(config).unwrap();
634
635        // Provide the same file twice
636        let files = discovery.discover(&[file.clone(), file]).unwrap();
637
638        // Should only be discovered once
639        assert_eq!(files.len(), 1);
640    }
641
642    #[test]
643    fn test_invalid_pattern_error() {
644        let config = default_config().with_include_patterns(vec!["[invalid".to_string()]);
645
646        let result = FileDiscovery::new(config);
647        assert!(result.is_err());
648        assert!(
649            result
650                .unwrap_err()
651                .to_string()
652                .contains("invalid glob pattern")
653        );
654    }
655
656    #[test]
657    fn test_discover_from_reader_valid_paths() {
658        let temp = TempDir::new().unwrap();
659        let file = temp.path().join("test.yaml");
660        fs::write(&file, "key: value").unwrap();
661
662        let input = format!("{}\n", file.display());
663        let reader = std::io::Cursor::new(input);
664
665        let config = default_config();
666        let discovery = FileDiscovery::new(config).unwrap();
667        let files = discovery.discover_from_reader(reader).unwrap();
668
669        assert_eq!(files.len(), 1);
670        assert_eq!(files[0].origin, DiscoveryOrigin::StdinList);
671    }
672
673    #[test]
674    fn test_discover_from_reader_comments_and_empty_lines() {
675        let temp = TempDir::new().unwrap();
676        let file = temp.path().join("test.yaml");
677        fs::write(&file, "key: value").unwrap();
678
679        let input = format!("# comment\n\n{}\n# another comment\n", file.display());
680        let reader = std::io::Cursor::new(input);
681
682        let config = default_config();
683        let discovery = FileDiscovery::new(config).unwrap();
684        let files = discovery.discover_from_reader(reader).unwrap();
685
686        assert_eq!(files.len(), 1);
687    }
688
689    #[test]
690    fn test_discover_from_reader_too_many_paths() {
691        let temp = TempDir::new().unwrap();
692        let file = temp.path().join("test.yaml");
693        fs::write(&file, "key: value").unwrap();
694
695        let mut input = String::new();
696        for _ in 0..=MAX_STDIN_PATHS {
697            use std::fmt::Write;
698            writeln!(&mut input, "{}", file.display()).unwrap();
699        }
700        let reader = std::io::Cursor::new(input);
701
702        let config = default_config();
703        let discovery = FileDiscovery::new(config).unwrap();
704        let result = discovery.discover_from_reader(reader);
705
706        assert!(result.is_err());
707        let err = result.unwrap_err();
708        assert!(err.to_string().contains("exceeded maximum"));
709    }
710
711    #[test]
712    fn test_discover_from_reader_long_line_skipped() {
713        let temp = TempDir::new().unwrap();
714        let file = temp.path().join("test.yaml");
715        fs::write(&file, "key: value").unwrap();
716
717        let long_line = "x".repeat(MAX_LINE_LENGTH + 1);
718        let input = format!("{}\n{}\n", long_line, file.display());
719        let reader = std::io::Cursor::new(input);
720
721        let config = default_config();
722        let discovery = FileDiscovery::new(config).unwrap();
723        let files = discovery.discover_from_reader(reader).unwrap();
724
725        // Long line should be skipped, only valid file should be found
726        assert_eq!(files.len(), 1);
727    }
728
729    #[test]
730    fn test_permission_denied_continues() {
731        // Testing permission errors requires platform-specific setup
732        // This is better suited for integration tests
733    }
734}