hedl_cli/
file_discovery.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! File discovery with glob patterns and recursive traversal.
19//!
20//! This module provides efficient file discovery capabilities for batch processing,
21//! supporting glob patterns, recursive directory traversal, and various filtering options.
22//!
23//! # Features
24//!
25//! - **Glob Patterns**: Support for standard glob patterns (*, ?, [abc], **)
26//! - **Recursive Traversal**: Optional recursive directory traversal with depth limiting
27//! - **Filtering**: Extension, size, and hidden file filtering
28//! - **Symlinks**: Configurable symlink following behavior
29//! - **Error Handling**: Detailed error reporting for invalid patterns and I/O failures
30//!
31//! # Examples
32//!
33//! ```rust,no_run
34//! use hedl_cli::file_discovery::{FileDiscovery, DiscoveryConfig};
35//!
36//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
37//! // Discover all HEDL files in current directory
38//! let discovery = FileDiscovery::new(
39//!     vec!["*.hedl".to_string()],
40//!     DiscoveryConfig::default()
41//! );
42//! let files = discovery.discover()?;
43//!
44//! // Recursive discovery with depth limit
45//! let discovery = FileDiscovery::new(
46//!     vec!["**/*.hedl".to_string()],
47//!     DiscoveryConfig {
48//!         max_depth: Some(3),
49//!         extension: Some("hedl".to_string()),
50//!         ..Default::default()
51//!     }
52//! );
53//! let files = discovery.discover()?;
54//! # Ok(())
55//! # }
56//! ```
57
58use crate::error::CliError;
59use std::path::{Path, PathBuf};
60use walkdir::{DirEntry, WalkDir};
61
62/// Configuration for file discovery.
63///
64/// Controls how files are discovered, including recursion depth, filtering,
65/// and symlink handling.
66#[derive(Debug, Clone)]
67pub struct DiscoveryConfig {
68    /// Maximum recursion depth for directory traversal.
69    ///
70    /// - `None`: Unlimited depth (use with caution)
71    /// - `Some(0)`: Current directory only
72    /// - `Some(n)`: Traverse up to n levels deep
73    pub max_depth: Option<usize>,
74
75    /// Filter by file extension (without leading dot).
76    ///
77    /// Only files with this extension will be included.
78    /// Example: `Some("hedl")` matches "file.hedl" but not "file.txt"
79    pub extension: Option<String>,
80
81    /// Maximum file size in bytes.
82    ///
83    /// Files larger than this will be excluded.
84    pub max_file_size: Option<u64>,
85
86    /// Follow symbolic links during traversal.
87    ///
88    /// When false, symlinks are ignored.
89    pub follow_links: bool,
90
91    /// Include hidden files (starting with '.').
92    ///
93    /// When false, hidden files and directories are skipped.
94    pub include_hidden: bool,
95
96    /// Enable recursive directory traversal.
97    ///
98    /// When false, only process files matching patterns directly,
99    /// don't traverse directories.
100    pub recursive: bool,
101}
102
103impl Default for DiscoveryConfig {
104    fn default() -> Self {
105        Self {
106            max_depth: Some(10),
107            extension: None,
108            max_file_size: None,
109            follow_links: false,
110            include_hidden: false,
111            recursive: false,
112        }
113    }
114}
115
116/// File discovery engine with glob pattern support.
117///
118/// Discovers files matching specified patterns with configurable filtering
119/// and traversal options.
120#[derive(Debug)]
121pub struct FileDiscovery {
122    patterns: Vec<String>,
123    config: DiscoveryConfig,
124}
125
126impl FileDiscovery {
127    /// Create a new file discovery instance.
128    ///
129    /// # Arguments
130    ///
131    /// * `patterns` - List of file patterns (glob patterns or explicit paths)
132    /// * `config` - Discovery configuration
133    ///
134    /// # Examples
135    ///
136    /// ```rust
137    /// use hedl_cli::file_discovery::{FileDiscovery, DiscoveryConfig};
138    ///
139    /// let discovery = FileDiscovery::new(
140    ///     vec!["*.hedl".to_string(), "data/*.hedl".to_string()],
141    ///     DiscoveryConfig::default()
142    /// );
143    /// ```
144    #[must_use]
145    pub fn new(patterns: Vec<String>, config: DiscoveryConfig) -> Self {
146        Self { patterns, config }
147    }
148
149    /// Validate all patterns before discovery.
150    ///
151    /// Checks that patterns are valid glob expressions.
152    ///
153    /// # Returns
154    ///
155    /// * `Ok(())` - All patterns are valid
156    /// * `Err(CliError::GlobPattern)` - Invalid pattern found
157    ///
158    /// # Examples
159    ///
160    /// ```rust
161    /// use hedl_cli::file_discovery::{FileDiscovery, DiscoveryConfig};
162    ///
163    /// let discovery = FileDiscovery::new(
164    ///     vec!["*.hedl".to_string()],
165    ///     DiscoveryConfig::default()
166    /// );
167    /// assert!(discovery.validate_patterns().is_ok());
168    /// ```
169    pub fn validate_patterns(&self) -> Result<(), CliError> {
170        for pattern in &self.patterns {
171            if let Err(e) = glob::Pattern::new(pattern) {
172                return Err(CliError::GlobPattern {
173                    pattern: pattern.clone(),
174                    message: e.to_string(),
175                });
176            }
177        }
178        Ok(())
179    }
180
181    /// Discover all files matching the patterns.
182    ///
183    /// Expands glob patterns and applies configured filters.
184    ///
185    /// # Returns
186    ///
187    /// * `Ok(Vec<PathBuf>)` - List of discovered file paths
188    /// * `Err(CliError)` - On pattern errors, I/O failures, or no matches
189    ///
190    /// # Errors
191    ///
192    /// Returns error if:
193    /// - Pattern is invalid
194    /// - No files match any pattern
195    /// - Directory traversal fails
196    ///
197    /// # Examples
198    ///
199    /// ```rust,no_run
200    /// use hedl_cli::file_discovery::{FileDiscovery, DiscoveryConfig};
201    ///
202    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
203    /// let discovery = FileDiscovery::new(
204    ///     vec!["tests/*.hedl".to_string()],
205    ///     DiscoveryConfig::default()
206    /// );
207    /// let files = discovery.discover()?;
208    /// println!("Found {} files", files.len());
209    /// # Ok(())
210    /// # }
211    /// ```
212    pub fn discover(&self) -> Result<Vec<PathBuf>, CliError> {
213        // If no patterns provided, return empty (nothing to discover = success)
214        if self.patterns.is_empty() {
215            return Ok(Vec::new());
216        }
217
218        // Validate patterns first
219        self.validate_patterns()?;
220
221        let mut all_files = Vec::new();
222
223        for pattern in &self.patterns {
224            let pattern_files = if self.config.recursive && pattern.contains("**") {
225                // Recursive glob pattern
226                self.discover_recursive_glob(pattern)?
227            } else if self.config.recursive {
228                // Recursive but not using ** syntax
229                self.discover_recursive_simple(pattern)?
230            } else {
231                // Simple glob pattern (no recursion)
232                self.discover_simple_glob(pattern)?
233            };
234
235            all_files.extend(pattern_files);
236        }
237
238        // Remove duplicates while preserving order
239        let mut seen = std::collections::HashSet::new();
240        all_files.retain(|path| seen.insert(path.clone()));
241
242        if all_files.is_empty() {
243            return Err(CliError::NoFilesMatched {
244                patterns: self.patterns.clone(),
245            });
246        }
247
248        Ok(all_files)
249    }
250
251    /// Discover files using simple glob pattern (no recursion).
252    fn discover_simple_glob(&self, pattern: &str) -> Result<Vec<PathBuf>, CliError> {
253        let mut files = Vec::new();
254
255        for entry in glob::glob(pattern).map_err(|e| CliError::GlobPattern {
256            pattern: pattern.to_string(),
257            message: e.to_string(),
258        })? {
259            let path = entry.map_err(|e| CliError::DirectoryTraversal {
260                path: PathBuf::from(pattern),
261                message: e.to_string(),
262            })?;
263
264            if self.should_include_file(&path)? {
265                files.push(path);
266            }
267        }
268
269        Ok(files)
270    }
271
272    /// Discover files using recursive glob with ** syntax.
273    fn discover_recursive_glob(&self, pattern: &str) -> Result<Vec<PathBuf>, CliError> {
274        // For ** patterns, we need to manually walk directories
275        // Extract base directory from pattern
276        let base_dir = self.extract_base_dir(pattern);
277
278        let mut files = Vec::new();
279
280        let walker = WalkDir::new(&base_dir)
281            .follow_links(self.config.follow_links)
282            .max_depth(self.config.max_depth.unwrap_or(usize::MAX));
283
284        let glob_pattern = glob::Pattern::new(pattern).map_err(|e| CliError::GlobPattern {
285            pattern: pattern.to_string(),
286            message: e.to_string(),
287        })?;
288
289        for entry in walker {
290            let entry = entry.map_err(|e| CliError::DirectoryTraversal {
291                path: base_dir.clone(),
292                message: e.to_string(),
293            })?;
294
295            if !self.should_include_entry(&entry) {
296                continue;
297            }
298
299            let path = entry.path();
300            if path.is_file()
301                && glob_pattern.matches_path(path)
302                && self.should_include_file(path)?
303            {
304                files.push(path.to_path_buf());
305            }
306        }
307
308        Ok(files)
309    }
310
311    /// Discover files recursively with simple pattern.
312    fn discover_recursive_simple(&self, pattern: &str) -> Result<Vec<PathBuf>, CliError> {
313        // Convert simple pattern to recursive glob
314        let base_dir = self.extract_base_dir(pattern);
315        let filename_pattern = PathBuf::from(pattern)
316            .file_name()
317            .map_or_else(|| pattern.to_string(), |s| s.to_string_lossy().to_string());
318
319        let recursive_pattern = if base_dir == std::path::Path::new(".") {
320            format!("**/{filename_pattern}")
321        } else {
322            format!("{}/**/{}", base_dir.display(), filename_pattern)
323        };
324
325        self.discover_recursive_glob(&recursive_pattern)
326    }
327
328    /// Extract base directory from a glob pattern.
329    fn extract_base_dir(&self, pattern: &str) -> PathBuf {
330        let path = PathBuf::from(pattern);
331
332        // Find the deepest ancestor that doesn't contain glob characters
333        for ancestor in path.ancestors() {
334            let s = ancestor.to_string_lossy();
335            // Skip if it contains glob characters
336            if s.contains('*') || s.contains('?') || s.contains('[') {
337                continue;
338            }
339            // Found a non-glob path - return it (or "." if empty)
340            if s.is_empty() {
341                return PathBuf::from(".");
342            }
343            return ancestor.to_path_buf();
344        }
345
346        // All parts contain globs, default to current directory
347        PathBuf::from(".")
348    }
349
350    /// Check if a directory entry should be included in traversal.
351    fn should_include_entry(&self, entry: &DirEntry) -> bool {
352        // Skip hidden files/directories if not configured to include them
353        if !self.config.include_hidden {
354            if let Some(name) = entry.file_name().to_str() {
355                if name.starts_with('.') && name != "." && name != ".." {
356                    return false;
357                }
358            }
359        }
360
361        true
362    }
363
364    /// Check if a file should be included based on filters.
365    fn should_include_file(&self, path: &Path) -> Result<bool, CliError> {
366        // Must be a regular file
367        if !path.is_file() {
368            return Ok(false);
369        }
370
371        // Check extension filter
372        if let Some(ref ext) = self.config.extension {
373            if path.extension().and_then(|s| s.to_str()) != Some(ext.as_str()) {
374                return Ok(false);
375            }
376        }
377
378        // Check file size filter
379        if let Some(max_size) = self.config.max_file_size {
380            let metadata = std::fs::metadata(path).map_err(|e| CliError::io_error(path, e))?;
381            if metadata.len() > max_size {
382                return Ok(false);
383            }
384        }
385
386        Ok(true)
387    }
388}
389
390#[cfg(test)]
391mod tests {
392    use super::*;
393    use std::fs;
394    use tempfile::TempDir;
395
396    fn create_test_files(dir: &Path, files: &[&str]) -> Result<(), std::io::Error> {
397        for file in files {
398            let path = dir.join(file);
399            if let Some(parent) = path.parent() {
400                fs::create_dir_all(parent)?;
401            }
402            fs::write(path, "test content")?;
403        }
404        Ok(())
405    }
406
407    #[test]
408    fn test_discovery_config_default() {
409        let config = DiscoveryConfig::default();
410        assert_eq!(config.max_depth, Some(10));
411        assert!(config.extension.is_none());
412        assert!(config.max_file_size.is_none());
413        assert!(!config.follow_links);
414        assert!(!config.include_hidden);
415        assert!(!config.recursive);
416    }
417
418    #[test]
419    fn test_validate_patterns_valid() {
420        let discovery = FileDiscovery::new(
421            vec!["*.hedl".to_string(), "test/*.hedl".to_string()],
422            DiscoveryConfig::default(),
423        );
424        assert!(discovery.validate_patterns().is_ok());
425    }
426
427    #[test]
428    fn test_validate_patterns_invalid() {
429        let discovery =
430            FileDiscovery::new(vec!["[invalid".to_string()], DiscoveryConfig::default());
431        let result = discovery.validate_patterns();
432        assert!(result.is_err());
433        if let Err(CliError::GlobPattern { pattern, .. }) = result {
434            assert_eq!(pattern, "[invalid");
435        }
436    }
437
438    #[test]
439    fn test_discover_simple_glob() -> Result<(), Box<dyn std::error::Error>> {
440        let temp_dir = TempDir::new()?;
441        create_test_files(temp_dir.path(), &["file1.hedl", "file2.hedl", "file3.txt"])?;
442
443        let pattern = format!("{}/*.hedl", temp_dir.path().display());
444        let discovery = FileDiscovery::new(vec![pattern], DiscoveryConfig::default());
445
446        let files = discovery.discover()?;
447        assert_eq!(files.len(), 2);
448
449        Ok(())
450    }
451
452    #[test]
453    fn test_discover_no_matches() {
454        let temp_dir = TempDir::new().unwrap();
455        let pattern = format!("{}/*.hedl", temp_dir.path().display());
456        let discovery = FileDiscovery::new(vec![pattern.clone()], DiscoveryConfig::default());
457
458        let result = discovery.discover();
459        assert!(result.is_err());
460        if let Err(CliError::NoFilesMatched { patterns }) = result {
461            assert_eq!(patterns, vec![pattern]);
462        }
463    }
464
465    #[test]
466    fn test_discover_recursive() -> Result<(), Box<dyn std::error::Error>> {
467        let temp_dir = TempDir::new()?;
468        create_test_files(
469            temp_dir.path(),
470            &[
471                "file1.hedl",
472                "dir1/file2.hedl",
473                "dir1/dir2/file3.hedl",
474                "dir1/file4.txt",
475            ],
476        )?;
477
478        let pattern = format!("{}/**/*.hedl", temp_dir.path().display());
479        let discovery = FileDiscovery::new(
480            vec![pattern],
481            DiscoveryConfig {
482                recursive: true,
483                ..Default::default()
484            },
485        );
486
487        let files = discovery.discover()?;
488        assert_eq!(files.len(), 3);
489
490        Ok(())
491    }
492
493    #[test]
494    fn test_discover_with_depth_limit() -> Result<(), Box<dyn std::error::Error>> {
495        let temp_dir = TempDir::new()?;
496        create_test_files(
497            temp_dir.path(),
498            &[
499                "file1.hedl",
500                "dir1/file2.hedl",
501                "dir1/dir2/file3.hedl",
502                "dir1/dir2/dir3/file4.hedl",
503            ],
504        )?;
505
506        let pattern = format!("{}/**/*.hedl", temp_dir.path().display());
507        let discovery = FileDiscovery::new(
508            vec![pattern],
509            DiscoveryConfig {
510                recursive: true,
511                max_depth: Some(2),
512                ..Default::default()
513            },
514        );
515
516        let files = discovery.discover()?;
517        // Should find file1.hedl and dir1/file2.hedl, but not deeper files
518        assert!(files.len() <= 3); // May include dir1/dir2/file3.hedl depending on depth counting
519
520        Ok(())
521    }
522
523    #[test]
524    fn test_discover_with_extension_filter() -> Result<(), Box<dyn std::error::Error>> {
525        let temp_dir = TempDir::new()?;
526        create_test_files(temp_dir.path(), &["file1.hedl", "file2.txt", "file3.hedl"])?;
527
528        let pattern = format!("{}/*", temp_dir.path().display());
529        let discovery = FileDiscovery::new(
530            vec![pattern],
531            DiscoveryConfig {
532                extension: Some("hedl".to_string()),
533                ..Default::default()
534            },
535        );
536
537        let files = discovery.discover()?;
538        assert_eq!(files.len(), 2);
539        assert!(files.iter().all(|p| p.extension().unwrap() == "hedl"));
540
541        Ok(())
542    }
543
544    #[test]
545    fn test_discover_hidden_files() -> Result<(), Box<dyn std::error::Error>> {
546        let temp_dir = TempDir::new()?;
547        create_test_files(
548            temp_dir.path(),
549            &["file1.hedl", ".hidden.hedl", "dir/.hidden2.hedl"],
550        )?;
551
552        let pattern = format!("{}/**/*.hedl", temp_dir.path().display());
553
554        // Without include_hidden
555        let discovery = FileDiscovery::new(
556            vec![pattern.clone()],
557            DiscoveryConfig {
558                recursive: true,
559                include_hidden: false,
560                ..Default::default()
561            },
562        );
563        let files = discovery.discover()?;
564        assert_eq!(files.len(), 1); // Only file1.hedl
565
566        // With include_hidden
567        let discovery = FileDiscovery::new(
568            vec![pattern],
569            DiscoveryConfig {
570                recursive: true,
571                include_hidden: true,
572                ..Default::default()
573            },
574        );
575        let files = discovery.discover()?;
576        assert!(files.len() >= 2); // file1.hedl and hidden files
577
578        Ok(())
579    }
580
581    #[test]
582    fn test_extract_base_dir() {
583        let discovery = FileDiscovery::new(vec![], DiscoveryConfig::default());
584
585        assert_eq!(discovery.extract_base_dir("*.hedl"), PathBuf::from("."));
586        assert_eq!(
587            discovery.extract_base_dir("dir/*.hedl"),
588            PathBuf::from("dir")
589        );
590        assert_eq!(
591            discovery.extract_base_dir("dir/subdir/*.hedl"),
592            PathBuf::from("dir/subdir")
593        );
594        assert_eq!(
595            discovery.extract_base_dir("**/file.hedl"),
596            PathBuf::from(".")
597        );
598    }
599
600    #[test]
601    fn test_multiple_patterns() -> Result<(), Box<dyn std::error::Error>> {
602        let temp_dir = TempDir::new()?;
603        create_test_files(
604            temp_dir.path(),
605            &["dir1/file1.hedl", "dir2/file2.hedl", "file3.hedl"],
606        )?;
607
608        let patterns = vec![
609            format!("{}/dir1/*.hedl", temp_dir.path().display()),
610            format!("{}/dir2/*.hedl", temp_dir.path().display()),
611        ];
612        let discovery = FileDiscovery::new(patterns, DiscoveryConfig::default());
613
614        let files = discovery.discover()?;
615        assert_eq!(files.len(), 2);
616
617        Ok(())
618    }
619
620    #[test]
621    fn test_deduplicate_files() -> Result<(), Box<dyn std::error::Error>> {
622        let temp_dir = TempDir::new()?;
623        create_test_files(temp_dir.path(), &["file1.hedl"])?;
624
625        // Same file matched by multiple patterns
626        let file_path = format!("{}/file1.hedl", temp_dir.path().display());
627        let patterns = vec![
628            file_path.clone(),
629            format!("{}/*.hedl", temp_dir.path().display()),
630        ];
631        let discovery = FileDiscovery::new(patterns, DiscoveryConfig::default());
632
633        let files = discovery.discover()?;
634        // Should be deduplicated to 1 file
635        assert_eq!(files.len(), 1);
636
637        Ok(())
638    }
639}