Skip to main content

hedl_cli/
file_discovery.rs

1// Dweve HEDL - Hierarchical Entity Data Language
2//
3// Copyright (c) 2025 Dweve IP B.V. and individual contributors.
4//
5// SPDX-License-Identifier: Apache-2.0
6//
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the LICENSE file at the
10// root of this repository or at: http://www.apache.org/licenses/LICENSE-2.0
11//
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18//! File discovery with glob patterns and recursive traversal.
19//!
20//! This module provides efficient file discovery capabilities for batch processing,
21//! supporting glob patterns, recursive directory traversal, and various filtering options.
22//!
23//! # Features
24//!
25//! - **Glob Patterns**: Support for standard glob patterns (`*`, `?`, `[abc]`, `**`)
26//! - **Recursive Traversal**: Optional recursive directory traversal with depth limiting
27//! - **Filtering**: Extension, size, and hidden file filtering
28//! - **Symlinks**: Configurable symlink following behavior
29//! - **Error Handling**: Detailed error reporting for invalid patterns and I/O failures
30//!
31//! # Examples
32//!
33//! ```rust,no_run
34//! use hedl_cli::file_discovery::{FileDiscovery, DiscoveryConfig};
35//!
36//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
37//! // Discover all HEDL files in current directory
38//! let discovery = FileDiscovery::new(
39//!     vec!["*.hedl".to_string()],
40//!     DiscoveryConfig::default()
41//! );
42//! let files = discovery.discover()?;
43//!
44//! // Recursive discovery with depth limit
45//! let discovery = FileDiscovery::new(
46//!     vec!["**/*.hedl".to_string()],
47//!     DiscoveryConfig {
48//!         max_depth: Some(3),
49//!         extension: Some("hedl".to_string()),
50//!         ..Default::default()
51//!     }
52//! );
53//! let files = discovery.discover()?;
54//! # Ok(())
55//! # }
56//! ```
57
58use crate::error::CliError;
59use std::path::{Path, PathBuf};
60use walkdir::{DirEntry, WalkDir};
61
62/// Configuration for file discovery.
63///
64/// Controls how files are discovered, including recursion depth, filtering,
65/// and symlink handling.
66#[derive(Debug, Clone)]
67pub struct DiscoveryConfig {
68    /// Maximum recursion depth for directory traversal.
69    ///
70    /// - `None`: Unlimited depth (use with caution)
71    /// - `Some(0)`: Current directory only
72    /// - `Some(n)`: Traverse up to n levels deep
73    pub max_depth: Option<usize>,
74
75    /// Filter by file extension (without leading dot).
76    ///
77    /// Only files with this extension will be included.
78    /// Example: `Some("hedl")` matches "file.hedl" but not "file.txt"
79    pub extension: Option<String>,
80
81    /// Maximum file size in bytes.
82    ///
83    /// Files larger than this will be excluded.
84    pub max_file_size: Option<u64>,
85
86    /// Follow symbolic links during traversal.
87    ///
88    /// When false, symlinks are ignored.
89    pub follow_links: bool,
90
91    /// Include hidden files (starting with '.').
92    ///
93    /// When false, hidden files and directories are skipped.
94    pub include_hidden: bool,
95
96    /// Enable recursive directory traversal.
97    ///
98    /// When false, only process files matching patterns directly,
99    /// don't traverse directories.
100    pub recursive: bool,
101}
102
103impl Default for DiscoveryConfig {
104    fn default() -> Self {
105        Self {
106            max_depth: Some(10),
107            extension: None,
108            max_file_size: None,
109            follow_links: false,
110            include_hidden: false,
111            recursive: false,
112        }
113    }
114}
115
116/// File discovery engine with glob pattern support.
117///
118/// Discovers files matching specified patterns with configurable filtering
119/// and traversal options.
120#[derive(Debug)]
121pub struct FileDiscovery {
122    patterns: Vec<String>,
123    config: DiscoveryConfig,
124}
125
126impl FileDiscovery {
127    /// Create a new file discovery instance.
128    ///
129    /// # Arguments
130    ///
131    /// * `patterns` - List of file patterns (glob patterns or explicit paths)
132    /// * `config` - Discovery configuration
133    ///
134    /// # Examples
135    ///
136    /// ```rust
137    /// use hedl_cli::file_discovery::{FileDiscovery, DiscoveryConfig};
138    ///
139    /// let discovery = FileDiscovery::new(
140    ///     vec!["*.hedl".to_string(), "data/*.hedl".to_string()],
141    ///     DiscoveryConfig::default()
142    /// );
143    /// ```
144    #[must_use]
145    pub fn new(patterns: Vec<String>, config: DiscoveryConfig) -> Self {
146        Self { patterns, config }
147    }
148
149    /// Validate all patterns before discovery.
150    ///
151    /// Checks that patterns are valid glob expressions.
152    ///
153    /// # Returns
154    ///
155    /// * `Ok(())` - All patterns are valid
156    /// * `Err(CliError::GlobPattern)` - Invalid pattern found
157    ///
158    /// # Examples
159    ///
160    /// ```rust
161    /// use hedl_cli::file_discovery::{FileDiscovery, DiscoveryConfig};
162    ///
163    /// let discovery = FileDiscovery::new(
164    ///     vec!["*.hedl".to_string()],
165    ///     DiscoveryConfig::default()
166    /// );
167    /// assert!(discovery.validate_patterns().is_ok());
168    /// ```
169    pub fn validate_patterns(&self) -> Result<(), CliError> {
170        for pattern in &self.patterns {
171            if let Err(e) = glob::Pattern::new(pattern) {
172                return Err(CliError::GlobPattern {
173                    pattern: pattern.clone(),
174                    message: e.to_string(),
175                });
176            }
177        }
178        Ok(())
179    }
180
181    /// Discover all files matching the patterns.
182    ///
183    /// Expands glob patterns and applies configured filters.
184    ///
185    /// # Returns
186    ///
187    /// * `Ok(Vec<PathBuf>)` - List of discovered file paths
188    /// * `Err(CliError)` - On pattern errors, I/O failures, or no matches
189    ///
190    /// # Errors
191    ///
192    /// Returns error if:
193    /// - Pattern is invalid
194    /// - No files match any pattern
195    /// - Directory traversal fails
196    ///
197    /// # Examples
198    ///
199    /// ```rust,no_run
200    /// use hedl_cli::file_discovery::{FileDiscovery, DiscoveryConfig};
201    ///
202    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
203    /// let discovery = FileDiscovery::new(
204    ///     vec!["tests/*.hedl".to_string()],
205    ///     DiscoveryConfig::default()
206    /// );
207    /// let files = discovery.discover()?;
208    /// println!("Found {} files", files.len());
209    /// # Ok(())
210    /// # }
211    /// ```
212    pub fn discover(&self) -> Result<Vec<PathBuf>, CliError> {
213        // If no patterns provided, return empty (nothing to discover = success)
214        if self.patterns.is_empty() {
215            return Ok(Vec::new());
216        }
217
218        // Validate patterns first
219        self.validate_patterns()?;
220
221        let mut all_files = Vec::new();
222
223        for pattern in &self.patterns {
224            let pattern_files = if self.config.recursive && pattern.contains("**") {
225                // Recursive glob pattern
226                self.discover_recursive_glob(pattern)?
227            } else if self.config.recursive {
228                // Recursive but not using ** syntax
229                self.discover_recursive_simple(pattern)?
230            } else {
231                // Simple glob pattern (no recursion)
232                self.discover_simple_glob(pattern)?
233            };
234
235            all_files.extend(pattern_files);
236        }
237
238        // Remove duplicates while preserving order
239        let mut seen = std::collections::HashSet::new();
240        all_files.retain(|path| seen.insert(path.clone()));
241
242        if all_files.is_empty() {
243            return Err(CliError::NoFilesMatched {
244                patterns: self.patterns.clone(),
245            });
246        }
247
248        Ok(all_files)
249    }
250
251    /// Discover files using simple glob pattern (no recursion).
252    fn discover_simple_glob(&self, pattern: &str) -> Result<Vec<PathBuf>, CliError> {
253        let mut files = Vec::new();
254
255        for entry in glob::glob(pattern).map_err(|e| CliError::GlobPattern {
256            pattern: pattern.to_string(),
257            message: e.to_string(),
258        })? {
259            let path = entry.map_err(|e| CliError::DirectoryTraversal {
260                path: PathBuf::from(pattern),
261                message: e.to_string(),
262            })?;
263
264            if self.should_include_file(&path)? {
265                files.push(path);
266            }
267        }
268
269        Ok(files)
270    }
271
272    /// Discover files using recursive glob with ** syntax.
273    fn discover_recursive_glob(&self, pattern: &str) -> Result<Vec<PathBuf>, CliError> {
274        // For ** patterns, we need to manually walk directories
275        // Extract base directory from pattern
276        let base_dir = self.extract_base_dir(pattern);
277
278        let mut files = Vec::new();
279
280        let walker = WalkDir::new(&base_dir)
281            .follow_links(self.config.follow_links)
282            .max_depth(self.config.max_depth.unwrap_or(usize::MAX));
283
284        let glob_pattern = glob::Pattern::new(pattern).map_err(|e| CliError::GlobPattern {
285            pattern: pattern.to_string(),
286            message: e.to_string(),
287        })?;
288
289        for entry in walker {
290            let entry = entry.map_err(|e| CliError::DirectoryTraversal {
291                path: base_dir.clone(),
292                message: e.to_string(),
293            })?;
294
295            if !self.should_include_entry(&entry) {
296                continue;
297            }
298
299            let path = entry.path();
300            if path.is_file()
301                && glob_pattern.matches_path(path)
302                && self.should_include_file(path)?
303            {
304                files.push(path.to_path_buf());
305            }
306        }
307
308        Ok(files)
309    }
310
311    /// Discover files recursively with simple pattern.
312    fn discover_recursive_simple(&self, pattern: &str) -> Result<Vec<PathBuf>, CliError> {
313        // Convert simple pattern to recursive glob
314        let base_dir = self.extract_base_dir(pattern);
315        let filename_pattern = PathBuf::from(pattern)
316            .file_name()
317            .map_or_else(|| pattern.to_string(), |s| s.to_string_lossy().to_string());
318
319        let recursive_pattern = if base_dir == std::path::Path::new(".") {
320            format!("**/{filename_pattern}")
321        } else {
322            format!("{}/**/{}", base_dir.display(), filename_pattern)
323        };
324
325        self.discover_recursive_glob(&recursive_pattern)
326    }
327
328    /// Extract base directory from a glob pattern.
329    fn extract_base_dir(&self, pattern: &str) -> PathBuf {
330        let path = PathBuf::from(pattern);
331
332        // Find the deepest ancestor that doesn't contain glob characters
333        for ancestor in path.ancestors() {
334            let ancestor_str = ancestor.to_string_lossy();
335            // Skip if it contains glob characters
336            if ancestor_str.contains('*')
337                || ancestor_str.contains('?')
338                || ancestor_str.contains('[')
339            {
340                continue;
341            }
342            // Found a non-glob path - return it (or "." if empty)
343            if ancestor_str.is_empty() {
344                return PathBuf::from(".");
345            }
346            return ancestor.to_path_buf();
347        }
348
349        // All parts contain globs, default to current directory
350        PathBuf::from(".")
351    }
352
353    /// Check if a directory entry should be included in traversal.
354    fn should_include_entry(&self, entry: &DirEntry) -> bool {
355        // Skip hidden files/directories if not configured to include them
356        if !self.config.include_hidden {
357            if let Some(name) = entry.file_name().to_str() {
358                if name.starts_with('.') && name != "." && name != ".." {
359                    return false;
360                }
361            }
362        }
363
364        true
365    }
366
367    /// Check if a file should be included based on filters.
368    fn should_include_file(&self, path: &Path) -> Result<bool, CliError> {
369        // Must be a regular file
370        if !path.is_file() {
371            return Ok(false);
372        }
373
374        // Check extension filter
375        if let Some(ref ext) = self.config.extension {
376            if path.extension().and_then(|s| s.to_str()) != Some(ext.as_str()) {
377                return Ok(false);
378            }
379        }
380
381        // Check file size filter
382        if let Some(max_size) = self.config.max_file_size {
383            let metadata = std::fs::metadata(path).map_err(|e| CliError::io_error(path, e))?;
384            if metadata.len() > max_size {
385                return Ok(false);
386            }
387        }
388
389        Ok(true)
390    }
391}
392
393#[cfg(test)]
394mod tests {
395    use super::*;
396    use std::fs;
397    use tempfile::TempDir;
398
399    fn create_test_files(dir: &Path, files: &[&str]) -> Result<(), std::io::Error> {
400        for file in files {
401            let path = dir.join(file);
402            if let Some(parent) = path.parent() {
403                fs::create_dir_all(parent)?;
404            }
405            fs::write(path, "test content")?;
406        }
407        Ok(())
408    }
409
410    #[test]
411    fn test_discovery_config_default() {
412        let config = DiscoveryConfig::default();
413        assert_eq!(config.max_depth, Some(10));
414        assert!(config.extension.is_none());
415        assert!(config.max_file_size.is_none());
416        assert!(!config.follow_links);
417        assert!(!config.include_hidden);
418        assert!(!config.recursive);
419    }
420
421    #[test]
422    fn test_validate_patterns_valid() {
423        let discovery = FileDiscovery::new(
424            vec!["*.hedl".to_string(), "test/*.hedl".to_string()],
425            DiscoveryConfig::default(),
426        );
427        assert!(discovery.validate_patterns().is_ok());
428    }
429
430    #[test]
431    fn test_validate_patterns_invalid() {
432        let discovery =
433            FileDiscovery::new(vec!["[invalid".to_string()], DiscoveryConfig::default());
434        let result = discovery.validate_patterns();
435        assert!(result.is_err());
436        if let Err(CliError::GlobPattern { pattern, .. }) = result {
437            assert_eq!(pattern, "[invalid");
438        }
439    }
440
441    #[test]
442    fn test_discover_simple_glob() -> Result<(), Box<dyn std::error::Error>> {
443        let temp_dir = TempDir::new()?;
444        create_test_files(temp_dir.path(), &["file1.hedl", "file2.hedl", "file3.txt"])?;
445
446        let pattern = format!("{}/*.hedl", temp_dir.path().display());
447        let discovery = FileDiscovery::new(vec![pattern], DiscoveryConfig::default());
448
449        let files = discovery.discover()?;
450        assert_eq!(files.len(), 2);
451
452        Ok(())
453    }
454
455    #[test]
456    fn test_discover_no_matches() {
457        let temp_dir = TempDir::new().unwrap();
458        let pattern = format!("{}/*.hedl", temp_dir.path().display());
459        let discovery = FileDiscovery::new(vec![pattern.clone()], DiscoveryConfig::default());
460
461        let result = discovery.discover();
462        assert!(result.is_err());
463        if let Err(CliError::NoFilesMatched { patterns }) = result {
464            assert_eq!(patterns, vec![pattern]);
465        }
466    }
467
468    #[test]
469    fn test_discover_recursive() -> Result<(), Box<dyn std::error::Error>> {
470        let temp_dir = TempDir::new()?;
471        create_test_files(
472            temp_dir.path(),
473            &[
474                "file1.hedl",
475                "dir1/file2.hedl",
476                "dir1/dir2/file3.hedl",
477                "dir1/file4.txt",
478            ],
479        )?;
480
481        let pattern = format!("{}/**/*.hedl", temp_dir.path().display());
482        let discovery = FileDiscovery::new(
483            vec![pattern],
484            DiscoveryConfig {
485                recursive: true,
486                ..Default::default()
487            },
488        );
489
490        let files = discovery.discover()?;
491        assert_eq!(files.len(), 3);
492
493        Ok(())
494    }
495
496    #[test]
497    fn test_discover_with_depth_limit() -> Result<(), Box<dyn std::error::Error>> {
498        let temp_dir = TempDir::new()?;
499        create_test_files(
500            temp_dir.path(),
501            &[
502                "file1.hedl",
503                "dir1/file2.hedl",
504                "dir1/dir2/file3.hedl",
505                "dir1/dir2/dir3/file4.hedl",
506            ],
507        )?;
508
509        let pattern = format!("{}/**/*.hedl", temp_dir.path().display());
510        let discovery = FileDiscovery::new(
511            vec![pattern],
512            DiscoveryConfig {
513                recursive: true,
514                max_depth: Some(2),
515                ..Default::default()
516            },
517        );
518
519        let files = discovery.discover()?;
520        // Should find file1.hedl and dir1/file2.hedl, but not deeper files
521        assert!(files.len() <= 3); // May include dir1/dir2/file3.hedl depending on depth counting
522
523        Ok(())
524    }
525
526    #[test]
527    fn test_discover_with_extension_filter() -> Result<(), Box<dyn std::error::Error>> {
528        let temp_dir = TempDir::new()?;
529        create_test_files(temp_dir.path(), &["file1.hedl", "file2.txt", "file3.hedl"])?;
530
531        let pattern = format!("{}/*", temp_dir.path().display());
532        let discovery = FileDiscovery::new(
533            vec![pattern],
534            DiscoveryConfig {
535                extension: Some("hedl".to_string()),
536                ..Default::default()
537            },
538        );
539
540        let files = discovery.discover()?;
541        assert_eq!(files.len(), 2);
542        assert!(files.iter().all(|p| p.extension().unwrap() == "hedl"));
543
544        Ok(())
545    }
546
547    #[test]
548    fn test_discover_hidden_files() -> Result<(), Box<dyn std::error::Error>> {
549        let temp_dir = TempDir::new()?;
550        create_test_files(
551            temp_dir.path(),
552            &["file1.hedl", ".hidden.hedl", "dir/.hidden2.hedl"],
553        )?;
554
555        let pattern = format!("{}/**/*.hedl", temp_dir.path().display());
556
557        // Without include_hidden
558        let discovery = FileDiscovery::new(
559            vec![pattern.clone()],
560            DiscoveryConfig {
561                recursive: true,
562                include_hidden: false,
563                ..Default::default()
564            },
565        );
566        let files = discovery.discover()?;
567        assert_eq!(files.len(), 1); // Only file1.hedl
568
569        // With include_hidden
570        let discovery = FileDiscovery::new(
571            vec![pattern],
572            DiscoveryConfig {
573                recursive: true,
574                include_hidden: true,
575                ..Default::default()
576            },
577        );
578        let files = discovery.discover()?;
579        assert!(files.len() >= 2); // file1.hedl and hidden files
580
581        Ok(())
582    }
583
584    #[test]
585    fn test_extract_base_dir() {
586        let discovery = FileDiscovery::new(vec![], DiscoveryConfig::default());
587
588        assert_eq!(discovery.extract_base_dir("*.hedl"), PathBuf::from("."));
589        assert_eq!(
590            discovery.extract_base_dir("dir/*.hedl"),
591            PathBuf::from("dir")
592        );
593        assert_eq!(
594            discovery.extract_base_dir("dir/subdir/*.hedl"),
595            PathBuf::from("dir/subdir")
596        );
597        assert_eq!(
598            discovery.extract_base_dir("**/file.hedl"),
599            PathBuf::from(".")
600        );
601    }
602
603    #[test]
604    fn test_multiple_patterns() -> Result<(), Box<dyn std::error::Error>> {
605        let temp_dir = TempDir::new()?;
606        create_test_files(
607            temp_dir.path(),
608            &["dir1/file1.hedl", "dir2/file2.hedl", "file3.hedl"],
609        )?;
610
611        let patterns = vec![
612            format!("{}/dir1/*.hedl", temp_dir.path().display()),
613            format!("{}/dir2/*.hedl", temp_dir.path().display()),
614        ];
615        let discovery = FileDiscovery::new(patterns, DiscoveryConfig::default());
616
617        let files = discovery.discover()?;
618        assert_eq!(files.len(), 2);
619
620        Ok(())
621    }
622
623    #[test]
624    fn test_deduplicate_files() -> Result<(), Box<dyn std::error::Error>> {
625        let temp_dir = TempDir::new()?;
626        create_test_files(temp_dir.path(), &["file1.hedl"])?;
627
628        // Same file matched by multiple patterns
629        let file_path = format!("{}/file1.hedl", temp_dir.path().display());
630        let patterns = vec![
631            file_path.clone(),
632            format!("{}/*.hedl", temp_dir.path().display()),
633        ];
634        let discovery = FileDiscovery::new(patterns, DiscoveryConfig::default());
635
636        let files = discovery.discover()?;
637        // Should be deduplicated to 1 file
638        assert_eq!(files.len(), 1);
639
640        Ok(())
641    }
642}