exarch_core/creation/
walker.rs

1//! Directory tree walking with filtering.
2//!
3//! This module provides efficient directory traversal with built-in filtering
4//! based on configuration options like hidden files, exclude patterns, and size
5//! limits.
6
7use crate::ExtractionError;
8use crate::Result;
9use crate::creation::config::CreationConfig;
10use crate::creation::filters;
11use std::fs::Metadata;
12use std::path::Path;
13use std::path::PathBuf;
14use walkdir::WalkDir;
15
16/// Walks a directory tree with filtering based on `CreationConfig`.
17///
18/// This walker handles:
19/// - Hidden file filtering
20/// - Pattern-based exclusion
21/// - Symlink handling (follow or store as-is)
22/// - Size limit enforcement
23/// - Archive path computation
24///
25/// # Examples
26///
27/// ```no_run
28/// use exarch_core::creation::CreationConfig;
29/// use exarch_core::creation::walker::FilteredWalker;
30/// use std::path::Path;
31///
32/// let config = CreationConfig::default();
33/// let root = Path::new("./project");
34/// let walker = FilteredWalker::new(root, &config);
35///
36/// for entry in walker.walk() {
37///     let entry = entry.unwrap();
38///     println!("Would add: {}", entry.archive_path.display());
39/// }
40/// ```
41pub struct FilteredWalker<'a> {
42    root: &'a Path,
43    config: &'a CreationConfig,
44}
45
46impl<'a> FilteredWalker<'a> {
47    /// Creates a new filtered walker for the given root directory.
48    ///
49    /// # Examples
50    ///
51    /// ```
52    /// use exarch_core::creation::CreationConfig;
53    /// use exarch_core::creation::walker::FilteredWalker;
54    /// use std::path::Path;
55    ///
56    /// let config = CreationConfig::default();
57    /// let walker = FilteredWalker::new(Path::new("."), &config);
58    /// ```
59    #[must_use]
60    pub fn new(root: &'a Path, config: &'a CreationConfig) -> Self {
61        Self { root, config }
62    }
63
64    /// Returns an iterator over filtered directory entries.
65    ///
66    /// The iterator:
67    /// - Skips entries based on configuration (hidden files, patterns, size)
68    /// - Handles symlinks according to `follow_symlinks` setting
69    /// - Computes archive paths using `strip_prefix` if configured
70    /// - Returns errors for inaccessible files/directories
71    ///
72    /// # Errors
73    ///
74    /// Entries may error if:
75    /// - File metadata cannot be read
76    /// - Path is not valid UTF-8 (platform-specific)
77    /// - Symlink target cannot be read
78    pub fn walk(&self) -> impl Iterator<Item = Result<FilteredEntry>> + '_ {
79        let walker = WalkDir::new(self.root)
80            .follow_links(self.config.follow_symlinks)
81            .into_iter();
82
83        walker.filter_map(move |entry| {
84            match entry {
85                Ok(entry) => {
86                    let path = entry.path();
87
88                    // Skip if matches filter rules
89                    if filters::should_skip(path, self.config) {
90                        return None;
91                    }
92
93                    // Build FilteredEntry
94                    match self.build_filtered_entry(&entry) {
95                        Ok(Some(filtered)) => Some(Ok(filtered)),
96                        Ok(None) => None, // Filtered out (e.g., size limit)
97                        Err(e) => Some(Err(e)),
98                    }
99                }
100                Err(e) => {
101                    // Convert walkdir error to ExtractionError
102                    Some(Err(ExtractionError::Io(std::io::Error::other(format!(
103                        "walkdir error: {e}"
104                    )))))
105                }
106            }
107        })
108    }
109
110    /// Builds a `FilteredEntry` from a `walkdir::DirEntry`.
111    ///
112    /// Returns `Ok(None)` if the entry should be filtered out (e.g., exceeds
113    /// size limit).
114    fn build_filtered_entry(&self, entry: &walkdir::DirEntry) -> Result<Option<FilteredEntry>> {
115        let path = entry.path().to_path_buf();
116        let metadata = entry.metadata().map_err(|e| {
117            ExtractionError::Io(std::io::Error::other(format!(
118                "cannot read metadata for {}: {e}",
119                path.display()
120            )))
121        })?;
122
123        // Determine entry type
124        let entry_type = if metadata.is_symlink() {
125            let target = std::fs::read_link(&path).map_err(|e| {
126                ExtractionError::Io(std::io::Error::other(format!(
127                    "cannot read symlink target for {}: {e}",
128                    path.display()
129                )))
130            })?;
131            EntryType::Symlink { target }
132        } else if metadata.is_dir() {
133            EntryType::Directory
134        } else {
135            EntryType::File
136        };
137
138        // Check file size limit (only for regular files)
139        let size = get_file_size(&metadata);
140        if entry_type == EntryType::File
141            && let Some(max_size) = self.config.max_file_size
142            && size > max_size
143        {
144            return Ok(None); // Filter out
145        }
146
147        // Compute archive path
148        let archive_path = filters::compute_archive_path(&path, self.root, self.config)?;
149
150        Ok(Some(FilteredEntry {
151            path,
152            archive_path,
153            entry_type,
154            size,
155        }))
156    }
157}
158
159/// A filtered directory entry with computed archive path.
160///
161/// Represents a file, directory, or symlink that passed all filtering rules
162/// and is ready to be added to an archive.
163#[derive(Debug, Clone, PartialEq, Eq)]
164pub struct FilteredEntry {
165    /// Full filesystem path to the entry.
166    pub path: PathBuf,
167
168    /// Path to use in the archive (relative, with `strip_prefix` applied).
169    pub archive_path: PathBuf,
170
171    /// Type of entry (file, directory, or symlink).
172    pub entry_type: EntryType,
173
174    /// Size in bytes (0 for directories).
175    pub size: u64,
176}
177
178/// Type of directory entry.
179#[derive(Debug, Clone, PartialEq, Eq)]
180pub enum EntryType {
181    /// Regular file.
182    File,
183
184    /// Directory.
185    Directory,
186
187    /// Symbolic link with its target path.
188    Symlink {
189        /// Target of the symlink.
190        target: PathBuf,
191    },
192}
193
194/// Collects all entries from sources into a vector for single-pass processing.
195///
196/// This function performs a single directory traversal and collects all
197/// filtered entries into memory, avoiding the need to traverse the directory
198/// tree twice (once for counting, once for processing).
199///
200/// # Examples
201///
202/// ```no_run
203/// use exarch_core::creation::CreationConfig;
204/// use exarch_core::creation::walker::collect_entries;
205/// use std::path::Path;
206///
207/// let config = CreationConfig::default();
208/// let sources = [Path::new("./src")];
209/// let entries = collect_entries(&sources, &config)?;
210/// println!("Total entries: {}", entries.len());
211/// # Ok::<(), exarch_core::ExtractionError>(())
212/// ```
213///
214/// # Errors
215///
216/// Returns an error if:
217/// - Source path does not exist
218/// - Directory traversal fails
219/// - File metadata cannot be read
220pub fn collect_entries<P: AsRef<Path>>(
221    sources: &[P],
222    config: &CreationConfig,
223) -> Result<Vec<FilteredEntry>> {
224    let mut entries = Vec::new();
225
226    for source in sources {
227        let path = source.as_ref();
228
229        if !path.exists() {
230            return Err(ExtractionError::SourceNotFound {
231                path: path.to_path_buf(),
232            });
233        }
234
235        if path.is_dir() {
236            let walker = FilteredWalker::new(path, config);
237            for entry in walker.walk() {
238                entries.push(entry?);
239            }
240        } else {
241            // For single files, we need to create a FilteredEntry manually
242            // This matches the behavior in tar.rs and zip.rs
243            let metadata = std::fs::metadata(path)?;
244            let size = if metadata.is_file() {
245                metadata.len()
246            } else {
247                0
248            };
249
250            let entry_type = if metadata.is_symlink() {
251                let target = std::fs::read_link(path)?;
252                EntryType::Symlink { target }
253            } else if metadata.is_dir() {
254                EntryType::Directory
255            } else {
256                EntryType::File
257            };
258
259            // Compute archive path using the same logic as in the format modules
260            let archive_path = if let Some(parent) = path.parent() {
261                filters::compute_archive_path(path, parent, config)?
262            } else {
263                path.file_name()
264                    .ok_or_else(|| {
265                        ExtractionError::Io(std::io::Error::other(format!(
266                            "cannot determine filename for {}",
267                            path.display()
268                        )))
269                    })?
270                    .into()
271            };
272
273            entries.push(FilteredEntry {
274                path: path.to_path_buf(),
275                archive_path,
276                entry_type,
277                size,
278            });
279        }
280    }
281
282    Ok(entries)
283}
284
285/// Gets the file size from metadata in a cross-platform way.
286#[cfg(unix)]
287fn get_file_size(metadata: &Metadata) -> u64 {
288    use std::os::unix::fs::MetadataExt;
289    metadata.size()
290}
291
292#[cfg(not(unix))]
293fn get_file_size(metadata: &Metadata) -> u64 {
294    metadata.len()
295}
296
297#[cfg(test)]
298#[allow(clippy::unwrap_used)] // Allow unwrap in tests for brevity
299mod tests {
300    use super::*;
301    use std::fs;
302    use tempfile::TempDir;
303
304    #[test]
305    fn test_walker_basic_directory() {
306        let temp = TempDir::new().unwrap();
307        let root = temp.path();
308
309        // Create test structure
310        fs::write(root.join("file1.txt"), "content1").unwrap();
311        fs::write(root.join("file2.rs"), "content2").unwrap();
312        fs::create_dir(root.join("subdir")).unwrap();
313        fs::write(root.join("subdir/file3.txt"), "content3").unwrap();
314
315        let config = CreationConfig::default()
316            .with_include_hidden(true)
317            .with_exclude_patterns(vec![]);
318
319        let walker = FilteredWalker::new(root, &config);
320        let entries: Vec<_> = walker.walk().collect::<Result<Vec<_>>>().unwrap();
321
322        // Should find exactly: root dir, file1, file2, subdir, file3 = 5 entries
323        assert_eq!(entries.len(), 5, "expected exactly 5 entries");
324
325        let paths: Vec<_> = entries
326            .iter()
327            .map(|e| e.archive_path.to_str().unwrap())
328            .collect();
329
330        assert!(paths.iter().any(|p| p.contains("file1.txt")));
331        assert!(paths.iter().any(|p| p.contains("file2.rs")));
332        assert!(paths.iter().any(|p| p.contains("subdir")));
333        assert!(paths.iter().any(|p| p.contains("file3.txt")));
334    }
335
336    #[test]
337    fn test_walker_skips_hidden_files() {
338        let temp = TempDir::new().unwrap();
339        let root = temp.path();
340
341        fs::write(root.join("visible.txt"), "content").unwrap();
342        fs::write(root.join(".hidden"), "secret").unwrap();
343
344        let config = CreationConfig::default(); // include_hidden = false by default
345        let walker = FilteredWalker::new(root, &config);
346        let entries: Vec<_> = walker.walk().collect::<Result<Vec<_>>>().unwrap();
347
348        let paths: Vec<_> = entries
349            .iter()
350            .map(|e| e.archive_path.to_str().unwrap())
351            .collect();
352
353        assert!(paths.iter().any(|p| p.contains("visible.txt")));
354        assert!(!paths.iter().any(|p| p.contains(".hidden")));
355    }
356
357    #[test]
358    fn test_walker_includes_hidden_when_configured() {
359        let temp = TempDir::new().unwrap();
360        let root = temp.path();
361
362        fs::write(root.join("visible.txt"), "content").unwrap();
363        fs::write(root.join(".hidden"), "secret").unwrap();
364
365        let config = CreationConfig::default().with_include_hidden(true);
366        let walker = FilteredWalker::new(root, &config);
367        let entries: Vec<_> = walker.walk().collect::<Result<Vec<_>>>().unwrap();
368
369        let paths: Vec<_> = entries
370            .iter()
371            .map(|e| e.archive_path.to_str().unwrap())
372            .collect();
373
374        assert!(paths.iter().any(|p| p.contains("visible.txt")));
375        assert!(paths.iter().any(|p| p.contains(".hidden")));
376    }
377
378    #[test]
379    fn test_walker_skips_excluded_patterns() {
380        let temp = TempDir::new().unwrap();
381        let root = temp.path();
382
383        fs::write(root.join("keep.txt"), "keep").unwrap();
384        fs::write(root.join("skip.tmp"), "skip").unwrap();
385        fs::write(root.join("also.log"), "skip").unwrap();
386
387        let config = CreationConfig::default()
388            .with_exclude_patterns(vec!["*.tmp".to_string(), "*.log".to_string()]);
389
390        let walker = FilteredWalker::new(root, &config);
391        let entries: Vec<_> = walker.walk().collect::<Result<Vec<_>>>().unwrap();
392
393        let paths: Vec<_> = entries
394            .iter()
395            .map(|e| e.archive_path.to_str().unwrap())
396            .collect();
397
398        assert!(paths.iter().any(|p| p.contains("keep.txt")));
399        assert!(!paths.iter().any(|p| p.contains("skip.tmp")));
400        assert!(!paths.iter().any(|p| p.contains("also.log")));
401    }
402
403    #[cfg(unix)]
404    #[test]
405    fn test_walker_handles_symlinks() {
406        let temp = TempDir::new().unwrap();
407        let root = temp.path();
408
409        fs::write(root.join("target.txt"), "content").unwrap();
410        std::os::unix::fs::symlink(root.join("target.txt"), root.join("link.txt")).unwrap();
411
412        // Don't follow symlinks (default)
413        let config = CreationConfig::default();
414        let walker = FilteredWalker::new(root, &config);
415        let entries: Vec<_> = walker.walk().collect::<Result<Vec<_>>>().unwrap();
416
417        let link_entry = entries
418            .iter()
419            .find(|e| e.archive_path.to_str().unwrap().contains("link.txt"));
420
421        assert!(link_entry.is_some());
422        if let Some(entry) = link_entry {
423            assert!(matches!(entry.entry_type, EntryType::Symlink { .. }));
424        }
425    }
426
427    #[cfg(unix)]
428    #[test]
429    fn test_walker_detects_symlink_cycles() {
430        let temp = TempDir::new().unwrap();
431        let root = temp.path();
432
433        fs::create_dir(root.join("dir1")).unwrap();
434        fs::create_dir(root.join("dir1/dir2")).unwrap();
435
436        // Create symlink cycle: dir1/dir2/link -> dir1
437        std::os::unix::fs::symlink(root.join("dir1"), root.join("dir1/dir2/link")).unwrap();
438
439        // Follow symlinks - walkdir handles cycle detection
440        let config = CreationConfig::default().with_follow_symlinks(true);
441        let walker = FilteredWalker::new(root, &config);
442
443        // Collect all entries - should get an error for the cycle
444        let results: Vec<_> = walker.walk().collect();
445
446        // Should have some successful entries before hitting the cycle
447        let successes = results.iter().filter(|r| r.is_ok()).count();
448        assert!(successes > 0, "should have some entries before cycle");
449
450        // Should detect the cycle and return an error
451        let has_cycle_error = results.iter().any(|r| {
452            if let Err(e) = r {
453                e.to_string().contains("File system loop")
454                    || e.to_string().contains("walkdir error")
455            } else {
456                false
457            }
458        });
459        assert!(has_cycle_error, "should detect symlink cycle");
460    }
461
462    #[test]
463    fn test_walker_respects_max_file_size() {
464        let temp = TempDir::new().unwrap();
465        let root = temp.path();
466
467        fs::write(root.join("small.txt"), "tiny").unwrap(); // 4 bytes
468        fs::write(root.join("large.txt"), "a".repeat(1000)).unwrap(); // 1000 bytes
469
470        let config = CreationConfig::default().with_max_file_size(Some(100));
471
472        let walker = FilteredWalker::new(root, &config);
473        let entries: Vec<_> = walker.walk().collect::<Result<Vec<_>>>().unwrap();
474
475        let paths: Vec<_> = entries
476            .iter()
477            .map(|e| e.archive_path.to_str().unwrap())
478            .collect();
479
480        assert!(paths.iter().any(|p| p.contains("small.txt")));
481        assert!(!paths.iter().any(|p| p.contains("large.txt")));
482    }
483
484    #[test]
485    fn test_walker_computes_archive_paths() {
486        let temp = TempDir::new().unwrap();
487        let root = temp.path();
488
489        fs::create_dir(root.join("src")).unwrap();
490        fs::write(root.join("src/main.rs"), "code").unwrap();
491
492        let config = CreationConfig::default();
493        let walker = FilteredWalker::new(root, &config);
494        let entries: Vec<_> = walker.walk().collect::<Result<Vec<_>>>().unwrap();
495
496        let main_entry = entries
497            .iter()
498            .find(|e| e.archive_path.to_str().unwrap().contains("main.rs"));
499
500        assert!(main_entry.is_some());
501        if let Some(entry) = main_entry {
502            assert_eq!(entry.archive_path, Path::new("src/main.rs"));
503        }
504    }
505
506    #[test]
507    fn test_walker_strip_prefix() {
508        let temp = TempDir::new().unwrap();
509        let root = temp.path();
510
511        fs::create_dir(root.join("project")).unwrap();
512        fs::create_dir(root.join("project/src")).unwrap();
513        fs::write(root.join("project/src/main.rs"), "code").unwrap();
514
515        let config = CreationConfig::default().with_strip_prefix(Some(PathBuf::from("project")));
516
517        let walker = FilteredWalker::new(root, &config);
518        let entries: Vec<_> = walker.walk().collect::<Result<Vec<_>>>().unwrap();
519
520        let main_entry = entries
521            .iter()
522            .find(|e| e.archive_path.to_str().unwrap().contains("main.rs"));
523
524        assert!(main_entry.is_some());
525        if let Some(entry) = main_entry {
526            assert_eq!(entry.archive_path, Path::new("src/main.rs"));
527        }
528    }
529
530    #[test]
531    fn test_filtered_entry_file() {
532        let entry = FilteredEntry {
533            path: PathBuf::from("/tmp/file.txt"),
534            archive_path: PathBuf::from("file.txt"),
535            entry_type: EntryType::File,
536            size: 1024,
537        };
538
539        assert_eq!(entry.path, Path::new("/tmp/file.txt"));
540        assert_eq!(entry.archive_path, Path::new("file.txt"));
541        assert!(matches!(entry.entry_type, EntryType::File));
542        assert_eq!(entry.size, 1024);
543    }
544
545    #[test]
546    fn test_filtered_entry_directory() {
547        let entry = FilteredEntry {
548            path: PathBuf::from("/tmp/dir"),
549            archive_path: PathBuf::from("dir"),
550            entry_type: EntryType::Directory,
551            size: 0,
552        };
553
554        assert!(matches!(entry.entry_type, EntryType::Directory));
555        assert_eq!(entry.size, 0);
556    }
557
558    #[test]
559    fn test_filtered_entry_symlink() {
560        let entry = FilteredEntry {
561            path: PathBuf::from("/tmp/link"),
562            archive_path: PathBuf::from("link"),
563            entry_type: EntryType::Symlink {
564                target: PathBuf::from("target.txt"),
565            },
566            size: 0,
567        };
568
569        match &entry.entry_type {
570            EntryType::Symlink { target } => {
571                assert_eq!(target, Path::new("target.txt"));
572            }
573            _ => panic!("expected symlink"),
574        }
575    }
576
577    #[test]
578    fn test_entry_type_equality() {
579        assert_eq!(EntryType::File, EntryType::File);
580        assert_eq!(EntryType::Directory, EntryType::Directory);
581        assert_eq!(
582            EntryType::Symlink {
583                target: PathBuf::from("a")
584            },
585            EntryType::Symlink {
586                target: PathBuf::from("a")
587            }
588        );
589        assert_ne!(EntryType::File, EntryType::Directory);
590        assert_ne!(
591            EntryType::Symlink {
592                target: PathBuf::from("a")
593            },
594            EntryType::Symlink {
595                target: PathBuf::from("b")
596            }
597        );
598    }
599
600    #[test]
601    fn test_collect_entries_empty_sources() {
602        let config = CreationConfig::default();
603        let sources: Vec<&Path> = vec![];
604
605        let entries = collect_entries(&sources, &config).unwrap();
606
607        assert_eq!(entries.len(), 0);
608    }
609
610    #[test]
611    fn test_collect_entries_nonexistent_source() {
612        let config = CreationConfig::default();
613        let sources = [Path::new("/nonexistent/path/that/does/not/exist")];
614
615        let result = collect_entries(&sources, &config);
616
617        assert!(result.is_err());
618        assert!(matches!(
619            result.unwrap_err(),
620            ExtractionError::SourceNotFound { .. }
621        ));
622    }
623
624    #[test]
625    fn test_collect_entries_mixed_files_and_directories() {
626        let temp = TempDir::new().unwrap();
627        let root = temp.path();
628
629        // Create mixed structure
630        fs::write(root.join("single_file.txt"), "standalone").unwrap();
631        fs::create_dir(root.join("dir1")).unwrap();
632        fs::write(root.join("dir1/file1.txt"), "content1").unwrap();
633        fs::write(root.join("dir1/file2.txt"), "content2").unwrap();
634        fs::create_dir(root.join("dir2")).unwrap();
635        fs::write(root.join("dir2/file3.txt"), "content3").unwrap();
636
637        let config = CreationConfig::default().with_include_hidden(true);
638        let sources = [
639            root.join("single_file.txt"),
640            root.join("dir1"),
641            root.join("dir2"),
642        ];
643
644        let entries = collect_entries(&sources, &config).unwrap();
645
646        // Should have: single_file.txt (1) + dir1 entries (2 files + 1 dir = 3) + dir2
647        // entries (1 file + 1 dir = 2) = 6 total
648        assert!(
649            entries.len() >= 5,
650            "Expected at least 5 entries (files and dirs), got {}",
651            entries.len()
652        );
653
654        let paths: Vec<_> = entries
655            .iter()
656            .map(|e| e.archive_path.to_str().unwrap())
657            .collect();
658
659        assert!(paths.iter().any(|p| p.contains("single_file.txt")));
660        assert!(paths.iter().any(|p| p.contains("file1.txt")));
661        assert!(paths.iter().any(|p| p.contains("file2.txt")));
662        assert!(paths.iter().any(|p| p.contains("file3.txt")));
663    }
664
665    #[test]
666    fn test_collect_entries_large_directory_count() {
667        let temp = TempDir::new().unwrap();
668        let root = temp.path();
669
670        // Create a directory with known number of entries
671        for i in 0..50 {
672            fs::write(root.join(format!("file_{i}.txt")), format!("content {i}")).unwrap();
673        }
674        fs::create_dir(root.join("subdir")).unwrap();
675        for i in 0..30 {
676            fs::write(
677                root.join(format!("subdir/file_{i}.txt")),
678                format!("sub content {i}"),
679            )
680            .unwrap();
681        }
682
683        let config = CreationConfig::default().with_include_hidden(true);
684        let sources = [root];
685
686        let entries = collect_entries(&sources, &config).unwrap();
687
688        // Should have: 50 files in root + 1 subdir + 1 root dir + 30 files in subdir =
689        // 82
690        assert!(
691            entries.len() >= 80,
692            "Expected at least 80 entries, got {}",
693            entries.len()
694        );
695    }
696
697    #[test]
698    fn test_collect_entries_single_file() {
699        let temp = TempDir::new().unwrap();
700        let file_path = temp.path().join("test.txt");
701        fs::write(&file_path, "content").unwrap();
702
703        let config = CreationConfig::default();
704        let sources = [&file_path];
705
706        let entries = collect_entries(&sources, &config).unwrap();
707
708        assert_eq!(entries.len(), 1);
709        assert_eq!(entries[0].entry_type, EntryType::File);
710        assert!(
711            entries[0]
712                .archive_path
713                .to_str()
714                .unwrap()
715                .contains("test.txt")
716        );
717    }
718
719    #[test]
720    fn test_collect_entries_respects_filters() {
721        let temp = TempDir::new().unwrap();
722        let root = temp.path();
723
724        fs::write(root.join("keep.txt"), "keep").unwrap();
725        fs::write(root.join("skip.tmp"), "skip").unwrap();
726        fs::write(root.join(".hidden"), "hidden").unwrap();
727
728        let config = CreationConfig::default()
729            .with_exclude_patterns(vec!["*.tmp".to_string()])
730            .with_include_hidden(false);
731
732        let sources = [root];
733        let entries = collect_entries(&sources, &config).unwrap();
734
735        let paths: Vec<_> = entries
736            .iter()
737            .map(|e| e.archive_path.to_str().unwrap())
738            .collect();
739
740        assert!(paths.iter().any(|p| p.contains("keep.txt")));
741        assert!(!paths.iter().any(|p| p.contains("skip.tmp")));
742        assert!(!paths.iter().any(|p| p.contains(".hidden")));
743    }
744}