Skip to main content

diskforge_core/
file_finder.rs

1use std::cmp::Reverse;
2use std::collections::BinaryHeap;
3use std::ffi::OsStr;
4use std::path::{Path, PathBuf};
5use std::sync::{Arc, Mutex};
6use std::time::{Duration, SystemTime};
7
8use ignore::WalkBuilder;
9
10/// Categories of file types for filtering.
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
12pub enum FileCategory {
13    DiskImage,
14    Archive,
15    Video,
16    Installer,
17    VmImage,
18    Document,
19    Other,
20}
21
22impl std::fmt::Display for FileCategory {
23    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
24        match self {
25            Self::DiskImage => write!(f, "Disk Image"),
26            Self::Archive => write!(f, "Archive"),
27            Self::Video => write!(f, "Video"),
28            Self::Installer => write!(f, "Installer"),
29            Self::VmImage => write!(f, "VM Image"),
30            Self::Document => write!(f, "Document"),
31            Self::Other => write!(f, "Other"),
32        }
33    }
34}
35
36/// A file discovered by the finder.
37#[derive(Debug, Clone)]
38pub struct FoundFile {
39    pub path: PathBuf,
40    pub size: u64,
41    pub mtime: Option<SystemTime>,
42    pub category: FileCategory,
43    pub is_downloaded: bool,
44    pub download_source: Option<String>,
45}
46
47impl PartialEq for FoundFile {
48    fn eq(&self, other: &Self) -> bool {
49        self.size == other.size && self.path == other.path
50    }
51}
52
53impl Eq for FoundFile {}
54
55impl PartialOrd for FoundFile {
56    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
57        Some(self.cmp(other))
58    }
59}
60
61impl Ord for FoundFile {
62    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
63        self.size
64            .cmp(&other.size)
65            .then_with(|| self.path.cmp(&other.path))
66    }
67}
68
69/// Options for the file finder.
70#[derive(Debug, Clone)]
71pub struct FindOptions {
72    /// Root directories to scan.
73    pub root_paths: Vec<PathBuf>,
74    /// Minimum file size in bytes (files smaller than this are skipped).
75    pub min_size: Option<u64>,
76    /// Only include files older than this duration (based on mtime).
77    pub older_than: Option<Duration>,
78    /// Filter to specific file type categories.
79    pub file_types: Option<Vec<FileCategory>>,
80    /// Limit results to the top N largest files (uses BinaryHeap for memory efficiency).
81    pub max_results: Option<usize>,
82    /// Whether to validate file types with magic bytes (default: true).
83    pub validate_types: bool,
84    /// Additional paths to exclude (beyond the defaults).
85    pub extra_exclusions: Vec<String>,
86}
87
88impl Default for FindOptions {
89    fn default() -> Self {
90        Self {
91            root_paths: Vec::new(),
92            min_size: None,
93            older_than: None,
94            file_types: None,
95            max_results: None,
96            validate_types: true,
97            extra_exclusions: Vec::new(),
98        }
99    }
100}
101
102/// Paths that are always excluded from scanning.
103const DEFAULT_EXCLUSIONS: &[&str] = &[
104    "/System",
105    "/usr/bin",
106    "/usr/lib",
107    "/usr/libexec",
108    "/usr/sbin",
109    "/usr/share",
110    "/bin",
111    "/sbin",
112    "/private/var/db",
113];
114
115/// Components that signal we are inside an excluded tree.
116const EXCLUDED_COMPONENTS: &[&str] = &[".git"];
117
118/// Check if a path is inside a .app bundle (should be treated as atomic).
119fn is_inside_app_bundle(path: &Path) -> bool {
120    for ancestor in path.ancestors().skip(1) {
121        if ancestor.extension().is_some_and(|e| e == "app") {
122            return true;
123        }
124    }
125    false
126}
127
128/// Check if a path should be excluded from scanning.
129fn should_exclude(path: &Path, extra_exclusions: &[String]) -> bool {
130    let path_str = path.to_string_lossy();
131
132    // Check default absolute exclusions
133    for excl in DEFAULT_EXCLUSIONS {
134        if path_str.starts_with(excl) {
135            return true;
136        }
137    }
138
139    // Check extra exclusions
140    for excl in extra_exclusions {
141        if path_str.contains(excl.as_str()) {
142            return true;
143        }
144    }
145
146    // Check component-based exclusions
147    for component in path.components() {
148        let s = component.as_os_str().to_string_lossy();
149        for excl in EXCLUDED_COMPONENTS {
150            if s == *excl {
151                return true;
152            }
153        }
154    }
155
156    // Check if inside .app bundle
157    if is_inside_app_bundle(path) {
158        return true;
159    }
160
161    false
162}
163
164/// Detect file category by extension.
165pub fn detect_by_extension(path: &Path) -> FileCategory {
166    let name = path
167        .file_name()
168        .unwrap_or(OsStr::new(""))
169        .to_string_lossy()
170        .to_lowercase();
171
172    // Check compound extensions first
173    if name.ends_with(".tar.gz") || name.ends_with(".tar.bz2") || name.ends_with(".tar.xz") {
174        return FileCategory::Archive;
175    }
176
177    let ext = path
178        .extension()
179        .unwrap_or(OsStr::new(""))
180        .to_string_lossy()
181        .to_lowercase();
182
183    match ext.as_str() {
184        // Disk images
185        "dmg" | "iso" | "img" | "sparseimage" | "sparsebundle" => FileCategory::DiskImage,
186        // Archives
187        "zip" | "tar" | "tgz" | "rar" | "7z" | "xz" | "gz" | "bz2" => FileCategory::Archive,
188        // Videos
189        "mp4" | "mov" | "avi" | "mkv" | "wmv" | "flv" | "m4v" | "webm" => FileCategory::Video,
190        // Installers
191        "pkg" | "mpkg" => FileCategory::Installer,
192        // VM images
193        "vmdk" | "vdi" | "qcow2" | "vhd" => FileCategory::VmImage,
194        // Documents (large ones)
195        "pdf" => FileCategory::Document,
196        _ => FileCategory::Other,
197    }
198}
199
200/// Validate file type using magic bytes via the `infer` crate.
201/// If magic bytes disagree with extension-based detection, trust magic bytes.
202/// If infer returns None (unknown), keep the extension-based category.
203pub fn validate_by_magic_bytes(path: &Path, extension_category: FileCategory) -> FileCategory {
204    let Ok(kind) = infer::get_from_path(path) else {
205        return extension_category;
206    };
207    let Some(kind) = kind else {
208        return extension_category;
209    };
210
211    // Map infer MIME types to our categories
212    let mime = kind.mime_type();
213    if mime.starts_with("video/") {
214        return FileCategory::Video;
215    }
216    match mime {
217        "application/x-apple-diskimage" => FileCategory::DiskImage,
218        "application/zip"
219        | "application/gzip"
220        | "application/x-tar"
221        | "application/x-rar-compressed"
222        | "application/x-7z-compressed"
223        | "application/x-bzip2"
224        | "application/x-xz"
225        | "application/zstd" => FileCategory::Archive,
226        "application/pdf" => FileCategory::Document,
227        "application/x-xar" => FileCategory::Installer, // .pkg files use xar format
228        _ => extension_category,
229    }
230}
231
232/// Read the quarantine xattr to detect downloaded files.
233/// Returns (is_downloaded, download_source).
234pub fn check_quarantine(path: &Path) -> (bool, Option<String>) {
235    match xattr::get(path, "com.apple.quarantine") {
236        Ok(Some(value)) => {
237            let value_str = String::from_utf8_lossy(&value);
238            // Format: flag;timestamp;agent_name;UUID
239            let parts: Vec<&str> = value_str.split(';').collect();
240            let source = parts
241                .get(2)
242                .map(|s| s.to_string())
243                .filter(|s| !s.is_empty());
244            (true, source)
245        }
246        _ => (false, None),
247    }
248}
249
250/// Shared counter for progress reporting during parallel walk.
251pub struct ScanProgress {
252    pub files_scanned: std::sync::atomic::AtomicU64,
253}
254
255impl Default for ScanProgress {
256    fn default() -> Self {
257        Self::new()
258    }
259}
260
261impl ScanProgress {
262    pub fn new() -> Self {
263        Self {
264            files_scanned: std::sync::atomic::AtomicU64::new(0),
265        }
266    }
267
268    pub fn count(&self) -> u64 {
269        self.files_scanned
270            .load(std::sync::atomic::Ordering::Relaxed)
271    }
272}
273
274/// Find files matching the given options using parallel traversal.
275///
276/// Uses `ignore::WalkParallel` for 3-6x speedup over single-threaded traversal.
277/// Files are filtered by size during the walk to avoid collecting small files.
278/// Age is determined by `mtime` (NOT atime -- APFS disables atime by default).
279///
280/// Returns results sorted by size descending.
281pub fn find_files(options: &FindOptions, progress: Option<&ScanProgress>) -> Vec<FoundFile> {
282    let now = SystemTime::now();
283    let min_size = options.min_size.unwrap_or(0);
284    let extra_exclusions = options.extra_exclusions.clone();
285
286    // Build results collector based on whether we have a max_results limit
287    let collector: Arc<Mutex<TopNCollector>> =
288        Arc::new(Mutex::new(TopNCollector::new(options.max_results)));
289
290    for root in &options.root_paths {
291        if !root.exists() {
292            continue;
293        }
294
295        let mut builder = WalkBuilder::new(root);
296        builder
297            .hidden(false) // Don't skip hidden files -- users may want to find them
298            .git_ignore(true) // Respect .gitignore by default
299            .git_global(false)
300            .git_exclude(false)
301            .follow_links(false) // Don't follow symlinks (avoid infinite loops)
302            .threads(num_cpus());
303
304        let walker = builder.build_parallel();
305        let collector_ref = Arc::clone(&collector);
306        let extra_excl = extra_exclusions.clone();
307        let older_than = options.older_than;
308        let file_types = options.file_types.clone();
309
310        walker.run(|| {
311            let collector = Arc::clone(&collector_ref);
312            let extra_excl = extra_excl.clone();
313            let file_types = file_types.clone();
314
315            Box::new(move |entry| {
316                let Ok(entry) = entry else {
317                    return ignore::WalkState::Continue;
318                };
319
320                let path = entry.path();
321
322                // Skip excluded paths early (prevents descent into excluded trees)
323                if should_exclude(path, &extra_excl) {
324                    return ignore::WalkState::Skip;
325                }
326
327                // Only process files (not directories)
328                let Some(file_type) = entry.file_type() else {
329                    return ignore::WalkState::Continue;
330                };
331                if !file_type.is_file() {
332                    return ignore::WalkState::Continue;
333                }
334
335                // Increment progress counter
336                if let Some(prog) = progress {
337                    prog.files_scanned
338                        .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
339                }
340
341                // Get metadata for size and mtime
342                let Ok(meta) = entry.metadata() else {
343                    return ignore::WalkState::Continue;
344                };
345
346                let size = meta.len();
347
348                // Filter by minimum size
349                if size < min_size {
350                    return ignore::WalkState::Continue;
351                }
352
353                // Filter by age (mtime)
354                if let Some(max_age) = older_than
355                    && let Ok(mtime) = meta.modified()
356                        && let Ok(age) = now.duration_since(mtime)
357                            && age < max_age {
358                                return ignore::WalkState::Continue;
359                            }
360
361                // Detect category by extension (fast, no I/O)
362                let category = detect_by_extension(path);
363
364                // Filter by file type if specified
365                if let Some(ref types) = file_types {
366                    if !types.contains(&category) && category != FileCategory::Other {
367                        return ignore::WalkState::Continue;
368                    }
369                    // If category is Other and we have type filters, still skip
370                    if category == FileCategory::Other && !types.contains(&FileCategory::Other) {
371                        return ignore::WalkState::Continue;
372                    }
373                }
374
375                let mtime = meta.modified().ok();
376
377                let file = FoundFile {
378                    path: path.to_path_buf(),
379                    size,
380                    mtime,
381                    category,
382                    is_downloaded: false,
383                    download_source: None,
384                };
385
386                if let Ok(mut coll) = collector.lock() {
387                    coll.push(file);
388                }
389
390                ignore::WalkState::Continue
391            })
392        });
393    }
394
395    let inner = Arc::try_unwrap(collector)
396        .map(|mutex| mutex.into_inner().unwrap_or_else(|e| e.into_inner()))
397        .unwrap_or_else(|arc| {
398            let lock = arc.lock().unwrap();
399            lock.clone()
400        });
401    let mut results = inner.into_vec();
402
403    // Second pass: validate types with magic bytes and check quarantine xattr
404    if options.validate_types {
405        for file in &mut results {
406            file.category = validate_by_magic_bytes(&file.path, file.category);
407        }
408    }
409
410    // Check quarantine xattr for download detection
411    for file in &mut results {
412        let (is_downloaded, source) = check_quarantine(&file.path);
413        file.is_downloaded = is_downloaded;
414        file.download_source = source;
415    }
416
417    // Re-filter by file type after magic byte validation (category may have changed)
418    if let Some(ref types) = options.file_types {
419        results.retain(|f| types.contains(&f.category));
420    }
421
422    // Sort by size descending
423    results.sort_by(|a, b| b.size.cmp(&a.size));
424    results
425}
426
427/// Memory-efficient top-N collector using a BinaryHeap (min-heap).
428/// When max is set, keeps only the N largest items. O(n log k) instead of O(n log n).
429/// When max is None, collects all items into a Vec.
430#[derive(Clone)]
431struct TopNCollector {
432    max: Option<usize>,
433    heap: BinaryHeap<Reverse<FoundFile>>,
434    vec: Vec<FoundFile>,
435}
436
437impl TopNCollector {
438    fn new(max: Option<usize>) -> Self {
439        Self {
440            max,
441            heap: BinaryHeap::new(),
442            vec: Vec::new(),
443        }
444    }
445
446    fn push(&mut self, file: FoundFile) {
447        match self.max {
448            Some(n) if n > 0 => {
449                self.heap.push(Reverse(file));
450                if self.heap.len() > n {
451                    self.heap.pop(); // Remove smallest
452                }
453            }
454            _ => {
455                self.vec.push(file);
456            }
457        }
458    }
459
460    fn into_vec(self) -> Vec<FoundFile> {
461        match self.max {
462            Some(_) => self
463                .heap
464                .into_sorted_vec()
465                .into_iter()
466                .map(|Reverse(f)| f)
467                .collect(),
468            None => self.vec,
469        }
470    }
471}
472
473/// Get a reasonable thread count for parallel walking.
474fn num_cpus() -> usize {
475    std::thread::available_parallelism()
476        .map(|n| n.get())
477        .unwrap_or(4)
478}
479
480#[cfg(test)]
481mod tests {
482    use super::*;
483    use std::fs;
484
485    fn setup_test_dir(name: &str) -> PathBuf {
486        let tmp = std::env::temp_dir().join(format!("diskforge_test_ff_{name}"));
487        let _ = fs::remove_dir_all(&tmp);
488        fs::create_dir_all(&tmp).unwrap();
489        tmp
490    }
491
492    fn create_file(dir: &Path, name: &str, size: usize) -> PathBuf {
493        let path = dir.join(name);
494        if let Some(parent) = path.parent() {
495            fs::create_dir_all(parent).unwrap();
496        }
497        fs::write(&path, vec![0u8; size]).unwrap();
498        path
499    }
500
501    #[test]
502    fn detect_extension_disk_image() {
503        assert_eq!(
504            detect_by_extension(Path::new("file.dmg")),
505            FileCategory::DiskImage
506        );
507        assert_eq!(
508            detect_by_extension(Path::new("file.iso")),
509            FileCategory::DiskImage
510        );
511        assert_eq!(
512            detect_by_extension(Path::new("file.IMG")),
513            FileCategory::DiskImage
514        );
515    }
516
517    #[test]
518    fn detect_extension_archive() {
519        assert_eq!(
520            detect_by_extension(Path::new("file.zip")),
521            FileCategory::Archive
522        );
523        assert_eq!(
524            detect_by_extension(Path::new("file.tar.gz")),
525            FileCategory::Archive
526        );
527        assert_eq!(
528            detect_by_extension(Path::new("file.7z")),
529            FileCategory::Archive
530        );
531        assert_eq!(
532            detect_by_extension(Path::new("file.rar")),
533            FileCategory::Archive
534        );
535    }
536
537    #[test]
538    fn detect_extension_video() {
539        assert_eq!(
540            detect_by_extension(Path::new("file.mp4")),
541            FileCategory::Video
542        );
543        assert_eq!(
544            detect_by_extension(Path::new("file.mkv")),
545            FileCategory::Video
546        );
547        assert_eq!(
548            detect_by_extension(Path::new("movie.MOV")),
549            FileCategory::Video
550        );
551    }
552
553    #[test]
554    fn detect_extension_installer() {
555        assert_eq!(
556            detect_by_extension(Path::new("setup.pkg")),
557            FileCategory::Installer
558        );
559        assert_eq!(
560            detect_by_extension(Path::new("setup.mpkg")),
561            FileCategory::Installer
562        );
563    }
564
565    #[test]
566    fn detect_extension_vm_image() {
567        assert_eq!(
568            detect_by_extension(Path::new("disk.vmdk")),
569            FileCategory::VmImage
570        );
571        assert_eq!(
572            detect_by_extension(Path::new("disk.qcow2")),
573            FileCategory::VmImage
574        );
575    }
576
577    #[test]
578    fn detect_extension_document() {
579        assert_eq!(
580            detect_by_extension(Path::new("doc.pdf")),
581            FileCategory::Document
582        );
583    }
584
585    #[test]
586    fn detect_extension_other() {
587        assert_eq!(
588            detect_by_extension(Path::new("file.txt")),
589            FileCategory::Other
590        );
591        assert_eq!(
592            detect_by_extension(Path::new("file.rs")),
593            FileCategory::Other
594        );
595        assert_eq!(detect_by_extension(Path::new("noext")), FileCategory::Other);
596    }
597
598    #[test]
599    fn find_files_size_filter() {
600        let tmp = setup_test_dir("size_filter");
601        create_file(&tmp, "small.zip", 100);
602        create_file(&tmp, "big.zip", 10_000);
603        create_file(&tmp, "huge.dmg", 100_000);
604
605        let options = FindOptions {
606            root_paths: vec![tmp.clone()],
607            min_size: Some(5_000),
608            validate_types: false,
609            ..Default::default()
610        };
611
612        let results = find_files(&options, None);
613        assert_eq!(results.len(), 2, "Should find 2 files above 5KB");
614        assert!(
615            results[0].size >= results[1].size,
616            "Should be sorted by size desc"
617        );
618
619        fs::remove_dir_all(&tmp).ok();
620    }
621
622    #[test]
623    fn find_files_max_results() {
624        let tmp = setup_test_dir("max_results");
625        for i in 0..10 {
626            create_file(&tmp, &format!("file{i}.zip"), (i + 1) * 1000);
627        }
628
629        let options = FindOptions {
630            root_paths: vec![tmp.clone()],
631            min_size: Some(1),
632            max_results: Some(3),
633            validate_types: false,
634            ..Default::default()
635        };
636
637        let results = find_files(&options, None);
638        assert_eq!(results.len(), 3, "Should return exactly 3 results");
639        // Should be the 3 largest
640        assert!(results[0].size >= results[1].size);
641        assert!(results[1].size >= results[2].size);
642
643        fs::remove_dir_all(&tmp).ok();
644    }
645
646    #[test]
647    fn find_files_type_filter() {
648        let tmp = setup_test_dir("type_filter");
649        create_file(&tmp, "movie.mp4", 5000);
650        create_file(&tmp, "archive.zip", 5000);
651        create_file(&tmp, "image.dmg", 5000);
652
653        let options = FindOptions {
654            root_paths: vec![tmp.clone()],
655            min_size: Some(1),
656            file_types: Some(vec![FileCategory::Video]),
657            validate_types: false,
658            ..Default::default()
659        };
660
661        let results = find_files(&options, None);
662        assert_eq!(results.len(), 1, "Should find only the video");
663        assert_eq!(results[0].category, FileCategory::Video);
664
665        fs::remove_dir_all(&tmp).ok();
666    }
667
668    #[test]
669    fn find_files_excludes_git_objects() {
670        let tmp = setup_test_dir("git_objects");
671        create_file(&tmp, ".git/objects/pack/bigpack.zip", 10_000);
672        create_file(&tmp, "normal.zip", 10_000);
673
674        let options = FindOptions {
675            root_paths: vec![tmp.clone()],
676            min_size: Some(1),
677            validate_types: false,
678            ..Default::default()
679        };
680
681        let results = find_files(&options, None);
682        assert_eq!(results.len(), 1, "Should exclude .git/objects file");
683        assert!(!results[0].path.to_string_lossy().contains(".git"));
684
685        fs::remove_dir_all(&tmp).ok();
686    }
687
688    #[test]
689    fn find_files_excludes_app_bundle_contents() {
690        let tmp = setup_test_dir("app_bundle");
691        create_file(&tmp, "SomeApp.app/Contents/MacOS/binary", 10_000);
692        create_file(&tmp, "outside.dmg", 10_000);
693
694        let options = FindOptions {
695            root_paths: vec![tmp.clone()],
696            min_size: Some(1),
697            validate_types: false,
698            ..Default::default()
699        };
700
701        let results = find_files(&options, None);
702        assert_eq!(results.len(), 1, "Should exclude file inside .app bundle");
703        assert!(results[0].path.to_string_lossy().contains("outside.dmg"));
704
705        fs::remove_dir_all(&tmp).ok();
706    }
707
708    #[test]
709    fn find_files_progress_counter() {
710        let tmp = setup_test_dir("progress");
711        create_file(&tmp, "a.zip", 1000);
712        create_file(&tmp, "b.zip", 2000);
713        create_file(&tmp, "c.zip", 3000);
714
715        let progress = ScanProgress::new();
716        let options = FindOptions {
717            root_paths: vec![tmp.clone()],
718            min_size: Some(1),
719            validate_types: false,
720            ..Default::default()
721        };
722
723        find_files(&options, Some(&progress));
724        assert!(
725            progress.count() >= 3,
726            "Should have scanned at least 3 files"
727        );
728
729        fs::remove_dir_all(&tmp).ok();
730    }
731
732    #[test]
733    fn top_n_collector_unlimited() {
734        let mut coll = TopNCollector::new(None);
735        for i in 0..5 {
736            coll.push(FoundFile {
737                path: PathBuf::from(format!("file{i}")),
738                size: (i + 1) as u64 * 100,
739                mtime: None,
740                category: FileCategory::Other,
741                is_downloaded: false,
742                download_source: None,
743            });
744        }
745        let results = coll.into_vec();
746        assert_eq!(results.len(), 5);
747    }
748
749    #[test]
750    fn top_n_collector_limited() {
751        let mut coll = TopNCollector::new(Some(2));
752        for i in 0..5 {
753            coll.push(FoundFile {
754                path: PathBuf::from(format!("file{i}")),
755                size: (i + 1) as u64 * 100,
756                mtime: None,
757                category: FileCategory::Other,
758                is_downloaded: false,
759                download_source: None,
760            });
761        }
762        let results = coll.into_vec();
763        assert_eq!(results.len(), 2);
764        // Should be the 2 largest (400 and 500)
765        assert!(results.iter().all(|f| f.size >= 400));
766    }
767}