1use std::cmp::Reverse;
2use std::collections::BinaryHeap;
3use std::ffi::OsStr;
4use std::path::{Path, PathBuf};
5use std::sync::{Arc, Mutex};
6use std::time::{Duration, SystemTime};
7
8use ignore::WalkBuilder;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
12pub enum FileCategory {
13 DiskImage,
14 Archive,
15 Video,
16 Installer,
17 VmImage,
18 Document,
19 Other,
20}
21
22impl std::fmt::Display for FileCategory {
23 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
24 match self {
25 Self::DiskImage => write!(f, "Disk Image"),
26 Self::Archive => write!(f, "Archive"),
27 Self::Video => write!(f, "Video"),
28 Self::Installer => write!(f, "Installer"),
29 Self::VmImage => write!(f, "VM Image"),
30 Self::Document => write!(f, "Document"),
31 Self::Other => write!(f, "Other"),
32 }
33 }
34}
35
36#[derive(Debug, Clone)]
38pub struct FoundFile {
39 pub path: PathBuf,
40 pub size: u64,
41 pub mtime: Option<SystemTime>,
42 pub category: FileCategory,
43 pub is_downloaded: bool,
44 pub download_source: Option<String>,
45}
46
47impl PartialEq for FoundFile {
48 fn eq(&self, other: &Self) -> bool {
49 self.size == other.size && self.path == other.path
50 }
51}
52
53impl Eq for FoundFile {}
54
55impl PartialOrd for FoundFile {
56 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
57 Some(self.cmp(other))
58 }
59}
60
61impl Ord for FoundFile {
62 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
63 self.size
64 .cmp(&other.size)
65 .then_with(|| self.path.cmp(&other.path))
66 }
67}
68
69#[derive(Debug, Clone)]
71pub struct FindOptions {
72 pub root_paths: Vec<PathBuf>,
74 pub min_size: Option<u64>,
76 pub older_than: Option<Duration>,
78 pub file_types: Option<Vec<FileCategory>>,
80 pub max_results: Option<usize>,
82 pub validate_types: bool,
84 pub extra_exclusions: Vec<String>,
86}
87
88impl Default for FindOptions {
89 fn default() -> Self {
90 Self {
91 root_paths: Vec::new(),
92 min_size: None,
93 older_than: None,
94 file_types: None,
95 max_results: None,
96 validate_types: true,
97 extra_exclusions: Vec::new(),
98 }
99 }
100}
101
102const DEFAULT_EXCLUSIONS: &[&str] = &[
104 "/System",
105 "/usr/bin",
106 "/usr/lib",
107 "/usr/libexec",
108 "/usr/sbin",
109 "/usr/share",
110 "/bin",
111 "/sbin",
112 "/private/var/db",
113];
114
115const EXCLUDED_COMPONENTS: &[&str] = &[".git"];
117
118fn is_inside_app_bundle(path: &Path) -> bool {
120 for ancestor in path.ancestors().skip(1) {
121 if ancestor.extension().is_some_and(|e| e == "app") {
122 return true;
123 }
124 }
125 false
126}
127
128fn should_exclude(path: &Path, extra_exclusions: &[String]) -> bool {
130 let path_str = path.to_string_lossy();
131
132 for excl in DEFAULT_EXCLUSIONS {
134 if path_str.starts_with(excl) {
135 return true;
136 }
137 }
138
139 for excl in extra_exclusions {
141 if path_str.contains(excl.as_str()) {
142 return true;
143 }
144 }
145
146 for component in path.components() {
148 let s = component.as_os_str().to_string_lossy();
149 for excl in EXCLUDED_COMPONENTS {
150 if s == *excl {
151 return true;
152 }
153 }
154 }
155
156 if is_inside_app_bundle(path) {
158 return true;
159 }
160
161 false
162}
163
164pub fn detect_by_extension(path: &Path) -> FileCategory {
166 let name = path
167 .file_name()
168 .unwrap_or(OsStr::new(""))
169 .to_string_lossy()
170 .to_lowercase();
171
172 if name.ends_with(".tar.gz") || name.ends_with(".tar.bz2") || name.ends_with(".tar.xz") {
174 return FileCategory::Archive;
175 }
176
177 let ext = path
178 .extension()
179 .unwrap_or(OsStr::new(""))
180 .to_string_lossy()
181 .to_lowercase();
182
183 match ext.as_str() {
184 "dmg" | "iso" | "img" | "sparseimage" | "sparsebundle" => FileCategory::DiskImage,
186 "zip" | "tar" | "tgz" | "rar" | "7z" | "xz" | "gz" | "bz2" => FileCategory::Archive,
188 "mp4" | "mov" | "avi" | "mkv" | "wmv" | "flv" | "m4v" | "webm" => FileCategory::Video,
190 "pkg" | "mpkg" => FileCategory::Installer,
192 "vmdk" | "vdi" | "qcow2" | "vhd" => FileCategory::VmImage,
194 "pdf" => FileCategory::Document,
196 _ => FileCategory::Other,
197 }
198}
199
200pub fn validate_by_magic_bytes(path: &Path, extension_category: FileCategory) -> FileCategory {
207 if extension_category == FileCategory::DiskImage {
210 let ext = path
211 .extension()
212 .unwrap_or_default()
213 .to_string_lossy()
214 .to_lowercase();
215 if ext == "dmg" {
216 return FileCategory::DiskImage;
217 }
218 }
219
220 let Ok(kind) = infer::get_from_path(path) else {
221 return extension_category;
222 };
223 let Some(kind) = kind else {
224 return extension_category;
225 };
226
227 let mime = kind.mime_type();
229 if mime.starts_with("video/") {
230 return FileCategory::Video;
231 }
232 match mime {
233 "application/x-apple-diskimage" => FileCategory::DiskImage,
234 "application/zip"
235 | "application/gzip"
236 | "application/x-tar"
237 | "application/x-rar-compressed"
238 | "application/x-7z-compressed"
239 | "application/x-bzip2"
240 | "application/x-xz"
241 | "application/zstd" => FileCategory::Archive,
242 "application/pdf" => FileCategory::Document,
243 "application/x-xar" => FileCategory::Installer, _ => extension_category,
245 }
246}
247
248pub fn check_quarantine(path: &Path) -> (bool, Option<String>) {
251 match xattr::get(path, "com.apple.quarantine") {
252 Ok(Some(value)) => {
253 let value_str = String::from_utf8_lossy(&value);
254 let parts: Vec<&str> = value_str.split(';').collect();
256 let source = parts
257 .get(2)
258 .map(|s| s.to_string())
259 .filter(|s| !s.is_empty());
260 (true, source)
261 }
262 _ => (false, None),
263 }
264}
265
266pub struct ScanProgress {
268 pub files_scanned: std::sync::atomic::AtomicU64,
269}
270
271impl Default for ScanProgress {
272 fn default() -> Self {
273 Self::new()
274 }
275}
276
277impl ScanProgress {
278 pub fn new() -> Self {
279 Self {
280 files_scanned: std::sync::atomic::AtomicU64::new(0),
281 }
282 }
283
284 pub fn count(&self) -> u64 {
285 self.files_scanned
286 .load(std::sync::atomic::Ordering::Relaxed)
287 }
288}
289
290pub fn find_files(options: &FindOptions, progress: Option<&ScanProgress>) -> Vec<FoundFile> {
298 let now = SystemTime::now();
299 let min_size = options.min_size.unwrap_or(0);
300 let extra_exclusions = options.extra_exclusions.clone();
301
302 let collector: Arc<Mutex<TopNCollector>> =
304 Arc::new(Mutex::new(TopNCollector::new(options.max_results)));
305
306 for root in &options.root_paths {
307 if !root.exists() {
308 continue;
309 }
310
311 let mut builder = WalkBuilder::new(root);
312 builder
313 .hidden(false) .git_ignore(true) .git_global(false)
316 .git_exclude(false)
317 .follow_links(false) .threads(num_cpus());
319
320 let walker = builder.build_parallel();
321 let collector_ref = Arc::clone(&collector);
322 let extra_excl = extra_exclusions.clone();
323 let older_than = options.older_than;
324 let file_types = options.file_types.clone();
325
326 walker.run(|| {
327 let collector = Arc::clone(&collector_ref);
328 let extra_excl = extra_excl.clone();
329 let file_types = file_types.clone();
330
331 Box::new(move |entry| {
332 let Ok(entry) = entry else {
333 return ignore::WalkState::Continue;
334 };
335
336 let path = entry.path();
337
338 if should_exclude(path, &extra_excl) {
340 return ignore::WalkState::Skip;
341 }
342
343 let Some(file_type) = entry.file_type() else {
345 return ignore::WalkState::Continue;
346 };
347 if !file_type.is_file() {
348 return ignore::WalkState::Continue;
349 }
350
351 if let Some(prog) = progress {
353 prog.files_scanned
354 .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
355 }
356
357 let Ok(meta) = entry.metadata() else {
359 return ignore::WalkState::Continue;
360 };
361
362 let size = meta.len();
363
364 if size < min_size {
366 return ignore::WalkState::Continue;
367 }
368
369 if let Some(max_age) = older_than
371 && let Ok(mtime) = meta.modified()
372 && let Ok(age) = now.duration_since(mtime)
373 && age < max_age
374 {
375 return ignore::WalkState::Continue;
376 }
377
378 let category = detect_by_extension(path);
380
381 if let Some(ref types) = file_types {
383 if !types.contains(&category) && category != FileCategory::Other {
384 return ignore::WalkState::Continue;
385 }
386 if category == FileCategory::Other && !types.contains(&FileCategory::Other) {
388 return ignore::WalkState::Continue;
389 }
390 }
391
392 let mtime = meta.modified().ok();
393
394 let file = FoundFile {
395 path: path.to_path_buf(),
396 size,
397 mtime,
398 category,
399 is_downloaded: false,
400 download_source: None,
401 };
402
403 if let Ok(mut coll) = collector.lock() {
404 coll.push(file);
405 }
406
407 ignore::WalkState::Continue
408 })
409 });
410 }
411
412 let inner = Arc::try_unwrap(collector)
413 .map(|mutex| mutex.into_inner().unwrap_or_else(|e| e.into_inner()))
414 .unwrap_or_else(|arc| {
415 let lock = arc.lock().unwrap();
416 lock.clone()
417 });
418 let mut results = inner.into_vec();
419
420 if options.validate_types {
422 for file in &mut results {
423 file.category = validate_by_magic_bytes(&file.path, file.category);
424 }
425 }
426
427 for file in &mut results {
429 let (is_downloaded, source) = check_quarantine(&file.path);
430 file.is_downloaded = is_downloaded;
431 file.download_source = source;
432 }
433
434 if let Some(ref types) = options.file_types {
436 results.retain(|f| types.contains(&f.category));
437 }
438
439 results.sort_by(|a, b| b.size.cmp(&a.size));
441 results
442}
443
444#[derive(Clone)]
448struct TopNCollector {
449 max: Option<usize>,
450 heap: BinaryHeap<Reverse<FoundFile>>,
451 vec: Vec<FoundFile>,
452}
453
454impl TopNCollector {
455 fn new(max: Option<usize>) -> Self {
456 Self {
457 max,
458 heap: BinaryHeap::new(),
459 vec: Vec::new(),
460 }
461 }
462
463 fn push(&mut self, file: FoundFile) {
464 match self.max {
465 Some(n) if n > 0 => {
466 self.heap.push(Reverse(file));
467 if self.heap.len() > n {
468 self.heap.pop(); }
470 }
471 _ => {
472 self.vec.push(file);
473 }
474 }
475 }
476
477 fn into_vec(self) -> Vec<FoundFile> {
478 match self.max {
479 Some(_) => self
480 .heap
481 .into_sorted_vec()
482 .into_iter()
483 .map(|Reverse(f)| f)
484 .collect(),
485 None => self.vec,
486 }
487 }
488}
489
490fn num_cpus() -> usize {
492 std::thread::available_parallelism()
493 .map(|n| n.get())
494 .unwrap_or(4)
495}
496
497#[cfg(test)]
498mod tests {
499 use super::*;
500 use std::fs;
501
502 fn setup_test_dir(name: &str) -> PathBuf {
503 let tmp = std::env::temp_dir().join(format!("diskforge_test_ff_{name}"));
504 let _ = fs::remove_dir_all(&tmp);
505 fs::create_dir_all(&tmp).unwrap();
506 tmp
507 }
508
509 fn create_file(dir: &Path, name: &str, size: usize) -> PathBuf {
510 let path = dir.join(name);
511 if let Some(parent) = path.parent() {
512 fs::create_dir_all(parent).unwrap();
513 }
514 fs::write(&path, vec![0u8; size]).unwrap();
515 path
516 }
517
518 #[test]
519 fn detect_extension_disk_image() {
520 assert_eq!(
521 detect_by_extension(Path::new("file.dmg")),
522 FileCategory::DiskImage
523 );
524 assert_eq!(
525 detect_by_extension(Path::new("file.iso")),
526 FileCategory::DiskImage
527 );
528 assert_eq!(
529 detect_by_extension(Path::new("file.IMG")),
530 FileCategory::DiskImage
531 );
532 }
533
534 #[test]
535 fn detect_extension_archive() {
536 assert_eq!(
537 detect_by_extension(Path::new("file.zip")),
538 FileCategory::Archive
539 );
540 assert_eq!(
541 detect_by_extension(Path::new("file.tar.gz")),
542 FileCategory::Archive
543 );
544 assert_eq!(
545 detect_by_extension(Path::new("file.7z")),
546 FileCategory::Archive
547 );
548 assert_eq!(
549 detect_by_extension(Path::new("file.rar")),
550 FileCategory::Archive
551 );
552 }
553
554 #[test]
555 fn detect_extension_video() {
556 assert_eq!(
557 detect_by_extension(Path::new("file.mp4")),
558 FileCategory::Video
559 );
560 assert_eq!(
561 detect_by_extension(Path::new("file.mkv")),
562 FileCategory::Video
563 );
564 assert_eq!(
565 detect_by_extension(Path::new("movie.MOV")),
566 FileCategory::Video
567 );
568 }
569
570 #[test]
571 fn detect_extension_installer() {
572 assert_eq!(
573 detect_by_extension(Path::new("setup.pkg")),
574 FileCategory::Installer
575 );
576 assert_eq!(
577 detect_by_extension(Path::new("setup.mpkg")),
578 FileCategory::Installer
579 );
580 }
581
582 #[test]
583 fn detect_extension_vm_image() {
584 assert_eq!(
585 detect_by_extension(Path::new("disk.vmdk")),
586 FileCategory::VmImage
587 );
588 assert_eq!(
589 detect_by_extension(Path::new("disk.qcow2")),
590 FileCategory::VmImage
591 );
592 }
593
594 #[test]
595 fn detect_extension_document() {
596 assert_eq!(
597 detect_by_extension(Path::new("doc.pdf")),
598 FileCategory::Document
599 );
600 }
601
602 #[test]
603 fn detect_extension_other() {
604 assert_eq!(
605 detect_by_extension(Path::new("file.txt")),
606 FileCategory::Other
607 );
608 assert_eq!(
609 detect_by_extension(Path::new("file.rs")),
610 FileCategory::Other
611 );
612 assert_eq!(detect_by_extension(Path::new("noext")), FileCategory::Other);
613 }
614
615 #[test]
616 fn find_files_size_filter() {
617 let tmp = setup_test_dir("size_filter");
618 create_file(&tmp, "small.zip", 100);
619 create_file(&tmp, "big.zip", 10_000);
620 create_file(&tmp, "huge.dmg", 100_000);
621
622 let options = FindOptions {
623 root_paths: vec![tmp.clone()],
624 min_size: Some(5_000),
625 validate_types: false,
626 ..Default::default()
627 };
628
629 let results = find_files(&options, None);
630 assert_eq!(results.len(), 2, "Should find 2 files above 5KB");
631 assert!(
632 results[0].size >= results[1].size,
633 "Should be sorted by size desc"
634 );
635
636 fs::remove_dir_all(&tmp).ok();
637 }
638
639 #[test]
640 fn find_files_max_results() {
641 let tmp = setup_test_dir("max_results");
642 for i in 0..10 {
643 create_file(&tmp, &format!("file{i}.zip"), (i + 1) * 1000);
644 }
645
646 let options = FindOptions {
647 root_paths: vec![tmp.clone()],
648 min_size: Some(1),
649 max_results: Some(3),
650 validate_types: false,
651 ..Default::default()
652 };
653
654 let results = find_files(&options, None);
655 assert_eq!(results.len(), 3, "Should return exactly 3 results");
656 assert!(results[0].size >= results[1].size);
658 assert!(results[1].size >= results[2].size);
659
660 fs::remove_dir_all(&tmp).ok();
661 }
662
663 #[test]
664 fn find_files_type_filter() {
665 let tmp = setup_test_dir("type_filter");
666 create_file(&tmp, "movie.mp4", 5000);
667 create_file(&tmp, "archive.zip", 5000);
668 create_file(&tmp, "image.dmg", 5000);
669
670 let options = FindOptions {
671 root_paths: vec![tmp.clone()],
672 min_size: Some(1),
673 file_types: Some(vec![FileCategory::Video]),
674 validate_types: false,
675 ..Default::default()
676 };
677
678 let results = find_files(&options, None);
679 assert_eq!(results.len(), 1, "Should find only the video");
680 assert_eq!(results[0].category, FileCategory::Video);
681
682 fs::remove_dir_all(&tmp).ok();
683 }
684
685 #[test]
686 fn find_files_excludes_git_objects() {
687 let tmp = setup_test_dir("git_objects");
688 create_file(&tmp, ".git/objects/pack/bigpack.zip", 10_000);
689 create_file(&tmp, "normal.zip", 10_000);
690
691 let options = FindOptions {
692 root_paths: vec![tmp.clone()],
693 min_size: Some(1),
694 validate_types: false,
695 ..Default::default()
696 };
697
698 let results = find_files(&options, None);
699 assert_eq!(results.len(), 1, "Should exclude .git/objects file");
700 assert!(!results[0].path.to_string_lossy().contains(".git"));
701
702 fs::remove_dir_all(&tmp).ok();
703 }
704
705 #[test]
706 fn find_files_excludes_app_bundle_contents() {
707 let tmp = setup_test_dir("app_bundle");
708 create_file(&tmp, "SomeApp.app/Contents/MacOS/binary", 10_000);
709 create_file(&tmp, "outside.dmg", 10_000);
710
711 let options = FindOptions {
712 root_paths: vec![tmp.clone()],
713 min_size: Some(1),
714 validate_types: false,
715 ..Default::default()
716 };
717
718 let results = find_files(&options, None);
719 assert_eq!(results.len(), 1, "Should exclude file inside .app bundle");
720 assert!(results[0].path.to_string_lossy().contains("outside.dmg"));
721
722 fs::remove_dir_all(&tmp).ok();
723 }
724
725 #[test]
726 fn find_files_progress_counter() {
727 let tmp = setup_test_dir("progress");
728 create_file(&tmp, "a.zip", 1000);
729 create_file(&tmp, "b.zip", 2000);
730 create_file(&tmp, "c.zip", 3000);
731
732 let progress = ScanProgress::new();
733 let options = FindOptions {
734 root_paths: vec![tmp.clone()],
735 min_size: Some(1),
736 validate_types: false,
737 ..Default::default()
738 };
739
740 find_files(&options, Some(&progress));
741 assert!(
742 progress.count() >= 3,
743 "Should have scanned at least 3 files"
744 );
745
746 fs::remove_dir_all(&tmp).ok();
747 }
748
749 #[test]
750 fn dmg_not_misclassified_as_archive() {
751 let tmp = setup_test_dir("dmg_fix");
753 let dmg_path = create_file(&tmp, "test.dmg", 5000);
754
755 assert_eq!(
757 detect_by_extension(&dmg_path),
758 FileCategory::DiskImage,
759 "Extension detection should return DiskImage for .dmg"
760 );
761
762 let validated = validate_by_magic_bytes(&dmg_path, FileCategory::DiskImage);
764 assert_eq!(
765 validated,
766 FileCategory::DiskImage,
767 "DMG files must stay as DiskImage even after magic-byte validation"
768 );
769
770 fs::remove_dir_all(&tmp).ok();
771 }
772
773 #[test]
774 fn top_n_collector_unlimited() {
775 let mut coll = TopNCollector::new(None);
776 for i in 0..5 {
777 coll.push(FoundFile {
778 path: PathBuf::from(format!("file{i}")),
779 size: (i + 1) as u64 * 100,
780 mtime: None,
781 category: FileCategory::Other,
782 is_downloaded: false,
783 download_source: None,
784 });
785 }
786 let results = coll.into_vec();
787 assert_eq!(results.len(), 5);
788 }
789
790 #[test]
791 fn top_n_collector_limited() {
792 let mut coll = TopNCollector::new(Some(2));
793 for i in 0..5 {
794 coll.push(FoundFile {
795 path: PathBuf::from(format!("file{i}")),
796 size: (i + 1) as u64 * 100,
797 mtime: None,
798 category: FileCategory::Other,
799 is_downloaded: false,
800 download_source: None,
801 });
802 }
803 let results = coll.into_vec();
804 assert_eq!(results.len(), 2);
805 assert!(results.iter().all(|f| f.size >= 400));
807 }
808}