1use std::cmp::Reverse;
2use std::collections::BinaryHeap;
3use std::ffi::OsStr;
4use std::path::{Path, PathBuf};
5use std::sync::{Arc, Mutex};
6use std::time::{Duration, SystemTime};
7
8use ignore::WalkBuilder;
9
10#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
12pub enum FileCategory {
13 DiskImage,
14 Archive,
15 Video,
16 Installer,
17 VmImage,
18 Document,
19 Other,
20}
21
22impl std::fmt::Display for FileCategory {
23 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
24 match self {
25 Self::DiskImage => write!(f, "Disk Image"),
26 Self::Archive => write!(f, "Archive"),
27 Self::Video => write!(f, "Video"),
28 Self::Installer => write!(f, "Installer"),
29 Self::VmImage => write!(f, "VM Image"),
30 Self::Document => write!(f, "Document"),
31 Self::Other => write!(f, "Other"),
32 }
33 }
34}
35
36#[derive(Debug, Clone)]
38pub struct FoundFile {
39 pub path: PathBuf,
40 pub size: u64,
41 pub mtime: Option<SystemTime>,
42 pub category: FileCategory,
43 pub is_downloaded: bool,
44 pub download_source: Option<String>,
45}
46
47impl PartialEq for FoundFile {
48 fn eq(&self, other: &Self) -> bool {
49 self.size == other.size && self.path == other.path
50 }
51}
52
53impl Eq for FoundFile {}
54
55impl PartialOrd for FoundFile {
56 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
57 Some(self.cmp(other))
58 }
59}
60
61impl Ord for FoundFile {
62 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
63 self.size
64 .cmp(&other.size)
65 .then_with(|| self.path.cmp(&other.path))
66 }
67}
68
69#[derive(Debug, Clone)]
71pub struct FindOptions {
72 pub root_paths: Vec<PathBuf>,
74 pub min_size: Option<u64>,
76 pub older_than: Option<Duration>,
78 pub file_types: Option<Vec<FileCategory>>,
80 pub max_results: Option<usize>,
82 pub validate_types: bool,
84 pub extra_exclusions: Vec<String>,
86}
87
88impl Default for FindOptions {
89 fn default() -> Self {
90 Self {
91 root_paths: Vec::new(),
92 min_size: None,
93 older_than: None,
94 file_types: None,
95 max_results: None,
96 validate_types: true,
97 extra_exclusions: Vec::new(),
98 }
99 }
100}
101
102const DEFAULT_EXCLUSIONS: &[&str] = &[
104 "/System",
105 "/usr/bin",
106 "/usr/lib",
107 "/usr/libexec",
108 "/usr/sbin",
109 "/usr/share",
110 "/bin",
111 "/sbin",
112 "/private/var/db",
113];
114
115const EXCLUDED_COMPONENTS: &[&str] = &[".git"];
117
118fn is_inside_app_bundle(path: &Path) -> bool {
120 for ancestor in path.ancestors().skip(1) {
121 if ancestor.extension().is_some_and(|e| e == "app") {
122 return true;
123 }
124 }
125 false
126}
127
128fn should_exclude(path: &Path, extra_exclusions: &[String]) -> bool {
130 let path_str = path.to_string_lossy();
131
132 for excl in DEFAULT_EXCLUSIONS {
134 if path_str.starts_with(excl) {
135 return true;
136 }
137 }
138
139 for excl in extra_exclusions {
141 if path_str.contains(excl.as_str()) {
142 return true;
143 }
144 }
145
146 for component in path.components() {
148 let s = component.as_os_str().to_string_lossy();
149 for excl in EXCLUDED_COMPONENTS {
150 if s == *excl {
151 return true;
152 }
153 }
154 }
155
156 if is_inside_app_bundle(path) {
158 return true;
159 }
160
161 false
162}
163
164pub fn detect_by_extension(path: &Path) -> FileCategory {
166 let name = path
167 .file_name()
168 .unwrap_or(OsStr::new(""))
169 .to_string_lossy()
170 .to_lowercase();
171
172 if name.ends_with(".tar.gz") || name.ends_with(".tar.bz2") || name.ends_with(".tar.xz") {
174 return FileCategory::Archive;
175 }
176
177 let ext = path
178 .extension()
179 .unwrap_or(OsStr::new(""))
180 .to_string_lossy()
181 .to_lowercase();
182
183 match ext.as_str() {
184 "dmg" | "iso" | "img" | "sparseimage" | "sparsebundle" => FileCategory::DiskImage,
186 "zip" | "tar" | "tgz" | "rar" | "7z" | "xz" | "gz" | "bz2" => FileCategory::Archive,
188 "mp4" | "mov" | "avi" | "mkv" | "wmv" | "flv" | "m4v" | "webm" => FileCategory::Video,
190 "pkg" | "mpkg" => FileCategory::Installer,
192 "vmdk" | "vdi" | "qcow2" | "vhd" => FileCategory::VmImage,
194 "pdf" => FileCategory::Document,
196 _ => FileCategory::Other,
197 }
198}
199
200pub fn validate_by_magic_bytes(path: &Path, extension_category: FileCategory) -> FileCategory {
204 let Ok(kind) = infer::get_from_path(path) else {
205 return extension_category;
206 };
207 let Some(kind) = kind else {
208 return extension_category;
209 };
210
211 let mime = kind.mime_type();
213 if mime.starts_with("video/") {
214 return FileCategory::Video;
215 }
216 match mime {
217 "application/x-apple-diskimage" => FileCategory::DiskImage,
218 "application/zip"
219 | "application/gzip"
220 | "application/x-tar"
221 | "application/x-rar-compressed"
222 | "application/x-7z-compressed"
223 | "application/x-bzip2"
224 | "application/x-xz"
225 | "application/zstd" => FileCategory::Archive,
226 "application/pdf" => FileCategory::Document,
227 "application/x-xar" => FileCategory::Installer, _ => extension_category,
229 }
230}
231
232pub fn check_quarantine(path: &Path) -> (bool, Option<String>) {
235 match xattr::get(path, "com.apple.quarantine") {
236 Ok(Some(value)) => {
237 let value_str = String::from_utf8_lossy(&value);
238 let parts: Vec<&str> = value_str.split(';').collect();
240 let source = parts
241 .get(2)
242 .map(|s| s.to_string())
243 .filter(|s| !s.is_empty());
244 (true, source)
245 }
246 _ => (false, None),
247 }
248}
249
250pub struct ScanProgress {
252 pub files_scanned: std::sync::atomic::AtomicU64,
253}
254
255impl Default for ScanProgress {
256 fn default() -> Self {
257 Self::new()
258 }
259}
260
261impl ScanProgress {
262 pub fn new() -> Self {
263 Self {
264 files_scanned: std::sync::atomic::AtomicU64::new(0),
265 }
266 }
267
268 pub fn count(&self) -> u64 {
269 self.files_scanned
270 .load(std::sync::atomic::Ordering::Relaxed)
271 }
272}
273
274pub fn find_files(options: &FindOptions, progress: Option<&ScanProgress>) -> Vec<FoundFile> {
282 let now = SystemTime::now();
283 let min_size = options.min_size.unwrap_or(0);
284 let extra_exclusions = options.extra_exclusions.clone();
285
286 let collector: Arc<Mutex<TopNCollector>> =
288 Arc::new(Mutex::new(TopNCollector::new(options.max_results)));
289
290 for root in &options.root_paths {
291 if !root.exists() {
292 continue;
293 }
294
295 let mut builder = WalkBuilder::new(root);
296 builder
297 .hidden(false) .git_ignore(true) .git_global(false)
300 .git_exclude(false)
301 .follow_links(false) .threads(num_cpus());
303
304 let walker = builder.build_parallel();
305 let collector_ref = Arc::clone(&collector);
306 let extra_excl = extra_exclusions.clone();
307 let older_than = options.older_than;
308 let file_types = options.file_types.clone();
309
310 walker.run(|| {
311 let collector = Arc::clone(&collector_ref);
312 let extra_excl = extra_excl.clone();
313 let file_types = file_types.clone();
314
315 Box::new(move |entry| {
316 let Ok(entry) = entry else {
317 return ignore::WalkState::Continue;
318 };
319
320 let path = entry.path();
321
322 if should_exclude(path, &extra_excl) {
324 return ignore::WalkState::Skip;
325 }
326
327 let Some(file_type) = entry.file_type() else {
329 return ignore::WalkState::Continue;
330 };
331 if !file_type.is_file() {
332 return ignore::WalkState::Continue;
333 }
334
335 if let Some(prog) = progress {
337 prog.files_scanned
338 .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
339 }
340
341 let Ok(meta) = entry.metadata() else {
343 return ignore::WalkState::Continue;
344 };
345
346 let size = meta.len();
347
348 if size < min_size {
350 return ignore::WalkState::Continue;
351 }
352
353 if let Some(max_age) = older_than
355 && let Ok(mtime) = meta.modified()
356 && let Ok(age) = now.duration_since(mtime)
357 && age < max_age {
358 return ignore::WalkState::Continue;
359 }
360
361 let category = detect_by_extension(path);
363
364 if let Some(ref types) = file_types {
366 if !types.contains(&category) && category != FileCategory::Other {
367 return ignore::WalkState::Continue;
368 }
369 if category == FileCategory::Other && !types.contains(&FileCategory::Other) {
371 return ignore::WalkState::Continue;
372 }
373 }
374
375 let mtime = meta.modified().ok();
376
377 let file = FoundFile {
378 path: path.to_path_buf(),
379 size,
380 mtime,
381 category,
382 is_downloaded: false,
383 download_source: None,
384 };
385
386 if let Ok(mut coll) = collector.lock() {
387 coll.push(file);
388 }
389
390 ignore::WalkState::Continue
391 })
392 });
393 }
394
395 let inner = Arc::try_unwrap(collector)
396 .map(|mutex| mutex.into_inner().unwrap_or_else(|e| e.into_inner()))
397 .unwrap_or_else(|arc| {
398 let lock = arc.lock().unwrap();
399 lock.clone()
400 });
401 let mut results = inner.into_vec();
402
403 if options.validate_types {
405 for file in &mut results {
406 file.category = validate_by_magic_bytes(&file.path, file.category);
407 }
408 }
409
410 for file in &mut results {
412 let (is_downloaded, source) = check_quarantine(&file.path);
413 file.is_downloaded = is_downloaded;
414 file.download_source = source;
415 }
416
417 if let Some(ref types) = options.file_types {
419 results.retain(|f| types.contains(&f.category));
420 }
421
422 results.sort_by(|a, b| b.size.cmp(&a.size));
424 results
425}
426
427#[derive(Clone)]
431struct TopNCollector {
432 max: Option<usize>,
433 heap: BinaryHeap<Reverse<FoundFile>>,
434 vec: Vec<FoundFile>,
435}
436
437impl TopNCollector {
438 fn new(max: Option<usize>) -> Self {
439 Self {
440 max,
441 heap: BinaryHeap::new(),
442 vec: Vec::new(),
443 }
444 }
445
446 fn push(&mut self, file: FoundFile) {
447 match self.max {
448 Some(n) if n > 0 => {
449 self.heap.push(Reverse(file));
450 if self.heap.len() > n {
451 self.heap.pop(); }
453 }
454 _ => {
455 self.vec.push(file);
456 }
457 }
458 }
459
460 fn into_vec(self) -> Vec<FoundFile> {
461 match self.max {
462 Some(_) => self
463 .heap
464 .into_sorted_vec()
465 .into_iter()
466 .map(|Reverse(f)| f)
467 .collect(),
468 None => self.vec,
469 }
470 }
471}
472
473fn num_cpus() -> usize {
475 std::thread::available_parallelism()
476 .map(|n| n.get())
477 .unwrap_or(4)
478}
479
480#[cfg(test)]
481mod tests {
482 use super::*;
483 use std::fs;
484
485 fn setup_test_dir(name: &str) -> PathBuf {
486 let tmp = std::env::temp_dir().join(format!("diskforge_test_ff_{name}"));
487 let _ = fs::remove_dir_all(&tmp);
488 fs::create_dir_all(&tmp).unwrap();
489 tmp
490 }
491
492 fn create_file(dir: &Path, name: &str, size: usize) -> PathBuf {
493 let path = dir.join(name);
494 if let Some(parent) = path.parent() {
495 fs::create_dir_all(parent).unwrap();
496 }
497 fs::write(&path, vec![0u8; size]).unwrap();
498 path
499 }
500
501 #[test]
502 fn detect_extension_disk_image() {
503 assert_eq!(
504 detect_by_extension(Path::new("file.dmg")),
505 FileCategory::DiskImage
506 );
507 assert_eq!(
508 detect_by_extension(Path::new("file.iso")),
509 FileCategory::DiskImage
510 );
511 assert_eq!(
512 detect_by_extension(Path::new("file.IMG")),
513 FileCategory::DiskImage
514 );
515 }
516
517 #[test]
518 fn detect_extension_archive() {
519 assert_eq!(
520 detect_by_extension(Path::new("file.zip")),
521 FileCategory::Archive
522 );
523 assert_eq!(
524 detect_by_extension(Path::new("file.tar.gz")),
525 FileCategory::Archive
526 );
527 assert_eq!(
528 detect_by_extension(Path::new("file.7z")),
529 FileCategory::Archive
530 );
531 assert_eq!(
532 detect_by_extension(Path::new("file.rar")),
533 FileCategory::Archive
534 );
535 }
536
537 #[test]
538 fn detect_extension_video() {
539 assert_eq!(
540 detect_by_extension(Path::new("file.mp4")),
541 FileCategory::Video
542 );
543 assert_eq!(
544 detect_by_extension(Path::new("file.mkv")),
545 FileCategory::Video
546 );
547 assert_eq!(
548 detect_by_extension(Path::new("movie.MOV")),
549 FileCategory::Video
550 );
551 }
552
553 #[test]
554 fn detect_extension_installer() {
555 assert_eq!(
556 detect_by_extension(Path::new("setup.pkg")),
557 FileCategory::Installer
558 );
559 assert_eq!(
560 detect_by_extension(Path::new("setup.mpkg")),
561 FileCategory::Installer
562 );
563 }
564
565 #[test]
566 fn detect_extension_vm_image() {
567 assert_eq!(
568 detect_by_extension(Path::new("disk.vmdk")),
569 FileCategory::VmImage
570 );
571 assert_eq!(
572 detect_by_extension(Path::new("disk.qcow2")),
573 FileCategory::VmImage
574 );
575 }
576
577 #[test]
578 fn detect_extension_document() {
579 assert_eq!(
580 detect_by_extension(Path::new("doc.pdf")),
581 FileCategory::Document
582 );
583 }
584
585 #[test]
586 fn detect_extension_other() {
587 assert_eq!(
588 detect_by_extension(Path::new("file.txt")),
589 FileCategory::Other
590 );
591 assert_eq!(
592 detect_by_extension(Path::new("file.rs")),
593 FileCategory::Other
594 );
595 assert_eq!(detect_by_extension(Path::new("noext")), FileCategory::Other);
596 }
597
598 #[test]
599 fn find_files_size_filter() {
600 let tmp = setup_test_dir("size_filter");
601 create_file(&tmp, "small.zip", 100);
602 create_file(&tmp, "big.zip", 10_000);
603 create_file(&tmp, "huge.dmg", 100_000);
604
605 let options = FindOptions {
606 root_paths: vec![tmp.clone()],
607 min_size: Some(5_000),
608 validate_types: false,
609 ..Default::default()
610 };
611
612 let results = find_files(&options, None);
613 assert_eq!(results.len(), 2, "Should find 2 files above 5KB");
614 assert!(
615 results[0].size >= results[1].size,
616 "Should be sorted by size desc"
617 );
618
619 fs::remove_dir_all(&tmp).ok();
620 }
621
622 #[test]
623 fn find_files_max_results() {
624 let tmp = setup_test_dir("max_results");
625 for i in 0..10 {
626 create_file(&tmp, &format!("file{i}.zip"), (i + 1) * 1000);
627 }
628
629 let options = FindOptions {
630 root_paths: vec![tmp.clone()],
631 min_size: Some(1),
632 max_results: Some(3),
633 validate_types: false,
634 ..Default::default()
635 };
636
637 let results = find_files(&options, None);
638 assert_eq!(results.len(), 3, "Should return exactly 3 results");
639 assert!(results[0].size >= results[1].size);
641 assert!(results[1].size >= results[2].size);
642
643 fs::remove_dir_all(&tmp).ok();
644 }
645
646 #[test]
647 fn find_files_type_filter() {
648 let tmp = setup_test_dir("type_filter");
649 create_file(&tmp, "movie.mp4", 5000);
650 create_file(&tmp, "archive.zip", 5000);
651 create_file(&tmp, "image.dmg", 5000);
652
653 let options = FindOptions {
654 root_paths: vec![tmp.clone()],
655 min_size: Some(1),
656 file_types: Some(vec![FileCategory::Video]),
657 validate_types: false,
658 ..Default::default()
659 };
660
661 let results = find_files(&options, None);
662 assert_eq!(results.len(), 1, "Should find only the video");
663 assert_eq!(results[0].category, FileCategory::Video);
664
665 fs::remove_dir_all(&tmp).ok();
666 }
667
668 #[test]
669 fn find_files_excludes_git_objects() {
670 let tmp = setup_test_dir("git_objects");
671 create_file(&tmp, ".git/objects/pack/bigpack.zip", 10_000);
672 create_file(&tmp, "normal.zip", 10_000);
673
674 let options = FindOptions {
675 root_paths: vec![tmp.clone()],
676 min_size: Some(1),
677 validate_types: false,
678 ..Default::default()
679 };
680
681 let results = find_files(&options, None);
682 assert_eq!(results.len(), 1, "Should exclude .git/objects file");
683 assert!(!results[0].path.to_string_lossy().contains(".git"));
684
685 fs::remove_dir_all(&tmp).ok();
686 }
687
688 #[test]
689 fn find_files_excludes_app_bundle_contents() {
690 let tmp = setup_test_dir("app_bundle");
691 create_file(&tmp, "SomeApp.app/Contents/MacOS/binary", 10_000);
692 create_file(&tmp, "outside.dmg", 10_000);
693
694 let options = FindOptions {
695 root_paths: vec![tmp.clone()],
696 min_size: Some(1),
697 validate_types: false,
698 ..Default::default()
699 };
700
701 let results = find_files(&options, None);
702 assert_eq!(results.len(), 1, "Should exclude file inside .app bundle");
703 assert!(results[0].path.to_string_lossy().contains("outside.dmg"));
704
705 fs::remove_dir_all(&tmp).ok();
706 }
707
708 #[test]
709 fn find_files_progress_counter() {
710 let tmp = setup_test_dir("progress");
711 create_file(&tmp, "a.zip", 1000);
712 create_file(&tmp, "b.zip", 2000);
713 create_file(&tmp, "c.zip", 3000);
714
715 let progress = ScanProgress::new();
716 let options = FindOptions {
717 root_paths: vec![tmp.clone()],
718 min_size: Some(1),
719 validate_types: false,
720 ..Default::default()
721 };
722
723 find_files(&options, Some(&progress));
724 assert!(
725 progress.count() >= 3,
726 "Should have scanned at least 3 files"
727 );
728
729 fs::remove_dir_all(&tmp).ok();
730 }
731
732 #[test]
733 fn top_n_collector_unlimited() {
734 let mut coll = TopNCollector::new(None);
735 for i in 0..5 {
736 coll.push(FoundFile {
737 path: PathBuf::from(format!("file{i}")),
738 size: (i + 1) as u64 * 100,
739 mtime: None,
740 category: FileCategory::Other,
741 is_downloaded: false,
742 download_source: None,
743 });
744 }
745 let results = coll.into_vec();
746 assert_eq!(results.len(), 5);
747 }
748
749 #[test]
750 fn top_n_collector_limited() {
751 let mut coll = TopNCollector::new(Some(2));
752 for i in 0..5 {
753 coll.push(FoundFile {
754 path: PathBuf::from(format!("file{i}")),
755 size: (i + 1) as u64 * 100,
756 mtime: None,
757 category: FileCategory::Other,
758 is_downloaded: false,
759 download_source: None,
760 });
761 }
762 let results = coll.into_vec();
763 assert_eq!(results.len(), 2);
764 assert!(results.iter().all(|f| f.size >= 400));
766 }
767}