use anyhow::Result;
use rayon::prelude::*;
use std::collections::HashSet;
use std::fs;
use std::path::Path;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Mutex;
use std::time::SystemTime;
use crate::category::{self, Category};
use crate::entry::Entry;
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, clap::ValueEnum)]
pub enum HardlinkPolicy {
#[default]
CountOnce,
CountEach,
}
const PARALLEL_DEPTH: usize = 16;
#[derive(Default, Debug)]
pub struct ScanCounts {
pub items_scanned: AtomicU64,
pub items_skipped: AtomicU64,
}
impl ScanCounts {
pub fn scanned(&self) -> u64 {
self.items_scanned.load(Ordering::Relaxed)
}
pub fn skipped(&self) -> u64 {
self.items_skipped.load(Ordering::Relaxed)
}
}
struct ScanCtx<'a> {
counts: &'a ScanCounts,
#[cfg_attr(not(unix), allow(dead_code))]
hardlinks: HardlinkPolicy,
#[cfg_attr(not(unix), allow(dead_code))]
seen_inodes: Mutex<HashSet<(u64, u64)>>,
}
impl<'a> ScanCtx<'a> {
fn new(counts: &'a ScanCounts, hardlinks: HardlinkPolicy) -> Self {
Self {
counts,
hardlinks,
seen_inodes: Mutex::new(HashSet::new()),
}
}
}
pub fn scan(path: &Path, hardlinks: HardlinkPolicy) -> Result<(Entry, ScanCounts)> {
let counts = ScanCounts::default();
let ctx = ScanCtx::new(&counts, hardlinks);
let entry = scan_recursive(path, 0, Category::Other, &ctx)?;
Ok((entry, counts))
}
pub fn scan_with_progress(
path: &Path,
counts: &ScanCounts,
hardlinks: HardlinkPolicy,
) -> Result<Entry> {
let ctx = ScanCtx::new(counts, hardlinks);
scan_recursive(path, 0, Category::Other, &ctx)
}
fn scan_recursive(
path: &Path,
depth: usize,
inherited: Category,
ctx: &ScanCtx<'_>,
) -> Result<Entry> {
ctx.counts.items_scanned.fetch_add(1, Ordering::Relaxed);
let name = path
.file_name()
.map(|n| n.to_string_lossy().to_string())
.unwrap_or_else(|| path.to_string_lossy().to_string());
let metadata = fs::symlink_metadata(path)?;
let modified_days_ago = days_since_modified(&metadata);
if metadata.is_dir() {
let own = category::classify_dir(&name);
let effective = if inherited != Category::Other {
inherited
} else {
own
};
let children = scan_children(path, depth, effective, ctx);
Ok(Entry::dir(name, effective, modified_days_ago, children))
} else {
let effective = if inherited != Category::Other {
inherited
} else {
category::classify_file(&name)
};
Ok(Entry::file(
name,
file_disk_usage(&metadata, ctx),
effective,
modified_days_ago,
))
}
}
#[cfg(unix)]
fn file_disk_usage(metadata: &fs::Metadata, ctx: &ScanCtx<'_>) -> u64 {
use std::os::unix::fs::MetadataExt;
let bytes = metadata.blocks() * 512;
if ctx.hardlinks == HardlinkPolicy::CountEach {
return bytes;
}
if !metadata.file_type().is_file() || metadata.nlink() <= 1 {
return bytes;
}
let key = (metadata.dev(), metadata.ino());
let mut seen = ctx.seen_inodes.lock().unwrap();
if seen.insert(key) {
bytes
} else {
0
}
}
#[cfg(not(unix))]
fn file_disk_usage(metadata: &fs::Metadata, _ctx: &ScanCtx<'_>) -> u64 {
metadata.len()
}
fn scan_children(path: &Path, depth: usize, inherited: Category, ctx: &ScanCtx<'_>) -> Vec<Entry> {
let entries: Vec<_> = match fs::read_dir(path) {
Ok(rd) => rd
.filter_map(|e| match e {
Ok(entry) => Some(entry),
Err(_) => {
ctx.counts.items_skipped.fetch_add(1, Ordering::Relaxed);
None
}
})
.collect(),
Err(_) => {
ctx.counts.items_skipped.fetch_add(1, Ordering::Relaxed);
return Vec::new();
}
};
if depth < PARALLEL_DEPTH {
entries
.par_iter()
.filter_map(
|entry| match scan_recursive(&entry.path(), depth + 1, inherited, ctx) {
Ok(e) => Some(e),
Err(_) => {
ctx.counts.items_skipped.fetch_add(1, Ordering::Relaxed);
None
}
},
)
.collect()
} else {
entries
.iter()
.filter_map(
|entry| match scan_recursive(&entry.path(), depth + 1, inherited, ctx) {
Ok(e) => Some(e),
Err(_) => {
ctx.counts.items_skipped.fetch_add(1, Ordering::Relaxed);
None
}
},
)
.collect()
}
}
fn days_since_modified(metadata: &fs::Metadata) -> Option<u64> {
let modified = metadata.modified().ok()?;
let duration = SystemTime::now().duration_since(modified).ok()?;
Some(duration.as_secs() / 86400)
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
fn unique_tmp(label: &str) -> std::path::PathBuf {
std::env::temp_dir().join(format!(
"duvis_test_{}_{}_{}",
label,
std::process::id(),
std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_nanos())
.unwrap_or(0)
))
}
fn find<'a>(entry: &'a Entry, name: &str) -> Option<&'a Entry> {
entry.children()?.iter().find(|c| c.name == name)
}
#[test]
fn scan_walks_a_real_directory() {
let tmp = unique_tmp("walk");
let _ = fs::remove_dir_all(&tmp);
fs::create_dir_all(tmp.join("sub")).unwrap();
fs::write(tmp.join("a.txt"), b"hello").unwrap();
fs::write(tmp.join("sub/b.txt"), b"world!").unwrap();
let (entry, _counts) = scan(&tmp, HardlinkPolicy::CountOnce).unwrap();
assert!(entry.is_dir());
assert!(entry.size >= 5 + 6);
let children = entry.children().unwrap();
assert_eq!(children.len(), 2);
fs::remove_dir_all(&tmp).unwrap();
}
#[test]
fn category_propagates_into_node_modules() {
let tmp = unique_tmp("nm");
let _ = fs::remove_dir_all(&tmp);
fs::create_dir_all(tmp.join("node_modules/some-pkg")).unwrap();
fs::write(tmp.join("node_modules/some-pkg/preview.png"), b"x").unwrap();
fs::write(tmp.join("node_modules/some-pkg/debug.log"), b"x").unwrap();
let (root, _counts) = scan(&tmp, HardlinkPolicy::CountOnce).unwrap();
let nm = find(&root, "node_modules").unwrap();
assert_eq!(nm.category, Category::Cache);
let pkg = find(nm, "some-pkg").unwrap();
assert_eq!(pkg.category, Category::Cache);
let png = find(pkg, "preview.png").unwrap();
assert_eq!(png.category, Category::Cache);
let log = find(pkg, "debug.log").unwrap();
assert_eq!(log.category, Category::Cache);
fs::remove_dir_all(&tmp).unwrap();
}
#[test]
fn outermost_ancestor_wins() {
let tmp = unique_tmp("nest");
let _ = fs::remove_dir_all(&tmp);
fs::create_dir_all(tmp.join("node_modules/inner/target")).unwrap();
fs::write(tmp.join("node_modules/inner/target/main.o"), b"x").unwrap();
let (root, _counts) = scan(&tmp, HardlinkPolicy::CountOnce).unwrap();
let nm = find(&root, "node_modules").unwrap();
let inner = find(nm, "inner").unwrap();
let target = find(inner, "target").unwrap();
assert_eq!(target.category, Category::Cache);
assert_eq!(find(target, "main.o").unwrap().category, Category::Cache);
fs::remove_dir_all(&tmp).unwrap();
}
#[test]
fn leaves_keep_own_category_outside_named_dirs() {
let tmp = unique_tmp("free");
let _ = fs::remove_dir_all(&tmp);
fs::create_dir_all(tmp.join("Movies")).unwrap();
fs::write(tmp.join("Movies/clip.mp4"), b"x").unwrap();
fs::write(tmp.join("Movies/notes.txt"), b"x").unwrap();
let (root, _counts) = scan(&tmp, HardlinkPolicy::CountOnce).unwrap();
let movies = find(&root, "Movies").unwrap();
assert_eq!(movies.category, Category::Other);
assert_eq!(find(movies, "clip.mp4").unwrap().category, Category::Media);
assert_eq!(find(movies, "notes.txt").unwrap().category, Category::Other);
fs::remove_dir_all(&tmp).unwrap();
}
#[test]
fn counts_are_populated_on_clean_scan() {
let tmp = unique_tmp("counts");
let _ = fs::remove_dir_all(&tmp);
fs::create_dir_all(tmp.join("a")).unwrap();
fs::write(tmp.join("a/x.txt"), b"x").unwrap();
fs::write(tmp.join("y.txt"), b"y").unwrap();
let (_, counts) = scan(&tmp, HardlinkPolicy::CountOnce).unwrap();
assert_eq!(counts.scanned(), 4);
assert_eq!(counts.skipped(), 0);
fs::remove_dir_all(&tmp).unwrap();
}
#[cfg(unix)]
#[test]
fn unreadable_directory_is_counted_as_skipped() {
use std::os::unix::fs::PermissionsExt;
let tmp = unique_tmp("noperm");
let _ = fs::remove_dir_all(&tmp);
fs::create_dir_all(tmp.join("locked")).unwrap();
fs::write(tmp.join("locked/secret"), b"x").unwrap();
fs::write(tmp.join("readable"), b"y").unwrap();
let mut perms = fs::metadata(tmp.join("locked")).unwrap().permissions();
perms.set_mode(0o000);
fs::set_permissions(tmp.join("locked"), perms).unwrap();
let (_, counts) = scan(&tmp, HardlinkPolicy::CountOnce).unwrap();
assert!(
counts.skipped() >= 1,
"expected ≥1 skip, got {}",
counts.skipped(),
);
let mut perms = fs::metadata(tmp.join("locked")).unwrap().permissions();
perms.set_mode(0o755);
fs::set_permissions(tmp.join("locked"), perms).unwrap();
fs::remove_dir_all(&tmp).unwrap();
}
#[cfg(unix)]
#[test]
fn hardlink_dedup_obeys_policy() {
let tmp = unique_tmp("hardlink");
let _ = fs::remove_dir_all(&tmp);
fs::create_dir_all(&tmp).unwrap();
fs::write(tmp.join("original.bin"), vec![b'x'; 4096]).unwrap();
fs::hard_link(tmp.join("original.bin"), tmp.join("alias.bin")).unwrap();
let (once, _) = scan(&tmp, HardlinkPolicy::CountOnce).unwrap();
let (each, _) = scan(&tmp, HardlinkPolicy::CountEach).unwrap();
assert!(once.size > 0);
assert_eq!(each.size, once.size * 2);
let once_children = once.children().unwrap();
let once_sizes: Vec<u64> = once_children.iter().map(|c| c.size).collect();
assert_eq!(once_sizes.iter().filter(|s| **s == 0).count(), 1);
assert_eq!(once_sizes.iter().filter(|s| **s > 0).count(), 1);
for c in each.children().unwrap() {
assert!(c.size > 0, "{} reported 0 under CountEach", c.name);
}
fs::remove_dir_all(&tmp).unwrap();
}
}