Skip to main content

coreutils_rs/du/
core.rs

1use std::collections::HashSet;
2use std::io::{self, BufRead, Write};
3use std::os::unix::fs::MetadataExt;
4use std::path::{Path, PathBuf};
5use std::time::SystemTime;
6
7/// Configuration for the `du` command.
8pub struct DuConfig {
9    /// Show counts for all files, not just directories.
10    pub all: bool,
11    /// Print apparent sizes rather than disk usage.
12    pub apparent_size: bool,
13    /// Block size for output scaling.
14    pub block_size: u64,
15    /// Human-readable output (powers of 1024).
16    pub human_readable: bool,
17    /// Human-readable output (powers of 1000).
18    pub si: bool,
19    /// Produce a grand total.
20    pub total: bool,
21    /// Maximum depth of directory traversal to display.
22    pub max_depth: Option<usize>,
23    /// Only display a total for each argument.
24    pub summarize: bool,
25    /// Stay on the same filesystem.
26    pub one_file_system: bool,
27    /// Dereference all symbolic links.
28    pub dereference: bool,
29    /// Dereference only command-line symlink arguments (-D / -H / --dereference-args).
30    pub dereference_args: bool,
31    /// For directories, do not include size of subdirectories.
32    pub separate_dirs: bool,
33    /// Count sizes of hard-linked files multiple times.
34    pub count_links: bool,
35    /// End output lines with NUL instead of newline.
36    pub null_terminator: bool,
37    /// Exclude entries smaller (or larger if negative) than this threshold.
38    pub threshold: Option<i64>,
39    /// Show modification time of entries.
40    pub show_time: bool,
41    /// Time format style (full-iso, long-iso, iso).
42    pub time_style: String,
43    /// Glob patterns to exclude.
44    pub exclude_patterns: Vec<String>,
45    /// Count inodes instead of sizes.
46    pub inodes: bool,
47}
48
49impl Default for DuConfig {
50    fn default() -> Self {
51        DuConfig {
52            all: false,
53            apparent_size: false,
54            block_size: 1024,
55            human_readable: false,
56            si: false,
57            total: false,
58            max_depth: None,
59            summarize: false,
60            one_file_system: false,
61            dereference: false,
62            dereference_args: false,
63            separate_dirs: false,
64            count_links: false,
65            null_terminator: false,
66            threshold: None,
67            show_time: false,
68            time_style: "long-iso".to_string(),
69            exclude_patterns: Vec::new(),
70            inodes: false,
71        }
72    }
73}
74
75/// A single entry produced by `du` traversal.
76pub struct DuEntry {
77    /// Size in bytes (or inode count if inodes mode).
78    pub size: u64,
79    /// Path of the entry.
80    pub path: PathBuf,
81    /// Modification time (seconds since epoch), if available.
82    pub mtime: Option<i64>,
83}
84
85/// Traverse `path` and collect `DuEntry` results according to `config`.
86pub fn du_path(path: &Path, config: &DuConfig) -> io::Result<Vec<DuEntry>> {
87    let mut seen_inodes: HashSet<(u64, u64)> = HashSet::new();
88    let mut had_error = false;
89    du_path_with_seen(path, config, &mut seen_inodes, &mut had_error)
90}
91
92/// Traverse `path` with a shared inode set (for deduplication across multiple arguments).
93/// Sets `had_error` to true if any permission or access errors are encountered.
94pub fn du_path_with_seen(
95    path: &Path,
96    config: &DuConfig,
97    seen_inodes: &mut HashSet<(u64, u64)>,
98    had_error: &mut bool,
99) -> io::Result<Vec<DuEntry>> {
100    let mut entries = Vec::new();
101    du_recursive(path, config, seen_inodes, &mut entries, 0, None, had_error)?;
102    Ok(entries)
103}
104
105/// Check whether a path should be excluded by any of the exclude patterns.
106/// GNU du matches the pattern against the basename and the full path.
107fn is_excluded(path: &Path, config: &DuConfig) -> bool {
108    if config.exclude_patterns.is_empty() {
109        return false;
110    }
111    let path_str = path.to_string_lossy();
112    let basename = path
113        .file_name()
114        .map(|n| n.to_string_lossy())
115        .unwrap_or_default();
116    config
117        .exclude_patterns
118        .iter()
119        .any(|pat| glob_match(pat, &basename) || glob_match(pat, &path_str))
120}
121
122/// Recursive traversal core. Returns the cumulative size of the subtree at `path`.
123fn du_recursive(
124    path: &Path,
125    config: &DuConfig,
126    seen: &mut HashSet<(u64, u64)>,
127    entries: &mut Vec<DuEntry>,
128    depth: usize,
129    root_dev: Option<u64>,
130    had_error: &mut bool,
131) -> io::Result<u64> {
132    // Check exclude patterns against this path (GNU du applies exclude to all entries
133    // including the root argument itself).
134    if is_excluded(path, config) {
135        return Ok(0);
136    }
137
138    // For depth 0 (command-line arguments), dereference_args means follow symlinks.
139    let meta = if config.dereference || (depth == 0 && config.dereference_args) {
140        std::fs::metadata(path)?
141    } else {
142        std::fs::symlink_metadata(path)?
143    };
144
145    // Check one-file-system: skip entries on different devices.
146    if let Some(dev) = root_dev {
147        if meta.dev() != dev && config.one_file_system {
148            return Ok(0);
149        }
150    }
151
152    // Track hard links: skip files we have already counted (unless --count-links).
153    let ino_key = (meta.dev(), meta.ino());
154    if meta.nlink() > 1 && !config.count_links {
155        if !seen.insert(ino_key) {
156            return Ok(0);
157        }
158    }
159
160    let size = if config.inodes {
161        1
162    } else if config.apparent_size {
163        // GNU du uses 0 for directory entries themselves in apparent-size mode;
164        // only file content sizes are counted.
165        if meta.is_dir() { 0 } else { meta.len() }
166    } else {
167        meta.blocks() * 512
168    };
169
170    let mtime = meta.mtime();
171
172    if meta.is_dir() {
173        // Track the full subtree size (for returning to parent)
174        // and the display size (what we show for this directory).
175        let mut subtree_size: u64 = size;
176        // For separate_dirs, display size only includes this dir + direct files, not subdirs.
177        let mut display_size: u64 = size;
178
179        let read_dir = match std::fs::read_dir(path) {
180            Ok(rd) => rd,
181            Err(e) => {
182                eprintln!(
183                    "du: cannot read directory '{}': {}",
184                    path.display(),
185                    format_io_error(&e)
186                );
187                *had_error = true;
188                // Still report what we can for this directory.
189                if should_report_dir(config, depth) {
190                    entries.push(DuEntry {
191                        size,
192                        path: path.to_path_buf(),
193                        mtime: if config.show_time { Some(mtime) } else { None },
194                    });
195                }
196                return Ok(size);
197            }
198        };
199
200        for entry in read_dir {
201            let entry = match entry {
202                Ok(e) => e,
203                Err(e) => {
204                    eprintln!(
205                        "du: cannot access entry in '{}': {}",
206                        path.display(),
207                        format_io_error(&e)
208                    );
209                    *had_error = true;
210                    continue;
211                }
212            };
213            let child_path = entry.path();
214
215            // Check exclude patterns against both basename and full path (GNU compat).
216            if is_excluded(&child_path, config) {
217                continue;
218            }
219
220            // Use DirEntry::file_type() which is free on Linux (uses d_type from readdir)
221            // instead of an extra symlink_metadata() syscall. Fall back to symlink_metadata()
222            // on filesystems that return DT_UNKNOWN (old XFS, NFS, some FUSE).
223            let child_is_dir = entry
224                .file_type()
225                .or_else(|_| std::fs::symlink_metadata(&child_path).map(|m| m.file_type()))
226                .map_or(false, |ft| ft.is_dir());
227
228            let child_size = du_recursive(
229                &child_path,
230                config,
231                seen,
232                entries,
233                depth + 1,
234                Some(root_dev.unwrap_or(meta.dev())),
235                had_error,
236            )?;
237            subtree_size += child_size;
238            if config.separate_dirs && child_is_dir {
239                // Don't add subdirectory sizes to display size.
240            } else {
241                display_size += child_size;
242            }
243        }
244
245        // Emit an entry for this directory if within display depth.
246        if should_report_dir(config, depth) {
247            entries.push(DuEntry {
248                size: display_size,
249                path: path.to_path_buf(),
250                mtime: if config.show_time { Some(mtime) } else { None },
251            });
252        }
253
254        Ok(subtree_size)
255    } else {
256        // Regular file / symlink / special file.
257        // Always report top-level arguments (depth 0), or all files if --all.
258        if (depth == 0 || config.all) && within_depth(config, depth) {
259            entries.push(DuEntry {
260                size,
261                path: path.to_path_buf(),
262                mtime: if config.show_time { Some(mtime) } else { None },
263            });
264        }
265        Ok(size)
266    }
267}
268
269/// Whether a directory entry at `depth` should be reported.
270fn should_report_dir(config: &DuConfig, depth: usize) -> bool {
271    if config.summarize {
272        return depth == 0;
273    }
274    within_depth(config, depth)
275}
276
277/// Whether `depth` is within the configured max_depth.
278fn within_depth(config: &DuConfig, depth: usize) -> bool {
279    match config.max_depth {
280        Some(max) => depth <= max,
281        None => true,
282    }
283}
284
285/// Glob matching supporting `*`, `?`, and `[...]`/`[^...]` character classes.
286/// Compatible with fnmatch(3) FNM_PATHNAME-less matching used by GNU du.
287pub fn glob_match(pattern: &str, text: &str) -> bool {
288    let pat: Vec<char> = pattern.chars().collect();
289    let txt: Vec<char> = text.chars().collect();
290    glob_match_inner(&pat, &txt)
291}
292
293/// Try to match a `[...]` or `[^...]` bracket expression starting at `pat[start]` (which is `[`).
294/// Returns `Some((matched_char, end_index))` where `end_index` is the index after `]`,
295/// or `None` if the bracket expression is malformed.
296fn match_bracket_class(pat: &[char], start: usize, ch: char) -> Option<(bool, usize)> {
297    let mut i = start + 1; // skip the opening `[`
298    if i >= pat.len() {
299        return None;
300    }
301
302    // Check for negation: `[^...]` or `[!...]`
303    let negate = if pat[i] == '^' || pat[i] == '!' {
304        i += 1;
305        true
306    } else {
307        false
308    };
309
310    // A `]` immediately after `[` (or `[^`) is treated as a literal character in the class.
311    let mut found = false;
312    let mut first = true;
313    while i < pat.len() {
314        if pat[i] == ']' && !first {
315            // End of bracket expression.
316            let matched = if negate { !found } else { found };
317            return Some((matched, i + 1));
318        }
319        // Check for range: a-z
320        if i + 2 < pat.len() && pat[i + 1] == '-' && pat[i + 2] != ']' {
321            let lo = pat[i];
322            let hi = pat[i + 2];
323            if ch >= lo && ch <= hi {
324                found = true;
325            }
326            i += 3;
327        } else {
328            if pat[i] == ch {
329                found = true;
330            }
331            i += 1;
332        }
333        first = false;
334    }
335
336    // No closing `]` found — malformed bracket expression.
337    None
338}
339
340fn glob_match_inner(pat: &[char], txt: &[char]) -> bool {
341    let mut pi = 0;
342    let mut ti = 0;
343    let mut star_pi = usize::MAX;
344    let mut star_ti = 0;
345
346    while ti < txt.len() {
347        if pi < pat.len() && pat[pi] == '[' {
348            // Try to match a bracket expression.
349            if let Some((matched, end)) = match_bracket_class(pat, pi, txt[ti]) {
350                if matched {
351                    pi = end;
352                    ti += 1;
353                    continue;
354                }
355                // Not matched — fall through to star backtrack.
356            }
357            // Malformed bracket or no match — try star backtrack.
358            if star_pi != usize::MAX {
359                pi = star_pi + 1;
360                star_ti += 1;
361                ti = star_ti;
362            } else {
363                return false;
364            }
365        } else if pi < pat.len() && (pat[pi] == '?' || pat[pi] == txt[ti]) {
366            pi += 1;
367            ti += 1;
368        } else if pi < pat.len() && pat[pi] == '*' {
369            star_pi = pi;
370            star_ti = ti;
371            pi += 1;
372        } else if star_pi != usize::MAX {
373            pi = star_pi + 1;
374            star_ti += 1;
375            ti = star_ti;
376        } else {
377            return false;
378        }
379    }
380
381    while pi < pat.len() && pat[pi] == '*' {
382        pi += 1;
383    }
384    pi == pat.len()
385}
386
387/// Format a size value for display according to the config.
388pub fn format_size(raw_bytes: u64, config: &DuConfig) -> String {
389    if config.human_readable {
390        human_readable(raw_bytes, 1024)
391    } else if config.si {
392        human_readable(raw_bytes, 1000)
393    } else if config.inodes {
394        raw_bytes.to_string()
395    } else {
396        // Scale by block_size, rounding up.
397        let scaled = (raw_bytes + config.block_size - 1) / config.block_size;
398        scaled.to_string()
399    }
400}
401
402/// Format a byte count in human-readable form (e.g., 1.5K, 23M).
403/// GNU du uses ceiling rounding and always shows one decimal for values < 10.
404fn human_readable(bytes: u64, base: u64) -> String {
405    let suffixes = if base == 1024 {
406        &["", "K", "M", "G", "T", "P", "E"]
407    } else {
408        &["", "k", "M", "G", "T", "P", "E"]
409    };
410
411    if bytes < base {
412        return format!("{}", bytes);
413    }
414
415    let mut value = bytes as f64;
416    let mut idx = 0;
417    while value >= base as f64 && idx + 1 < suffixes.len() {
418        value /= base as f64;
419        idx += 1;
420    }
421
422    if value >= 10.0 {
423        format!("{:.0}{}", value.ceil(), suffixes[idx])
424    } else {
425        // Show one decimal place for values < 10 (GNU keeps the .0).
426        let rounded = (value * 10.0).ceil() / 10.0;
427        if rounded >= 10.0 {
428            format!("{:.0}{}", rounded.ceil(), suffixes[idx])
429        } else {
430            format!("{:.1}{}", rounded, suffixes[idx])
431        }
432    }
433}
434
435/// Format a modification time for display.
436pub fn format_time(epoch_secs: i64, style: &str) -> String {
437    // Convert epoch seconds to a broken-down time.
438    let secs = epoch_secs;
439    let st = match SystemTime::UNIX_EPOCH.checked_add(std::time::Duration::from_secs(secs as u64)) {
440        Some(t) => t,
441        None => return String::from("?"),
442    };
443
444    // Use libc localtime_r for correct timezone handling.
445    let mut tm: libc::tm = unsafe { std::mem::zeroed() };
446    let time_t = secs as libc::time_t;
447    unsafe {
448        libc::localtime_r(&time_t, &mut tm);
449    }
450    // Ignore the SystemTime; we use the libc tm directly.
451    let _ = st;
452
453    let year = tm.tm_year + 1900;
454    let mon = tm.tm_mon + 1;
455    let day = tm.tm_mday;
456    let hour = tm.tm_hour;
457    let min = tm.tm_min;
458    let sec = tm.tm_sec;
459
460    match style {
461        "full-iso" => format!(
462            "{:04}-{:02}-{:02} {:02}:{:02}:{:02}.000000000 +0000",
463            year, mon, day, hour, min, sec
464        ),
465        "iso" => format!("{:04}-{:02}-{:02}", year, mon, day),
466        _ => {
467            // long-iso (default)
468            format!("{:04}-{:02}-{:02} {:02}:{:02}", year, mon, day, hour, min)
469        }
470    }
471}
472
473/// Print a single DuEntry.
474pub fn print_entry<W: Write>(out: &mut W, entry: &DuEntry, config: &DuConfig) -> io::Result<()> {
475    // Apply threshold filtering.
476    if let Some(thresh) = config.threshold {
477        let size_signed = entry.size as i64;
478        if thresh >= 0 && size_signed < thresh {
479            return Ok(());
480        }
481        if thresh < 0 && size_signed > thresh.unsigned_abs() as i64 {
482            return Ok(());
483        }
484    }
485
486    let size_str = format_size(entry.size, config);
487
488    if config.show_time {
489        if let Some(mtime) = entry.mtime {
490            let time_str = format_time(mtime, &config.time_style);
491            write!(out, "{}\t{}\t{}", size_str, time_str, entry.path.display())?;
492        } else {
493            write!(out, "{}\t{}", size_str, entry.path.display())?;
494        }
495    } else {
496        write!(out, "{}\t{}", size_str, entry.path.display())?;
497    }
498
499    if config.null_terminator {
500        out.write_all(b"\0")?;
501    } else {
502        out.write_all(b"\n")?;
503    }
504
505    Ok(())
506}
507
508/// Parse a block size string like "1K", "1M", "1G", etc.
509/// Returns the number of bytes per block.
510pub fn parse_block_size(s: &str) -> Result<u64, String> {
511    let s = s.trim();
512    if s.is_empty() {
513        return Err("empty block size".to_string());
514    }
515
516    let mut num_end = 0;
517    for (i, c) in s.char_indices() {
518        if c.is_ascii_digit() {
519            num_end = i + 1;
520        } else {
521            break;
522        }
523    }
524
525    let (num_str, suffix) = s.split_at(num_end);
526    let base_val: u64 = if num_str.is_empty() {
527        1
528    } else {
529        num_str
530            .parse()
531            .map_err(|_| format!("invalid block size: '{}'", s))?
532    };
533
534    let multiplier = match suffix.to_uppercase().as_str() {
535        "" => 1u64,
536        "B" => 1,
537        "K" | "KB" => 1024,
538        "M" | "MB" => 1024 * 1024,
539        "G" | "GB" => 1024 * 1024 * 1024,
540        "T" | "TB" => 1024u64 * 1024 * 1024 * 1024,
541        "P" | "PB" => 1024u64 * 1024 * 1024 * 1024 * 1024,
542        "KB_SI" => 1000,
543        _ => return Err(format!("invalid suffix in block size: '{}'", s)),
544    };
545
546    Ok(base_val * multiplier)
547}
548
549/// Parse a threshold value. Positive means "exclude entries smaller than SIZE".
550/// Negative means "exclude entries larger than -SIZE".
551/// GNU du rejects `--threshold=-0` and `--threshold=0` is allowed (positive zero is fine,
552/// but negative zero is invalid).
553pub fn parse_threshold(s: &str) -> Result<i64, String> {
554    let s = s.trim();
555    let (negative, rest) = if let Some(stripped) = s.strip_prefix('-') {
556        (true, stripped)
557    } else {
558        (false, s)
559    };
560
561    let val = parse_block_size(rest)? as i64;
562    if negative {
563        if val == 0 {
564            return Err(format!("invalid --threshold argument '-{}'", rest));
565        }
566        Ok(-val)
567    } else {
568        Ok(val)
569    }
570}
571
572/// Read exclude patterns from a file (one per line).
573pub fn read_exclude_file(path: &str) -> io::Result<Vec<String>> {
574    let file = std::fs::File::open(path)?;
575    let reader = io::BufReader::new(file);
576    let mut patterns = Vec::new();
577    for line in reader.lines() {
578        let line = line?;
579        let trimmed = line.trim();
580        if !trimmed.is_empty() {
581            patterns.push(trimmed.to_string());
582        }
583    }
584    Ok(patterns)
585}
586
587/// Format an IO error without the "(os error N)" suffix.
588fn format_io_error(e: &io::Error) -> String {
589    if let Some(raw) = e.raw_os_error() {
590        let os_err = io::Error::from_raw_os_error(raw);
591        let msg = format!("{}", os_err);
592        msg.replace(&format!(" (os error {})", raw), "")
593    } else {
594        format!("{}", e)
595    }
596}