coreutils_rs/hash/
core.rs

1use std::cell::RefCell;
2use std::fs::{self, File};
3use std::io::{self, BufRead, Read, Write};
4use std::path::Path;
5
6#[cfg(target_os = "linux")]
7use std::sync::atomic::{AtomicBool, Ordering};
8
9use md5::Md5;
10use sha2::{Digest, Sha256};
11
12/// Supported hash algorithms.
13#[derive(Debug, Clone, Copy)]
14pub enum HashAlgorithm {
15    Sha256,
16    Md5,
17    Blake2b,
18}
19
20impl HashAlgorithm {
21    pub fn name(self) -> &'static str {
22        match self {
23            HashAlgorithm::Sha256 => "SHA256",
24            HashAlgorithm::Md5 => "MD5",
25            HashAlgorithm::Blake2b => "BLAKE2b",
26        }
27    }
28}
29
30// ── Generic hash helpers ────────────────────────────────────────────
31
32fn hash_digest<D: Digest>(data: &[u8]) -> String {
33    hex_encode(&D::digest(data))
34}
35
36/// Streaming hash using thread-local 4MB buffer for optimal L3 cache behavior.
37/// Avoids per-call 16MB allocation and keeps working set cache-resident.
38fn hash_reader_impl<D: Digest>(mut reader: impl Read) -> io::Result<String> {
39    STREAM_BUF.with(|cell| {
40        let mut buf = cell.borrow_mut();
41        let mut hasher = D::new();
42        loop {
43            let n = reader.read(&mut buf)?;
44            if n == 0 {
45                break;
46            }
47            hasher.update(&buf[..n]);
48        }
49        Ok(hex_encode(&hasher.finalize()))
50    })
51}
52
53// ── Public hashing API ──────────────────────────────────────────────
54
55/// Buffer size for streaming hash I/O.
56/// 4MB fits in L3 cache for optimal throughput while minimizing syscall count.
57/// With fadvise(SEQUENTIAL), the kernel prefetches ahead of our reads.
58const HASH_READ_BUF: usize = 4 * 1024 * 1024;
59
60/// Threshold below which read() into thread-local buffer + single-shot hash
61/// is faster than streaming. Avoids per-chunk hasher.update() overhead.
62const MMAP_THRESHOLD: u64 = 1024 * 1024; // 1MB
63
64// Thread-local reusable buffers avoid per-call allocation overhead.
65// READ_BUF: for small files (<1MB) — read entire file, single-shot hash.
66// STREAM_BUF: for large files & pipes — streaming read + incremental hash.
67thread_local! {
68    static READ_BUF: RefCell<Vec<u8>> = RefCell::new(Vec::with_capacity(MMAP_THRESHOLD as usize));
69    static STREAM_BUF: RefCell<Vec<u8>> = RefCell::new(vec![0u8; HASH_READ_BUF]);
70}
71
72/// Compute hash of a byte slice directly (zero-copy fast path).
73pub fn hash_bytes(algo: HashAlgorithm, data: &[u8]) -> String {
74    match algo {
75        HashAlgorithm::Sha256 => hash_digest::<Sha256>(data),
76        HashAlgorithm::Md5 => hash_digest::<Md5>(data),
77        HashAlgorithm::Blake2b => {
78            let hash = blake2b_simd::blake2b(data);
79            hex_encode(hash.as_bytes())
80        }
81    }
82}
83
84/// Compute hash of data from a reader, returning hex string.
85pub fn hash_reader<R: Read>(algo: HashAlgorithm, reader: R) -> io::Result<String> {
86    match algo {
87        HashAlgorithm::Sha256 => hash_reader_impl::<Sha256>(reader),
88        HashAlgorithm::Md5 => hash_reader_impl::<Md5>(reader),
89        HashAlgorithm::Blake2b => blake2b_hash_reader(reader, 64),
90    }
91}
92
93/// Track whether O_NOATIME is supported to avoid repeated failed open() attempts.
94/// After the first EPERM, we never try O_NOATIME again (saves one syscall per file).
95#[cfg(target_os = "linux")]
96static NOATIME_SUPPORTED: AtomicBool = AtomicBool::new(true);
97
98/// Open a file with O_NOATIME on Linux to avoid atime update overhead.
99/// Caches whether O_NOATIME works to avoid double-open on every file.
100#[cfg(target_os = "linux")]
101fn open_noatime(path: &Path) -> io::Result<File> {
102    use std::os::unix::fs::OpenOptionsExt;
103    if NOATIME_SUPPORTED.load(Ordering::Relaxed) {
104        match fs::OpenOptions::new()
105            .read(true)
106            .custom_flags(libc::O_NOATIME)
107            .open(path)
108        {
109            Ok(f) => return Ok(f),
110            Err(ref e) if e.raw_os_error() == Some(libc::EPERM) => {
111                // O_NOATIME requires file ownership or CAP_FOWNER — disable globally
112                NOATIME_SUPPORTED.store(false, Ordering::Relaxed);
113            }
114            Err(e) => return Err(e), // Real error, propagate
115        }
116    }
117    File::open(path)
118}
119
120#[cfg(not(target_os = "linux"))]
121fn open_noatime(path: &Path) -> io::Result<File> {
122    File::open(path)
123}
124
125/// Hint the kernel for sequential read access. Non-blocking.
126#[cfg(target_os = "linux")]
127#[inline]
128fn fadvise_sequential(file: &File, len: u64) {
129    use std::os::unix::io::AsRawFd;
130    unsafe {
131        libc::posix_fadvise(file.as_raw_fd(), 0, len as i64, libc::POSIX_FADV_SEQUENTIAL);
132    }
133}
134
135#[cfg(not(target_os = "linux"))]
136#[inline]
137fn fadvise_sequential(_file: &File, _len: u64) {}
138
139/// Hash a file by path. Single open + fstat to minimize syscalls.
140/// Uses read() for small files, streaming read+hash for large files.
141/// Replaced mmap with read()+fadvise for better cache behavior:
142/// read() keeps data hot in L2/L3 cache, while mmap suffers page table
143/// and TLB overhead for sequential single-pass workloads.
144pub fn hash_file(algo: HashAlgorithm, path: &Path) -> io::Result<String> {
145    // Single open — reuse fd for fstat + read (saves separate stat + open)
146    let file = open_noatime(path)?;
147    let metadata = file.metadata()?; // fstat on existing fd, cheaper than stat(path)
148    let len = metadata.len();
149    let is_regular = metadata.file_type().is_file();
150
151    if is_regular && len == 0 {
152        return Ok(hash_bytes(algo, &[]));
153    }
154
155    if is_regular && len > 0 {
156        // Small files: read into thread-local buffer (zero allocation after first call)
157        if len < MMAP_THRESHOLD {
158            return READ_BUF.with(|cell| {
159                let mut buf = cell.borrow_mut();
160                buf.clear();
161                // Reserve is a no-op if capacity >= len (which it is after first call)
162                buf.reserve(len as usize);
163                Read::read_to_end(&mut &file, &mut buf)?;
164                Ok(hash_bytes(algo, &buf))
165            });
166        }
167
168        // Large files: streaming read with kernel readahead hint.
169        // fadvise(SEQUENTIAL) enables aggressive readahead (2x default).
170        fadvise_sequential(&file, len);
171        return hash_reader(algo, file);
172    }
173
174    // Fallback: streaming read (special files, pipes, etc.) — fd already open
175    hash_reader(algo, file)
176}
177
178/// Hash stdin. Uses fadvise for file redirects, streaming for pipes.
179pub fn hash_stdin(algo: HashAlgorithm) -> io::Result<String> {
180    let stdin = io::stdin();
181    // Hint kernel for sequential access if stdin is a regular file (redirect)
182    #[cfg(target_os = "linux")]
183    {
184        use std::os::unix::io::AsRawFd;
185        let fd = stdin.as_raw_fd();
186        let mut stat: libc::stat = unsafe { std::mem::zeroed() };
187        if unsafe { libc::fstat(fd, &mut stat) } == 0
188            && (stat.st_mode & libc::S_IFMT) == libc::S_IFREG
189            && stat.st_size > 0
190        {
191            unsafe {
192                libc::posix_fadvise(fd, 0, stat.st_size, libc::POSIX_FADV_SEQUENTIAL);
193            }
194        }
195    }
196    // Streaming hash — works for both pipe and file-redirect stdin
197    hash_reader(algo, stdin.lock())
198}
199
200/// Estimate total file size for parallel/sequential decision.
201/// Uses a quick heuristic: samples first file and extrapolates.
202/// Returns 0 if estimation fails.
203pub fn estimate_total_size(paths: &[&Path]) -> u64 {
204    if paths.is_empty() {
205        return 0;
206    }
207    // Sample first file to estimate
208    if let Ok(meta) = fs::metadata(paths[0]) {
209        meta.len().saturating_mul(paths.len() as u64)
210    } else {
211        0
212    }
213}
214
215/// Check if parallel hashing is worthwhile for the given file paths.
216/// Only uses rayon when files are individually large enough for the hash
217/// computation to dominate over rayon overhead (thread pool init + work stealing).
218/// For many small files (e.g., 100 × 100KB), sequential is much faster.
219pub fn should_use_parallel(paths: &[&Path]) -> bool {
220    if paths.len() < 2 {
221        return false;
222    }
223    let total = estimate_total_size(paths);
224    let avg = total / paths.len() as u64;
225    // Only parallelize when average file size >= 1MB.
226    // Below this, rayon overhead exceeds the benefit of parallel hashing.
227    avg >= 1024 * 1024
228}
229
230/// Issue readahead hints for a list of file paths to warm the page cache.
231/// Uses POSIX_FADV_WILLNEED which is non-blocking and batches efficiently.
232#[cfg(target_os = "linux")]
233pub fn readahead_files(paths: &[&Path]) {
234    use std::os::unix::io::AsRawFd;
235    for path in paths {
236        if let Ok(file) = open_noatime(path) {
237            if let Ok(meta) = file.metadata() {
238                let len = meta.len();
239                if meta.file_type().is_file() && len > 0 {
240                    unsafe {
241                        libc::posix_fadvise(
242                            file.as_raw_fd(),
243                            0,
244                            len as i64,
245                            libc::POSIX_FADV_WILLNEED,
246                        );
247                    }
248                }
249            }
250        }
251    }
252}
253
254#[cfg(not(target_os = "linux"))]
255pub fn readahead_files(_paths: &[&Path]) {
256    // No-op on non-Linux
257}
258
259// --- BLAKE2b variable-length functions (using blake2b_simd) ---
260
261/// Hash raw data with BLAKE2b variable output length.
262/// `output_bytes` is the output size in bytes (e.g., 32 for 256-bit).
263pub fn blake2b_hash_data(data: &[u8], output_bytes: usize) -> String {
264    let hash = blake2b_simd::Params::new()
265        .hash_length(output_bytes)
266        .hash(data);
267    hex_encode(hash.as_bytes())
268}
269
270/// Hash a reader with BLAKE2b variable output length.
271/// Uses thread-local 4MB buffer for cache-friendly streaming.
272pub fn blake2b_hash_reader<R: Read>(mut reader: R, output_bytes: usize) -> io::Result<String> {
273    STREAM_BUF.with(|cell| {
274        let mut buf = cell.borrow_mut();
275        let mut state = blake2b_simd::Params::new()
276            .hash_length(output_bytes)
277            .to_state();
278        loop {
279            let n = reader.read(&mut buf)?;
280            if n == 0 {
281                break;
282            }
283            state.update(&buf[..n]);
284        }
285        Ok(hex_encode(state.finalize().as_bytes()))
286    })
287}
288
289/// Hash a file with BLAKE2b variable output length. Single open + fstat.
290/// Uses read() for small files, streaming read+hash for large.
291pub fn blake2b_hash_file(path: &Path, output_bytes: usize) -> io::Result<String> {
292    // Single open — reuse fd for fstat + read
293    let file = open_noatime(path)?;
294    let metadata = file.metadata()?;
295    let len = metadata.len();
296    let is_regular = metadata.file_type().is_file();
297
298    if is_regular && len == 0 {
299        return Ok(blake2b_hash_data(&[], output_bytes));
300    }
301
302    if is_regular && len > 0 {
303        // Small files: read into thread-local buffer (zero allocation after first call)
304        if len < MMAP_THRESHOLD {
305            return READ_BUF.with(|cell| {
306                let mut buf = cell.borrow_mut();
307                buf.clear();
308                buf.reserve(len as usize);
309                Read::read_to_end(&mut &file, &mut buf)?;
310                Ok(blake2b_hash_data(&buf, output_bytes))
311            });
312        }
313
314        // Large files: streaming read with kernel readahead hint
315        fadvise_sequential(&file, len);
316        return blake2b_hash_reader(file, output_bytes);
317    }
318
319    // Fallback: streaming read — fd already open
320    blake2b_hash_reader(file, output_bytes)
321}
322
323/// Hash stdin with BLAKE2b variable output length.
324/// Tries fadvise if stdin is a regular file (shell redirect), then streams.
325pub fn blake2b_hash_stdin(output_bytes: usize) -> io::Result<String> {
326    let stdin = io::stdin();
327    #[cfg(target_os = "linux")]
328    {
329        use std::os::unix::io::AsRawFd;
330        let fd = stdin.as_raw_fd();
331        let mut stat: libc::stat = unsafe { std::mem::zeroed() };
332        if unsafe { libc::fstat(fd, &mut stat) } == 0
333            && (stat.st_mode & libc::S_IFMT) == libc::S_IFREG
334            && stat.st_size > 0
335        {
336            unsafe {
337                libc::posix_fadvise(fd, 0, stat.st_size, libc::POSIX_FADV_SEQUENTIAL);
338            }
339        }
340    }
341    blake2b_hash_reader(stdin.lock(), output_bytes)
342}
343
344/// Print hash result in GNU format: "hash  filename\n"
345pub fn print_hash(
346    out: &mut impl Write,
347    hash: &str,
348    filename: &str,
349    binary: bool,
350) -> io::Result<()> {
351    let mode_char = if binary { '*' } else { ' ' };
352    writeln!(out, "{} {}{}", hash, mode_char, filename)
353}
354
355/// Print hash in GNU format with NUL terminator instead of newline.
356pub fn print_hash_zero(
357    out: &mut impl Write,
358    hash: &str,
359    filename: &str,
360    binary: bool,
361) -> io::Result<()> {
362    let mode_char = if binary { '*' } else { ' ' };
363    write!(out, "{} {}{}\0", hash, mode_char, filename)
364}
365
366/// Print hash result in BSD tag format: "ALGO (filename) = hash\n"
367pub fn print_hash_tag(
368    out: &mut impl Write,
369    algo: HashAlgorithm,
370    hash: &str,
371    filename: &str,
372) -> io::Result<()> {
373    writeln!(out, "{} ({}) = {}", algo.name(), filename, hash)
374}
375
376/// Print hash in BSD tag format with NUL terminator.
377pub fn print_hash_tag_zero(
378    out: &mut impl Write,
379    algo: HashAlgorithm,
380    hash: &str,
381    filename: &str,
382) -> io::Result<()> {
383    write!(out, "{} ({}) = {}\0", algo.name(), filename, hash)
384}
385
386/// Print hash in BSD tag format with BLAKE2b length info:
387/// "BLAKE2b (filename) = hash" for 512-bit, or
388/// "BLAKE2b-256 (filename) = hash" for other lengths.
389pub fn print_hash_tag_b2sum(
390    out: &mut impl Write,
391    hash: &str,
392    filename: &str,
393    bits: usize,
394) -> io::Result<()> {
395    if bits == 512 {
396        writeln!(out, "BLAKE2b ({}) = {}", filename, hash)
397    } else {
398        writeln!(out, "BLAKE2b-{} ({}) = {}", bits, filename, hash)
399    }
400}
401
402/// Print hash in BSD tag format with BLAKE2b length info and NUL terminator.
403pub fn print_hash_tag_b2sum_zero(
404    out: &mut impl Write,
405    hash: &str,
406    filename: &str,
407    bits: usize,
408) -> io::Result<()> {
409    if bits == 512 {
410        write!(out, "BLAKE2b ({}) = {}\0", filename, hash)
411    } else {
412        write!(out, "BLAKE2b-{} ({}) = {}\0", bits, filename, hash)
413    }
414}
415
416/// Options for check mode.
417pub struct CheckOptions {
418    pub quiet: bool,
419    pub status_only: bool,
420    pub strict: bool,
421    pub warn: bool,
422    pub ignore_missing: bool,
423    /// Prefix for per-line format warnings, e.g., "fmd5sum: checksums.txt".
424    /// When non-empty, warnings use GNU format: "{prefix}: {line}: message".
425    /// When empty, uses generic format: "line {line}: message".
426    pub warn_prefix: String,
427}
428
429/// Result of check mode verification.
430pub struct CheckResult {
431    pub ok: usize,
432    pub mismatches: usize,
433    pub format_errors: usize,
434    pub read_errors: usize,
435    /// Number of files skipped because they were missing and --ignore-missing was set.
436    pub ignored_missing: usize,
437}
438
439/// Verify checksums from a check file.
440/// Each line should be "hash  filename" or "hash *filename" or "ALGO (filename) = hash".
441pub fn check_file<R: BufRead>(
442    algo: HashAlgorithm,
443    reader: R,
444    opts: &CheckOptions,
445    out: &mut impl Write,
446    err_out: &mut impl Write,
447) -> io::Result<CheckResult> {
448    let quiet = opts.quiet;
449    let status_only = opts.status_only;
450    let warn = opts.warn;
451    let ignore_missing = opts.ignore_missing;
452    let mut ok_count = 0;
453    let mut mismatch_count = 0;
454    let mut format_errors = 0;
455    let mut read_errors = 0;
456    let mut ignored_missing_count = 0;
457    let mut line_num = 0;
458
459    for line_result in reader.lines() {
460        line_num += 1;
461        let line = line_result?;
462        let line = line.trim_end();
463
464        if line.is_empty() {
465            continue;
466        }
467
468        // Parse "hash  filename" or "hash *filename" or "ALGO (file) = hash"
469        let (expected_hash, filename) = match parse_check_line(line) {
470            Some(v) => v,
471            None => {
472                format_errors += 1;
473                if warn {
474                    out.flush()?;
475                    if opts.warn_prefix.is_empty() {
476                        writeln!(
477                            err_out,
478                            "line {}: improperly formatted {} checksum line",
479                            line_num,
480                            algo.name()
481                        )?;
482                    } else {
483                        writeln!(
484                            err_out,
485                            "{}: {}: improperly formatted {} checksum line",
486                            opts.warn_prefix,
487                            line_num,
488                            algo.name()
489                        )?;
490                    }
491                }
492                continue;
493            }
494        };
495
496        // Compute actual hash
497        let actual = match hash_file(algo, Path::new(filename)) {
498            Ok(h) => h,
499            Err(e) => {
500                if ignore_missing && e.kind() == io::ErrorKind::NotFound {
501                    ignored_missing_count += 1;
502                    continue;
503                }
504                read_errors += 1;
505                if !status_only {
506                    out.flush()?;
507                    writeln!(err_out, "{}: {}", filename, e)?;
508                    writeln!(out, "{}: FAILED open or read", filename)?;
509                }
510                continue;
511            }
512        };
513
514        if actual.eq_ignore_ascii_case(expected_hash) {
515            ok_count += 1;
516            if !quiet && !status_only {
517                writeln!(out, "{}: OK", filename)?;
518            }
519        } else {
520            mismatch_count += 1;
521            if !status_only {
522                writeln!(out, "{}: FAILED", filename)?;
523            }
524        }
525    }
526
527    Ok(CheckResult {
528        ok: ok_count,
529        mismatches: mismatch_count,
530        format_errors,
531        read_errors,
532        ignored_missing: ignored_missing_count,
533    })
534}
535
536/// Parse a checksum line in any supported format.
537pub fn parse_check_line(line: &str) -> Option<(&str, &str)> {
538    // Try BSD tag format: "ALGO (filename) = hash"
539    let rest = line
540        .strip_prefix("MD5 (")
541        .or_else(|| line.strip_prefix("SHA256 ("))
542        .or_else(|| line.strip_prefix("BLAKE2b ("))
543        .or_else(|| {
544            // Handle BLAKE2b-NNN (filename) = hash
545            if line.starts_with("BLAKE2b-") {
546                let after = &line["BLAKE2b-".len()..];
547                if let Some(sp) = after.find(" (") {
548                    if after[..sp].bytes().all(|b| b.is_ascii_digit()) {
549                        return Some(&after[sp + 2..]);
550                    }
551                }
552            }
553            None
554        });
555    if let Some(rest) = rest {
556        if let Some(paren_idx) = rest.find(") = ") {
557            let filename = &rest[..paren_idx];
558            let hash = &rest[paren_idx + 4..];
559            return Some((hash, filename));
560        }
561    }
562
563    // Handle backslash-escaped lines (leading '\')
564    let line = line.strip_prefix('\\').unwrap_or(line);
565
566    // Standard format: "hash  filename"
567    if let Some(idx) = line.find("  ") {
568        let hash = &line[..idx];
569        let rest = &line[idx + 2..];
570        return Some((hash, rest));
571    }
572    // Binary mode: "hash *filename"
573    if let Some(idx) = line.find(" *") {
574        let hash = &line[..idx];
575        let rest = &line[idx + 2..];
576        return Some((hash, rest));
577    }
578    None
579}
580
581/// Parse a BSD-style tag line: "ALGO (filename) = hash"
582/// Returns (expected_hash, filename, optional_bits).
583/// `bits` is the hash length parsed from the algo name (e.g., BLAKE2b-256 -> Some(256)).
584pub fn parse_check_line_tag(line: &str) -> Option<(&str, &str, Option<usize>)> {
585    let paren_start = line.find(" (")?;
586    let algo_part = &line[..paren_start];
587    let rest = &line[paren_start + 2..];
588    let paren_end = rest.find(") = ")?;
589    let filename = &rest[..paren_end];
590    let hash = &rest[paren_end + 4..];
591
592    // Parse optional bit length from algo name (e.g., "BLAKE2b-256" -> Some(256))
593    let bits = if let Some(dash_pos) = algo_part.rfind('-') {
594        algo_part[dash_pos + 1..].parse::<usize>().ok()
595    } else {
596        None
597    };
598
599    Some((hash, filename, bits))
600}
601
602/// Compile-time generated 2-byte hex pair lookup table.
603/// Each byte maps directly to its 2-char hex representation — single lookup per byte.
604const fn generate_hex_table() -> [[u8; 2]; 256] {
605    let hex = b"0123456789abcdef";
606    let mut table = [[0u8; 2]; 256];
607    let mut i = 0;
608    while i < 256 {
609        table[i] = [hex[i >> 4], hex[i & 0xf]];
610        i += 1;
611    }
612    table
613}
614
615const HEX_TABLE: [[u8; 2]; 256] = generate_hex_table();
616
617/// Fast hex encoding using 2-byte pair lookup table — one lookup per input byte.
618/// Uses String directly instead of Vec<u8> to avoid the from_utf8 conversion overhead.
619pub(crate) fn hex_encode(bytes: &[u8]) -> String {
620    let len = bytes.len() * 2;
621    let mut hex = String::with_capacity(len);
622    // SAFETY: We write exactly `len` valid ASCII hex bytes into the String's buffer.
623    unsafe {
624        let buf = hex.as_mut_vec();
625        buf.set_len(len);
626        let ptr = buf.as_mut_ptr();
627        for (i, &b) in bytes.iter().enumerate() {
628            let pair = *HEX_TABLE.get_unchecked(b as usize);
629            *ptr.add(i * 2) = pair[0];
630            *ptr.add(i * 2 + 1) = pair[1];
631        }
632    }
633    hex
634}
coreutils_rs/hash/core.rs

coreutils_rs/hash/
core.rs