Skip to main content

coreutils_rs/hash/
core.rs

1use std::cell::RefCell;
2use std::fs::File;
3use std::io::{self, BufRead, Read, Write};
4use std::path::Path;
5
6#[cfg(target_os = "linux")]
7use std::sync::atomic::{AtomicBool, Ordering};
8
9use digest::Digest;
10use md5::Md5;
11use memmap2::MmapOptions;
12
13/// Supported hash algorithms.
14#[derive(Debug, Clone, Copy)]
15pub enum HashAlgorithm {
16    Sha256,
17    Md5,
18    Blake2b,
19}
20
21impl HashAlgorithm {
22    pub fn name(self) -> &'static str {
23        match self {
24            HashAlgorithm::Sha256 => "SHA256",
25            HashAlgorithm::Md5 => "MD5",
26            HashAlgorithm::Blake2b => "BLAKE2b",
27        }
28    }
29}
30
31// ── Generic hash helpers ────────────────────────────────────────────
32
33fn hash_digest<D: Digest>(data: &[u8]) -> String {
34    hex_encode(&D::digest(data))
35}
36
37/// Streaming hash using thread-local 1MB buffer for optimal L2 cache behavior.
38/// 1MB fits in L2 cache on most CPUs, keeping data hot during hash update.
39/// Uses read_full to ensure each update() gets a full buffer, minimizing
40/// per-chunk hasher overhead and maximizing SIMD-friendly aligned updates.
41fn hash_reader_impl<D: Digest>(mut reader: impl Read) -> io::Result<String> {
42    STREAM_BUF.with(|cell| {
43        let mut buf = cell.borrow_mut();
44        let mut hasher = D::new();
45        loop {
46            let n = read_full(&mut reader, &mut buf)?;
47            if n == 0 {
48                break;
49            }
50            hasher.update(&buf[..n]);
51        }
52        Ok(hex_encode(&hasher.finalize()))
53    })
54}
55
56// ── Public hashing API ──────────────────────────────────────────────
57
58/// Buffer size for streaming hash I/O (stdin/pipes only — regular files use mmap).
59/// 4MB gives fewer syscalls while still fitting in L3 cache.
60/// With fadvise(SEQUENTIAL) the kernel prefetches ahead, so the next
61/// chunk is already in page cache by the time we finish hashing the current one.
62const HASH_READ_BUF: usize = 4 * 1024 * 1024;
63
64// Thread-local reusable buffer for streaming hash I/O (stdin/pipes only).
65// Allocated once per thread, reused across all hash_reader calls.
66thread_local! {
67    static STREAM_BUF: RefCell<Vec<u8>> = RefCell::new(vec![0u8; HASH_READ_BUF]);
68}
69
70// ── SHA-256: ring on non-Apple, sha2 fallback on Apple ───────────────
71
72/// Single-shot SHA-256 using ring's BoringSSL assembly (Linux/Windows).
73#[cfg(not(target_vendor = "apple"))]
74fn sha256_bytes(data: &[u8]) -> String {
75    hex_encode(ring::digest::digest(&ring::digest::SHA256, data).as_ref())
76}
77
78/// Single-shot SHA-256 using sha2 crate (macOS fallback).
79#[cfg(target_vendor = "apple")]
80fn sha256_bytes(data: &[u8]) -> String {
81    hash_digest::<sha2::Sha256>(data)
82}
83
84/// Streaming SHA-256 using ring's BoringSSL assembly (Linux/Windows).
85#[cfg(not(target_vendor = "apple"))]
86fn sha256_reader(mut reader: impl Read) -> io::Result<String> {
87    STREAM_BUF.with(|cell| {
88        let mut buf = cell.borrow_mut();
89        let mut ctx = ring::digest::Context::new(&ring::digest::SHA256);
90        loop {
91            let n = read_full(&mut reader, &mut buf)?;
92            if n == 0 {
93                break;
94            }
95            ctx.update(&buf[..n]);
96        }
97        Ok(hex_encode(ctx.finish().as_ref()))
98    })
99}
100
101/// Streaming SHA-256 using sha2 crate (macOS fallback).
102#[cfg(target_vendor = "apple")]
103fn sha256_reader(reader: impl Read) -> io::Result<String> {
104    hash_reader_impl::<sha2::Sha256>(reader)
105}
106
107/// Compute hash of a byte slice directly (zero-copy fast path).
108pub fn hash_bytes(algo: HashAlgorithm, data: &[u8]) -> String {
109    match algo {
110        HashAlgorithm::Sha256 => sha256_bytes(data),
111        HashAlgorithm::Md5 => hash_digest::<Md5>(data),
112        HashAlgorithm::Blake2b => {
113            let hash = blake2b_simd::blake2b(data);
114            hex_encode(hash.as_bytes())
115        }
116    }
117}
118
119/// Compute hash of data from a reader, returning hex string.
120pub fn hash_reader<R: Read>(algo: HashAlgorithm, reader: R) -> io::Result<String> {
121    match algo {
122        HashAlgorithm::Sha256 => sha256_reader(reader),
123        HashAlgorithm::Md5 => hash_reader_impl::<Md5>(reader),
124        HashAlgorithm::Blake2b => blake2b_hash_reader(reader, 64),
125    }
126}
127
128/// Track whether O_NOATIME is supported to avoid repeated failed open() attempts.
129/// After the first EPERM, we never try O_NOATIME again (saves one syscall per file).
130#[cfg(target_os = "linux")]
131static NOATIME_SUPPORTED: AtomicBool = AtomicBool::new(true);
132
133/// Open a file with O_NOATIME on Linux to avoid atime update overhead.
134/// Caches whether O_NOATIME works to avoid double-open on every file.
135#[cfg(target_os = "linux")]
136fn open_noatime(path: &Path) -> io::Result<File> {
137    use std::os::unix::fs::OpenOptionsExt;
138    if NOATIME_SUPPORTED.load(Ordering::Relaxed) {
139        match std::fs::OpenOptions::new()
140            .read(true)
141            .custom_flags(libc::O_NOATIME)
142            .open(path)
143        {
144            Ok(f) => return Ok(f),
145            Err(ref e) if e.raw_os_error() == Some(libc::EPERM) => {
146                // O_NOATIME requires file ownership or CAP_FOWNER — disable globally
147                NOATIME_SUPPORTED.store(false, Ordering::Relaxed);
148            }
149            Err(e) => return Err(e), // Real error, propagate
150        }
151    }
152    File::open(path)
153}
154
155#[cfg(not(target_os = "linux"))]
156fn open_noatime(path: &Path) -> io::Result<File> {
157    File::open(path)
158}
159
160/// Advise kernel for optimal mmap access: sequential readahead.
161/// MADV_SEQUENTIAL enables aggressive readahead (2x default window).
162#[cfg(target_os = "linux")]
163#[inline]
164fn mmap_advise(mmap: &memmap2::Mmap) {
165    unsafe {
166        let ptr = mmap.as_ptr() as *mut libc::c_void;
167        let len = mmap.len();
168        libc::madvise(ptr, len, libc::MADV_SEQUENTIAL);
169    }
170}
171
172#[cfg(not(target_os = "linux"))]
173#[inline]
174fn mmap_advise(_mmap: &memmap2::Mmap) {}
175
176/// Hash a file by path. Single open + fstat to minimize syscalls.
177/// Uses zero-copy mmap for regular files: the hash function reads directly
178/// from the page cache without any kernel→user memcpy or read() syscalls.
179/// MAP_POPULATE prefaults all pages before hashing starts.
180/// MADV_HUGEPAGE uses 2MB pages to reduce TLB misses by ~500x.
181pub fn hash_file(algo: HashAlgorithm, path: &Path) -> io::Result<String> {
182    // Single open — reuse fd for fstat + mmap (saves separate stat + open)
183    let file = open_noatime(path)?;
184    let metadata = file.metadata()?; // fstat on existing fd, cheaper than stat(path)
185    let len = metadata.len();
186    let is_regular = metadata.file_type().is_file();
187
188    if is_regular && len == 0 {
189        return Ok(hash_bytes(algo, &[]));
190    }
191
192    if is_regular && len > 0 {
193        // Zero-copy mmap: hash function reads directly from page cache.
194        // No read() syscalls, no kernel→user memcpy.
195        let mmap = unsafe { MmapOptions::new().map(&file)? };
196        mmap_advise(&mmap);
197        return Ok(hash_bytes(algo, &mmap));
198    }
199
200    // Fallback: streaming read (special files, pipes, etc.) — fd already open
201    hash_reader(algo, file)
202}
203
204/// Hash stdin. Uses fadvise for file redirects, streaming for pipes.
205pub fn hash_stdin(algo: HashAlgorithm) -> io::Result<String> {
206    let stdin = io::stdin();
207    // Hint kernel for sequential access if stdin is a regular file (redirect)
208    #[cfg(target_os = "linux")]
209    {
210        use std::os::unix::io::AsRawFd;
211        let fd = stdin.as_raw_fd();
212        let mut stat: libc::stat = unsafe { std::mem::zeroed() };
213        if unsafe { libc::fstat(fd, &mut stat) } == 0
214            && (stat.st_mode & libc::S_IFMT) == libc::S_IFREG
215            && stat.st_size > 0
216        {
217            unsafe {
218                libc::posix_fadvise(fd, 0, stat.st_size, libc::POSIX_FADV_SEQUENTIAL);
219            }
220        }
221    }
222    // Streaming hash — works for both pipe and file-redirect stdin
223    hash_reader(algo, stdin.lock())
224}
225
226/// Check if parallel hashing is worthwhile for the given file paths.
227/// Always parallelizes with 2+ files — rayon's thread pool is already initialized
228/// and work-stealing overhead is minimal (~1µs per file dispatch).
229/// With mmap, each thread independently maps and hashes its own file with no
230/// shared state, giving near-linear speedup with available cores.
231pub fn should_use_parallel(paths: &[&Path]) -> bool {
232    paths.len() >= 2
233}
234
235/// Issue readahead hints for a list of file paths to warm the page cache.
236/// Uses POSIX_FADV_WILLNEED which is non-blocking and batches efficiently.
237#[cfg(target_os = "linux")]
238pub fn readahead_files(paths: &[&Path]) {
239    use std::os::unix::io::AsRawFd;
240    for path in paths {
241        if let Ok(file) = open_noatime(path) {
242            if let Ok(meta) = file.metadata() {
243                let len = meta.len();
244                if meta.file_type().is_file() && len > 0 {
245                    unsafe {
246                        libc::posix_fadvise(
247                            file.as_raw_fd(),
248                            0,
249                            len as i64,
250                            libc::POSIX_FADV_WILLNEED,
251                        );
252                    }
253                }
254            }
255        }
256    }
257}
258
259#[cfg(not(target_os = "linux"))]
260pub fn readahead_files(_paths: &[&Path]) {
261    // No-op on non-Linux
262}
263
264// --- BLAKE2b variable-length functions (using blake2b_simd) ---
265
266/// Hash raw data with BLAKE2b variable output length.
267/// `output_bytes` is the output size in bytes (e.g., 32 for 256-bit).
268pub fn blake2b_hash_data(data: &[u8], output_bytes: usize) -> String {
269    let hash = blake2b_simd::Params::new()
270        .hash_length(output_bytes)
271        .hash(data);
272    hex_encode(hash.as_bytes())
273}
274
275/// Hash a reader with BLAKE2b variable output length.
276/// Uses thread-local 1MB buffer for cache-friendly streaming.
277pub fn blake2b_hash_reader<R: Read>(mut reader: R, output_bytes: usize) -> io::Result<String> {
278    STREAM_BUF.with(|cell| {
279        let mut buf = cell.borrow_mut();
280        let mut state = blake2b_simd::Params::new()
281            .hash_length(output_bytes)
282            .to_state();
283        loop {
284            let n = read_full(&mut reader, &mut buf)?;
285            if n == 0 {
286                break;
287            }
288            state.update(&buf[..n]);
289        }
290        Ok(hex_encode(state.finalize().as_bytes()))
291    })
292}
293
294/// Hash a file with BLAKE2b variable output length. Single open + fstat.
295/// Uses zero-copy mmap for regular files, streaming for pipes/special files.
296pub fn blake2b_hash_file(path: &Path, output_bytes: usize) -> io::Result<String> {
297    // Single open — reuse fd for fstat + mmap
298    let file = open_noatime(path)?;
299    let metadata = file.metadata()?;
300    let len = metadata.len();
301    let is_regular = metadata.file_type().is_file();
302
303    if is_regular && len == 0 {
304        return Ok(blake2b_hash_data(&[], output_bytes));
305    }
306
307    if is_regular && len > 0 {
308        // Zero-copy mmap: hash function reads directly from page cache.
309        let mmap = unsafe { MmapOptions::new().map(&file)? };
310        mmap_advise(&mmap);
311        return Ok(blake2b_hash_data(&mmap, output_bytes));
312    }
313
314    // Fallback: streaming read — fd already open
315    blake2b_hash_reader(file, output_bytes)
316}
317
318/// Hash stdin with BLAKE2b variable output length.
319/// Tries fadvise if stdin is a regular file (shell redirect), then streams.
320pub fn blake2b_hash_stdin(output_bytes: usize) -> io::Result<String> {
321    let stdin = io::stdin();
322    #[cfg(target_os = "linux")]
323    {
324        use std::os::unix::io::AsRawFd;
325        let fd = stdin.as_raw_fd();
326        let mut stat: libc::stat = unsafe { std::mem::zeroed() };
327        if unsafe { libc::fstat(fd, &mut stat) } == 0
328            && (stat.st_mode & libc::S_IFMT) == libc::S_IFREG
329            && stat.st_size > 0
330        {
331            unsafe {
332                libc::posix_fadvise(fd, 0, stat.st_size, libc::POSIX_FADV_SEQUENTIAL);
333            }
334        }
335    }
336    blake2b_hash_reader(stdin.lock(), output_bytes)
337}
338
339/// Print hash result in GNU format: "hash  filename\n"
340pub fn print_hash(
341    out: &mut impl Write,
342    hash: &str,
343    filename: &str,
344    binary: bool,
345) -> io::Result<()> {
346    let mode_char = if binary { '*' } else { ' ' };
347    writeln!(out, "{} {}{}", hash, mode_char, filename)
348}
349
350/// Print hash in GNU format with NUL terminator instead of newline.
351pub fn print_hash_zero(
352    out: &mut impl Write,
353    hash: &str,
354    filename: &str,
355    binary: bool,
356) -> io::Result<()> {
357    let mode_char = if binary { '*' } else { ' ' };
358    write!(out, "{} {}{}\0", hash, mode_char, filename)
359}
360
361/// Print hash result in BSD tag format: "ALGO (filename) = hash\n"
362pub fn print_hash_tag(
363    out: &mut impl Write,
364    algo: HashAlgorithm,
365    hash: &str,
366    filename: &str,
367) -> io::Result<()> {
368    writeln!(out, "{} ({}) = {}", algo.name(), filename, hash)
369}
370
371/// Print hash in BSD tag format with NUL terminator.
372pub fn print_hash_tag_zero(
373    out: &mut impl Write,
374    algo: HashAlgorithm,
375    hash: &str,
376    filename: &str,
377) -> io::Result<()> {
378    write!(out, "{} ({}) = {}\0", algo.name(), filename, hash)
379}
380
381/// Print hash in BSD tag format with BLAKE2b length info:
382/// "BLAKE2b (filename) = hash" for 512-bit, or
383/// "BLAKE2b-256 (filename) = hash" for other lengths.
384pub fn print_hash_tag_b2sum(
385    out: &mut impl Write,
386    hash: &str,
387    filename: &str,
388    bits: usize,
389) -> io::Result<()> {
390    if bits == 512 {
391        writeln!(out, "BLAKE2b ({}) = {}", filename, hash)
392    } else {
393        writeln!(out, "BLAKE2b-{} ({}) = {}", bits, filename, hash)
394    }
395}
396
397/// Print hash in BSD tag format with BLAKE2b length info and NUL terminator.
398pub fn print_hash_tag_b2sum_zero(
399    out: &mut impl Write,
400    hash: &str,
401    filename: &str,
402    bits: usize,
403) -> io::Result<()> {
404    if bits == 512 {
405        write!(out, "BLAKE2b ({}) = {}\0", filename, hash)
406    } else {
407        write!(out, "BLAKE2b-{} ({}) = {}\0", bits, filename, hash)
408    }
409}
410
411/// Options for check mode.
412pub struct CheckOptions {
413    pub quiet: bool,
414    pub status_only: bool,
415    pub strict: bool,
416    pub warn: bool,
417    pub ignore_missing: bool,
418    /// Prefix for per-line format warnings, e.g., "fmd5sum: checksums.txt".
419    /// When non-empty, warnings use GNU format: "{prefix}: {line}: message".
420    /// When empty, uses generic format: "line {line}: message".
421    pub warn_prefix: String,
422}
423
424/// Result of check mode verification.
425pub struct CheckResult {
426    pub ok: usize,
427    pub mismatches: usize,
428    pub format_errors: usize,
429    pub read_errors: usize,
430    /// Number of files skipped because they were missing and --ignore-missing was set.
431    pub ignored_missing: usize,
432}
433
434/// Verify checksums from a check file.
435/// Each line should be "hash  filename" or "hash *filename" or "ALGO (filename) = hash".
436pub fn check_file<R: BufRead>(
437    algo: HashAlgorithm,
438    reader: R,
439    opts: &CheckOptions,
440    out: &mut impl Write,
441    err_out: &mut impl Write,
442) -> io::Result<CheckResult> {
443    let quiet = opts.quiet;
444    let status_only = opts.status_only;
445    let warn = opts.warn;
446    let ignore_missing = opts.ignore_missing;
447    let mut ok_count = 0;
448    let mut mismatch_count = 0;
449    let mut format_errors = 0;
450    let mut read_errors = 0;
451    let mut ignored_missing_count = 0;
452    let mut line_num = 0;
453
454    for line_result in reader.lines() {
455        line_num += 1;
456        let line = line_result?;
457        let line = line.trim_end();
458
459        if line.is_empty() {
460            continue;
461        }
462
463        // Parse "hash  filename" or "hash *filename" or "ALGO (file) = hash"
464        let (expected_hash, filename) = match parse_check_line(line) {
465            Some(v) => v,
466            None => {
467                format_errors += 1;
468                if warn {
469                    out.flush()?;
470                    if opts.warn_prefix.is_empty() {
471                        writeln!(
472                            err_out,
473                            "line {}: improperly formatted {} checksum line",
474                            line_num,
475                            algo.name()
476                        )?;
477                    } else {
478                        writeln!(
479                            err_out,
480                            "{}: {}: improperly formatted {} checksum line",
481                            opts.warn_prefix,
482                            line_num,
483                            algo.name()
484                        )?;
485                    }
486                }
487                continue;
488            }
489        };
490
491        // Compute actual hash
492        let actual = match hash_file(algo, Path::new(filename)) {
493            Ok(h) => h,
494            Err(e) => {
495                if ignore_missing && e.kind() == io::ErrorKind::NotFound {
496                    ignored_missing_count += 1;
497                    continue;
498                }
499                read_errors += 1;
500                if !status_only {
501                    out.flush()?;
502                    writeln!(err_out, "{}: {}", filename, e)?;
503                    writeln!(out, "{}: FAILED open or read", filename)?;
504                }
505                continue;
506            }
507        };
508
509        if actual.eq_ignore_ascii_case(expected_hash) {
510            ok_count += 1;
511            if !quiet && !status_only {
512                writeln!(out, "{}: OK", filename)?;
513            }
514        } else {
515            mismatch_count += 1;
516            if !status_only {
517                writeln!(out, "{}: FAILED", filename)?;
518            }
519        }
520    }
521
522    Ok(CheckResult {
523        ok: ok_count,
524        mismatches: mismatch_count,
525        format_errors,
526        read_errors,
527        ignored_missing: ignored_missing_count,
528    })
529}
530
531/// Parse a checksum line in any supported format.
532pub fn parse_check_line(line: &str) -> Option<(&str, &str)> {
533    // Try BSD tag format: "ALGO (filename) = hash"
534    let rest = line
535        .strip_prefix("MD5 (")
536        .or_else(|| line.strip_prefix("SHA256 ("))
537        .or_else(|| line.strip_prefix("BLAKE2b ("))
538        .or_else(|| {
539            // Handle BLAKE2b-NNN (filename) = hash
540            if line.starts_with("BLAKE2b-") {
541                let after = &line["BLAKE2b-".len()..];
542                if let Some(sp) = after.find(" (") {
543                    if after[..sp].bytes().all(|b| b.is_ascii_digit()) {
544                        return Some(&after[sp + 2..]);
545                    }
546                }
547            }
548            None
549        });
550    if let Some(rest) = rest {
551        if let Some(paren_idx) = rest.find(") = ") {
552            let filename = &rest[..paren_idx];
553            let hash = &rest[paren_idx + 4..];
554            return Some((hash, filename));
555        }
556    }
557
558    // Handle backslash-escaped lines (leading '\')
559    let line = line.strip_prefix('\\').unwrap_or(line);
560
561    // Standard format: "hash  filename"
562    if let Some(idx) = line.find("  ") {
563        let hash = &line[..idx];
564        let rest = &line[idx + 2..];
565        return Some((hash, rest));
566    }
567    // Binary mode: "hash *filename"
568    if let Some(idx) = line.find(" *") {
569        let hash = &line[..idx];
570        let rest = &line[idx + 2..];
571        return Some((hash, rest));
572    }
573    None
574}
575
576/// Parse a BSD-style tag line: "ALGO (filename) = hash"
577/// Returns (expected_hash, filename, optional_bits).
578/// `bits` is the hash length parsed from the algo name (e.g., BLAKE2b-256 -> Some(256)).
579pub fn parse_check_line_tag(line: &str) -> Option<(&str, &str, Option<usize>)> {
580    let paren_start = line.find(" (")?;
581    let algo_part = &line[..paren_start];
582    let rest = &line[paren_start + 2..];
583    let paren_end = rest.find(") = ")?;
584    let filename = &rest[..paren_end];
585    let hash = &rest[paren_end + 4..];
586
587    // Parse optional bit length from algo name (e.g., "BLAKE2b-256" -> Some(256))
588    let bits = if let Some(dash_pos) = algo_part.rfind('-') {
589        algo_part[dash_pos + 1..].parse::<usize>().ok()
590    } else {
591        None
592    };
593
594    Some((hash, filename, bits))
595}
596
597/// Read as many bytes as possible into buf, retrying on partial reads.
598/// Ensures each hash update gets a full buffer (fewer update calls = less overhead).
599#[inline]
600fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
601    let mut total = 0;
602    while total < buf.len() {
603        match reader.read(&mut buf[total..]) {
604            Ok(0) => break,
605            Ok(n) => total += n,
606            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
607            Err(e) => return Err(e),
608        }
609    }
610    Ok(total)
611}
612
613/// Compile-time generated 2-byte hex pair lookup table.
614/// Each byte maps directly to its 2-char hex representation — single lookup per byte.
615const fn generate_hex_table() -> [[u8; 2]; 256] {
616    let hex = b"0123456789abcdef";
617    let mut table = [[0u8; 2]; 256];
618    let mut i = 0;
619    while i < 256 {
620        table[i] = [hex[i >> 4], hex[i & 0xf]];
621        i += 1;
622    }
623    table
624}
625
626const HEX_TABLE: [[u8; 2]; 256] = generate_hex_table();
627
628/// Fast hex encoding using 2-byte pair lookup table — one lookup per input byte.
629/// Uses String directly instead of Vec<u8> to avoid the from_utf8 conversion overhead.
630pub(crate) fn hex_encode(bytes: &[u8]) -> String {
631    let len = bytes.len() * 2;
632    let mut hex = String::with_capacity(len);
633    // SAFETY: We write exactly `len` valid ASCII hex bytes into the String's buffer.
634    unsafe {
635        let buf = hex.as_mut_vec();
636        buf.set_len(len);
637        let ptr = buf.as_mut_ptr();
638        for (i, &b) in bytes.iter().enumerate() {
639            let pair = *HEX_TABLE.get_unchecked(b as usize);
640            *ptr.add(i * 2) = pair[0];
641            *ptr.add(i * 2 + 1) = pair[1];
642        }
643    }
644    hex
645}