coreutils_rs/hash/
core.rs

1use std::cell::RefCell;
2use std::fs::File;
3use std::io::{self, BufRead, Read, Write};
4use std::path::Path;
5
6#[cfg(target_os = "linux")]
7use std::sync::atomic::{AtomicBool, Ordering};
8
9use digest::Digest;
10use md5::Md5;
11use memmap2::MmapOptions;
12
13/// Supported hash algorithms.
14#[derive(Debug, Clone, Copy)]
15pub enum HashAlgorithm {
16    Sha256,
17    Md5,
18    Blake2b,
19}
20
21impl HashAlgorithm {
22    pub fn name(self) -> &'static str {
23        match self {
24            HashAlgorithm::Sha256 => "SHA256",
25            HashAlgorithm::Md5 => "MD5",
26            HashAlgorithm::Blake2b => "BLAKE2b",
27        }
28    }
29}
30
31// ── Generic hash helpers ────────────────────────────────────────────
32
33fn hash_digest<D: Digest>(data: &[u8]) -> String {
34    hex_encode(&D::digest(data))
35}
36
37/// Streaming hash using thread-local 1MB buffer for optimal L2 cache behavior.
38/// 1MB fits in L2 cache on most CPUs, keeping data hot during hash update.
39/// Uses read_full to ensure each update() gets a full buffer, minimizing
40/// per-chunk hasher overhead and maximizing SIMD-friendly aligned updates.
41fn hash_reader_impl<D: Digest>(mut reader: impl Read) -> io::Result<String> {
42    STREAM_BUF.with(|cell| {
43        let mut buf = cell.borrow_mut();
44        let mut hasher = D::new();
45        loop {
46            let n = read_full(&mut reader, &mut buf)?;
47            if n == 0 {
48                break;
49            }
50            hasher.update(&buf[..n]);
51        }
52        Ok(hex_encode(&hasher.finalize()))
53    })
54}
55
56// ── Public hashing API ──────────────────────────────────────────────
57
58/// Buffer size for streaming hash I/O (stdin/pipes only — regular files use mmap).
59/// 4MB gives fewer syscalls while still fitting in L3 cache.
60/// With fadvise(SEQUENTIAL) the kernel prefetches ahead, so the next
61/// chunk is already in page cache by the time we finish hashing the current one.
62const HASH_READ_BUF: usize = 4 * 1024 * 1024;
63
64// Thread-local reusable buffer for streaming hash I/O (stdin/pipes only).
65// Allocated once per thread, reused across all hash_reader calls.
66thread_local! {
67    static STREAM_BUF: RefCell<Vec<u8>> = RefCell::new(vec![0u8; HASH_READ_BUF]);
68}
69
70// ── SHA-256: ring on non-Apple, sha2 fallback on Apple ───────────────
71
72/// Single-shot SHA-256 using ring's BoringSSL assembly (Linux/Windows).
73#[cfg(not(target_vendor = "apple"))]
74fn sha256_bytes(data: &[u8]) -> String {
75    hex_encode(ring::digest::digest(&ring::digest::SHA256, data).as_ref())
76}
77
78/// Single-shot SHA-256 using sha2 crate (macOS fallback).
79#[cfg(target_vendor = "apple")]
80fn sha256_bytes(data: &[u8]) -> String {
81    hash_digest::<sha2::Sha256>(data)
82}
83
84/// Streaming SHA-256 using ring's BoringSSL assembly (Linux/Windows).
85#[cfg(not(target_vendor = "apple"))]
86fn sha256_reader(mut reader: impl Read) -> io::Result<String> {
87    STREAM_BUF.with(|cell| {
88        let mut buf = cell.borrow_mut();
89        let mut ctx = ring::digest::Context::new(&ring::digest::SHA256);
90        loop {
91            let n = read_full(&mut reader, &mut buf)?;
92            if n == 0 {
93                break;
94            }
95            ctx.update(&buf[..n]);
96        }
97        Ok(hex_encode(ctx.finish().as_ref()))
98    })
99}
100
101/// Streaming SHA-256 using sha2 crate (macOS fallback).
102#[cfg(target_vendor = "apple")]
103fn sha256_reader(reader: impl Read) -> io::Result<String> {
104    hash_reader_impl::<sha2::Sha256>(reader)
105}
106
107/// Compute hash of a byte slice directly (zero-copy fast path).
108pub fn hash_bytes(algo: HashAlgorithm, data: &[u8]) -> String {
109    match algo {
110        HashAlgorithm::Sha256 => sha256_bytes(data),
111        HashAlgorithm::Md5 => hash_digest::<Md5>(data),
112        HashAlgorithm::Blake2b => {
113            let hash = blake2b_simd::blake2b(data);
114            hex_encode(hash.as_bytes())
115        }
116    }
117}
118
119/// Compute hash of data from a reader, returning hex string.
120pub fn hash_reader<R: Read>(algo: HashAlgorithm, reader: R) -> io::Result<String> {
121    match algo {
122        HashAlgorithm::Sha256 => sha256_reader(reader),
123        HashAlgorithm::Md5 => hash_reader_impl::<Md5>(reader),
124        HashAlgorithm::Blake2b => blake2b_hash_reader(reader, 64),
125    }
126}
127
128/// Track whether O_NOATIME is supported to avoid repeated failed open() attempts.
129/// After the first EPERM, we never try O_NOATIME again (saves one syscall per file).
130#[cfg(target_os = "linux")]
131static NOATIME_SUPPORTED: AtomicBool = AtomicBool::new(true);
132
133/// Open a file with O_NOATIME on Linux to avoid atime update overhead.
134/// Caches whether O_NOATIME works to avoid double-open on every file.
135#[cfg(target_os = "linux")]
136fn open_noatime(path: &Path) -> io::Result<File> {
137    use std::os::unix::fs::OpenOptionsExt;
138    if NOATIME_SUPPORTED.load(Ordering::Relaxed) {
139        match std::fs::OpenOptions::new()
140            .read(true)
141            .custom_flags(libc::O_NOATIME)
142            .open(path)
143        {
144            Ok(f) => return Ok(f),
145            Err(ref e) if e.raw_os_error() == Some(libc::EPERM) => {
146                // O_NOATIME requires file ownership or CAP_FOWNER — disable globally
147                NOATIME_SUPPORTED.store(false, Ordering::Relaxed);
148            }
149            Err(e) => return Err(e), // Real error, propagate
150        }
151    }
152    File::open(path)
153}
154
155#[cfg(not(target_os = "linux"))]
156fn open_noatime(path: &Path) -> io::Result<File> {
157    File::open(path)
158}
159
160/// Advise kernel for optimal mmap access: sequential readahead + transparent huge pages.
161/// MADV_SEQUENTIAL enables aggressive readahead (2x default window).
162/// MADV_HUGEPAGE requests 2MB pages, reducing TLB misses by ~500x for large files
163/// (e.g., 100MB file: 50 TLB entries with 2MB pages vs 25,600 with 4KB pages).
164#[cfg(target_os = "linux")]
165#[inline]
166fn mmap_advise(mmap: &memmap2::Mmap) {
167    unsafe {
168        let ptr = mmap.as_ptr() as *mut libc::c_void;
169        let len = mmap.len();
170        libc::madvise(ptr, len, libc::MADV_SEQUENTIAL);
171        libc::madvise(ptr, len, libc::MADV_HUGEPAGE);
172    }
173}
174
175#[cfg(not(target_os = "linux"))]
176#[inline]
177fn mmap_advise(_mmap: &memmap2::Mmap) {}
178
179/// Hash a file by path. Single open + fstat to minimize syscalls.
180/// Uses zero-copy mmap for regular files: the hash function reads directly
181/// from the page cache without any kernel→user memcpy or read() syscalls.
182/// MAP_POPULATE prefaults all pages before hashing starts.
183/// MADV_HUGEPAGE uses 2MB pages to reduce TLB misses by ~500x.
184pub fn hash_file(algo: HashAlgorithm, path: &Path) -> io::Result<String> {
185    // Single open — reuse fd for fstat + mmap (saves separate stat + open)
186    let file = open_noatime(path)?;
187    let metadata = file.metadata()?; // fstat on existing fd, cheaper than stat(path)
188    let len = metadata.len();
189    let is_regular = metadata.file_type().is_file();
190
191    if is_regular && len == 0 {
192        return Ok(hash_bytes(algo, &[]));
193    }
194
195    if is_regular && len > 0 {
196        // Zero-copy mmap: hash function reads directly from page cache.
197        // No read() syscalls, no kernel→user memcpy.
198        // MAP_POPULATE prefaults all pages before hashing starts.
199        let mmap = unsafe { MmapOptions::new().populate().map(&file)? };
200        mmap_advise(&mmap);
201        return Ok(hash_bytes(algo, &mmap));
202    }
203
204    // Fallback: streaming read (special files, pipes, etc.) — fd already open
205    hash_reader(algo, file)
206}
207
208/// Hash stdin. Uses fadvise for file redirects, streaming for pipes.
209pub fn hash_stdin(algo: HashAlgorithm) -> io::Result<String> {
210    let stdin = io::stdin();
211    // Hint kernel for sequential access if stdin is a regular file (redirect)
212    #[cfg(target_os = "linux")]
213    {
214        use std::os::unix::io::AsRawFd;
215        let fd = stdin.as_raw_fd();
216        let mut stat: libc::stat = unsafe { std::mem::zeroed() };
217        if unsafe { libc::fstat(fd, &mut stat) } == 0
218            && (stat.st_mode & libc::S_IFMT) == libc::S_IFREG
219            && stat.st_size > 0
220        {
221            unsafe {
222                libc::posix_fadvise(fd, 0, stat.st_size, libc::POSIX_FADV_SEQUENTIAL);
223            }
224        }
225    }
226    // Streaming hash — works for both pipe and file-redirect stdin
227    hash_reader(algo, stdin.lock())
228}
229
230/// Check if parallel hashing is worthwhile for the given file paths.
231/// Always parallelizes with 2+ files — rayon's thread pool is already initialized
232/// and work-stealing overhead is minimal (~1µs per file dispatch).
233/// With mmap, each thread independently maps and hashes its own file with no
234/// shared state, giving near-linear speedup with available cores.
235pub fn should_use_parallel(paths: &[&Path]) -> bool {
236    paths.len() >= 2
237}
238
239/// Issue readahead hints for a list of file paths to warm the page cache.
240/// Uses POSIX_FADV_WILLNEED which is non-blocking and batches efficiently.
241#[cfg(target_os = "linux")]
242pub fn readahead_files(paths: &[&Path]) {
243    use std::os::unix::io::AsRawFd;
244    for path in paths {
245        if let Ok(file) = open_noatime(path) {
246            if let Ok(meta) = file.metadata() {
247                let len = meta.len();
248                if meta.file_type().is_file() && len > 0 {
249                    unsafe {
250                        libc::posix_fadvise(
251                            file.as_raw_fd(),
252                            0,
253                            len as i64,
254                            libc::POSIX_FADV_WILLNEED,
255                        );
256                    }
257                }
258            }
259        }
260    }
261}
262
263#[cfg(not(target_os = "linux"))]
264pub fn readahead_files(_paths: &[&Path]) {
265    // No-op on non-Linux
266}
267
268// --- BLAKE2b variable-length functions (using blake2b_simd) ---
269
270/// Hash raw data with BLAKE2b variable output length.
271/// `output_bytes` is the output size in bytes (e.g., 32 for 256-bit).
272pub fn blake2b_hash_data(data: &[u8], output_bytes: usize) -> String {
273    let hash = blake2b_simd::Params::new()
274        .hash_length(output_bytes)
275        .hash(data);
276    hex_encode(hash.as_bytes())
277}
278
279/// Hash a reader with BLAKE2b variable output length.
280/// Uses thread-local 1MB buffer for cache-friendly streaming.
281pub fn blake2b_hash_reader<R: Read>(mut reader: R, output_bytes: usize) -> io::Result<String> {
282    STREAM_BUF.with(|cell| {
283        let mut buf = cell.borrow_mut();
284        let mut state = blake2b_simd::Params::new()
285            .hash_length(output_bytes)
286            .to_state();
287        loop {
288            let n = read_full(&mut reader, &mut buf)?;
289            if n == 0 {
290                break;
291            }
292            state.update(&buf[..n]);
293        }
294        Ok(hex_encode(state.finalize().as_bytes()))
295    })
296}
297
298/// Hash a file with BLAKE2b variable output length. Single open + fstat.
299/// Uses zero-copy mmap for regular files, streaming for pipes/special files.
300pub fn blake2b_hash_file(path: &Path, output_bytes: usize) -> io::Result<String> {
301    // Single open — reuse fd for fstat + mmap
302    let file = open_noatime(path)?;
303    let metadata = file.metadata()?;
304    let len = metadata.len();
305    let is_regular = metadata.file_type().is_file();
306
307    if is_regular && len == 0 {
308        return Ok(blake2b_hash_data(&[], output_bytes));
309    }
310
311    if is_regular && len > 0 {
312        // Zero-copy mmap: hash function reads directly from page cache.
313        let mmap = unsafe { MmapOptions::new().populate().map(&file)? };
314        mmap_advise(&mmap);
315        return Ok(blake2b_hash_data(&mmap, output_bytes));
316    }
317
318    // Fallback: streaming read — fd already open
319    blake2b_hash_reader(file, output_bytes)
320}
321
322/// Hash stdin with BLAKE2b variable output length.
323/// Tries fadvise if stdin is a regular file (shell redirect), then streams.
324pub fn blake2b_hash_stdin(output_bytes: usize) -> io::Result<String> {
325    let stdin = io::stdin();
326    #[cfg(target_os = "linux")]
327    {
328        use std::os::unix::io::AsRawFd;
329        let fd = stdin.as_raw_fd();
330        let mut stat: libc::stat = unsafe { std::mem::zeroed() };
331        if unsafe { libc::fstat(fd, &mut stat) } == 0
332            && (stat.st_mode & libc::S_IFMT) == libc::S_IFREG
333            && stat.st_size > 0
334        {
335            unsafe {
336                libc::posix_fadvise(fd, 0, stat.st_size, libc::POSIX_FADV_SEQUENTIAL);
337            }
338        }
339    }
340    blake2b_hash_reader(stdin.lock(), output_bytes)
341}
342
343/// Print hash result in GNU format: "hash  filename\n"
344pub fn print_hash(
345    out: &mut impl Write,
346    hash: &str,
347    filename: &str,
348    binary: bool,
349) -> io::Result<()> {
350    let mode_char = if binary { '*' } else { ' ' };
351    writeln!(out, "{} {}{}", hash, mode_char, filename)
352}
353
354/// Print hash in GNU format with NUL terminator instead of newline.
355pub fn print_hash_zero(
356    out: &mut impl Write,
357    hash: &str,
358    filename: &str,
359    binary: bool,
360) -> io::Result<()> {
361    let mode_char = if binary { '*' } else { ' ' };
362    write!(out, "{} {}{}\0", hash, mode_char, filename)
363}
364
365/// Print hash result in BSD tag format: "ALGO (filename) = hash\n"
366pub fn print_hash_tag(
367    out: &mut impl Write,
368    algo: HashAlgorithm,
369    hash: &str,
370    filename: &str,
371) -> io::Result<()> {
372    writeln!(out, "{} ({}) = {}", algo.name(), filename, hash)
373}
374
375/// Print hash in BSD tag format with NUL terminator.
376pub fn print_hash_tag_zero(
377    out: &mut impl Write,
378    algo: HashAlgorithm,
379    hash: &str,
380    filename: &str,
381) -> io::Result<()> {
382    write!(out, "{} ({}) = {}\0", algo.name(), filename, hash)
383}
384
385/// Print hash in BSD tag format with BLAKE2b length info:
386/// "BLAKE2b (filename) = hash" for 512-bit, or
387/// "BLAKE2b-256 (filename) = hash" for other lengths.
388pub fn print_hash_tag_b2sum(
389    out: &mut impl Write,
390    hash: &str,
391    filename: &str,
392    bits: usize,
393) -> io::Result<()> {
394    if bits == 512 {
395        writeln!(out, "BLAKE2b ({}) = {}", filename, hash)
396    } else {
397        writeln!(out, "BLAKE2b-{} ({}) = {}", bits, filename, hash)
398    }
399}
400
401/// Print hash in BSD tag format with BLAKE2b length info and NUL terminator.
402pub fn print_hash_tag_b2sum_zero(
403    out: &mut impl Write,
404    hash: &str,
405    filename: &str,
406    bits: usize,
407) -> io::Result<()> {
408    if bits == 512 {
409        write!(out, "BLAKE2b ({}) = {}\0", filename, hash)
410    } else {
411        write!(out, "BLAKE2b-{} ({}) = {}\0", bits, filename, hash)
412    }
413}
414
415/// Options for check mode.
416pub struct CheckOptions {
417    pub quiet: bool,
418    pub status_only: bool,
419    pub strict: bool,
420    pub warn: bool,
421    pub ignore_missing: bool,
422    /// Prefix for per-line format warnings, e.g., "fmd5sum: checksums.txt".
423    /// When non-empty, warnings use GNU format: "{prefix}: {line}: message".
424    /// When empty, uses generic format: "line {line}: message".
425    pub warn_prefix: String,
426}
427
428/// Result of check mode verification.
429pub struct CheckResult {
430    pub ok: usize,
431    pub mismatches: usize,
432    pub format_errors: usize,
433    pub read_errors: usize,
434    /// Number of files skipped because they were missing and --ignore-missing was set.
435    pub ignored_missing: usize,
436}
437
438/// Verify checksums from a check file.
439/// Each line should be "hash  filename" or "hash *filename" or "ALGO (filename) = hash".
440pub fn check_file<R: BufRead>(
441    algo: HashAlgorithm,
442    reader: R,
443    opts: &CheckOptions,
444    out: &mut impl Write,
445    err_out: &mut impl Write,
446) -> io::Result<CheckResult> {
447    let quiet = opts.quiet;
448    let status_only = opts.status_only;
449    let warn = opts.warn;
450    let ignore_missing = opts.ignore_missing;
451    let mut ok_count = 0;
452    let mut mismatch_count = 0;
453    let mut format_errors = 0;
454    let mut read_errors = 0;
455    let mut ignored_missing_count = 0;
456    let mut line_num = 0;
457
458    for line_result in reader.lines() {
459        line_num += 1;
460        let line = line_result?;
461        let line = line.trim_end();
462
463        if line.is_empty() {
464            continue;
465        }
466
467        // Parse "hash  filename" or "hash *filename" or "ALGO (file) = hash"
468        let (expected_hash, filename) = match parse_check_line(line) {
469            Some(v) => v,
470            None => {
471                format_errors += 1;
472                if warn {
473                    out.flush()?;
474                    if opts.warn_prefix.is_empty() {
475                        writeln!(
476                            err_out,
477                            "line {}: improperly formatted {} checksum line",
478                            line_num,
479                            algo.name()
480                        )?;
481                    } else {
482                        writeln!(
483                            err_out,
484                            "{}: {}: improperly formatted {} checksum line",
485                            opts.warn_prefix,
486                            line_num,
487                            algo.name()
488                        )?;
489                    }
490                }
491                continue;
492            }
493        };
494
495        // Compute actual hash
496        let actual = match hash_file(algo, Path::new(filename)) {
497            Ok(h) => h,
498            Err(e) => {
499                if ignore_missing && e.kind() == io::ErrorKind::NotFound {
500                    ignored_missing_count += 1;
501                    continue;
502                }
503                read_errors += 1;
504                if !status_only {
505                    out.flush()?;
506                    writeln!(err_out, "{}: {}", filename, e)?;
507                    writeln!(out, "{}: FAILED open or read", filename)?;
508                }
509                continue;
510            }
511        };
512
513        if actual.eq_ignore_ascii_case(expected_hash) {
514            ok_count += 1;
515            if !quiet && !status_only {
516                writeln!(out, "{}: OK", filename)?;
517            }
518        } else {
519            mismatch_count += 1;
520            if !status_only {
521                writeln!(out, "{}: FAILED", filename)?;
522            }
523        }
524    }
525
526    Ok(CheckResult {
527        ok: ok_count,
528        mismatches: mismatch_count,
529        format_errors,
530        read_errors,
531        ignored_missing: ignored_missing_count,
532    })
533}
534
535/// Parse a checksum line in any supported format.
536pub fn parse_check_line(line: &str) -> Option<(&str, &str)> {
537    // Try BSD tag format: "ALGO (filename) = hash"
538    let rest = line
539        .strip_prefix("MD5 (")
540        .or_else(|| line.strip_prefix("SHA256 ("))
541        .or_else(|| line.strip_prefix("BLAKE2b ("))
542        .or_else(|| {
543            // Handle BLAKE2b-NNN (filename) = hash
544            if line.starts_with("BLAKE2b-") {
545                let after = &line["BLAKE2b-".len()..];
546                if let Some(sp) = after.find(" (") {
547                    if after[..sp].bytes().all(|b| b.is_ascii_digit()) {
548                        return Some(&after[sp + 2..]);
549                    }
550                }
551            }
552            None
553        });
554    if let Some(rest) = rest {
555        if let Some(paren_idx) = rest.find(") = ") {
556            let filename = &rest[..paren_idx];
557            let hash = &rest[paren_idx + 4..];
558            return Some((hash, filename));
559        }
560    }
561
562    // Handle backslash-escaped lines (leading '\')
563    let line = line.strip_prefix('\\').unwrap_or(line);
564
565    // Standard format: "hash  filename"
566    if let Some(idx) = line.find("  ") {
567        let hash = &line[..idx];
568        let rest = &line[idx + 2..];
569        return Some((hash, rest));
570    }
571    // Binary mode: "hash *filename"
572    if let Some(idx) = line.find(" *") {
573        let hash = &line[..idx];
574        let rest = &line[idx + 2..];
575        return Some((hash, rest));
576    }
577    None
578}
579
580/// Parse a BSD-style tag line: "ALGO (filename) = hash"
581/// Returns (expected_hash, filename, optional_bits).
582/// `bits` is the hash length parsed from the algo name (e.g., BLAKE2b-256 -> Some(256)).
583pub fn parse_check_line_tag(line: &str) -> Option<(&str, &str, Option<usize>)> {
584    let paren_start = line.find(" (")?;
585    let algo_part = &line[..paren_start];
586    let rest = &line[paren_start + 2..];
587    let paren_end = rest.find(") = ")?;
588    let filename = &rest[..paren_end];
589    let hash = &rest[paren_end + 4..];
590
591    // Parse optional bit length from algo name (e.g., "BLAKE2b-256" -> Some(256))
592    let bits = if let Some(dash_pos) = algo_part.rfind('-') {
593        algo_part[dash_pos + 1..].parse::<usize>().ok()
594    } else {
595        None
596    };
597
598    Some((hash, filename, bits))
599}
600
601/// Read as many bytes as possible into buf, retrying on partial reads.
602/// Ensures each hash update gets a full buffer (fewer update calls = less overhead).
603#[inline]
604fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
605    let mut total = 0;
606    while total < buf.len() {
607        match reader.read(&mut buf[total..]) {
608            Ok(0) => break,
609            Ok(n) => total += n,
610            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
611            Err(e) => return Err(e),
612        }
613    }
614    Ok(total)
615}
616
617/// Compile-time generated 2-byte hex pair lookup table.
618/// Each byte maps directly to its 2-char hex representation — single lookup per byte.
619const fn generate_hex_table() -> [[u8; 2]; 256] {
620    let hex = b"0123456789abcdef";
621    let mut table = [[0u8; 2]; 256];
622    let mut i = 0;
623    while i < 256 {
624        table[i] = [hex[i >> 4], hex[i & 0xf]];
625        i += 1;
626    }
627    table
628}
629
630const HEX_TABLE: [[u8; 2]; 256] = generate_hex_table();
631
632/// Fast hex encoding using 2-byte pair lookup table — one lookup per input byte.
633/// Uses String directly instead of Vec<u8> to avoid the from_utf8 conversion overhead.
634pub(crate) fn hex_encode(bytes: &[u8]) -> String {
635    let len = bytes.len() * 2;
636    let mut hex = String::with_capacity(len);
637    // SAFETY: We write exactly `len` valid ASCII hex bytes into the String's buffer.
638    unsafe {
639        let buf = hex.as_mut_vec();
640        buf.set_len(len);
641        let ptr = buf.as_mut_ptr();
642        for (i, &b) in bytes.iter().enumerate() {
643            let pair = *HEX_TABLE.get_unchecked(b as usize);
644            *ptr.add(i * 2) = pair[0];
645            *ptr.add(i * 2 + 1) = pair[1];
646        }
647    }
648    hex
649}
coreutils_rs/hash/core.rs

coreutils_rs/hash/
core.rs