coreutils_rs/hash/
core.rs

1use std::cell::RefCell;
2use std::fs::File;
3use std::io::{self, BufRead, Read, Write};
4use std::path::Path;
5
6#[cfg(target_os = "linux")]
7use std::sync::atomic::{AtomicBool, Ordering};
8
9#[cfg(not(target_os = "linux"))]
10use digest::Digest;
11#[cfg(not(target_os = "linux"))]
12use md5::Md5;
13
14/// Supported hash algorithms.
15#[derive(Debug, Clone, Copy)]
16pub enum HashAlgorithm {
17    Sha256,
18    Md5,
19    Blake2b,
20}
21
22impl HashAlgorithm {
23    pub fn name(self) -> &'static str {
24        match self {
25            HashAlgorithm::Sha256 => "SHA256",
26            HashAlgorithm::Md5 => "MD5",
27            HashAlgorithm::Blake2b => "BLAKE2b",
28        }
29    }
30}
31
32// ── Generic hash helpers ────────────────────────────────────────────
33
34/// Single-shot hash using the Digest trait (non-Linux fallback).
35#[cfg(not(target_os = "linux"))]
36fn hash_digest<D: Digest>(data: &[u8]) -> String {
37    hex_encode(&D::digest(data))
38}
39
40/// Streaming hash using thread-local buffer (non-Linux fallback).
41#[cfg(not(target_os = "linux"))]
42fn hash_reader_impl<D: Digest>(mut reader: impl Read) -> io::Result<String> {
43    STREAM_BUF.with(|cell| {
44        let mut buf = cell.borrow_mut();
45        let mut hasher = D::new();
46        loop {
47            let n = read_full(&mut reader, &mut buf)?;
48            if n == 0 {
49                break;
50            }
51            hasher.update(&buf[..n]);
52        }
53        Ok(hex_encode(&hasher.finalize()))
54    })
55}
56
57// ── Public hashing API ──────────────────────────────────────────────
58
59/// Buffer size for streaming hash I/O.
60/// 8MB: amortizes syscall overhead while still fitting in L3 cache on modern CPUs.
61/// Larger buffer means fewer read() calls per file (e.g., 13 reads for 100MB vs 25).
62const HASH_READ_BUF: usize = 8 * 1024 * 1024;
63
64// Thread-local reusable buffer for streaming hash I/O.
65// Allocated once per thread, reused across all hash_reader calls.
66thread_local! {
67    static STREAM_BUF: RefCell<Vec<u8>> = RefCell::new(vec![0u8; HASH_READ_BUF]);
68}
69
70// ── SHA-256 ───────────────────────────────────────────────────────────
71
72/// Single-shot SHA-256 using OpenSSL's optimized assembly (SHA-NI on x86).
73/// Linux only — OpenSSL is not available on Windows/macOS in CI.
74#[cfg(target_os = "linux")]
75fn sha256_bytes(data: &[u8]) -> String {
76    let digest = openssl::hash::hash(openssl::hash::MessageDigest::sha256(), data)
77        .expect("SHA256 hash failed");
78    hex_encode(&digest)
79}
80
81/// Single-shot SHA-256 using ring's BoringSSL assembly (Windows and other non-Apple).
82#[cfg(all(not(target_vendor = "apple"), not(target_os = "linux")))]
83fn sha256_bytes(data: &[u8]) -> String {
84    hex_encode(ring::digest::digest(&ring::digest::SHA256, data).as_ref())
85}
86
87/// Single-shot SHA-256 using sha2 crate (macOS fallback — ring doesn't compile on Apple Silicon).
88#[cfg(target_vendor = "apple")]
89fn sha256_bytes(data: &[u8]) -> String {
90    hash_digest::<sha2::Sha256>(data)
91}
92
93/// Streaming SHA-256 using OpenSSL's optimized assembly.
94/// Linux only — OpenSSL is not available on Windows/macOS in CI.
95#[cfg(target_os = "linux")]
96fn sha256_reader(mut reader: impl Read) -> io::Result<String> {
97    STREAM_BUF.with(|cell| {
98        let mut buf = cell.borrow_mut();
99        let mut hasher = openssl::hash::Hasher::new(openssl::hash::MessageDigest::sha256())
100            .map_err(|e| io::Error::other(e))?;
101        loop {
102            let n = read_full(&mut reader, &mut buf)?;
103            if n == 0 {
104                break;
105            }
106            hasher.update(&buf[..n]).map_err(|e| io::Error::other(e))?;
107        }
108        let digest = hasher.finish().map_err(|e| io::Error::other(e))?;
109        Ok(hex_encode(&digest))
110    })
111}
112
113/// Streaming SHA-256 using ring's BoringSSL assembly (Windows and other non-Apple).
114#[cfg(all(not(target_vendor = "apple"), not(target_os = "linux")))]
115fn sha256_reader(mut reader: impl Read) -> io::Result<String> {
116    STREAM_BUF.with(|cell| {
117        let mut buf = cell.borrow_mut();
118        let mut ctx = ring::digest::Context::new(&ring::digest::SHA256);
119        loop {
120            let n = read_full(&mut reader, &mut buf)?;
121            if n == 0 {
122                break;
123            }
124            ctx.update(&buf[..n]);
125        }
126        Ok(hex_encode(ctx.finish().as_ref()))
127    })
128}
129
130/// Streaming SHA-256 using sha2 crate (macOS fallback).
131#[cfg(target_vendor = "apple")]
132fn sha256_reader(reader: impl Read) -> io::Result<String> {
133    hash_reader_impl::<sha2::Sha256>(reader)
134}
135
136/// Compute hash of a byte slice directly (zero-copy fast path).
137pub fn hash_bytes(algo: HashAlgorithm, data: &[u8]) -> String {
138    match algo {
139        HashAlgorithm::Sha256 => sha256_bytes(data),
140        HashAlgorithm::Md5 => md5_bytes(data),
141        HashAlgorithm::Blake2b => {
142            let hash = blake2b_simd::blake2b(data);
143            hex_encode(hash.as_bytes())
144        }
145    }
146}
147
148// ── MD5 ─────────────────────────────────────────────────────────────
149
150/// Single-shot MD5 using OpenSSL's optimized assembly (Linux).
151#[cfg(target_os = "linux")]
152fn md5_bytes(data: &[u8]) -> String {
153    let digest =
154        openssl::hash::hash(openssl::hash::MessageDigest::md5(), data).expect("MD5 hash failed");
155    hex_encode(&digest)
156}
157
158/// Single-shot MD5 using md-5 crate (non-Linux fallback).
159#[cfg(not(target_os = "linux"))]
160fn md5_bytes(data: &[u8]) -> String {
161    hash_digest::<Md5>(data)
162}
163
164/// Compute hash of data from a reader, returning hex string.
165pub fn hash_reader<R: Read>(algo: HashAlgorithm, reader: R) -> io::Result<String> {
166    match algo {
167        HashAlgorithm::Sha256 => sha256_reader(reader),
168        HashAlgorithm::Md5 => md5_reader(reader),
169        HashAlgorithm::Blake2b => blake2b_hash_reader(reader, 64),
170    }
171}
172
173/// Streaming MD5 using OpenSSL's optimized assembly (Linux).
174#[cfg(target_os = "linux")]
175fn md5_reader(mut reader: impl Read) -> io::Result<String> {
176    STREAM_BUF.with(|cell| {
177        let mut buf = cell.borrow_mut();
178        let mut hasher = openssl::hash::Hasher::new(openssl::hash::MessageDigest::md5())
179            .map_err(|e| io::Error::other(e))?;
180        loop {
181            let n = read_full(&mut reader, &mut buf)?;
182            if n == 0 {
183                break;
184            }
185            hasher.update(&buf[..n]).map_err(|e| io::Error::other(e))?;
186        }
187        let digest = hasher.finish().map_err(|e| io::Error::other(e))?;
188        Ok(hex_encode(&digest))
189    })
190}
191
192/// Streaming MD5 using md-5 crate (non-Linux fallback).
193#[cfg(not(target_os = "linux"))]
194fn md5_reader(reader: impl Read) -> io::Result<String> {
195    hash_reader_impl::<Md5>(reader)
196}
197
198/// Track whether O_NOATIME is supported to avoid repeated failed open() attempts.
199/// After the first EPERM, we never try O_NOATIME again (saves one syscall per file).
200#[cfg(target_os = "linux")]
201static NOATIME_SUPPORTED: AtomicBool = AtomicBool::new(true);
202
203/// Open a file with O_NOATIME on Linux to avoid atime update overhead.
204/// Caches whether O_NOATIME works to avoid double-open on every file.
205#[cfg(target_os = "linux")]
206fn open_noatime(path: &Path) -> io::Result<File> {
207    use std::os::unix::fs::OpenOptionsExt;
208    if NOATIME_SUPPORTED.load(Ordering::Relaxed) {
209        match std::fs::OpenOptions::new()
210            .read(true)
211            .custom_flags(libc::O_NOATIME)
212            .open(path)
213        {
214            Ok(f) => return Ok(f),
215            Err(ref e) if e.raw_os_error() == Some(libc::EPERM) => {
216                // O_NOATIME requires file ownership or CAP_FOWNER — disable globally
217                NOATIME_SUPPORTED.store(false, Ordering::Relaxed);
218            }
219            Err(e) => return Err(e), // Real error, propagate
220        }
221    }
222    File::open(path)
223}
224
225#[cfg(not(target_os = "linux"))]
226fn open_noatime(path: &Path) -> io::Result<File> {
227    File::open(path)
228}
229
230/// Minimum file size to issue fadvise hint (1MB).
231/// For small files, the syscall overhead exceeds the readahead benefit.
232#[cfg(target_os = "linux")]
233const FADVISE_MIN_SIZE: u64 = 1024 * 1024;
234
235/// Maximum file size for single-read hash optimization.
236/// Files up to this size are read entirely into a thread-local buffer and hashed
237/// with single-shot hash (avoids Hasher allocation + streaming overhead).
238const SMALL_FILE_LIMIT: u64 = 1024 * 1024;
239
240// Thread-local reusable buffer for small-file single-read hash.
241// Avoids repeated allocation for many small files (e.g., 100 files of 1KB each).
242thread_local! {
243    static SMALL_FILE_BUF: RefCell<Vec<u8>> = RefCell::new(Vec::with_capacity(64 * 1024));
244}
245
246/// Hash a file by path. Uses mmap for large files (zero-copy, no read() syscalls),
247/// single-read + single-shot hash for small files, and streaming read as fallback.
248pub fn hash_file(algo: HashAlgorithm, path: &Path) -> io::Result<String> {
249    let file = open_noatime(path)?;
250    let metadata = file.metadata()?;
251    let file_size = metadata.len();
252
253    if metadata.file_type().is_file() && file_size == 0 {
254        return Ok(hash_bytes(algo, &[]));
255    }
256
257    if file_size > 0 && metadata.file_type().is_file() {
258        // mmap for large files — zero-copy, eliminates multiple read() syscalls
259        if file_size >= SMALL_FILE_LIMIT {
260            #[cfg(target_os = "linux")]
261            if file_size >= FADVISE_MIN_SIZE {
262                use std::os::unix::io::AsRawFd;
263                unsafe {
264                    libc::posix_fadvise(
265                        file.as_raw_fd(),
266                        0,
267                        file_size as i64,
268                        libc::POSIX_FADV_SEQUENTIAL,
269                    );
270                }
271            }
272            if let Ok(mmap) = unsafe { memmap2::MmapOptions::new().populate().map(&file) } {
273                #[cfg(target_os = "linux")]
274                {
275                    let _ = mmap.advise(memmap2::Advice::Sequential);
276                    if file_size >= 2 * 1024 * 1024 {
277                        let _ = mmap.advise(memmap2::Advice::HugePage);
278                    }
279                }
280                return Ok(hash_bytes(algo, &mmap));
281            }
282        }
283        // Small files: single read into thread-local buffer, then single-shot hash.
284        // This avoids Hasher context allocation + streaming overhead for each file.
285        // Especially beneficial for many small files (100+ files of a few KB each).
286        if file_size < SMALL_FILE_LIMIT {
287            return hash_file_small(algo, file, file_size as usize);
288        }
289    }
290
291    // Non-regular files or fallback: stream
292    #[cfg(target_os = "linux")]
293    if file_size >= FADVISE_MIN_SIZE {
294        use std::os::unix::io::AsRawFd;
295        unsafe {
296            libc::posix_fadvise(file.as_raw_fd(), 0, 0, libc::POSIX_FADV_SEQUENTIAL);
297        }
298    }
299    hash_reader(algo, file)
300}
301
302/// Hash a small file by reading it entirely into a thread-local buffer,
303/// then using the single-shot hash function. Avoids per-file Hasher allocation.
304#[inline]
305fn hash_file_small(algo: HashAlgorithm, mut file: File, size: usize) -> io::Result<String> {
306    SMALL_FILE_BUF.with(|cell| {
307        let mut buf = cell.borrow_mut();
308        buf.clear();
309        // Pre-allocate to known file size to avoid reallocation
310        let cap = buf.capacity();
311        if cap < size {
312            buf.reserve(size - cap);
313        }
314        file.read_to_end(&mut buf)?;
315        Ok(hash_bytes(algo, &buf))
316    })
317}
318
319/// Hash stdin. Uses fadvise for file redirects, streaming for pipes.
320pub fn hash_stdin(algo: HashAlgorithm) -> io::Result<String> {
321    let stdin = io::stdin();
322    // Hint kernel for sequential access if stdin is a regular file (redirect)
323    #[cfg(target_os = "linux")]
324    {
325        use std::os::unix::io::AsRawFd;
326        let fd = stdin.as_raw_fd();
327        let mut stat: libc::stat = unsafe { std::mem::zeroed() };
328        if unsafe { libc::fstat(fd, &mut stat) } == 0
329            && (stat.st_mode & libc::S_IFMT) == libc::S_IFREG
330            && stat.st_size > 0
331        {
332            unsafe {
333                libc::posix_fadvise(fd, 0, stat.st_size, libc::POSIX_FADV_SEQUENTIAL);
334            }
335        }
336    }
337    // Streaming hash — works for both pipe and file-redirect stdin
338    hash_reader(algo, stdin.lock())
339}
340
341/// Check if parallel hashing is worthwhile for the given file paths.
342/// Always parallelize with 2+ files — rayon's thread pool is lazily initialized
343/// once and reused, so per-file work-stealing overhead is negligible (~1µs).
344/// Removing the stat()-based size check eliminates N extra syscalls for N files.
345pub fn should_use_parallel(paths: &[&Path]) -> bool {
346    paths.len() >= 2
347}
348
349/// Issue readahead hints for a list of file paths to warm the page cache.
350/// Uses POSIX_FADV_WILLNEED which is non-blocking and batches efficiently.
351/// Only issues hints for files >= 1MB; small files are read fast enough
352/// that the fadvise syscall overhead isn't worth it.
353#[cfg(target_os = "linux")]
354pub fn readahead_files(paths: &[&Path]) {
355    use std::os::unix::io::AsRawFd;
356    for path in paths {
357        if let Ok(file) = open_noatime(path) {
358            if let Ok(meta) = file.metadata() {
359                let len = meta.len();
360                if meta.file_type().is_file() && len >= FADVISE_MIN_SIZE {
361                    unsafe {
362                        libc::posix_fadvise(
363                            file.as_raw_fd(),
364                            0,
365                            len as i64,
366                            libc::POSIX_FADV_WILLNEED,
367                        );
368                    }
369                }
370            }
371        }
372    }
373}
374
375#[cfg(not(target_os = "linux"))]
376pub fn readahead_files(_paths: &[&Path]) {
377    // No-op on non-Linux
378}
379
380// --- BLAKE2b variable-length functions (using blake2b_simd) ---
381
382/// Hash raw data with BLAKE2b variable output length.
383/// `output_bytes` is the output size in bytes (e.g., 32 for 256-bit).
384pub fn blake2b_hash_data(data: &[u8], output_bytes: usize) -> String {
385    let hash = blake2b_simd::Params::new()
386        .hash_length(output_bytes)
387        .hash(data);
388    hex_encode(hash.as_bytes())
389}
390
391/// Hash a reader with BLAKE2b variable output length.
392/// Uses thread-local buffer for cache-friendly streaming.
393pub fn blake2b_hash_reader<R: Read>(mut reader: R, output_bytes: usize) -> io::Result<String> {
394    STREAM_BUF.with(|cell| {
395        let mut buf = cell.borrow_mut();
396        let mut state = blake2b_simd::Params::new()
397            .hash_length(output_bytes)
398            .to_state();
399        loop {
400            let n = read_full(&mut reader, &mut buf)?;
401            if n == 0 {
402                break;
403            }
404            state.update(&buf[..n]);
405        }
406        Ok(hex_encode(state.finalize().as_bytes()))
407    })
408}
409
410/// Hash a file with BLAKE2b variable output length.
411/// Uses mmap for large files (zero-copy), single-read for small files,
412/// and streaming read as fallback.
413pub fn blake2b_hash_file(path: &Path, output_bytes: usize) -> io::Result<String> {
414    let file = open_noatime(path)?;
415    let metadata = file.metadata()?;
416    let file_size = metadata.len();
417
418    if metadata.file_type().is_file() && file_size == 0 {
419        return Ok(blake2b_hash_data(&[], output_bytes));
420    }
421
422    if file_size > 0 && metadata.file_type().is_file() {
423        // mmap for large files — zero-copy, eliminates multiple read() syscalls
424        if file_size >= SMALL_FILE_LIMIT {
425            #[cfg(target_os = "linux")]
426            if file_size >= FADVISE_MIN_SIZE {
427                use std::os::unix::io::AsRawFd;
428                unsafe {
429                    libc::posix_fadvise(
430                        file.as_raw_fd(),
431                        0,
432                        file_size as i64,
433                        libc::POSIX_FADV_SEQUENTIAL,
434                    );
435                }
436            }
437            if let Ok(mmap) = unsafe { memmap2::MmapOptions::new().populate().map(&file) } {
438                #[cfg(target_os = "linux")]
439                {
440                    let _ = mmap.advise(memmap2::Advice::Sequential);
441                    if file_size >= 2 * 1024 * 1024 {
442                        let _ = mmap.advise(memmap2::Advice::HugePage);
443                    }
444                }
445                return Ok(blake2b_hash_data(&mmap, output_bytes));
446            }
447        }
448        // Small files: single read into thread-local buffer, then single-shot hash
449        if file_size < SMALL_FILE_LIMIT {
450            return blake2b_hash_file_small(file, file_size as usize, output_bytes);
451        }
452    }
453
454    // Non-regular files or fallback: stream
455    #[cfg(target_os = "linux")]
456    if file_size >= FADVISE_MIN_SIZE {
457        use std::os::unix::io::AsRawFd;
458        unsafe {
459            libc::posix_fadvise(file.as_raw_fd(), 0, 0, libc::POSIX_FADV_SEQUENTIAL);
460        }
461    }
462    blake2b_hash_reader(file, output_bytes)
463}
464
465/// Hash a small file with BLAKE2b by reading it entirely into a thread-local buffer.
466#[inline]
467fn blake2b_hash_file_small(mut file: File, size: usize, output_bytes: usize) -> io::Result<String> {
468    SMALL_FILE_BUF.with(|cell| {
469        let mut buf = cell.borrow_mut();
470        buf.clear();
471        let cap = buf.capacity();
472        if cap < size {
473            buf.reserve(size - cap);
474        }
475        file.read_to_end(&mut buf)?;
476        Ok(blake2b_hash_data(&buf, output_bytes))
477    })
478}
479
480/// Hash stdin with BLAKE2b variable output length.
481/// Tries fadvise if stdin is a regular file (shell redirect), then streams.
482pub fn blake2b_hash_stdin(output_bytes: usize) -> io::Result<String> {
483    let stdin = io::stdin();
484    #[cfg(target_os = "linux")]
485    {
486        use std::os::unix::io::AsRawFd;
487        let fd = stdin.as_raw_fd();
488        let mut stat: libc::stat = unsafe { std::mem::zeroed() };
489        if unsafe { libc::fstat(fd, &mut stat) } == 0
490            && (stat.st_mode & libc::S_IFMT) == libc::S_IFREG
491            && stat.st_size > 0
492        {
493            unsafe {
494                libc::posix_fadvise(fd, 0, stat.st_size, libc::POSIX_FADV_SEQUENTIAL);
495            }
496        }
497    }
498    blake2b_hash_reader(stdin.lock(), output_bytes)
499}
500
501/// Print hash result in GNU format: "hash  filename\n"
502/// Uses raw byte writes to avoid std::fmt overhead.
503pub fn print_hash(
504    out: &mut impl Write,
505    hash: &str,
506    filename: &str,
507    binary: bool,
508) -> io::Result<()> {
509    let mode = if binary { b'*' } else { b' ' };
510    out.write_all(hash.as_bytes())?;
511    out.write_all(&[b' ', mode])?;
512    out.write_all(filename.as_bytes())?;
513    out.write_all(b"\n")
514}
515
516/// Print hash in GNU format with NUL terminator instead of newline.
517pub fn print_hash_zero(
518    out: &mut impl Write,
519    hash: &str,
520    filename: &str,
521    binary: bool,
522) -> io::Result<()> {
523    let mode = if binary { b'*' } else { b' ' };
524    out.write_all(hash.as_bytes())?;
525    out.write_all(&[b' ', mode])?;
526    out.write_all(filename.as_bytes())?;
527    out.write_all(b"\0")
528}
529
530/// Print hash result in BSD tag format: "ALGO (filename) = hash\n"
531pub fn print_hash_tag(
532    out: &mut impl Write,
533    algo: HashAlgorithm,
534    hash: &str,
535    filename: &str,
536) -> io::Result<()> {
537    out.write_all(algo.name().as_bytes())?;
538    out.write_all(b" (")?;
539    out.write_all(filename.as_bytes())?;
540    out.write_all(b") = ")?;
541    out.write_all(hash.as_bytes())?;
542    out.write_all(b"\n")
543}
544
545/// Print hash in BSD tag format with NUL terminator.
546pub fn print_hash_tag_zero(
547    out: &mut impl Write,
548    algo: HashAlgorithm,
549    hash: &str,
550    filename: &str,
551) -> io::Result<()> {
552    out.write_all(algo.name().as_bytes())?;
553    out.write_all(b" (")?;
554    out.write_all(filename.as_bytes())?;
555    out.write_all(b") = ")?;
556    out.write_all(hash.as_bytes())?;
557    out.write_all(b"\0")
558}
559
560/// Print hash in BSD tag format with BLAKE2b length info:
561/// "BLAKE2b (filename) = hash" for 512-bit, or
562/// "BLAKE2b-256 (filename) = hash" for other lengths.
563pub fn print_hash_tag_b2sum(
564    out: &mut impl Write,
565    hash: &str,
566    filename: &str,
567    bits: usize,
568) -> io::Result<()> {
569    if bits == 512 {
570        out.write_all(b"BLAKE2b (")?;
571    } else {
572        // Use write! for the rare non-512 path (negligible overhead per file)
573        write!(out, "BLAKE2b-{} (", bits)?;
574    }
575    out.write_all(filename.as_bytes())?;
576    out.write_all(b") = ")?;
577    out.write_all(hash.as_bytes())?;
578    out.write_all(b"\n")
579}
580
581/// Print hash in BSD tag format with BLAKE2b length info and NUL terminator.
582pub fn print_hash_tag_b2sum_zero(
583    out: &mut impl Write,
584    hash: &str,
585    filename: &str,
586    bits: usize,
587) -> io::Result<()> {
588    if bits == 512 {
589        out.write_all(b"BLAKE2b (")?;
590    } else {
591        write!(out, "BLAKE2b-{} (", bits)?;
592    }
593    out.write_all(filename.as_bytes())?;
594    out.write_all(b") = ")?;
595    out.write_all(hash.as_bytes())?;
596    out.write_all(b"\0")
597}
598
599/// Options for check mode.
600pub struct CheckOptions {
601    pub quiet: bool,
602    pub status_only: bool,
603    pub strict: bool,
604    pub warn: bool,
605    pub ignore_missing: bool,
606    /// Prefix for per-line format warnings, e.g., "fmd5sum: checksums.txt".
607    /// When non-empty, warnings use GNU format: "{prefix}: {line}: message".
608    /// When empty, uses generic format: "line {line}: message".
609    pub warn_prefix: String,
610}
611
612/// Result of check mode verification.
613pub struct CheckResult {
614    pub ok: usize,
615    pub mismatches: usize,
616    pub format_errors: usize,
617    pub read_errors: usize,
618    /// Number of files skipped because they were missing and --ignore-missing was set.
619    pub ignored_missing: usize,
620}
621
622/// Verify checksums from a check file.
623/// Each line should be "hash  filename" or "hash *filename" or "ALGO (filename) = hash".
624pub fn check_file<R: BufRead>(
625    algo: HashAlgorithm,
626    reader: R,
627    opts: &CheckOptions,
628    out: &mut impl Write,
629    err_out: &mut impl Write,
630) -> io::Result<CheckResult> {
631    let quiet = opts.quiet;
632    let status_only = opts.status_only;
633    let warn = opts.warn;
634    let ignore_missing = opts.ignore_missing;
635    let mut ok_count = 0;
636    let mut mismatch_count = 0;
637    let mut format_errors = 0;
638    let mut read_errors = 0;
639    let mut ignored_missing_count = 0;
640    let mut line_num = 0;
641
642    for line_result in reader.lines() {
643        line_num += 1;
644        let line = line_result?;
645        let line = line.trim_end();
646
647        if line.is_empty() {
648            continue;
649        }
650
651        // Parse "hash  filename" or "hash *filename" or "ALGO (file) = hash"
652        let (expected_hash, filename) = match parse_check_line(line) {
653            Some(v) => v,
654            None => {
655                format_errors += 1;
656                if warn {
657                    out.flush()?;
658                    if opts.warn_prefix.is_empty() {
659                        writeln!(
660                            err_out,
661                            "line {}: improperly formatted {} checksum line",
662                            line_num,
663                            algo.name()
664                        )?;
665                    } else {
666                        writeln!(
667                            err_out,
668                            "{}: {}: improperly formatted {} checksum line",
669                            opts.warn_prefix,
670                            line_num,
671                            algo.name()
672                        )?;
673                    }
674                }
675                continue;
676            }
677        };
678
679        // Compute actual hash
680        let actual = match hash_file(algo, Path::new(filename)) {
681            Ok(h) => h,
682            Err(e) => {
683                if ignore_missing && e.kind() == io::ErrorKind::NotFound {
684                    ignored_missing_count += 1;
685                    continue;
686                }
687                read_errors += 1;
688                if !status_only {
689                    out.flush()?;
690                    writeln!(err_out, "{}: {}", filename, e)?;
691                    writeln!(out, "{}: FAILED open or read", filename)?;
692                }
693                continue;
694            }
695        };
696
697        if actual.eq_ignore_ascii_case(expected_hash) {
698            ok_count += 1;
699            if !quiet && !status_only {
700                writeln!(out, "{}: OK", filename)?;
701            }
702        } else {
703            mismatch_count += 1;
704            if !status_only {
705                writeln!(out, "{}: FAILED", filename)?;
706            }
707        }
708    }
709
710    Ok(CheckResult {
711        ok: ok_count,
712        mismatches: mismatch_count,
713        format_errors,
714        read_errors,
715        ignored_missing: ignored_missing_count,
716    })
717}
718
719/// Parse a checksum line in any supported format.
720pub fn parse_check_line(line: &str) -> Option<(&str, &str)> {
721    // Try BSD tag format: "ALGO (filename) = hash"
722    let rest = line
723        .strip_prefix("MD5 (")
724        .or_else(|| line.strip_prefix("SHA256 ("))
725        .or_else(|| line.strip_prefix("BLAKE2b ("))
726        .or_else(|| {
727            // Handle BLAKE2b-NNN (filename) = hash
728            if line.starts_with("BLAKE2b-") {
729                let after = &line["BLAKE2b-".len()..];
730                if let Some(sp) = after.find(" (") {
731                    if after[..sp].bytes().all(|b| b.is_ascii_digit()) {
732                        return Some(&after[sp + 2..]);
733                    }
734                }
735            }
736            None
737        });
738    if let Some(rest) = rest {
739        if let Some(paren_idx) = rest.find(") = ") {
740            let filename = &rest[..paren_idx];
741            let hash = &rest[paren_idx + 4..];
742            return Some((hash, filename));
743        }
744    }
745
746    // Handle backslash-escaped lines (leading '\')
747    let line = line.strip_prefix('\\').unwrap_or(line);
748
749    // Standard format: "hash  filename"
750    if let Some(idx) = line.find("  ") {
751        let hash = &line[..idx];
752        let rest = &line[idx + 2..];
753        return Some((hash, rest));
754    }
755    // Binary mode: "hash *filename"
756    if let Some(idx) = line.find(" *") {
757        let hash = &line[..idx];
758        let rest = &line[idx + 2..];
759        return Some((hash, rest));
760    }
761    None
762}
763
764/// Parse a BSD-style tag line: "ALGO (filename) = hash"
765/// Returns (expected_hash, filename, optional_bits).
766/// `bits` is the hash length parsed from the algo name (e.g., BLAKE2b-256 -> Some(256)).
767pub fn parse_check_line_tag(line: &str) -> Option<(&str, &str, Option<usize>)> {
768    let paren_start = line.find(" (")?;
769    let algo_part = &line[..paren_start];
770    let rest = &line[paren_start + 2..];
771    let paren_end = rest.find(") = ")?;
772    let filename = &rest[..paren_end];
773    let hash = &rest[paren_end + 4..];
774
775    // Parse optional bit length from algo name (e.g., "BLAKE2b-256" -> Some(256))
776    let bits = if let Some(dash_pos) = algo_part.rfind('-') {
777        algo_part[dash_pos + 1..].parse::<usize>().ok()
778    } else {
779        None
780    };
781
782    Some((hash, filename, bits))
783}
784
785/// Read as many bytes as possible into buf, retrying on partial reads.
786/// Ensures each hash update gets a full buffer (fewer update calls = less overhead).
787/// Fast path: regular file reads usually return the full buffer on the first call.
788#[inline]
789fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
790    // Fast path: first read() usually fills the entire buffer for regular files
791    let n = reader.read(buf)?;
792    if n == buf.len() || n == 0 {
793        return Ok(n);
794    }
795    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
796    let mut total = n;
797    while total < buf.len() {
798        match reader.read(&mut buf[total..]) {
799            Ok(0) => break,
800            Ok(n) => total += n,
801            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
802            Err(e) => return Err(e),
803        }
804    }
805    Ok(total)
806}
807
808/// Compile-time generated 2-byte hex pair lookup table.
809/// Each byte maps directly to its 2-char hex representation — single lookup per byte.
810const fn generate_hex_table() -> [[u8; 2]; 256] {
811    let hex = b"0123456789abcdef";
812    let mut table = [[0u8; 2]; 256];
813    let mut i = 0;
814    while i < 256 {
815        table[i] = [hex[i >> 4], hex[i & 0xf]];
816        i += 1;
817    }
818    table
819}
820
821const HEX_TABLE: [[u8; 2]; 256] = generate_hex_table();
822
823/// Fast hex encoding using 2-byte pair lookup table — one lookup per input byte.
824/// Uses String directly instead of Vec<u8> to avoid the from_utf8 conversion overhead.
825pub(crate) fn hex_encode(bytes: &[u8]) -> String {
826    let len = bytes.len() * 2;
827    let mut hex = String::with_capacity(len);
828    // SAFETY: We write exactly `len` valid ASCII hex bytes into the String's buffer.
829    unsafe {
830        let buf = hex.as_mut_vec();
831        buf.set_len(len);
832        let ptr = buf.as_mut_ptr();
833        for (i, &b) in bytes.iter().enumerate() {
834            let pair = *HEX_TABLE.get_unchecked(b as usize);
835            *ptr.add(i * 2) = pair[0];
836            *ptr.add(i * 2 + 1) = pair[1];
837        }
838    }
839    hex
840}
coreutils_rs/hash/core.rs

coreutils_rs/hash/
core.rs