coreutils_rs/hash/
core.rs

1use std::cell::RefCell;
2use std::fs::File;
3use std::io::{self, BufRead, Read, Write};
4use std::path::Path;
5
6#[cfg(target_os = "linux")]
7use std::sync::atomic::{AtomicBool, Ordering};
8
9#[cfg(not(target_os = "linux"))]
10use digest::Digest;
11#[cfg(not(target_os = "linux"))]
12use md5::Md5;
13
14/// Supported hash algorithms.
15#[derive(Debug, Clone, Copy)]
16pub enum HashAlgorithm {
17    Sha256,
18    Md5,
19    Blake2b,
20}
21
22impl HashAlgorithm {
23    pub fn name(self) -> &'static str {
24        match self {
25            HashAlgorithm::Sha256 => "SHA256",
26            HashAlgorithm::Md5 => "MD5",
27            HashAlgorithm::Blake2b => "BLAKE2b",
28        }
29    }
30}
31
32// ── Generic hash helpers ────────────────────────────────────────────
33
34/// Single-shot hash using the Digest trait (non-Linux fallback).
35#[cfg(not(target_os = "linux"))]
36fn hash_digest<D: Digest>(data: &[u8]) -> String {
37    hex_encode(&D::digest(data))
38}
39
40/// Streaming hash using thread-local buffer (non-Linux fallback).
41#[cfg(not(target_os = "linux"))]
42fn hash_reader_impl<D: Digest>(mut reader: impl Read) -> io::Result<String> {
43    STREAM_BUF.with(|cell| {
44        let mut buf = cell.borrow_mut();
45        ensure_stream_buf(&mut buf);
46        let mut hasher = D::new();
47        loop {
48            let n = read_full(&mut reader, &mut buf)?;
49            if n == 0 {
50                break;
51            }
52            hasher.update(&buf[..n]);
53        }
54        Ok(hex_encode(&hasher.finalize()))
55    })
56}
57
58// ── Public hashing API ──────────────────────────────────────────────
59
60/// Buffer size for streaming hash I/O.
61/// 8MB: amortizes syscall overhead while still fitting in L3 cache on modern CPUs.
62/// Larger buffer means fewer read() calls per file (e.g., 13 reads for 100MB vs 25).
63const HASH_READ_BUF: usize = 8 * 1024 * 1024;
64
65// Thread-local reusable buffer for streaming hash I/O.
66// Allocated LAZILY (only on first streaming-hash call) to avoid 8MB cost for
67// small-file-only workloads (e.g., "sha256sum *.txt" where every file is <1MB).
68thread_local! {
69    static STREAM_BUF: RefCell<Vec<u8>> = const { RefCell::new(Vec::new()) };
70}
71
72/// Ensure the streaming buffer is at least HASH_READ_BUF bytes.
73/// Called only on the streaming path, so small-file workloads never allocate 8MB.
74#[inline]
75fn ensure_stream_buf(buf: &mut Vec<u8>) {
76    if buf.len() < HASH_READ_BUF {
77        buf.resize(HASH_READ_BUF, 0);
78    }
79}
80
81// ── SHA-256 ───────────────────────────────────────────────────────────
82
83/// Single-shot SHA-256 using OpenSSL's optimized assembly (SHA-NI on x86).
84/// Linux only — OpenSSL is not available on Windows/macOS in CI.
85#[cfg(target_os = "linux")]
86fn sha256_bytes(data: &[u8]) -> String {
87    let digest = openssl::hash::hash(openssl::hash::MessageDigest::sha256(), data)
88        .expect("SHA256 hash failed");
89    hex_encode(&digest)
90}
91
92/// Single-shot SHA-256 using ring's BoringSSL assembly (Windows and other non-Apple).
93#[cfg(all(not(target_vendor = "apple"), not(target_os = "linux")))]
94fn sha256_bytes(data: &[u8]) -> String {
95    hex_encode(ring::digest::digest(&ring::digest::SHA256, data).as_ref())
96}
97
98/// Single-shot SHA-256 using sha2 crate (macOS fallback — ring doesn't compile on Apple Silicon).
99#[cfg(target_vendor = "apple")]
100fn sha256_bytes(data: &[u8]) -> String {
101    hash_digest::<sha2::Sha256>(data)
102}
103
104/// Streaming SHA-256 using OpenSSL's optimized assembly.
105/// Linux only — OpenSSL is not available on Windows/macOS in CI.
106#[cfg(target_os = "linux")]
107fn sha256_reader(mut reader: impl Read) -> io::Result<String> {
108    STREAM_BUF.with(|cell| {
109        let mut buf = cell.borrow_mut();
110        ensure_stream_buf(&mut buf);
111        let mut hasher = openssl::hash::Hasher::new(openssl::hash::MessageDigest::sha256())
112            .map_err(|e| io::Error::other(e))?;
113        loop {
114            let n = read_full(&mut reader, &mut buf)?;
115            if n == 0 {
116                break;
117            }
118            hasher.update(&buf[..n]).map_err(|e| io::Error::other(e))?;
119        }
120        let digest = hasher.finish().map_err(|e| io::Error::other(e))?;
121        Ok(hex_encode(&digest))
122    })
123}
124
125/// Streaming SHA-256 using ring's BoringSSL assembly (Windows and other non-Apple).
126#[cfg(all(not(target_vendor = "apple"), not(target_os = "linux")))]
127fn sha256_reader(mut reader: impl Read) -> io::Result<String> {
128    STREAM_BUF.with(|cell| {
129        let mut buf = cell.borrow_mut();
130        ensure_stream_buf(&mut buf);
131        let mut ctx = ring::digest::Context::new(&ring::digest::SHA256);
132        loop {
133            let n = read_full(&mut reader, &mut buf)?;
134            if n == 0 {
135                break;
136            }
137            ctx.update(&buf[..n]);
138        }
139        Ok(hex_encode(ctx.finish().as_ref()))
140    })
141}
142
143/// Streaming SHA-256 using sha2 crate (macOS fallback).
144#[cfg(target_vendor = "apple")]
145fn sha256_reader(reader: impl Read) -> io::Result<String> {
146    hash_reader_impl::<sha2::Sha256>(reader)
147}
148
149/// Compute hash of a byte slice directly (zero-copy fast path).
150pub fn hash_bytes(algo: HashAlgorithm, data: &[u8]) -> String {
151    match algo {
152        HashAlgorithm::Sha256 => sha256_bytes(data),
153        HashAlgorithm::Md5 => md5_bytes(data),
154        HashAlgorithm::Blake2b => {
155            let hash = blake2b_simd::blake2b(data);
156            hex_encode(hash.as_bytes())
157        }
158    }
159}
160
161// ── MD5 ─────────────────────────────────────────────────────────────
162
163/// Single-shot MD5 using OpenSSL's optimized assembly (Linux).
164#[cfg(target_os = "linux")]
165fn md5_bytes(data: &[u8]) -> String {
166    let digest =
167        openssl::hash::hash(openssl::hash::MessageDigest::md5(), data).expect("MD5 hash failed");
168    hex_encode(&digest)
169}
170
171/// Single-shot MD5 using md-5 crate (non-Linux fallback).
172#[cfg(not(target_os = "linux"))]
173fn md5_bytes(data: &[u8]) -> String {
174    hash_digest::<Md5>(data)
175}
176
177/// Compute hash of data from a reader, returning hex string.
178pub fn hash_reader<R: Read>(algo: HashAlgorithm, reader: R) -> io::Result<String> {
179    match algo {
180        HashAlgorithm::Sha256 => sha256_reader(reader),
181        HashAlgorithm::Md5 => md5_reader(reader),
182        HashAlgorithm::Blake2b => blake2b_hash_reader(reader, 64),
183    }
184}
185
186/// Streaming MD5 using OpenSSL's optimized assembly (Linux).
187#[cfg(target_os = "linux")]
188fn md5_reader(mut reader: impl Read) -> io::Result<String> {
189    STREAM_BUF.with(|cell| {
190        let mut buf = cell.borrow_mut();
191        ensure_stream_buf(&mut buf);
192        let mut hasher = openssl::hash::Hasher::new(openssl::hash::MessageDigest::md5())
193            .map_err(|e| io::Error::other(e))?;
194        loop {
195            let n = read_full(&mut reader, &mut buf)?;
196            if n == 0 {
197                break;
198            }
199            hasher.update(&buf[..n]).map_err(|e| io::Error::other(e))?;
200        }
201        let digest = hasher.finish().map_err(|e| io::Error::other(e))?;
202        Ok(hex_encode(&digest))
203    })
204}
205
206/// Streaming MD5 using md-5 crate (non-Linux fallback).
207#[cfg(not(target_os = "linux"))]
208fn md5_reader(reader: impl Read) -> io::Result<String> {
209    hash_reader_impl::<Md5>(reader)
210}
211
212/// Track whether O_NOATIME is supported to avoid repeated failed open() attempts.
213/// After the first EPERM, we never try O_NOATIME again (saves one syscall per file).
214#[cfg(target_os = "linux")]
215static NOATIME_SUPPORTED: AtomicBool = AtomicBool::new(true);
216
217/// Open a file with O_NOATIME on Linux to avoid atime update overhead.
218/// Caches whether O_NOATIME works to avoid double-open on every file.
219#[cfg(target_os = "linux")]
220fn open_noatime(path: &Path) -> io::Result<File> {
221    use std::os::unix::fs::OpenOptionsExt;
222    if NOATIME_SUPPORTED.load(Ordering::Relaxed) {
223        match std::fs::OpenOptions::new()
224            .read(true)
225            .custom_flags(libc::O_NOATIME)
226            .open(path)
227        {
228            Ok(f) => return Ok(f),
229            Err(ref e) if e.raw_os_error() == Some(libc::EPERM) => {
230                // O_NOATIME requires file ownership or CAP_FOWNER — disable globally
231                NOATIME_SUPPORTED.store(false, Ordering::Relaxed);
232            }
233            Err(e) => return Err(e), // Real error, propagate
234        }
235    }
236    File::open(path)
237}
238
239#[cfg(not(target_os = "linux"))]
240fn open_noatime(path: &Path) -> io::Result<File> {
241    File::open(path)
242}
243
244/// Open a file and get its metadata in one step.
245/// On Linux uses fstat directly on the fd to avoid an extra syscall layer.
246#[cfg(target_os = "linux")]
247#[inline]
248fn open_and_stat(path: &Path) -> io::Result<(File, u64, bool)> {
249    let file = open_noatime(path)?;
250    let fd = {
251        use std::os::unix::io::AsRawFd;
252        file.as_raw_fd()
253    };
254    let mut stat: libc::stat = unsafe { std::mem::zeroed() };
255    if unsafe { libc::fstat(fd, &mut stat) } != 0 {
256        return Err(io::Error::last_os_error());
257    }
258    let is_regular = (stat.st_mode & libc::S_IFMT) == libc::S_IFREG;
259    let size = stat.st_size as u64;
260    Ok((file, size, is_regular))
261}
262
263#[cfg(not(target_os = "linux"))]
264#[inline]
265fn open_and_stat(path: &Path) -> io::Result<(File, u64, bool)> {
266    let file = open_noatime(path)?;
267    let metadata = file.metadata()?;
268    Ok((file, metadata.len(), metadata.file_type().is_file()))
269}
270
271/// Minimum file size to issue fadvise hint (1MB).
272/// For small files, the syscall overhead exceeds the readahead benefit.
273#[cfg(target_os = "linux")]
274const FADVISE_MIN_SIZE: u64 = 1024 * 1024;
275
276/// Maximum file size for single-read hash optimization.
277/// Files up to this size are read entirely into a thread-local buffer and hashed
278/// with single-shot hash (avoids Hasher allocation + streaming overhead).
279const SMALL_FILE_LIMIT: u64 = 1024 * 1024;
280
281/// Threshold for tiny files that can be read into a stack buffer.
282/// Below this size, we use a stack-allocated buffer + single read() syscall,
283/// completely avoiding any heap allocation for the data path.
284const TINY_FILE_LIMIT: u64 = 8 * 1024;
285
286// Thread-local reusable buffer for small-file single-read hash.
287// Avoids repeated allocation for many small files (e.g., 100 files of 1KB each).
288thread_local! {
289    static SMALL_FILE_BUF: RefCell<Vec<u8>> = RefCell::new(Vec::with_capacity(64 * 1024));
290}
291
292/// Hash a file by path. Uses mmap for large files (zero-copy, no read() syscalls),
293/// single-read + single-shot hash for small files, and streaming read as fallback.
294pub fn hash_file(algo: HashAlgorithm, path: &Path) -> io::Result<String> {
295    let (file, file_size, is_regular) = open_and_stat(path)?;
296
297    if is_regular && file_size == 0 {
298        return Ok(hash_bytes(algo, &[]));
299    }
300
301    if file_size > 0 && is_regular {
302        // Tiny files (<8KB): stack buffer + single read() — zero heap allocation
303        if file_size < TINY_FILE_LIMIT {
304            return hash_file_tiny(algo, file, file_size as usize);
305        }
306        // mmap for large files — zero-copy, eliminates multiple read() syscalls
307        if file_size >= SMALL_FILE_LIMIT {
308            #[cfg(target_os = "linux")]
309            if file_size >= FADVISE_MIN_SIZE {
310                use std::os::unix::io::AsRawFd;
311                unsafe {
312                    libc::posix_fadvise(
313                        file.as_raw_fd(),
314                        0,
315                        file_size as i64,
316                        libc::POSIX_FADV_SEQUENTIAL,
317                    );
318                }
319            }
320            if let Ok(mmap) = unsafe { memmap2::MmapOptions::new().populate().map(&file) } {
321                #[cfg(target_os = "linux")]
322                {
323                    let _ = mmap.advise(memmap2::Advice::Sequential);
324                    if file_size >= 2 * 1024 * 1024 {
325                        let _ = mmap.advise(memmap2::Advice::HugePage);
326                    }
327                }
328                return Ok(hash_bytes(algo, &mmap));
329            }
330        }
331        // Small files (8KB..1MB): single read into thread-local buffer, then single-shot hash.
332        // This avoids Hasher context allocation + streaming overhead for each file.
333        if file_size < SMALL_FILE_LIMIT {
334            return hash_file_small(algo, file, file_size as usize);
335        }
336    }
337
338    // Non-regular files or fallback: stream
339    #[cfg(target_os = "linux")]
340    if file_size >= FADVISE_MIN_SIZE {
341        use std::os::unix::io::AsRawFd;
342        unsafe {
343            libc::posix_fadvise(file.as_raw_fd(), 0, 0, libc::POSIX_FADV_SEQUENTIAL);
344        }
345    }
346    hash_reader(algo, file)
347}
348
349/// Hash a tiny file (<8KB) using a stack-allocated buffer.
350/// Single read() syscall, zero heap allocation on the data path.
351/// Optimal for the "100 small files" benchmark where per-file overhead dominates.
352#[inline]
353fn hash_file_tiny(algo: HashAlgorithm, mut file: File, size: usize) -> io::Result<String> {
354    let mut buf = [0u8; 8192];
355    let mut total = 0;
356    // Read with known size — usually completes in a single read() for regular files
357    while total < size {
358        match file.read(&mut buf[total..size]) {
359            Ok(0) => break,
360            Ok(n) => total += n,
361            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
362            Err(e) => return Err(e),
363        }
364    }
365    Ok(hash_bytes(algo, &buf[..total]))
366}
367
368/// Hash a small file by reading it entirely into a thread-local buffer,
369/// then using the single-shot hash function. Avoids per-file Hasher allocation.
370#[inline]
371fn hash_file_small(algo: HashAlgorithm, mut file: File, size: usize) -> io::Result<String> {
372    SMALL_FILE_BUF.with(|cell| {
373        let mut buf = cell.borrow_mut();
374        // Reset length but keep allocation, then grow if needed
375        buf.clear();
376        buf.reserve(size);
377        // SAFETY: capacity >= size after clear+reserve. We read into the buffer
378        // directly and only access buf[..total] where total <= size <= capacity.
379        unsafe {
380            buf.set_len(size);
381        }
382        let mut total = 0;
383        while total < size {
384            match file.read(&mut buf[total..size]) {
385                Ok(0) => break,
386                Ok(n) => total += n,
387                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
388                Err(e) => return Err(e),
389            }
390        }
391        Ok(hash_bytes(algo, &buf[..total]))
392    })
393}
394
395/// Hash stdin. Uses fadvise for file redirects, streaming for pipes.
396pub fn hash_stdin(algo: HashAlgorithm) -> io::Result<String> {
397    let stdin = io::stdin();
398    // Hint kernel for sequential access if stdin is a regular file (redirect)
399    #[cfg(target_os = "linux")]
400    {
401        use std::os::unix::io::AsRawFd;
402        let fd = stdin.as_raw_fd();
403        let mut stat: libc::stat = unsafe { std::mem::zeroed() };
404        if unsafe { libc::fstat(fd, &mut stat) } == 0
405            && (stat.st_mode & libc::S_IFMT) == libc::S_IFREG
406            && stat.st_size > 0
407        {
408            unsafe {
409                libc::posix_fadvise(fd, 0, stat.st_size, libc::POSIX_FADV_SEQUENTIAL);
410            }
411        }
412    }
413    // Streaming hash — works for both pipe and file-redirect stdin
414    hash_reader(algo, stdin.lock())
415}
416
417/// Check if parallel hashing is worthwhile for the given file paths.
418/// Always parallelize with 2+ files — rayon's thread pool is lazily initialized
419/// once and reused, so per-file work-stealing overhead is negligible (~1µs).
420/// Removing the stat()-based size check eliminates N extra syscalls for N files.
421pub fn should_use_parallel(paths: &[&Path]) -> bool {
422    paths.len() >= 2
423}
424
425/// Issue readahead hints for a list of file paths to warm the page cache.
426/// Uses POSIX_FADV_WILLNEED which is non-blocking and batches efficiently.
427/// Only issues hints for files >= 1MB; small files are read fast enough
428/// that the fadvise syscall overhead isn't worth it.
429#[cfg(target_os = "linux")]
430pub fn readahead_files(paths: &[&Path]) {
431    use std::os::unix::io::AsRawFd;
432    for path in paths {
433        if let Ok(file) = open_noatime(path) {
434            if let Ok(meta) = file.metadata() {
435                let len = meta.len();
436                if meta.file_type().is_file() && len >= FADVISE_MIN_SIZE {
437                    unsafe {
438                        libc::posix_fadvise(
439                            file.as_raw_fd(),
440                            0,
441                            len as i64,
442                            libc::POSIX_FADV_WILLNEED,
443                        );
444                    }
445                }
446            }
447        }
448    }
449}
450
451#[cfg(not(target_os = "linux"))]
452pub fn readahead_files(_paths: &[&Path]) {
453    // No-op on non-Linux
454}
455
456// --- BLAKE2b variable-length functions (using blake2b_simd) ---
457
458/// Hash raw data with BLAKE2b variable output length.
459/// `output_bytes` is the output size in bytes (e.g., 32 for 256-bit).
460pub fn blake2b_hash_data(data: &[u8], output_bytes: usize) -> String {
461    let hash = blake2b_simd::Params::new()
462        .hash_length(output_bytes)
463        .hash(data);
464    hex_encode(hash.as_bytes())
465}
466
467/// Hash a reader with BLAKE2b variable output length.
468/// Uses thread-local buffer for cache-friendly streaming.
469pub fn blake2b_hash_reader<R: Read>(mut reader: R, output_bytes: usize) -> io::Result<String> {
470    STREAM_BUF.with(|cell| {
471        let mut buf = cell.borrow_mut();
472        ensure_stream_buf(&mut buf);
473        let mut state = blake2b_simd::Params::new()
474            .hash_length(output_bytes)
475            .to_state();
476        loop {
477            let n = read_full(&mut reader, &mut buf)?;
478            if n == 0 {
479                break;
480            }
481            state.update(&buf[..n]);
482        }
483        Ok(hex_encode(state.finalize().as_bytes()))
484    })
485}
486
487/// Hash a file with BLAKE2b variable output length.
488/// Uses mmap for large files (zero-copy), single-read for small files,
489/// and streaming read as fallback.
490pub fn blake2b_hash_file(path: &Path, output_bytes: usize) -> io::Result<String> {
491    let (file, file_size, is_regular) = open_and_stat(path)?;
492
493    if is_regular && file_size == 0 {
494        return Ok(blake2b_hash_data(&[], output_bytes));
495    }
496
497    if file_size > 0 && is_regular {
498        // Tiny files (<8KB): stack buffer + single read() — zero heap allocation
499        if file_size < TINY_FILE_LIMIT {
500            return blake2b_hash_file_tiny(file, file_size as usize, output_bytes);
501        }
502        // mmap for large files — zero-copy, eliminates multiple read() syscalls
503        if file_size >= SMALL_FILE_LIMIT {
504            #[cfg(target_os = "linux")]
505            if file_size >= FADVISE_MIN_SIZE {
506                use std::os::unix::io::AsRawFd;
507                unsafe {
508                    libc::posix_fadvise(
509                        file.as_raw_fd(),
510                        0,
511                        file_size as i64,
512                        libc::POSIX_FADV_SEQUENTIAL,
513                    );
514                }
515            }
516            if let Ok(mmap) = unsafe { memmap2::MmapOptions::new().populate().map(&file) } {
517                #[cfg(target_os = "linux")]
518                {
519                    let _ = mmap.advise(memmap2::Advice::Sequential);
520                    if file_size >= 2 * 1024 * 1024 {
521                        let _ = mmap.advise(memmap2::Advice::HugePage);
522                    }
523                }
524                return Ok(blake2b_hash_data(&mmap, output_bytes));
525            }
526        }
527        // Small files (8KB..1MB): single read into thread-local buffer, then single-shot hash
528        if file_size < SMALL_FILE_LIMIT {
529            return blake2b_hash_file_small(file, file_size as usize, output_bytes);
530        }
531    }
532
533    // Non-regular files or fallback: stream
534    #[cfg(target_os = "linux")]
535    if file_size >= FADVISE_MIN_SIZE {
536        use std::os::unix::io::AsRawFd;
537        unsafe {
538            libc::posix_fadvise(file.as_raw_fd(), 0, 0, libc::POSIX_FADV_SEQUENTIAL);
539        }
540    }
541    blake2b_hash_reader(file, output_bytes)
542}
543
544/// Hash a tiny BLAKE2b file (<8KB) using a stack-allocated buffer.
545#[inline]
546fn blake2b_hash_file_tiny(mut file: File, size: usize, output_bytes: usize) -> io::Result<String> {
547    let mut buf = [0u8; 8192];
548    let mut total = 0;
549    while total < size {
550        match file.read(&mut buf[total..size]) {
551            Ok(0) => break,
552            Ok(n) => total += n,
553            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
554            Err(e) => return Err(e),
555        }
556    }
557    Ok(blake2b_hash_data(&buf[..total], output_bytes))
558}
559
560/// Hash a small file with BLAKE2b by reading it entirely into a thread-local buffer.
561#[inline]
562fn blake2b_hash_file_small(mut file: File, size: usize, output_bytes: usize) -> io::Result<String> {
563    SMALL_FILE_BUF.with(|cell| {
564        let mut buf = cell.borrow_mut();
565        buf.clear();
566        buf.reserve(size);
567        // SAFETY: capacity >= size after clear+reserve
568        unsafe {
569            buf.set_len(size);
570        }
571        let mut total = 0;
572        while total < size {
573            match file.read(&mut buf[total..size]) {
574                Ok(0) => break,
575                Ok(n) => total += n,
576                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
577                Err(e) => return Err(e),
578            }
579        }
580        Ok(blake2b_hash_data(&buf[..total], output_bytes))
581    })
582}
583
584/// Hash stdin with BLAKE2b variable output length.
585/// Tries fadvise if stdin is a regular file (shell redirect), then streams.
586pub fn blake2b_hash_stdin(output_bytes: usize) -> io::Result<String> {
587    let stdin = io::stdin();
588    #[cfg(target_os = "linux")]
589    {
590        use std::os::unix::io::AsRawFd;
591        let fd = stdin.as_raw_fd();
592        let mut stat: libc::stat = unsafe { std::mem::zeroed() };
593        if unsafe { libc::fstat(fd, &mut stat) } == 0
594            && (stat.st_mode & libc::S_IFMT) == libc::S_IFREG
595            && stat.st_size > 0
596        {
597            unsafe {
598                libc::posix_fadvise(fd, 0, stat.st_size, libc::POSIX_FADV_SEQUENTIAL);
599            }
600        }
601    }
602    blake2b_hash_reader(stdin.lock(), output_bytes)
603}
604
605/// Print hash result in GNU format: "hash  filename\n"
606/// Uses raw byte writes to avoid std::fmt overhead.
607pub fn print_hash(
608    out: &mut impl Write,
609    hash: &str,
610    filename: &str,
611    binary: bool,
612) -> io::Result<()> {
613    let mode = if binary { b'*' } else { b' ' };
614    out.write_all(hash.as_bytes())?;
615    out.write_all(&[b' ', mode])?;
616    out.write_all(filename.as_bytes())?;
617    out.write_all(b"\n")
618}
619
620/// Print hash in GNU format with NUL terminator instead of newline.
621pub fn print_hash_zero(
622    out: &mut impl Write,
623    hash: &str,
624    filename: &str,
625    binary: bool,
626) -> io::Result<()> {
627    let mode = if binary { b'*' } else { b' ' };
628    out.write_all(hash.as_bytes())?;
629    out.write_all(&[b' ', mode])?;
630    out.write_all(filename.as_bytes())?;
631    out.write_all(b"\0")
632}
633
634// ── Single-write output buffer ─────────────────────────────────────
635// For multi-file workloads, batch the entire "hash  filename\n" line into
636// a single write() call. This halves the number of BufWriter flushes.
637
638// Thread-local output line buffer for batched writes.
639// Reused across files to avoid per-file allocation.
640thread_local! {
641    static LINE_BUF: RefCell<Vec<u8>> = RefCell::new(Vec::with_capacity(256));
642}
643
644/// Build and write the standard GNU hash output line in a single write() call.
645/// Format: "hash  filename\n" or "hash *filename\n" (binary mode).
646/// For escaped filenames: "\hash  escaped_filename\n".
647#[inline]
648pub fn write_hash_line(
649    out: &mut impl Write,
650    hash: &str,
651    filename: &str,
652    binary: bool,
653    zero: bool,
654    escaped: bool,
655) -> io::Result<()> {
656    LINE_BUF.with(|cell| {
657        let mut buf = cell.borrow_mut();
658        buf.clear();
659        let mode = if binary { b'*' } else { b' ' };
660        let term = if zero { b'\0' } else { b'\n' };
661        if escaped {
662            buf.push(b'\\');
663        }
664        buf.extend_from_slice(hash.as_bytes());
665        buf.push(b' ');
666        buf.push(mode);
667        buf.extend_from_slice(filename.as_bytes());
668        buf.push(term);
669        out.write_all(&buf)
670    })
671}
672
673/// Build and write BSD tag format output in a single write() call.
674/// Format: "ALGO (filename) = hash\n"
675#[inline]
676pub fn write_hash_tag_line(
677    out: &mut impl Write,
678    algo_name: &str,
679    hash: &str,
680    filename: &str,
681    zero: bool,
682) -> io::Result<()> {
683    LINE_BUF.with(|cell| {
684        let mut buf = cell.borrow_mut();
685        buf.clear();
686        let term = if zero { b'\0' } else { b'\n' };
687        buf.extend_from_slice(algo_name.as_bytes());
688        buf.extend_from_slice(b" (");
689        buf.extend_from_slice(filename.as_bytes());
690        buf.extend_from_slice(b") = ");
691        buf.extend_from_slice(hash.as_bytes());
692        buf.push(term);
693        out.write_all(&buf)
694    })
695}
696
697/// Print hash result in BSD tag format: "ALGO (filename) = hash\n"
698pub fn print_hash_tag(
699    out: &mut impl Write,
700    algo: HashAlgorithm,
701    hash: &str,
702    filename: &str,
703) -> io::Result<()> {
704    out.write_all(algo.name().as_bytes())?;
705    out.write_all(b" (")?;
706    out.write_all(filename.as_bytes())?;
707    out.write_all(b") = ")?;
708    out.write_all(hash.as_bytes())?;
709    out.write_all(b"\n")
710}
711
712/// Print hash in BSD tag format with NUL terminator.
713pub fn print_hash_tag_zero(
714    out: &mut impl Write,
715    algo: HashAlgorithm,
716    hash: &str,
717    filename: &str,
718) -> io::Result<()> {
719    out.write_all(algo.name().as_bytes())?;
720    out.write_all(b" (")?;
721    out.write_all(filename.as_bytes())?;
722    out.write_all(b") = ")?;
723    out.write_all(hash.as_bytes())?;
724    out.write_all(b"\0")
725}
726
727/// Print hash in BSD tag format with BLAKE2b length info:
728/// "BLAKE2b (filename) = hash" for 512-bit, or
729/// "BLAKE2b-256 (filename) = hash" for other lengths.
730pub fn print_hash_tag_b2sum(
731    out: &mut impl Write,
732    hash: &str,
733    filename: &str,
734    bits: usize,
735) -> io::Result<()> {
736    if bits == 512 {
737        out.write_all(b"BLAKE2b (")?;
738    } else {
739        // Use write! for the rare non-512 path (negligible overhead per file)
740        write!(out, "BLAKE2b-{} (", bits)?;
741    }
742    out.write_all(filename.as_bytes())?;
743    out.write_all(b") = ")?;
744    out.write_all(hash.as_bytes())?;
745    out.write_all(b"\n")
746}
747
748/// Print hash in BSD tag format with BLAKE2b length info and NUL terminator.
749pub fn print_hash_tag_b2sum_zero(
750    out: &mut impl Write,
751    hash: &str,
752    filename: &str,
753    bits: usize,
754) -> io::Result<()> {
755    if bits == 512 {
756        out.write_all(b"BLAKE2b (")?;
757    } else {
758        write!(out, "BLAKE2b-{} (", bits)?;
759    }
760    out.write_all(filename.as_bytes())?;
761    out.write_all(b") = ")?;
762    out.write_all(hash.as_bytes())?;
763    out.write_all(b"\0")
764}
765
766/// Options for check mode.
767pub struct CheckOptions {
768    pub quiet: bool,
769    pub status_only: bool,
770    pub strict: bool,
771    pub warn: bool,
772    pub ignore_missing: bool,
773    /// Prefix for per-line format warnings, e.g., "fmd5sum: checksums.txt".
774    /// When non-empty, warnings use GNU format: "{prefix}: {line}: message".
775    /// When empty, uses generic format: "line {line}: message".
776    pub warn_prefix: String,
777}
778
779/// Result of check mode verification.
780pub struct CheckResult {
781    pub ok: usize,
782    pub mismatches: usize,
783    pub format_errors: usize,
784    pub read_errors: usize,
785    /// Number of files skipped because they were missing and --ignore-missing was set.
786    pub ignored_missing: usize,
787}
788
789/// Verify checksums from a check file.
790/// Each line should be "hash  filename" or "hash *filename" or "ALGO (filename) = hash".
791pub fn check_file<R: BufRead>(
792    algo: HashAlgorithm,
793    reader: R,
794    opts: &CheckOptions,
795    out: &mut impl Write,
796    err_out: &mut impl Write,
797) -> io::Result<CheckResult> {
798    let quiet = opts.quiet;
799    let status_only = opts.status_only;
800    let warn = opts.warn;
801    let ignore_missing = opts.ignore_missing;
802    let mut ok_count = 0;
803    let mut mismatch_count = 0;
804    let mut format_errors = 0;
805    let mut read_errors = 0;
806    let mut ignored_missing_count = 0;
807    let mut line_num = 0;
808
809    for line_result in reader.lines() {
810        line_num += 1;
811        let line = line_result?;
812        let line = line.trim_end();
813
814        if line.is_empty() {
815            continue;
816        }
817
818        // Parse "hash  filename" or "hash *filename" or "ALGO (file) = hash"
819        let (expected_hash, filename) = match parse_check_line(line) {
820            Some(v) => v,
821            None => {
822                format_errors += 1;
823                if warn {
824                    out.flush()?;
825                    if opts.warn_prefix.is_empty() {
826                        writeln!(
827                            err_out,
828                            "line {}: improperly formatted {} checksum line",
829                            line_num,
830                            algo.name()
831                        )?;
832                    } else {
833                        writeln!(
834                            err_out,
835                            "{}: {}: improperly formatted {} checksum line",
836                            opts.warn_prefix,
837                            line_num,
838                            algo.name()
839                        )?;
840                    }
841                }
842                continue;
843            }
844        };
845
846        // Compute actual hash
847        let actual = match hash_file(algo, Path::new(filename)) {
848            Ok(h) => h,
849            Err(e) => {
850                if ignore_missing && e.kind() == io::ErrorKind::NotFound {
851                    ignored_missing_count += 1;
852                    continue;
853                }
854                read_errors += 1;
855                if !status_only {
856                    out.flush()?;
857                    writeln!(err_out, "{}: {}", filename, e)?;
858                    writeln!(out, "{}: FAILED open or read", filename)?;
859                }
860                continue;
861            }
862        };
863
864        if actual.eq_ignore_ascii_case(expected_hash) {
865            ok_count += 1;
866            if !quiet && !status_only {
867                writeln!(out, "{}: OK", filename)?;
868            }
869        } else {
870            mismatch_count += 1;
871            if !status_only {
872                writeln!(out, "{}: FAILED", filename)?;
873            }
874        }
875    }
876
877    Ok(CheckResult {
878        ok: ok_count,
879        mismatches: mismatch_count,
880        format_errors,
881        read_errors,
882        ignored_missing: ignored_missing_count,
883    })
884}
885
886/// Parse a checksum line in any supported format.
887pub fn parse_check_line(line: &str) -> Option<(&str, &str)> {
888    // Try BSD tag format: "ALGO (filename) = hash"
889    let rest = line
890        .strip_prefix("MD5 (")
891        .or_else(|| line.strip_prefix("SHA256 ("))
892        .or_else(|| line.strip_prefix("BLAKE2b ("))
893        .or_else(|| {
894            // Handle BLAKE2b-NNN (filename) = hash
895            if line.starts_with("BLAKE2b-") {
896                let after = &line["BLAKE2b-".len()..];
897                if let Some(sp) = after.find(" (") {
898                    if after[..sp].bytes().all(|b| b.is_ascii_digit()) {
899                        return Some(&after[sp + 2..]);
900                    }
901                }
902            }
903            None
904        });
905    if let Some(rest) = rest {
906        if let Some(paren_idx) = rest.find(") = ") {
907            let filename = &rest[..paren_idx];
908            let hash = &rest[paren_idx + 4..];
909            return Some((hash, filename));
910        }
911    }
912
913    // Handle backslash-escaped lines (leading '\')
914    let line = line.strip_prefix('\\').unwrap_or(line);
915
916    // Standard format: "hash  filename"
917    if let Some(idx) = line.find("  ") {
918        let hash = &line[..idx];
919        let rest = &line[idx + 2..];
920        return Some((hash, rest));
921    }
922    // Binary mode: "hash *filename"
923    if let Some(idx) = line.find(" *") {
924        let hash = &line[..idx];
925        let rest = &line[idx + 2..];
926        return Some((hash, rest));
927    }
928    None
929}
930
931/// Parse a BSD-style tag line: "ALGO (filename) = hash"
932/// Returns (expected_hash, filename, optional_bits).
933/// `bits` is the hash length parsed from the algo name (e.g., BLAKE2b-256 -> Some(256)).
934pub fn parse_check_line_tag(line: &str) -> Option<(&str, &str, Option<usize>)> {
935    let paren_start = line.find(" (")?;
936    let algo_part = &line[..paren_start];
937    let rest = &line[paren_start + 2..];
938    let paren_end = rest.find(") = ")?;
939    let filename = &rest[..paren_end];
940    let hash = &rest[paren_end + 4..];
941
942    // Parse optional bit length from algo name (e.g., "BLAKE2b-256" -> Some(256))
943    let bits = if let Some(dash_pos) = algo_part.rfind('-') {
944        algo_part[dash_pos + 1..].parse::<usize>().ok()
945    } else {
946        None
947    };
948
949    Some((hash, filename, bits))
950}
951
952/// Read as many bytes as possible into buf, retrying on partial reads.
953/// Ensures each hash update gets a full buffer (fewer update calls = less overhead).
954/// Fast path: regular file reads usually return the full buffer on the first call.
955#[inline]
956fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
957    // Fast path: first read() usually fills the entire buffer for regular files
958    let n = reader.read(buf)?;
959    if n == buf.len() || n == 0 {
960        return Ok(n);
961    }
962    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
963    let mut total = n;
964    while total < buf.len() {
965        match reader.read(&mut buf[total..]) {
966            Ok(0) => break,
967            Ok(n) => total += n,
968            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
969            Err(e) => return Err(e),
970        }
971    }
972    Ok(total)
973}
974
975/// Compile-time generated 2-byte hex pair lookup table.
976/// Each byte maps directly to its 2-char hex representation — single lookup per byte.
977const fn generate_hex_table() -> [[u8; 2]; 256] {
978    let hex = b"0123456789abcdef";
979    let mut table = [[0u8; 2]; 256];
980    let mut i = 0;
981    while i < 256 {
982        table[i] = [hex[i >> 4], hex[i & 0xf]];
983        i += 1;
984    }
985    table
986}
987
988const HEX_TABLE: [[u8; 2]; 256] = generate_hex_table();
989
990/// Fast hex encoding using 2-byte pair lookup table — one lookup per input byte.
991/// Uses String directly instead of Vec<u8> to avoid the from_utf8 conversion overhead.
992pub(crate) fn hex_encode(bytes: &[u8]) -> String {
993    let len = bytes.len() * 2;
994    let mut hex = String::with_capacity(len);
995    // SAFETY: We write exactly `len` valid ASCII hex bytes into the String's buffer.
996    unsafe {
997        let buf = hex.as_mut_vec();
998        buf.set_len(len);
999        hex_encode_to_slice(bytes, buf);
1000    }
1001    hex
1002}
1003
1004/// Encode bytes as hex directly into a pre-allocated output slice.
1005/// Output slice must be at least `bytes.len() * 2` bytes long.
1006#[inline]
1007fn hex_encode_to_slice(bytes: &[u8], out: &mut [u8]) {
1008    // SAFETY: We write exactly bytes.len()*2 bytes into `out`, which must be large enough.
1009    unsafe {
1010        let ptr = out.as_mut_ptr();
1011        for (i, &b) in bytes.iter().enumerate() {
1012            let pair = *HEX_TABLE.get_unchecked(b as usize);
1013            *ptr.add(i * 2) = pair[0];
1014            *ptr.add(i * 2 + 1) = pair[1];
1015        }
1016    }
1017}
coreutils_rs/hash/core.rs

coreutils_rs/hash/
core.rs