Skip to main content

coreutils_rs/hash/
core.rs

1use std::cell::RefCell;
2use std::fs::File;
3use std::io::{self, BufRead, Read, Write};
4use std::path::Path;
5
6use std::sync::atomic::AtomicUsize;
7#[cfg(target_os = "linux")]
8use std::sync::atomic::{AtomicBool, Ordering};
9
10use digest::Digest;
11use md5::Md5;
12
13/// Supported hash algorithms.
14#[derive(Debug, Clone, Copy)]
15pub enum HashAlgorithm {
16    Sha1,
17    Sha224,
18    Sha256,
19    Sha384,
20    Sha512,
21    Md5,
22    Blake2b,
23}
24
25impl HashAlgorithm {
26    pub fn name(self) -> &'static str {
27        match self {
28            HashAlgorithm::Sha1 => "SHA1",
29            HashAlgorithm::Sha224 => "SHA224",
30            HashAlgorithm::Sha256 => "SHA256",
31            HashAlgorithm::Sha384 => "SHA384",
32            HashAlgorithm::Sha512 => "SHA512",
33            HashAlgorithm::Md5 => "MD5",
34            HashAlgorithm::Blake2b => "BLAKE2b",
35        }
36    }
37}
38
39// ── Generic hash helpers ────────────────────────────────────────────
40
41/// Single-shot hash using the Digest trait.
42fn hash_digest<D: Digest>(data: &[u8]) -> String {
43    hex_encode(&D::digest(data))
44}
45
46/// Streaming hash using thread-local buffer via the Digest trait.
47fn hash_reader_impl<D: Digest>(mut reader: impl Read) -> io::Result<String> {
48    STREAM_BUF.with(|cell| {
49        let mut buf = cell.borrow_mut();
50        ensure_stream_buf(&mut buf);
51        let mut hasher = D::new();
52        loop {
53            let n = read_full(&mut reader, &mut buf)?;
54            if n == 0 {
55                break;
56            }
57            hasher.update(&buf[..n]);
58        }
59        Ok(hex_encode(&hasher.finalize()))
60    })
61}
62
63// ── Public hashing API ──────────────────────────────────────────────
64
65/// Buffer size for streaming hash I/O.
66/// 128KB matches GNU coreutils' buffer size (BUFSIZE=131072), which works well with kernel readahead.
67/// Many small reads allow the kernel to pipeline I/O efficiently, reducing latency
68/// vs fewer large reads that stall waiting for the full buffer to fill.
69const HASH_READ_BUF: usize = 131072;
70
71// Thread-local reusable buffer for streaming hash I/O.
72// Allocated LAZILY (only on first streaming-hash call) to avoid 8MB cost for
73// small-file-only workloads (e.g., "sha256sum *.txt" where every file is <1MB).
74thread_local! {
75    static STREAM_BUF: RefCell<Vec<u8>> = const { RefCell::new(Vec::new()) };
76}
77
78/// Ensure the streaming buffer is at least HASH_READ_BUF bytes.
79/// Called only on the streaming path, so small-file workloads never allocate 8MB.
80#[inline]
81fn ensure_stream_buf(buf: &mut Vec<u8>) {
82    if buf.len() < HASH_READ_BUF {
83        buf.resize(HASH_READ_BUF, 0);
84    }
85}
86
87// ── Ring-accelerated hash functions (non-Apple, non-Linux targets) ────
88// ring provides BoringSSL assembly with SHA-NI/AVX2/NEON for Windows/FreeBSD.
89
90/// Single-shot hash using ring::digest (non-Apple, non-Linux).
91#[cfg(all(not(target_vendor = "apple"), not(target_os = "linux")))]
92#[inline]
93fn ring_hash_bytes(algo: &'static ring::digest::Algorithm, data: &[u8]) -> io::Result<String> {
94    Ok(hex_encode(ring::digest::digest(algo, data).as_ref()))
95}
96
97/// Streaming hash using ring::digest::Context (non-Apple, non-Linux).
98#[cfg(all(not(target_vendor = "apple"), not(target_os = "linux")))]
99fn ring_hash_reader(
100    algo: &'static ring::digest::Algorithm,
101    mut reader: impl Read,
102) -> io::Result<String> {
103    STREAM_BUF.with(|cell| {
104        let mut buf = cell.borrow_mut();
105        ensure_stream_buf(&mut buf);
106        let mut ctx = ring::digest::Context::new(algo);
107        loop {
108            let n = read_full(&mut reader, &mut buf)?;
109            if n == 0 {
110                break;
111            }
112            ctx.update(&buf[..n]);
113        }
114        Ok(hex_encode(ctx.finish().as_ref()))
115    })
116}
117
118// ── SHA-256 ───────────────────────────────────────────────────────────
119// Linux/Apple: sha2 crate (cpufeatures runtime SHA-NI dispatch, no startup overhead)
120// Windows/FreeBSD: ring (BoringSSL assembly)
121
122#[cfg(any(target_os = "linux", target_vendor = "apple"))]
123fn sha256_bytes(data: &[u8]) -> io::Result<String> {
124    Ok(hash_digest::<sha2::Sha256>(data))
125}
126
127#[cfg(all(not(target_os = "linux"), not(target_vendor = "apple")))]
128fn sha256_bytes(data: &[u8]) -> io::Result<String> {
129    ring_hash_bytes(&ring::digest::SHA256, data)
130}
131
132#[cfg(any(target_os = "linux", target_vendor = "apple"))]
133fn sha256_reader(reader: impl Read) -> io::Result<String> {
134    hash_reader_impl::<sha2::Sha256>(reader)
135}
136
137#[cfg(all(not(target_os = "linux"), not(target_vendor = "apple")))]
138fn sha256_reader(reader: impl Read) -> io::Result<String> {
139    ring_hash_reader(&ring::digest::SHA256, reader)
140}
141
142// ── SHA-1 ─────────────────────────────────────────────────────────────
143
144#[cfg(any(target_os = "linux", target_vendor = "apple"))]
145fn sha1_bytes(data: &[u8]) -> io::Result<String> {
146    Ok(hash_digest::<sha1::Sha1>(data))
147}
148
149#[cfg(all(not(target_os = "linux"), not(target_vendor = "apple")))]
150fn sha1_bytes(data: &[u8]) -> io::Result<String> {
151    ring_hash_bytes(&ring::digest::SHA1_FOR_LEGACY_USE_ONLY, data)
152}
153
154#[cfg(any(target_os = "linux", target_vendor = "apple"))]
155fn sha1_reader(reader: impl Read) -> io::Result<String> {
156    hash_reader_impl::<sha1::Sha1>(reader)
157}
158
159#[cfg(all(not(target_os = "linux"), not(target_vendor = "apple")))]
160fn sha1_reader(reader: impl Read) -> io::Result<String> {
161    ring_hash_reader(&ring::digest::SHA1_FOR_LEGACY_USE_ONLY, reader)
162}
163
164// ── SHA-224 ───────────────────────────────────────────────────────────
165// ring does not support SHA-224. Use sha2 crate on all platforms.
166
167fn sha224_bytes(data: &[u8]) -> io::Result<String> {
168    Ok(hash_digest::<sha2::Sha224>(data))
169}
170
171fn sha224_reader(reader: impl Read) -> io::Result<String> {
172    hash_reader_impl::<sha2::Sha224>(reader)
173}
174
175// ── SHA-384 ───────────────────────────────────────────────────────────
176
177#[cfg(any(target_os = "linux", target_vendor = "apple"))]
178fn sha384_bytes(data: &[u8]) -> io::Result<String> {
179    Ok(hash_digest::<sha2::Sha384>(data))
180}
181
182#[cfg(all(not(target_os = "linux"), not(target_vendor = "apple")))]
183fn sha384_bytes(data: &[u8]) -> io::Result<String> {
184    ring_hash_bytes(&ring::digest::SHA384, data)
185}
186
187#[cfg(any(target_os = "linux", target_vendor = "apple"))]
188fn sha384_reader(reader: impl Read) -> io::Result<String> {
189    hash_reader_impl::<sha2::Sha384>(reader)
190}
191
192#[cfg(all(not(target_os = "linux"), not(target_vendor = "apple")))]
193fn sha384_reader(reader: impl Read) -> io::Result<String> {
194    ring_hash_reader(&ring::digest::SHA384, reader)
195}
196
197// ── SHA-512 ───────────────────────────────────────────────────────────
198
199#[cfg(any(target_os = "linux", target_vendor = "apple"))]
200fn sha512_bytes(data: &[u8]) -> io::Result<String> {
201    Ok(hash_digest::<sha2::Sha512>(data))
202}
203
204#[cfg(all(not(target_os = "linux"), not(target_vendor = "apple")))]
205fn sha512_bytes(data: &[u8]) -> io::Result<String> {
206    ring_hash_bytes(&ring::digest::SHA512, data)
207}
208
209#[cfg(any(target_os = "linux", target_vendor = "apple"))]
210fn sha512_reader(reader: impl Read) -> io::Result<String> {
211    hash_reader_impl::<sha2::Sha512>(reader)
212}
213
214#[cfg(all(not(target_os = "linux"), not(target_vendor = "apple")))]
215fn sha512_reader(reader: impl Read) -> io::Result<String> {
216    ring_hash_reader(&ring::digest::SHA512, reader)
217}
218
219/// Compute hash of a byte slice directly (zero-copy fast path).
220/// Returns an error if the underlying crypto library rejects the algorithm.
221pub fn hash_bytes(algo: HashAlgorithm, data: &[u8]) -> io::Result<String> {
222    match algo {
223        HashAlgorithm::Sha1 => sha1_bytes(data),
224        HashAlgorithm::Sha224 => sha224_bytes(data),
225        HashAlgorithm::Sha256 => sha256_bytes(data),
226        HashAlgorithm::Sha384 => sha384_bytes(data),
227        HashAlgorithm::Sha512 => sha512_bytes(data),
228        HashAlgorithm::Md5 => md5_bytes(data),
229        HashAlgorithm::Blake2b => {
230            let hash = blake2b_simd::blake2b(data);
231            Ok(hex_encode(hash.as_bytes()))
232        }
233    }
234}
235
236/// Hash data and write hex result directly into an output buffer.
237/// Returns the number of hex bytes written. Avoids String allocation
238/// on the critical single-file fast path.
239/// `out` must be at least 128 bytes for BLAKE2b/SHA512 (64 * 2), 64 for SHA256, 32 for MD5, etc.
240#[cfg(target_os = "linux")]
241pub fn hash_bytes_to_buf(algo: HashAlgorithm, data: &[u8], out: &mut [u8]) -> io::Result<usize> {
242    match algo {
243        HashAlgorithm::Md5 => {
244            let digest = Md5::digest(data);
245            hex_encode_to_slice(&digest, out);
246            Ok(32)
247        }
248        HashAlgorithm::Sha1 => {
249            let digest = sha1::Sha1::digest(data);
250            hex_encode_to_slice(&digest, out);
251            Ok(40)
252        }
253        HashAlgorithm::Sha224 => {
254            let digest = sha2::Sha224::digest(data);
255            hex_encode_to_slice(&digest, out);
256            Ok(56)
257        }
258        HashAlgorithm::Sha256 => {
259            let digest = sha2::Sha256::digest(data);
260            hex_encode_to_slice(&digest, out);
261            Ok(64)
262        }
263        HashAlgorithm::Sha384 => {
264            let digest = sha2::Sha384::digest(data);
265            hex_encode_to_slice(&digest, out);
266            Ok(96)
267        }
268        HashAlgorithm::Sha512 => {
269            let digest = sha2::Sha512::digest(data);
270            hex_encode_to_slice(&digest, out);
271            Ok(128)
272        }
273        HashAlgorithm::Blake2b => {
274            let hash = blake2b_simd::blake2b(data);
275            let bytes = hash.as_bytes();
276            hex_encode_to_slice(bytes, out);
277            Ok(bytes.len() * 2)
278        }
279    }
280}
281
282/// Hash a single file using raw syscalls and write hex directly to output buffer.
283/// Returns number of hex bytes written.
284/// This is the absolute minimum-overhead path for single-file hashing:
285/// raw open + fstat + read + hash + hex encode, with zero String allocation.
286#[cfg(target_os = "linux")]
287pub fn hash_file_raw_to_buf(algo: HashAlgorithm, path: &Path, out: &mut [u8]) -> io::Result<usize> {
288    use std::os::unix::ffi::OsStrExt;
289
290    let path_bytes = path.as_os_str().as_bytes();
291    let c_path = std::ffi::CString::new(path_bytes)
292        .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "path contains null byte"))?;
293
294    let mut flags = libc::O_RDONLY | libc::O_CLOEXEC;
295    if NOATIME_SUPPORTED.load(Ordering::Relaxed) {
296        flags |= libc::O_NOATIME;
297    }
298
299    let fd = unsafe { libc::open(c_path.as_ptr(), flags) };
300    if fd < 0 {
301        let err = io::Error::last_os_error();
302        if err.raw_os_error() == Some(libc::EPERM) && flags & libc::O_NOATIME != 0 {
303            NOATIME_SUPPORTED.store(false, Ordering::Relaxed);
304            let fd2 = unsafe { libc::open(c_path.as_ptr(), libc::O_RDONLY | libc::O_CLOEXEC) };
305            if fd2 < 0 {
306                return Err(io::Error::last_os_error());
307            }
308            return hash_from_raw_fd_to_buf(algo, fd2, out);
309        }
310        return Err(err);
311    }
312    hash_from_raw_fd_to_buf(algo, fd, out)
313}
314
315/// Hash from raw fd and write hex directly to output buffer.
316/// For tiny files (<8KB), the entire path is raw syscalls + stack buffer — zero heap.
317/// For larger files, falls back to hash_file_raw() which allocates a String.
318#[cfg(target_os = "linux")]
319fn hash_from_raw_fd_to_buf(algo: HashAlgorithm, fd: i32, out: &mut [u8]) -> io::Result<usize> {
320    let mut stat: libc::stat = unsafe { std::mem::zeroed() };
321    if unsafe { libc::fstat(fd, &mut stat) } != 0 {
322        let err = io::Error::last_os_error();
323        unsafe {
324            libc::close(fd);
325        }
326        return Err(err);
327    }
328    let size = stat.st_size as u64;
329    let is_regular = (stat.st_mode & libc::S_IFMT) == libc::S_IFREG;
330
331    // Empty regular file
332    if is_regular && size == 0 {
333        unsafe {
334            libc::close(fd);
335        }
336        return hash_bytes_to_buf(algo, &[], out);
337    }
338
339    // Tiny files (<8KB): fully raw path — zero heap allocation
340    if is_regular && size < TINY_FILE_LIMIT {
341        let mut buf = [0u8; 8192];
342        let mut total = 0usize;
343        while total < size as usize {
344            let n = unsafe {
345                libc::read(
346                    fd,
347                    buf[total..].as_mut_ptr() as *mut libc::c_void,
348                    (size as usize) - total,
349                )
350            };
351            if n < 0 {
352                let err = io::Error::last_os_error();
353                if err.kind() == io::ErrorKind::Interrupted {
354                    continue;
355                }
356                unsafe {
357                    libc::close(fd);
358                }
359                return Err(err);
360            }
361            if n == 0 {
362                break;
363            }
364            total += n as usize;
365        }
366        unsafe {
367            libc::close(fd);
368        }
369        return hash_bytes_to_buf(algo, &buf[..total], out);
370    }
371
372    // Larger files: fall back to hash_from_raw_fd which returns a String,
373    // then copy the hex into out.
374    use std::os::unix::io::FromRawFd;
375    let file = unsafe { File::from_raw_fd(fd) };
376    let hash_str = if is_regular && size > 0 {
377        hash_regular_file(algo, file, size)?
378    } else {
379        hash_reader(algo, file)?
380    };
381    let hex_bytes = hash_str.as_bytes();
382    out[..hex_bytes.len()].copy_from_slice(hex_bytes);
383    Ok(hex_bytes.len())
384}
385
386// ── MD5 ─────────────────────────────────────────────────────────────
387// All platforms: md-5 crate (cpufeatures runtime dispatch on supported CPUs)
388
389fn md5_bytes(data: &[u8]) -> io::Result<String> {
390    Ok(hash_digest::<Md5>(data))
391}
392
393fn md5_reader(reader: impl Read) -> io::Result<String> {
394    hash_reader_impl::<Md5>(reader)
395}
396
397/// Compute hash of data from a reader, returning hex string.
398pub fn hash_reader<R: Read>(algo: HashAlgorithm, reader: R) -> io::Result<String> {
399    match algo {
400        HashAlgorithm::Sha1 => sha1_reader(reader),
401        HashAlgorithm::Sha224 => sha224_reader(reader),
402        HashAlgorithm::Sha256 => sha256_reader(reader),
403        HashAlgorithm::Sha384 => sha384_reader(reader),
404        HashAlgorithm::Sha512 => sha512_reader(reader),
405        HashAlgorithm::Md5 => md5_reader(reader),
406        HashAlgorithm::Blake2b => blake2b_hash_reader(reader, 64),
407    }
408}
409
410/// Track whether O_NOATIME is supported to avoid repeated failed open() attempts.
411/// After the first EPERM, we never try O_NOATIME again (saves one syscall per file).
412#[cfg(target_os = "linux")]
413static NOATIME_SUPPORTED: AtomicBool = AtomicBool::new(true);
414
415/// Open a file with O_NOATIME on Linux to avoid atime update overhead.
416/// Caches whether O_NOATIME works to avoid double-open on every file.
417#[cfg(target_os = "linux")]
418fn open_noatime(path: &Path) -> io::Result<File> {
419    use std::os::unix::fs::OpenOptionsExt;
420    if NOATIME_SUPPORTED.load(Ordering::Relaxed) {
421        match std::fs::OpenOptions::new()
422            .read(true)
423            .custom_flags(libc::O_NOATIME)
424            .open(path)
425        {
426            Ok(f) => return Ok(f),
427            Err(ref e) if e.raw_os_error() == Some(libc::EPERM) => {
428                // O_NOATIME requires file ownership or CAP_FOWNER — disable globally
429                NOATIME_SUPPORTED.store(false, Ordering::Relaxed);
430            }
431            Err(e) => return Err(e), // Real error, propagate
432        }
433    }
434    File::open(path)
435}
436
437#[cfg(not(target_os = "linux"))]
438fn open_noatime(path: &Path) -> io::Result<File> {
439    File::open(path)
440}
441
442/// Open a file and get its metadata in one step.
443/// On Linux uses fstat directly on the fd to avoid an extra syscall layer.
444#[cfg(target_os = "linux")]
445#[inline]
446fn open_and_stat(path: &Path) -> io::Result<(File, u64, bool)> {
447    let file = open_noatime(path)?;
448    let fd = {
449        use std::os::unix::io::AsRawFd;
450        file.as_raw_fd()
451    };
452    let mut stat: libc::stat = unsafe { std::mem::zeroed() };
453    if unsafe { libc::fstat(fd, &mut stat) } != 0 {
454        return Err(io::Error::last_os_error());
455    }
456    let is_regular = (stat.st_mode & libc::S_IFMT) == libc::S_IFREG;
457    let size = stat.st_size as u64;
458    Ok((file, size, is_regular))
459}
460
461#[cfg(not(target_os = "linux"))]
462#[inline]
463fn open_and_stat(path: &Path) -> io::Result<(File, u64, bool)> {
464    let file = open_noatime(path)?;
465    let metadata = file.metadata()?;
466    Ok((file, metadata.len(), metadata.file_type().is_file()))
467}
468
469/// Minimum file size to issue fadvise hint (1MB).
470/// For small files, the syscall overhead exceeds the readahead benefit.
471#[cfg(target_os = "linux")]
472const FADVISE_MIN_SIZE: u64 = 1024 * 1024;
473
474/// Maximum file size for single-read hash optimization.
475/// Files up to this size are read entirely into a thread-local buffer and hashed
476/// with single-shot hash. This avoids mmap/munmap overhead (~100µs each) and
477/// MAP_POPULATE page faults (~300ns/page). The thread-local buffer is reused
478/// across files in sequential mode, saving re-allocation.
479/// 16MB covers typical benchmark files (10MB) while keeping memory usage bounded.
480const SMALL_FILE_LIMIT: u64 = 16 * 1024 * 1024;
481
482/// Threshold for tiny files that can be read into a stack buffer.
483/// Below this size, we use a stack-allocated buffer + single read() syscall,
484/// completely avoiding any heap allocation for the data path.
485const TINY_FILE_LIMIT: u64 = 8 * 1024;
486
487// Thread-local reusable buffer for single-read hash.
488// Grows lazily up to SMALL_FILE_LIMIT (16MB). Initial 64KB allocation
489// handles tiny files; larger files trigger one grow that persists for reuse.
490thread_local! {
491    static SMALL_FILE_BUF: RefCell<Vec<u8>> = RefCell::new(Vec::with_capacity(64 * 1024));
492}
493
494/// Optimized hash for large files (>=16MB) on Linux.
495/// Hash large files (>=16MB) using streaming I/O with fadvise + ring Context.
496/// Uses sequential fadvise hint for kernel readahead, then streams through
497/// hash context in large chunks. For large files (>64MB), uses double-buffered
498/// reader thread to overlap I/O and hashing.
499#[cfg(target_os = "linux")]
500fn hash_file_pipelined(algo: HashAlgorithm, file: File, file_size: u64) -> io::Result<String> {
501    // For very large files, double-buffered reader thread overlaps I/O and CPU.
502    // For medium files, single-thread streaming is faster (avoids thread overhead).
503    if file_size >= 64 * 1024 * 1024 {
504        hash_file_pipelined_read(algo, file, file_size)
505    } else {
506        hash_file_streaming(algo, file, file_size)
507    }
508}
509
510/// Simple single-thread streaming hash with fadvise.
511/// Optimal for files 16-64MB where thread overhead exceeds I/O overlap benefit.
512#[cfg(target_os = "linux")]
513fn hash_file_streaming(algo: HashAlgorithm, file: File, file_size: u64) -> io::Result<String> {
514    use std::os::unix::io::AsRawFd;
515
516    let _ = unsafe {
517        libc::posix_fadvise(
518            file.as_raw_fd(),
519            0,
520            file_size as i64,
521            libc::POSIX_FADV_SEQUENTIAL,
522        )
523    };
524
525    hash_reader(algo, file)
526}
527
528/// Streaming fallback for large files when mmap is unavailable.
529/// Uses double-buffered reader thread with fadvise hints.
530/// Fixed: uses blocking recv() to eliminate triple-buffer allocation bug.
531#[cfg(target_os = "linux")]
532fn hash_file_pipelined_read(
533    algo: HashAlgorithm,
534    mut file: File,
535    file_size: u64,
536) -> io::Result<String> {
537    use std::os::unix::io::AsRawFd;
538
539    const PIPE_BUF_SIZE: usize = 4 * 1024 * 1024; // 4MB per buffer
540
541    let _ = unsafe {
542        libc::posix_fadvise(
543            file.as_raw_fd(),
544            0,
545            file_size as i64,
546            libc::POSIX_FADV_SEQUENTIAL,
547        )
548    };
549
550    let (tx, rx) = std::sync::mpsc::sync_channel::<(Vec<u8>, usize)>(1);
551    let (buf_tx, buf_rx) = std::sync::mpsc::sync_channel::<Vec<u8>>(1);
552    let _ = buf_tx.send(vec![0u8; PIPE_BUF_SIZE]);
553
554    let reader_handle = std::thread::spawn(move || -> io::Result<()> {
555        while let Ok(mut buf) = buf_rx.recv() {
556            let mut total = 0;
557            while total < buf.len() {
558                match file.read(&mut buf[total..]) {
559                    Ok(0) => break,
560                    Ok(n) => total += n,
561                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
562                    Err(e) => return Err(e),
563                }
564            }
565            if total == 0 {
566                break;
567            }
568            if tx.send((buf, total)).is_err() {
569                break;
570            }
571        }
572        Ok(())
573    });
574
575    // Use Digest trait for all hash algorithms.
576    macro_rules! hash_pipelined_digest {
577        ($hasher_init:expr) => {{
578            let mut hasher = $hasher_init;
579            while let Ok((buf, n)) = rx.recv() {
580                hasher.update(&buf[..n]);
581                let _ = buf_tx.send(buf);
582            }
583            Ok(hex_encode(&hasher.finalize()))
584        }};
585    }
586
587    let hash_result: io::Result<String> = match algo {
588        HashAlgorithm::Blake2b => {
589            let mut state = blake2b_simd::Params::new().to_state();
590            while let Ok((buf, n)) = rx.recv() {
591                state.update(&buf[..n]);
592                let _ = buf_tx.send(buf);
593            }
594            Ok(hex_encode(state.finalize().as_bytes()))
595        }
596        HashAlgorithm::Md5 => hash_pipelined_digest!(Md5::new()),
597        HashAlgorithm::Sha1 => hash_pipelined_digest!(sha1::Sha1::new()),
598        HashAlgorithm::Sha224 => hash_pipelined_digest!(sha2::Sha224::new()),
599        HashAlgorithm::Sha256 => hash_pipelined_digest!(sha2::Sha256::new()),
600        HashAlgorithm::Sha384 => hash_pipelined_digest!(sha2::Sha384::new()),
601        HashAlgorithm::Sha512 => hash_pipelined_digest!(sha2::Sha512::new()),
602    };
603
604    match reader_handle.join() {
605        Ok(Ok(())) => {}
606        Ok(Err(e)) => {
607            if hash_result.is_ok() {
608                return Err(e);
609            }
610        }
611        Err(payload) => {
612            let msg = if let Some(s) = payload.downcast_ref::<&str>() {
613                format!("reader thread panicked: {}", s)
614            } else if let Some(s) = payload.downcast_ref::<String>() {
615                format!("reader thread panicked: {}", s)
616            } else {
617                "reader thread panicked".to_string()
618            };
619            return Err(io::Error::other(msg));
620        }
621    }
622
623    hash_result
624}
625
626/// Hash a known-regular file using tiered I/O strategy based on size.
627/// - Large (>=16MB): mmap with HugePage/PopulateRead hints, pipelined fallback
628/// - Small/Medium (8KB-16MB): single read into thread-local buffer + single-shot hash
629///
630/// SAFETY: mmap is safe for regular local files opened just above. The fallback
631/// to streaming I/O (hash_reader/hash_file_pipelined) handles mmap failures at
632/// map time, but cannot protect against post-map truncation. If the file is
633/// truncated or backing storage disappears after mapping (e.g. NFS), the kernel
634/// delivers SIGBUS — acceptable, matching other mmap tools.
635fn hash_regular_file(algo: HashAlgorithm, file: File, file_size: u64) -> io::Result<String> {
636    // Large files (>=SMALL_FILE_LIMIT): mmap for zero-copy single-shot hash.
637    if file_size >= SMALL_FILE_LIMIT {
638        let mmap_result = unsafe { memmap2::MmapOptions::new().map(&file) };
639        if let Ok(mmap) = mmap_result {
640            #[cfg(target_os = "linux")]
641            {
642                let _ = mmap.advise(memmap2::Advice::Sequential);
643                // PopulateRead (Linux 5.14+) synchronously faults all pages into
644                // TLB before returning. This costs ~200µs/GB but eliminates TLB
645                // miss stalls during the hash computation, which is net positive
646                // for files that fit comfortably in page cache.
647                if mmap.advise(memmap2::Advice::PopulateRead).is_err() {
648                    let _ = mmap.advise(memmap2::Advice::WillNeed);
649                }
650            }
651            return hash_bytes(algo, &mmap);
652        }
653        // mmap failed — fall back to streaming I/O
654        #[cfg(target_os = "linux")]
655        {
656            return hash_file_pipelined(algo, file, file_size);
657        }
658        #[cfg(not(target_os = "linux"))]
659        {
660            return hash_reader(algo, file);
661        }
662    }
663    // Small/medium files (8KB-16MB): single read into thread-local buffer.
664    // One read() + one single-shot hash call. The thread-local buffer grows
665    // lazily and persists across files, so allocation cost is amortized.
666    // This outperforms streaming (128KB chunks × N syscalls × N trait dispatches)
667    // for files that fit comfortably in the page cache.
668    #[cfg(target_os = "linux")]
669    {
670        use std::os::unix::io::AsRawFd;
671        let _ = unsafe {
672            libc::posix_fadvise(
673                file.as_raw_fd(),
674                0,
675                file_size as i64,
676                libc::POSIX_FADV_SEQUENTIAL,
677            )
678        };
679    }
680    hash_file_small(algo, file, file_size as usize)
681}
682
683/// Hash a file by path. Uses tiered I/O strategy for regular files,
684/// streaming read for non-regular files.
685pub fn hash_file(algo: HashAlgorithm, path: &Path) -> io::Result<String> {
686    let (file, file_size, is_regular) = open_and_stat(path)?;
687
688    if is_regular && file_size == 0 {
689        return hash_bytes(algo, &[]);
690    }
691
692    if file_size > 0 && is_regular {
693        if file_size < TINY_FILE_LIMIT {
694            return hash_file_tiny(algo, file, file_size as usize);
695        }
696        return hash_regular_file(algo, file, file_size);
697    }
698
699    // Non-regular files or fallback: stream
700    #[cfg(target_os = "linux")]
701    if file_size >= FADVISE_MIN_SIZE {
702        use std::os::unix::io::AsRawFd;
703        let _ = unsafe {
704            libc::posix_fadvise(
705                file.as_raw_fd(),
706                0,
707                file_size as i64,
708                libc::POSIX_FADV_SEQUENTIAL,
709            )
710        };
711    }
712    hash_reader(algo, file)
713}
714
715/// Hash a tiny file (<8KB) using a stack-allocated buffer.
716/// Single read() syscall, zero heap allocation on the data path.
717/// Optimal for the "100 small files" benchmark where per-file overhead dominates.
718#[inline]
719fn hash_file_tiny(algo: HashAlgorithm, mut file: File, size: usize) -> io::Result<String> {
720    let mut buf = [0u8; 8192];
721    let mut total = 0;
722    // Read with known size — usually completes in a single read() for regular files
723    while total < size {
724        match file.read(&mut buf[total..size]) {
725            Ok(0) => break,
726            Ok(n) => total += n,
727            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
728            Err(e) => return Err(e),
729        }
730    }
731    hash_bytes(algo, &buf[..total])
732}
733
734/// Hash a small file by reading it entirely into a thread-local buffer,
735/// then using the single-shot hash function. Avoids per-file Hasher allocation.
736#[inline]
737fn hash_file_small(algo: HashAlgorithm, mut file: File, size: usize) -> io::Result<String> {
738    SMALL_FILE_BUF.with(|cell| {
739        let mut buf = cell.borrow_mut();
740        // Reset length but keep allocation, then grow if needed
741        buf.clear();
742        buf.reserve(size);
743        // SAFETY: capacity >= size after clear+reserve. We read into the buffer
744        // directly and only access buf[..total] where total <= size <= capacity.
745        unsafe {
746            buf.set_len(size);
747        }
748        let mut total = 0;
749        while total < size {
750            match file.read(&mut buf[total..size]) {
751                Ok(0) => break,
752                Ok(n) => total += n,
753                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
754                Err(e) => return Err(e),
755            }
756        }
757        hash_bytes(algo, &buf[..total])
758    })
759}
760
761/// Hash stdin. Uses fadvise for file redirects, streaming for pipes.
762pub fn hash_stdin(algo: HashAlgorithm) -> io::Result<String> {
763    let stdin = io::stdin();
764    // Hint kernel for sequential access if stdin is a regular file (redirect)
765    #[cfg(target_os = "linux")]
766    {
767        use std::os::unix::io::AsRawFd;
768        let fd = stdin.as_raw_fd();
769        let mut stat: libc::stat = unsafe { std::mem::zeroed() };
770        if unsafe { libc::fstat(fd, &mut stat) } == 0
771            && (stat.st_mode & libc::S_IFMT) == libc::S_IFREG
772            && stat.st_size > 0
773        {
774            unsafe {
775                libc::posix_fadvise(fd, 0, stat.st_size, libc::POSIX_FADV_SEQUENTIAL);
776            }
777        }
778    }
779    // Streaming hash — works for both pipe and file-redirect stdin
780    hash_reader(algo, stdin.lock())
781}
782
783/// Check if parallel hashing is worthwhile for the given file paths.
784/// Always parallelize with 2+ files — rayon's thread pool is lazily initialized
785/// once and reused, so per-file work-stealing overhead is negligible (~1µs).
786/// Removing the stat()-based size check eliminates N extra syscalls for N files.
787pub fn should_use_parallel(paths: &[&Path]) -> bool {
788    paths.len() >= 2
789}
790
791/// Issue readahead hints for a list of file paths to warm the page cache.
792/// Uses POSIX_FADV_WILLNEED which is non-blocking and batches efficiently.
793/// Only issues hints for files >= 1MB; small files are read fast enough
794/// that the fadvise syscall overhead isn't worth it.
795#[cfg(target_os = "linux")]
796pub fn readahead_files(paths: &[&Path]) {
797    use std::os::unix::io::AsRawFd;
798    for path in paths {
799        if let Ok(file) = open_noatime(path) {
800            if let Ok(meta) = file.metadata() {
801                let len = meta.len();
802                if meta.file_type().is_file() && len >= FADVISE_MIN_SIZE {
803                    unsafe {
804                        libc::posix_fadvise(
805                            file.as_raw_fd(),
806                            0,
807                            len as i64,
808                            libc::POSIX_FADV_WILLNEED,
809                        );
810                    }
811                }
812            }
813        }
814    }
815}
816
817#[cfg(not(target_os = "linux"))]
818pub fn readahead_files(_paths: &[&Path]) {
819    // No-op on non-Linux
820}
821
822// --- BLAKE2b variable-length functions (using blake2b_simd) ---
823
824/// Hash raw data with BLAKE2b variable output length.
825/// `output_bytes` is the output size in bytes (e.g., 32 for 256-bit).
826pub fn blake2b_hash_data(data: &[u8], output_bytes: usize) -> String {
827    let hash = blake2b_simd::Params::new()
828        .hash_length(output_bytes)
829        .hash(data);
830    hex_encode(hash.as_bytes())
831}
832
833/// Hash a reader with BLAKE2b variable output length.
834/// Uses thread-local buffer for cache-friendly streaming.
835pub fn blake2b_hash_reader<R: Read>(mut reader: R, output_bytes: usize) -> io::Result<String> {
836    STREAM_BUF.with(|cell| {
837        let mut buf = cell.borrow_mut();
838        ensure_stream_buf(&mut buf);
839        let mut state = blake2b_simd::Params::new()
840            .hash_length(output_bytes)
841            .to_state();
842        loop {
843            let n = read_full(&mut reader, &mut buf)?;
844            if n == 0 {
845                break;
846            }
847            state.update(&buf[..n]);
848        }
849        Ok(hex_encode(state.finalize().as_bytes()))
850    })
851}
852
853/// Hash a file with BLAKE2b variable output length.
854/// Uses mmap for large files (zero-copy), single-read for small files,
855/// and streaming read as fallback.
856pub fn blake2b_hash_file(path: &Path, output_bytes: usize) -> io::Result<String> {
857    let (file, file_size, is_regular) = open_and_stat(path)?;
858
859    if is_regular && file_size == 0 {
860        return Ok(blake2b_hash_data(&[], output_bytes));
861    }
862
863    if file_size > 0 && is_regular {
864        // Tiny files (<8KB): stack buffer + single read() — zero heap allocation
865        if file_size < TINY_FILE_LIMIT {
866            return blake2b_hash_file_tiny(file, file_size as usize, output_bytes);
867        }
868        // Large files (>=16MB): I/O pipelining on Linux, mmap on other platforms
869        if file_size >= SMALL_FILE_LIMIT {
870            #[cfg(target_os = "linux")]
871            {
872                return blake2b_hash_file_pipelined(file, file_size, output_bytes);
873            }
874            #[cfg(not(target_os = "linux"))]
875            {
876                let mmap_result = unsafe { memmap2::MmapOptions::new().map(&file) };
877                if let Ok(mmap) = mmap_result {
878                    return Ok(blake2b_hash_data(&mmap, output_bytes));
879                }
880            }
881        }
882        // Small files (8KB..16MB): single read into thread-local buffer, then single-shot hash
883        if file_size < SMALL_FILE_LIMIT {
884            return blake2b_hash_file_small(file, file_size as usize, output_bytes);
885        }
886    }
887
888    // Non-regular files or fallback: stream
889    #[cfg(target_os = "linux")]
890    if file_size >= FADVISE_MIN_SIZE {
891        use std::os::unix::io::AsRawFd;
892        let _ = unsafe {
893            libc::posix_fadvise(
894                file.as_raw_fd(),
895                0,
896                file_size as i64,
897                libc::POSIX_FADV_SEQUENTIAL,
898            )
899        };
900    }
901    blake2b_hash_reader(file, output_bytes)
902}
903
904/// Hash a tiny BLAKE2b file (<8KB) using a stack-allocated buffer.
905#[inline]
906fn blake2b_hash_file_tiny(mut file: File, size: usize, output_bytes: usize) -> io::Result<String> {
907    let mut buf = [0u8; 8192];
908    let mut total = 0;
909    while total < size {
910        match file.read(&mut buf[total..size]) {
911            Ok(0) => break,
912            Ok(n) => total += n,
913            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
914            Err(e) => return Err(e),
915        }
916    }
917    Ok(blake2b_hash_data(&buf[..total], output_bytes))
918}
919
920/// Hash a small file with BLAKE2b by reading it entirely into a thread-local buffer.
921#[inline]
922fn blake2b_hash_file_small(mut file: File, size: usize, output_bytes: usize) -> io::Result<String> {
923    SMALL_FILE_BUF.with(|cell| {
924        let mut buf = cell.borrow_mut();
925        buf.clear();
926        buf.reserve(size);
927        // SAFETY: capacity >= size after clear+reserve
928        unsafe {
929            buf.set_len(size);
930        }
931        let mut total = 0;
932        while total < size {
933            match file.read(&mut buf[total..size]) {
934                Ok(0) => break,
935                Ok(n) => total += n,
936                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
937                Err(e) => return Err(e),
938            }
939        }
940        Ok(blake2b_hash_data(&buf[..total], output_bytes))
941    })
942}
943
944/// Optimized BLAKE2b hash for large files (>=16MB) on Linux.
945/// Primary path: mmap with HUGEPAGE + POPULATE_READ for zero-copy, single-shot hash.
946/// Eliminates thread spawn, channel synchronization, buffer allocation (24MB→0),
947/// and read() memcpy overhead. Falls back to streaming I/O if mmap fails.
948#[cfg(target_os = "linux")]
949fn blake2b_hash_file_pipelined(
950    file: File,
951    file_size: u64,
952    output_bytes: usize,
953) -> io::Result<String> {
954    // Primary path: mmap with huge pages for zero-copy single-shot hash.
955    // Eliminates: thread spawn (~50µs), channel sync, buffer allocs (24MB),
956    // 13+ read() syscalls, and page-cache → user-buffer memcpy.
957    match unsafe { memmap2::MmapOptions::new().map(&file) } {
958        Ok(mmap) => {
959            // HUGEPAGE MUST come before any page faults: reduces 25,600 minor
960            // faults (4KB) to ~50 faults (2MB) for 100MB. Saves ~12ms overhead.
961            if file_size >= 2 * 1024 * 1024 {
962                let _ = mmap.advise(memmap2::Advice::HugePage);
963            }
964            let _ = mmap.advise(memmap2::Advice::Sequential);
965            // POPULATE_READ (Linux 5.14+): synchronously prefaults all pages with
966            // huge pages before hashing begins. Falls back to WillNeed on older kernels.
967            if file_size >= 4 * 1024 * 1024 {
968                if mmap.advise(memmap2::Advice::PopulateRead).is_err() {
969                    let _ = mmap.advise(memmap2::Advice::WillNeed);
970                }
971            } else {
972                let _ = mmap.advise(memmap2::Advice::WillNeed);
973            }
974            // Single-shot hash: processes entire file in one call, streaming
975            // directly from page cache with no user-space buffer copies.
976            Ok(blake2b_hash_data(&mmap, output_bytes))
977        }
978        Err(_) => {
979            // mmap failed (FUSE, NFS without mmap support, etc.) — fall back
980            // to streaming pipelined I/O.
981            blake2b_hash_file_streamed(file, file_size, output_bytes)
982        }
983    }
984}
985
986/// Streaming fallback for BLAKE2b large files when mmap is unavailable.
987/// Uses double-buffered reader thread with fadvise hints.
988/// Fixed: uses blocking recv() to eliminate triple-buffer allocation bug.
989#[cfg(target_os = "linux")]
990fn blake2b_hash_file_streamed(
991    mut file: File,
992    file_size: u64,
993    output_bytes: usize,
994) -> io::Result<String> {
995    use std::os::unix::io::AsRawFd;
996
997    const PIPE_BUF_SIZE: usize = 8 * 1024 * 1024; // 8MB per buffer
998
999    // Hint kernel for sequential access
1000    unsafe {
1001        libc::posix_fadvise(
1002            file.as_raw_fd(),
1003            0,
1004            file_size as i64,
1005            libc::POSIX_FADV_SEQUENTIAL,
1006        );
1007    }
1008
1009    // Double-buffered channels: reader fills one buffer while hasher processes another.
1010    let (tx, rx) = std::sync::mpsc::sync_channel::<(Vec<u8>, usize)>(1);
1011    let (buf_tx, buf_rx) = std::sync::mpsc::sync_channel::<Vec<u8>>(1);
1012    let _ = buf_tx.send(vec![0u8; PIPE_BUF_SIZE]);
1013
1014    let reader_handle = std::thread::spawn(move || -> io::Result<()> {
1015        // Blocking recv reuses hasher's returned buffer (2 buffers total, not 3).
1016        while let Ok(mut buf) = buf_rx.recv() {
1017            let mut total = 0;
1018            while total < buf.len() {
1019                match file.read(&mut buf[total..]) {
1020                    Ok(0) => break,
1021                    Ok(n) => total += n,
1022                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1023                    Err(e) => return Err(e),
1024                }
1025            }
1026            if total == 0 {
1027                break;
1028            }
1029            if tx.send((buf, total)).is_err() {
1030                break;
1031            }
1032        }
1033        Ok(())
1034    });
1035
1036    let mut state = blake2b_simd::Params::new()
1037        .hash_length(output_bytes)
1038        .to_state();
1039    while let Ok((buf, n)) = rx.recv() {
1040        state.update(&buf[..n]);
1041        let _ = buf_tx.send(buf);
1042    }
1043    let hash_result = Ok(hex_encode(state.finalize().as_bytes()));
1044
1045    match reader_handle.join() {
1046        Ok(Ok(())) => {}
1047        Ok(Err(e)) => {
1048            if hash_result.is_ok() {
1049                return Err(e);
1050            }
1051        }
1052        Err(payload) => {
1053            let msg = if let Some(s) = payload.downcast_ref::<&str>() {
1054                format!("reader thread panicked: {}", s)
1055            } else if let Some(s) = payload.downcast_ref::<String>() {
1056                format!("reader thread panicked: {}", s)
1057            } else {
1058                "reader thread panicked".to_string()
1059            };
1060            return Err(io::Error::other(msg));
1061        }
1062    }
1063
1064    hash_result
1065}
1066
1067/// Hash stdin with BLAKE2b variable output length.
1068/// Tries fadvise if stdin is a regular file (shell redirect), then streams.
1069pub fn blake2b_hash_stdin(output_bytes: usize) -> io::Result<String> {
1070    let stdin = io::stdin();
1071    #[cfg(target_os = "linux")]
1072    {
1073        use std::os::unix::io::AsRawFd;
1074        let fd = stdin.as_raw_fd();
1075        let mut stat: libc::stat = unsafe { std::mem::zeroed() };
1076        if unsafe { libc::fstat(fd, &mut stat) } == 0
1077            && (stat.st_mode & libc::S_IFMT) == libc::S_IFREG
1078            && stat.st_size > 0
1079        {
1080            unsafe {
1081                libc::posix_fadvise(fd, 0, stat.st_size, libc::POSIX_FADV_SEQUENTIAL);
1082            }
1083        }
1084    }
1085    blake2b_hash_reader(stdin.lock(), output_bytes)
1086}
1087
1088/// Internal enum for file content in batch hashing.
1089/// Keeps data alive (either as mmap or owned Vec) while hash_many references it.
1090enum FileContent {
1091    Mmap(memmap2::Mmap),
1092    Buf(Vec<u8>),
1093}
1094
1095impl AsRef<[u8]> for FileContent {
1096    fn as_ref(&self) -> &[u8] {
1097        match self {
1098            FileContent::Mmap(m) => m,
1099            FileContent::Buf(v) => v,
1100        }
1101    }
1102}
1103
1104/// Open a file and load its content for batch hashing.
1105/// Uses read for tiny files (avoids mmap syscall overhead), mmap for large
1106/// files (zero-copy), and read-to-end for non-regular files.
1107fn open_file_content(path: &Path) -> io::Result<FileContent> {
1108    let (file, size, is_regular) = open_and_stat(path)?;
1109    if is_regular && size == 0 {
1110        return Ok(FileContent::Buf(Vec::new()));
1111    }
1112    if is_regular && size > 0 {
1113        // Tiny files: read directly into Vec. The mmap syscall + page fault
1114        // overhead exceeds the data transfer cost for files under 8KB.
1115        // For the 100-file benchmark (55 bytes each), this saves ~100 mmap calls.
1116        if size < TINY_FILE_LIMIT {
1117            let mut buf = vec![0u8; size as usize];
1118            let mut total = 0;
1119            let mut f = file;
1120            while total < size as usize {
1121                match f.read(&mut buf[total..]) {
1122                    Ok(0) => break,
1123                    Ok(n) => total += n,
1124                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1125                    Err(e) => return Err(e),
1126                }
1127            }
1128            buf.truncate(total);
1129            return Ok(FileContent::Buf(buf));
1130        }
1131        // HUGEPAGE + PopulateRead for optimal page faulting
1132        let mmap_result = unsafe { memmap2::MmapOptions::new().map(&file) };
1133        if let Ok(mmap) = mmap_result {
1134            #[cfg(target_os = "linux")]
1135            {
1136                if size >= 2 * 1024 * 1024 {
1137                    let _ = mmap.advise(memmap2::Advice::HugePage);
1138                }
1139                let _ = mmap.advise(memmap2::Advice::Sequential);
1140                if mmap.advise(memmap2::Advice::PopulateRead).is_err() {
1141                    let _ = mmap.advise(memmap2::Advice::WillNeed);
1142                }
1143            }
1144            return Ok(FileContent::Mmap(mmap));
1145        }
1146        // Fallback: read into Vec
1147        let mut buf = vec![0u8; size as usize];
1148        let mut total = 0;
1149        let mut f = file;
1150        while total < size as usize {
1151            match f.read(&mut buf[total..]) {
1152                Ok(0) => break,
1153                Ok(n) => total += n,
1154                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1155                Err(e) => return Err(e),
1156            }
1157        }
1158        buf.truncate(total);
1159        return Ok(FileContent::Buf(buf));
1160    }
1161    // Non-regular: read to end
1162    let mut buf = Vec::new();
1163    let mut f = file;
1164    f.read_to_end(&mut buf)?;
1165    Ok(FileContent::Buf(buf))
1166}
1167
1168/// Read remaining file content from an already-open fd into a Vec.
1169/// Used when the initial stack buffer is exhausted and we need to read
1170/// the rest without re-opening the file.
1171fn read_remaining_to_vec(prefix: &[u8], mut file: File) -> io::Result<FileContent> {
1172    let mut buf = Vec::with_capacity(prefix.len() + 65536);
1173    buf.extend_from_slice(prefix);
1174    file.read_to_end(&mut buf)?;
1175    Ok(FileContent::Buf(buf))
1176}
1177
1178/// Open a file and read all content without fstat — just open+read+close.
1179/// For many-file workloads (100+ files), skipping fstat saves ~5µs/file
1180/// (~0.5ms for 100 files). Uses a small initial buffer for tiny files (< 4KB),
1181/// then falls back to larger buffer or read_to_end for bigger files.
1182fn open_file_content_fast(path: &Path) -> io::Result<FileContent> {
1183    let mut file = open_noatime(path)?;
1184    // Try small stack buffer first — optimal for benchmark's ~55 byte files.
1185    // For tiny files, allocate exact-size Vec to avoid waste.
1186    let mut small_buf = [0u8; 4096];
1187    match file.read(&mut small_buf) {
1188        Ok(0) => return Ok(FileContent::Buf(Vec::new())),
1189        Ok(n) if n < small_buf.len() => {
1190            // File fits in small buffer — allocate exact size
1191            let mut vec = Vec::with_capacity(n);
1192            vec.extend_from_slice(&small_buf[..n]);
1193            return Ok(FileContent::Buf(vec));
1194        }
1195        Ok(n) => {
1196            // Might be more data — allocate heap buffer and read into it directly
1197            let mut buf = vec![0u8; 65536];
1198            buf[..n].copy_from_slice(&small_buf[..n]);
1199            let mut total = n;
1200            loop {
1201                match file.read(&mut buf[total..]) {
1202                    Ok(0) => {
1203                        buf.truncate(total);
1204                        return Ok(FileContent::Buf(buf));
1205                    }
1206                    Ok(n) => {
1207                        total += n;
1208                        if total >= buf.len() {
1209                            // File > 64KB: read rest from existing fd
1210                            return read_remaining_to_vec(&buf[..total], file);
1211                        }
1212                    }
1213                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1214                    Err(e) => return Err(e),
1215                }
1216            }
1217        }
1218        Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {
1219            let mut buf = vec![0u8; 65536];
1220            let mut total = 0;
1221            loop {
1222                match file.read(&mut buf[total..]) {
1223                    Ok(0) => {
1224                        buf.truncate(total);
1225                        return Ok(FileContent::Buf(buf));
1226                    }
1227                    Ok(n) => {
1228                        total += n;
1229                        if total >= buf.len() {
1230                            // File > 64KB: read rest from existing fd
1231                            return read_remaining_to_vec(&buf[..total], file);
1232                        }
1233                    }
1234                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1235                    Err(e) => return Err(e),
1236                }
1237            }
1238        }
1239        Err(e) => return Err(e),
1240    }
1241}
1242
1243/// Batch-hash multiple files with BLAKE2b using multi-buffer SIMD.
1244///
1245/// Uses blake2b_simd::many::hash_many for 4-way AVX2 parallel hashing.
1246/// All files are pre-loaded into memory (mmap for large, read for small),
1247/// then hashed simultaneously. Returns results in input order.
1248///
1249/// For 100 files on AVX2: 4x throughput from SIMD parallelism.
1250pub fn blake2b_hash_files_many(paths: &[&Path], output_bytes: usize) -> Vec<io::Result<String>> {
1251    use blake2b_simd::many::{HashManyJob, hash_many};
1252
1253    // Phase 1: Read all files into memory.
1254    // For small file counts (≤10), load sequentially to avoid thread::scope
1255    // overhead (~120µs). For many files, use parallel loading with lightweight
1256    // OS threads. For 100+ files, use fast path that skips fstat.
1257    let use_fast = paths.len() >= 20;
1258
1259    let file_data: Vec<io::Result<FileContent>> = if paths.len() <= 10 {
1260        // Sequential loading — avoids thread spawn overhead for small batches
1261        paths.iter().map(|&path| open_file_content(path)).collect()
1262    } else {
1263        let num_threads = std::thread::available_parallelism()
1264            .map(|n| n.get())
1265            .unwrap_or(4)
1266            .min(paths.len());
1267        let chunk_size = (paths.len() + num_threads - 1) / num_threads;
1268
1269        std::thread::scope(|s| {
1270            let handles: Vec<_> = paths
1271                .chunks(chunk_size)
1272                .map(|chunk| {
1273                    s.spawn(move || {
1274                        chunk
1275                            .iter()
1276                            .map(|&path| {
1277                                if use_fast {
1278                                    open_file_content_fast(path)
1279                                } else {
1280                                    open_file_content(path)
1281                                }
1282                            })
1283                            .collect::<Vec<_>>()
1284                    })
1285                })
1286                .collect();
1287
1288            handles
1289                .into_iter()
1290                .flat_map(|h| h.join().unwrap())
1291                .collect()
1292        })
1293    };
1294
1295    // Phase 2: Build hash_many jobs for successful reads
1296    let hash_results = {
1297        let mut params = blake2b_simd::Params::new();
1298        params.hash_length(output_bytes);
1299
1300        let ok_entries: Vec<(usize, &[u8])> = file_data
1301            .iter()
1302            .enumerate()
1303            .filter_map(|(i, r)| r.as_ref().ok().map(|c| (i, c.as_ref())))
1304            .collect();
1305
1306        let mut jobs: Vec<HashManyJob> = ok_entries
1307            .iter()
1308            .map(|(_, data)| HashManyJob::new(&params, data))
1309            .collect();
1310
1311        // Phase 3: Run multi-buffer SIMD hash (4-way AVX2)
1312        hash_many(jobs.iter_mut());
1313
1314        // Extract hashes into a map
1315        let mut hm: Vec<Option<String>> = vec![None; paths.len()];
1316        for (j, &(orig_i, _)) in ok_entries.iter().enumerate() {
1317            hm[orig_i] = Some(hex_encode(jobs[j].to_hash().as_bytes()));
1318        }
1319        hm
1320    }; // file_data borrow released here
1321
1322    // Phase 4: Combine hashes and errors in original order
1323    hash_results
1324        .into_iter()
1325        .zip(file_data)
1326        .map(|(hash_opt, result)| match result {
1327            Ok(_) => Ok(hash_opt.unwrap()),
1328            Err(e) => Err(e),
1329        })
1330        .collect()
1331}
1332
1333/// Batch-hash multiple files with BLAKE2b using the best strategy for the workload.
1334/// Samples a few files to estimate total data size. For small workloads, uses
1335/// single-core SIMD batch hashing (`blake2b_hash_files_many`) to avoid stat and
1336/// thread spawn overhead. For larger workloads, uses multi-core work-stealing
1337/// parallelism where each worker calls `blake2b_hash_file` (with I/O pipelining
1338/// for large files on Linux).
1339/// Returns results in input order.
1340pub fn blake2b_hash_files_parallel(
1341    paths: &[&Path],
1342    output_bytes: usize,
1343) -> Vec<io::Result<String>> {
1344    let n = paths.len();
1345
1346    // Sample a few files to estimate whether parallel processing is worthwhile.
1347    // This avoids the cost of statting ALL files (~70µs/file) when the workload
1348    // is too small for parallelism to help.
1349    let sample_count = n.min(5);
1350    let mut sample_max: u64 = 0;
1351    let mut sample_total: u64 = 0;
1352    for &p in paths.iter().take(sample_count) {
1353        let size = std::fs::metadata(p).map(|m| m.len()).unwrap_or(0);
1354        sample_total += size;
1355        sample_max = sample_max.max(size);
1356    }
1357    let estimated_total = if sample_count > 0 {
1358        sample_total * (n as u64) / (sample_count as u64)
1359    } else {
1360        0
1361    };
1362
1363    // For small workloads, thread spawn overhead (~120µs × N_threads) exceeds
1364    // any parallelism benefit. Use SIMD batch hashing directly (no stat pass).
1365    if estimated_total < 1024 * 1024 && sample_max < SMALL_FILE_LIMIT {
1366        return blake2b_hash_files_many(paths, output_bytes);
1367    }
1368
1369    // Full stat pass for parallel scheduling — worth it for larger workloads.
1370    let mut indexed: Vec<(usize, &Path, u64)> = paths
1371        .iter()
1372        .enumerate()
1373        .map(|(i, &p)| {
1374            let size = std::fs::metadata(p).map(|m| m.len()).unwrap_or(0);
1375            (i, p, size)
1376        })
1377        .collect();
1378
1379    // Sort largest first: ensures big files start hashing immediately while
1380    // small files fill in gaps, minimizing tail latency.
1381    indexed.sort_by(|a, b| b.2.cmp(&a.2));
1382
1383    // Warm page cache for the largest files using async readahead(2).
1384    // Each hash call handles its own mmap prefaulting, but issuing readahead
1385    // here lets the kernel start I/O for upcoming files while workers process
1386    // current ones. readahead(2) returns immediately (non-blocking).
1387    #[cfg(target_os = "linux")]
1388    {
1389        use std::os::unix::io::AsRawFd;
1390        for &(_, path, size) in indexed.iter().take(20) {
1391            if size >= 1024 * 1024 {
1392                if let Ok(file) = open_noatime(path) {
1393                    unsafe {
1394                        libc::readahead(file.as_raw_fd(), 0, size as usize);
1395                    }
1396                }
1397            }
1398        }
1399    }
1400
1401    let num_threads = std::thread::available_parallelism()
1402        .map(|n| n.get())
1403        .unwrap_or(4)
1404        .min(n);
1405
1406    // Atomic work index for dynamic work-stealing.
1407    let work_idx = AtomicUsize::new(0);
1408
1409    std::thread::scope(|s| {
1410        let work_idx = &work_idx;
1411        let indexed = &indexed;
1412
1413        let handles: Vec<_> = (0..num_threads)
1414            .map(|_| {
1415                s.spawn(move || {
1416                    let mut local_results = Vec::new();
1417                    loop {
1418                        let idx = work_idx.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1419                        if idx >= indexed.len() {
1420                            break;
1421                        }
1422                        let (orig_idx, path, _size) = indexed[idx];
1423                        let result = blake2b_hash_file(path, output_bytes);
1424                        local_results.push((orig_idx, result));
1425                    }
1426                    local_results
1427                })
1428            })
1429            .collect();
1430
1431        // Collect results and reorder to match original input order.
1432        let mut results: Vec<Option<io::Result<String>>> = (0..n).map(|_| None).collect();
1433        for handle in handles {
1434            for (orig_idx, result) in handle.join().unwrap() {
1435                results[orig_idx] = Some(result);
1436            }
1437        }
1438        results
1439            .into_iter()
1440            .map(|opt| opt.unwrap_or_else(|| Err(io::Error::other("missing result"))))
1441            .collect()
1442    })
1443}
1444
1445/// Auto-dispatch multi-file hashing: picks sequential or parallel based on workload.
1446///
1447/// For small files (<64KB sample), sequential avoids thread spawn + readahead overhead
1448/// that dominates for tiny files. On the "100 × 55-byte files" benchmark, this saves
1449/// ~5ms of overhead (thread creation + 200 stat() calls + 100 fadvise() calls).
1450///
1451/// For large files (>=64KB), parallel processing amortizes thread spawn cost over
1452/// substantial per-file hash work. Returns results in input order.
1453pub fn hash_files_auto(paths: &[&Path], algo: HashAlgorithm) -> Vec<io::Result<String>> {
1454    let n = paths.len();
1455    if n == 0 {
1456        return Vec::new();
1457    }
1458    if n == 1 {
1459        return vec![hash_file_nostat(algo, paths[0])];
1460    }
1461
1462    // Sample up to 3 files (max size) to correctly dispatch mixed workloads
1463    // like `md5sum small.txt big1.gb big2.gb`. Costs at most 3 stat calls (~6µs)
1464    // to save potentially 3-6ms of thread overhead for small-file workloads.
1465    let sample_size = paths
1466        .iter()
1467        .take(3)
1468        .filter_map(|p| std::fs::metadata(p).ok())
1469        .map(|m| m.len())
1470        .max()
1471        .unwrap_or(0);
1472
1473    if sample_size < 65536 {
1474        // Small files: sequential loop avoiding thread spawn overhead.
1475        #[cfg(target_os = "linux")]
1476        {
1477            // Raw syscall path: reuses CString buffer, avoids OpenOptions/File overhead
1478            let mut c_path_buf = Vec::with_capacity(256);
1479            paths
1480                .iter()
1481                .map(|&p| hash_file_raw_nostat(algo, p, &mut c_path_buf))
1482                .collect()
1483        }
1484        #[cfg(not(target_os = "linux"))]
1485        {
1486            paths.iter().map(|&p| hash_file_nostat(algo, p)).collect()
1487        }
1488    } else if n >= 20 {
1489        hash_files_batch(paths, algo)
1490    } else {
1491        hash_files_parallel_fast(paths, algo)
1492    }
1493}
1494
1495/// Batch-hash multiple files with SHA-256/MD5 using work-stealing parallelism.
1496/// Files are sorted by size (largest first) so the biggest files start processing
1497/// immediately. Each worker thread grabs the next unprocessed file via atomic index,
1498/// eliminating tail latency from uneven file sizes.
1499/// Returns results in input order.
1500pub fn hash_files_parallel(paths: &[&Path], algo: HashAlgorithm) -> Vec<io::Result<String>> {
1501    let n = paths.len();
1502
1503    // Build (original_index, path, size) tuples — stat all files for scheduling.
1504    // The stat cost (~5µs/file) is repaid by better work distribution.
1505    let mut indexed: Vec<(usize, &Path, u64)> = paths
1506        .iter()
1507        .enumerate()
1508        .map(|(i, &p)| {
1509            let size = std::fs::metadata(p).map(|m| m.len()).unwrap_or(0);
1510            (i, p, size)
1511        })
1512        .collect();
1513
1514    // Sort largest first: ensures big files start hashing immediately while
1515    // small files fill in gaps, minimizing tail latency.
1516    indexed.sort_by(|a, b| b.2.cmp(&a.2));
1517
1518    // Warm page cache for the largest files using async readahead(2).
1519    // Each hash call handles its own mmap prefaulting, but issuing readahead
1520    // here lets the kernel start I/O for upcoming files while workers process
1521    // current ones. readahead(2) returns immediately (non-blocking).
1522    #[cfg(target_os = "linux")]
1523    {
1524        use std::os::unix::io::AsRawFd;
1525        for &(_, path, size) in indexed.iter().take(20) {
1526            if size >= 1024 * 1024 {
1527                if let Ok(file) = open_noatime(path) {
1528                    unsafe {
1529                        libc::readahead(file.as_raw_fd(), 0, size as usize);
1530                    }
1531                }
1532            }
1533        }
1534    }
1535
1536    let num_threads = std::thread::available_parallelism()
1537        .map(|n| n.get())
1538        .unwrap_or(4)
1539        .min(n);
1540
1541    // Atomic work index for dynamic work-stealing.
1542    let work_idx = AtomicUsize::new(0);
1543
1544    std::thread::scope(|s| {
1545        let work_idx = &work_idx;
1546        let indexed = &indexed;
1547
1548        let handles: Vec<_> = (0..num_threads)
1549            .map(|_| {
1550                s.spawn(move || {
1551                    let mut local_results = Vec::new();
1552                    loop {
1553                        let idx = work_idx.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1554                        if idx >= indexed.len() {
1555                            break;
1556                        }
1557                        let (orig_idx, path, _size) = indexed[idx];
1558                        let result = hash_file(algo, path);
1559                        local_results.push((orig_idx, result));
1560                    }
1561                    local_results
1562                })
1563            })
1564            .collect();
1565
1566        // Collect results and reorder to match original input order.
1567        let mut results: Vec<Option<io::Result<String>>> = (0..n).map(|_| None).collect();
1568        for handle in handles {
1569            for (orig_idx, result) in handle.join().unwrap() {
1570                results[orig_idx] = Some(result);
1571            }
1572        }
1573        results
1574            .into_iter()
1575            .map(|opt| opt.unwrap_or_else(|| Err(io::Error::other("missing result"))))
1576            .collect()
1577    })
1578}
1579
1580/// Fast parallel hash for multi-file workloads. Skips the stat-all-and-sort phase
1581/// of `hash_files_parallel()` and uses `hash_file_nostat()` per worker to minimize
1582/// per-file syscall overhead. For 100 tiny files, this eliminates ~200 stat() calls
1583/// (100 from the sort phase + 100 from open_and_stat inside each worker).
1584/// Returns results in input order.
1585pub fn hash_files_parallel_fast(paths: &[&Path], algo: HashAlgorithm) -> Vec<io::Result<String>> {
1586    let n = paths.len();
1587    if n == 0 {
1588        return Vec::new();
1589    }
1590    if n == 1 {
1591        return vec![hash_file_nostat(algo, paths[0])];
1592    }
1593
1594    // Issue readahead for all files (no size threshold — even tiny files benefit
1595    // from batched WILLNEED hints when processing 100+ files)
1596    #[cfg(target_os = "linux")]
1597    readahead_files_all(paths);
1598
1599    let num_threads = std::thread::available_parallelism()
1600        .map(|n| n.get())
1601        .unwrap_or(4)
1602        .min(n);
1603
1604    let work_idx = AtomicUsize::new(0);
1605
1606    std::thread::scope(|s| {
1607        let work_idx = &work_idx;
1608
1609        let handles: Vec<_> = (0..num_threads)
1610            .map(|_| {
1611                s.spawn(move || {
1612                    let mut local_results = Vec::new();
1613                    loop {
1614                        let idx = work_idx.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1615                        if idx >= n {
1616                            break;
1617                        }
1618                        let result = hash_file_nostat(algo, paths[idx]);
1619                        local_results.push((idx, result));
1620                    }
1621                    local_results
1622                })
1623            })
1624            .collect();
1625
1626        let mut results: Vec<Option<io::Result<String>>> = (0..n).map(|_| None).collect();
1627        for handle in handles {
1628            for (idx, result) in handle.join().unwrap() {
1629                results[idx] = Some(result);
1630            }
1631        }
1632        results
1633            .into_iter()
1634            .map(|opt| opt.unwrap_or_else(|| Err(io::Error::other("missing result"))))
1635            .collect()
1636    })
1637}
1638
1639/// Batch-hash multiple files: pre-read all files into memory in parallel,
1640/// then hash all data in parallel. Optimal for many small files where per-file
1641/// overhead (open/read/close syscalls) dominates over hash computation.
1642///
1643/// Reuses the same parallel file loading pattern as `blake2b_hash_files_many()`.
1644/// For 100 × 55-byte files: all 5500 bytes are loaded in parallel across threads,
1645/// then hashed in parallel — minimizing wall-clock time for syscall-bound workloads.
1646/// Returns results in input order.
1647pub fn hash_files_batch(paths: &[&Path], algo: HashAlgorithm) -> Vec<io::Result<String>> {
1648    let n = paths.len();
1649    if n == 0 {
1650        return Vec::new();
1651    }
1652
1653    // Issue readahead for all files
1654    #[cfg(target_os = "linux")]
1655    readahead_files_all(paths);
1656
1657    // Phase 1: Load all files into memory in parallel.
1658    // For 20+ files, use fast path that skips fstat.
1659    let use_fast = n >= 20;
1660
1661    let file_data: Vec<io::Result<FileContent>> = if n <= 10 {
1662        // Sequential loading — avoids thread spawn overhead for small batches
1663        paths
1664            .iter()
1665            .map(|&path| {
1666                if use_fast {
1667                    open_file_content_fast(path)
1668                } else {
1669                    open_file_content(path)
1670                }
1671            })
1672            .collect()
1673    } else {
1674        let num_threads = std::thread::available_parallelism()
1675            .map(|t| t.get())
1676            .unwrap_or(4)
1677            .min(n);
1678        let chunk_size = (n + num_threads - 1) / num_threads;
1679
1680        std::thread::scope(|s| {
1681            let handles: Vec<_> = paths
1682                .chunks(chunk_size)
1683                .map(|chunk| {
1684                    s.spawn(move || {
1685                        chunk
1686                            .iter()
1687                            .map(|&path| {
1688                                if use_fast {
1689                                    open_file_content_fast(path)
1690                                } else {
1691                                    open_file_content(path)
1692                                }
1693                            })
1694                            .collect::<Vec<_>>()
1695                    })
1696                })
1697                .collect();
1698
1699            handles
1700                .into_iter()
1701                .flat_map(|h| h.join().unwrap())
1702                .collect()
1703        })
1704    };
1705
1706    // Phase 2: Hash all loaded data. For tiny files hash is negligible;
1707    // for larger files the parallel hashing across threads helps.
1708    let num_hash_threads = std::thread::available_parallelism()
1709        .map(|t| t.get())
1710        .unwrap_or(4)
1711        .min(n);
1712    let work_idx = AtomicUsize::new(0);
1713
1714    std::thread::scope(|s| {
1715        let work_idx = &work_idx;
1716        let file_data = &file_data;
1717
1718        let handles: Vec<_> = (0..num_hash_threads)
1719            .map(|_| {
1720                s.spawn(move || {
1721                    let mut local_results = Vec::new();
1722                    loop {
1723                        let idx = work_idx.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1724                        if idx >= n {
1725                            break;
1726                        }
1727                        let result = match &file_data[idx] {
1728                            Ok(content) => hash_bytes(algo, content.as_ref()),
1729                            Err(e) => Err(io::Error::new(e.kind(), e.to_string())),
1730                        };
1731                        local_results.push((idx, result));
1732                    }
1733                    local_results
1734                })
1735            })
1736            .collect();
1737
1738        let mut results: Vec<Option<io::Result<String>>> = (0..n).map(|_| None).collect();
1739        for handle in handles {
1740            for (idx, result) in handle.join().unwrap() {
1741                results[idx] = Some(result);
1742            }
1743        }
1744        results
1745            .into_iter()
1746            .map(|opt| opt.unwrap_or_else(|| Err(io::Error::other("missing result"))))
1747            .collect()
1748    })
1749}
1750
1751/// Stream-hash a file that already has a prefix read into memory.
1752/// Feeds `prefix` into the hasher first, then streams the rest from `file`.
1753/// Avoids re-opening and re-reading the file when the initial buffer is exhausted.
1754fn hash_stream_with_prefix(
1755    algo: HashAlgorithm,
1756    prefix: &[u8],
1757    mut file: File,
1758) -> io::Result<String> {
1759    // Blake2b uses its own hasher on all platforms
1760    if matches!(algo, HashAlgorithm::Blake2b) {
1761        let mut state = blake2b_simd::Params::new().to_state();
1762        state.update(prefix);
1763        return STREAM_BUF.with(|cell| {
1764            let mut buf = cell.borrow_mut();
1765            ensure_stream_buf(&mut buf);
1766            loop {
1767                let n = read_full(&mut file, &mut buf)?;
1768                if n == 0 {
1769                    break;
1770                }
1771                state.update(&buf[..n]);
1772            }
1773            Ok(hex_encode(state.finalize().as_bytes()))
1774        });
1775    }
1776
1777    match algo {
1778        HashAlgorithm::Sha1 => hash_stream_with_prefix_digest::<sha1::Sha1>(prefix, file),
1779        HashAlgorithm::Sha224 => hash_stream_with_prefix_digest::<sha2::Sha224>(prefix, file),
1780        HashAlgorithm::Sha256 => hash_stream_with_prefix_digest::<sha2::Sha256>(prefix, file),
1781        HashAlgorithm::Sha384 => hash_stream_with_prefix_digest::<sha2::Sha384>(prefix, file),
1782        HashAlgorithm::Sha512 => hash_stream_with_prefix_digest::<sha2::Sha512>(prefix, file),
1783        HashAlgorithm::Md5 => hash_stream_with_prefix_digest::<md5::Md5>(prefix, file),
1784        HashAlgorithm::Blake2b => unreachable!(),
1785    }
1786}
1787
1788/// Generic stream-hash with prefix using Digest trait (all platforms).
1789fn hash_stream_with_prefix_digest<D: digest::Digest>(
1790    prefix: &[u8],
1791    mut file: File,
1792) -> io::Result<String> {
1793    STREAM_BUF.with(|cell| {
1794        let mut buf = cell.borrow_mut();
1795        ensure_stream_buf(&mut buf);
1796        let mut hasher = D::new();
1797        hasher.update(prefix);
1798        loop {
1799            let n = read_full(&mut file, &mut buf)?;
1800            if n == 0 {
1801                break;
1802            }
1803            hasher.update(&buf[..n]);
1804        }
1805        Ok(hex_encode(&hasher.finalize()))
1806    })
1807}
1808
1809/// Hash a file without fstat — just open, read until EOF, hash.
1810/// For many-file workloads (100+ tiny files), skipping fstat saves ~5µs/file.
1811/// Uses a two-tier buffer strategy: small stack buffer (4KB) for the initial read,
1812/// then falls back to a larger stack buffer (64KB) or streaming hash for bigger files.
1813/// For benchmark's 55-byte files: one read() fills the 4KB buffer, hash immediately.
1814pub fn hash_file_nostat(algo: HashAlgorithm, path: &Path) -> io::Result<String> {
1815    let mut file = open_noatime(path)?;
1816    // First try a small stack buffer — optimal for tiny files (< 4KB).
1817    // Most "many_files" benchmark files are ~55 bytes, so this completes
1818    // with a single read() syscall and no fallback.
1819    let mut small_buf = [0u8; 4096];
1820    match file.read(&mut small_buf) {
1821        Ok(0) => return hash_bytes(algo, &[]),
1822        Ok(n) if n < small_buf.len() => {
1823            // File fits in small buffer — hash directly (common case)
1824            return hash_bytes(algo, &small_buf[..n]);
1825        }
1826        Ok(n) => {
1827            // Might be more data — fall back to larger buffer
1828            let mut buf = [0u8; 65536];
1829            buf[..n].copy_from_slice(&small_buf[..n]);
1830            let mut total = n;
1831            loop {
1832                match file.read(&mut buf[total..]) {
1833                    Ok(0) => return hash_bytes(algo, &buf[..total]),
1834                    Ok(n) => {
1835                        total += n;
1836                        if total >= buf.len() {
1837                            // File > 64KB: stream-hash from existing fd instead of
1838                            // re-opening. Feed already-read prefix, continue streaming.
1839                            return hash_stream_with_prefix(algo, &buf[..total], file);
1840                        }
1841                    }
1842                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1843                    Err(e) => return Err(e),
1844                }
1845            }
1846        }
1847        Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {
1848            // Retry with full buffer on interrupt
1849            let mut buf = [0u8; 65536];
1850            let mut total = 0;
1851            loop {
1852                match file.read(&mut buf[total..]) {
1853                    Ok(0) => return hash_bytes(algo, &buf[..total]),
1854                    Ok(n) => {
1855                        total += n;
1856                        if total >= buf.len() {
1857                            // File > 64KB: stream-hash from existing fd
1858                            return hash_stream_with_prefix(algo, &buf[..total], file);
1859                        }
1860                    }
1861                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1862                    Err(e) => return Err(e),
1863                }
1864            }
1865        }
1866        Err(e) => return Err(e),
1867    }
1868}
1869
1870/// Hash a small file using raw Linux syscalls without fstat.
1871/// For the multi-file sequential path where we already know files are small.
1872/// Avoids: OpenOptions builder, CString per-file alloc (reuses caller's buffer),
1873/// fstat overhead (unnecessary when we just need open+read+close).
1874/// Returns hash as hex string.
1875#[cfg(target_os = "linux")]
1876fn hash_file_raw_nostat(
1877    algo: HashAlgorithm,
1878    path: &Path,
1879    c_path_buf: &mut Vec<u8>,
1880) -> io::Result<String> {
1881    use std::os::unix::ffi::OsStrExt;
1882
1883    let path_bytes = path.as_os_str().as_bytes();
1884
1885    // Reuse caller's buffer for null-terminated path (avoids heap alloc per file)
1886    c_path_buf.clear();
1887    c_path_buf.reserve(path_bytes.len() + 1);
1888    c_path_buf.extend_from_slice(path_bytes);
1889    c_path_buf.push(0);
1890
1891    let mut flags = libc::O_RDONLY | libc::O_CLOEXEC;
1892    if NOATIME_SUPPORTED.load(Ordering::Relaxed) {
1893        flags |= libc::O_NOATIME;
1894    }
1895
1896    let fd = unsafe { libc::open(c_path_buf.as_ptr() as *const libc::c_char, flags) };
1897    if fd < 0 {
1898        let err = io::Error::last_os_error();
1899        if err.raw_os_error() == Some(libc::EPERM) && flags & libc::O_NOATIME != 0 {
1900            NOATIME_SUPPORTED.store(false, Ordering::Relaxed);
1901            let fd2 = unsafe {
1902                libc::open(
1903                    c_path_buf.as_ptr() as *const libc::c_char,
1904                    libc::O_RDONLY | libc::O_CLOEXEC,
1905                )
1906            };
1907            if fd2 < 0 {
1908                return Err(io::Error::last_os_error());
1909            }
1910            return hash_fd_small(algo, fd2);
1911        }
1912        return Err(err);
1913    }
1914    hash_fd_small(algo, fd)
1915}
1916
1917/// Read a small file from fd, hash it, close fd. No fstat needed.
1918#[cfg(target_os = "linux")]
1919#[inline]
1920fn hash_fd_small(algo: HashAlgorithm, fd: i32) -> io::Result<String> {
1921    let mut buf = [0u8; 4096];
1922    let n = loop {
1923        let ret = unsafe { libc::read(fd, buf.as_mut_ptr() as *mut libc::c_void, buf.len()) };
1924        if ret >= 0 {
1925            break ret;
1926        }
1927        let err = io::Error::last_os_error();
1928        if err.kind() == io::ErrorKind::Interrupted {
1929            continue;
1930        }
1931        unsafe {
1932            libc::close(fd);
1933        }
1934        return Err(err);
1935    };
1936    let n = n as usize;
1937    if n < buf.len() {
1938        // File fits in 4KB — common case for small files
1939        unsafe {
1940            libc::close(fd);
1941        }
1942        return hash_bytes(algo, &buf[..n]);
1943    }
1944    // File > 4KB: fall back to hash_file_nostat-style reading
1945    // Wrap fd in File for RAII close
1946    use std::os::unix::io::FromRawFd;
1947    let mut file = unsafe { File::from_raw_fd(fd) };
1948    let mut big_buf = [0u8; 65536];
1949    big_buf[..n].copy_from_slice(&buf[..n]);
1950    let mut total = n;
1951    loop {
1952        match std::io::Read::read(&mut file, &mut big_buf[total..]) {
1953            Ok(0) => return hash_bytes(algo, &big_buf[..total]),
1954            Ok(n) => {
1955                total += n;
1956                if total >= big_buf.len() {
1957                    return hash_stream_with_prefix(algo, &big_buf[..total], file);
1958                }
1959            }
1960            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
1961            Err(e) => return Err(e),
1962        }
1963    }
1964}
1965
1966/// Hash a single file using raw Linux syscalls for minimum overhead.
1967/// Bypasses Rust's File abstraction entirely: raw open/fstat/read/close.
1968/// For the single-file fast path, this eliminates OpenOptions builder,
1969/// CString heap allocation, File wrapper overhead, and Read trait dispatch.
1970///
1971/// Size-based dispatch:
1972/// - Tiny (<8KB): stack buffer + raw read + hash_bytes (3 syscalls total)
1973/// - Small (8KB-16MB): wraps fd in File, reads into thread-local buffer
1974/// - Large (>=16MB): wraps fd in File, mmaps with HugePage + PopulateRead
1975/// - Non-regular: wraps fd in File, streaming hash_reader
1976#[cfg(target_os = "linux")]
1977pub fn hash_file_raw(algo: HashAlgorithm, path: &Path) -> io::Result<String> {
1978    use std::os::unix::ffi::OsStrExt;
1979
1980    let path_bytes = path.as_os_str().as_bytes();
1981    let c_path = std::ffi::CString::new(path_bytes)
1982        .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "path contains null byte"))?;
1983
1984    // Raw open with O_RDONLY | O_CLOEXEC, optionally O_NOATIME
1985    let mut flags = libc::O_RDONLY | libc::O_CLOEXEC;
1986    if NOATIME_SUPPORTED.load(Ordering::Relaxed) {
1987        flags |= libc::O_NOATIME;
1988    }
1989
1990    let fd = unsafe { libc::open(c_path.as_ptr(), flags) };
1991    if fd < 0 {
1992        let err = io::Error::last_os_error();
1993        if err.raw_os_error() == Some(libc::EPERM) && flags & libc::O_NOATIME != 0 {
1994            NOATIME_SUPPORTED.store(false, Ordering::Relaxed);
1995            let fd2 = unsafe { libc::open(c_path.as_ptr(), libc::O_RDONLY | libc::O_CLOEXEC) };
1996            if fd2 < 0 {
1997                return Err(io::Error::last_os_error());
1998            }
1999            return hash_from_raw_fd(algo, fd2);
2000        }
2001        return Err(err);
2002    }
2003    hash_from_raw_fd(algo, fd)
2004}
2005
2006/// Hash from a raw fd — dispatches by file size for optimal I/O strategy.
2007/// Handles tiny (stack buffer), small (thread-local buffer), large (mmap), and
2008/// non-regular (streaming) files.
2009#[cfg(target_os = "linux")]
2010fn hash_from_raw_fd(algo: HashAlgorithm, fd: i32) -> io::Result<String> {
2011    // Raw fstat to determine size and type
2012    let mut stat: libc::stat = unsafe { std::mem::zeroed() };
2013    if unsafe { libc::fstat(fd, &mut stat) } != 0 {
2014        let err = io::Error::last_os_error();
2015        unsafe {
2016            libc::close(fd);
2017        }
2018        return Err(err);
2019    }
2020    let size = stat.st_size as u64;
2021    let is_regular = (stat.st_mode & libc::S_IFMT) == libc::S_IFREG;
2022
2023    // Empty regular file
2024    if is_regular && size == 0 {
2025        unsafe {
2026            libc::close(fd);
2027        }
2028        return hash_bytes(algo, &[]);
2029    }
2030
2031    // Tiny files (<8KB): raw read into stack buffer, no File wrapper needed.
2032    // Entire I/O in 3 raw syscalls: open + read + close.
2033    if is_regular && size < TINY_FILE_LIMIT {
2034        let mut buf = [0u8; 8192];
2035        let mut total = 0usize;
2036        while total < size as usize {
2037            let n = unsafe {
2038                libc::read(
2039                    fd,
2040                    buf[total..].as_mut_ptr() as *mut libc::c_void,
2041                    (size as usize) - total,
2042                )
2043            };
2044            if n < 0 {
2045                let err = io::Error::last_os_error();
2046                if err.kind() == io::ErrorKind::Interrupted {
2047                    continue;
2048                }
2049                unsafe {
2050                    libc::close(fd);
2051                }
2052                return Err(err);
2053            }
2054            if n == 0 {
2055                break;
2056            }
2057            total += n as usize;
2058        }
2059        unsafe {
2060            libc::close(fd);
2061        }
2062        return hash_bytes(algo, &buf[..total]);
2063    }
2064
2065    // For larger files, wrap fd in File for RAII close and existing optimized paths.
2066    use std::os::unix::io::FromRawFd;
2067    let file = unsafe { File::from_raw_fd(fd) };
2068
2069    if is_regular && size > 0 {
2070        return hash_regular_file(algo, file, size);
2071    }
2072
2073    // Non-regular files: streaming hash
2074    hash_reader(algo, file)
2075}
2076
2077/// Issue readahead hints for ALL file paths (no size threshold).
2078/// For multi-file benchmarks, even small files benefit from batched readahead.
2079#[cfg(target_os = "linux")]
2080pub fn readahead_files_all(paths: &[&Path]) {
2081    use std::os::unix::io::AsRawFd;
2082    for path in paths {
2083        if let Ok(file) = open_noatime(path) {
2084            if let Ok(meta) = file.metadata() {
2085                if meta.file_type().is_file() {
2086                    let len = meta.len();
2087                    unsafe {
2088                        libc::posix_fadvise(
2089                            file.as_raw_fd(),
2090                            0,
2091                            len as i64,
2092                            libc::POSIX_FADV_WILLNEED,
2093                        );
2094                    }
2095                }
2096            }
2097        }
2098    }
2099}
2100
2101#[cfg(not(target_os = "linux"))]
2102pub fn readahead_files_all(_paths: &[&Path]) {}
2103
2104/// Print hash result in GNU format: "hash  filename\n"
2105/// Uses raw byte writes to avoid std::fmt overhead.
2106pub fn print_hash(
2107    out: &mut impl Write,
2108    hash: &str,
2109    filename: &str,
2110    binary: bool,
2111) -> io::Result<()> {
2112    let mode = if binary { b'*' } else { b' ' };
2113    out.write_all(hash.as_bytes())?;
2114    out.write_all(&[b' ', mode])?;
2115    out.write_all(filename.as_bytes())?;
2116    out.write_all(b"\n")
2117}
2118
2119/// Print hash in GNU format with NUL terminator instead of newline.
2120pub fn print_hash_zero(
2121    out: &mut impl Write,
2122    hash: &str,
2123    filename: &str,
2124    binary: bool,
2125) -> io::Result<()> {
2126    let mode = if binary { b'*' } else { b' ' };
2127    out.write_all(hash.as_bytes())?;
2128    out.write_all(&[b' ', mode])?;
2129    out.write_all(filename.as_bytes())?;
2130    out.write_all(b"\0")
2131}
2132
2133// ── Single-write output buffer ─────────────────────────────────────
2134// For multi-file workloads, batch the entire "hash  filename\n" line into
2135// a single write() call. This halves the number of BufWriter flushes.
2136
2137// Thread-local output line buffer for batched writes.
2138// Reused across files to avoid per-file allocation.
2139thread_local! {
2140    static LINE_BUF: RefCell<Vec<u8>> = RefCell::new(Vec::with_capacity(256));
2141}
2142
2143/// Build and write the standard GNU hash output line in a single write() call.
2144/// Format: "hash  filename\n" or "hash *filename\n" (binary mode).
2145/// For escaped filenames: "\hash  escaped_filename\n".
2146#[inline]
2147pub fn write_hash_line(
2148    out: &mut impl Write,
2149    hash: &str,
2150    filename: &str,
2151    binary: bool,
2152    zero: bool,
2153    escaped: bool,
2154) -> io::Result<()> {
2155    LINE_BUF.with(|cell| {
2156        let mut buf = cell.borrow_mut();
2157        buf.clear();
2158        let mode = if binary { b'*' } else { b' ' };
2159        let term = if zero { b'\0' } else { b'\n' };
2160        if escaped {
2161            buf.push(b'\\');
2162        }
2163        buf.extend_from_slice(hash.as_bytes());
2164        buf.push(b' ');
2165        buf.push(mode);
2166        buf.extend_from_slice(filename.as_bytes());
2167        buf.push(term);
2168        out.write_all(&buf)
2169    })
2170}
2171
2172/// Build and write BSD tag format output in a single write() call.
2173/// Format: "ALGO (filename) = hash\n"
2174#[inline]
2175pub fn write_hash_tag_line(
2176    out: &mut impl Write,
2177    algo_name: &str,
2178    hash: &str,
2179    filename: &str,
2180    zero: bool,
2181) -> io::Result<()> {
2182    LINE_BUF.with(|cell| {
2183        let mut buf = cell.borrow_mut();
2184        buf.clear();
2185        let term = if zero { b'\0' } else { b'\n' };
2186        buf.extend_from_slice(algo_name.as_bytes());
2187        buf.extend_from_slice(b" (");
2188        buf.extend_from_slice(filename.as_bytes());
2189        buf.extend_from_slice(b") = ");
2190        buf.extend_from_slice(hash.as_bytes());
2191        buf.push(term);
2192        out.write_all(&buf)
2193    })
2194}
2195
2196/// Print hash result in BSD tag format: "ALGO (filename) = hash\n"
2197pub fn print_hash_tag(
2198    out: &mut impl Write,
2199    algo: HashAlgorithm,
2200    hash: &str,
2201    filename: &str,
2202) -> io::Result<()> {
2203    out.write_all(algo.name().as_bytes())?;
2204    out.write_all(b" (")?;
2205    out.write_all(filename.as_bytes())?;
2206    out.write_all(b") = ")?;
2207    out.write_all(hash.as_bytes())?;
2208    out.write_all(b"\n")
2209}
2210
2211/// Print hash in BSD tag format with NUL terminator.
2212pub fn print_hash_tag_zero(
2213    out: &mut impl Write,
2214    algo: HashAlgorithm,
2215    hash: &str,
2216    filename: &str,
2217) -> io::Result<()> {
2218    out.write_all(algo.name().as_bytes())?;
2219    out.write_all(b" (")?;
2220    out.write_all(filename.as_bytes())?;
2221    out.write_all(b") = ")?;
2222    out.write_all(hash.as_bytes())?;
2223    out.write_all(b"\0")
2224}
2225
2226/// Print hash in BSD tag format with BLAKE2b length info:
2227/// "BLAKE2b (filename) = hash" for 512-bit, or
2228/// "BLAKE2b-256 (filename) = hash" for other lengths.
2229pub fn print_hash_tag_b2sum(
2230    out: &mut impl Write,
2231    hash: &str,
2232    filename: &str,
2233    bits: usize,
2234) -> io::Result<()> {
2235    if bits == 512 {
2236        out.write_all(b"BLAKE2b (")?;
2237    } else {
2238        // Use write! for the rare non-512 path (negligible overhead per file)
2239        write!(out, "BLAKE2b-{} (", bits)?;
2240    }
2241    out.write_all(filename.as_bytes())?;
2242    out.write_all(b") = ")?;
2243    out.write_all(hash.as_bytes())?;
2244    out.write_all(b"\n")
2245}
2246
2247/// Print hash in BSD tag format with BLAKE2b length info and NUL terminator.
2248pub fn print_hash_tag_b2sum_zero(
2249    out: &mut impl Write,
2250    hash: &str,
2251    filename: &str,
2252    bits: usize,
2253) -> io::Result<()> {
2254    if bits == 512 {
2255        out.write_all(b"BLAKE2b (")?;
2256    } else {
2257        write!(out, "BLAKE2b-{} (", bits)?;
2258    }
2259    out.write_all(filename.as_bytes())?;
2260    out.write_all(b") = ")?;
2261    out.write_all(hash.as_bytes())?;
2262    out.write_all(b"\0")
2263}
2264
2265/// Options for check mode.
2266pub struct CheckOptions {
2267    pub quiet: bool,
2268    pub status_only: bool,
2269    pub strict: bool,
2270    pub warn: bool,
2271    pub ignore_missing: bool,
2272    /// Prefix for per-line format warnings, e.g., "fmd5sum: checksums.txt".
2273    /// When non-empty, warnings use GNU format: "{prefix}: {line}: message".
2274    /// When empty, uses generic format: "line {line}: message".
2275    pub warn_prefix: String,
2276}
2277
2278/// Result of check mode verification.
2279pub struct CheckResult {
2280    pub ok: usize,
2281    pub mismatches: usize,
2282    pub format_errors: usize,
2283    pub read_errors: usize,
2284    /// Number of files skipped because they were missing and --ignore-missing was set.
2285    pub ignored_missing: usize,
2286}
2287
2288/// Verify checksums from a check file.
2289/// Each line should be "hash  filename" or "hash *filename" or "ALGO (filename) = hash".
2290pub fn check_file<R: BufRead>(
2291    algo: HashAlgorithm,
2292    reader: R,
2293    opts: &CheckOptions,
2294    out: &mut impl Write,
2295    err_out: &mut impl Write,
2296) -> io::Result<CheckResult> {
2297    let quiet = opts.quiet;
2298    let status_only = opts.status_only;
2299    let warn = opts.warn;
2300    let ignore_missing = opts.ignore_missing;
2301    let mut ok_count = 0;
2302    let mut mismatch_count = 0;
2303    let mut format_errors = 0;
2304    let mut read_errors = 0;
2305    let mut ignored_missing_count = 0;
2306    let mut line_num = 0;
2307
2308    for line_result in reader.lines() {
2309        line_num += 1;
2310        let line = line_result?;
2311        let line = line.trim_end();
2312
2313        if line.is_empty() {
2314            continue;
2315        }
2316
2317        // Parse "hash  filename" or "hash *filename" or "ALGO (file) = hash"
2318        let (expected_hash, filename) = match parse_check_line(line) {
2319            Some(v) => v,
2320            None => {
2321                format_errors += 1;
2322                if warn {
2323                    out.flush()?;
2324                    if opts.warn_prefix.is_empty() {
2325                        writeln!(
2326                            err_out,
2327                            "line {}: improperly formatted {} checksum line",
2328                            line_num,
2329                            algo.name()
2330                        )?;
2331                    } else {
2332                        writeln!(
2333                            err_out,
2334                            "{}: {}: improperly formatted {} checksum line",
2335                            opts.warn_prefix,
2336                            line_num,
2337                            algo.name()
2338                        )?;
2339                    }
2340                }
2341                continue;
2342            }
2343        };
2344
2345        // Compute actual hash
2346        let actual = match hash_file(algo, Path::new(filename)) {
2347            Ok(h) => h,
2348            Err(e) => {
2349                if ignore_missing && e.kind() == io::ErrorKind::NotFound {
2350                    ignored_missing_count += 1;
2351                    continue;
2352                }
2353                read_errors += 1;
2354                if !status_only {
2355                    out.flush()?;
2356                    writeln!(err_out, "{}: {}", filename, e)?;
2357                    writeln!(out, "{}: FAILED open or read", filename)?;
2358                }
2359                continue;
2360            }
2361        };
2362
2363        if actual.eq_ignore_ascii_case(expected_hash) {
2364            ok_count += 1;
2365            if !quiet && !status_only {
2366                writeln!(out, "{}: OK", filename)?;
2367            }
2368        } else {
2369            mismatch_count += 1;
2370            if !status_only {
2371                writeln!(out, "{}: FAILED", filename)?;
2372            }
2373        }
2374    }
2375
2376    Ok(CheckResult {
2377        ok: ok_count,
2378        mismatches: mismatch_count,
2379        format_errors,
2380        read_errors,
2381        ignored_missing: ignored_missing_count,
2382    })
2383}
2384
2385/// Parse a checksum line in any supported format.
2386pub fn parse_check_line(line: &str) -> Option<(&str, &str)> {
2387    // Try BSD tag format: "ALGO (filename) = hash"
2388    let rest = line
2389        .strip_prefix("MD5 (")
2390        .or_else(|| line.strip_prefix("SHA1 ("))
2391        .or_else(|| line.strip_prefix("SHA224 ("))
2392        .or_else(|| line.strip_prefix("SHA256 ("))
2393        .or_else(|| line.strip_prefix("SHA384 ("))
2394        .or_else(|| line.strip_prefix("SHA512 ("))
2395        .or_else(|| line.strip_prefix("BLAKE2b ("))
2396        .or_else(|| {
2397            // Handle BLAKE2b-NNN (filename) = hash
2398            if line.starts_with("BLAKE2b-") {
2399                let after = &line["BLAKE2b-".len()..];
2400                if let Some(sp) = after.find(" (") {
2401                    if after[..sp].bytes().all(|b| b.is_ascii_digit()) {
2402                        return Some(&after[sp + 2..]);
2403                    }
2404                }
2405            }
2406            None
2407        });
2408    if let Some(rest) = rest {
2409        if let Some(paren_idx) = rest.find(") = ") {
2410            let filename = &rest[..paren_idx];
2411            let hash = &rest[paren_idx + 4..];
2412            return Some((hash, filename));
2413        }
2414    }
2415
2416    // Handle backslash-escaped lines (leading '\')
2417    let line = line.strip_prefix('\\').unwrap_or(line);
2418
2419    // Standard format: "hash  filename"
2420    if let Some(idx) = line.find("  ") {
2421        let hash = &line[..idx];
2422        let rest = &line[idx + 2..];
2423        return Some((hash, rest));
2424    }
2425    // Binary mode: "hash *filename"
2426    if let Some(idx) = line.find(" *") {
2427        let hash = &line[..idx];
2428        let rest = &line[idx + 2..];
2429        return Some((hash, rest));
2430    }
2431    None
2432}
2433
2434/// Parse a BSD-style tag line: "ALGO (filename) = hash"
2435/// Returns (expected_hash, filename, optional_bits).
2436/// `bits` is the hash length parsed from the algo name (e.g., BLAKE2b-256 -> Some(256)).
2437pub fn parse_check_line_tag(line: &str) -> Option<(&str, &str, Option<usize>)> {
2438    let paren_start = line.find(" (")?;
2439    let algo_part = &line[..paren_start];
2440    let rest = &line[paren_start + 2..];
2441    let paren_end = rest.find(") = ")?;
2442    let filename = &rest[..paren_end];
2443    let hash = &rest[paren_end + 4..];
2444
2445    // Parse optional bit length from algo name (e.g., "BLAKE2b-256" -> Some(256))
2446    let bits = if let Some(dash_pos) = algo_part.rfind('-') {
2447        algo_part[dash_pos + 1..].parse::<usize>().ok()
2448    } else {
2449        None
2450    };
2451
2452    Some((hash, filename, bits))
2453}
2454
2455/// Read as many bytes as possible into buf, retrying on partial reads.
2456/// Ensures each hash update gets a full buffer (fewer update calls = less overhead).
2457/// Fast path: regular file reads usually return the full buffer on the first call.
2458#[inline]
2459fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
2460    // Fast path: first read() usually fills the entire buffer for regular files
2461    let n = reader.read(buf)?;
2462    if n == buf.len() || n == 0 {
2463        return Ok(n);
2464    }
2465    // Slow path: partial read — retry to fill buffer (pipes, slow devices)
2466    let mut total = n;
2467    while total < buf.len() {
2468        match reader.read(&mut buf[total..]) {
2469            Ok(0) => break,
2470            Ok(n) => total += n,
2471            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
2472            Err(e) => return Err(e),
2473        }
2474    }
2475    Ok(total)
2476}
2477
2478/// Compile-time generated 2-byte hex pair lookup table.
2479/// Each byte maps directly to its 2-char hex representation — single lookup per byte.
2480const fn generate_hex_table() -> [[u8; 2]; 256] {
2481    let hex = b"0123456789abcdef";
2482    let mut table = [[0u8; 2]; 256];
2483    let mut i = 0;
2484    while i < 256 {
2485        table[i] = [hex[i >> 4], hex[i & 0xf]];
2486        i += 1;
2487    }
2488    table
2489}
2490
2491const HEX_TABLE: [[u8; 2]; 256] = generate_hex_table();
2492
2493/// Fast hex encoding using 2-byte pair lookup table — one lookup per input byte.
2494/// Uses String directly instead of Vec<u8> to avoid the from_utf8 conversion overhead.
2495pub(crate) fn hex_encode(bytes: &[u8]) -> String {
2496    let len = bytes.len() * 2;
2497    let mut hex = String::with_capacity(len);
2498    // SAFETY: We write exactly `len` valid ASCII hex bytes into the String's buffer.
2499    unsafe {
2500        let buf = hex.as_mut_vec();
2501        buf.set_len(len);
2502        hex_encode_to_slice(bytes, buf);
2503    }
2504    hex
2505}
2506
2507/// Encode bytes as hex directly into a pre-allocated output slice.
2508/// Output slice must be at least `bytes.len() * 2` bytes long.
2509#[inline]
2510fn hex_encode_to_slice(bytes: &[u8], out: &mut [u8]) {
2511    // SAFETY: We write exactly bytes.len()*2 bytes into `out`, which must be large enough.
2512    unsafe {
2513        let ptr = out.as_mut_ptr();
2514        for (i, &b) in bytes.iter().enumerate() {
2515            let pair = *HEX_TABLE.get_unchecked(b as usize);
2516            *ptr.add(i * 2) = pair[0];
2517            *ptr.add(i * 2 + 1) = pair[1];
2518        }
2519    }
2520}