coreutils_rs/hash/
core.rs

1use std::cell::RefCell;
2use std::fs::{self, File};
3use std::io::{self, BufRead, Read, Write};
4use std::path::Path;
5
6#[cfg(target_os = "linux")]
7use std::sync::atomic::{AtomicBool, Ordering};
8
9use md5::Md5;
10use sha2::{Digest, Sha256};
11
12/// Supported hash algorithms.
13#[derive(Debug, Clone, Copy)]
14pub enum HashAlgorithm {
15    Sha256,
16    Md5,
17    Blake2b,
18}
19
20impl HashAlgorithm {
21    pub fn name(self) -> &'static str {
22        match self {
23            HashAlgorithm::Sha256 => "SHA256",
24            HashAlgorithm::Md5 => "MD5",
25            HashAlgorithm::Blake2b => "BLAKE2b",
26        }
27    }
28}
29
30// ── Generic hash helpers ────────────────────────────────────────────
31
32fn hash_digest<D: Digest>(data: &[u8]) -> String {
33    hex_encode(&D::digest(data))
34}
35
36/// Streaming hash using thread-local 1MB buffer for optimal L2 cache behavior.
37/// 1MB fits in L2 cache on most CPUs, keeping data hot during hash update.
38/// Uses read_full to ensure each update() gets a full buffer, minimizing
39/// per-chunk hasher overhead and maximizing SIMD-friendly aligned updates.
40fn hash_reader_impl<D: Digest>(mut reader: impl Read) -> io::Result<String> {
41    STREAM_BUF.with(|cell| {
42        let mut buf = cell.borrow_mut();
43        let mut hasher = D::new();
44        loop {
45            let n = read_full(&mut reader, &mut buf)?;
46            if n == 0 {
47                break;
48            }
49            hasher.update(&buf[..n]);
50        }
51        Ok(hex_encode(&hasher.finalize()))
52    })
53}
54
55// ── Public hashing API ──────────────────────────────────────────────
56
57/// Buffer size for streaming hash I/O.
58/// 1MB balances syscall count vs L2 cache locality. Smaller buffers keep
59/// data hot in L2 (256KB-1MB) during hash update, while fadvise(SEQUENTIAL)
60/// ensures the kernel pre-fetches the next chunk asynchronously.
61const HASH_READ_BUF: usize = 1024 * 1024;
62
63/// Threshold below which we read the entire file + single-shot hash.
64/// Files up to 8MB fit comfortably in memory and benefit from contiguous
65/// data access — the CPU hardware prefetcher works best on large contiguous
66/// buffers. Also avoids per-chunk hasher.update() call overhead.
67const SINGLE_SHOT_THRESHOLD: u64 = 8 * 1024 * 1024;
68
69// Thread-local reusable buffer for streaming hash I/O.
70// Allocated once per thread, reused across all hash_reader calls.
71thread_local! {
72    static STREAM_BUF: RefCell<Vec<u8>> = RefCell::new(vec![0u8; HASH_READ_BUF]);
73}
74
75/// Compute hash of a byte slice directly (zero-copy fast path).
76pub fn hash_bytes(algo: HashAlgorithm, data: &[u8]) -> String {
77    match algo {
78        HashAlgorithm::Sha256 => hash_digest::<Sha256>(data),
79        HashAlgorithm::Md5 => hash_digest::<Md5>(data),
80        HashAlgorithm::Blake2b => {
81            let hash = blake2b_simd::blake2b(data);
82            hex_encode(hash.as_bytes())
83        }
84    }
85}
86
87/// Compute hash of data from a reader, returning hex string.
88pub fn hash_reader<R: Read>(algo: HashAlgorithm, reader: R) -> io::Result<String> {
89    match algo {
90        HashAlgorithm::Sha256 => hash_reader_impl::<Sha256>(reader),
91        HashAlgorithm::Md5 => hash_reader_impl::<Md5>(reader),
92        HashAlgorithm::Blake2b => blake2b_hash_reader(reader, 64),
93    }
94}
95
96/// Track whether O_NOATIME is supported to avoid repeated failed open() attempts.
97/// After the first EPERM, we never try O_NOATIME again (saves one syscall per file).
98#[cfg(target_os = "linux")]
99static NOATIME_SUPPORTED: AtomicBool = AtomicBool::new(true);
100
101/// Open a file with O_NOATIME on Linux to avoid atime update overhead.
102/// Caches whether O_NOATIME works to avoid double-open on every file.
103#[cfg(target_os = "linux")]
104fn open_noatime(path: &Path) -> io::Result<File> {
105    use std::os::unix::fs::OpenOptionsExt;
106    if NOATIME_SUPPORTED.load(Ordering::Relaxed) {
107        match fs::OpenOptions::new()
108            .read(true)
109            .custom_flags(libc::O_NOATIME)
110            .open(path)
111        {
112            Ok(f) => return Ok(f),
113            Err(ref e) if e.raw_os_error() == Some(libc::EPERM) => {
114                // O_NOATIME requires file ownership or CAP_FOWNER — disable globally
115                NOATIME_SUPPORTED.store(false, Ordering::Relaxed);
116            }
117            Err(e) => return Err(e), // Real error, propagate
118        }
119    }
120    File::open(path)
121}
122
123#[cfg(not(target_os = "linux"))]
124fn open_noatime(path: &Path) -> io::Result<File> {
125    File::open(path)
126}
127
128/// Hint the kernel for sequential read access. Non-blocking.
129#[cfg(target_os = "linux")]
130#[inline]
131fn fadvise_sequential(file: &File, len: u64) {
132    use std::os::unix::io::AsRawFd;
133    unsafe {
134        libc::posix_fadvise(file.as_raw_fd(), 0, len as i64, libc::POSIX_FADV_SEQUENTIAL);
135    }
136}
137
138#[cfg(not(target_os = "linux"))]
139#[inline]
140fn fadvise_sequential(_file: &File, _len: u64) {}
141
142/// Hash a file by path. Single open + fstat to minimize syscalls.
143/// Uses read() for small files, streaming read+hash for large files.
144/// Replaced mmap with read()+fadvise for better cache behavior:
145/// read() keeps data hot in L2/L3 cache, while mmap suffers page table
146/// and TLB overhead for sequential single-pass workloads.
147pub fn hash_file(algo: HashAlgorithm, path: &Path) -> io::Result<String> {
148    // Single open — reuse fd for fstat + read (saves separate stat + open)
149    let file = open_noatime(path)?;
150    let metadata = file.metadata()?; // fstat on existing fd, cheaper than stat(path)
151    let len = metadata.len();
152    let is_regular = metadata.file_type().is_file();
153
154    if is_regular && len == 0 {
155        return Ok(hash_bytes(algo, &[]));
156    }
157
158    if is_regular && len > 0 {
159        // Files up to 8MB: read entirely + single-shot hash.
160        // Contiguous data enables optimal CPU prefetching and avoids
161        // per-chunk hasher.update() overhead. Vec::with_capacity uses
162        // mmap(MAP_ANONYMOUS) for large allocs — kernel zero-page COW
163        // means pages are only physically allocated on first write (by read).
164        if len <= SINGLE_SHOT_THRESHOLD {
165            fadvise_sequential(&file, len);
166            let mut buf = Vec::with_capacity(len as usize);
167            Read::read_to_end(&mut &file, &mut buf)?;
168            return Ok(hash_bytes(algo, &buf));
169        }
170
171        // Large files (>8MB): streaming read with kernel readahead hint.
172        // fadvise(SEQUENTIAL) enables aggressive readahead (2x default).
173        fadvise_sequential(&file, len);
174        return hash_reader(algo, file);
175    }
176
177    // Fallback: streaming read (special files, pipes, etc.) — fd already open
178    hash_reader(algo, file)
179}
180
181/// Hash stdin. Uses fadvise for file redirects, streaming for pipes.
182pub fn hash_stdin(algo: HashAlgorithm) -> io::Result<String> {
183    let stdin = io::stdin();
184    // Hint kernel for sequential access if stdin is a regular file (redirect)
185    #[cfg(target_os = "linux")]
186    {
187        use std::os::unix::io::AsRawFd;
188        let fd = stdin.as_raw_fd();
189        let mut stat: libc::stat = unsafe { std::mem::zeroed() };
190        if unsafe { libc::fstat(fd, &mut stat) } == 0
191            && (stat.st_mode & libc::S_IFMT) == libc::S_IFREG
192            && stat.st_size > 0
193        {
194            unsafe {
195                libc::posix_fadvise(fd, 0, stat.st_size, libc::POSIX_FADV_SEQUENTIAL);
196            }
197        }
198    }
199    // Streaming hash — works for both pipe and file-redirect stdin
200    hash_reader(algo, stdin.lock())
201}
202
203/// Estimate total file size for parallel/sequential decision.
204/// Uses a quick heuristic: samples first file and extrapolates.
205/// Returns 0 if estimation fails.
206pub fn estimate_total_size(paths: &[&Path]) -> u64 {
207    if paths.is_empty() {
208        return 0;
209    }
210    // Sample first file to estimate
211    if let Ok(meta) = fs::metadata(paths[0]) {
212        meta.len().saturating_mul(paths.len() as u64)
213    } else {
214        0
215    }
216}
217
218/// Check if parallel hashing is worthwhile for the given file paths.
219/// Only uses rayon when files are individually large enough for the hash
220/// computation to dominate over rayon overhead (thread pool init + work stealing).
221/// For many small files (e.g., 100 × 100KB), sequential is much faster.
222pub fn should_use_parallel(paths: &[&Path]) -> bool {
223    if paths.len() < 2 {
224        return false;
225    }
226    let total = estimate_total_size(paths);
227    let avg = total / paths.len() as u64;
228    // Only parallelize when average file size >= 1MB.
229    // Below this, rayon overhead exceeds the benefit of parallel hashing.
230    avg >= 1024 * 1024
231}
232
233/// Issue readahead hints for a list of file paths to warm the page cache.
234/// Uses POSIX_FADV_WILLNEED which is non-blocking and batches efficiently.
235#[cfg(target_os = "linux")]
236pub fn readahead_files(paths: &[&Path]) {
237    use std::os::unix::io::AsRawFd;
238    for path in paths {
239        if let Ok(file) = open_noatime(path) {
240            if let Ok(meta) = file.metadata() {
241                let len = meta.len();
242                if meta.file_type().is_file() && len > 0 {
243                    unsafe {
244                        libc::posix_fadvise(
245                            file.as_raw_fd(),
246                            0,
247                            len as i64,
248                            libc::POSIX_FADV_WILLNEED,
249                        );
250                    }
251                }
252            }
253        }
254    }
255}
256
257#[cfg(not(target_os = "linux"))]
258pub fn readahead_files(_paths: &[&Path]) {
259    // No-op on non-Linux
260}
261
262// --- BLAKE2b variable-length functions (using blake2b_simd) ---
263
264/// Hash raw data with BLAKE2b variable output length.
265/// `output_bytes` is the output size in bytes (e.g., 32 for 256-bit).
266pub fn blake2b_hash_data(data: &[u8], output_bytes: usize) -> String {
267    let hash = blake2b_simd::Params::new()
268        .hash_length(output_bytes)
269        .hash(data);
270    hex_encode(hash.as_bytes())
271}
272
273/// Hash a reader with BLAKE2b variable output length.
274/// Uses thread-local 1MB buffer for cache-friendly streaming.
275pub fn blake2b_hash_reader<R: Read>(mut reader: R, output_bytes: usize) -> io::Result<String> {
276    STREAM_BUF.with(|cell| {
277        let mut buf = cell.borrow_mut();
278        let mut state = blake2b_simd::Params::new()
279            .hash_length(output_bytes)
280            .to_state();
281        loop {
282            let n = read_full(&mut reader, &mut buf)?;
283            if n == 0 {
284                break;
285            }
286            state.update(&buf[..n]);
287        }
288        Ok(hex_encode(state.finalize().as_bytes()))
289    })
290}
291
292/// Hash a file with BLAKE2b variable output length. Single open + fstat.
293/// Uses read() for small files, streaming read+hash for large.
294pub fn blake2b_hash_file(path: &Path, output_bytes: usize) -> io::Result<String> {
295    // Single open — reuse fd for fstat + read
296    let file = open_noatime(path)?;
297    let metadata = file.metadata()?;
298    let len = metadata.len();
299    let is_regular = metadata.file_type().is_file();
300
301    if is_regular && len == 0 {
302        return Ok(blake2b_hash_data(&[], output_bytes));
303    }
304
305    if is_regular && len > 0 {
306        // Files up to 8MB: read entirely + single-shot hash.
307        if len <= SINGLE_SHOT_THRESHOLD {
308            fadvise_sequential(&file, len);
309            let mut buf = Vec::with_capacity(len as usize);
310            Read::read_to_end(&mut &file, &mut buf)?;
311            return Ok(blake2b_hash_data(&buf, output_bytes));
312        }
313
314        // Large files (>8MB): streaming read with kernel readahead hint
315        fadvise_sequential(&file, len);
316        return blake2b_hash_reader(file, output_bytes);
317    }
318
319    // Fallback: streaming read — fd already open
320    blake2b_hash_reader(file, output_bytes)
321}
322
323/// Hash stdin with BLAKE2b variable output length.
324/// Tries fadvise if stdin is a regular file (shell redirect), then streams.
325pub fn blake2b_hash_stdin(output_bytes: usize) -> io::Result<String> {
326    let stdin = io::stdin();
327    #[cfg(target_os = "linux")]
328    {
329        use std::os::unix::io::AsRawFd;
330        let fd = stdin.as_raw_fd();
331        let mut stat: libc::stat = unsafe { std::mem::zeroed() };
332        if unsafe { libc::fstat(fd, &mut stat) } == 0
333            && (stat.st_mode & libc::S_IFMT) == libc::S_IFREG
334            && stat.st_size > 0
335        {
336            unsafe {
337                libc::posix_fadvise(fd, 0, stat.st_size, libc::POSIX_FADV_SEQUENTIAL);
338            }
339        }
340    }
341    blake2b_hash_reader(stdin.lock(), output_bytes)
342}
343
344/// Print hash result in GNU format: "hash  filename\n"
345pub fn print_hash(
346    out: &mut impl Write,
347    hash: &str,
348    filename: &str,
349    binary: bool,
350) -> io::Result<()> {
351    let mode_char = if binary { '*' } else { ' ' };
352    writeln!(out, "{} {}{}", hash, mode_char, filename)
353}
354
355/// Print hash in GNU format with NUL terminator instead of newline.
356pub fn print_hash_zero(
357    out: &mut impl Write,
358    hash: &str,
359    filename: &str,
360    binary: bool,
361) -> io::Result<()> {
362    let mode_char = if binary { '*' } else { ' ' };
363    write!(out, "{} {}{}\0", hash, mode_char, filename)
364}
365
366/// Print hash result in BSD tag format: "ALGO (filename) = hash\n"
367pub fn print_hash_tag(
368    out: &mut impl Write,
369    algo: HashAlgorithm,
370    hash: &str,
371    filename: &str,
372) -> io::Result<()> {
373    writeln!(out, "{} ({}) = {}", algo.name(), filename, hash)
374}
375
376/// Print hash in BSD tag format with NUL terminator.
377pub fn print_hash_tag_zero(
378    out: &mut impl Write,
379    algo: HashAlgorithm,
380    hash: &str,
381    filename: &str,
382) -> io::Result<()> {
383    write!(out, "{} ({}) = {}\0", algo.name(), filename, hash)
384}
385
386/// Print hash in BSD tag format with BLAKE2b length info:
387/// "BLAKE2b (filename) = hash" for 512-bit, or
388/// "BLAKE2b-256 (filename) = hash" for other lengths.
389pub fn print_hash_tag_b2sum(
390    out: &mut impl Write,
391    hash: &str,
392    filename: &str,
393    bits: usize,
394) -> io::Result<()> {
395    if bits == 512 {
396        writeln!(out, "BLAKE2b ({}) = {}", filename, hash)
397    } else {
398        writeln!(out, "BLAKE2b-{} ({}) = {}", bits, filename, hash)
399    }
400}
401
402/// Print hash in BSD tag format with BLAKE2b length info and NUL terminator.
403pub fn print_hash_tag_b2sum_zero(
404    out: &mut impl Write,
405    hash: &str,
406    filename: &str,
407    bits: usize,
408) -> io::Result<()> {
409    if bits == 512 {
410        write!(out, "BLAKE2b ({}) = {}\0", filename, hash)
411    } else {
412        write!(out, "BLAKE2b-{} ({}) = {}\0", bits, filename, hash)
413    }
414}
415
416/// Options for check mode.
417pub struct CheckOptions {
418    pub quiet: bool,
419    pub status_only: bool,
420    pub strict: bool,
421    pub warn: bool,
422    pub ignore_missing: bool,
423    /// Prefix for per-line format warnings, e.g., "fmd5sum: checksums.txt".
424    /// When non-empty, warnings use GNU format: "{prefix}: {line}: message".
425    /// When empty, uses generic format: "line {line}: message".
426    pub warn_prefix: String,
427}
428
429/// Result of check mode verification.
430pub struct CheckResult {
431    pub ok: usize,
432    pub mismatches: usize,
433    pub format_errors: usize,
434    pub read_errors: usize,
435    /// Number of files skipped because they were missing and --ignore-missing was set.
436    pub ignored_missing: usize,
437}
438
439/// Verify checksums from a check file.
440/// Each line should be "hash  filename" or "hash *filename" or "ALGO (filename) = hash".
441pub fn check_file<R: BufRead>(
442    algo: HashAlgorithm,
443    reader: R,
444    opts: &CheckOptions,
445    out: &mut impl Write,
446    err_out: &mut impl Write,
447) -> io::Result<CheckResult> {
448    let quiet = opts.quiet;
449    let status_only = opts.status_only;
450    let warn = opts.warn;
451    let ignore_missing = opts.ignore_missing;
452    let mut ok_count = 0;
453    let mut mismatch_count = 0;
454    let mut format_errors = 0;
455    let mut read_errors = 0;
456    let mut ignored_missing_count = 0;
457    let mut line_num = 0;
458
459    for line_result in reader.lines() {
460        line_num += 1;
461        let line = line_result?;
462        let line = line.trim_end();
463
464        if line.is_empty() {
465            continue;
466        }
467
468        // Parse "hash  filename" or "hash *filename" or "ALGO (file) = hash"
469        let (expected_hash, filename) = match parse_check_line(line) {
470            Some(v) => v,
471            None => {
472                format_errors += 1;
473                if warn {
474                    out.flush()?;
475                    if opts.warn_prefix.is_empty() {
476                        writeln!(
477                            err_out,
478                            "line {}: improperly formatted {} checksum line",
479                            line_num,
480                            algo.name()
481                        )?;
482                    } else {
483                        writeln!(
484                            err_out,
485                            "{}: {}: improperly formatted {} checksum line",
486                            opts.warn_prefix,
487                            line_num,
488                            algo.name()
489                        )?;
490                    }
491                }
492                continue;
493            }
494        };
495
496        // Compute actual hash
497        let actual = match hash_file(algo, Path::new(filename)) {
498            Ok(h) => h,
499            Err(e) => {
500                if ignore_missing && e.kind() == io::ErrorKind::NotFound {
501                    ignored_missing_count += 1;
502                    continue;
503                }
504                read_errors += 1;
505                if !status_only {
506                    out.flush()?;
507                    writeln!(err_out, "{}: {}", filename, e)?;
508                    writeln!(out, "{}: FAILED open or read", filename)?;
509                }
510                continue;
511            }
512        };
513
514        if actual.eq_ignore_ascii_case(expected_hash) {
515            ok_count += 1;
516            if !quiet && !status_only {
517                writeln!(out, "{}: OK", filename)?;
518            }
519        } else {
520            mismatch_count += 1;
521            if !status_only {
522                writeln!(out, "{}: FAILED", filename)?;
523            }
524        }
525    }
526
527    Ok(CheckResult {
528        ok: ok_count,
529        mismatches: mismatch_count,
530        format_errors,
531        read_errors,
532        ignored_missing: ignored_missing_count,
533    })
534}
535
536/// Parse a checksum line in any supported format.
537pub fn parse_check_line(line: &str) -> Option<(&str, &str)> {
538    // Try BSD tag format: "ALGO (filename) = hash"
539    let rest = line
540        .strip_prefix("MD5 (")
541        .or_else(|| line.strip_prefix("SHA256 ("))
542        .or_else(|| line.strip_prefix("BLAKE2b ("))
543        .or_else(|| {
544            // Handle BLAKE2b-NNN (filename) = hash
545            if line.starts_with("BLAKE2b-") {
546                let after = &line["BLAKE2b-".len()..];
547                if let Some(sp) = after.find(" (") {
548                    if after[..sp].bytes().all(|b| b.is_ascii_digit()) {
549                        return Some(&after[sp + 2..]);
550                    }
551                }
552            }
553            None
554        });
555    if let Some(rest) = rest {
556        if let Some(paren_idx) = rest.find(") = ") {
557            let filename = &rest[..paren_idx];
558            let hash = &rest[paren_idx + 4..];
559            return Some((hash, filename));
560        }
561    }
562
563    // Handle backslash-escaped lines (leading '\')
564    let line = line.strip_prefix('\\').unwrap_or(line);
565
566    // Standard format: "hash  filename"
567    if let Some(idx) = line.find("  ") {
568        let hash = &line[..idx];
569        let rest = &line[idx + 2..];
570        return Some((hash, rest));
571    }
572    // Binary mode: "hash *filename"
573    if let Some(idx) = line.find(" *") {
574        let hash = &line[..idx];
575        let rest = &line[idx + 2..];
576        return Some((hash, rest));
577    }
578    None
579}
580
581/// Parse a BSD-style tag line: "ALGO (filename) = hash"
582/// Returns (expected_hash, filename, optional_bits).
583/// `bits` is the hash length parsed from the algo name (e.g., BLAKE2b-256 -> Some(256)).
584pub fn parse_check_line_tag(line: &str) -> Option<(&str, &str, Option<usize>)> {
585    let paren_start = line.find(" (")?;
586    let algo_part = &line[..paren_start];
587    let rest = &line[paren_start + 2..];
588    let paren_end = rest.find(") = ")?;
589    let filename = &rest[..paren_end];
590    let hash = &rest[paren_end + 4..];
591
592    // Parse optional bit length from algo name (e.g., "BLAKE2b-256" -> Some(256))
593    let bits = if let Some(dash_pos) = algo_part.rfind('-') {
594        algo_part[dash_pos + 1..].parse::<usize>().ok()
595    } else {
596        None
597    };
598
599    Some((hash, filename, bits))
600}
601
602/// Read as many bytes as possible into buf, retrying on partial reads.
603/// Ensures each hash update gets a full buffer (fewer update calls = less overhead).
604#[inline]
605fn read_full(reader: &mut impl Read, buf: &mut [u8]) -> io::Result<usize> {
606    let mut total = 0;
607    while total < buf.len() {
608        match reader.read(&mut buf[total..]) {
609            Ok(0) => break,
610            Ok(n) => total += n,
611            Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
612            Err(e) => return Err(e),
613        }
614    }
615    Ok(total)
616}
617
618/// Compile-time generated 2-byte hex pair lookup table.
619/// Each byte maps directly to its 2-char hex representation — single lookup per byte.
620const fn generate_hex_table() -> [[u8; 2]; 256] {
621    let hex = b"0123456789abcdef";
622    let mut table = [[0u8; 2]; 256];
623    let mut i = 0;
624    while i < 256 {
625        table[i] = [hex[i >> 4], hex[i & 0xf]];
626        i += 1;
627    }
628    table
629}
630
631const HEX_TABLE: [[u8; 2]; 256] = generate_hex_table();
632
633/// Fast hex encoding using 2-byte pair lookup table — one lookup per input byte.
634/// Uses String directly instead of Vec<u8> to avoid the from_utf8 conversion overhead.
635pub(crate) fn hex_encode(bytes: &[u8]) -> String {
636    let len = bytes.len() * 2;
637    let mut hex = String::with_capacity(len);
638    // SAFETY: We write exactly `len` valid ASCII hex bytes into the String's buffer.
639    unsafe {
640        let buf = hex.as_mut_vec();
641        buf.set_len(len);
642        let ptr = buf.as_mut_ptr();
643        for (i, &b) in bytes.iter().enumerate() {
644            let pair = *HEX_TABLE.get_unchecked(b as usize);
645            *ptr.add(i * 2) = pair[0];
646            *ptr.add(i * 2 + 1) = pair[1];
647        }
648    }
649    hex
650}
coreutils_rs/hash/core.rs

coreutils_rs/hash/
core.rs