Skip to main content

coreutils_rs/hash/
core.rs

1use std::fs::{self, File};
2use std::io::{self, BufRead, BufReader, Read, Write};
3use std::path::Path;
4
5use md5::Md5;
6use memmap2::MmapOptions;
7use sha2::{Digest, Sha256};
8
9/// Supported hash algorithms.
10#[derive(Debug, Clone, Copy)]
11pub enum HashAlgorithm {
12    Sha256,
13    Md5,
14    Blake2b,
15}
16
17impl HashAlgorithm {
18    pub fn name(self) -> &'static str {
19        match self {
20            HashAlgorithm::Sha256 => "SHA256",
21            HashAlgorithm::Md5 => "MD5",
22            HashAlgorithm::Blake2b => "BLAKE2b",
23        }
24    }
25}
26
27// ── Generic hash helpers ────────────────────────────────────────────
28
29fn hash_digest<D: Digest>(data: &[u8]) -> String {
30    hex_encode(&D::digest(data))
31}
32
33fn hash_reader_impl<D: Digest>(mut reader: impl Read) -> io::Result<String> {
34    let mut hasher = D::new();
35    let mut buf = vec![0u8; 16 * 1024 * 1024]; // 16MB buffer — fewer syscalls
36    loop {
37        let n = reader.read(&mut buf)?;
38        if n == 0 {
39            break;
40        }
41        hasher.update(&buf[..n]);
42    }
43    Ok(hex_encode(&hasher.finalize()))
44}
45
46// ── Public hashing API ──────────────────────────────────────────────
47
48/// Compute hash of a byte slice directly (zero-copy fast path).
49pub fn hash_bytes(algo: HashAlgorithm, data: &[u8]) -> String {
50    match algo {
51        HashAlgorithm::Sha256 => hash_digest::<Sha256>(data),
52        HashAlgorithm::Md5 => hash_digest::<Md5>(data),
53        HashAlgorithm::Blake2b => {
54            let hash = blake2b_simd::blake2b(data);
55            hex_encode(hash.as_bytes())
56        }
57    }
58}
59
60/// Compute hash of data from a reader, returning hex string.
61pub fn hash_reader<R: Read>(algo: HashAlgorithm, reader: R) -> io::Result<String> {
62    match algo {
63        HashAlgorithm::Sha256 => hash_reader_impl::<Sha256>(reader),
64        HashAlgorithm::Md5 => hash_reader_impl::<Md5>(reader),
65        HashAlgorithm::Blake2b => blake2b_hash_reader(reader, 64),
66    }
67}
68
69/// Hash a file by path using mmap for regular files. Returns the hex digest.
70pub fn hash_file(algo: HashAlgorithm, path: &Path) -> io::Result<String> {
71    let metadata = fs::metadata(path)?;
72    let len = metadata.len();
73    let is_regular = metadata.file_type().is_file();
74
75    // mmap fast path for all regular files with data
76    if is_regular && len > 0 {
77        let file = File::open(path)?;
78        match unsafe {
79            MmapOptions::new()
80                .populate() // Eagerly populate page tables — avoids page faults
81                .map(&file)
82        } {
83            Ok(mmap) => {
84                #[cfg(target_os = "linux")]
85                {
86                    let _ = mmap.advise(memmap2::Advice::Sequential);
87                    // Request transparent huge pages for TLB efficiency on large files
88                    if len >= 2 * 1024 * 1024 {
89                        unsafe {
90                            libc::madvise(
91                                mmap.as_ptr() as *mut libc::c_void,
92                                mmap.len(),
93                                libc::MADV_HUGEPAGE,
94                            );
95                        }
96                    }
97                }
98                return Ok(hash_bytes(algo, &mmap));
99            }
100            Err(_) => {
101                let reader = BufReader::with_capacity(16 * 1024 * 1024, file);
102                return hash_reader(algo, reader);
103            }
104        }
105    }
106
107    // Empty regular files
108    if is_regular {
109        return Ok(hash_bytes(algo, &[]));
110    }
111
112    // Fallback: buffered read (special files, pipes, etc.)
113    let file = File::open(path)?;
114    let reader = BufReader::with_capacity(16 * 1024 * 1024, file);
115    hash_reader(algo, reader)
116}
117
118/// Hash stdin. Reads all data first, then hashes in one pass for optimal throughput.
119pub fn hash_stdin(algo: HashAlgorithm) -> io::Result<String> {
120    // Try to mmap stdin if it's a regular file (shell redirect)
121    #[cfg(unix)]
122    {
123        use std::os::unix::io::AsRawFd;
124        let stdin = io::stdin();
125        let fd = stdin.as_raw_fd();
126        let mut stat: libc::stat = unsafe { std::mem::zeroed() };
127        if unsafe { libc::fstat(fd, &mut stat) } == 0
128            && (stat.st_mode & libc::S_IFMT) == libc::S_IFREG
129            && stat.st_size > 0
130        {
131            use std::os::unix::io::FromRawFd;
132            let file = unsafe { File::from_raw_fd(fd) };
133            let result = unsafe { MmapOptions::new().populate().map(&file) };
134            std::mem::forget(file); // Don't close stdin
135            if let Ok(mmap) = result {
136                #[cfg(target_os = "linux")]
137                {
138                    let _ = mmap.advise(memmap2::Advice::Sequential);
139                }
140                return Ok(hash_bytes(algo, &mmap));
141            }
142        }
143    }
144    // Fallback: read all then hash in one pass (avoids per-read update overhead)
145    let mut data = Vec::new();
146    io::stdin().lock().read_to_end(&mut data)?;
147    Ok(hash_bytes(algo, &data))
148}
149
150/// Issue readahead hints for a list of file paths to warm the page cache.
151/// This should be called before parallel hashing to reduce I/O stalls.
152#[cfg(target_os = "linux")]
153pub fn readahead_files(paths: &[&Path]) {
154    use std::os::unix::io::AsRawFd;
155    for path in paths {
156        if let Ok(file) = File::open(path) {
157            if let Ok(meta) = file.metadata() {
158                let len = meta.len();
159                if meta.file_type().is_file() && len > 0 {
160                    unsafe {
161                        libc::readahead(file.as_raw_fd(), 0, len as usize);
162                    }
163                }
164            }
165        }
166    }
167}
168
169#[cfg(not(target_os = "linux"))]
170pub fn readahead_files(_paths: &[&Path]) {
171    // No-op on non-Linux
172}
173
174// --- BLAKE2b variable-length functions (using blake2b_simd) ---
175
176/// Hash raw data with BLAKE2b variable output length.
177/// `output_bytes` is the output size in bytes (e.g., 32 for 256-bit).
178pub fn blake2b_hash_data(data: &[u8], output_bytes: usize) -> String {
179    let hash = blake2b_simd::Params::new()
180        .hash_length(output_bytes)
181        .hash(data);
182    hex_encode(hash.as_bytes())
183}
184
185/// Hash a reader with BLAKE2b variable output length.
186pub fn blake2b_hash_reader<R: Read>(mut reader: R, output_bytes: usize) -> io::Result<String> {
187    let mut state = blake2b_simd::Params::new()
188        .hash_length(output_bytes)
189        .to_state();
190    let mut buf = vec![0u8; 16 * 1024 * 1024]; // 16MB buffer
191    loop {
192        let n = reader.read(&mut buf)?;
193        if n == 0 {
194            break;
195        }
196        state.update(&buf[..n]);
197    }
198    Ok(hex_encode(state.finalize().as_bytes()))
199}
200
201/// Hash a file with BLAKE2b variable output length using mmap.
202pub fn blake2b_hash_file(path: &Path, output_bytes: usize) -> io::Result<String> {
203    let metadata = fs::metadata(path)?;
204    let len = metadata.len();
205    let is_regular = metadata.file_type().is_file();
206
207    // mmap fast path for all regular files with data
208    if is_regular && len > 0 {
209        let file = File::open(path)?;
210        match unsafe { MmapOptions::new().populate().map(&file) } {
211            Ok(mmap) => {
212                #[cfg(target_os = "linux")]
213                {
214                    let _ = mmap.advise(memmap2::Advice::Sequential);
215                    if len >= 2 * 1024 * 1024 {
216                        unsafe {
217                            libc::madvise(
218                                mmap.as_ptr() as *mut libc::c_void,
219                                mmap.len(),
220                                libc::MADV_HUGEPAGE,
221                            );
222                        }
223                    }
224                }
225                return Ok(blake2b_hash_data(&mmap, output_bytes));
226            }
227            Err(_) => {
228                let reader = BufReader::with_capacity(16 * 1024 * 1024, file);
229                return blake2b_hash_reader(reader, output_bytes);
230            }
231        }
232    }
233
234    // Empty regular files
235    if is_regular {
236        return Ok(blake2b_hash_data(&[], output_bytes));
237    }
238
239    let file = File::open(path)?;
240    let reader = BufReader::with_capacity(16 * 1024 * 1024, file);
241    blake2b_hash_reader(reader, output_bytes)
242}
243
244/// Hash stdin with BLAKE2b variable output length.
245pub fn blake2b_hash_stdin(output_bytes: usize) -> io::Result<String> {
246    blake2b_hash_reader(io::stdin().lock(), output_bytes)
247}
248
249/// Print hash result in GNU format: "hash  filename\n"
250pub fn print_hash(
251    out: &mut impl Write,
252    hash: &str,
253    filename: &str,
254    binary: bool,
255) -> io::Result<()> {
256    let mode_char = if binary { '*' } else { ' ' };
257    writeln!(out, "{} {}{}", hash, mode_char, filename)
258}
259
260/// Print hash in GNU format with NUL terminator instead of newline.
261pub fn print_hash_zero(
262    out: &mut impl Write,
263    hash: &str,
264    filename: &str,
265    binary: bool,
266) -> io::Result<()> {
267    let mode_char = if binary { '*' } else { ' ' };
268    write!(out, "{} {}{}\0", hash, mode_char, filename)
269}
270
271/// Print hash result in BSD tag format: "ALGO (filename) = hash\n"
272pub fn print_hash_tag(
273    out: &mut impl Write,
274    algo: HashAlgorithm,
275    hash: &str,
276    filename: &str,
277) -> io::Result<()> {
278    writeln!(out, "{} ({}) = {}", algo.name(), filename, hash)
279}
280
281/// Print hash in BSD tag format with NUL terminator.
282pub fn print_hash_tag_zero(
283    out: &mut impl Write,
284    algo: HashAlgorithm,
285    hash: &str,
286    filename: &str,
287) -> io::Result<()> {
288    write!(out, "{} ({}) = {}\0", algo.name(), filename, hash)
289}
290
291/// Print hash in BSD tag format with BLAKE2b length info:
292/// "BLAKE2b (filename) = hash" for 512-bit, or
293/// "BLAKE2b-256 (filename) = hash" for other lengths.
294pub fn print_hash_tag_b2sum(
295    out: &mut impl Write,
296    hash: &str,
297    filename: &str,
298    bits: usize,
299) -> io::Result<()> {
300    if bits == 512 {
301        writeln!(out, "BLAKE2b ({}) = {}", filename, hash)
302    } else {
303        writeln!(out, "BLAKE2b-{} ({}) = {}", bits, filename, hash)
304    }
305}
306
307/// Print hash in BSD tag format with BLAKE2b length info and NUL terminator.
308pub fn print_hash_tag_b2sum_zero(
309    out: &mut impl Write,
310    hash: &str,
311    filename: &str,
312    bits: usize,
313) -> io::Result<()> {
314    if bits == 512 {
315        write!(out, "BLAKE2b ({}) = {}\0", filename, hash)
316    } else {
317        write!(out, "BLAKE2b-{} ({}) = {}\0", bits, filename, hash)
318    }
319}
320
321/// Options for check mode.
322pub struct CheckOptions {
323    pub quiet: bool,
324    pub status_only: bool,
325    pub strict: bool,
326    pub warn: bool,
327    pub ignore_missing: bool,
328    /// Prefix for per-line format warnings, e.g., "fmd5sum: checksums.txt".
329    /// When non-empty, warnings use GNU format: "{prefix}: {line}: message".
330    /// When empty, uses generic format: "line {line}: message".
331    pub warn_prefix: String,
332}
333
334/// Result of check mode verification.
335pub struct CheckResult {
336    pub ok: usize,
337    pub mismatches: usize,
338    pub format_errors: usize,
339    pub read_errors: usize,
340    /// Number of files skipped because they were missing and --ignore-missing was set.
341    pub ignored_missing: usize,
342}
343
344/// Verify checksums from a check file.
345/// Each line should be "hash  filename" or "hash *filename" or "ALGO (filename) = hash".
346pub fn check_file<R: BufRead>(
347    algo: HashAlgorithm,
348    reader: R,
349    opts: &CheckOptions,
350    out: &mut impl Write,
351    err_out: &mut impl Write,
352) -> io::Result<CheckResult> {
353    let quiet = opts.quiet;
354    let status_only = opts.status_only;
355    let warn = opts.warn;
356    let ignore_missing = opts.ignore_missing;
357    let mut ok_count = 0;
358    let mut mismatch_count = 0;
359    let mut format_errors = 0;
360    let mut read_errors = 0;
361    let mut ignored_missing_count = 0;
362    let mut line_num = 0;
363
364    for line_result in reader.lines() {
365        line_num += 1;
366        let line = line_result?;
367        let line = line.trim_end();
368
369        if line.is_empty() {
370            continue;
371        }
372
373        // Parse "hash  filename" or "hash *filename" or "ALGO (file) = hash"
374        let (expected_hash, filename) = match parse_check_line(line) {
375            Some(v) => v,
376            None => {
377                format_errors += 1;
378                if warn {
379                    out.flush()?;
380                    if opts.warn_prefix.is_empty() {
381                        writeln!(
382                            err_out,
383                            "line {}: improperly formatted {} checksum line",
384                            line_num,
385                            algo.name()
386                        )?;
387                    } else {
388                        writeln!(
389                            err_out,
390                            "{}: {}: improperly formatted {} checksum line",
391                            opts.warn_prefix,
392                            line_num,
393                            algo.name()
394                        )?;
395                    }
396                }
397                continue;
398            }
399        };
400
401        // Compute actual hash
402        let actual = match hash_file(algo, Path::new(filename)) {
403            Ok(h) => h,
404            Err(e) => {
405                if ignore_missing && e.kind() == io::ErrorKind::NotFound {
406                    ignored_missing_count += 1;
407                    continue;
408                }
409                read_errors += 1;
410                if !status_only {
411                    out.flush()?;
412                    writeln!(err_out, "{}: {}", filename, e)?;
413                    writeln!(out, "{}: FAILED open or read", filename)?;
414                }
415                continue;
416            }
417        };
418
419        if actual.eq_ignore_ascii_case(expected_hash) {
420            ok_count += 1;
421            if !quiet && !status_only {
422                writeln!(out, "{}: OK", filename)?;
423            }
424        } else {
425            mismatch_count += 1;
426            if !status_only {
427                writeln!(out, "{}: FAILED", filename)?;
428            }
429        }
430    }
431
432    Ok(CheckResult {
433        ok: ok_count,
434        mismatches: mismatch_count,
435        format_errors,
436        read_errors,
437        ignored_missing: ignored_missing_count,
438    })
439}
440
441/// Parse a checksum line in any supported format.
442pub fn parse_check_line(line: &str) -> Option<(&str, &str)> {
443    // Try BSD tag format: "ALGO (filename) = hash"
444    let rest = line
445        .strip_prefix("MD5 (")
446        .or_else(|| line.strip_prefix("SHA256 ("))
447        .or_else(|| line.strip_prefix("BLAKE2b ("))
448        .or_else(|| {
449            // Handle BLAKE2b-NNN (filename) = hash
450            if line.starts_with("BLAKE2b-") {
451                let after = &line["BLAKE2b-".len()..];
452                if let Some(sp) = after.find(" (") {
453                    if after[..sp].bytes().all(|b| b.is_ascii_digit()) {
454                        return Some(&after[sp + 2..]);
455                    }
456                }
457            }
458            None
459        });
460    if let Some(rest) = rest {
461        if let Some(paren_idx) = rest.find(") = ") {
462            let filename = &rest[..paren_idx];
463            let hash = &rest[paren_idx + 4..];
464            return Some((hash, filename));
465        }
466    }
467
468    // Handle backslash-escaped lines (leading '\')
469    let line = line.strip_prefix('\\').unwrap_or(line);
470
471    // Standard format: "hash  filename"
472    if let Some(idx) = line.find("  ") {
473        let hash = &line[..idx];
474        let rest = &line[idx + 2..];
475        return Some((hash, rest));
476    }
477    // Binary mode: "hash *filename"
478    if let Some(idx) = line.find(" *") {
479        let hash = &line[..idx];
480        let rest = &line[idx + 2..];
481        return Some((hash, rest));
482    }
483    None
484}
485
486/// Parse a BSD-style tag line: "ALGO (filename) = hash"
487/// Returns (expected_hash, filename, optional_bits).
488/// `bits` is the hash length parsed from the algo name (e.g., BLAKE2b-256 -> Some(256)).
489pub fn parse_check_line_tag(line: &str) -> Option<(&str, &str, Option<usize>)> {
490    let paren_start = line.find(" (")?;
491    let algo_part = &line[..paren_start];
492    let rest = &line[paren_start + 2..];
493    let paren_end = rest.find(") = ")?;
494    let filename = &rest[..paren_end];
495    let hash = &rest[paren_end + 4..];
496
497    // Parse optional bit length from algo name (e.g., "BLAKE2b-256" -> Some(256))
498    let bits = if let Some(dash_pos) = algo_part.rfind('-') {
499        algo_part[dash_pos + 1..].parse::<usize>().ok()
500    } else {
501        None
502    };
503
504    Some((hash, filename, bits))
505}
506
507/// Compile-time generated 2-byte hex pair lookup table.
508/// Each byte maps directly to its 2-char hex representation — single lookup per byte.
509const fn generate_hex_table() -> [[u8; 2]; 256] {
510    let hex = b"0123456789abcdef";
511    let mut table = [[0u8; 2]; 256];
512    let mut i = 0;
513    while i < 256 {
514        table[i] = [hex[i >> 4], hex[i & 0xf]];
515        i += 1;
516    }
517    table
518}
519
520const HEX_TABLE: [[u8; 2]; 256] = generate_hex_table();
521
522/// Fast hex encoding using 2-byte pair lookup table — one lookup per input byte.
523pub(crate) fn hex_encode(bytes: &[u8]) -> String {
524    let mut hex = vec![0u8; bytes.len() * 2];
525    let mut i = 0;
526    for &b in bytes {
527        let pair = unsafe { *HEX_TABLE.get_unchecked(b as usize) };
528        unsafe {
529            *hex.get_unchecked_mut(i) = pair[0];
530            *hex.get_unchecked_mut(i + 1) = pair[1];
531        }
532        i += 2;
533    }
534    // SAFETY: All bytes are ASCII hex digits [0-9a-f]
535    unsafe { String::from_utf8_unchecked(hex) }
536}