Skip to main content

coreutils_rs/hash/
core.rs

1use std::fs::{self, File};
2use std::io::{self, BufRead, BufReader, Read, Write};
3use std::path::Path;
4
5use blake2::Blake2b512;
6use md5::Md5;
7use memmap2::MmapOptions;
8use sha2::{Digest, Sha256};
9
10/// Supported hash algorithms.
11#[derive(Debug, Clone, Copy)]
12pub enum HashAlgorithm {
13    Sha256,
14    Md5,
15    Blake2b,
16}
17
18impl HashAlgorithm {
19    pub fn name(self) -> &'static str {
20        match self {
21            HashAlgorithm::Sha256 => "SHA256",
22            HashAlgorithm::Md5 => "MD5",
23            HashAlgorithm::Blake2b => "BLAKE2b",
24        }
25    }
26}
27
28/// Threshold above which we use mmap instead of buffered read.
29const MMAP_THRESHOLD: u64 = 64 * 1024;
30
31// ── Generic hash helpers ────────────────────────────────────────────
32
33fn hash_digest<D: Digest>(data: &[u8]) -> String {
34    hex_encode(&D::digest(data))
35}
36
37fn hash_reader_impl<D: Digest>(mut reader: impl Read) -> io::Result<String> {
38    let mut hasher = D::new();
39    let mut buf = vec![0u8; 8 * 1024 * 1024]; // 8MB buffer for better throughput
40    loop {
41        let n = reader.read(&mut buf)?;
42        if n == 0 {
43            break;
44        }
45        hasher.update(&buf[..n]);
46    }
47    Ok(hex_encode(&hasher.finalize()))
48}
49
50// ── Public hashing API ──────────────────────────────────────────────
51
52/// Compute hash of a byte slice directly (zero-copy fast path).
53pub fn hash_bytes(algo: HashAlgorithm, data: &[u8]) -> String {
54    match algo {
55        HashAlgorithm::Sha256 => hash_digest::<Sha256>(data),
56        HashAlgorithm::Md5 => hash_digest::<Md5>(data),
57        HashAlgorithm::Blake2b => hash_digest::<Blake2b512>(data),
58    }
59}
60
61/// Compute hash of data from a reader, returning hex string.
62pub fn hash_reader<R: Read>(algo: HashAlgorithm, reader: R) -> io::Result<String> {
63    match algo {
64        HashAlgorithm::Sha256 => hash_reader_impl::<Sha256>(reader),
65        HashAlgorithm::Md5 => hash_reader_impl::<Md5>(reader),
66        HashAlgorithm::Blake2b => hash_reader_impl::<Blake2b512>(reader),
67    }
68}
69
70/// Hash a file by path using mmap for large files. Returns the hex digest.
71pub fn hash_file(algo: HashAlgorithm, path: &Path) -> io::Result<String> {
72    let metadata = fs::metadata(path)?;
73    let len = metadata.len();
74    let is_regular = metadata.file_type().is_file();
75
76    // mmap fast path for regular files >= 64KB
77    if is_regular && len >= MMAP_THRESHOLD {
78        let file = File::open(path)?;
79        match unsafe { MmapOptions::new().map(&file) } {
80            Ok(mmap) => {
81                #[cfg(target_os = "linux")]
82                {
83                    let _ = mmap.advise(memmap2::Advice::Sequential);
84                }
85                return Ok(hash_bytes(algo, &mmap));
86            }
87            Err(_) => {
88                let reader = BufReader::with_capacity(8 * 1024 * 1024, file);
89                return hash_reader(algo, reader);
90            }
91        }
92    }
93
94    // Small regular files: read into memory directly
95    if is_regular && len > 0 {
96        let data = fs::read(path)?;
97        return Ok(hash_bytes(algo, &data));
98    }
99
100    // Fallback: buffered read (special files, empty files, etc.)
101    let file = File::open(path)?;
102    let reader = BufReader::with_capacity(8 * 1024 * 1024, file);
103    hash_reader(algo, reader)
104}
105
106/// Hash stdin. Returns the hex digest.
107pub fn hash_stdin(algo: HashAlgorithm) -> io::Result<String> {
108    hash_reader(algo, io::stdin().lock())
109}
110
111/// Issue readahead hints for a list of file paths to warm the page cache.
112/// This should be called before parallel hashing to reduce I/O stalls.
113#[cfg(target_os = "linux")]
114pub fn readahead_files(paths: &[&Path]) {
115    use std::os::unix::io::AsRawFd;
116    for path in paths {
117        if let Ok(file) = File::open(path) {
118            if let Ok(meta) = file.metadata() {
119                let len = meta.len();
120                if meta.file_type().is_file() && len > 0 {
121                    unsafe {
122                        libc::readahead(file.as_raw_fd(), 0, len as usize);
123                    }
124                }
125            }
126        }
127    }
128}
129
130#[cfg(not(target_os = "linux"))]
131pub fn readahead_files(_paths: &[&Path]) {
132    // No-op on non-Linux
133}
134
135// --- BLAKE2b variable-length functions ---
136
137/// Hash raw data with BLAKE2b variable output length.
138/// `output_bytes` is the output size in bytes (e.g., 32 for 256-bit).
139pub fn blake2b_hash_data(data: &[u8], output_bytes: usize) -> String {
140    use blake2::Blake2bVar;
141    use blake2::digest::{Update, VariableOutput};
142
143    let mut hasher = Blake2bVar::new(output_bytes).expect("Invalid BLAKE2b output size");
144    Update::update(&mut hasher, data);
145    let result = hasher.finalize_boxed();
146    hex_encode(&result)
147}
148
149/// Hash a reader with BLAKE2b variable output length.
150pub fn blake2b_hash_reader<R: Read>(mut reader: R, output_bytes: usize) -> io::Result<String> {
151    use blake2::Blake2bVar;
152    use blake2::digest::{Update, VariableOutput};
153
154    let mut hasher = Blake2bVar::new(output_bytes).expect("Invalid BLAKE2b output size");
155    let mut buf = vec![0u8; 8 * 1024 * 1024]; // 8MB buffer
156    loop {
157        let n = reader.read(&mut buf)?;
158        if n == 0 {
159            break;
160        }
161        Update::update(&mut hasher, &buf[..n]);
162    }
163    Ok(hex_encode(&hasher.finalize_boxed()))
164}
165
166/// Hash a file with BLAKE2b variable output length using mmap.
167pub fn blake2b_hash_file(path: &Path, output_bytes: usize) -> io::Result<String> {
168    let metadata = fs::metadata(path)?;
169    let len = metadata.len();
170    let is_regular = metadata.file_type().is_file();
171
172    if is_regular && len >= MMAP_THRESHOLD {
173        let file = File::open(path)?;
174        match unsafe { MmapOptions::new().map(&file) } {
175            Ok(mmap) => {
176                #[cfg(target_os = "linux")]
177                {
178                    let _ = mmap.advise(memmap2::Advice::Sequential);
179                }
180                return Ok(blake2b_hash_data(&mmap, output_bytes));
181            }
182            Err(_) => {
183                let reader = BufReader::with_capacity(8 * 1024 * 1024, file);
184                return blake2b_hash_reader(reader, output_bytes);
185            }
186        }
187    }
188
189    if is_regular && len > 0 {
190        let data = fs::read(path)?;
191        return Ok(blake2b_hash_data(&data, output_bytes));
192    }
193
194    let file = File::open(path)?;
195    let reader = BufReader::with_capacity(8 * 1024 * 1024, file);
196    blake2b_hash_reader(reader, output_bytes)
197}
198
199/// Hash stdin with BLAKE2b variable output length.
200pub fn blake2b_hash_stdin(output_bytes: usize) -> io::Result<String> {
201    blake2b_hash_reader(io::stdin().lock(), output_bytes)
202}
203
204/// Print hash result in GNU format: "hash  filename\n"
205pub fn print_hash(
206    out: &mut impl Write,
207    hash: &str,
208    filename: &str,
209    binary: bool,
210) -> io::Result<()> {
211    let mode_char = if binary { '*' } else { ' ' };
212    writeln!(out, "{} {}{}", hash, mode_char, filename)
213}
214
215/// Print hash in GNU format with NUL terminator instead of newline.
216pub fn print_hash_zero(
217    out: &mut impl Write,
218    hash: &str,
219    filename: &str,
220    binary: bool,
221) -> io::Result<()> {
222    let mode_char = if binary { '*' } else { ' ' };
223    write!(out, "{} {}{}\0", hash, mode_char, filename)
224}
225
226/// Print hash result in BSD tag format: "ALGO (filename) = hash\n"
227pub fn print_hash_tag(
228    out: &mut impl Write,
229    algo: HashAlgorithm,
230    hash: &str,
231    filename: &str,
232) -> io::Result<()> {
233    writeln!(out, "{} ({}) = {}", algo.name(), filename, hash)
234}
235
236/// Print hash in BSD tag format with NUL terminator.
237pub fn print_hash_tag_zero(
238    out: &mut impl Write,
239    algo: HashAlgorithm,
240    hash: &str,
241    filename: &str,
242) -> io::Result<()> {
243    write!(out, "{} ({}) = {}\0", algo.name(), filename, hash)
244}
245
246/// Print hash in BSD tag format with BLAKE2b length info:
247/// "BLAKE2b (filename) = hash" for 512-bit, or
248/// "BLAKE2b-256 (filename) = hash" for other lengths.
249pub fn print_hash_tag_b2sum(
250    out: &mut impl Write,
251    hash: &str,
252    filename: &str,
253    bits: usize,
254) -> io::Result<()> {
255    if bits == 512 {
256        writeln!(out, "BLAKE2b ({}) = {}", filename, hash)
257    } else {
258        writeln!(out, "BLAKE2b-{} ({}) = {}", bits, filename, hash)
259    }
260}
261
262/// Print hash in BSD tag format with BLAKE2b length info and NUL terminator.
263pub fn print_hash_tag_b2sum_zero(
264    out: &mut impl Write,
265    hash: &str,
266    filename: &str,
267    bits: usize,
268) -> io::Result<()> {
269    if bits == 512 {
270        write!(out, "BLAKE2b ({}) = {}\0", filename, hash)
271    } else {
272        write!(out, "BLAKE2b-{} ({}) = {}\0", bits, filename, hash)
273    }
274}
275
276/// Options for check mode.
277pub struct CheckOptions {
278    pub quiet: bool,
279    pub status_only: bool,
280    pub strict: bool,
281    pub warn: bool,
282    pub ignore_missing: bool,
283    /// Prefix for per-line format warnings, e.g., "fmd5sum: checksums.txt".
284    /// When non-empty, warnings use GNU format: "{prefix}: {line}: message".
285    /// When empty, uses generic format: "line {line}: message".
286    pub warn_prefix: String,
287}
288
289/// Result of check mode verification.
290pub struct CheckResult {
291    pub ok: usize,
292    pub mismatches: usize,
293    pub format_errors: usize,
294    pub read_errors: usize,
295}
296
297/// Verify checksums from a check file.
298/// Each line should be "hash  filename" or "hash *filename" or "ALGO (filename) = hash".
299pub fn check_file<R: BufRead>(
300    algo: HashAlgorithm,
301    reader: R,
302    opts: &CheckOptions,
303    out: &mut impl Write,
304    err_out: &mut impl Write,
305) -> io::Result<CheckResult> {
306    let quiet = opts.quiet;
307    let status_only = opts.status_only;
308    let warn = opts.warn;
309    let ignore_missing = opts.ignore_missing;
310    let mut ok_count = 0;
311    let mut mismatch_count = 0;
312    let mut format_errors = 0;
313    let mut read_errors = 0;
314    let mut line_num = 0;
315
316    for line_result in reader.lines() {
317        line_num += 1;
318        let line = line_result?;
319        let line = line.trim_end();
320
321        if line.is_empty() {
322            continue;
323        }
324
325        // Parse "hash  filename" or "hash *filename" or "ALGO (file) = hash"
326        let (expected_hash, filename) = match parse_check_line(line) {
327            Some(v) => v,
328            None => {
329                format_errors += 1;
330                if warn {
331                    out.flush()?;
332                    if opts.warn_prefix.is_empty() {
333                        writeln!(
334                            err_out,
335                            "line {}: improperly formatted {} checksum line",
336                            line_num,
337                            algo.name()
338                        )?;
339                    } else {
340                        writeln!(
341                            err_out,
342                            "{}: {}: improperly formatted {} checksum line",
343                            opts.warn_prefix,
344                            line_num,
345                            algo.name()
346                        )?;
347                    }
348                }
349                continue;
350            }
351        };
352
353        // Compute actual hash
354        let actual = match hash_file(algo, Path::new(filename)) {
355            Ok(h) => h,
356            Err(e) => {
357                if ignore_missing && e.kind() == io::ErrorKind::NotFound {
358                    continue;
359                }
360                read_errors += 1;
361                if !status_only {
362                    out.flush()?;
363                    writeln!(err_out, "{}: {}", filename, e)?;
364                    writeln!(out, "{}: FAILED open or read", filename)?;
365                }
366                continue;
367            }
368        };
369
370        if actual.eq_ignore_ascii_case(expected_hash) {
371            ok_count += 1;
372            if !quiet && !status_only {
373                writeln!(out, "{}: OK", filename)?;
374            }
375        } else {
376            mismatch_count += 1;
377            if !status_only {
378                writeln!(out, "{}: FAILED", filename)?;
379            }
380        }
381    }
382
383    Ok(CheckResult {
384        ok: ok_count,
385        mismatches: mismatch_count,
386        format_errors,
387        read_errors,
388    })
389}
390
391/// Parse a checksum line in any supported format.
392pub fn parse_check_line(line: &str) -> Option<(&str, &str)> {
393    // Try BSD tag format: "ALGO (filename) = hash"
394    let rest = line
395        .strip_prefix("MD5 (")
396        .or_else(|| line.strip_prefix("SHA256 ("))
397        .or_else(|| line.strip_prefix("BLAKE2b ("))
398        .or_else(|| {
399            // Handle BLAKE2b-NNN (filename) = hash
400            if line.starts_with("BLAKE2b-") {
401                let after = &line["BLAKE2b-".len()..];
402                if let Some(sp) = after.find(" (") {
403                    if after[..sp].bytes().all(|b| b.is_ascii_digit()) {
404                        return Some(&after[sp + 2..]);
405                    }
406                }
407            }
408            None
409        });
410    if let Some(rest) = rest {
411        if let Some(paren_idx) = rest.find(") = ") {
412            let filename = &rest[..paren_idx];
413            let hash = &rest[paren_idx + 4..];
414            return Some((hash, filename));
415        }
416    }
417
418    // Handle backslash-escaped lines (leading '\')
419    let line = line.strip_prefix('\\').unwrap_or(line);
420
421    // Standard format: "hash  filename"
422    if let Some(idx) = line.find("  ") {
423        let hash = &line[..idx];
424        let rest = &line[idx + 2..];
425        return Some((hash, rest));
426    }
427    // Binary mode: "hash *filename"
428    if let Some(idx) = line.find(" *") {
429        let hash = &line[..idx];
430        let rest = &line[idx + 2..];
431        return Some((hash, rest));
432    }
433    None
434}
435
436/// Parse a BSD-style tag line: "ALGO (filename) = hash"
437/// Returns (expected_hash, filename, optional_bits).
438/// `bits` is the hash length parsed from the algo name (e.g., BLAKE2b-256 -> Some(256)).
439pub fn parse_check_line_tag(line: &str) -> Option<(&str, &str, Option<usize>)> {
440    let paren_start = line.find(" (")?;
441    let algo_part = &line[..paren_start];
442    let rest = &line[paren_start + 2..];
443    let paren_end = rest.find(") = ")?;
444    let filename = &rest[..paren_end];
445    let hash = &rest[paren_end + 4..];
446
447    // Parse optional bit length from algo name (e.g., "BLAKE2b-256" -> Some(256))
448    let bits = if let Some(dash_pos) = algo_part.rfind('-') {
449        algo_part[dash_pos + 1..].parse::<usize>().ok()
450    } else {
451        None
452    };
453
454    Some((hash, filename, bits))
455}
456
457/// Fast hex encoding using lookup table.
458const HEX_CHARS: &[u8; 16] = b"0123456789abcdef";
459
460pub(crate) fn hex_encode(bytes: &[u8]) -> String {
461    let mut hex = vec![0u8; bytes.len() * 2];
462    for (i, &b) in bytes.iter().enumerate() {
463        hex[i * 2] = HEX_CHARS[(b >> 4) as usize];
464        hex[i * 2 + 1] = HEX_CHARS[(b & 0x0f) as usize];
465    }
466    // SAFETY: All bytes are ASCII hex digits [0-9a-f]
467    unsafe { String::from_utf8_unchecked(hex) }
468}