Skip to main content

coreutils_rs/hash/
core.rs

1use std::fs::{self, File};
2use std::io::{self, BufRead, BufReader, Read, Write};
3use std::path::Path;
4
5use md5::Md5;
6use memmap2::MmapOptions;
7use sha2::{Digest, Sha256};
8
9/// Supported hash algorithms.
10#[derive(Debug, Clone, Copy)]
11pub enum HashAlgorithm {
12    Sha256,
13    Md5,
14    Blake2b,
15}
16
17impl HashAlgorithm {
18    pub fn name(self) -> &'static str {
19        match self {
20            HashAlgorithm::Sha256 => "SHA256",
21            HashAlgorithm::Md5 => "MD5",
22            HashAlgorithm::Blake2b => "BLAKE2b",
23        }
24    }
25}
26
27/// Threshold above which we use mmap instead of buffered read.
28const MMAP_THRESHOLD: u64 = 64 * 1024;
29
30// ── Generic hash helpers ────────────────────────────────────────────
31
32fn hash_digest<D: Digest>(data: &[u8]) -> String {
33    hex_encode(&D::digest(data))
34}
35
36fn hash_reader_impl<D: Digest>(mut reader: impl Read) -> io::Result<String> {
37    let mut hasher = D::new();
38    let mut buf = vec![0u8; 8 * 1024 * 1024]; // 8MB buffer for better throughput
39    loop {
40        let n = reader.read(&mut buf)?;
41        if n == 0 {
42            break;
43        }
44        hasher.update(&buf[..n]);
45    }
46    Ok(hex_encode(&hasher.finalize()))
47}
48
49// ── Public hashing API ──────────────────────────────────────────────
50
51/// Compute hash of a byte slice directly (zero-copy fast path).
52pub fn hash_bytes(algo: HashAlgorithm, data: &[u8]) -> String {
53    match algo {
54        HashAlgorithm::Sha256 => hash_digest::<Sha256>(data),
55        HashAlgorithm::Md5 => hash_digest::<Md5>(data),
56        HashAlgorithm::Blake2b => {
57            let hash = blake2b_simd::blake2b(data);
58            hex_encode(hash.as_bytes())
59        }
60    }
61}
62
63/// Compute hash of data from a reader, returning hex string.
64pub fn hash_reader<R: Read>(algo: HashAlgorithm, reader: R) -> io::Result<String> {
65    match algo {
66        HashAlgorithm::Sha256 => hash_reader_impl::<Sha256>(reader),
67        HashAlgorithm::Md5 => hash_reader_impl::<Md5>(reader),
68        HashAlgorithm::Blake2b => blake2b_hash_reader(reader, 64),
69    }
70}
71
72/// Hash a file by path using mmap for large files. Returns the hex digest.
73pub fn hash_file(algo: HashAlgorithm, path: &Path) -> io::Result<String> {
74    let metadata = fs::metadata(path)?;
75    let len = metadata.len();
76    let is_regular = metadata.file_type().is_file();
77
78    // mmap fast path for regular files >= 64KB
79    if is_regular && len >= MMAP_THRESHOLD {
80        let file = File::open(path)?;
81        match unsafe { MmapOptions::new().map(&file) } {
82            Ok(mmap) => {
83                #[cfg(target_os = "linux")]
84                {
85                    let _ = mmap.advise(memmap2::Advice::Sequential);
86                }
87                return Ok(hash_bytes(algo, &mmap));
88            }
89            Err(_) => {
90                let reader = BufReader::with_capacity(8 * 1024 * 1024, file);
91                return hash_reader(algo, reader);
92            }
93        }
94    }
95
96    // Small regular files: read into memory directly
97    if is_regular && len > 0 {
98        let data = fs::read(path)?;
99        return Ok(hash_bytes(algo, &data));
100    }
101
102    // Fallback: buffered read (special files, empty files, etc.)
103    let file = File::open(path)?;
104    let reader = BufReader::with_capacity(8 * 1024 * 1024, file);
105    hash_reader(algo, reader)
106}
107
108/// Hash stdin. Returns the hex digest.
109pub fn hash_stdin(algo: HashAlgorithm) -> io::Result<String> {
110    hash_reader(algo, io::stdin().lock())
111}
112
113/// Issue readahead hints for a list of file paths to warm the page cache.
114/// This should be called before parallel hashing to reduce I/O stalls.
115#[cfg(target_os = "linux")]
116pub fn readahead_files(paths: &[&Path]) {
117    use std::os::unix::io::AsRawFd;
118    for path in paths {
119        if let Ok(file) = File::open(path) {
120            if let Ok(meta) = file.metadata() {
121                let len = meta.len();
122                if meta.file_type().is_file() && len > 0 {
123                    unsafe {
124                        libc::readahead(file.as_raw_fd(), 0, len as usize);
125                    }
126                }
127            }
128        }
129    }
130}
131
132#[cfg(not(target_os = "linux"))]
133pub fn readahead_files(_paths: &[&Path]) {
134    // No-op on non-Linux
135}
136
137// --- BLAKE2b variable-length functions (using blake2b_simd) ---
138
139/// Hash raw data with BLAKE2b variable output length.
140/// `output_bytes` is the output size in bytes (e.g., 32 for 256-bit).
141pub fn blake2b_hash_data(data: &[u8], output_bytes: usize) -> String {
142    let hash = blake2b_simd::Params::new()
143        .hash_length(output_bytes)
144        .hash(data);
145    hex_encode(hash.as_bytes())
146}
147
148/// Hash a reader with BLAKE2b variable output length.
149pub fn blake2b_hash_reader<R: Read>(mut reader: R, output_bytes: usize) -> io::Result<String> {
150    let mut state = blake2b_simd::Params::new()
151        .hash_length(output_bytes)
152        .to_state();
153    let mut buf = vec![0u8; 8 * 1024 * 1024]; // 8MB buffer
154    loop {
155        let n = reader.read(&mut buf)?;
156        if n == 0 {
157            break;
158        }
159        state.update(&buf[..n]);
160    }
161    Ok(hex_encode(state.finalize().as_bytes()))
162}
163
164/// Hash a file with BLAKE2b variable output length using mmap.
165pub fn blake2b_hash_file(path: &Path, output_bytes: usize) -> io::Result<String> {
166    let metadata = fs::metadata(path)?;
167    let len = metadata.len();
168    let is_regular = metadata.file_type().is_file();
169
170    if is_regular && len >= MMAP_THRESHOLD {
171        let file = File::open(path)?;
172        match unsafe { MmapOptions::new().map(&file) } {
173            Ok(mmap) => {
174                #[cfg(target_os = "linux")]
175                {
176                    let _ = mmap.advise(memmap2::Advice::Sequential);
177                }
178                return Ok(blake2b_hash_data(&mmap, output_bytes));
179            }
180            Err(_) => {
181                let reader = BufReader::with_capacity(8 * 1024 * 1024, file);
182                return blake2b_hash_reader(reader, output_bytes);
183            }
184        }
185    }
186
187    if is_regular && len > 0 {
188        let data = fs::read(path)?;
189        return Ok(blake2b_hash_data(&data, output_bytes));
190    }
191
192    let file = File::open(path)?;
193    let reader = BufReader::with_capacity(8 * 1024 * 1024, file);
194    blake2b_hash_reader(reader, output_bytes)
195}
196
197/// Hash stdin with BLAKE2b variable output length.
198pub fn blake2b_hash_stdin(output_bytes: usize) -> io::Result<String> {
199    blake2b_hash_reader(io::stdin().lock(), output_bytes)
200}
201
202/// Print hash result in GNU format: "hash  filename\n"
203pub fn print_hash(
204    out: &mut impl Write,
205    hash: &str,
206    filename: &str,
207    binary: bool,
208) -> io::Result<()> {
209    let mode_char = if binary { '*' } else { ' ' };
210    writeln!(out, "{} {}{}", hash, mode_char, filename)
211}
212
213/// Print hash in GNU format with NUL terminator instead of newline.
214pub fn print_hash_zero(
215    out: &mut impl Write,
216    hash: &str,
217    filename: &str,
218    binary: bool,
219) -> io::Result<()> {
220    let mode_char = if binary { '*' } else { ' ' };
221    write!(out, "{} {}{}\0", hash, mode_char, filename)
222}
223
224/// Print hash result in BSD tag format: "ALGO (filename) = hash\n"
225pub fn print_hash_tag(
226    out: &mut impl Write,
227    algo: HashAlgorithm,
228    hash: &str,
229    filename: &str,
230) -> io::Result<()> {
231    writeln!(out, "{} ({}) = {}", algo.name(), filename, hash)
232}
233
234/// Print hash in BSD tag format with NUL terminator.
235pub fn print_hash_tag_zero(
236    out: &mut impl Write,
237    algo: HashAlgorithm,
238    hash: &str,
239    filename: &str,
240) -> io::Result<()> {
241    write!(out, "{} ({}) = {}\0", algo.name(), filename, hash)
242}
243
244/// Print hash in BSD tag format with BLAKE2b length info:
245/// "BLAKE2b (filename) = hash" for 512-bit, or
246/// "BLAKE2b-256 (filename) = hash" for other lengths.
247pub fn print_hash_tag_b2sum(
248    out: &mut impl Write,
249    hash: &str,
250    filename: &str,
251    bits: usize,
252) -> io::Result<()> {
253    if bits == 512 {
254        writeln!(out, "BLAKE2b ({}) = {}", filename, hash)
255    } else {
256        writeln!(out, "BLAKE2b-{} ({}) = {}", bits, filename, hash)
257    }
258}
259
260/// Print hash in BSD tag format with BLAKE2b length info and NUL terminator.
261pub fn print_hash_tag_b2sum_zero(
262    out: &mut impl Write,
263    hash: &str,
264    filename: &str,
265    bits: usize,
266) -> io::Result<()> {
267    if bits == 512 {
268        write!(out, "BLAKE2b ({}) = {}\0", filename, hash)
269    } else {
270        write!(out, "BLAKE2b-{} ({}) = {}\0", bits, filename, hash)
271    }
272}
273
274/// Options for check mode.
275pub struct CheckOptions {
276    pub quiet: bool,
277    pub status_only: bool,
278    pub strict: bool,
279    pub warn: bool,
280    pub ignore_missing: bool,
281    /// Prefix for per-line format warnings, e.g., "fmd5sum: checksums.txt".
282    /// When non-empty, warnings use GNU format: "{prefix}: {line}: message".
283    /// When empty, uses generic format: "line {line}: message".
284    pub warn_prefix: String,
285}
286
287/// Result of check mode verification.
288pub struct CheckResult {
289    pub ok: usize,
290    pub mismatches: usize,
291    pub format_errors: usize,
292    pub read_errors: usize,
293    /// Number of files skipped because they were missing and --ignore-missing was set.
294    pub ignored_missing: usize,
295}
296
297/// Verify checksums from a check file.
298/// Each line should be "hash  filename" or "hash *filename" or "ALGO (filename) = hash".
299pub fn check_file<R: BufRead>(
300    algo: HashAlgorithm,
301    reader: R,
302    opts: &CheckOptions,
303    out: &mut impl Write,
304    err_out: &mut impl Write,
305) -> io::Result<CheckResult> {
306    let quiet = opts.quiet;
307    let status_only = opts.status_only;
308    let warn = opts.warn;
309    let ignore_missing = opts.ignore_missing;
310    let mut ok_count = 0;
311    let mut mismatch_count = 0;
312    let mut format_errors = 0;
313    let mut read_errors = 0;
314    let mut ignored_missing_count = 0;
315    let mut line_num = 0;
316
317    for line_result in reader.lines() {
318        line_num += 1;
319        let line = line_result?;
320        let line = line.trim_end();
321
322        if line.is_empty() {
323            continue;
324        }
325
326        // Parse "hash  filename" or "hash *filename" or "ALGO (file) = hash"
327        let (expected_hash, filename) = match parse_check_line(line) {
328            Some(v) => v,
329            None => {
330                format_errors += 1;
331                if warn {
332                    out.flush()?;
333                    if opts.warn_prefix.is_empty() {
334                        writeln!(
335                            err_out,
336                            "line {}: improperly formatted {} checksum line",
337                            line_num,
338                            algo.name()
339                        )?;
340                    } else {
341                        writeln!(
342                            err_out,
343                            "{}: {}: improperly formatted {} checksum line",
344                            opts.warn_prefix,
345                            line_num,
346                            algo.name()
347                        )?;
348                    }
349                }
350                continue;
351            }
352        };
353
354        // Compute actual hash
355        let actual = match hash_file(algo, Path::new(filename)) {
356            Ok(h) => h,
357            Err(e) => {
358                if ignore_missing && e.kind() == io::ErrorKind::NotFound {
359                    ignored_missing_count += 1;
360                    continue;
361                }
362                read_errors += 1;
363                if !status_only {
364                    out.flush()?;
365                    writeln!(err_out, "{}: {}", filename, e)?;
366                    writeln!(out, "{}: FAILED open or read", filename)?;
367                }
368                continue;
369            }
370        };
371
372        if actual.eq_ignore_ascii_case(expected_hash) {
373            ok_count += 1;
374            if !quiet && !status_only {
375                writeln!(out, "{}: OK", filename)?;
376            }
377        } else {
378            mismatch_count += 1;
379            if !status_only {
380                writeln!(out, "{}: FAILED", filename)?;
381            }
382        }
383    }
384
385    Ok(CheckResult {
386        ok: ok_count,
387        mismatches: mismatch_count,
388        format_errors,
389        read_errors,
390        ignored_missing: ignored_missing_count,
391    })
392}
393
394/// Parse a checksum line in any supported format.
395pub fn parse_check_line(line: &str) -> Option<(&str, &str)> {
396    // Try BSD tag format: "ALGO (filename) = hash"
397    let rest = line
398        .strip_prefix("MD5 (")
399        .or_else(|| line.strip_prefix("SHA256 ("))
400        .or_else(|| line.strip_prefix("BLAKE2b ("))
401        .or_else(|| {
402            // Handle BLAKE2b-NNN (filename) = hash
403            if line.starts_with("BLAKE2b-") {
404                let after = &line["BLAKE2b-".len()..];
405                if let Some(sp) = after.find(" (") {
406                    if after[..sp].bytes().all(|b| b.is_ascii_digit()) {
407                        return Some(&after[sp + 2..]);
408                    }
409                }
410            }
411            None
412        });
413    if let Some(rest) = rest {
414        if let Some(paren_idx) = rest.find(") = ") {
415            let filename = &rest[..paren_idx];
416            let hash = &rest[paren_idx + 4..];
417            return Some((hash, filename));
418        }
419    }
420
421    // Handle backslash-escaped lines (leading '\')
422    let line = line.strip_prefix('\\').unwrap_or(line);
423
424    // Standard format: "hash  filename"
425    if let Some(idx) = line.find("  ") {
426        let hash = &line[..idx];
427        let rest = &line[idx + 2..];
428        return Some((hash, rest));
429    }
430    // Binary mode: "hash *filename"
431    if let Some(idx) = line.find(" *") {
432        let hash = &line[..idx];
433        let rest = &line[idx + 2..];
434        return Some((hash, rest));
435    }
436    None
437}
438
439/// Parse a BSD-style tag line: "ALGO (filename) = hash"
440/// Returns (expected_hash, filename, optional_bits).
441/// `bits` is the hash length parsed from the algo name (e.g., BLAKE2b-256 -> Some(256)).
442pub fn parse_check_line_tag(line: &str) -> Option<(&str, &str, Option<usize>)> {
443    let paren_start = line.find(" (")?;
444    let algo_part = &line[..paren_start];
445    let rest = &line[paren_start + 2..];
446    let paren_end = rest.find(") = ")?;
447    let filename = &rest[..paren_end];
448    let hash = &rest[paren_end + 4..];
449
450    // Parse optional bit length from algo name (e.g., "BLAKE2b-256" -> Some(256))
451    let bits = if let Some(dash_pos) = algo_part.rfind('-') {
452        algo_part[dash_pos + 1..].parse::<usize>().ok()
453    } else {
454        None
455    };
456
457    Some((hash, filename, bits))
458}
459
460/// Fast hex encoding using lookup table.
461const HEX_CHARS: &[u8; 16] = b"0123456789abcdef";
462
463pub(crate) fn hex_encode(bytes: &[u8]) -> String {
464    let mut hex = vec![0u8; bytes.len() * 2];
465    for (i, &b) in bytes.iter().enumerate() {
466        hex[i * 2] = HEX_CHARS[(b >> 4) as usize];
467        hex[i * 2 + 1] = HEX_CHARS[(b & 0x0f) as usize];
468    }
469    // SAFETY: All bytes are ASCII hex digits [0-9a-f]
470    unsafe { String::from_utf8_unchecked(hex) }
471}