Skip to main content

hash_hunter/
lib.rs

1#![deny(clippy::pedantic)]
2
3use std::collections::BTreeMap;
4use std::fs::File;
5use std::io::{self, BufRead, BufReader, IsTerminal, Read};
6use std::path::{Path, PathBuf};
7
8use blake2::{Blake2b512, Blake2s256};
9use digest::Digest;
10use indicatif::{ParallelProgressIterator, ProgressBar, ProgressStyle};
11use rayon::prelude::*;
12use walkdir::WalkDir;
13
14/// Supported hashing algorithms.
15#[derive(Copy, Clone, Debug)]
16pub enum Algorithm {
17    Md5,
18    Sha1,
19    Sha256,
20    Sha512,
21    Sha3_256,
22    Sha3_512,
23    Blake2s,
24    Blake2b,
25    Blake3,
26}
27
28/// A hash target with optional name metadata.
29#[derive(Clone, Debug)]
30pub struct Target {
31    pub hash: Vec<u8>,
32    pub name: Option<String>,
33}
34
35/// Configuration for a search across files.
36pub struct SearchConfig {
37    pub dir: PathBuf,
38    pub algorithm: Algorithm,
39    pub targets: Vec<Target>,
40    pub threads: Option<usize>,
41}
42
43/// A matched target found on disk.
44#[derive(Clone, Debug)]
45pub struct MatchResult {
46    pub path: PathBuf,
47    pub target: Target,
48}
49
50/// Summary information from a [`search`] operation.
51#[derive(Debug)]
52pub struct SearchReport {
53    pub matches: Vec<MatchResult>,
54    pub total_files_checked: usize,
55    pub failed_files: Vec<FileCheckFailure>,
56}
57
58/// Details for a file that could not be checked during search.
59#[derive(Debug)]
60pub struct FileCheckFailure {
61    pub path: PathBuf,
62    pub error: String,
63}
64
65/// Search a directory tree for files whose hashes match configured targets.
66///
67/// This walks the directory specified in [`SearchConfig::dir`] (without following
68/// symlinks) and hashes only files that are relevant to the configured targets.
69/// If a target includes a [`Target::name`], hashing is limited to files with the
70/// same filename; otherwise all files are considered. Hashing uses the
71/// [`Algorithm`] configured in [`SearchConfig::algorithm`].
72///
73/// # Examples
74///
75/// ```no_run
76/// use std::path::PathBuf;
77///
78/// let config = hash_hunter::SearchConfig {
79///     dir: PathBuf::from("."),
80///     algorithm: hash_hunter::Algorithm::Sha256,
81///     targets: vec![hash_hunter::Target {
82///         hash: hash_hunter::parse_hex("d2d2d2d2")?,
83///         name: Some("example.txt".to_string()),
84///     }],
85///     threads: Some(4),
86/// };
87///
88/// let report = hash_hunter::search(&config)?;
89/// println!("matched {} file(s)", report.matches.len());
90/// # Ok::<(), std::io::Error>(())
91/// ```
92///
93/// # Errors
94///
95/// Returns an error if:
96/// - no targets are provided;
97/// - the global Rayon thread pool cannot be created;
98/// - or filesystem traversal fails.
99///
100/// Individual file hashing failures (for example, permission errors) are
101/// reported on stderr and do not abort the search.
102pub fn search(config: &SearchConfig) -> io::Result<SearchReport> {
103    if config.targets.is_empty() {
104        return Err(io::Error::new(
105            io::ErrorKind::InvalidInput,
106            "at least one target is required",
107        ));
108    }
109
110    if let Some(threads) = config.threads {
111        status_message(&format!("configuring {threads} hashing threads"));
112        rayon::ThreadPoolBuilder::new()
113            .num_threads(threads)
114            .build_global()
115            .map_err(io::Error::other)?;
116    }
117
118    status_message("preparing targets");
119    let (name_map, hash_only) = split_targets(&config.targets);
120    status_message("scanning directory tree");
121    let search_root = config.dir.canonicalize()?;
122
123    let file_paths: Vec<PathBuf> = WalkDir::new(&search_root)
124        .follow_links(false)
125        .into_iter()
126        .par_bridge()
127        .filter_map(Result::ok)
128        .filter(|entry| entry.file_type().is_file())
129        .map(|entry| entry.path().to_path_buf())
130        .collect();
131
132    status_message(&format!(
133        "found {} file(s); hashing in progress",
134        file_paths.len()
135    ));
136    let progress = progress_bar(file_paths.len());
137    let results = file_paths
138        .par_iter()
139        .progress_with(progress.clone())
140        .filter_map(|path| {
141            let file_name = path
142                .file_name()
143                .map(|value| value.to_string_lossy().to_string())
144                .unwrap_or_default();
145            let name_targets = name_map.get(&file_name).cloned().unwrap_or_default();
146            let needs_hash = !name_targets.is_empty() || !hash_only.is_empty();
147            if !needs_hash {
148                return Some(ResultEntry::SkippedNameMismatch);
149            }
150            let hash = match compute_hash(path, config.algorithm) {
151                Ok(value) => value,
152                Err(err) => return Some(ResultEntry::Error {
153                    path: path.clone(),
154                    err,
155                }),
156            };
157            let mut matches = Vec::new();
158            for idx in name_targets.iter().chain(hash_only.iter()) {
159                if config.targets[*idx].hash == hash {
160                    matches.push(*idx);
161                }
162            }
163            Some(ResultEntry::Hashed {
164                path: path.clone(),
165                matches,
166            })
167        })
168        .collect::<Vec<_>>();
169    progress.finish_with_message("scan complete");
170
171    status_message("summarizing results");
172    let mut output = Vec::new();
173    let mut failures = Vec::new();
174    let mut total_files_checked = 0usize;
175    for result in results {
176        total_files_checked += 1;
177        match result {
178            ResultEntry::Hashed { path, matches } => {
179                for idx in matches {
180                    output.push(MatchResult {
181                        path: normalize_path(&path),
182                        target: config.targets[idx].clone(),
183                    });
184                }
185            }
186            ResultEntry::Error { path, err } => {
187                failures.push(FileCheckFailure {
188                    path: normalize_path(&path),
189                    error: err.to_string(),
190                });
191                eprintln!("failed to hash {}: {err}", path.display());
192            }
193            ResultEntry::SkippedNameMismatch => {}
194        }
195    }
196
197    Ok(SearchReport {
198        matches: output,
199        total_files_checked,
200        failed_files: failures,
201    })
202}
203
204fn status_message(message: &str) {
205    if io::stderr().is_terminal() {
206        eprintln!("{message}");
207    }
208}
209
210fn progress_bar(total_files: usize) -> ProgressBar {
211    if !io::stderr().is_terminal() {
212        return ProgressBar::hidden();
213    }
214
215    let progress = ProgressBar::new(total_files as u64);
216    let style = ProgressStyle::with_template(
217        "{spinner:.green} [{elapsed_precise}] {msg} {bar:40.cyan/blue} {pos}/{len} files ({eta})",
218    )
219    .unwrap_or_else(|_| ProgressStyle::default_bar())
220    .progress_chars("=>-");
221    progress.set_style(style);
222    progress.set_message("hashing");
223    progress
224}
225
226#[cfg(windows)]
227fn normalize_path(path: &Path) -> PathBuf {
228    let path_str = path.to_string_lossy();
229    if let Some(stripped) = path_str.strip_prefix(r"\\?\") {
230        if let Some(unc_path) = stripped.strip_prefix("UNC\\") {
231            PathBuf::from(format!(r"\\{}", unc_path))
232        } else {
233            PathBuf::from(stripped)
234        }
235    } else {
236        path.to_path_buf()
237    }
238}
239
240#[cfg(not(windows))]
241fn normalize_path(path: &Path) -> PathBuf {
242    path.to_path_buf()
243}
244
245/// Load hash targets from a batch file.
246///
247/// Each non-empty, non-comment line in the file must contain a hexadecimal hash
248/// followed by an optional filename:
249///
250/// ```text
251/// <hex-hash> [filename]
252/// ```
253///
254/// If the filename is present it is stored in [`Target::name`] and is later used
255/// by [`search`] to limit hashing to files with the same basename.
256///
257/// # Examples
258///
259/// ```no_run
260/// # use std::path::PathBuf;
261/// # use std::fs;
262/// let path = PathBuf::from("targets.txt");
263/// fs::write(&path, "d2d2d2d2 example.txt\n# comment line\n")?;
264/// let targets = hash_hunter::load_batch(&path)?;
265/// assert_eq!(targets.len(), 1);
266/// # Ok::<(), std::io::Error>(())
267/// ```
268///
269/// # Errors
270///
271/// Returns an error if the file cannot be read, if a line is malformed, or if a
272/// hash cannot be parsed by [`parse_hex`]. Line numbers are included in
273/// formatting errors to aid debugging.
274pub fn load_batch(path: &Path) -> io::Result<Vec<Target>> {
275    let file = File::open(path)?;
276    let reader = BufReader::new(file);
277    let mut targets = Vec::new();
278    for (line_number, line) in reader.lines().enumerate() {
279        let line = line?;
280        let line = line.trim();
281        if line.is_empty() || line.starts_with('#') {
282            continue;
283        }
284        let mut parts = line.split_whitespace();
285        let hash = parts
286            .next()
287            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "missing hash"))?;
288        let name = parts.next().map(std::string::ToString::to_string);
289        if parts.next().is_some() {
290            return Err(io::Error::new(
291                io::ErrorKind::InvalidData,
292                format!("line {}: too many fields", line_number + 1),
293            ));
294        }
295        let hash = parse_hex(hash).map_err(|err| {
296            io::Error::new(
297                io::ErrorKind::InvalidData,
298                format!("line {}: {err}", line_number + 1),
299            )
300        })?;
301        targets.push(Target { hash, name });
302    }
303    Ok(targets)
304}
305
306/// Parse a hexadecimal string into raw bytes.
307///
308/// The input is trimmed before decoding. This helper is used by [`load_batch`]
309/// and the CLI to convert user-provided hex strings into byte arrays suitable
310/// for hashing comparisons.
311///
312/// # Examples
313///
314/// ```
315/// let bytes = hash_hunter::parse_hex("0a0b0c")?;
316/// assert_eq!(bytes, vec![0x0a, 0x0b, 0x0c]);
317/// # Ok::<(), std::io::Error>(())
318/// ```
319///
320/// # Errors
321///
322/// Returns an error if the input is not valid hex.
323pub fn parse_hex(input: &str) -> io::Result<Vec<u8>> {
324    let cleaned = input.trim();
325    hex::decode(cleaned).map_err(|err| io::Error::new(io::ErrorKind::InvalidInput, err))
326}
327
328enum ResultEntry {
329    Hashed { path: PathBuf, matches: Vec<usize> },
330    Error { path: PathBuf, err: io::Error },
331    SkippedNameMismatch,
332}
333
334/// Partition targets into a filename map and a hash-only list.
335///
336/// This returns:
337/// - a map from filename to indices of [`Target`] entries with
338///   [`Target::name`] set, and
339/// - a list of indices for targets without filenames.
340///
341/// [`search`] uses this split to avoid hashing files that cannot possibly
342/// satisfy any target.
343fn split_targets(targets: &[Target]) -> (BTreeMap<String, Vec<usize>>, Vec<usize>) {
344    let mut name_map: BTreeMap<String, Vec<usize>> = BTreeMap::new();
345    let mut hash_only = Vec::new();
346    for (idx, target) in targets.iter().enumerate() {
347        if let Some(name) = &target.name {
348            name_map.entry(name.clone()).or_default().push(idx);
349        } else {
350            hash_only.push(idx);
351        }
352    }
353    (name_map, hash_only)
354}
355
356/// Compute the hash of a file using the selected [`Algorithm`].
357///
358/// This function opens the file at `path`, streams its contents into the
359/// appropriate hash implementation, and returns the resulting digest bytes. For
360/// BLAKE3, the specialized [`hash_blake3`] path is used; all other algorithms
361/// use [`hash_with_digest`].
362///
363/// # Errors
364///
365/// Returns an error if the file cannot be opened or read.
366fn compute_hash(path: &Path, algo: Algorithm) -> io::Result<Vec<u8>> {
367    let file = File::open(path)?;
368    let mut reader = BufReader::new(file);
369    match algo {
370        Algorithm::Md5 => hash_with_digest::<md5::Md5>(&mut reader),
371        Algorithm::Sha1 => hash_with_digest::<sha1::Sha1>(&mut reader),
372        Algorithm::Sha256 => hash_with_digest::<sha2::Sha256>(&mut reader),
373        Algorithm::Sha512 => hash_with_digest::<sha2::Sha512>(&mut reader),
374        Algorithm::Sha3_256 => hash_with_digest::<sha3::Sha3_256>(&mut reader),
375        Algorithm::Sha3_512 => hash_with_digest::<sha3::Sha3_512>(&mut reader),
376        Algorithm::Blake2s => hash_with_digest::<Blake2s256>(&mut reader),
377        Algorithm::Blake2b => hash_with_digest::<Blake2b512>(&mut reader),
378        Algorithm::Blake3 => hash_blake3(&mut reader),
379    }
380}
381
382/// Hash a reader using a `Digest` implementation.
383///
384/// This helper is used by [`compute_hash`] for algorithms that implement the
385/// [`Digest`] trait. It reads the input in 128 KiB chunks to limit memory usage.
386///
387/// # Errors
388///
389/// Returns an error if the underlying reader cannot be read.
390fn hash_with_digest<D: Digest>(reader: &mut BufReader<File>) -> io::Result<Vec<u8>> {
391    let mut hasher = D::new();
392    let mut buffer = vec![0u8; 128 * 1024];
393    loop {
394        let read = reader.read(&mut buffer)?;
395        if read == 0 {
396            break;
397        }
398        hasher.update(&buffer[..read]);
399    }
400    Ok(hasher.finalize().to_vec())
401}
402
403/// Hash a reader using the BLAKE3 implementation.
404///
405/// BLAKE3 does not implement the [`Digest`] trait, so it uses its own hashing
406/// API. The read loop mirrors [`hash_with_digest`] to keep behavior consistent.
407///
408/// # Errors
409///
410/// Returns an error if the underlying reader cannot be read.
411fn hash_blake3(reader: &mut BufReader<File>) -> io::Result<Vec<u8>> {
412    let mut hasher = blake3::Hasher::new();
413    let mut buffer = vec![0u8; 128 * 1024];
414    loop {
415        let read = reader.read(&mut buffer)?;
416        if read == 0 {
417            break;
418        }
419        hasher.update(&buffer[..read]);
420    }
421    Ok(hasher.finalize().as_bytes().to_vec())
422}
423
424#[cfg(test)]
425mod tests {
426    use super::*;
427    use std::fs;
428    use std::io::Write;
429
430    fn write_file(dir: &tempfile::TempDir, name: &str, contents: &[u8]) -> PathBuf {
431        let path = dir.path().join(name);
432        let mut file = File::create(&path).expect("create file");
433        file.write_all(contents).expect("write file");
434        path
435    }
436
437    #[test]
438    fn parse_hex_trims_and_parses() {
439        let bytes = parse_hex(" 0a0b0c ").expect("parse hex");
440        assert_eq!(bytes, vec![0x0a, 0x0b, 0x0c]);
441    }
442
443    #[test]
444    fn parse_hex_rejects_invalid() {
445        let err = parse_hex("not-hex").expect_err("invalid hex should fail");
446        assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
447    }
448
449    #[test]
450    fn load_batch_parses_names_and_hash_only() {
451        let dir = tempfile::tempdir().expect("tempdir");
452        let path = dir.path().join("batch.txt");
453        fs::write(
454            &path,
455            "0a0b0c report.txt\n\n# comment line\n0d0e0f\n",
456        )
457        .expect("write batch");
458        let targets = load_batch(&path).expect("load batch");
459        assert_eq!(targets.len(), 2);
460        assert_eq!(targets[0].hash, vec![0x0a, 0x0b, 0x0c]);
461        assert_eq!(targets[0].name.as_deref(), Some("report.txt"));
462        assert_eq!(targets[1].hash, vec![0x0d, 0x0e, 0x0f]);
463        assert!(targets[1].name.is_none());
464    }
465
466    #[test]
467    fn load_batch_reports_too_many_fields() {
468        let dir = tempfile::tempdir().expect("tempdir");
469        let path = dir.path().join("batch.txt");
470        fs::write(&path, "0a0b0c one two\n").expect("write batch");
471        let err = load_batch(&path).expect_err("expected too many fields error");
472        assert_eq!(err.kind(), io::ErrorKind::InvalidData);
473        assert!(err.to_string().contains("line 1"));
474    }
475
476    #[test]
477    fn load_batch_reports_invalid_hex_with_line_number() {
478        let dir = tempfile::tempdir().expect("tempdir");
479        let path = dir.path().join("batch.txt");
480        fs::write(&path, "0a0b0c\nzzzz\n").expect("write batch");
481        let err = load_batch(&path).expect_err("expected invalid hex");
482        assert_eq!(err.kind(), io::ErrorKind::InvalidData);
483        assert!(err.to_string().contains("line 2"));
484    }
485
486    #[test]
487    fn split_targets_separates_named_and_hash_only() {
488        let targets = vec![
489            Target {
490                hash: vec![1],
491                name: Some("a.txt".to_string()),
492            },
493            Target {
494                hash: vec![2],
495                name: None,
496            },
497            Target {
498                hash: vec![3],
499                name: Some("a.txt".to_string()),
500            },
501        ];
502        let (name_map, hash_only) = split_targets(&targets);
503        assert_eq!(hash_only, vec![1]);
504        let entries = name_map.get("a.txt").expect("name entry");
505        assert_eq!(entries, &vec![0, 2]);
506    }
507
508    #[test]
509    fn compute_hash_errors_for_missing_file() {
510        let path = PathBuf::from("missing-file");
511        let err = compute_hash(&path, Algorithm::Sha256).expect_err("missing file");
512        assert_eq!(err.kind(), io::ErrorKind::NotFound);
513    }
514
515    #[test]
516    fn hash_with_digest_matches_md5() {
517        let dir = tempfile::tempdir().expect("tempdir");
518        let path = write_file(&dir, "file.txt", b"hash-hunter");
519        let file = File::open(&path).expect("open file");
520        let mut reader = BufReader::new(file);
521        let hash = hash_with_digest::<md5::Md5>(&mut reader).expect("hash");
522        let expected = md5::Md5::digest(b"hash-hunter").to_vec();
523        assert_eq!(hash, expected);
524    }
525
526    #[test]
527    fn hash_blake3_matches_expected() {
528        let dir = tempfile::tempdir().expect("tempdir");
529        let path = write_file(&dir, "file.txt", b"hash-hunter");
530        let file = File::open(&path).expect("open file");
531        let mut reader = BufReader::new(file);
532        let hash = hash_blake3(&mut reader).expect("hash");
533        let expected = blake3::hash(b"hash-hunter").as_bytes().to_vec();
534        assert_eq!(hash, expected);
535    }
536
537    #[test]
538    fn compute_hash_supports_all_algorithms() {
539        let dir = tempfile::tempdir().expect("tempdir");
540        let path = write_file(&dir, "file.txt", b"hash-hunter");
541        let cases = [
542            (Algorithm::Md5, md5::Md5::digest(b"hash-hunter").to_vec()),
543            (Algorithm::Sha1, sha1::Sha1::digest(b"hash-hunter").to_vec()),
544            (Algorithm::Sha256, sha2::Sha256::digest(b"hash-hunter").to_vec()),
545            (Algorithm::Sha512, sha2::Sha512::digest(b"hash-hunter").to_vec()),
546            (
547                Algorithm::Sha3_256,
548                sha3::Sha3_256::digest(b"hash-hunter").to_vec(),
549            ),
550            (
551                Algorithm::Sha3_512,
552                sha3::Sha3_512::digest(b"hash-hunter").to_vec(),
553            ),
554            (Algorithm::Blake2s, Blake2s256::digest(b"hash-hunter").to_vec()),
555            (Algorithm::Blake2b, Blake2b512::digest(b"hash-hunter").to_vec()),
556            (
557                Algorithm::Blake3,
558                blake3::hash(b"hash-hunter").as_bytes().to_vec(),
559            ),
560        ];
561        for (algo, expected) in cases {
562            let digest = compute_hash(&path, algo).expect("compute hash");
563            assert_eq!(digest, expected, "mismatch for {algo:?}");
564        }
565    }
566
567    #[test]
568    fn search_requires_targets() {
569        let dir = tempfile::tempdir().expect("tempdir");
570        let config = SearchConfig {
571            dir: dir.path().to_path_buf(),
572            algorithm: Algorithm::Sha256,
573            targets: Vec::new(),
574            threads: None,
575        };
576        let err = search(&config).expect_err("should require targets");
577        assert_eq!(err.kind(), io::ErrorKind::InvalidInput);
578    }
579
580    #[test]
581    fn search_matches_named_and_hash_only_targets() {
582        let dir = tempfile::tempdir().expect("tempdir");
583        let alpha_path = write_file(&dir, "alpha.txt", b"alpha");
584        let beta_path = write_file(&dir, "beta.txt", b"beta");
585        let alpha_hash = compute_hash(&alpha_path, Algorithm::Sha256).expect("hash");
586        let beta_hash = compute_hash(&beta_path, Algorithm::Sha256).expect("hash");
587        let targets = vec![
588            Target {
589                hash: alpha_hash.clone(),
590                name: Some("alpha.txt".to_string()),
591            },
592            Target {
593                hash: beta_hash.clone(),
594                name: None,
595            },
596            Target {
597                hash: vec![0xff],
598                name: Some("missing.txt".to_string()),
599            },
600        ];
601        let config = SearchConfig {
602            dir: dir.path().to_path_buf(),
603            algorithm: Algorithm::Sha256,
604            targets: targets.clone(),
605            threads: None,
606        };
607        let report = search(&config).expect("search");
608        assert_eq!(report.matches.len(), 2);
609        let mut matched_paths: Vec<_> = report
610            .matches
611            .iter()
612            .map(|result| result.path.clone())
613            .collect();
614        matched_paths.sort();
615        let mut expected = vec![alpha_path, beta_path];
616        expected.sort();
617        assert_eq!(matched_paths, expected);
618        let matched_targets: Vec<_> = report
619            .matches
620            .into_iter()
621            .map(|result| result.target)
622            .collect();
623        assert!(matched_targets.iter().any(|target| {
624            target.hash == targets[0].hash && target.name == targets[0].name
625        }));
626        assert!(matched_targets.iter().any(|target| {
627            target.hash == targets[1].hash && target.name == targets[1].name
628        }));
629    }
630
631    #[test]
632    fn search_counts_name_mismatches_as_checked() {
633        let dir = tempfile::tempdir().expect("tempdir");
634        let alpha_path = write_file(&dir, "alpha.txt", b"alpha");
635        write_file(&dir, "beta.txt", b"beta");
636        let alpha_hash = compute_hash(&alpha_path, Algorithm::Sha256).expect("hash");
637        let targets = vec![Target {
638            hash: alpha_hash,
639            name: Some("alpha.txt".to_string()),
640        }];
641        let config = SearchConfig {
642            dir: dir.path().to_path_buf(),
643            algorithm: Algorithm::Sha256,
644            targets,
645            threads: None,
646        };
647        let report = search(&config).expect("search");
648        assert_eq!(report.total_files_checked, 2);
649    }
650}