hashjunkie 0.6.0

Fast multi-algorithm hashing library with file-sharing and cloud hash support
Documentation
use digest::Digest;
use rayon::prelude::*;

use crate::hashes::Hasher;

pub const BLOCK_SIZE: usize = 9_728_000;
const PARALLEL_BLOCK_BATCH_SIZE: usize = 4;

pub struct Ed2kHasher {
    block_hashes: Vec<[u8; 16]>,
    pending_blocks: Vec<Vec<u8>>,
    current_block: Vec<u8>,
}

impl Ed2kHasher {
    pub fn new() -> Self {
        Self {
            block_hashes: Vec::new(),
            pending_blocks: Vec::new(),
            current_block: Vec::with_capacity(BLOCK_SIZE),
        }
    }

    fn push_current_block(&mut self) {
        let block = std::mem::replace(&mut self.current_block, Vec::with_capacity(BLOCK_SIZE));
        self.pending_blocks.push(block);
        if self.pending_blocks.len() >= PARALLEL_BLOCK_BATCH_SIZE {
            self.flush_pending_blocks();
        }
    }

    fn flush_pending_blocks(&mut self) {
        let pending = std::mem::take(&mut self.pending_blocks);
        self.block_hashes.extend(
            pending
                .into_par_iter()
                .map(|block| md4_block(&block))
                .collect::<Vec<_>>(),
        );
    }
}

impl Default for Ed2kHasher {
    fn default() -> Self {
        Self::new()
    }
}

impl Hasher for Ed2kHasher {
    fn update(&mut self, mut data: &[u8]) {
        if self.current_block.len() == BLOCK_SIZE && !data.is_empty() {
            self.push_current_block();
        }

        while !data.is_empty() {
            let remaining = BLOCK_SIZE - self.current_block.len();
            let take = data.len().min(remaining);
            self.current_block.extend_from_slice(&data[..take]);
            data = &data[take..];

            if self.current_block.len() == BLOCK_SIZE && !data.is_empty() {
                self.push_current_block();
            }
        }
    }

    fn finalize_hex(mut self: Box<Self>) -> String {
        if self.block_hashes.is_empty() && self.pending_blocks.is_empty() {
            return hex::encode(md4_block(&self.current_block));
        }

        if !self.current_block.is_empty() {
            self.push_current_block();
        }
        self.flush_pending_blocks();

        let mut root = md4::Md4::new();
        for block_hash in &self.block_hashes {
            root.update(block_hash);
        }
        hex::encode(root.finalize())
    }
}

fn md4_block(block: &[u8]) -> [u8; 16] {
    md4::Md4::digest(block).into()
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::hashes::Hasher;

    fn hash(data: &[u8]) -> String {
        let mut h = Ed2kHasher::new();
        h.update(data);
        Box::new(h).finalize_hex()
    }

    #[test]
    fn empty_matches_md4_empty() {
        assert_eq!(hash(b""), "31d6cfe0d16ae931b73c59d7e0c089c0");
    }

    #[test]
    fn default_equals_new() {
        let mut default_hasher = Ed2kHasher::default();
        default_hasher.update(b"abc");

        let mut new_hasher = Ed2kHasher::new();
        new_hasher.update(b"abc");

        assert_eq!(
            Box::new(default_hasher).finalize_hex(),
            Box::new(new_hasher).finalize_hex()
        );
    }

    #[test]
    fn small_file_is_plain_md4() {
        assert_eq!(hash(b"abc"), "a448017aaf21d8525fc10ae87aa6729d");
    }

    #[test]
    fn chunked_update_matches_single_update() {
        let data = (0..(BLOCK_SIZE + 1234))
            .map(|i| (i % 251) as u8)
            .collect::<Vec<_>>();
        let single = hash(&data);

        let mut h = Ed2kHasher::new();
        for chunk in data.chunks(123_457) {
            h.update(chunk);
        }

        assert_eq!(Box::new(h).finalize_hex(), single);
    }

    #[test]
    fn exact_block_boundary_is_single_block_hash() {
        let data = vec![0xA5; BLOCK_SIZE];
        assert_eq!(hash(&data), hex::encode(md4_block(&data)));
    }

    #[test]
    fn one_byte_over_block_boundary_uses_root_hash() {
        let mut data = vec![0xA5; BLOCK_SIZE];
        data.push(0x5A);

        let h1 = md4_block(&data[..BLOCK_SIZE]);
        let h2 = md4_block(&data[BLOCK_SIZE..]);
        let mut root = md4::Md4::new();
        root.update(h1);
        root.update(h2);

        assert_eq!(hash(&data), hex::encode(root.finalize()));
    }

    #[test]
    fn new_update_after_exact_block_flushes_deferred_block() {
        let mut h = Ed2kHasher::new();
        h.update(&vec![0xA5; BLOCK_SIZE]);
        h.update(&[0x5A]);

        let mut data = vec![0xA5; BLOCK_SIZE];
        data.push(0x5A);

        assert_eq!(Box::new(h).finalize_hex(), hash(&data));
    }

    #[test]
    fn pending_blocks_flush_at_parallel_batch_threshold() {
        let data = (0..(BLOCK_SIZE * PARALLEL_BLOCK_BATCH_SIZE + 1))
            .map(|i| (i % 251) as u8)
            .collect::<Vec<_>>();

        assert_eq!(hash(&data), "dab73f86a7763a9268d72761b7a4ae2a");
    }
}