qem 0.6.3

High-performance cross-platform text engine for massive files.
Documentation
use std::fs::File;
use std::io::{self, Read, Seek, SeekFrom};
use std::path::Path;

const FNV64_OFFSET_BASIS: u64 = 0xcbf29ce484222325;
const FNV64_PRIME: u64 = 0x100000001b3;
const FULL_FINGERPRINT_MAX_BYTES: usize = 32 * 1024;
const FINGERPRINT_WINDOW_BYTES: usize = 8 * 1024;

pub(crate) fn sampled_content_fingerprint_budget(len: usize) -> usize {
    if len <= FULL_FINGERPRINT_MAX_BYTES {
        return len;
    }

    sample_starts(len)
        .into_iter()
        .map(|start| FINGERPRINT_WINDOW_BYTES.min(len.saturating_sub(start)))
        .sum()
}

pub(crate) fn sampled_content_fingerprint(bytes: &[u8]) -> u64 {
    let mut state = FNV64_OFFSET_BASIS;
    hash_u64(&mut state, bytes.len() as u64);

    if bytes.len() <= FULL_FINGERPRINT_MAX_BYTES {
        hash_sample(&mut state, 0, bytes);
        return state;
    }

    for start in sample_starts(bytes.len()) {
        let end = start
            .saturating_add(FINGERPRINT_WINDOW_BYTES)
            .min(bytes.len());
        hash_sample(&mut state, start, &bytes[start..end]);
    }

    state
}

pub(crate) fn sampled_file_fingerprint(path: &Path) -> io::Result<u64> {
    let mut file = File::open(path)?;
    let len = file.metadata()?.len() as usize;

    let mut state = FNV64_OFFSET_BASIS;
    hash_u64(&mut state, len as u64);

    if len <= FULL_FINGERPRINT_MAX_BYTES {
        let mut buf = Vec::with_capacity(len);
        file.read_to_end(&mut buf)?;
        hash_sample(&mut state, 0, &buf);
        return Ok(state);
    }

    let mut buf = vec![0u8; FINGERPRINT_WINDOW_BYTES];
    for start in sample_starts(len) {
        let sample_len = FINGERPRINT_WINDOW_BYTES.min(len.saturating_sub(start));
        file.seek(SeekFrom::Start(start as u64))?;
        file.read_exact(&mut buf[..sample_len])?;
        hash_sample(&mut state, start, &buf[..sample_len]);
    }

    Ok(state)
}

fn sample_starts(len: usize) -> Vec<usize> {
    debug_assert!(len > FULL_FINGERPRINT_MAX_BYTES);

    let window = FINGERPRINT_WINDOW_BYTES.min(len);
    let last_start = len.saturating_sub(window);
    let anchors = [
        0usize,
        len / 4,
        len / 2,
        (len / 4).saturating_mul(3),
        last_start,
    ];

    let mut starts = Vec::with_capacity(anchors.len());
    for (index, anchor) in anchors.into_iter().enumerate() {
        let start = if index == 0 || index + 1 == anchors.len() {
            anchor.min(last_start)
        } else {
            anchor.saturating_sub(window / 2).min(last_start)
        };
        if starts.last().copied() != Some(start) {
            starts.push(start);
        }
    }
    starts
}

fn hash_sample(state: &mut u64, start: usize, bytes: &[u8]) {
    hash_u64(state, start as u64);
    hash_u64(state, bytes.len() as u64);
    hash_bytes(state, bytes);
}

fn hash_u64(state: &mut u64, value: u64) {
    hash_bytes(state, &value.to_le_bytes());
}

fn hash_bytes(state: &mut u64, bytes: &[u8]) {
    for &byte in bytes {
        *state ^= u64::from(byte);
        *state = state.wrapping_mul(FNV64_PRIME);
    }
}

#[cfg(test)]
mod tests {
    use super::{
        sampled_content_fingerprint, sampled_content_fingerprint_budget, sampled_file_fingerprint,
        FINGERPRINT_WINDOW_BYTES, FULL_FINGERPRINT_MAX_BYTES,
    };
    use std::fs;

    #[test]
    fn sampled_content_fingerprint_changes_with_small_edits() {
        let left = b"alpha\nbeta\ngamma\n".repeat(4096);
        let mut right = left.clone();
        let middle = right.len() / 2;
        right[middle] ^= 1;

        assert_ne!(
            sampled_content_fingerprint(&left),
            sampled_content_fingerprint(&right)
        );
    }

    #[test]
    fn sampled_file_fingerprint_matches_in_memory_version() {
        let dir =
            std::env::temp_dir().join(format!("qem-source-fingerprint-{}", std::process::id()));
        let _ = fs::create_dir_all(&dir);
        let path = dir.join("sample.txt");
        let bytes = b"abcd\n".repeat(10_000);
        fs::write(&path, &bytes).unwrap();

        let file_fingerprint = sampled_file_fingerprint(&path).unwrap();
        let memory_fingerprint = sampled_content_fingerprint(&bytes);
        assert_eq!(file_fingerprint, memory_fingerprint);

        let _ = fs::remove_file(&path);
    }

    #[test]
    fn sampled_content_fingerprint_budget_matches_sampled_windows() {
        assert_eq!(sampled_content_fingerprint_budget(0), 0);
        assert_eq!(sampled_content_fingerprint_budget(17), 17);
        assert_eq!(
            sampled_content_fingerprint_budget(FULL_FINGERPRINT_MAX_BYTES),
            FULL_FINGERPRINT_MAX_BYTES
        );
        assert_eq!(
            sampled_content_fingerprint_budget(FULL_FINGERPRINT_MAX_BYTES + 1),
            5 * FINGERPRINT_WINDOW_BYTES
        );
    }
}