use crate::{DedupError, DedupResult};
use blake3::Hasher;
use std::fs::File;
use std::io::{BufReader, Read};
use std::path::Path;
const BUFFER_SIZE: usize = 65536;
const DEFAULT_CHUNK_SIZE: usize = 4096;
const MIN_CHUNK_SIZE: usize = 1024;
const MAX_CHUNK_SIZE: usize = 1048576;
const RABIN_POLYNOMIAL: u64 = 0x3DA3_358B_4DC1_73E9;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct FileHash {
hash: [u8; 32],
}
impl FileHash {
#[must_use]
pub fn from_bytes(bytes: [u8; 32]) -> Self {
Self { hash: bytes }
}
#[must_use]
pub fn as_bytes(&self) -> &[u8; 32] {
&self.hash
}
#[must_use]
pub fn to_hex(&self) -> String {
hex::encode(self.hash)
}
pub fn from_hex(s: &str) -> DedupResult<Self> {
let bytes =
hex::decode(s).map_err(|e| DedupError::Hash(format!("Invalid hex string: {e}")))?;
if bytes.len() != 32 {
return Err(DedupError::Hash(format!(
"Invalid hash length: expected 32, got {}",
bytes.len()
)));
}
let mut hash = [0u8; 32];
hash.copy_from_slice(&bytes);
Ok(Self::from_bytes(hash))
}
#[must_use]
pub fn hamming_distance(&self, other: &Self) -> u32 {
self.hash
.iter()
.zip(other.hash.iter())
.map(|(a, b)| (a ^ b).count_ones())
.sum()
}
#[must_use]
pub fn similarity(&self, other: &Self) -> f64 {
let distance = self.hamming_distance(other);
let max_distance = 256; 1.0 - (f64::from(distance) / f64::from(max_distance))
}
}
pub fn compute_file_hash(path: impl AsRef<Path>) -> DedupResult<FileHash> {
let file = File::open(path)?;
let mut reader = BufReader::with_capacity(BUFFER_SIZE, file);
let mut hasher = Hasher::new();
let mut buffer = vec![0u8; BUFFER_SIZE];
loop {
let n = reader.read(&mut buffer)?;
if n == 0 {
break;
}
hasher.update(&buffer[..n]);
}
let hash = hasher.finalize();
Ok(FileHash::from_bytes(*hash.as_bytes()))
}
#[must_use]
pub fn compute_data_hash(data: &[u8]) -> FileHash {
let hash = blake3::hash(data);
FileHash::from_bytes(*hash.as_bytes())
}
#[derive(Debug, Clone)]
pub struct Chunk {
pub offset: u64,
pub size: usize,
pub hash: FileHash,
}
pub struct RollingHash {
window_size: usize,
polynomial: u64,
mod_mask: u64,
hash: u64,
window: Vec<u8>,
window_pos: usize,
pow_table: Vec<u64>,
}
impl RollingHash {
#[must_use]
pub fn new(window_size: usize) -> Self {
let polynomial = RABIN_POLYNOMIAL;
let mod_mask = (1u64 << 63) - 1;
let mut pow_table = vec![1u64; window_size];
for i in 1..window_size {
pow_table[i] = pow_table[i - 1].wrapping_mul(polynomial) & mod_mask;
}
Self {
window_size,
polynomial,
mod_mask,
hash: 0,
window: vec![0u8; window_size],
window_pos: 0,
pow_table,
}
}
pub fn roll(&mut self, byte: u8) -> u64 {
let old_byte = self.window[self.window_pos];
let old_contribution =
u64::from(old_byte).wrapping_mul(self.pow_table[self.window_size - 1]);
self.hash = self.hash.wrapping_sub(old_contribution) & self.mod_mask;
self.hash = self.hash.wrapping_mul(self.polynomial) & self.mod_mask;
self.hash = self.hash.wrapping_add(u64::from(byte)) & self.mod_mask;
self.window[self.window_pos] = byte;
self.window_pos = (self.window_pos + 1) % self.window_size;
self.hash
}
#[must_use]
pub fn hash(&self) -> u64 {
self.hash
}
pub fn reset(&mut self) {
self.hash = 0;
self.window.fill(0);
self.window_pos = 0;
}
}
pub struct Chunker {
chunk_size: usize,
min_size: usize,
max_size: usize,
rolling_hash: RollingHash,
}
impl Chunker {
#[must_use]
pub fn new(chunk_size: usize) -> Self {
let chunk_size = chunk_size.clamp(MIN_CHUNK_SIZE, MAX_CHUNK_SIZE);
let min_size = chunk_size / 2;
let max_size = chunk_size * 2;
let rolling_hash = RollingHash::new(64);
Self {
chunk_size,
min_size,
max_size,
rolling_hash,
}
}
fn is_boundary(&self, hash: u64) -> bool {
let mask = (1u64 << 12) - 1; (hash & mask) == 0
}
#[must_use]
pub fn chunk(&mut self, data: &[u8]) -> Vec<Chunk> {
let mut chunks = Vec::new();
let mut chunk_start = 0;
let mut offset = 0u64;
self.rolling_hash.reset();
for (pos, &byte) in data.iter().enumerate() {
let hash = self.rolling_hash.roll(byte);
let chunk_len = pos - chunk_start;
if chunk_len >= self.min_size && (self.is_boundary(hash) || chunk_len >= self.max_size)
{
let chunk_data = &data[chunk_start..pos + 1];
let chunk_hash = compute_data_hash(chunk_data);
chunks.push(Chunk {
offset,
size: chunk_data.len(),
hash: chunk_hash,
});
chunk_start = pos + 1;
offset += chunk_data.len() as u64;
self.rolling_hash.reset();
}
}
if chunk_start < data.len() {
let chunk_data = &data[chunk_start..];
let chunk_hash = compute_data_hash(chunk_data);
chunks.push(Chunk {
offset,
size: chunk_data.len(),
hash: chunk_hash,
});
}
chunks
}
pub fn chunk_file(&mut self, path: impl AsRef<Path>) -> DedupResult<Vec<Chunk>> {
let file = File::open(path)?;
let mut reader = BufReader::with_capacity(BUFFER_SIZE, file);
let mut chunks = Vec::new();
let mut chunk_buffer = Vec::new();
let mut offset = 0u64;
let mut buffer = vec![0u8; BUFFER_SIZE];
self.rolling_hash.reset();
loop {
let n = reader.read(&mut buffer)?;
if n == 0 {
break;
}
for &byte in &buffer[..n] {
chunk_buffer.push(byte);
let hash = self.rolling_hash.roll(byte);
let chunk_len = chunk_buffer.len();
if chunk_len >= self.min_size
&& (self.is_boundary(hash) || chunk_len >= self.max_size)
{
let chunk_hash = compute_data_hash(&chunk_buffer);
chunks.push(Chunk {
offset,
size: chunk_buffer.len(),
hash: chunk_hash,
});
offset += chunk_buffer.len() as u64;
chunk_buffer.clear();
self.rolling_hash.reset();
}
}
}
if !chunk_buffer.is_empty() {
let chunk_hash = compute_data_hash(&chunk_buffer);
chunks.push(Chunk {
offset,
size: chunk_buffer.len(),
hash: chunk_hash,
});
}
Ok(chunks)
}
}
pub struct ChunkIndex {
chunks: Vec<(FileHash, Vec<String>)>,
}
impl ChunkIndex {
#[must_use]
pub fn new() -> Self {
Self { chunks: Vec::new() }
}
pub fn add_file(&mut self, file_path: &str, chunks: &[Chunk]) {
for chunk in chunks {
if let Some((_, files)) = self.chunks.iter_mut().find(|(h, _)| h == &chunk.hash) {
if !files.contains(&file_path.to_string()) {
files.push(file_path.to_string());
}
} else {
self.chunks
.push((chunk.hash.clone(), vec![file_path.to_string()]));
}
}
}
#[must_use]
pub fn find_duplicates(&self) -> Vec<(FileHash, Vec<String>)> {
self.chunks
.iter()
.filter(|(_, files)| files.len() > 1)
.cloned()
.collect()
}
#[must_use]
pub fn chunk_count(&self) -> usize {
self.chunks.len()
}
#[must_use]
pub fn duplicate_count(&self) -> usize {
self.chunks
.iter()
.filter(|(_, files)| files.len() > 1)
.count()
}
#[must_use]
pub fn dedup_ratio(&self) -> f64 {
if self.chunks.is_empty() {
return 0.0;
}
let total: usize = self.chunks.iter().map(|(_, files)| files.len()).sum();
let unique = self.chunks.len();
if total == 0 {
return 0.0;
}
1.0 - (unique as f64 / total as f64)
}
}
impl Default for ChunkIndex {
fn default() -> Self {
Self::new()
}
}
pub fn compute_chunk_similarity(
path1: impl AsRef<Path>,
path2: impl AsRef<Path>,
chunk_size: usize,
) -> DedupResult<f64> {
let mut chunker = Chunker::new(chunk_size);
let chunks1 = chunker.chunk_file(path1)?;
let chunks2 = chunker.chunk_file(path2)?;
if chunks1.is_empty() || chunks2.is_empty() {
return Ok(0.0);
}
let hashes1: Vec<_> = chunks1.iter().map(|c| &c.hash).collect();
let hashes2: Vec<_> = chunks2.iter().map(|c| &c.hash).collect();
let shared = hashes1.iter().filter(|h| hashes2.contains(h)).count();
let total = hashes1.len().max(hashes2.len());
Ok(shared as f64 / total as f64)
}
mod hex {
use super::DedupError;
pub fn encode(bytes: [u8; 32]) -> String {
bytes.iter().map(|b| format!("{b:02x}")).collect::<String>()
}
pub fn decode(s: &str) -> Result<Vec<u8>, DedupError> {
if s.len() % 2 != 0 {
return Err(DedupError::Hash("Odd hex string length".to_string()));
}
(0..s.len())
.step_by(2)
.map(|i| {
u8::from_str_radix(&s[i..i + 2], 16)
.map_err(|e| DedupError::Hash(format!("Invalid hex digit: {e}")))
})
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_compute_data_hash() {
let data = b"Hello, World!";
let hash = compute_data_hash(data);
assert_eq!(hash.as_bytes().len(), 32);
}
#[test]
fn test_hash_hex() {
let data = b"Test";
let hash = compute_data_hash(data);
let hex = hash.to_hex();
assert_eq!(hex.len(), 64);
let decoded = FileHash::from_hex(&hex).expect("operation should succeed");
assert_eq!(hash, decoded);
}
#[test]
fn test_hash_similarity() {
let hash1 = compute_data_hash(b"Hello");
let hash2 = compute_data_hash(b"Hello");
let hash3 = compute_data_hash(b"World");
assert_eq!(hash1.similarity(&hash2), 1.0);
assert!(hash1.similarity(&hash3) < 1.0);
}
#[test]
fn test_rolling_hash() {
let mut rh = RollingHash::new(8);
let data = b"Hello, World!";
for &byte in data {
rh.roll(byte);
}
assert!(rh.hash() != 0);
rh.reset();
assert_eq!(rh.hash(), 0);
}
#[test]
fn test_chunker() {
let mut chunker = Chunker::new(DEFAULT_CHUNK_SIZE);
let data = vec![0u8; 100_000];
let chunks = chunker.chunk(&data);
assert!(!chunks.is_empty());
let total_size: usize = chunks.iter().map(|c| c.size).sum();
assert_eq!(total_size, data.len());
let mut expected_offset = 0u64;
for chunk in &chunks {
assert_eq!(chunk.offset, expected_offset);
expected_offset += chunk.size as u64;
}
}
#[test]
fn test_chunk_index() {
let mut index = ChunkIndex::new();
let chunks1 = vec![
Chunk {
offset: 0,
size: 100,
hash: compute_data_hash(b"chunk1"),
},
Chunk {
offset: 100,
size: 100,
hash: compute_data_hash(b"chunk2"),
},
];
let chunks2 = vec![
Chunk {
offset: 0,
size: 100,
hash: compute_data_hash(b"chunk1"), },
Chunk {
offset: 100,
size: 100,
hash: compute_data_hash(b"chunk3"),
},
];
index.add_file("file1.txt", &chunks1);
index.add_file("file2.txt", &chunks2);
assert_eq!(index.chunk_count(), 3); assert_eq!(index.duplicate_count(), 1);
let duplicates = index.find_duplicates();
assert_eq!(duplicates.len(), 1);
assert_eq!(duplicates[0].1.len(), 2); }
#[test]
fn test_dedup_ratio() {
let mut index = ChunkIndex::new();
assert_eq!(index.dedup_ratio(), 0.0);
let chunks = vec![
Chunk {
offset: 0,
size: 100,
hash: compute_data_hash(b"a"),
},
Chunk {
offset: 100,
size: 100,
hash: compute_data_hash(b"b"),
},
];
index.add_file("file1", &chunks);
assert_eq!(index.dedup_ratio(), 0.0);
index.add_file("file2", &chunks);
assert!((index.dedup_ratio() - 0.5).abs() < 0.01);
}
#[test]
fn test_hamming_distance() {
let hash1 = FileHash::from_bytes([0xFF; 32]);
let hash2 = FileHash::from_bytes([0x00; 32]);
assert_eq!(hash1.hamming_distance(&hash2), 256);
let hash3 = FileHash::from_bytes([0xFF; 32]);
assert_eq!(hash1.hamming_distance(&hash3), 0);
}
}