1use crate::{DedupError, DedupResult};
10use blake3::Hasher;
11use std::fs::File;
12use std::io::{BufReader, Read};
13use std::path::Path;
14
15const BUFFER_SIZE: usize = 65536; const DEFAULT_CHUNK_SIZE: usize = 4096; const MIN_CHUNK_SIZE: usize = 1024; const MAX_CHUNK_SIZE: usize = 1048576; const RABIN_POLYNOMIAL: u64 = 0x3DA3_358B_4DC1_73E9;
29
30#[derive(Debug, Clone, PartialEq, Eq)]
32pub struct FileHash {
33 hash: [u8; 32],
34}
35
36impl FileHash {
37 #[must_use]
39 pub fn from_bytes(bytes: [u8; 32]) -> Self {
40 Self { hash: bytes }
41 }
42
43 #[must_use]
45 pub fn as_bytes(&self) -> &[u8; 32] {
46 &self.hash
47 }
48
49 #[must_use]
51 pub fn to_hex(&self) -> String {
52 hex::encode(self.hash)
53 }
54
55 pub fn from_hex(s: &str) -> DedupResult<Self> {
61 let bytes =
62 hex::decode(s).map_err(|e| DedupError::Hash(format!("Invalid hex string: {e}")))?;
63 if bytes.len() != 32 {
64 return Err(DedupError::Hash(format!(
65 "Invalid hash length: expected 32, got {}",
66 bytes.len()
67 )));
68 }
69 let mut hash = [0u8; 32];
70 hash.copy_from_slice(&bytes);
71 Ok(Self::from_bytes(hash))
72 }
73
74 #[must_use]
76 pub fn hamming_distance(&self, other: &Self) -> u32 {
77 self.hash
78 .iter()
79 .zip(other.hash.iter())
80 .map(|(a, b)| (a ^ b).count_ones())
81 .sum()
82 }
83
84 #[must_use]
86 pub fn similarity(&self, other: &Self) -> f64 {
87 let distance = self.hamming_distance(other);
88 let max_distance = 256; 1.0 - (f64::from(distance) / f64::from(max_distance))
90 }
91}
92
93pub fn compute_file_hash(path: impl AsRef<Path>) -> DedupResult<FileHash> {
99 let file = File::open(path)?;
100 let mut reader = BufReader::with_capacity(BUFFER_SIZE, file);
101 let mut hasher = Hasher::new();
102 let mut buffer = vec![0u8; BUFFER_SIZE];
103
104 loop {
105 let n = reader.read(&mut buffer)?;
106 if n == 0 {
107 break;
108 }
109 hasher.update(&buffer[..n]);
110 }
111
112 let hash = hasher.finalize();
113 Ok(FileHash::from_bytes(*hash.as_bytes()))
114}
115
116#[must_use]
118pub fn compute_data_hash(data: &[u8]) -> FileHash {
119 let hash = blake3::hash(data);
120 FileHash::from_bytes(*hash.as_bytes())
121}
122
123#[derive(Debug, Clone)]
125pub struct Chunk {
126 pub offset: u64,
128
129 pub size: usize,
131
132 pub hash: FileHash,
134}
135
136pub struct RollingHash {
138 window_size: usize,
139 polynomial: u64,
140 mod_mask: u64,
141 hash: u64,
142 window: Vec<u8>,
143 window_pos: usize,
144 pow_table: Vec<u64>,
145}
146
147impl RollingHash {
148 #[must_use]
150 pub fn new(window_size: usize) -> Self {
151 let polynomial = RABIN_POLYNOMIAL;
152 let mod_mask = (1u64 << 63) - 1; let mut pow_table = vec![1u64; window_size];
156 for i in 1..window_size {
157 pow_table[i] = pow_table[i - 1].wrapping_mul(polynomial) & mod_mask;
158 }
159
160 Self {
161 window_size,
162 polynomial,
163 mod_mask,
164 hash: 0,
165 window: vec![0u8; window_size],
166 window_pos: 0,
167 pow_table,
168 }
169 }
170
171 pub fn roll(&mut self, byte: u8) -> u64 {
173 let old_byte = self.window[self.window_pos];
175 let old_contribution =
176 u64::from(old_byte).wrapping_mul(self.pow_table[self.window_size - 1]);
177 self.hash = self.hash.wrapping_sub(old_contribution) & self.mod_mask;
178
179 self.hash = self.hash.wrapping_mul(self.polynomial) & self.mod_mask;
181 self.hash = self.hash.wrapping_add(u64::from(byte)) & self.mod_mask;
182
183 self.window[self.window_pos] = byte;
185 self.window_pos = (self.window_pos + 1) % self.window_size;
186
187 self.hash
188 }
189
190 #[must_use]
192 pub fn hash(&self) -> u64 {
193 self.hash
194 }
195
196 pub fn reset(&mut self) {
198 self.hash = 0;
199 self.window.fill(0);
200 self.window_pos = 0;
201 }
202}
203
204pub struct Chunker {
206 chunk_size: usize,
207 min_size: usize,
208 max_size: usize,
209 rolling_hash: RollingHash,
210}
211
212impl Chunker {
213 #[must_use]
215 pub fn new(chunk_size: usize) -> Self {
216 let chunk_size = chunk_size.clamp(MIN_CHUNK_SIZE, MAX_CHUNK_SIZE);
217 let min_size = chunk_size / 2;
218 let max_size = chunk_size * 2;
219 let rolling_hash = RollingHash::new(64); Self {
222 chunk_size,
223 min_size,
224 max_size,
225 rolling_hash,
226 }
227 }
228
229 fn is_boundary(&self, hash: u64) -> bool {
231 let mask = (1u64 << 12) - 1; (hash & mask) == 0
234 }
235
236 #[must_use]
238 pub fn chunk(&mut self, data: &[u8]) -> Vec<Chunk> {
239 let mut chunks = Vec::new();
240 let mut chunk_start = 0;
241 let mut offset = 0u64;
242
243 self.rolling_hash.reset();
244
245 for (pos, &byte) in data.iter().enumerate() {
246 let hash = self.rolling_hash.roll(byte);
247 let chunk_len = pos - chunk_start;
248
249 if chunk_len >= self.min_size && (self.is_boundary(hash) || chunk_len >= self.max_size)
251 {
252 let chunk_data = &data[chunk_start..pos + 1];
253 let chunk_hash = compute_data_hash(chunk_data);
254
255 chunks.push(Chunk {
256 offset,
257 size: chunk_data.len(),
258 hash: chunk_hash,
259 });
260
261 chunk_start = pos + 1;
262 offset += chunk_data.len() as u64;
263 self.rolling_hash.reset();
264 }
265 }
266
267 if chunk_start < data.len() {
269 let chunk_data = &data[chunk_start..];
270 let chunk_hash = compute_data_hash(chunk_data);
271
272 chunks.push(Chunk {
273 offset,
274 size: chunk_data.len(),
275 hash: chunk_hash,
276 });
277 }
278
279 chunks
280 }
281
282 pub fn chunk_file(&mut self, path: impl AsRef<Path>) -> DedupResult<Vec<Chunk>> {
288 let file = File::open(path)?;
289 let mut reader = BufReader::with_capacity(BUFFER_SIZE, file);
290 let mut chunks = Vec::new();
291 let mut chunk_buffer = Vec::new();
292 let mut offset = 0u64;
293 let mut buffer = vec![0u8; BUFFER_SIZE];
294
295 self.rolling_hash.reset();
296
297 loop {
298 let n = reader.read(&mut buffer)?;
299 if n == 0 {
300 break;
301 }
302
303 for &byte in &buffer[..n] {
304 chunk_buffer.push(byte);
305 let hash = self.rolling_hash.roll(byte);
306 let chunk_len = chunk_buffer.len();
307
308 if chunk_len >= self.min_size
310 && (self.is_boundary(hash) || chunk_len >= self.max_size)
311 {
312 let chunk_hash = compute_data_hash(&chunk_buffer);
313
314 chunks.push(Chunk {
315 offset,
316 size: chunk_buffer.len(),
317 hash: chunk_hash,
318 });
319
320 offset += chunk_buffer.len() as u64;
321 chunk_buffer.clear();
322 self.rolling_hash.reset();
323 }
324 }
325 }
326
327 if !chunk_buffer.is_empty() {
329 let chunk_hash = compute_data_hash(&chunk_buffer);
330
331 chunks.push(Chunk {
332 offset,
333 size: chunk_buffer.len(),
334 hash: chunk_hash,
335 });
336 }
337
338 Ok(chunks)
339 }
340}
341
342pub struct ChunkIndex {
344 chunks: Vec<(FileHash, Vec<String>)>,
345}
346
347impl ChunkIndex {
348 #[must_use]
350 pub fn new() -> Self {
351 Self { chunks: Vec::new() }
352 }
353
354 pub fn add_file(&mut self, file_path: &str, chunks: &[Chunk]) {
356 for chunk in chunks {
357 if let Some((_, files)) = self.chunks.iter_mut().find(|(h, _)| h == &chunk.hash) {
358 if !files.contains(&file_path.to_string()) {
359 files.push(file_path.to_string());
360 }
361 } else {
362 self.chunks
363 .push((chunk.hash.clone(), vec![file_path.to_string()]));
364 }
365 }
366 }
367
368 #[must_use]
370 pub fn find_duplicates(&self) -> Vec<(FileHash, Vec<String>)> {
371 self.chunks
372 .iter()
373 .filter(|(_, files)| files.len() > 1)
374 .cloned()
375 .collect()
376 }
377
378 #[must_use]
380 pub fn chunk_count(&self) -> usize {
381 self.chunks.len()
382 }
383
384 #[must_use]
386 pub fn duplicate_count(&self) -> usize {
387 self.chunks
388 .iter()
389 .filter(|(_, files)| files.len() > 1)
390 .count()
391 }
392
393 #[must_use]
395 pub fn dedup_ratio(&self) -> f64 {
396 if self.chunks.is_empty() {
397 return 0.0;
398 }
399 let total: usize = self.chunks.iter().map(|(_, files)| files.len()).sum();
400 let unique = self.chunks.len();
401 if total == 0 {
402 return 0.0;
403 }
404 1.0 - (unique as f64 / total as f64)
405 }
406}
407
408impl Default for ChunkIndex {
409 fn default() -> Self {
410 Self::new()
411 }
412}
413
414pub fn compute_chunk_similarity(
420 path1: impl AsRef<Path>,
421 path2: impl AsRef<Path>,
422 chunk_size: usize,
423) -> DedupResult<f64> {
424 let mut chunker = Chunker::new(chunk_size);
425
426 let chunks1 = chunker.chunk_file(path1)?;
427 let chunks2 = chunker.chunk_file(path2)?;
428
429 if chunks1.is_empty() || chunks2.is_empty() {
430 return Ok(0.0);
431 }
432
433 let hashes1: Vec<_> = chunks1.iter().map(|c| &c.hash).collect();
435 let hashes2: Vec<_> = chunks2.iter().map(|c| &c.hash).collect();
436
437 let shared = hashes1.iter().filter(|h| hashes2.contains(h)).count();
438
439 let total = hashes1.len().max(hashes2.len());
440 Ok(shared as f64 / total as f64)
441}
442
443mod hex {
445 use super::DedupError;
446
447 pub fn encode(bytes: [u8; 32]) -> String {
448 bytes.iter().map(|b| format!("{b:02x}")).collect::<String>()
449 }
450
451 pub fn decode(s: &str) -> Result<Vec<u8>, DedupError> {
452 if s.len() % 2 != 0 {
453 return Err(DedupError::Hash("Odd hex string length".to_string()));
454 }
455
456 (0..s.len())
457 .step_by(2)
458 .map(|i| {
459 u8::from_str_radix(&s[i..i + 2], 16)
460 .map_err(|e| DedupError::Hash(format!("Invalid hex digit: {e}")))
461 })
462 .collect()
463 }
464}
465
466#[cfg(test)]
467mod tests {
468 use super::*;
469
470 #[test]
471 fn test_compute_data_hash() {
472 let data = b"Hello, World!";
473 let hash = compute_data_hash(data);
474 assert_eq!(hash.as_bytes().len(), 32);
475 }
476
477 #[test]
478 fn test_hash_hex() {
479 let data = b"Test";
480 let hash = compute_data_hash(data);
481 let hex = hash.to_hex();
482 assert_eq!(hex.len(), 64); let decoded = FileHash::from_hex(&hex).expect("operation should succeed");
485 assert_eq!(hash, decoded);
486 }
487
488 #[test]
489 fn test_hash_similarity() {
490 let hash1 = compute_data_hash(b"Hello");
491 let hash2 = compute_data_hash(b"Hello");
492 let hash3 = compute_data_hash(b"World");
493
494 assert_eq!(hash1.similarity(&hash2), 1.0);
495 assert!(hash1.similarity(&hash3) < 1.0);
496 }
497
498 #[test]
499 fn test_rolling_hash() {
500 let mut rh = RollingHash::new(8);
501 let data = b"Hello, World!";
502
503 for &byte in data {
504 rh.roll(byte);
505 }
506
507 assert!(rh.hash() != 0);
508
509 rh.reset();
510 assert_eq!(rh.hash(), 0);
511 }
512
513 #[test]
514 fn test_chunker() {
515 let mut chunker = Chunker::new(DEFAULT_CHUNK_SIZE);
516 let data = vec![0u8; 100_000]; let chunks = chunker.chunk(&data);
519 assert!(!chunks.is_empty());
520
521 let total_size: usize = chunks.iter().map(|c| c.size).sum();
523 assert_eq!(total_size, data.len());
524
525 let mut expected_offset = 0u64;
527 for chunk in &chunks {
528 assert_eq!(chunk.offset, expected_offset);
529 expected_offset += chunk.size as u64;
530 }
531 }
532
533 #[test]
534 fn test_chunk_index() {
535 let mut index = ChunkIndex::new();
536
537 let chunks1 = vec![
538 Chunk {
539 offset: 0,
540 size: 100,
541 hash: compute_data_hash(b"chunk1"),
542 },
543 Chunk {
544 offset: 100,
545 size: 100,
546 hash: compute_data_hash(b"chunk2"),
547 },
548 ];
549
550 let chunks2 = vec![
551 Chunk {
552 offset: 0,
553 size: 100,
554 hash: compute_data_hash(b"chunk1"), },
556 Chunk {
557 offset: 100,
558 size: 100,
559 hash: compute_data_hash(b"chunk3"),
560 },
561 ];
562
563 index.add_file("file1.txt", &chunks1);
564 index.add_file("file2.txt", &chunks2);
565
566 assert_eq!(index.chunk_count(), 3); assert_eq!(index.duplicate_count(), 1); let duplicates = index.find_duplicates();
570 assert_eq!(duplicates.len(), 1);
571 assert_eq!(duplicates[0].1.len(), 2); }
573
574 #[test]
575 fn test_dedup_ratio() {
576 let mut index = ChunkIndex::new();
577 assert_eq!(index.dedup_ratio(), 0.0);
578
579 let chunks = vec![
581 Chunk {
582 offset: 0,
583 size: 100,
584 hash: compute_data_hash(b"a"),
585 },
586 Chunk {
587 offset: 100,
588 size: 100,
589 hash: compute_data_hash(b"b"),
590 },
591 ];
592 index.add_file("file1", &chunks);
593
594 assert_eq!(index.dedup_ratio(), 0.0);
596
597 index.add_file("file2", &chunks);
599
600 assert!((index.dedup_ratio() - 0.5).abs() < 0.01);
602 }
603
604 #[test]
605 fn test_hamming_distance() {
606 let hash1 = FileHash::from_bytes([0xFF; 32]);
607 let hash2 = FileHash::from_bytes([0x00; 32]);
608 assert_eq!(hash1.hamming_distance(&hash2), 256);
609
610 let hash3 = FileHash::from_bytes([0xFF; 32]);
611 assert_eq!(hash1.hamming_distance(&hash3), 0);
612 }
613}