Skip to main content

oximedia_dedup/
hash.rs

1//! Cryptographic and content-based hashing for deduplication.
2//!
3//! This module provides:
4//! - BLAKE3 cryptographic hashing for exact duplicate detection
5//! - Content-based chunking with rolling hash (Rabin fingerprinting)
6//! - Chunk-level deduplication for efficient storage
7//! - Hash indexing and comparison
8
9use crate::{DedupError, DedupResult};
10use blake3::Hasher;
11use std::fs::File;
12use std::io::{BufReader, Read};
13use std::path::Path;
14
15/// Size of read buffer for hashing
16const BUFFER_SIZE: usize = 65536; // 64 KB
17
18/// Default chunk size for content-based chunking
19const DEFAULT_CHUNK_SIZE: usize = 4096; // 4 KB
20
21/// Minimum chunk size
22const MIN_CHUNK_SIZE: usize = 1024; // 1 KB
23
24/// Maximum chunk size
25const MAX_CHUNK_SIZE: usize = 1048576; // 1 MB
26
27/// Rabin polynomial for rolling hash
28const RABIN_POLYNOMIAL: u64 = 0x3DA3_358B_4DC1_73E9;
29
30/// File hash result containing BLAKE3 hash.
31#[derive(Debug, Clone, PartialEq, Eq)]
32pub struct FileHash {
33    hash: [u8; 32],
34}
35
36impl FileHash {
37    /// Create from byte array.
38    #[must_use]
39    pub fn from_bytes(bytes: [u8; 32]) -> Self {
40        Self { hash: bytes }
41    }
42
43    /// Get hash as bytes.
44    #[must_use]
45    pub fn as_bytes(&self) -> &[u8; 32] {
46        &self.hash
47    }
48
49    /// Convert to hex string.
50    #[must_use]
51    pub fn to_hex(&self) -> String {
52        hex::encode(self.hash)
53    }
54
55    /// Create from hex string.
56    ///
57    /// # Errors
58    ///
59    /// Returns an error if the hex string is invalid.
60    pub fn from_hex(s: &str) -> DedupResult<Self> {
61        let bytes =
62            hex::decode(s).map_err(|e| DedupError::Hash(format!("Invalid hex string: {e}")))?;
63        if bytes.len() != 32 {
64            return Err(DedupError::Hash(format!(
65                "Invalid hash length: expected 32, got {}",
66                bytes.len()
67            )));
68        }
69        let mut hash = [0u8; 32];
70        hash.copy_from_slice(&bytes);
71        Ok(Self::from_bytes(hash))
72    }
73
74    /// Compute Hamming distance between two hashes.
75    #[must_use]
76    pub fn hamming_distance(&self, other: &Self) -> u32 {
77        self.hash
78            .iter()
79            .zip(other.hash.iter())
80            .map(|(a, b)| (a ^ b).count_ones())
81            .sum()
82    }
83
84    /// Compute similarity (0.0-1.0) based on Hamming distance.
85    #[must_use]
86    pub fn similarity(&self, other: &Self) -> f64 {
87        let distance = self.hamming_distance(other);
88        let max_distance = 256; // 32 bytes * 8 bits
89        1.0 - (f64::from(distance) / f64::from(max_distance))
90    }
91}
92
93/// Compute BLAKE3 hash of a file.
94///
95/// # Errors
96///
97/// Returns an error if the file cannot be read.
98pub fn compute_file_hash(path: impl AsRef<Path>) -> DedupResult<FileHash> {
99    let file = File::open(path)?;
100    let mut reader = BufReader::with_capacity(BUFFER_SIZE, file);
101    let mut hasher = Hasher::new();
102    let mut buffer = vec![0u8; BUFFER_SIZE];
103
104    loop {
105        let n = reader.read(&mut buffer)?;
106        if n == 0 {
107            break;
108        }
109        hasher.update(&buffer[..n]);
110    }
111
112    let hash = hasher.finalize();
113    Ok(FileHash::from_bytes(*hash.as_bytes()))
114}
115
116/// Compute BLAKE3 hash of data.
117#[must_use]
118pub fn compute_data_hash(data: &[u8]) -> FileHash {
119    let hash = blake3::hash(data);
120    FileHash::from_bytes(*hash.as_bytes())
121}
122
123/// Content chunk with hash.
124#[derive(Debug, Clone)]
125pub struct Chunk {
126    /// Offset in file
127    pub offset: u64,
128
129    /// Size of chunk
130    pub size: usize,
131
132    /// BLAKE3 hash of chunk
133    pub hash: FileHash,
134}
135
136/// Rolling hash for content-based chunking.
137pub struct RollingHash {
138    window_size: usize,
139    polynomial: u64,
140    mod_mask: u64,
141    hash: u64,
142    window: Vec<u8>,
143    window_pos: usize,
144    pow_table: Vec<u64>,
145}
146
147impl RollingHash {
148    /// Create a new rolling hash with specified window size.
149    #[must_use]
150    pub fn new(window_size: usize) -> Self {
151        let polynomial = RABIN_POLYNOMIAL;
152        let mod_mask = (1u64 << 63) - 1; // Use lower 63 bits
153
154        // Precompute powers of polynomial
155        let mut pow_table = vec![1u64; window_size];
156        for i in 1..window_size {
157            pow_table[i] = pow_table[i - 1].wrapping_mul(polynomial) & mod_mask;
158        }
159
160        Self {
161            window_size,
162            polynomial,
163            mod_mask,
164            hash: 0,
165            window: vec![0u8; window_size],
166            window_pos: 0,
167            pow_table,
168        }
169    }
170
171    /// Roll the hash with a new byte.
172    pub fn roll(&mut self, byte: u8) -> u64 {
173        // Remove oldest byte's contribution
174        let old_byte = self.window[self.window_pos];
175        let old_contribution =
176            u64::from(old_byte).wrapping_mul(self.pow_table[self.window_size - 1]);
177        self.hash = self.hash.wrapping_sub(old_contribution) & self.mod_mask;
178
179        // Shift hash and add new byte
180        self.hash = self.hash.wrapping_mul(self.polynomial) & self.mod_mask;
181        self.hash = self.hash.wrapping_add(u64::from(byte)) & self.mod_mask;
182
183        // Update window
184        self.window[self.window_pos] = byte;
185        self.window_pos = (self.window_pos + 1) % self.window_size;
186
187        self.hash
188    }
189
190    /// Get current hash value.
191    #[must_use]
192    pub fn hash(&self) -> u64 {
193        self.hash
194    }
195
196    /// Reset the rolling hash.
197    pub fn reset(&mut self) {
198        self.hash = 0;
199        self.window.fill(0);
200        self.window_pos = 0;
201    }
202}
203
204/// Chunker for content-based chunking.
205pub struct Chunker {
206    chunk_size: usize,
207    min_size: usize,
208    max_size: usize,
209    rolling_hash: RollingHash,
210}
211
212impl Chunker {
213    /// Create a new chunker with specified parameters.
214    #[must_use]
215    pub fn new(chunk_size: usize) -> Self {
216        let chunk_size = chunk_size.clamp(MIN_CHUNK_SIZE, MAX_CHUNK_SIZE);
217        let min_size = chunk_size / 2;
218        let max_size = chunk_size * 2;
219        let rolling_hash = RollingHash::new(64); // 64-byte window
220
221        Self {
222            chunk_size,
223            min_size,
224            max_size,
225            rolling_hash,
226        }
227    }
228
229    /// Check if this is a chunk boundary.
230    fn is_boundary(&self, hash: u64) -> bool {
231        // Use lower bits as boundary marker
232        let mask = (1u64 << 12) - 1; // 12-bit mask for 4KB average chunks
233        (hash & mask) == 0
234    }
235
236    /// Chunk data into content-defined chunks.
237    #[must_use]
238    pub fn chunk(&mut self, data: &[u8]) -> Vec<Chunk> {
239        let mut chunks = Vec::new();
240        let mut chunk_start = 0;
241        let mut offset = 0u64;
242
243        self.rolling_hash.reset();
244
245        for (pos, &byte) in data.iter().enumerate() {
246            let hash = self.rolling_hash.roll(byte);
247            let chunk_len = pos - chunk_start;
248
249            // Check for chunk boundary
250            if chunk_len >= self.min_size && (self.is_boundary(hash) || chunk_len >= self.max_size)
251            {
252                let chunk_data = &data[chunk_start..pos + 1];
253                let chunk_hash = compute_data_hash(chunk_data);
254
255                chunks.push(Chunk {
256                    offset,
257                    size: chunk_data.len(),
258                    hash: chunk_hash,
259                });
260
261                chunk_start = pos + 1;
262                offset += chunk_data.len() as u64;
263                self.rolling_hash.reset();
264            }
265        }
266
267        // Add remaining data as final chunk
268        if chunk_start < data.len() {
269            let chunk_data = &data[chunk_start..];
270            let chunk_hash = compute_data_hash(chunk_data);
271
272            chunks.push(Chunk {
273                offset,
274                size: chunk_data.len(),
275                hash: chunk_hash,
276            });
277        }
278
279        chunks
280    }
281
282    /// Chunk a file into content-defined chunks.
283    ///
284    /// # Errors
285    ///
286    /// Returns an error if the file cannot be read.
287    pub fn chunk_file(&mut self, path: impl AsRef<Path>) -> DedupResult<Vec<Chunk>> {
288        let file = File::open(path)?;
289        let mut reader = BufReader::with_capacity(BUFFER_SIZE, file);
290        let mut chunks = Vec::new();
291        let mut chunk_buffer = Vec::new();
292        let mut offset = 0u64;
293        let mut buffer = vec![0u8; BUFFER_SIZE];
294
295        self.rolling_hash.reset();
296
297        loop {
298            let n = reader.read(&mut buffer)?;
299            if n == 0 {
300                break;
301            }
302
303            for &byte in &buffer[..n] {
304                chunk_buffer.push(byte);
305                let hash = self.rolling_hash.roll(byte);
306                let chunk_len = chunk_buffer.len();
307
308                // Check for chunk boundary
309                if chunk_len >= self.min_size
310                    && (self.is_boundary(hash) || chunk_len >= self.max_size)
311                {
312                    let chunk_hash = compute_data_hash(&chunk_buffer);
313
314                    chunks.push(Chunk {
315                        offset,
316                        size: chunk_buffer.len(),
317                        hash: chunk_hash,
318                    });
319
320                    offset += chunk_buffer.len() as u64;
321                    chunk_buffer.clear();
322                    self.rolling_hash.reset();
323                }
324            }
325        }
326
327        // Add remaining data as final chunk
328        if !chunk_buffer.is_empty() {
329            let chunk_hash = compute_data_hash(&chunk_buffer);
330
331            chunks.push(Chunk {
332                offset,
333                size: chunk_buffer.len(),
334                hash: chunk_hash,
335            });
336        }
337
338        Ok(chunks)
339    }
340}
341
342/// Chunk index for deduplication.
343pub struct ChunkIndex {
344    chunks: Vec<(FileHash, Vec<String>)>,
345}
346
347impl ChunkIndex {
348    /// Create a new chunk index.
349    #[must_use]
350    pub fn new() -> Self {
351        Self { chunks: Vec::new() }
352    }
353
354    /// Add chunks from a file.
355    pub fn add_file(&mut self, file_path: &str, chunks: &[Chunk]) {
356        for chunk in chunks {
357            if let Some((_, files)) = self.chunks.iter_mut().find(|(h, _)| h == &chunk.hash) {
358                if !files.contains(&file_path.to_string()) {
359                    files.push(file_path.to_string());
360                }
361            } else {
362                self.chunks
363                    .push((chunk.hash.clone(), vec![file_path.to_string()]));
364            }
365        }
366    }
367
368    /// Find duplicate chunks.
369    #[must_use]
370    pub fn find_duplicates(&self) -> Vec<(FileHash, Vec<String>)> {
371        self.chunks
372            .iter()
373            .filter(|(_, files)| files.len() > 1)
374            .cloned()
375            .collect()
376    }
377
378    /// Get total number of chunks.
379    #[must_use]
380    pub fn chunk_count(&self) -> usize {
381        self.chunks.len()
382    }
383
384    /// Get number of duplicate chunks.
385    #[must_use]
386    pub fn duplicate_count(&self) -> usize {
387        self.chunks
388            .iter()
389            .filter(|(_, files)| files.len() > 1)
390            .count()
391    }
392
393    /// Calculate deduplication ratio.
394    #[must_use]
395    pub fn dedup_ratio(&self) -> f64 {
396        if self.chunks.is_empty() {
397            return 0.0;
398        }
399        let total: usize = self.chunks.iter().map(|(_, files)| files.len()).sum();
400        let unique = self.chunks.len();
401        if total == 0 {
402            return 0.0;
403        }
404        1.0 - (unique as f64 / total as f64)
405    }
406}
407
408impl Default for ChunkIndex {
409    fn default() -> Self {
410        Self::new()
411    }
412}
413
414/// Compute similarity between two files based on shared chunks.
415///
416/// # Errors
417///
418/// Returns an error if files cannot be read or chunked.
419pub fn compute_chunk_similarity(
420    path1: impl AsRef<Path>,
421    path2: impl AsRef<Path>,
422    chunk_size: usize,
423) -> DedupResult<f64> {
424    let mut chunker = Chunker::new(chunk_size);
425
426    let chunks1 = chunker.chunk_file(path1)?;
427    let chunks2 = chunker.chunk_file(path2)?;
428
429    if chunks1.is_empty() || chunks2.is_empty() {
430        return Ok(0.0);
431    }
432
433    // Count shared chunks
434    let hashes1: Vec<_> = chunks1.iter().map(|c| &c.hash).collect();
435    let hashes2: Vec<_> = chunks2.iter().map(|c| &c.hash).collect();
436
437    let shared = hashes1.iter().filter(|h| hashes2.contains(h)).count();
438
439    let total = hashes1.len().max(hashes2.len());
440    Ok(shared as f64 / total as f64)
441}
442
443/// Helper function to decode hex strings
444mod hex {
445    use super::DedupError;
446
447    pub fn encode(bytes: [u8; 32]) -> String {
448        bytes.iter().map(|b| format!("{b:02x}")).collect::<String>()
449    }
450
451    pub fn decode(s: &str) -> Result<Vec<u8>, DedupError> {
452        if s.len() % 2 != 0 {
453            return Err(DedupError::Hash("Odd hex string length".to_string()));
454        }
455
456        (0..s.len())
457            .step_by(2)
458            .map(|i| {
459                u8::from_str_radix(&s[i..i + 2], 16)
460                    .map_err(|e| DedupError::Hash(format!("Invalid hex digit: {e}")))
461            })
462            .collect()
463    }
464}
465
466#[cfg(test)]
467mod tests {
468    use super::*;
469
470    #[test]
471    fn test_compute_data_hash() {
472        let data = b"Hello, World!";
473        let hash = compute_data_hash(data);
474        assert_eq!(hash.as_bytes().len(), 32);
475    }
476
477    #[test]
478    fn test_hash_hex() {
479        let data = b"Test";
480        let hash = compute_data_hash(data);
481        let hex = hash.to_hex();
482        assert_eq!(hex.len(), 64); // 32 bytes * 2 hex chars
483
484        let decoded = FileHash::from_hex(&hex).expect("operation should succeed");
485        assert_eq!(hash, decoded);
486    }
487
488    #[test]
489    fn test_hash_similarity() {
490        let hash1 = compute_data_hash(b"Hello");
491        let hash2 = compute_data_hash(b"Hello");
492        let hash3 = compute_data_hash(b"World");
493
494        assert_eq!(hash1.similarity(&hash2), 1.0);
495        assert!(hash1.similarity(&hash3) < 1.0);
496    }
497
498    #[test]
499    fn test_rolling_hash() {
500        let mut rh = RollingHash::new(8);
501        let data = b"Hello, World!";
502
503        for &byte in data {
504            rh.roll(byte);
505        }
506
507        assert!(rh.hash() != 0);
508
509        rh.reset();
510        assert_eq!(rh.hash(), 0);
511    }
512
513    #[test]
514    fn test_chunker() {
515        let mut chunker = Chunker::new(DEFAULT_CHUNK_SIZE);
516        let data = vec![0u8; 100_000]; // 100 KB of zeros
517
518        let chunks = chunker.chunk(&data);
519        assert!(!chunks.is_empty());
520
521        // Verify chunk properties
522        let total_size: usize = chunks.iter().map(|c| c.size).sum();
523        assert_eq!(total_size, data.len());
524
525        // Verify offsets
526        let mut expected_offset = 0u64;
527        for chunk in &chunks {
528            assert_eq!(chunk.offset, expected_offset);
529            expected_offset += chunk.size as u64;
530        }
531    }
532
533    #[test]
534    fn test_chunk_index() {
535        let mut index = ChunkIndex::new();
536
537        let chunks1 = vec![
538            Chunk {
539                offset: 0,
540                size: 100,
541                hash: compute_data_hash(b"chunk1"),
542            },
543            Chunk {
544                offset: 100,
545                size: 100,
546                hash: compute_data_hash(b"chunk2"),
547            },
548        ];
549
550        let chunks2 = vec![
551            Chunk {
552                offset: 0,
553                size: 100,
554                hash: compute_data_hash(b"chunk1"), // Duplicate
555            },
556            Chunk {
557                offset: 100,
558                size: 100,
559                hash: compute_data_hash(b"chunk3"),
560            },
561        ];
562
563        index.add_file("file1.txt", &chunks1);
564        index.add_file("file2.txt", &chunks2);
565
566        assert_eq!(index.chunk_count(), 3); // chunk1, chunk2, chunk3
567        assert_eq!(index.duplicate_count(), 1); // chunk1 is duplicated
568
569        let duplicates = index.find_duplicates();
570        assert_eq!(duplicates.len(), 1);
571        assert_eq!(duplicates[0].1.len(), 2); // chunk1 in both files
572    }
573
574    #[test]
575    fn test_dedup_ratio() {
576        let mut index = ChunkIndex::new();
577        assert_eq!(index.dedup_ratio(), 0.0);
578
579        // Add unique chunks
580        let chunks = vec![
581            Chunk {
582                offset: 0,
583                size: 100,
584                hash: compute_data_hash(b"a"),
585            },
586            Chunk {
587                offset: 100,
588                size: 100,
589                hash: compute_data_hash(b"b"),
590            },
591        ];
592        index.add_file("file1", &chunks);
593
594        // No deduplication yet
595        assert_eq!(index.dedup_ratio(), 0.0);
596
597        // Add duplicate chunks
598        index.add_file("file2", &chunks);
599
600        // 50% deduplication (2 unique, 4 total references)
601        assert!((index.dedup_ratio() - 0.5).abs() < 0.01);
602    }
603
604    #[test]
605    fn test_hamming_distance() {
606        let hash1 = FileHash::from_bytes([0xFF; 32]);
607        let hash2 = FileHash::from_bytes([0x00; 32]);
608        assert_eq!(hash1.hamming_distance(&hash2), 256);
609
610        let hash3 = FileHash::from_bytes([0xFF; 32]);
611        assert_eq!(hash1.hamming_distance(&hash3), 0);
612    }
613}