Skip to main content

oximedia_dedup/
dedup_index.rs

1//! Persistent-style deduplication index.
2//!
3//! Tracks content hashes together with occurrence counts and space-saving
4//! metrics.  The index operates entirely in memory; persistence to disk can
5//! be added by serialising `DedupIndex` with serde.
6
7#![allow(dead_code)]
8#![allow(clippy::cast_precision_loss)]
9
10use std::path::{Path, PathBuf};
11
12use rayon::prelude::*;
13
14// ---------------------------------------------------------------------------
15// DedupEntry
16// ---------------------------------------------------------------------------
17
18/// A single entry in the deduplication index.
19#[derive(Debug, Clone, PartialEq, Eq)]
20pub struct DedupEntry {
21    /// Unique identifier assigned by the index.
22    pub id: u64,
23    /// Content hash (arbitrary bytes, e.g. BLAKE3 digest).
24    pub content_hash: Vec<u8>,
25    /// Size of the deduplicated content in bytes.
26    pub size_bytes: u64,
27    /// Unix epoch timestamp when this hash was first seen.
28    pub first_seen_epoch: u64,
29    /// Number of times this hash has been seen (≥ 1 after creation).
30    pub occurrence_count: u32,
31}
32
33impl DedupEntry {
34    /// Returns `true` if this hash has been seen more than once.
35    #[must_use]
36    pub fn is_duplicate(&self) -> bool {
37        self.occurrence_count > 1
38    }
39
40    /// Space saved by deduplication.
41    ///
42    /// If the content appeared N times, only one copy is stored.
43    /// Savings = `(N - 1) * size_bytes`.
44    #[must_use]
45    pub fn space_savings(&self) -> u64 {
46        if self.occurrence_count <= 1 {
47            return 0;
48        }
49        (self.occurrence_count as u64 - 1).saturating_mul(self.size_bytes)
50    }
51
52    /// Return the last-seen epoch, which equals `first_seen_epoch` until we
53    /// track updates (here we simply alias for interface completeness).
54    #[must_use]
55    pub fn first_seen(&self) -> u64 {
56        self.first_seen_epoch
57    }
58
59    /// Hex representation of the content hash.
60    #[must_use]
61    pub fn hash_hex(&self) -> String {
62        self.content_hash
63            .iter()
64            .map(|b| format!("{b:02x}"))
65            .collect()
66    }
67}
68
69// ---------------------------------------------------------------------------
70// DedupIndex
71// ---------------------------------------------------------------------------
72
73/// In-memory deduplication index.
74///
75/// Each unique content hash is stored once; subsequent insertions increment
76/// the occurrence counter.
77pub struct DedupIndex {
78    /// All known entries, ordered by insertion.
79    pub entries: Vec<DedupEntry>,
80    /// Next ID to assign.
81    pub next_id: u64,
82}
83
84impl DedupIndex {
85    /// Create a new, empty index.
86    #[must_use]
87    pub fn new() -> Self {
88        Self {
89            entries: Vec::new(),
90            next_id: 1,
91        }
92    }
93
94    /// Add a content hash to the index, or increment its counter if already present.
95    ///
96    /// Returns the `id` of the (existing or newly created) entry.
97    pub fn add_or_increment(&mut self, hash: Vec<u8>, size_bytes: u64, epoch: u64) -> u64 {
98        if let Some(entry) = self.entries.iter_mut().find(|e| e.content_hash == hash) {
99            entry.occurrence_count += 1;
100            return entry.id;
101        }
102
103        let id = self.next_id;
104        self.next_id += 1;
105        self.entries.push(DedupEntry {
106            id,
107            content_hash: hash,
108            size_bytes,
109            first_seen_epoch: epoch,
110            occurrence_count: 1,
111        });
112        id
113    }
114
115    /// Find an entry by its content hash.
116    #[must_use]
117    pub fn find_by_hash(&self, hash: &[u8]) -> Option<&DedupEntry> {
118        self.entries.iter().find(|e| e.content_hash == hash)
119    }
120
121    /// Find an entry by its assigned ID.
122    #[must_use]
123    pub fn find_by_id(&self, id: u64) -> Option<&DedupEntry> {
124        self.entries.iter().find(|e| e.id == id)
125    }
126
127    /// Return all entries that are duplicates (occurrence_count > 1).
128    #[must_use]
129    pub fn find_duplicates(&self) -> Vec<&DedupEntry> {
130        self.entries.iter().filter(|e| e.is_duplicate()).collect()
131    }
132
133    /// Total space saved across all duplicate entries.
134    #[must_use]
135    pub fn total_space_savings(&self) -> u64 {
136        self.entries.iter().map(|e| e.space_savings()).sum()
137    }
138
139    /// Number of unique content hashes in the index.
140    #[must_use]
141    pub fn unique_count(&self) -> usize {
142        self.entries.len()
143    }
144
145    /// Total number of content insertions (sum of all occurrence counts).
146    #[must_use]
147    pub fn total_insertions(&self) -> u64 {
148        self.entries.iter().map(|e| e.occurrence_count as u64).sum()
149    }
150
151    /// Remove an entry by hash.  Returns `true` if an entry was removed.
152    pub fn remove_by_hash(&mut self, hash: &[u8]) -> bool {
153        if let Some(pos) = self.entries.iter().position(|e| e.content_hash == hash) {
154            self.entries.swap_remove(pos);
155            true
156        } else {
157            false
158        }
159    }
160
161    /// Clear the entire index.
162    pub fn clear(&mut self) {
163        self.entries.clear();
164        self.next_id = 1;
165    }
166
167    /// Return entries sorted by occurrence count (descending).
168    #[must_use]
169    pub fn most_common(&self) -> Vec<&DedupEntry> {
170        let mut sorted: Vec<&DedupEntry> = self.entries.iter().collect();
171        sorted.sort_by(|a, b| b.occurrence_count.cmp(&a.occurrence_count));
172        sorted
173    }
174
175    /// Report: percentage of insertions that were duplicates.
176    ///
177    /// Returns `0.0` if no insertions have been made.
178    #[must_use]
179    pub fn duplicate_rate(&self) -> f64 {
180        let total = self.total_insertions();
181        if total == 0 {
182            return 0.0;
183        }
184        let unique = self.unique_count() as u64;
185        let dupes = total.saturating_sub(unique);
186        dupes as f64 / total as f64
187    }
188}
189
190impl Default for DedupIndex {
191    fn default() -> Self {
192        Self::new()
193    }
194}
195
196// ---------------------------------------------------------------------------
197// Parallel add_files support
198// ---------------------------------------------------------------------------
199
200/// Features extracted from a file for deduplication purposes.
201#[derive(Debug, Clone)]
202pub struct FileFeatures {
203    /// Path of the source file.
204    pub path: PathBuf,
205    /// BLAKE3-style content hash of the file (or a synthetic hash for tests).
206    pub content_hash: Vec<u8>,
207    /// File size in bytes.
208    pub size_bytes: u64,
209    /// Modification time as seconds since UNIX epoch.
210    pub mtime_epoch: u64,
211}
212
213/// Compute features for a single file.
214///
215/// Uses a streaming FNV-1a hash over the file bytes as a lightweight
216/// approximation of a cryptographic digest.  For production use, callers
217/// should substitute a BLAKE3 hasher here.
218pub fn compute_file_features(path: &Path) -> std::io::Result<FileFeatures> {
219    use std::io::Read;
220    use std::time::UNIX_EPOCH;
221
222    let meta = std::fs::metadata(path)?;
223    let size_bytes = meta.len();
224    let mtime_epoch = meta
225        .modified()
226        .unwrap_or(UNIX_EPOCH)
227        .duration_since(UNIX_EPOCH)
228        .unwrap_or_default()
229        .as_secs();
230
231    // FNV-1a streaming hash over file contents.
232    const FNV_OFFSET: u64 = 0xcbf2_9ce4_8422_2325;
233    const FNV_PRIME: u64 = 0x0000_0100_0000_01b3;
234    let mut hash: u64 = FNV_OFFSET;
235
236    let file = std::fs::File::open(path)?;
237    let mut reader = std::io::BufReader::with_capacity(65_536, file);
238    let mut buf = vec![0u8; 65_536];
239    loop {
240        let n = reader.read(&mut buf)?;
241        if n == 0 {
242            break;
243        }
244        for &b in &buf[..n] {
245            hash = (hash ^ u64::from(b)).wrapping_mul(FNV_PRIME);
246        }
247    }
248
249    Ok(FileFeatures {
250        path: path.to_path_buf(),
251        content_hash: hash.to_le_bytes().to_vec(),
252        size_bytes,
253        mtime_epoch,
254    })
255}
256
257impl DedupIndex {
258    /// Add multiple files to the index in parallel (feature extraction) and
259    /// insert results sequentially.
260    ///
261    /// Files that cannot be read are silently skipped; their paths are returned
262    /// in the error list.
263    ///
264    /// # Returns
265    ///
266    /// A tuple `(added_ids, skipped_paths)`.
267    pub fn add_files(&mut self, paths: &[impl AsRef<Path> + Sync]) -> (Vec<u64>, Vec<PathBuf>) {
268        // --- parallel feature extraction ---
269        let results: Vec<(PathBuf, std::io::Result<FileFeatures>)> = paths
270            .par_iter()
271            .map(|p| {
272                let path = p.as_ref().to_path_buf();
273                let feat = compute_file_features(&path);
274                (path, feat)
275            })
276            .collect();
277
278        // --- sequential DB insertion ---
279        let mut added_ids = Vec::with_capacity(results.len());
280        let mut skipped = Vec::new();
281
282        for (path, result) in results {
283            match result {
284                Ok(feat) => {
285                    let id =
286                        self.add_or_increment(feat.content_hash, feat.size_bytes, feat.mtime_epoch);
287                    added_ids.push(id);
288                }
289                Err(_) => {
290                    skipped.push(path);
291                }
292            }
293        }
294
295        (added_ids, skipped)
296    }
297}
298
299// ---------------------------------------------------------------------------
300// Unit tests
301// ---------------------------------------------------------------------------
302
303#[cfg(test)]
304mod tests {
305    use super::*;
306    use std::env::temp_dir;
307
308    fn hash(s: &str) -> Vec<u8> {
309        s.as_bytes().to_vec()
310    }
311
312    /// Write `n` temp files with distinct content and return their paths.
313    fn make_temp_files(n: usize) -> Vec<std::path::PathBuf> {
314        use std::sync::atomic::{AtomicU64, Ordering};
315        static COUNTER: AtomicU64 = AtomicU64::new(0);
316
317        let base = temp_dir();
318        let pid = std::process::id();
319        (0..n)
320            .map(|i| {
321                let uid = COUNTER.fetch_add(1, Ordering::Relaxed);
322                let path = base.join(format!("dedup_idx_{pid}_{uid}_{i}.bin"));
323                std::fs::write(&path, format!("synthetic content for file {pid}_{uid}_{i}"))
324                    .expect("write temp file");
325                path
326            })
327            .collect()
328    }
329
330    // ---- DedupEntry tests ----
331
332    #[test]
333    fn test_entry_is_duplicate_false_when_once() {
334        let e = DedupEntry {
335            id: 1,
336            content_hash: hash("abc"),
337            size_bytes: 1024,
338            first_seen_epoch: 0,
339            occurrence_count: 1,
340        };
341        assert!(!e.is_duplicate());
342    }
343
344    #[test]
345    fn test_entry_is_duplicate_true_when_multiple() {
346        let e = DedupEntry {
347            id: 1,
348            content_hash: hash("abc"),
349            size_bytes: 1024,
350            first_seen_epoch: 0,
351            occurrence_count: 3,
352        };
353        assert!(e.is_duplicate());
354    }
355
356    #[test]
357    fn test_entry_space_savings_zero_when_once() {
358        let e = DedupEntry {
359            id: 1,
360            content_hash: hash("abc"),
361            size_bytes: 500,
362            first_seen_epoch: 0,
363            occurrence_count: 1,
364        };
365        assert_eq!(e.space_savings(), 0);
366    }
367
368    #[test]
369    fn test_entry_space_savings_correct() {
370        let e = DedupEntry {
371            id: 1,
372            content_hash: hash("abc"),
373            size_bytes: 1000,
374            first_seen_epoch: 0,
375            occurrence_count: 4,
376        };
377        // (4-1) * 1000 = 3000
378        assert_eq!(e.space_savings(), 3000);
379    }
380
381    #[test]
382    fn test_entry_hash_hex() {
383        let e = DedupEntry {
384            id: 1,
385            content_hash: vec![0xDE, 0xAD, 0xBE, 0xEF],
386            size_bytes: 0,
387            first_seen_epoch: 0,
388            occurrence_count: 1,
389        };
390        assert_eq!(e.hash_hex(), "deadbeef");
391    }
392
393    // ---- DedupIndex tests ----
394
395    #[test]
396    fn test_index_new_empty() {
397        let idx = DedupIndex::new();
398        assert_eq!(idx.unique_count(), 0);
399        assert_eq!(idx.total_insertions(), 0);
400        assert_eq!(idx.total_space_savings(), 0);
401    }
402
403    #[test]
404    fn test_add_or_increment_new_entry() {
405        let mut idx = DedupIndex::new();
406        let id = idx.add_or_increment(hash("file_a"), 100, 1000);
407        assert_eq!(idx.unique_count(), 1);
408        assert_eq!(id, 1);
409    }
410
411    #[test]
412    fn test_add_or_increment_existing_entry() {
413        let mut idx = DedupIndex::new();
414        let id1 = idx.add_or_increment(hash("file_a"), 100, 1000);
415        let id2 = idx.add_or_increment(hash("file_a"), 100, 2000);
416        assert_eq!(id1, id2, "Same hash should return same ID");
417        assert_eq!(idx.unique_count(), 1, "Should still be one unique entry");
418        let entry = idx
419            .find_by_hash(&hash("file_a"))
420            .expect("operation should succeed");
421        assert_eq!(entry.occurrence_count, 2);
422    }
423
424    #[test]
425    fn test_find_by_hash_found() {
426        let mut idx = DedupIndex::new();
427        idx.add_or_increment(hash("alpha"), 200, 0);
428        let entry = idx.find_by_hash(&hash("alpha"));
429        assert!(entry.is_some());
430        assert_eq!(entry.expect("operation should succeed").size_bytes, 200);
431    }
432
433    #[test]
434    fn test_find_by_hash_not_found() {
435        let idx = DedupIndex::new();
436        assert!(idx.find_by_hash(&hash("missing")).is_none());
437    }
438
439    #[test]
440    fn test_find_by_id() {
441        let mut idx = DedupIndex::new();
442        let id = idx.add_or_increment(hash("entry_1"), 512, 100);
443        let entry = idx.find_by_id(id);
444        assert!(entry.is_some());
445        assert_eq!(
446            entry.expect("operation should succeed").content_hash,
447            hash("entry_1")
448        );
449    }
450
451    #[test]
452    fn test_find_duplicates() {
453        let mut idx = DedupIndex::new();
454        idx.add_or_increment(hash("unique"), 10, 0);
455        idx.add_or_increment(hash("dup"), 20, 0);
456        idx.add_or_increment(hash("dup"), 20, 1);
457        idx.add_or_increment(hash("dup"), 20, 2);
458
459        let dups = idx.find_duplicates();
460        assert_eq!(dups.len(), 1);
461        assert_eq!(dups[0].content_hash, hash("dup"));
462        assert_eq!(dups[0].occurrence_count, 3);
463    }
464
465    #[test]
466    fn test_total_space_savings() {
467        let mut idx = DedupIndex::new();
468        // 1 occurrence → 0 savings
469        idx.add_or_increment(hash("a"), 100, 0);
470        // 3 occurrences → (3-1)*200 = 400 savings
471        idx.add_or_increment(hash("b"), 200, 0);
472        idx.add_or_increment(hash("b"), 200, 1);
473        idx.add_or_increment(hash("b"), 200, 2);
474
475        assert_eq!(idx.total_space_savings(), 400);
476    }
477
478    #[test]
479    fn test_remove_by_hash() {
480        let mut idx = DedupIndex::new();
481        idx.add_or_increment(hash("to_remove"), 50, 0);
482        assert_eq!(idx.unique_count(), 1);
483        let removed = idx.remove_by_hash(&hash("to_remove"));
484        assert!(removed);
485        assert_eq!(idx.unique_count(), 0);
486    }
487
488    #[test]
489    fn test_remove_nonexistent_returns_false() {
490        let mut idx = DedupIndex::new();
491        assert!(!idx.remove_by_hash(&hash("ghost")));
492    }
493
494    #[test]
495    fn test_clear() {
496        let mut idx = DedupIndex::new();
497        idx.add_or_increment(hash("x"), 10, 0);
498        idx.add_or_increment(hash("y"), 20, 0);
499        idx.clear();
500        assert_eq!(idx.unique_count(), 0);
501        assert_eq!(idx.next_id, 1);
502    }
503
504    #[test]
505    fn test_duplicate_rate() {
506        let mut idx = DedupIndex::new();
507        // 1 unique, seen 3 times → 2 duplicate insertions out of 3 total ≈ 0.667
508        idx.add_or_increment(hash("h"), 100, 0);
509        idx.add_or_increment(hash("h"), 100, 1);
510        idx.add_or_increment(hash("h"), 100, 2);
511
512        let rate = idx.duplicate_rate();
513        let expected = 2.0 / 3.0;
514        assert!((rate - expected).abs() < 1e-9);
515    }
516
517    #[test]
518    fn test_most_common_ordering() {
519        let mut idx = DedupIndex::new();
520        idx.add_or_increment(hash("rare"), 10, 0);
521        idx.add_or_increment(hash("common"), 100, 0);
522        idx.add_or_increment(hash("common"), 100, 1);
523        idx.add_or_increment(hash("common"), 100, 2);
524        idx.add_or_increment(hash("mid"), 50, 0);
525        idx.add_or_increment(hash("mid"), 50, 1);
526
527        let mc = idx.most_common();
528        assert_eq!(mc[0].content_hash, hash("common"));
529        assert_eq!(mc[1].content_hash, hash("mid"));
530        assert_eq!(mc[2].content_hash, hash("rare"));
531    }
532
533    #[test]
534    fn test_total_insertions() {
535        let mut idx = DedupIndex::new();
536        idx.add_or_increment(hash("a"), 1, 0);
537        idx.add_or_increment(hash("a"), 1, 1);
538        idx.add_or_increment(hash("b"), 1, 0);
539        // a: 2, b: 1 → total = 3
540        assert_eq!(idx.total_insertions(), 3);
541    }
542
543    // ── add_files parallel tests ──────────────────────────────────────────────
544
545    #[test]
546    fn test_add_files_parallel_same_result_as_sequential() {
547        let paths = make_temp_files(10);
548
549        // Parallel path.
550        let mut par_idx = DedupIndex::new();
551        let (par_ids, par_skipped) = par_idx.add_files(&paths);
552        assert!(par_skipped.is_empty(), "no files should be skipped");
553        assert_eq!(par_ids.len(), 10, "all 10 files should produce an ID");
554
555        // Sequential reference path: compute features one by one.
556        let mut seq_idx = DedupIndex::new();
557        let mut seq_ids = Vec::with_capacity(paths.len());
558        for p in &paths {
559            let feat = compute_file_features(p.as_ref()).expect("compute features");
560            let id = seq_idx.add_or_increment(feat.content_hash, feat.size_bytes, feat.mtime_epoch);
561            seq_ids.push(id);
562        }
563
564        // Both indexes should contain the same number of unique hashes.
565        assert_eq!(
566            par_idx.unique_count(),
567            seq_idx.unique_count(),
568            "unique count must match between parallel and sequential"
569        );
570
571        // Each file had distinct content, so all IDs should be unique (no dups).
572        assert_eq!(par_idx.find_duplicates().len(), 0);
573        assert_eq!(seq_idx.find_duplicates().len(), 0);
574
575        // Clean up.
576        for p in &paths {
577            let _ = std::fs::remove_file(p);
578        }
579    }
580
581    #[test]
582    fn test_add_files_skips_missing_files() {
583        let real = make_temp_files(3);
584        let mut all_paths = real.clone();
585        all_paths.push(temp_dir().join("nonexistent_dedup_test_file.bin"));
586
587        let mut idx = DedupIndex::new();
588        let (_ids, skipped) = idx.add_files(&all_paths);
589        assert_eq!(
590            skipped.len(),
591            1,
592            "exactly one missing file should be skipped"
593        );
594        assert_eq!(idx.unique_count(), 3, "three real files should be indexed");
595
596        for p in &real {
597            let _ = std::fs::remove_file(p);
598        }
599    }
600}