Skip to main content

oximedia_dedup/
exact_match.rs

1//! Exact file and content matching for deduplication.
2//!
3//! Uses FNV-based hashing to build a 128-bit content identity for each piece
4//! of data, then groups identical identities together to surface duplicates.
5
6use std::collections::HashMap;
7
8/// A 128-bit content identifier derived from raw bytes.
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
10pub struct ContentId {
11    /// Raw 16-byte identifier
12    pub bytes: [u8; 16],
13}
14
15impl ContentId {
16    /// Return a lowercase hex string representation of this identifier.
17    #[must_use]
18    pub fn to_hex(&self) -> String {
19        self.bytes
20            .iter()
21            .fold(String::with_capacity(32), |mut s, b| {
22                s.push_str(&format!("{b:02x}"));
23                s
24            })
25    }
26
27    /// Derive a `ContentId` from raw data using FNV-1a over two 64-bit lanes.
28    ///
29    /// The two lanes use different starting offsets so that the pair forms a
30    /// 128-bit value with better collision resistance than a single 64-bit hash.
31    #[must_use]
32    pub fn from_data(data: &[u8]) -> ContentId {
33        const FNV_PRIME: u64 = 0x0000_0100_0000_01b3;
34        const OFFSET_A: u64 = 0xcbf2_9ce4_8422_2325;
35        const OFFSET_B: u64 = 0xaf63_dc4c_8601_ec8c;
36
37        let mut h0 = OFFSET_A;
38        let mut h1 = OFFSET_B;
39
40        for (i, &byte) in data.iter().enumerate() {
41            if i % 2 == 0 {
42                h0 ^= u64::from(byte);
43                h0 = h0.wrapping_mul(FNV_PRIME);
44            } else {
45                h1 ^= u64::from(byte);
46                h1 = h1.wrapping_mul(FNV_PRIME);
47            }
48        }
49
50        let mut bytes = [0u8; 16];
51        bytes[..8].copy_from_slice(&h0.to_le_bytes());
52        bytes[8..].copy_from_slice(&h1.to_le_bytes());
53        ContentId { bytes }
54    }
55}
56
57/// An index mapping content identifiers to lists of file paths.
58#[derive(Debug, Default)]
59pub struct ExactMatchIndex {
60    /// Internal storage: hash → list of paths
61    pub entries: HashMap<[u8; 16], Vec<String>>,
62}
63
64impl ExactMatchIndex {
65    /// Create a new, empty index.
66    #[must_use]
67    pub fn new() -> Self {
68        Self::default()
69    }
70
71    /// Insert a file path under the given content identifier.
72    pub fn insert(&mut self, id: ContentId, path: &str) {
73        self.entries
74            .entry(id.bytes)
75            .or_default()
76            .push(path.to_owned());
77    }
78
79    /// Return all file paths that share the same content identifier as `id`.
80    ///
81    /// Returns an empty slice when there is no entry for `id`.
82    #[must_use]
83    pub fn find_duplicates(&self, id: ContentId) -> &[String] {
84        self.entries
85            .get(&id.bytes)
86            .map(Vec::as_slice)
87            .unwrap_or(&[])
88    }
89
90    /// Total number of individual path entries across all identifiers.
91    #[must_use]
92    pub fn total_entries(&self) -> usize {
93        self.entries.values().map(Vec::len).sum()
94    }
95
96    /// Return all groups of paths that have more than one entry (actual duplicates).
97    #[must_use]
98    pub fn duplicate_sets(&self) -> Vec<Vec<String>> {
99        self.entries
100            .values()
101            .filter(|v| v.len() > 1)
102            .cloned()
103            .collect()
104    }
105}
106
107/// A size-based pre-filter to skip files outside a plausible range.
108#[derive(Debug, Clone, Copy)]
109pub struct FileSizeFilter {
110    /// Minimum acceptable file size in bytes (inclusive)
111    pub min_bytes: u64,
112    /// Maximum acceptable file size in bytes (inclusive)
113    pub max_bytes: u64,
114}
115
116impl FileSizeFilter {
117    /// Create a new filter.
118    #[must_use]
119    pub fn new(min_bytes: u64, max_bytes: u64) -> Self {
120        Self {
121            min_bytes,
122            max_bytes,
123        }
124    }
125
126    /// Return `true` if `size` falls within the accepted range.
127    #[must_use]
128    pub fn accepts(&self, size: u64) -> bool {
129        size >= self.min_bytes && size <= self.max_bytes
130    }
131}
132
133/// Summary report produced after an exact-dedup scan.
134#[derive(Debug, Clone, Copy, Default)]
135pub struct ExactDedupReport {
136    /// Number of files examined
137    pub scanned: u64,
138    /// Number of duplicate files found
139    pub duplicates: u64,
140    /// Total bytes consumed by duplicate copies
141    pub wasted_bytes: u64,
142}
143
144impl ExactDedupReport {
145    /// Create a new report.
146    #[must_use]
147    pub fn new(scanned: u64, duplicates: u64, wasted_bytes: u64) -> Self {
148        Self {
149            scanned,
150            duplicates,
151            wasted_bytes,
152        }
153    }
154
155    /// Return the percentage of files that are duplicates (0.0–100.0).
156    #[must_use]
157    pub fn savings_pct(&self) -> f64 {
158        if self.scanned == 0 {
159            return 0.0;
160        }
161        (self.duplicates as f64 / self.scanned as f64) * 100.0
162    }
163}
164
165#[cfg(test)]
166mod tests {
167    use super::*;
168
169    #[test]
170    fn test_content_id_from_data_deterministic() {
171        let id1 = ContentId::from_data(b"hello world");
172        let id2 = ContentId::from_data(b"hello world");
173        assert_eq!(id1, id2);
174    }
175
176    #[test]
177    fn test_content_id_different_data() {
178        let id1 = ContentId::from_data(b"hello");
179        let id2 = ContentId::from_data(b"world");
180        assert_ne!(id1, id2);
181    }
182
183    #[test]
184    fn test_content_id_empty_data() {
185        let id = ContentId::from_data(b"");
186        // Should not panic and produce a 16-byte value
187        assert_eq!(id.bytes.len(), 16);
188    }
189
190    #[test]
191    fn test_content_id_to_hex_length() {
192        let id = ContentId::from_data(b"test");
193        assert_eq!(id.to_hex().len(), 32);
194    }
195
196    #[test]
197    fn test_content_id_to_hex_valid_chars() {
198        let id = ContentId::from_data(b"abc");
199        assert!(id.to_hex().chars().all(|c| c.is_ascii_hexdigit()));
200    }
201
202    #[test]
203    fn test_exact_match_index_insert_and_find() {
204        let mut index = ExactMatchIndex::new();
205        let id = ContentId::from_data(b"duplicate content");
206        index.insert(id, "/media/file_a.mp4");
207        index.insert(id, "/media/file_b.mp4");
208        let dups = index.find_duplicates(id);
209        assert_eq!(dups.len(), 2);
210        assert!(dups.contains(&"/media/file_a.mp4".to_string()));
211        assert!(dups.contains(&"/media/file_b.mp4".to_string()));
212    }
213
214    #[test]
215    fn test_exact_match_index_find_missing() {
216        let index = ExactMatchIndex::new();
217        let id = ContentId::from_data(b"not present");
218        assert!(index.find_duplicates(id).is_empty());
219    }
220
221    #[test]
222    fn test_exact_match_index_total_entries() {
223        let mut index = ExactMatchIndex::new();
224        let id_a = ContentId::from_data(b"aaa");
225        let id_b = ContentId::from_data(b"bbb");
226        index.insert(id_a, "/a/1.mp4");
227        index.insert(id_a, "/a/2.mp4");
228        index.insert(id_b, "/b/1.mp4");
229        assert_eq!(index.total_entries(), 3);
230    }
231
232    #[test]
233    fn test_exact_match_duplicate_sets() {
234        let mut index = ExactMatchIndex::new();
235        let id_a = ContentId::from_data(b"same_data");
236        let id_b = ContentId::from_data(b"unique_data");
237        index.insert(id_a, "/dup1.mp4");
238        index.insert(id_a, "/dup2.mp4");
239        index.insert(id_b, "/unique.mp4");
240        let sets = index.duplicate_sets();
241        assert_eq!(sets.len(), 1);
242        assert_eq!(sets[0].len(), 2);
243    }
244
245    #[test]
246    fn test_file_size_filter_accepts_within_range() {
247        let filter = FileSizeFilter::new(1024, 1024 * 1024);
248        assert!(filter.accepts(512 * 1024));
249    }
250
251    #[test]
252    fn test_file_size_filter_rejects_below_min() {
253        let filter = FileSizeFilter::new(1024, 1024 * 1024);
254        assert!(!filter.accepts(100));
255    }
256
257    #[test]
258    fn test_file_size_filter_rejects_above_max() {
259        let filter = FileSizeFilter::new(1024, 1024 * 1024);
260        assert!(!filter.accepts(2 * 1024 * 1024));
261    }
262
263    #[test]
264    fn test_file_size_filter_boundary_values() {
265        let filter = FileSizeFilter::new(100, 200);
266        assert!(filter.accepts(100));
267        assert!(filter.accepts(200));
268        assert!(!filter.accepts(99));
269        assert!(!filter.accepts(201));
270    }
271
272    #[test]
273    fn test_exact_dedup_report_savings_pct_zero_scanned() {
274        let report = ExactDedupReport::new(0, 0, 0);
275        assert_eq!(report.savings_pct(), 0.0);
276    }
277
278    #[test]
279    fn test_exact_dedup_report_savings_pct() {
280        let report = ExactDedupReport::new(100, 25, 1_000_000);
281        assert!((report.savings_pct() - 25.0).abs() < 1e-9);
282    }
283
284    #[test]
285    fn test_exact_dedup_report_full_duplicates() {
286        let report = ExactDedupReport::new(50, 50, 5_000_000);
287        assert!((report.savings_pct() - 100.0).abs() < 1e-9);
288    }
289}