oximedia_dedup/
exact_match.rs1use std::collections::HashMap;
7
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
10pub struct ContentId {
11 pub bytes: [u8; 16],
13}
14
15impl ContentId {
16 #[must_use]
18 pub fn to_hex(&self) -> String {
19 self.bytes
20 .iter()
21 .fold(String::with_capacity(32), |mut s, b| {
22 s.push_str(&format!("{b:02x}"));
23 s
24 })
25 }
26
27 #[must_use]
32 pub fn from_data(data: &[u8]) -> ContentId {
33 const FNV_PRIME: u64 = 0x0000_0100_0000_01b3;
34 const OFFSET_A: u64 = 0xcbf2_9ce4_8422_2325;
35 const OFFSET_B: u64 = 0xaf63_dc4c_8601_ec8c;
36
37 let mut h0 = OFFSET_A;
38 let mut h1 = OFFSET_B;
39
40 for (i, &byte) in data.iter().enumerate() {
41 if i % 2 == 0 {
42 h0 ^= u64::from(byte);
43 h0 = h0.wrapping_mul(FNV_PRIME);
44 } else {
45 h1 ^= u64::from(byte);
46 h1 = h1.wrapping_mul(FNV_PRIME);
47 }
48 }
49
50 let mut bytes = [0u8; 16];
51 bytes[..8].copy_from_slice(&h0.to_le_bytes());
52 bytes[8..].copy_from_slice(&h1.to_le_bytes());
53 ContentId { bytes }
54 }
55}
56
57#[derive(Debug, Default)]
59pub struct ExactMatchIndex {
60 pub entries: HashMap<[u8; 16], Vec<String>>,
62}
63
64impl ExactMatchIndex {
65 #[must_use]
67 pub fn new() -> Self {
68 Self::default()
69 }
70
71 pub fn insert(&mut self, id: ContentId, path: &str) {
73 self.entries
74 .entry(id.bytes)
75 .or_default()
76 .push(path.to_owned());
77 }
78
79 #[must_use]
83 pub fn find_duplicates(&self, id: ContentId) -> &[String] {
84 self.entries
85 .get(&id.bytes)
86 .map(Vec::as_slice)
87 .unwrap_or(&[])
88 }
89
90 #[must_use]
92 pub fn total_entries(&self) -> usize {
93 self.entries.values().map(Vec::len).sum()
94 }
95
96 #[must_use]
98 pub fn duplicate_sets(&self) -> Vec<Vec<String>> {
99 self.entries
100 .values()
101 .filter(|v| v.len() > 1)
102 .cloned()
103 .collect()
104 }
105}
106
107#[derive(Debug, Clone, Copy)]
109pub struct FileSizeFilter {
110 pub min_bytes: u64,
112 pub max_bytes: u64,
114}
115
116impl FileSizeFilter {
117 #[must_use]
119 pub fn new(min_bytes: u64, max_bytes: u64) -> Self {
120 Self {
121 min_bytes,
122 max_bytes,
123 }
124 }
125
126 #[must_use]
128 pub fn accepts(&self, size: u64) -> bool {
129 size >= self.min_bytes && size <= self.max_bytes
130 }
131}
132
133#[derive(Debug, Clone, Copy, Default)]
135pub struct ExactDedupReport {
136 pub scanned: u64,
138 pub duplicates: u64,
140 pub wasted_bytes: u64,
142}
143
144impl ExactDedupReport {
145 #[must_use]
147 pub fn new(scanned: u64, duplicates: u64, wasted_bytes: u64) -> Self {
148 Self {
149 scanned,
150 duplicates,
151 wasted_bytes,
152 }
153 }
154
155 #[must_use]
157 pub fn savings_pct(&self) -> f64 {
158 if self.scanned == 0 {
159 return 0.0;
160 }
161 (self.duplicates as f64 / self.scanned as f64) * 100.0
162 }
163}
164
165#[cfg(test)]
166mod tests {
167 use super::*;
168
169 #[test]
170 fn test_content_id_from_data_deterministic() {
171 let id1 = ContentId::from_data(b"hello world");
172 let id2 = ContentId::from_data(b"hello world");
173 assert_eq!(id1, id2);
174 }
175
176 #[test]
177 fn test_content_id_different_data() {
178 let id1 = ContentId::from_data(b"hello");
179 let id2 = ContentId::from_data(b"world");
180 assert_ne!(id1, id2);
181 }
182
183 #[test]
184 fn test_content_id_empty_data() {
185 let id = ContentId::from_data(b"");
186 assert_eq!(id.bytes.len(), 16);
188 }
189
190 #[test]
191 fn test_content_id_to_hex_length() {
192 let id = ContentId::from_data(b"test");
193 assert_eq!(id.to_hex().len(), 32);
194 }
195
196 #[test]
197 fn test_content_id_to_hex_valid_chars() {
198 let id = ContentId::from_data(b"abc");
199 assert!(id.to_hex().chars().all(|c| c.is_ascii_hexdigit()));
200 }
201
202 #[test]
203 fn test_exact_match_index_insert_and_find() {
204 let mut index = ExactMatchIndex::new();
205 let id = ContentId::from_data(b"duplicate content");
206 index.insert(id, "/media/file_a.mp4");
207 index.insert(id, "/media/file_b.mp4");
208 let dups = index.find_duplicates(id);
209 assert_eq!(dups.len(), 2);
210 assert!(dups.contains(&"/media/file_a.mp4".to_string()));
211 assert!(dups.contains(&"/media/file_b.mp4".to_string()));
212 }
213
214 #[test]
215 fn test_exact_match_index_find_missing() {
216 let index = ExactMatchIndex::new();
217 let id = ContentId::from_data(b"not present");
218 assert!(index.find_duplicates(id).is_empty());
219 }
220
221 #[test]
222 fn test_exact_match_index_total_entries() {
223 let mut index = ExactMatchIndex::new();
224 let id_a = ContentId::from_data(b"aaa");
225 let id_b = ContentId::from_data(b"bbb");
226 index.insert(id_a, "/a/1.mp4");
227 index.insert(id_a, "/a/2.mp4");
228 index.insert(id_b, "/b/1.mp4");
229 assert_eq!(index.total_entries(), 3);
230 }
231
232 #[test]
233 fn test_exact_match_duplicate_sets() {
234 let mut index = ExactMatchIndex::new();
235 let id_a = ContentId::from_data(b"same_data");
236 let id_b = ContentId::from_data(b"unique_data");
237 index.insert(id_a, "/dup1.mp4");
238 index.insert(id_a, "/dup2.mp4");
239 index.insert(id_b, "/unique.mp4");
240 let sets = index.duplicate_sets();
241 assert_eq!(sets.len(), 1);
242 assert_eq!(sets[0].len(), 2);
243 }
244
245 #[test]
246 fn test_file_size_filter_accepts_within_range() {
247 let filter = FileSizeFilter::new(1024, 1024 * 1024);
248 assert!(filter.accepts(512 * 1024));
249 }
250
251 #[test]
252 fn test_file_size_filter_rejects_below_min() {
253 let filter = FileSizeFilter::new(1024, 1024 * 1024);
254 assert!(!filter.accepts(100));
255 }
256
257 #[test]
258 fn test_file_size_filter_rejects_above_max() {
259 let filter = FileSizeFilter::new(1024, 1024 * 1024);
260 assert!(!filter.accepts(2 * 1024 * 1024));
261 }
262
263 #[test]
264 fn test_file_size_filter_boundary_values() {
265 let filter = FileSizeFilter::new(100, 200);
266 assert!(filter.accepts(100));
267 assert!(filter.accepts(200));
268 assert!(!filter.accepts(99));
269 assert!(!filter.accepts(201));
270 }
271
272 #[test]
273 fn test_exact_dedup_report_savings_pct_zero_scanned() {
274 let report = ExactDedupReport::new(0, 0, 0);
275 assert_eq!(report.savings_pct(), 0.0);
276 }
277
278 #[test]
279 fn test_exact_dedup_report_savings_pct() {
280 let report = ExactDedupReport::new(100, 25, 1_000_000);
281 assert!((report.savings_pct() - 25.0).abs() < 1e-9);
282 }
283
284 #[test]
285 fn test_exact_dedup_report_full_duplicates() {
286 let report = ExactDedupReport::new(50, 50, 5_000_000);
287 assert!((report.savings_pct() - 100.0).abs() < 1e-9);
288 }
289}