1#![allow(dead_code)]
8#![allow(clippy::cast_precision_loss)]
9
10use std::path::{Path, PathBuf};
11
12use rayon::prelude::*;
13
14#[derive(Debug, Clone, PartialEq, Eq)]
20pub struct DedupEntry {
21 pub id: u64,
23 pub content_hash: Vec<u8>,
25 pub size_bytes: u64,
27 pub first_seen_epoch: u64,
29 pub occurrence_count: u32,
31}
32
33impl DedupEntry {
34 #[must_use]
36 pub fn is_duplicate(&self) -> bool {
37 self.occurrence_count > 1
38 }
39
40 #[must_use]
45 pub fn space_savings(&self) -> u64 {
46 if self.occurrence_count <= 1 {
47 return 0;
48 }
49 (self.occurrence_count as u64 - 1).saturating_mul(self.size_bytes)
50 }
51
52 #[must_use]
55 pub fn first_seen(&self) -> u64 {
56 self.first_seen_epoch
57 }
58
59 #[must_use]
61 pub fn hash_hex(&self) -> String {
62 self.content_hash
63 .iter()
64 .map(|b| format!("{b:02x}"))
65 .collect()
66 }
67}
68
69pub struct DedupIndex {
78 pub entries: Vec<DedupEntry>,
80 pub next_id: u64,
82}
83
84impl DedupIndex {
85 #[must_use]
87 pub fn new() -> Self {
88 Self {
89 entries: Vec::new(),
90 next_id: 1,
91 }
92 }
93
94 pub fn add_or_increment(&mut self, hash: Vec<u8>, size_bytes: u64, epoch: u64) -> u64 {
98 if let Some(entry) = self.entries.iter_mut().find(|e| e.content_hash == hash) {
99 entry.occurrence_count += 1;
100 return entry.id;
101 }
102
103 let id = self.next_id;
104 self.next_id += 1;
105 self.entries.push(DedupEntry {
106 id,
107 content_hash: hash,
108 size_bytes,
109 first_seen_epoch: epoch,
110 occurrence_count: 1,
111 });
112 id
113 }
114
115 #[must_use]
117 pub fn find_by_hash(&self, hash: &[u8]) -> Option<&DedupEntry> {
118 self.entries.iter().find(|e| e.content_hash == hash)
119 }
120
121 #[must_use]
123 pub fn find_by_id(&self, id: u64) -> Option<&DedupEntry> {
124 self.entries.iter().find(|e| e.id == id)
125 }
126
127 #[must_use]
129 pub fn find_duplicates(&self) -> Vec<&DedupEntry> {
130 self.entries.iter().filter(|e| e.is_duplicate()).collect()
131 }
132
133 #[must_use]
135 pub fn total_space_savings(&self) -> u64 {
136 self.entries.iter().map(|e| e.space_savings()).sum()
137 }
138
139 #[must_use]
141 pub fn unique_count(&self) -> usize {
142 self.entries.len()
143 }
144
145 #[must_use]
147 pub fn total_insertions(&self) -> u64 {
148 self.entries.iter().map(|e| e.occurrence_count as u64).sum()
149 }
150
151 pub fn remove_by_hash(&mut self, hash: &[u8]) -> bool {
153 if let Some(pos) = self.entries.iter().position(|e| e.content_hash == hash) {
154 self.entries.swap_remove(pos);
155 true
156 } else {
157 false
158 }
159 }
160
161 pub fn clear(&mut self) {
163 self.entries.clear();
164 self.next_id = 1;
165 }
166
167 #[must_use]
169 pub fn most_common(&self) -> Vec<&DedupEntry> {
170 let mut sorted: Vec<&DedupEntry> = self.entries.iter().collect();
171 sorted.sort_by(|a, b| b.occurrence_count.cmp(&a.occurrence_count));
172 sorted
173 }
174
175 #[must_use]
179 pub fn duplicate_rate(&self) -> f64 {
180 let total = self.total_insertions();
181 if total == 0 {
182 return 0.0;
183 }
184 let unique = self.unique_count() as u64;
185 let dupes = total.saturating_sub(unique);
186 dupes as f64 / total as f64
187 }
188}
189
190impl Default for DedupIndex {
191 fn default() -> Self {
192 Self::new()
193 }
194}
195
196#[derive(Debug, Clone)]
202pub struct FileFeatures {
203 pub path: PathBuf,
205 pub content_hash: Vec<u8>,
207 pub size_bytes: u64,
209 pub mtime_epoch: u64,
211}
212
213pub fn compute_file_features(path: &Path) -> std::io::Result<FileFeatures> {
219 use std::io::Read;
220 use std::time::UNIX_EPOCH;
221
222 let meta = std::fs::metadata(path)?;
223 let size_bytes = meta.len();
224 let mtime_epoch = meta
225 .modified()
226 .unwrap_or(UNIX_EPOCH)
227 .duration_since(UNIX_EPOCH)
228 .unwrap_or_default()
229 .as_secs();
230
231 const FNV_OFFSET: u64 = 0xcbf2_9ce4_8422_2325;
233 const FNV_PRIME: u64 = 0x0000_0100_0000_01b3;
234 let mut hash: u64 = FNV_OFFSET;
235
236 let file = std::fs::File::open(path)?;
237 let mut reader = std::io::BufReader::with_capacity(65_536, file);
238 let mut buf = vec![0u8; 65_536];
239 loop {
240 let n = reader.read(&mut buf)?;
241 if n == 0 {
242 break;
243 }
244 for &b in &buf[..n] {
245 hash = (hash ^ u64::from(b)).wrapping_mul(FNV_PRIME);
246 }
247 }
248
249 Ok(FileFeatures {
250 path: path.to_path_buf(),
251 content_hash: hash.to_le_bytes().to_vec(),
252 size_bytes,
253 mtime_epoch,
254 })
255}
256
257impl DedupIndex {
258 pub fn add_files(&mut self, paths: &[impl AsRef<Path> + Sync]) -> (Vec<u64>, Vec<PathBuf>) {
268 let results: Vec<(PathBuf, std::io::Result<FileFeatures>)> = paths
270 .par_iter()
271 .map(|p| {
272 let path = p.as_ref().to_path_buf();
273 let feat = compute_file_features(&path);
274 (path, feat)
275 })
276 .collect();
277
278 let mut added_ids = Vec::with_capacity(results.len());
280 let mut skipped = Vec::new();
281
282 for (path, result) in results {
283 match result {
284 Ok(feat) => {
285 let id =
286 self.add_or_increment(feat.content_hash, feat.size_bytes, feat.mtime_epoch);
287 added_ids.push(id);
288 }
289 Err(_) => {
290 skipped.push(path);
291 }
292 }
293 }
294
295 (added_ids, skipped)
296 }
297}
298
299#[cfg(test)]
304mod tests {
305 use super::*;
306 use std::env::temp_dir;
307
308 fn hash(s: &str) -> Vec<u8> {
309 s.as_bytes().to_vec()
310 }
311
312 fn make_temp_files(n: usize) -> Vec<std::path::PathBuf> {
314 use std::sync::atomic::{AtomicU64, Ordering};
315 static COUNTER: AtomicU64 = AtomicU64::new(0);
316
317 let base = temp_dir();
318 let pid = std::process::id();
319 (0..n)
320 .map(|i| {
321 let uid = COUNTER.fetch_add(1, Ordering::Relaxed);
322 let path = base.join(format!("dedup_idx_{pid}_{uid}_{i}.bin"));
323 std::fs::write(&path, format!("synthetic content for file {pid}_{uid}_{i}"))
324 .expect("write temp file");
325 path
326 })
327 .collect()
328 }
329
330 #[test]
333 fn test_entry_is_duplicate_false_when_once() {
334 let e = DedupEntry {
335 id: 1,
336 content_hash: hash("abc"),
337 size_bytes: 1024,
338 first_seen_epoch: 0,
339 occurrence_count: 1,
340 };
341 assert!(!e.is_duplicate());
342 }
343
344 #[test]
345 fn test_entry_is_duplicate_true_when_multiple() {
346 let e = DedupEntry {
347 id: 1,
348 content_hash: hash("abc"),
349 size_bytes: 1024,
350 first_seen_epoch: 0,
351 occurrence_count: 3,
352 };
353 assert!(e.is_duplicate());
354 }
355
356 #[test]
357 fn test_entry_space_savings_zero_when_once() {
358 let e = DedupEntry {
359 id: 1,
360 content_hash: hash("abc"),
361 size_bytes: 500,
362 first_seen_epoch: 0,
363 occurrence_count: 1,
364 };
365 assert_eq!(e.space_savings(), 0);
366 }
367
368 #[test]
369 fn test_entry_space_savings_correct() {
370 let e = DedupEntry {
371 id: 1,
372 content_hash: hash("abc"),
373 size_bytes: 1000,
374 first_seen_epoch: 0,
375 occurrence_count: 4,
376 };
377 assert_eq!(e.space_savings(), 3000);
379 }
380
381 #[test]
382 fn test_entry_hash_hex() {
383 let e = DedupEntry {
384 id: 1,
385 content_hash: vec![0xDE, 0xAD, 0xBE, 0xEF],
386 size_bytes: 0,
387 first_seen_epoch: 0,
388 occurrence_count: 1,
389 };
390 assert_eq!(e.hash_hex(), "deadbeef");
391 }
392
393 #[test]
396 fn test_index_new_empty() {
397 let idx = DedupIndex::new();
398 assert_eq!(idx.unique_count(), 0);
399 assert_eq!(idx.total_insertions(), 0);
400 assert_eq!(idx.total_space_savings(), 0);
401 }
402
403 #[test]
404 fn test_add_or_increment_new_entry() {
405 let mut idx = DedupIndex::new();
406 let id = idx.add_or_increment(hash("file_a"), 100, 1000);
407 assert_eq!(idx.unique_count(), 1);
408 assert_eq!(id, 1);
409 }
410
411 #[test]
412 fn test_add_or_increment_existing_entry() {
413 let mut idx = DedupIndex::new();
414 let id1 = idx.add_or_increment(hash("file_a"), 100, 1000);
415 let id2 = idx.add_or_increment(hash("file_a"), 100, 2000);
416 assert_eq!(id1, id2, "Same hash should return same ID");
417 assert_eq!(idx.unique_count(), 1, "Should still be one unique entry");
418 let entry = idx
419 .find_by_hash(&hash("file_a"))
420 .expect("operation should succeed");
421 assert_eq!(entry.occurrence_count, 2);
422 }
423
424 #[test]
425 fn test_find_by_hash_found() {
426 let mut idx = DedupIndex::new();
427 idx.add_or_increment(hash("alpha"), 200, 0);
428 let entry = idx.find_by_hash(&hash("alpha"));
429 assert!(entry.is_some());
430 assert_eq!(entry.expect("operation should succeed").size_bytes, 200);
431 }
432
433 #[test]
434 fn test_find_by_hash_not_found() {
435 let idx = DedupIndex::new();
436 assert!(idx.find_by_hash(&hash("missing")).is_none());
437 }
438
439 #[test]
440 fn test_find_by_id() {
441 let mut idx = DedupIndex::new();
442 let id = idx.add_or_increment(hash("entry_1"), 512, 100);
443 let entry = idx.find_by_id(id);
444 assert!(entry.is_some());
445 assert_eq!(
446 entry.expect("operation should succeed").content_hash,
447 hash("entry_1")
448 );
449 }
450
451 #[test]
452 fn test_find_duplicates() {
453 let mut idx = DedupIndex::new();
454 idx.add_or_increment(hash("unique"), 10, 0);
455 idx.add_or_increment(hash("dup"), 20, 0);
456 idx.add_or_increment(hash("dup"), 20, 1);
457 idx.add_or_increment(hash("dup"), 20, 2);
458
459 let dups = idx.find_duplicates();
460 assert_eq!(dups.len(), 1);
461 assert_eq!(dups[0].content_hash, hash("dup"));
462 assert_eq!(dups[0].occurrence_count, 3);
463 }
464
465 #[test]
466 fn test_total_space_savings() {
467 let mut idx = DedupIndex::new();
468 idx.add_or_increment(hash("a"), 100, 0);
470 idx.add_or_increment(hash("b"), 200, 0);
472 idx.add_or_increment(hash("b"), 200, 1);
473 idx.add_or_increment(hash("b"), 200, 2);
474
475 assert_eq!(idx.total_space_savings(), 400);
476 }
477
478 #[test]
479 fn test_remove_by_hash() {
480 let mut idx = DedupIndex::new();
481 idx.add_or_increment(hash("to_remove"), 50, 0);
482 assert_eq!(idx.unique_count(), 1);
483 let removed = idx.remove_by_hash(&hash("to_remove"));
484 assert!(removed);
485 assert_eq!(idx.unique_count(), 0);
486 }
487
488 #[test]
489 fn test_remove_nonexistent_returns_false() {
490 let mut idx = DedupIndex::new();
491 assert!(!idx.remove_by_hash(&hash("ghost")));
492 }
493
494 #[test]
495 fn test_clear() {
496 let mut idx = DedupIndex::new();
497 idx.add_or_increment(hash("x"), 10, 0);
498 idx.add_or_increment(hash("y"), 20, 0);
499 idx.clear();
500 assert_eq!(idx.unique_count(), 0);
501 assert_eq!(idx.next_id, 1);
502 }
503
504 #[test]
505 fn test_duplicate_rate() {
506 let mut idx = DedupIndex::new();
507 idx.add_or_increment(hash("h"), 100, 0);
509 idx.add_or_increment(hash("h"), 100, 1);
510 idx.add_or_increment(hash("h"), 100, 2);
511
512 let rate = idx.duplicate_rate();
513 let expected = 2.0 / 3.0;
514 assert!((rate - expected).abs() < 1e-9);
515 }
516
517 #[test]
518 fn test_most_common_ordering() {
519 let mut idx = DedupIndex::new();
520 idx.add_or_increment(hash("rare"), 10, 0);
521 idx.add_or_increment(hash("common"), 100, 0);
522 idx.add_or_increment(hash("common"), 100, 1);
523 idx.add_or_increment(hash("common"), 100, 2);
524 idx.add_or_increment(hash("mid"), 50, 0);
525 idx.add_or_increment(hash("mid"), 50, 1);
526
527 let mc = idx.most_common();
528 assert_eq!(mc[0].content_hash, hash("common"));
529 assert_eq!(mc[1].content_hash, hash("mid"));
530 assert_eq!(mc[2].content_hash, hash("rare"));
531 }
532
533 #[test]
534 fn test_total_insertions() {
535 let mut idx = DedupIndex::new();
536 idx.add_or_increment(hash("a"), 1, 0);
537 idx.add_or_increment(hash("a"), 1, 1);
538 idx.add_or_increment(hash("b"), 1, 0);
539 assert_eq!(idx.total_insertions(), 3);
541 }
542
543 #[test]
546 fn test_add_files_parallel_same_result_as_sequential() {
547 let paths = make_temp_files(10);
548
549 let mut par_idx = DedupIndex::new();
551 let (par_ids, par_skipped) = par_idx.add_files(&paths);
552 assert!(par_skipped.is_empty(), "no files should be skipped");
553 assert_eq!(par_ids.len(), 10, "all 10 files should produce an ID");
554
555 let mut seq_idx = DedupIndex::new();
557 let mut seq_ids = Vec::with_capacity(paths.len());
558 for p in &paths {
559 let feat = compute_file_features(p.as_ref()).expect("compute features");
560 let id = seq_idx.add_or_increment(feat.content_hash, feat.size_bytes, feat.mtime_epoch);
561 seq_ids.push(id);
562 }
563
564 assert_eq!(
566 par_idx.unique_count(),
567 seq_idx.unique_count(),
568 "unique count must match between parallel and sequential"
569 );
570
571 assert_eq!(par_idx.find_duplicates().len(), 0);
573 assert_eq!(seq_idx.find_duplicates().len(), 0);
574
575 for p in &paths {
577 let _ = std::fs::remove_file(p);
578 }
579 }
580
581 #[test]
582 fn test_add_files_skips_missing_files() {
583 let real = make_temp_files(3);
584 let mut all_paths = real.clone();
585 all_paths.push(temp_dir().join("nonexistent_dedup_test_file.bin"));
586
587 let mut idx = DedupIndex::new();
588 let (_ids, skipped) = idx.add_files(&all_paths);
589 assert_eq!(
590 skipped.len(),
591 1,
592 "exactly one missing file should be skipped"
593 );
594 assert_eq!(idx.unique_count(), 3, "three real files should be indexed");
595
596 for p in &real {
597 let _ = std::fs::remove_file(p);
598 }
599 }
600}