Skip to main content

oximedia_dedup/
hash_store.rs

1//! Persistent hash store for deduplication lookups.
2//!
3//! Provides an in-memory hash store backed by sorted vectors for fast
4//! lookup, insertion, and range queries. Designed to hold millions of
5//! hash entries with minimal overhead.
6
7#![allow(dead_code)]
8#![allow(clippy::cast_precision_loss)]
9
10use std::collections::HashMap;
11use std::path::PathBuf;
12
13// ---------------------------------------------------------------------------
14// HashEntry
15// ---------------------------------------------------------------------------
16
17/// A single hash entry in the store.
18#[derive(Debug, Clone, PartialEq, Eq, Hash)]
19pub struct HashEntry {
20    /// The 256-bit hash digest as a hex string.
21    pub digest: String,
22    /// The file path that produced this hash.
23    pub file_path: PathBuf,
24    /// File size in bytes.
25    pub file_size: u64,
26    /// Unix timestamp when the entry was added.
27    pub added_at: u64,
28}
29
30impl HashEntry {
31    /// Create a new hash entry.
32    pub fn new(digest: &str, file_path: PathBuf, file_size: u64, added_at: u64) -> Self {
33        Self {
34            digest: digest.to_string(),
35            file_path,
36            file_size,
37            added_at,
38        }
39    }
40}
41
42// ---------------------------------------------------------------------------
43// HashStore
44// ---------------------------------------------------------------------------
45
46/// An in-memory hash store that maps digests to file entries.
47///
48/// Supports fast lookup by digest and retrieval of duplicate groups.
49pub struct HashStore {
50    /// Map from digest string to list of entries with that digest.
51    entries: HashMap<String, Vec<HashEntry>>,
52    /// Total number of entries.
53    count: usize,
54}
55
56impl HashStore {
57    /// Create an empty hash store.
58    #[must_use]
59    pub fn new() -> Self {
60        Self {
61            entries: HashMap::new(),
62            count: 0,
63        }
64    }
65
66    /// Create a hash store with pre-allocated capacity.
67    #[must_use]
68    pub fn with_capacity(cap: usize) -> Self {
69        Self {
70            entries: HashMap::with_capacity(cap),
71            count: 0,
72        }
73    }
74
75    /// Insert an entry into the store.
76    pub fn insert(&mut self, entry: HashEntry) {
77        self.entries
78            .entry(entry.digest.clone())
79            .or_default()
80            .push(entry);
81        self.count += 1;
82    }
83
84    /// Check if a digest exists in the store.
85    #[must_use]
86    pub fn contains(&self, digest: &str) -> bool {
87        self.entries.contains_key(digest)
88    }
89
90    /// Look up entries by digest.
91    #[must_use]
92    pub fn get(&self, digest: &str) -> Option<&[HashEntry]> {
93        self.entries.get(digest).map(Vec::as_slice)
94    }
95
96    /// Return the total number of entries.
97    #[must_use]
98    pub fn len(&self) -> usize {
99        self.count
100    }
101
102    /// Return `true` if the store is empty.
103    #[must_use]
104    pub fn is_empty(&self) -> bool {
105        self.count == 0
106    }
107
108    /// Return the number of unique digests.
109    #[must_use]
110    pub fn unique_count(&self) -> usize {
111        self.entries.len()
112    }
113
114    /// Return all digest groups that have more than one entry (duplicates).
115    #[must_use]
116    pub fn duplicate_groups(&self) -> Vec<(&str, &[HashEntry])> {
117        self.entries
118            .iter()
119            .filter(|(_, v)| v.len() > 1)
120            .map(|(k, v)| (k.as_str(), v.as_slice()))
121            .collect()
122    }
123
124    /// Return the number of duplicate entries (entries beyond the first in each group).
125    #[must_use]
126    pub fn duplicate_count(&self) -> usize {
127        self.entries
128            .values()
129            .filter(|v| v.len() > 1)
130            .map(|v| v.len() - 1)
131            .sum()
132    }
133
134    /// Return the total file size of all duplicate entries.
135    #[must_use]
136    pub fn duplicate_bytes(&self) -> u64 {
137        self.entries
138            .values()
139            .filter(|v| v.len() > 1)
140            .flat_map(|v| v.iter().skip(1))
141            .map(|e| e.file_size)
142            .sum()
143    }
144
145    /// Remove all entries for a given digest. Returns the removed entries.
146    pub fn remove(&mut self, digest: &str) -> Vec<HashEntry> {
147        if let Some(removed) = self.entries.remove(digest) {
148            self.count -= removed.len();
149            removed
150        } else {
151            Vec::new()
152        }
153    }
154
155    /// Clear the entire store.
156    pub fn clear(&mut self) {
157        self.entries.clear();
158        self.count = 0;
159    }
160
161    /// Return an iterator over all unique digests.
162    pub fn digests(&self) -> impl Iterator<Item = &str> {
163        self.entries.keys().map(String::as_str)
164    }
165
166    /// Return deduplication ratio as a fraction.
167    ///
168    /// `1.0` means no duplicates, lower means more duplication.
169    #[must_use]
170    pub fn dedup_ratio(&self) -> f64 {
171        if self.count == 0 {
172            return 1.0;
173        }
174        self.unique_count() as f64 / self.count as f64
175    }
176}
177
178impl Default for HashStore {
179    fn default() -> Self {
180        Self::new()
181    }
182}
183
184// ---------------------------------------------------------------------------
185// Tests
186// ---------------------------------------------------------------------------
187
188#[cfg(test)]
189mod tests {
190    use super::*;
191
192    fn make_entry(digest: &str, path: &str, size: u64) -> HashEntry {
193        HashEntry::new(digest, PathBuf::from(path), size, 1000)
194    }
195
196    #[test]
197    fn test_new_store_empty() {
198        let store = HashStore::new();
199        assert!(store.is_empty());
200        assert_eq!(store.len(), 0);
201        assert_eq!(store.unique_count(), 0);
202    }
203
204    #[test]
205    fn test_insert_and_lookup() {
206        let mut store = HashStore::new();
207        store.insert(make_entry("aaa", "/a.mp4", 100));
208        assert!(store.contains("aaa"));
209        assert!(!store.contains("bbb"));
210        assert_eq!(store.len(), 1);
211    }
212
213    #[test]
214    fn test_get_entries() {
215        let mut store = HashStore::new();
216        store.insert(make_entry("hash1", "/file1.mp4", 200));
217        let entries = store.get("hash1").expect("operation should succeed");
218        assert_eq!(entries.len(), 1);
219        assert_eq!(entries[0].file_size, 200);
220    }
221
222    #[test]
223    fn test_duplicate_groups() {
224        let mut store = HashStore::new();
225        store.insert(make_entry("dup", "/a.mp4", 100));
226        store.insert(make_entry("dup", "/b.mp4", 100));
227        store.insert(make_entry("unique", "/c.mp4", 50));
228
229        let groups = store.duplicate_groups();
230        assert_eq!(groups.len(), 1);
231        assert_eq!(groups[0].0, "dup");
232        assert_eq!(groups[0].1.len(), 2);
233    }
234
235    #[test]
236    fn test_duplicate_count() {
237        let mut store = HashStore::new();
238        store.insert(make_entry("d", "/a.mp4", 10));
239        store.insert(make_entry("d", "/b.mp4", 10));
240        store.insert(make_entry("d", "/c.mp4", 10));
241        store.insert(make_entry("u", "/d.mp4", 10));
242
243        assert_eq!(store.duplicate_count(), 2); // 3-1 = 2 duplicates
244    }
245
246    #[test]
247    fn test_duplicate_bytes() {
248        let mut store = HashStore::new();
249        store.insert(make_entry("h", "/a.mp4", 1000));
250        store.insert(make_entry("h", "/b.mp4", 1000));
251        store.insert(make_entry("h", "/c.mp4", 1500));
252
253        // skip first, sum rest = 1000 + 1500 = 2500
254        assert_eq!(store.duplicate_bytes(), 2500);
255    }
256
257    #[test]
258    fn test_remove() {
259        let mut store = HashStore::new();
260        store.insert(make_entry("rm", "/a.mp4", 10));
261        store.insert(make_entry("rm", "/b.mp4", 20));
262        store.insert(make_entry("keep", "/c.mp4", 30));
263
264        let removed = store.remove("rm");
265        assert_eq!(removed.len(), 2);
266        assert!(!store.contains("rm"));
267        assert_eq!(store.len(), 1);
268    }
269
270    #[test]
271    fn test_remove_nonexistent() {
272        let mut store = HashStore::new();
273        let removed = store.remove("nope");
274        assert!(removed.is_empty());
275    }
276
277    #[test]
278    fn test_clear() {
279        let mut store = HashStore::new();
280        store.insert(make_entry("a", "/x.mp4", 1));
281        store.insert(make_entry("b", "/y.mp4", 2));
282        store.clear();
283        assert!(store.is_empty());
284        assert_eq!(store.unique_count(), 0);
285    }
286
287    #[test]
288    fn test_dedup_ratio_no_dupes() {
289        let mut store = HashStore::new();
290        store.insert(make_entry("a", "/1.mp4", 10));
291        store.insert(make_entry("b", "/2.mp4", 20));
292        assert!((store.dedup_ratio() - 1.0).abs() < 1e-9);
293    }
294
295    #[test]
296    fn test_dedup_ratio_all_dupes() {
297        let mut store = HashStore::new();
298        store.insert(make_entry("same", "/1.mp4", 10));
299        store.insert(make_entry("same", "/2.mp4", 10));
300        store.insert(make_entry("same", "/3.mp4", 10));
301        assert!((store.dedup_ratio() - 1.0 / 3.0).abs() < 1e-9);
302    }
303
304    #[test]
305    fn test_dedup_ratio_empty() {
306        let store = HashStore::new();
307        assert!((store.dedup_ratio() - 1.0).abs() < 1e-9);
308    }
309
310    #[test]
311    fn test_with_capacity() {
312        let store = HashStore::with_capacity(1000);
313        assert!(store.is_empty());
314    }
315}