Skip to main content

oximedia_dedup/
content_id.rs

1//! Content ID and fingerprinting for media assets.
2//!
3//! This module provides:
4//! - `ContentId`: a UUID-like content identifier derived from FNV-128 hashing
5//! - `ContentFingerprint`: combined audio/visual/metadata fingerprint
6//! - `ContentIdRegistry`: registry for deduplication and lookup
7//! - `ContentIdStats`: deduplication statistics
8
9#![allow(dead_code)]
10
11// ---------------------------------------------------------------------------
12// ContentId
13// ---------------------------------------------------------------------------
14
15/// A UUID-like content identifier derived from the data's FNV-128 hash.
16#[derive(Debug, Clone, PartialEq, Eq, Hash)]
17pub struct ContentId(pub String);
18
19impl ContentId {
20    /// Generate a `ContentId` from arbitrary byte data using FNV-128.
21    ///
22    /// The result is formatted as a 32-character lowercase hex string.
23    #[must_use]
24    pub fn generate(data: &[u8]) -> Self {
25        let hash = fnv128(data);
26        // Format as two 64-bit hex segments
27        let s = format!("{:016x}{:016x}", hash.0, hash.1);
28        Self(s)
29    }
30
31    /// Return a string slice of the identifier.
32    #[must_use]
33    pub fn as_str(&self) -> &str {
34        &self.0
35    }
36}
37
38impl std::fmt::Display for ContentId {
39    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
40        write!(f, "{}", self.0)
41    }
42}
43
44/// FNV-128 hash. Returns (high 64 bits, low 64 bits).
45fn fnv128(data: &[u8]) -> (u64, u64) {
46    // FNV-1a 128-bit: prime = 0x0000_0000_0000_0000_0000_0001_0000_0000_0000_0000_0000_013B
47    // offset = 0x6c62272e07bb0142_62b821756295c58d
48    let mut h_lo: u64 = 0x62b8_2175_6295_c58d;
49    let mut h_hi: u64 = 0x6c62_272e_07bb_0142;
50
51    for &byte in data {
52        // XOR with byte (applied to low 64 bits only, as per FNV-128 spec approximation)
53        h_lo ^= u64::from(byte);
54
55        // Multiply by FNV prime (128-bit): 2^88 + 2^8 + 0x3b
56        // We use 64-bit multiplication with split carry for the two 64-bit halves.
57        // Prime low: 0x0000_0000_0000_013B, Prime high: 0x0000_0001_0000_0000
58        let prime_lo: u64 = 0x0000_0000_0000_013b;
59        let prime_hi: u64 = 0x0000_0001_0000_0000;
60
61        let new_lo = h_lo.wrapping_mul(prime_lo);
62        let carry = h_lo
63            .wrapping_mul(prime_hi)
64            .wrapping_add(h_hi.wrapping_mul(prime_lo));
65
66        h_lo = new_lo;
67        h_hi = carry;
68    }
69
70    (h_hi, h_lo)
71}
72
73// ---------------------------------------------------------------------------
74// ContentFingerprint
75// ---------------------------------------------------------------------------
76
77/// A combined fingerprint for a media asset.
78#[derive(Debug, Clone)]
79pub struct ContentFingerprint {
80    /// Content identifier.
81    pub id: ContentId,
82    /// Audio fingerprint codes (optional).
83    pub audio_fingerprint: Option<Vec<u32>>,
84    /// Visual fingerprint hashes (optional).
85    pub visual_fingerprint: Option<Vec<u64>>,
86    /// Metadata hash.
87    pub metadata_hash: u64,
88}
89
90impl ContentFingerprint {
91    /// Create a new fingerprint.
92    #[must_use]
93    pub fn new(
94        id: ContentId,
95        audio_fingerprint: Option<Vec<u32>>,
96        visual_fingerprint: Option<Vec<u64>>,
97        metadata_hash: u64,
98    ) -> Self {
99        Self {
100            id,
101            audio_fingerprint,
102            visual_fingerprint,
103            metadata_hash,
104        }
105    }
106}
107
108// ---------------------------------------------------------------------------
109// ContentIdRegistry
110// ---------------------------------------------------------------------------
111
112/// Registry for content fingerprints supporting deduplication and lookup.
113pub struct ContentIdRegistry {
114    /// Stored fingerprints indexed by `ContentId`.
115    fingerprints: Vec<ContentFingerprint>,
116    /// Deduplication statistics.
117    stats: ContentIdStats,
118}
119
120impl ContentIdRegistry {
121    /// Create an empty registry.
122    #[must_use]
123    pub fn new() -> Self {
124        Self {
125            fingerprints: Vec::new(),
126            stats: ContentIdStats::default(),
127        }
128    }
129
130    /// Register a fingerprint.
131    ///
132    /// If an exact `ContentId` already exists, the duplicate count is incremented.
133    pub fn register(&mut self, fingerprint: ContentFingerprint) {
134        let is_duplicate = self.fingerprints.iter().any(|fp| fp.id == fingerprint.id);
135
136        if is_duplicate {
137            self.stats.duplicates_found += 1;
138        } else {
139            self.stats.total_registered += 1;
140            self.fingerprints.push(fingerprint);
141        }
142    }
143
144    /// Look up a fingerprint by `ContentId`.
145    #[must_use]
146    pub fn lookup(&self, id: &ContentId) -> Option<&ContentFingerprint> {
147        self.fingerprints.iter().find(|fp| &fp.id == id)
148    }
149
150    /// Find fingerprints whose audio fingerprint matches the query.
151    ///
152    /// `min_match` is the minimum fraction of matching 32-bit codes (0.0–1.0).
153    #[must_use]
154    pub fn find_by_audio(&self, query: &[u32], min_match: f32) -> Vec<(ContentId, f32)> {
155        self.fingerprints
156            .iter()
157            .filter_map(|fp| {
158                fp.audio_fingerprint.as_ref().map(|audio| {
159                    let sim = audio_code_similarity(query, audio);
160                    (fp.id.clone(), sim)
161                })
162            })
163            .filter(|(_, sim)| *sim >= min_match)
164            .collect()
165    }
166
167    /// Find fingerprints whose visual fingerprint is similar to the query.
168    ///
169    /// `min_match` is the minimum Jaccard similarity.
170    #[must_use]
171    pub fn find_by_visual(&self, query: &[u64], min_match: f32) -> Vec<(ContentId, f32)> {
172        self.fingerprints
173            .iter()
174            .filter_map(|fp| {
175                fp.visual_fingerprint.as_ref().map(|visual| {
176                    let sim = visual_hash_similarity(query, visual);
177                    (fp.id.clone(), sim)
178                })
179            })
180            .filter(|(_, sim)| *sim >= min_match)
181            .collect()
182    }
183
184    /// Get current statistics.
185    #[must_use]
186    pub fn stats(&self) -> &ContentIdStats {
187        &self.stats
188    }
189
190    /// Return the number of registered (unique) fingerprints.
191    #[must_use]
192    pub fn len(&self) -> usize {
193        self.fingerprints.len()
194    }
195
196    /// Return true if no fingerprints have been registered.
197    #[must_use]
198    pub fn is_empty(&self) -> bool {
199        self.fingerprints.is_empty()
200    }
201}
202
203impl Default for ContentIdRegistry {
204    fn default() -> Self {
205        Self::new()
206    }
207}
208
209/// Compute fraction of matching 32-bit audio codes.
210///
211/// `matching = count of codes in query that appear in candidate` / max(len_a, len_b).
212fn audio_code_similarity(query: &[u32], candidate: &[u32]) -> f32 {
213    let denom = query.len().max(candidate.len());
214    if denom == 0 {
215        return 1.0;
216    }
217
218    let matches = query.iter().filter(|code| candidate.contains(code)).count();
219
220    matches as f32 / denom as f32
221}
222
223/// Compute Jaccard similarity between two sets of visual hashes.
224fn visual_hash_similarity(query: &[u64], candidate: &[u64]) -> f32 {
225    let intersection = query.iter().filter(|h| candidate.contains(h)).count();
226    let union = query.len() + candidate.len() - intersection;
227    if union == 0 {
228        return 1.0;
229    }
230    intersection as f32 / union as f32
231}
232
233// ---------------------------------------------------------------------------
234// ContentIdStats
235// ---------------------------------------------------------------------------
236
237/// Statistics tracked by `ContentIdRegistry`.
238#[derive(Debug, Clone, Default)]
239pub struct ContentIdStats {
240    /// Total number of unique items registered.
241    pub total_registered: u64,
242    /// Number of duplicates detected (not re-stored).
243    pub duplicates_found: u64,
244    /// Estimated storage saved in bytes (placeholder).
245    pub storage_saved_bytes: u64,
246}
247
248// ---------------------------------------------------------------------------
249// Unit tests
250// ---------------------------------------------------------------------------
251
252#[cfg(test)]
253mod tests {
254    use super::*;
255
256    // --- ContentId tests ---
257
258    #[test]
259    fn test_content_id_generate_length() {
260        let id = ContentId::generate(b"Hello, World!");
261        assert_eq!(id.0.len(), 32);
262    }
263
264    #[test]
265    fn test_content_id_generate_hex_chars() {
266        let id = ContentId::generate(b"test data");
267        assert!(id.0.chars().all(|c| c.is_ascii_hexdigit()));
268    }
269
270    #[test]
271    fn test_content_id_deterministic() {
272        let id1 = ContentId::generate(b"same input");
273        let id2 = ContentId::generate(b"same input");
274        assert_eq!(id1, id2);
275    }
276
277    #[test]
278    fn test_content_id_different_inputs() {
279        let id1 = ContentId::generate(b"input A");
280        let id2 = ContentId::generate(b"input B");
281        assert_ne!(id1, id2);
282    }
283
284    #[test]
285    fn test_content_id_empty_data() {
286        let id = ContentId::generate(b"");
287        assert_eq!(id.0.len(), 32);
288    }
289
290    #[test]
291    fn test_content_id_display() {
292        let id = ContentId::generate(b"display test");
293        let s = format!("{id}");
294        assert_eq!(s.len(), 32);
295    }
296
297    // --- ContentFingerprint / Registry tests ---
298
299    fn make_fp(data: &[u8]) -> ContentFingerprint {
300        ContentFingerprint::new(
301            ContentId::generate(data),
302            Some(vec![1u32, 2, 3, 4]),
303            Some(vec![10u64, 20, 30]),
304            0xDEAD_BEEF,
305        )
306    }
307
308    #[test]
309    fn test_registry_empty() {
310        let registry = ContentIdRegistry::new();
311        assert!(registry.is_empty());
312        assert_eq!(registry.len(), 0);
313    }
314
315    #[test]
316    fn test_registry_register_and_lookup() {
317        let mut registry = ContentIdRegistry::new();
318        let fp = make_fp(b"video1.mp4");
319        let id = fp.id.clone();
320        registry.register(fp);
321
322        let found = registry.lookup(&id);
323        assert!(found.is_some());
324        assert_eq!(found.expect("operation should succeed").id, id);
325    }
326
327    #[test]
328    fn test_registry_duplicate_not_stored() {
329        let mut registry = ContentIdRegistry::new();
330        let fp1 = make_fp(b"video1.mp4");
331        let fp2 = make_fp(b"video1.mp4"); // same ID
332        registry.register(fp1);
333        registry.register(fp2);
334
335        assert_eq!(registry.len(), 1);
336        assert_eq!(registry.stats().duplicates_found, 1);
337    }
338
339    #[test]
340    fn test_registry_find_by_audio_match() {
341        let mut registry = ContentIdRegistry::new();
342        let fp = ContentFingerprint::new(
343            ContentId::generate(b"audio test"),
344            Some(vec![1u32, 2, 3, 4, 5]),
345            None,
346            0,
347        );
348        registry.register(fp);
349
350        // Query with same codes → high similarity
351        let results = registry.find_by_audio(&[1, 2, 3, 4, 5], 0.9);
352        assert_eq!(results.len(), 1);
353        assert_eq!(results[0].1, 1.0);
354    }
355
356    #[test]
357    fn test_registry_find_by_audio_no_match() {
358        let mut registry = ContentIdRegistry::new();
359        let fp = ContentFingerprint::new(
360            ContentId::generate(b"audio test"),
361            Some(vec![100u32, 200, 300]),
362            None,
363            0,
364        );
365        registry.register(fp);
366
367        // Completely different codes
368        let results = registry.find_by_audio(&[1, 2, 3], 0.5);
369        assert!(results.is_empty());
370    }
371
372    #[test]
373    fn test_registry_find_by_visual_match() {
374        let mut registry = ContentIdRegistry::new();
375        let fp = ContentFingerprint::new(
376            ContentId::generate(b"visual test"),
377            None,
378            Some(vec![10u64, 20, 30, 40]),
379            0,
380        );
381        registry.register(fp);
382
383        let results = registry.find_by_visual(&[10, 20, 30, 40], 0.9);
384        assert_eq!(results.len(), 1);
385        assert_eq!(results[0].1, 1.0);
386    }
387
388    #[test]
389    fn test_registry_stats_initial() {
390        let registry = ContentIdRegistry::new();
391        assert_eq!(registry.stats().total_registered, 0);
392        assert_eq!(registry.stats().duplicates_found, 0);
393    }
394
395    #[test]
396    fn test_registry_multiple_unique() {
397        let mut registry = ContentIdRegistry::new();
398        for i in 0u8..5 {
399            registry.register(make_fp(&[i]));
400        }
401        assert_eq!(registry.len(), 5);
402        assert_eq!(registry.stats().total_registered, 5);
403        assert_eq!(registry.stats().duplicates_found, 0);
404    }
405
406    #[test]
407    fn test_audio_code_similarity_empty() {
408        let sim = audio_code_similarity(&[], &[]);
409        assert_eq!(sim, 1.0);
410    }
411
412    #[test]
413    fn test_audio_code_similarity_disjoint() {
414        let sim = audio_code_similarity(&[1, 2, 3], &[4, 5, 6]);
415        assert_eq!(sim, 0.0);
416    }
417}