Skip to main content

oximedia_dedup/
cross_format.rs

1//! Cross-format duplicate detection: same content in different containers/codecs.
2//!
3//! Detects when the same media content exists in multiple file formats
4//! (e.g., the same video in MP4/MKV/WebM or the same audio in FLAC/OGG/WAV).
5//!
6//! # Approach
7//!
8//! Cross-format duplicates cannot be found by cryptographic hashing because
9//! different containers/codecs produce entirely different byte streams.  Instead
10//! we combine multiple format-agnostic signals:
11//!
12//! 1. **Duration matching** -- content in different containers should have
13//!    nearly identical duration.
14//! 2. **Perceptual hash matching** -- visual content produces similar pHash /
15//!    dHash regardless of codec.
16//! 3. **Audio fingerprint matching** -- spectral fingerprints survive
17//!    re-encoding.
18//! 4. **Resolution / channel layout matching** -- same content typically
19//!    retains the same frame size and audio channel count.
20//!
21//! The module assigns a **cross-format confidence** score (0.0 - 1.0) to
22//! each candidate pair and groups files above a configurable threshold.
23
24#![allow(dead_code)]
25#![allow(clippy::cast_precision_loss)]
26
27use std::collections::HashMap;
28
29// ---------------------------------------------------------------------------
30// FormatInfo
31// ---------------------------------------------------------------------------
32
33/// Normalised, format-agnostic content descriptor.
34#[derive(Debug, Clone)]
35pub struct FormatInfo {
36    /// File path.
37    pub path: String,
38    /// Container format label (e.g. "mp4", "mkv", "webm", "flac").
39    pub container: String,
40    /// Video codec label (e.g. "av1", "vp9"), if present.
41    pub video_codec: Option<String>,
42    /// Audio codec label (e.g. "opus", "vorbis", "flac"), if present.
43    pub audio_codec: Option<String>,
44    /// Duration in seconds.
45    pub duration_secs: Option<f64>,
46    /// Video width in pixels.
47    pub width: Option<u32>,
48    /// Video height in pixels.
49    pub height: Option<u32>,
50    /// Audio sample rate in Hz.
51    pub sample_rate: Option<u32>,
52    /// Number of audio channels.
53    pub audio_channels: Option<u32>,
54    /// 64-bit perceptual hash of a representative frame (if available).
55    pub phash: Option<u64>,
56    /// Audio fingerprint bytes (if available).
57    pub audio_fingerprint: Option<Vec<u8>>,
58}
59
60impl FormatInfo {
61    /// Create a minimal `FormatInfo` with only path and container.
62    #[must_use]
63    pub fn new(path: impl Into<String>, container: impl Into<String>) -> Self {
64        Self {
65            path: path.into(),
66            container: container.into(),
67            video_codec: None,
68            audio_codec: None,
69            duration_secs: None,
70            width: None,
71            height: None,
72            sample_rate: None,
73            audio_channels: None,
74            phash: None,
75            audio_fingerprint: None,
76        }
77    }
78
79    /// Builder: set duration.
80    #[must_use]
81    pub fn with_duration(mut self, secs: f64) -> Self {
82        self.duration_secs = Some(secs);
83        self
84    }
85
86    /// Builder: set video dimensions.
87    #[must_use]
88    pub fn with_resolution(mut self, w: u32, h: u32) -> Self {
89        self.width = Some(w);
90        self.height = Some(h);
91        self
92    }
93
94    /// Builder: set codecs.
95    #[must_use]
96    pub fn with_codecs(mut self, video: Option<String>, audio: Option<String>) -> Self {
97        self.video_codec = video;
98        self.audio_codec = audio;
99        self
100    }
101
102    /// Builder: set perceptual hash.
103    #[must_use]
104    pub fn with_phash(mut self, hash: u64) -> Self {
105        self.phash = Some(hash);
106        self
107    }
108
109    /// Builder: set audio fingerprint.
110    #[must_use]
111    pub fn with_audio_fingerprint(mut self, fp: Vec<u8>) -> Self {
112        self.audio_fingerprint = Some(fp);
113        self
114    }
115
116    /// Builder: set audio info.
117    #[must_use]
118    pub fn with_audio_info(mut self, sample_rate: u32, channels: u32) -> Self {
119        self.sample_rate = Some(sample_rate);
120        self.audio_channels = Some(channels);
121        self
122    }
123
124    /// Returns `true` if the containers differ from `other`.
125    #[must_use]
126    pub fn is_different_format(&self, other: &Self) -> bool {
127        self.container.to_lowercase() != other.container.to_lowercase()
128    }
129}
130
131// ---------------------------------------------------------------------------
132// CrossFormatConfig
133// ---------------------------------------------------------------------------
134
135/// Configuration for cross-format detection.
136#[derive(Debug, Clone)]
137pub struct CrossFormatConfig {
138    /// Maximum allowed duration difference in seconds.
139    pub max_duration_diff_secs: f64,
140    /// Maximum Hamming distance for perceptual hash match (out of 64 bits).
141    pub max_phash_distance: u32,
142    /// Minimum audio fingerprint similarity (0.0 - 1.0).
143    pub min_audio_similarity: f64,
144    /// Overall confidence threshold for declaring a cross-format duplicate.
145    pub confidence_threshold: f64,
146    /// Weight for duration similarity in composite score.
147    pub weight_duration: f64,
148    /// Weight for resolution match in composite score.
149    pub weight_resolution: f64,
150    /// Weight for perceptual hash match in composite score.
151    pub weight_phash: f64,
152    /// Weight for audio fingerprint match in composite score.
153    pub weight_audio: f64,
154}
155
156impl Default for CrossFormatConfig {
157    fn default() -> Self {
158        Self {
159            max_duration_diff_secs: 0.5,
160            max_phash_distance: 8,
161            min_audio_similarity: 0.80,
162            confidence_threshold: 0.75,
163            weight_duration: 0.25,
164            weight_resolution: 0.15,
165            weight_phash: 0.35,
166            weight_audio: 0.25,
167        }
168    }
169}
170
171impl CrossFormatConfig {
172    /// Normalise weights so they sum to 1.0.
173    #[must_use]
174    pub fn normalised_weights(&self) -> (f64, f64, f64, f64) {
175        let total =
176            self.weight_duration + self.weight_resolution + self.weight_phash + self.weight_audio;
177        if total < f64::EPSILON {
178            return (0.25, 0.25, 0.25, 0.25);
179        }
180        (
181            self.weight_duration / total,
182            self.weight_resolution / total,
183            self.weight_phash / total,
184            self.weight_audio / total,
185        )
186    }
187}
188
189// ---------------------------------------------------------------------------
190// CrossFormatMatch
191// ---------------------------------------------------------------------------
192
193/// A confirmed cross-format duplicate pair.
194#[derive(Debug, Clone)]
195pub struct CrossFormatMatch {
196    /// Path of the first file.
197    pub path_a: String,
198    /// Path of the second file.
199    pub path_b: String,
200    /// Container of the first file.
201    pub container_a: String,
202    /// Container of the second file.
203    pub container_b: String,
204    /// Overall confidence score (0.0 - 1.0).
205    pub confidence: f64,
206    /// Individual signal scores.
207    pub signal_scores: SignalScores,
208}
209
210/// Individual signal similarity scores.
211#[derive(Debug, Clone)]
212pub struct SignalScores {
213    /// Duration similarity (1.0 = identical).
214    pub duration: Option<f64>,
215    /// Resolution match (1.0 = same, 0.0 = different or missing).
216    pub resolution: Option<f64>,
217    /// Perceptual hash similarity (0.0 - 1.0).
218    pub phash: Option<f64>,
219    /// Audio fingerprint similarity (0.0 - 1.0).
220    pub audio: Option<f64>,
221}
222
223// ---------------------------------------------------------------------------
224// CrossFormatGroup
225// ---------------------------------------------------------------------------
226
227/// A group of files that contain the same content in different formats.
228#[derive(Debug, Clone)]
229pub struct CrossFormatGroup {
230    /// Files in this group.
231    pub files: Vec<String>,
232    /// Containers present in this group.
233    pub containers: Vec<String>,
234    /// Best confidence score among all pairs in this group.
235    pub best_confidence: f64,
236}
237
238// ---------------------------------------------------------------------------
239// Comparison functions
240// ---------------------------------------------------------------------------
241
242/// Compare duration similarity.
243///
244/// Returns 1.0 for identical durations, tapering to 0.0 at `max_diff` seconds.
245fn duration_similarity(a: Option<f64>, b: Option<f64>, max_diff: f64) -> Option<f64> {
246    match (a, b) {
247        (Some(da), Some(db)) => {
248            let diff = (da - db).abs();
249            if max_diff < f64::EPSILON {
250                return Some(if diff < f64::EPSILON { 1.0 } else { 0.0 });
251            }
252            Some((1.0 - diff / max_diff).max(0.0))
253        }
254        _ => None,
255    }
256}
257
258/// Compare resolution.
259///
260/// Returns 1.0 if both width and height match, 0.5 if only one matches, 0.0 otherwise.
261fn resolution_similarity(
262    w_a: Option<u32>,
263    h_a: Option<u32>,
264    w_b: Option<u32>,
265    h_b: Option<u32>,
266) -> Option<f64> {
267    match (w_a, h_a, w_b, h_b) {
268        (Some(wa), Some(ha), Some(wb), Some(hb)) => {
269            // Compute per-dimension ratio similarity.
270            let w_ratio = wa.min(wb) as f64 / wa.max(wb).max(1) as f64;
271            let h_ratio = ha.min(hb) as f64 / ha.max(hb).max(1) as f64;
272
273            let score = if wa == wb && ha == hb {
274                1.0
275            } else if w_ratio > 0.99 && h_ratio > 0.99 {
276                // Near-identical (e.g. 1920 vs 1918 due to encoding quirks).
277                0.95
278            } else if w_ratio > 0.95 && h_ratio > 0.95 {
279                // Very close resolutions.
280                0.85
281            } else if (wa == wb) || (ha == hb) {
282                // One dimension matches exactly.
283                0.5
284            } else {
285                0.0
286            };
287
288            Some(score)
289        }
290        _ => None,
291    }
292}
293
294/// Compare perceptual hashes via Hamming distance.
295fn phash_similarity(a: Option<u64>, b: Option<u64>, max_distance: u32) -> Option<f64> {
296    match (a, b) {
297        (Some(ha), Some(hb)) => {
298            let dist = (ha ^ hb).count_ones();
299            if dist > max_distance {
300                Some(0.0)
301            } else {
302                Some(1.0 - dist as f64 / 64.0)
303            }
304        }
305        _ => None,
306    }
307}
308
309/// Compare audio fingerprints using bit-level Hamming similarity.
310fn audio_fingerprint_similarity(a: &Option<Vec<u8>>, b: &Option<Vec<u8>>) -> Option<f64> {
311    match (a.as_ref(), b.as_ref()) {
312        (Some(fa), Some(fb)) => {
313            if fa.is_empty() || fb.is_empty() {
314                return Some(0.0);
315            }
316            let len = fa.len().min(fb.len());
317            let total_bits = len * 8;
318            if total_bits == 0 {
319                return Some(0.0);
320            }
321            let differing_bits: u32 = fa
322                .iter()
323                .zip(fb.iter())
324                .take(len)
325                .map(|(a, b)| (a ^ b).count_ones())
326                .sum();
327            Some(1.0 - differing_bits as f64 / total_bits as f64)
328        }
329        _ => None,
330    }
331}
332
333// ---------------------------------------------------------------------------
334// CrossFormatDetector
335// ---------------------------------------------------------------------------
336
337/// Detector for cross-format duplicates.
338#[derive(Debug)]
339pub struct CrossFormatDetector {
340    config: CrossFormatConfig,
341    items: Vec<FormatInfo>,
342}
343
344impl CrossFormatDetector {
345    /// Create a new detector.
346    #[must_use]
347    pub fn new(config: CrossFormatConfig) -> Self {
348        Self {
349            config,
350            items: Vec::new(),
351        }
352    }
353
354    /// Create a detector with default configuration.
355    #[must_use]
356    pub fn with_defaults() -> Self {
357        Self::new(CrossFormatConfig::default())
358    }
359
360    /// Add a file to the detection pool.
361    pub fn add(&mut self, info: FormatInfo) {
362        self.items.push(info);
363    }
364
365    /// Add multiple files.
366    pub fn add_batch(&mut self, infos: impl IntoIterator<Item = FormatInfo>) {
367        self.items.extend(infos);
368    }
369
370    /// Number of files in the pool.
371    #[must_use]
372    pub fn item_count(&self) -> usize {
373        self.items.len()
374    }
375
376    /// Compare two items and return a match if above threshold.
377    fn compare_pair(&self, a: &FormatInfo, b: &FormatInfo) -> Option<CrossFormatMatch> {
378        // Only compare files in different formats.
379        if !a.is_different_format(b) {
380            return None;
381        }
382
383        // Quick rejection: duration must be close.
384        if let (Some(da), Some(db)) = (a.duration_secs, b.duration_secs) {
385            if (da - db).abs() > self.config.max_duration_diff_secs * 2.0 {
386                return None;
387            }
388        }
389
390        let dur_sim = duration_similarity(
391            a.duration_secs,
392            b.duration_secs,
393            self.config.max_duration_diff_secs,
394        );
395        let res_sim = resolution_similarity(a.width, a.height, b.width, b.height);
396        let phash_sim = phash_similarity(a.phash, b.phash, self.config.max_phash_distance);
397        let audio_sim = audio_fingerprint_similarity(&a.audio_fingerprint, &b.audio_fingerprint);
398
399        let signal_scores = SignalScores {
400            duration: dur_sim,
401            resolution: res_sim,
402            phash: phash_sim,
403            audio: audio_sim,
404        };
405
406        // Compute weighted confidence.
407        let (wd, wr, wp, wa) = self.config.normalised_weights();
408        let mut weighted_sum = 0.0;
409        let mut weight_sum = 0.0;
410
411        if let Some(s) = dur_sim {
412            weighted_sum += s * wd;
413            weight_sum += wd;
414        }
415        if let Some(s) = res_sim {
416            weighted_sum += s * wr;
417            weight_sum += wr;
418        }
419        if let Some(s) = phash_sim {
420            weighted_sum += s * wp;
421            weight_sum += wp;
422        }
423        if let Some(s) = audio_sim {
424            weighted_sum += s * wa;
425            weight_sum += wa;
426        }
427
428        if weight_sum < f64::EPSILON {
429            return None;
430        }
431
432        let confidence = weighted_sum / weight_sum;
433
434        if confidence >= self.config.confidence_threshold {
435            Some(CrossFormatMatch {
436                path_a: a.path.clone(),
437                path_b: b.path.clone(),
438                container_a: a.container.clone(),
439                container_b: b.container.clone(),
440                confidence,
441                signal_scores,
442            })
443        } else {
444            None
445        }
446    }
447
448    /// Find all cross-format duplicate pairs.
449    #[must_use]
450    pub fn find_matches(&self) -> Vec<CrossFormatMatch> {
451        let mut matches = Vec::new();
452        let mut seen_pairs = std::collections::HashSet::new();
453
454        // Group by approximate duration to reduce comparisons.
455        let buckets = self.bucket_by_duration();
456
457        for bucket in buckets.values() {
458            if bucket.len() < 2 {
459                continue;
460            }
461            for i in 0..bucket.len() {
462                for j in (i + 1)..bucket.len() {
463                    let (lo, hi) = if bucket[i] < bucket[j] {
464                        (bucket[i], bucket[j])
465                    } else {
466                        (bucket[j], bucket[i])
467                    };
468                    if !seen_pairs.insert((lo, hi)) {
469                        continue; // already checked this pair
470                    }
471                    if let Some(m) = self.compare_pair(&self.items[lo], &self.items[hi]) {
472                        matches.push(m);
473                    }
474                }
475            }
476        }
477
478        // Sort by confidence descending.
479        matches.sort_by(|a, b| {
480            b.confidence
481                .partial_cmp(&a.confidence)
482                .unwrap_or(std::cmp::Ordering::Equal)
483        });
484        matches
485    }
486
487    /// Find and group cross-format duplicates using transitive closure.
488    #[must_use]
489    pub fn find_groups(&self) -> Vec<CrossFormatGroup> {
490        let matches = self.find_matches();
491        if matches.is_empty() {
492            return Vec::new();
493        }
494
495        // Build path index.
496        let mut path_to_idx: HashMap<&str, usize> = HashMap::new();
497        for (i, item) in self.items.iter().enumerate() {
498            path_to_idx.insert(&item.path, i);
499        }
500
501        // Union-Find for grouping.
502        let n = self.items.len();
503        let mut parent: Vec<usize> = (0..n).collect();
504
505        let find = |parent: &mut Vec<usize>, mut x: usize| -> usize {
506            while parent[x] != x {
507                parent[x] = parent[parent[x]]; // path halving
508                x = parent[x];
509            }
510            x
511        };
512
513        let mut best_confidence: Vec<f64> = vec![0.0; n];
514
515        for m in &matches {
516            if let (Some(&ia), Some(&ib)) = (
517                path_to_idx.get(m.path_a.as_str()),
518                path_to_idx.get(m.path_b.as_str()),
519            ) {
520                let ra = find(&mut parent, ia);
521                let rb = find(&mut parent, ib);
522                if ra != rb {
523                    parent[ra] = rb;
524                }
525                best_confidence[ia] = best_confidence[ia].max(m.confidence);
526                best_confidence[ib] = best_confidence[ib].max(m.confidence);
527            }
528        }
529
530        // Collect groups.
531        let mut groups_map: HashMap<usize, Vec<usize>> = HashMap::new();
532        for i in 0..n {
533            let root = find(&mut parent, i);
534            groups_map.entry(root).or_default().push(i);
535        }
536
537        groups_map
538            .into_values()
539            .filter(|g| g.len() > 1)
540            .filter(|g| {
541                // Ensure at least 2 different containers.
542                let containers: std::collections::HashSet<&str> = g
543                    .iter()
544                    .map(|&i| self.items[i].container.as_str())
545                    .collect();
546                containers.len() > 1
547            })
548            .map(|g| {
549                let mut containers: Vec<String> =
550                    g.iter().map(|&i| self.items[i].container.clone()).collect();
551                containers.sort();
552                containers.dedup();
553
554                let bc = g.iter().map(|&i| best_confidence[i]).fold(0.0f64, f64::max);
555
556                CrossFormatGroup {
557                    files: g.iter().map(|&i| self.items[i].path.clone()).collect(),
558                    containers,
559                    best_confidence: bc,
560                }
561            })
562            .collect()
563    }
564
565    /// Bucket items by rounded duration for efficient comparison.
566    fn bucket_by_duration(&self) -> HashMap<i64, Vec<usize>> {
567        let mut buckets: HashMap<i64, Vec<usize>> = HashMap::new();
568        let bucket_width = self.config.max_duration_diff_secs.max(0.5);
569
570        for (idx, item) in self.items.iter().enumerate() {
571            match item.duration_secs {
572                Some(d) => {
573                    // Insert into the primary bucket and adjacent buckets
574                    // to handle boundary cases.
575                    let primary = (d / bucket_width) as i64;
576                    for offset in -1..=1 {
577                        buckets.entry(primary + offset).or_default().push(idx);
578                    }
579                }
580                None => {
581                    // Items without duration go into a special bucket.
582                    buckets.entry(i64::MIN).or_default().push(idx);
583                }
584            }
585        }
586
587        // Deduplicate indices within each bucket.
588        for bucket in buckets.values_mut() {
589            bucket.sort_unstable();
590            bucket.dedup();
591        }
592
593        buckets
594    }
595}
596
597// ---------------------------------------------------------------------------
598// Tests
599// ---------------------------------------------------------------------------
600
601#[cfg(test)]
602mod tests {
603    use super::*;
604
605    #[test]
606    fn test_format_info_creation() {
607        let info = FormatInfo::new("video.mp4", "mp4")
608            .with_duration(120.5)
609            .with_resolution(1920, 1080);
610        assert_eq!(info.path, "video.mp4");
611        assert_eq!(info.container, "mp4");
612        assert_eq!(info.duration_secs, Some(120.5));
613        assert_eq!(info.width, Some(1920));
614        assert_eq!(info.height, Some(1080));
615    }
616
617    #[test]
618    fn test_is_different_format() {
619        let a = FormatInfo::new("a.mp4", "mp4");
620        let b = FormatInfo::new("b.mkv", "mkv");
621        let c = FormatInfo::new("c.mp4", "MP4");
622
623        assert!(a.is_different_format(&b));
624        assert!(!a.is_different_format(&c)); // case-insensitive
625    }
626
627    #[test]
628    fn test_duration_similarity_identical() {
629        let sim = duration_similarity(Some(120.0), Some(120.0), 0.5);
630        assert_eq!(sim, Some(1.0));
631    }
632
633    #[test]
634    fn test_duration_similarity_close() {
635        let sim = duration_similarity(Some(120.0), Some(120.3), 0.5);
636        let s = sim.expect("should be Some");
637        assert!(s > 0.3 && s < 1.0, "sim = {s}");
638    }
639
640    #[test]
641    fn test_duration_similarity_too_far() {
642        let sim = duration_similarity(Some(120.0), Some(121.0), 0.5);
643        let s = sim.expect("should be Some");
644        assert_eq!(s, 0.0);
645    }
646
647    #[test]
648    fn test_duration_similarity_missing() {
649        assert!(duration_similarity(None, Some(120.0), 0.5).is_none());
650        assert!(duration_similarity(Some(120.0), None, 0.5).is_none());
651    }
652
653    #[test]
654    fn test_resolution_similarity_exact() {
655        let sim = resolution_similarity(Some(1920), Some(1080), Some(1920), Some(1080));
656        assert_eq!(sim, Some(1.0));
657    }
658
659    #[test]
660    fn test_resolution_similarity_different() {
661        let sim = resolution_similarity(Some(1920), Some(1080), Some(1280), Some(720));
662        let s = sim.expect("should be Some");
663        assert_eq!(s, 0.0);
664    }
665
666    #[test]
667    fn test_resolution_similarity_partial() {
668        let sim = resolution_similarity(Some(1920), Some(1080), Some(1920), Some(720));
669        let s = sim.expect("should be Some");
670        assert_eq!(s, 0.5);
671    }
672
673    #[test]
674    fn test_resolution_similarity_missing() {
675        assert!(resolution_similarity(None, Some(1080), Some(1920), Some(1080)).is_none());
676    }
677
678    #[test]
679    fn test_phash_similarity_identical() {
680        let sim = phash_similarity(Some(0xDEADBEEF), Some(0xDEADBEEF), 8);
681        assert_eq!(sim, Some(1.0));
682    }
683
684    #[test]
685    fn test_phash_similarity_close() {
686        let a = 0xFFFF_FFFF_FFFF_FFFFu64;
687        let b = a ^ 0b1111; // 4 bits different
688        let sim = phash_similarity(Some(a), Some(b), 8);
689        let s = sim.expect("should be Some");
690        assert!(s > 0.9, "sim = {s}");
691    }
692
693    #[test]
694    fn test_phash_similarity_too_far() {
695        let sim = phash_similarity(Some(0x0), Some(0xFFFF_FFFF_FFFF_FFFF), 8);
696        let s = sim.expect("should be Some");
697        assert_eq!(s, 0.0);
698    }
699
700    #[test]
701    fn test_audio_fingerprint_similarity_identical() {
702        let fp = vec![0xAB, 0xCD, 0xEF, 0x01];
703        let sim = audio_fingerprint_similarity(&Some(fp.clone()), &Some(fp));
704        assert_eq!(sim, Some(1.0));
705    }
706
707    #[test]
708    fn test_audio_fingerprint_similarity_different() {
709        let a = vec![0xFF, 0xFF, 0xFF, 0xFF];
710        let b = vec![0x00, 0x00, 0x00, 0x00];
711        let sim = audio_fingerprint_similarity(&Some(a), &Some(b));
712        assert_eq!(sim, Some(0.0));
713    }
714
715    #[test]
716    fn test_audio_fingerprint_similarity_missing() {
717        let fp = vec![0xAB];
718        assert!(audio_fingerprint_similarity(&None, &Some(fp)).is_none());
719    }
720
721    #[test]
722    fn test_cross_format_config_normalised_weights() {
723        let config = CrossFormatConfig::default();
724        let (wd, wr, wp, wa) = config.normalised_weights();
725        let total = wd + wr + wp + wa;
726        assert!((total - 1.0).abs() < 1e-10);
727    }
728
729    #[test]
730    fn test_detector_identical_content_different_format() {
731        let mut detector = CrossFormatDetector::with_defaults();
732
733        let hash = 0xDEAD_BEEF_CAFE_BABEu64;
734        let fp = vec![0xAB, 0xCD, 0xEF, 0x01, 0x23, 0x45, 0x67, 0x89];
735
736        detector.add(
737            FormatInfo::new("video.mp4", "mp4")
738                .with_duration(120.0)
739                .with_resolution(1920, 1080)
740                .with_phash(hash)
741                .with_audio_fingerprint(fp.clone()),
742        );
743        detector.add(
744            FormatInfo::new("video.mkv", "mkv")
745                .with_duration(120.0)
746                .with_resolution(1920, 1080)
747                .with_phash(hash)
748                .with_audio_fingerprint(fp),
749        );
750
751        let matches = detector.find_matches();
752        assert_eq!(matches.len(), 1);
753        assert!(matches[0].confidence > 0.99);
754    }
755
756    #[test]
757    fn test_detector_same_format_not_matched() {
758        let mut detector = CrossFormatDetector::with_defaults();
759
760        let hash = 0xDEAD_BEEF_CAFE_BABEu64;
761        detector.add(
762            FormatInfo::new("a.mp4", "mp4")
763                .with_duration(120.0)
764                .with_phash(hash),
765        );
766        detector.add(
767            FormatInfo::new("b.mp4", "mp4")
768                .with_duration(120.0)
769                .with_phash(hash),
770        );
771
772        let matches = detector.find_matches();
773        assert!(matches.is_empty(), "same format should not be matched");
774    }
775
776    #[test]
777    fn test_detector_duration_too_different() {
778        let mut detector = CrossFormatDetector::with_defaults();
779
780        detector.add(
781            FormatInfo::new("short.mp4", "mp4")
782                .with_duration(60.0)
783                .with_resolution(1920, 1080),
784        );
785        detector.add(
786            FormatInfo::new("long.mkv", "mkv")
787                .with_duration(120.0)
788                .with_resolution(1920, 1080),
789        );
790
791        let matches = detector.find_matches();
792        assert!(
793            matches.is_empty(),
794            "very different durations should not match"
795        );
796    }
797
798    #[test]
799    fn test_detector_find_groups() {
800        let mut detector = CrossFormatDetector::with_defaults();
801
802        let hash = 0xAAAA_BBBB_CCCC_DDDDu64;
803        let fp = vec![0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88];
804
805        for (path, container) in &[
806            ("video.mp4", "mp4"),
807            ("video.mkv", "mkv"),
808            ("video.webm", "webm"),
809        ] {
810            detector.add(
811                FormatInfo::new(*path, *container)
812                    .with_duration(90.0)
813                    .with_resolution(1280, 720)
814                    .with_phash(hash)
815                    .with_audio_fingerprint(fp.clone()),
816            );
817        }
818
819        let groups = detector.find_groups();
820        assert_eq!(groups.len(), 1);
821        assert_eq!(groups[0].files.len(), 3);
822        assert!(groups[0].containers.len() >= 2);
823        assert!(groups[0].best_confidence > 0.9);
824    }
825
826    #[test]
827    fn test_detector_two_separate_groups() {
828        let mut detector = CrossFormatDetector::with_defaults();
829
830        // Group 1
831        detector.add(
832            FormatInfo::new("a.mp4", "mp4")
833                .with_duration(60.0)
834                .with_resolution(1920, 1080)
835                .with_phash(0x1111_1111_1111_1111),
836        );
837        detector.add(
838            FormatInfo::new("a.mkv", "mkv")
839                .with_duration(60.0)
840                .with_resolution(1920, 1080)
841                .with_phash(0x1111_1111_1111_1111),
842        );
843
844        // Group 2 (different content)
845        detector.add(
846            FormatInfo::new("b.mp4", "mp4")
847                .with_duration(300.0)
848                .with_resolution(1280, 720)
849                .with_phash(0xFFFF_FFFF_FFFF_FFFF),
850        );
851        detector.add(
852            FormatInfo::new("b.webm", "webm")
853                .with_duration(300.0)
854                .with_resolution(1280, 720)
855                .with_phash(0xFFFF_FFFF_FFFF_FFFF),
856        );
857
858        let groups = detector.find_groups();
859        assert_eq!(groups.len(), 2);
860    }
861
862    #[test]
863    fn test_detector_empty_pool() {
864        let detector = CrossFormatDetector::with_defaults();
865        assert!(detector.find_matches().is_empty());
866        assert!(detector.find_groups().is_empty());
867    }
868
869    #[test]
870    fn test_detector_single_item() {
871        let mut detector = CrossFormatDetector::with_defaults();
872        detector.add(FormatInfo::new("only.mp4", "mp4").with_duration(60.0));
873        assert!(detector.find_matches().is_empty());
874    }
875
876    #[test]
877    fn test_detector_partial_signals() {
878        // Only duration and resolution, no phash/audio
879        let mut detector = CrossFormatDetector::new(CrossFormatConfig {
880            confidence_threshold: 0.5, // lower threshold since we have fewer signals
881            ..CrossFormatConfig::default()
882        });
883
884        detector.add(
885            FormatInfo::new("video.mp4", "mp4")
886                .with_duration(120.0)
887                .with_resolution(1920, 1080),
888        );
889        detector.add(
890            FormatInfo::new("video.mkv", "mkv")
891                .with_duration(120.0)
892                .with_resolution(1920, 1080),
893        );
894
895        let matches = detector.find_matches();
896        assert_eq!(matches.len(), 1);
897        // Score should reflect only the available signals
898        assert!(matches[0].confidence >= 0.5);
899    }
900
901    #[test]
902    fn test_detector_audio_only_content() {
903        let mut detector = CrossFormatDetector::with_defaults();
904
905        let fp = vec![0xAA; 32];
906        detector.add(
907            FormatInfo::new("song.flac", "flac")
908                .with_duration(180.0)
909                .with_audio_fingerprint(fp.clone())
910                .with_audio_info(44100, 2),
911        );
912        detector.add(
913            FormatInfo::new("song.ogg", "ogg")
914                .with_duration(180.0)
915                .with_audio_fingerprint(fp)
916                .with_audio_info(44100, 2),
917        );
918
919        let matches = detector.find_matches();
920        assert_eq!(matches.len(), 1);
921        assert!(matches[0].confidence > 0.7);
922    }
923
924    #[test]
925    fn test_signal_scores_populated() {
926        let mut detector = CrossFormatDetector::with_defaults();
927
928        let hash = 0xDEAD_BEEF_CAFE_BABEu64;
929        detector.add(
930            FormatInfo::new("a.mp4", "mp4")
931                .with_duration(100.0)
932                .with_resolution(1920, 1080)
933                .with_phash(hash),
934        );
935        detector.add(
936            FormatInfo::new("a.mkv", "mkv")
937                .with_duration(100.0)
938                .with_resolution(1920, 1080)
939                .with_phash(hash),
940        );
941
942        let matches = detector.find_matches();
943        assert_eq!(matches.len(), 1);
944
945        let scores = &matches[0].signal_scores;
946        assert_eq!(scores.duration, Some(1.0));
947        assert_eq!(scores.resolution, Some(1.0));
948        assert_eq!(scores.phash, Some(1.0));
949        assert!(scores.audio.is_none()); // no audio fingerprint provided
950    }
951
952    #[test]
953    fn test_item_count() {
954        let mut detector = CrossFormatDetector::with_defaults();
955        assert_eq!(detector.item_count(), 0);
956        detector.add(FormatInfo::new("a.mp4", "mp4"));
957        detector.add(FormatInfo::new("b.mkv", "mkv"));
958        assert_eq!(detector.item_count(), 2);
959    }
960
961    #[test]
962    fn test_add_batch() {
963        let mut detector = CrossFormatDetector::with_defaults();
964        detector.add_batch(vec![
965            FormatInfo::new("a.mp4", "mp4"),
966            FormatInfo::new("b.mkv", "mkv"),
967            FormatInfo::new("c.webm", "webm"),
968        ]);
969        assert_eq!(detector.item_count(), 3);
970    }
971
972    #[test]
973    fn test_resolution_similarity_near_identical() {
974        // Encoding quirk: 1920 vs 1918
975        let sim = resolution_similarity(Some(1920), Some(1080), Some(1918), Some(1080));
976        let s = sim.expect("should be Some");
977        assert!(s > 0.8, "near-identical resolution should score high: {s}");
978    }
979
980    #[test]
981    fn test_matches_sorted_by_confidence() {
982        let mut detector = CrossFormatDetector::new(CrossFormatConfig {
983            confidence_threshold: 0.3,
984            ..CrossFormatConfig::default()
985        });
986
987        // High confidence pair
988        detector.add(
989            FormatInfo::new("a.mp4", "mp4")
990                .with_duration(100.0)
991                .with_resolution(1920, 1080)
992                .with_phash(0xAAAA),
993        );
994        detector.add(
995            FormatInfo::new("a.mkv", "mkv")
996                .with_duration(100.0)
997                .with_resolution(1920, 1080)
998                .with_phash(0xAAAA),
999        );
1000
1001        // Lower confidence pair (duration slightly off)
1002        detector.add(
1003            FormatInfo::new("b.mp4", "mp4")
1004                .with_duration(200.0)
1005                .with_resolution(1280, 720)
1006                .with_phash(0xBBBB),
1007        );
1008        detector.add(
1009            FormatInfo::new("b.webm", "webm")
1010                .with_duration(200.2)
1011                .with_resolution(1280, 720)
1012                .with_phash(0xBBBB),
1013        );
1014
1015        let matches = detector.find_matches();
1016        assert!(matches.len() >= 2);
1017        // Should be sorted descending by confidence
1018        for i in 1..matches.len() {
1019            assert!(matches[i - 1].confidence >= matches[i].confidence);
1020        }
1021    }
1022
1023    #[test]
1024    fn test_format_info_builders() {
1025        let info = FormatInfo::new("test.mp4", "mp4")
1026            .with_codecs(Some("av1".into()), Some("opus".into()))
1027            .with_audio_info(48000, 6);
1028        assert_eq!(info.video_codec.as_deref(), Some("av1"));
1029        assert_eq!(info.audio_codec.as_deref(), Some("opus"));
1030        assert_eq!(info.sample_rate, Some(48000));
1031        assert_eq!(info.audio_channels, Some(6));
1032    }
1033}