Skip to main content

oximedia_dedup/
metadata.rs

1//! Metadata-based deduplication and fuzzy matching.
2//!
3//! This module provides:
4//! - Filename similarity (Levenshtein distance, fuzzy matching)
5//! - Duration matching with tolerance
6//! - Resolution matching
7//! - Codec and format matching
8//! - Fuzzy metadata comparison
9
10use crate::{DedupError, DedupResult};
11use std::path::{Path, PathBuf};
12
13/// Media metadata.
14#[derive(Debug, Clone, PartialEq)]
15pub struct MediaMetadata {
16    /// File path
17    pub path: PathBuf,
18
19    /// File size in bytes
20    pub size: u64,
21
22    /// Human-readable title (e.g., from ID3/MP4 title tag). Used for fuzzy
23    /// title matching in addition to filename comparison.
24    pub title: Option<String>,
25
26    /// Duration in seconds (for audio/video)
27    pub duration: Option<f64>,
28
29    /// Video width (if video)
30    pub width: Option<u32>,
31
32    /// Video height (if video)
33    pub height: Option<u32>,
34
35    /// Bitrate in bits per second
36    pub bitrate: Option<u64>,
37
38    /// Frame rate (if video)
39    pub framerate: Option<f64>,
40
41    /// Sample rate (if audio)
42    pub sample_rate: Option<u32>,
43
44    /// Number of audio channels
45    pub channels: Option<u16>,
46
47    /// Video codec
48    pub video_codec: Option<String>,
49
50    /// Audio codec
51    pub audio_codec: Option<String>,
52
53    /// Container format
54    pub container: Option<String>,
55
56    /// Creation timestamp
57    pub created: Option<i64>,
58
59    /// Modified timestamp
60    pub modified: Option<i64>,
61}
62
63impl MediaMetadata {
64    /// Create new metadata.
65    #[must_use]
66    pub fn new(path: PathBuf, size: u64) -> Self {
67        Self {
68            path,
69            size,
70            title: None,
71            duration: None,
72            width: None,
73            height: None,
74            bitrate: None,
75            framerate: None,
76            sample_rate: None,
77            channels: None,
78            video_codec: None,
79            audio_codec: None,
80            container: None,
81            created: None,
82            modified: None,
83        }
84    }
85
86    /// Get filename without extension.
87    #[must_use]
88    pub fn filename(&self) -> String {
89        self.path
90            .file_stem()
91            .and_then(|s| s.to_str())
92            .unwrap_or("")
93            .to_string()
94    }
95
96    /// Get file extension.
97    #[must_use]
98    pub fn extension(&self) -> String {
99        self.path
100            .extension()
101            .and_then(|s| s.to_str())
102            .unwrap_or("")
103            .to_lowercase()
104    }
105
106    /// Get resolution as string (e.g., "1920x1080").
107    #[must_use]
108    pub fn resolution(&self) -> Option<String> {
109        match (self.width, self.height) {
110            (Some(w), Some(h)) => Some(format!("{w}x{h}")),
111            _ => None,
112        }
113    }
114
115    /// Check if this is a video file.
116    #[must_use]
117    pub fn is_video(&self) -> bool {
118        self.width.is_some() && self.height.is_some()
119    }
120
121    /// Check if this is an audio file.
122    #[must_use]
123    pub fn is_audio(&self) -> bool {
124        self.sample_rate.is_some() && !self.is_video()
125    }
126
127    /// Calculate aspect ratio.
128    #[must_use]
129    pub fn aspect_ratio(&self) -> Option<f64> {
130        match (self.width, self.height) {
131            (Some(w), Some(h)) if h > 0 => Some(f64::from(w) / f64::from(h)),
132            _ => None,
133        }
134    }
135}
136
137/// Metadata similarity result.
138#[derive(Debug, Clone)]
139pub struct MetadataSimilarity {
140    /// Filename similarity (0.0-1.0): multi-signal comparison of the file stem
141    /// using edit distance, token Jaccard, and bigram Dice coefficient.
142    pub filename_similarity: f64,
143
144    /// Title fuzzy score (0.0-1.0): similarity of the embedded media title tag
145    /// (if present) using the media-aware [`crate::fuzzy_match::FilenameMatcher`].
146    /// Falls back to 0.0 when either file lacks an embedded title.
147    pub title_fuzzy_score: f64,
148
149    /// Duration match (0.0-1.0)
150    pub duration_match: f64,
151
152    /// Resolution match (0.0-1.0)
153    pub resolution_match: f64,
154
155    /// Codec match (0.0-1.0)
156    pub codec_match: f64,
157
158    /// Size similarity (0.0-1.0)
159    pub size_similarity: f64,
160
161    /// Container match (0.0 or 1.0)
162    pub container_match: f64,
163}
164
165impl MetadataSimilarity {
166    /// Calculate overall similarity score.
167    ///
168    /// Weights (sum to 1.0):
169    ///
170    /// | Signal               | Weight |
171    /// |----------------------|--------|
172    /// | name_score           |  0.30  |
173    /// | duration_match       |  0.20  |
174    /// | resolution_match     |  0.20  |
175    /// | codec_match          |  0.15  |
176    /// | size_similarity      |  0.10  |
177    /// | container_match      |  0.05  |
178    ///
179    /// `name_score` is the maximum of `filename_similarity` and
180    /// `title_fuzzy_score`. When both embedded title tags are present,
181    /// `title_fuzzy_score` can lift the name signal above what the filename
182    /// alone would contribute (e.g., `"Movie (1080p)"` vs `"Movie (720p)"`).
183    /// When `title_fuzzy_score` is 0.0 (no title tags), the formula is
184    /// identical to the original 6-signal formula.
185    #[must_use]
186    pub fn overall_score(&self) -> f64 {
187        let name_score = self.filename_similarity.max(self.title_fuzzy_score);
188        name_score * 0.30
189            + self.duration_match * 0.20
190            + self.resolution_match * 0.20
191            + self.codec_match * 0.15
192            + self.size_similarity * 0.10
193            + self.container_match * 0.05
194    }
195
196    /// Check if metadata is similar above threshold.
197    #[must_use]
198    pub fn is_similar(&self, threshold: f64) -> bool {
199        self.overall_score() >= threshold
200    }
201}
202
203/// Compare two metadata objects.
204///
205/// When both files carry an embedded `title` tag, the title similarity is
206/// computed with [`crate::fuzzy_match::FilenameMatcher`] (which strips codec /
207/// resolution noise tokens). Otherwise `title_fuzzy_score` is 0.0 and the
208/// overall score is identical to the pre-title-field formula.
209#[must_use]
210pub fn compare_metadata(meta1: &MediaMetadata, meta2: &MediaMetadata) -> MetadataSimilarity {
211    let filename_similarity = compare_filenames(&meta1.filename(), &meta2.filename());
212    let title_fuzzy_score = compare_titles(meta1.title.as_deref(), meta2.title.as_deref());
213    let duration_match = compare_durations(meta1.duration, meta2.duration);
214    let resolution_match = compare_resolutions(meta1, meta2);
215    let codec_match = compare_codecs(meta1, meta2);
216    let size_similarity = compare_sizes(meta1.size, meta2.size);
217    let container_match = compare_containers(&meta1.container, &meta2.container);
218
219    MetadataSimilarity {
220        filename_similarity,
221        title_fuzzy_score,
222        duration_match,
223        resolution_match,
224        codec_match,
225        size_similarity,
226        container_match,
227    }
228}
229
230/// Compare embedded media title tags using the media-aware `FilenameMatcher`.
231///
232/// Returns 0.0 when either title is absent (the caller decides how much weight
233/// to give this signal).  Returns a value in 0.0–1.0 when both titles are set.
234#[must_use]
235pub fn compare_titles(title1: Option<&str>, title2: Option<&str>) -> f64 {
236    match (title1, title2) {
237        (Some(t1), Some(t2)) => {
238            if t1.eq_ignore_ascii_case(t2) {
239                return 1.0;
240            }
241            // Media-aware comparison strips codec / resolution noise tokens so
242            // that "Movie (1080p)" and "Movie (720p)" both resolve to "Movie".
243            let matcher = crate::fuzzy_match::FilenameMatcher::new(0.0);
244            matcher.similarity(t1, t2).value()
245        }
246        // One or both titles absent — treat as unknown / neutral.
247        _ => 0.0,
248    }
249}
250
251/// Compare filenames using a combination of Levenshtein edit distance,
252/// Jaccard token overlap, and bigram (Dice coefficient) similarity.
253///
254/// This multi-signal approach is robust against common filename variations:
255/// - `"My Video (1080p)"` vs `"my_video_1080p"` (separators, case)
256/// - `"vacation_2024_final"` vs `"vacation_2024_final_v2"` (suffixes)
257/// - `"clip_001"` vs `"clip_002"` (numeric increments)
258///
259/// The function uses two comparison strategies and returns the **maximum**
260/// of the two scores:
261///
262/// 1. **Raw comparison**: Normalized filenames (lowercase, collapsed separators)
263///    compared via edit distance (40%), token Jaccard (35%), bigram Dice (25%).
264/// 2. **Media-aware comparison**: Uses [`crate::fuzzy_match::FilenameMatcher`]
265///    which strips common noise tokens (resolution tags like "1080p", codec names
266///    like "x264", release markers like "BluRay") before comparing. This ensures
267///    that files like `"Movie.2024.1080p.x264.mkv"` and `"Movie.2024.720p.h265.mp4"`
268///    are recognized as the same content despite different encoding parameters.
269#[must_use]
270pub fn compare_filenames(name1: &str, name2: &str) -> f64 {
271    let norm1 = normalize_filename(name1);
272    let norm2 = normalize_filename(name2);
273
274    if norm1 == norm2 {
275        return 1.0;
276    }
277
278    if norm1.is_empty() || norm2.is_empty() {
279        return 0.0;
280    }
281
282    // Strategy 1: Raw comparison (original logic)
283    let raw_score = raw_filename_similarity(&norm1, &norm2);
284
285    // Strategy 2: Media-aware comparison using FilenameMatcher
286    // This strips codec tags, resolution markers, release-group tokens, etc.
287    let matcher = crate::fuzzy_match::FilenameMatcher::new(0.0); // threshold=0 to always get a score
288    let media_aware_score = matcher.similarity(name1, name2).value();
289
290    // Return the maximum of the two strategies so that either approach
291    // can detect duplicates that the other would miss.
292    raw_score.max(media_aware_score)
293}
294
295/// Raw filename similarity using edit distance, token Jaccard, and bigram Dice.
296fn raw_filename_similarity(norm1: &str, norm2: &str) -> f64 {
297    // 1. Levenshtein edit distance score
298    let distance = levenshtein_distance(norm1, norm2);
299    let max_len = norm1.len().max(norm2.len());
300    let edit_score = 1.0 - (distance as f64 / max_len as f64);
301
302    // 2. Token Jaccard index (bag-of-words overlap)
303    let tokens1 = tokenize_filename(norm1);
304    let tokens2 = tokenize_filename(norm2);
305    let token_score = if tokens1.is_empty() && tokens2.is_empty() {
306        1.0
307    } else {
308        let intersection = tokens1.intersection(&tokens2).count();
309        let union = tokens1.union(&tokens2).count();
310        if union == 0 {
311            0.0
312        } else {
313            intersection as f64 / union as f64
314        }
315    };
316
317    // 3. Bigram Dice coefficient
318    let bigrams1 = char_bigrams(norm1);
319    let bigrams2 = char_bigrams(norm2);
320    let bigram_score = if bigrams1.is_empty() && bigrams2.is_empty() {
321        1.0
322    } else {
323        let mut overlap = 0usize;
324        for (bg, count_a) in &bigrams1 {
325            if let Some(count_b) = bigrams2.get(bg) {
326                overlap += (*count_a).min(*count_b);
327            }
328        }
329        let total_a: usize = bigrams1.values().sum();
330        let total_b: usize = bigrams2.values().sum();
331        let denom = total_a + total_b;
332        if denom == 0 {
333            0.0
334        } else {
335            2.0 * overlap as f64 / denom as f64
336        }
337    };
338
339    // Weighted combination
340    edit_score * 0.40 + token_score * 0.35 + bigram_score * 0.25
341}
342
343/// Normalize filename for comparison: lowercase, strip separators, collapse whitespace.
344fn normalize_filename(name: &str) -> String {
345    name.to_lowercase()
346        .chars()
347        .map(|c| {
348            if c.is_alphanumeric() {
349                c
350            } else {
351                // Replace separators (underscore, dot, dash, etc.) with space
352                ' '
353            }
354        })
355        .collect::<String>()
356        .split_whitespace()
357        .collect::<Vec<_>>()
358        .join(" ")
359}
360
361/// Tokenize a normalized filename into a set of words (split on whitespace).
362fn tokenize_filename(name: &str) -> std::collections::HashSet<String> {
363    name.split_whitespace().map(|s| s.to_string()).collect()
364}
365
366/// Extract character bigrams with counts from a string.
367fn char_bigrams(s: &str) -> std::collections::HashMap<(char, char), usize> {
368    let chars: Vec<char> = s.chars().collect();
369    let mut map = std::collections::HashMap::new();
370    if chars.len() >= 2 {
371        for pair in chars.windows(2) {
372            *map.entry((pair[0], pair[1])).or_insert(0) += 1;
373        }
374    }
375    map
376}
377
378/// Calculate Levenshtein distance between two strings.
379fn levenshtein_distance(s1: &str, s2: &str) -> usize {
380    let len1 = s1.chars().count();
381    let len2 = s2.chars().count();
382
383    if len1 == 0 {
384        return len2;
385    }
386    if len2 == 0 {
387        return len1;
388    }
389
390    let mut matrix = vec![vec![0usize; len2 + 1]; len1 + 1];
391
392    // Initialize first row and column
393    for i in 0..=len1 {
394        matrix[i][0] = i;
395    }
396    for j in 0..=len2 {
397        matrix[0][j] = j;
398    }
399
400    // Fill matrix
401    let s1_chars: Vec<char> = s1.chars().collect();
402    let s2_chars: Vec<char> = s2.chars().collect();
403
404    for i in 1..=len1 {
405        for j in 1..=len2 {
406            let cost = if s1_chars[i - 1] == s2_chars[j - 1] {
407                0
408            } else {
409                1
410            };
411
412            matrix[i][j] = (matrix[i - 1][j] + 1) // deletion
413                .min(matrix[i][j - 1] + 1) // insertion
414                .min(matrix[i - 1][j - 1] + cost); // substitution
415        }
416    }
417
418    matrix[len1][len2]
419}
420
421/// Compare durations with tolerance.
422#[must_use]
423pub fn compare_durations(dur1: Option<f64>, dur2: Option<f64>) -> f64 {
424    match (dur1, dur2) {
425        (Some(d1), Some(d2)) => {
426            if d1 == 0.0 && d2 == 0.0 {
427                return 1.0;
428            }
429
430            let max_dur = d1.max(d2);
431            if max_dur == 0.0 {
432                return 1.0;
433            }
434
435            let diff = (d1 - d2).abs();
436            let tolerance = max_dur * 0.02; // 2% tolerance
437
438            if diff <= tolerance {
439                1.0
440            } else {
441                (1.0 - diff / max_dur).max(0.0)
442            }
443        }
444        (None, None) => 0.5, // Unknown
445        _ => 0.0,            // One has duration, other doesn't
446    }
447}
448
449/// Compare resolutions.
450#[must_use]
451pub fn compare_resolutions(meta1: &MediaMetadata, meta2: &MediaMetadata) -> f64 {
452    match ((meta1.width, meta1.height), (meta2.width, meta2.height)) {
453        ((Some(w1), Some(h1)), (Some(w2), Some(h2))) => {
454            if w1 == w2 && h1 == h2 {
455                1.0
456            } else {
457                // Compare aspect ratios
458                let ar1 = f64::from(w1) / f64::from(h1);
459                let ar2 = f64::from(w2) / f64::from(h2);
460
461                let ar_diff = (ar1 - ar2).abs();
462                if ar_diff < 0.01 {
463                    // Same aspect ratio, different resolution
464                    0.5
465                } else {
466                    0.0
467                }
468            }
469        }
470        ((None, None), (None, None)) => 0.5, // Both unknown
471        _ => 0.0,                            // One has resolution, other doesn't
472    }
473}
474
475/// Compare codecs.
476#[must_use]
477pub fn compare_codecs(meta1: &MediaMetadata, meta2: &MediaMetadata) -> f64 {
478    let video_match = compare_strings(&meta1.video_codec, &meta2.video_codec);
479    let audio_match = compare_strings(&meta1.audio_codec, &meta2.audio_codec);
480
481    // Average of video and audio codec matches
482    (video_match + audio_match) / 2.0
483}
484
485/// Compare optional strings.
486fn compare_strings(s1: &Option<String>, s2: &Option<String>) -> f64 {
487    match (s1, s2) {
488        (Some(a), Some(b)) => {
489            if a.eq_ignore_ascii_case(b) {
490                1.0
491            } else {
492                0.0
493            }
494        }
495        (None, None) => 0.5, // Both unknown
496        _ => 0.0,            // One known, other unknown
497    }
498}
499
500/// Compare file sizes.
501#[must_use]
502pub fn compare_sizes(size1: u64, size2: u64) -> f64 {
503    if size1 == 0 && size2 == 0 {
504        return 1.0;
505    }
506
507    let max_size = size1.max(size2);
508    if max_size == 0 {
509        return 1.0;
510    }
511
512    let diff = (size1 as i64 - size2 as i64).unsigned_abs();
513    let tolerance = (max_size as f64 * 0.05) as u64; // 5% tolerance
514
515    if diff <= tolerance {
516        1.0
517    } else {
518        (1.0 - diff as f64 / max_size as f64).max(0.0)
519    }
520}
521
522/// Compare containers.
523#[must_use]
524pub fn compare_containers(cont1: &Option<String>, cont2: &Option<String>) -> f64 {
525    compare_strings(cont1, cont2)
526}
527
528/// Extract metadata from file.
529///
530/// # Errors
531///
532/// Returns an error if the file cannot be read.
533pub fn extract_metadata(path: impl AsRef<Path>) -> DedupResult<MediaMetadata> {
534    let path = path.as_ref();
535
536    if !path.exists() {
537        return Err(DedupError::FileNotFound(path.to_path_buf()));
538    }
539
540    let file_metadata = std::fs::metadata(path)?;
541    let size = file_metadata.len();
542
543    let mut metadata = MediaMetadata::new(path.to_path_buf(), size);
544
545    // Get timestamps
546    if let Ok(created) = file_metadata.created() {
547        if let Ok(duration) = created.duration_since(std::time::UNIX_EPOCH) {
548            metadata.created = Some(duration.as_secs() as i64);
549        }
550    }
551
552    if let Ok(modified) = file_metadata.modified() {
553        if let Ok(duration) = modified.duration_since(std::time::UNIX_EPOCH) {
554            metadata.modified = Some(duration.as_secs() as i64);
555        }
556    }
557
558    // Set container based on file extension
559    let ext = path
560        .extension()
561        .and_then(|s| s.to_str())
562        .unwrap_or("")
563        .to_lowercase();
564
565    metadata.container = Some(ext);
566
567    // Extract detailed metadata using magic-byte detection
568    detect_format_from_magic(path, &mut metadata);
569
570    Ok(metadata)
571}
572
573/// Detect media format and codec information from file magic bytes.
574fn detect_format_from_magic(path: &Path, metadata: &mut MediaMetadata) {
575    use std::io::Read;
576
577    let mut file = match std::fs::File::open(path) {
578        Ok(f) => f,
579        Err(_) => return,
580    };
581
582    let mut buf = [0u8; 64];
583    let n = match file.read(&mut buf) {
584        Ok(n) => n,
585        Err(_) => return,
586    };
587
588    if n < 4 {
589        return;
590    }
591
592    let bytes = &buf[..n];
593
594    // EBML / Matroska / WebM: magic [0x1A, 0x45, 0xDF, 0xA3]
595    if bytes.starts_with(&[0x1A, 0x45, 0xDF, 0xA3]) {
596        // Look for "webm" string in the header area (bytes 4..32)
597        let search_range = &bytes[4..n.min(32)];
598        let is_webm = search_range.windows(4).any(|w| w == b"webm");
599        if is_webm {
600            metadata.container = Some("webm".to_string());
601        } else {
602            metadata.container = Some("mkv".to_string());
603        }
604        metadata.video_codec = Some("vp9".to_string());
605        metadata.audio_codec = Some("opus".to_string());
606        return;
607    }
608
609    // ftyp box: MP4 family — bytes[4..8] == b"ftyp"
610    if n >= 12 && &bytes[4..8] == b"ftyp" {
611        let brand = &bytes[8..12];
612        if brand == b"qt  " {
613            metadata.container = Some("mov".to_string());
614        } else if brand == b"M4A " {
615            metadata.container = Some("m4a".to_string());
616        } else if brand == b"M4V " {
617            metadata.container = Some("m4v".to_string());
618        } else {
619            metadata.container = Some("mp4".to_string());
620        }
621        metadata.video_codec = Some("h264".to_string());
622        metadata.audio_codec = Some("aac".to_string());
623        return;
624    }
625
626    // RIFF / WAV: b"RIFF" at start and bytes[8..12] == b"WAVE"
627    if n >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WAVE" {
628        metadata.container = Some("wav".to_string());
629        metadata.audio_codec = Some("pcm".to_string());
630        // Parse WAV fmt chunk: channels at bytes 22..24 (u16 LE),
631        // sample_rate at bytes 24..28 (u32 LE).
632        if n >= 28 {
633            let channels = u16::from_le_bytes([bytes[22], bytes[23]]);
634            let sample_rate = u32::from_le_bytes([bytes[24], bytes[25], bytes[26], bytes[27]]);
635            if channels > 0 {
636                metadata.channels = Some(channels);
637            }
638            if sample_rate > 0 {
639                metadata.sample_rate = Some(sample_rate);
640            }
641        }
642        return;
643    }
644
645    // FLAC: b"fLaC" at start
646    if bytes.starts_with(b"fLaC") {
647        metadata.container = Some("flac".to_string());
648        metadata.audio_codec = Some("flac".to_string());
649        // STREAMINFO block begins at byte 4.
650        // Bytes 4: block type/last-metadata-block flag (0x00 = STREAMINFO, not last).
651        // Sample rate is stored as a 20-bit big-endian field starting at offset 18
652        // within the STREAMINFO data (i.e., file byte 4+4+14 = 22).
653        // Layout: bytes[8..10] = min block size, bytes[10..12] = max block size,
654        //         bytes[12..15] = min frame size (24-bit),
655        //         bytes[15..18] = max frame size (24-bit),
656        //         bytes[18..21] = sample_rate (20 bits) | channels (3 bits) | bits/sample-1 (5 bits)
657        // We need offset 18 within STREAMINFO which is file offset 4 (fLaC) + 4 (block header) + 18 = 26.
658        if n >= 29 {
659            // sample_rate is top 20 bits of 3 bytes at file offsets 26,27,28
660            let b0 = bytes[26] as u32;
661            let b1 = bytes[27] as u32;
662            let b2 = bytes[28] as u32;
663            let sample_rate = (b0 << 12) | (b1 << 4) | (b2 >> 4);
664            if sample_rate > 0 {
665                metadata.sample_rate = Some(sample_rate);
666            }
667            // channels: bits [3:1] of byte 28 (3 bits, value+1)
668            let channels = ((b2 >> 1) & 0x07) + 1;
669            metadata.channels = Some(channels as u16);
670        }
671        return;
672    }
673
674    // OGG: b"OggS" at start
675    if bytes.starts_with(b"OggS") {
676        metadata.container = Some("ogg".to_string());
677        // Check for Opus or Vorbis identification in the first page payload.
678        // The first page data typically starts around byte 28.
679        let page_data = if n > 28 { &bytes[28..] } else { &bytes[4..] };
680        if page_data.windows(8).any(|w| w == b"OpusHead") {
681            metadata.audio_codec = Some("opus".to_string());
682        } else if page_data
683            .windows(7)
684            .any(|w| w == b"\x01vorbis" || w == b"\x03vorbis")
685        {
686            metadata.audio_codec = Some("vorbis".to_string());
687        } else if page_data.windows(6).any(|w| w == b"vorbis") {
688            metadata.audio_codec = Some("vorbis".to_string());
689        } else {
690            metadata.audio_codec = Some("vorbis".to_string());
691        }
692        return;
693    }
694
695    // MPEG-TS: sync byte 0x47 at intervals of 188 bytes
696    if n >= 1 && bytes[0] == 0x47 {
697        // Check if additional sync bytes appear at expected 188-byte intervals.
698        let is_ts = (n >= 189 && bytes[188] == 0x47) || (n >= 1 && bytes[0] == 0x47 && n < 189);
699        if is_ts {
700            metadata.container = Some("ts".to_string());
701            metadata.video_codec = Some("h264".to_string());
702            metadata.audio_codec = Some("aac".to_string());
703            return;
704        }
705    }
706
707    // ID3 tag (MP3): b"ID3"
708    if bytes.starts_with(b"ID3") {
709        metadata.container = Some("mp3".to_string());
710        metadata.audio_codec = Some("mp3".to_string());
711        return;
712    }
713
714    // MP3 sync word: 0xFF followed by 0xE0..=0xFF
715    if n >= 2 && bytes[0] == 0xFF && bytes[1] >= 0xE0 {
716        metadata.container = Some("mp3".to_string());
717        metadata.audio_codec = Some("mp3".to_string());
718    }
719}
720
721/// Find potential duplicates based on metadata.
722#[must_use]
723pub fn find_metadata_duplicates(
724    metadata_list: &[MediaMetadata],
725    threshold: f64,
726) -> Vec<Vec<usize>> {
727    let mut groups = Vec::new();
728    let mut processed = vec![false; metadata_list.len()];
729
730    for i in 0..metadata_list.len() {
731        if processed[i] {
732            continue;
733        }
734
735        let mut group = vec![i];
736
737        for j in (i + 1)..metadata_list.len() {
738            if processed[j] {
739                continue;
740            }
741
742            let similarity = compare_metadata(&metadata_list[i], &metadata_list[j]);
743
744            if similarity.is_similar(threshold) {
745                group.push(j);
746                processed[j] = true;
747            }
748        }
749
750        if group.len() > 1 {
751            groups.push(group);
752        }
753
754        processed[i] = true;
755    }
756
757    groups
758}
759
760/// Fuzzy search for similar filenames.
761#[must_use]
762pub fn fuzzy_search(query: &str, candidates: &[String], threshold: f64) -> Vec<(usize, f64)> {
763    let mut results = Vec::new();
764
765    for (i, candidate) in candidates.iter().enumerate() {
766        let similarity = compare_filenames(query, candidate);
767
768        if similarity >= threshold {
769            results.push((i, similarity));
770        }
771    }
772
773    // Sort by similarity (highest first)
774    results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
775
776    results
777}
778
779/// Calculate metadata quality score (completeness).
780#[must_use]
781pub fn metadata_quality(metadata: &MediaMetadata) -> f64 {
782    let mut score = 0.0;
783    let mut total = 0.0;
784
785    // Check each field
786    total += 1.0;
787    if metadata.duration.is_some() {
788        score += 1.0;
789    }
790
791    total += 1.0;
792    if metadata.width.is_some() && metadata.height.is_some() {
793        score += 1.0;
794    }
795
796    total += 1.0;
797    if metadata.bitrate.is_some() {
798        score += 1.0;
799    }
800
801    total += 1.0;
802    if metadata.framerate.is_some() || metadata.sample_rate.is_some() {
803        score += 1.0;
804    }
805
806    total += 1.0;
807    if metadata.video_codec.is_some() || metadata.audio_codec.is_some() {
808        score += 1.0;
809    }
810
811    total += 1.0;
812    if metadata.container.is_some() {
813        score += 1.0;
814    }
815
816    score / total
817}
818
819#[cfg(test)]
820mod tests {
821    use super::*;
822
823    fn create_test_metadata(name: &str, duration: f64, width: u32, height: u32) -> MediaMetadata {
824        let mut meta = MediaMetadata::new(PathBuf::from(name), 1000000);
825        meta.duration = Some(duration);
826        meta.width = Some(width);
827        meta.height = Some(height);
828        meta
829    }
830
831    #[test]
832    fn test_metadata_creation() {
833        let meta = MediaMetadata::new(PathBuf::from("test.mp4"), 1000);
834        assert_eq!(meta.size, 1000);
835        assert_eq!(meta.extension(), "mp4");
836    }
837
838    #[test]
839    fn test_filename_extraction() {
840        let meta = MediaMetadata::new(PathBuf::from("/path/to/video.mp4"), 1000);
841        assert_eq!(meta.filename(), "video");
842        assert_eq!(meta.extension(), "mp4");
843    }
844
845    #[test]
846    fn test_resolution() {
847        let mut meta = MediaMetadata::new(PathBuf::from("test.mp4"), 1000);
848        meta.width = Some(1920);
849        meta.height = Some(1080);
850
851        assert_eq!(meta.resolution(), Some("1920x1080".to_string()));
852        assert!(meta.is_video());
853    }
854
855    #[test]
856    fn test_aspect_ratio() {
857        let mut meta = MediaMetadata::new(PathBuf::from("test.mp4"), 1000);
858        meta.width = Some(1920);
859        meta.height = Some(1080);
860
861        let ar = meta.aspect_ratio().expect("operation should succeed");
862        assert!((ar - 16.0 / 9.0).abs() < 0.01);
863    }
864
865    #[test]
866    fn test_filename_comparison() {
867        assert_eq!(compare_filenames("video", "video"), 1.0);
868        assert!(compare_filenames("video1", "video2") > 0.5);
869        assert!(compare_filenames("test", "completely_different") < 0.5);
870
871        // Case insensitive
872        assert_eq!(compare_filenames("VIDEO", "video"), 1.0);
873
874        // Special characters ignored
875        assert_eq!(compare_filenames("my-video", "my_video"), 1.0);
876    }
877
878    #[test]
879    fn test_levenshtein_distance() {
880        assert_eq!(levenshtein_distance("", ""), 0);
881        assert_eq!(levenshtein_distance("abc", "abc"), 0);
882        assert_eq!(levenshtein_distance("abc", "ab"), 1);
883        assert_eq!(levenshtein_distance("abc", "def"), 3);
884        assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
885    }
886
887    #[test]
888    fn test_duration_comparison() {
889        assert_eq!(compare_durations(Some(100.0), Some(100.0)), 1.0);
890        assert!(compare_durations(Some(100.0), Some(101.0)) > 0.9); // Within tolerance
891        assert!(compare_durations(Some(100.0), Some(200.0)) < 0.9);
892        assert_eq!(compare_durations(None, None), 0.5);
893        assert_eq!(compare_durations(Some(100.0), None), 0.0);
894    }
895
896    #[test]
897    fn test_resolution_comparison() {
898        let meta1 = create_test_metadata("video1.mp4", 100.0, 1920, 1080);
899        let meta2 = create_test_metadata("video2.mp4", 100.0, 1920, 1080);
900        let meta3 = create_test_metadata("video3.mp4", 100.0, 1280, 720);
901        let meta4 = create_test_metadata("video4.mp4", 100.0, 3840, 2160);
902
903        assert_eq!(compare_resolutions(&meta1, &meta2), 1.0); // Same resolution
904        assert_eq!(compare_resolutions(&meta1, &meta4), 0.5); // Same aspect ratio
905        assert_eq!(compare_resolutions(&meta1, &meta3), 0.5); // Same aspect ratio
906    }
907
908    #[test]
909    fn test_size_comparison() {
910        assert_eq!(compare_sizes(1000, 1000), 1.0);
911        assert!(compare_sizes(1000, 1040) > 0.9); // Within 5% tolerance
912        assert!(compare_sizes(1000, 2000) < 0.9);
913    }
914
915    #[test]
916    fn test_codec_comparison() {
917        let mut meta1 = create_test_metadata("video1.mp4", 100.0, 1920, 1080);
918        meta1.video_codec = Some("av1".to_string());
919        meta1.audio_codec = Some("opus".to_string());
920
921        let mut meta2 = create_test_metadata("video2.mp4", 100.0, 1920, 1080);
922        meta2.video_codec = Some("av1".to_string());
923        meta2.audio_codec = Some("opus".to_string());
924
925        let mut meta3 = create_test_metadata("video3.mp4", 100.0, 1920, 1080);
926        meta3.video_codec = Some("vp9".to_string());
927        meta3.audio_codec = Some("opus".to_string());
928
929        let match12 = compare_codecs(&meta1, &meta2);
930        let match13 = compare_codecs(&meta1, &meta3);
931
932        assert_eq!(match12, 1.0); // Same codecs
933        assert_eq!(match13, 0.5); // One codec different
934    }
935
936    #[test]
937    fn test_metadata_similarity() {
938        let meta1 = create_test_metadata("video_clip.mp4", 100.0, 1920, 1080);
939        let meta2 = create_test_metadata("video_clip_copy.mp4", 100.0, 1920, 1080);
940
941        let similarity = compare_metadata(&meta1, &meta2);
942
943        assert!(similarity.filename_similarity > 0.6);
944        assert_eq!(similarity.duration_match, 1.0);
945        assert_eq!(similarity.resolution_match, 1.0);
946        assert!(similarity.is_similar(0.8));
947    }
948
949    #[test]
950    fn test_fuzzy_search() {
951        let candidates = vec![
952            "video_clip.mp4".to_string(),
953            "audio_track.mp3".to_string(),
954            "video_clip_2.mp4".to_string(),
955            "completely_different.mov".to_string(),
956        ];
957
958        let results = fuzzy_search("video clip", &candidates, 0.5);
959
960        assert!(!results.is_empty());
961        assert!(results[0].1 > 0.5); // First result should be most similar
962    }
963
964    #[test]
965    fn test_metadata_quality() {
966        let mut meta = MediaMetadata::new(PathBuf::from("test.mp4"), 1000);
967        assert!(metadata_quality(&meta) < 0.2); // Minimal metadata
968
969        meta.duration = Some(100.0);
970        meta.width = Some(1920);
971        meta.height = Some(1080);
972        meta.bitrate = Some(5000000);
973        meta.framerate = Some(30.0);
974        meta.video_codec = Some("av1".to_string());
975        meta.container = Some("mp4".to_string());
976
977        assert!(metadata_quality(&meta) > 0.9); // Complete metadata
978    }
979
980    #[test]
981    fn test_find_metadata_duplicates() {
982        let metadata_list = vec![
983            create_test_metadata("video1.mp4", 100.0, 1920, 1080),
984            create_test_metadata("video1_copy.mp4", 100.0, 1920, 1080),
985            create_test_metadata("video2.mp4", 200.0, 1280, 720),
986            create_test_metadata("video1_copy2.mp4", 100.0, 1920, 1080),
987        ];
988
989        let groups = find_metadata_duplicates(&metadata_list, 0.8);
990
991        assert_eq!(groups.len(), 1); // One group of duplicates
992        assert!(groups[0].len() >= 2); // At least two files in the group
993    }
994
995    // ---- Media-aware filename comparison tests ----
996
997    #[test]
998    fn test_filename_comparison_strips_codec_tags() {
999        // Same movie, different encoding parameters
1000        let score = compare_filenames(
1001            "The.Movie.2024.1080p.x264.mkv",
1002            "The.Movie.2024.720p.x265.mp4",
1003        );
1004        assert!(
1005            score > 0.8,
1006            "Same movie with different codecs/resolutions should score > 0.8, got {score}"
1007        );
1008    }
1009
1010    #[test]
1011    fn test_filename_comparison_strips_release_markers() {
1012        let score = compare_filenames(
1013            "Movie.Title.2024.BluRay.REMUX.mkv",
1014            "Movie.Title.2024.WEB-DL.mp4",
1015        );
1016        assert!(
1017            score > 0.8,
1018            "Same movie with different release types should score > 0.8, got {score}"
1019        );
1020    }
1021
1022    #[test]
1023    fn test_filename_comparison_different_content() {
1024        // Completely different movies should score low
1025        let score = compare_filenames("Inception.2010.1080p.mkv", "Interstellar.2014.720p.mp4");
1026        assert!(
1027            score < 0.8,
1028            "Different movies should score < 0.8, got {score}"
1029        );
1030    }
1031
1032    #[test]
1033    fn test_filename_comparison_uhd_vs_hd() {
1034        let score = compare_filenames(
1035            "Documentary.2024.2160p.HDR.mkv",
1036            "Documentary.2024.1080p.SDR.mp4",
1037        );
1038        assert!(
1039            score > 0.7,
1040            "Same content at different quality tiers should be similar, got {score}"
1041        );
1042    }
1043
1044    #[test]
1045    fn test_filename_comparison_audio_codecs_stripped() {
1046        let score = compare_filenames("Concert.2024.FLAC.mkv", "Concert.2024.AAC.mp4");
1047        assert!(
1048            score > 0.8,
1049            "Same content with different audio codecs should match, got {score}"
1050        );
1051    }
1052
1053    #[test]
1054    fn test_metadata_comparison_uses_fuzzy_filename() {
1055        let mut meta1 = create_test_metadata("Movie.2024.1080p.x264.BluRay.mkv", 100.0, 1920, 1080);
1056        meta1.video_codec = Some("h264".to_string());
1057        meta1.audio_codec = Some("aac".to_string());
1058
1059        let mut meta2 = create_test_metadata("Movie.2024.720p.x265.WEB-DL.mp4", 100.0, 1280, 720);
1060        meta2.video_codec = Some("h265".to_string());
1061        meta2.audio_codec = Some("opus".to_string());
1062
1063        let sim = compare_metadata(&meta1, &meta2);
1064        // filename_similarity should be high due to media-aware stripping
1065        assert!(
1066            sim.filename_similarity > 0.7,
1067            "Media-aware filename comparison should score > 0.7, got {}",
1068            sim.filename_similarity
1069        );
1070    }
1071
1072    // ---- title_fuzzy_score tests ----
1073
1074    #[test]
1075    fn test_metadata_fuzzy_similar_titles() {
1076        let mut meta1 = create_test_metadata("video_a.mp4", 100.0, 1920, 1080);
1077        meta1.title = Some("The Great Documentary 2024".to_string());
1078
1079        let mut meta2 = create_test_metadata("video_b.mp4", 100.0, 1920, 1080);
1080        meta2.title = Some("the great documentary 2024".to_string());
1081
1082        let sim = compare_metadata(&meta1, &meta2);
1083        // Identical-modulo-case titles should yield title_fuzzy_score == 1.0
1084        assert!(
1085            sim.title_fuzzy_score >= 0.9,
1086            "Near-identical titles (case difference) should score >= 0.9, got {}",
1087            sim.title_fuzzy_score
1088        );
1089        // And the overall_score uses the maximum of filename and title signals
1090        assert!(
1091            sim.overall_score() >= 0.5,
1092            "Overall score should be raised by the high title signal, got {}",
1093            sim.overall_score()
1094        );
1095    }
1096
1097    #[test]
1098    fn test_metadata_fuzzy_different_titles() {
1099        let mut meta1 = create_test_metadata("video_a.mp4", 100.0, 1920, 1080);
1100        meta1.title = Some("Inception".to_string());
1101
1102        let mut meta2 = create_test_metadata("video_b.mp4", 100.0, 1920, 1080);
1103        meta2.title = Some("Interstellar".to_string());
1104
1105        let sim = compare_metadata(&meta1, &meta2);
1106        assert!(
1107            sim.title_fuzzy_score < 0.7,
1108            "Clearly different titles should score < 0.7, got {}",
1109            sim.title_fuzzy_score
1110        );
1111    }
1112
1113    #[test]
1114    fn test_metadata_no_title_title_score_is_zero() {
1115        let meta1 = create_test_metadata("video_a.mp4", 100.0, 1920, 1080);
1116        let meta2 = create_test_metadata("video_b.mp4", 100.0, 1920, 1080);
1117        let sim = compare_metadata(&meta1, &meta2);
1118        assert_eq!(
1119            sim.title_fuzzy_score, 0.0,
1120            "Missing title tags should produce title_fuzzy_score of 0.0"
1121        );
1122    }
1123
1124    #[test]
1125    fn test_compare_titles_identical_case_insensitive() {
1126        assert_eq!(
1127            compare_titles(Some("Hello World"), Some("hello world")),
1128            1.0
1129        );
1130    }
1131
1132    #[test]
1133    fn test_compare_titles_none_returns_zero() {
1134        assert_eq!(compare_titles(None, Some("title")), 0.0);
1135        assert_eq!(compare_titles(Some("title"), None), 0.0);
1136        assert_eq!(compare_titles(None, None), 0.0);
1137    }
1138
1139    #[test]
1140    fn test_compare_titles_similar_strips_noise() {
1141        // "Movie 1080p" vs "Movie 720p" — differ only in resolution tag
1142        let score = compare_titles(Some("The Movie 1080p"), Some("The Movie 720p"));
1143        assert!(
1144            score > 0.7,
1145            "Title comparison should strip resolution noise, got {score}"
1146        );
1147    }
1148}