oximedia_dedup/
fuzzy_match.rs

1#![allow(dead_code)]
2
3//! Fuzzy / approximate matching for media deduplication.
4//!
5//! This module provides edit-distance and similarity metrics that detect
6//! near-duplicate media by comparing fingerprints, metadata strings, or
7//! byte sequences that may differ slightly due to re-encoding, cropping,
8//! or metadata edits.
9//!
10//! # Key Types
11//!
12//! - [`EditDistance`] - Levenshtein edit distance calculator
13//! - [`FuzzyScore`] - A normalised similarity score (0.0 to 1.0)
14//! - [`TokenMatcher`] - Token-based (bag-of-words) similarity
15//! - [`BigramSimilarity`] - Character bigram overlap metric
16
17use std::collections::{HashMap, HashSet};
18use std::fmt;
19
20/// A normalised similarity score in the range `[0.0, 1.0]`.
21///
22/// - `1.0` means an exact match.
23/// - `0.0` means completely dissimilar.
24#[derive(Debug, Clone, Copy)]
25pub struct FuzzyScore {
26    /// The raw score value.
27    value: f64,
28}
29
30impl FuzzyScore {
31    /// Create a new score, clamping to `[0.0, 1.0]`.
32    #[must_use]
33    pub fn new(value: f64) -> Self {
34        Self {
35            value: value.clamp(0.0, 1.0),
36        }
37    }
38
39    /// Return the score value.
40    #[must_use]
41    pub fn value(self) -> f64 {
42        self.value
43    }
44
45    /// Check whether the score meets a given threshold.
46    #[must_use]
47    pub fn meets_threshold(self, threshold: f64) -> bool {
48        self.value >= threshold
49    }
50
51    /// Exact match (score == 1.0).
52    #[must_use]
53    pub fn is_exact(self) -> bool {
54        (self.value - 1.0).abs() < f64::EPSILON
55    }
56
57    /// Combine two scores by averaging.
58    #[must_use]
59    pub fn average(self, other: Self) -> Self {
60        Self::new((self.value + other.value) / 2.0)
61    }
62
63    /// Combine two scores using a weighted average.
64    #[must_use]
65    #[allow(clippy::cast_precision_loss)]
66    pub fn weighted_average(scores: &[(Self, f64)]) -> Self {
67        if scores.is_empty() {
68            return Self::new(0.0);
69        }
70        let total_weight: f64 = scores.iter().map(|(_, w)| w).sum();
71        if total_weight <= 0.0 {
72            return Self::new(0.0);
73        }
74        let weighted_sum: f64 = scores.iter().map(|(s, w)| s.value * w).sum();
75        Self::new(weighted_sum / total_weight)
76    }
77}
78
79impl fmt::Display for FuzzyScore {
80    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
81        write!(f, "{:.4}", self.value)
82    }
83}
84
85impl PartialEq for FuzzyScore {
86    fn eq(&self, other: &Self) -> bool {
87        (self.value - other.value).abs() < 1e-10
88    }
89}
90
91/// Levenshtein edit distance calculator.
92pub struct EditDistance;
93
94impl EditDistance {
95    /// Compute the Levenshtein distance between two byte slices.
96    #[must_use]
97    pub fn bytes(a: &[u8], b: &[u8]) -> usize {
98        let m = a.len();
99        let n = b.len();
100
101        if m == 0 {
102            return n;
103        }
104        if n == 0 {
105            return m;
106        }
107
108        // Use single-row optimisation
109        let mut prev = vec![0usize; n + 1];
110        let mut curr = vec![0usize; n + 1];
111
112        for j in 0..=n {
113            prev[j] = j;
114        }
115
116        for i in 1..=m {
117            curr[0] = i;
118            for j in 1..=n {
119                let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
120                curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
121            }
122            std::mem::swap(&mut prev, &mut curr);
123        }
124
125        prev[n]
126    }
127
128    /// Compute the Levenshtein distance between two strings.
129    #[must_use]
130    pub fn strings(a: &str, b: &str) -> usize {
131        let a_chars: Vec<char> = a.chars().collect();
132        let b_chars: Vec<char> = b.chars().collect();
133        let m = a_chars.len();
134        let n = b_chars.len();
135
136        if m == 0 {
137            return n;
138        }
139        if n == 0 {
140            return m;
141        }
142
143        let mut prev = vec![0usize; n + 1];
144        let mut curr = vec![0usize; n + 1];
145
146        for j in 0..=n {
147            prev[j] = j;
148        }
149
150        for i in 1..=m {
151            curr[0] = i;
152            for j in 1..=n {
153                let cost = if a_chars[i - 1] == b_chars[j - 1] {
154                    0
155                } else {
156                    1
157                };
158                curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
159            }
160            std::mem::swap(&mut prev, &mut curr);
161        }
162
163        prev[n]
164    }
165
166    /// Convert edit distance to a normalised similarity score.
167    #[must_use]
168    #[allow(clippy::cast_precision_loss)]
169    pub fn similarity(a: &str, b: &str) -> FuzzyScore {
170        let dist = Self::strings(a, b);
171        let max_len = a.chars().count().max(b.chars().count());
172        if max_len == 0 {
173            return FuzzyScore::new(1.0);
174        }
175        FuzzyScore::new(1.0 - dist as f64 / max_len as f64)
176    }
177}
178
179/// Token-based (bag-of-words) similarity.
180///
181/// Computes the Jaccard index of the token sets extracted from two strings.
182pub struct TokenMatcher {
183    /// Separator characters used for tokenisation.
184    separators: Vec<char>,
185    /// Whether to compare case-insensitively.
186    case_insensitive: bool,
187}
188
189impl TokenMatcher {
190    /// Create a new token matcher with default settings.
191    #[must_use]
192    pub fn new() -> Self {
193        Self {
194            separators: vec![' ', '-', '_', '.', ',', ';', '/', '\\'],
195            case_insensitive: true,
196        }
197    }
198
199    /// Set whether comparison is case-insensitive.
200    #[must_use]
201    pub fn case_insensitive(mut self, yes: bool) -> Self {
202        self.case_insensitive = yes;
203        self
204    }
205
206    /// Tokenise a string into a set of tokens.
207    fn tokenize(&self, s: &str) -> HashSet<String> {
208        let input = if self.case_insensitive {
209            s.to_lowercase()
210        } else {
211            s.to_string()
212        };
213
214        let mut tokens = HashSet::new();
215        let mut current = String::new();
216
217        for ch in input.chars() {
218            if self.separators.contains(&ch) {
219                if !current.is_empty() {
220                    tokens.insert(std::mem::take(&mut current));
221                }
222            } else {
223                current.push(ch);
224            }
225        }
226        if !current.is_empty() {
227            tokens.insert(current);
228        }
229
230        tokens
231    }
232
233    /// Compute the Jaccard similarity between two strings.
234    #[must_use]
235    #[allow(clippy::cast_precision_loss)]
236    pub fn similarity(&self, a: &str, b: &str) -> FuzzyScore {
237        let set_a = self.tokenize(a);
238        let set_b = self.tokenize(b);
239
240        if set_a.is_empty() && set_b.is_empty() {
241            return FuzzyScore::new(1.0);
242        }
243
244        let intersection = set_a.intersection(&set_b).count();
245        let union = set_a.union(&set_b).count();
246
247        if union == 0 {
248            FuzzyScore::new(0.0)
249        } else {
250            FuzzyScore::new(intersection as f64 / union as f64)
251        }
252    }
253}
254
255impl Default for TokenMatcher {
256    fn default() -> Self {
257        Self::new()
258    }
259}
260
261/// Character bigram overlap metric (Dice coefficient).
262pub struct BigramSimilarity;
263
264impl BigramSimilarity {
265    /// Extract character bigrams from a string.
266    fn bigrams(s: &str) -> HashMap<(char, char), usize> {
267        let chars: Vec<char> = s.chars().collect();
268        let mut map = HashMap::new();
269        if chars.len() < 2 {
270            return map;
271        }
272        for pair in chars.windows(2) {
273            *map.entry((pair[0], pair[1])).or_insert(0) += 1;
274        }
275        map
276    }
277
278    /// Compute the Dice coefficient between two strings.
279    #[must_use]
280    #[allow(clippy::cast_precision_loss)]
281    pub fn similarity(a: &str, b: &str) -> FuzzyScore {
282        let bg_a = Self::bigrams(&a.to_lowercase());
283        let bg_b = Self::bigrams(&b.to_lowercase());
284
285        if bg_a.is_empty() && bg_b.is_empty() {
286            return FuzzyScore::new(1.0);
287        }
288
289        let mut intersection_count: usize = 0;
290        for (bigram, count_a) in &bg_a {
291            if let Some(count_b) = bg_b.get(bigram) {
292                intersection_count += (*count_a).min(*count_b);
293            }
294        }
295
296        let total_a: usize = bg_a.values().sum();
297        let total_b: usize = bg_b.values().sum();
298        let denom = total_a + total_b;
299
300        if denom == 0 {
301            FuzzyScore::new(0.0)
302        } else {
303            FuzzyScore::new(2.0 * intersection_count as f64 / denom as f64)
304        }
305    }
306}
307
308/// Hamming distance between two equal-length byte slices.
309///
310/// Counts the number of positions where corresponding bytes differ.
311/// Returns `None` if the slices have different lengths.
312#[must_use]
313pub fn hamming_distance(a: &[u8], b: &[u8]) -> Option<usize> {
314    if a.len() != b.len() {
315        return None;
316    }
317    Some(a.iter().zip(b.iter()).filter(|(x, y)| x != y).count())
318}
319
320/// Normalised Hamming similarity (1.0 = identical, 0.0 = all bits differ).
321#[must_use]
322#[allow(clippy::cast_precision_loss)]
323pub fn hamming_similarity(a: &[u8], b: &[u8]) -> Option<FuzzyScore> {
324    let dist = hamming_distance(a, b)?;
325    let len = a.len();
326    if len == 0 {
327        return Some(FuzzyScore::new(1.0));
328    }
329    Some(FuzzyScore::new(1.0 - dist as f64 / len as f64))
330}
331
332// ---------------------------------------------------------------------------
333// Filename / title matching for media deduplication
334// ---------------------------------------------------------------------------
335
336/// Normalized filename / title matcher for media deduplication.
337///
338/// Strips common media-file noise (resolution tags, codec names, release-group
339/// markers, punctuation) and computes a combined similarity from Levenshtein
340/// edit distance, token Jaccard, and bigram Dice coefficient.
341pub struct FilenameMatcher {
342    /// Weight for edit-distance similarity (0.0–1.0).
343    edit_weight: f64,
344    /// Weight for token Jaccard similarity (0.0–1.0).
345    token_weight: f64,
346    /// Weight for bigram Dice similarity (0.0–1.0).
347    bigram_weight: f64,
348    /// Minimum combined score to consider a match.
349    threshold: f64,
350}
351
352impl FilenameMatcher {
353    /// Create a new matcher with default weights.
354    #[must_use]
355    pub fn new(threshold: f64) -> Self {
356        Self {
357            edit_weight: 0.4,
358            token_weight: 0.35,
359            bigram_weight: 0.25,
360            threshold: threshold.clamp(0.0, 1.0),
361        }
362    }
363
364    /// Create with custom weights.  Weights are normalized internally.
365    #[must_use]
366    pub fn with_weights(threshold: f64, edit_w: f64, token_w: f64, bigram_w: f64) -> Self {
367        let total = edit_w + token_w + bigram_w;
368        let (ew, tw, bw) = if total <= 0.0 {
369            (1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0)
370        } else {
371            (edit_w / total, token_w / total, bigram_w / total)
372        };
373        Self {
374            edit_weight: ew,
375            token_weight: tw,
376            bigram_weight: bw,
377            threshold: threshold.clamp(0.0, 1.0),
378        }
379    }
380
381    /// Normalize a filename/title for comparison.
382    ///
383    /// Strips extension, converts to lowercase, removes common noise tokens
384    /// (resolution tags, codec names, quality markers), and collapses whitespace.
385    #[must_use]
386    pub fn normalize(name: &str) -> String {
387        // Strip directory components – keep only the filename.
388        let base = name.rsplit(['/', '\\']).next().unwrap_or(name);
389
390        // Strip extension
391        let stem = base.rsplit_once('.').map_or(base, |(s, _)| s);
392
393        let lower = stem.to_lowercase();
394
395        // Remove common noise tokens (resolution, codec, quality, release)
396        let noise: &[&str] = &[
397            "1080p", "720p", "480p", "2160p", "4k", "uhd", "hdr", "hdr10", "x264", "x265", "h264",
398            "h265", "hevc", "avc", "vp9", "av1", "aac", "ac3", "dts", "flac", "opus", "mp3",
399            "bluray", "bdrip", "brrip", "webrip", "web-dl", "webdl", "dvdrip", "remux", "remaster",
400            "proper", "repack", "mkv", "mp4", "avi", "mov", "wmv", "webm",
401        ];
402
403        let mut cleaned = lower;
404        for &tag in noise {
405            // Replace noise tokens bounded by non-alphanumerics or start/end
406            cleaned = remove_noise_token(&cleaned, tag);
407        }
408
409        // Replace common separators with spaces
410        let normalized: String = cleaned
411            .chars()
412            .map(|c| {
413                if c.is_alphanumeric() || c == ' ' {
414                    c
415                } else {
416                    ' '
417                }
418            })
419            .collect();
420
421        // Collapse whitespace
422        let parts: Vec<&str> = normalized.split_whitespace().collect();
423        parts.join(" ")
424    }
425
426    /// Compute the combined similarity score between two filenames/titles.
427    #[must_use]
428    pub fn similarity(&self, name_a: &str, name_b: &str) -> FuzzyScore {
429        let norm_a = Self::normalize(name_a);
430        let norm_b = Self::normalize(name_b);
431
432        if norm_a.is_empty() && norm_b.is_empty() {
433            return FuzzyScore::new(1.0);
434        }
435
436        let edit_sim = EditDistance::similarity(&norm_a, &norm_b);
437        let token_sim = TokenMatcher::new().similarity(&norm_a, &norm_b);
438        let bigram_sim = BigramSimilarity::similarity(&norm_a, &norm_b);
439
440        let combined = edit_sim.value() * self.edit_weight
441            + token_sim.value() * self.token_weight
442            + bigram_sim.value() * self.bigram_weight;
443
444        FuzzyScore::new(combined)
445    }
446
447    /// Returns `true` if the two filenames are considered matching.
448    #[must_use]
449    pub fn is_match(&self, name_a: &str, name_b: &str) -> bool {
450        self.similarity(name_a, name_b)
451            .meets_threshold(self.threshold)
452    }
453
454    /// Find all matching pairs within a list of filenames.
455    ///
456    /// Returns `Vec<(usize, usize, FuzzyScore)>` with `i < j`.
457    #[must_use]
458    pub fn find_matching_pairs(&self, names: &[&str]) -> Vec<(usize, usize, FuzzyScore)> {
459        let mut pairs = Vec::new();
460        for i in 0..names.len() {
461            for j in (i + 1)..names.len() {
462                let score = self.similarity(names[i], names[j]);
463                if score.meets_threshold(self.threshold) {
464                    pairs.push((i, j, score));
465                }
466            }
467        }
468        pairs
469    }
470
471    /// Return the threshold.
472    #[must_use]
473    pub fn threshold(&self) -> f64 {
474        self.threshold
475    }
476}
477
478impl Default for FilenameMatcher {
479    fn default() -> Self {
480        Self::new(0.80)
481    }
482}
483
484/// Remove a noise token from a string, respecting word boundaries.
485fn remove_noise_token(input: &str, token: &str) -> String {
486    let mut result = input.to_string();
487    loop {
488        let lower = result.to_lowercase();
489        if let Some(pos) = lower.find(token) {
490            let before_ok = pos == 0
491                || !lower
492                    .as_bytes()
493                    .get(pos - 1)
494                    .map_or(false, |b| b.is_ascii_alphanumeric());
495            let after_pos = pos + token.len();
496            let after_ok = after_pos >= lower.len()
497                || !lower
498                    .as_bytes()
499                    .get(after_pos)
500                    .map_or(false, |b| b.is_ascii_alphanumeric());
501            if before_ok && after_ok {
502                result = format!("{}{}", &result[..pos], &result[after_pos..]);
503            } else {
504                break;
505            }
506        } else {
507            break;
508        }
509    }
510    result
511}
512
513#[cfg(test)]
514mod tests {
515    use super::*;
516
517    #[test]
518    fn test_fuzzy_score_clamp() {
519        assert!((FuzzyScore::new(1.5).value() - 1.0).abs() < f64::EPSILON);
520        assert!((FuzzyScore::new(-0.3).value() - 0.0).abs() < f64::EPSILON);
521        assert!((FuzzyScore::new(0.75).value() - 0.75).abs() < f64::EPSILON);
522    }
523
524    #[test]
525    fn test_fuzzy_score_threshold() {
526        let s = FuzzyScore::new(0.85);
527        assert!(s.meets_threshold(0.8));
528        assert!(s.meets_threshold(0.85));
529        assert!(!s.meets_threshold(0.9));
530    }
531
532    #[test]
533    fn test_fuzzy_score_is_exact() {
534        assert!(FuzzyScore::new(1.0).is_exact());
535        assert!(!FuzzyScore::new(0.999).is_exact());
536    }
537
538    #[test]
539    fn test_fuzzy_score_average() {
540        let a = FuzzyScore::new(0.6);
541        let b = FuzzyScore::new(0.8);
542        let avg = a.average(b);
543        assert!((avg.value() - 0.7).abs() < 1e-10);
544    }
545
546    #[test]
547    fn test_fuzzy_score_weighted_average() {
548        let scores = vec![(FuzzyScore::new(1.0), 3.0), (FuzzyScore::new(0.0), 1.0)];
549        let avg = FuzzyScore::weighted_average(&scores);
550        assert!((avg.value() - 0.75).abs() < 1e-10);
551    }
552
553    #[test]
554    fn test_edit_distance_strings_identical() {
555        assert_eq!(EditDistance::strings("hello", "hello"), 0);
556    }
557
558    #[test]
559    fn test_edit_distance_strings_basic() {
560        assert_eq!(EditDistance::strings("kitten", "sitting"), 3);
561        assert_eq!(EditDistance::strings("", "abc"), 3);
562        assert_eq!(EditDistance::strings("abc", ""), 3);
563    }
564
565    #[test]
566    fn test_edit_distance_bytes() {
567        assert_eq!(EditDistance::bytes(b"abc", b"abc"), 0);
568        assert_eq!(EditDistance::bytes(b"abc", b"adc"), 1);
569        assert_eq!(EditDistance::bytes(b"", b"xyz"), 3);
570    }
571
572    #[test]
573    fn test_edit_distance_similarity() {
574        let s = EditDistance::similarity("hello", "hello");
575        assert!(s.is_exact());
576
577        let s2 = EditDistance::similarity("hello", "hxllo");
578        assert!(s2.value() > 0.5);
579
580        let s3 = EditDistance::similarity("", "");
581        assert!(s3.is_exact());
582    }
583
584    #[test]
585    fn test_token_matcher_identical() {
586        let matcher = TokenMatcher::new();
587        let s = matcher.similarity("hello world", "hello world");
588        assert!(s.is_exact());
589    }
590
591    #[test]
592    fn test_token_matcher_case_insensitive() {
593        let matcher = TokenMatcher::new().case_insensitive(true);
594        let s = matcher.similarity("Hello World", "hello world");
595        assert!(s.is_exact());
596    }
597
598    #[test]
599    fn test_token_matcher_partial() {
600        let matcher = TokenMatcher::new();
601        let s = matcher.similarity("the quick brown fox", "the quick red fox");
602        assert!(s.value() > 0.5);
603        assert!(!s.is_exact());
604    }
605
606    #[test]
607    fn test_bigram_similarity_identical() {
608        let s = BigramSimilarity::similarity("night", "night");
609        assert!(s.is_exact());
610    }
611
612    #[test]
613    fn test_bigram_similarity_similar() {
614        let s = BigramSimilarity::similarity("night", "nacht");
615        assert!(s.value() > 0.0);
616        assert!(!s.is_exact());
617    }
618
619    #[test]
620    fn test_hamming_distance_equal() {
621        assert_eq!(hamming_distance(b"abc", b"abc"), Some(0));
622    }
623
624    #[test]
625    fn test_hamming_distance_different() {
626        assert_eq!(hamming_distance(b"abc", b"axc"), Some(1));
627    }
628
629    #[test]
630    fn test_hamming_distance_length_mismatch() {
631        assert_eq!(hamming_distance(b"ab", b"abc"), None);
632    }
633
634    #[test]
635    fn test_hamming_similarity() {
636        let s = hamming_similarity(b"abcd", b"abcd").expect("operation should succeed");
637        assert!(s.is_exact());
638
639        let s2 = hamming_similarity(b"abcd", b"axyd").expect("operation should succeed");
640        assert!((s2.value() - 0.5).abs() < f64::EPSILON);
641    }
642
643    // ---- FilenameMatcher tests ----
644
645    #[test]
646    fn test_filename_normalize_basic() {
647        let n = FilenameMatcher::normalize("The.Movie.2024.1080p.x264.mkv");
648        assert_eq!(n, "the movie 2024");
649    }
650
651    #[test]
652    fn test_filename_normalize_strips_extension() {
653        let n = FilenameMatcher::normalize("video.mp4");
654        assert_eq!(n, "video");
655    }
656
657    #[test]
658    fn test_filename_normalize_strips_directory() {
659        let n = FilenameMatcher::normalize("/path/to/video.mp4");
660        assert_eq!(n, "video");
661    }
662
663    #[test]
664    fn test_filename_normalize_codec_tags() {
665        let n = FilenameMatcher::normalize("Movie.2024.h265.AAC.BluRay.mp4");
666        assert_eq!(n, "movie 2024");
667    }
668
669    #[test]
670    fn test_filename_matcher_identical() {
671        let m = FilenameMatcher::new(0.8);
672        let s = m.similarity("The.Movie.2024.mkv", "The.Movie.2024.mkv");
673        assert!(s.is_exact());
674    }
675
676    #[test]
677    fn test_filename_matcher_same_content_different_codec() {
678        let m = FilenameMatcher::new(0.8);
679        let s = m.similarity(
680            "The.Movie.2024.1080p.x264.mkv",
681            "The.Movie.2024.720p.x265.mp4",
682        );
683        assert!(s.meets_threshold(0.8), "Score was {}", s.value());
684    }
685
686    #[test]
687    fn test_filename_matcher_different_movies() {
688        let m = FilenameMatcher::new(0.8);
689        let s = m.similarity("Inception.2010.mkv", "Interstellar.2014.mkv");
690        assert!(!s.meets_threshold(0.8));
691    }
692
693    #[test]
694    fn test_filename_matcher_is_match() {
695        let m = FilenameMatcher::new(0.9);
696        assert!(m.is_match("movie.1080p.mkv", "movie.720p.mp4"));
697    }
698
699    #[test]
700    fn test_filename_matcher_find_matching_pairs() {
701        let m = FilenameMatcher::new(0.8);
702        let names = [
703            "The.Movie.2024.1080p.mkv",
704            "The.Movie.2024.720p.mp4",
705            "Totally.Different.2023.mkv",
706        ];
707        let pairs = m.find_matching_pairs(&names);
708        // First two should match, third should not match either
709        assert!(pairs.iter().any(|(i, j, _)| *i == 0 && *j == 1));
710        assert!(!pairs.iter().any(|(_, j, _)| *j == 2));
711    }
712
713    #[test]
714    fn test_filename_matcher_empty_strings() {
715        let m = FilenameMatcher::new(0.5);
716        let s = m.similarity("", "");
717        assert!(s.is_exact());
718    }
719
720    #[test]
721    fn test_filename_matcher_default() {
722        let m = FilenameMatcher::default();
723        assert!((m.threshold() - 0.80).abs() < f64::EPSILON);
724    }
725
726    #[test]
727    fn test_filename_matcher_custom_weights() {
728        let m = FilenameMatcher::with_weights(0.7, 1.0, 0.0, 0.0);
729        // With 100% edit weight, edit distance should dominate
730        let s = m.similarity("hello.mp4", "hello.mp4");
731        assert!(s.is_exact());
732    }
733
734    #[test]
735    fn test_filename_normalize_preserves_year() {
736        let n = FilenameMatcher::normalize("Movie.Title.2024.Remaster.mkv");
737        // "remaster" is noise, "2024" should remain
738        assert!(n.contains("2024"));
739        assert!(!n.contains("remaster"));
740    }
741
742    #[test]
743    fn test_remove_noise_token_boundary() {
744        let result = remove_noise_token("test1080pin", "1080p");
745        // "1080p" is adjacent to alphanumeric chars so should NOT be removed
746        assert_eq!(result, "test1080pin");
747    }
748
749    #[test]
750    fn test_remove_noise_token_standalone() {
751        let result = remove_noise_token("test.1080p.file", "1080p");
752        assert!(!result.contains("1080p"));
753    }
754}
oximedia_dedup/fuzzy_match.rs

oximedia_dedup/
fuzzy_match.rs