1#![allow(dead_code)]
2
3use std::collections::{HashMap, HashSet};
18use std::fmt;
19
20#[derive(Debug, Clone, Copy)]
25pub struct FuzzyScore {
26 value: f64,
28}
29
30impl FuzzyScore {
31 #[must_use]
33 pub fn new(value: f64) -> Self {
34 Self {
35 value: value.clamp(0.0, 1.0),
36 }
37 }
38
39 #[must_use]
41 pub fn value(self) -> f64 {
42 self.value
43 }
44
45 #[must_use]
47 pub fn meets_threshold(self, threshold: f64) -> bool {
48 self.value >= threshold
49 }
50
51 #[must_use]
53 pub fn is_exact(self) -> bool {
54 (self.value - 1.0).abs() < f64::EPSILON
55 }
56
57 #[must_use]
59 pub fn average(self, other: Self) -> Self {
60 Self::new((self.value + other.value) / 2.0)
61 }
62
63 #[must_use]
65 #[allow(clippy::cast_precision_loss)]
66 pub fn weighted_average(scores: &[(Self, f64)]) -> Self {
67 if scores.is_empty() {
68 return Self::new(0.0);
69 }
70 let total_weight: f64 = scores.iter().map(|(_, w)| w).sum();
71 if total_weight <= 0.0 {
72 return Self::new(0.0);
73 }
74 let weighted_sum: f64 = scores.iter().map(|(s, w)| s.value * w).sum();
75 Self::new(weighted_sum / total_weight)
76 }
77}
78
79impl fmt::Display for FuzzyScore {
80 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
81 write!(f, "{:.4}", self.value)
82 }
83}
84
85impl PartialEq for FuzzyScore {
86 fn eq(&self, other: &Self) -> bool {
87 (self.value - other.value).abs() < 1e-10
88 }
89}
90
91pub struct EditDistance;
93
94impl EditDistance {
95 #[must_use]
97 pub fn bytes(a: &[u8], b: &[u8]) -> usize {
98 let m = a.len();
99 let n = b.len();
100
101 if m == 0 {
102 return n;
103 }
104 if n == 0 {
105 return m;
106 }
107
108 let mut prev = vec![0usize; n + 1];
110 let mut curr = vec![0usize; n + 1];
111
112 for j in 0..=n {
113 prev[j] = j;
114 }
115
116 for i in 1..=m {
117 curr[0] = i;
118 for j in 1..=n {
119 let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
120 curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
121 }
122 std::mem::swap(&mut prev, &mut curr);
123 }
124
125 prev[n]
126 }
127
128 #[must_use]
130 pub fn strings(a: &str, b: &str) -> usize {
131 let a_chars: Vec<char> = a.chars().collect();
132 let b_chars: Vec<char> = b.chars().collect();
133 let m = a_chars.len();
134 let n = b_chars.len();
135
136 if m == 0 {
137 return n;
138 }
139 if n == 0 {
140 return m;
141 }
142
143 let mut prev = vec![0usize; n + 1];
144 let mut curr = vec![0usize; n + 1];
145
146 for j in 0..=n {
147 prev[j] = j;
148 }
149
150 for i in 1..=m {
151 curr[0] = i;
152 for j in 1..=n {
153 let cost = if a_chars[i - 1] == b_chars[j - 1] {
154 0
155 } else {
156 1
157 };
158 curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
159 }
160 std::mem::swap(&mut prev, &mut curr);
161 }
162
163 prev[n]
164 }
165
166 #[must_use]
168 #[allow(clippy::cast_precision_loss)]
169 pub fn similarity(a: &str, b: &str) -> FuzzyScore {
170 let dist = Self::strings(a, b);
171 let max_len = a.chars().count().max(b.chars().count());
172 if max_len == 0 {
173 return FuzzyScore::new(1.0);
174 }
175 FuzzyScore::new(1.0 - dist as f64 / max_len as f64)
176 }
177}
178
179pub struct TokenMatcher {
183 separators: Vec<char>,
185 case_insensitive: bool,
187}
188
189impl TokenMatcher {
190 #[must_use]
192 pub fn new() -> Self {
193 Self {
194 separators: vec![' ', '-', '_', '.', ',', ';', '/', '\\'],
195 case_insensitive: true,
196 }
197 }
198
199 #[must_use]
201 pub fn case_insensitive(mut self, yes: bool) -> Self {
202 self.case_insensitive = yes;
203 self
204 }
205
206 fn tokenize(&self, s: &str) -> HashSet<String> {
208 let input = if self.case_insensitive {
209 s.to_lowercase()
210 } else {
211 s.to_string()
212 };
213
214 let mut tokens = HashSet::new();
215 let mut current = String::new();
216
217 for ch in input.chars() {
218 if self.separators.contains(&ch) {
219 if !current.is_empty() {
220 tokens.insert(std::mem::take(&mut current));
221 }
222 } else {
223 current.push(ch);
224 }
225 }
226 if !current.is_empty() {
227 tokens.insert(current);
228 }
229
230 tokens
231 }
232
233 #[must_use]
235 #[allow(clippy::cast_precision_loss)]
236 pub fn similarity(&self, a: &str, b: &str) -> FuzzyScore {
237 let set_a = self.tokenize(a);
238 let set_b = self.tokenize(b);
239
240 if set_a.is_empty() && set_b.is_empty() {
241 return FuzzyScore::new(1.0);
242 }
243
244 let intersection = set_a.intersection(&set_b).count();
245 let union = set_a.union(&set_b).count();
246
247 if union == 0 {
248 FuzzyScore::new(0.0)
249 } else {
250 FuzzyScore::new(intersection as f64 / union as f64)
251 }
252 }
253}
254
255impl Default for TokenMatcher {
256 fn default() -> Self {
257 Self::new()
258 }
259}
260
261pub struct BigramSimilarity;
263
264impl BigramSimilarity {
265 fn bigrams(s: &str) -> HashMap<(char, char), usize> {
267 let chars: Vec<char> = s.chars().collect();
268 let mut map = HashMap::new();
269 if chars.len() < 2 {
270 return map;
271 }
272 for pair in chars.windows(2) {
273 *map.entry((pair[0], pair[1])).or_insert(0) += 1;
274 }
275 map
276 }
277
278 #[must_use]
280 #[allow(clippy::cast_precision_loss)]
281 pub fn similarity(a: &str, b: &str) -> FuzzyScore {
282 let bg_a = Self::bigrams(&a.to_lowercase());
283 let bg_b = Self::bigrams(&b.to_lowercase());
284
285 if bg_a.is_empty() && bg_b.is_empty() {
286 return FuzzyScore::new(1.0);
287 }
288
289 let mut intersection_count: usize = 0;
290 for (bigram, count_a) in &bg_a {
291 if let Some(count_b) = bg_b.get(bigram) {
292 intersection_count += (*count_a).min(*count_b);
293 }
294 }
295
296 let total_a: usize = bg_a.values().sum();
297 let total_b: usize = bg_b.values().sum();
298 let denom = total_a + total_b;
299
300 if denom == 0 {
301 FuzzyScore::new(0.0)
302 } else {
303 FuzzyScore::new(2.0 * intersection_count as f64 / denom as f64)
304 }
305 }
306}
307
308#[must_use]
313pub fn hamming_distance(a: &[u8], b: &[u8]) -> Option<usize> {
314 if a.len() != b.len() {
315 return None;
316 }
317 Some(a.iter().zip(b.iter()).filter(|(x, y)| x != y).count())
318}
319
320#[must_use]
322#[allow(clippy::cast_precision_loss)]
323pub fn hamming_similarity(a: &[u8], b: &[u8]) -> Option<FuzzyScore> {
324 let dist = hamming_distance(a, b)?;
325 let len = a.len();
326 if len == 0 {
327 return Some(FuzzyScore::new(1.0));
328 }
329 Some(FuzzyScore::new(1.0 - dist as f64 / len as f64))
330}
331
332pub struct FilenameMatcher {
342 edit_weight: f64,
344 token_weight: f64,
346 bigram_weight: f64,
348 threshold: f64,
350}
351
352impl FilenameMatcher {
353 #[must_use]
355 pub fn new(threshold: f64) -> Self {
356 Self {
357 edit_weight: 0.4,
358 token_weight: 0.35,
359 bigram_weight: 0.25,
360 threshold: threshold.clamp(0.0, 1.0),
361 }
362 }
363
364 #[must_use]
366 pub fn with_weights(threshold: f64, edit_w: f64, token_w: f64, bigram_w: f64) -> Self {
367 let total = edit_w + token_w + bigram_w;
368 let (ew, tw, bw) = if total <= 0.0 {
369 (1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0)
370 } else {
371 (edit_w / total, token_w / total, bigram_w / total)
372 };
373 Self {
374 edit_weight: ew,
375 token_weight: tw,
376 bigram_weight: bw,
377 threshold: threshold.clamp(0.0, 1.0),
378 }
379 }
380
381 #[must_use]
386 pub fn normalize(name: &str) -> String {
387 let base = name.rsplit(['/', '\\']).next().unwrap_or(name);
389
390 let stem = base.rsplit_once('.').map_or(base, |(s, _)| s);
392
393 let lower = stem.to_lowercase();
394
395 let noise: &[&str] = &[
397 "1080p", "720p", "480p", "2160p", "4k", "uhd", "hdr", "hdr10", "x264", "x265", "h264",
398 "h265", "hevc", "avc", "vp9", "av1", "aac", "ac3", "dts", "flac", "opus", "mp3",
399 "bluray", "bdrip", "brrip", "webrip", "web-dl", "webdl", "dvdrip", "remux", "remaster",
400 "proper", "repack", "mkv", "mp4", "avi", "mov", "wmv", "webm",
401 ];
402
403 let mut cleaned = lower;
404 for &tag in noise {
405 cleaned = remove_noise_token(&cleaned, tag);
407 }
408
409 let normalized: String = cleaned
411 .chars()
412 .map(|c| {
413 if c.is_alphanumeric() || c == ' ' {
414 c
415 } else {
416 ' '
417 }
418 })
419 .collect();
420
421 let parts: Vec<&str> = normalized.split_whitespace().collect();
423 parts.join(" ")
424 }
425
426 #[must_use]
428 pub fn similarity(&self, name_a: &str, name_b: &str) -> FuzzyScore {
429 let norm_a = Self::normalize(name_a);
430 let norm_b = Self::normalize(name_b);
431
432 if norm_a.is_empty() && norm_b.is_empty() {
433 return FuzzyScore::new(1.0);
434 }
435
436 let edit_sim = EditDistance::similarity(&norm_a, &norm_b);
437 let token_sim = TokenMatcher::new().similarity(&norm_a, &norm_b);
438 let bigram_sim = BigramSimilarity::similarity(&norm_a, &norm_b);
439
440 let combined = edit_sim.value() * self.edit_weight
441 + token_sim.value() * self.token_weight
442 + bigram_sim.value() * self.bigram_weight;
443
444 FuzzyScore::new(combined)
445 }
446
447 #[must_use]
449 pub fn is_match(&self, name_a: &str, name_b: &str) -> bool {
450 self.similarity(name_a, name_b)
451 .meets_threshold(self.threshold)
452 }
453
454 #[must_use]
458 pub fn find_matching_pairs(&self, names: &[&str]) -> Vec<(usize, usize, FuzzyScore)> {
459 let mut pairs = Vec::new();
460 for i in 0..names.len() {
461 for j in (i + 1)..names.len() {
462 let score = self.similarity(names[i], names[j]);
463 if score.meets_threshold(self.threshold) {
464 pairs.push((i, j, score));
465 }
466 }
467 }
468 pairs
469 }
470
471 #[must_use]
473 pub fn threshold(&self) -> f64 {
474 self.threshold
475 }
476}
477
478impl Default for FilenameMatcher {
479 fn default() -> Self {
480 Self::new(0.80)
481 }
482}
483
484fn remove_noise_token(input: &str, token: &str) -> String {
486 let mut result = input.to_string();
487 loop {
488 let lower = result.to_lowercase();
489 if let Some(pos) = lower.find(token) {
490 let before_ok = pos == 0
491 || !lower
492 .as_bytes()
493 .get(pos - 1)
494 .map_or(false, |b| b.is_ascii_alphanumeric());
495 let after_pos = pos + token.len();
496 let after_ok = after_pos >= lower.len()
497 || !lower
498 .as_bytes()
499 .get(after_pos)
500 .map_or(false, |b| b.is_ascii_alphanumeric());
501 if before_ok && after_ok {
502 result = format!("{}{}", &result[..pos], &result[after_pos..]);
503 } else {
504 break;
505 }
506 } else {
507 break;
508 }
509 }
510 result
511}
512
513#[cfg(test)]
514mod tests {
515 use super::*;
516
517 #[test]
518 fn test_fuzzy_score_clamp() {
519 assert!((FuzzyScore::new(1.5).value() - 1.0).abs() < f64::EPSILON);
520 assert!((FuzzyScore::new(-0.3).value() - 0.0).abs() < f64::EPSILON);
521 assert!((FuzzyScore::new(0.75).value() - 0.75).abs() < f64::EPSILON);
522 }
523
524 #[test]
525 fn test_fuzzy_score_threshold() {
526 let s = FuzzyScore::new(0.85);
527 assert!(s.meets_threshold(0.8));
528 assert!(s.meets_threshold(0.85));
529 assert!(!s.meets_threshold(0.9));
530 }
531
532 #[test]
533 fn test_fuzzy_score_is_exact() {
534 assert!(FuzzyScore::new(1.0).is_exact());
535 assert!(!FuzzyScore::new(0.999).is_exact());
536 }
537
538 #[test]
539 fn test_fuzzy_score_average() {
540 let a = FuzzyScore::new(0.6);
541 let b = FuzzyScore::new(0.8);
542 let avg = a.average(b);
543 assert!((avg.value() - 0.7).abs() < 1e-10);
544 }
545
546 #[test]
547 fn test_fuzzy_score_weighted_average() {
548 let scores = vec![(FuzzyScore::new(1.0), 3.0), (FuzzyScore::new(0.0), 1.0)];
549 let avg = FuzzyScore::weighted_average(&scores);
550 assert!((avg.value() - 0.75).abs() < 1e-10);
551 }
552
553 #[test]
554 fn test_edit_distance_strings_identical() {
555 assert_eq!(EditDistance::strings("hello", "hello"), 0);
556 }
557
558 #[test]
559 fn test_edit_distance_strings_basic() {
560 assert_eq!(EditDistance::strings("kitten", "sitting"), 3);
561 assert_eq!(EditDistance::strings("", "abc"), 3);
562 assert_eq!(EditDistance::strings("abc", ""), 3);
563 }
564
565 #[test]
566 fn test_edit_distance_bytes() {
567 assert_eq!(EditDistance::bytes(b"abc", b"abc"), 0);
568 assert_eq!(EditDistance::bytes(b"abc", b"adc"), 1);
569 assert_eq!(EditDistance::bytes(b"", b"xyz"), 3);
570 }
571
572 #[test]
573 fn test_edit_distance_similarity() {
574 let s = EditDistance::similarity("hello", "hello");
575 assert!(s.is_exact());
576
577 let s2 = EditDistance::similarity("hello", "hxllo");
578 assert!(s2.value() > 0.5);
579
580 let s3 = EditDistance::similarity("", "");
581 assert!(s3.is_exact());
582 }
583
584 #[test]
585 fn test_token_matcher_identical() {
586 let matcher = TokenMatcher::new();
587 let s = matcher.similarity("hello world", "hello world");
588 assert!(s.is_exact());
589 }
590
591 #[test]
592 fn test_token_matcher_case_insensitive() {
593 let matcher = TokenMatcher::new().case_insensitive(true);
594 let s = matcher.similarity("Hello World", "hello world");
595 assert!(s.is_exact());
596 }
597
598 #[test]
599 fn test_token_matcher_partial() {
600 let matcher = TokenMatcher::new();
601 let s = matcher.similarity("the quick brown fox", "the quick red fox");
602 assert!(s.value() > 0.5);
603 assert!(!s.is_exact());
604 }
605
606 #[test]
607 fn test_bigram_similarity_identical() {
608 let s = BigramSimilarity::similarity("night", "night");
609 assert!(s.is_exact());
610 }
611
612 #[test]
613 fn test_bigram_similarity_similar() {
614 let s = BigramSimilarity::similarity("night", "nacht");
615 assert!(s.value() > 0.0);
616 assert!(!s.is_exact());
617 }
618
619 #[test]
620 fn test_hamming_distance_equal() {
621 assert_eq!(hamming_distance(b"abc", b"abc"), Some(0));
622 }
623
624 #[test]
625 fn test_hamming_distance_different() {
626 assert_eq!(hamming_distance(b"abc", b"axc"), Some(1));
627 }
628
629 #[test]
630 fn test_hamming_distance_length_mismatch() {
631 assert_eq!(hamming_distance(b"ab", b"abc"), None);
632 }
633
634 #[test]
635 fn test_hamming_similarity() {
636 let s = hamming_similarity(b"abcd", b"abcd").expect("operation should succeed");
637 assert!(s.is_exact());
638
639 let s2 = hamming_similarity(b"abcd", b"axyd").expect("operation should succeed");
640 assert!((s2.value() - 0.5).abs() < f64::EPSILON);
641 }
642
643 #[test]
646 fn test_filename_normalize_basic() {
647 let n = FilenameMatcher::normalize("The.Movie.2024.1080p.x264.mkv");
648 assert_eq!(n, "the movie 2024");
649 }
650
651 #[test]
652 fn test_filename_normalize_strips_extension() {
653 let n = FilenameMatcher::normalize("video.mp4");
654 assert_eq!(n, "video");
655 }
656
657 #[test]
658 fn test_filename_normalize_strips_directory() {
659 let n = FilenameMatcher::normalize("/path/to/video.mp4");
660 assert_eq!(n, "video");
661 }
662
663 #[test]
664 fn test_filename_normalize_codec_tags() {
665 let n = FilenameMatcher::normalize("Movie.2024.h265.AAC.BluRay.mp4");
666 assert_eq!(n, "movie 2024");
667 }
668
669 #[test]
670 fn test_filename_matcher_identical() {
671 let m = FilenameMatcher::new(0.8);
672 let s = m.similarity("The.Movie.2024.mkv", "The.Movie.2024.mkv");
673 assert!(s.is_exact());
674 }
675
676 #[test]
677 fn test_filename_matcher_same_content_different_codec() {
678 let m = FilenameMatcher::new(0.8);
679 let s = m.similarity(
680 "The.Movie.2024.1080p.x264.mkv",
681 "The.Movie.2024.720p.x265.mp4",
682 );
683 assert!(s.meets_threshold(0.8), "Score was {}", s.value());
684 }
685
686 #[test]
687 fn test_filename_matcher_different_movies() {
688 let m = FilenameMatcher::new(0.8);
689 let s = m.similarity("Inception.2010.mkv", "Interstellar.2014.mkv");
690 assert!(!s.meets_threshold(0.8));
691 }
692
693 #[test]
694 fn test_filename_matcher_is_match() {
695 let m = FilenameMatcher::new(0.9);
696 assert!(m.is_match("movie.1080p.mkv", "movie.720p.mp4"));
697 }
698
699 #[test]
700 fn test_filename_matcher_find_matching_pairs() {
701 let m = FilenameMatcher::new(0.8);
702 let names = [
703 "The.Movie.2024.1080p.mkv",
704 "The.Movie.2024.720p.mp4",
705 "Totally.Different.2023.mkv",
706 ];
707 let pairs = m.find_matching_pairs(&names);
708 assert!(pairs.iter().any(|(i, j, _)| *i == 0 && *j == 1));
710 assert!(!pairs.iter().any(|(_, j, _)| *j == 2));
711 }
712
713 #[test]
714 fn test_filename_matcher_empty_strings() {
715 let m = FilenameMatcher::new(0.5);
716 let s = m.similarity("", "");
717 assert!(s.is_exact());
718 }
719
720 #[test]
721 fn test_filename_matcher_default() {
722 let m = FilenameMatcher::default();
723 assert!((m.threshold() - 0.80).abs() < f64::EPSILON);
724 }
725
726 #[test]
727 fn test_filename_matcher_custom_weights() {
728 let m = FilenameMatcher::with_weights(0.7, 1.0, 0.0, 0.0);
729 let s = m.similarity("hello.mp4", "hello.mp4");
731 assert!(s.is_exact());
732 }
733
734 #[test]
735 fn test_filename_normalize_preserves_year() {
736 let n = FilenameMatcher::normalize("Movie.Title.2024.Remaster.mkv");
737 assert!(n.contains("2024"));
739 assert!(!n.contains("remaster"));
740 }
741
742 #[test]
743 fn test_remove_noise_token_boundary() {
744 let result = remove_noise_token("test1080pin", "1080p");
745 assert_eq!(result, "test1080pin");
747 }
748
749 #[test]
750 fn test_remove_noise_token_standalone() {
751 let result = remove_noise_token("test.1080p.file", "1080p");
752 assert!(!result.contains("1080p"));
753 }
754}