audiobook_forge/utils/
scoring.rs

1//! Scoring and distance calculation for metadata matching
2
3use crate::models::{AudibleMetadata, CurrentMetadata, MatchCandidate, MetadataDistance, MatchConfidence};
4
5/// Calculate distance between current metadata and Audible candidate
6pub fn calculate_distance(
7    current: &CurrentMetadata,
8    candidate: &AudibleMetadata,
9) -> MetadataDistance {
10    let mut distance = MetadataDistance::new();
11
12    // Title comparison (weight: 0.4)
13    if let Some(cur_title) = &current.title {
14        let cand_title = &candidate.title;
15        let title_dist = string_distance(cur_title, cand_title);
16        distance.add_penalty("title", title_dist, 0.4);
17    }
18
19    // Author comparison (weight: 0.3)
20    if let Some(cur_author) = &current.author {
21        // Compare against all Audible authors, use best match
22        let author_dist = candidate.authors.iter()
23            .map(|a| string_distance(cur_author, &a.name))
24            .min_by(|a, b| a.partial_cmp(b).unwrap())
25            .unwrap_or(1.0);
26        distance.add_penalty("author", author_dist, 0.3);
27    }
28
29    // Year comparison (weight: 0.1)
30    if let (Some(cur_year), Some(cand_year)) = (current.year, candidate.published_year) {
31        let year_dist = year_distance(cur_year, cand_year);
32        distance.add_penalty("year", year_dist, 0.1);
33    }
34
35    // Duration comparison (weight: 0.2)
36    if let (Some(cur_dur), Some(cand_dur_ms)) = (current.duration, candidate.runtime_length_ms) {
37        let cand_dur_sec = cand_dur_ms as f64 / 1000.0;
38        let dur_dist = duration_distance(cur_dur, cand_dur_sec);
39        distance.add_penalty("duration", dur_dist, 0.2);
40    }
41
42    distance
43}
44
45/// Normalized string distance using Levenshtein (0.0 = identical, 1.0 = completely different)
46pub fn string_distance(a: &str, b: &str) -> f64 {
47    // Normalize: lowercase, trim, remove "the" prefix
48    let a_norm = normalize_string(a);
49    let b_norm = normalize_string(b);
50
51    // Use strsim::normalized_levenshtein (returns 0.0-1.0 similarity)
52    let similarity = strsim::normalized_levenshtein(&a_norm, &b_norm);
53
54    // Convert similarity to distance
55    1.0 - similarity
56}
57
58/// Year distance with tolerance (1.0 = off by >10 years)
59fn year_distance(a: u32, b: u32) -> f64 {
60    let diff = (a as i32 - b as i32).abs();
61    (diff as f64 / 10.0).min(1.0)
62}
63
64/// Duration distance with 5% tolerance (1.0 = off by >20%)
65fn duration_distance(a: f64, b: f64) -> f64 {
66    let diff_ratio = ((a - b).abs() / a.max(b)).max(0.0);
67
68    // 0-5% difference = 0.0 distance (acceptable)
69    // 5-20% difference = linear scale to 0.75
70    // >20% difference = 1.0 distance
71    if diff_ratio < 0.05 {
72        0.0
73    } else if diff_ratio < 0.20 {
74        ((diff_ratio - 0.05) / 0.15) * 0.75
75    } else {
76        1.0
77    }
78}
79
80/// Normalize string for comparison
81pub fn normalize_string(s: &str) -> String {
82    let mut normalized = s.to_lowercase().trim().to_string();
83
84    // Remove leading "the " if present
85    if normalized.starts_with("the ") {
86        normalized = normalized[4..].to_string();
87    }
88
89    // Remove special characters but keep spaces
90    normalized.retain(|c| c.is_alphanumeric() || c.is_whitespace());
91
92    normalized
93}
94
95/// Score candidates and sort by distance (best first)
96pub fn score_and_sort(
97    current: &CurrentMetadata,
98    candidates: Vec<AudibleMetadata>,
99) -> Vec<MatchCandidate> {
100    let mut scored: Vec<MatchCandidate> = candidates
101        .into_iter()
102        .map(|metadata| {
103            let distance = calculate_distance(current, &metadata);
104            let confidence = determine_confidence(distance.total_distance());
105            MatchCandidate {
106                distance,
107                metadata,
108                confidence,
109            }
110        })
111        .collect();
112
113    // Sort by distance (ascending = best first)
114    scored.sort_by(|a, b| {
115        a.distance.total_distance()
116            .partial_cmp(&b.distance.total_distance())
117            .unwrap()
118    });
119
120    scored
121}
122
123/// Determine confidence level based on distance
124pub fn determine_confidence(distance: f64) -> MatchConfidence {
125    if distance < 0.04 {
126        MatchConfidence::Strong
127    } else if distance < 0.12 {
128        MatchConfidence::Medium
129    } else if distance < 0.20 {
130        MatchConfidence::Low
131    } else {
132        MatchConfidence::None
133    }
134}
135
136#[cfg(test)]
137mod tests {
138    use super::*;
139
140    #[test]
141    fn test_string_distance() {
142        // Identical strings
143        assert_eq!(string_distance("Hello World", "Hello World"), 0.0);
144        assert_eq!(string_distance("hello world", "HELLO WORLD"), 0.0);
145
146        // Similar strings
147        let dist = string_distance("Project Hail Mary", "Project Haile Mary");
148        assert!(dist > 0.0 && dist < 0.15);  // Small typo
149
150        // Different strings
151        let dist = string_distance("Completely Different", "Not the Same");
152        assert!(dist > 0.5);
153    }
154
155    #[test]
156    fn test_normalize_string() {
157        assert_eq!(normalize_string("The Hobbit"), "hobbit");
158        assert_eq!(normalize_string("  Project Hail Mary  "), "project hail mary");
159        assert_eq!(normalize_string("Author's Name"), "authors name");
160        assert_eq!(normalize_string("Title! @ # $"), "title");
161    }
162
163    #[test]
164    fn test_year_distance() {
165        assert_eq!(year_distance(2020, 2020), 0.0);  // Same year
166        assert_eq!(year_distance(2020, 2025), 0.5);  // 5 years apart
167        assert_eq!(year_distance(2020, 2030), 1.0);  // 10 years apart
168        assert!(year_distance(2020, 2035) >= 1.0);    // >10 years (clamped to 1.0)
169    }
170
171    #[test]
172    fn test_duration_distance() {
173        // Within 5% tolerance
174        assert_eq!(duration_distance(3600.0, 3620.0), 0.0);  // ~0.5% diff
175
176        // 5-20% range
177        let dist = duration_distance(3600.0, 3960.0);  // 10% diff
178        assert!(dist > 0.0 && dist < 0.75);
179
180        // Over 20% difference
181        let dist = duration_distance(3600.0, 4500.0);  // 25% diff
182        assert_eq!(dist, 1.0);
183    }
184
185    #[test]
186    fn test_determine_confidence() {
187        assert_eq!(determine_confidence(0.02), MatchConfidence::Strong);
188        assert_eq!(determine_confidence(0.08), MatchConfidence::Medium);
189        assert_eq!(determine_confidence(0.15), MatchConfidence::Low);
190        assert_eq!(determine_confidence(0.50), MatchConfidence::None);
191    }
192}