butterfly_common/
error.rs

1//! Error types and utilities for butterfly-osm toolkit
2//!
3//! Provides comprehensive error handling and fuzzy matching for OSM source identification.
4
5use std::fmt;
6use std::sync::OnceLock;
7use strsim::{jaro_winkler, normalized_levenshtein};
8
9/// Cache for dynamically loaded valid sources
10static VALID_SOURCES_CACHE: OnceLock<Vec<String>> = OnceLock::new();
11
12/// Initialize the source cache with comprehensive list
13fn ensure_sources_loaded() {
14    VALID_SOURCES_CACHE.get_or_init(|| {
15        // Comprehensive source list covering most common use cases
16        vec![
17            // Root level
18            "planet".to_string(),
19            // Continents
20            "africa".to_string(),
21            "antarctica".to_string(),
22            "asia".to_string(),
23            "australia-oceania".to_string(),
24            "europe".to_string(),
25            "north-america".to_string(),
26            "south-america".to_string(),
27            "central-america".to_string(),
28            // Europe
29            "europe/albania".to_string(),
30            "europe/andorra".to_string(),
31            "europe/austria".to_string(),
32            "europe/belarus".to_string(),
33            "europe/belgium".to_string(),
34            "europe/bosnia-herzegovina".to_string(),
35            "europe/bulgaria".to_string(),
36            "europe/croatia".to_string(),
37            "europe/cyprus".to_string(),
38            "europe/czech-republic".to_string(),
39            "europe/denmark".to_string(),
40            "europe/estonia".to_string(),
41            "europe/faroe-islands".to_string(),
42            "europe/finland".to_string(),
43            "europe/france".to_string(),
44            "europe/germany".to_string(),
45            "europe/great-britain".to_string(),
46            "europe/greece".to_string(),
47            "europe/hungary".to_string(),
48            "europe/iceland".to_string(),
49            "europe/ireland".to_string(),
50            "europe/isle-of-man".to_string(),
51            "europe/italy".to_string(),
52            "europe/kosovo".to_string(),
53            "europe/latvia".to_string(),
54            "europe/liechtenstein".to_string(),
55            "europe/lithuania".to_string(),
56            "europe/luxembourg".to_string(),
57            "europe/malta".to_string(),
58            "europe/moldova".to_string(),
59            "europe/monaco".to_string(),
60            "europe/montenegro".to_string(),
61            "europe/netherlands".to_string(),
62            "europe/north-macedonia".to_string(),
63            "europe/norway".to_string(),
64            "europe/poland".to_string(),
65            "europe/portugal".to_string(),
66            "europe/romania".to_string(),
67            "europe/russia".to_string(),
68            "europe/san-marino".to_string(),
69            "europe/serbia".to_string(),
70            "europe/slovakia".to_string(),
71            "europe/slovenia".to_string(),
72            "europe/spain".to_string(),
73            "europe/sweden".to_string(),
74            "europe/switzerland".to_string(),
75            "europe/turkey".to_string(),
76            "europe/ukraine".to_string(),
77            "europe/united-kingdom".to_string(),
78            "europe/vatican-city".to_string(),
79            // North America
80            "north-america/canada".to_string(),
81            "north-america/greenland".to_string(),
82            "north-america/mexico".to_string(),
83            "north-america/us".to_string(),
84            // Asia
85            "asia/afghanistan".to_string(),
86            "asia/bangladesh".to_string(),
87            "asia/bhutan".to_string(),
88            "asia/cambodia".to_string(),
89            "asia/china".to_string(),
90            "asia/gcc-states".to_string(),
91            "asia/india".to_string(),
92            "asia/indonesia".to_string(),
93            "asia/iran".to_string(),
94            "asia/iraq".to_string(),
95            "asia/israel-and-palestine".to_string(),
96            "asia/japan".to_string(),
97            "asia/jordan".to_string(),
98            "asia/kazakhstan".to_string(),
99            "asia/kyrgyzstan".to_string(),
100            "asia/lebanon".to_string(),
101            "asia/malaysia-singapore-brunei".to_string(),
102            "asia/maldives".to_string(),
103            "asia/mongolia".to_string(),
104            "asia/myanmar".to_string(),
105            "asia/nepal".to_string(),
106            "asia/north-korea".to_string(),
107            "asia/pakistan".to_string(),
108            "asia/philippines".to_string(),
109            "asia/south-korea".to_string(),
110            "asia/sri-lanka".to_string(),
111            "asia/syria".to_string(),
112            "asia/taiwan".to_string(),
113            "asia/tajikistan".to_string(),
114            "asia/thailand".to_string(),
115            "asia/tibet".to_string(),
116            "asia/turkmenistan".to_string(),
117            "asia/uzbekistan".to_string(),
118            "asia/vietnam".to_string(),
119            "asia/yemen".to_string(),
120        ]
121    });
122}
123
124// Note: Dynamic source loading from Geofabrik JSON API would be implemented here
125// Currently using comprehensive static list for reliability and to avoid runtime conflicts
126
127/// Get valid sources (cached)  
128fn get_valid_sources_sync() -> &'static [String] {
129    // Ensure sources are loaded (lazy initialization)
130    ensure_sources_loaded();
131
132    // Get cached sources (will always be available after ensure_sources_loaded)
133    VALID_SOURCES_CACHE
134        .get()
135        .map(|v| v.as_slice())
136        .unwrap_or(&[])
137}
138
139/// Find the best fuzzy match using hybrid semantic + character-based scoring
140///
141/// Combines character-based similarity (Jaro-Winkler 70% + Normalized Levenshtein 30%)
142/// with semantic bonuses:
143/// - Prefix matching: 20% bonus for strong prefix similarity (≥7 chars)
144/// - Substring matching: 12% bonus for compound word parts (australia-oceania)
145/// - Length similarity: 10% bonus for appropriate length matches
146/// - Anti-bias penalty: -10% for inappropriate short matches
147///
148/// Minimum threshold: 0.65 similarity to balance precision vs recall
149fn find_best_fuzzy_match(input: &str, candidates: &[String]) -> Option<String> {
150    if candidates.is_empty() {
151        return None;
152    }
153
154    let input_lower = input.to_lowercase();
155    let mut best_match = None;
156    let mut best_score = 0.0f64;
157
158    // Minimum similarity threshold (0.0 to 1.0)
159    let min_threshold = 0.65;
160
161    for candidate in candidates {
162        let candidate_lower = candidate.to_lowercase();
163
164        // Use Jaro-Winkler for typos (especially good for prefixes)
165        let jw_score = jaro_winkler(&input_lower, &candidate_lower);
166
167        // Use normalized Levenshtein as backup
168        let lev_score = normalized_levenshtein(&input_lower, &candidate_lower);
169
170        // Combine scores with weight toward Jaro-Winkler
171        let combined_score = (jw_score * 0.7) + (lev_score * 0.3);
172
173        // Semantic scoring bonuses
174        let mut semantic_bonus = 0.0;
175
176        // Strong prefix matching bonus (for cases like "austrailia" -> "australia-oceania")
177        let prefix_len = input_lower.chars().count().min(7); // Look at first 7 chars
178        if prefix_len >= 4 {
179            let input_prefix = input_lower.chars().take(prefix_len).collect::<String>();
180            let candidate_prefix = candidate_lower.chars().take(prefix_len).collect::<String>();
181
182            // Bonus for strong prefix similarity
183            let prefix_similarity = normalized_levenshtein(&input_prefix, &candidate_prefix);
184            if prefix_similarity > 0.7 {
185                semantic_bonus += 0.2 * prefix_similarity;
186            }
187        }
188
189        // Length-based semantic bonus (longer strings that match well are more meaningful)
190        if input_lower.len() >= 8 && candidate_lower.len() >= 8 {
191            let length_ratio = 1.0
192                - ((input_lower.len() as f64 - candidate_lower.len() as f64).abs()
193                    / input_lower.len().max(candidate_lower.len()) as f64);
194            if length_ratio > 0.7 {
195                semantic_bonus += 0.1 * length_ratio;
196            }
197        }
198
199        // Substring matching bonus (for compound words like "australia-oceania")
200        if candidate_lower.contains('-') || candidate_lower.contains('/') {
201            let parts: Vec<&str> = candidate_lower.split(&['-', '/'][..]).collect();
202            for part in parts {
203                if part.len() >= 4 {
204                    let part_similarity = jaro_winkler(&input_lower, part);
205                    if part_similarity > 0.85 {
206                        // More strict threshold
207                        semantic_bonus += 0.12 * part_similarity; // Reduced bonus
208                    }
209                }
210            }
211        }
212
213        // Anti-bonus for very short matches when input is long (reduces "austria" for "austrailia")
214        if input_lower.len() >= 8 && candidate_lower.len() <= 7 && !candidate_lower.contains('/') {
215            semantic_bonus -= 0.1;
216        }
217
218        let final_score = combined_score + semantic_bonus;
219
220        if final_score >= min_threshold && final_score > best_score {
221            best_score = final_score;
222            best_match = Some(candidate.clone());
223        }
224    }
225
226    best_match
227}
228
229/// Suggest a correction for a potentially misspelled source using fuzzy matching
230pub fn suggest_correction(source: &str) -> Option<String> {
231    // Get valid sources (cached)
232    let valid_sources = get_valid_sources_sync();
233
234    // First, check for exact case-insensitive match (no suggestion needed)
235    for valid_source in valid_sources {
236        if valid_source.eq_ignore_ascii_case(source) {
237            return None; // Exact match, no suggestion needed
238        }
239    }
240
241    // For standalone inputs (no '/'), check if it's a country name first
242    if !source.contains('/') {
243        // Check for exact country matches that should suggest continent/country path
244        for valid_source in valid_sources {
245            if let Some(slash_pos) = valid_source.find('/') {
246                let country_part = &valid_source[slash_pos + 1..];
247                if country_part.eq_ignore_ascii_case(source) {
248                    return Some(valid_source.clone());
249                }
250            }
251        }
252
253        // Create separate lists for different types of matches
254        let mut continent_level: Vec<String> = Vec::new();
255        let mut country_level: Vec<String> = Vec::new();
256
257        for valid_source in valid_sources {
258            if valid_source.contains('/') {
259                country_level.push(valid_source.clone());
260            } else {
261                continent_level.push(valid_source.clone());
262            }
263        }
264
265        // For longer inputs (likely continents), try continents first
266        if source.len() >= 6 {
267            if let Some(match_result) = find_best_fuzzy_match(source, &continent_level) {
268                return Some(match_result);
269            }
270        }
271
272        // For short inputs, also try continents first to catch "plant" -> "planet"
273        if source.len() <= 6 {
274            if let Some(match_result) = find_best_fuzzy_match(source, &continent_level) {
275                // Check if it's a really good match (high similarity)
276                let source_lower = source.to_lowercase();
277                let match_result_lower = match_result.to_lowercase();
278                let similarity = jaro_winkler(&source_lower, &match_result_lower);
279                if similarity > 0.8 {
280                    return Some(match_result);
281                }
282            }
283        }
284
285        // Then try country names (just the country part, but fuzzy match against country part only)
286        let country_names: Vec<String> = country_level
287            .iter()
288            .filter_map(|s| s.split('/').nth(1).map(|c| c.to_string()))
289            .collect();
290
291        if let Some(best_country) = find_best_fuzzy_match(source, &country_names) {
292            // Find the full path for this country
293            for full_path in &country_level {
294                if let Some(country_part) = full_path.split('/').nth(1) {
295                    if country_part == best_country {
296                        return Some(full_path.clone());
297                    }
298                }
299            }
300        }
301
302        // Finally try all sources
303        return find_best_fuzzy_match(source, valid_sources);
304    }
305
306    // For paths (continent/country), handle geographic corrections
307    if let Some(slash_pos) = source.find('/') {
308        let continent = &source[..slash_pos];
309        let country = &source[slash_pos + 1..];
310
311        // Check if the country exists in any valid continent (geographic correction)
312        for valid_source in valid_sources {
313            if let Some(valid_slash_pos) = valid_source.find('/') {
314                let valid_country = &valid_source[valid_slash_pos + 1..];
315                if valid_country.eq_ignore_ascii_case(country) {
316                    // Found correct geography, suggest the right continent
317                    return Some(valid_source.clone());
318                }
319            }
320        }
321
322        // If country not found, check if continent is close to a valid continent
323        let continents: Vec<String> = valid_sources
324            .iter()
325            .filter(|s| !s.contains('/'))
326            .cloned()
327            .collect();
328
329        if let Some(corrected_continent) = find_best_fuzzy_match(continent, &continents) {
330            // Only suggest continent if the country part is clearly invalid
331            if country.len() > 8
332                && !country
333                    .chars()
334                    .all(|c| c.is_ascii_alphanumeric() || c == '-')
335            {
336                return Some(corrected_continent);
337            }
338            // For plausible but unknown countries, suggest the corrected continent
339            return Some(corrected_continent);
340        }
341    }
342
343    // Default: fuzzy match against all sources
344    find_best_fuzzy_match(source, valid_sources)
345}
346
347/// Main error type for butterfly-osm operations
348#[derive(Debug)]
349pub enum Error {
350    /// Source identifier not recognized or supported
351    SourceNotFound(String),
352
353    /// Network or HTTP-related download failure
354    DownloadFailed(String),
355
356    /// HTTP-specific error
357    HttpError(String),
358
359    /// File I/O error
360    IoError(std::io::Error),
361
362    /// Invalid configuration or parameters
363    InvalidInput(String),
364
365    /// Network connectivity issues
366    NetworkError(String),
367}
368
369impl fmt::Display for Error {
370    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
371        match self {
372            Error::SourceNotFound(source) => {
373                write!(f, "Source '{source}' not found or not supported")
374            }
375            Error::DownloadFailed(msg) => {
376                write!(f, "Download failed: {msg}")
377            }
378            Error::HttpError(msg) => {
379                write!(f, "HTTP error: {msg}")
380            }
381            Error::IoError(err) => {
382                write!(f, "I/O error: {err}")
383            }
384            Error::InvalidInput(msg) => {
385                write!(f, "Invalid input: {msg}")
386            }
387            Error::NetworkError(msg) => {
388                write!(f, "Network error: {msg}")
389            }
390        }
391    }
392}
393
394impl std::error::Error for Error {
395    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
396        match self {
397            Error::IoError(err) => Some(err),
398            _ => None,
399        }
400    }
401}
402
403impl From<std::io::Error> for Error {
404    fn from(err: std::io::Error) -> Self {
405        Error::IoError(err)
406    }
407}
408
409#[cfg(feature = "http")]
410impl From<reqwest::Error> for Error {
411    fn from(err: reqwest::Error) -> Self {
412        if err.is_connect() || err.is_timeout() {
413            Error::NetworkError(err.to_string())
414        } else {
415            Error::HttpError(err.to_string())
416        }
417    }
418}
419
420/// Convenience result type for butterfly-osm operations
421pub type Result<T> = std::result::Result<T, Error>;
422
423#[cfg(test)]
424mod tests {
425    use super::*;
426
427    #[test]
428    fn test_suggest_correction_fuzzy_matching() {
429        // Test common typos
430        assert_eq!(
431            suggest_correction("antartica"),
432            Some("antarctica".to_string())
433        );
434        // "austrailia" should now correctly suggest "australia-oceania" with semantic scoring
435        assert_eq!(
436            suggest_correction("austrailia"),
437            Some("australia-oceania".to_string())
438        );
439        assert_eq!(suggest_correction("eurpoe"), Some("europe".to_string()));
440        assert_eq!(suggest_correction("afirca"), Some("africa".to_string()));
441
442        // Test planet typos
443        assert_eq!(suggest_correction("plant"), Some("planet".to_string()));
444        assert_eq!(suggest_correction("plnet"), Some("planet".to_string()));
445    }
446
447    #[test]
448    fn test_suggest_correction_standalone_country_names() {
449        // Test standalone country names that should suggest continent/country paths
450        assert_eq!(
451            suggest_correction("monaco"),
452            Some("europe/monaco".to_string())
453        );
454        assert_eq!(
455            suggest_correction("belgium"),
456            Some("europe/belgium".to_string())
457        );
458        assert_eq!(
459            suggest_correction("germany"),
460            Some("europe/germany".to_string())
461        );
462        assert_eq!(
463            suggest_correction("france"),
464            Some("europe/france".to_string())
465        );
466        // Test case insensitive
467        assert_eq!(
468            suggest_correction("MONACO"),
469            Some("europe/monaco".to_string())
470        );
471        assert_eq!(
472            suggest_correction("Belgium"),
473            Some("europe/belgium".to_string())
474        );
475    }
476
477    #[test]
478    fn test_suggest_correction_standalone_country_typos() {
479        // Test typos in standalone country names
480        assert_eq!(
481            suggest_correction("monac"),
482            Some("europe/monaco".to_string())
483        );
484        assert_eq!(
485            suggest_correction("belgum"),
486            Some("europe/belgium".to_string())
487        );
488        assert_eq!(
489            suggest_correction("germay"),
490            Some("europe/germany".to_string())
491        );
492    }
493
494    #[test]
495    fn test_suggest_correction_country_paths() {
496        // Belgium is in Europe, so should suggest the correct geography
497        assert_eq!(
498            suggest_correction("antartica/belgium"),
499            Some("europe/belgium".to_string())
500        );
501        assert_eq!(
502            suggest_correction("europ/france"),
503            Some("europe/france".to_string())
504        );
505        assert_eq!(
506            suggest_correction("eurpoe/germany"),
507            Some("europe/germany".to_string())
508        );
509        // Unknown country should suggest the corrected continent
510        assert_eq!(
511            suggest_correction("europ/unknown-country"),
512            Some("europe".to_string())
513        );
514    }
515
516    #[test]
517    fn test_suggest_correction_no_match() {
518        assert_eq!(suggest_correction("totally-invalid-place"), None); // Too different
519        assert_eq!(suggest_correction("europe"), None); // Correct spelling
520        assert_eq!(suggest_correction("a"), None); // Too short and different
521    }
522
523    #[test]
524    fn test_suggest_correction_case_insensitive() {
525        assert_eq!(
526            suggest_correction("ANTARTICA"),
527            Some("antarctica".to_string())
528        );
529        assert_eq!(
530            suggest_correction("AntArTiCa"),
531            Some("antarctica".to_string())
532        );
533        assert_eq!(suggest_correction("EuRoPe"), None); // Correct spelling, just wrong case
534    }
535
536    #[test]
537    fn test_strsim_fuzzy_matching() {
538        // Test that strsim correctly prioritizes semantic matches
539        let candidates = vec![
540            "australia-oceania".to_string(),
541            "austria".to_string(),
542            "europe/austria".to_string(),
543            "antarctica".to_string(),
544        ];
545
546        // "austrailia" should match "australia-oceania" better than "austria"
547        let result = find_best_fuzzy_match("austrailia", &candidates);
548
549        assert_eq!(result, Some("australia-oceania".to_string()));
550    }
551
552    #[test]
553    fn test_semantic_bonuses() {
554        // Test anti-bias penalty - long input should not match very short candidates
555        let candidates = vec![
556            "austria".to_string(),           // Short candidate - should get penalty
557            "europe/austria".to_string(),    // Contains '/' - no penalty
558            "australia-oceania".to_string(), // Long candidate - gets bonuses
559        ];
560
561        let result = find_best_fuzzy_match("very-long-input-string", &candidates);
562        // Should not suggest "austria" due to anti-bias penalty
563        assert_ne!(result, Some("austria".to_string()));
564
565        // Test length-based bonus - similar length strings should get bonus
566        let length_candidates = vec![
567            "short".to_string(),
568            "medium-length-string".to_string(),
569            "very-long-similar-length".to_string(),
570        ];
571
572        let result = find_best_fuzzy_match("very-long-similar-input", &length_candidates);
573        // Should prefer the similar length candidate
574        assert_eq!(result, Some("very-long-similar-length".to_string()));
575
576        // Test prefix bonus
577        let prefix_candidates = vec![
578            "australia-oceania".to_string(),
579            "antarctica".to_string(),
580            "africa".to_string(),
581        ];
582
583        let result = find_best_fuzzy_match("austr", &prefix_candidates);
584        // Should prefer australia-oceania due to strong prefix match
585        assert_eq!(result, Some("australia-oceania".to_string()));
586    }
587}
butterfly_common/error.rs

butterfly_common/
error.rs