#[derive(Debug, Clone, PartialEq, Eq)]
pub struct MojibakeReport {
pub distinct_patterns: usize,
}
const PATTERNS: &[&str] = &[
"é", "è", "ê", "ë", "Ã\u{A0}", "á", "â", "ã", "ä", "Ã¥", "ì", "Ã\u{AD}", "î",
"ï", "ò", "ó", "ô", "ö", "ù", "ú", "û", "ü", "ñ", "ç", "ß", "À", "Ã\u{81}", "Â", "Ã", "Ä", "Ã…", "É", "Ê", "Ë", "Ñ", "Ó", "Ô", "Ö", "Ú", "Û", "Ü", "â€",
];
pub fn detect_mojibake(input: &str) -> Option<MojibakeReport> {
let distinct = PATTERNS.iter().filter(|pat| input.contains(*pat)).count();
if distinct >= 3 {
Some(MojibakeReport {
distinct_patterns: distinct,
})
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn detects_classic_double_encoded_paragraph() {
let input = "Résumé of Möbius strip: a café favourite. â€\u{201D} indeed.";
let report = detect_mojibake(input).expect("clear mojibake should report");
assert!(
report.distinct_patterns >= 3,
"expected ≥3 distinct patterns, got {}",
report.distinct_patterns
);
}
#[test]
fn ignores_legitimate_text_with_isolated_accents() {
let input = "El niño visitó la ciudad. Le maître prépare le dîner.";
assert_eq!(detect_mojibake(input), None);
}
#[test]
fn ignores_pure_ascii() {
let input = "Just plain ASCII text with no special characters.\n";
assert_eq!(detect_mojibake(input), None);
}
#[test]
fn ignores_single_pattern_hit() {
let input = "An odd é occurrence, nothing else.";
assert_eq!(detect_mojibake(input), None);
}
#[test]
fn flags_smart_quote_em_dash_storm() {
let input = "He said â€\u{201C}helloâ€\u{201D}, then â€\u{201D} she replied. \
Café öpen ñ kids.";
let report = detect_mojibake(input).expect("should detect");
assert!(report.distinct_patterns >= 3);
}
}