rust-switcher 1.0.13

Windows keyboard layout switcher and text conversion utility
Documentation
use std::sync::OnceLock;

use lingua::{Language, LanguageDetector, LanguageDetectorBuilder};

use super::mapping::{ConversionDirection, convert_ru_en_with_direction};

const MIN_SOURCE_CONFIDENCE: f64 = 0.45;
const MIN_TARGET_CONFIDENCE: f64 = 0.60;

#[derive(Copy, Clone, Debug, Eq, PartialEq)]
enum Script {
    Latin,
    Cyrillic,
}

pub(crate) fn text_looks_correct(text: &str) -> bool {
    let tokens = word_tokens(text);
    if tokens.is_empty() {
        return false;
    }

    let Some(source_script) = classify_tokens(&tokens) else {
        return false;
    };
    if !tokens
        .iter()
        .all(|token| token_plausible_for_script(token, source_script))
    {
        return false;
    }

    let detector = language_detector();
    let source_lang = language_for_script(source_script);
    let source_confidence = confidence(detector, text, source_lang);
    if source_confidence < MIN_SOURCE_CONFIDENCE {
        return false;
    }

    let converted = convert_ru_en_with_direction(text, direction_for_script(source_script));
    let converted_tokens = word_tokens(&converted);
    let target_script = opposite_script(source_script);
    let converted_looks_like_target = classify_tokens(&converted_tokens) == Some(target_script)
        && converted_tokens
            .iter()
            .all(|token| token_plausible_for_script(token, target_script))
        && confidence(detector, &converted, language_for_script(target_script)) >= MIN_TARGET_CONFIDENCE;

    !converted_looks_like_target
}

fn language_detector() -> &'static LanguageDetector {
    static DETECTOR: OnceLock<LanguageDetector> = OnceLock::new();
    DETECTOR.get_or_init(|| {
        LanguageDetectorBuilder::from_languages(&[Language::English, Language::Russian])
            .with_minimum_relative_distance(0.20)
            .build()
    })
}

fn word_tokens(text: &str) -> Vec<&str> {
    text.split_whitespace()
        .filter_map(trim_word_token)
        .filter(|token| token.chars().any(char::is_alphabetic))
        .collect()
}

fn trim_word_token(token: &str) -> Option<&str> {
    let start = token
        .char_indices()
        .find_map(|(idx, ch)| is_word_token_char(ch).then_some(idx))?;
    let end = token
        .char_indices()
        .rev()
        .find_map(|(idx, ch)| is_word_token_char(ch).then_some(idx + ch.len_utf8()))?;
    (start < end).then_some(&token[start..end])
}

fn is_word_token_char(ch: char) -> bool {
    ch.is_alphabetic() || ch == '\'' || ch == '-'
}

fn classify_tokens(tokens: &[&str]) -> Option<Script> {
    let mut script = None;
    for token in tokens {
        let token_script = classify_token(token)?;
        if script.is_some_and(|known| known != token_script) {
            return None;
        }
        script = Some(token_script);
    }
    script
}

fn classify_token(token: &str) -> Option<Script> {
    let mut script = None;
    for ch in token.chars().filter(|ch| ch.is_alphabetic()) {
        let ch_script = if ch.is_ascii_alphabetic() {
            Script::Latin
        } else if is_cyrillic(ch) {
            Script::Cyrillic
        } else {
            return None;
        };
        if script.is_some_and(|known| known != ch_script) {
            return None;
        }
        script = Some(ch_script);
    }
    script
}

fn token_plausible_for_script(token: &str, script: Script) -> bool {
    match script {
        Script::Latin => is_plausible_english_like_token(token),
        Script::Cyrillic => is_plausible_russian_like_token(token),
    }
}

fn direction_for_script(script: Script) -> ConversionDirection {
    match script {
        Script::Latin => ConversionDirection::EnToRu,
        Script::Cyrillic => ConversionDirection::RuToEn,
    }
}

fn opposite_script(script: Script) -> Script {
    match script {
        Script::Latin => Script::Cyrillic,
        Script::Cyrillic => Script::Latin,
    }
}

fn language_for_script(script: Script) -> Language {
    match script {
        Script::Latin => Language::English,
        Script::Cyrillic => Language::Russian,
    }
}

fn confidence(detector: &LanguageDetector, text: &str, lang: Language) -> f64 {
    detector
        .compute_language_confidence_values(text)
        .iter()
        .find(|(l, _)| *l == lang)
        .map_or(0.0, |(_, v)| *v)
}

fn is_plausible_english_like_token(token: &str) -> bool {
    let letters: Vec<char> = token
        .chars()
        .filter(|ch| ch.is_ascii_alphabetic())
        .map(|ch| ch.to_ascii_lowercase())
        .collect();
    if letters.is_empty() || !letters.iter().any(|ch| matches!(ch, 'a' | 'e' | 'i' | 'o' | 'u')) {
        return false;
    }

    let mut consonant_run = 0usize;
    let mut max_consonant_run = 0usize;
    let mut rare = 0usize;
    for ch in letters {
        if matches!(ch, 'a' | 'e' | 'i' | 'o' | 'u') {
            consonant_run = 0;
        } else {
            consonant_run += 1;
            max_consonant_run = max_consonant_run.max(consonant_run);
            if matches!(ch, 'j' | 'q' | 'x' | 'z') {
                rare += 1;
            }
        }
    }

    max_consonant_run <= 4 && rare <= 1
}

fn is_plausible_russian_like_token(token: &str) -> bool {
    let letters: Vec<char> = token
        .chars()
        .filter(|ch| ch.is_alphabetic())
        .map(|ch| ch.to_lowercase().next().unwrap_or(ch))
        .collect();
    if letters.is_empty() || !letters.iter().all(|ch| is_cyrillic(*ch)) {
        return false;
    }
    if !letters
        .iter()
        .any(|ch| matches!(ch, 'а' | 'е' | 'ё' | 'и' | 'о' | 'у' | 'ы' | 'э' | 'ю' | 'я'))
    {
        return false;
    }

    let mut consonant_run = 0usize;
    let mut max_consonant_run = 0usize;
    for ch in letters {
        if matches!(ch, 'а' | 'е' | 'ё' | 'и' | 'о' | 'у' | 'ы' | 'э' | 'ю' | 'я') {
            consonant_run = 0;
        } else {
            consonant_run += 1;
            max_consonant_run = max_consonant_run.max(consonant_run);
        }
    }

    max_consonant_run <= 4
}

fn is_cyrillic(ch: char) -> bool {
    ('\u{0400}'..='\u{04FF}').contains(&ch) || ('\u{0500}'..='\u{052F}').contains(&ch)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn smart_guard_accepts_correct_russian_phrase() {
        assert!(text_looks_correct("привет как дела"));
    }

    #[test]
    fn smart_guard_rejects_wrong_layout_latin_phrase() {
        assert!(!text_looks_correct("ghbdtn rfr ltkf"));
    }

    #[test]
    fn smart_guard_rejects_wrong_layout_cyrillic_word_with_good_conversion() {
        assert!(!text_looks_correct("руддщ"));
    }

    #[test]
    fn smart_guard_accepts_correct_english_phrase() {
        assert!(text_looks_correct("hello how are you"));
    }
}