disarm 0.10.0

Unicode canonicalization and TR39 confusable analysis: building blocks for text-security pipelines (homoglyph/bidi/zalgo handling) plus standards-based transliteration
Documentation
//! Reverse transliteration: romanized Latin text → native script.
//!
//! Given a romanized string and a target language, converts back to the
//! native script using longest-match greedy scanning to handle digraphs
//! and trigraphs (e.g., "sh" → ш, "shch" → щ).
//!
//! Layer 1 (pure-Rust core): no pyo3. Shims in `src/py/reverse.rs`; crates.io
//! surface is `crate::api::{reverse_transliterate, reverse_langs}` (`ReverseLang`).

#![allow(clippy::unreadable_literal)]

// Reverse transliteration tables — generated by build.rs from TSV data.
include!(concat!(env!("OUT_DIR"), "/reverse_translit_phf.rs"));

/// Maximum key length in any reverse table (for greedy matching window).
const MAX_KEY_LEN: usize = 4; // "shch" is the longest key

/// Look up a romanized substring in the appropriate reverse table.
fn reverse_lookup(key: &str, lang: &str) -> Option<&'static str> {
    let table: &phf::Map<&'static str, &'static str> = match lang {
        "ru" => &REVERSE_RU,
        "uk" => &REVERSE_UK,
        "el" => &REVERSE_EL,
        _ => return None,
    };
    table.get(key).copied()
}

/// Languages that support reverse transliteration.
const REVERSE_LANGS: &[&str] = &["el", "ru", "uk"];

/// Check if reverse transliteration is supported for a language.
pub(crate) fn supports_reverse(lang: &str) -> bool {
    REVERSE_LANGS.contains(&lang)
}

/// Return list of languages supporting reverse transliteration.
pub(crate) fn reverse_langs() -> Vec<String> {
    REVERSE_LANGS.iter().map(|s| (*s).to_string()).collect()
}

/// Core reverse transliteration: romanized ASCII → native script.
///
/// Uses greedy longest-match scanning: for each position in the input,
/// tries the longest possible key first (up to MAX_KEY_LEN characters),
/// falling back to shorter keys, and finally passing through unmatched
/// characters verbatim.
///
/// **Case sensitivity (#255 C4):** multigraph keys exist only in their
/// documented casings (e.g. `Shch`/`shch` for щ). When an exact-case lookup
/// misses and the candidate is an all-uppercase ASCII multigraph (e.g. `SHCH`),
/// the matcher retries with the lowercase key and re-uppercases the native
/// result — so an all-caps romanization recovers the trigraph (Щ) rather than
/// decomposing character by character.
pub(crate) fn reverse_transliterate_impl(text: &str, lang: &str) -> String {
    let bytes = text.as_bytes();
    let len = bytes.len();
    let mut result = String::with_capacity(len);
    let mut i = 0;

    while i < len {
        let remaining = len - i;
        let max_try = remaining.min(MAX_KEY_LEN);
        let mut matched = false;

        // Greedy: try longest key first, work down to single char
        for key_len in (1..=max_try).rev() {
            // `text` is always valid UTF-8 (it's a `&str`); this guards against a
            // fixed-width byte window slicing *through* a multi-byte char, which
            // would not be valid UTF-8 — skip those `key_len`s. (P8)
            if let Ok(candidate) = std::str::from_utf8(&bytes[i..i + key_len]) {
                if let Some(native) = reverse_lookup(candidate, lang) {
                    result.push_str(native);
                    i += key_len;
                    matched = true;
                    break;
                }
                // #255 C4: an all-uppercase ASCII multigraph (e.g. "SHCH") has no
                // cased entry — only `Shch`/`shch` exist. Retry the lowercase key
                // and re-uppercase the native result so the trigraph is recovered.
                if key_len > 1 && candidate.bytes().all(|b| b.is_ascii_uppercase()) {
                    if let Some(native) = reverse_lookup(&candidate.to_ascii_lowercase(), lang) {
                        result.push_str(&native.to_uppercase());
                        i += key_len;
                        matched = true;
                        break;
                    }
                }
            }
        }

        if !matched {
            // Pass through unmatched character verbatim
            // Handle potential multi-byte UTF-8 properly
            if let Some(ch) = text[i..].chars().next() {
                result.push(ch);
                i += ch.len_utf8();
            } else {
                i += 1;
            }
        }
    }

    result
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_russian_basic() {
        assert_eq!(reverse_transliterate_impl("Moskva", "ru"), "Москва");
    }

    #[test]
    fn test_russian_digraphs() {
        assert_eq!(reverse_transliterate_impl("zh", "ru"), "ж");
        assert_eq!(reverse_transliterate_impl("sh", "ru"), "ш");
        assert_eq!(reverse_transliterate_impl("ch", "ru"), "ч");
    }

    #[test]
    fn test_russian_trigraphs() {
        assert_eq!(reverse_transliterate_impl("shch", "ru"), "щ");
    }

    #[test]
    fn test_passthrough() {
        assert_eq!(reverse_transliterate_impl("123!", "ru"), "123!");
    }

    #[test]
    fn test_greek_upsilon_no_latin_leak() {
        // #82: the forward direction romanizes Υ/υ as "Y"/"y" (incl. in the
        // ου/αυ/ευ diphthongs), so reverse must map those back to Greek instead
        // of leaving a literal Latin letter in the output.
        assert_eq!(reverse_transliterate_impl("psychi", "el"), "ψυχη");
        assert_eq!(reverse_transliterate_impl("oyzo", "el"), "ουζο");
        assert_eq!(reverse_transliterate_impl("Y", "el"), "Υ");
        assert_eq!(reverse_transliterate_impl("y", "el"), "υ");
        // No Latin letter may survive a reverse-to-Greek of forward Greek output.
        for s in ["psychi", "oyzo", "ayrio", "Kypros"] {
            let rev = reverse_transliterate_impl(s, "el");
            assert!(
                !rev.chars().any(|c| c.is_ascii_alphabetic()),
                "reverse el leaked a Latin letter: {s:?} -> {rev:?}"
            );
        }
    }

    #[test]
    fn test_supports_reverse() {
        assert!(supports_reverse("ru"));
        assert!(supports_reverse("uk"));
        assert!(supports_reverse("el"));
        assert!(!supports_reverse("de"));
    }
}