disarm 0.10.0

Unicode canonicalization and TR39 confusable analysis: building blocks for text-security pipelines (homoglyph/bidi/zalgo handling) plus standards-based transliteration
Documentation
//! Layer 1 (pure-Rust core): TR39 confusable folding. No pyo3.
//!
//! The PyO3 shims for these functions live in `src/py/confusables.rs`; the
//! idiomatic crates.io surface is `crate::api::{normalize_confusables,
//! is_confusable}`. This module is the algorithm, returning the native
//! [`crate::ErrorRepr`] (never a `PyErr`).
//!
//! These fns are `pub(crate)` while [`crate::ErrorRepr`] is `pub(crate)` (avoiding a
//! private-in-public leak). They are promoted to `pub` together with the opaque
//! public `Error` in the first fallible-module extraction sub-PR (#38).

use crate::tables;

/// Validate the `target_script` parameter.
///
/// Supported values: `"latin"`, `"cyrillic"`.
fn validate_target_script(target_script: &str) -> Result<(), crate::ErrorRepr> {
    match target_script {
        "latin" | "cyrillic" => Ok(()),
        _ => Err(crate::ErrorRepr::InvalidTargetScript {
            got: target_script.to_owned(),
        }),
    }
}

/// Replace Unicode confusable homoglyphs with target-script equivalents.
///
/// # NFKC interaction warning
/// This function does **not** apply NFKC normalization. If NFKC is ever added
/// as a pre-processing step, ~31 codepoints in the TR39 confusables table
/// conflict with NFKC mappings (e.g. ſ U+017F: TR39→f but NFKC→s). In that
/// case, `gen_confusables.py` must filter entries where the TR39 target
/// differs from `unicodedata.normalize('NFKC', chr(cp))`.
/// See: <https://paultendo.github.io/posts/unicode-confusables-nfkc-conflict/>
///
/// # Valid `target_script` values
/// `"latin"` or `"cyrillic"`. Any other value returns [`crate::ErrorRepr`].
pub(crate) fn normalize_confusables(
    text: &str,
    target_script: &str,
) -> Result<String, crate::ErrorRepr> {
    let mut out = String::new();
    normalize_confusables_into(text, target_script, &mut out)?;
    Ok(out)
}

/// Borrowing form of [`normalize_confusables`] (#352): returns `Cow::Borrowed`
/// when `text` contains no confusable for the target (the common case), so a
/// no-op never allocates. A single pass — it only starts building an owned
/// string at the first character that actually folds.
pub(crate) fn normalize_confusables_cow<'a>(
    text: &'a str,
    target_script: &str,
) -> Result<std::borrow::Cow<'a, str>, crate::ErrorRepr> {
    use std::borrow::Cow;

    validate_target_script(target_script)?;
    let map = tables::resolve_confusable_map(target_script);

    for (i, ch) in text.char_indices() {
        if let Some(replacement) = map.and_then(|m| m.get(&ch).copied()) {
            // First fold found: copy the borrowed prefix, then fold the rest.
            let mut out = String::with_capacity(text.len());
            out.push_str(&text[..i]);
            out.push_str(replacement);
            for ch in text[i + ch.len_utf8()..].chars() {
                match map.and_then(|m| m.get(&ch).copied()) {
                    Some(replacement) => out.push_str(replacement),
                    None => out.push(ch),
                }
            }
            return Ok(Cow::Owned(out));
        }
    }
    Ok(Cow::Borrowed(text))
}

/// In-place form of [`normalize_confusables`] writing into `out` (cleared
/// first), so the pipeline can reuse one buffer across steps (#236 item 7).
pub(crate) fn normalize_confusables_into(
    text: &str,
    target_script: &str,
    out: &mut String,
) -> Result<(), crate::ErrorRepr> {
    validate_target_script(target_script)?;
    out.clear();
    out.reserve(text.len());

    // Resolve the confusables map once (#236 / #233 review item) instead of
    // re-dispatching `target_script` for every character. `validate_target_script`
    // above guarantees `Some`. There is deliberately no ASCII fast path: the
    // latin table maps ASCII source code points (U+007C `|`→`l`, U+0022 `"`→`''`,
    // U+0060 `` ` ``→`'`), so ASCII input is not identity even for `target="latin"`.
    let map = tables::resolve_confusable_map(target_script);

    for ch in text.chars() {
        match map.and_then(|m| m.get(&ch).copied()) {
            Some(replacement) => out.push_str(replacement),
            None => out.push(ch),
        }
    }

    Ok(())
}

/// True if text contains any characters confusable with target-script characters.
///
/// # Valid `target_script` values
/// `"latin"` or `"cyrillic"`. Any other value returns [`crate::ErrorRepr`].
pub(crate) fn is_confusable(text: &str, target_script: &str) -> Result<bool, crate::ErrorRepr> {
    validate_target_script(target_script)?;

    let map = tables::resolve_confusable_map(target_script);
    for ch in text.chars() {
        if map.is_some_and(|m| m.contains_key(&ch)) {
            return Ok(true);
        }
    }
    Ok(false)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_normalize_confusables_cyrillic() {
        // Cyrillic 'а' (U+0430) → Latin 'a'
        let result = normalize_confusables("\u{0430}", "latin").unwrap();
        assert_eq!(result, "a");
    }

    #[test]
    fn test_normalize_confusables_passthrough() {
        let result = normalize_confusables("hello", "latin").unwrap();
        assert_eq!(result, "hello");
    }

    #[test]
    fn test_normalize_confusables_empty() {
        let result = normalize_confusables("", "latin").unwrap();
        assert_eq!(result, "");
    }

    #[test]
    fn test_is_confusable_true() {
        // Cyrillic 'а' is confusable with Latin 'a'
        assert!(is_confusable("\u{0430}", "latin").unwrap());
    }

    #[test]
    fn test_is_confusable_false() {
        assert!(!is_confusable("hello", "latin").unwrap());
    }

    #[test]
    fn test_is_confusable_empty() {
        assert!(!is_confusable("", "latin").unwrap());
    }

    #[test]
    fn test_validate_target_script_latin_ok() {
        assert!(validate_target_script("latin").is_ok());
    }

    #[test]
    fn test_validate_target_script_cyrillic_ok() {
        assert!(validate_target_script("cyrillic").is_ok());
    }

    #[test]
    fn test_validate_target_script_invalid() {
        assert!(validate_target_script("greek").is_err());
        assert!(validate_target_script("").is_err());
        assert!(validate_target_script("Latin").is_err()); // case-sensitive
        assert!(validate_target_script("Cyrillic").is_err()); // case-sensitive
    }

    #[test]
    fn test_normalize_confusables_mixed_long() {
        // String with confusable Cyrillic chars interspersed with ASCII
        let input = "h\u{0435}ll\u{043E} w\u{043E}rld"; // Cyrillic е and о
        let result = normalize_confusables(input, "latin").unwrap();
        // Cyrillic е→e, о→o
        assert_eq!(result, "hello world");
    }

    #[test]
    fn test_normalize_confusables_nfc_vs_nfd() {
        // Confusable lookup operates on individual codepoints; NFC and NFD
        // should both work (combining marks aren't confusable targets).
        let nfc = "\u{00e9}"; // é as single codepoint
        let result = normalize_confusables(nfc, "latin").unwrap();
        // é is not a confusable — it should pass through unchanged
        assert_eq!(result, nfc);
    }

    // ── Property-based tests ─────────────────────────────────────────

    mod proptest_properties {
        use super::*;
        use proptest::prelude::*;

        proptest! {
            #![proptest_config(ProptestConfig::with_cases(1000))]

            /// Normalizing confusables is idempotent: applying it twice
            /// yields the same result as applying it once. This must hold
            /// because every confusable maps to an ASCII target, and ASCII
            /// characters are never themselves confusable.
            #[test]
            fn normalize_confusables_idempotent(s in "\\PC*") {
                let once = normalize_confusables(&s, "latin").unwrap();
                let twice = normalize_confusables(&once, "latin").unwrap();
                prop_assert_eq!(&once, &twice,
                    "normalize_confusables is not idempotent on: {:?}", s);
            }

            /// After normalizing confusables, is_confusable must return false.
            /// This is the completeness invariant: if the table is self-consistent,
            /// no confusable characters survive normalization.
            #[test]
            fn normalized_is_not_confusable(s in "\\PC*") {
                let normalized = normalize_confusables(&s, "latin").unwrap();
                let still_confusable = is_confusable(&normalized, "latin").unwrap();
                prop_assert!(!still_confusable,
                    "is_confusable returned true after normalize_confusables on: {:?} → {:?}",
                    s, normalized);
            }

            /// normalize_confusables never drops characters — it only replaces.
            /// Output character count must be >= input character count
            /// (replacements may expand, e.g. a ligature confusable, but never shrink).
            #[test]
            fn normalize_confusables_never_drops_chars(s in "\\PC*") {
                let result = normalize_confusables(&s, "latin").unwrap();
                prop_assert!(
                    result.chars().count() >= s.chars().count(),
                    "normalize_confusables dropped chars: {:?} ({} chars) → {:?} ({} chars)",
                    s, s.chars().count(), result, result.chars().count()
                );
            }

            /// normalize_confusables output is always valid UTF-8 (trivially
            /// true since we return String, but this catches memory corruption).
            #[test]
            fn normalize_confusables_valid_utf8(s in "\\PC*") {
                let result = normalize_confusables(&s, "latin").unwrap();
                // If this compiles and doesn't panic, the result is valid UTF-8.
                let _ = result.len(); // forces evaluation
            }
        }
    }
}