disarm 0.10.0

Unicode canonicalization and TR39 confusable analysis: building blocks for text-security pipelines (homoglyph/bidi/zalgo handling) plus standards-based transliteration
Documentation
//! Terminal column-width measurement (#224).
//!
//! `terminal_width` / `grapheme_width` report how many terminal cells text
//! occupies, summed over UAX #29 grapheme clusters (via [`crate::grapheme`], so
//! the segmenter is shared — see #226). Base width follows UAX #11 East Asian
//! Width; emoji-presented clusters are 2 cells as a whole. This measures cells,
//! not pixels or font metrics; it does not expand tabs or model line wrapping.
//!
//! See the axioms A0–A8 and invariants I_w1–I_w5 in #224. Width data is generated
//! at build time from the pinned UCD (`scripts/gen_width_data.py`).
//!
//! Layer 1 (pure-Rust core): no pyo3. The PyO3 shims live in `src/py/width.rs`;
//! the idiomatic crates.io surface is `crate::api::{terminal_width,
//! grapheme_width}`.

// The generated range tables are bare code-point literals (e.g. `(12288, 12289, 2)`);
// underscore grouping would be noise in machine-generated data.
#![allow(clippy::unreadable_literal)]

use std::cmp::Ordering;

use crate::grapheme::clusters;

// Sorted, non-overlapping range tables generated by build.rs.
include!(concat!(env!("OUT_DIR"), "/char_width_ranges.rs")); // WIDTH_RANGES: &[(u32,u32,u8)]
include!(concat!(env!("OUT_DIR"), "/emoji_presentation_ranges.rs")); // EMOJI_PRESENTATION_RANGES

const VS15: char = '\u{FE0E}'; // text-presentation selector
const VS16: char = '\u{FE0F}'; // emoji-presentation selector
const KEYCAP: char = '\u{20E3}'; // combining enclosing keycap

// Width class encoding in WIDTH_RANGES: 0 = zero-width, 2 = wide, 3 = ambiguous.
// Code points absent from the table are narrow (1).
fn width_class(cp: u32) -> u8 {
    match WIDTH_RANGES.binary_search_by(|&(start, end, _)| {
        if cp < start {
            Ordering::Greater
        } else if cp > end {
            Ordering::Less
        } else {
            Ordering::Equal
        }
    }) {
        Ok(i) => WIDTH_RANGES[i].2,
        Err(_) => 1,
    }
}

fn in_range_set(set: &[(u32, u32)], cp: u32) -> bool {
    set.binary_search_by(|&(start, end)| {
        if cp < start {
            Ordering::Greater
        } else if cp > end {
            Ordering::Less
        } else {
            Ordering::Equal
        }
    })
    .is_ok()
}

fn is_regional_indicator(c: char) -> bool {
    ('\u{1F1E6}'..='\u{1F1FF}').contains(&c)
}

fn resolve(class: u8, ambiguous_wide: bool) -> usize {
    match class {
        0 => 0,
        2 => 2,
        3 => usize::from(ambiguous_wide) + 1, // ambiguous: 2 if wide else 1
        _ => 1,
    }
}

/// Column width of a single grapheme cluster, with the `ambiguous_wide` policy.
///
/// This is the workhorse; [`grapheme_width`] is the `ambiguous_wide = false` form.
pub(crate) fn grapheme_width_opts(cluster: &str, ambiguous_wide: bool) -> usize {
    let mut rest = cluster.chars();
    let Some(base) = rest.next() else {
        return 0;
    };

    let base_emoji = in_range_set(EMOJI_PRESENTATION_RANGES, base as u32) // default-emoji base
        || is_regional_indicator(base); // flag (segmenter pairs regional indicators)
    let base_class = width_class(base as u32);

    // A zero-width base with no emoji presentation is a combining/selector/control-led
    // cluster (e.g. a lone combining mark, ZWJ, or variation selector): width 0 (I_w5).
    // Handling this first also stops a lone VS16/keycap from triggering the emoji branch.
    if base_class == 0 && !base_emoji {
        return 0;
    }

    // A6: detect emoji presentation. VS15 forces text presentation (width by base).
    // A ZWJ emoji sequence is covered by its base (Emoji_Presentation) or a VS16,
    // so ZWJ itself is not a trigger.
    let mut has_vs15 = false;
    let mut has_vs16 = false;
    let mut has_keycap = false;
    for c in rest {
        match c {
            VS15 => has_vs15 = true,
            VS16 => has_vs16 = true,
            KEYCAP => has_keycap = true,
            _ => {}
        }
    }
    let is_keycap_base = matches!(base, '0'..='9' | '#' | '*');
    // A6: VS15 forces text presentation. For an emoji base that is one text glyph
    // (1 column) even when the base is East-Asian-Wide; a stray VS15 on a
    // non-emoji base is ignored and the base's own width applies.
    if has_vs15 {
        return if base_emoji {
            1
        } else {
            resolve(base_class, ambiguous_wide)
        };
    }
    if has_vs16 || base_emoji || (has_keycap && is_keycap_base) {
        return 2;
    }

    // Non-emoji: the base scalar determines the cell width; combining /
    // default-ignorable scalars in the cluster contribute 0.
    resolve(base_class, ambiguous_wide)
}

/// Total terminal column width of `text`, summed over grapheme clusters, with the
/// `ambiguous_wide` policy.
#[must_use]
pub(crate) fn terminal_width_opts(text: &str, ambiguous_wide: bool) -> usize {
    clusters(text)
        .map(|cluster| grapheme_width_opts(cluster, ambiguous_wide))
        .sum()
}

#[cfg(test)]
mod tests {
    use super::*;

    // The no-arg (`ambiguous_wide = false`) convenience forms — the default
    // policy used throughout the golden vectors. The public surface is
    // `crate::api::{terminal_width, grapheme_width}`, which take the flag
    // explicitly; here they keep the assertions terse.
    fn grapheme_width(cluster: &str) -> usize {
        grapheme_width_opts(cluster, false)
    }
    fn terminal_width(text: &str) -> usize {
        terminal_width_opts(text, false)
    }

    // --- Golden vectors (A2–A6) ---

    #[test]
    fn golden_ascii() {
        assert_eq!(terminal_width("hello"), 5);
        assert_eq!(grapheme_width("a"), 1);
    }

    #[test]
    fn golden_wide_cjk_and_hangul() {
        assert_eq!(grapheme_width(""), 2);
        assert_eq!(terminal_width("世界"), 4);
        assert_eq!(grapheme_width(""), 2); // composed Hangul syllable
    }

    #[test]
    fn golden_fullwidth_halfwidth() {
        assert_eq!(grapheme_width(""), 2); // U+FF21 fullwidth A
        assert_eq!(grapheme_width(""), 1); // U+FF71 halfwidth katakana
    }

    #[test]
    fn golden_combining() {
        assert_eq!(grapheme_width("e\u{0301}"), 1); // é (NFD): base + combining
        assert_eq!(grapheme_width("\u{0301}"), 0); // lone combining mark (I_w5)
        assert_eq!(terminal_width("café"), 4); // NFC
        assert_eq!(terminal_width("cafe\u{0301}"), 4); // NFD
    }

    #[test]
    fn golden_emoji_presentation() {
        assert_eq!(grapheme_width("😀"), 2); // Emoji_Presentation base
        assert_eq!(grapheme_width("\u{FE0F}"), 2); // VS16 → emoji
        assert_eq!(grapheme_width("\u{FE0E}"), 1); // VS15 → text, width 1
                                                    // VS15 forces text presentation to 1 even for an East-Asian-Wide emoji
                                                    // base (⌚ U+231A is Emoji_Presentation AND EAW=Wide). (#224 review)
        assert_eq!(grapheme_width("\u{FE0E}"), 1);
        assert_eq!(grapheme_width(""), 2); // no selector → emoji width 2
        assert_eq!(grapheme_width("🇫🇷"), 2); // regional-indicator flag
        assert_eq!(grapheme_width("1\u{FE0F}\u{20E3}"), 2); // keycap
        assert_eq!(grapheme_width("👨‍👩‍👧‍👦"), 2); // ZWJ family
        assert_eq!(terminal_width("hi 😀"), 5); // "hi " (3) + emoji (2)
    }

    #[test]
    fn golden_controls_and_zero_width() {
        assert_eq!(terminal_width("\t"), 0); // tab not expanded (A5)
        assert_eq!(terminal_width("\u{200B}"), 0); // ZWSP
        assert_eq!(terminal_width("a\u{0000}b"), 2); // NUL contributes 0
    }

    #[test]
    fn ambiguous_policy() {
        // U+00A1 INVERTED EXCLAMATION MARK is East Asian Ambiguous.
        assert_eq!(grapheme_width_opts("¡", false), 1);
        assert_eq!(grapheme_width_opts("¡", true), 2);
        assert_eq!(terminal_width("¡"), 1); // default narrow
        assert_eq!(terminal_width_opts("¡", true), 2);
    }

    // --- Invariants I_w1–I_w5 ---

    #[test]
    fn iw1_ascii_equals_len() {
        for s in ["", "hello world", "a-b_c.123!"] {
            assert_eq!(terminal_width(s), s.len(), "I_w1 for {s:?}");
        }
    }

    #[test]
    fn iw2_bounds() {
        for s in ["世界", "café", "😀🇫🇷", "a\u{0301}b", "한국어"] {
            let w = terminal_width(s);
            let upper = 2 * crate::grapheme::grapheme_len(s);
            assert!(w <= upper, "I_w2: {w} <= {upper} for {s:?}");
        }
    }

    #[test]
    fn iw3_additivity_no_cluster_merge() {
        // I_w3: additivity holds when the inserted space forms its own cluster,
        // i.e. neither side attaches across the join (no leading Extend/ZWJ/
        // SpacingMark in `b`). `b = "ok"` starts with a base scalar, so the
        // space is its own cluster and width is additive.
        let a = "世界";
        let b = "ok";
        assert_eq!(
            terminal_width(&format!("{a} {b}")),
            terminal_width(a) + 1 + terminal_width(b)
        );
    }

    #[test]
    fn iw3_leading_extend_absorbs_space() {
        // #279: when `b` begins with a grapheme-Extend scalar, UAX #29 (GB9)
        // attaches it leftward across the space — `" 🏻"` is ONE cluster, not
        // two — so additivity across the space does not (and should not) hold.
        // This is grapheme-cluster-accurate, not a width bug.
        let fitzpatrick = "\u{1F3FB}"; // GCB=Extend, base has Emoji_Presentation
        let joined = format!(" {fitzpatrick}");
        // The space + modifier are a single cluster of width 1.
        assert_eq!(crate::grapheme::grapheme_len(&joined), 1);
        assert_eq!(terminal_width(&joined), 1);
        // The lone modifier is its own cluster, width 2.
        assert_eq!(terminal_width(fitzpatrick), 2);
        // So the naive additive identity fails across the space here.
        assert_ne!(
            terminal_width(&joined),
            terminal_width("") + 1 + terminal_width(fitzpatrick)
        );
    }

    #[test]
    fn iw4_determinism() {
        let s = "Hello 世界 😀 café";
        let first = terminal_width(s);
        for _ in 0..5 {
            assert_eq!(terminal_width(s), first);
        }
    }

    #[test]
    fn iw5_zero_width_clusters() {
        assert_eq!(grapheme_width("\u{0301}"), 0);
        assert_eq!(grapheme_width("\u{200D}"), 0); // lone ZWJ: default-ignorable
        assert_eq!(grapheme_width("\u{FE0F}"), 0); // lone VS16
    }
}