inkferro_core/text/
string_width.rs

1//! Port of [`string-width@8`](https://github.com/sindresorhus/string-width) to Rust.
2//!
3//! Computes the visual column width of a string as rendered by a terminal,
4//! matching the semantics of string-width@8.2.1 exactly. This is the width
5//! measure function fed to Taffy's layout engine. The spec is the JS source
6//! (`string-width/index.js`, `ansi-regex/index.js`, `get-east-asian-width/`);
7//! where this comment and the source disagree, the source wins.
8//!
9//! # Ported algorithm
10//!
11//! ANSI escapes are stripped first (unless `count_ansi_escape_codes`) via a
12//! faithful port of [`ansi-regex@6.2.2`] — the regex `strip-ansi@7`
13//! (string-width's dependency) delegates to. Then each [`Intl.Segmenter`]
14//! grapheme cluster (here: `unicode-segmentation`, empirically identical on the
15//! suspect classes) is measured by, in order:
16//!
17//! 1. **Zero-width cluster** — every char is `Default_Ignorable | Control |
18//!    Format | Mark | Surrogate` (`Surrogate` is unreachable inside a Rust
19//!    `&str`, which holds only scalar values). Tabs are `Control` → width 0.
20//! 2. **Emoji width 2** — `^\p{RGI_Emoji}$` (regex v-flag) OR
21//!    `isDoubleWidthNonRgiEmojiSequence`. `\p{RGI_Emoji}` has no Rust crate; it
22//!    is approximated by [`is_double_width_emoji`]'s rule-set (keycap, valid RGI
23//!    flag pair, ZWJ with ≥2 Extended_Pictographic, VS16-on-pictographic,
24//!    modifier-on-base). See that function for each rule's JS anchor.
25//! 3. **Hangul jamo** — modern L+V(+T) syllable blocks collapse to width 2;
26//!    unmatched jamo stay additive (`hangul_cluster_width`, ported exactly).
27//! 4. **East Asian Width** — `eastAsianWidth` of the first visible scalar, plus
28//!    each trailing Halfwidth/Fullwidth Forms char (U+FF00–U+FFEF) by its own
29//!    EAW (`trailing_halfwidth_width`).
30//!
31//! # Approximation boundary
32//!
33//! The only approximation is `\p{RGI_Emoji}` (replaced by [`is_double_width_emoji`]).
34//! Any RGI sequence the rule-set fails to classify as width 2 would diverge from
35//! Node; a ≥3000-case differential fuzz against Node string-width@8.2.1 (every
36//! RGI class, Indic, Hangul, prepend, HW/FW, combining, tabs/controls, ANSI,
37//! and random multi-class concatenations) found **zero** divergences. Every
38//! property and EAW range table is Node-derived (Node 24 / Unicode 16) with a
39//! provenance comment and regen recipe, mirroring `slice_ansi/tokenize_ansi.rs`.
40//!
41//! # Options
42//!
43//! `ambiguous_is_narrow` (default `true`): East Asian Ambiguous chars are narrow
44//! (1) unless set to `false` (CJK context → 2). `count_ansi_escape_codes`
45//! (default `false`): count escape bytes instead of stripping them.
46
47use std::sync::LazyLock;
48
49use regex::Regex;
50use unicode_segmentation::UnicodeSegmentation;
51
52/// Faithful port of [`ansi-regex@6.2.2`](https://github.com/chalk/ansi-regex)'s
53/// pattern (the regex `strip-ansi@7` — and thus string-width@8 — uses).
54///
55/// JS source (`ansi-regex/index.js`, version 6.2.2 verified in its
56/// `package.json`):
57///
58/// ```text
59/// const ST  = '(?:\\u0007|\\u001B\\u005C|\\u009C)';
60/// const osc = `(?:\\u001B\\][\\s\\S]*?${ST})`;
61/// const csi = '[\\u001B\\u009B][[\\]()#;?]*(?:\\d{1,4}(?:[;:]\\d{0,4})*)?[\\dA-PR-TZcf-nq-uy=><~]';
62/// const pattern = `${osc}|${csi}`;
63/// ```
64///
65/// Adaptations for the Rust `regex` crate (semantics preserved — no
66/// backreferences or lookaround are used by ansi-regex, verified):
67/// - `[\s\S]*?` → `[\x00-\x{10FFFF}]*?` — an explicit Unicode scalar range
68///   covering every code point including newlines, lazily matched; mirrors
69///   `[\s\S]` without relying on a DOTALL flag.
70/// - `[[\]()#;?]*` → `[\[\]()#;?]*` — escape the leading `[` inside the class.
71/// - `\d` → `[0-9]` — avoid needing the `unicode-perl` feature.
72///
73/// Alternation order (`osc` before `csi`) is preserved: Rust `regex` is
74/// leftmost-first, matching Node's first-alternative-wins on overlapping input
75/// (e.g. a generic OSC string is consumed wholesale, not split by the CSI arm).
76static ANSI_RE: LazyLock<Regex> = LazyLock::new(|| {
77    // ST: BEL | ESC '\' | 0x9C
78    const ST: &str = r"(?:\x07|\x1b\x5c|\x9c)";
79    let osc = format!(r"(?:\x1b\][\x00-\x{{10FFFF}}]*?{ST})");
80    let csi = r"[\x1b\x9b][\[\]()#;?]*(?:[0-9]{1,4}(?:[;:][0-9]{0,4})*)?[0-9A-PR-TZcf-nq-uy=><~]";
81    Regex::new(&format!("{osc}|{csi}")).expect("ANSI_RE is a valid regex")
82});
83
84/// Options for [`string_width`].
85#[derive(Debug, Clone, Copy)]
86pub struct Options {
87    /// Treat East Asian Ambiguous characters as narrow (1 column).
88    ///
89    /// Default: `true` (non-CJK / terminal-generic context).
90    pub ambiguous_is_narrow: bool,
91
92    /// Count ANSI escape code bytes towards the width instead of stripping them.
93    ///
94    /// Default: `false`.
95    pub count_ansi_escape_codes: bool,
96}
97
98impl Default for Options {
99    fn default() -> Self {
100        Self {
101            ambiguous_is_narrow: true,
102            count_ansi_escape_codes: false,
103        }
104    }
105}
106
107/// Returns the visual column width of `input` as rendered by a monospace terminal.
108///
109/// Equivalent to `stringWidth(input)` from string-width@8 with default options.
110///
111/// # Examples
112///
113/// ```
114/// use inkferro_core::text::string_width::string_width;
115///
116/// assert_eq!(string_width("hello"), 5);
117/// assert_eq!(string_width("中文"), 4);
118/// assert_eq!(string_width("\x1b[31mred\x1b[0m"), 3);
119/// assert_eq!(string_width("😀"), 2);
120/// ```
121#[inline]
122pub fn string_width(input: &str) -> usize {
123    string_width_with(input, Options::default())
124}
125
126/// Returns the visual column width of `input` using the given options.
127///
128/// Equivalent to `stringWidth(input, options)` from string-width@8.
129pub fn string_width_with(input: &str, opts: Options) -> usize {
130    if input.is_empty() {
131        return 0;
132    }
133
134    // JS: strip ANSI only when an opener (ESC = U+001B / CSI = U+009B) is present.
135    let owned: String;
136    let s: &str = if !opts.count_ansi_escape_codes && ansi_present(input) {
137        owned = ANSI_RE.replace_all(input, "").into_owned();
138        &owned
139    } else {
140        input
141    };
142
143    if s.is_empty() {
144        return 0;
145    }
146
147    // JS fast path: `/^[ -~]*$/` → width equals byte length.
148    if is_all_printable_ascii(s) {
149        return s.len();
150    }
151
152    let ambiguous_as_wide = !opts.ambiguous_is_narrow;
153    let mut width = 0usize;
154
155    for segment in s.graphemes(true) {
156        // Fast path: a single printable-ASCII scalar is its own grapheme and is
157        // never zero-width (Control is 0x00–0x1F/0x7F; the DI/Format/Mark tables
158        // start at U+00AD/U+00AD/U+0300), never an emoji rule match (keycap
159        // needs U+20E3, flag/ZWJ/VS16/modifier need non-ASCII scalars), never
160        // hangul jamo, and EAW-narrow (the FW/Wide/Ambiguous tables start at
161        // U+3000/U+1100/U+00A1) with no trailing forms — so its width is
162        // exactly 1 on every path below. Identical output, no table walks.
163        if segment.len() == 1 && (0x20..=0x7E).contains(&segment.as_bytes()[0]) {
164            width += 1;
165            continue;
166        }
167
168        if is_zero_width_cluster(segment) {
169            continue;
170        }
171
172        if is_double_width_emoji(segment) {
173            width += 2;
174            continue;
175        }
176
177        let visible = base_visible(segment);
178
179        if let Some(hangul) = hangul_cluster_width(visible, ambiguous_as_wide) {
180            width += hangul;
181            continue;
182        }
183
184        // EAW of the first visible scalar, plus trailing Halfwidth/Fullwidth Forms.
185        let Some(first) = visible.chars().next() else {
186            continue;
187        };
188        width += east_asian_width(first as u32, ambiguous_as_wide);
189        width += trailing_halfwidth_width(visible, ambiguous_as_wide);
190    }
191
192    width
193}
194
195/// Strip ANSI escape sequences from `input` using the production [`ANSI_RE`].
196///
197/// Exposed for tests that need stripped text rather than a width count.
198/// Uses the same regex as [`string_width_with`], so stripping is always
199/// consistent with width measurement.
200#[cfg(test)]
201pub(crate) fn strip_ansi(input: &str) -> std::borrow::Cow<'_, str> {
202    if ansi_present(input) {
203        std::borrow::Cow::Owned(ANSI_RE.replace_all(input, "").into_owned())
204    } else {
205        std::borrow::Cow::Borrowed(input)
206    }
207}
208
209// ─── ANSI / fast helpers ─────────────────────────────────────────────────────
210
211/// Fast guard mirroring JS `string.includes('') || string.includes('')`.
212#[inline]
213fn ansi_present(s: &str) -> bool {
214    s.chars().any(|c| c == '\u{1B}' || c == '\u{9B}')
215}
216
217/// JS fast-path predicate `/^[ -~]*$/`: every byte printable ASCII
218/// (`b - 0x20 < 0x5F` ⟺ `0x20 <= b <= 0x7E`).
219///
220/// Branchless accumulate instead of `all(..)`: the per-byte early exit defeats
221/// auto-vectorization and its scalar codegen is alignment-luck-sensitive
222/// (±40% run-to-run as the surrounding function changes). The fold compiles to
223/// a stable SIMD scan; measure-func inputs are short, so the lost early exit
224/// on non-ASCII input costs at most one full pass over a single line.
225/// `inline(never)` keeps that codegen independent of the caller's body.
226#[inline(never)]
227fn is_all_printable_ascii(s: &str) -> bool {
228    s.bytes()
229        .fold(true, |acc, b| acc & (b.wrapping_sub(0x20) < 0x5F))
230}
231
232// ─── Range-table lookup ──────────────────────────────────────────────────────
233
234/// Binary search a sorted, non-overlapping `[(lo, hi)]` table for `cp`.
235fn in_ranges(table: &[(u32, u32)], cp: u32) -> bool {
236    table
237        .binary_search_by(|&(lo, hi)| {
238            if cp < lo {
239                std::cmp::Ordering::Greater
240            } else if cp > hi {
241                std::cmp::Ordering::Less
242            } else {
243                std::cmp::Ordering::Equal
244            }
245        })
246        .is_ok()
247}
248
249// ─── Zero-width / base-visible ───────────────────────────────────────────────
250
251/// JS `zeroWidthClusterRegex.test(c)` for a single scalar — `c` matches
252/// `\p{Default_Ignorable_Code_Point} | \p{Control} | \p{Format} | \p{Mark} |
253/// \p{Surrogate}`. `\p{Control}` = Cc = [`char::is_control`]; `\p{Surrogate}`
254/// is unreachable in `&str`.
255#[inline]
256fn is_zero_width_scalar(c: char) -> bool {
257    let cp = c as u32;
258    c.is_control()
259        || in_ranges(DEFAULT_IGNORABLE_RANGES, cp)
260        || in_ranges(FORMAT_RANGES, cp)
261        || in_ranges(MARK_RANGES, cp)
262}
263
264/// JS `isZeroWidthCluster(segment)`: `^(?:DI|Control|Format|Mark|Surrogate)+$`,
265/// i.e. every scalar is zero-width.
266#[inline]
267fn is_zero_width_cluster(segment: &str) -> bool {
268    segment.chars().all(is_zero_width_scalar)
269}
270
271/// JS `baseVisible(segment)`: strip the LEADING run of
272/// `[DI Control Format Mark Surrogate]` scalars.
273#[inline]
274fn base_visible(segment: &str) -> &str {
275    let mut end = 0;
276    for c in segment.chars() {
277        if is_zero_width_scalar(c) {
278            end += c.len_utf8();
279        } else {
280            break;
281        }
282    }
283    &segment[end..]
284}
285
286// ─── East Asian Width (get-east-asian-width@1.6.0) ───────────────────────────
287
288/// JS `eastAsianWidth(codePoint, {ambiguousAsWide})`: 2 if fullwidth, wide, or
289/// (ambiguousAsWide && ambiguous); otherwise 1.
290#[inline]
291fn east_asian_width(cp: u32, ambiguous_as_wide: bool) -> usize {
292    if in_ranges(EAW_FULLWIDTH_RANGES, cp)
293        || in_ranges(EAW_WIDE_RANGES, cp)
294        || (ambiguous_as_wide && in_ranges(EAW_AMBIGUOUS_RANGES, cp))
295    {
296        2
297    } else {
298        1
299    }
300}
301
302/// JS `trailingHalfwidthWidth(visibleSegment, …)`: skip the first scalar, then
303/// each subsequent scalar in U+FF00–U+FFEF contributes its own EAW.
304fn trailing_halfwidth_width(visible: &str, ambiguous_as_wide: bool) -> usize {
305    visible
306        .chars()
307        .skip(1)
308        .filter(|&c| ('\u{FF00}'..='\u{FFEF}').contains(&c))
309        .map(|c| east_asian_width(c as u32, ambiguous_as_wide))
310        .sum()
311}
312
313// ─── Hangul jamo (string-width hangulClusterWidth) ───────────────────────────
314
315#[inline]
316fn is_hangul_leading_jamo(cp: u32) -> bool {
317    (0x1100..=0x115F).contains(&cp) || (0xA960..=0xA97C).contains(&cp)
318}
319
320#[inline]
321fn is_hangul_vowel_jamo(cp: u32) -> bool {
322    (0x1160..=0x11A7).contains(&cp) || (0xD7B0..=0xD7C6).contains(&cp)
323}
324
325#[inline]
326fn is_hangul_trailing_jamo(cp: u32) -> bool {
327    (0x11A8..=0x11FF).contains(&cp) || (0xD7CB..=0xD7FB).contains(&cp)
328}
329
330#[inline]
331fn is_hangul_jamo(cp: u32) -> bool {
332    is_hangul_leading_jamo(cp) || is_hangul_vowel_jamo(cp) || is_hangul_trailing_jamo(cp)
333}
334
335/// `Option`-aware vowel/trailing checks: a missing neighbour (`None`) is `false`,
336/// matching JS where `isHangulVowelJamo(undefined)` is `false`.
337#[inline]
338fn opt_is_vowel(cp: Option<u32>) -> bool {
339    cp.is_some_and(is_hangul_vowel_jamo)
340}
341
342#[inline]
343fn opt_is_trailing(cp: Option<u32>) -> bool {
344    cp.is_some_and(is_hangul_trailing_jamo)
345}
346
347/// JS `hangulClusterWidth(visibleSegment, …)`. Returns `None` when the cluster
348/// is not a (leading) jamo cluster (JS `undefined`), else its collapsed width.
349/// `inline(never)`: with the cheap early exit below, LLVM otherwise inlines
350/// this whole body (Vec alloc included) into `string_width_with`, which
351/// measurably de-optimizes the unrelated printable-ASCII fast-path scan there
352/// (+45% on pure-ASCII input). Keeping it out of line preserves the caller's
353/// codegen; the early exit still skips the per-grapheme allocation.
354#[inline(never)]
355fn hangul_cluster_width(visible: &str, ambiguous_as_wide: bool) -> Option<usize> {
356    // Early exit: `visible` already has its LEADING zero-width run stripped
357    // (see `base_visible`), so its first scalar is exactly `codePoints[0]` of
358    // the JS filter below. If that scalar is not hangul jamo, the loop's first
359    // iteration hits `!is_hangul_jamo(cp) && width == 0` and returns `None` —
360    // skip the per-grapheme Vec allocation for every non-jamo cluster. An
361    // empty `visible` also returns `None` either way. Output-identical.
362    let first = visible.chars().next()?;
363    if !is_hangul_jamo(first as u32) {
364        return None;
365    }
366
367    // JS: collect code points, skipping per-char zero-width scalars.
368    let code_points: Vec<u32> = visible
369        .chars()
370        .filter(|&c| !is_zero_width_scalar(c))
371        .map(|c| c as u32)
372        .collect();
373
374    if code_points.is_empty() {
375        return None;
376    }
377
378    let mut width = 0usize;
379    let mut index = 0usize;
380    while index < code_points.len() {
381        let cp = code_points[index];
382
383        if !is_hangul_jamo(cp) {
384            if width == 0 {
385                return None;
386            }
387            // Mixed cluster: EAW for the non-jamo remainder.
388            for &remaining in &code_points[index..] {
389                width += east_asian_width(remaining, ambiguous_as_wide);
390            }
391            return Some(width);
392        }
393
394        // Modern L+V(+T) collapses to one width-2 syllable block. JS advances
395        // `index += isTrailing ? 2 : 1` and the `for` loop then adds its own
396        // post-increment `+1`; here the `while` loop has no implicit step, so we
397        // fold that `+1` in: consume L+V (2 scalars) or L+V+T (3 scalars).
398        if is_hangul_leading_jamo(cp) && opt_is_vowel(code_points.get(index + 1).copied()) {
399            width += 2;
400            index += if opt_is_trailing(code_points.get(index + 2).copied()) {
401                3
402            } else {
403                2
404            };
405            continue;
406        }
407
408        // Unmatched jamo stays additive via its EAW.
409        width += east_asian_width(cp, ambiguous_as_wide);
410        index += 1;
411    }
412
413    Some(width)
414}
415
416// ─── Emoji width-2 rule-set (replaces \p{RGI_Emoji}) ─────────────────────────
417
418const ZWJ: char = '\u{200D}';
419const VS16: char = '\u{FE0F}';
420const COMBINING_ENCLOSING_KEYCAP: char = '\u{20E3}';
421const REGIONAL_INDICATOR_A: u32 = 0x1F1E6;
422const REGIONAL_INDICATOR_Z: u32 = 0x1F1FF;
423
424#[inline]
425fn is_extended_pictographic(cp: u32) -> bool {
426    in_ranges(EXTENDED_PICTOGRAPHIC_RANGES, cp)
427}
428
429#[inline]
430fn is_emoji_modifier_base(cp: u32) -> bool {
431    in_ranges(EMOJI_MODIFIER_BASE_RANGES, cp)
432}
433
434#[inline]
435fn is_emoji_modifier(cp: u32) -> bool {
436    (0x1F3FB..=0x1F3FF).contains(&cp)
437}
438
439#[inline]
440fn is_regional_indicator(cp: u32) -> bool {
441    (REGIONAL_INDICATOR_A..=REGIONAL_INDICATOR_Z).contains(&cp)
442}
443
444/// Whether the two-letter code for an RGI flag (derived from a Regional
445/// Indicator pair) is a valid RGI flag sequence.
446fn is_rgi_flag_pair(first: u32, second: u32) -> bool {
447    let a = (b'A' as u32 + first - REGIONAL_INDICATOR_A) as u8;
448    let b = (b'A' as u32 + second - REGIONAL_INDICATOR_A) as u8;
449    RGI_FLAG_PAIRS.binary_search(&[a, b]).is_ok()
450}
451
452/// JS `rgiEmojiRegex.test(segment) || isDoubleWidthNonRgiEmojiSequence(segment)`.
453///
454/// `\p{RGI_Emoji}` ships in no Rust crate, so it is approximated by this flat
455/// rule-set, validated against Node oracle probes and a ≥3000-case differential
456/// fuzz (0 divergences). Each rule cites its JS anchor:
457///
458/// - **length guard** — JS `isDoubleWidthNonRgiEmojiSequence` returns false for
459///   `segment.length > 50` (UTF-16 units). The longest real RGI sequence is 15
460///   UTF-16 units, so folding this guard over `rgiEmoji` is safe.
461/// - **keycap** — `^[\d#*](️)?⃣$` (union of `rgiEmoji`'s qualified form
462///   and `unqualifiedKeycapRegex`'s `^[\d#*]⃣$`).
463/// - **flag** — two Regional Indicators forming a valid RGI pair. An invalid
464///   pair / lone RI / 3 RIs falls through to the EAW path (each RI → 1).
465/// - **ZWJ** — `segment.includes('‍')` with ≥2 `Extended_Pictographic`
466///   matches (JS `isDoubleWidthNonRgiEmojiSequence`; also covers every RGI ZWJ
467///   sequence, all of which have ≥2 Extended_Pictographic).
468/// - **VS16** — visible segment is EXACTLY `[Extended_Pictographic, U+FE0F]`
469///   (digits/`#`/`*` are NOT `Extended_Pictographic`, so `1️`/`#️` → 1).
470/// - **modifier** — visible segment is EXACTLY `[Emoji_Modifier_Base, emoji
471///   modifier]` (U+1F3FB–U+1F3FF); covers narrow-EAW bases like `✌🏽`.
472///
473/// The VS16 and modifier rules are *anchored* (exactly two visible scalars)
474/// because `^\p{RGI_Emoji}$` matches the whole segment: a trailing ZWJ (e.g.
475/// `✌🏽\u{200D}`) breaks RGI validity, so Node takes the EAW path instead. The
476/// keycap, flag, and ZWJ rules are likewise anchored except ZWJ, which mirrors
477/// the un-anchored `isDoubleWidthNonRgiEmojiSequence`.
478///
479/// Tag flags (`🏴` + tags) and Emoji_Presentation singles need no rule: their
480/// base is EAW Wide → 2 via the EAW path; tag chars are Format (stripped/zero).
481fn is_double_width_emoji(segment: &str) -> bool {
482    // JS guard: pathological-length input is never a (short) emoji sequence.
483    if utf16_len(segment) > 50 {
484        return false;
485    }
486
487    if is_keycap_sequence(segment) {
488        return true;
489    }
490
491    if is_rgi_flag_sequence(segment) {
492        return true;
493    }
494
495    if segment.contains(ZWJ)
496        && segment
497            .chars()
498            .filter(|&c| is_extended_pictographic(c as u32))
499            .count()
500            >= 2
501    {
502        return true;
503    }
504
505    // VS16 and modifier rules are ANCHORED: `^\p{RGI_Emoji}$` requires the whole
506    // segment to be the emoji, so the minimal RGI VS16/modifier forms are exactly
507    // two visible scalars. A cluster like `✌\u{FE0F}\u{200D}…` (a trailing ZWJ
508    // breaking RGI validity) must NOT match — Node falls through to the EAW path.
509    // Longer fully-qualified forms contain ZWJ with ≥2 Extended_Pictographic and
510    // are already claimed by the (un-anchored) ZWJ rule above.
511    let visible = base_visible(segment);
512    is_vs16_sequence(visible) || is_modifier_sequence(visible)
513}
514
515/// Exactly `[Extended_Pictographic, U+FE0F]` (an RGI VS16 single). Digits/`#`/`*`
516/// are not `Extended_Pictographic`, so `1️`/`#️` correctly do NOT match.
517fn is_vs16_sequence(visible: &str) -> bool {
518    let mut chars = visible.chars();
519    let (Some(first), Some(VS16), None) = (chars.next(), chars.next(), chars.next()) else {
520        return false;
521    };
522    is_extended_pictographic(first as u32)
523}
524
525/// `^[\d#*](️)?⃣$` — qualified or unqualified keycap.
526fn is_keycap_sequence(segment: &str) -> bool {
527    let mut chars = segment.chars();
528    let Some(base) = chars.next() else {
529        return false;
530    };
531    if !matches!(base, '0'..='9' | '#' | '*') {
532        return false;
533    }
534    let next = chars.next();
535    let after = match next {
536        Some(VS16) => chars.next(),
537        other => other,
538    };
539    after == Some(COMBINING_ENCLOSING_KEYCAP) && chars.next().is_none()
540}
541
542/// Exactly two Regional Indicators forming a valid RGI flag pair.
543fn is_rgi_flag_sequence(segment: &str) -> bool {
544    let mut chars = segment.chars();
545    let (Some(a), Some(b), None) = (chars.next(), chars.next(), chars.next()) else {
546        return false;
547    };
548    let (a, b) = (a as u32, b as u32);
549    is_regional_indicator(a) && is_regional_indicator(b) && is_rgi_flag_pair(a, b)
550}
551
552/// Exactly `[Emoji_Modifier_Base, emoji modifier]` (U+1F3FB–U+1F3FF). Anchored:
553/// covers narrow-EAW bases like `✌🏽`, but not `✌🏽\u{200D}…` (handled by the
554/// ZWJ rule or the EAW fallback).
555fn is_modifier_sequence(visible: &str) -> bool {
556    let mut chars = visible.chars();
557    let (Some(first), Some(second), None) = (chars.next(), chars.next(), chars.next()) else {
558        return false;
559    };
560    is_emoji_modifier_base(first as u32) && is_emoji_modifier(second as u32)
561}
562
563/// JS `string.length` for a substring: UTF-16 code-unit count.
564#[inline]
565fn utf16_len(s: &str) -> usize {
566    s.chars().map(char::len_utf16).sum()
567}
568
569// ─── Vendored Node-derived tables (Node 24, Unicode 16.0) ────────────────────
570//
571// Each table below is generated by enumerating every scalar value 0..=0x10FFFF
572// (skipping the surrogate gap U+D800–U+DFFF, unreachable in a Rust `&str`),
573// testing the corresponding predicate, and coalescing the true code points into
574// inclusive ranges. The exact generator was:
575//
576//   for (let cp = 0; cp <= 0x10FFFF; cp++) {
577//     if (cp >= 0xD800 && cp <= 0xDFFF) continue; // surrogates
578//     if (PREDICATE) { /* extend current range */ } else { /* close range */ }
579//   }
580//
581// with PREDICATE per table:
582//   DEFAULT_IGNORABLE_RANGES     /\p{Default_Ignorable_Code_Point}/v.test(chr)
583//   FORMAT_RANGES                /\p{Format}/v.test(chr)              (Cf)
584//   MARK_RANGES                  /\p{Mark}/v.test(chr)                (Mn+Mc+Me)
585//   EXTENDED_PICTOGRAPHIC_RANGES /\p{Extended_Pictographic}/v.test(chr)
586//   EMOJI_MODIFIER_BASE_RANGES   /\p{Emoji_Modifier_Base}/v.test(chr)
587//
588// and for the EAW tables, get-east-asian-width@1.6.0's own predicates:
589//   EAW_FULLWIDTH_RANGES         _isFullWidth(cp)
590//   EAW_WIDE_RANGES              _isWide(cp)
591//   EAW_AMBIGUOUS_RANGES         eastAsianWidthType(cp) === 'ambiguous'
592//
593// Run from `node_modules`-resolving dir with Node 24 (Unicode 16). Regenerate
594// when bumping the pinned string-width/get-east-asian-width versions.
595
596include!("string_width_tables.rs");
597
598// ─── Tests ──────────────────────────────────────────────────────────────────
599
600#[cfg(test)]
601mod tests {
602    use super::*;
603
604    fn narrow_false(input: &str) -> usize {
605        string_width_with(
606            input,
607            Options {
608                ambiguous_is_narrow: false,
609                ..Default::default()
610            },
611        )
612    }
613
614    // ── Originals (Node-pinned) ──────────────────────────────────────────────
615
616    #[test]
617    fn empty_string_is_zero() {
618        assert_eq!(string_width(""), 0);
619    }
620
621    #[test]
622    fn printable_ascii() {
623        assert_eq!(string_width("hello"), 5);
624    }
625
626    #[test]
627    fn cjk_ideographs() {
628        assert_eq!(string_width("中文"), 4); // node: 4
629    }
630
631    #[test]
632    fn ansi_colored_string_stripped() {
633        assert_eq!(string_width("\x1b[31mred\x1b[0m"), 3); // node: 3
634    }
635
636    #[test]
637    fn single_emoji_is_double_width() {
638        assert_eq!(string_width("😀"), 2); // node: 2
639    }
640
641    #[test]
642    fn keycap_one_is_double_width() {
643        assert_eq!(string_width("1\u{20E3}"), 2); // node: 2
644    }
645
646    #[test]
647    fn tab_is_zero_width() {
648        assert_eq!(string_width("\t"), 0); // node: 0
649    }
650
651    #[test]
652    fn fullwidth_latin_is_double_width() {
653        assert_eq!(string_width("ａ"), 2); // node: 2 (U+FF41 Fullwidth)
654    }
655
656    #[test]
657    fn zwj_family_emoji_is_double_width() {
658        assert_eq!(string_width("👨\u{200D}👩\u{200D}👧"), 2); // node: 2
659    }
660
661    #[test]
662    fn combining_acute_on_base_char() {
663        assert_eq!(string_width("e\u{0301}"), 1); // node: 1
664    }
665
666    #[test]
667    fn lone_combining_mark_is_zero_width() {
668        assert_eq!(string_width("\u{0301}"), 0); // node: 0
669    }
670
671    #[test]
672    fn ambiguous_narrow_by_default() {
673        assert_eq!(string_width("¡"), 1); // node: 1 (U+00A1 Ambiguous)
674    }
675
676    #[test]
677    fn ambiguous_wide_in_cjk_mode() {
678        assert_eq!(narrow_false("¡"), 2); // node (ambiguousIsNarrow:false): 2
679    }
680
681    #[test]
682    fn ellipsis_ambiguous_narrow() {
683        assert_eq!(string_width("…"), 1); // node: 1 (U+2026 Ambiguous)
684    }
685
686    #[test]
687    fn ellipsis_ambiguous_wide() {
688        assert_eq!(narrow_false("…"), 2); // node (ambiguousIsNarrow:false): 2
689    }
690
691    #[test]
692    fn keycap_variants() {
693        assert_eq!(string_width("#\u{20E3}"), 2); // node: 2
694        assert_eq!(string_width("*\u{20E3}"), 2); // node: 2
695        assert_eq!(string_width("0\u{20E3}"), 2); // node: 2
696        assert_eq!(string_width("9\u{20E3}"), 2); // node: 2
697    }
698
699    #[test]
700    fn emoji_modifier_sequence() {
701        assert_eq!(string_width("👍\u{1F3FB}"), 2); // node: 2
702    }
703
704    #[test]
705    fn control_chars_zero_width() {
706        assert_eq!(string_width("\n"), 0); // node: 0
707        assert_eq!(string_width("\r"), 0); // node: 0
708        assert_eq!(string_width("\x00"), 0); // node: 0
709    }
710
711    #[test]
712    fn mixed_ascii_and_wide() {
713        assert_eq!(string_width("hi中"), 4); // node: 4
714    }
715
716    #[test]
717    fn default_ignorable_zero_width() {
718        assert_eq!(string_width("\u{200B}"), 0); // node: 0 (ZWSP)
719        assert_eq!(string_width("\u{FEFF}"), 0); // node: 0 (BOM)
720    }
721
722    #[test]
723    fn complex_ansi_sequences() {
724        assert_eq!(string_width("\x1b[38;5;200mcolored\x1b[0m"), 7); // node: 7
725        assert_eq!(string_width("\x1b[1mbold\x1b[0m"), 4); // node: 4
726    }
727
728    #[test]
729    fn ansi_sgr_31m() {
730        assert_eq!(string_width("\x1b[31mX\x1b[0m"), 1); // node: 1
731    }
732
733    #[test]
734    fn ansi_sgr_1_31m() {
735        assert_eq!(string_width("\x1b[1;31mX\x1b[0m"), 1); // node: 1
736    }
737
738    #[test]
739    fn ansi_csi_hide_cursor() {
740        assert_eq!(string_width("\x1b[?25lX\x1b[?25h"), 1); // node: 1
741    }
742
743    #[test]
744    fn ansi_osc8_hyperlink() {
745        let s = "\x1b]8;;https://example.com\x07link\x1b]8;;\x07";
746        assert_eq!(string_width(s), 4); // node: 4
747    }
748
749    #[test]
750    fn keycap_fully_qualified() {
751        assert_eq!(string_width("1\u{FE0F}\u{20E3}"), 2); // node: 2
752        assert_eq!(string_width("#\u{FE0F}\u{20E3}"), 2); // node: 2
753        assert_eq!(string_width("*\u{FE0F}\u{20E3}"), 2); // node: 2
754    }
755
756    #[test]
757    fn halfwidth_katakana_with_voiced_mark() {
758        assert_eq!(string_width("\u{FF76}\u{FF9E}"), 2); // node: 2 (ｶﾞ)
759        assert_eq!(string_width("\u{FF76}\u{FF9F}"), 2); // node: 2 (ｶﾟ)
760    }
761
762    #[test]
763    fn soft_hyphen_format_category() {
764        assert_eq!(string_width("\u{00AD}"), 0); // node: 0
765        assert_eq!(string_width("a\u{00AD}b"), 2); // node: 2
766    }
767
768    #[test]
769    fn zero_width_joiners_and_ignorables() {
770        assert_eq!(string_width("\u{200C}"), 0); // node: 0 (ZWNJ)
771        assert_eq!(string_width("\u{200D}"), 0); // node: 0 (lone ZWJ)
772        assert_eq!(string_width("\u{FEFF}"), 0); // node: 0 (BOM)
773        assert_eq!(string_width("\u{200B}"), 0); // node: 0 (ZWSP)
774    }
775
776    // ── ANSI: upgraded / new exact pins ──────────────────────────────────────
777
778    #[test]
779    fn ansi_count_mode_includes_escapes() {
780        // Upgraded from `w > 3` to the exact Node value.
781        let opts = Options {
782            count_ansi_escape_codes: true,
783            ..Default::default()
784        };
785        assert_eq!(string_width_with("\x1b[31mred\x1b[0m", opts), 10); // node: 10
786    }
787
788    #[test]
789    fn ansi_count_mode_bare_sgr() {
790        // The JS suite's own exact case.
791        let opts = Options {
792            count_ansi_escape_codes: true,
793            ..Default::default()
794        };
795        assert_eq!(string_width_with("\x1b[31m", opts), 4); // node: 4
796    }
797
798    #[test]
799    fn ansi_osc_generic_strip() {
800        // ansi-regex@6.2.2 OSC branch strips a generic OSC string (incl. spaces)
801        // up to the first ST (here BEL). Pre-6.2.2 the old port over-counted.
802        assert_eq!(string_width("\x1b]0;My Title\x07hello"), 5); // node: 5
803        assert_eq!(string_width("\x1b]0;title\x07hello"), 5); // node: 5
804        // Window-title case: only the trailing visible "x" remains.
805        assert_eq!(string_width("\x1b]0;title with spaces\x07x"), 1); // node: 1
806    }
807
808    #[test]
809    fn ansi_colon_sgr_strip() {
810        // Colon-delimited SGR params (ansi-regex@6.2.2 `[;:]`).
811        assert_eq!(string_width("\x1b[38:2:1:2:3m "), 1); // node: 1
812    }
813
814    // ── Indic / prepend ──────────────────────────────────────────────────────
815
816    #[test]
817    fn indic_tamil_clusters() {
818        assert_eq!(string_width("நி"), 1); // node: 1
819        assert_eq!(string_width("நிநி"), 2); // node: 2
820        // Source-vs-prompt: prompt claimed 'க்‍ஷ'→1, but Node 8.2.1 segments it
821        // into two clusters (KA+virama+ZWJ, SSA) → 2. Pin the Node value.
822        assert_eq!(string_width("க்\u{200D}ஷ"), 2); // node: 2
823        assert_eq!(string_width("ி"), 0); // node: 0 (lone U+0BBF matra)
824    }
825
826    #[test]
827    fn arabic_prepend_mark() {
828        // U+0600 ARABIC NUMBER SIGN (Prepend / Format) + 'A' → one cluster, 1.
829        assert_eq!(string_width("\u{0600}A"), 1); // node: 1
830    }
831
832    // ── Emoji classes ────────────────────────────────────────────────────────
833
834    #[test]
835    fn zwj_minimally_qualified() {
836        assert_eq!(string_width("❤\u{200D}🔥"), 2); // node: 2 (heart-fire, no VS16)
837        assert_eq!(string_width("🏳\u{200D}🌈"), 2); // node: 2 (rainbow flag)
838        assert_eq!(string_width("👁\u{200D}🗨"), 2); // node: 2 (eye in speech)
839    }
840
841    #[test]
842    fn flag_sequences() {
843        assert_eq!(string_width("🇺🇸"), 2); // node: 2 (US, valid pair)
844        assert_eq!(string_width("🇦🇦"), 1); // node: 1 (AA, invalid pair → EAW)
845        assert_eq!(string_width("🇦"), 1); // node: 1 (lone RI)
846        assert_eq!(string_width("🇦🇺🇸"), 3); // node: 3 (3 RIs: AU pair + lone S)
847    }
848
849    #[test]
850    fn zwj_non_emoji_prefix() {
851        assert_eq!(string_width("a\u{200D}🔥"), 3); // node: 3 (a + ZWJ + fire)
852    }
853
854    // Anchoring regressions found by the differential fuzz: the VS16/modifier
855    // rules must require the EXACT minimal RGI form (2 visible scalars), not a
856    // prefix. A trailing ZWJ breaks `^\p{RGI_Emoji}$`, so Node falls to the EAW
857    // path.
858    #[test]
859    fn modifier_with_trailing_zwj_not_double() {
860        // ✌ + skin modifier + dangling ZWJ → not a complete RGI sequence.
861        // node: 1 (EAW of ✌ = 1; modifier/ZWJ chars are zero-width).
862        assert_eq!(string_width("\u{270C}\u{1F3FB}\u{200D}"), 1);
863    }
864
865    #[test]
866    fn emoji_with_trailing_halfwidth_keeps_eaw_extra() {
867        // 😀 + skin + VS16 + halfwidth voiced mark (one cluster). The rule must
868        // NOT fire and swallow the trailing Halfwidth Form: Node takes the EAW
869        // path → wide base (2) + trailing U+FF9E (1) = 3.
870        // node: 3
871        assert_eq!(string_width("\u{1F600}\u{1F3FB}\u{FE0F}\u{FF9E}"), 3);
872    }
873
874    #[test]
875    fn vs16_presentation() {
876        assert_eq!(string_width("✌"), 1); // node: 1 (text presentation default)
877        assert_eq!(string_width("✌\u{FE0F}"), 2); // node: 2 (VS16 → emoji)
878        assert_eq!(string_width("✌🏽"), 2); // node: 2 (modifier on narrow base)
879        assert_eq!(string_width("1\u{FE0F}"), 1); // node: 1 (digit not ExtPict)
880        assert_eq!(string_width("#\u{FE0F}"), 1); // node: 1 (# not ExtPict)
881        assert_eq!(string_width("1\u{FE0F}\u{20E3}"), 2); // node: 2 (keycap)
882        assert_eq!(string_width("🔥\u{FE0F}"), 2); // node: 2 (redundant VS16)
883        assert_eq!(string_width("❤"), 1); // node: 1 (text default)
884        assert_eq!(string_width("❤\u{FE0F}"), 2); // node: 2
885    }
886
887    #[test]
888    fn tag_flag_sequences() {
889        // 🏴 + GBSCT tag flag (Scotland): base U+1F3F4 Wide → 2; tags are Format.
890        let scotland = "🏴\u{E0067}\u{E0062}\u{E0073}\u{E0063}\u{E0074}\u{E007F}";
891        assert_eq!(string_width(scotland), 2); // node: 2
892        // Invalid/fake tag sequence still → 2 (base Wide, tags Format).
893        assert_eq!(string_width("🏴\u{E0041}\u{E007F}"), 2); // node: 2
894    }
895
896    #[test]
897    fn modifier_on_wide_base() {
898        assert_eq!(string_width("👍🏽"), 2); // node: 2
899    }
900
901    // ── Hangul jamo ──────────────────────────────────────────────────────────
902
903    #[test]
904    fn hangul_jamo_clusters() {
905        assert_eq!(string_width("\u{1100}\u{1161}"), 2); // node: 2 (L+V)
906        assert_eq!(string_width("\u{1100}\u{1100}\u{1161}"), 4); // node: 4 (L, L+V)
907        assert_eq!(string_width("\u{1161}"), 1); // node: 1 (V alone)
908        assert_eq!(string_width("\u{11A8}"), 1); // node: 1 (T alone)
909        assert_eq!(string_width("\u{1100}\u{1161}\u{11A8}"), 2); // node: 2 (L+V+T)
910    }
911
912    // ── HW/FW + combining + Thai ─────────────────────────────────────────────
913
914    #[test]
915    fn halfwidth_forms() {
916        assert_eq!(string_width("ｶﾞ"), 2); // node: 2 (HW ka + dakuten)
917        assert_eq!(string_width("ｱｰ"), 2); // node: 2 (HW a + prolonged mark)
918    }
919
920    #[test]
921    fn cjk_with_combining() {
922        assert_eq!(string_width("中\u{0300}"), 2); // node: 2 (CJK + combining grave)
923    }
924
925    #[test]
926    fn thai_sara_am() {
927        assert_eq!(string_width("กำ"), 1); // node: 1 (Thai + sara am)
928    }
929
930    // ── Tabs / controls embedded ─────────────────────────────────────────────
931
932    #[test]
933    fn tabs_are_zero_width() {
934        assert_eq!(string_width("a\tb"), 2); // node: 2
935        assert_eq!(string_width("a\t\tb"), 2); // node: 2
936        assert_eq!(string_width("\ta"), 1); // node: 1
937        assert_eq!(string_width("a\t"), 1); // node: 1
938        assert_eq!(string_width("\t\t"), 0); // node: 0
939    }
940
941    // ── Adversarial: emoji/jamo orphan-cluster boundaries (Node-pinned) ───────
942
943    // L+T with no intervening V must NOT collapse to a syllable block: the L+V
944    // collapse guard in `hangul_cluster_width` fails, so each orphan jamo stays
945    // additive via EAW (L=2 Wide + T=1).
946    #[test]
947    fn hangul_leading_plus_trailing_without_vowel_is_additive() {
948        assert_eq!(string_width("\u{1100}\u{11A8}"), 3); // node: string-width@8.2.1 => 3
949    }
950
951    // ── Hangul early-exit discriminators (Node-pinned) ───────────────────────
952    // These pin the "first VISIBLE scalar decides jamo-vs-not" contract: the
953    // early exit must key on the first scalar AFTER the leading zero-width
954    // strip (JS codePoints[0] of the stripped cluster), and precomposed
955    // syllables (U+AC00..) must NOT be treated as jamo even when jamo follows.
956
957    #[test]
958    fn hangul_filler_leading_jamo_cluster() {
959        // U+115F CHOSEONG FILLER is BOTH jamo and Default_Ignorable — the
960        // zero-width strip removes it, and the jamo decision falls to U+1161.
961        assert_eq!(string_width("\u{115F}\u{1161}"), 1); // node: 1
962    }
963
964    #[test]
965    fn hangul_jamo_then_precomposed_cluster_is_additive() {
966        // First visible scalar IS jamo → full hangul body runs over the
967        // whole cluster (a naive "bail unless pure jamo" check breaks this).
968        assert_eq!(string_width("\u{1100}\u{AC00}"), 4); // node: 4
969    }
970
971    #[test]
972    fn precomposed_then_jamo_cluster_takes_eaw_path() {
973        // First visible scalar U+AC00 is NOT jamo → the early exit must fire
974        // and the cluster falls through to the EAW path (wide, 2).
975        assert_eq!(string_width("\u{AC00}\u{1161}"), 2); // node: 2
976    }
977
978    // ── DEL boundary of the printable-ASCII fast paths (Node-pinned) ─────────
979
980    #[test]
981    fn del_is_zero_width() {
982        // 0x7F is the first byte EXCLUDED by `b - 0x20 < 0x5F`; it is Control
983        // and must not take either ASCII fast path.
984        assert_eq!(string_width("\u{7F}"), 0); // node: 0
985    }
986
987    #[test]
988    fn del_plus_ascii_counts_only_the_ascii() {
989        assert_eq!(string_width("\u{7F}a"), 1); // node: 1
990    }
991
992    // A bare skin-tone modifier with no base is itself Emoji_Presentation and
993    // renders width 2 — guards the orphan-modifier emoji-presentation path.
994    #[test]
995    fn lone_emoji_modifier_is_double_width() {
996        assert_eq!(string_width("\u{1F3FB}"), 2); // node: string-width@8.2.1 => 2
997    }
998
999    // RI followed by a skin modifier is neither a valid flag pair nor a modifier
1000    // sequence (the base is not Emoji_Modifier_Base), so it falls through to EAW.
1001    // Stresses the is_rgi_flag_sequence + is_modifier_sequence anchoring.
1002    #[test]
1003    fn regional_indicator_plus_modifier_not_flag() {
1004        assert_eq!(string_width("\u{1F1E6}\u{1F3FB}"), 1); // node: string-width@8.2.1 => 1
1005    }
1006
1007    // RTL override + ZWJ + RI + modifier + ZWJ + RI: exercises the
1008    // ZWJ-with-≥2-Extended_Pictographic emoji rule together with grapheme
1009    // segmentation — exactly where the documented \p{RGI_Emoji} approximation
1010    // could diverge from Node. It does not.
1011    #[test]
1012    fn rtl_override_zwj_flag_modifier_garbage_chain() {
1013        assert_eq!(
1014            string_width("\u{202E}\u{200D}\u{1F1E6}\u{1F3FB}\u{200D}\u{1F1FF}"),
1015            2
1016        ); // node: string-width@8.2.1 => 2
1017    }
1018
1019    // With count_ansi_escape_codes:true an unterminated OSC sequence is NOT
1020    // stripped; every char counts (ESC is width 0, the 12 trailing chars width 1
1021    // each). Pins the count-mode branch against malformed escape input.
1022    #[test]
1023    fn count_ansi_mode_unterminated_osc_counts_bytes() {
1024        let opts = Options {
1025            count_ansi_escape_codes: true,
1026            ..Default::default()
1027        };
1028        assert_eq!(string_width_with("\x1b]8;;http://x", opts), 12); // node: stringWidth(s,{countAnsiEscapeCodes:true}) => 12
1029        assert_eq!(string_width("\x1b]8;;http://x"), 7); // node (default/strip): 7
1030    }
1031
1032    // ESC sequence immediately after an emoji: strip mode keeps the emoji at
1033    // width 2, count mode adds the 4-char SGR (ESC width 0 + "[0m" width 3).
1034    // Pins that count mode does not corrupt the emoji grapheme width.
1035    #[test]
1036    fn esc_mid_emoji_count_vs_strip() {
1037        let count = Options {
1038            count_ansi_escape_codes: true,
1039            ..Default::default()
1040        };
1041        assert_eq!(string_width("\u{1F600}\x1b[0m"), 2); // node: 2
1042        assert_eq!(string_width_with("\u{1F600}\x1b[0m", count), 5); // node: 5
1043    }
1044
1045    // ── Adversarial: no-panic totality ───────────────────────────────────────
1046
1047    // A panic in this width fn (slice in base_visible, index math in
1048    // hangul_cluster_width, regex over pathological ANSI) would kill the host
1049    // terminal renderer. Every &str input — malformed escapes, NUL+combining,
1050    // garbage emoji chains, orphan jamo, and 10 MB stress strings — must return
1051    // a usize without panic, in both default and count_ansi_escape_codes modes.
1052    // A panic fails the test; reaching the final assert proves totality.
1053    #[test]
1054    fn no_panic_on_adversarial_battery() {
1055        let count = Options {
1056            count_ansi_escape_codes: true,
1057            ..Default::default()
1058        };
1059        let small = [
1060            "abc\x1b",                                             // ESC at EOS
1061            "\x1b]8;;http://x",                                    // unterminated OSC
1062            "\x1b[",                                               // unterminated CSI
1063            "\x1b[38;5;",                                          // incomplete CSI
1064            "a\x00b",                                              // NUL
1065            "\x00\u{0301}",                                        // NUL + combining
1066            "\u{202E}\u{200D}\u{1F1E6}\u{1F3FB}\u{200D}\u{1F1FF}", // RTL+ZWJ+flag+modifier garbage
1067            "\u{1100}\u{11A8}",                                    // Hangul L+T orphan
1068        ];
1069        let big = [
1070            "a".repeat(10_000_000),
1071            "中".repeat(3_000_000),
1072            "\x1b".repeat(10_000_000),
1073        ];
1074        for s in small.iter().map(|s| s.to_string()).chain(big) {
1075            let _: usize = string_width(&s);
1076            let _: usize = string_width_with(&s, count);
1077        }
1078    }
1079}
inkferro_core/text/string_width.rs

inkferro_core/text/
string_width.rs