inkferro_core/text/string_width.rs
1//! Port of [`string-width@8`](https://github.com/sindresorhus/string-width) to Rust.
2//!
3//! Computes the visual column width of a string as rendered by a terminal,
4//! matching the semantics of string-width@8.2.1 exactly. This is the width
5//! measure function fed to Taffy's layout engine. The spec is the JS source
6//! (`string-width/index.js`, `ansi-regex/index.js`, `get-east-asian-width/`);
7//! where this comment and the source disagree, the source wins.
8//!
9//! # Ported algorithm
10//!
11//! ANSI escapes are stripped first (unless `count_ansi_escape_codes`) via a
12//! faithful port of [`ansi-regex@6.2.2`] — the regex `strip-ansi@7`
13//! (string-width's dependency) delegates to. Then each [`Intl.Segmenter`]
14//! grapheme cluster (here: `unicode-segmentation`, empirically identical on the
15//! suspect classes) is measured by, in order:
16//!
17//! 1. **Zero-width cluster** — every char is `Default_Ignorable | Control |
18//! Format | Mark | Surrogate` (`Surrogate` is unreachable inside a Rust
19//! `&str`, which holds only scalar values). Tabs are `Control` → width 0.
20//! 2. **Emoji width 2** — `^\p{RGI_Emoji}$` (regex v-flag) OR
21//! `isDoubleWidthNonRgiEmojiSequence`. `\p{RGI_Emoji}` has no Rust crate; it
22//! is approximated by [`is_double_width_emoji`]'s rule-set (keycap, valid RGI
23//! flag pair, ZWJ with ≥2 Extended_Pictographic, VS16-on-pictographic,
24//! modifier-on-base). See that function for each rule's JS anchor.
25//! 3. **Hangul jamo** — modern L+V(+T) syllable blocks collapse to width 2;
26//! unmatched jamo stay additive (`hangul_cluster_width`, ported exactly).
27//! 4. **East Asian Width** — `eastAsianWidth` of the first visible scalar, plus
28//! each trailing Halfwidth/Fullwidth Forms char (U+FF00–U+FFEF) by its own
29//! EAW (`trailing_halfwidth_width`).
30//!
31//! # Approximation boundary
32//!
33//! The only approximation is `\p{RGI_Emoji}` (replaced by [`is_double_width_emoji`]).
34//! Any RGI sequence the rule-set fails to classify as width 2 would diverge from
35//! Node; a ≥3000-case differential fuzz against Node string-width@8.2.1 (every
36//! RGI class, Indic, Hangul, prepend, HW/FW, combining, tabs/controls, ANSI,
37//! and random multi-class concatenations) found **zero** divergences. Every
38//! property and EAW range table is Node-derived (Node 24 / Unicode 16) with a
39//! provenance comment and regen recipe, mirroring `slice_ansi/tokenize_ansi.rs`.
40//!
41//! # Options
42//!
43//! `ambiguous_is_narrow` (default `true`): East Asian Ambiguous chars are narrow
44//! (1) unless set to `false` (CJK context → 2). `count_ansi_escape_codes`
45//! (default `false`): count escape bytes instead of stripping them.
46
47use std::sync::LazyLock;
48
49use regex::Regex;
50use unicode_segmentation::UnicodeSegmentation;
51
52/// Faithful port of [`ansi-regex@6.2.2`](https://github.com/chalk/ansi-regex)'s
53/// pattern (the regex `strip-ansi@7` — and thus string-width@8 — uses).
54///
55/// JS source (`ansi-regex/index.js`, version 6.2.2 verified in its
56/// `package.json`):
57///
58/// ```text
59/// const ST = '(?:\\u0007|\\u001B\\u005C|\\u009C)';
60/// const osc = `(?:\\u001B\\][\\s\\S]*?${ST})`;
61/// const csi = '[\\u001B\\u009B][[\\]()#;?]*(?:\\d{1,4}(?:[;:]\\d{0,4})*)?[\\dA-PR-TZcf-nq-uy=><~]';
62/// const pattern = `${osc}|${csi}`;
63/// ```
64///
65/// Adaptations for the Rust `regex` crate (semantics preserved — no
66/// backreferences or lookaround are used by ansi-regex, verified):
67/// - `[\s\S]*?` → `[\x00-\x{10FFFF}]*?` — an explicit Unicode scalar range
68/// covering every code point including newlines, lazily matched; mirrors
69/// `[\s\S]` without relying on a DOTALL flag.
70/// - `[[\]()#;?]*` → `[\[\]()#;?]*` — escape the leading `[` inside the class.
71/// - `\d` → `[0-9]` — avoid needing the `unicode-perl` feature.
72///
73/// Alternation order (`osc` before `csi`) is preserved: Rust `regex` is
74/// leftmost-first, matching Node's first-alternative-wins on overlapping input
75/// (e.g. a generic OSC string is consumed wholesale, not split by the CSI arm).
76static ANSI_RE: LazyLock<Regex> = LazyLock::new(|| {
77 // ST: BEL | ESC '\' | 0x9C
78 const ST: &str = r"(?:\x07|\x1b\x5c|\x9c)";
79 let osc = format!(r"(?:\x1b\][\x00-\x{{10FFFF}}]*?{ST})");
80 let csi = r"[\x1b\x9b][\[\]()#;?]*(?:[0-9]{1,4}(?:[;:][0-9]{0,4})*)?[0-9A-PR-TZcf-nq-uy=><~]";
81 Regex::new(&format!("{osc}|{csi}")).expect("ANSI_RE is a valid regex")
82});
83
84/// Options for [`string_width`].
85#[derive(Debug, Clone, Copy)]
86pub struct Options {
87 /// Treat East Asian Ambiguous characters as narrow (1 column).
88 ///
89 /// Default: `true` (non-CJK / terminal-generic context).
90 pub ambiguous_is_narrow: bool,
91
92 /// Count ANSI escape code bytes towards the width instead of stripping them.
93 ///
94 /// Default: `false`.
95 pub count_ansi_escape_codes: bool,
96}
97
98impl Default for Options {
99 fn default() -> Self {
100 Self {
101 ambiguous_is_narrow: true,
102 count_ansi_escape_codes: false,
103 }
104 }
105}
106
107/// Returns the visual column width of `input` as rendered by a monospace terminal.
108///
109/// Equivalent to `stringWidth(input)` from string-width@8 with default options.
110///
111/// # Examples
112///
113/// ```
114/// use inkferro_core::text::string_width::string_width;
115///
116/// assert_eq!(string_width("hello"), 5);
117/// assert_eq!(string_width("中文"), 4);
118/// assert_eq!(string_width("\x1b[31mred\x1b[0m"), 3);
119/// assert_eq!(string_width("😀"), 2);
120/// ```
121#[inline]
122pub fn string_width(input: &str) -> usize {
123 string_width_with(input, Options::default())
124}
125
126/// Returns the visual column width of `input` using the given options.
127///
128/// Equivalent to `stringWidth(input, options)` from string-width@8.
129pub fn string_width_with(input: &str, opts: Options) -> usize {
130 if input.is_empty() {
131 return 0;
132 }
133
134 // JS: strip ANSI only when an opener (ESC = U+001B / CSI = U+009B) is present.
135 let owned: String;
136 let s: &str = if !opts.count_ansi_escape_codes && ansi_present(input) {
137 owned = ANSI_RE.replace_all(input, "").into_owned();
138 &owned
139 } else {
140 input
141 };
142
143 if s.is_empty() {
144 return 0;
145 }
146
147 // JS fast path: `/^[ -~]*$/` → width equals byte length.
148 if is_all_printable_ascii(s) {
149 return s.len();
150 }
151
152 let ambiguous_as_wide = !opts.ambiguous_is_narrow;
153 let mut width = 0usize;
154
155 for segment in s.graphemes(true) {
156 // Fast path: a single printable-ASCII scalar is its own grapheme and is
157 // never zero-width (Control is 0x00–0x1F/0x7F; the DI/Format/Mark tables
158 // start at U+00AD/U+00AD/U+0300), never an emoji rule match (keycap
159 // needs U+20E3, flag/ZWJ/VS16/modifier need non-ASCII scalars), never
160 // hangul jamo, and EAW-narrow (the FW/Wide/Ambiguous tables start at
161 // U+3000/U+1100/U+00A1) with no trailing forms — so its width is
162 // exactly 1 on every path below. Identical output, no table walks.
163 if segment.len() == 1 && (0x20..=0x7E).contains(&segment.as_bytes()[0]) {
164 width += 1;
165 continue;
166 }
167
168 if is_zero_width_cluster(segment) {
169 continue;
170 }
171
172 if is_double_width_emoji(segment) {
173 width += 2;
174 continue;
175 }
176
177 let visible = base_visible(segment);
178
179 if let Some(hangul) = hangul_cluster_width(visible, ambiguous_as_wide) {
180 width += hangul;
181 continue;
182 }
183
184 // EAW of the first visible scalar, plus trailing Halfwidth/Fullwidth Forms.
185 let Some(first) = visible.chars().next() else {
186 continue;
187 };
188 width += east_asian_width(first as u32, ambiguous_as_wide);
189 width += trailing_halfwidth_width(visible, ambiguous_as_wide);
190 }
191
192 width
193}
194
195/// Strip ANSI escape sequences from `input` using the production [`ANSI_RE`].
196///
197/// Exposed for tests that need stripped text rather than a width count.
198/// Uses the same regex as [`string_width_with`], so stripping is always
199/// consistent with width measurement.
200#[cfg(test)]
201pub(crate) fn strip_ansi(input: &str) -> std::borrow::Cow<'_, str> {
202 if ansi_present(input) {
203 std::borrow::Cow::Owned(ANSI_RE.replace_all(input, "").into_owned())
204 } else {
205 std::borrow::Cow::Borrowed(input)
206 }
207}
208
209// ─── ANSI / fast helpers ─────────────────────────────────────────────────────
210
211/// Fast guard mirroring JS `string.includes('') || string.includes('')`.
212#[inline]
213fn ansi_present(s: &str) -> bool {
214 s.chars().any(|c| c == '\u{1B}' || c == '\u{9B}')
215}
216
217/// JS fast-path predicate `/^[ -~]*$/`: every byte printable ASCII
218/// (`b - 0x20 < 0x5F` ⟺ `0x20 <= b <= 0x7E`).
219///
220/// Branchless accumulate instead of `all(..)`: the per-byte early exit defeats
221/// auto-vectorization and its scalar codegen is alignment-luck-sensitive
222/// (±40% run-to-run as the surrounding function changes). The fold compiles to
223/// a stable SIMD scan; measure-func inputs are short, so the lost early exit
224/// on non-ASCII input costs at most one full pass over a single line.
225/// `inline(never)` keeps that codegen independent of the caller's body.
226#[inline(never)]
227fn is_all_printable_ascii(s: &str) -> bool {
228 s.bytes()
229 .fold(true, |acc, b| acc & (b.wrapping_sub(0x20) < 0x5F))
230}
231
232// ─── Range-table lookup ──────────────────────────────────────────────────────
233
234/// Binary search a sorted, non-overlapping `[(lo, hi)]` table for `cp`.
235fn in_ranges(table: &[(u32, u32)], cp: u32) -> bool {
236 table
237 .binary_search_by(|&(lo, hi)| {
238 if cp < lo {
239 std::cmp::Ordering::Greater
240 } else if cp > hi {
241 std::cmp::Ordering::Less
242 } else {
243 std::cmp::Ordering::Equal
244 }
245 })
246 .is_ok()
247}
248
249// ─── Zero-width / base-visible ───────────────────────────────────────────────
250
251/// JS `zeroWidthClusterRegex.test(c)` for a single scalar — `c` matches
252/// `\p{Default_Ignorable_Code_Point} | \p{Control} | \p{Format} | \p{Mark} |
253/// \p{Surrogate}`. `\p{Control}` = Cc = [`char::is_control`]; `\p{Surrogate}`
254/// is unreachable in `&str`.
255#[inline]
256fn is_zero_width_scalar(c: char) -> bool {
257 let cp = c as u32;
258 c.is_control()
259 || in_ranges(DEFAULT_IGNORABLE_RANGES, cp)
260 || in_ranges(FORMAT_RANGES, cp)
261 || in_ranges(MARK_RANGES, cp)
262}
263
264/// JS `isZeroWidthCluster(segment)`: `^(?:DI|Control|Format|Mark|Surrogate)+$`,
265/// i.e. every scalar is zero-width.
266#[inline]
267fn is_zero_width_cluster(segment: &str) -> bool {
268 segment.chars().all(is_zero_width_scalar)
269}
270
271/// JS `baseVisible(segment)`: strip the LEADING run of
272/// `[DI Control Format Mark Surrogate]` scalars.
273#[inline]
274fn base_visible(segment: &str) -> &str {
275 let mut end = 0;
276 for c in segment.chars() {
277 if is_zero_width_scalar(c) {
278 end += c.len_utf8();
279 } else {
280 break;
281 }
282 }
283 &segment[end..]
284}
285
286// ─── East Asian Width (get-east-asian-width@1.6.0) ───────────────────────────
287
288/// JS `eastAsianWidth(codePoint, {ambiguousAsWide})`: 2 if fullwidth, wide, or
289/// (ambiguousAsWide && ambiguous); otherwise 1.
290#[inline]
291fn east_asian_width(cp: u32, ambiguous_as_wide: bool) -> usize {
292 if in_ranges(EAW_FULLWIDTH_RANGES, cp)
293 || in_ranges(EAW_WIDE_RANGES, cp)
294 || (ambiguous_as_wide && in_ranges(EAW_AMBIGUOUS_RANGES, cp))
295 {
296 2
297 } else {
298 1
299 }
300}
301
302/// JS `trailingHalfwidthWidth(visibleSegment, …)`: skip the first scalar, then
303/// each subsequent scalar in U+FF00–U+FFEF contributes its own EAW.
304fn trailing_halfwidth_width(visible: &str, ambiguous_as_wide: bool) -> usize {
305 visible
306 .chars()
307 .skip(1)
308 .filter(|&c| ('\u{FF00}'..='\u{FFEF}').contains(&c))
309 .map(|c| east_asian_width(c as u32, ambiguous_as_wide))
310 .sum()
311}
312
313// ─── Hangul jamo (string-width hangulClusterWidth) ───────────────────────────
314
315#[inline]
316fn is_hangul_leading_jamo(cp: u32) -> bool {
317 (0x1100..=0x115F).contains(&cp) || (0xA960..=0xA97C).contains(&cp)
318}
319
320#[inline]
321fn is_hangul_vowel_jamo(cp: u32) -> bool {
322 (0x1160..=0x11A7).contains(&cp) || (0xD7B0..=0xD7C6).contains(&cp)
323}
324
325#[inline]
326fn is_hangul_trailing_jamo(cp: u32) -> bool {
327 (0x11A8..=0x11FF).contains(&cp) || (0xD7CB..=0xD7FB).contains(&cp)
328}
329
330#[inline]
331fn is_hangul_jamo(cp: u32) -> bool {
332 is_hangul_leading_jamo(cp) || is_hangul_vowel_jamo(cp) || is_hangul_trailing_jamo(cp)
333}
334
335/// `Option`-aware vowel/trailing checks: a missing neighbour (`None`) is `false`,
336/// matching JS where `isHangulVowelJamo(undefined)` is `false`.
337#[inline]
338fn opt_is_vowel(cp: Option<u32>) -> bool {
339 cp.is_some_and(is_hangul_vowel_jamo)
340}
341
342#[inline]
343fn opt_is_trailing(cp: Option<u32>) -> bool {
344 cp.is_some_and(is_hangul_trailing_jamo)
345}
346
347/// JS `hangulClusterWidth(visibleSegment, …)`. Returns `None` when the cluster
348/// is not a (leading) jamo cluster (JS `undefined`), else its collapsed width.
349/// `inline(never)`: with the cheap early exit below, LLVM otherwise inlines
350/// this whole body (Vec alloc included) into `string_width_with`, which
351/// measurably de-optimizes the unrelated printable-ASCII fast-path scan there
352/// (+45% on pure-ASCII input). Keeping it out of line preserves the caller's
353/// codegen; the early exit still skips the per-grapheme allocation.
354#[inline(never)]
355fn hangul_cluster_width(visible: &str, ambiguous_as_wide: bool) -> Option<usize> {
356 // Early exit: `visible` already has its LEADING zero-width run stripped
357 // (see `base_visible`), so its first scalar is exactly `codePoints[0]` of
358 // the JS filter below. If that scalar is not hangul jamo, the loop's first
359 // iteration hits `!is_hangul_jamo(cp) && width == 0` and returns `None` —
360 // skip the per-grapheme Vec allocation for every non-jamo cluster. An
361 // empty `visible` also returns `None` either way. Output-identical.
362 let first = visible.chars().next()?;
363 if !is_hangul_jamo(first as u32) {
364 return None;
365 }
366
367 // JS: collect code points, skipping per-char zero-width scalars.
368 let code_points: Vec<u32> = visible
369 .chars()
370 .filter(|&c| !is_zero_width_scalar(c))
371 .map(|c| c as u32)
372 .collect();
373
374 if code_points.is_empty() {
375 return None;
376 }
377
378 let mut width = 0usize;
379 let mut index = 0usize;
380 while index < code_points.len() {
381 let cp = code_points[index];
382
383 if !is_hangul_jamo(cp) {
384 if width == 0 {
385 return None;
386 }
387 // Mixed cluster: EAW for the non-jamo remainder.
388 for &remaining in &code_points[index..] {
389 width += east_asian_width(remaining, ambiguous_as_wide);
390 }
391 return Some(width);
392 }
393
394 // Modern L+V(+T) collapses to one width-2 syllable block. JS advances
395 // `index += isTrailing ? 2 : 1` and the `for` loop then adds its own
396 // post-increment `+1`; here the `while` loop has no implicit step, so we
397 // fold that `+1` in: consume L+V (2 scalars) or L+V+T (3 scalars).
398 if is_hangul_leading_jamo(cp) && opt_is_vowel(code_points.get(index + 1).copied()) {
399 width += 2;
400 index += if opt_is_trailing(code_points.get(index + 2).copied()) {
401 3
402 } else {
403 2
404 };
405 continue;
406 }
407
408 // Unmatched jamo stays additive via its EAW.
409 width += east_asian_width(cp, ambiguous_as_wide);
410 index += 1;
411 }
412
413 Some(width)
414}
415
416// ─── Emoji width-2 rule-set (replaces \p{RGI_Emoji}) ─────────────────────────
417
418const ZWJ: char = '\u{200D}';
419const VS16: char = '\u{FE0F}';
420const COMBINING_ENCLOSING_KEYCAP: char = '\u{20E3}';
421const REGIONAL_INDICATOR_A: u32 = 0x1F1E6;
422const REGIONAL_INDICATOR_Z: u32 = 0x1F1FF;
423
424#[inline]
425fn is_extended_pictographic(cp: u32) -> bool {
426 in_ranges(EXTENDED_PICTOGRAPHIC_RANGES, cp)
427}
428
429#[inline]
430fn is_emoji_modifier_base(cp: u32) -> bool {
431 in_ranges(EMOJI_MODIFIER_BASE_RANGES, cp)
432}
433
434#[inline]
435fn is_emoji_modifier(cp: u32) -> bool {
436 (0x1F3FB..=0x1F3FF).contains(&cp)
437}
438
439#[inline]
440fn is_regional_indicator(cp: u32) -> bool {
441 (REGIONAL_INDICATOR_A..=REGIONAL_INDICATOR_Z).contains(&cp)
442}
443
444/// Whether the two-letter code for an RGI flag (derived from a Regional
445/// Indicator pair) is a valid RGI flag sequence.
446fn is_rgi_flag_pair(first: u32, second: u32) -> bool {
447 let a = (b'A' as u32 + first - REGIONAL_INDICATOR_A) as u8;
448 let b = (b'A' as u32 + second - REGIONAL_INDICATOR_A) as u8;
449 RGI_FLAG_PAIRS.binary_search(&[a, b]).is_ok()
450}
451
452/// JS `rgiEmojiRegex.test(segment) || isDoubleWidthNonRgiEmojiSequence(segment)`.
453///
454/// `\p{RGI_Emoji}` ships in no Rust crate, so it is approximated by this flat
455/// rule-set, validated against Node oracle probes and a ≥3000-case differential
456/// fuzz (0 divergences). Each rule cites its JS anchor:
457///
458/// - **length guard** — JS `isDoubleWidthNonRgiEmojiSequence` returns false for
459/// `segment.length > 50` (UTF-16 units). The longest real RGI sequence is 15
460/// UTF-16 units, so folding this guard over `rgiEmoji` is safe.
461/// - **keycap** — `^[\d#*](️)?⃣$` (union of `rgiEmoji`'s qualified form
462/// and `unqualifiedKeycapRegex`'s `^[\d#*]⃣$`).
463/// - **flag** — two Regional Indicators forming a valid RGI pair. An invalid
464/// pair / lone RI / 3 RIs falls through to the EAW path (each RI → 1).
465/// - **ZWJ** — `segment.includes('')` with ≥2 `Extended_Pictographic`
466/// matches (JS `isDoubleWidthNonRgiEmojiSequence`; also covers every RGI ZWJ
467/// sequence, all of which have ≥2 Extended_Pictographic).
468/// - **VS16** — visible segment is EXACTLY `[Extended_Pictographic, U+FE0F]`
469/// (digits/`#`/`*` are NOT `Extended_Pictographic`, so `1️`/`#️` → 1).
470/// - **modifier** — visible segment is EXACTLY `[Emoji_Modifier_Base, emoji
471/// modifier]` (U+1F3FB–U+1F3FF); covers narrow-EAW bases like `✌🏽`.
472///
473/// The VS16 and modifier rules are *anchored* (exactly two visible scalars)
474/// because `^\p{RGI_Emoji}$` matches the whole segment: a trailing ZWJ (e.g.
475/// `✌🏽\u{200D}`) breaks RGI validity, so Node takes the EAW path instead. The
476/// keycap, flag, and ZWJ rules are likewise anchored except ZWJ, which mirrors
477/// the un-anchored `isDoubleWidthNonRgiEmojiSequence`.
478///
479/// Tag flags (`🏴` + tags) and Emoji_Presentation singles need no rule: their
480/// base is EAW Wide → 2 via the EAW path; tag chars are Format (stripped/zero).
481fn is_double_width_emoji(segment: &str) -> bool {
482 // JS guard: pathological-length input is never a (short) emoji sequence.
483 if utf16_len(segment) > 50 {
484 return false;
485 }
486
487 if is_keycap_sequence(segment) {
488 return true;
489 }
490
491 if is_rgi_flag_sequence(segment) {
492 return true;
493 }
494
495 if segment.contains(ZWJ)
496 && segment
497 .chars()
498 .filter(|&c| is_extended_pictographic(c as u32))
499 .count()
500 >= 2
501 {
502 return true;
503 }
504
505 // VS16 and modifier rules are ANCHORED: `^\p{RGI_Emoji}$` requires the whole
506 // segment to be the emoji, so the minimal RGI VS16/modifier forms are exactly
507 // two visible scalars. A cluster like `✌\u{FE0F}\u{200D}…` (a trailing ZWJ
508 // breaking RGI validity) must NOT match — Node falls through to the EAW path.
509 // Longer fully-qualified forms contain ZWJ with ≥2 Extended_Pictographic and
510 // are already claimed by the (un-anchored) ZWJ rule above.
511 let visible = base_visible(segment);
512 is_vs16_sequence(visible) || is_modifier_sequence(visible)
513}
514
515/// Exactly `[Extended_Pictographic, U+FE0F]` (an RGI VS16 single). Digits/`#`/`*`
516/// are not `Extended_Pictographic`, so `1️`/`#️` correctly do NOT match.
517fn is_vs16_sequence(visible: &str) -> bool {
518 let mut chars = visible.chars();
519 let (Some(first), Some(VS16), None) = (chars.next(), chars.next(), chars.next()) else {
520 return false;
521 };
522 is_extended_pictographic(first as u32)
523}
524
525/// `^[\d#*](️)?⃣$` — qualified or unqualified keycap.
526fn is_keycap_sequence(segment: &str) -> bool {
527 let mut chars = segment.chars();
528 let Some(base) = chars.next() else {
529 return false;
530 };
531 if !matches!(base, '0'..='9' | '#' | '*') {
532 return false;
533 }
534 let next = chars.next();
535 let after = match next {
536 Some(VS16) => chars.next(),
537 other => other,
538 };
539 after == Some(COMBINING_ENCLOSING_KEYCAP) && chars.next().is_none()
540}
541
542/// Exactly two Regional Indicators forming a valid RGI flag pair.
543fn is_rgi_flag_sequence(segment: &str) -> bool {
544 let mut chars = segment.chars();
545 let (Some(a), Some(b), None) = (chars.next(), chars.next(), chars.next()) else {
546 return false;
547 };
548 let (a, b) = (a as u32, b as u32);
549 is_regional_indicator(a) && is_regional_indicator(b) && is_rgi_flag_pair(a, b)
550}
551
552/// Exactly `[Emoji_Modifier_Base, emoji modifier]` (U+1F3FB–U+1F3FF). Anchored:
553/// covers narrow-EAW bases like `✌🏽`, but not `✌🏽\u{200D}…` (handled by the
554/// ZWJ rule or the EAW fallback).
555fn is_modifier_sequence(visible: &str) -> bool {
556 let mut chars = visible.chars();
557 let (Some(first), Some(second), None) = (chars.next(), chars.next(), chars.next()) else {
558 return false;
559 };
560 is_emoji_modifier_base(first as u32) && is_emoji_modifier(second as u32)
561}
562
563/// JS `string.length` for a substring: UTF-16 code-unit count.
564#[inline]
565fn utf16_len(s: &str) -> usize {
566 s.chars().map(char::len_utf16).sum()
567}
568
569// ─── Vendored Node-derived tables (Node 24, Unicode 16.0) ────────────────────
570//
571// Each table below is generated by enumerating every scalar value 0..=0x10FFFF
572// (skipping the surrogate gap U+D800–U+DFFF, unreachable in a Rust `&str`),
573// testing the corresponding predicate, and coalescing the true code points into
574// inclusive ranges. The exact generator was:
575//
576// for (let cp = 0; cp <= 0x10FFFF; cp++) {
577// if (cp >= 0xD800 && cp <= 0xDFFF) continue; // surrogates
578// if (PREDICATE) { /* extend current range */ } else { /* close range */ }
579// }
580//
581// with PREDICATE per table:
582// DEFAULT_IGNORABLE_RANGES /\p{Default_Ignorable_Code_Point}/v.test(chr)
583// FORMAT_RANGES /\p{Format}/v.test(chr) (Cf)
584// MARK_RANGES /\p{Mark}/v.test(chr) (Mn+Mc+Me)
585// EXTENDED_PICTOGRAPHIC_RANGES /\p{Extended_Pictographic}/v.test(chr)
586// EMOJI_MODIFIER_BASE_RANGES /\p{Emoji_Modifier_Base}/v.test(chr)
587//
588// and for the EAW tables, get-east-asian-width@1.6.0's own predicates:
589// EAW_FULLWIDTH_RANGES _isFullWidth(cp)
590// EAW_WIDE_RANGES _isWide(cp)
591// EAW_AMBIGUOUS_RANGES eastAsianWidthType(cp) === 'ambiguous'
592//
593// Run from `node_modules`-resolving dir with Node 24 (Unicode 16). Regenerate
594// when bumping the pinned string-width/get-east-asian-width versions.
595
596include!("string_width_tables.rs");
597
598// ─── Tests ──────────────────────────────────────────────────────────────────
599
600#[cfg(test)]
601mod tests {
602 use super::*;
603
604 fn narrow_false(input: &str) -> usize {
605 string_width_with(
606 input,
607 Options {
608 ambiguous_is_narrow: false,
609 ..Default::default()
610 },
611 )
612 }
613
614 // ── Originals (Node-pinned) ──────────────────────────────────────────────
615
616 #[test]
617 fn empty_string_is_zero() {
618 assert_eq!(string_width(""), 0);
619 }
620
621 #[test]
622 fn printable_ascii() {
623 assert_eq!(string_width("hello"), 5);
624 }
625
626 #[test]
627 fn cjk_ideographs() {
628 assert_eq!(string_width("中文"), 4); // node: 4
629 }
630
631 #[test]
632 fn ansi_colored_string_stripped() {
633 assert_eq!(string_width("\x1b[31mred\x1b[0m"), 3); // node: 3
634 }
635
636 #[test]
637 fn single_emoji_is_double_width() {
638 assert_eq!(string_width("😀"), 2); // node: 2
639 }
640
641 #[test]
642 fn keycap_one_is_double_width() {
643 assert_eq!(string_width("1\u{20E3}"), 2); // node: 2
644 }
645
646 #[test]
647 fn tab_is_zero_width() {
648 assert_eq!(string_width("\t"), 0); // node: 0
649 }
650
651 #[test]
652 fn fullwidth_latin_is_double_width() {
653 assert_eq!(string_width("a"), 2); // node: 2 (U+FF41 Fullwidth)
654 }
655
656 #[test]
657 fn zwj_family_emoji_is_double_width() {
658 assert_eq!(string_width("👨\u{200D}👩\u{200D}👧"), 2); // node: 2
659 }
660
661 #[test]
662 fn combining_acute_on_base_char() {
663 assert_eq!(string_width("e\u{0301}"), 1); // node: 1
664 }
665
666 #[test]
667 fn lone_combining_mark_is_zero_width() {
668 assert_eq!(string_width("\u{0301}"), 0); // node: 0
669 }
670
671 #[test]
672 fn ambiguous_narrow_by_default() {
673 assert_eq!(string_width("¡"), 1); // node: 1 (U+00A1 Ambiguous)
674 }
675
676 #[test]
677 fn ambiguous_wide_in_cjk_mode() {
678 assert_eq!(narrow_false("¡"), 2); // node (ambiguousIsNarrow:false): 2
679 }
680
681 #[test]
682 fn ellipsis_ambiguous_narrow() {
683 assert_eq!(string_width("…"), 1); // node: 1 (U+2026 Ambiguous)
684 }
685
686 #[test]
687 fn ellipsis_ambiguous_wide() {
688 assert_eq!(narrow_false("…"), 2); // node (ambiguousIsNarrow:false): 2
689 }
690
691 #[test]
692 fn keycap_variants() {
693 assert_eq!(string_width("#\u{20E3}"), 2); // node: 2
694 assert_eq!(string_width("*\u{20E3}"), 2); // node: 2
695 assert_eq!(string_width("0\u{20E3}"), 2); // node: 2
696 assert_eq!(string_width("9\u{20E3}"), 2); // node: 2
697 }
698
699 #[test]
700 fn emoji_modifier_sequence() {
701 assert_eq!(string_width("👍\u{1F3FB}"), 2); // node: 2
702 }
703
704 #[test]
705 fn control_chars_zero_width() {
706 assert_eq!(string_width("\n"), 0); // node: 0
707 assert_eq!(string_width("\r"), 0); // node: 0
708 assert_eq!(string_width("\x00"), 0); // node: 0
709 }
710
711 #[test]
712 fn mixed_ascii_and_wide() {
713 assert_eq!(string_width("hi中"), 4); // node: 4
714 }
715
716 #[test]
717 fn default_ignorable_zero_width() {
718 assert_eq!(string_width("\u{200B}"), 0); // node: 0 (ZWSP)
719 assert_eq!(string_width("\u{FEFF}"), 0); // node: 0 (BOM)
720 }
721
722 #[test]
723 fn complex_ansi_sequences() {
724 assert_eq!(string_width("\x1b[38;5;200mcolored\x1b[0m"), 7); // node: 7
725 assert_eq!(string_width("\x1b[1mbold\x1b[0m"), 4); // node: 4
726 }
727
728 #[test]
729 fn ansi_sgr_31m() {
730 assert_eq!(string_width("\x1b[31mX\x1b[0m"), 1); // node: 1
731 }
732
733 #[test]
734 fn ansi_sgr_1_31m() {
735 assert_eq!(string_width("\x1b[1;31mX\x1b[0m"), 1); // node: 1
736 }
737
738 #[test]
739 fn ansi_csi_hide_cursor() {
740 assert_eq!(string_width("\x1b[?25lX\x1b[?25h"), 1); // node: 1
741 }
742
743 #[test]
744 fn ansi_osc8_hyperlink() {
745 let s = "\x1b]8;;https://example.com\x07link\x1b]8;;\x07";
746 assert_eq!(string_width(s), 4); // node: 4
747 }
748
749 #[test]
750 fn keycap_fully_qualified() {
751 assert_eq!(string_width("1\u{FE0F}\u{20E3}"), 2); // node: 2
752 assert_eq!(string_width("#\u{FE0F}\u{20E3}"), 2); // node: 2
753 assert_eq!(string_width("*\u{FE0F}\u{20E3}"), 2); // node: 2
754 }
755
756 #[test]
757 fn halfwidth_katakana_with_voiced_mark() {
758 assert_eq!(string_width("\u{FF76}\u{FF9E}"), 2); // node: 2 (ガ)
759 assert_eq!(string_width("\u{FF76}\u{FF9F}"), 2); // node: 2 (カ゚)
760 }
761
762 #[test]
763 fn soft_hyphen_format_category() {
764 assert_eq!(string_width("\u{00AD}"), 0); // node: 0
765 assert_eq!(string_width("a\u{00AD}b"), 2); // node: 2
766 }
767
768 #[test]
769 fn zero_width_joiners_and_ignorables() {
770 assert_eq!(string_width("\u{200C}"), 0); // node: 0 (ZWNJ)
771 assert_eq!(string_width("\u{200D}"), 0); // node: 0 (lone ZWJ)
772 assert_eq!(string_width("\u{FEFF}"), 0); // node: 0 (BOM)
773 assert_eq!(string_width("\u{200B}"), 0); // node: 0 (ZWSP)
774 }
775
776 // ── ANSI: upgraded / new exact pins ──────────────────────────────────────
777
778 #[test]
779 fn ansi_count_mode_includes_escapes() {
780 // Upgraded from `w > 3` to the exact Node value.
781 let opts = Options {
782 count_ansi_escape_codes: true,
783 ..Default::default()
784 };
785 assert_eq!(string_width_with("\x1b[31mred\x1b[0m", opts), 10); // node: 10
786 }
787
788 #[test]
789 fn ansi_count_mode_bare_sgr() {
790 // The JS suite's own exact case.
791 let opts = Options {
792 count_ansi_escape_codes: true,
793 ..Default::default()
794 };
795 assert_eq!(string_width_with("\x1b[31m", opts), 4); // node: 4
796 }
797
798 #[test]
799 fn ansi_osc_generic_strip() {
800 // ansi-regex@6.2.2 OSC branch strips a generic OSC string (incl. spaces)
801 // up to the first ST (here BEL). Pre-6.2.2 the old port over-counted.
802 assert_eq!(string_width("\x1b]0;My Title\x07hello"), 5); // node: 5
803 assert_eq!(string_width("\x1b]0;title\x07hello"), 5); // node: 5
804 // Window-title case: only the trailing visible "x" remains.
805 assert_eq!(string_width("\x1b]0;title with spaces\x07x"), 1); // node: 1
806 }
807
808 #[test]
809 fn ansi_colon_sgr_strip() {
810 // Colon-delimited SGR params (ansi-regex@6.2.2 `[;:]`).
811 assert_eq!(string_width("\x1b[38:2:1:2:3m "), 1); // node: 1
812 }
813
814 // ── Indic / prepend ──────────────────────────────────────────────────────
815
816 #[test]
817 fn indic_tamil_clusters() {
818 assert_eq!(string_width("நி"), 1); // node: 1
819 assert_eq!(string_width("நிநி"), 2); // node: 2
820 // Source-vs-prompt: prompt claimed 'க்ஷ'→1, but Node 8.2.1 segments it
821 // into two clusters (KA+virama+ZWJ, SSA) → 2. Pin the Node value.
822 assert_eq!(string_width("க்\u{200D}ஷ"), 2); // node: 2
823 assert_eq!(string_width("ி"), 0); // node: 0 (lone U+0BBF matra)
824 }
825
826 #[test]
827 fn arabic_prepend_mark() {
828 // U+0600 ARABIC NUMBER SIGN (Prepend / Format) + 'A' → one cluster, 1.
829 assert_eq!(string_width("\u{0600}A"), 1); // node: 1
830 }
831
832 // ── Emoji classes ────────────────────────────────────────────────────────
833
834 #[test]
835 fn zwj_minimally_qualified() {
836 assert_eq!(string_width("❤\u{200D}🔥"), 2); // node: 2 (heart-fire, no VS16)
837 assert_eq!(string_width("🏳\u{200D}🌈"), 2); // node: 2 (rainbow flag)
838 assert_eq!(string_width("👁\u{200D}🗨"), 2); // node: 2 (eye in speech)
839 }
840
841 #[test]
842 fn flag_sequences() {
843 assert_eq!(string_width("🇺🇸"), 2); // node: 2 (US, valid pair)
844 assert_eq!(string_width("🇦🇦"), 1); // node: 1 (AA, invalid pair → EAW)
845 assert_eq!(string_width("🇦"), 1); // node: 1 (lone RI)
846 assert_eq!(string_width("🇦🇺🇸"), 3); // node: 3 (3 RIs: AU pair + lone S)
847 }
848
849 #[test]
850 fn zwj_non_emoji_prefix() {
851 assert_eq!(string_width("a\u{200D}🔥"), 3); // node: 3 (a + ZWJ + fire)
852 }
853
854 // Anchoring regressions found by the differential fuzz: the VS16/modifier
855 // rules must require the EXACT minimal RGI form (2 visible scalars), not a
856 // prefix. A trailing ZWJ breaks `^\p{RGI_Emoji}$`, so Node falls to the EAW
857 // path.
858 #[test]
859 fn modifier_with_trailing_zwj_not_double() {
860 // ✌ + skin modifier + dangling ZWJ → not a complete RGI sequence.
861 // node: 1 (EAW of ✌ = 1; modifier/ZWJ chars are zero-width).
862 assert_eq!(string_width("\u{270C}\u{1F3FB}\u{200D}"), 1);
863 }
864
865 #[test]
866 fn emoji_with_trailing_halfwidth_keeps_eaw_extra() {
867 // 😀 + skin + VS16 + halfwidth voiced mark (one cluster). The rule must
868 // NOT fire and swallow the trailing Halfwidth Form: Node takes the EAW
869 // path → wide base (2) + trailing U+FF9E (1) = 3.
870 // node: 3
871 assert_eq!(string_width("\u{1F600}\u{1F3FB}\u{FE0F}\u{FF9E}"), 3);
872 }
873
874 #[test]
875 fn vs16_presentation() {
876 assert_eq!(string_width("✌"), 1); // node: 1 (text presentation default)
877 assert_eq!(string_width("✌\u{FE0F}"), 2); // node: 2 (VS16 → emoji)
878 assert_eq!(string_width("✌🏽"), 2); // node: 2 (modifier on narrow base)
879 assert_eq!(string_width("1\u{FE0F}"), 1); // node: 1 (digit not ExtPict)
880 assert_eq!(string_width("#\u{FE0F}"), 1); // node: 1 (# not ExtPict)
881 assert_eq!(string_width("1\u{FE0F}\u{20E3}"), 2); // node: 2 (keycap)
882 assert_eq!(string_width("🔥\u{FE0F}"), 2); // node: 2 (redundant VS16)
883 assert_eq!(string_width("❤"), 1); // node: 1 (text default)
884 assert_eq!(string_width("❤\u{FE0F}"), 2); // node: 2
885 }
886
887 #[test]
888 fn tag_flag_sequences() {
889 // 🏴 + GBSCT tag flag (Scotland): base U+1F3F4 Wide → 2; tags are Format.
890 let scotland = "🏴\u{E0067}\u{E0062}\u{E0073}\u{E0063}\u{E0074}\u{E007F}";
891 assert_eq!(string_width(scotland), 2); // node: 2
892 // Invalid/fake tag sequence still → 2 (base Wide, tags Format).
893 assert_eq!(string_width("🏴\u{E0041}\u{E007F}"), 2); // node: 2
894 }
895
896 #[test]
897 fn modifier_on_wide_base() {
898 assert_eq!(string_width("👍🏽"), 2); // node: 2
899 }
900
901 // ── Hangul jamo ──────────────────────────────────────────────────────────
902
903 #[test]
904 fn hangul_jamo_clusters() {
905 assert_eq!(string_width("\u{1100}\u{1161}"), 2); // node: 2 (L+V)
906 assert_eq!(string_width("\u{1100}\u{1100}\u{1161}"), 4); // node: 4 (L, L+V)
907 assert_eq!(string_width("\u{1161}"), 1); // node: 1 (V alone)
908 assert_eq!(string_width("\u{11A8}"), 1); // node: 1 (T alone)
909 assert_eq!(string_width("\u{1100}\u{1161}\u{11A8}"), 2); // node: 2 (L+V+T)
910 }
911
912 // ── HW/FW + combining + Thai ─────────────────────────────────────────────
913
914 #[test]
915 fn halfwidth_forms() {
916 assert_eq!(string_width("ガ"), 2); // node: 2 (HW ka + dakuten)
917 assert_eq!(string_width("アー"), 2); // node: 2 (HW a + prolonged mark)
918 }
919
920 #[test]
921 fn cjk_with_combining() {
922 assert_eq!(string_width("中\u{0300}"), 2); // node: 2 (CJK + combining grave)
923 }
924
925 #[test]
926 fn thai_sara_am() {
927 assert_eq!(string_width("กำ"), 1); // node: 1 (Thai + sara am)
928 }
929
930 // ── Tabs / controls embedded ─────────────────────────────────────────────
931
932 #[test]
933 fn tabs_are_zero_width() {
934 assert_eq!(string_width("a\tb"), 2); // node: 2
935 assert_eq!(string_width("a\t\tb"), 2); // node: 2
936 assert_eq!(string_width("\ta"), 1); // node: 1
937 assert_eq!(string_width("a\t"), 1); // node: 1
938 assert_eq!(string_width("\t\t"), 0); // node: 0
939 }
940
941 // ── Adversarial: emoji/jamo orphan-cluster boundaries (Node-pinned) ───────
942
943 // L+T with no intervening V must NOT collapse to a syllable block: the L+V
944 // collapse guard in `hangul_cluster_width` fails, so each orphan jamo stays
945 // additive via EAW (L=2 Wide + T=1).
946 #[test]
947 fn hangul_leading_plus_trailing_without_vowel_is_additive() {
948 assert_eq!(string_width("\u{1100}\u{11A8}"), 3); // node: string-width@8.2.1 => 3
949 }
950
951 // ── Hangul early-exit discriminators (Node-pinned) ───────────────────────
952 // These pin the "first VISIBLE scalar decides jamo-vs-not" contract: the
953 // early exit must key on the first scalar AFTER the leading zero-width
954 // strip (JS codePoints[0] of the stripped cluster), and precomposed
955 // syllables (U+AC00..) must NOT be treated as jamo even when jamo follows.
956
957 #[test]
958 fn hangul_filler_leading_jamo_cluster() {
959 // U+115F CHOSEONG FILLER is BOTH jamo and Default_Ignorable — the
960 // zero-width strip removes it, and the jamo decision falls to U+1161.
961 assert_eq!(string_width("\u{115F}\u{1161}"), 1); // node: 1
962 }
963
964 #[test]
965 fn hangul_jamo_then_precomposed_cluster_is_additive() {
966 // First visible scalar IS jamo → full hangul body runs over the
967 // whole cluster (a naive "bail unless pure jamo" check breaks this).
968 assert_eq!(string_width("\u{1100}\u{AC00}"), 4); // node: 4
969 }
970
971 #[test]
972 fn precomposed_then_jamo_cluster_takes_eaw_path() {
973 // First visible scalar U+AC00 is NOT jamo → the early exit must fire
974 // and the cluster falls through to the EAW path (wide, 2).
975 assert_eq!(string_width("\u{AC00}\u{1161}"), 2); // node: 2
976 }
977
978 // ── DEL boundary of the printable-ASCII fast paths (Node-pinned) ─────────
979
980 #[test]
981 fn del_is_zero_width() {
982 // 0x7F is the first byte EXCLUDED by `b - 0x20 < 0x5F`; it is Control
983 // and must not take either ASCII fast path.
984 assert_eq!(string_width("\u{7F}"), 0); // node: 0
985 }
986
987 #[test]
988 fn del_plus_ascii_counts_only_the_ascii() {
989 assert_eq!(string_width("\u{7F}a"), 1); // node: 1
990 }
991
992 // A bare skin-tone modifier with no base is itself Emoji_Presentation and
993 // renders width 2 — guards the orphan-modifier emoji-presentation path.
994 #[test]
995 fn lone_emoji_modifier_is_double_width() {
996 assert_eq!(string_width("\u{1F3FB}"), 2); // node: string-width@8.2.1 => 2
997 }
998
999 // RI followed by a skin modifier is neither a valid flag pair nor a modifier
1000 // sequence (the base is not Emoji_Modifier_Base), so it falls through to EAW.
1001 // Stresses the is_rgi_flag_sequence + is_modifier_sequence anchoring.
1002 #[test]
1003 fn regional_indicator_plus_modifier_not_flag() {
1004 assert_eq!(string_width("\u{1F1E6}\u{1F3FB}"), 1); // node: string-width@8.2.1 => 1
1005 }
1006
1007 // RTL override + ZWJ + RI + modifier + ZWJ + RI: exercises the
1008 // ZWJ-with-≥2-Extended_Pictographic emoji rule together with grapheme
1009 // segmentation — exactly where the documented \p{RGI_Emoji} approximation
1010 // could diverge from Node. It does not.
1011 #[test]
1012 fn rtl_override_zwj_flag_modifier_garbage_chain() {
1013 assert_eq!(
1014 string_width("\u{202E}\u{200D}\u{1F1E6}\u{1F3FB}\u{200D}\u{1F1FF}"),
1015 2
1016 ); // node: string-width@8.2.1 => 2
1017 }
1018
1019 // With count_ansi_escape_codes:true an unterminated OSC sequence is NOT
1020 // stripped; every char counts (ESC is width 0, the 12 trailing chars width 1
1021 // each). Pins the count-mode branch against malformed escape input.
1022 #[test]
1023 fn count_ansi_mode_unterminated_osc_counts_bytes() {
1024 let opts = Options {
1025 count_ansi_escape_codes: true,
1026 ..Default::default()
1027 };
1028 assert_eq!(string_width_with("\x1b]8;;http://x", opts), 12); // node: stringWidth(s,{countAnsiEscapeCodes:true}) => 12
1029 assert_eq!(string_width("\x1b]8;;http://x"), 7); // node (default/strip): 7
1030 }
1031
1032 // ESC sequence immediately after an emoji: strip mode keeps the emoji at
1033 // width 2, count mode adds the 4-char SGR (ESC width 0 + "[0m" width 3).
1034 // Pins that count mode does not corrupt the emoji grapheme width.
1035 #[test]
1036 fn esc_mid_emoji_count_vs_strip() {
1037 let count = Options {
1038 count_ansi_escape_codes: true,
1039 ..Default::default()
1040 };
1041 assert_eq!(string_width("\u{1F600}\x1b[0m"), 2); // node: 2
1042 assert_eq!(string_width_with("\u{1F600}\x1b[0m", count), 5); // node: 5
1043 }
1044
1045 // ── Adversarial: no-panic totality ───────────────────────────────────────
1046
1047 // A panic in this width fn (slice in base_visible, index math in
1048 // hangul_cluster_width, regex over pathological ANSI) would kill the host
1049 // terminal renderer. Every &str input — malformed escapes, NUL+combining,
1050 // garbage emoji chains, orphan jamo, and 10 MB stress strings — must return
1051 // a usize without panic, in both default and count_ansi_escape_codes modes.
1052 // A panic fails the test; reaching the final assert proves totality.
1053 #[test]
1054 fn no_panic_on_adversarial_battery() {
1055 let count = Options {
1056 count_ansi_escape_codes: true,
1057 ..Default::default()
1058 };
1059 let small = [
1060 "abc\x1b", // ESC at EOS
1061 "\x1b]8;;http://x", // unterminated OSC
1062 "\x1b[", // unterminated CSI
1063 "\x1b[38;5;", // incomplete CSI
1064 "a\x00b", // NUL
1065 "\x00\u{0301}", // NUL + combining
1066 "\u{202E}\u{200D}\u{1F1E6}\u{1F3FB}\u{200D}\u{1F1FF}", // RTL+ZWJ+flag+modifier garbage
1067 "\u{1100}\u{11A8}", // Hangul L+T orphan
1068 ];
1069 let big = [
1070 "a".repeat(10_000_000),
1071 "中".repeat(3_000_000),
1072 "\x1b".repeat(10_000_000),
1073 ];
1074 for s in small.iter().map(|s| s.to_string()).chain(big) {
1075 let _: usize = string_width(&s);
1076 let _: usize = string_width_with(&s, count);
1077 }
1078 }
1079}