inkferro-core 0.1.0

//! Port of [`string-width@8`](https://github.com/sindresorhus/string-width) to Rust.
//!
//! Computes the visual column width of a string as rendered by a terminal,
//! matching the semantics of string-width@8.2.1 exactly. This is the width
//! measure function fed to Taffy's layout engine. The spec is the JS source
//! (`string-width/index.js`, `ansi-regex/index.js`, `get-east-asian-width/`);
//! where this comment and the source disagree, the source wins.
//!
//! # Ported algorithm
//!
//! ANSI escapes are stripped first (unless `count_ansi_escape_codes`) via a
//! faithful port of [`ansi-regex@6.2.2`] — the regex `strip-ansi@7`
//! (string-width's dependency) delegates to. Then each [`Intl.Segmenter`]
//! grapheme cluster (here: `unicode-segmentation`, empirically identical on the
//! suspect classes) is measured by, in order:
//!
//! 1. **Zero-width cluster** — every char is `Default_Ignorable | Control |
//!    Format | Mark | Surrogate` (`Surrogate` is unreachable inside a Rust
//!    `&str`, which holds only scalar values). Tabs are `Control` → width 0.
//! 2. **Emoji width 2** — `^\p{RGI_Emoji}$` (regex v-flag) OR
//!    `isDoubleWidthNonRgiEmojiSequence`. `\p{RGI_Emoji}` has no Rust crate; it
//!    is approximated by [`is_double_width_emoji`]'s rule-set (keycap, valid RGI
//!    flag pair, ZWJ with ≥2 Extended_Pictographic, VS16-on-pictographic,
//!    modifier-on-base). See that function for each rule's JS anchor.
//! 3. **Hangul jamo** — modern L+V(+T) syllable blocks collapse to width 2;
//!    unmatched jamo stay additive (`hangul_cluster_width`, ported exactly).
//! 4. **East Asian Width** — `eastAsianWidth` of the first visible scalar, plus
//!    each trailing Halfwidth/Fullwidth Forms char (U+FF00–U+FFEF) by its own
//!    EAW (`trailing_halfwidth_width`).
//!
//! # Approximation boundary
//!
//! The only approximation is `\p{RGI_Emoji}` (replaced by [`is_double_width_emoji`]).
//! Any RGI sequence the rule-set fails to classify as width 2 would diverge from
//! Node; a ≥3000-case differential fuzz against Node string-width@8.2.1 (every
//! RGI class, Indic, Hangul, prepend, HW/FW, combining, tabs/controls, ANSI,
//! and random multi-class concatenations) found **zero** divergences. Every
//! property and EAW range table is Node-derived (Node 24 / Unicode 16) with a
//! provenance comment and regen recipe, mirroring `slice_ansi/tokenize_ansi.rs`.
//!
//! # Options
//!
//! `ambiguous_is_narrow` (default `true`): East Asian Ambiguous chars are narrow
//! (1) unless set to `false` (CJK context → 2). `count_ansi_escape_codes`
//! (default `false`): count escape bytes instead of stripping them.

use std::sync::LazyLock;

use regex::Regex;
use unicode_segmentation::UnicodeSegmentation;

/// Faithful port of [`ansi-regex@6.2.2`](https://github.com/chalk/ansi-regex)'s
/// pattern (the regex `strip-ansi@7` — and thus string-width@8 — uses).
///
/// JS source (`ansi-regex/index.js`, version 6.2.2 verified in its
/// `package.json`):
///
/// ```text
/// const ST  = '(?:\\u0007|\\u001B\\u005C|\\u009C)';
/// const osc = `(?:\\u001B\\][\\s\\S]*?${ST})`;
/// const csi = '[\\u001B\\u009B][[\\]()#;?]*(?:\\d{1,4}(?:[;:]\\d{0,4})*)?[\\dA-PR-TZcf-nq-uy=><~]';
/// const pattern = `${osc}|${csi}`;
/// ```
///
/// Adaptations for the Rust `regex` crate (semantics preserved — no
/// backreferences or lookaround are used by ansi-regex, verified):
/// - `[\s\S]*?` → `[\x00-\x{10FFFF}]*?` — an explicit Unicode scalar range
///   covering every code point including newlines, lazily matched; mirrors
///   `[\s\S]` without relying on a DOTALL flag.
/// - `[[\]()#;?]*` → `[\[\]()#;?]*` — escape the leading `[` inside the class.
/// - `\d` → `[0-9]` — avoid needing the `unicode-perl` feature.
///
/// Alternation order (`osc` before `csi`) is preserved: Rust `regex` is
/// leftmost-first, matching Node's first-alternative-wins on overlapping input
/// (e.g. a generic OSC string is consumed wholesale, not split by the CSI arm).
static ANSI_RE: LazyLock<Regex> = LazyLock::new(|| {
    // ST: BEL | ESC '\' | 0x9C
    const ST: &str = r"(?:\x07|\x1b\x5c|\x9c)";
    let osc = format!(r"(?:\x1b\][\x00-\x{{10FFFF}}]*?{ST})");
    let csi = r"[\x1b\x9b][\[\]()#;?]*(?:[0-9]{1,4}(?:[;:][0-9]{0,4})*)?[0-9A-PR-TZcf-nq-uy=><~]";
    Regex::new(&format!("{osc}|{csi}")).expect("ANSI_RE is a valid regex")
});

/// Options for [`string_width`].
#[derive(Debug, Clone, Copy)]
pub struct Options {
    /// Treat East Asian Ambiguous characters as narrow (1 column).
    ///
    /// Default: `true` (non-CJK / terminal-generic context).
    pub ambiguous_is_narrow: bool,

    /// Count ANSI escape code bytes towards the width instead of stripping them.
    ///
    /// Default: `false`.
    pub count_ansi_escape_codes: bool,
}

impl Default for Options {
    fn default() -> Self {
        Self {
            ambiguous_is_narrow: true,
            count_ansi_escape_codes: false,
        }
    }
}

/// Returns the visual column width of `input` as rendered by a monospace terminal.
///
/// Equivalent to `stringWidth(input)` from string-width@8 with default options.
///
/// # Examples
///
/// ```
/// use inkferro_core::text::string_width::string_width;
///
/// assert_eq!(string_width("hello"), 5);
/// assert_eq!(string_width("中文"), 4);
/// assert_eq!(string_width("\x1b[31mred\x1b[0m"), 3);
/// assert_eq!(string_width("😀"), 2);
/// ```
#[inline]
pub fn string_width(input: &str) -> usize {
    string_width_with(input, Options::default())
}

/// Returns the visual column width of `input` using the given options.
///
/// Equivalent to `stringWidth(input, options)` from string-width@8.
pub fn string_width_with(input: &str, opts: Options) -> usize {
    if input.is_empty() {
        return 0;
    }

    // JS: strip ANSI only when an opener (ESC = U+001B / CSI = U+009B) is present.
    let owned: String;
    let s: &str = if !opts.count_ansi_escape_codes && ansi_present(input) {
        owned = ANSI_RE.replace_all(input, "").into_owned();
        &owned
    } else {
        input
    };

    if s.is_empty() {
        return 0;
    }

    // JS fast path: `/^[ -~]*$/` → width equals byte length.
    if is_all_printable_ascii(s) {
        return s.len();
    }

    let ambiguous_as_wide = !opts.ambiguous_is_narrow;
    let mut width = 0usize;

    for segment in s.graphemes(true) {
        // Fast path: a single printable-ASCII scalar is its own grapheme and is
        // never zero-width (Control is 0x00–0x1F/0x7F; the DI/Format/Mark tables
        // start at U+00AD/U+00AD/U+0300), never an emoji rule match (keycap
        // needs U+20E3, flag/ZWJ/VS16/modifier need non-ASCII scalars), never
        // hangul jamo, and EAW-narrow (the FW/Wide/Ambiguous tables start at
        // U+3000/U+1100/U+00A1) with no trailing forms — so its width is
        // exactly 1 on every path below. Identical output, no table walks.
        if segment.len() == 1 && (0x20..=0x7E).contains(&segment.as_bytes()[0]) {
            width += 1;
            continue;
        }

        if is_zero_width_cluster(segment) {
            continue;
        }

        if is_double_width_emoji(segment) {
            width += 2;
            continue;
        }

        let visible = base_visible(segment);

        if let Some(hangul) = hangul_cluster_width(visible, ambiguous_as_wide) {
            width += hangul;
            continue;
        }

        // EAW of the first visible scalar, plus trailing Halfwidth/Fullwidth Forms.
        let Some(first) = visible.chars().next() else {
            continue;
        };
        width += east_asian_width(first as u32, ambiguous_as_wide);
        width += trailing_halfwidth_width(visible, ambiguous_as_wide);
    }

    width
}

/// Strip ANSI escape sequences from `input` using the production [`ANSI_RE`].
///
/// Exposed for tests that need stripped text rather than a width count.
/// Uses the same regex as [`string_width_with`], so stripping is always
/// consistent with width measurement.
#[cfg(test)]
pub(crate) fn strip_ansi(input: &str) -> std::borrow::Cow<'_, str> {
    if ansi_present(input) {
        std::borrow::Cow::Owned(ANSI_RE.replace_all(input, "").into_owned())
    } else {
        std::borrow::Cow::Borrowed(input)
    }
}

// ─── ANSI / fast helpers ─────────────────────────────────────────────────────

/// Fast guard mirroring JS `string.includes('') || string.includes('')`.
#[inline]
fn ansi_present(s: &str) -> bool {
    s.chars().any(|c| c == '\u{1B}' || c == '\u{9B}')
}

/// JS fast-path predicate `/^[ -~]*$/`: every byte printable ASCII
/// (`b - 0x20 < 0x5F` ⟺ `0x20 <= b <= 0x7E`).
///
/// Branchless accumulate instead of `all(..)`: the per-byte early exit defeats
/// auto-vectorization and its scalar codegen is alignment-luck-sensitive
/// (±40% run-to-run as the surrounding function changes). The fold compiles to
/// a stable SIMD scan; measure-func inputs are short, so the lost early exit
/// on non-ASCII input costs at most one full pass over a single line.
/// `inline(never)` keeps that codegen independent of the caller's body.
#[inline(never)]
fn is_all_printable_ascii(s: &str) -> bool {
    s.bytes()
        .fold(true, |acc, b| acc & (b.wrapping_sub(0x20) < 0x5F))
}

// ─── Range-table lookup ──────────────────────────────────────────────────────

/// Binary search a sorted, non-overlapping `[(lo, hi)]` table for `cp`.
fn in_ranges(table: &[(u32, u32)], cp: u32) -> bool {
    table
        .binary_search_by(|&(lo, hi)| {
            if cp < lo {
                std::cmp::Ordering::Greater
            } else if cp > hi {
                std::cmp::Ordering::Less
            } else {
                std::cmp::Ordering::Equal
            }
        })
        .is_ok()
}

// ─── Zero-width / base-visible ───────────────────────────────────────────────

/// JS `zeroWidthClusterRegex.test(c)` for a single scalar — `c` matches
/// `\p{Default_Ignorable_Code_Point} | \p{Control} | \p{Format} | \p{Mark} |
/// \p{Surrogate}`. `\p{Control}` = Cc = [`char::is_control`]; `\p{Surrogate}`
/// is unreachable in `&str`.
#[inline]
fn is_zero_width_scalar(c: char) -> bool {
    let cp = c as u32;
    c.is_control()
        || in_ranges(DEFAULT_IGNORABLE_RANGES, cp)
        || in_ranges(FORMAT_RANGES, cp)
        || in_ranges(MARK_RANGES, cp)
}

/// JS `isZeroWidthCluster(segment)`: `^(?:DI|Control|Format|Mark|Surrogate)+$`,
/// i.e. every scalar is zero-width.
#[inline]
fn is_zero_width_cluster(segment: &str) -> bool {
    segment.chars().all(is_zero_width_scalar)
}

/// JS `baseVisible(segment)`: strip the LEADING run of
/// `[DI Control Format Mark Surrogate]` scalars.
#[inline]
fn base_visible(segment: &str) -> &str {
    let mut end = 0;
    for c in segment.chars() {
        if is_zero_width_scalar(c) {
            end += c.len_utf8();
        } else {
            break;
        }
    }
    &segment[end..]
}

// ─── East Asian Width (get-east-asian-width@1.6.0) ───────────────────────────

/// JS `eastAsianWidth(codePoint, {ambiguousAsWide})`: 2 if fullwidth, wide, or
/// (ambiguousAsWide && ambiguous); otherwise 1.
#[inline]
fn east_asian_width(cp: u32, ambiguous_as_wide: bool) -> usize {
    if in_ranges(EAW_FULLWIDTH_RANGES, cp)
        || in_ranges(EAW_WIDE_RANGES, cp)
        || (ambiguous_as_wide && in_ranges(EAW_AMBIGUOUS_RANGES, cp))
    {
        2
    } else {
        1
    }
}

/// JS `trailingHalfwidthWidth(visibleSegment, …)`: skip the first scalar, then
/// each subsequent scalar in U+FF00–U+FFEF contributes its own EAW.
fn trailing_halfwidth_width(visible: &str, ambiguous_as_wide: bool) -> usize {
    visible
        .chars()
        .skip(1)
        .filter(|&c| ('\u{FF00}'..='\u{FFEF}').contains(&c))
        .map(|c| east_asian_width(c as u32, ambiguous_as_wide))
        .sum()
}

// ─── Hangul jamo (string-width hangulClusterWidth) ───────────────────────────

#[inline]
fn is_hangul_leading_jamo(cp: u32) -> bool {
    (0x1100..=0x115F).contains(&cp) || (0xA960..=0xA97C).contains(&cp)
}

#[inline]
fn is_hangul_vowel_jamo(cp: u32) -> bool {
    (0x1160..=0x11A7).contains(&cp) || (0xD7B0..=0xD7C6).contains(&cp)
}

#[inline]
fn is_hangul_trailing_jamo(cp: u32) -> bool {
    (0x11A8..=0x11FF).contains(&cp) || (0xD7CB..=0xD7FB).contains(&cp)
}

#[inline]
fn is_hangul_jamo(cp: u32) -> bool {
    is_hangul_leading_jamo(cp) || is_hangul_vowel_jamo(cp) || is_hangul_trailing_jamo(cp)
}

/// `Option`-aware vowel/trailing checks: a missing neighbour (`None`) is `false`,
/// matching JS where `isHangulVowelJamo(undefined)` is `false`.
#[inline]
fn opt_is_vowel(cp: Option<u32>) -> bool {
    cp.is_some_and(is_hangul_vowel_jamo)
}

#[inline]
fn opt_is_trailing(cp: Option<u32>) -> bool {
    cp.is_some_and(is_hangul_trailing_jamo)
}

/// JS `hangulClusterWidth(visibleSegment, …)`. Returns `None` when the cluster
/// is not a (leading) jamo cluster (JS `undefined`), else its collapsed width.
/// `inline(never)`: with the cheap early exit below, LLVM otherwise inlines
/// this whole body (Vec alloc included) into `string_width_with`, which
/// measurably de-optimizes the unrelated printable-ASCII fast-path scan there
/// (+45% on pure-ASCII input). Keeping it out of line preserves the caller's
/// codegen; the early exit still skips the per-grapheme allocation.
#[inline(never)]
fn hangul_cluster_width(visible: &str, ambiguous_as_wide: bool) -> Option<usize> {
    // Early exit: `visible` already has its LEADING zero-width run stripped
    // (see `base_visible`), so its first scalar is exactly `codePoints[0]` of
    // the JS filter below. If that scalar is not hangul jamo, the loop's first
    // iteration hits `!is_hangul_jamo(cp) && width == 0` and returns `None` —
    // skip the per-grapheme Vec allocation for every non-jamo cluster. An
    // empty `visible` also returns `None` either way. Output-identical.
    let first = visible.chars().next()?;
    if !is_hangul_jamo(first as u32) {
        return None;
    }

    // JS: collect code points, skipping per-char zero-width scalars.
    let code_points: Vec<u32> = visible
        .chars()
        .filter(|&c| !is_zero_width_scalar(c))
        .map(|c| c as u32)
        .collect();

    if code_points.is_empty() {
        return None;
    }

    let mut width = 0usize;
    let mut index = 0usize;
    while index < code_points.len() {
        let cp = code_points[index];

        if !is_hangul_jamo(cp) {
            if width == 0 {
                return None;
            }
            // Mixed cluster: EAW for the non-jamo remainder.
            for &remaining in &code_points[index..] {
                width += east_asian_width(remaining, ambiguous_as_wide);
            }
            return Some(width);
        }

        // Modern L+V(+T) collapses to one width-2 syllable block. JS advances
        // `index += isTrailing ? 2 : 1` and the `for` loop then adds its own
        // post-increment `+1`; here the `while` loop has no implicit step, so we
        // fold that `+1` in: consume L+V (2 scalars) or L+V+T (3 scalars).
        if is_hangul_leading_jamo(cp) && opt_is_vowel(code_points.get(index + 1).copied()) {
            width += 2;
            index += if opt_is_trailing(code_points.get(index + 2).copied()) {
                3
            } else {
                2
            };
            continue;
        }

        // Unmatched jamo stays additive via its EAW.
        width += east_asian_width(cp, ambiguous_as_wide);
        index += 1;
    }

    Some(width)
}

// ─── Emoji width-2 rule-set (replaces \p{RGI_Emoji}) ─────────────────────────

const ZWJ: char = '\u{200D}';
const VS16: char = '\u{FE0F}';
const COMBINING_ENCLOSING_KEYCAP: char = '\u{20E3}';
const REGIONAL_INDICATOR_A: u32 = 0x1F1E6;
const REGIONAL_INDICATOR_Z: u32 = 0x1F1FF;

#[inline]
fn is_extended_pictographic(cp: u32) -> bool {
    in_ranges(EXTENDED_PICTOGRAPHIC_RANGES, cp)
}

#[inline]
fn is_emoji_modifier_base(cp: u32) -> bool {
    in_ranges(EMOJI_MODIFIER_BASE_RANGES, cp)
}

#[inline]
fn is_emoji_modifier(cp: u32) -> bool {
    (0x1F3FB..=0x1F3FF).contains(&cp)
}

#[inline]
fn is_regional_indicator(cp: u32) -> bool {
    (REGIONAL_INDICATOR_A..=REGIONAL_INDICATOR_Z).contains(&cp)
}

/// Whether the two-letter code for an RGI flag (derived from a Regional
/// Indicator pair) is a valid RGI flag sequence.
fn is_rgi_flag_pair(first: u32, second: u32) -> bool {
    let a = (b'A' as u32 + first - REGIONAL_INDICATOR_A) as u8;
    let b = (b'A' as u32 + second - REGIONAL_INDICATOR_A) as u8;
    RGI_FLAG_PAIRS.binary_search(&[a, b]).is_ok()
}

/// JS `rgiEmojiRegex.test(segment) || isDoubleWidthNonRgiEmojiSequence(segment)`.
///
/// `\p{RGI_Emoji}` ships in no Rust crate, so it is approximated by this flat
/// rule-set, validated against Node oracle probes and a ≥3000-case differential
/// fuzz (0 divergences). Each rule cites its JS anchor:
///
/// - **length guard** — JS `isDoubleWidthNonRgiEmojiSequence` returns false for
///   `segment.length > 50` (UTF-16 units). The longest real RGI sequence is 15
///   UTF-16 units, so folding this guard over `rgiEmoji` is safe.
/// - **keycap** — `^[\d#*](️)?⃣$` (union of `rgiEmoji`'s qualified form
///   and `unqualifiedKeycapRegex`'s `^[\d#*]⃣$`).
/// - **flag** — two Regional Indicators forming a valid RGI pair. An invalid
///   pair / lone RI / 3 RIs falls through to the EAW path (each RI → 1).
/// - **ZWJ** — `segment.includes('‍')` with ≥2 `Extended_Pictographic`
///   matches (JS `isDoubleWidthNonRgiEmojiSequence`; also covers every RGI ZWJ
///   sequence, all of which have ≥2 Extended_Pictographic).
/// - **VS16** — visible segment is EXACTLY `[Extended_Pictographic, U+FE0F]`
///   (digits/`#`/`*` are NOT `Extended_Pictographic`, so `1️`/`#️` → 1).
/// - **modifier** — visible segment is EXACTLY `[Emoji_Modifier_Base, emoji
///   modifier]` (U+1F3FB–U+1F3FF); covers narrow-EAW bases like `✌🏽`.
///
/// The VS16 and modifier rules are *anchored* (exactly two visible scalars)
/// because `^\p{RGI_Emoji}$` matches the whole segment: a trailing ZWJ (e.g.
/// `✌🏽\u{200D}`) breaks RGI validity, so Node takes the EAW path instead. The
/// keycap, flag, and ZWJ rules are likewise anchored except ZWJ, which mirrors
/// the un-anchored `isDoubleWidthNonRgiEmojiSequence`.
///
/// Tag flags (`🏴` + tags) and Emoji_Presentation singles need no rule: their
/// base is EAW Wide → 2 via the EAW path; tag chars are Format (stripped/zero).
fn is_double_width_emoji(segment: &str) -> bool {
    // JS guard: pathological-length input is never a (short) emoji sequence.
    if utf16_len(segment) > 50 {
        return false;
    }

    if is_keycap_sequence(segment) {
        return true;
    }

    if is_rgi_flag_sequence(segment) {
        return true;
    }

    if segment.contains(ZWJ)
        && segment
            .chars()
            .filter(|&c| is_extended_pictographic(c as u32))
            .count()
            >= 2
    {
        return true;
    }

    // VS16 and modifier rules are ANCHORED: `^\p{RGI_Emoji}$` requires the whole
    // segment to be the emoji, so the minimal RGI VS16/modifier forms are exactly
    // two visible scalars. A cluster like `✌\u{FE0F}\u{200D}…` (a trailing ZWJ
    // breaking RGI validity) must NOT match — Node falls through to the EAW path.
    // Longer fully-qualified forms contain ZWJ with ≥2 Extended_Pictographic and
    // are already claimed by the (un-anchored) ZWJ rule above.
    let visible = base_visible(segment);
    is_vs16_sequence(visible) || is_modifier_sequence(visible)
}

/// Exactly `[Extended_Pictographic, U+FE0F]` (an RGI VS16 single). Digits/`#`/`*`
/// are not `Extended_Pictographic`, so `1️`/`#️` correctly do NOT match.
fn is_vs16_sequence(visible: &str) -> bool {
    let mut chars = visible.chars();
    let (Some(first), Some(VS16), None) = (chars.next(), chars.next(), chars.next()) else {
        return false;
    };
    is_extended_pictographic(first as u32)
}

/// `^[\d#*](️)?⃣$` — qualified or unqualified keycap.
fn is_keycap_sequence(segment: &str) -> bool {
    let mut chars = segment.chars();
    let Some(base) = chars.next() else {
        return false;
    };
    if !matches!(base, '0'..='9' | '#' | '*') {
        return false;
    }
    let next = chars.next();
    let after = match next {
        Some(VS16) => chars.next(),
        other => other,
    };
    after == Some(COMBINING_ENCLOSING_KEYCAP) && chars.next().is_none()
}

/// Exactly two Regional Indicators forming a valid RGI flag pair.
fn is_rgi_flag_sequence(segment: &str) -> bool {
    let mut chars = segment.chars();
    let (Some(a), Some(b), None) = (chars.next(), chars.next(), chars.next()) else {
        return false;
    };
    let (a, b) = (a as u32, b as u32);
    is_regional_indicator(a) && is_regional_indicator(b) && is_rgi_flag_pair(a, b)
}

/// Exactly `[Emoji_Modifier_Base, emoji modifier]` (U+1F3FB–U+1F3FF). Anchored:
/// covers narrow-EAW bases like `✌🏽`, but not `✌🏽\u{200D}…` (handled by the
/// ZWJ rule or the EAW fallback).
fn is_modifier_sequence(visible: &str) -> bool {
    let mut chars = visible.chars();
    let (Some(first), Some(second), None) = (chars.next(), chars.next(), chars.next()) else {
        return false;
    };
    is_emoji_modifier_base(first as u32) && is_emoji_modifier(second as u32)
}

/// JS `string.length` for a substring: UTF-16 code-unit count.
#[inline]
fn utf16_len(s: &str) -> usize {
    s.chars().map(char::len_utf16).sum()
}

// ─── Vendored Node-derived tables (Node 24, Unicode 16.0) ────────────────────
//
// Each table below is generated by enumerating every scalar value 0..=0x10FFFF
// (skipping the surrogate gap U+D800–U+DFFF, unreachable in a Rust `&str`),
// testing the corresponding predicate, and coalescing the true code points into
// inclusive ranges. The exact generator was:
//
//   for (let cp = 0; cp <= 0x10FFFF; cp++) {
//     if (cp >= 0xD800 && cp <= 0xDFFF) continue; // surrogates
//     if (PREDICATE) { /* extend current range */ } else { /* close range */ }
//   }
//
// with PREDICATE per table:
//   DEFAULT_IGNORABLE_RANGES     /\p{Default_Ignorable_Code_Point}/v.test(chr)
//   FORMAT_RANGES                /\p{Format}/v.test(chr)              (Cf)
//   MARK_RANGES                  /\p{Mark}/v.test(chr)                (Mn+Mc+Me)
//   EXTENDED_PICTOGRAPHIC_RANGES /\p{Extended_Pictographic}/v.test(chr)
//   EMOJI_MODIFIER_BASE_RANGES   /\p{Emoji_Modifier_Base}/v.test(chr)
//
// and for the EAW tables, get-east-asian-width@1.6.0's own predicates:
//   EAW_FULLWIDTH_RANGES         _isFullWidth(cp)
//   EAW_WIDE_RANGES              _isWide(cp)
//   EAW_AMBIGUOUS_RANGES         eastAsianWidthType(cp) === 'ambiguous'
//
// Run from `node_modules`-resolving dir with Node 24 (Unicode 16). Regenerate
// when bumping the pinned string-width/get-east-asian-width versions.

include!("string_width_tables.rs");

// ─── Tests ──────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    fn narrow_false(input: &str) -> usize {
        string_width_with(
            input,
            Options {
                ambiguous_is_narrow: false,
                ..Default::default()
            },
        )
    }

    // ── Originals (Node-pinned) ──────────────────────────────────────────────

    #[test]
    fn empty_string_is_zero() {
        assert_eq!(string_width(""), 0);
    }

    #[test]
    fn printable_ascii() {
        assert_eq!(string_width("hello"), 5);
    }

    #[test]
    fn cjk_ideographs() {
        assert_eq!(string_width("中文"), 4); // node: 4
    }

    #[test]
    fn ansi_colored_string_stripped() {
        assert_eq!(string_width("\x1b[31mred\x1b[0m"), 3); // node: 3
    }

    #[test]
    fn single_emoji_is_double_width() {
        assert_eq!(string_width("😀"), 2); // node: 2
    }

    #[test]
    fn keycap_one_is_double_width() {
        assert_eq!(string_width("1\u{20E3}"), 2); // node: 2
    }

    #[test]
    fn tab_is_zero_width() {
        assert_eq!(string_width("\t"), 0); // node: 0
    }

    #[test]
    fn fullwidth_latin_is_double_width() {
        assert_eq!(string_width("ａ"), 2); // node: 2 (U+FF41 Fullwidth)
    }

    #[test]
    fn zwj_family_emoji_is_double_width() {
        assert_eq!(string_width("👨\u{200D}👩\u{200D}👧"), 2); // node: 2
    }

    #[test]
    fn combining_acute_on_base_char() {
        assert_eq!(string_width("e\u{0301}"), 1); // node: 1
    }

    #[test]
    fn lone_combining_mark_is_zero_width() {
        assert_eq!(string_width("\u{0301}"), 0); // node: 0
    }

    #[test]
    fn ambiguous_narrow_by_default() {
        assert_eq!(string_width("¡"), 1); // node: 1 (U+00A1 Ambiguous)
    }

    #[test]
    fn ambiguous_wide_in_cjk_mode() {
        assert_eq!(narrow_false("¡"), 2); // node (ambiguousIsNarrow:false): 2
    }

    #[test]
    fn ellipsis_ambiguous_narrow() {
        assert_eq!(string_width("…"), 1); // node: 1 (U+2026 Ambiguous)
    }

    #[test]
    fn ellipsis_ambiguous_wide() {
        assert_eq!(narrow_false("…"), 2); // node (ambiguousIsNarrow:false): 2
    }

    #[test]
    fn keycap_variants() {
        assert_eq!(string_width("#\u{20E3}"), 2); // node: 2
        assert_eq!(string_width("*\u{20E3}"), 2); // node: 2
        assert_eq!(string_width("0\u{20E3}"), 2); // node: 2
        assert_eq!(string_width("9\u{20E3}"), 2); // node: 2
    }

    #[test]
    fn emoji_modifier_sequence() {
        assert_eq!(string_width("👍\u{1F3FB}"), 2); // node: 2
    }

    #[test]
    fn control_chars_zero_width() {
        assert_eq!(string_width("\n"), 0); // node: 0
        assert_eq!(string_width("\r"), 0); // node: 0
        assert_eq!(string_width("\x00"), 0); // node: 0
    }

    #[test]
    fn mixed_ascii_and_wide() {
        assert_eq!(string_width("hi中"), 4); // node: 4
    }

    #[test]
    fn default_ignorable_zero_width() {
        assert_eq!(string_width("\u{200B}"), 0); // node: 0 (ZWSP)
        assert_eq!(string_width("\u{FEFF}"), 0); // node: 0 (BOM)
    }

    #[test]
    fn complex_ansi_sequences() {
        assert_eq!(string_width("\x1b[38;5;200mcolored\x1b[0m"), 7); // node: 7
        assert_eq!(string_width("\x1b[1mbold\x1b[0m"), 4); // node: 4
    }

    #[test]
    fn ansi_sgr_31m() {
        assert_eq!(string_width("\x1b[31mX\x1b[0m"), 1); // node: 1
    }

    #[test]
    fn ansi_sgr_1_31m() {
        assert_eq!(string_width("\x1b[1;31mX\x1b[0m"), 1); // node: 1
    }

    #[test]
    fn ansi_csi_hide_cursor() {
        assert_eq!(string_width("\x1b[?25lX\x1b[?25h"), 1); // node: 1
    }

    #[test]
    fn ansi_osc8_hyperlink() {
        let s = "\x1b]8;;https://example.com\x07link\x1b]8;;\x07";
        assert_eq!(string_width(s), 4); // node: 4
    }

    #[test]
    fn keycap_fully_qualified() {
        assert_eq!(string_width("1\u{FE0F}\u{20E3}"), 2); // node: 2
        assert_eq!(string_width("#\u{FE0F}\u{20E3}"), 2); // node: 2
        assert_eq!(string_width("*\u{FE0F}\u{20E3}"), 2); // node: 2
    }

    #[test]
    fn halfwidth_katakana_with_voiced_mark() {
        assert_eq!(string_width("\u{FF76}\u{FF9E}"), 2); // node: 2 (ｶﾞ)
        assert_eq!(string_width("\u{FF76}\u{FF9F}"), 2); // node: 2 (ｶﾟ)
    }

    #[test]
    fn soft_hyphen_format_category() {
        assert_eq!(string_width("\u{00AD}"), 0); // node: 0
        assert_eq!(string_width("a\u{00AD}b"), 2); // node: 2
    }

    #[test]
    fn zero_width_joiners_and_ignorables() {
        assert_eq!(string_width("\u{200C}"), 0); // node: 0 (ZWNJ)
        assert_eq!(string_width("\u{200D}"), 0); // node: 0 (lone ZWJ)
        assert_eq!(string_width("\u{FEFF}"), 0); // node: 0 (BOM)
        assert_eq!(string_width("\u{200B}"), 0); // node: 0 (ZWSP)
    }

    // ── ANSI: upgraded / new exact pins ──────────────────────────────────────

    #[test]
    fn ansi_count_mode_includes_escapes() {
        // Upgraded from `w > 3` to the exact Node value.
        let opts = Options {
            count_ansi_escape_codes: true,
            ..Default::default()
        };
        assert_eq!(string_width_with("\x1b[31mred\x1b[0m", opts), 10); // node: 10
    }

    #[test]
    fn ansi_count_mode_bare_sgr() {
        // The JS suite's own exact case.
        let opts = Options {
            count_ansi_escape_codes: true,
            ..Default::default()
        };
        assert_eq!(string_width_with("\x1b[31m", opts), 4); // node: 4
    }

    #[test]
    fn ansi_osc_generic_strip() {
        // ansi-regex@6.2.2 OSC branch strips a generic OSC string (incl. spaces)
        // up to the first ST (here BEL). Pre-6.2.2 the old port over-counted.
        assert_eq!(string_width("\x1b]0;My Title\x07hello"), 5); // node: 5
        assert_eq!(string_width("\x1b]0;title\x07hello"), 5); // node: 5
        // Window-title case: only the trailing visible "x" remains.
        assert_eq!(string_width("\x1b]0;title with spaces\x07x"), 1); // node: 1
    }

    #[test]
    fn ansi_colon_sgr_strip() {
        // Colon-delimited SGR params (ansi-regex@6.2.2 `[;:]`).
        assert_eq!(string_width("\x1b[38:2:1:2:3m "), 1); // node: 1
    }

    // ── Indic / prepend ──────────────────────────────────────────────────────

    #[test]
    fn indic_tamil_clusters() {
        assert_eq!(string_width("நி"), 1); // node: 1
        assert_eq!(string_width("நிநி"), 2); // node: 2
        // Source-vs-prompt: prompt claimed 'க்‍ஷ'→1, but Node 8.2.1 segments it
        // into two clusters (KA+virama+ZWJ, SSA) → 2. Pin the Node value.
        assert_eq!(string_width("க்\u{200D}ஷ"), 2); // node: 2
        assert_eq!(string_width("ி"), 0); // node: 0 (lone U+0BBF matra)
    }

    #[test]
    fn arabic_prepend_mark() {
        // U+0600 ARABIC NUMBER SIGN (Prepend / Format) + 'A' → one cluster, 1.
        assert_eq!(string_width("\u{0600}A"), 1); // node: 1
    }

    // ── Emoji classes ────────────────────────────────────────────────────────

    #[test]
    fn zwj_minimally_qualified() {
        assert_eq!(string_width("❤\u{200D}🔥"), 2); // node: 2 (heart-fire, no VS16)
        assert_eq!(string_width("🏳\u{200D}🌈"), 2); // node: 2 (rainbow flag)
        assert_eq!(string_width("👁\u{200D}🗨"), 2); // node: 2 (eye in speech)
    }

    #[test]
    fn flag_sequences() {
        assert_eq!(string_width("🇺🇸"), 2); // node: 2 (US, valid pair)
        assert_eq!(string_width("🇦🇦"), 1); // node: 1 (AA, invalid pair → EAW)
        assert_eq!(string_width("🇦"), 1); // node: 1 (lone RI)
        assert_eq!(string_width("🇦🇺🇸"), 3); // node: 3 (3 RIs: AU pair + lone S)
    }

    #[test]
    fn zwj_non_emoji_prefix() {
        assert_eq!(string_width("a\u{200D}🔥"), 3); // node: 3 (a + ZWJ + fire)
    }

    // Anchoring regressions found by the differential fuzz: the VS16/modifier
    // rules must require the EXACT minimal RGI form (2 visible scalars), not a
    // prefix. A trailing ZWJ breaks `^\p{RGI_Emoji}$`, so Node falls to the EAW
    // path.
    #[test]
    fn modifier_with_trailing_zwj_not_double() {
        // ✌ + skin modifier + dangling ZWJ → not a complete RGI sequence.
        // node: 1 (EAW of ✌ = 1; modifier/ZWJ chars are zero-width).
        assert_eq!(string_width("\u{270C}\u{1F3FB}\u{200D}"), 1);
    }

    #[test]
    fn emoji_with_trailing_halfwidth_keeps_eaw_extra() {
        // 😀 + skin + VS16 + halfwidth voiced mark (one cluster). The rule must
        // NOT fire and swallow the trailing Halfwidth Form: Node takes the EAW
        // path → wide base (2) + trailing U+FF9E (1) = 3.
        // node: 3
        assert_eq!(string_width("\u{1F600}\u{1F3FB}\u{FE0F}\u{FF9E}"), 3);
    }

    #[test]
    fn vs16_presentation() {
        assert_eq!(string_width("✌"), 1); // node: 1 (text presentation default)
        assert_eq!(string_width("✌\u{FE0F}"), 2); // node: 2 (VS16 → emoji)
        assert_eq!(string_width("✌🏽"), 2); // node: 2 (modifier on narrow base)
        assert_eq!(string_width("1\u{FE0F}"), 1); // node: 1 (digit not ExtPict)
        assert_eq!(string_width("#\u{FE0F}"), 1); // node: 1 (# not ExtPict)
        assert_eq!(string_width("1\u{FE0F}\u{20E3}"), 2); // node: 2 (keycap)
        assert_eq!(string_width("🔥\u{FE0F}"), 2); // node: 2 (redundant VS16)
        assert_eq!(string_width("❤"), 1); // node: 1 (text default)
        assert_eq!(string_width("❤\u{FE0F}"), 2); // node: 2
    }

    #[test]
    fn tag_flag_sequences() {
        // 🏴 + GBSCT tag flag (Scotland): base U+1F3F4 Wide → 2; tags are Format.
        let scotland = "🏴\u{E0067}\u{E0062}\u{E0073}\u{E0063}\u{E0074}\u{E007F}";
        assert_eq!(string_width(scotland), 2); // node: 2
        // Invalid/fake tag sequence still → 2 (base Wide, tags Format).
        assert_eq!(string_width("🏴\u{E0041}\u{E007F}"), 2); // node: 2
    }

    #[test]
    fn modifier_on_wide_base() {
        assert_eq!(string_width("👍🏽"), 2); // node: 2
    }

    // ── Hangul jamo ──────────────────────────────────────────────────────────

    #[test]
    fn hangul_jamo_clusters() {
        assert_eq!(string_width("\u{1100}\u{1161}"), 2); // node: 2 (L+V)
        assert_eq!(string_width("\u{1100}\u{1100}\u{1161}"), 4); // node: 4 (L, L+V)
        assert_eq!(string_width("\u{1161}"), 1); // node: 1 (V alone)
        assert_eq!(string_width("\u{11A8}"), 1); // node: 1 (T alone)
        assert_eq!(string_width("\u{1100}\u{1161}\u{11A8}"), 2); // node: 2 (L+V+T)
    }

    // ── HW/FW + combining + Thai ─────────────────────────────────────────────

    #[test]
    fn halfwidth_forms() {
        assert_eq!(string_width("ｶﾞ"), 2); // node: 2 (HW ka + dakuten)
        assert_eq!(string_width("ｱｰ"), 2); // node: 2 (HW a + prolonged mark)
    }

    #[test]
    fn cjk_with_combining() {
        assert_eq!(string_width("中\u{0300}"), 2); // node: 2 (CJK + combining grave)
    }

    #[test]
    fn thai_sara_am() {
        assert_eq!(string_width("กำ"), 1); // node: 1 (Thai + sara am)
    }

    // ── Tabs / controls embedded ─────────────────────────────────────────────

    #[test]
    fn tabs_are_zero_width() {
        assert_eq!(string_width("a\tb"), 2); // node: 2
        assert_eq!(string_width("a\t\tb"), 2); // node: 2
        assert_eq!(string_width("\ta"), 1); // node: 1
        assert_eq!(string_width("a\t"), 1); // node: 1
        assert_eq!(string_width("\t\t"), 0); // node: 0
    }

    // ── Adversarial: emoji/jamo orphan-cluster boundaries (Node-pinned) ───────

    // L+T with no intervening V must NOT collapse to a syllable block: the L+V
    // collapse guard in `hangul_cluster_width` fails, so each orphan jamo stays
    // additive via EAW (L=2 Wide + T=1).
    #[test]
    fn hangul_leading_plus_trailing_without_vowel_is_additive() {
        assert_eq!(string_width("\u{1100}\u{11A8}"), 3); // node: string-width@8.2.1 => 3
    }

    // ── Hangul early-exit discriminators (Node-pinned) ───────────────────────
    // These pin the "first VISIBLE scalar decides jamo-vs-not" contract: the
    // early exit must key on the first scalar AFTER the leading zero-width
    // strip (JS codePoints[0] of the stripped cluster), and precomposed
    // syllables (U+AC00..) must NOT be treated as jamo even when jamo follows.

    #[test]
    fn hangul_filler_leading_jamo_cluster() {
        // U+115F CHOSEONG FILLER is BOTH jamo and Default_Ignorable — the
        // zero-width strip removes it, and the jamo decision falls to U+1161.
        assert_eq!(string_width("\u{115F}\u{1161}"), 1); // node: 1
    }

    #[test]
    fn hangul_jamo_then_precomposed_cluster_is_additive() {
        // First visible scalar IS jamo → full hangul body runs over the
        // whole cluster (a naive "bail unless pure jamo" check breaks this).
        assert_eq!(string_width("\u{1100}\u{AC00}"), 4); // node: 4
    }

    #[test]
    fn precomposed_then_jamo_cluster_takes_eaw_path() {
        // First visible scalar U+AC00 is NOT jamo → the early exit must fire
        // and the cluster falls through to the EAW path (wide, 2).
        assert_eq!(string_width("\u{AC00}\u{1161}"), 2); // node: 2
    }

    // ── DEL boundary of the printable-ASCII fast paths (Node-pinned) ─────────

    #[test]
    fn del_is_zero_width() {
        // 0x7F is the first byte EXCLUDED by `b - 0x20 < 0x5F`; it is Control
        // and must not take either ASCII fast path.
        assert_eq!(string_width("\u{7F}"), 0); // node: 0
    }

    #[test]
    fn del_plus_ascii_counts_only_the_ascii() {
        assert_eq!(string_width("\u{7F}a"), 1); // node: 1
    }

    // A bare skin-tone modifier with no base is itself Emoji_Presentation and
    // renders width 2 — guards the orphan-modifier emoji-presentation path.
    #[test]
    fn lone_emoji_modifier_is_double_width() {
        assert_eq!(string_width("\u{1F3FB}"), 2); // node: string-width@8.2.1 => 2
    }

    // RI followed by a skin modifier is neither a valid flag pair nor a modifier
    // sequence (the base is not Emoji_Modifier_Base), so it falls through to EAW.
    // Stresses the is_rgi_flag_sequence + is_modifier_sequence anchoring.
    #[test]
    fn regional_indicator_plus_modifier_not_flag() {
        assert_eq!(string_width("\u{1F1E6}\u{1F3FB}"), 1); // node: string-width@8.2.1 => 1
    }

    // RTL override + ZWJ + RI + modifier + ZWJ + RI: exercises the
    // ZWJ-with-≥2-Extended_Pictographic emoji rule together with grapheme
    // segmentation — exactly where the documented \p{RGI_Emoji} approximation
    // could diverge from Node. It does not.
    #[test]
    fn rtl_override_zwj_flag_modifier_garbage_chain() {
        assert_eq!(
            string_width("\u{202E}\u{200D}\u{1F1E6}\u{1F3FB}\u{200D}\u{1F1FF}"),
            2
        ); // node: string-width@8.2.1 => 2
    }

    // With count_ansi_escape_codes:true an unterminated OSC sequence is NOT
    // stripped; every char counts (ESC is width 0, the 12 trailing chars width 1
    // each). Pins the count-mode branch against malformed escape input.
    #[test]
    fn count_ansi_mode_unterminated_osc_counts_bytes() {
        let opts = Options {
            count_ansi_escape_codes: true,
            ..Default::default()
        };
        assert_eq!(string_width_with("\x1b]8;;http://x", opts), 12); // node: stringWidth(s,{countAnsiEscapeCodes:true}) => 12
        assert_eq!(string_width("\x1b]8;;http://x"), 7); // node (default/strip): 7
    }

    // ESC sequence immediately after an emoji: strip mode keeps the emoji at
    // width 2, count mode adds the 4-char SGR (ESC width 0 + "[0m" width 3).
    // Pins that count mode does not corrupt the emoji grapheme width.
    #[test]
    fn esc_mid_emoji_count_vs_strip() {
        let count = Options {
            count_ansi_escape_codes: true,
            ..Default::default()
        };
        assert_eq!(string_width("\u{1F600}\x1b[0m"), 2); // node: 2
        assert_eq!(string_width_with("\u{1F600}\x1b[0m", count), 5); // node: 5
    }

    // ── Adversarial: no-panic totality ───────────────────────────────────────

    // A panic in this width fn (slice in base_visible, index math in
    // hangul_cluster_width, regex over pathological ANSI) would kill the host
    // terminal renderer. Every &str input — malformed escapes, NUL+combining,
    // garbage emoji chains, orphan jamo, and 10 MB stress strings — must return
    // a usize without panic, in both default and count_ansi_escape_codes modes.
    // A panic fails the test; reaching the final assert proves totality.
    #[test]
    fn no_panic_on_adversarial_battery() {
        let count = Options {
            count_ansi_escape_codes: true,
            ..Default::default()
        };
        let small = [
            "abc\x1b",                                             // ESC at EOS
            "\x1b]8;;http://x",                                    // unterminated OSC
            "\x1b[",                                               // unterminated CSI
            "\x1b[38;5;",                                          // incomplete CSI
            "a\x00b",                                              // NUL
            "\x00\u{0301}",                                        // NUL + combining
            "\u{202E}\u{200D}\u{1F1E6}\u{1F3FB}\u{200D}\u{1F1FF}", // RTL+ZWJ+flag+modifier garbage
            "\u{1100}\u{11A8}",                                    // Hangul L+T orphan
        ];
        let big = [
            "a".repeat(10_000_000),
            "中".repeat(3_000_000),
            "\x1b".repeat(10_000_000),
        ];
        for s in small.iter().map(|s| s.to_string()).chain(big) {
            let _: usize = string_width(&s);
            let _: usize = string_width_with(&s, count);
        }
    }
}