inkferro-core 0.1.0

//! ANSI SGR end-code table and related helpers.
//!
//! Faithfully ported from `@alcalzone/ansi-tokenize@0.3.0 / ansiCodes.js`,
//! which derives its `endCodesSet` / `endCodesMap` from `ansi-styles@6`.
//!
//! The `ansi-styles` `codes` map (start → end), verbatim:
//! ```text
//!   0→0, 1→22, 2→22, 3→23, 4→24, 53→55, 7→27, 8→28, 9→29,
//!   30..=37→39, 90..=97→39, 40..=47→49, 100..=107→49
//! ```
//! and the resulting `endCodesSet` (the *values* of that map) is exactly:
//! ```text
//!   { 0, 22, 23, 24, 55, 27, 28, 29, 39, 49 }
//! ```
//! Note in particular: `5`, `6`, `21`, `25`, `54`, `59` are **not** keys of the
//! map, so e.g. `getEndCode("\x1B[25m")` falls through to the reset `"\x1B[0m"`.

use compact_str::{CompactString, format_compact};

use crate::text::ansi_tokenize::consts::{
    C1_ST, LINK_CODE_PREFIX, LINK_END_CODE, LINK_END_CODE_C1ST, LINK_END_CODE_ST,
};

use super::types::AnsiToken;

/// The reset code `\x1B[0m`.
pub(crate) const RESET_CODE: &str = "\x1B[0m";

/// The `ansi-styles` `codes` map: SGR start number → SGR end number.
///
/// Returns `None` when `n` is not a key of the map (matching `codes.get(n)`
/// returning `undefined` in the JS source).
///
/// Deliberate per-port duplicate of `ansi_styles_close` in
/// `text/wrap_ansi.rs` — each port tracks its own upstream npm package.
/// Do not unify.
fn end_code_for_num(n: u32) -> Option<u32> {
    Some(match n {
        0 => 0,
        1 | 2 => 22,
        3 => 23,
        4 => 24,
        53 => 55,
        7 => 27,
        8 => 28,
        9 => 29,
        30..=37 | 90..=97 => 39,
        40..=47 | 100..=107 => 49,
        _ => return None,
    })
}

/// Whether `\x1B[{n}m` is a member of `endCodesSet` — the set of *end* codes,
/// i.e. the values of the `codes` map: `{0, 22, 23, 24, 27, 28, 29, 39, 49, 55}`.
fn is_end_code_num(n: u32) -> bool {
    matches!(n, 0 | 22 | 23 | 24 | 27 | 28 | 29 | 39 | 49 | 55)
}

/// Parse the first numeric parameter from an SGR code string like `\x1B[31m` → 31.
///
/// Returns `None` if the string is not a plain `\x1B[<num>...m` SGR sequence,
/// or if the leading parameter is not a base-10 integer (matching the JS
/// `parseInt(code, 10)`, which yields `NaN` here and falls through to reset).
fn parse_sgr_first_param(code: &str) -> Option<u32> {
    let inner = code.strip_prefix("\x1B[")?;
    let inner = inner.strip_suffix('m')?;
    let first = inner.split(';').next()?;
    first.parse().ok()
}

/// Returns the end code for the given ANSI start code.
///
/// Faithfully matches `getEndCode` from `ansiCodes.js`, preserving its lookup
/// order:
/// 1. if `code` is itself an end code (`endCodesSet.has`) → return it unchanged;
/// 2. if `code` is a known start code (`endCodesMap.has`) → return its end;
/// 3. link prefix → BEL / ST / C1-ST link end code, chosen by the suffix;
/// 4. extended colour `38`/`48` → `\x1B[39m` / `\x1B[49m`;
/// 5. otherwise numeric `codes` lookup, falling back to the reset `\x1B[0m`.
pub(crate) fn get_end_code(code: &str) -> CompactString {
    // Steps 1 & 2 operate on the *whole* code string. For plain SGR codes they
    // are driven by the leading numeric parameter; the JS keys its maps on the
    // single-parameter strings `ansiStyles.color.ansi(n)` == `"\x1B[{n}m"`, so a
    // compound code such as `"\x1B[38;5;200m"` is never a map member and falls
    // through to the special handling below — which is exactly what we want.
    if let Some(n) = parse_sgr_first_param(code) {
        // Only treat it as a map member when the code is the canonical
        // single-parameter form `"\x1B[{n}m"`.
        if is_canonical_sgr(code, n) {
            // (1) endCodesSet.has(code) → its own end code.
            if is_end_code_num(n) {
                return code.into();
            }
            // (2) endCodesMap.has(code) → mapped end code.
            if let Some(end) = end_code_for_num(n) {
                return format_compact!("\x1B[{end}m");
            }
        }
    }

    // (3) Links.
    if code.starts_with(LINK_CODE_PREFIX) {
        if code.ends_with("\x1B\\") {
            return LINK_END_CODE_ST.into();
        }
        if code.ends_with(C1_ST) {
            return LINK_END_CODE_C1ST.into();
        }
        return LINK_END_CODE.into();
    }

    // (4) & (5): JS does `code = code.slice(2)` *unconditionally* (dropping the
    // first two characters, normally `\x1B[`) and then inspects the remainder.
    // We mirror that char-based slice exactly so that quirky inputs — e.g. the
    // `"[31m"` code produced from a C1 CSI (U+009B) opener — match the JS result.
    // Borrowing the post-slice remainder (instead of `chars().skip(2).collect()`
    // into a `String`) is byte-identical: the slice starts at the byte offset of
    // the third char, or is empty when there are fewer than three chars.
    let rest: &str = code
        .char_indices()
        .nth(2)
        .map(|(off, _)| &code[off..])
        .unwrap_or("");
    if rest.starts_with("38") {
        return "\x1B[39m".into();
    }
    if rest.starts_with("48") {
        return "\x1B[49m".into();
    }
    // `parseInt(rest, 10)` on the remainder, then numeric `codes` lookup.
    if let Some(end) = leading_int(rest).and_then(end_code_for_num) {
        return format_compact!("\x1B[{end}m");
    }

    // Fallback: global reset.
    RESET_CODE.into()
}

/// Whether `code` is exactly the canonical single-parameter form `"\x1B[{n}m"`.
///
/// Same comparison as the previous inline `code == format!("\x1B[{n}m")`, but
/// `format_compact!` renders into a stack-inline `CompactString` ("\x1B[{n}m"
/// is at most 14 bytes), so the check no longer heap-allocates per call —
/// `get_end_code`/`is_end_code` run once per SGR token.
fn is_canonical_sgr(code: &str, n: u32) -> bool {
    format_compact!("\x1B[{n}m") == code
}

/// Mimics JS `parseInt(str, 10)` for the leading run of ASCII digits.
///
/// Returns `None` when there is no leading digit (JS `NaN`). The leading digit
/// run is parsed as a borrowed slice — no intermediate `String`.
fn leading_int(s: &str) -> Option<u32> {
    let end = s
        .as_bytes()
        .iter()
        .position(|b| !b.is_ascii_digit())
        .unwrap_or(s.len());
    s[..end].parse().ok()
}

/// Returns `true` if `code` is an end/reset code (`endCodesSet.has(code)`).
pub(crate) fn is_end_code(code: &str) -> bool {
    // Only the canonical single-parameter form can be a set member.
    if let Some(n) = parse_sgr_first_param(code) {
        return is_canonical_sgr(code, n) && is_end_code_num(n);
    }
    false
}

/// Returns `true` if `code` is an intensity code (bold `\x1B[1m` or dim `\x1B[2m`).
pub(crate) fn is_intensity_code(token: &AnsiToken) -> bool {
    token.code == "\x1B[1m" || token.code == "\x1B[2m"
}

/// Converts a slice of [`AnsiToken`] to a deduplicated string of ANSI codes.
///
/// Matches `ansiCodesToString` from `ansiCodes.js` (`new Set(...)` keeps first
/// occurrence in insertion order).
pub(crate) fn ansi_codes_to_string(codes: &[AnsiToken]) -> String {
    // Style stacks are tiny (a handful of entries), so a linear-scan Vec
    // outperforms a HashSet while preserving insertion order.
    let mut seen: Vec<&str> = Vec::with_capacity(codes.len());
    let mut result = String::new();
    for code in codes {
        let s = code.code.as_str();
        if !seen.contains(&s) {
            seen.push(s);
            result.push_str(s);
        }
    }
    result
}

#[cfg(test)]
mod tests {
    use super::*;

    // Ground truth captured directly from `@alcalzone/ansi-tokenize@0.3.0`.
    #[test]
    fn get_end_code_parity_table() {
        let cases = [
            ("\x1B[0m", "\x1B[0m"),
            ("\x1B[39m", "\x1B[39m"),
            ("\x1B[49m", "\x1B[49m"),
            ("\x1B[55m", "\x1B[55m"),
            // 25 / 5 / 6 / 21 / 99 are NOT in the ansi-styles codes map → reset.
            ("\x1B[25m", "\x1B[0m"),
            ("\x1B[5m", "\x1B[0m"),
            ("\x1B[6m", "\x1B[0m"),
            ("\x1B[21m", "\x1B[0m"),
            ("\x1B[99m", "\x1B[0m"),
            ("\x1B[53m", "\x1B[55m"),
            ("\x1B[1m", "\x1B[22m"),
            ("\x1B[2m", "\x1B[22m"),
            ("\x1B[31m", "\x1B[39m"),
            ("\x1B[38;5;200m", "\x1B[39m"),
            ("\x1B[48;2;1;2;3m", "\x1B[49m"),
            // Quirky codes produced from a C1 CSI (U+009B) opener — slice(2).
            ("[31m", "\x1B[22m"),
            ("[38;5;1m", "\x1B[28m"),
            ("[1m", "\x1B[0m"),
        ];
        for (input, expected) in cases {
            assert_eq!(
                get_end_code(input),
                expected,
                "get_end_code({input:?}) mismatch"
            );
        }
    }

    #[test]
    fn get_end_code_link_suffix_variants() {
        assert_eq!(get_end_code("\x1B]8;;http://x\x07"), "\x1B]8;;\x07");
        assert_eq!(get_end_code("\x1B]8;;http://x\x1B\\"), "\x1B]8;;\x1B\\");
        assert_eq!(get_end_code("\x1B]8;;http://x\u{9C}"), "\x1B]8;;\u{9C}");
    }

    #[test]
    fn is_end_code_matches_set() {
        for c in [
            "\x1B[0m", "\x1B[22m", "\x1B[23m", "\x1B[24m", "\x1B[27m", "\x1B[28m", "\x1B[29m",
            "\x1B[39m", "\x1B[49m", "\x1B[55m",
        ] {
            assert!(is_end_code(c), "{c:?} should be an end code");
        }
        for c in [
            "\x1B[25m", "\x1B[54m", "\x1B[59m", "\x1B[1m", "\x1B[31m", "[39m",
        ] {
            assert!(!is_end_code(c), "{c:?} should NOT be an end code");
        }
    }

    #[test]
    fn ansi_codes_to_string_dedup() {
        let red = AnsiToken {
            code: "\x1B[31m".into(),
            end_code: "\x1B[39m".into(),
        };
        // Two identical codes collapse to one (Test 19).
        let out = ansi_codes_to_string(&[red.clone(), red]);
        assert_eq!(out, "\x1B[31m");
    }
}