inkferro-core 0.1.0

//! Port of ink's `sanitize-ansi.ts` + the parts of `ansi-tokenizer.ts` it
//! consumes (ink commit 9da2dfa "Strip non-SGR ANSI escape sequences from text
//! content").
//!
//! # Oracle contract (sanitize-ansi.ts:9-33, probed on the live build)
//!
//! `sanitizeAnsi` runs at the squash boundary (`squash-text-nodes.ts:45`) on
//! every `squashTextNodes` return — i.e. once per text-subtree fold, at every
//! nesting level. It tokenizes the string (ansi-tokenizer.ts `tokenizeAnsi`)
//! and keeps ONLY:
//!
//! * `text` tokens — plain characters (including C0 controls like BEL/TAB/LF
//!   that are not sequence introducers);
//! * `osc` tokens — ALL OSC control strings, byte-verbatim, in BOTH the
//!   `ESC ]` and the C1 `U+009D` introducer forms, terminated by BEL, `ESC \`,
//!   or C1 ST `U+009C` (hyperlinks OSC 8, titles OSC 0, anything else);
//! * `csi` tokens that are SGR — final byte `m`, NO intermediate bytes, and a
//!   parameter string matching `/^[\d:;]*$/` (sanitize-ansi.ts:3,22-28) — in
//!   BOTH the `ESC [` and the C1 `U+009B` introducer forms, byte-verbatim
//!   (the C1 introducer byte is preserved, hex-probed: `\x9b31m` survives).
//!
//! Everything else is STRIPPED:
//!
//! * non-SGR CSI (`\x1b[2J`, `\x1b[1A`, `\x1b[2K`, private `\x1b[?25l`,
//!   SGR-with-intermediate `\x1b[1 m`);
//! * `esc` tokens — two-/multi-byte ESC sequences with final in `0x30..=0x7E`
//!   (`\x1b7` DECSC, `\x1b8` DECRC, and even `\x1bB`: "A\x1bB" → "A");
//! * DCS / PM / APC / SOS control strings INCLUDING their payload (both ESC
//!   `P ^ _ X` and C1 `U+0090 U+009E U+009F U+0098` introducers);
//! * a standalone C1 ST `U+009C`;
//! * standalone C1 controls `U+0080..=U+009F` (NEL `U+0085` etc.);
//! * a lone ESC whose follower is neither an introducer, an escape
//!   intermediate (`0x20..=0x2F`), nor an escape final (`0x30..=0x7E`) — the
//!   ESC alone is dropped and scanning resumes at the follower
//!   (ansi-tokenizer.ts:- "Ignore lone ESC and continue tokenizing the rest");
//! * MALFORMED tails — an unterminated/invalid sequence (ESC at end of input,
//!   CSI/OSC/DCS without a terminator, ESC + intermediate without a final)
//!   drops the ENTIRE remainder of the string from the introducer on
//!   (`malformedFromIndex` → `invalid` token, dropped by sanitize):
//!   "A\x1b[31" → "A".
//!
//! Probe matrix captured from the live oracle build (`/tmp/t129`, non-TTY;
//! the function is pure — no TTY dependence):
//!
//! | input                          | output                       |
//! |--------------------------------|------------------------------|
//! | `A\x1b[2JB`                    | `AB`                         |
//! | `A\x1b[31mR\x1b[39mB`          | unchanged (SGR kept)         |
//! | `A\x1b[38:5:196mR\x1b[39mB`    | unchanged (colon SGR kept)   |
//! | `A\x1b]8;;u\x07L\x1b]8;;\x07B` | unchanged (OSC 8 kept)       |
//! | `A\x1b]0;title\x07B`           | unchanged (OSC 0 kept)       |
//! | `A\x1b7B` / `A\x1b8B`          | `AB`                         |
//! | `A\u{85}B`                     | `AB` (NEL stripped)          |
//! | `A\u{9b}2JB`                   | `AB` (C1 CSI stripped)       |
//! | `A\u{9b}31mR\u{9b}39mB`        | unchanged (C1 SGR kept)      |
//! | `A\x1bB`                       | `A` (ESC-B is an esc token)  |
//! | `A\x1b[?25lB` / `A\x1b[1 mB`   | `AB`                         |
//! | `A\x1bPq\x1b\\B`               | `AB` (DCS stripped)          |
//! | `A\u{9c}B`                     | `AB` (lone ST stripped)      |
//! | `A\x1b[31`                     | `A` (malformed tail dropped) |
//!
//! # Implementation shape
//!
//! The JS builds a token vec then filters; we fuse tokenize+filter into one
//! pass over a `Vec<char>` (JS indexes UTF-16 code units, but every byte that
//! participates in sequence grammar is ASCII or C1, so char-indexing is
//! decision-equivalent and slice-content-identical). The
//! [`has_ansi_control_characters`] fast path returns the input untouched —
//! the common no-ANSI squash pays one scan and zero allocations.

const BELL: char = '\u{0007}';
const ESC: char = '\u{001B}';
const ST: char = '\u{009C}';
const C1_CSI: char = '\u{009B}';
const C1_OSC: char = '\u{009D}';
const C1_DCS: char = '\u{0090}';
const C1_PM: char = '\u{009E}';
const C1_APC: char = '\u{009F}';
const C1_SOS: char = '\u{0098}';

/// ansi-tokenizer.ts `isC1ControlCharacter`: `0x80..=0x9F`.
fn is_c1_control(c: char) -> bool {
    ('\u{80}'..='\u{9F}').contains(&c)
}

/// ansi-tokenizer.ts `isCsiParameterCharacter`: `0x30..=0x3F`.
fn is_csi_parameter(c: char) -> bool {
    ('\u{30}'..='\u{3F}').contains(&c)
}

/// ansi-tokenizer.ts `isCsiIntermediateCharacter` /
/// `isEscapeIntermediateCharacter`: `0x20..=0x2F` (same range for both).
fn is_intermediate(c: char) -> bool {
    ('\u{20}'..='\u{2F}').contains(&c)
}

/// ansi-tokenizer.ts `isCsiFinalCharacter`: `0x40..=0x7E`.
fn is_csi_final(c: char) -> bool {
    ('\u{40}'..='\u{7E}').contains(&c)
}

/// ansi-tokenizer.ts `isEscapeFinalCharacter`: `0x30..=0x7E`.
fn is_escape_final(c: char) -> bool {
    ('\u{30}'..='\u{7E}').contains(&c)
}

/// ansi-tokenizer.ts `hasAnsiControlCharacters`: ESC present or any C1 char.
pub(crate) fn has_ansi_control_characters(text: &str) -> bool {
    text.chars().any(|c| c == ESC || is_c1_control(c))
}

/// A parsed CSI body: `end` is the index one past the final byte;
/// `params_end`/`intermediates_end` delimit the parameter and intermediate
/// runs; `final_char` is the final byte.
struct CsiBody {
    end: usize,
    params_end: usize,
    intermediates_end: usize,
    final_char: char,
}

/// ansi-tokenizer.ts `readCsiSequence` — parse params/intermediates/final
/// starting at `from` (one past the introducer). `None` = malformed.
fn read_csi(chars: &[char], from: usize) -> Option<CsiBody> {
    let mut i = from;
    while i < chars.len() && is_csi_parameter(chars[i]) {
        i += 1;
    }
    let params_end = i;
    while i < chars.len() && is_intermediate(chars[i]) {
        i += 1;
    }
    let intermediates_end = i;
    let final_char = *chars.get(i)?;
    if !is_csi_final(final_char) {
        return None;
    }
    Some(CsiBody {
        end: i + 1,
        params_end,
        intermediates_end,
        final_char,
    })
}

/// ansi-tokenizer.ts `findControlStringTerminatorIndex` — scan for BEL (OSC
/// only), C1 ST, or `ESC \` (with the tmux `ESC ESC` payload-escape skip).
/// Returns the index one past the terminator; `None` = unterminated.
fn find_control_string_terminator(
    chars: &[char],
    from: usize,
    allow_bell_terminator: bool,
) -> Option<usize> {
    let mut i = from;
    while i < chars.len() {
        let c = chars[i];
        if allow_bell_terminator && c == BELL {
            return Some(i + 1);
        }
        if c == ST {
            return Some(i + 1);
        }
        if c == ESC {
            match chars.get(i + 1) {
                // Tmux escapes ESC bytes in payload as ESC ESC.
                Some(&ESC) => {
                    i += 2;
                    continue;
                }
                Some('\\') => return Some(i + 2),
                _ => {}
            }
        }
        i += 1;
    }
    None
}

/// ansi-tokenizer.ts `readEscapeSequence` — intermediates then a final in
/// `0x30..=0x7E`, starting at `from` (one past ESC). Returns the index one
/// past the final; `None` = not a (complete) escape sequence.
fn read_escape_sequence(chars: &[char], from: usize) -> Option<usize> {
    let mut i = from;
    while i < chars.len() && is_intermediate(chars[i]) {
        i += 1;
    }
    let final_char = *chars.get(i)?;
    if !is_escape_final(final_char) {
        return None;
    }
    Some(i + 1)
}

/// Control-string kind. Only OSC is KEPT by sanitize; the kind distinction
/// otherwise only matters for the BEL-terminator rule (OSC-only).
#[derive(PartialEq, Eq, Clone, Copy)]
enum ControlString {
    Osc,
    Other, // DCS / PM / APC / SOS — all stripped, all ST/ESC\-terminated
}

/// ansi-tokenizer.ts `getControlStringFromEscapeIntroducer`.
fn control_string_from_escape_introducer(c: char) -> Option<ControlString> {
    match c {
        ']' => Some(ControlString::Osc),
        'P' | '^' | '_' | 'X' => Some(ControlString::Other),
        _ => None,
    }
}

/// ansi-tokenizer.ts `getControlStringFromC1Introducer`.
fn control_string_from_c1_introducer(c: char) -> Option<ControlString> {
    match c {
        C1_OSC => Some(ControlString::Osc),
        C1_DCS | C1_PM | C1_APC | C1_SOS => Some(ControlString::Other),
        _ => None,
    }
}

/// sanitize-ansi.ts:3 `sgrParametersRegex = /^[\d:;]*$/` over the parameter
/// run (JS `\d` is ASCII `0-9`).
fn params_are_sgr(params: &[char]) -> bool {
    params
        .iter()
        .all(|&c| c.is_ascii_digit() || c == ':' || c == ';')
}

/// Port of `sanitizeAnsi` (sanitize-ansi.ts:9-33): strip ANSI escape
/// sequences that would conflict with layout, keeping SGR and OSC verbatim.
///
/// Takes the squashed `String` by value: the no-control fast path
/// (sanitize-ansi.ts:10-12) returns it unchanged with zero allocation.
pub(crate) fn sanitize_ansi(text: String) -> String {
    if !has_ansi_control_characters(&text) {
        return text;
    }

    let chars: Vec<char> = text.chars().collect();
    let mut out = String::with_capacity(text.len());
    let mut i = 0;

    while i < chars.len() {
        let c = chars[i];

        if c == ESC {
            // ansi-tokenizer.ts: ESC at end of input → malformedFromIndex →
            // the `invalid` remainder token is dropped by sanitize.
            let Some(&following) = chars.get(i + 1) else {
                return out;
            };

            // ESC [ — CSI.
            if following == '[' {
                let Some(csi) = read_csi(&chars, i + 2) else {
                    // Malformed CSI: drop the remainder.
                    return out;
                };
                // sanitize-ansi.ts:18-28: keep only SGR (final 'm', no
                // intermediates, params all digits/:/;), byte-verbatim.
                if csi.final_char == 'm'
                    && csi.intermediates_end == csi.params_end
                    && params_are_sgr(&chars[i + 2..csi.params_end])
                {
                    out.extend(&chars[i..csi.end]);
                }
                i = csi.end;
                continue;
            }

            // ESC ] / P / ^ / _ / X — control string.
            if let Some(kind) = control_string_from_escape_introducer(following) {
                let Some(end) =
                    find_control_string_terminator(&chars, i + 2, kind == ControlString::Osc)
                else {
                    // Unterminated control string: drop the remainder.
                    return out;
                };
                // sanitize-ansi.ts:13-16: OSC kept verbatim; DCS/PM/APC/SOS
                // stripped including payload.
                if kind == ControlString::Osc {
                    out.extend(&chars[i..end]);
                }
                i = end;
                continue;
            }

            // Plain escape sequence (ESC + intermediates + final) — stripped.
            match read_escape_sequence(&chars, i + 1) {
                Some(end) => {
                    i = end;
                    continue;
                }
                None => {
                    // ansi-tokenizer.ts: ESC + intermediate without a final is
                    // a malformed control string → drop the remainder; any
                    // other lone ESC is ignored and scanning continues.
                    if is_intermediate(following) {
                        return out;
                    }
                    i += 1;
                    continue;
                }
            }
        }

        // C1 CSI introducer (U+009B) — same CSI grammar, introducer kept
        // verbatim when the sequence is SGR (hex-probed on the oracle).
        if c == C1_CSI {
            let Some(csi) = read_csi(&chars, i + 1) else {
                return out;
            };
            if csi.final_char == 'm'
                && csi.intermediates_end == csi.params_end
                && params_are_sgr(&chars[i + 1..csi.params_end])
            {
                out.extend(&chars[i..csi.end]);
            }
            i = csi.end;
            continue;
        }

        // C1 control-string introducers (OSC kept, DCS/PM/APC/SOS stripped).
        if let Some(kind) = control_string_from_c1_introducer(c) {
            let Some(end) =
                find_control_string_terminator(&chars, i + 1, kind == ControlString::Osc)
            else {
                return out;
            };
            if kind == ControlString::Osc {
                out.extend(&chars[i..end]);
            }
            i = end;
            continue;
        }

        // Standalone C1 ST and remaining standalone C1 controls (NEL etc.):
        // stripped (ansi-tokenizer.ts `st` / `c1` tokens, dropped by sanitize).
        if is_c1_control(c) {
            i += 1;
            continue;
        }

        out.push(c);
        i += 1;
    }

    out
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Every literal below is ORACLE-CAPTURED: the input was fed to the live
    /// ink build's `sanitizeAnsi` (`/home/alpha/rewrite/ink/build/sanitize-ansi.js`,
    /// probe /tmp/t129) and the output recorded byte-for-byte (hex-dumped for
    /// the C1 cases, where the introducer byte is invisible in a terminal).
    fn s(input: &str) -> String {
        sanitize_ansi(input.to_string())
    }

    // ── stripped: standalone non-SGR controls (the #129 defect class) ──────

    #[test]
    fn strips_csi_clear_screen() {
        assert_eq!(s("A\x1b[2JB"), "AB");
    }

    #[test]
    fn strips_csi_cursor_up() {
        assert_eq!(s("A\x1b[1AB"), "AB");
    }

    #[test]
    fn strips_csi_erase_line() {
        assert_eq!(s("A\x1b[2KB"), "AB");
    }

    #[test]
    fn strips_private_mode_csi() {
        // Params "?25" fail /^[\d:;]*$/ and the final is 'l' anyway.
        assert_eq!(s("A\x1b[?25lB"), "AB");
    }

    #[test]
    fn strips_csi_with_intermediate_even_when_final_is_m() {
        // \x1b[1 m has intermediate 0x20 → NOT plain SGR → stripped.
        assert_eq!(s("A\x1b[1 mB"), "AB");
    }

    #[test]
    fn strips_decsc_decrc() {
        assert_eq!(s("A\x1b7B"), "AB");
        assert_eq!(s("A\x1b8B"), "AB");
    }

    #[test]
    fn strips_esc_with_letter_final_consuming_it() {
        // ESC+'B' is a complete esc token (final 0x42 ∈ 0x30..=0x7E): the 'B'
        // is part of the stripped sequence, NOT text. Oracle: "A".
        assert_eq!(s("A\x1bB"), "A");
    }

    #[test]
    fn strips_dcs_with_payload() {
        assert_eq!(s("A\x1bPq\x1b\\B"), "AB");
    }

    #[test]
    fn strips_standalone_c1_nel() {
        assert_eq!(s("A\u{85}B"), "AB");
    }

    #[test]
    fn strips_c1_csi_clear_screen() {
        assert_eq!(s("A\u{9b}2JB"), "AB");
    }

    #[test]
    fn strips_lone_st() {
        assert_eq!(s("A\u{9c}B"), "AB");
    }

    #[test]
    fn lone_esc_before_non_sequence_char_drops_only_the_esc() {
        // ESC + C1 follower: not intermediate, not escape-final → lone ESC
        // ignored, then the C1 NEL is stripped on its own. Oracle: "AB".
        assert_eq!(s("A\x1b\u{85}B"), "AB");
    }

    // ── malformed: remainder dropped from the introducer on ────────────────

    #[test]
    fn malformed_csi_at_eof_drops_remainder() {
        assert_eq!(s("A\x1b[31"), "A");
    }

    #[test]
    fn lone_esc_at_eof_drops_it() {
        assert_eq!(s("A\x1b"), "A");
    }

    #[test]
    fn esc_intermediate_without_final_drops_remainder() {
        // ESC + 0x20 intermediate + EOF → malformed → remainder dropped.
        assert_eq!(s("A\x1b "), "A");
        // Probe "A\x1b B": ESC+SP+'B' is a COMPLETE esc token ('B' final) →
        // stripped whole. Oracle: "A".
        assert_eq!(s("A\x1b B"), "A");
    }

    #[test]
    fn unterminated_osc_drops_remainder() {
        assert_eq!(s("A\x1b]8;;x"), "A");
    }

    // ── kept: SGR byte-verbatim (styling passthrough) ───────────────────────

    #[test]
    fn keeps_sgr_pair() {
        assert_eq!(s("A\x1b[31mR\x1b[39mB"), "A\x1b[31mR\x1b[39mB");
    }

    #[test]
    fn keeps_colon_parameter_sgr() {
        assert_eq!(s("A\x1b[38:5:196mR\x1b[39mB"), "A\x1b[38:5:196mR\x1b[39mB");
    }

    #[test]
    fn keeps_c1_csi_sgr_with_introducer_byte() {
        // Hex-probed on the oracle: 41 9b 33 31 6d 52 9b 33 39 6d 42 —
        // the C1 introducer byte SURVIVES.
        assert_eq!(s("A\u{9b}31mR\u{9b}39mB"), "A\u{9b}31mR\u{9b}39mB");
    }

    // ── kept: OSC byte-verbatim (hyperlinks, titles) ────────────────────────

    #[test]
    fn keeps_osc8_hyperlink_bel_terminated() {
        let t = "A\x1b]8;;https://x\x07L\x1b]8;;\x07B";
        assert_eq!(s(t), t);
    }

    #[test]
    fn keeps_osc8_hyperlink_esc_backslash_terminated() {
        let t = "A\x1b]8;;https://x\x1b\\L\x1b]8;;\x1b\\B";
        assert_eq!(s(t), t);
    }

    #[test]
    fn keeps_osc0_title() {
        let t = "A\x1b]0;title\x07B";
        assert_eq!(s(t), t);
    }

    #[test]
    fn keeps_c1_osc_with_introducer_byte() {
        // Hex-probed: 41 9d 30 3b 74 07 42 — C1 OSC kept verbatim.
        let t = "A\u{9d}0;t\x07B";
        assert_eq!(s(t), t);
    }

    // ── fast path: plain text untouched ─────────────────────────────────────

    #[test]
    fn plain_text_passthrough() {
        assert_eq!(s("AB"), "AB");
        assert_eq!(
            s("multi\nline\ttext \u{1F600}"),
            "multi\nline\ttext \u{1F600}"
        );
    }
}