inkferro-core 0.1.0

//! Port of `tokenize.js` from `@alcalzone/ansi-tokenize@0.3.0`.
//!
//! Key design choices:
//! - ANSI sequences are (almost) pure ASCII, so they are scanned by byte offset.
//! - Grapheme clusters are extracted with `unicode-segmentation` for visible chars.
//! - `end_char = None` means unlimited (`Number.POSITIVE_INFINITY` in JS).
//!
//! The escape *opener* may be either `ESC` (U+001B, one byte) or the C1 control
//! `U+009B` (`ESCAPES` in the JS source). U+009B encodes to two UTF-8 bytes
//! (`0xC2 0x9B`), so the opener is matched on code points, not raw bytes.

use compact_str::{CompactString, format_compact};
use unicode_segmentation::UnicodeSegmentation;
use unicode_width::UnicodeWidthChar;

use crate::text::ansi_tokenize::ansi_codes::get_end_code;
use crate::text::ansi_tokenize::consts::{CSI, ESC, LINK_CODE_PREFIX, OSC, SGR_FINAL};
use crate::text::ansi_tokenize::types::{AnsiToken, CharToken, ControlToken, Token};

/// C1 CSI / OSC opener code point (U+009B), a member of the JS `ESCAPES` set.
const C1_OPENER: char = '\u{9B}';

// ─── Full-width detection ────────────────────────────────────────────────────

/// Matches `isFullwidthGrapheme` from `tokenize.js`.
///
/// Returns `true` if the grapheme cluster should be counted as two terminal
/// columns.
pub(crate) fn is_fullwidth_grapheme(grapheme: &str, base_code_point: u32) -> bool {
    // 1. unicode-width single-char check (replaces `is-fullwidth-code-point`).
    let base_is_wide = char::from_u32(base_code_point)
        .map(|c| UnicodeWidthChar::width(c) == Some(2))
        .unwrap_or(false);
    if base_is_wide {
        return true;
    }
    // 2. Variation Selector-16 (U+FE0F) forces emoji presentation → 2 columns.
    if grapheme.contains('\u{FE0F}') {
        return true;
    }
    // 3. Regional Indicator letters (U+1F1E6..=U+1F1FF) form flag pairs → 2 columns.
    if (0x1F1E6..=0x1F1FF).contains(&base_code_point) {
        return true;
    }
    false
}

// ─── SGR parsing ─────────────────────────────────────────────────────────────

/// Scans `s` (which starts with a CSI opener `<ESC-or-C1> [`) for the end of an
/// SGR sequence. Returns the byte index of the `m` terminator, or `None`.
///
/// Mirrors `findSGRSequenceEndIndex`, which begins scanning *after* the 2-char
/// opener. The opener is 2 UTF-16 units in JS but may be 2 or 3 UTF-8 bytes here
/// (`ESC [` = 2 bytes, `\u{9B} [` = 3 bytes), so the caller passes `scan_start`
/// as the byte offset just past the opener.
fn find_sgr_end(s: &[u8], scan_start: usize) -> Option<usize> {
    for (i, &byte) in s.iter().enumerate().skip(scan_start) {
        match byte {
            SGR_FINAL => return Some(i),
            b';' | b'0'..=b'9' => {}
            _ => return None,
        }
    }
    None
}

/// Parse an SGR sequence starting at byte offset `offset` in `input`, where the
/// opener occupies `opener_len` bytes (1 for `ESC`, 2 for `U+009B`) followed by
/// the 1-byte `[`. Returns the raw code string (e.g. `"\x1B[31m"`) or `None`.
fn parse_sgr_sequence(input: &str, offset: usize, opener_len: usize) -> Option<&str> {
    let bytes = &input.as_bytes()[offset..];
    let end = find_sgr_end(bytes, opener_len + 1)?;
    Some(&input[offset..offset + end + 1])
}

/// Split a compound SGR like `\x1B[1;3;31m` into individual `\x1B[Nm` tokens.
///
/// Keeps `38;5;N` / `48;5;N` (8-bit) and `38;2;R;G;B` / `48;2;R;G;B` (24-bit)
/// colour codes together. Mirrors `splitCompoundSGRSequences`, including its
/// `chars().skip(2)` / drop-last-char slicing so that a C1-CSI opener (whose
/// first character is U+009B) is handled identically to the JS source.
///
/// The caller (`tokenize`) handles the non-compound (no `;`) case inline, so
/// this only ever sees compound codes; the JS no-`;` early return is kept
/// anyway for safety. `inner` borrows the slice between the 2-char opener and
/// the trailing `m`: the byte offset of the third char replaces the old
/// `Vec<char>` collect + `String` rebuild, byte-identically — the chars dropped
/// are the same two leading chars, and `m` is always 1 byte.
fn split_compound_sgr(code: &str) -> Vec<CompactString> {
    if !code.contains(';') {
        return vec![code.into()];
    }
    // JS: `code.slice(2, -1)` — drop the first two chars and the trailing `m`.
    let third_char = code
        .char_indices()
        .nth(2)
        .map(|(off, _)| off)
        .unwrap_or(code.len());
    let inner = &code[third_char..code.len() - 1];
    let parts: Vec<&str> = inner.split(';').collect();
    let mut ret = Vec::new();
    let mut i = 0;
    while i < parts.len() {
        let raw = parts[i];
        if raw == "38" || raw == "48" {
            if i + 2 < parts.len() && parts[i + 1] == "5" {
                ret.push(parts[i..i + 3].join(";"));
                i += 3;
                continue;
            } else if i + 4 < parts.len() && parts[i + 1] == "2" {
                ret.push(parts[i..i + 5].join(";"));
                i += 5;
                continue;
            }
        }
        ret.push(raw.to_owned());
        i += 1;
    }
    ret.into_iter()
        .map(|part| format_compact!("\x1B[{part}m"))
        .collect()
}

// ─── OSC / link parsing ───────────────────────────────────────────────────────

/// Find the byte index of the last byte of the first OSC terminator at or after
/// `start`. Terminators are BEL (0x07), C1 ST (U+009C, bytes `0xC2 0x9C`), and
/// `ESC \` (0x1B 0x5C). Returns the byte offset of the terminator's last byte.
///
/// Mirrors `findOSCTerminatorIndex`. We scan code points (not raw bytes) so the
/// two-byte C1 ST is recognised, returning the byte index of its final byte.
fn find_osc_terminator(input: &str, start: usize) -> Option<usize> {
    let mut it = input[start..].char_indices().peekable();
    while let Some((rel, ch)) = it.next() {
        let abs = start + rel;
        match ch {
            '\u{07}' => return Some(abs),
            '\u{9C}' => return Some(abs + ch.len_utf8() - 1),
            '\u{1B}' => {
                if matches!(it.peek(), Some(&(_, '\\'))) {
                    // The backslash is the terminator's last byte.
                    return Some(abs + 1);
                }
            }
            _ => {}
        }
    }
    None
}

/// Try to parse an OSC 8 link sequence at byte offset `offset`, whose opener
/// (ESC or C1 CSI U+009B) is `opener_len` bytes long.
/// Returns the raw code string (including terminator) or `None`.
///
/// Mirrors `parseLinkCode`: the JS verifies `linkCodePrefixCharCodes` from
/// index **1** — the opener byte itself is never re-checked — so a C1 opener
/// followed by `]8;` also parses as a link. We therefore verify only `]8;`
/// after the opener. The params-terminating `;` is searched from after the
/// prefix, then the OSC terminator ends the code.
fn parse_link_code(input: &str, offset: usize, opener_len: usize) -> Option<&str> {
    let s = &input[offset..];
    // `]8;` after the opener — `LINK_CODE_PREFIX` minus its 1-byte ESC opener.
    let prefix_rest = &LINK_CODE_PREFIX[1..];
    if !s[opener_len..].starts_with(prefix_rest) {
        return None;
    }
    let after_prefix = opener_len + prefix_rest.len();
    // Index of the params-terminating `;`, searched from after the prefix.
    let params_end = s[after_prefix..].find(';').map(|p| p + after_prefix)?;
    let term_last = find_osc_terminator(s, params_end + 1)?;
    Some(&input[offset..offset + term_last + 1])
}

// ─── Main tokenize function ───────────────────────────────────────────────────

/// Tokenize an ANSI-escaped string into a vector of [`Token`]s.
///
/// `end_char` limits the output to this many visible columns; `None` means
/// unlimited (matching `Number.POSITIVE_INFINITY` in the JS source).
pub fn tokenize(input: &str, end_char: Option<usize>) -> Vec<Token<'_>> {
    let end_char = end_char.unwrap_or(usize::MAX);
    let mut tokens = Vec::new();
    let mut visible = 0usize;
    let mut i = 0usize; // byte offset

    while i < input.len() {
        let rest = &input[i..];
        // SAFETY: `i` always sits on a UTF-8 boundary (we only ever advance by
        // whole code points / grapheme clusters / parsed-sequence byte lengths).
        let cp = rest.chars().next().expect("non-empty slice has a char");

        if cp == ESC as char || cp == C1_OPENER {
            // Peek the next code point after the opener.
            let opener_len = cp.len_utf8();
            let next_cp = rest[opener_len..].chars().next();

            if next_cp == Some(OSC as char) {
                // OSC — try a hyperlink first, then a generic control sequence.
                if let Some(code) = parse_link_code(input, i, opener_len) {
                    let len = code.len();
                    let end_code = get_end_code(code);
                    tokens.push(Token::Ansi(AnsiToken {
                        code: code.into(),
                        end_code,
                    }));
                    i += len;
                    continue;
                }
                // Generic OSC (window title, notifications, …): terminator scan
                // begins after the 2-char `ESC ]` opener (`startIndex = 2`).
                let scan_start = i + opener_len + (OSC as char).len_utf8();
                if let Some(term_last) = find_osc_terminator(input, scan_start) {
                    let code = &input[i..=term_last];
                    tokens.push(Token::Control(ControlToken {
                        code: code.to_owned(),
                    }));
                    i = term_last + 1;
                    continue;
                }
            } else if next_cp == Some(CSI as char) {
                // CSI / SGR sequence.
                if let Some(code) = parse_sgr_sequence(input, i, opener_len) {
                    let len = code.len();
                    if !code.contains(';') {
                        // Non-compound (the common case): push directly,
                        // skipping `split_compound_sgr`'s `vec![code.into()]`
                        // round trip. Identical output: the splitter's no-`;`
                        // early return yields exactly this one token.
                        let end_code = get_end_code(code);
                        tokens.push(Token::Ansi(AnsiToken {
                            code: code.into(),
                            end_code,
                        }));
                    } else {
                        for part in split_compound_sgr(code) {
                            let end_code = get_end_code(&part);
                            tokens.push(Token::Ansi(AnsiToken {
                                code: part,
                                end_code,
                            }));
                        }
                    }
                    i += len;
                    continue;
                }
            }
            // Fall through: the opener is consumed as an ordinary visible
            // character (matching the JS char-handling fallthrough).
        }

        // Visible character — extract one grapheme cluster.
        let cluster = rest
            .graphemes(true)
            .next()
            .expect("non-empty slice has at least one grapheme");
        let base_cp = cluster.chars().next().map(|c| c as u32).unwrap_or(0);
        let full_width = is_fullwidth_grapheme(cluster, base_cp);
        tokens.push(Token::Char(CharToken {
            value: cluster,
            full_width,
        }));
        visible += if full_width { 2 } else { 1 };
        if visible >= end_char {
            break;
        }
        i += cluster.len();
    }

    tokens
}

// ─── Tests ───────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;
    use crate::text::ansi_tokenize::types::Token;

    fn chars<'a>(tokens: &[Token<'a>]) -> Vec<&'a str> {
        tokens
            .iter()
            .filter_map(|t| match t {
                Token::Char(c) => Some(c.value),
                _ => None,
            })
            .collect()
    }

    fn ansi_tokens<'a>(tokens: &'a [Token<'a>]) -> Vec<(&'a str, &'a str)> {
        tokens
            .iter()
            .filter_map(|t| match t {
                Token::Ansi(a) => Some((a.code.as_str(), a.end_code.as_str())),
                _ => None,
            })
            .collect()
    }

    // Test 1: Plain text → all CharTokens, full_width = false.
    #[test]
    fn plain_text_chars() {
        let tokens = tokenize("abc", None);
        assert_eq!(tokens.len(), 3);
        assert_eq!(chars(&tokens), vec!["a", "b", "c"]);
        for t in &tokens {
            if let Token::Char(c) = t {
                assert!(!c.full_width);
            }
        }
    }

    // Test 2: CJK → full_width = true.
    #[test]
    fn cjk_fullwidth() {
        let tokens = tokenize("中", None);
        assert_eq!(tokens.len(), 1);
        match &tokens[0] {
            Token::Char(c) => assert!(c.full_width),
            _ => panic!("expected Char"),
        }
    }

    // Test 3: Emoji with VS16 → full_width = true (VS16 rule).
    #[test]
    fn vs16_fullwidth() {
        let tokens = tokenize("✏️", None);
        assert_eq!(tokens.len(), 1);
        match &tokens[0] {
            Token::Char(c) => {
                assert_eq!(c.value, "✏️");
                assert!(c.full_width);
            }
            _ => panic!("expected Char"),
        }
    }

    // Test 4: Flag → single grapheme, full_width = true (regional indicator rule).
    #[test]
    fn flag_fullwidth() {
        let tokens = tokenize("🇩🇪", None);
        assert_eq!(tokens.len(), 1, "flag is one grapheme cluster");
        match &tokens[0] {
            Token::Char(c) => assert!(c.full_width),
            _ => panic!("expected Char"),
        }
    }

    // Test 5: `\x1B[31mred\x1B[39m`.
    #[test]
    fn red_then_reset_fg() {
        let tokens = tokenize("\x1B[31mred\x1B[39m", None);
        assert_eq!(
            ansi_tokens(&tokens),
            vec![("\x1B[31m", "\x1B[39m"), ("\x1B[39m", "\x1B[39m")]
        );
        assert_eq!(chars(&tokens), vec!["r", "e", "d"]);
    }

    // Test 6: Reset → end_code == itself.
    #[test]
    fn reset_code() {
        let tokens = tokenize("\x1B[0m", None);
        assert_eq!(ansi_tokens(&tokens), vec![("\x1B[0m", "\x1B[0m")]);
    }

    // Test 7: Compound → 3 AnsiTokens.
    #[test]
    fn compound_sgr_split() {
        let tokens = tokenize("\x1B[1;3;31m", None);
        assert_eq!(
            ansi_tokens(&tokens),
            vec![
                ("\x1B[1m", "\x1B[22m"),
                ("\x1B[3m", "\x1B[23m"),
                ("\x1B[31m", "\x1B[39m"),
            ]
        );
    }

    // Test 8: 8-bit color → ONE AnsiToken, end `\x1B[39m`.
    #[test]
    fn eight_bit_color() {
        let tokens = tokenize("\x1B[38;5;200m", None);
        assert_eq!(ansi_tokens(&tokens), vec![("\x1B[38;5;200m", "\x1B[39m")]);
    }

    // Test 9: 24-bit color → ONE AnsiToken, end `\x1B[39m`.
    #[test]
    fn twenty_four_bit_color() {
        let tokens = tokenize("\x1B[38;2;255;0;128m", None);
        assert_eq!(
            ansi_tokens(&tokens),
            vec![("\x1B[38;2;255;0;128m", "\x1B[39m")]
        );
    }

    // Test 10: Compound with embedded 24-bit → bold, 24-bit fg, underline.
    #[test]
    fn compound_with_embedded_24bit() {
        let tokens = tokenize("\x1B[1;38;2;10;20;30;4m", None);
        assert_eq!(
            ansi_tokens(&tokens),
            vec![
                ("\x1B[1m", "\x1B[22m"),
                ("\x1B[38;2;10;20;30m", "\x1B[39m"),
                ("\x1B[4m", "\x1B[24m"),
            ]
        );
    }

    // Test 11: OSC 8 link (BEL) → AnsiToken(link), 4 chars, AnsiToken.
    #[test]
    fn osc8_link_bel() {
        let s = "\x1B]8;;https://example.com\x07text\x1B]8;;\x07";
        let tokens = tokenize(s, None);
        assert_eq!(
            ansi_tokens(&tokens),
            vec![
                ("\x1B]8;;https://example.com\x07", "\x1B]8;;\x07"),
                ("\x1B]8;;\x07", "\x1B]8;;\x07"),
            ]
        );
        assert_eq!(chars(&tokens), vec!["t", "e", "x", "t"]);
    }

    // OSC 8 link terminated by ST (`ESC \`) — exercises the full terminator set.
    #[test]
    fn osc8_link_st() {
        let s = "\x1B]8;;https://e.com\x1B\\hi\x1B]8;;\x1B\\";
        let tokens = tokenize(s, None);
        assert_eq!(
            ansi_tokens(&tokens),
            vec![
                ("\x1B]8;;https://e.com\x1B\\", "\x1B]8;;\x1B\\"),
                ("\x1B]8;;\x1B\\", "\x1B]8;;\x1B\\"),
            ]
        );
        assert_eq!(chars(&tokens), vec!["h", "i"]);
    }

    // Test 12: OSC window title → ControlToken.
    #[test]
    fn osc_window_title() {
        let tokens = tokenize("\x1B]0;title\x07X", None);
        assert_eq!(tokens.len(), 2);
        match &tokens[0] {
            Token::Control(c) => assert_eq!(c.code, "\x1B]0;title\x07"),
            _ => panic!("expected Control"),
        }
        assert_eq!(chars(&tokens), vec!["X"]);
    }

    // OSC title terminated by ST.
    #[test]
    fn osc_window_title_st() {
        let tokens = tokenize("\x1B]0;title\x1B\\X", None);
        match &tokens[0] {
            Token::Control(c) => assert_eq!(c.code, "\x1B]0;title\x1B\\"),
            _ => panic!("expected Control"),
        }
        assert_eq!(chars(&tokens), vec!["X"]);
    }

    // Test 13: Invalid SGR → no ansi token; ESC and following chars are visible.
    #[test]
    fn invalid_sgr_falls_through() {
        let tokens = tokenize("\x1B[31xred", None);
        assert!(ansi_tokens(&tokens).is_empty());
        assert_eq!(
            chars(&tokens),
            vec!["\x1B", "[", "3", "1", "x", "r", "e", "d"]
        );
    }

    // Test 14: end_char limit (narrow chars).
    #[test]
    fn end_char_limit() {
        let tokens = tokenize("abcdef", Some(3));
        assert_eq!(chars(&tokens), vec!["a", "b", "c"]);
    }

    // end_char with full-width chars (each counts as 2 columns).
    #[test]
    fn end_char_fullwidth_limit() {
        // 中(2) + 文(2) + X(1): end_char = 4 stops after 文.
        let tokens = tokenize("中文X", Some(4));
        assert_eq!(chars(&tokens), vec!["中", "文"]);
    }

    // C1 CSI opener (U+009B) followed by `[` parses as an SGR sequence.
    #[test]
    fn c1_csi_opener_parses_sgr() {
        let tokens = tokenize("\u{9B}[31mX", None);
        // The parsed code retains the C1 opener byte(s); endCode is from slice(2).
        assert_eq!(ansi_tokens(&tokens), vec![("\u{9B}[31m", "\x1B[39m")]);
        assert_eq!(chars(&tokens), vec!["X"]);
    }

    // C1 opener (U+009B) followed by `]8;` parses as an OSC-8 link, matching
    // JS `parseLinkCode` which verifies the prefix from index 1 and never
    // re-checks the opener byte. The endCode is the JS slice(2)+parseInt quirk:
    // chars \u{9B} and `]` are dropped, "8;;…" parses to 8 → end code 28.
    // Verified empirically against @alcalzone/ansi-tokenize@0.3.0 on Node.
    #[test]
    fn c1_opener_parses_osc8_link() {
        let tokens = tokenize("\u{9B}]8;;https://x\u{07}T\u{9B}]8;;\u{07}", None);
        assert_eq!(
            ansi_tokens(&tokens),
            vec![
                ("\u{9B}]8;;https://x\u{07}", "\x1B[28m"),
                ("\u{9B}]8;;\u{07}", "\x1B[28m"),
            ]
        );
        assert_eq!(chars(&tokens), vec!["T"]);
    }

    // ── Adversarial: full-pipeline no-panic totality ─────────────────────────

    // The ink-style render path is tokenize → styled_chars_from_tokens →
    // styled_chars_to_string. Control-token / unterminated-sequence streams must
    // round-trip without panic (a panic here kills ink-style rendering): raw C1
    // SGR openers, a NUL run, and nested OSC-8 links. A panic fails the test;
    // reaching the final assert proves the pipeline is total. The C1 case is also
    // pinned: it round-trips to "\u{9b}[31mhi\x1b[39m".
    #[test]
    fn ansi_tokenize_pipeline_raw_c1_and_null_bytes_no_panic() {
        use crate::text::ansi_tokenize::{styled_chars_from_tokens, styled_chars_to_string};
        let pipe = |s: &str| styled_chars_to_string(&styled_chars_from_tokens(&tokenize(s, None)));
        assert_eq!(pipe("\u{9b}[31mhi\u{9b}[39m"), "\u{9b}[31mhi\x1b[39m");
        let _ = pipe("\x00\x00\x00"); // NUL run: returns without panic
        let _ = pipe("\x1b]8;;a\x07x\x1b]8;;b\x07y\x1b]8;;\x07"); // nested OSC-8: no panic
    }
}