inkferro-core 0.1.0

//! Port of `styledChars.js` from `@alcalzone/ansi-tokenize@0.3.0`.

use std::rc::Rc;

use unicode_segmentation::UnicodeSegmentation;

use super::ansi_codes::ansi_codes_to_string;
use super::diff::diff_ansi_codes;
use super::reduce::reduce_ansi_codes_in_place;
use super::tokenize::is_fullwidth_grapheme;
use super::types::{AnsiToken, StyledChar, Token, empty_styles};

/// Build a `StyledChar` vector from a token stream.
///
/// Each visible character gets a snapshot of the current active ANSI style stack.
/// Equivalent to `styledCharsFromTokens(tokens)` in the JS source.
pub fn styled_chars_from_tokens(tokens: &[Token]) -> Vec<StyledChar> {
    // The active reduced style stack, as a Vec we mutate in place across SGR
    // transitions.
    let mut codes: Vec<AnsiToken> = Vec::new();
    // The SHARED snapshot of `codes` handed to every char in the current run.
    // Rebuilt into a fresh `Rc` only when an SGR token changes the stack; each
    // char then takes a cheap `Rc::clone`, so a run of N chars between two SGR
    // transitions costs ONE heap allocation, not N. Starts at the shared empty
    // sentinel (zero heap) so leading-unstyled runs allocate nothing.
    let mut current: Rc<[AnsiToken]> = empty_styles();
    let mut ret = Vec::new();

    for token in tokens {
        match token {
            Token::Ansi(a) => {
                // In-place: reuses `codes`' buffer instead of cloning the
                // stack into a fresh `Vec` per SGR token.
                reduce_ansi_codes_in_place(&mut codes, std::slice::from_ref(a));
                // Style stack changed: snapshot once for the upcoming run. An
                // empty stack reuses the sentinel rather than allocating.
                current = if codes.is_empty() {
                    empty_styles()
                } else {
                    Rc::from(codes.as_slice())
                };
            }
            Token::Char(c) => {
                ret.push(StyledChar {
                    // One copy: the borrowed source slice lands straight in the
                    // `CompactString` (was source -> CharToken String -> here).
                    value: c.value.into(),
                    full_width: c.full_width,
                    styles: Rc::clone(&current),
                });
            }
            Token::Control(_) => {
                // Control tokens (non-link OSC) are not tracked in the style stack
            }
        }
    }

    ret
}

/// Build a `StyledChar` vector directly from a plain (ANSI-free) string.
///
/// A fused fast path for the common case where `input` contains **no** SGR/OSC
/// escape opener (`ESC` U+001B or C1 CSI U+009B). For such input, `tokenize`
/// emits only [`Token::Char`] entries — one grapheme cluster each, with no style
/// transitions — and [`styled_chars_from_tokens`] then maps each to a
/// `StyledChar` carrying the shared empty-style sentinel. This helper collapses
/// that two-pass `tokenize` → `styled_chars_from_tokens` into a single grapheme
/// walk, skipping the intermediate `Vec<Token>` and the per-grapheme
/// `CharToken.value` `String` (each cluster lands straight in the
/// `StyledChar.value` `CompactString`).
///
/// Byte-for-byte equivalent to
/// `styled_chars_from_tokens(&tokenize(input, None))` whenever
/// `!input.contains(['\u{1B}', '\u{9B}'])`: the grapheme segmentation
/// (`unicode-segmentation`) and `full_width` rule ([`is_fullwidth_grapheme`])
/// are the exact ones `tokenize` uses, and with no SGR tokens every char's
/// `styles` is the empty sentinel.
///
/// The caller MUST verify the no-opener precondition; passing styled input would
/// treat escape bytes as visible graphemes and diverge from the tokenizer.
pub(crate) fn styled_chars_from_plain(input: &str) -> Vec<StyledChar> {
    debug_assert!(
        !input.contains(['\u{1B}', '\u{9B}']),
        "styled_chars_from_plain requires ANSI-free input"
    );
    let empty = empty_styles();
    input
        .graphemes(true)
        .map(|cluster| {
            let base_cp = cluster.chars().next().map(|c| c as u32).unwrap_or(0);
            StyledChar {
                value: cluster.into(),
                full_width: is_fullwidth_grapheme(cluster, base_cp),
                styles: Rc::clone(&empty),
            }
        })
        .collect()
}

/// Reconstruct an ANSI-escaped string from a `StyledChar` slice.
///
/// Uses [`diff_ansi_codes`] between adjacent characters to emit only the
/// minimum required escape sequences.
///
/// Equivalent to `styledCharsToString(chars)` in the JS source.
pub fn styled_chars_to_string(chars: &[StyledChar]) -> String {
    styled_chars_to_string_borrowed(chars.iter())
}

/// Borrow-taking core of [`styled_chars_to_string`].
///
/// Accepts any iterator of `&StyledChar` so callers holding `&StyledChar`
/// references (e.g. `Grid::get`, which would otherwise clone every surviving
/// cell into an owned `Vec<StyledChar>` just to serialize it) can drive the
/// serializer without materializing owned copies. Byte-for-byte identical to
/// the index-based loop it replaces: it walks adjacent pairs via a retained
/// `prev` reference instead of `chars[i - 1]`.
pub(crate) fn styled_chars_to_string_borrowed<'a, I>(chars: I) -> String
where
    I: IntoIterator<Item = &'a StyledChar>,
{
    let mut ret = String::new();
    styled_chars_to_string_into(chars, &mut ret);
    ret
}

/// Buffer-reuse variant of [`styled_chars_to_string_borrowed`].
///
/// Serializes `chars` by **appending** to `out` rather than allocating a fresh
/// `String` — the caller owns the buffer and decides when to clear or truncate
/// it. This lets a multi-row serializer (`Grid::get`) drive every row through
/// one reused allocation instead of minting (and dropping) a trimmed `String`
/// per row.
///
/// Byte-for-byte identical to [`styled_chars_to_string_borrowed`] for the
/// region it appends: the same `prev`-pair walk, the same open/delta/close SGR
/// emission. The only difference is the destination — `out` instead of a
/// private `String` — so the bytes produced for a given `chars` are unchanged.
/// `out` is never read, only appended to; pre-existing contents are preserved.
pub(crate) fn styled_chars_to_string_into<'a, I>(chars: I, out: &mut String)
where
    I: IntoIterator<Item = &'a StyledChar>,
{
    let mut prev: Option<&StyledChar> = None;

    for ch in chars {
        match prev {
            // First character: open its full style stack from empty.
            None => out.push_str(&ansi_codes_to_string(&ch.styles)),
            // Subsequent characters: emit only the delta from the previous.
            // When both chars share the SAME style-run allocation (`Rc::ptr_eq`
            // — every char inside one run, plus all empty-style chars via the
            // shared `empty_styles` sentinel), the slices are identical, and
            // `diff_ansi_codes(x, x)` is always empty: every `from` entry is
            // matched in `to` (same code / same end_code), so `to_undo` is
            // empty, and every `to` entry's code is present in `from`, so
            // nothing is added. `ansi_codes_to_string(&[])` is `""` — skipping
            // the call emits byte-identical output.
            Some(p) => {
                if !Rc::ptr_eq(&p.styles, &ch.styles) {
                    let diff = diff_ansi_codes(&p.styles, &ch.styles);
                    out.push_str(&ansi_codes_to_string(&diff));
                }
            }
        }
        out.push_str(&ch.value);
        prev = Some(ch);
    }

    // After the last character, close any remaining active styles. `prev` is
    // the final character iff the input was non-empty (empty input appends
    // nothing, matching the old early return).
    if let Some(last) = prev {
        let closing = diff_ansi_codes(&last.styles, &[]);
        out.push_str(&ansi_codes_to_string(&closing));
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::text::ansi_tokenize::tokenize::tokenize;

    fn pipeline(input: &str) -> String {
        styled_chars_to_string(&styled_chars_from_tokens(&tokenize(input, None)))
    }

    // Test 18: styled_chars round-trip. `\x1B[31mred\x1B[39m` is already in
    // normalized form, so the pipeline reproduces it exactly. Ground truth
    // captured from `@alcalzone/ansi-tokenize@0.3.0`.
    #[test]
    fn round_trip_normalized_literal() {
        assert_eq!(pipeline("\x1B[31mred\x1B[39m"), "\x1B[31mred\x1B[39m");
    }

    // A global reset `\x1B[0m` normalizes to the specific close code `\x1B[39m`.
    #[test]
    fn reset_normalizes_to_specific_close() {
        assert_eq!(pipeline("\x1B[31mred\x1B[0m"), "\x1B[31mred\x1B[39m");
    }

    // Compound code + reset: exact ground-truth output from the JS package.
    #[test]
    fn compound_round_trip_exact() {
        assert_eq!(
            pipeline("\x1B[1;3;31mred\x1B[0m"),
            "\x1B[1m\x1B[3m\x1B[31mred\x1B[39m\x1B[23m\x1B[22m"
        );
    }

    #[test]
    fn round_trip_is_idempotent() {
        let once = pipeline("\x1B[1;31mhi\x1B[0m");
        let twice = pipeline(&once);
        assert_eq!(once, twice, "normalized output must be a fixed point");
    }

    #[test]
    fn unstyled_string_unchanged() {
        assert_eq!(pipeline("hello"), "hello");
    }

    #[test]
    fn styled_chars_from_tokens_correct_styles() {
        let tokens = tokenize("\x1B[31mab\x1B[0mc", None);
        let styled = styled_chars_from_tokens(&tokens);
        assert_eq!(styled.len(), 3);
        // 'a' and 'b' are red.
        assert_eq!(styled[0].styles.len(), 1);
        assert_eq!(styled[0].styles[0].code, "\x1B[31m");
        assert_eq!(styled[1].styles.len(), 1);
        // 'c' has no styles (reset cleared them).
        assert!(styled[2].styles.is_empty());
    }

    // The fused plain fast path must produce byte-identical `StyledChar`s to
    // the full `tokenize` → `styled_chars_from_tokens` pipeline for every
    // ANSI-free input — including grapheme clusters the tokenizer treats
    // specially (wide CJK, VS16 emoji, regional-indicator flags, combining
    // marks) and the C1 OSC introducer U+009D, which the tokenizer does NOT
    // recognise as an opener (only ESC U+001B and U+009B), so it is a plain
    // grapheme on both paths. A divergence here would corrupt grid content.
    #[test]
    fn styled_chars_from_plain_matches_tokenizer() {
        let cases = [
            "",
            "hello",
            "* item one",
            "╭──────────╮",
            "中文字",
            "a中b",
            "👍🏽 ok",
            "🇺🇸 flag",
            "e\u{0301}x",        // combining acute
            "tab\tend",          // control char (not an opener)
            "C1-OSC:\u{9D}here", // U+009D: plain grapheme, not an opener
            "mix 中\u{FE0F}!",   // VS16 forces full-width
        ];
        for input in cases {
            let via_tokenizer = styled_chars_from_tokens(&tokenize(input, None));
            let via_plain = styled_chars_from_plain(input);
            assert_eq!(
                via_plain, via_tokenizer,
                "plain fast path diverged from tokenizer for {input:?}"
            );
        }
    }

    // The `Rc::ptr_eq` fast path in `styled_chars_to_string_into` must be
    // unobservable: serializing chars whose runs SHARE one Rc allocation must
    // byte-equal serializing the same logical sequence rebuilt so every char
    // owns a DISTINCT Rc allocation (which can never take the fast path and
    // always runs the full `diff_ansi_codes`). Guards the skip against any
    // future change that would make Rc identity diverge from value identity.
    #[test]
    fn serializer_rc_sharing_is_unobservable() {
        let inputs = [
            "\x1B[31mred and \x1B[1mbold\x1B[0m plain",
            "plain only",
            "\x1B[38;2;1;2;3mtruecolor\x1B[39m mix \x1B[4mu\x1B[24m",
            "\x1B]8;;https://e.com\x07link\x1B]8;;\x07 tail",
            // C1 CSI opener (U+009B, 2 bytes in UTF-8) incl. a compound code:
            // pins the skip independent of opener flavor.
            "\u{9B}[31mc1 \u{9B}[1;4mcompound\u{9B}[0m tail",
            "",
        ];
        for input in inputs {
            let shared = styled_chars_from_tokens(&tokenize(input, None));
            let unshared: Vec<StyledChar> = shared
                .iter()
                .map(|c| StyledChar {
                    value: c.value.clone(),
                    full_width: c.full_width,
                    // Fresh allocation per char: `Rc::ptr_eq` is false for
                    // every adjacent pair (even empty styles).
                    styles: Rc::from(c.styles.to_vec()),
                })
                .collect();
            assert_eq!(
                styled_chars_to_string(&shared),
                styled_chars_to_string(&unshared),
                "Rc-sharing fast path diverged for {input:?}"
            );
        }
    }

    // Control (non-link OSC) tokens are dropped and do not affect styles.
    #[test]
    fn control_tokens_dropped() {
        let tokens = tokenize("\x1B[31ma\x1B]0;t\x07b\x1B[0m", None);
        let styled = styled_chars_from_tokens(&tokens);
        assert_eq!(styled.len(), 2);
        assert_eq!(styled[0].value, "a");
        assert_eq!(styled[1].value, "b");
        assert_eq!(styled[1].styles[0].code, "\x1B[31m");
    }
}