inkferro-core 0.1.0

Layout, text measurement, ANSI render, and frame-diff engine for inkferro — a Rust-backed, byte-for-byte drop-in for the ink terminal UI library.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
//! Port of `tokenize.js` from `@alcalzone/ansi-tokenize@0.3.0`.
//!
//! Key design choices:
//! - ANSI sequences are (almost) pure ASCII, so they are scanned by byte offset.
//! - Grapheme clusters are extracted with `unicode-segmentation` for visible chars.
//! - `end_char = None` means unlimited (`Number.POSITIVE_INFINITY` in JS).
//!
//! The escape *opener* may be either `ESC` (U+001B, one byte) or the C1 control
//! `U+009B` (`ESCAPES` in the JS source). U+009B encodes to two UTF-8 bytes
//! (`0xC2 0x9B`), so the opener is matched on code points, not raw bytes.

use compact_str::{CompactString, format_compact};
use unicode_segmentation::UnicodeSegmentation;
use unicode_width::UnicodeWidthChar;

use crate::text::ansi_tokenize::ansi_codes::get_end_code;
use crate::text::ansi_tokenize::consts::{CSI, ESC, LINK_CODE_PREFIX, OSC, SGR_FINAL};
use crate::text::ansi_tokenize::types::{AnsiToken, CharToken, ControlToken, Token};

/// C1 CSI / OSC opener code point (U+009B), a member of the JS `ESCAPES` set.
const C1_OPENER: char = '\u{9B}';

// ─── Full-width detection ────────────────────────────────────────────────────

/// Matches `isFullwidthGrapheme` from `tokenize.js`.
///
/// Returns `true` if the grapheme cluster should be counted as two terminal
/// columns.
pub(crate) fn is_fullwidth_grapheme(grapheme: &str, base_code_point: u32) -> bool {
    // 1. unicode-width single-char check (replaces `is-fullwidth-code-point`).
    let base_is_wide = char::from_u32(base_code_point)
        .map(|c| UnicodeWidthChar::width(c) == Some(2))
        .unwrap_or(false);
    if base_is_wide {
        return true;
    }
    // 2. Variation Selector-16 (U+FE0F) forces emoji presentation → 2 columns.
    if grapheme.contains('\u{FE0F}') {
        return true;
    }
    // 3. Regional Indicator letters (U+1F1E6..=U+1F1FF) form flag pairs → 2 columns.
    if (0x1F1E6..=0x1F1FF).contains(&base_code_point) {
        return true;
    }
    false
}

// ─── SGR parsing ─────────────────────────────────────────────────────────────

/// Scans `s` (which starts with a CSI opener `<ESC-or-C1> [`) for the end of an
/// SGR sequence. Returns the byte index of the `m` terminator, or `None`.
///
/// Mirrors `findSGRSequenceEndIndex`, which begins scanning *after* the 2-char
/// opener. The opener is 2 UTF-16 units in JS but may be 2 or 3 UTF-8 bytes here
/// (`ESC [` = 2 bytes, `\u{9B} [` = 3 bytes), so the caller passes `scan_start`
/// as the byte offset just past the opener.
fn find_sgr_end(s: &[u8], scan_start: usize) -> Option<usize> {
    for (i, &byte) in s.iter().enumerate().skip(scan_start) {
        match byte {
            SGR_FINAL => return Some(i),
            b';' | b'0'..=b'9' => {}
            _ => return None,
        }
    }
    None
}

/// Parse an SGR sequence starting at byte offset `offset` in `input`, where the
/// opener occupies `opener_len` bytes (1 for `ESC`, 2 for `U+009B`) followed by
/// the 1-byte `[`. Returns the raw code string (e.g. `"\x1B[31m"`) or `None`.
fn parse_sgr_sequence(input: &str, offset: usize, opener_len: usize) -> Option<&str> {
    let bytes = &input.as_bytes()[offset..];
    let end = find_sgr_end(bytes, opener_len + 1)?;
    Some(&input[offset..offset + end + 1])
}

/// Split a compound SGR like `\x1B[1;3;31m` into individual `\x1B[Nm` tokens.
///
/// Keeps `38;5;N` / `48;5;N` (8-bit) and `38;2;R;G;B` / `48;2;R;G;B` (24-bit)
/// colour codes together. Mirrors `splitCompoundSGRSequences`, including its
/// `chars().skip(2)` / drop-last-char slicing so that a C1-CSI opener (whose
/// first character is U+009B) is handled identically to the JS source.
///
/// The caller (`tokenize`) handles the non-compound (no `;`) case inline, so
/// this only ever sees compound codes; the JS no-`;` early return is kept
/// anyway for safety. `inner` borrows the slice between the 2-char opener and
/// the trailing `m`: the byte offset of the third char replaces the old
/// `Vec<char>` collect + `String` rebuild, byte-identically — the chars dropped
/// are the same two leading chars, and `m` is always 1 byte.
fn split_compound_sgr(code: &str) -> Vec<CompactString> {
    if !code.contains(';') {
        return vec![code.into()];
    }
    // JS: `code.slice(2, -1)` — drop the first two chars and the trailing `m`.
    let third_char = code
        .char_indices()
        .nth(2)
        .map(|(off, _)| off)
        .unwrap_or(code.len());
    let inner = &code[third_char..code.len() - 1];
    let parts: Vec<&str> = inner.split(';').collect();
    let mut ret = Vec::new();
    let mut i = 0;
    while i < parts.len() {
        let raw = parts[i];
        if raw == "38" || raw == "48" {
            if i + 2 < parts.len() && parts[i + 1] == "5" {
                ret.push(parts[i..i + 3].join(";"));
                i += 3;
                continue;
            } else if i + 4 < parts.len() && parts[i + 1] == "2" {
                ret.push(parts[i..i + 5].join(";"));
                i += 5;
                continue;
            }
        }
        ret.push(raw.to_owned());
        i += 1;
    }
    ret.into_iter()
        .map(|part| format_compact!("\x1B[{part}m"))
        .collect()
}

// ─── OSC / link parsing ───────────────────────────────────────────────────────

/// Find the byte index of the last byte of the first OSC terminator at or after
/// `start`. Terminators are BEL (0x07), C1 ST (U+009C, bytes `0xC2 0x9C`), and
/// `ESC \` (0x1B 0x5C). Returns the byte offset of the terminator's last byte.
///
/// Mirrors `findOSCTerminatorIndex`. We scan code points (not raw bytes) so the
/// two-byte C1 ST is recognised, returning the byte index of its final byte.
fn find_osc_terminator(input: &str, start: usize) -> Option<usize> {
    let mut it = input[start..].char_indices().peekable();
    while let Some((rel, ch)) = it.next() {
        let abs = start + rel;
        match ch {
            '\u{07}' => return Some(abs),
            '\u{9C}' => return Some(abs + ch.len_utf8() - 1),
            '\u{1B}' => {
                if matches!(it.peek(), Some(&(_, '\\'))) {
                    // The backslash is the terminator's last byte.
                    return Some(abs + 1);
                }
            }
            _ => {}
        }
    }
    None
}

/// Try to parse an OSC 8 link sequence at byte offset `offset`, whose opener
/// (ESC or C1 CSI U+009B) is `opener_len` bytes long.
/// Returns the raw code string (including terminator) or `None`.
///
/// Mirrors `parseLinkCode`: the JS verifies `linkCodePrefixCharCodes` from
/// index **1** — the opener byte itself is never re-checked — so a C1 opener
/// followed by `]8;` also parses as a link. We therefore verify only `]8;`
/// after the opener. The params-terminating `;` is searched from after the
/// prefix, then the OSC terminator ends the code.
fn parse_link_code(input: &str, offset: usize, opener_len: usize) -> Option<&str> {
    let s = &input[offset..];
    // `]8;` after the opener — `LINK_CODE_PREFIX` minus its 1-byte ESC opener.
    let prefix_rest = &LINK_CODE_PREFIX[1..];
    if !s[opener_len..].starts_with(prefix_rest) {
        return None;
    }
    let after_prefix = opener_len + prefix_rest.len();
    // Index of the params-terminating `;`, searched from after the prefix.
    let params_end = s[after_prefix..].find(';').map(|p| p + after_prefix)?;
    let term_last = find_osc_terminator(s, params_end + 1)?;
    Some(&input[offset..offset + term_last + 1])
}

// ─── Main tokenize function ───────────────────────────────────────────────────

/// Tokenize an ANSI-escaped string into a vector of [`Token`]s.
///
/// `end_char` limits the output to this many visible columns; `None` means
/// unlimited (matching `Number.POSITIVE_INFINITY` in the JS source).
pub fn tokenize(input: &str, end_char: Option<usize>) -> Vec<Token<'_>> {
    let end_char = end_char.unwrap_or(usize::MAX);
    let mut tokens = Vec::new();
    let mut visible = 0usize;
    let mut i = 0usize; // byte offset

    while i < input.len() {
        let rest = &input[i..];
        // SAFETY: `i` always sits on a UTF-8 boundary (we only ever advance by
        // whole code points / grapheme clusters / parsed-sequence byte lengths).
        let cp = rest.chars().next().expect("non-empty slice has a char");

        if cp == ESC as char || cp == C1_OPENER {
            // Peek the next code point after the opener.
            let opener_len = cp.len_utf8();
            let next_cp = rest[opener_len..].chars().next();

            if next_cp == Some(OSC as char) {
                // OSC — try a hyperlink first, then a generic control sequence.
                if let Some(code) = parse_link_code(input, i, opener_len) {
                    let len = code.len();
                    let end_code = get_end_code(code);
                    tokens.push(Token::Ansi(AnsiToken {
                        code: code.into(),
                        end_code,
                    }));
                    i += len;
                    continue;
                }
                // Generic OSC (window title, notifications, …): terminator scan
                // begins after the 2-char `ESC ]` opener (`startIndex = 2`).
                let scan_start = i + opener_len + (OSC as char).len_utf8();
                if let Some(term_last) = find_osc_terminator(input, scan_start) {
                    let code = &input[i..=term_last];
                    tokens.push(Token::Control(ControlToken {
                        code: code.to_owned(),
                    }));
                    i = term_last + 1;
                    continue;
                }
            } else if next_cp == Some(CSI as char) {
                // CSI / SGR sequence.
                if let Some(code) = parse_sgr_sequence(input, i, opener_len) {
                    let len = code.len();
                    if !code.contains(';') {
                        // Non-compound (the common case): push directly,
                        // skipping `split_compound_sgr`'s `vec![code.into()]`
                        // round trip. Identical output: the splitter's no-`;`
                        // early return yields exactly this one token.
                        let end_code = get_end_code(code);
                        tokens.push(Token::Ansi(AnsiToken {
                            code: code.into(),
                            end_code,
                        }));
                    } else {
                        for part in split_compound_sgr(code) {
                            let end_code = get_end_code(&part);
                            tokens.push(Token::Ansi(AnsiToken {
                                code: part,
                                end_code,
                            }));
                        }
                    }
                    i += len;
                    continue;
                }
            }
            // Fall through: the opener is consumed as an ordinary visible
            // character (matching the JS char-handling fallthrough).
        }

        // Visible character — extract one grapheme cluster.
        let cluster = rest
            .graphemes(true)
            .next()
            .expect("non-empty slice has at least one grapheme");
        let base_cp = cluster.chars().next().map(|c| c as u32).unwrap_or(0);
        let full_width = is_fullwidth_grapheme(cluster, base_cp);
        tokens.push(Token::Char(CharToken {
            value: cluster,
            full_width,
        }));
        visible += if full_width { 2 } else { 1 };
        if visible >= end_char {
            break;
        }
        i += cluster.len();
    }

    tokens
}

// ─── Tests ───────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;
    use crate::text::ansi_tokenize::types::Token;

    fn chars<'a>(tokens: &[Token<'a>]) -> Vec<&'a str> {
        tokens
            .iter()
            .filter_map(|t| match t {
                Token::Char(c) => Some(c.value),
                _ => None,
            })
            .collect()
    }

    fn ansi_tokens<'a>(tokens: &'a [Token<'a>]) -> Vec<(&'a str, &'a str)> {
        tokens
            .iter()
            .filter_map(|t| match t {
                Token::Ansi(a) => Some((a.code.as_str(), a.end_code.as_str())),
                _ => None,
            })
            .collect()
    }

    // Test 1: Plain text → all CharTokens, full_width = false.
    #[test]
    fn plain_text_chars() {
        let tokens = tokenize("abc", None);
        assert_eq!(tokens.len(), 3);
        assert_eq!(chars(&tokens), vec!["a", "b", "c"]);
        for t in &tokens {
            if let Token::Char(c) = t {
                assert!(!c.full_width);
            }
        }
    }

    // Test 2: CJK → full_width = true.
    #[test]
    fn cjk_fullwidth() {
        let tokens = tokenize("", None);
        assert_eq!(tokens.len(), 1);
        match &tokens[0] {
            Token::Char(c) => assert!(c.full_width),
            _ => panic!("expected Char"),
        }
    }

    // Test 3: Emoji with VS16 → full_width = true (VS16 rule).
    #[test]
    fn vs16_fullwidth() {
        let tokens = tokenize("✏️", None);
        assert_eq!(tokens.len(), 1);
        match &tokens[0] {
            Token::Char(c) => {
                assert_eq!(c.value, "✏️");
                assert!(c.full_width);
            }
            _ => panic!("expected Char"),
        }
    }

    // Test 4: Flag → single grapheme, full_width = true (regional indicator rule).
    #[test]
    fn flag_fullwidth() {
        let tokens = tokenize("🇩🇪", None);
        assert_eq!(tokens.len(), 1, "flag is one grapheme cluster");
        match &tokens[0] {
            Token::Char(c) => assert!(c.full_width),
            _ => panic!("expected Char"),
        }
    }

    // Test 5: `\x1B[31mred\x1B[39m`.
    #[test]
    fn red_then_reset_fg() {
        let tokens = tokenize("\x1B[31mred\x1B[39m", None);
        assert_eq!(
            ansi_tokens(&tokens),
            vec![("\x1B[31m", "\x1B[39m"), ("\x1B[39m", "\x1B[39m")]
        );
        assert_eq!(chars(&tokens), vec!["r", "e", "d"]);
    }

    // Test 6: Reset → end_code == itself.
    #[test]
    fn reset_code() {
        let tokens = tokenize("\x1B[0m", None);
        assert_eq!(ansi_tokens(&tokens), vec![("\x1B[0m", "\x1B[0m")]);
    }

    // Test 7: Compound → 3 AnsiTokens.
    #[test]
    fn compound_sgr_split() {
        let tokens = tokenize("\x1B[1;3;31m", None);
        assert_eq!(
            ansi_tokens(&tokens),
            vec![
                ("\x1B[1m", "\x1B[22m"),
                ("\x1B[3m", "\x1B[23m"),
                ("\x1B[31m", "\x1B[39m"),
            ]
        );
    }

    // Test 8: 8-bit color → ONE AnsiToken, end `\x1B[39m`.
    #[test]
    fn eight_bit_color() {
        let tokens = tokenize("\x1B[38;5;200m", None);
        assert_eq!(ansi_tokens(&tokens), vec![("\x1B[38;5;200m", "\x1B[39m")]);
    }

    // Test 9: 24-bit color → ONE AnsiToken, end `\x1B[39m`.
    #[test]
    fn twenty_four_bit_color() {
        let tokens = tokenize("\x1B[38;2;255;0;128m", None);
        assert_eq!(
            ansi_tokens(&tokens),
            vec![("\x1B[38;2;255;0;128m", "\x1B[39m")]
        );
    }

    // Test 10: Compound with embedded 24-bit → bold, 24-bit fg, underline.
    #[test]
    fn compound_with_embedded_24bit() {
        let tokens = tokenize("\x1B[1;38;2;10;20;30;4m", None);
        assert_eq!(
            ansi_tokens(&tokens),
            vec![
                ("\x1B[1m", "\x1B[22m"),
                ("\x1B[38;2;10;20;30m", "\x1B[39m"),
                ("\x1B[4m", "\x1B[24m"),
            ]
        );
    }

    // Test 11: OSC 8 link (BEL) → AnsiToken(link), 4 chars, AnsiToken.
    #[test]
    fn osc8_link_bel() {
        let s = "\x1B]8;;https://example.com\x07text\x1B]8;;\x07";
        let tokens = tokenize(s, None);
        assert_eq!(
            ansi_tokens(&tokens),
            vec![
                ("\x1B]8;;https://example.com\x07", "\x1B]8;;\x07"),
                ("\x1B]8;;\x07", "\x1B]8;;\x07"),
            ]
        );
        assert_eq!(chars(&tokens), vec!["t", "e", "x", "t"]);
    }

    // OSC 8 link terminated by ST (`ESC \`) — exercises the full terminator set.
    #[test]
    fn osc8_link_st() {
        let s = "\x1B]8;;https://e.com\x1B\\hi\x1B]8;;\x1B\\";
        let tokens = tokenize(s, None);
        assert_eq!(
            ansi_tokens(&tokens),
            vec![
                ("\x1B]8;;https://e.com\x1B\\", "\x1B]8;;\x1B\\"),
                ("\x1B]8;;\x1B\\", "\x1B]8;;\x1B\\"),
            ]
        );
        assert_eq!(chars(&tokens), vec!["h", "i"]);
    }

    // Test 12: OSC window title → ControlToken.
    #[test]
    fn osc_window_title() {
        let tokens = tokenize("\x1B]0;title\x07X", None);
        assert_eq!(tokens.len(), 2);
        match &tokens[0] {
            Token::Control(c) => assert_eq!(c.code, "\x1B]0;title\x07"),
            _ => panic!("expected Control"),
        }
        assert_eq!(chars(&tokens), vec!["X"]);
    }

    // OSC title terminated by ST.
    #[test]
    fn osc_window_title_st() {
        let tokens = tokenize("\x1B]0;title\x1B\\X", None);
        match &tokens[0] {
            Token::Control(c) => assert_eq!(c.code, "\x1B]0;title\x1B\\"),
            _ => panic!("expected Control"),
        }
        assert_eq!(chars(&tokens), vec!["X"]);
    }

    // Test 13: Invalid SGR → no ansi token; ESC and following chars are visible.
    #[test]
    fn invalid_sgr_falls_through() {
        let tokens = tokenize("\x1B[31xred", None);
        assert!(ansi_tokens(&tokens).is_empty());
        assert_eq!(
            chars(&tokens),
            vec!["\x1B", "[", "3", "1", "x", "r", "e", "d"]
        );
    }

    // Test 14: end_char limit (narrow chars).
    #[test]
    fn end_char_limit() {
        let tokens = tokenize("abcdef", Some(3));
        assert_eq!(chars(&tokens), vec!["a", "b", "c"]);
    }

    // end_char with full-width chars (each counts as 2 columns).
    #[test]
    fn end_char_fullwidth_limit() {
        // 中(2) + 文(2) + X(1): end_char = 4 stops after 文.
        let tokens = tokenize("中文X", Some(4));
        assert_eq!(chars(&tokens), vec!["", ""]);
    }

    // C1 CSI opener (U+009B) followed by `[` parses as an SGR sequence.
    #[test]
    fn c1_csi_opener_parses_sgr() {
        let tokens = tokenize("\u{9B}[31mX", None);
        // The parsed code retains the C1 opener byte(s); endCode is from slice(2).
        assert_eq!(ansi_tokens(&tokens), vec![("\u{9B}[31m", "\x1B[39m")]);
        assert_eq!(chars(&tokens), vec!["X"]);
    }

    // C1 opener (U+009B) followed by `]8;` parses as an OSC-8 link, matching
    // JS `parseLinkCode` which verifies the prefix from index 1 and never
    // re-checks the opener byte. The endCode is the JS slice(2)+parseInt quirk:
    // chars \u{9B} and `]` are dropped, "8;;…" parses to 8 → end code 28.
    // Verified empirically against @alcalzone/ansi-tokenize@0.3.0 on Node.
    #[test]
    fn c1_opener_parses_osc8_link() {
        let tokens = tokenize("\u{9B}]8;;https://x\u{07}T\u{9B}]8;;\u{07}", None);
        assert_eq!(
            ansi_tokens(&tokens),
            vec![
                ("\u{9B}]8;;https://x\u{07}", "\x1B[28m"),
                ("\u{9B}]8;;\u{07}", "\x1B[28m"),
            ]
        );
        assert_eq!(chars(&tokens), vec!["T"]);
    }

    // ── Adversarial: full-pipeline no-panic totality ─────────────────────────

    // The ink-style render path is tokenize → styled_chars_from_tokens →
    // styled_chars_to_string. Control-token / unterminated-sequence streams must
    // round-trip without panic (a panic here kills ink-style rendering): raw C1
    // SGR openers, a NUL run, and nested OSC-8 links. A panic fails the test;
    // reaching the final assert proves the pipeline is total. The C1 case is also
    // pinned: it round-trips to "\u{9b}[31mhi\x1b[39m".
    #[test]
    fn ansi_tokenize_pipeline_raw_c1_and_null_bytes_no_panic() {
        use crate::text::ansi_tokenize::{styled_chars_from_tokens, styled_chars_to_string};
        let pipe = |s: &str| styled_chars_to_string(&styled_chars_from_tokens(&tokenize(s, None)));
        assert_eq!(pipe("\u{9b}[31mhi\u{9b}[39m"), "\u{9b}[31mhi\x1b[39m");
        let _ = pipe("\x00\x00\x00"); // NUL run: returns without panic
        let _ = pipe("\x1b]8;;a\x07x\x1b]8;;b\x07y\x1b]8;;\x07"); // nested OSC-8: no panic
    }
}