Skip to main content

oo_ide/
vt_parser.rs

1//! VT100 SGR parser for task output streams.
2//!
3//! Converts raw byte chunks into [`StyledLine`] values — structured, styled
4//! representations of individual terminal output lines.
5//!
6//! # Parsing scope (MVP)
7//!
8//! Handles what is needed for build-tool output:
9//!
10//! * `\n` — emit current line
11//! * `\r` — move cursor to column 0 (overwrite mode; does **not** clear the
12//!   line — correct POSIX terminal behaviour, which also handles CRLF correctly)
13//! * SGR sequences (`\x1b[...m`):
14//!   * reset (0)
15//!   * bold (1), italic (3), underline (4)
16//!   * standard foreground (30–37), bright foreground (90–97), default fg (39)
17//!   * standard background (40–47), bright background (100–107), default bg (49)
18//!   * 256-colour fg/bg (`38;5;n` / `48;5;n`)
19//!   * truecolour fg/bg (`38;2;r;g;b` / `48;2;r;g;b`)
20//! * Unknown / malformed sequences — silently skipped, never panics
21//! * Invalid UTF-8 — lossy replacement
22
23// ---------------------------------------------------------------------------
24// Public types
25// ---------------------------------------------------------------------------
26
27/// A terminal colour value.
28#[derive(Clone, Debug, PartialEq, Eq)]
29pub enum Color {
30    /// Inherited from the terminal default.
31    Default,
32    /// One of the 256 indexed palette colours (0–255).
33    Indexed(u8),
34    /// 24-bit RGB truecolour.
35    Rgb(u8, u8, u8),
36}
37
38/// Text rendering style at a given position.
39#[derive(Clone, Debug, PartialEq, Eq)]
40#[derive(Default)]
41pub struct Style {
42    pub fg: Option<Color>,
43    pub bg: Option<Color>,
44    pub bold: bool,
45    pub italic: bool,
46    pub underline: bool,
47}
48
49impl Style {
50    /// Returns `true` if all fields are at their default (zero) values.
51    pub fn is_default(&self) -> bool {
52        self.fg.is_none() && self.bg.is_none() && !self.bold && !self.italic && !self.underline
53    }
54}
55
56
57/// A styled region within a [`StyledLine`].
58///
59/// `start` and `end` are **byte offsets** into [`StyledLine::text`].
60#[derive(Clone, Debug, PartialEq, Eq)]
61pub struct StyleSpan {
62    /// Start byte offset (inclusive).
63    pub start: usize,
64    /// End byte offset (exclusive).
65    pub end: usize,
66    pub style: Style,
67}
68
69/// A single completed terminal output line with optional styling.
70///
71/// Only non-default styled regions appear in `spans`; unstyled text simply has
72/// no span entry.  Spans are non-overlapping and sorted by byte offset.
73#[derive(Clone, Debug, PartialEq, Eq)]
74pub struct StyledLine {
75    pub text: String,
76    pub spans: Vec<StyleSpan>,
77}
78
79// ---------------------------------------------------------------------------
80// Parser state
81// ---------------------------------------------------------------------------
82
83/// Internal automaton state.
84enum ParseState {
85    Normal,
86    /// Saw `\x1b` — waiting for `[` to begin a CSI sequence.
87    Escape,
88    /// Accumulating CSI parameter bytes after `\x1b[`.
89    Csi(Vec<u8>),
90}
91
92// ---------------------------------------------------------------------------
93// TerminalParser
94// ---------------------------------------------------------------------------
95
96/// Stateful, incremental VT100 SGR parser.
97///
98/// Each task output stream (stdout / stderr) should have its own `TerminalParser`
99/// instance because SGR state is per-stream.
100///
101/// # Usage
102///
103/// ```ignore
104/// let mut parser = TerminalParser::new();
105/// for chunk in byte_stream {
106///     for line in parser.push(&chunk) {
107///         // handle completed line
108///     }
109/// }
110/// if let Some(line) = parser.flush() {
111///     // handle trailing line with no newline
112/// }
113/// ```
114pub struct TerminalParser {
115    state: ParseState,
116    current_style: Style,
117    /// Per-character line buffer: `(character, style-at-that-position)`.
118    line_cells: Vec<(char, Style)>,
119    /// Write cursor in `line_cells`; advances on write, resets on `\r`.
120    cursor: usize,
121    /// Leftover bytes from an incomplete UTF-8 multi-byte sequence.
122    utf8_buf: Vec<u8>,
123}
124
125impl TerminalParser {
126    pub fn new() -> Self {
127        Self {
128            state: ParseState::Normal,
129            current_style: Style::default(),
130            line_cells: Vec::new(),
131            cursor: 0,
132            utf8_buf: Vec::new(),
133        }
134    }
135
136    /// Push a raw byte chunk through the parser.
137    ///
138    /// Returns all lines that were **completed** (terminated by `\n`) within
139    /// this chunk.  Partial lines are buffered internally.
140    pub fn push(&mut self, bytes: &[u8]) -> Vec<StyledLine> {
141        // Prepend any leftover bytes from an incomplete multi-byte sequence.
142        let data: Vec<u8> = if self.utf8_buf.is_empty() {
143            bytes.to_vec()
144        } else {
145            let mut v = std::mem::take(&mut self.utf8_buf);
146            v.extend_from_slice(bytes);
147            v
148        };
149
150        let mut completed = Vec::new();
151        let mut i = 0;
152
153        while i < data.len() {
154            let b = data[i];
155
156            // Take ownership of current state (replacing with Normal) so we can
157            // freely reassign `self.state` inside each arm without borrow issues.
158            let state = std::mem::replace(&mut self.state, ParseState::Normal);
159
160            match state {
161                ParseState::Escape => {
162                    if b == b'[' {
163                        self.state = ParseState::Csi(Vec::new());
164                    }
165                    // Any other byte after \x1b: discard the escape, state stays Normal.
166                    i += 1;
167                }
168
169                ParseState::Csi(mut params) => {
170                    if b == b'm' {
171                        // End of SGR sequence — apply it; state stays Normal.
172                        self.apply_sgr(&params);
173                    } else if b.is_ascii_digit() || b == b';' {
174                        params.push(b);
175                        self.state = ParseState::Csi(params);
176                    }
177                    // Any other terminator: discard the sequence, state stays Normal.
178                    i += 1;
179                }
180
181                ParseState::Normal => {
182                    if b == b'\x1b' {
183                        self.state = ParseState::Escape;
184                        i += 1;
185                    } else if b == b'\n' {
186                        completed.push(self.emit_line());
187                        i += 1;
188                    } else if b == b'\r' {
189                        // Move cursor to column 0; existing cell content is preserved
190                        // (correct POSIX behaviour — handles CRLF endings naturally).
191                        self.cursor = 0;
192                        i += 1;
193                    } else {
194                        let char_len = utf8_char_len(b);
195                        if i + char_len > data.len() {
196                            // Incomplete multi-byte sequence at end of chunk.
197                            self.utf8_buf = data[i..].to_vec();
198                            return completed;
199                        }
200                        let ch = String::from_utf8_lossy(&data[i..i + char_len])
201                            .chars()
202                            .next()
203                            .unwrap_or('\u{FFFD}');
204                        self.write_char(ch);
205                        i += char_len;
206                    }
207                }
208            }
209        }
210
211        completed
212    }
213
214    /// Flush any remaining partial line (content that has no trailing `\n`).
215    ///
216    /// Should be called once when the underlying stream is closed.
217    pub fn flush(&mut self) -> Option<StyledLine> {
218        // Consume any buffered partial UTF-8 bytes as replacement characters.
219        if !self.utf8_buf.is_empty() {
220            let buf = std::mem::take(&mut self.utf8_buf);
221            for ch in String::from_utf8_lossy(&buf).chars() {
222                self.write_char(ch);
223            }
224        }
225        if self.line_cells.is_empty() {
226            None
227        } else {
228            Some(self.emit_line())
229        }
230    }
231
232    // --- private helpers ---------------------------------------------------
233
234    fn write_char(&mut self, ch: char) {
235        if self.cursor < self.line_cells.len() {
236            self.line_cells[self.cursor] = (ch, self.current_style.clone());
237        } else {
238            self.line_cells.push((ch, self.current_style.clone()));
239        }
240        self.cursor += 1;
241    }
242
243    fn emit_line(&mut self) -> StyledLine {
244        let cells = std::mem::take(&mut self.line_cells);
245        self.cursor = 0;
246        cells_to_styled_line(&cells)
247    }
248
249    fn apply_sgr(&mut self, param_bytes: &[u8]) {
250        let param_str = std::str::from_utf8(param_bytes).unwrap_or("");
251
252        let nums: Vec<u32> = if param_str.is_empty() {
253            // `\x1b[m` with no params is equivalent to reset.
254            vec![0]
255        } else {
256            param_str
257                .split(';')
258                .map(|s| s.parse::<u32>().unwrap_or(0))
259                .collect()
260        };
261
262        let mut idx = 0;
263        while idx < nums.len() {
264            match nums[idx] {
265                0 => self.current_style = Style::default(),
266                1 => self.current_style.bold = true,
267                3 => self.current_style.italic = true,
268                4 => self.current_style.underline = true,
269                22 => self.current_style.bold = false,
270                23 => self.current_style.italic = false,
271                24 => self.current_style.underline = false,
272                // Standard foreground colours (30–37) map to Indexed(0–7).
273                n @ 30..=37 => self.current_style.fg = Some(Color::Indexed((n - 30) as u8)),
274                39 => self.current_style.fg = None,
275                // Standard background colours (40–47) map to Indexed(0–7).
276                n @ 40..=47 => self.current_style.bg = Some(Color::Indexed((n - 40) as u8)),
277                49 => self.current_style.bg = None,
278                // Bright/high-intensity foreground (90–97) map to Indexed(8–15).
279                n @ 90..=97 => self.current_style.fg = Some(Color::Indexed((n - 90 + 8) as u8)),
280                // Bright background (100–107) map to Indexed(8–15).
281                n @ 100..=107 => self.current_style.bg = Some(Color::Indexed((n - 100 + 8) as u8)),
282                // Extended colour: 38 = fg, 48 = bg.
283                n @ (38 | 48) => {
284                    let is_fg = n == 38;
285                    if idx + 1 < nums.len() {
286                        match nums[idx + 1] {
287                            5 if idx + 2 < nums.len() => {
288                                // 256-colour: `38;5;n`
289                                let color = Color::Indexed(nums[idx + 2] as u8);
290                                if is_fg {
291                                    self.current_style.fg = Some(color);
292                                } else {
293                                    self.current_style.bg = Some(color);
294                                }
295                                idx += 2;
296                            }
297                            2 if idx + 4 < nums.len() => {
298                                // Truecolour: `38;2;r;g;b`
299                                let color = Color::Rgb(
300                                    nums[idx + 2] as u8,
301                                    nums[idx + 3] as u8,
302                                    nums[idx + 4] as u8,
303                                );
304                                if is_fg {
305                                    self.current_style.fg = Some(color);
306                                } else {
307                                    self.current_style.bg = Some(color);
308                                }
309                                idx += 4;
310                            }
311                            _ => {}
312                        }
313                    }
314                }
315                _ => {} // Unknown parameter — ignore.
316            }
317            idx += 1;
318        }
319    }
320}
321
322impl Default for TerminalParser {
323    fn default() -> Self {
324        Self::new()
325    }
326}
327
328// ---------------------------------------------------------------------------
329// Helpers
330// ---------------------------------------------------------------------------
331
332/// Returns the expected byte length of a UTF-8 character given its first byte.
333fn utf8_char_len(first_byte: u8) -> usize {
334    match first_byte {
335        0x00..=0x7F => 1,
336        0xC0..=0xDF => 2,
337        0xE0..=0xEF => 3,
338        0xF0..=0xF7 => 4,
339        // Continuation byte or invalid — treat as single byte to advance.
340        _ => 1,
341    }
342}
343
344/// Convert a slice of `(char, Style)` cells into a [`StyledLine`].
345///
346/// Consecutive cells with the same non-default style are merged into a single
347/// [`StyleSpan`].  Cells with default style produce no span entry.
348fn cells_to_styled_line(cells: &[(char, Style)]) -> StyledLine {
349    let text: String = cells.iter().map(|(c, _)| *c).collect();
350    let mut spans: Vec<StyleSpan> = Vec::new();
351
352    if cells.is_empty() {
353        return StyledLine { text, spans };
354    }
355
356    let mut byte_pos = 0usize;
357    let mut span_start_byte = 0usize;
358    let mut current_style = cells[0].1.clone();
359
360    for (i, (ch, style)) in cells.iter().enumerate() {
361        let ch_bytes = ch.len_utf8();
362        byte_pos += ch_bytes;
363
364        // Close the current span when the style changes or we reach the end.
365        let style_ends = cells.get(i + 1).is_none_or(|(_, next)| next != style);
366        if style_ends {
367            if !current_style.is_default() {
368                spans.push(StyleSpan {
369                    start: span_start_byte,
370                    end: byte_pos,
371                    style: current_style.clone(),
372                });
373            }
374            span_start_byte = byte_pos;
375            if let Some((_, next_style)) = cells.get(i + 1) {
376                current_style = next_style.clone();
377            }
378        }
379    }
380
381    StyledLine { text, spans }
382}
383
384// ---------------------------------------------------------------------------
385// Tests
386// ---------------------------------------------------------------------------
387
388#[cfg(test)]
389mod tests {
390    use super::*;
391
392    fn parse_all(input: &[u8]) -> Vec<StyledLine> {
393        let mut p = TerminalParser::new();
394        p.push(input)
395    }
396
397    /// Feed bytes in multiple chunks and collect all lines including a final flush.
398    fn parse_chunks(chunks: &[&[u8]]) -> Vec<StyledLine> {
399        let mut p = TerminalParser::new();
400        let mut lines = Vec::new();
401        for chunk in chunks {
402            lines.extend(p.push(chunk));
403        }
404        if let Some(line) = p.flush() {
405            lines.push(line);
406        }
407        lines
408    }
409
410    // 1. Plain text with no escapes → single span-free line.
411    #[test]
412    fn plain_text_single_line() {
413        let lines = parse_all(b"hello\n");
414        assert_eq!(lines.len(), 1);
415        assert_eq!(lines[0].text, "hello");
416        assert!(lines[0].spans.is_empty(), "plain text should have no spans");
417    }
418
419    // 2. Multiple `\n` characters split into separate lines.
420    #[test]
421    fn newline_splits_lines() {
422        let lines = parse_all(b"foo\nbar\nbaz\n");
423        assert_eq!(lines.len(), 3);
424        assert_eq!(lines[0].text, "foo");
425        assert_eq!(lines[1].text, "bar");
426        assert_eq!(lines[2].text, "baz");
427    }
428
429    // 3. `\x1b[31m` → red foreground span, reset by `\x1b[0m`.
430    #[test]
431    fn red_color_span() {
432        // SGR 31 → standard red → Indexed(1).
433        let lines = parse_all(b"\x1b[31merror\x1b[0m\n");
434        assert_eq!(lines.len(), 1);
435        assert_eq!(lines[0].text, "error");
436        assert_eq!(lines[0].spans.len(), 1);
437        let span = &lines[0].spans[0];
438        assert_eq!(span.start, 0);
439        assert_eq!(span.end, 5);
440        assert_eq!(span.style.fg, Some(Color::Indexed(1)));
441    }
442
443    // 4. Escape sequence split across two chunks.
444    #[test]
445    fn escape_split_across_chunks() {
446        // `\x1b[31m` is split: `\x1b[31` in chunk 1, `mErr` in chunk 2.
447        let lines = parse_chunks(&[b"\x1b[31mErr", b"or\x1b[0m\n"]);
448        assert_eq!(lines.len(), 1);
449        assert_eq!(lines[0].text, "Error");
450        assert_eq!(lines[0].spans[0].style.fg, Some(Color::Indexed(1)));
451        assert_eq!(lines[0].spans[0].start, 0);
452        assert_eq!(lines[0].spans[0].end, 5);
453    }
454
455    // 5. `\r` moves cursor to column 0; subsequent chars overwrite existing content.
456    //    CRLF (`\r\n`) — common on Windows — should produce the line content
457    //    that was written before the `\r`.
458    #[test]
459    fn crlf_line_ending() {
460        let lines = parse_all(b"hello\r\n");
461        assert_eq!(lines.len(), 1);
462        assert_eq!(lines[0].text, "hello");
463    }
464
465    // 6. `\r` overwrite — new content shorter than old leaves a suffix.
466    //    "ABCDE\rXY\n" → cursor resets to 0, X and Y overwrite A and B,
467    //    C D E remain → "XYCDE".
468    #[test]
469    fn carriage_return_partial_overwrite() {
470        let lines = parse_all(b"ABCDE\rXY\n");
471        assert_eq!(lines.len(), 1);
472        assert_eq!(lines[0].text, "XYCDE");
473    }
474
475    // 7. Multiple style spans in the same line.
476    #[test]
477    fn multiple_spans_same_line() {
478        let lines = parse_all(b"\x1b[31mred\x1b[32mgreen\x1b[0m\n");
479        assert_eq!(lines.len(), 1);
480        assert_eq!(lines[0].text, "redgreen");
481        assert_eq!(lines[0].spans.len(), 2);
482        assert_eq!(lines[0].spans[0].style.fg, Some(Color::Indexed(1))); // red
483        assert_eq!(lines[0].spans[0].start, 0);
484        assert_eq!(lines[0].spans[0].end, 3);
485        assert_eq!(lines[0].spans[1].style.fg, Some(Color::Indexed(2))); // green
486        assert_eq!(lines[0].spans[1].start, 3);
487        assert_eq!(lines[0].spans[1].end, 8);
488    }
489
490    // 8. Style does not leak from one line to the next after reset.
491    #[test]
492    fn style_no_leak_across_lines() {
493        let lines = parse_all(b"\x1b[31mred\x1b[0m\nnormal\n");
494        assert_eq!(lines.len(), 2);
495        assert_eq!(lines[0].text, "red");
496        assert!(!lines[0].spans.is_empty());
497        assert_eq!(lines[1].text, "normal");
498        assert!(lines[1].spans.is_empty(), "second line should be unstyled");
499    }
500
501    // 9. Bold and colour combined via `\x1b[1;31m`.
502    #[test]
503    fn bold_and_color_combined() {
504        let lines = parse_all(b"\x1b[1;31mbold red\x1b[0m\n");
505        assert_eq!(lines.len(), 1);
506        assert_eq!(lines[0].spans.len(), 1);
507        let s = &lines[0].spans[0].style;
508        assert!(s.bold);
509        assert_eq!(s.fg, Some(Color::Indexed(1)));
510    }
511
512    // 10. `flush()` returns the partial line when there is no trailing `\n`.
513    #[test]
514    fn flush_returns_partial_line() {
515        let mut p = TerminalParser::new();
516        let completed = p.push(b"partial");
517        assert!(completed.is_empty(), "no newline → no completed lines yet");
518        let line = p.flush().expect("flush should return the partial line");
519        assert_eq!(line.text, "partial");
520    }
521
522    // 11. 256-colour foreground via `38;5;n`.
523    #[test]
524    fn color_256() {
525        let lines = parse_all(b"\x1b[38;5;200mtext\x1b[0m\n");
526        assert_eq!(lines[0].spans[0].style.fg, Some(Color::Indexed(200)));
527    }
528
529    // 12. Truecolour foreground via `38;2;r;g;b`.
530    #[test]
531    fn truecolor() {
532        let lines = parse_all(b"\x1b[38;2;255;0;128mtext\x1b[0m\n");
533        assert_eq!(lines[0].spans[0].style.fg, Some(Color::Rgb(255, 0, 128)));
534    }
535
536    // 13. Malformed / unknown sequence — plain text fallback, no panic.
537    #[test]
538    fn malformed_escape_sequence() {
539        // `\x1b[999X` is an unknown sequence (terminator ≠ 'm') → discarded.
540        let lines = parse_all(b"a\x1b[999Xb\n");
541        assert_eq!(lines.len(), 1);
542        assert_eq!(lines[0].text, "ab");
543        assert!(lines[0].spans.is_empty());
544    }
545
546    // 14. Empty input → empty output and no panic.
547    #[test]
548    fn empty_input() {
549        let mut p = TerminalParser::new();
550        assert!(p.push(b"").is_empty());
551        assert!(p.flush().is_none());
552    }
553
554    // 15. `\n` alone produces an empty line.
555    #[test]
556    fn newline_only_creates_empty_line() {
557        let lines = parse_all(b"\n");
558        assert_eq!(lines.len(), 1);
559        assert_eq!(lines[0].text, "");
560        assert!(lines[0].spans.is_empty());
561    }
562
563    // 16. Multi-byte UTF-8 characters are decoded correctly.
564    #[test]
565    fn utf8_multibyte() {
566        let lines = parse_all("héllo\n".as_bytes());
567        assert_eq!(lines.len(), 1);
568        assert_eq!(lines[0].text, "héllo");
569    }
570
571    // 17. Multi-byte UTF-8 sequence split across two chunks.
572    #[test]
573    fn utf8_split_across_chunks() {
574        // 'é' = 0xC3 0xA9; split so that 0xC3 is in chunk 1 and 0xA9 in chunk 2.
575        let e_bytes = "é".as_bytes();
576        let chunk1 = &[b'h', e_bytes[0]];
577        let chunk2 = &[e_bytes[1], b'\n'];
578        let lines = parse_chunks(&[chunk1, chunk2]);
579        assert_eq!(lines.len(), 1);
580        assert_eq!(lines[0].text, "hé");
581    }
582
583    // 18. Bare reset `\x1b[m` (no params) resets all styles.
584    #[test]
585    fn bare_reset() {
586        let lines = parse_all(b"\x1b[31mred\x1b[mnormal\n");
587        assert_eq!(lines[0].text, "rednormal");
588        assert_eq!(lines[0].spans.len(), 1, "only 'red' should be styled");
589        assert_eq!(lines[0].spans[0].end, 3); // "red" = bytes 0..3
590    }
591
592    // 19. 256-colour background via `48;5;n`.
593    #[test]
594    fn color_256_background() {
595        let lines = parse_all(b"\x1b[48;5;100mtext\x1b[0m\n");
596        assert_eq!(lines[0].spans[0].style.bg, Some(Color::Indexed(100)));
597    }
598
599    // 20. Bright foreground colours (90–97) map to Indexed(8–15).
600    #[test]
601    fn bright_foreground_colors() {
602        let lines = parse_all(b"\x1b[91mbright red\x1b[0m\n");
603        // SGR 91 → 91 - 90 + 8 = 9 → Indexed(9)
604        assert_eq!(lines[0].spans[0].style.fg, Some(Color::Indexed(9)));
605    }
606}