vtcode_commons/
ansi.rs

1//! Shared ANSI escape parser and stripping utilities for VT Code.
2//!
3//! See `docs/reference/ansi-in-vtcode.md` for the workspace usage map.
4
5use crate::ansi_codes::{BEL_BYTE, ESC_BYTE};
6use memchr::memchr;
7
8const ESC: u8 = ESC_BYTE;
9const BEL: u8 = BEL_BYTE;
10const DEL: u8 = 0x7f;
11const C1_ST: u8 = 0x9c;
12const C1_DCS: u8 = 0x90;
13const C1_SOS: u8 = 0x98;
14const C1_CSI: u8 = 0x9b;
15const C1_OSC: u8 = 0x9d;
16const C1_PM: u8 = 0x9e;
17const C1_APC: u8 = 0x9f;
18const CAN: u8 = 0x18;
19const SUB: u8 = 0x1a;
20const MAX_STRING_SEQUENCE_BYTES: usize = 4096;
21const MAX_CSI_SEQUENCE_BYTES: usize = 64;
22
23#[derive(Clone, Copy)]
24enum StringSequenceTerminator {
25    StOnly,
26    BelOrSt,
27}
28
29impl StringSequenceTerminator {
30    #[inline]
31    const fn allows_bel(self) -> bool {
32        matches!(self, Self::BelOrSt)
33    }
34}
35
36#[inline]
37fn parse_c1_at(bytes: &[u8], start: usize) -> Option<(u8, usize)> {
38    let first = *bytes.get(start)?;
39    if (0x80..=0x9f).contains(&first) {
40        return Some((first, 1));
41    }
42    None
43}
44
45#[inline]
46fn parse_csi(bytes: &[u8], start: usize) -> Option<usize> {
47    // ECMA-48 / ISO 6429 CSI grammar:
48    // - parameter bytes: 0x30..0x3F
49    // - intermediate bytes: 0x20..0x2F
50    // - final byte: 0x40..0x7E
51    // (See ANSI escape code article on Wikipedia, CSI section.)
52    let mut index = start;
53    let mut phase = 0u8; // 0=parameter, 1=intermediate
54    let mut consumed = 0usize;
55
56    while index < bytes.len() {
57        let byte = bytes[index];
58        if byte == ESC {
59            // VT100: ESC aborts current control sequence and starts a new one.
60            return Some(index);
61        }
62        if byte == CAN || byte == SUB {
63            // VT100: CAN/SUB abort current control sequence.
64            return Some(index + 1);
65        }
66
67        consumed += 1;
68        if consumed > MAX_CSI_SEQUENCE_BYTES {
69            // Bound malformed or hostile input.
70            return Some(index + 1);
71        }
72
73        if phase == 0 && (0x30..=0x3f).contains(&byte) {
74            index += 1;
75            continue;
76        }
77        if (0x20..=0x2f).contains(&byte) {
78            phase = 1;
79            index += 1;
80            continue;
81        }
82        if (0x40..=0x7e).contains(&byte) {
83            return Some(index + 1);
84        }
85
86        // Invalid CSI byte: abort sequence without consuming this byte.
87        return Some(index);
88    }
89
90    None
91}
92
93#[inline]
94fn parse_string_sequence(
95    bytes: &[u8],
96    start: usize,
97    terminator: StringSequenceTerminator,
98) -> Option<usize> {
99    let mut consumed = 0usize;
100    for index in start..bytes.len() {
101        if bytes[index] == ESC && !(index + 1 < bytes.len() && bytes[index + 1] == b'\\') {
102            // VT100: ESC aborts current sequence and begins a new one.
103            return Some(index);
104        }
105        if bytes[index] == CAN || bytes[index] == SUB {
106            return Some(index + 1);
107        }
108
109        if let Some((c1, len)) = parse_c1_at(bytes, index)
110            && c1 == C1_ST
111        {
112            return Some(index + len);
113        }
114
115        match bytes[index] {
116            BEL if terminator.allows_bel() => return Some(index + 1),
117            ESC if index + 1 < bytes.len() && bytes[index + 1] == b'\\' => return Some(index + 2),
118            _ => {}
119        }
120
121        consumed += 1;
122        if consumed > MAX_STRING_SEQUENCE_BYTES {
123            // Cap unbounded strings when terminator is missing.
124            return Some(index + 1);
125        }
126    }
127    None
128}
129
130#[inline]
131fn push_visible_byte(output: &mut Vec<u8>, byte: u8) {
132    if matches!(byte, b'\n' | b'\r' | b'\t') || !(byte < 32 || byte == DEL) {
133        output.push(byte);
134    }
135}
136
137#[inline]
138fn parse_ansi_sequence_bytes(bytes: &[u8]) -> Option<usize> {
139    if bytes.is_empty() {
140        return None;
141    }
142
143    if let Some((c1, c1_len)) = parse_c1_at(bytes, 0) {
144        return match c1 {
145            C1_CSI => parse_csi(bytes, c1_len),
146            C1_OSC => parse_string_sequence(bytes, c1_len, StringSequenceTerminator::BelOrSt),
147            C1_DCS | C1_SOS | C1_PM | C1_APC => {
148                parse_string_sequence(bytes, c1_len, StringSequenceTerminator::StOnly)
149            }
150            _ => Some(c1_len),
151        };
152    }
153
154    match bytes[0] {
155        ESC => {
156            if bytes.len() < 2 {
157                return None;
158            }
159
160            match bytes[1] {
161                b'[' => parse_csi(bytes, 2),
162                b']' => parse_string_sequence(bytes, 2, StringSequenceTerminator::BelOrSt),
163                b'P' | b'^' | b'_' | b'X' => {
164                    parse_string_sequence(bytes, 2, StringSequenceTerminator::StOnly)
165                }
166                // Three-byte sequences: ESC + intermediate + final
167                // ESC SP {F,G,L,M,N} — 7/8-bit controls, ANSI conformance
168                // ESC # {3,4,5,6,8} — DEC line attributes / screen alignment
169                // ESC % {@ ,G} — character set selection (ISO 2022)
170                // ESC ( C / ESC ) C / ESC * C / ESC + C — G0-G3 designation
171                b' ' | b'#' | b'%' | b'(' | b')' | b'*' | b'+' => {
172                    if bytes.len() > 2 {
173                        Some(3)
174                    } else {
175                        None
176                    }
177                }
178                next if next < 128 => Some(2),
179                _ => Some(1),
180            }
181        }
182        _ => None,
183    }
184}
185
186/// Strip ANSI escape codes from text, returning a borrowed `Cow` when the
187/// input contains no ESC byte.  This is the preferred API for call-sites that
188/// want to avoid allocation on the common "no ANSI codes" path.
189pub fn strip_ansi_codes(text: &str) -> std::borrow::Cow<'_, str> {
190    if !text.contains('\x1b') {
191        return std::borrow::Cow::Borrowed(text);
192    }
193    std::borrow::Cow::Owned(strip_ansi(text))
194}
195
196/// Strip ANSI escape codes from text, keeping only plain text
197pub fn strip_ansi(text: &str) -> String {
198    let mut output = Vec::with_capacity(text.len());
199    let bytes = text.as_bytes();
200    let mut i = 0;
201
202    while i < bytes.len() {
203        let next_esc = memchr(ESC, &bytes[i..]).map_or(bytes.len(), |offset| i + offset);
204        // Pre-slice to avoid bounds checks in the inner loop — the range
205        // i..next_esc is provably within bytes[..].
206        for &b in &bytes[i..next_esc] {
207            push_visible_byte(&mut output, b);
208        }
209        i = next_esc;
210
211        if i >= bytes.len() {
212            break;
213        }
214
215        if let Some(len) = parse_ansi_sequence_bytes(&bytes[i..]) {
216            i += len;
217            continue;
218        } else {
219            // Incomplete/unterminated control sequence at end of available text.
220            break;
221        }
222    }
223
224    String::from_utf8_lossy(&output).into_owned()
225}
226
227/// Strip ANSI escape codes from arbitrary bytes, preserving non-control bytes.
228///
229/// This is the preferred API when input may contain raw C1 (8-bit) controls.
230pub fn strip_ansi_bytes(input: &[u8]) -> Vec<u8> {
231    let mut output = Vec::with_capacity(input.len());
232    let bytes = input;
233    let mut i = 0;
234
235    while i < bytes.len() {
236        // Pre-slice to the remaining portion so all indexing below shares one bounds edge.
237        let rest = &bytes[i..];
238
239        if (rest[0] == ESC || parse_c1_at(bytes, i).is_some())
240            && let Some(len) = parse_ansi_sequence_bytes(rest)
241        {
242            i += len;
243            continue;
244        }
245        if rest[0] == ESC || parse_c1_at(bytes, i).is_some() {
246            // Incomplete/unterminated control sequence at end of available text.
247            break;
248        }
249
250        push_visible_byte(&mut output, rest[0]);
251        i += 1;
252    }
253    output
254}
255
256/// Parse and determine the length of the ANSI escape sequence at the start of text
257pub fn parse_ansi_sequence(text: &str) -> Option<usize> {
258    let bytes = text.as_bytes();
259    parse_ansi_sequence_bytes(bytes)
260}
261
262/// Fast ASCII-only ANSI stripping for performance-critical paths
263pub fn strip_ansi_ascii_only(text: &str) -> String {
264    let mut output = String::with_capacity(text.len());
265    let bytes = text.as_bytes();
266    let mut search_start = 0;
267    let mut copy_start = 0;
268
269    while let Some(offset) = memchr(ESC, &bytes[search_start..]) {
270        let esc_index = search_start + offset;
271        if let Some(len) = parse_ansi_sequence_bytes(&bytes[esc_index..]) {
272            if copy_start < esc_index {
273                output.push_str(&text[copy_start..esc_index]);
274            }
275            copy_start = esc_index + len;
276            search_start = copy_start;
277        } else {
278            search_start = esc_index + 1;
279        }
280    }
281
282    if copy_start < text.len() {
283        output.push_str(&text[copy_start..]);
284    }
285
286    output
287}
288
289/// Detect if text contains unicode characters that need special handling
290#[must_use]
291pub fn contains_unicode(text: &str) -> bool {
292    text.bytes().any(|b| b >= 0x80)
293}
294
295#[cfg(test)]
296mod tests {
297    use super::{CAN, SUB, strip_ansi, strip_ansi_ascii_only};
298
299    #[test]
300    fn strips_esc_csi_sequences() {
301        let input = "a\x1b[31mred\x1b[0mz";
302        assert_eq!(strip_ansi(input), "aredz");
303        assert_eq!(strip_ansi_ascii_only(input), "aredz");
304    }
305
306    #[test]
307    fn utf8_encoded_c1_is_not_reprocessed_as_control() {
308        // XTerm/ECMA-48: controls are processed once; decoded UTF-8 text is not reprocessed as C1.
309        let input = "a\u{009b}31mred";
310        assert_eq!(strip_ansi(input), input);
311    }
312
313    #[test]
314    fn strip_removes_ascii_del_control() {
315        let input = format!("a{}b", char::from(0x7f));
316        assert_eq!(strip_ansi(&input), "ab");
317    }
318
319    #[test]
320    fn csi_aborts_on_esc_then_new_sequence_parses() {
321        let input = "a\x1b[31\x1b[32mgreen\x1b[0mz";
322        assert_eq!(strip_ansi(input), "agreenz");
323    }
324
325    #[test]
326    fn csi_aborts_on_can_and_sub() {
327        let can = format!("a\x1b[31{}b", char::from(CAN));
328        let sub = format!("a\x1b[31{}b", char::from(SUB));
329        assert_eq!(strip_ansi(&can), "ab");
330        assert_eq!(strip_ansi(&sub), "ab");
331    }
332
333    #[test]
334    fn osc_aborts_on_esc_non_st() {
335        let input = "a\x1b]title\x1b[31mred\x1b[0mz";
336        assert_eq!(strip_ansi(input), "aredz");
337    }
338
339    #[test]
340    fn incomplete_sequence_drops_tail() {
341        let input = "text\x1b[31";
342        assert_eq!(strip_ansi(input), "text");
343    }
344
345    #[test]
346    fn ascii_only_incomplete_sequence_keeps_tail() {
347        let input = "text\x1b[31";
348        assert_eq!(strip_ansi_ascii_only(input), input);
349    }
350
351    #[test]
352    fn strips_common_progress_redraw_sequences() {
353        // Common pattern for dynamic CLI updates:
354        // carriage return + erase line + redraw text.
355        let input = "\r\x1b[2KProgress 10%\r\x1b[2KDone\n";
356        assert_eq!(strip_ansi(input), "\rProgress 10%\rDone\n");
357    }
358
359    #[test]
360    fn strips_cursor_navigation_sequences() {
361        let input = "left\x1b[1D!\nup\x1b[1Arow";
362        assert_eq!(strip_ansi(input), "left!\nuprow");
363    }
364
365    #[test]
366    fn strip_ansi_bytes_supports_raw_c1_csi() {
367        let input = [
368            b'a', 0x9b, b'3', b'1', b'm', b'r', b'e', b'd', 0x9b, b'0', b'm', b'z',
369        ];
370        let out = super::strip_ansi_bytes(&input);
371        assert_eq!(out, b"aredz");
372    }
373
374    #[test]
375    fn strip_ansi_bytes_supports_raw_c1_osc_and_st() {
376        let mut input = b"pre".to_vec();
377        input.extend_from_slice(&[0x9d]);
378        input.extend_from_slice(b"8;;https://example.com");
379        input.extend_from_slice(&[0x9c]);
380        input.extend_from_slice(b"link");
381        input.extend_from_slice(&[0x9d]);
382        input.extend_from_slice(b"8;;");
383        input.extend_from_slice(&[0x9c]);
384        input.extend_from_slice(b"post");
385        let out = super::strip_ansi_bytes(&input);
386        assert_eq!(out, b"prelinkpost");
387    }
388
389    #[test]
390    fn csi_respects_parameter_intermediate_final_grammar() {
391        // Parameter bytes ("1;2"), intermediate bytes (" "), then final ("m")
392        let input = "a\x1b[1;2 mred\x1b[0mz";
393        assert_eq!(strip_ansi(input), "aredz");
394    }
395
396    #[test]
397    fn malformed_csi_does_not_consume_following_text() {
398        // 0x10 is not valid CSI parameter/intermediate/final.
399        let malformed = format!("a\x1b[12{}visible", char::from(0x10));
400        assert_eq!(strip_ansi(&malformed), "avisible");
401    }
402
403    #[test]
404    fn strips_wikipedia_sgr_8bit_color_pattern() {
405        let input = "x\x1b[38;5;196mred\x1b[0my";
406        assert_eq!(strip_ansi(input), "xredy");
407    }
408
409    #[test]
410    fn strips_wikipedia_sgr_truecolor_pattern() {
411        let input = "x\x1b[48;2;12;34;56mblock\x1b[0my";
412        assert_eq!(strip_ansi(input), "xblocky");
413    }
414
415    #[test]
416    fn strips_wikipedia_osc8_hyperlink_pattern() {
417        let input = "go \x1b]8;;https://example.com\x1b\\here\x1b]8;;\x1b\\ now";
418        assert_eq!(strip_ansi(input), "go here now");
419    }
420
421    #[test]
422    fn strips_dec_private_mode_csi() {
423        let input = "a\x1b[?25lb\x1b[?25hc";
424        assert_eq!(strip_ansi(input), "abc");
425    }
426
427    #[test]
428    fn strips_three_byte_esc_sequences() {
429        // ESC # 8 = DEC screen alignment test
430        let input = "a\x1b#8b";
431        assert_eq!(strip_ansi(input), "ab");
432
433        // ESC ( B = designate US ASCII as G0
434        let input2 = "a\x1b(Bb";
435        assert_eq!(strip_ansi(input2), "ab");
436
437        // ESC SP F = 7-bit controls
438        let input3 = "a\x1b Fb";
439        assert_eq!(strip_ansi(input3), "ab");
440
441        // ESC % G = select UTF-8
442        let input4 = "a\x1b%Gb";
443        assert_eq!(strip_ansi(input4), "ab");
444    }
445
446    #[test]
447    fn incomplete_three_byte_esc_sequence_drops_tail() {
448        // ESC # at end — incomplete, should not consume past end
449        let input = "text\x1b#";
450        assert_eq!(strip_ansi(input), "text");
451    }
452}
vtcode_commons/ansi.rs

vtcode_commons/
ansi.rs