Skip to main content

vtcode_commons/
ansi.rs

1//! ANSI escape sequence parser and utilities
2
3use memchr::memchr;
4
5const ESC: u8 = 0x1b;
6const BEL: u8 = 0x07;
7const DEL: u8 = 0x7f;
8const C1_ST: u8 = 0x9c;
9const C1_DCS: u8 = 0x90;
10const C1_SOS: u8 = 0x98;
11const C1_CSI: u8 = 0x9b;
12const C1_OSC: u8 = 0x9d;
13const C1_PM: u8 = 0x9e;
14const C1_APC: u8 = 0x9f;
15const CAN: u8 = 0x18;
16const SUB: u8 = 0x1a;
17const MAX_STRING_SEQUENCE_BYTES: usize = 4096;
18const MAX_CSI_SEQUENCE_BYTES: usize = 64;
19
20#[inline]
21fn parse_c1_at(bytes: &[u8], start: usize) -> Option<(u8, usize)> {
22    let first = *bytes.get(start)?;
23    if (0x80..=0x9f).contains(&first) {
24        return Some((first, 1));
25    }
26    None
27}
28
29#[inline]
30fn parse_csi(bytes: &[u8], start: usize) -> Option<usize> {
31    // ECMA-48 / ISO 6429 CSI grammar:
32    // - parameter bytes: 0x30..0x3F
33    // - intermediate bytes: 0x20..0x2F
34    // - final byte: 0x40..0x7E
35    // (See ANSI escape code article on Wikipedia, CSI section.)
36    let mut index = start;
37    let mut phase = 0u8; // 0=parameter, 1=intermediate
38    let mut consumed = 0usize;
39
40    while index < bytes.len() {
41        let byte = bytes[index];
42        if byte == ESC {
43            // VT100: ESC aborts current control sequence and starts a new one.
44            return Some(index);
45        }
46        if byte == CAN || byte == SUB {
47            // VT100: CAN/SUB abort current control sequence.
48            return Some(index + 1);
49        }
50
51        consumed += 1;
52        if consumed > MAX_CSI_SEQUENCE_BYTES {
53            // Bound malformed or hostile input.
54            return Some(index + 1);
55        }
56
57        if phase == 0 && (0x30..=0x3f).contains(&byte) {
58            index += 1;
59            continue;
60        }
61        if (0x20..=0x2f).contains(&byte) {
62            phase = 1;
63            index += 1;
64            continue;
65        }
66        if (0x40..=0x7e).contains(&byte) {
67            return Some(index + 1);
68        }
69
70        // Invalid CSI byte: abort sequence without consuming this byte.
71        return Some(index);
72    }
73
74    None
75}
76
77#[inline]
78fn parse_osc(bytes: &[u8], start: usize) -> Option<usize> {
79    let mut consumed = 0usize;
80    for index in start..bytes.len() {
81        if bytes[index] == ESC && !(index + 1 < bytes.len() && bytes[index + 1] == b'\\') {
82            // VT100: ESC aborts current sequence and begins a new one.
83            return Some(index);
84        }
85        if bytes[index] == CAN || bytes[index] == SUB {
86            return Some(index + 1);
87        }
88
89        if let Some((c1, len)) = parse_c1_at(bytes, index)
90            && c1 == C1_ST
91        {
92            return Some(index + len);
93        }
94
95        match bytes[index] {
96            BEL | C1_ST => return Some(index + 1),
97            ESC if index + 1 < bytes.len() && bytes[index + 1] == b'\\' => return Some(index + 2),
98            _ => {}
99        }
100
101        consumed += 1;
102        if consumed > MAX_STRING_SEQUENCE_BYTES {
103            // Cap unbounded strings when terminator is missing.
104            return Some(index + 1);
105        }
106    }
107    None
108}
109
110#[inline]
111fn parse_st_terminated(bytes: &[u8], start: usize) -> Option<usize> {
112    let mut consumed = 0usize;
113    for index in start..bytes.len() {
114        if bytes[index] == ESC && !(index + 1 < bytes.len() && bytes[index + 1] == b'\\') {
115            return Some(index);
116        }
117        if bytes[index] == CAN || bytes[index] == SUB {
118            return Some(index + 1);
119        }
120
121        if let Some((c1, len)) = parse_c1_at(bytes, index)
122            && c1 == C1_ST
123        {
124            return Some(index + len);
125        }
126
127        match bytes[index] {
128            C1_ST => return Some(index + 1),
129            ESC if index + 1 < bytes.len() && bytes[index + 1] == b'\\' => return Some(index + 2),
130            _ => {}
131        }
132
133        consumed += 1;
134        if consumed > MAX_STRING_SEQUENCE_BYTES {
135            return Some(index + 1);
136        }
137    }
138    None
139}
140
141#[inline]
142fn parse_ansi_sequence_bytes(bytes: &[u8]) -> Option<usize> {
143    if bytes.is_empty() {
144        return None;
145    }
146
147    if let Some((c1, c1_len)) = parse_c1_at(bytes, 0) {
148        return match c1 {
149            C1_CSI => parse_csi(bytes, c1_len),
150            C1_OSC => parse_osc(bytes, c1_len),
151            C1_DCS | C1_SOS | C1_PM | C1_APC => parse_st_terminated(bytes, c1_len),
152            _ => Some(c1_len),
153        };
154    }
155
156    match bytes[0] {
157        ESC => {
158            if bytes.len() < 2 {
159                return None;
160            }
161
162            match bytes[1] {
163                b'[' => parse_csi(bytes, 2),
164                b']' => parse_osc(bytes, 2),
165                b'P' | b'^' | b'_' | b'X' => parse_st_terminated(bytes, 2),
166                next if next < 128 => Some(2),
167                _ => Some(1),
168            }
169        }
170        _ => None,
171    }
172}
173
174/// Strip ANSI escape codes from text, keeping only plain text
175pub fn strip_ansi(text: &str) -> String {
176    let mut output = Vec::with_capacity(text.len());
177    let bytes = text.as_bytes();
178    let mut i = 0;
179
180    while i < bytes.len() {
181        let next_esc = memchr(ESC, &bytes[i..]).map_or(bytes.len(), |offset| i + offset);
182        while i < next_esc {
183            if bytes[i] == b'\n' || bytes[i] == b'\r' || bytes[i] == b'\t' {
184                output.push(bytes[i]);
185                i += 1;
186            } else if bytes[i] < 32 || bytes[i] == DEL {
187                i += 1;
188            } else {
189                output.push(bytes[i]);
190                i += 1;
191            }
192        }
193
194        if i >= bytes.len() {
195            break;
196        }
197
198        if let Some(len) = parse_ansi_sequence_bytes(&bytes[i..]) {
199            i += len;
200            continue;
201        } else {
202            // Incomplete/unterminated control sequence at end of available text.
203            break;
204        }
205    }
206
207    String::from_utf8_lossy(&output).into_owned()
208}
209
210/// Strip ANSI escape codes from arbitrary bytes, preserving non-control bytes.
211///
212/// This is the preferred API when input may contain raw C1 (8-bit) controls.
213pub fn strip_ansi_bytes(input: &[u8]) -> Vec<u8> {
214    let mut output = Vec::with_capacity(input.len());
215    let bytes = input;
216    let mut i = 0;
217
218    while i < bytes.len() {
219        if (bytes[i] == ESC || parse_c1_at(bytes, i).is_some())
220            && let Some(len) = parse_ansi_sequence_bytes(&bytes[i..])
221        {
222            i += len;
223            continue;
224        }
225        if bytes[i] == ESC || parse_c1_at(bytes, i).is_some() {
226            // Incomplete/unterminated control sequence at end of available text.
227            break;
228        }
229
230        if bytes[i] == b'\n' || bytes[i] == b'\r' || bytes[i] == b'\t' {
231            output.push(bytes[i]);
232            i += 1;
233        } else if bytes[i] < 32 || bytes[i] == DEL {
234            i += 1;
235        } else {
236            output.push(bytes[i]);
237            i += 1;
238        }
239    }
240    output
241}
242
243/// Parse and determine the length of the ANSI escape sequence at the start of text
244pub fn parse_ansi_sequence(text: &str) -> Option<usize> {
245    let bytes = text.as_bytes();
246    parse_ansi_sequence_bytes(bytes)
247}
248
249/// Fast ASCII-only ANSI stripping for performance-critical paths
250pub fn strip_ansi_ascii_only(text: &str) -> String {
251    let mut output = String::with_capacity(text.len());
252    let bytes = text.as_bytes();
253    let mut search_start = 0;
254    let mut copy_start = 0;
255
256    while let Some(offset) = memchr(ESC, &bytes[search_start..]) {
257        let esc_index = search_start + offset;
258        if let Some(len) = parse_ansi_sequence_bytes(&bytes[esc_index..]) {
259            if copy_start < esc_index {
260                output.push_str(&text[copy_start..esc_index]);
261            }
262            copy_start = esc_index + len;
263            search_start = copy_start;
264        } else {
265            search_start = esc_index + 1;
266        }
267    }
268
269    if copy_start < text.len() {
270        output.push_str(&text[copy_start..]);
271    }
272
273    output
274}
275
276/// Detect if text contains unicode characters that need special handling
277pub fn contains_unicode(text: &str) -> bool {
278    text.bytes().any(|b| b >= 0x80)
279}
280
281#[cfg(test)]
282mod tests {
283    use super::{CAN, SUB, strip_ansi, strip_ansi_ascii_only};
284
285    #[test]
286    fn strips_esc_csi_sequences() {
287        let input = "a\x1b[31mred\x1b[0mz";
288        assert_eq!(strip_ansi(input), "aredz");
289        assert_eq!(strip_ansi_ascii_only(input), "aredz");
290    }
291
292    #[test]
293    fn utf8_encoded_c1_is_not_reprocessed_as_control() {
294        // XTerm/ECMA-48: controls are processed once; decoded UTF-8 text is not reprocessed as C1.
295        let input = "a\u{009b}31mred";
296        assert_eq!(strip_ansi(input), input);
297    }
298
299    #[test]
300    fn strip_removes_ascii_del_control() {
301        let input = format!("a{}b", char::from(0x7f));
302        assert_eq!(strip_ansi(&input), "ab");
303    }
304
305    #[test]
306    fn csi_aborts_on_esc_then_new_sequence_parses() {
307        let input = "a\x1b[31\x1b[32mgreen\x1b[0mz";
308        assert_eq!(strip_ansi(input), "agreenz");
309    }
310
311    #[test]
312    fn csi_aborts_on_can_and_sub() {
313        let can = format!("a\x1b[31{}b", char::from(CAN));
314        let sub = format!("a\x1b[31{}b", char::from(SUB));
315        assert_eq!(strip_ansi(&can), "ab");
316        assert_eq!(strip_ansi(&sub), "ab");
317    }
318
319    #[test]
320    fn osc_aborts_on_esc_non_st() {
321        let input = "a\x1b]title\x1b[31mred\x1b[0mz";
322        assert_eq!(strip_ansi(input), "aredz");
323    }
324
325    #[test]
326    fn incomplete_sequence_drops_tail() {
327        let input = "text\x1b[31";
328        assert_eq!(strip_ansi(input), "text");
329    }
330
331    #[test]
332    fn ascii_only_incomplete_sequence_keeps_tail() {
333        let input = "text\x1b[31";
334        assert_eq!(strip_ansi_ascii_only(input), input);
335    }
336
337    #[test]
338    fn strips_common_progress_redraw_sequences() {
339        // Common pattern for dynamic CLI updates:
340        // carriage return + erase line + redraw text.
341        let input = "\r\x1b[2KProgress 10%\r\x1b[2KDone\n";
342        assert_eq!(strip_ansi(input), "\rProgress 10%\rDone\n");
343    }
344
345    #[test]
346    fn strips_cursor_navigation_sequences() {
347        let input = "left\x1b[1D!\nup\x1b[1Arow";
348        assert_eq!(strip_ansi(input), "left!\nuprow");
349    }
350
351    #[test]
352    fn strip_ansi_bytes_supports_raw_c1_csi() {
353        let input = [
354            b'a', 0x9b, b'3', b'1', b'm', b'r', b'e', b'd', 0x9b, b'0', b'm', b'z',
355        ];
356        let out = super::strip_ansi_bytes(&input);
357        assert_eq!(out, b"aredz");
358    }
359
360    #[test]
361    fn strip_ansi_bytes_supports_raw_c1_osc_and_st() {
362        let mut input = b"pre".to_vec();
363        input.extend_from_slice(&[0x9d]);
364        input.extend_from_slice(b"8;;https://example.com");
365        input.extend_from_slice(&[0x9c]);
366        input.extend_from_slice(b"link");
367        input.extend_from_slice(&[0x9d]);
368        input.extend_from_slice(b"8;;");
369        input.extend_from_slice(&[0x9c]);
370        input.extend_from_slice(b"post");
371        let out = super::strip_ansi_bytes(&input);
372        assert_eq!(out, b"prelinkpost");
373    }
374
375    #[test]
376    fn csi_respects_parameter_intermediate_final_grammar() {
377        // Parameter bytes ("1;2"), intermediate bytes (" "), then final ("m")
378        let input = "a\x1b[1;2 mred\x1b[0mz";
379        assert_eq!(strip_ansi(input), "aredz");
380    }
381
382    #[test]
383    fn malformed_csi_does_not_consume_following_text() {
384        // 0x10 is not valid CSI parameter/intermediate/final.
385        let malformed = format!("a\x1b[12{}visible", char::from(0x10));
386        assert_eq!(strip_ansi(&malformed), "avisible");
387    }
388
389    #[test]
390    fn strips_wikipedia_sgr_8bit_color_pattern() {
391        let input = "x\x1b[38;5;196mred\x1b[0my";
392        assert_eq!(strip_ansi(input), "xredy");
393    }
394
395    #[test]
396    fn strips_wikipedia_sgr_truecolor_pattern() {
397        let input = "x\x1b[48;2;12;34;56mblock\x1b[0my";
398        assert_eq!(strip_ansi(input), "xblocky");
399    }
400
401    #[test]
402    fn strips_wikipedia_osc8_hyperlink_pattern() {
403        let input = "go \x1b]8;;https://example.com\x1b\\here\x1b]8;;\x1b\\ now";
404        assert_eq!(strip_ansi(input), "go here now");
405    }
406
407    #[test]
408    fn strips_dec_private_mode_csi() {
409        let input = "a\x1b[?25lb\x1b[?25hc";
410        assert_eq!(strip_ansi(input), "abc");
411    }
412}