Skip to main content

vtcode_commons/
ansi.rs

1//! ANSI escape sequence parser and utilities
2
3const ESC: u8 = 0x1b;
4const BEL: u8 = 0x07;
5const DEL: u8 = 0x7f;
6const C1_ST: u8 = 0x9c;
7const C1_DCS: u8 = 0x90;
8const C1_SOS: u8 = 0x98;
9const C1_CSI: u8 = 0x9b;
10const C1_OSC: u8 = 0x9d;
11const C1_PM: u8 = 0x9e;
12const C1_APC: u8 = 0x9f;
13const CAN: u8 = 0x18;
14const SUB: u8 = 0x1a;
15const MAX_STRING_SEQUENCE_BYTES: usize = 4096;
16const MAX_CSI_SEQUENCE_BYTES: usize = 64;
17
18#[inline]
19fn parse_c1_at(bytes: &[u8], start: usize) -> Option<(u8, usize)> {
20    let first = *bytes.get(start)?;
21    if (0x80..=0x9f).contains(&first) {
22        return Some((first, 1));
23    }
24    None
25}
26
27#[inline]
28fn parse_csi(bytes: &[u8], start: usize) -> Option<usize> {
29    // ECMA-48 / ISO 6429 CSI grammar:
30    // - parameter bytes: 0x30..0x3F
31    // - intermediate bytes: 0x20..0x2F
32    // - final byte: 0x40..0x7E
33    // (See ANSI escape code article on Wikipedia, CSI section.)
34    let mut index = start;
35    let mut phase = 0u8; // 0=parameter, 1=intermediate
36    let mut consumed = 0usize;
37
38    while index < bytes.len() {
39        let byte = bytes[index];
40        if byte == ESC {
41            // VT100: ESC aborts current control sequence and starts a new one.
42            return Some(index);
43        }
44        if byte == CAN || byte == SUB {
45            // VT100: CAN/SUB abort current control sequence.
46            return Some(index + 1);
47        }
48
49        consumed += 1;
50        if consumed > MAX_CSI_SEQUENCE_BYTES {
51            // Bound malformed or hostile input.
52            return Some(index + 1);
53        }
54
55        if phase == 0 && (0x30..=0x3f).contains(&byte) {
56            index += 1;
57            continue;
58        }
59        if (0x20..=0x2f).contains(&byte) {
60            phase = 1;
61            index += 1;
62            continue;
63        }
64        if (0x40..=0x7e).contains(&byte) {
65            return Some(index + 1);
66        }
67
68        // Invalid CSI byte: abort sequence without consuming this byte.
69        return Some(index);
70    }
71
72    None
73}
74
75#[inline]
76fn parse_osc(bytes: &[u8], start: usize) -> Option<usize> {
77    let mut consumed = 0usize;
78    for index in start..bytes.len() {
79        if bytes[index] == ESC && !(index + 1 < bytes.len() && bytes[index + 1] == b'\\') {
80            // VT100: ESC aborts current sequence and begins a new one.
81            return Some(index);
82        }
83        if bytes[index] == CAN || bytes[index] == SUB {
84            return Some(index + 1);
85        }
86
87        if let Some((c1, len)) = parse_c1_at(bytes, index)
88            && c1 == C1_ST
89        {
90            return Some(index + len);
91        }
92
93        match bytes[index] {
94            BEL | C1_ST => return Some(index + 1),
95            ESC if index + 1 < bytes.len() && bytes[index + 1] == b'\\' => return Some(index + 2),
96            _ => {}
97        }
98
99        consumed += 1;
100        if consumed > MAX_STRING_SEQUENCE_BYTES {
101            // Cap unbounded strings when terminator is missing.
102            return Some(index + 1);
103        }
104    }
105    None
106}
107
108#[inline]
109fn parse_st_terminated(bytes: &[u8], start: usize) -> Option<usize> {
110    let mut consumed = 0usize;
111    for index in start..bytes.len() {
112        if bytes[index] == ESC && !(index + 1 < bytes.len() && bytes[index + 1] == b'\\') {
113            return Some(index);
114        }
115        if bytes[index] == CAN || bytes[index] == SUB {
116            return Some(index + 1);
117        }
118
119        if let Some((c1, len)) = parse_c1_at(bytes, index)
120            && c1 == C1_ST
121        {
122            return Some(index + len);
123        }
124
125        match bytes[index] {
126            C1_ST => return Some(index + 1),
127            ESC if index + 1 < bytes.len() && bytes[index + 1] == b'\\' => return Some(index + 2),
128            _ => {}
129        }
130
131        consumed += 1;
132        if consumed > MAX_STRING_SEQUENCE_BYTES {
133            return Some(index + 1);
134        }
135    }
136    None
137}
138
139#[inline]
140fn parse_ansi_sequence_bytes(bytes: &[u8]) -> Option<usize> {
141    if bytes.is_empty() {
142        return None;
143    }
144
145    if let Some((c1, c1_len)) = parse_c1_at(bytes, 0) {
146        return match c1 {
147            C1_CSI => parse_csi(bytes, c1_len),
148            C1_OSC => parse_osc(bytes, c1_len),
149            C1_DCS | C1_SOS | C1_PM | C1_APC => parse_st_terminated(bytes, c1_len),
150            _ => Some(c1_len),
151        };
152    }
153
154    match bytes[0] {
155        ESC => {
156            if bytes.len() < 2 {
157                return None;
158            }
159
160            match bytes[1] {
161                b'[' => parse_csi(bytes, 2),
162                b']' => parse_osc(bytes, 2),
163                b'P' | b'^' | b'_' | b'X' => parse_st_terminated(bytes, 2),
164                next if next < 128 => Some(2),
165                _ => Some(1),
166            }
167        }
168        _ => None,
169    }
170}
171
172/// Strip ANSI escape codes from text, keeping only plain text
173pub fn strip_ansi(text: &str) -> String {
174    let mut output = Vec::with_capacity(text.len());
175    let bytes = text.as_bytes();
176    let mut i = 0;
177
178    while i < bytes.len() {
179        if bytes[i] == ESC
180            && let Some(len) = parse_ansi_sequence_bytes(&bytes[i..])
181        {
182            i += len;
183            continue;
184        }
185        if bytes[i] == ESC {
186            // Incomplete/unterminated control sequence at end of available text.
187            break;
188        }
189
190        if bytes[i] == b'\n' || bytes[i] == b'\r' || bytes[i] == b'\t' {
191            output.push(bytes[i]);
192            i += 1;
193        } else if bytes[i] < 32 || bytes[i] == DEL {
194            i += 1;
195        } else {
196            output.push(bytes[i]);
197            i += 1;
198        }
199    }
200
201    String::from_utf8_lossy(&output).into_owned()
202}
203
204/// Strip ANSI escape codes from arbitrary bytes, preserving non-control bytes.
205///
206/// This is the preferred API when input may contain raw C1 (8-bit) controls.
207pub fn strip_ansi_bytes(input: &[u8]) -> Vec<u8> {
208    let mut output = Vec::with_capacity(input.len());
209    let bytes = input;
210    let mut i = 0;
211
212    while i < bytes.len() {
213        if (bytes[i] == ESC || parse_c1_at(bytes, i).is_some())
214            && let Some(len) = parse_ansi_sequence_bytes(&bytes[i..])
215        {
216            i += len;
217            continue;
218        }
219        if bytes[i] == ESC || parse_c1_at(bytes, i).is_some() {
220            // Incomplete/unterminated control sequence at end of available text.
221            break;
222        }
223
224        if bytes[i] == b'\n' || bytes[i] == b'\r' || bytes[i] == b'\t' {
225            output.push(bytes[i]);
226            i += 1;
227        } else if bytes[i] < 32 || bytes[i] == DEL {
228            i += 1;
229        } else {
230            output.push(bytes[i]);
231            i += 1;
232        }
233    }
234    output
235}
236
237/// Parse and determine the length of the ANSI escape sequence at the start of text
238pub fn parse_ansi_sequence(text: &str) -> Option<usize> {
239    let bytes = text.as_bytes();
240    parse_ansi_sequence_bytes(bytes)
241}
242
243/// Fast ASCII-only ANSI stripping for performance-critical paths
244pub fn strip_ansi_ascii_only(text: &str) -> String {
245    let mut output = String::with_capacity(text.len());
246    let bytes = text.as_bytes();
247    let mut i = 0;
248    let mut last_valid = 0;
249
250    while i < bytes.len() {
251        if (bytes[i] == ESC || parse_c1_at(bytes, i).is_some())
252            && let Some(len) = parse_ansi_sequence_bytes(&bytes[i..])
253        {
254            if last_valid < i {
255                output.push_str(&text[last_valid..i]);
256            }
257            i += len;
258            last_valid = i;
259            continue;
260        }
261
262        i += 1;
263    }
264
265    if last_valid < text.len() {
266        output.push_str(&text[last_valid..]);
267    }
268
269    output
270}
271
272/// Detect if text contains unicode characters that need special handling
273pub fn contains_unicode(text: &str) -> bool {
274    text.bytes().any(|b| b >= 0x80)
275}
276
277#[cfg(test)]
278mod tests {
279    use super::{CAN, SUB, strip_ansi, strip_ansi_ascii_only};
280
281    #[test]
282    fn strips_esc_csi_sequences() {
283        let input = "a\x1b[31mred\x1b[0mz";
284        assert_eq!(strip_ansi(input), "aredz");
285        assert_eq!(strip_ansi_ascii_only(input), "aredz");
286    }
287
288    #[test]
289    fn utf8_encoded_c1_is_not_reprocessed_as_control() {
290        // XTerm/ECMA-48: controls are processed once; decoded UTF-8 text is not reprocessed as C1.
291        let input = "a\u{009b}31mred";
292        assert_eq!(strip_ansi(input), input);
293    }
294
295    #[test]
296    fn strip_removes_ascii_del_control() {
297        let input = format!("a{}b", char::from(0x7f));
298        assert_eq!(strip_ansi(&input), "ab");
299    }
300
301    #[test]
302    fn csi_aborts_on_esc_then_new_sequence_parses() {
303        let input = "a\x1b[31\x1b[32mgreen\x1b[0mz";
304        assert_eq!(strip_ansi(input), "agreenz");
305    }
306
307    #[test]
308    fn csi_aborts_on_can_and_sub() {
309        let can = format!("a\x1b[31{}b", char::from(CAN));
310        let sub = format!("a\x1b[31{}b", char::from(SUB));
311        assert_eq!(strip_ansi(&can), "ab");
312        assert_eq!(strip_ansi(&sub), "ab");
313    }
314
315    #[test]
316    fn osc_aborts_on_esc_non_st() {
317        let input = "a\x1b]title\x1b[31mred\x1b[0mz";
318        assert_eq!(strip_ansi(input), "aredz");
319    }
320
321    #[test]
322    fn incomplete_sequence_drops_tail() {
323        let input = "text\x1b[31";
324        assert_eq!(strip_ansi(input), "text");
325    }
326
327    #[test]
328    fn strips_common_progress_redraw_sequences() {
329        // Common pattern for dynamic CLI updates:
330        // carriage return + erase line + redraw text.
331        let input = "\r\x1b[2KProgress 10%\r\x1b[2KDone\n";
332        assert_eq!(strip_ansi(input), "\rProgress 10%\rDone\n");
333    }
334
335    #[test]
336    fn strips_cursor_navigation_sequences() {
337        let input = "left\x1b[1D!\nup\x1b[1Arow";
338        assert_eq!(strip_ansi(input), "left!\nuprow");
339    }
340
341    #[test]
342    fn strip_ansi_bytes_supports_raw_c1_csi() {
343        let input = [
344            b'a', 0x9b, b'3', b'1', b'm', b'r', b'e', b'd', 0x9b, b'0', b'm', b'z',
345        ];
346        let out = super::strip_ansi_bytes(&input);
347        assert_eq!(out, b"aredz");
348    }
349
350    #[test]
351    fn strip_ansi_bytes_supports_raw_c1_osc_and_st() {
352        let mut input = b"pre".to_vec();
353        input.extend_from_slice(&[0x9d]);
354        input.extend_from_slice(b"8;;https://example.com");
355        input.extend_from_slice(&[0x9c]);
356        input.extend_from_slice(b"link");
357        input.extend_from_slice(&[0x9d]);
358        input.extend_from_slice(b"8;;");
359        input.extend_from_slice(&[0x9c]);
360        input.extend_from_slice(b"post");
361        let out = super::strip_ansi_bytes(&input);
362        assert_eq!(out, b"prelinkpost");
363    }
364
365    #[test]
366    fn csi_respects_parameter_intermediate_final_grammar() {
367        // Parameter bytes ("1;2"), intermediate bytes (" "), then final ("m")
368        let input = "a\x1b[1;2 mred\x1b[0mz";
369        assert_eq!(strip_ansi(input), "aredz");
370    }
371
372    #[test]
373    fn malformed_csi_does_not_consume_following_text() {
374        // 0x10 is not valid CSI parameter/intermediate/final.
375        let malformed = format!("a\x1b[12{}visible", char::from(0x10));
376        assert_eq!(strip_ansi(&malformed), "avisible");
377    }
378
379    #[test]
380    fn strips_wikipedia_sgr_8bit_color_pattern() {
381        let input = "x\x1b[38;5;196mred\x1b[0my";
382        assert_eq!(strip_ansi(input), "xredy");
383    }
384
385    #[test]
386    fn strips_wikipedia_sgr_truecolor_pattern() {
387        let input = "x\x1b[48;2;12;34;56mblock\x1b[0my";
388        assert_eq!(strip_ansi(input), "xblocky");
389    }
390
391    #[test]
392    fn strips_wikipedia_osc8_hyperlink_pattern() {
393        let input = "go \x1b]8;;https://example.com\x1b\\here\x1b]8;;\x1b\\ now";
394        assert_eq!(strip_ansi(input), "go here now");
395    }
396
397    #[test]
398    fn strips_dec_private_mode_csi() {
399        let input = "a\x1b[?25lb\x1b[?25hc";
400        assert_eq!(strip_ansi(input), "abc");
401    }
402}