edit/
vt.rs

1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Our VT parser.
5
6use std::time;
7
8use crate::simd::memchr2;
9use crate::unicode::Utf8Chars;
10
11/// The parser produces these tokens.
12pub enum Token<'parser, 'input> {
13    /// A bunch of text. Doesn't contain any control characters.
14    Text(&'input str),
15    /// A single control character, like backspace or return.
16    Ctrl(char),
17    /// We encountered `ESC x` and this contains `x`.
18    Esc(char),
19    /// We encountered `ESC O x` and this contains `x`.
20    SS3(char),
21    /// A CSI sequence started with `ESC [`.
22    ///
23    /// They are the most common escape sequences. See [`Csi`].
24    Csi(&'parser Csi),
25    /// An OSC sequence started with `ESC ]`.
26    ///
27    /// The sequence may be split up into multiple tokens if the input
28    /// is given in chunks. This is indicated by the `partial` field.
29    Osc { data: &'input str, partial: bool },
30    /// An DCS sequence started with `ESC P`.
31    ///
32    /// The sequence may be split up into multiple tokens if the input
33    /// is given in chunks. This is indicated by the `partial` field.
34    Dcs { data: &'input str, partial: bool },
35}
36
37/// Stores the state of the parser.
38#[derive(Clone, Copy)]
39enum State {
40    Ground,
41    Esc,
42    Ss3,
43    Csi,
44    Osc,
45    Dcs,
46    OscEsc,
47    DcsEsc,
48}
49
50/// A single CSI sequence, parsed for your convenience.
51pub struct Csi {
52    /// The parameters of the CSI sequence.
53    pub params: [u16; 32],
54    /// The number of parameters stored in [`Csi::params`].
55    pub param_count: usize,
56    /// The private byte, if any. `0` if none.
57    ///
58    /// The private byte is the first character right after the
59    /// `ESC [` sequence. It is usually a `?` or `<`.
60    pub private_byte: char,
61    /// The final byte of the CSI sequence.
62    ///
63    /// This is the last character of the sequence, e.g. `m` or `H`.
64    pub final_byte: char,
65}
66
67pub struct Parser {
68    state: State,
69    // Csi is not part of State, because it allows us
70    // to more quickly erase and reuse the struct.
71    csi: Csi,
72}
73
74impl Parser {
75    pub fn new() -> Self {
76        Self {
77            state: State::Ground,
78            csi: Csi { params: [0; 32], param_count: 0, private_byte: '\0', final_byte: '\0' },
79        }
80    }
81
82    /// Suggests a timeout for the next call to `read()`.
83    ///
84    /// We need this because of the ambiguity of whether a trailing
85    /// escape character in an input is starting another escape sequence or
86    /// is just the result of the user literally pressing the Escape key.
87    pub fn read_timeout(&mut self) -> std::time::Duration {
88        match self.state {
89            // 100ms is a upper ceiling for a responsive feel.
90            // Realistically though, this could be much lower.
91            //
92            // However, there seems to be issues with OpenSSH on Windows.
93            // See: https://github.com/PowerShell/Win32-OpenSSH/issues/2275
94            State::Esc => time::Duration::from_millis(100),
95            _ => time::Duration::MAX,
96        }
97    }
98
99    /// Parses the given input into VT sequences.
100    ///
101    /// You should call this function even if your `read()`
102    /// had a timeout (pass an empty string in that case).
103    pub fn parse<'parser, 'input>(
104        &'parser mut self,
105        input: &'input str,
106    ) -> Stream<'parser, 'input> {
107        Stream { parser: self, input, off: 0 }
108    }
109}
110
111/// An iterator that parses VT sequences into [`Token`]s.
112///
113/// Can't implement [`Iterator`], because this is a "lending iterator".
114pub struct Stream<'parser, 'input> {
115    parser: &'parser mut Parser,
116    input: &'input str,
117    off: usize,
118}
119
120impl<'input> Stream<'_, 'input> {
121    /// Returns the input that is being parsed.
122    pub fn input(&self) -> &'input str {
123        self.input
124    }
125
126    /// Returns the current parser offset.
127    pub fn offset(&self) -> usize {
128        self.off
129    }
130
131    /// Reads and consumes raw bytes from the input.
132    pub fn read(&mut self, dst: &mut [u8]) -> usize {
133        let bytes = self.input.as_bytes();
134        let off = self.off.min(bytes.len());
135        let len = dst.len().min(bytes.len() - off);
136        dst[..len].copy_from_slice(&bytes[off..off + len]);
137        self.off += len;
138        len
139    }
140
141    fn decode_next(&mut self) -> char {
142        let mut iter = Utf8Chars::new(self.input.as_bytes(), self.off);
143        let c = iter.next().unwrap_or('\0');
144        self.off = iter.offset();
145        c
146    }
147
148    /// Parses the next VT sequence from the previously given input.
149    #[allow(
150        clippy::should_implement_trait,
151        reason = "can't implement Iterator because this is a lending iterator"
152    )]
153    pub fn next(&mut self) -> Option<Token<'_, 'input>> {
154        let input = self.input;
155        let bytes = input.as_bytes();
156
157        // If the previous input ended with an escape character, `read_timeout()`
158        // returned `Some(..)` timeout, and if the caller did everything correctly
159        // and there was indeed a timeout, we should be called with an empty
160        // input. In that case we'll return the escape as its own token.
161        if input.is_empty() && matches!(self.parser.state, State::Esc) {
162            self.parser.state = State::Ground;
163            return Some(Token::Esc('\0'));
164        }
165
166        while self.off < bytes.len() {
167            // TODO: The state machine can be roughly broken up into two parts:
168            // * Wants to parse 1 `char` at a time: Ground, Esc, Ss3
169            //   These could all be unified to a single call to `decode_next()`.
170            // * Wants to bulk-process bytes: Csi, Osc, Dcs
171            // We should do that so the UTF8 handling is a bit more "unified".
172            match self.parser.state {
173                State::Ground => match bytes[self.off] {
174                    0x1b => {
175                        self.parser.state = State::Esc;
176                        self.off += 1;
177                    }
178                    c @ (0x00..0x20 | 0x7f) => {
179                        self.off += 1;
180                        return Some(Token::Ctrl(c as char));
181                    }
182                    _ => {
183                        let beg = self.off;
184                        while {
185                            self.off += 1;
186                            self.off < bytes.len()
187                                && bytes[self.off] >= 0x20
188                                && bytes[self.off] != 0x7f
189                        } {}
190                        return Some(Token::Text(&input[beg..self.off]));
191                    }
192                },
193                State::Esc => match self.decode_next() {
194                    '[' => {
195                        self.parser.state = State::Csi;
196                        self.parser.csi.private_byte = '\0';
197                        self.parser.csi.final_byte = '\0';
198                        while self.parser.csi.param_count > 0 {
199                            self.parser.csi.param_count -= 1;
200                            self.parser.csi.params[self.parser.csi.param_count] = 0;
201                        }
202                    }
203                    ']' => {
204                        self.parser.state = State::Osc;
205                    }
206                    'O' => {
207                        self.parser.state = State::Ss3;
208                    }
209                    'P' => {
210                        self.parser.state = State::Dcs;
211                    }
212                    c => {
213                        self.parser.state = State::Ground;
214                        return Some(Token::Esc(c));
215                    }
216                },
217                State::Ss3 => {
218                    self.parser.state = State::Ground;
219                    return Some(Token::SS3(self.decode_next()));
220                }
221                State::Csi => {
222                    loop {
223                        // If we still have slots left, parse the parameter.
224                        if self.parser.csi.param_count < self.parser.csi.params.len() {
225                            let dst = &mut self.parser.csi.params[self.parser.csi.param_count];
226                            while self.off < bytes.len() && bytes[self.off].is_ascii_digit() {
227                                let add = bytes[self.off] as u32 - b'0' as u32;
228                                let value = *dst as u32 * 10 + add;
229                                *dst = value.min(u16::MAX as u32) as u16;
230                                self.off += 1;
231                            }
232                        } else {
233                            // ...otherwise, skip the parameters until we find the final byte.
234                            while self.off < bytes.len() && bytes[self.off].is_ascii_digit() {
235                                self.off += 1;
236                            }
237                        }
238
239                        // Encountered the end of the input before finding the final byte.
240                        if self.off >= bytes.len() {
241                            return None;
242                        }
243
244                        let c = bytes[self.off];
245                        self.off += 1;
246
247                        match c {
248                            0x40..=0x7e => {
249                                self.parser.state = State::Ground;
250                                self.parser.csi.final_byte = c as char;
251                                if self.parser.csi.param_count != 0
252                                    || self.parser.csi.params[0] != 0
253                                {
254                                    self.parser.csi.param_count += 1;
255                                }
256                                return Some(Token::Csi(&self.parser.csi));
257                            }
258                            b';' => self.parser.csi.param_count += 1,
259                            b'<'..=b'?' => self.parser.csi.private_byte = c as char,
260                            _ => {}
261                        }
262                    }
263                }
264                State::Osc | State::Dcs => {
265                    let beg = self.off;
266                    let mut data;
267                    let mut partial;
268
269                    loop {
270                        // Find any indication for the end of the OSC/DCS sequence.
271                        self.off = memchr2(b'\x07', b'\x1b', bytes, self.off);
272
273                        data = &input[beg..self.off];
274                        partial = self.off >= bytes.len();
275
276                        // Encountered the end of the input before finding the terminator.
277                        if partial {
278                            break;
279                        }
280
281                        let c = bytes[self.off];
282                        self.off += 1;
283
284                        if c == 0x1b {
285                            // It's only a string terminator if it's followed by \.
286                            // We're at the end so we're saving the state and will continue next time.
287                            if self.off >= bytes.len() {
288                                self.parser.state = match self.parser.state {
289                                    State::Osc => State::OscEsc,
290                                    _ => State::DcsEsc,
291                                };
292                                partial = true;
293                                break;
294                            }
295
296                            // False alarm: Not a string terminator.
297                            if bytes[self.off] != b'\\' {
298                                continue;
299                            }
300
301                            self.off += 1;
302                        }
303
304                        break;
305                    }
306
307                    let state = self.parser.state;
308                    if !partial {
309                        self.parser.state = State::Ground;
310                    }
311                    return match state {
312                        State::Osc => Some(Token::Osc { data, partial }),
313                        _ => Some(Token::Dcs { data, partial }),
314                    };
315                }
316                State::OscEsc | State::DcsEsc => {
317                    // We were processing an OSC/DCS sequence and the last byte was an escape character.
318                    // It's only a string terminator if it's followed by \ (= "\x1b\\").
319                    if bytes[self.off] == b'\\' {
320                        // It was indeed a string terminator and we can now tell the caller about it.
321                        let state = self.parser.state;
322
323                        // Consume the terminator (one byte in the previous input and this byte).
324                        self.parser.state = State::Ground;
325                        self.off += 1;
326
327                        return match state {
328                            State::OscEsc => Some(Token::Osc { data: "", partial: false }),
329                            _ => Some(Token::Dcs { data: "", partial: false }),
330                        };
331                    } else {
332                        // False alarm: Not a string terminator.
333                        // We'll return the escape character as a separate token.
334                        // Processing will continue from the current state (`bytes[self.off]`).
335                        self.parser.state = match self.parser.state {
336                            State::OscEsc => State::Osc,
337                            _ => State::Dcs,
338                        };
339                        return match self.parser.state {
340                            State::Osc => Some(Token::Osc { data: "\x1b", partial: true }),
341                            _ => Some(Token::Dcs { data: "\x1b", partial: true }),
342                        };
343                    }
344                }
345            }
346        }
347
348        None
349    }
350}