edit/vt.rs
1// Copyright (c) Microsoft Corporation.
2// Licensed under the MIT License.
3
4//! Our VT parser.
5
6use std::time;
7
8use crate::simd::memchr2;
9use crate::unicode::Utf8Chars;
10
11/// The parser produces these tokens.
12pub enum Token<'parser, 'input> {
13 /// A bunch of text. Doesn't contain any control characters.
14 Text(&'input str),
15 /// A single control character, like backspace or return.
16 Ctrl(char),
17 /// We encountered `ESC x` and this contains `x`.
18 Esc(char),
19 /// We encountered `ESC O x` and this contains `x`.
20 SS3(char),
21 /// A CSI sequence started with `ESC [`.
22 ///
23 /// They are the most common escape sequences. See [`Csi`].
24 Csi(&'parser Csi),
25 /// An OSC sequence started with `ESC ]`.
26 ///
27 /// The sequence may be split up into multiple tokens if the input
28 /// is given in chunks. This is indicated by the `partial` field.
29 Osc { data: &'input str, partial: bool },
30 /// An DCS sequence started with `ESC P`.
31 ///
32 /// The sequence may be split up into multiple tokens if the input
33 /// is given in chunks. This is indicated by the `partial` field.
34 Dcs { data: &'input str, partial: bool },
35}
36
37/// Stores the state of the parser.
38#[derive(Clone, Copy)]
39enum State {
40 Ground,
41 Esc,
42 Ss3,
43 Csi,
44 Osc,
45 Dcs,
46 OscEsc,
47 DcsEsc,
48}
49
50/// A single CSI sequence, parsed for your convenience.
51pub struct Csi {
52 /// The parameters of the CSI sequence.
53 pub params: [u16; 32],
54 /// The number of parameters stored in [`Csi::params`].
55 pub param_count: usize,
56 /// The private byte, if any. `0` if none.
57 ///
58 /// The private byte is the first character right after the
59 /// `ESC [` sequence. It is usually a `?` or `<`.
60 pub private_byte: char,
61 /// The final byte of the CSI sequence.
62 ///
63 /// This is the last character of the sequence, e.g. `m` or `H`.
64 pub final_byte: char,
65}
66
67pub struct Parser {
68 state: State,
69 // Csi is not part of State, because it allows us
70 // to more quickly erase and reuse the struct.
71 csi: Csi,
72}
73
74impl Parser {
75 pub fn new() -> Self {
76 Self {
77 state: State::Ground,
78 csi: Csi { params: [0; 32], param_count: 0, private_byte: '\0', final_byte: '\0' },
79 }
80 }
81
82 /// Suggests a timeout for the next call to `read()`.
83 ///
84 /// We need this because of the ambiguity of whether a trailing
85 /// escape character in an input is starting another escape sequence or
86 /// is just the result of the user literally pressing the Escape key.
87 pub fn read_timeout(&mut self) -> std::time::Duration {
88 match self.state {
89 // 100ms is a upper ceiling for a responsive feel.
90 // Realistically though, this could be much lower.
91 //
92 // However, there seems to be issues with OpenSSH on Windows.
93 // See: https://github.com/PowerShell/Win32-OpenSSH/issues/2275
94 State::Esc => time::Duration::from_millis(100),
95 _ => time::Duration::MAX,
96 }
97 }
98
99 /// Parses the given input into VT sequences.
100 ///
101 /// You should call this function even if your `read()`
102 /// had a timeout (pass an empty string in that case).
103 pub fn parse<'parser, 'input>(
104 &'parser mut self,
105 input: &'input str,
106 ) -> Stream<'parser, 'input> {
107 Stream { parser: self, input, off: 0 }
108 }
109}
110
111/// An iterator that parses VT sequences into [`Token`]s.
112///
113/// Can't implement [`Iterator`], because this is a "lending iterator".
114pub struct Stream<'parser, 'input> {
115 parser: &'parser mut Parser,
116 input: &'input str,
117 off: usize,
118}
119
120impl<'input> Stream<'_, 'input> {
121 /// Returns the input that is being parsed.
122 pub fn input(&self) -> &'input str {
123 self.input
124 }
125
126 /// Returns the current parser offset.
127 pub fn offset(&self) -> usize {
128 self.off
129 }
130
131 /// Reads and consumes raw bytes from the input.
132 pub fn read(&mut self, dst: &mut [u8]) -> usize {
133 let bytes = self.input.as_bytes();
134 let off = self.off.min(bytes.len());
135 let len = dst.len().min(bytes.len() - off);
136 dst[..len].copy_from_slice(&bytes[off..off + len]);
137 self.off += len;
138 len
139 }
140
141 fn decode_next(&mut self) -> char {
142 let mut iter = Utf8Chars::new(self.input.as_bytes(), self.off);
143 let c = iter.next().unwrap_or('\0');
144 self.off = iter.offset();
145 c
146 }
147
148 /// Parses the next VT sequence from the previously given input.
149 #[allow(
150 clippy::should_implement_trait,
151 reason = "can't implement Iterator because this is a lending iterator"
152 )]
153 pub fn next(&mut self) -> Option<Token<'_, 'input>> {
154 let input = self.input;
155 let bytes = input.as_bytes();
156
157 // If the previous input ended with an escape character, `read_timeout()`
158 // returned `Some(..)` timeout, and if the caller did everything correctly
159 // and there was indeed a timeout, we should be called with an empty
160 // input. In that case we'll return the escape as its own token.
161 if input.is_empty() && matches!(self.parser.state, State::Esc) {
162 self.parser.state = State::Ground;
163 return Some(Token::Esc('\0'));
164 }
165
166 while self.off < bytes.len() {
167 // TODO: The state machine can be roughly broken up into two parts:
168 // * Wants to parse 1 `char` at a time: Ground, Esc, Ss3
169 // These could all be unified to a single call to `decode_next()`.
170 // * Wants to bulk-process bytes: Csi, Osc, Dcs
171 // We should do that so the UTF8 handling is a bit more "unified".
172 match self.parser.state {
173 State::Ground => match bytes[self.off] {
174 0x1b => {
175 self.parser.state = State::Esc;
176 self.off += 1;
177 }
178 c @ (0x00..0x20 | 0x7f) => {
179 self.off += 1;
180 return Some(Token::Ctrl(c as char));
181 }
182 _ => {
183 let beg = self.off;
184 while {
185 self.off += 1;
186 self.off < bytes.len()
187 && bytes[self.off] >= 0x20
188 && bytes[self.off] != 0x7f
189 } {}
190 return Some(Token::Text(&input[beg..self.off]));
191 }
192 },
193 State::Esc => match self.decode_next() {
194 '[' => {
195 self.parser.state = State::Csi;
196 self.parser.csi.private_byte = '\0';
197 self.parser.csi.final_byte = '\0';
198 while self.parser.csi.param_count > 0 {
199 self.parser.csi.param_count -= 1;
200 self.parser.csi.params[self.parser.csi.param_count] = 0;
201 }
202 }
203 ']' => {
204 self.parser.state = State::Osc;
205 }
206 'O' => {
207 self.parser.state = State::Ss3;
208 }
209 'P' => {
210 self.parser.state = State::Dcs;
211 }
212 c => {
213 self.parser.state = State::Ground;
214 return Some(Token::Esc(c));
215 }
216 },
217 State::Ss3 => {
218 self.parser.state = State::Ground;
219 return Some(Token::SS3(self.decode_next()));
220 }
221 State::Csi => {
222 loop {
223 // If we still have slots left, parse the parameter.
224 if self.parser.csi.param_count < self.parser.csi.params.len() {
225 let dst = &mut self.parser.csi.params[self.parser.csi.param_count];
226 while self.off < bytes.len() && bytes[self.off].is_ascii_digit() {
227 let add = bytes[self.off] as u32 - b'0' as u32;
228 let value = *dst as u32 * 10 + add;
229 *dst = value.min(u16::MAX as u32) as u16;
230 self.off += 1;
231 }
232 } else {
233 // ...otherwise, skip the parameters until we find the final byte.
234 while self.off < bytes.len() && bytes[self.off].is_ascii_digit() {
235 self.off += 1;
236 }
237 }
238
239 // Encountered the end of the input before finding the final byte.
240 if self.off >= bytes.len() {
241 return None;
242 }
243
244 let c = bytes[self.off];
245 self.off += 1;
246
247 match c {
248 0x40..=0x7e => {
249 self.parser.state = State::Ground;
250 self.parser.csi.final_byte = c as char;
251 if self.parser.csi.param_count != 0
252 || self.parser.csi.params[0] != 0
253 {
254 self.parser.csi.param_count += 1;
255 }
256 return Some(Token::Csi(&self.parser.csi));
257 }
258 b';' => self.parser.csi.param_count += 1,
259 b'<'..=b'?' => self.parser.csi.private_byte = c as char,
260 _ => {}
261 }
262 }
263 }
264 State::Osc | State::Dcs => {
265 let beg = self.off;
266 let mut data;
267 let mut partial;
268
269 loop {
270 // Find any indication for the end of the OSC/DCS sequence.
271 self.off = memchr2(b'\x07', b'\x1b', bytes, self.off);
272
273 data = &input[beg..self.off];
274 partial = self.off >= bytes.len();
275
276 // Encountered the end of the input before finding the terminator.
277 if partial {
278 break;
279 }
280
281 let c = bytes[self.off];
282 self.off += 1;
283
284 if c == 0x1b {
285 // It's only a string terminator if it's followed by \.
286 // We're at the end so we're saving the state and will continue next time.
287 if self.off >= bytes.len() {
288 self.parser.state = match self.parser.state {
289 State::Osc => State::OscEsc,
290 _ => State::DcsEsc,
291 };
292 partial = true;
293 break;
294 }
295
296 // False alarm: Not a string terminator.
297 if bytes[self.off] != b'\\' {
298 continue;
299 }
300
301 self.off += 1;
302 }
303
304 break;
305 }
306
307 let state = self.parser.state;
308 if !partial {
309 self.parser.state = State::Ground;
310 }
311 return match state {
312 State::Osc => Some(Token::Osc { data, partial }),
313 _ => Some(Token::Dcs { data, partial }),
314 };
315 }
316 State::OscEsc | State::DcsEsc => {
317 // We were processing an OSC/DCS sequence and the last byte was an escape character.
318 // It's only a string terminator if it's followed by \ (= "\x1b\\").
319 if bytes[self.off] == b'\\' {
320 // It was indeed a string terminator and we can now tell the caller about it.
321 let state = self.parser.state;
322
323 // Consume the terminator (one byte in the previous input and this byte).
324 self.parser.state = State::Ground;
325 self.off += 1;
326
327 return match state {
328 State::OscEsc => Some(Token::Osc { data: "", partial: false }),
329 _ => Some(Token::Dcs { data: "", partial: false }),
330 };
331 } else {
332 // False alarm: Not a string terminator.
333 // We'll return the escape character as a separate token.
334 // Processing will continue from the current state (`bytes[self.off]`).
335 self.parser.state = match self.parser.state {
336 State::OscEsc => State::Osc,
337 _ => State::Dcs,
338 };
339 return match self.parser.state {
340 State::Osc => Some(Token::Osc { data: "\x1b", partial: true }),
341 _ => Some(Token::Dcs { data: "\x1b", partial: true }),
342 };
343 }
344 }
345 }
346 }
347
348 None
349 }
350}