prettytty 0.3.0

Simple, lightweight terminal I/O and configuration
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
//! The machinery for recognizing UTF-8 and ANSI escape codes.

mod buffer;
mod machine;
mod utf8;

use self::buffer::Buffer;
use self::machine::{transition, Action, State};
use self::utf8::scan_utf8_length;

use super::err::{Error, ErrorKind};
use super::opt::Options;
use super::{Control, Token};

// ================================================================================================

/// A scanner for text and control tokens.
///
/// This struct builds on Paul Flo Williams' [parser for DEC's ANSI-compatible
/// terminals](https://vt100.net/emu/dec_ansi_parser) to implement a state
/// machine for recognizing UTF-8 characters and ANSI control sequences alike.
/// Notably, [`Scanner::read_token`] produces the corresponding [`Token`]s. To
/// minimize overhead, the implementation turns subsequent UTF-8 characters into
/// text tokens. It is zero-copy as long as no control characters appear in the
/// middle of control sequences. As a result, tokens have the same lifetime as
/// the scanner itself, and each token must be processed before the next
/// invocation of `read_token`.
///
/// The implementation of the state machine has been carefully engineered to
/// return to the well-known start state if at all possible, including for
/// errors. Still, that is not always possible, notably for errors in the
/// underlying input and when recognizing a control code while already
/// processing a control sequence. Unless the underlying input keeps rejecting
/// read requests, reading more tokens is a viable strategy for eventually
/// returning to the start state.
pub struct Scanner<R> {
    /// The underlying reader.
    reader: R,
    /// The state machine state for the escape sequence being recognized.
    state: State,
    /// The control for the escape sequence being recognized.
    control: Option<Control>,
    // The byte data being scanned.
    buffer: Buffer,
    /// The flag for the current escape sequence being too long.
    did_overflow: bool,
    /// The actual length of the current escape sequence.
    sequence_length: usize,
    /// The maximum length for any escape sequence, which must be at least as
    /// large as the buffer size.
    max_sequence_length: usize,
    /// A single byte buffer for control characters in the middle of an escape
    /// sequence.
    extra: [u8; 1],
}

impl<R: std::io::Read> Scanner<R> {
    /// Create a new scanner with the given capacity.
    pub fn with_options(options: &Options, reader: R) -> Self {
        Self {
            reader,
            state: State::Ground,
            control: None,
            buffer: Buffer::with_options(options),
            did_overflow: false,
            sequence_length: 0,
            max_sequence_length: options.pathological_size(),
            extra: [0; 1],
        }
    }

    // ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
    // Manage the internal buffer

    /// Determine whether the scanner's buffer has readable content available.
    #[inline]
    pub fn is_readable(&self) -> bool {
        self.buffer.is_readable()
    }

    /// Ensure that the buffer has readable content.
    ///
    /// This method returns an option indicating how many bytes were read from
    /// the underlying input. If the internal buffer contains readable bytes,
    /// this method returns `None`. If it doesn't, this method makes space in
    /// the buffer and reads from the underlying input. If that read returns no
    /// bytes, this method returns `Some(0)`.
    fn ensure_readable(&mut self) -> Result<Option<usize>, Error> {
        if !self.buffer.is_readable() {
            if matches!(self.state, State::Ground) {
                // No readable data or token to preserve. Just reset buffer.
                self.buffer.reset();
            } else if self.buffer.is_exhausted() {
                // To make progress again, reset buffer but remember error.
                self.buffer.reset();
                self.did_overflow = true;
            } else if !self.buffer.has_capacity() {
                // Some terminals require two reads for OSC/ST sequence.
                // Only backshift if there is no more capacity.
                self.buffer.defrag();
            }

            let count = self.buffer.fill(&mut self.reader)?;
            return Ok(Some(count));
        }

        Ok(None)
    }

    // ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
    // Support for reading bytes

    /// Determine whether this scanner's state machine is in-flight.
    #[inline]
    pub fn in_flight(&self) -> bool {
        !matches!(self.state, State::Ground)
    }

    /// Get a buffer with unread bytes.
    ///
    /// This method only reads from the underlying input, if there are no unread
    /// bytes already buffered.
    pub fn fill_buf(&mut self) -> Result<&[u8], Error> {
        if self.in_flight() {
            return Err(ErrorKind::InFlight.into());
        }

        Ok(if let Some(0) = self.ensure_readable()? {
            &[]
        } else {
            self.buffer.peek_many()
        })
    }

    /// Consume unread bytes.
    ///
    /// Unless the state machine is in-flight, this method consumes at most the
    /// given number of bytes.
    pub fn consume(&mut self, count: usize) -> Result<(), Error> {
        if self.in_flight() {
            return Err(ErrorKind::InFlight.into());
        }

        self.buffer.consume_many(count);
        Ok(())
    }

    // ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~
    // Support for reading tokens

    /// Scan the buffer for one or more UTF-8 characters.
    ///
    /// This method returns a wrapped boolean indicating whether to return a
    /// text token. It also handles malformed UTF-8 errors.
    fn scan_text(&mut self) -> Result<bool, Error> {
        let mut bytes = self.buffer.peek_many();
        let mut index = 0;

        loop {
            if bytes.is_empty() {
                break;
            }

            // The first byte of an UTF-8 character is either ASCII or
            // 0xC2..=0xF4. That means that treating 0x80..0xA0 as C1 does not
            // interfere with UTF-8 start bytes. That, however, is not possible
            // for continuation bytes.

            // Oops: Aggressive linting with Clippy suggests to use an assertion
            // that preempts repeated bounds checking. But "0 < bytes.len()"
            // triggers Clippy because it's not idiomatic and
            // "!bytes.is_empty()" is not recognized by the assertion lint. On
            // top of that, we can only add attributes to items, not macro
            // invocations. Hence, we create an annotated, nested scope and use
            // the non-idiomatic test in the assertion.
            #[allow(clippy::len_zero)]
            {
                assert!(0 < bytes.len(), "a nonempty slice must contain 1 byte");
                if bytes[0] < 0x20 || (0x80..0xa0).contains(&bytes[0]) {
                    break;
                }
            }

            match scan_utf8_length(bytes) {
                Ok(size) => {
                    index += size;
                    bytes = &bytes[size..];
                }
                Err(size) => {
                    if index == 0 {
                        self.buffer.consume_many(size);
                        return Err(ErrorKind::MalformedUtf8.into());
                    } else {
                        break;
                    }
                }
            }
        }

        if 0 < index {
            self.buffer.start_token();
            self.buffer.consume_many(index);
            self.buffer.retain_many(index);
        }

        Ok(0 < index)
    }

    /// Step the state machine for ANSI escape sequences.
    ///
    /// Given a byte peeked from the buffer, this method transitions the state
    /// machine for ANSI escape sequences and updates the control and buffer. It
    /// also detects malformed sequences. The caller only needs to process
    /// complete controls and control sequences.
    fn step_sequence(&mut self, byte: u8) -> Result<Action, Error> {
        use self::Action::*;
        use self::Control::*;

        let (action, control);
        (self.state, action, control) = transition(self.state, byte);

        // Handle control. Handle sequence start and length.
        if control.is_some() {
            self.control = control;

            // Setting the control implies the start of a sequence.
            self.buffer.start_token();
            self.did_overflow = false;
            self.sequence_length = 1;
        } else if !matches!(self.state, State::Ground) {
            self.sequence_length += 1;

            if self.max_sequence_length <= self.sequence_length {
                // Hard reset scanner upon pathological control sequence.
                // That includes discarding buffered bytes.
                self.state = State::Ground;
                self.buffer.reset();
                return Err(ErrorKind::PathologicalSequence.into());
            }
        }

        // Handle bug and early return.
        if matches!(action, Print) {
            panic!("printable characters should not appear within control sequence");
        } else if matches!(action, AbortThenRetry) {
            return Err(ErrorKind::MalformedSequence.into());
        }

        // Handle buffer.
        self.buffer.consume();

        match action {
            AbortSequence => return Err(ErrorKind::MalformedSequence.into()),
            RetainByte => self.buffer.retain(),
            Dispatch => {
                let control = self
                    .control
                    .expect("dispatching a control sequence requires a control");
                if matches!(control, CSI | ESC | SS2 | SS3) {
                    self.buffer.retain()
                }
            }
            _ => {}
        }

        Ok(action)
    }

    /// Create a control token for the byte.
    #[inline]
    fn new_control_token(&mut self, byte: u8) -> Result<Token<'_>, Error> {
        self.extra[0] = byte;
        Ok(Token::Control(&self.extra))
    }

    /// Create a new sequence token.
    #[inline]
    fn new_sequence_token(&self) -> Result<Token<'_>, Error> {
        if self.did_overflow {
            Err(ErrorKind::OutOfMemory.into())
        } else {
            Ok(Token::Sequence(
                self.control.expect("a control sequence has a control"),
                self.buffer.token(),
            ))
        }
    }

    // ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~

    /// Read the next token.
    pub fn read_token(&mut self) -> Result<Token<'_>, Error> {
        loop {
            // Make sure that we have some bytes to process
            if let Some(0) = self.ensure_readable()? {
                return Err(ErrorKind::NoData.into());
            }

            // Try fast path for text
            if matches!(self.state, State::Ground) && self.scan_text()? {
                return Ok(Token::Text(self.buffer.token()));
            }

            // Run the state machine for control sequences
            while let Some(byte) = self.buffer.peek() {
                use self::Action::*;

                match self.step_sequence(byte)? {
                    HandleControl => return self.new_control_token(byte),
                    Dispatch => return self.new_sequence_token(),
                    _ => (),
                }
            }
        }
    }
}

impl<R> core::fmt::Debug for Scanner<R> {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        f.debug_struct("Scanner")
            .field("state", &self.state)
            .field("control", &self.control)
            .field("buffer", &self.buffer)
            .field("did_overflow", &self.did_overflow)
            .field("sequence_length", &self.sequence_length)
            .field("max_sequence_length", &self.max_sequence_length)
            .finish_non_exhaustive()
    }
}

// ================================================================================================

#[cfg(test)]
mod test {
    use super::{transition, Action, Control, Error, ErrorKind, Scanner, State, Token};
    use crate::opt::Options;
    use std::mem::size_of;

    #[test]
    fn test_size() {
        assert_eq!(size_of::<(State, Action)>(), 2);
    }

    #[test]
    fn test_state_machine() {
        use self::Action::*;
        use self::Control::*;

        let mut state = State::Ground;

        let mut step = |byte| {
            let (action, control);
            (state, action, control) = transition(state, byte);
            (action, control)
        };

        let input = b"\x1b[31m";
        assert!(matches!(step(input[0]), (StartSequence, Some(ESC))));
        assert!(matches!(step(input[1]), (IgnoreByte, Some(CSI))));
        assert!(matches!(step(input[2]), (RetainByte, None)));
        assert!(matches!(step(input[3]), (RetainByte, None)));
        assert!(matches!(step(input[4]), (Dispatch, None)));
    }

    #[test]
    fn test_events() {
        let input = [
            b"a \xe2\x9c\xb6 \x1b[+=+@ \x1bOR \xe2x \x07 ".as_slice(),
            b"\x1b[31m \x1b[123$$4<=>m \xe2\x81\x82\x1b[38:5:90m".as_slice(),
        ]
        .concat();
        let input = input.as_slice();

        let expected_output: &[Result<Token, Error>] = &[
            Ok(Token::Text(b"a \xe2\x9c\xb6 ".as_slice())),
            Err(ErrorKind::MalformedSequence.into()),
            Ok(Token::Text(b" ".as_slice())),
            Ok(Token::Sequence(Control::SS3, &[b'R'])),
            Ok(Token::Text(b" ".as_slice())),
            Err(ErrorKind::MalformedUtf8.into()),
            Ok(Token::Text(b"x ".as_slice())),
            Ok(Token::Control(&[0x07])),
            Ok(Token::Text(b" ".as_slice())),
            Ok(Token::Sequence(Control::CSI, b"31m".as_slice())),
            Ok(Token::Text(b" ".as_slice())),
            Err(ErrorKind::MalformedSequence.into()),
            Ok(Token::Text(b" \xe2\x81\x82".as_slice())),
            Ok(Token::Sequence(Control::CSI, b"38:5:90m".as_slice())),
        ];

        let mut scanner = Scanner::with_options(&Options::default(), input);
        for expected in expected_output {
            println!("\n{:#?}", scanner);
            let result = scanner.read_token();
            println!("got {:?}, expected {:?}", result, expected);
            assert_eq!(result.is_ok(), expected.is_ok());
            if result.is_ok() {
                assert_eq!(&result.unwrap(), expected.as_ref().unwrap());
            } else {
                assert_eq!(
                    result.unwrap_err().kind(),
                    expected.as_ref().unwrap_err().kind()
                );
            }
        }
    }

    #[test]
    fn test_bad_osc() {
        let input = b"\x1b]junk\x1b]text\x1b\\".as_slice();
        let mut scanner = Scanner::with_options(&Options::default(), input);
        let t = scanner.read_token();
        assert!(t.is_err());
        assert_eq!(t.unwrap_err().kind(), ErrorKind::MalformedSequence);
        let t = scanner.read_token();
        assert!(t.is_ok());
        assert_eq!(t.unwrap(), Token::Sequence(Control::OSC, b"text"));
    }
}