miden_assembly_syntax/parser/
scanner.rs

1use core::ops::Range;
2
3// SCANNER
4// ================================================================================================
5
6/// [Scanner] handles the low-level details of reading characters from a raw input stream of bytes.
7/// It decodes those bytes into UTF-8 characters, and associates each character with the
8/// [miden_debug_types::ByteIndex] at which it occurs.
9///
10/// The [Scanner] is intended to be consumed by a lexer, which handles converting the stream of
11/// characters into a token stream for use by the parser.
12///
13/// ## Scanner Lifecycle
14///
15/// The following illustrates how content flows from the raw input stream through the scanner.
16///
17/// ```ignore
18/// lexer <- (peek) <- pending <- source
19///       <- (pop) <- current <- pending <- source
20/// ```
21///
22/// As shown above, the lexer is "pulling" characters from the scanner.
23///
24/// When "peeking" a character, we return the character currently in the `pending` field, but if
25/// `pending` is empty, we read enough bytes from the source to construct a UTF-8 character, and
26/// store it as `pending`, as well as returning it to the lexer.
27///
28/// When "popping" a character (i.e. we are advancing the scanner in the input), we are returning
29/// the character in the `current` field, and then moving the character in `pending` into `current`.
30/// Accordingly, if any of those fields is empty, we must pull from the next field in the chain,
31/// reading bytes from the input as we go.
32pub struct Scanner<'input> {
33    input: &'input str,
34    chars: core::iter::Peekable<core::str::CharIndices<'input>>,
35    current: (usize, char),
36    pending: (usize, char),
37    start: usize,
38    end: usize,
39}
40
41impl<'input> Scanner<'input> {
42    /// Construct a new [Scanner] for the given `source`.
43    pub fn new(input: &'input str) -> Self {
44        let end = input.len();
45        assert!(end < u32::MAX as usize, "file too large");
46
47        let mut chars = input.char_indices().peekable();
48        let current = chars.next().unwrap_or((0, '\0'));
49        let pending = chars.next().unwrap_or((end, '\0'));
50        Self {
51            input,
52            chars,
53            current,
54            pending,
55            start: 0,
56            end,
57        }
58    }
59
60    /// Returns the byte offset representing the start of the source
61    pub fn start(&self) -> usize {
62        self.start
63    }
64
65    /// Advance scanner pipeline by a single character.
66    ///
67    /// `pending` becomes `current`, and bytes are read from the input to repopulate `pending`.
68    #[inline]
69    pub fn advance(&mut self) {
70        self.current = self.pending;
71        self.pending = self.chars.next().unwrap_or((self.end, '\0'));
72    }
73
74    /// Return the current character and advance our position in the source
75    #[inline]
76    pub fn pop(&mut self) -> (usize, char) {
77        let current = self.current;
78        self.advance();
79        current
80    }
81
82    /// Return the next character in the input, but do not advance.
83    #[inline]
84    pub fn peek(&self) -> (usize, char) {
85        self.pending
86    }
87
88    /// Return the character after the next character in the input, but do not advance.
89    #[inline]
90    pub fn peek_next(&mut self) -> (usize, char) {
91        self.chars.peek().copied().unwrap_or((self.end, '\0'))
92    }
93
94    /// Get current character in the input.
95    #[inline]
96    pub fn read(&self) -> (usize, char) {
97        self.current
98    }
99
100    /// Get a string slice representing the given range in the underlying source
101    #[inline]
102    pub fn slice(&self, span: impl Into<Range<usize>>) -> &'input str {
103        let range = span.into();
104        let bytes = &self.input.as_bytes()[range];
105        core::str::from_utf8(bytes).expect("invalid slice indices")
106    }
107}