miden_assembly_syntax/parser/scanner.rs
1use core::ops::Range;
2
3// SCANNER
4// ================================================================================================
5
6/// [Scanner] handles the low-level details of reading characters from a raw input stream of bytes.
7/// It decodes those bytes into UTF-8 characters, and associates each character with the
8/// [miden_debug_types::ByteIndex] at which it occurs.
9///
10/// The [Scanner] is intended to be consumed by a lexer, which handles converting the stream of
11/// characters into a token stream for use by the parser.
12///
13/// ## Scanner Lifecycle
14///
15/// The following illustrates how content flows from the raw input stream through the scanner.
16///
17/// ```ignore
18/// lexer <- (peek) <- pending <- source
19/// <- (pop) <- current <- pending <- source
20/// ```
21///
22/// As shown above, the lexer is "pulling" characters from the scanner.
23///
24/// When "peeking" a character, we return the character currently in the `pending` field, but if
25/// `pending` is empty, we read enough bytes from the source to construct a UTF-8 character, and
26/// store it as `pending`, as well as returning it to the lexer.
27///
28/// When "popping" a character (i.e. we are advancing the scanner in the input), we are returning
29/// the character in the `current` field, and then moving the character in `pending` into `current`.
30/// Accordingly, if any of those fields is empty, we must pull from the next field in the chain,
31/// reading bytes from the input as we go.
32pub struct Scanner<'input> {
33 input: &'input str,
34 chars: core::iter::Peekable<core::str::CharIndices<'input>>,
35 current: (usize, char),
36 pending: (usize, char),
37 start: usize,
38 end: usize,
39}
40
41impl<'input> Scanner<'input> {
42 /// Construct a new [Scanner] for the given `source`.
43 pub fn new(input: &'input str) -> Self {
44 let end = input.len();
45 assert!(end < u32::MAX as usize, "file too large");
46
47 let mut chars = input.char_indices().peekable();
48 let current = chars.next().unwrap_or((0, '\0'));
49 let pending = chars.next().unwrap_or((end, '\0'));
50 Self {
51 input,
52 chars,
53 current,
54 pending,
55 start: 0,
56 end,
57 }
58 }
59
60 /// Returns the byte offset representing the start of the source
61 pub fn start(&self) -> usize {
62 self.start
63 }
64
65 /// Advance scanner pipeline by a single character.
66 ///
67 /// `pending` becomes `current`, and bytes are read from the input to repopulate `pending`.
68 #[inline]
69 pub fn advance(&mut self) {
70 self.current = self.pending;
71 self.pending = self.chars.next().unwrap_or((self.end, '\0'));
72 }
73
74 /// Return the current character and advance our position in the source
75 #[inline]
76 pub fn pop(&mut self) -> (usize, char) {
77 let current = self.current;
78 self.advance();
79 current
80 }
81
82 /// Return the next character in the input, but do not advance.
83 #[inline]
84 pub fn peek(&self) -> (usize, char) {
85 self.pending
86 }
87
88 /// Return the character after the next character in the input, but do not advance.
89 #[inline]
90 pub fn peek_next(&mut self) -> (usize, char) {
91 self.chars.peek().copied().unwrap_or((self.end, '\0'))
92 }
93
94 /// Get current character in the input.
95 #[inline]
96 pub fn read(&self) -> (usize, char) {
97 self.current
98 }
99
100 /// Get a string slice representing the given range in the underlying source
101 #[inline]
102 pub fn slice(&self, span: impl Into<Range<usize>>) -> &'input str {
103 let range = span.into();
104 let bytes = &self.input.as_bytes()[range];
105 core::str::from_utf8(bytes).expect("invalid slice indices")
106 }
107}