chisel_lexers/
scanner.rs

1//! Implementation of an LA(1) scanner backend.
2//!
3//! # Usage
4//!
5//! Usage of the scanner is pretty straightforward. Construct an instance based on a supplied
6//! decoder (which is responsible for decoding byte streams into streams of UTF8 characters),
7//! and then use the [Scanner::advance] and [Scanner::advance_n] functions to move through the
8//! underlying input and populate the internal scanner buffer.
9//!
10//! To look into the scanner buffer, the [Scanner::front] and [Scanner::back] functions allow
11//! access to the first and last elements.  To grab the entire contents of the buffer, functions
12//! such as [Scanner::buffer_as_char_array] may be used.
13//!
14//! Once a chunk of input has been processed, the scanner state (i.e. the buffer) can be reset
15//! with a call to [Scanner::clear].
16//!
17//! # Examples
18//!
19//! ```rust
20//!  use std::io::BufReader;
21//!  use chisel_common::reader_from_bytes;
22//!  use chisel_decoders::utf8::Utf8Decoder;
23//!  use chisel_lexers::scanner::Scanner;
24//!
25//!  // construct a new scanner instance, based on a decoded byte source
26//!  let buffer: &[u8] = "let goodly sin and sunshine in".as_bytes();
27//!  let mut reader = BufReader::new(buffer);
28//!  let mut decoder = Utf8Decoder::new(&mut reader);
29//!  let mut scanner = Scanner::new(&mut decoder);
30//!  
31//! // consume from the scanner...
32//! let first = scanner.advance(true);
33//! assert!(first.is_ok());
34//! assert_eq!(scanner.front().unwrap().ch, 'l');
35//! assert_eq!(scanner.front().unwrap().coords.column, 1);
36//!
37//! // reset the scanner state
38//! scanner.clear();
39//!
40//! ```
41#![allow(dead_code)]
42use chisel_common::char::coords::Coords;
43use chisel_common::char::span::Span;
44use std::fmt::{Display, Formatter};
45
46/// Result type for the scanner
47pub type ScannerResult<T> = Result<T, ScannerError>;
48
49/// An enumeration of possible faults
50#[derive(Debug, Clone, PartialEq)]
51pub enum ScannerErrorDetails {
52    EndOfInput,
53}
54
55/// Convert specific fault codes into human-readable strings
56impl Display for ScannerErrorDetails {
57    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
58        match self {
59            ScannerErrorDetails::EndOfInput => write!(f, "end of input reached"),
60        }
61    }
62}
63
64#[derive(Debug, Clone)]
65pub struct ScannerError {
66    /// The error code associated with the error
67    pub details: ScannerErrorDetails,
68    /// [Coords] providing location information relating to the error
69    pub coords: Option<Coords>,
70}
71
72/// Convert a [ScannerError] into a human-readable format
73impl Display for ScannerError {
74    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
75        match self.coords {
76            Some(coords) => write!(f, "details: {}, coords: {}", self.details, coords),
77            None => write!(f, "details: {}", self.details),
78        }
79    }
80}
81
82/// Helper macro for the quick definition of a [ScannerError]
83macro_rules! scanner_error {
84    ($details: expr, $coords : expr) => {
85        Err(ScannerError {
86            details: $details,
87            coords: Some($coords),
88        })
89    };
90    ($details : expr) => {
91        Err(ScannerError {
92            details: $details,
93            coords: None,
94        })
95    };
96}
97
98/// A [char] and a [Coord] providing positional information
99pub struct CharWithCoords {
100    pub ch: char,
101    pub coords: Coords,
102}
103
104/// A [String] along with the [Span] it occupies in the input
105pub struct StringWithSpan {
106    pub str: String,
107    pub span: Span,
108}
109
110/// Just clone a [CharWithCoords] structure
111macro_rules! clone_char_with_coords {
112    ($src : expr) => {
113        CharWithCoords {
114            ch: $src.ch,
115            coords: $src.coords.clone(),
116        }
117    };
118}
119
120/// Shorthand for the creation of a [CharWithCoords]
121macro_rules! char_with_coords {
122    ($ch : expr, $coords : expr) => {
123        CharWithCoords {
124            ch: $ch,
125            coords: $coords,
126        }
127    };
128}
129
130/// Simple scanner which wraps itself around a source of [char]s and converts raw characters
131/// into [CharWithCoords] structures. Provides a running buffer which can be used to accumulate
132/// input characters, prior to extracting them for further downstream processing.
133#[derive()]
134pub struct Scanner<'a> {
135    /// The underlying source of characters
136    source: &'a mut dyn Iterator<Item = char>,
137
138    /// Accumulation buffer
139    accumulator: Vec<CharWithCoords>,
140
141    /// Input buffer
142    buffer: Vec<CharWithCoords>,
143
144    /// Overall position
145    position: Coords,
146
147    /// Newline flag in order ensure correct position reporting
148    newline: bool,
149}
150
151/// An input adapter used by the lexer. A [Scanner] is responsible for managing input
152/// state to to provide access to segments (or individual characters) from within the source input.
153impl<'a> Scanner<'a> {
154    /// New instance, based on an [Iterator] of [char]
155    pub fn new(chars: &'a mut dyn Iterator<Item = char>) -> Self {
156        Scanner {
157            source: chars,
158            accumulator: vec![],
159            buffer: vec![],
160            position: Coords {
161                column: 0,
162                line: 1,
163                absolute: 0,
164            },
165            newline: false,
166        }
167    }
168
169    /// Reset the internal state of the scanner, without resetting the state of the underlying char iterator
170    pub fn clear(&mut self) {
171        self.accumulator = vec![];
172    }
173
174    /// Push the last read character (and it's coords) onto the pushback buffer. Noop if there's
175    /// currently nothing in the accumulator
176    pub fn pushback(&mut self) {
177        if !self.accumulator.is_empty() {
178            self.buffer.push(self.accumulator.pop().unwrap())
179        }
180    }
181
182    /// Get the absolute position in the underlying input
183    pub fn position(&self) -> Coords {
184        self.position
185    }
186
187    /// Get the optional [char] at the front of the scanner buffer
188    pub fn front(&self) -> Option<CharWithCoords> {
189        return if !self.accumulator.is_empty() {
190            Some(clone_char_with_coords!(self.accumulator.last().unwrap()))
191        } else {
192            None
193        };
194    }
195
196    /// Get the optional [char] at the back of the scanner buffer
197    pub fn back(&self) -> Option<CharWithCoords> {
198        return if !self.accumulator.is_empty() {
199            Some(clone_char_with_coords!(self.accumulator.first().unwrap()))
200        } else {
201            None
202        };
203    }
204
205    /// Advance the scanner to the next available character, optionally skipping whitespace.
206    pub fn advance(&mut self, skip_whitespace: bool) -> ScannerResult<()> {
207        loop {
208            match self.next() {
209                Some(cwc) => {
210                    // update overall position
211                    self.position.copy_from(&cwc.coords);
212
213                    // check for whitespace
214                    if skip_whitespace {
215                        if !cwc.ch.is_whitespace() {
216                            self.accumulator.push(cwc);
217                            return Ok(());
218                        }
219                    } else {
220                        self.accumulator.push(cwc);
221                        return Ok(());
222                    }
223                }
224                None => return scanner_error!(ScannerErrorDetails::EndOfInput),
225            }
226        }
227    }
228
229    /// Try and look ahead one [char] in the input stream
230    pub fn try_lookahead(&mut self) -> Option<&CharWithCoords> {
231        return if !self.buffer.is_empty() {
232            self.buffer.last()
233        } else {
234            match self.next() {
235                Some(cwc) => {
236                    self.buffer.push(cwc);
237                    self.buffer.last()
238                }
239                None => None,
240            }
241        };
242    }
243
244    /// Grab the next available character and update the current position if we retrieve a new
245    /// character from the underlying input
246    fn next(&mut self) -> Option<CharWithCoords> {
247        // early return from the buffer if possible
248        return if !self.buffer.is_empty() {
249            Some(self.buffer.pop().unwrap())
250        } else {
251            // check next character and adjust position taking into account line endings
252            match self.source.next() {
253                Some(ch) => match ch {
254                    '\n' => {
255                        self.newline = true;
256                        Some(char_with_coords!(ch, self.position.copy_increment()))
257                    }
258                    _ => {
259                        if self.newline {
260                            self.newline = false;
261                            Some(char_with_coords!(
262                                ch,
263                                self.position.copy_increment_newline()
264                            ))
265                        } else {
266                            Some(char_with_coords!(ch, self.position.copy_increment()))
267                        }
268                    }
269                },
270                None => None,
271            }
272        };
273    }
274
275    /// Advance the scanner over n available characters, returning a [ScannerError] if it's not
276    /// possible to do so. After calling this method the input state should be read using the
277    /// other associated functions available for this type
278    pub fn advance_n(&mut self, n: usize, skip_whitespace: bool) -> ScannerResult<()> {
279        for _ in 0..n {
280            self.advance(skip_whitespace)?;
281        }
282        Ok(())
283    }
284
285    /// Extract the scanner buffer as a [StringWithSpan]. Will return an empty string if there's
286    /// nothing in the buffer
287    pub fn buffer_as_string_with_span(&mut self) -> StringWithSpan {
288        return if !self.accumulator.is_empty() {
289            let mut s = String::with_capacity(self.accumulator.len());
290            self.accumulator.iter().for_each(|cwc| s.push(cwc.ch));
291            StringWithSpan {
292                str: s,
293                span: Span {
294                    start: self.back().unwrap().coords,
295                    end: self.front().unwrap().coords,
296                },
297            }
298        } else {
299            StringWithSpan {
300                str: String::new(),
301                span: Span {
302                    start: self.position,
303                    end: self.position,
304                },
305            }
306        };
307    }
308
309    /// Extract the scanner buffer as a [char] slice
310    pub fn buffer_as_char_array(&mut self) -> Vec<char> {
311        return if !self.accumulator.is_empty() {
312            let mut arr: Vec<char> = vec![];
313            self.accumulator.iter().for_each(|cwc| arr.push(cwc.ch));
314            arr
315        } else {
316            vec![]
317        };
318    }
319
320    /// Extract the scanner buffer as a byte buffer.  You just get an empty vec if the buffer is
321    /// currently empty
322    pub fn buffer_as_byte_array(&self) -> Vec<u8> {
323        return if !self.accumulator.is_empty() {
324            self.accumulator.iter().map(|cwc| cwc.ch as u8).collect()
325        } else {
326            vec![]
327        };
328    }
329}
330
331#[cfg(test)]
332mod test {
333    use crate::scanner::Scanner;
334    use chisel_common::reader_from_bytes;
335    use chisel_decoders::utf8::Utf8Decoder;
336    use std::io::BufReader;
337
338    #[test]
339    fn should_create_new() {
340        let mut reader = reader_from_bytes!("{}[],:");
341        let mut decoder = Utf8Decoder::new(&mut reader);
342        let _ = Scanner::new(&mut decoder);
343    }
344
345    #[test]
346    fn should_consume_single_lines_correctly() {
347        let mut reader = reader_from_bytes!("this is a test line");
348        let mut decoder = Utf8Decoder::new(&mut reader);
349        let mut input = Scanner::new(&mut decoder);
350        let result = input.advance(true);
351        assert!(result.is_ok());
352        assert_eq!(input.front().unwrap().ch, 't');
353        for _ in 1..5 {
354            let result = input.advance(true);
355            assert!(result.is_ok());
356        }
357        assert_eq!(input.front().unwrap().ch, 'i');
358        assert_eq!(input.front().unwrap().coords.column, 6);
359
360        input.clear();
361        for _ in 1..5 {
362            let result = input.advance(false);
363            assert!(result.is_ok());
364        }
365        assert_eq!(input.front().unwrap().ch, ' ');
366        assert_eq!(input.front().unwrap().coords.column, 10)
367    }
368
369    #[test]
370    fn should_handle_pushbacks_correctly() {
371        // construct a new scanner instance, based on a decoded byte source
372        let buffer: &[u8] = "let goodly sin and sunshine in".as_bytes();
373        let mut reader = BufReader::new(buffer);
374        let mut decoder = Utf8Decoder::new(&mut reader);
375        let mut scanner = Scanner::new(&mut decoder);
376
377        // consume the first character from the scanner...
378        let first = scanner.advance(true);
379        assert!(first.is_ok());
380        assert_eq!(scanner.front().unwrap().ch, 'l');
381        assert_eq!(scanner.front().unwrap().coords.column, 1);
382
383        // consume a second character
384        assert!(scanner.advance(true).is_ok());
385
386        // ...and then pushback onto the buffer
387        scanner.pushback();
388
389        // front of the buffer should still be 'l'
390        assert_eq!(scanner.front().unwrap().ch, 'l');
391
392        // advance again - this time char will be taken from the pushback buffer
393        let _ = scanner.advance(true);
394        assert_eq!(scanner.front().unwrap().ch, 'e');
395
396        // grab the contents of the buffer as a string
397        let buffer_contents = scanner.buffer_as_string_with_span();
398        assert_eq!(buffer_contents.str, String::from("le"));
399
400        // reset the scanner and empty the buffer
401        scanner.clear();
402
403        // buffer should now be empty
404        assert!(scanner.buffer_as_string_with_span().str.is_empty());
405
406        // advance yet again
407        assert!(scanner.advance(true).is_ok());
408
409        // the third character read will be from the 3rd column in the input
410        assert_eq!(scanner.front().unwrap().ch, 't');
411        assert_eq!(scanner.front().unwrap().coords.column, 3);
412    }
413}