Skip to main content

limbo_sqlite3_parser/lexer/
scan.rs

1//! Adaptation/port of [Go scanner](http://tip.golang.org/pkg/bufio/#Scanner).
2
3use std::error::Error;
4use std::fmt;
5use std::io;
6
7/// Error with position
8pub trait ScanError: Error + From<io::Error> + Sized {
9    /// Update the position where the error occurs
10    fn position(&mut self, line: u64, column: usize, offset: usize);
11}
12
13/// The `(&[u8], TokenType)` is the token.
14/// And the `usize` is the amount of bytes to consume.
15type SplitResult<'input, TokenType, Error> =
16    Result<(Option<(&'input [u8], TokenType)>, usize), Error>;
17
18/// Split function used to tokenize the input
19pub trait Splitter: Sized {
20    /// Potential error raised
21    type Error: ScanError;
22    //type Item: ?Sized;
23    /// Token generated
24    type TokenType;
25
26    /// The arguments are an initial substring of the remaining unprocessed
27    /// data.
28    ///
29    /// If the returned error is non-nil, scanning stops and the error
30    /// is returned to the client.
31    ///
32    /// The function is never called with an empty data slice.
33    fn split<'input>(
34        &mut self,
35        data: &'input [u8],
36    ) -> SplitResult<'input, Self::TokenType, Self::Error>;
37}
38
39/// Like a `BufReader` but with a growable buffer.
40/// Successive calls to the `scan` method will step through the 'tokens'
41/// of a file, skipping the bytes between the tokens.
42///
43/// Scanning stops unrecoverably at EOF, the first I/O error, or a token too
44/// large to fit in the buffer. When a scan stops, the reader may have
45/// advanced arbitrarily far past the last token.
46pub struct Scanner<S: Splitter> {
47    /// offset in `input`
48    offset: usize,
49    /// mark
50    mark: (usize, u64, usize),
51    /// The function to tokenize the input.
52    splitter: S,
53    /// current line number
54    line: u64,
55    /// current column number (byte offset, not char offset)
56    column: usize,
57}
58
59impl<S: Splitter> Scanner<S> {
60    /// Constructor
61    pub fn new(splitter: S) -> Self {
62        Self {
63            offset: 0,
64            mark: (0, 0, 0),
65            splitter,
66            line: 1,
67            column: 1,
68        }
69    }
70
71    /// Current line number
72    pub fn line(&self) -> u64 {
73        self.line
74    }
75
76    /// Current column number (byte offset, not char offset)
77    pub fn column(&self) -> usize {
78        self.column
79    }
80
81    /// Current byte offset in the source string
82    pub fn offset(&self) -> usize {
83        self.offset
84    }
85
86    /// Associated splitter
87    pub fn splitter(&self) -> &S {
88        &self.splitter
89    }
90    /// Mark current position
91    pub fn mark(&mut self) {
92        self.mark = (self.offset, self.line, self.column);
93    }
94    /// Reset to mark
95    pub fn reset_to_mark(&mut self) {
96        (self.offset, self.line, self.column) = self.mark;
97    }
98
99    /// Reset the scanner such that it behaves as if it had never been used.
100    pub fn reset(&mut self) {
101        self.offset = 0;
102        self.line = 1;
103        self.column = 1;
104    }
105}
106
107type ScanResult<'input, TokenType, Error> =
108    Result<(usize, Option<(&'input [u8], TokenType)>, usize), Error>;
109
110impl<S: Splitter> Scanner<S> {
111    /// Advance the Scanner to next token.
112    /// Return the token as a byte slice.
113    /// Return `None` when the end of the input is reached.
114    /// Return any error that occurs while reading the input.
115    pub fn scan<'input>(
116        &mut self,
117        input: &'input [u8],
118    ) -> ScanResult<'input, S::TokenType, S::Error> {
119        // Loop until we have a token.
120        loop {
121            // See if we can get a token with what we already have.
122            if self.offset < input.len() {
123                let data = &input[self.offset..];
124                match self.splitter.split(data) {
125                    Err(mut e) => {
126                        e.position(self.line, self.column, self.offset);
127                        return Err(e);
128                    }
129                    Ok((None, 0)) => {
130                        // Done
131                    }
132                    Ok((None, amt)) => {
133                        // Ignore/skip this data
134                        self.consume(data, amt);
135                        continue;
136                    }
137                    Ok((tok, amt)) => {
138                        let start = self.offset;
139                        self.consume(data, amt);
140                        return Ok((start, tok, self.offset));
141                    }
142                }
143            }
144            // We cannot generate a token with what we are holding.
145            // we are done.
146            return Ok((self.offset, None, self.offset));
147        }
148    }
149
150    /// Consume `amt` bytes of the buffer.
151    fn consume(&mut self, data: &[u8], amt: usize) {
152        debug_assert!(amt <= data.len());
153        for byte in &data[..amt] {
154            if *byte == b'\n' {
155                self.line += 1;
156                self.column = 1;
157            } else {
158                self.column += 1;
159            }
160        }
161        self.offset += amt;
162    }
163}
164
165impl<S: Splitter> fmt::Debug for Scanner<S> {
166    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
167        f.debug_struct("Scanner")
168            .field("offset", &self.offset)
169            .field("mark", &self.mark)
170            .field("line", &self.line)
171            .field("column", &self.column)
172            .finish()
173    }
174}