libsql_sqlite3_parser/lexer/
scan.rs

1//! Adaptation/port of [Go scanner](http://tip.golang.org/pkg/bufio/#Scanner).
2
3use log::trace;
4
5use std::error::Error;
6use std::fmt;
7use std::io;
8
9pub trait ScanError: Error + From<io::Error> + Sized {
10    fn position(&mut self, line: u64, column: usize);
11}
12
13/// The `(&[u8], TokenType)` is the token.
14/// And the `usize` is the amount of bytes to consume.
15type SplitResult<'input, TokenType, Error> =
16    Result<(Option<(&'input [u8], TokenType)>, usize), Error>;
17
18/// Split function used to tokenize the input
19pub trait Splitter: Sized {
20    type Error: ScanError;
21    //type Item: ?Sized;
22    type TokenType;
23
24    /// The arguments are an initial substring of the remaining unprocessed
25    /// data.
26    ///
27    /// If the returned error is non-nil, scanning stops and the error
28    /// is returned to the client.
29    ///
30    /// The function is never called with an empty data slice.
31    fn split<'input>(
32        &mut self,
33        data: &'input [u8],
34    ) -> SplitResult<'input, Self::TokenType, Self::Error>;
35}
36
37/// Like a `BufReader` but with a growable buffer.
38/// Successive calls to the `scan` method will step through the 'tokens'
39/// of a file, skipping the bytes between the tokens.
40///
41/// Scanning stops unrecoverably at EOF, the first I/O error, or a token too
42/// large to fit in the buffer. When a scan stops, the reader may have
43/// advanced arbitrarily far past the last token.
44pub struct Scanner<S: Splitter> {
45    /// offset in `input`
46    offset: usize,
47    /// mark
48    mark: (usize, u64, usize),
49    /// The function to tokenize the input.
50    splitter: S,
51    /// current line number
52    line: u64,
53    /// current column number (byte offset, not char offset)
54    column: usize,
55}
56
57impl<S: Splitter> Scanner<S> {
58    pub fn new(splitter: S) -> Scanner<S> {
59        Scanner {
60            offset: 0,
61            mark: (0, 0, 0),
62            splitter,
63            line: 1,
64            column: 1,
65        }
66    }
67
68    /// Current line number
69    pub fn line(&self) -> u64 {
70        self.line
71    }
72
73    /// Current column number (byte offset, not char offset)
74    pub fn column(&self) -> usize {
75        self.column
76    }
77
78    pub fn splitter(&self) -> &S {
79        &self.splitter
80    }
81
82    pub fn mark(&mut self) {
83        self.mark = (self.offset, self.line, self.column);
84    }
85    pub fn reset_to_mark(&mut self) {
86        (self.offset, self.line, self.column) = self.mark;
87    }
88
89    /// Reset the scanner such that it behaves as if it had never been used.
90    pub fn reset(&mut self) {
91        self.offset = 0;
92        self.line = 1;
93        self.column = 1;
94    }
95
96    pub(crate) fn offset(&self) -> usize {
97        self.offset
98    }
99}
100
101type ScanResult<'input, TokenType, Error> =
102    Result<(usize, Option<(&'input [u8], TokenType)>, usize), Error>;
103
104impl<S: Splitter> Scanner<S> {
105    /// Advance the Scanner to next token.
106    /// Return the token as a byte slice.
107    /// Return `None` when the end of the input is reached.
108    /// Return any error that occurs while reading the input.
109    pub fn scan<'input>(
110        &mut self,
111        input: &'input [u8],
112    ) -> ScanResult<'input, S::TokenType, S::Error> {
113        trace!(target: "scanner", "scan(line: {}, column: {})", self.line, self.column);
114        // Loop until we have a token.
115        loop {
116            // See if we can get a token with what we already have.
117            if self.offset < input.len() {
118                let data = &input[self.offset..];
119                match self.splitter.split(data) {
120                    Err(mut e) => {
121                        e.position(self.line, self.column);
122                        return Err(e);
123                    }
124                    Ok((None, 0)) => {
125                        // Done
126                    }
127                    Ok((None, amt)) => {
128                        // Ignore/skip this data
129                        self.consume(data, amt);
130                        continue;
131                    }
132                    Ok((tok, amt)) => {
133                        let start = self.offset;
134                        self.consume(data, amt);
135                        return Ok((start, tok, self.offset));
136                    }
137                }
138            }
139            // We cannot generate a token with what we are holding.
140            // we are done.
141            return Ok((self.offset, None, self.offset));
142        }
143    }
144
145    /// Consume `amt` bytes of the buffer.
146    fn consume(&mut self, data: &[u8], amt: usize) {
147        trace!(target: "scanner", "consume({})", amt);
148        debug_assert!(amt <= data.len());
149        for byte in &data[..amt] {
150            if *byte == b'\n' {
151                self.line += 1;
152                self.column = 1;
153            } else {
154                self.column += 1;
155            }
156        }
157        self.offset += amt;
158    }
159}
160
161impl<S: Splitter> fmt::Debug for Scanner<S> {
162    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
163        f.debug_struct("Scanner")
164            .field("offset", &self.offset)
165            .field("mark", &self.mark)
166            .field("line", &self.line)
167            .field("column", &self.column)
168            .finish()
169    }
170}