sqlite3_parser/lexer/
scan.rs

1//! Adaptation/port of [Go scanner](http://tip.golang.org/pkg/bufio/#Scanner).
2
3use log::debug;
4
5use std::error::Error;
6use std::fmt;
7use std::io;
8
9/// Error with position
10pub trait ScanError: Error + From<io::Error> + Sized {
11    /// Update the position where the error occurs
12    fn position(&mut self, line: u64, column: usize);
13}
14
15/// The `(&[u8], TokenType)` is the token.
16/// And the `usize` is the amount of bytes to consume.
17type SplitResult<'input, TokenType, Error> =
18    Result<(Option<(&'input [u8], TokenType)>, usize), Error>;
19
20/// Split function used to tokenize the input
21pub trait Splitter: Sized {
22    /// Potential error raised
23    type Error: ScanError;
24    //type Item: ?Sized;
25    /// Token generated
26    type TokenType;
27
28    /// The arguments are an initial substring of the remaining unprocessed
29    /// data.
30    ///
31    /// If the returned error is non-nil, scanning stops and the error
32    /// is returned to the client.
33    ///
34    /// The function is never called with an empty data slice.
35    fn split<'input>(
36        &mut self,
37        data: &'input [u8],
38    ) -> SplitResult<'input, Self::TokenType, Self::Error>;
39}
40
41/// Like a `BufReader` but with a growable buffer.
42/// Successive calls to the `scan` method will step through the 'tokens'
43/// of a file, skipping the bytes between the tokens.
44///
45/// Scanning stops unrecoverably at EOF, the first I/O error, or a token too
46/// large to fit in the buffer. When a scan stops, the reader may have
47/// advanced arbitrarily far past the last token.
48pub struct Scanner<S: Splitter> {
49    /// offset in `input`
50    offset: usize,
51    /// mark
52    mark: (usize, u64, usize),
53    /// The function to tokenize the input.
54    splitter: S,
55    /// current line number
56    line: u64,
57    /// current column number (byte offset, not char offset)
58    column: usize,
59}
60
61impl<S: Splitter> Scanner<S> {
62    /// Constructor
63    pub fn new(splitter: S) -> Self {
64        Self {
65            offset: 0,
66            mark: (0, 0, 0),
67            splitter,
68            line: 1,
69            column: 1,
70        }
71    }
72
73    /// Current line number
74    pub fn line(&self) -> u64 {
75        self.line
76    }
77
78    /// Current column number (byte offset, not char offset)
79    pub fn column(&self) -> usize {
80        self.column
81    }
82    /// Associated splitter
83    pub fn splitter(&self) -> &S {
84        &self.splitter
85    }
86    /// Mark current position
87    pub fn mark(&mut self) {
88        self.mark = (self.offset, self.line, self.column);
89    }
90    /// Reset to mark
91    pub fn reset_to_mark(&mut self) {
92        (self.offset, self.line, self.column) = self.mark;
93    }
94
95    /// Reset the scanner such that it behaves as if it had never been used.
96    pub fn reset(&mut self) {
97        self.offset = 0;
98        self.line = 1;
99        self.column = 1;
100    }
101}
102
103type ScanResult<'input, TokenType, Error> =
104    Result<(usize, Option<(&'input [u8], TokenType)>, usize), Error>;
105
106impl<S: Splitter> Scanner<S> {
107    /// Advance the Scanner to next token.
108    /// Return the token as a byte slice.
109    /// Return `None` when the end of the input is reached.
110    /// Return any error that occurs while reading the input.
111    pub fn scan<'input>(
112        &mut self,
113        input: &'input [u8],
114    ) -> ScanResult<'input, S::TokenType, S::Error> {
115        debug!(target: "scanner", "scan(line: {}, column: {})", self.line, self.column);
116        // Loop until we have a token.
117        loop {
118            // See if we can get a token with what we already have.
119            if self.offset < input.len() {
120                let data = &input[self.offset..];
121                match self.splitter.split(data) {
122                    Err(mut e) => {
123                        e.position(self.line, self.column);
124                        return Err(e);
125                    }
126                    Ok((None, 0)) => {
127                        // Done
128                    }
129                    Ok((None, amt)) => {
130                        // Ignore/skip this data
131                        self.consume(data, amt);
132                        continue;
133                    }
134                    Ok((tok, amt)) => {
135                        let start = self.offset;
136                        self.consume(data, amt);
137                        return Ok((start, tok, self.offset));
138                    }
139                }
140            }
141            // We cannot generate a token with what we are holding.
142            // we are done.
143            return Ok((self.offset, None, self.offset));
144        }
145    }
146
147    /// Consume `amt` bytes of the buffer.
148    fn consume(&mut self, data: &[u8], amt: usize) {
149        debug!(target: "scanner", "consume({})", amt);
150        debug_assert!(amt <= data.len());
151        for byte in &data[..amt] {
152            if *byte == b'\n' {
153                self.line += 1;
154                self.column = 1;
155            } else {
156                self.column += 1;
157            }
158        }
159        self.offset += amt;
160    }
161}
162
163impl<S: Splitter> fmt::Debug for Scanner<S> {
164    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
165        f.debug_struct("Scanner")
166            .field("offset", &self.offset)
167            .field("mark", &self.mark)
168            .field("line", &self.line)
169            .field("column", &self.column)
170            .finish()
171    }
172}