sqlite3_parser/lexer/
scan.rs

1//! Adaptation/port of [Go scanner](http://tip.golang.org/pkg/bufio/#Scanner).
2
3use log::debug;
4
5use std::error::Error;
6use std::fmt;
7use std::io;
8
9/// Position
10#[derive(Debug)]
11pub struct Pos {
12    /// line number
13    pub line: usize,
14    /// column number (byte offset, not char offset)
15    pub column: usize,
16}
17
18impl Pos {
19    pub fn from(input: &[u8], offset: usize) -> Self {
20        let (mut line, mut column) = (1, 1);
21        for byte in &input[..offset] {
22            if *byte == b'\n' {
23                line += 1;
24                column = 1;
25            } else {
26                column += 1;
27            }
28        }
29        Self { line, column }
30    }
31}
32
33impl fmt::Display for Pos {
34    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
35        write!(f, "line: {}, column: {}", self.line, self.column)
36    }
37}
38
39/// Error with position
40pub trait ScanError: Error + From<io::Error> + Sized {
41    /// Update the position where the error occurs
42    fn position(&mut self, p: Pos);
43}
44
45/// The `(&[u8], TokenType)` is the token.
46/// And the `usize` is the amount of bytes to consume.
47type SplitResult<'input, TokenType, Error> =
48    Result<(Option<(&'input [u8], TokenType)>, usize), Error>;
49
50/// Split function used to tokenize the input
51pub trait Splitter: Sized {
52    /// Potential error raised
53    type Error: ScanError;
54    //type Item: ?Sized;
55    /// Token generated
56    type TokenType: std::fmt::Debug;
57
58    /// The arguments are an initial substring of the remaining unprocessed
59    /// data.
60    ///
61    /// If the returned error is non-nil, scanning stops and the error
62    /// is returned to the client.
63    ///
64    /// The function is never called with an empty data slice.
65    fn split<'input>(
66        &mut self,
67        data: &'input [u8],
68    ) -> SplitResult<'input, Self::TokenType, Self::Error>;
69}
70
71/// Like a `BufReader` but with a growable buffer.
72/// Successive calls to the `scan` method will step through the 'tokens'
73/// of a file, skipping the bytes between the tokens.
74///
75/// Scanning stops unrecoverably at EOF, the first I/O error, or a token too
76/// large to fit in the buffer. When a scan stops, the reader may have
77/// advanced arbitrarily far past the last token.
78pub struct Scanner<S: Splitter> {
79    /// offset in `input`
80    offset: usize,
81    /// mark
82    mark: usize,
83    /// The function to tokenize the input.
84    splitter: S,
85}
86
87impl<S: Splitter> Scanner<S> {
88    /// Constructor
89    pub fn new(splitter: S) -> Self {
90        Self {
91            offset: 0,
92            mark: 0,
93            splitter,
94        }
95    }
96
97    /// Current position
98    pub fn position(&self, input: &[u8]) -> Pos {
99        Pos::from(input, self.offset)
100    }
101
102    /// Associated splitter
103    pub fn splitter(&self) -> &S {
104        &self.splitter
105    }
106    /// Mark current position
107    pub fn mark(&mut self) {
108        self.mark = self.offset;
109    }
110    /// Reset to mark
111    pub fn reset_to_mark(&mut self) {
112        self.offset = self.mark;
113    }
114
115    /// Reset the scanner such that it behaves as if it had never been used.
116    pub fn reset(&mut self) {
117        self.offset = 0;
118    }
119}
120
121type ScanResult<'input, TokenType, Error> =
122    Result<(usize, Option<(&'input [u8], TokenType)>, usize), Error>;
123
124impl<S: Splitter> Scanner<S> {
125    /// Advance the Scanner to next token.
126    /// Return the token as a byte slice.
127    /// Return `None` when the end of the input is reached.
128    /// Return any error that occurs while reading the input.
129    pub fn scan<'input>(
130        &mut self,
131        input: &'input [u8],
132    ) -> ScanResult<'input, S::TokenType, S::Error> {
133        debug!(target: "scanner", "scan({})", Pos::from(input, self.offset));
134        // Loop until we have a token.
135        loop {
136            // See if we can get a token with what we already have.
137            if self.offset < input.len() {
138                let data = &input[self.offset..];
139                match self.splitter.split(data) {
140                    Err(mut e) => {
141                        e.position(Pos::from(input, self.offset));
142                        return Err(e);
143                    }
144                    Ok((None, 0)) => {
145                        // Done
146                    }
147                    Ok((None, amt)) => {
148                        // Ignore/skip this data
149                        self.consume(data, amt);
150                        continue;
151                    }
152                    Ok((tok, amt)) => {
153                        let start = self.offset;
154                        self.consume(data, amt);
155                        debug!(target: "scanner", "scan(start: {}, tok: {:?}, offset: {})", start, tok, self.offset);
156                        return Ok((start, tok, self.offset));
157                    }
158                }
159            }
160            // We cannot generate a token with what we are holding.
161            // we are done.
162            return Ok((self.offset, None, self.offset));
163        }
164    }
165
166    /// Consume `amt` bytes of the buffer.
167    fn consume(&mut self, data: &[u8], amt: usize) {
168        debug!(target: "scanner", "consume({amt})");
169        debug_assert!(amt <= data.len());
170        self.offset += amt;
171    }
172}
173
174impl<S: Splitter> fmt::Debug for Scanner<S> {
175    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
176        f.debug_struct("Scanner")
177            .field("offset", &self.offset)
178            .field("mark", &self.mark)
179            .finish()
180    }
181}