1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
//! Adaptation/port of [Go scanner](http://tip.golang.org/pkg/bufio/#Scanner).

use log::debug;

use std::error::Error;
use std::fmt;
use std::io;

pub trait ScanError: Error + From<io::Error> + Sized {
    fn position(&mut self, line: u64, column: usize);
}

/// The `(&[u8], TokenType)` is the token.
/// And the `usize` is the amount of bytes to consume.
type SplitResult<'input, TokenType, Error> =
    Result<(Option<(&'input [u8], TokenType)>, usize), Error>;

/// Split function used to tokenize the input
pub trait Splitter: Sized {
    type Error: ScanError;
    //type Item: ?Sized;
    type TokenType;

    /// The arguments are an initial substring of the remaining unprocessed
    /// data.
    ///
    /// If the returned error is non-nil, scanning stops and the error
    /// is returned to the client.
    ///
    /// The function is never called with an empty data slice.
    fn split<'input>(
        &mut self,
        data: &'input [u8],
    ) -> SplitResult<'input, Self::TokenType, Self::Error>;
}

/// Like a `BufReader` but with a growable buffer.
/// Successive calls to the `scan` method will step through the 'tokens'
/// of a file, skipping the bytes between the tokens.
///
/// Scanning stops unrecoverably at EOF, the first I/O error, or a token too
/// large to fit in the buffer. When a scan stops, the reader may have
/// advanced arbitrarily far past the last token.
pub struct Scanner<S: Splitter> {
    /// offset in `input`
    offset: usize,
    /// mark
    mark: (usize, u64, usize),
    /// The function to tokenize the input.
    splitter: S,
    /// current line number
    line: u64,
    /// current column number (byte offset, not char offset)
    column: usize,
}

impl<S: Splitter> Scanner<S> {
    pub fn new(splitter: S) -> Scanner<S> {
        Scanner {
            offset: 0,
            mark: (0, 0, 0),
            splitter,
            line: 1,
            column: 1,
        }
    }

    /// Current line number
    pub fn line(&self) -> u64 {
        self.line
    }

    /// Current column number (byte offset, not char offset)
    pub fn column(&self) -> usize {
        self.column
    }

    pub fn splitter(&self) -> &S {
        &self.splitter
    }

    pub fn mark(&mut self) {
        self.mark = (self.offset, self.line, self.column);
    }
    pub fn reset_to_mark(&mut self) {
        (self.offset, self.line, self.column) = self.mark;
    }

    /// Reset the scanner such that it behaves as if it had never been used.
    pub fn reset(&mut self) {
        self.offset = 0;
        self.line = 1;
        self.column = 1;
    }
}

type ScanResult<'input, TokenType, Error> =
    Result<(usize, Option<(&'input [u8], TokenType)>, usize), Error>;

impl<S: Splitter> Scanner<S> {
    /// Advance the Scanner to next token.
    /// Return the token as a byte slice.
    /// Return `None` when the end of the input is reached.
    /// Return any error that occurs while reading the input.
    pub fn scan<'input>(
        &mut self,
        input: &'input [u8],
    ) -> ScanResult<'input, S::TokenType, S::Error> {
        debug!(target: "scanner", "scan(line: {}, column: {})", self.line, self.column);
        // Loop until we have a token.
        loop {
            // See if we can get a token with what we already have.
            if self.offset < input.len() {
                let data = &input[self.offset..];
                match self.splitter.split(data) {
                    Err(mut e) => {
                        e.position(self.line, self.column);
                        return Err(e);
                    }
                    Ok((None, 0)) => {
                        // Done
                    }
                    Ok((None, amt)) => {
                        // Ignore/skip this data
                        self.consume(data, amt);
                        continue;
                    }
                    Ok((tok, amt)) => {
                        let start = self.offset;
                        self.consume(data, amt);
                        return Ok((start, tok, self.offset));
                    }
                }
            }
            // We cannot generate a token with what we are holding.
            // we are done.
            return Ok((self.offset, None, self.offset));
        }
    }

    /// Consume `amt` bytes of the buffer.
    fn consume(&mut self, data: &[u8], amt: usize) {
        debug!(target: "scanner", "consume({})", amt);
        debug_assert!(amt <= data.len());
        for byte in &data[..amt] {
            if *byte == b'\n' {
                self.line += 1;
                self.column = 1;
            } else {
                self.column += 1;
            }
        }
        self.offset += amt;
    }
}

impl<S: Splitter> fmt::Debug for Scanner<S> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_struct("Scanner")
            .field("offset", &self.offset)
            .field("mark", &self.mark)
            .field("line", &self.line)
            .field("column", &self.column)
            .finish()
    }
}