1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
use std::ops::Range;

// =================================================================
// Errors
// =================================================================

/// Indicates that a particular kind of token was expected, but that
/// we actually found something else.
#[derive(Debug)]
pub enum SnapError<T>
where T:Clone+Copy+PartialEq {
    // Expected type T, but found this token.
    Expected(T,Span<T>),
    // Expected one of several types `T`, but found this token.
    ExpectedIn(Vec<T>,Span<T>)
}

/// The type of error which can be returned from `snap()`.
pub type SnapResult<T> = Result<Span<T>,SnapError<T>>;

// =================================================================
// Region
// =================================================================

/// Basically the same as `std::ops::Range`, but implements `Copy`.
/// Note, like `Range`, this is _half open_.  That means `start`
/// identifies the first index in the region, whilst `end` is one past
/// the last index.
#[derive(Clone,Copy,Debug,PartialEq)]
pub struct Region {
    pub start: usize,
    pub end: usize
}

impl Region {
    pub fn new(start: usize, end: usize) -> Self {
        Self {start,end}
    }
    /// Determine the number of items this region covers.
    pub fn len(&self) -> usize {
        self.end - self.start
    }

    pub fn shift(&mut self, delta: usize) {
        self.start += delta;
        self.end += delta;
    }
}

/// Simple mechanism for constructing a `Region` from a `Range`.
impl From<Range<usize>> for Region {
    fn from(r: Range<usize>) -> Region {
       Region{start:r.start,end:r.end}
    }
}

impl Into<Range<usize>> for Region {
    fn into(self) -> Range<usize> { self.start .. self.end }
}

// =================================================================
// Token
// =================================================================

#[derive(Clone,Copy,Debug,PartialEq)]
pub struct Span<T>
where T:Clone+Copy+PartialEq {
    /// Type of the token
    pub kind : T,
    /// Identifies the (half open) region in the sequence.
    pub region: Region
}

impl<T> Span<T>
where T:Clone+Copy+PartialEq {

    pub fn new(kind: T, range: Range<usize>) -> Self {
        Self { kind, region: Region::from(range) }
    }

    /// Get first index of this token.
    pub fn start(&self) -> usize {
	self.region.start
    }

    /// Get end of this token (that is one past its last character).
    pub fn end(&self) -> usize {
	self.region.end
    }

    /// Get the length (in chars) of this token.
    pub fn len(&self) -> usize {
        self.region.end - self.region.start
    }

    /// Extract the underlying region covered by this span as a
    /// `Range`.  This is really just for convenience.
    pub fn range(&self) -> Range<usize> { self.start() .. self.end() }

    /// Shift the span to a different position in the underlying
    /// sequence.  The position is taken as a delta from the current
    /// position (e.g. `delta==1` means we shift one up the sequence).
    pub fn shift(&mut self, delta: usize) {
        self.region.shift(delta);
    }
}

// =================================================================
// Tokenizer
// =================================================================

/// Provides a generic description of something which splits items in
/// the input sequence up into tokens.
pub trait Tokenizer {
    /// Identifies items in the underlying sequence being tokenized.
    type Item;
    /// Identifies the token type produced by this tokenizer.
    type Token:Clone+Copy+PartialEq;
    /// Responsible for producing a token from a given position in the
    /// input.
    fn scan(&self, input: &[Self::Item]) -> Span<Self::Token>;
}

// =================================================================
// Lexer
// =================================================================

/// Provides machinery for splitting up an _underlying sequence_ of
/// items into a sequence of tokens, where each token can correspond
/// to one or more items in the underlying sequence.
pub struct Lexer<T:Tokenizer> {
    /// Underlying sequence being tokenised
    input: Vec<T::Item>,
    /// Current position in character sequence
    offset: usize,
    /// Responsible for dividing characters into tokens
    tokeniser: T
}

impl<T:Tokenizer> Lexer<T> {
    /// Construct a new lexer for a given string slice.
    pub fn new(input: Vec<T::Item>, tokeniser: T) -> Self {
        // Construct lexer
        return Self { input, offset: 0, tokeniser }
    }

    /// Check whether the lexer has reached the end of the file or
    /// not.
    pub fn is_eof(&self) -> bool {
        self.offset >= self.input.len()
    }

    /// Get the slice which corresponds to a given span from the
    /// underlying sequence.
    pub fn get(&self, span: Span<T::Token>) -> &[T::Item] {
        &self.input[span.range()]
    }

    /// Peek at the next token in the sequence, or none if we have
    /// reached the end.
    pub fn peek(&self) -> Span<T::Token> { self.scan(self.offset) }

    /// Get the next token in the sequence, or none if we have reached
    /// the end.
    pub fn next(&mut self) -> Span<T::Token> {
        let t = self.scan(self.offset);
        self.offset = t.end();
        t
    }

    /// Match a given token type in the current stream.  If the kind
    /// matches, then the token stream advances.  Otherwise, it
    /// remains at the same position and an error is returned.
    pub fn snap(&mut self, kind : T::Token) -> SnapResult<T::Token> {
	// Peek at the next token
	let lookahead = self.peek();
	// Check it!
	if lookahead.kind == kind {
	    // Accept it
	    self.next();
	    //
	    Ok(lookahead)
	} else {
	    // Reject
	    Err(SnapError::Expected(kind,lookahead))
	}
    }

    /// Match a given token type in the current stream for a set of
    /// candidates.  If one of the candidates matches, then the token
    /// stream advances.  Otherwise, it remains at the same position
    /// and an error is returned.
    pub fn snap_any(&mut self, kinds: &[T::Token]) -> SnapResult<T::Token> {
        for k in kinds {
            match self.snap(*k) {
                Ok(tok) => { return Ok(tok); }
                _ => { }
            }
        }
        // Reject
	Err(SnapError::ExpectedIn(kinds.to_vec(),self.peek()))
    }

    /// Begin process of scanning a token based on its first
    /// character.  The actual work is offloaded to a helper based on
    /// this.
    fn scan(&self, start: usize) -> Span<T::Token> {
        // Scan next token
        let mut span = self.tokeniser.scan(&self.input[start..]);
        // Shift to correct position
        span.shift(start);
        // Done
        span
    }
}

// =================================================================
// Table Tokenizer
// =================================================================

/// Defines a very simple concept of a scanner which requires no
/// state.  Tokenizers can be built out of scanners, for example.
pub type Scanner<S,T> = fn(&[S])->Result<Span<T>,()>;

/// A tokenizer construct from one or more tokenizers which are tried
/// in order of appearance.
pub struct TableTokenizer<S,T>
where T: Copy+Clone+PartialEq {
    /// The table of tokenizers to use for scanning.
    table : Vec<Scanner<S,T>>
}

impl<S,T> TableTokenizer<S,T>
where T: Copy+Clone+PartialEq {
    /// Construct a new tokenizer from a given table.
    pub fn new(table: Vec<Scanner<S,T>>) -> Self {
        Self{table}
    }
}

impl<S,T> Tokenizer for TableTokenizer<S,T>
where T: Copy+Clone+PartialEq {
    type Item = S;
    type Token = T;

    fn scan(&self, input: &[Self::Item]) -> Span<Self::Token> {
        for s in &self.table {
            match s(input) {
                Ok(s) => { return s; }
                _ => {}
            }
        }
        panic!("PROBLEM");
    }
}