1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
// Copyright (c) ZeroC, Inc.
use super::super::common::SourceBlock;
use super::tokens::*;
use crate::slice_file::Location;
use std::iter::Peekable;
use std::str::Chars;
type LexerResult<'a> = Result<Token<'a>, Error>;
/// Converts a string into a stream of tokens representing blocks of source code and preprocessor tokens.
///
/// This token stream is in turn consumed by the [preprocessor parser](super::parser::Preprocessor) which parses the
/// tokens and evaluates the preprocessor directives represented by them.
#[derive(Debug)]
pub struct Lexer<'input> {
/// The string that this lexer is lexing over.
input: &'input str,
/// Iterator over the characters in the input string.
/// This is what the lexer actually operates on, by peeking at and consuming codepoints from this buffer.
buffer: Peekable<Chars<'input>>,
/// The lexer's current position in the buffer.
position: usize,
/// The lexer's current [location](crate::slice_file::Location) in the input string.
/// Used to tag tokens with their starting and ending locations in the input.
cursor: Location,
/// The current mode of the lexer; controls how the input is tokenized in a context-dependent manner.
mode: LexerMode,
}
impl<'input> Lexer<'input> {
/// Creates a new lexer over the provided input.
pub fn new(input: &'input str) -> Self {
Lexer {
input,
buffer: input.chars().peekable(),
position: 0,
cursor: Location::default(),
mode: LexerMode::Unknown,
}
}
/// Consumes the next character in the buffer and moves the lexer's cursor forward accordingly.
fn advance_buffer(&mut self) {
// Consume the next character and check if it's a newline.
if let Some(c) = self.buffer.next() {
self.position += c.len_utf8();
if c == '\n' {
self.cursor.row += 1;
self.cursor.col = 1;
} else {
self.cursor.col += 1;
}
}
}
/// Skips characters in the buffer until end-of-line (doesn't consume the EOL) or end-of-buffer is reached.
/// After calling this function, the next char will be '\n' or `None` (end-of-buffer).
fn advance_to_end_of_line(&mut self) {
// Loop while the next character is not '\n'.
while matches!(self.buffer.peek(), Some(c) if *c != '\n') {
self.advance_buffer(); // Consume the character.
}
}
/// Skips over inline whitespace characters (whitespace other than '\n') in the buffer.
/// After calling this function, the next char will be '\n', a non-whitespace character, or `None` (end-of-buffer).
fn skip_inline_whitespace(&mut self) {
// Loop while the next character in the buffer is whitespace (except '\n').
while matches!(self.buffer.peek(), Some(c) if (c.is_whitespace() && *c != '\n')) {
self.advance_buffer(); // Consume the character.
}
}
/// Reads, consumes, and returns a string of alphanumeric characters from the buffer.
/// After calling this function, the next char will be a non-alphanumeric character or `None` (end-of-buffer).
fn read_identifier(&mut self) -> &'input str {
let start_position = self.position;
// Loop while the next character in the buffer is an alphanumeric or underscore.
while matches!(self.buffer.peek(), Some(c) if (c.is_ascii_alphanumeric() || *c == '_')) {
self.advance_buffer(); // Consume the character.
}
&self.input[start_position..self.position]
}
/// Constructs and returns a preprocessor token representing a block of source code.
/// This function assumes that the lexer's cursor is at the end of the token being created.
fn create_source_block_token(
&self,
start_location: Location,
start_position: usize,
end_position: usize,
) -> Token<'input> {
let source_block = TokenKind::SourceBlock(SourceBlock {
content: &self.input[start_position..end_position],
start: start_location,
end: self.cursor,
});
(start_location, source_block, self.cursor)
}
/// Consumes a single character from the lexer's buffer and returns a token of the specified kind.
/// This is a convenience function for the common case where a token's lexeme is a single character.
fn return_simple_token(&mut self, token: TokenKind<'input>, start: Location) -> LexerResult<'input> {
self.advance_buffer(); // Consume the token from the buffer.
Ok((start, token, self.cursor)) // Return it.
}
/// Attempts to read and return a preprocessor directive token from the buffer.
/// Returns `Ok(x)` to indicate success (`x` is the next token), and `Err(y)` to indicate an error occurred.
fn lex_next_preprocessor_token(&mut self, c: char) -> Option<LexerResult<'input>> {
let start_location = self.cursor;
match c {
'(' => Some(self.return_simple_token(TokenKind::LeftParenthesis, start_location)),
')' => Some(self.return_simple_token(TokenKind::RightParenthesis, start_location)),
'!' => Some(self.return_simple_token(TokenKind::Not, start_location)),
'&' => {
self.advance_buffer(); // Consume the '&' character.
// Ensure the next character is also an '&' (since the whole token should be "&&").
if matches!(self.buffer.peek(), Some('&')) {
Some(self.return_simple_token(TokenKind::And, start_location))
} else {
let error = ErrorKind::UnknownSymbol {
symbol: "&".to_owned(),
suggestion: Some("&&".to_owned()),
};
Some(Err((start_location, error, self.cursor)))
}
}
'|' => {
self.advance_buffer(); // Consume the '|' character.
// Ensure the next character is also a '|' (since the whole token should be "||").
if matches!(self.buffer.peek(), Some('|')) {
Some(self.return_simple_token(TokenKind::Or, start_location))
} else {
let error = ErrorKind::UnknownSymbol {
symbol: "|".to_owned(),
suggestion: Some("||".to_owned()),
};
Some(Err((start_location, error, self.cursor)))
}
}
'#' => {
self.advance_buffer(); // Consume the '#' character.
self.skip_inline_whitespace(); // Consume any inline whitespace characters
let identifier = self.read_identifier(); // Reads and consumes an identifier from the buffer.
match identifier {
"define" => Some(Ok((start_location, TokenKind::DefineKeyword, self.cursor))),
"undef" => Some(Ok((start_location, TokenKind::UndefineKeyword, self.cursor))),
"if" => Some(Ok((start_location, TokenKind::IfKeyword, self.cursor))),
"elif" => Some(Ok((start_location, TokenKind::ElifKeyword, self.cursor))),
"else" => Some(Ok((start_location, TokenKind::ElseKeyword, self.cursor))),
"endif" => Some(Ok((start_location, TokenKind::EndifKeyword, self.cursor))),
"" => Some(Err((start_location, ErrorKind::MissingDirective, self.cursor))),
keyword => {
let error = ErrorKind::UnknownDirective {
keyword: keyword.to_owned(),
};
Some(Err((start_location, error, self.cursor)))
}
}
}
'/' => {
self.advance_buffer(); // Consume the '/' character.
match self.buffer.peek() {
Some('/') => {
// Consume the rest of the line, ending at either `\n` or `EOF`.
self.advance_to_end_of_line();
None
}
_ => {
let error = ErrorKind::UnknownSymbol {
symbol: "/".to_owned(),
suggestion: Some("//".to_owned()),
};
Some(Err((start_location, error, self.cursor)))
}
}
}
ch if ch.is_ascii_alphabetic() => {
let identifier = self.read_identifier();
Some(Ok((start_location, TokenKind::Identifier(identifier), self.cursor)))
}
ch if !ch.is_whitespace() => {
self.advance_buffer(); // Consume the unknown character.
let error = ErrorKind::UnknownSymbol {
symbol: c.to_string(),
suggestion: None,
};
Some(Err((start_location, error, self.cursor)))
}
'\n' => {
// End of line also means the end of a preprocessor directive.
self.mode = LexerMode::Unknown;
Some(Ok((start_location, TokenKind::DirectiveEnd, start_location)))
}
_ => panic!("'lex_next_preprocessor_token' encountered whitespace that should of been skipped"),
}
}
}
impl<'input> Iterator for Lexer<'input> {
type Item = LexerResult<'input>;
/// Attempts to lex and return the next token in this lexer's token stream.
/// Returns `None` to indicate end-of-stream, `Some(Ok(x))` to indicate success (where `x` is the next token),
/// and `Some(Err(y))` to indicate that an error occurred during lexing.
fn next(&mut self) -> Option<Self::Item> {
// The starting location of a token.
let mut start_location = None;
// The starting buffer position of a token.
let mut start_position = None;
self.skip_inline_whitespace();
while let Some(c) = self.buffer.peek().cloned() {
if self.mode == LexerMode::PreprocessorDirective {
if let Some(token) = self.lex_next_preprocessor_token(c) {
return Some(token);
};
} else if c == '\n' {
self.advance_buffer();
} else if c == '#' {
// The first non-whitespace character on this line is '#'. This line must be a directive.
// If the lexer's mode is currently `SourceBlock`, this is the end of that source block.
// We create and return a `SourceBlock` as the next token; otherwise, we `continue`.
// Either way, we skip the rest of the loop to ensure we don't consume the '#', so it's
// preserved for preprocessor directive lexing.
let next_token = match self.mode {
LexerMode::SourceBlock => Some(Ok(self.create_source_block_token(
start_location.take().unwrap(),
start_position.take().unwrap(),
self.position,
))),
_ => self.lex_next_preprocessor_token('#'),
};
self.mode = LexerMode::PreprocessorDirective;
return next_token;
} else {
// The first non-whitespace character on this line isn't '#'. This line must be source code.
// If the lexer's mode is currently `Unknown`, this is the start of a new source block.
// We switch lexing modes to `SourceBlock` and store information about the start of the block.
if self.mode == LexerMode::Unknown {
self.mode = LexerMode::SourceBlock;
// Store the starting position (in buffer) and location (row, col) of the source block.
debug_assert!(start_location.is_none());
debug_assert!(start_position.is_none());
start_location = Some(self.cursor);
start_position = Some(self.position);
}
// We know that this line is purely source code, so we skip the rest of the line.
self.advance_to_end_of_line();
}
self.skip_inline_whitespace();
}
// We've reached the end of the input.
match self.mode {
// If the lexer was in the middle of lexing a source block, return the source block as the final token.
LexerMode::SourceBlock => {
self.mode = LexerMode::Unknown;
Some(Ok(self.create_source_block_token(
start_location.take().unwrap(),
start_position.take().unwrap(),
self.input.len(),
)))
}
// If the lexer was in the middle of lexing a preprocessor directive, return a `DirectiveEnd` token.
LexerMode::PreprocessorDirective => {
self.mode = LexerMode::Unknown;
Some(Ok((self.cursor, TokenKind::DirectiveEnd, self.cursor)))
}
LexerMode::Unknown => {
debug_assert!(start_location.is_none());
debug_assert!(start_position.is_none());
None
}
}
}
}
// Allows string slices to be converted into `Lexer`s.
impl<'input> From<&'input str> for Lexer<'input> {
fn from(s: &'input str) -> Self {
Lexer::new(s)
}
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
enum LexerMode {
/// The lexer doesn't have enough context to know what mode it should be in. This is the initial mode of a newly
/// created lexer, and the mode lexers switch to after reaching the end of a preprocessor directive.
///
/// No lexing is performed in this state. The lexer simply checks the first non-whitespace character of the next
/// line to determine which mode to switch into, before consuming input. If the character is '#' it switches to
/// [`PreprocessorDirective`](LexerMode::PreprocessorDirective) mode, otherwise it switches to
/// [`SourceBlock`](LexerMode::SourceBlock) mode.
Unknown,
/// Indicates that the lexer is currently lexing a block of source code.
/// While in this mode, the lexer treats everything as string literals and performs no tokenization of the input.
///
/// This mode ends when the lexer sees a line where the first non-whitespace character is a '#', at which point it
/// switches into [`PreprocessorDirective`](LexerMode::PreprocessorDirective) mode.
SourceBlock,
/// Indicates that the lexer is currently lexing a preprocessor directive.
/// While in this mode, the lexer tokenizes input as preprocessor keywords and expressions.
///
/// This mode ends when the lexer hits end-of-line, at which point it switches into
/// [`Unknown`](LexerMode::Unknown) mode.
PreprocessorDirective,
}