1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
// Copyright (c) ZeroC, Inc.
use super::super::common::SourceBlock;
use super::tokens::*;
use crate::slice_file::Location;
use std::iter::Peekable;
use std::str::CharIndices;
type LexerResult<'a> = Result<Token<'a>, Error>;
/// Converts a stream of [source blocks](super::super::common::SourceBlock) (blocks of source code) into a stream of
/// Slice tokens.
///
/// This token stream is in turn consumed by the [Slice parser](super::parser::Parser) which parses the tokens into an
/// [AST](crate::ast::Ast).
#[derive(Debug)]
pub struct Lexer<'input, T>
where
T: Iterator<Item = SourceBlock<'input>>,
{
/// Iterator over the source blocks that this lexer is operating on.
source_blocks: Peekable<T>,
/// The source block that the lexer is currently lexing within.
current_block: SourceBlock<'input>,
/// Iterator over the characters in the current block.
/// This is what the lexer actually operates on, by peeking at and consuming codepoints from this buffer.
buffer: Peekable<CharIndices<'input>>,
/// The lexer's current [location](crate::slice_file::Location) in the slice file.
/// Used to tag tokens with their starting and ending locations in the source input.
///
/// Since source blocks can be non-adjacent (separated by a preprocessor directive) in a slice file,
/// it's value can jump forward when switching to a new source block, making it unreliable for indexing.
cursor: Location,
/// This flag stores whether the lexer is currently lexing the inside of an attribute.
/// It is set to true upon encountering an '[' character, and false upon an ']' character.
attribute_mode: bool,
}
impl<'input, T> Lexer<'input, T>
where
T: Iterator<Item = SourceBlock<'input>>,
{
fn new(mut input: T) -> Self {
// If the input is empty create a dummy source block that's empty and will cause the lexer to immediately exit.
let current_block = input.next().unwrap_or_else(|| SourceBlock {
content: "",
start: Location::default(),
end: Location::default(),
});
let buffer = current_block.content.char_indices().peekable();
let start_location = current_block.start;
Lexer {
source_blocks: input.peekable(),
current_block,
buffer,
cursor: start_location,
attribute_mode: false,
}
}
/// Returns the lexer's position in the buffer of the source block it's currently lexing.
fn get_position(&mut self) -> usize {
if let Some((i, _)) = self.buffer.peek() {
*i
} else {
// `None` means we're at the end of the current source block's buffer.
self.current_block.content.len()
}
}
/// Consumes the next character in the buffer and moves the lexer's cursor forward accordingly.
fn advance_buffer(&mut self) {
// Consume the next character and check if it's a newline.
if let Some((_, c)) = self.buffer.next() {
if c == '\n' {
self.cursor.row += 1;
self.cursor.col = 1;
} else {
self.cursor.col += 1;
}
}
}
/// Consumes characters in the buffer until end-of-line (doesn't consume the EOL) or end-of-buffer is reached.
fn advance_to_end_of_line(&mut self) {
// Loop until the next character is '\n'.
while matches!(self.buffer.peek(), Some((_, c)) if *c != '\n') {
self.advance_buffer(); // Consume the character.
}
}
/// Consumes whitespace characters in the buffer until a non-whitespace character is reached.
/// After calling this function, the next character will be non-whitespace or `None` (end of buffer).
fn skip_whitespace(&mut self) {
// Loop while the next character in the buffer is whitespace.
while matches!(self.buffer.peek(), Some((_, c)) if c.is_whitespace()) {
self.advance_buffer(); // Consume the character.
}
}
/// Reads, consumes, and returns a string of alphanumeric characters from the buffer.
/// After calling this function, the next character will be a non-alphanumeric character or `None` (end of buffer).
fn read_alphanumeric(&mut self) -> &'input str {
let start_position = self.get_position();
// Loop while the next character in the buffer is alphanumeric or an underscore.
while matches!(self.buffer.peek(), Some((_, c)) if (c.is_ascii_alphanumeric() || *c == '_')) {
self.advance_buffer(); // Consume the alphanumeric character.
}
let end_position = self.get_position();
&self.current_block.content[start_position..end_position]
}
/// Reads, consumes, and returns a string literal from the buffer.
/// String literals are any characters contained within a pair of un-escaped double-quotes.
/// The returned string doesn't include the opening and closing quotation marks, just the content between them.
///
/// This function expects the lexer's cursor to be immediately before the opening '"' character.
fn read_string_literal(&mut self) -> Result<&'input str, ErrorKind> {
self.advance_buffer(); // Consume the opening quotation mark.
let start_position = self.get_position();
let mut is_next_char_escaped = false;
while let Some((_, c)) = self.buffer.peek() {
if *c == '\n' {
// String literals cannot contain newlines.
return Err(ErrorKind::UnterminatedStringLiteral);
} else if is_next_char_escaped {
// If this character is escaped, don't check it and just reset the flag.
is_next_char_escaped = false;
} else {
match c {
'"' => {
// We've reached the end of the string literal.
let end_position = self.get_position();
self.advance_buffer(); // Consume the closing quotation mark.
return Ok(&self.current_block.content[start_position..end_position]);
}
'\\' => is_next_char_escaped = true,
_ => {}
}
}
self.advance_buffer(); // Consume the character.
}
// Reaching this means we hit the end of a buffer before the end of the string literal.
Err(ErrorKind::UnterminatedStringLiteral)
}
/// Reads, consumes. and returns a line comment from the buffer.
/// This function expects the lexer's cursor to be immediately after the last '/' character.
fn read_line_comment(&mut self) -> &'input str {
let start_position = self.get_position();
self.advance_to_end_of_line();
let end_position = self.get_position();
&self.current_block.content[start_position..end_position]
}
/// Reads and consumes a block comment from the buffer, ignoring it.
/// This function expects the lexer's cursor to be immediately after the opening "/*".
fn consume_block_comment(&mut self) -> Result<(), ErrorKind> {
let mut last_character_was_an_asterisk = false;
while let Some((_, c)) = self.buffer.peek().cloned() {
self.advance_buffer(); // Consume the character.
match c {
'/' if last_character_was_an_asterisk => return Ok(()),
'*' => last_character_was_an_asterisk = true,
_ => last_character_was_an_asterisk = false,
}
}
// Reaching this means we hit the end of a buffer before the end of the block comment.
Err(ErrorKind::UnterminatedBlockComment)
}
/// Checks if an identifier corresponds to a Slice keyword. If it does,
/// return the keyword's token. Otherwise, return an `[TokenKind::Identifier]` token.
fn check_if_keyword(identifier: &str) -> TokenKind<'_> {
debug_assert!(identifier.chars().all(|c| c.is_ascii_alphanumeric() || c == '_'));
debug_assert!(!identifier.is_empty());
match identifier {
"module" => TokenKind::ModuleKeyword,
"struct" => TokenKind::StructKeyword,
"exception" => TokenKind::ExceptionKeyword,
"class" => TokenKind::ClassKeyword,
"interface" => TokenKind::InterfaceKeyword,
"enum" => TokenKind::EnumKeyword,
"custom" => TokenKind::CustomKeyword,
"typealias" => TokenKind::TypeAliasKeyword,
"Result" => TokenKind::ResultKeyword,
"Sequence" => TokenKind::SequenceKeyword,
"Dictionary" => TokenKind::DictionaryKeyword,
"bool" => TokenKind::BoolKeyword,
"int8" => TokenKind::Int8Keyword,
"uint8" => TokenKind::UInt8Keyword,
"int16" => TokenKind::Int16Keyword,
"uint16" => TokenKind::UInt16Keyword,
"int32" => TokenKind::Int32Keyword,
"uint32" => TokenKind::UInt32Keyword,
"varint32" => TokenKind::VarInt32Keyword,
"varuint32" => TokenKind::VarUInt32Keyword,
"int64" => TokenKind::Int64Keyword,
"uint64" => TokenKind::UInt64Keyword,
"varint62" => TokenKind::VarInt62Keyword,
"varuint62" => TokenKind::VarUInt62Keyword,
"float32" => TokenKind::Float32Keyword,
"float64" => TokenKind::Float64Keyword,
"string" => TokenKind::StringKeyword,
"AnyClass" => TokenKind::AnyClassKeyword,
"compact" => TokenKind::CompactKeyword,
"idempotent" => TokenKind::IdempotentKeyword,
"mode" => TokenKind::ModeKeyword,
"stream" => TokenKind::StreamKeyword,
"tag" => TokenKind::TagKeyword,
"throws" => TokenKind::ThrowsKeyword,
"unchecked" => TokenKind::UncheckedKeyword,
ident => TokenKind::Identifier(ident),
}
}
/// Consumes a single character from the lexer's buffer and returns a token of the specified kind.
/// This is a convenience function for the common case where a token's lexeme is a single character.
fn return_simple_token(&mut self, token: TokenKind<'input>, start: Location) -> Option<LexerResult<'input>> {
self.advance_buffer(); // Consume the token from the buffer.
Some(Ok((start, token, self.cursor))) // Return it.
}
/// Attempts to read and return a Slice token from the buffer.
/// Returns `None` to indicate it read a token but ignored it (non-doc comments, whitespace, etc.),
/// `Some(Ok(x))` to indicate success (where `x` is the next token),
/// and `Some(Err(y))` to indicate an error occurred during lexing.
fn lex_next_slice_token(&mut self, c: char) -> Option<LexerResult<'input>> {
let start_location = self.cursor;
match c {
'(' => self.return_simple_token(TokenKind::LeftParenthesis, start_location),
')' => self.return_simple_token(TokenKind::RightParenthesis, start_location),
'[' => {
// Set the 'attribute_mode' flag since this must be the start of an attribute.
self.attribute_mode = true;
self.advance_buffer(); // Consume the '[' character.
// Check if the next character is also '['.
if matches!(self.buffer.peek(), Some((_, '['))) {
self.advance_buffer(); // Consume the second '[' character.
Some(Ok((start_location, TokenKind::DoubleLeftBracket, self.cursor)))
} else {
Some(Ok((start_location, TokenKind::LeftBracket, self.cursor)))
}
}
']' => {
// Clear the 'attribute_mode' flag since this must be the end of an attribute.
self.attribute_mode = false;
self.advance_buffer(); // Consume the ']' character.
// Check if the next character is also ']'.
if matches!(self.buffer.peek(), Some((_, ']'))) {
self.advance_buffer(); // Consume the second ']' character.
Some(Ok((start_location, TokenKind::DoubleRightBracket, self.cursor)))
} else {
Some(Ok((start_location, TokenKind::RightBracket, self.cursor)))
}
}
'{' => self.return_simple_token(TokenKind::LeftBrace, start_location),
'}' => self.return_simple_token(TokenKind::RightBrace, start_location),
'<' => self.return_simple_token(TokenKind::LeftChevron, start_location),
'>' => self.return_simple_token(TokenKind::RightChevron, start_location),
',' => self.return_simple_token(TokenKind::Comma, start_location),
':' => {
self.advance_buffer(); // Consume the ':' character.
// Check if the next character is also ':'.
if matches!(self.buffer.peek(), Some((_, ':'))) {
self.advance_buffer(); // Consume the second ':' character.
Some(Ok((start_location, TokenKind::DoubleColon, self.cursor)))
} else {
Some(Ok((start_location, TokenKind::Colon, self.cursor)))
}
}
'=' => self.return_simple_token(TokenKind::Equals, start_location),
'?' => self.return_simple_token(TokenKind::QuestionMark, start_location),
'-' => {
self.advance_buffer(); // Consume the '-' character.
// Check if the next character is '>'.
if matches!(self.buffer.peek(), Some((_, '>'))) {
self.advance_buffer(); // Consume the second '>' character.
Some(Ok((start_location, TokenKind::Arrow, self.cursor)))
} else {
Some(Ok((start_location, TokenKind::Minus, self.cursor)))
}
}
'"' => {
let result = self.read_string_literal();
Some(match result {
Ok(s) => Ok((start_location, TokenKind::StringLiteral(s), self.cursor)),
Err(err) => Err((start_location, err, self.cursor)),
})
}
'/' => {
self.advance_buffer(); // Consume the '/' character.
match self.buffer.peek() {
// The token is at least '//', indicating a line comment.
Some((_, '/')) => {
self.advance_buffer(); // Consume the 2nd '/' character.
// Check if there's a 3rd '/' character indicating this may be a doc comment.
let mut is_doc_comment = matches!(self.buffer.peek(), Some((_, '/')));
if is_doc_comment {
self.advance_buffer(); // Consume the 3rd '/' character.
// Check if there's a 4th '/' character, which would turn this back into a non-doc comment.
// Doc comments must start with _exactly_ 3 '/' characters.
is_doc_comment = !matches!(self.buffer.peek(), Some((_, '/')));
}
let content_start_loc = self.cursor;
let comment = self.read_line_comment();
match is_doc_comment {
true => Some(Ok((content_start_loc, TokenKind::DocComment(comment), self.cursor))),
false => None, // Non-doc comments are ignored.
}
}
// The token is "/*", indicating the start of a block comment.
Some((_, '*')) => {
self.advance_buffer(); // Consume the '*'.
match self.consume_block_comment() {
Ok(_) => None, // Block comments are always ignored.
Err(err) => Some(Err((start_location, err, self.cursor))),
}
}
// The token is just "/", indicating a syntax error. '/' on its own isn't a valid Slice token.
_ => {
let error = ErrorKind::UnknownSymbol {
symbol: "/".to_owned(),
suggestion: Some("//".to_owned()),
};
Some(Err((start_location, error, self.cursor)))
}
}
}
'\\' => {
self.advance_buffer(); // Consume the '\' character.
// Check if the next character could be the start of an identifier.
if matches!(self.buffer.peek(), Some((_, ch)) if ch.is_ascii_alphabetic()) {
let identifier = self.read_alphanumeric();
Some(Ok((start_location, TokenKind::Identifier(identifier), self.cursor)))
} else {
// The token is just "\", indicating a syntax error. '\' on its own isn't a valid Slice token.
let error = ErrorKind::UnknownSymbol {
symbol: "\\".to_string(),
suggestion: Some("\\<identifier>".to_owned()),
};
Some(Err((start_location, error, self.cursor)))
}
}
_ if c.is_ascii_alphabetic() => {
let token = if self.attribute_mode {
// If we're lexing an attribute, return the identifier as-is, without checking if it's a keyword.
TokenKind::Identifier(self.read_alphanumeric())
} else {
Self::check_if_keyword(self.read_alphanumeric())
};
Some(Ok((start_location, token, self.cursor)))
}
_ if c.is_ascii_digit() => {
let integer = self.read_alphanumeric();
Some(Ok((start_location, TokenKind::IntegerLiteral(integer), self.cursor)))
}
_ if c.is_whitespace() => {
self.skip_whitespace();
None
}
unknown => {
self.advance_buffer(); // Consume the unknown symbol.
let error = ErrorKind::UnknownSymbol {
symbol: unknown.to_string(),
suggestion: None,
};
Some(Err((start_location, error, self.cursor)))
}
}
}
}
impl<'input, T> Iterator for Lexer<'input, T>
where
T: Iterator<Item = SourceBlock<'input>>,
{
type Item = LexerResult<'input>;
/// Attempts to lex and return the next token in this lexer's token stream.
/// Returns `None` to indicate end-of-stream, `Some(Ok(x))` to indicate success (where `x` is the next token),
/// and `Some(Err(y))` to indicate an error occurred during lexing.
fn next(&mut self) -> Option<Self::Item> {
// Continue iterating until we return a token, or reach the end of our source blocks.
loop {
// Continue iterating until we return a token, or reach the end of the current source block.
while let Some((_, c)) = self.buffer.peek().cloned() {
// If the lexer has lexed a token or encountered an error, return it.
if let Some(token) = self.lex_next_slice_token(c) {
return Some(token);
}
}
// We've reached the end of the current source block.
if let Some(next_source_block) = self.source_blocks.next() {
// Drop the current source block and replace it with the next source block.
self.current_block = next_source_block;
self.buffer = self.current_block.content.char_indices().peekable();
self.cursor = self.current_block.start;
} else {
// There are no more source blocks to parse, the lexer has hit end of input.
return None;
}
}
}
}
// Allows iterators of source blocks to be converted into `Lexer`s.
impl<'input, T> From<T> for Lexer<'input, T>
where
T: Iterator<Item = SourceBlock<'input>>,
{
fn from(source_blocks: T) -> Self {
Lexer::new(source_blocks)
}
}
// Allows string slices to be converted into `Lexer`s.
#[cfg(test)]
impl<'input> From<&'input str> for Lexer<'input, std::iter::Once<SourceBlock<'input>>> {
fn from(s: &'input str) -> Self {
let newlines = s.char_indices().filter(|&(_, c)| c == '\n').collect::<Vec<_>>();
let chars_in_last_line = s[newlines.last().unwrap().0..].chars().count();
let source_block = SourceBlock {
content: s,
start: Location::default(),
end: Location {
row: newlines.len() + 1,
col: chars_in_last_line,
},
};
Lexer::new(std::iter::once(source_block))
}
}