1use crate::char::CharFilter;
2use crate::char::DIGIT;
3use crate::char::DIGIT_BIN;
4use crate::char::DIGIT_HEX;
5use crate::char::DIGIT_OCT;
6use crate::char::ID_CONTINUE;
7use crate::char::ID_START_CHARSTR;
8use crate::char::WHITESPACE;
9use crate::error::SyntaxError;
10use crate::error::SyntaxErrorType;
11use crate::error::SyntaxResult;
12use crate::source::SourceRange;
13use crate::token::Token;
14use crate::token::TokenType;
15use aho_corasick::AhoCorasick;
16use aho_corasick::AhoCorasickBuilder;
17use aho_corasick::MatchKind;
18use core::ops::Index;
19use lazy_static::lazy_static;
20use memchr::memchr;
21use memchr::memchr2;
22use memchr::memchr3;
23use std::collections::HashMap;
24
25#[derive(Copy, Clone)]
26pub struct LexerCheckpoint {
27 next: usize,
28}
29
30#[derive(Copy, Clone)]
31struct Match {
32 len: usize,
33}
34
35impl Match {
36 pub fn len(&self) -> usize {
37 self.len
38 }
39
40 pub fn is_empty(&self) -> bool {
41 self.len == 0
42 }
43}
44
45#[derive(Copy, Clone)]
46struct AhoCorasickMatch {
47 id: usize,
48 mat: Match,
49}
50
51pub struct Lexer<'a> {
52 source: &'a [u8],
53 next: usize,
54}
55
56#[allow(dead_code)]
57impl<'a> Lexer<'a> {
58 pub fn new(code: &'a [u8]) -> Lexer<'a> {
59 Lexer {
60 source: code,
61 next: 0,
62 }
63 }
64
65 fn end(&self) -> usize {
66 self.source.len()
67 }
68
69 fn remaining(&self) -> usize {
70 self.end() - self.next
71 }
72
73 pub fn source_range(&self) -> SourceRange {
74 SourceRange::new(0, self.end())
75 }
76
77 fn eof_range(&self) -> SourceRange {
78 SourceRange::new(self.end(), self.end())
79 }
80
81 fn error(&self, typ: SyntaxErrorType) -> SyntaxError {
82 SyntaxError::new(typ, SourceRange::new(self.next, self.end()), None)
83 }
84
85 fn at_end(&self) -> bool {
86 self.next >= self.end()
87 }
88
89 fn peek(&self, n: usize) -> SyntaxResult<u8> {
90 self
91 .peek_or_eof(n)
92 .ok_or_else(|| self.error(SyntaxErrorType::UnexpectedEnd))
93 }
94
95 fn peek_or_eof(&self, n: usize) -> Option<u8> {
96 self.source.get(self.next + n).map(|&c| c)
97 }
98
99 pub fn checkpoint(&self) -> LexerCheckpoint {
100 LexerCheckpoint { next: self.next }
101 }
102
103 pub fn since_checkpoint(&self, checkpoint: LexerCheckpoint) -> SourceRange {
104 SourceRange::new(checkpoint.next, self.next)
105 }
106
107 pub fn apply_checkpoint(&mut self, checkpoint: LexerCheckpoint) -> () {
108 self.next = checkpoint.next;
109 }
110
111 fn n(&self, n: usize) -> SyntaxResult<Match> {
112 if self.next + n > self.end() {
113 return Err(self.error(SyntaxErrorType::UnexpectedEnd));
114 };
115 Ok(Match { len: n })
116 }
117
118 fn if_char(&self, c: u8) -> Match {
119 Match {
120 len: (!self.at_end() && self.source[self.next] == c) as usize,
121 }
122 }
123
124 fn through_char_or_end(&self, c: u8) -> Match {
125 memchr(c, &self.source[self.next..])
126 .map(|pos| Match { len: pos + 1 })
127 .unwrap_or_else(|| Match {
128 len: self.remaining(),
129 })
130 }
131
132 fn through_char(&self, c: u8) -> SyntaxResult<Match> {
133 memchr(c, &self.source[self.next..])
134 .map(|pos| Match { len: pos + 1 })
135 .ok_or_else(|| self.error(SyntaxErrorType::UnexpectedEnd))
136 }
137
138 fn while_not_char(&self, a: u8) -> Match {
139 Match {
140 len: memchr(a, &self.source[self.next..]).unwrap_or(self.remaining()),
141 }
142 }
143
144 fn while_not_2_chars(&self, a: u8, b: u8) -> Match {
145 Match {
146 len: memchr2(a, b, &self.source[self.next..]).unwrap_or(self.remaining()),
147 }
148 }
149
150 fn while_not_3_chars(&self, a: u8, b: u8, c: u8) -> Match {
151 Match {
152 len: memchr3(a, b, c, &self.source[self.next..]).unwrap_or(self.remaining()),
153 }
154 }
155
156 fn while_chars(&self, chars: &CharFilter) -> Match {
157 let mut len = 0;
158 while len < self.remaining() && chars.has(self.source[self.next + len]) {
159 len += 1;
160 }
161 Match { len }
162 }
163
164 fn aho_corasick(&self, ac: &AhoCorasick) -> SyntaxResult<AhoCorasickMatch> {
165 ac.find(&self.source[self.next..])
166 .map(|m| AhoCorasickMatch {
167 id: m.pattern(),
168 mat: Match { len: m.end() },
169 })
170 .ok_or_else(|| self.error(SyntaxErrorType::ExpectedNotFound))
171 }
172
173 fn range(&self, m: Match) -> SourceRange {
174 SourceRange::new(self.next, self.next + m.len)
175 }
176
177 fn consume(&mut self, m: Match) -> Match {
178 self.next += m.len;
179 m
180 }
181
182 fn consume_next(&mut self) -> SyntaxResult<u8> {
183 let c = self.peek(0)?;
184 self.next += 1;
185 Ok(c)
186 }
187
188 fn skip_expect(&mut self, n: usize) -> () {
189 debug_assert!(self.next + n <= self.end());
190 self.next += n;
191 }
192}
193
194impl<'a> Index<SourceRange> for Lexer<'a> {
195 type Output = [u8];
196
197 fn index(&self, index: SourceRange) -> &Self::Output {
198 &self.source[index.start()..index.end()]
199 }
200}
201
202impl<'a> Index<Match> for Lexer<'a> {
203 type Output = [u8];
204
205 fn index(&self, index: Match) -> &Self::Output {
206 &self.source[self.next - index.len..self.next]
207 }
208}
209
210lazy_static! {
211 pub static ref OPERATORS_MAPPING: HashMap<TokenType, &'static [u8]> = {
212 let mut map = HashMap::<TokenType, &'static [u8]>::new();
213 map.insert(TokenType::Ampersand, b"&");
214 map.insert(TokenType::AmpersandAmpersand, b"&&");
215 map.insert(TokenType::AmpersandAmpersandEquals, b"&&=");
216 map.insert(TokenType::AmpersandEquals, b"&=");
217 map.insert(TokenType::Asterisk, b"*");
218 map.insert(TokenType::AsteriskAsterisk, b"**");
219 map.insert(TokenType::AsteriskAsteriskEquals, b"**=");
220 map.insert(TokenType::AsteriskEquals, b"*=");
221 map.insert(TokenType::Bar, b"|");
222 map.insert(TokenType::BarBar, b"||");
223 map.insert(TokenType::BarBarEquals, b"||=");
224 map.insert(TokenType::BarEquals, b"|=");
225 map.insert(TokenType::BraceClose, b"}");
226 map.insert(TokenType::BraceOpen, b"{");
227 map.insert(TokenType::BracketClose, b"]");
228 map.insert(TokenType::BracketOpen, b"[");
229 map.insert(TokenType::Caret, b"^");
230 map.insert(TokenType::CaretEquals, b"^=");
231 map.insert(TokenType::ChevronLeft, b"<");
232 map.insert(TokenType::ChevronLeftChevronLeft, b"<<");
233 map.insert(TokenType::ChevronLeftChevronLeftEquals, b"<<=");
234 map.insert(TokenType::ChevronLeftEquals, b"<=");
235 map.insert(TokenType::ChevronRight, b">");
236 map.insert(TokenType::ChevronRightChevronRight, b">>");
237 map.insert(TokenType::ChevronRightChevronRightChevronRight, b">>>");
238 map.insert(TokenType::ChevronRightChevronRightChevronRightEquals, b">>>=");
239 map.insert(TokenType::ChevronRightChevronRightEquals, b">>=");
240 map.insert(TokenType::ChevronRightEquals, b">=");
241 map.insert(TokenType::Colon, b":");
242 map.insert(TokenType::ColonColon, b"::");
243 map.insert(TokenType::Comma, b",");
244 map.insert(TokenType::Dot, b".");
245 map.insert(TokenType::DotDotDot, b"...");
246 map.insert(TokenType::Equals, b"=");
247 map.insert(TokenType::EqualsEquals, b"==");
248 map.insert(TokenType::Exclamation, b"!");
249 map.insert(TokenType::ExclamationEquals, b"!=");
250 map.insert(TokenType::Hyphen, b"-");
251 map.insert(TokenType::HyphenChevronRight, b"->");
252 map.insert(TokenType::HyphenEquals, b"-=");
253 map.insert(TokenType::ParenthesisClose, b")");
254 map.insert(TokenType::ParenthesisOpen, b"(");
255 map.insert(TokenType::Percent, b"%");
256 map.insert(TokenType::PercentEquals, b"%=");
257 map.insert(TokenType::Plus, b"+");
258 map.insert(TokenType::PlusEquals, b"+=");
259 map.insert(TokenType::QuestionDot, b"?.");
260 map.insert(TokenType::QuestionBracketOpen, b"?[");
261 map.insert(TokenType::QuestionParenthesisOpen, b"?(");
262 map.insert(TokenType::QuestionQuestion, b"??");
263 map.insert(TokenType::QuestionQuestionEquals, b"??=");
264 map.insert(TokenType::Semicolon, b";");
265 map.insert(TokenType::Slash, b"/");
266 map.insert(TokenType::SlashEquals, b"/=");
267 map
268 };
269
270 pub static ref KEYWORDS_MAPPING: HashMap<TokenType, &'static [u8]> = {
271 let mut map = HashMap::<TokenType, &'static [u8]>::new();
272 map.insert(TokenType::KeywordAs, b"as");
273 map.insert(TokenType::KeywordBreak, b"break");
274 map.insert(TokenType::KeywordContinue, b"continue");
275 map.insert(TokenType::KeywordCrate, b"crate");
276 map.insert(TokenType::KeywordElse, b"else");
277 map.insert(TokenType::KeywordFor, b"for");
278 map.insert(TokenType::KeywordIf, b"if");
279 map.insert(TokenType::KeywordImpl, b"impl");
280 map.insert(TokenType::KeywordIn, b"in");
281 map.insert(TokenType::KeywordLet, b"let");
282 map.insert(TokenType::KeywordLoop, b"loop");
283 map.insert(TokenType::KeywordMut, b"mut");
284 map.insert(TokenType::KeywordPub, b"pub");
285 map.insert(TokenType::KeywordReturn, b"return");
286 map.insert(TokenType::KeywordStruct, b"struct");
287 map.insert(TokenType::LiteralFalse, b"false");
288 map.insert(TokenType::LiteralNone, b"None");
289 map.insert(TokenType::LiteralTrue, b"true");
290 map
291 };
292
293 pub static ref KEYWORD_STRS: HashMap<&'static [u8], usize> = {
294 HashMap::<&'static [u8], usize>::from_iter(KEYWORDS_MAPPING.values().enumerate().map(|(i, v)| (*v, i)))
295 };
296
297 static ref PATTERNS: Vec<(TokenType, &'static [u8])> = {
299 let mut patterns: Vec<(TokenType, &'static [u8])> = Vec::new();
300 for (&k, &v) in OPERATORS_MAPPING.iter() {
301 patterns.push((k, v));
302 };
303 for (&k, &v) in KEYWORDS_MAPPING.iter() {
304 patterns.push((k, &v));
305 };
306 patterns.push((TokenType::ChevronLeftSlash, b"</"));
307 patterns.push((TokenType::CommentMultiple, b"/*"));
308 patterns.push((TokenType::CommentSingle, b"//"));
309 for c in ID_START_CHARSTR.chunks(1) {
310 patterns.push((TokenType::Identifier, c));
311 };
312 for c in b"0123456789".chunks(1) {
313 patterns.push((TokenType::_LiteralNumberDec, c));
314 };
315 patterns.push((TokenType::_LiteralNumberBin, b"0b"));
316 patterns.push((TokenType::_LiteralNumberBin, b"0B"));
317 patterns.push((TokenType::_LiteralNumberHex, b"0x"));
318 patterns.push((TokenType::_LiteralNumberHex, b"0X"));
319 patterns.push((TokenType::_LiteralNumberOct, b"0o"));
320 patterns.push((TokenType::_LiteralNumberOct, b"0O"));
321 patterns.push((TokenType::LiteralTemplatePartString, b"`"));
322 patterns
323 };
324
325 static ref MATCHER: AhoCorasick = AhoCorasickBuilder::new()
326 .anchored(true)
327 .dfa(true)
328 .match_kind(MatchKind::LeftmostLongest)
329 .build(PATTERNS.iter().map(|(_, pat)| pat));
330
331 static ref COMMENT_END: AhoCorasick = AhoCorasick::new(&[b"*/"]);
332}
333
334fn lex_multiple_comment<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<()> {
335 lexer.skip_expect(2);
337 lexer.consume(lexer.aho_corasick(&COMMENT_END)?.mat);
338 Ok(())
339}
340
341fn lex_single_comment<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<()> {
342 lexer.skip_expect(2);
344 lexer.consume(lexer.through_char_or_end(b'\n'));
346 Ok(())
347}
348
349fn lex_identifier<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
350 let cp = lexer.checkpoint();
351 lexer.skip_expect(1);
353 loop {
354 lexer.consume(lexer.while_chars(&ID_CONTINUE));
355 if lexer.peek_or_eof(0).filter(|c| !c.is_ascii()).is_none() {
356 break;
357 };
358 lexer.skip_expect(1);
359 }
360 Ok(Token::new(
361 lexer.since_checkpoint(cp),
362 TokenType::Identifier,
363 ))
364}
365
366fn lex_number_dec<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
367 let cp = lexer.checkpoint();
368 lexer.consume(lexer.while_chars(&DIGIT));
370 if !lexer.consume(lexer.if_char(b'n')).is_empty() {
371 return Ok(Token::new(
372 lexer.since_checkpoint(cp),
373 TokenType::LiteralBigInt,
374 ));
375 }
376 lexer.consume(lexer.if_char(b'.'));
377 lexer.consume(lexer.while_chars(&DIGIT));
378 if lexer
379 .peek_or_eof(0)
380 .filter(|&c| c == b'e' || c == b'E')
381 .is_some()
382 {
383 lexer.skip_expect(1);
384 match lexer.peek(0)? {
385 b'+' | b'-' => lexer.skip_expect(1),
386 _ => {}
387 };
388 lexer.consume(lexer.while_chars(&DIGIT));
389 }
390 Ok(Token::new(
391 lexer.since_checkpoint(cp),
392 TokenType::LiteralInt,
393 ))
394}
395
396fn lex_number_bin<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
397 let cp = lexer.checkpoint();
398 lexer.skip_expect(2);
399 lexer.consume(lexer.while_chars(&DIGIT_BIN));
400 if !lexer.consume(lexer.if_char(b'n')).is_empty() {
401 return Ok(Token::new(
402 lexer.since_checkpoint(cp),
403 TokenType::LiteralBigInt,
404 ));
405 }
406 Ok(Token::new(
407 lexer.since_checkpoint(cp),
408 TokenType::LiteralInt,
409 ))
410}
411
412fn lex_number_hex<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
413 let cp = lexer.checkpoint();
414 lexer.skip_expect(2);
415 lexer.consume(lexer.while_chars(&DIGIT_HEX));
416 if !lexer.consume(lexer.if_char(b'n')).is_empty() {
417 return Ok(Token::new(
418 lexer.since_checkpoint(cp),
419 TokenType::LiteralBigInt,
420 ));
421 }
422 Ok(Token::new(
423 lexer.since_checkpoint(cp),
424 TokenType::LiteralInt,
425 ))
426}
427
428fn lex_number_oct<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
429 let cp = lexer.checkpoint();
430 lexer.skip_expect(2);
431 lexer.consume(lexer.while_chars(&DIGIT_OCT));
432 if !lexer.consume(lexer.if_char(b'n')).is_empty() {
433 return Ok(Token::new(
434 lexer.since_checkpoint(cp),
435 TokenType::LiteralBigInt,
436 ));
437 }
438 Ok(Token::new(
439 lexer.since_checkpoint(cp),
440 TokenType::LiteralInt,
441 ))
442}
443
444pub fn lex_template_string_continue<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
445 let cp = lexer.checkpoint();
446 let mut ended = false;
447 let loc = loop {
448 lexer.consume(lexer.while_not_3_chars(b'\\', b'`', b'$'));
449 match lexer.peek(0)? {
450 b'\\' => {
451 lexer.consume(lexer.n(2)?);
452 }
453 b'`' => {
454 ended = true;
455 let loc = Some(lexer.since_checkpoint(cp));
456 lexer.skip_expect(1);
457 break loc;
458 }
459 b'$' => {
460 if lexer.peek(1)? == b'{' {
461 let loc = Some(lexer.since_checkpoint(cp));
462 lexer.skip_expect(2);
463 break loc;
464 } else {
465 lexer.skip_expect(1);
466 }
467 }
468 _ => unreachable!(),
469 };
470 };
471 Ok(Token::new(
472 loc.unwrap(),
473 if ended {
474 TokenType::LiteralTemplatePartStringEnd
475 } else {
476 TokenType::LiteralTemplatePartString
477 },
478 ))
479}
480
481fn lex_template<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
483 lexer.skip_expect(1);
485 lex_template_string_continue(lexer)
486}
487
488pub fn lex_next<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
489 loop {
490 let ws = lexer.while_chars(&WHITESPACE);
491 lexer.consume(ws);
492
493 if lexer.at_end() {
494 return Ok(Token::new(lexer.eof_range(), TokenType::EOF));
495 };
496
497 let AhoCorasickMatch { id, mat } = lexer.aho_corasick(&MATCHER)?;
498 match PATTERNS[id].0 {
499 TokenType::CommentMultiple => lex_multiple_comment(lexer)?,
500 TokenType::CommentSingle => lex_single_comment(lexer)?,
501 pat => {
502 return match pat {
503 TokenType::Identifier => lex_identifier(lexer),
504 TokenType::_LiteralNumberDec => lex_number_dec(lexer),
505 TokenType::_LiteralNumberBin => lex_number_bin(lexer),
506 TokenType::_LiteralNumberHex => lex_number_hex(lexer),
507 TokenType::_LiteralNumberOct => lex_number_oct(lexer),
508 TokenType::LiteralTemplatePartString => lex_template(lexer),
509 typ => {
510 if KEYWORDS_MAPPING.contains_key(&typ)
511 && lexer
512 .peek_or_eof(mat.len())
513 .filter(|c| ID_CONTINUE.has(*c))
514 .is_some()
515 {
516 return lex_identifier(lexer);
518 };
519 let loc = lexer.range(mat);
520 lexer.consume(mat);
521 Ok(Token::new(loc, typ))
522 }
523 };
524 }
525 };
526 }
527}