arrow_parser/
lex.rs

1use crate::char::CharFilter;
2use crate::char::DIGIT;
3use crate::char::DIGIT_BIN;
4use crate::char::DIGIT_HEX;
5use crate::char::DIGIT_OCT;
6use crate::char::ID_CONTINUE;
7use crate::char::ID_START_CHARSTR;
8use crate::char::WHITESPACE;
9use crate::error::SyntaxError;
10use crate::error::SyntaxErrorType;
11use crate::error::SyntaxResult;
12use crate::source::SourceRange;
13use crate::token::Token;
14use crate::token::TokenType;
15use aho_corasick::AhoCorasick;
16use aho_corasick::AhoCorasickBuilder;
17use aho_corasick::MatchKind;
18use core::ops::Index;
19use lazy_static::lazy_static;
20use memchr::memchr;
21use memchr::memchr2;
22use memchr::memchr3;
23use std::collections::HashMap;
24
25#[derive(Copy, Clone)]
26pub struct LexerCheckpoint {
27  next: usize,
28}
29
30#[derive(Copy, Clone)]
31struct Match {
32  len: usize,
33}
34
35impl Match {
36  pub fn len(&self) -> usize {
37    self.len
38  }
39
40  pub fn is_empty(&self) -> bool {
41    self.len == 0
42  }
43}
44
45#[derive(Copy, Clone)]
46struct AhoCorasickMatch {
47  id: usize,
48  mat: Match,
49}
50
51pub struct Lexer<'a> {
52  source: &'a [u8],
53  next: usize,
54}
55
56#[allow(dead_code)]
57impl<'a> Lexer<'a> {
58  pub fn new(code: &'a [u8]) -> Lexer<'a> {
59    Lexer {
60      source: code,
61      next: 0,
62    }
63  }
64
65  fn end(&self) -> usize {
66    self.source.len()
67  }
68
69  fn remaining(&self) -> usize {
70    self.end() - self.next
71  }
72
73  pub fn source_range(&self) -> SourceRange {
74    SourceRange::new(0, self.end())
75  }
76
77  fn eof_range(&self) -> SourceRange {
78    SourceRange::new(self.end(), self.end())
79  }
80
81  fn error(&self, typ: SyntaxErrorType) -> SyntaxError {
82    SyntaxError::new(typ, SourceRange::new(self.next, self.end()), None)
83  }
84
85  fn at_end(&self) -> bool {
86    self.next >= self.end()
87  }
88
89  fn peek(&self, n: usize) -> SyntaxResult<u8> {
90    self
91      .peek_or_eof(n)
92      .ok_or_else(|| self.error(SyntaxErrorType::UnexpectedEnd))
93  }
94
95  fn peek_or_eof(&self, n: usize) -> Option<u8> {
96    self.source.get(self.next + n).map(|&c| c)
97  }
98
99  pub fn checkpoint(&self) -> LexerCheckpoint {
100    LexerCheckpoint { next: self.next }
101  }
102
103  pub fn since_checkpoint(&self, checkpoint: LexerCheckpoint) -> SourceRange {
104    SourceRange::new(checkpoint.next, self.next)
105  }
106
107  pub fn apply_checkpoint(&mut self, checkpoint: LexerCheckpoint) -> () {
108    self.next = checkpoint.next;
109  }
110
111  fn n(&self, n: usize) -> SyntaxResult<Match> {
112    if self.next + n > self.end() {
113      return Err(self.error(SyntaxErrorType::UnexpectedEnd));
114    };
115    Ok(Match { len: n })
116  }
117
118  fn if_char(&self, c: u8) -> Match {
119    Match {
120      len: (!self.at_end() && self.source[self.next] == c) as usize,
121    }
122  }
123
124  fn through_char_or_end(&self, c: u8) -> Match {
125    memchr(c, &self.source[self.next..])
126      .map(|pos| Match { len: pos + 1 })
127      .unwrap_or_else(|| Match {
128        len: self.remaining(),
129      })
130  }
131
132  fn through_char(&self, c: u8) -> SyntaxResult<Match> {
133    memchr(c, &self.source[self.next..])
134      .map(|pos| Match { len: pos + 1 })
135      .ok_or_else(|| self.error(SyntaxErrorType::UnexpectedEnd))
136  }
137
138  fn while_not_char(&self, a: u8) -> Match {
139    Match {
140      len: memchr(a, &self.source[self.next..]).unwrap_or(self.remaining()),
141    }
142  }
143
144  fn while_not_2_chars(&self, a: u8, b: u8) -> Match {
145    Match {
146      len: memchr2(a, b, &self.source[self.next..]).unwrap_or(self.remaining()),
147    }
148  }
149
150  fn while_not_3_chars(&self, a: u8, b: u8, c: u8) -> Match {
151    Match {
152      len: memchr3(a, b, c, &self.source[self.next..]).unwrap_or(self.remaining()),
153    }
154  }
155
156  fn while_chars(&self, chars: &CharFilter) -> Match {
157    let mut len = 0;
158    while len < self.remaining() && chars.has(self.source[self.next + len]) {
159      len += 1;
160    }
161    Match { len }
162  }
163
164  fn aho_corasick(&self, ac: &AhoCorasick) -> SyntaxResult<AhoCorasickMatch> {
165    ac.find(&self.source[self.next..])
166      .map(|m| AhoCorasickMatch {
167        id: m.pattern(),
168        mat: Match { len: m.end() },
169      })
170      .ok_or_else(|| self.error(SyntaxErrorType::ExpectedNotFound))
171  }
172
173  fn range(&self, m: Match) -> SourceRange {
174    SourceRange::new(self.next, self.next + m.len)
175  }
176
177  fn consume(&mut self, m: Match) -> Match {
178    self.next += m.len;
179    m
180  }
181
182  fn consume_next(&mut self) -> SyntaxResult<u8> {
183    let c = self.peek(0)?;
184    self.next += 1;
185    Ok(c)
186  }
187
188  fn skip_expect(&mut self, n: usize) -> () {
189    debug_assert!(self.next + n <= self.end());
190    self.next += n;
191  }
192}
193
194impl<'a> Index<SourceRange> for Lexer<'a> {
195  type Output = [u8];
196
197  fn index(&self, index: SourceRange) -> &Self::Output {
198    &self.source[index.start()..index.end()]
199  }
200}
201
202impl<'a> Index<Match> for Lexer<'a> {
203  type Output = [u8];
204
205  fn index(&self, index: Match) -> &Self::Output {
206    &self.source[self.next - index.len..self.next]
207  }
208}
209
210lazy_static! {
211    pub static ref OPERATORS_MAPPING: HashMap<TokenType, &'static [u8]> = {
212        let mut map = HashMap::<TokenType, &'static [u8]>::new();
213        map.insert(TokenType::Ampersand, b"&");
214        map.insert(TokenType::AmpersandAmpersand, b"&&");
215        map.insert(TokenType::AmpersandAmpersandEquals, b"&&=");
216        map.insert(TokenType::AmpersandEquals, b"&=");
217        map.insert(TokenType::Asterisk, b"*");
218        map.insert(TokenType::AsteriskAsterisk, b"**");
219        map.insert(TokenType::AsteriskAsteriskEquals, b"**=");
220        map.insert(TokenType::AsteriskEquals, b"*=");
221        map.insert(TokenType::Bar, b"|");
222        map.insert(TokenType::BarBar, b"||");
223        map.insert(TokenType::BarBarEquals, b"||=");
224        map.insert(TokenType::BarEquals, b"|=");
225        map.insert(TokenType::BraceClose, b"}");
226        map.insert(TokenType::BraceOpen, b"{");
227        map.insert(TokenType::BracketClose, b"]");
228        map.insert(TokenType::BracketOpen, b"[");
229        map.insert(TokenType::Caret, b"^");
230        map.insert(TokenType::CaretEquals, b"^=");
231        map.insert(TokenType::ChevronLeft, b"<");
232        map.insert(TokenType::ChevronLeftChevronLeft, b"<<");
233        map.insert(TokenType::ChevronLeftChevronLeftEquals, b"<<=");
234        map.insert(TokenType::ChevronLeftEquals, b"<=");
235        map.insert(TokenType::ChevronRight, b">");
236        map.insert(TokenType::ChevronRightChevronRight, b">>");
237        map.insert(TokenType::ChevronRightChevronRightChevronRight, b">>>");
238        map.insert(TokenType::ChevronRightChevronRightChevronRightEquals, b">>>=");
239        map.insert(TokenType::ChevronRightChevronRightEquals, b">>=");
240        map.insert(TokenType::ChevronRightEquals, b">=");
241        map.insert(TokenType::Colon, b":");
242        map.insert(TokenType::ColonColon, b"::");
243        map.insert(TokenType::Comma, b",");
244        map.insert(TokenType::Dot, b".");
245        map.insert(TokenType::DotDotDot, b"...");
246        map.insert(TokenType::Equals, b"=");
247        map.insert(TokenType::EqualsEquals, b"==");
248        map.insert(TokenType::Exclamation, b"!");
249        map.insert(TokenType::ExclamationEquals, b"!=");
250        map.insert(TokenType::Hyphen, b"-");
251        map.insert(TokenType::HyphenChevronRight, b"->");
252        map.insert(TokenType::HyphenEquals, b"-=");
253        map.insert(TokenType::ParenthesisClose, b")");
254        map.insert(TokenType::ParenthesisOpen, b"(");
255        map.insert(TokenType::Percent, b"%");
256        map.insert(TokenType::PercentEquals, b"%=");
257        map.insert(TokenType::Plus, b"+");
258        map.insert(TokenType::PlusEquals, b"+=");
259        map.insert(TokenType::QuestionDot, b"?.");
260        map.insert(TokenType::QuestionBracketOpen, b"?[");
261        map.insert(TokenType::QuestionParenthesisOpen, b"?(");
262        map.insert(TokenType::QuestionQuestion, b"??");
263        map.insert(TokenType::QuestionQuestionEquals, b"??=");
264        map.insert(TokenType::Semicolon, b";");
265        map.insert(TokenType::Slash, b"/");
266        map.insert(TokenType::SlashEquals, b"/=");
267        map
268    };
269
270    pub static ref KEYWORDS_MAPPING: HashMap<TokenType, &'static [u8]> = {
271        let mut map = HashMap::<TokenType, &'static [u8]>::new();
272        map.insert(TokenType::KeywordAs, b"as");
273        map.insert(TokenType::KeywordBreak, b"break");
274        map.insert(TokenType::KeywordContinue, b"continue");
275        map.insert(TokenType::KeywordCrate, b"crate");
276        map.insert(TokenType::KeywordElse, b"else");
277        map.insert(TokenType::KeywordFor, b"for");
278        map.insert(TokenType::KeywordIf, b"if");
279        map.insert(TokenType::KeywordImpl, b"impl");
280        map.insert(TokenType::KeywordIn, b"in");
281        map.insert(TokenType::KeywordLet, b"let");
282        map.insert(TokenType::KeywordLoop, b"loop");
283        map.insert(TokenType::KeywordMut, b"mut");
284        map.insert(TokenType::KeywordPub, b"pub");
285        map.insert(TokenType::KeywordReturn, b"return");
286        map.insert(TokenType::KeywordStruct, b"struct");
287        map.insert(TokenType::LiteralFalse, b"false");
288        map.insert(TokenType::LiteralNone, b"None");
289        map.insert(TokenType::LiteralTrue, b"true");
290        map
291    };
292
293    pub static ref KEYWORD_STRS: HashMap<&'static [u8], usize> = {
294        HashMap::<&'static [u8], usize>::from_iter(KEYWORDS_MAPPING.values().enumerate().map(|(i, v)| (*v, i)))
295    };
296
297    // This has a specific order so that when we use MATCHER, we can find the corresponding TokenType.
298    static ref PATTERNS: Vec<(TokenType, &'static [u8])> = {
299        let mut patterns: Vec<(TokenType, &'static [u8])> = Vec::new();
300        for (&k, &v) in OPERATORS_MAPPING.iter() {
301            patterns.push((k, v));
302        };
303        for (&k, &v) in KEYWORDS_MAPPING.iter() {
304          patterns.push((k, &v));
305        };
306        patterns.push((TokenType::ChevronLeftSlash, b"</"));
307        patterns.push((TokenType::CommentMultiple, b"/*"));
308        patterns.push((TokenType::CommentSingle, b"//"));
309        for c in ID_START_CHARSTR.chunks(1) {
310            patterns.push((TokenType::Identifier, c));
311        };
312        for c in b"0123456789".chunks(1) {
313            patterns.push((TokenType::_LiteralNumberDec, c));
314        };
315        patterns.push((TokenType::_LiteralNumberBin, b"0b"));
316        patterns.push((TokenType::_LiteralNumberBin, b"0B"));
317        patterns.push((TokenType::_LiteralNumberHex, b"0x"));
318        patterns.push((TokenType::_LiteralNumberHex, b"0X"));
319        patterns.push((TokenType::_LiteralNumberOct, b"0o"));
320        patterns.push((TokenType::_LiteralNumberOct, b"0O"));
321        patterns.push((TokenType::LiteralTemplatePartString, b"`"));
322        patterns
323    };
324
325    static ref MATCHER: AhoCorasick = AhoCorasickBuilder::new()
326        .anchored(true)
327        .dfa(true)
328        .match_kind(MatchKind::LeftmostLongest)
329        .build(PATTERNS.iter().map(|(_, pat)| pat));
330
331    static ref COMMENT_END: AhoCorasick = AhoCorasick::new(&[b"*/"]);
332}
333
334fn lex_multiple_comment<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<()> {
335  // Consume `/*`.
336  lexer.skip_expect(2);
337  lexer.consume(lexer.aho_corasick(&COMMENT_END)?.mat);
338  Ok(())
339}
340
341fn lex_single_comment<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<()> {
342  // Consume `//`.
343  lexer.skip_expect(2);
344  // WARNING: Does not consider other line terminators allowed by spec.
345  lexer.consume(lexer.through_char_or_end(b'\n'));
346  Ok(())
347}
348
349fn lex_identifier<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
350  let cp = lexer.checkpoint();
351  // Consume starter.
352  lexer.skip_expect(1);
353  loop {
354    lexer.consume(lexer.while_chars(&ID_CONTINUE));
355    if lexer.peek_or_eof(0).filter(|c| !c.is_ascii()).is_none() {
356      break;
357    };
358    lexer.skip_expect(1);
359  }
360  Ok(Token::new(
361    lexer.since_checkpoint(cp),
362    TokenType::Identifier,
363  ))
364}
365
366fn lex_number_dec<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
367  let cp = lexer.checkpoint();
368  // TODO
369  lexer.consume(lexer.while_chars(&DIGIT));
370  if !lexer.consume(lexer.if_char(b'n')).is_empty() {
371    return Ok(Token::new(
372      lexer.since_checkpoint(cp),
373      TokenType::LiteralBigInt,
374    ));
375  }
376  lexer.consume(lexer.if_char(b'.'));
377  lexer.consume(lexer.while_chars(&DIGIT));
378  if lexer
379    .peek_or_eof(0)
380    .filter(|&c| c == b'e' || c == b'E')
381    .is_some()
382  {
383    lexer.skip_expect(1);
384    match lexer.peek(0)? {
385      b'+' | b'-' => lexer.skip_expect(1),
386      _ => {}
387    };
388    lexer.consume(lexer.while_chars(&DIGIT));
389  }
390  Ok(Token::new(
391    lexer.since_checkpoint(cp),
392    TokenType::LiteralInt,
393  ))
394}
395
396fn lex_number_bin<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
397  let cp = lexer.checkpoint();
398  lexer.skip_expect(2);
399  lexer.consume(lexer.while_chars(&DIGIT_BIN));
400  if !lexer.consume(lexer.if_char(b'n')).is_empty() {
401    return Ok(Token::new(
402      lexer.since_checkpoint(cp),
403      TokenType::LiteralBigInt,
404    ));
405  }
406  Ok(Token::new(
407    lexer.since_checkpoint(cp),
408    TokenType::LiteralInt,
409  ))
410}
411
412fn lex_number_hex<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
413  let cp = lexer.checkpoint();
414  lexer.skip_expect(2);
415  lexer.consume(lexer.while_chars(&DIGIT_HEX));
416  if !lexer.consume(lexer.if_char(b'n')).is_empty() {
417    return Ok(Token::new(
418      lexer.since_checkpoint(cp),
419      TokenType::LiteralBigInt,
420    ));
421  }
422  Ok(Token::new(
423    lexer.since_checkpoint(cp),
424    TokenType::LiteralInt,
425  ))
426}
427
428fn lex_number_oct<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
429  let cp = lexer.checkpoint();
430  lexer.skip_expect(2);
431  lexer.consume(lexer.while_chars(&DIGIT_OCT));
432  if !lexer.consume(lexer.if_char(b'n')).is_empty() {
433    return Ok(Token::new(
434      lexer.since_checkpoint(cp),
435      TokenType::LiteralBigInt,
436    ));
437  }
438  Ok(Token::new(
439    lexer.since_checkpoint(cp),
440    TokenType::LiteralInt,
441  ))
442}
443
444pub fn lex_template_string_continue<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
445  let cp = lexer.checkpoint();
446  let mut ended = false;
447  let loc = loop {
448    lexer.consume(lexer.while_not_3_chars(b'\\', b'`', b'$'));
449    match lexer.peek(0)? {
450      b'\\' => {
451        lexer.consume(lexer.n(2)?);
452      }
453      b'`' => {
454        ended = true;
455        let loc = Some(lexer.since_checkpoint(cp));
456        lexer.skip_expect(1);
457        break loc;
458      }
459      b'$' => {
460        if lexer.peek(1)? == b'{' {
461          let loc = Some(lexer.since_checkpoint(cp));
462          lexer.skip_expect(2);
463          break loc;
464        } else {
465          lexer.skip_expect(1);
466        }
467      }
468      _ => unreachable!(),
469    };
470  };
471  Ok(Token::new(
472    loc.unwrap(),
473    if ended {
474      TokenType::LiteralTemplatePartStringEnd
475    } else {
476      TokenType::LiteralTemplatePartString
477    },
478  ))
479}
480
481// TODO Validate template.
482fn lex_template<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
483  // Consume backtick.
484  lexer.skip_expect(1);
485  lex_template_string_continue(lexer)
486}
487
488pub fn lex_next<'a>(lexer: &mut Lexer<'a>) -> SyntaxResult<Token> {
489  loop {
490    let ws = lexer.while_chars(&WHITESPACE);
491    lexer.consume(ws);
492
493    if lexer.at_end() {
494      return Ok(Token::new(lexer.eof_range(), TokenType::EOF));
495    };
496
497    let AhoCorasickMatch { id, mat } = lexer.aho_corasick(&MATCHER)?;
498    match PATTERNS[id].0 {
499      TokenType::CommentMultiple => lex_multiple_comment(lexer)?,
500      TokenType::CommentSingle => lex_single_comment(lexer)?,
501      pat => {
502        return match pat {
503          TokenType::Identifier => lex_identifier(lexer),
504          TokenType::_LiteralNumberDec => lex_number_dec(lexer),
505          TokenType::_LiteralNumberBin => lex_number_bin(lexer),
506          TokenType::_LiteralNumberHex => lex_number_hex(lexer),
507          TokenType::_LiteralNumberOct => lex_number_oct(lexer),
508          TokenType::LiteralTemplatePartString => lex_template(lexer),
509          typ => {
510            if KEYWORDS_MAPPING.contains_key(&typ)
511              && lexer
512                .peek_or_eof(mat.len())
513                .filter(|c| ID_CONTINUE.has(*c))
514                .is_some()
515            {
516              // We've accidentally matched a prefix of an identifier as a keyword.
517              return lex_identifier(lexer);
518            };
519            let loc = lexer.range(mat);
520            lexer.consume(mat);
521            Ok(Token::new(loc, typ))
522          }
523        };
524      }
525    };
526  }
527}