1mod delimiter;
2mod error;
3mod patterns;
4mod token;
5
6use std::collections::VecDeque;
7
8use crate::{
9 ast::{self, CheckModifier},
10 common::*,
11 parse::ParserError,
12 pattern::search::{AhoCorasickSearcher, RegexSearcher},
13};
14
15pub use self::error::LexerError;
16pub use self::token::Token;
17
18pub type Lexed<'input> = Result<(usize, Token<'input>, usize), ParserError>;
20
21use self::delimiter::Delimiter;
22use self::patterns::{Check, Pattern};
23
24pub struct Lexer<'input> {
26 input: Input<'input>,
27 patterns: Vec<Pattern>,
28 check_prefixes: Vec<Arc<str>>,
29 seen_prefixes: Vec<bool>,
30 regex: Regex,
31 searcher: RegexSearcher<'input>,
32 cache: regex_automata::meta::Cache,
33 captures: regex_automata::util::captures::Captures,
34 delimiter_patterns: aho_corasick::AhoCorasick,
35 delimiter_searcher: AhoCorasickSearcher<'input>,
36 eof: bool,
40 leading_lf: bool,
45 buffer: VecDeque<Lexed<'input>>,
47}
48impl<'input> Lexer<'input> {
49 pub fn new<S>(
52 source: &'input S,
53 check_prefixes: &[Arc<str>],
54 comment_prefixes: &[Arc<str>],
55 ) -> Self
56 where
57 S: SourceFile + ?Sized + 'input,
58 {
59 let buffer = source.source().as_bytes();
60 let input = Input::new(buffer, false);
61 let mut patterns = Pattern::generate_check_patterns(check_prefixes).collect::<Vec<_>>();
62 patterns.extend(Pattern::generate_comment_patterns(comment_prefixes));
63 let regex =
64 Regex::new_many(&patterns).expect("expected valid prefix searcher configuration");
65 let searcher = RegexSearcher::new(input.into());
66 let eof = input.is_empty();
67 let captures = regex.create_captures();
68 let cache = regex.create_cache();
69
70 let mut builder = aho_corasick::AhoCorasickBuilder::new();
71 builder
72 .match_kind(aho_corasick::MatchKind::LeftmostLongest)
73 .start_kind(aho_corasick::StartKind::Both)
74 .kind(Some(aho_corasick::AhoCorasickKind::DFA));
75 let delimiter_patterns = builder
76 .build(Delimiter::ALL)
77 .expect("expected delimiter searcher configuration");
78 let delimiter_searcher = AhoCorasickSearcher::new(input.into());
79
80 Lexer {
81 input,
82 patterns,
83 check_prefixes: check_prefixes.iter().cloned().collect(),
84 seen_prefixes: vec![false; check_prefixes.len()],
85 regex,
86 searcher,
87 cache,
88 captures,
89 delimiter_patterns,
90 delimiter_searcher,
91 eof,
92 leading_lf: true,
93 buffer: VecDeque::with_capacity(128),
94 }
95 }
96
97 pub fn unused_prefixes(&self) -> Vec<Arc<str>> {
98 self.check_prefixes
99 .iter()
100 .zip(self.seen_prefixes.iter().copied())
101 .filter_map(|(prefix, used)| if used { None } else { Some(prefix.clone()) })
102 .collect()
103 }
104
105 pub fn current_offset(&self) -> SourceSpan {
106 let at = self.input.start();
107 SourceSpan::from(at..at)
108 }
109
110 pub fn peek(&mut self) -> Option<&Token<'input>> {
111 loop {
112 if !self.buffer.is_empty() {
113 break self
114 .buffer
115 .front()
116 .and_then(|lexed| lexed.as_ref().ok().map(|(_, t, _)| t));
117 } else if self.eof {
118 break None;
119 }
120
121 self.tokenize();
122 }
123 }
124
125 pub fn lex(&mut self) -> Option<Lexed<'input>> {
126 loop {
127 if !self.buffer.is_empty() {
128 break self.buffer.pop_front();
129 } else if self.eof {
130 break None;
131 }
132
133 self.tokenize();
134 }
135 }
136
137 fn tokenize(&mut self) {
138 match self.input.peek_byte() {
139 (_, b'\0') => {
140 self.eof = true;
141 return;
142 }
143 (offset, b'\n') => {
144 let next_offset = offset + 1;
145 self.input.set_start(next_offset);
146 self.buffer.push_back(Ok((offset, Token::Lf, next_offset)));
147 }
148 _ => (),
149 }
150
151 let bytes = self.input.buffer();
152 let start = self.input.start();
153 let eof = self.input.end();
154 let mut word_boundary = self.input.start();
155 while word_boundary < eof && bytes[word_boundary].is_ascii_whitespace() {
156 word_boundary += 1;
157 }
158 if start < word_boundary {
159 self.input.set_start(word_boundary);
160 }
161
162 let start = self.input.start();
164 if self.searcher.input().start() < start {
165 self.searcher.set_last_match_end(start);
166 }
167
168 let search_result = self.searcher.advance(|input| {
169 self.regex
170 .search_captures_with(&mut self.cache, input, &mut self.captures);
171 Ok(self.captures.get_match())
172 });
173 if let Some(matched) = search_result {
174 let pid = matched.pattern();
175 let range = Range::from(matched.range());
176 let pattern = &self.patterns[pid.as_usize()];
177 let pattern_ty = pattern.ty;
178 if let Check::Comment = pattern_ty {
179 return self.tokenize_comment(range);
180 }
181
182 let prefix_span = self.captures.get_group_by_name("prefix").unwrap();
183 let prefix = self.input.as_str(prefix_span.start..prefix_span.end);
184 if let Some(index) = self
185 .check_prefixes
186 .iter()
187 .position(|pfx| pfx.as_ref() == prefix)
188 {
189 self.seen_prefixes[index] = true;
190 }
191 match pattern_ty {
192 Check::Count => {
193 let valid = self.tokenize_check_count_prefix(range);
194 if !valid {
195 self.captures.set_pattern(None);
196 return;
197 }
198 }
199 ty => {
200 self.buffer.push_back(Ok((
201 range.start,
202 Token::Check(ty.try_into().unwrap()),
203 range.end,
204 )));
205 }
206 }
207 let literal = self.tokenize_optional_modifiers();
208 self.buffer
209 .push_back(Ok((range.end - 1, Token::Colon, range.end)));
210 self.input.set_start(range.end);
211 self.tokenize_check_pattern(literal);
212 self.captures.set_pattern(None);
213 } else {
214 self.input.set_start(self.input.end());
216 self.eof = true;
217 }
218 }
219
220 fn tokenize_optional_modifiers(&mut self) -> bool {
221 if let Some(span) = self.captures.get_group_by_name("modifiers") {
222 if self.input.buffer()[span.start..].starts_with(b"LITERAL") {
223 self.buffer.push_back(Ok((
224 span.start,
225 Token::Modifier(CheckModifier::LITERAL),
226 span.end,
227 )));
228 true
229 } else {
230 unreachable!("no other modifiers are recognized by the regex pattern")
231 }
232 } else {
233 false
234 }
235 }
236
237 fn tokenize_comment(&mut self, range: Range<usize>) {
238 let span = self.captures.get_group_by_name("comment").unwrap();
240 let comment = self.input.as_str(span.start..span.end);
241 let comment = comment.strip_prefix(' ').unwrap_or(comment);
242 self.buffer.push_back(Ok((
243 range.start,
244 Token::Comment(Cow::Borrowed(comment)),
245 span.end,
246 )));
247 self.input.set_start(span.end);
248 self.captures.set_pattern(None);
249 }
250
251 fn tokenize_check_count_prefix(&mut self, prefix_range: Range<usize>) -> bool {
252 let count_span = self.captures.get_group_by_name("count").unwrap();
253 let count = self.input.as_str(count_span.start..count_span.end);
254 match count.parse::<u8>() {
255 Ok(count) => {
256 self.buffer.push_back(Ok((
257 prefix_range.start,
258 Token::Check(ast::Check::Plain),
259 count_span.end,
260 )));
261 self.buffer.push_back(Ok((
262 prefix_range.start,
263 Token::Modifier(CheckModifier::from_count(count)),
264 count_span.end,
265 )));
266 true
267 }
268 Err(error) => {
269 let token = Token::Error(LexerError::BadCount {
270 span: SourceSpan::from(count_span.start..count_span.end),
271 error,
272 });
273 self.buffer
274 .push_back(Ok((count_span.start, token, count_span.end)));
275 let eol = self
277 .input
278 .next_newline_from(prefix_range.end)
279 .unwrap_or_else(|| self.input.end());
280 self.input.set_start(eol);
281 false
282 }
283 }
284 }
285
286 fn tokenize_check_pattern(&mut self, literal: bool) {
292 let start = self.input.start();
293 let eol = self.input.next_newline().unwrap_or(self.input.end());
294
295 if literal {
296 let raw = self.input.as_str(start..eol);
297 if let Some(raw) = raw.strip_prefix(' ') {
298 self.buffer.push_back(Ok((start + 1, Token::Raw(raw), eol)));
299 } else {
300 self.buffer.push_back(Ok((start, Token::Raw(raw), eol)));
301 }
302 self.input.set_start(eol);
303 return;
304 }
305
306 let mut in_match: Option<Span<Delimiter>> = None;
307 let mut in_regex: Option<Span<Delimiter>> = None;
308
309 let mut last_delimiter_end = start;
310 self.delimiter_searcher.set_range(start..eol);
311 let mut is_first = true;
312 while let Some(matched) = self
313 .delimiter_searcher
314 .advance(|input| Ok(self.delimiter_patterns.find(input.clone())))
315 {
316 let pid = matched.pattern();
317 let delim_range = Range::from(matched.range());
318 match Delimiter::from_pid(pid.as_usize()) {
319 delim @ (Delimiter::MatchStart | Delimiter::NumericMatchStart)
320 if in_match.is_none() && in_regex.is_none() =>
321 {
322 in_match = Some(Span::new(delim_range, delim));
323 if delim_range.start > last_delimiter_end {
324 let raw = &self.input.buffer()[last_delimiter_end..delim_range.start];
325 if !raw.iter().all(u8::is_ascii_whitespace) {
326 let content = self.input.as_str(last_delimiter_end..delim_range.start);
327 let content = if is_first {
328 content.strip_prefix(' ').unwrap_or(content)
329 } else {
330 content
331 };
332 self.buffer.push_back(Ok((
333 last_delimiter_end,
334 Token::Raw(content),
335 delim_range.start,
336 )));
337 }
338 }
339 if matches!(delim, Delimiter::NumericMatchStart) {
340 self.buffer.push_back(Ok((
341 delim_range.start,
342 Token::MatchStart,
343 delim_range.end - 1,
344 )));
345 self.buffer.push_back(Ok((
346 delim_range.end - 1,
347 Token::Hash,
348 delim_range.end,
349 )));
350 } else {
351 self.buffer.push_back(Ok((
352 delim_range.start,
353 Token::MatchStart,
354 delim_range.end,
355 )));
356 }
357 is_first = false;
358 }
359 Delimiter::RegexStart if in_match.is_none() && in_regex.is_none() => {
360 in_regex = Some(Span::new(delim_range, Delimiter::RegexStart));
361 if delim_range.start > last_delimiter_end {
362 let raw = &self.input.buffer()[last_delimiter_end..delim_range.start];
363 if !raw.iter().all(u8::is_ascii_whitespace) {
364 let content = self.input.as_str(last_delimiter_end..delim_range.start);
365 let content = if is_first {
366 content.strip_prefix(' ').unwrap_or(content)
367 } else {
368 content
369 };
370 self.buffer.push_back(Ok((
371 last_delimiter_end,
372 Token::Raw(content),
373 delim_range.start,
374 )));
375 }
376 }
377 self.buffer.push_back(Ok((
378 delim_range.start,
379 Token::RegexStart,
380 delim_range.end,
381 )));
382 is_first = false;
383 }
384 Delimiter::MatchEnd if in_match.is_some() => {
385 last_delimiter_end = delim_range.end;
386 let match_start = in_match.take().unwrap();
387 if matches!(match_start.into_inner(), Delimiter::NumericMatchStart) {
388 self.tokenize_capture_or_match_numeric(Range::new(
389 match_start.end(),
390 delim_range.start,
391 ));
392 } else {
393 self.tokenize_capture_or_match(Range::new(
394 match_start.end(),
395 delim_range.start,
396 ));
397 }
398 self.buffer.push_back(Ok((
399 delim_range.start,
400 Token::MatchEnd,
401 delim_range.end,
402 )));
403 self.input.set_start(delim_range.end);
404 self.searcher.set_last_match_end(delim_range.end);
405 }
406 Delimiter::RegexEnd if in_regex.is_some() => {
407 last_delimiter_end = delim_range.end;
408 let regex_start = in_regex.take().unwrap();
409 let pattern_start = regex_start.end();
410 let raw = self.input.as_str(pattern_start..delim_range.start).trim();
411 self.buffer
412 .push_back(Ok((pattern_start, Token::Raw(raw), delim_range.start)));
413 self.buffer.push_back(Ok((
414 delim_range.start,
415 Token::RegexEnd,
416 delim_range.end,
417 )));
418 self.input.set_start(delim_range.end);
419 self.searcher.set_last_match_end(delim_range.end);
420 }
421 delim @ (Delimiter::RegexEnd | Delimiter::MatchEnd)
422 if in_match.is_none() && in_regex.is_none() =>
423 {
424 self.buffer.push_back(Err(ParserError::UnrecognizedToken {
425 span: delim_range.into(),
426 token: AsRef::<str>::as_ref(&delim).to_string(),
427 expected: vec![
428 "literal".to_string(),
429 Token::MatchStart.to_string(),
430 Token::RegexStart.to_string(),
431 ],
432 }));
433 }
434 _ => continue,
435 }
436 }
437
438 if last_delimiter_end < eol && in_match.is_none() && in_regex.is_none() {
440 let line = self.input.as_str(last_delimiter_end..eol);
441 if !line.trim().is_empty() {
442 let line = if is_first {
443 line.strip_prefix(' ').unwrap_or(line)
444 } else {
445 line
446 };
447 self.buffer
448 .push_back(Ok((last_delimiter_end, Token::Raw(line), eol)));
449 }
450 self.input.set_start(eol);
451 self.searcher.set_last_match_end(eol);
452 self.delimiter_searcher.set_last_match_end(eol);
453 }
454
455 match (in_match, in_regex) {
457 (None, None) => (),
458 (Some(delim), _) => {
459 self.buffer
461 .push_back(Err(ParserError::UnclosedSubstitution {
462 span: delim.span(),
463 }));
464 }
465 (_, Some(delim)) => {
466 self.buffer
468 .push_back(Err(ParserError::UnclosedRegex { span: delim.span() }));
469 }
470 }
471 }
472
473 fn tokenize_capture_or_match(&mut self, range: Range<usize>) {
474 let mut chars = self.input.as_str(range).chars().peekable();
475 let mut offset = range.start;
476 while let Some(c) = chars.next() {
477 let next_offset = offset + c.len_utf8();
478 match c {
479 c if c.is_ascii_alphabetic() || c == '_' => {
480 let start = offset;
481 let mut end = next_offset;
482
483 while let Some(&c) = chars.peek() {
484 match c {
485 c if c.is_ascii_alphanumeric() => {
486 end += c.len_utf8();
487 chars.next();
488 }
489 '_' => {
490 end += '_'.len_utf8();
491 chars.next();
492 }
493 c if c.is_whitespace() || c == ':' => {
494 break;
495 }
496 _ => {
497 self.buffer.push_back(Ok((
498 start,
499 Token::Error(LexerError::InvalidIdentifier {
500 span: SourceSpan::from(start..(end + 1)),
501 }),
502 end + 1,
503 )));
504 self.buffer.push_back(Ok((
505 end + 1,
506 Token::Raw(self.input.as_str((end + 1)..range.end)),
507 range.end,
508 )));
509 return;
510 }
511 }
512 }
513 self.buffer.push_back(Ok((
514 start,
515 Token::from_keyword_or_ident(self.input.as_str(start..end)),
516 end,
517 )));
518 offset = end;
519 continue;
520 }
521 '@' => self.buffer.push_back(Ok((offset, Token::At, next_offset))),
522 '$' => self
523 .buffer
524 .push_back(Ok((offset, Token::Dollar, next_offset))),
525 ':' => {
526 self.buffer
527 .push_back(Ok((offset, Token::Colon, next_offset)));
528 let raw = self.input.as_str(next_offset..range.end);
530 self.buffer
531 .push_back(Ok((offset + 1, Token::Raw(raw), range.end)));
532 return;
533 }
534 c if c.is_whitespace() => (),
535 unexpected => {
536 self.buffer.push_back(Ok((
537 offset,
538 Token::Error(LexerError::UnexpectedCharacter {
539 span: SourceSpan::from(offset..next_offset),
540 unexpected,
541 }),
542 next_offset,
543 )));
544 self.buffer.push_back(Ok((
545 next_offset,
546 Token::Raw(self.input.as_str(next_offset..range.end)),
547 range.end,
548 )));
549 return;
550 }
551 }
552 offset = next_offset;
553 }
554 }
555
556 fn tokenize_capture_or_match_numeric(&mut self, range: Range<usize>) {
557 let mut chars = self.input.as_str(range).chars().peekable();
558 let mut offset = range.start;
559 let mut strip_whitespace = true;
560 while let Some(c) = chars.next() {
561 let mut next_offset = offset + c.len_utf8();
562 match c {
563 '#' => {
564 strip_whitespace = false;
565 self.buffer
566 .push_back(Ok((offset, Token::Hash, next_offset)));
567 }
568 '%' => {
569 strip_whitespace = false;
570 self.buffer
571 .push_back(Ok((offset, Token::Percent, next_offset)));
572 }
573 '.' => {
574 strip_whitespace = false;
575 self.buffer.push_back(Ok((offset, Token::Dot, next_offset)));
576 }
577 ',' => {
578 self.buffer
579 .push_back(Ok((offset, Token::Comma, next_offset)));
580 }
581 '+' => {
582 strip_whitespace = true;
583 self.buffer
584 .push_back(Ok((offset, Token::Plus, next_offset)));
585 }
586 '-' => {
587 strip_whitespace = true;
588 self.buffer
589 .push_back(Ok((offset, Token::Plus, next_offset)));
590 }
591 '@' => {
592 strip_whitespace = false;
593 self.buffer.push_back(Ok((offset, Token::At, next_offset)));
594 }
595 '$' => {
596 strip_whitespace = false;
597 self.buffer
598 .push_back(Ok((offset, Token::Dollar, next_offset)));
599 }
600 '=' if matches!(chars.peek(), Some(&'=')) => {
601 strip_whitespace = true;
602 chars.next();
603 next_offset += '='.len_utf8();
604 self.buffer
605 .push_back(Ok((offset, Token::Equals, next_offset)));
606 }
607 '(' => {
608 strip_whitespace = true;
609 self.buffer
610 .push_back(Ok((offset, Token::LParen, next_offset)));
611 }
612 ')' => {
613 strip_whitespace = true;
614 self.buffer
615 .push_back(Ok((offset, Token::RParen, next_offset)));
616 }
617 ':' => {
618 strip_whitespace = true;
619 self.buffer
620 .push_back(Ok((offset, Token::Colon, next_offset)));
621 }
622 c if c.is_ascii_alphabetic() || c == '_' => {
623 let mut end = next_offset;
624 while let Some(&c) = chars.peek() {
625 match c {
626 c if c.is_ascii_alphanumeric() => {
627 end += c.len_utf8();
628 chars.next();
629 }
630 '_' => {
631 end += c.len_utf8();
632 chars.next();
633 }
634 _ => break,
635 }
636 }
637 self.buffer.push_back(Ok((
638 offset,
639 Token::from_keyword_or_ident(self.input.as_str(offset..end)),
640 end,
641 )));
642 strip_whitespace = true;
643 offset = end;
644 continue;
645 }
646 c if c.is_ascii_digit() => {
647 let mut end = next_offset;
648 while let Some(&c) = chars.peek() {
649 match c {
650 c if c.is_ascii_digit() => {
651 end += 1;
652 chars.next();
653 }
654 _ => break,
655 }
656 }
657 match self.input.as_str(offset..end).parse::<i64>() {
658 Ok(value) => {
659 self.buffer.push_back(Ok((offset, Token::Num(value), end)));
660 }
661 Err(err) => {
662 self.buffer.push_back(Ok((
663 offset,
664 Token::Error(LexerError::InvalidNumber {
665 span: SourceSpan::from(offset..end),
666 error: err,
667 }),
668 end,
669 )));
670 }
671 }
672 strip_whitespace = true;
673 offset = end;
674 continue;
675 }
676 c if c.is_ascii_whitespace() && strip_whitespace => (),
677 unexpected => {
678 self.buffer.push_back(Ok((
679 offset,
680 Token::Error(LexerError::UnexpectedCharacter {
681 span: SourceSpan::from(offset..next_offset),
682 unexpected,
683 }),
684 next_offset,
685 )));
686 self.buffer.push_back(Ok((
687 next_offset,
688 Token::Raw(self.input.as_str(next_offset..range.end)),
689 range.end,
690 )));
691 return;
692 }
693 }
694 offset = next_offset;
695 }
696 }
697}
698impl<'input> Iterator for Lexer<'input> {
699 type Item = Lexed<'input>;
700
701 #[track_caller]
702 fn next(&mut self) -> Option<Self::Item> {
703 let mut res = self.lex();
704 loop {
705 if let Some(Ok((_, Token::Lf, _))) = res.as_ref() {
706 if self.leading_lf {
708 res = self.lex();
709 continue;
710 }
711 if let Some(Ok((_, Token::Lf, _))) = self.buffer.front() {
713 res = self.lex();
714 continue;
715 }
716 break;
717 } else {
718 self.leading_lf = false;
719 break;
720 }
721 }
722 res
723 }
724}