1mod delimiter;
2mod error;
3mod patterns;
4mod token;
5
6use std::collections::VecDeque;
7
8use crate::{
9 ast::{self, CheckModifier},
10 common::*,
11 parse::ParserError,
12 pattern::search::{AhoCorasickSearcher, RegexSearcher},
13};
14
15pub use self::error::LexerError;
16pub use self::token::Token;
17
18pub type Lexed<'input> = Result<(usize, Token<'input>, usize), ParserError>;
20
21use self::delimiter::Delimiter;
22use self::patterns::{Check, Pattern};
23
24pub struct Lexer<'input> {
26 input: Input<'input>,
27 patterns: Vec<Pattern>,
28 check_prefixes: Vec<Arc<str>>,
29 seen_prefixes: Vec<bool>,
30 regex: Regex,
31 searcher: RegexSearcher<'input>,
32 cache: regex_automata::meta::Cache,
33 captures: regex_automata::util::captures::Captures,
34 delimiter_patterns: aho_corasick::AhoCorasick,
35 delimiter_searcher: AhoCorasickSearcher<'input>,
36 buffer: VecDeque<Lexed<'input>>,
38 eof: bool,
42 leading_lf: bool,
47 strict_whitespace: bool,
49}
50impl<'input> Lexer<'input> {
51 pub fn new<S>(source: &'input S, config: &Config) -> Self
54 where
55 S: SourceFile + ?Sized + 'input,
56 {
57 let buffer = source.source().as_bytes();
58 let input = Input::new(buffer, false);
59 let mut patterns =
60 Pattern::generate_check_patterns(&config.check_prefixes).collect::<Vec<_>>();
61 patterns.extend(Pattern::generate_comment_patterns(&config.comment_prefixes));
62 let regex =
63 Regex::new_many(&patterns).expect("expected valid prefix searcher configuration");
64 let searcher = RegexSearcher::new(input.into());
65 let eof = input.is_empty();
66 let captures = regex.create_captures();
67 let cache = regex.create_cache();
68
69 let mut builder = aho_corasick::AhoCorasickBuilder::new();
70 builder
71 .match_kind(aho_corasick::MatchKind::LeftmostLongest)
72 .start_kind(aho_corasick::StartKind::Both)
73 .kind(Some(aho_corasick::AhoCorasickKind::DFA));
74 let delimiter_patterns = builder
75 .build(Delimiter::ALL)
76 .expect("expected delimiter searcher configuration");
77 let delimiter_searcher = AhoCorasickSearcher::new(input.into());
78
79 Lexer {
80 input,
81 patterns,
82 check_prefixes: config.check_prefixes.to_vec(),
83 seen_prefixes: vec![false; config.check_prefixes.len()],
84 regex,
85 searcher,
86 cache,
87 captures,
88 delimiter_patterns,
89 delimiter_searcher,
90 buffer: VecDeque::with_capacity(128),
91 eof,
92 leading_lf: true,
93 strict_whitespace: config.strict_whitespace,
94 }
95 }
96
97 pub fn unused_prefixes(&self) -> Vec<Arc<str>> {
98 self.check_prefixes
99 .iter()
100 .zip(self.seen_prefixes.iter().copied())
101 .filter_map(|(prefix, used)| if used { None } else { Some(prefix.clone()) })
102 .collect()
103 }
104
105 pub fn current_offset(&self) -> SourceSpan {
106 let at = self.input.start();
107 SourceSpan::from(at..at)
108 }
109
110 pub fn peek(&mut self) -> Option<&Token<'input>> {
111 loop {
112 if !self.buffer.is_empty() {
113 break self
114 .buffer
115 .front()
116 .and_then(|lexed| lexed.as_ref().ok().map(|(_, t, _)| t));
117 } else if self.eof {
118 break None;
119 }
120
121 self.tokenize();
122 }
123 }
124
125 pub fn lex(&mut self) -> Option<Lexed<'input>> {
126 loop {
127 if !self.buffer.is_empty() {
128 break self.buffer.pop_front();
129 } else if self.eof {
130 break None;
131 }
132
133 self.tokenize();
134 }
135 }
136
137 fn tokenize(&mut self) {
138 match self.input.peek_byte() {
139 (_, b'\0') => {
140 self.eof = true;
141 return;
142 }
143 (offset, b'\n') => {
144 let next_offset = offset + 1;
145 self.input.set_start(next_offset);
146 self.buffer.push_back(Ok((offset, Token::Lf, next_offset)));
147 }
148 _ => (),
149 }
150
151 let bytes = self.input.buffer();
152 let start = self.input.start();
153 let eof = self.input.end();
154 let mut word_boundary = self.input.start();
155 while word_boundary < eof && bytes[word_boundary].is_ascii_whitespace() {
156 word_boundary += 1;
157 }
158 if start < word_boundary {
159 self.input.set_start(word_boundary);
160 }
161
162 let start = self.input.start();
164 if self.searcher.input().start() < start {
165 self.searcher.set_last_match_end(start);
166 }
167
168 let search_result = self.searcher.advance(|input| {
169 self.regex
170 .search_captures_with(&mut self.cache, input, &mut self.captures);
171 Ok(self.captures.get_match())
172 });
173 if let Some(matched) = search_result {
174 let pid = matched.pattern();
175 let range = Range::from(matched.range());
176 let pattern = &self.patterns[pid.as_usize()];
177 let pattern_ty = pattern.ty;
178 if let Check::Comment = pattern_ty {
179 return self.tokenize_comment(range);
180 }
181
182 let prefix_span = self.captures.get_group_by_name("prefix").unwrap();
183 let prefix = self.input.as_str(prefix_span.start..prefix_span.end);
184 if let Some(index) = self
185 .check_prefixes
186 .iter()
187 .position(|pfx| pfx.as_ref() == prefix)
188 {
189 self.seen_prefixes[index] = true;
190 }
191 match pattern_ty {
192 Check::Count => {
193 let valid = self.tokenize_check_count_prefix(range);
194 if !valid {
195 self.captures.set_pattern(None);
196 return;
197 }
198 }
199 ty => {
200 self.buffer.push_back(Ok((
201 range.start,
202 Token::Check(ty.try_into().unwrap()),
203 range.end,
204 )));
205 }
206 }
207 let literal = self.tokenize_optional_modifiers();
208 self.buffer
209 .push_back(Ok((range.end - 1, Token::Colon, range.end)));
210 self.input.set_start(range.end);
211 self.tokenize_check_pattern(literal);
212 self.captures.set_pattern(None);
213 } else {
214 self.input.set_start(self.input.end());
216 self.eof = true;
217 }
218 }
219
220 fn tokenize_optional_modifiers(&mut self) -> bool {
221 if let Some(span) = self.captures.get_group_by_name("modifiers") {
222 if self.input.buffer()[span.start..].starts_with(b"LITERAL") {
223 self.buffer.push_back(Ok((
224 span.start,
225 Token::Modifier(CheckModifier::LITERAL),
226 span.end,
227 )));
228 true
229 } else {
230 unreachable!("no other modifiers are recognized by the regex pattern")
231 }
232 } else {
233 false
234 }
235 }
236
237 fn tokenize_comment(&mut self, range: Range<usize>) {
238 let span = self.captures.get_group_by_name("comment").unwrap();
240 let comment = self.input.as_str(span.start..span.end);
241 let comment = comment.strip_prefix(' ').unwrap_or(comment);
242 self.buffer.push_back(Ok((
243 range.start,
244 Token::Comment(Cow::Borrowed(comment)),
245 span.end,
246 )));
247 self.input.set_start(span.end);
248 self.captures.set_pattern(None);
249 }
250
251 fn tokenize_check_count_prefix(&mut self, prefix_range: Range<usize>) -> bool {
252 let count_span = self.captures.get_group_by_name("count").unwrap();
253 let count = self.input.as_str(count_span.start..count_span.end);
254 match count.parse::<u8>() {
255 Ok(count) => {
256 self.buffer.push_back(Ok((
257 prefix_range.start,
258 Token::Check(ast::Check::Plain),
259 count_span.end,
260 )));
261 self.buffer.push_back(Ok((
262 prefix_range.start,
263 Token::Modifier(CheckModifier::from_count(count)),
264 count_span.end,
265 )));
266 true
267 }
268 Err(error) => {
269 let token = Token::Error(LexerError::BadCount {
270 span: SourceSpan::from(count_span.start..count_span.end),
271 error,
272 });
273 self.buffer
274 .push_back(Ok((count_span.start, token, count_span.end)));
275 let eol = self
277 .input
278 .next_newline_from(prefix_range.end)
279 .unwrap_or_else(|| self.input.end());
280 self.input.set_start(eol);
281 false
282 }
283 }
284 }
285
286 fn tokenize_check_pattern(&mut self, literal: bool) {
292 let mut start = self.input.start();
293 let eol = self.input.next_newline().unwrap_or(self.input.end());
294
295 if literal {
296 let raw = self.input.as_str(start..eol);
297 if let Some(raw) = raw.strip_prefix(' ') {
298 self.buffer.push_back(Ok((start + 1, Token::Raw(raw), eol)));
299 } else {
300 self.buffer.push_back(Ok((start, Token::Raw(raw), eol)));
301 }
302 self.input.set_start(eol);
303 return;
304 } else if !self.strict_whitespace {
305 let raw = self.input.as_str(start..eol);
307 let stripped = raw.trim_ascii_start();
308 let shift = raw.len().abs_diff(stripped.len());
309 start += shift;
310 self.input.set_start(start);
311 }
312
313 let mut in_match: Option<Span<Delimiter>> = None;
314 let mut in_regex: Option<Span<Delimiter>> = None;
315
316 let mut last_delimiter_end = start;
317 self.delimiter_searcher.set_range(start..eol);
318 let mut is_first = true;
319 while let Some(matched) = self
320 .delimiter_searcher
321 .advance(|input| Ok(self.delimiter_patterns.find(input.clone())))
322 {
323 let pid = matched.pattern();
324 let delim_range = Range::from(matched.range());
325 match Delimiter::from_pid(pid.as_usize()) {
326 delim @ (Delimiter::MatchStart | Delimiter::NumericMatchStart)
327 if in_match.is_none() && in_regex.is_none() =>
328 {
329 in_match = Some(Span::new(delim_range, delim));
330 if delim_range.start > last_delimiter_end {
331 let raw = &self.input.buffer()[last_delimiter_end..delim_range.start];
332 if !raw.iter().all(u8::is_ascii_whitespace) {
333 let content = self.input.as_str(last_delimiter_end..delim_range.start);
334 let content = if is_first {
335 content.strip_prefix(' ').unwrap_or(content)
336 } else {
337 content
338 };
339 self.buffer.push_back(Ok((
340 last_delimiter_end,
341 Token::Raw(content),
342 delim_range.start,
343 )));
344 }
345 }
346 if matches!(delim, Delimiter::NumericMatchStart) {
347 self.buffer.push_back(Ok((
348 delim_range.start,
349 Token::MatchStart,
350 delim_range.end - 1,
351 )));
352 self.buffer.push_back(Ok((
353 delim_range.end - 1,
354 Token::Hash,
355 delim_range.end,
356 )));
357 } else {
358 self.buffer.push_back(Ok((
359 delim_range.start,
360 Token::MatchStart,
361 delim_range.end,
362 )));
363 }
364 is_first = false;
365 }
366 Delimiter::RegexStart if in_match.is_none() && in_regex.is_none() => {
367 in_regex = Some(Span::new(delim_range, Delimiter::RegexStart));
368 if delim_range.start > last_delimiter_end {
369 let raw = &self.input.buffer()[last_delimiter_end..delim_range.start];
370 if !raw.iter().all(u8::is_ascii_whitespace) {
371 let content = self.input.as_str(last_delimiter_end..delim_range.start);
372 let content = if is_first {
373 content.strip_prefix(' ').unwrap_or(content)
374 } else {
375 content
376 };
377 self.buffer.push_back(Ok((
378 last_delimiter_end,
379 Token::Raw(content),
380 delim_range.start,
381 )));
382 }
383 }
384 self.buffer.push_back(Ok((
385 delim_range.start,
386 Token::RegexStart,
387 delim_range.end,
388 )));
389 is_first = false;
390 }
391 Delimiter::MatchEnd if in_match.is_some() => {
392 last_delimiter_end = delim_range.end;
393 let match_start = in_match.take().unwrap();
394 if matches!(match_start.into_inner(), Delimiter::NumericMatchStart) {
395 self.tokenize_capture_or_match_numeric(Range::new(
396 match_start.end(),
397 delim_range.start,
398 ));
399 } else {
400 self.tokenize_capture_or_match(Range::new(
401 match_start.end(),
402 delim_range.start,
403 ));
404 }
405 self.buffer.push_back(Ok((
406 delim_range.start,
407 Token::MatchEnd,
408 delim_range.end,
409 )));
410 self.input.set_start(delim_range.end);
411 self.searcher.set_last_match_end(delim_range.end);
412 }
413 Delimiter::RegexEnd if in_regex.is_some() => {
414 last_delimiter_end = delim_range.end;
415 let regex_start = in_regex.take().unwrap();
416 let pattern_start = regex_start.end();
417 let raw = self.input.as_str(pattern_start..delim_range.start).trim();
418 self.buffer
419 .push_back(Ok((pattern_start, Token::Raw(raw), delim_range.start)));
420 self.buffer.push_back(Ok((
421 delim_range.start,
422 Token::RegexEnd,
423 delim_range.end,
424 )));
425 self.input.set_start(delim_range.end);
426 self.searcher.set_last_match_end(delim_range.end);
427 }
428 delim @ (Delimiter::RegexEnd | Delimiter::MatchEnd)
429 if in_match.is_none() && in_regex.is_none() =>
430 {
431 self.buffer.push_back(Err(ParserError::UnrecognizedToken {
432 span: delim_range.into(),
433 token: AsRef::<str>::as_ref(&delim).to_string(),
434 expected: vec![
435 "literal".to_string(),
436 Token::MatchStart.to_string(),
437 Token::RegexStart.to_string(),
438 ],
439 }));
440 }
441 _ => continue,
442 }
443 }
444
445 if last_delimiter_end < eol && in_match.is_none() && in_regex.is_none() {
447 let line = self.input.as_str(last_delimiter_end..eol);
448 if !line.trim().is_empty() {
449 let line = if is_first {
450 line.strip_prefix(' ').unwrap_or(line)
451 } else {
452 line
453 };
454 self.buffer
455 .push_back(Ok((last_delimiter_end, Token::Raw(line), eol)));
456 }
457 self.input.set_start(eol);
458 self.searcher.set_last_match_end(eol);
459 self.delimiter_searcher.set_last_match_end(eol);
460 }
461
462 match (in_match, in_regex) {
464 (None, None) => (),
465 (Some(delim), _) => {
466 self.buffer
468 .push_back(Err(ParserError::UnclosedSubstitution {
469 span: delim.span(),
470 }));
471 }
472 (_, Some(delim)) => {
473 self.buffer
475 .push_back(Err(ParserError::UnclosedRegex { span: delim.span() }));
476 }
477 }
478 }
479
480 fn tokenize_capture_or_match(&mut self, range: Range<usize>) {
481 let mut chars = self.input.as_str(range).chars().peekable();
482 let mut offset = range.start;
483 while let Some(c) = chars.next() {
484 let next_offset = offset + c.len_utf8();
485 match c {
486 c if c.is_ascii_alphabetic() || c == '_' => {
487 let start = offset;
488 let mut end = next_offset;
489
490 while let Some(&c) = chars.peek() {
491 match c {
492 c if c.is_ascii_alphanumeric() => {
493 end += c.len_utf8();
494 chars.next();
495 }
496 '_' => {
497 end += '_'.len_utf8();
498 chars.next();
499 }
500 c if c.is_whitespace() || c == ':' => {
501 break;
502 }
503 _ => {
504 self.buffer.push_back(Ok((
505 start,
506 Token::Error(LexerError::InvalidIdentifier {
507 span: SourceSpan::from(start..(end + 1)),
508 }),
509 end + 1,
510 )));
511 self.buffer.push_back(Ok((
512 end + 1,
513 Token::Raw(self.input.as_str((end + 1)..range.end)),
514 range.end,
515 )));
516 return;
517 }
518 }
519 }
520 self.buffer.push_back(Ok((
521 start,
522 Token::from_keyword_or_ident(self.input.as_str(start..end)),
523 end,
524 )));
525 offset = end;
526 continue;
527 }
528 '@' => self.buffer.push_back(Ok((offset, Token::At, next_offset))),
529 '$' => self
530 .buffer
531 .push_back(Ok((offset, Token::Dollar, next_offset))),
532 ':' => {
533 self.buffer
534 .push_back(Ok((offset, Token::Colon, next_offset)));
535 let raw = self.input.as_str(next_offset..range.end);
537 self.buffer
538 .push_back(Ok((offset + 1, Token::Raw(raw), range.end)));
539 return;
540 }
541 c if c.is_whitespace() => (),
542 unexpected => {
543 self.buffer.push_back(Ok((
544 offset,
545 Token::Error(LexerError::UnexpectedCharacter {
546 span: SourceSpan::from(offset..next_offset),
547 unexpected,
548 }),
549 next_offset,
550 )));
551 self.buffer.push_back(Ok((
552 next_offset,
553 Token::Raw(self.input.as_str(next_offset..range.end)),
554 range.end,
555 )));
556 return;
557 }
558 }
559 offset = next_offset;
560 }
561 }
562
563 fn tokenize_capture_or_match_numeric(&mut self, range: Range<usize>) {
564 let mut chars = self.input.as_str(range).chars().peekable();
565 let mut offset = range.start;
566 let mut strip_whitespace = true;
567 while let Some(c) = chars.next() {
568 let mut next_offset = offset + c.len_utf8();
569 match c {
570 '#' => {
571 strip_whitespace = false;
572 self.buffer
573 .push_back(Ok((offset, Token::Hash, next_offset)));
574 }
575 '%' => {
576 strip_whitespace = false;
577 self.buffer
578 .push_back(Ok((offset, Token::Percent, next_offset)));
579 }
580 '.' => {
581 strip_whitespace = false;
582 self.buffer.push_back(Ok((offset, Token::Dot, next_offset)));
583 }
584 ',' => {
585 self.buffer
586 .push_back(Ok((offset, Token::Comma, next_offset)));
587 }
588 '+' => {
589 strip_whitespace = true;
590 self.buffer
591 .push_back(Ok((offset, Token::Plus, next_offset)));
592 }
593 '-' => {
594 strip_whitespace = true;
595 self.buffer
596 .push_back(Ok((offset, Token::Plus, next_offset)));
597 }
598 '@' => {
599 strip_whitespace = false;
600 self.buffer.push_back(Ok((offset, Token::At, next_offset)));
601 }
602 '$' => {
603 strip_whitespace = false;
604 self.buffer
605 .push_back(Ok((offset, Token::Dollar, next_offset)));
606 }
607 '=' if matches!(chars.peek(), Some(&'=')) => {
608 strip_whitespace = true;
609 chars.next();
610 next_offset += '='.len_utf8();
611 self.buffer
612 .push_back(Ok((offset, Token::Equals, next_offset)));
613 }
614 '(' => {
615 strip_whitespace = true;
616 self.buffer
617 .push_back(Ok((offset, Token::LParen, next_offset)));
618 }
619 ')' => {
620 strip_whitespace = true;
621 self.buffer
622 .push_back(Ok((offset, Token::RParen, next_offset)));
623 }
624 ':' => {
625 strip_whitespace = true;
626 self.buffer
627 .push_back(Ok((offset, Token::Colon, next_offset)));
628 }
629 c if c.is_ascii_alphabetic() || c == '_' => {
630 let mut end = next_offset;
631 while let Some(&c) = chars.peek() {
632 match c {
633 c if c.is_ascii_alphanumeric() => {
634 end += c.len_utf8();
635 chars.next();
636 }
637 '_' => {
638 end += c.len_utf8();
639 chars.next();
640 }
641 _ => break,
642 }
643 }
644 self.buffer.push_back(Ok((
645 offset,
646 Token::from_keyword_or_ident(self.input.as_str(offset..end)),
647 end,
648 )));
649 strip_whitespace = true;
650 offset = end;
651 continue;
652 }
653 c if c.is_ascii_digit() => {
654 let mut end = next_offset;
655 while let Some(&c) = chars.peek() {
656 match c {
657 c if c.is_ascii_digit() => {
658 end += 1;
659 chars.next();
660 }
661 _ => break,
662 }
663 }
664 match self.input.as_str(offset..end).parse::<i64>() {
665 Ok(value) => {
666 self.buffer.push_back(Ok((offset, Token::Num(value), end)));
667 }
668 Err(err) => {
669 self.buffer.push_back(Ok((
670 offset,
671 Token::Error(LexerError::InvalidNumber {
672 span: SourceSpan::from(offset..end),
673 error: err,
674 }),
675 end,
676 )));
677 }
678 }
679 strip_whitespace = true;
680 offset = end;
681 continue;
682 }
683 c if c.is_ascii_whitespace() && strip_whitespace => (),
684 unexpected => {
685 self.buffer.push_back(Ok((
686 offset,
687 Token::Error(LexerError::UnexpectedCharacter {
688 span: SourceSpan::from(offset..next_offset),
689 unexpected,
690 }),
691 next_offset,
692 )));
693 self.buffer.push_back(Ok((
694 next_offset,
695 Token::Raw(self.input.as_str(next_offset..range.end)),
696 range.end,
697 )));
698 return;
699 }
700 }
701 offset = next_offset;
702 }
703 }
704}
705impl<'input> Iterator for Lexer<'input> {
706 type Item = Lexed<'input>;
707
708 #[track_caller]
709 fn next(&mut self) -> Option<Self::Item> {
710 let mut res = self.lex();
711 loop {
712 if let Some(Ok((_, Token::Lf, _))) = res.as_ref() {
713 if self.leading_lf {
715 res = self.lex();
716 continue;
717 }
718 if let Some(Ok((_, Token::Lf, _))) = self.buffer.front() {
720 res = self.lex();
721 continue;
722 }
723 break;
724 } else {
725 self.leading_lf = false;
726 break;
727 }
728 }
729 res
730 }
731}