litcheck_filecheck/parse/lexer/
mod.rs

1mod delimiter;
2mod error;
3mod patterns;
4mod token;
5
6use std::collections::VecDeque;
7
8use crate::{
9    ast::{self, CheckModifier},
10    common::*,
11    parse::ParserError,
12    pattern::search::{AhoCorasickSearcher, RegexSearcher},
13};
14
15pub use self::error::LexerError;
16pub use self::token::Token;
17
18/// The value produced by the [Lexer] when iterated
19pub type Lexed<'input> = Result<(usize, Token<'input>, usize), ParserError>;
20
21use self::delimiter::Delimiter;
22use self::patterns::{Check, Pattern};
23
24/// The lexer that is used to tokenize a check file
25pub struct Lexer<'input> {
26    input: Input<'input>,
27    patterns: Vec<Pattern>,
28    check_prefixes: Vec<Arc<str>>,
29    seen_prefixes: Vec<bool>,
30    regex: Regex,
31    searcher: RegexSearcher<'input>,
32    cache: regex_automata::meta::Cache,
33    captures: regex_automata::util::captures::Captures,
34    delimiter_patterns: aho_corasick::AhoCorasick,
35    delimiter_searcher: AhoCorasickSearcher<'input>,
36    /// The token buffer, used when tokenizing lines
37    buffer: VecDeque<Lexed<'input>>,
38    /// When we have reached true Eof, this gets set to true, and the only token
39    /// produced after that point is Token::Eof, or None, depending on how you are
40    /// consuming the lexer
41    eof: bool,
42    /// When we first start parsing, this is true until we've observed a token
43    /// which is not Lf or Eof, so that we trim all leading empty lines which
44    /// will just be ignored anyway. This simplifies parsing by ensuring the
45    /// rules only need to account for actual content lines in the file.
46    leading_lf: bool,
47    /// Is --strict-whitespace enabled
48    strict_whitespace: bool,
49}
50impl<'input> Lexer<'input> {
51    /// Produces an instance of the lexer with the lexical analysis to be performed on the `input`
52    /// string. Note that no lexical analysis occurs until the lexer has been iterated over.
53    pub fn new<S>(source: &'input S, config: &Config) -> Self
54    where
55        S: SourceFile + ?Sized + 'input,
56    {
57        let buffer = source.source().as_bytes();
58        let input = Input::new(buffer, false);
59        let mut patterns =
60            Pattern::generate_check_patterns(&config.check_prefixes).collect::<Vec<_>>();
61        patterns.extend(Pattern::generate_comment_patterns(&config.comment_prefixes));
62        let regex =
63            Regex::new_many(&patterns).expect("expected valid prefix searcher configuration");
64        let searcher = RegexSearcher::new(input.into());
65        let eof = input.is_empty();
66        let captures = regex.create_captures();
67        let cache = regex.create_cache();
68
69        let mut builder = aho_corasick::AhoCorasickBuilder::new();
70        builder
71            .match_kind(aho_corasick::MatchKind::LeftmostLongest)
72            .start_kind(aho_corasick::StartKind::Both)
73            .kind(Some(aho_corasick::AhoCorasickKind::DFA));
74        let delimiter_patterns = builder
75            .build(Delimiter::ALL)
76            .expect("expected delimiter searcher configuration");
77        let delimiter_searcher = AhoCorasickSearcher::new(input.into());
78
79        Lexer {
80            input,
81            patterns,
82            check_prefixes: config.check_prefixes.to_vec(),
83            seen_prefixes: vec![false; config.check_prefixes.len()],
84            regex,
85            searcher,
86            cache,
87            captures,
88            delimiter_patterns,
89            delimiter_searcher,
90            buffer: VecDeque::with_capacity(128),
91            eof,
92            leading_lf: true,
93            strict_whitespace: config.strict_whitespace,
94        }
95    }
96
97    pub fn unused_prefixes(&self) -> Vec<Arc<str>> {
98        self.check_prefixes
99            .iter()
100            .zip(self.seen_prefixes.iter().copied())
101            .filter_map(|(prefix, used)| if used { None } else { Some(prefix.clone()) })
102            .collect()
103    }
104
105    pub fn current_offset(&self) -> SourceSpan {
106        let at = self.input.start();
107        SourceSpan::from(at..at)
108    }
109
110    pub fn peek(&mut self) -> Option<&Token<'input>> {
111        loop {
112            if !self.buffer.is_empty() {
113                break self
114                    .buffer
115                    .front()
116                    .and_then(|lexed| lexed.as_ref().ok().map(|(_, t, _)| t));
117            } else if self.eof {
118                break None;
119            }
120
121            self.tokenize();
122        }
123    }
124
125    pub fn lex(&mut self) -> Option<Lexed<'input>> {
126        loop {
127            if !self.buffer.is_empty() {
128                break self.buffer.pop_front();
129            } else if self.eof {
130                break None;
131            }
132
133            self.tokenize();
134        }
135    }
136
137    fn tokenize(&mut self) {
138        match self.input.peek_byte() {
139            (_, b'\0') => {
140                self.eof = true;
141                return;
142            }
143            (offset, b'\n') => {
144                let next_offset = offset + 1;
145                self.input.set_start(next_offset);
146                self.buffer.push_back(Ok((offset, Token::Lf, next_offset)));
147            }
148            _ => (),
149        }
150
151        let bytes = self.input.buffer();
152        let start = self.input.start();
153        let eof = self.input.end();
154        let mut word_boundary = self.input.start();
155        while word_boundary < eof && bytes[word_boundary].is_ascii_whitespace() {
156            word_boundary += 1;
157        }
158        if start < word_boundary {
159            self.input.set_start(word_boundary);
160        }
161
162        // Force the searcher forward if necessary to ensure it is caught up to our view of the input
163        let start = self.input.start();
164        if self.searcher.input().start() < start {
165            self.searcher.set_last_match_end(start);
166        }
167
168        let search_result = self.searcher.advance(|input| {
169            self.regex
170                .search_captures_with(&mut self.cache, input, &mut self.captures);
171            Ok(self.captures.get_match())
172        });
173        if let Some(matched) = search_result {
174            let pid = matched.pattern();
175            let range = Range::from(matched.range());
176            let pattern = &self.patterns[pid.as_usize()];
177            let pattern_ty = pattern.ty;
178            if let Check::Comment = pattern_ty {
179                return self.tokenize_comment(range);
180            }
181
182            let prefix_span = self.captures.get_group_by_name("prefix").unwrap();
183            let prefix = self.input.as_str(prefix_span.start..prefix_span.end);
184            if let Some(index) = self
185                .check_prefixes
186                .iter()
187                .position(|pfx| pfx.as_ref() == prefix)
188            {
189                self.seen_prefixes[index] = true;
190            }
191            match pattern_ty {
192                Check::Count => {
193                    let valid = self.tokenize_check_count_prefix(range);
194                    if !valid {
195                        self.captures.set_pattern(None);
196                        return;
197                    }
198                }
199                ty => {
200                    self.buffer.push_back(Ok((
201                        range.start,
202                        Token::Check(ty.try_into().unwrap()),
203                        range.end,
204                    )));
205                }
206            }
207            let literal = self.tokenize_optional_modifiers();
208            self.buffer
209                .push_back(Ok((range.end - 1, Token::Colon, range.end)));
210            self.input.set_start(range.end);
211            self.tokenize_check_pattern(literal);
212            self.captures.set_pattern(None);
213        } else {
214            // There are no more check lines in the input
215            self.input.set_start(self.input.end());
216            self.eof = true;
217        }
218    }
219
220    fn tokenize_optional_modifiers(&mut self) -> bool {
221        if let Some(span) = self.captures.get_group_by_name("modifiers") {
222            if self.input.buffer()[span.start..].starts_with(b"LITERAL") {
223                self.buffer.push_back(Ok((
224                    span.start,
225                    Token::Modifier(CheckModifier::LITERAL),
226                    span.end,
227                )));
228                true
229            } else {
230                unreachable!("no other modifiers are recognized by the regex pattern")
231            }
232        } else {
233            false
234        }
235    }
236
237    fn tokenize_comment(&mut self, range: Range<usize>) {
238        // Find the next newline, and take all the input up to that point
239        let span = self.captures.get_group_by_name("comment").unwrap();
240        let comment = self.input.as_str(span.start..span.end);
241        let comment = comment.strip_prefix(' ').unwrap_or(comment);
242        self.buffer.push_back(Ok((
243            range.start,
244            Token::Comment(Cow::Borrowed(comment)),
245            span.end,
246        )));
247        self.input.set_start(span.end);
248        self.captures.set_pattern(None);
249    }
250
251    fn tokenize_check_count_prefix(&mut self, prefix_range: Range<usize>) -> bool {
252        let count_span = self.captures.get_group_by_name("count").unwrap();
253        let count = self.input.as_str(count_span.start..count_span.end);
254        match count.parse::<u8>() {
255            Ok(count) => {
256                self.buffer.push_back(Ok((
257                    prefix_range.start,
258                    Token::Check(ast::Check::Plain),
259                    count_span.end,
260                )));
261                self.buffer.push_back(Ok((
262                    prefix_range.start,
263                    Token::Modifier(CheckModifier::from_count(count)),
264                    count_span.end,
265                )));
266                true
267            }
268            Err(error) => {
269                let token = Token::Error(LexerError::BadCount {
270                    span: SourceSpan::from(count_span.start..count_span.end),
271                    error,
272                });
273                self.buffer
274                    .push_back(Ok((count_span.start, token, count_span.end)));
275                // Skip to end of line
276                let eol = self
277                    .input
278                    .next_newline_from(prefix_range.end)
279                    .unwrap_or_else(|| self.input.end());
280                self.input.set_start(eol);
281                false
282            }
283        }
284    }
285
286    /// We've succesfully parsed a CHECK directive.
287    ///
288    /// We're parsing the rest of the line as a CHECK pattern, so we need to look
289    /// for `[[` `]]` and `{{` `}}` markers indicating substitution/capture and regex
290    /// matches respectively.
291    fn tokenize_check_pattern(&mut self, literal: bool) {
292        let mut start = self.input.start();
293        let eol = self.input.next_newline().unwrap_or(self.input.end());
294
295        if literal {
296            let raw = self.input.as_str(start..eol);
297            if let Some(raw) = raw.strip_prefix(' ') {
298                self.buffer.push_back(Ok((start + 1, Token::Raw(raw), eol)));
299            } else {
300                self.buffer.push_back(Ok((start, Token::Raw(raw), eol)));
301            }
302            self.input.set_start(eol);
303            return;
304        } else if !self.strict_whitespace {
305            // If --strict-whitespace is not in use, remove leading whitespace
306            let raw = self.input.as_str(start..eol);
307            let stripped = raw.trim_ascii_start();
308            let shift = raw.len().abs_diff(stripped.len());
309            start += shift;
310            self.input.set_start(start);
311        }
312
313        let mut in_match: Option<Span<Delimiter>> = None;
314        let mut in_regex: Option<Span<Delimiter>> = None;
315
316        let mut last_delimiter_end = start;
317        self.delimiter_searcher.set_range(start..eol);
318        let mut is_first = true;
319        while let Some(matched) = self
320            .delimiter_searcher
321            .advance(|input| Ok(self.delimiter_patterns.find(input.clone())))
322        {
323            let pid = matched.pattern();
324            let delim_range = Range::from(matched.range());
325            match Delimiter::from_pid(pid.as_usize()) {
326                delim @ (Delimiter::MatchStart | Delimiter::NumericMatchStart)
327                    if in_match.is_none() && in_regex.is_none() =>
328                {
329                    in_match = Some(Span::new(delim_range, delim));
330                    if delim_range.start > last_delimiter_end {
331                        let raw = &self.input.buffer()[last_delimiter_end..delim_range.start];
332                        if !raw.iter().all(u8::is_ascii_whitespace) {
333                            let content = self.input.as_str(last_delimiter_end..delim_range.start);
334                            let content = if is_first {
335                                content.strip_prefix(' ').unwrap_or(content)
336                            } else {
337                                content
338                            };
339                            self.buffer.push_back(Ok((
340                                last_delimiter_end,
341                                Token::Raw(content),
342                                delim_range.start,
343                            )));
344                        }
345                    }
346                    if matches!(delim, Delimiter::NumericMatchStart) {
347                        self.buffer.push_back(Ok((
348                            delim_range.start,
349                            Token::MatchStart,
350                            delim_range.end - 1,
351                        )));
352                        self.buffer.push_back(Ok((
353                            delim_range.end - 1,
354                            Token::Hash,
355                            delim_range.end,
356                        )));
357                    } else {
358                        self.buffer.push_back(Ok((
359                            delim_range.start,
360                            Token::MatchStart,
361                            delim_range.end,
362                        )));
363                    }
364                    is_first = false;
365                }
366                Delimiter::RegexStart if in_match.is_none() && in_regex.is_none() => {
367                    in_regex = Some(Span::new(delim_range, Delimiter::RegexStart));
368                    if delim_range.start > last_delimiter_end {
369                        let raw = &self.input.buffer()[last_delimiter_end..delim_range.start];
370                        if !raw.iter().all(u8::is_ascii_whitespace) {
371                            let content = self.input.as_str(last_delimiter_end..delim_range.start);
372                            let content = if is_first {
373                                content.strip_prefix(' ').unwrap_or(content)
374                            } else {
375                                content
376                            };
377                            self.buffer.push_back(Ok((
378                                last_delimiter_end,
379                                Token::Raw(content),
380                                delim_range.start,
381                            )));
382                        }
383                    }
384                    self.buffer.push_back(Ok((
385                        delim_range.start,
386                        Token::RegexStart,
387                        delim_range.end,
388                    )));
389                    is_first = false;
390                }
391                Delimiter::MatchEnd if in_match.is_some() => {
392                    last_delimiter_end = delim_range.end;
393                    let match_start = in_match.take().unwrap();
394                    if matches!(match_start.into_inner(), Delimiter::NumericMatchStart) {
395                        self.tokenize_capture_or_match_numeric(Range::new(
396                            match_start.end(),
397                            delim_range.start,
398                        ));
399                    } else {
400                        self.tokenize_capture_or_match(Range::new(
401                            match_start.end(),
402                            delim_range.start,
403                        ));
404                    }
405                    self.buffer.push_back(Ok((
406                        delim_range.start,
407                        Token::MatchEnd,
408                        delim_range.end,
409                    )));
410                    self.input.set_start(delim_range.end);
411                    self.searcher.set_last_match_end(delim_range.end);
412                }
413                Delimiter::RegexEnd if in_regex.is_some() => {
414                    last_delimiter_end = delim_range.end;
415                    let regex_start = in_regex.take().unwrap();
416                    let pattern_start = regex_start.end();
417                    let raw = self.input.as_str(pattern_start..delim_range.start).trim();
418                    self.buffer
419                        .push_back(Ok((pattern_start, Token::Raw(raw), delim_range.start)));
420                    self.buffer.push_back(Ok((
421                        delim_range.start,
422                        Token::RegexEnd,
423                        delim_range.end,
424                    )));
425                    self.input.set_start(delim_range.end);
426                    self.searcher.set_last_match_end(delim_range.end);
427                }
428                delim @ (Delimiter::RegexEnd | Delimiter::MatchEnd)
429                    if in_match.is_none() && in_regex.is_none() =>
430                {
431                    self.buffer.push_back(Err(ParserError::UnrecognizedToken {
432                        span: delim_range.into(),
433                        token: AsRef::<str>::as_ref(&delim).to_string(),
434                        expected: vec![
435                            "literal".to_string(),
436                            Token::MatchStart.to_string(),
437                            Token::RegexStart.to_string(),
438                        ],
439                    }));
440                }
441                _ => continue,
442            }
443        }
444
445        // The line has been sliced up in delimited parts, but we have to handle any trailing content
446        if last_delimiter_end < eol && in_match.is_none() && in_regex.is_none() {
447            let line = self.input.as_str(last_delimiter_end..eol);
448            if !line.trim().is_empty() {
449                let line = if is_first {
450                    line.strip_prefix(' ').unwrap_or(line)
451                } else {
452                    line
453                };
454                self.buffer
455                    .push_back(Ok((last_delimiter_end, Token::Raw(line), eol)));
456            }
457            self.input.set_start(eol);
458            self.searcher.set_last_match_end(eol);
459            self.delimiter_searcher.set_last_match_end(eol);
460        }
461
462        // Handle unclosed delimiters
463        match (in_match, in_regex) {
464            (None, None) => (),
465            (Some(delim), _) => {
466                // Unclosed match start, i.e. [[ or [[#
467                self.buffer
468                    .push_back(Err(ParserError::UnclosedSubstitution {
469                        span: delim.span(),
470                    }));
471            }
472            (_, Some(delim)) => {
473                // Unclosed regex start, i.e. [[ or [[#
474                self.buffer
475                    .push_back(Err(ParserError::UnclosedRegex { span: delim.span() }));
476            }
477        }
478    }
479
480    fn tokenize_capture_or_match(&mut self, range: Range<usize>) {
481        let mut chars = self.input.as_str(range).chars().peekable();
482        let mut offset = range.start;
483        while let Some(c) = chars.next() {
484            let next_offset = offset + c.len_utf8();
485            match c {
486                c if c.is_ascii_alphabetic() || c == '_' => {
487                    let start = offset;
488                    let mut end = next_offset;
489
490                    while let Some(&c) = chars.peek() {
491                        match c {
492                            c if c.is_ascii_alphanumeric() => {
493                                end += c.len_utf8();
494                                chars.next();
495                            }
496                            '_' => {
497                                end += '_'.len_utf8();
498                                chars.next();
499                            }
500                            c if c.is_whitespace() || c == ':' => {
501                                break;
502                            }
503                            _ => {
504                                self.buffer.push_back(Ok((
505                                    start,
506                                    Token::Error(LexerError::InvalidIdentifier {
507                                        span: SourceSpan::from(start..(end + 1)),
508                                    }),
509                                    end + 1,
510                                )));
511                                self.buffer.push_back(Ok((
512                                    end + 1,
513                                    Token::Raw(self.input.as_str((end + 1)..range.end)),
514                                    range.end,
515                                )));
516                                return;
517                            }
518                        }
519                    }
520                    self.buffer.push_back(Ok((
521                        start,
522                        Token::from_keyword_or_ident(self.input.as_str(start..end)),
523                        end,
524                    )));
525                    offset = end;
526                    continue;
527                }
528                '@' => self.buffer.push_back(Ok((offset, Token::At, next_offset))),
529                '$' => self
530                    .buffer
531                    .push_back(Ok((offset, Token::Dollar, next_offset))),
532                ':' => {
533                    self.buffer
534                        .push_back(Ok((offset, Token::Colon, next_offset)));
535                    // The remainder of the match block is raw
536                    let raw = self.input.as_str(next_offset..range.end);
537                    self.buffer
538                        .push_back(Ok((offset + 1, Token::Raw(raw), range.end)));
539                    return;
540                }
541                c if c.is_whitespace() => (),
542                unexpected => {
543                    self.buffer.push_back(Ok((
544                        offset,
545                        Token::Error(LexerError::UnexpectedCharacter {
546                            span: SourceSpan::from(offset..next_offset),
547                            unexpected,
548                        }),
549                        next_offset,
550                    )));
551                    self.buffer.push_back(Ok((
552                        next_offset,
553                        Token::Raw(self.input.as_str(next_offset..range.end)),
554                        range.end,
555                    )));
556                    return;
557                }
558            }
559            offset = next_offset;
560        }
561    }
562
563    fn tokenize_capture_or_match_numeric(&mut self, range: Range<usize>) {
564        let mut chars = self.input.as_str(range).chars().peekable();
565        let mut offset = range.start;
566        let mut strip_whitespace = true;
567        while let Some(c) = chars.next() {
568            let mut next_offset = offset + c.len_utf8();
569            match c {
570                '#' => {
571                    strip_whitespace = false;
572                    self.buffer
573                        .push_back(Ok((offset, Token::Hash, next_offset)));
574                }
575                '%' => {
576                    strip_whitespace = false;
577                    self.buffer
578                        .push_back(Ok((offset, Token::Percent, next_offset)));
579                }
580                '.' => {
581                    strip_whitespace = false;
582                    self.buffer.push_back(Ok((offset, Token::Dot, next_offset)));
583                }
584                ',' => {
585                    self.buffer
586                        .push_back(Ok((offset, Token::Comma, next_offset)));
587                }
588                '+' => {
589                    strip_whitespace = true;
590                    self.buffer
591                        .push_back(Ok((offset, Token::Plus, next_offset)));
592                }
593                '-' => {
594                    strip_whitespace = true;
595                    self.buffer
596                        .push_back(Ok((offset, Token::Plus, next_offset)));
597                }
598                '@' => {
599                    strip_whitespace = false;
600                    self.buffer.push_back(Ok((offset, Token::At, next_offset)));
601                }
602                '$' => {
603                    strip_whitespace = false;
604                    self.buffer
605                        .push_back(Ok((offset, Token::Dollar, next_offset)));
606                }
607                '=' if matches!(chars.peek(), Some(&'=')) => {
608                    strip_whitespace = true;
609                    chars.next();
610                    next_offset += '='.len_utf8();
611                    self.buffer
612                        .push_back(Ok((offset, Token::Equals, next_offset)));
613                }
614                '(' => {
615                    strip_whitespace = true;
616                    self.buffer
617                        .push_back(Ok((offset, Token::LParen, next_offset)));
618                }
619                ')' => {
620                    strip_whitespace = true;
621                    self.buffer
622                        .push_back(Ok((offset, Token::RParen, next_offset)));
623                }
624                ':' => {
625                    strip_whitespace = true;
626                    self.buffer
627                        .push_back(Ok((offset, Token::Colon, next_offset)));
628                }
629                c if c.is_ascii_alphabetic() || c == '_' => {
630                    let mut end = next_offset;
631                    while let Some(&c) = chars.peek() {
632                        match c {
633                            c if c.is_ascii_alphanumeric() => {
634                                end += c.len_utf8();
635                                chars.next();
636                            }
637                            '_' => {
638                                end += c.len_utf8();
639                                chars.next();
640                            }
641                            _ => break,
642                        }
643                    }
644                    self.buffer.push_back(Ok((
645                        offset,
646                        Token::from_keyword_or_ident(self.input.as_str(offset..end)),
647                        end,
648                    )));
649                    strip_whitespace = true;
650                    offset = end;
651                    continue;
652                }
653                c if c.is_ascii_digit() => {
654                    let mut end = next_offset;
655                    while let Some(&c) = chars.peek() {
656                        match c {
657                            c if c.is_ascii_digit() => {
658                                end += 1;
659                                chars.next();
660                            }
661                            _ => break,
662                        }
663                    }
664                    match self.input.as_str(offset..end).parse::<i64>() {
665                        Ok(value) => {
666                            self.buffer.push_back(Ok((offset, Token::Num(value), end)));
667                        }
668                        Err(err) => {
669                            self.buffer.push_back(Ok((
670                                offset,
671                                Token::Error(LexerError::InvalidNumber {
672                                    span: SourceSpan::from(offset..end),
673                                    error: err,
674                                }),
675                                end,
676                            )));
677                        }
678                    }
679                    strip_whitespace = true;
680                    offset = end;
681                    continue;
682                }
683                c if c.is_ascii_whitespace() && strip_whitespace => (),
684                unexpected => {
685                    self.buffer.push_back(Ok((
686                        offset,
687                        Token::Error(LexerError::UnexpectedCharacter {
688                            span: SourceSpan::from(offset..next_offset),
689                            unexpected,
690                        }),
691                        next_offset,
692                    )));
693                    self.buffer.push_back(Ok((
694                        next_offset,
695                        Token::Raw(self.input.as_str(next_offset..range.end)),
696                        range.end,
697                    )));
698                    return;
699                }
700            }
701            offset = next_offset;
702        }
703    }
704}
705impl<'input> Iterator for Lexer<'input> {
706    type Item = Lexed<'input>;
707
708    #[track_caller]
709    fn next(&mut self) -> Option<Self::Item> {
710        let mut res = self.lex();
711        loop {
712            if let Some(Ok((_, Token::Lf, _))) = res.as_ref() {
713                // Drop leading newlines
714                if self.leading_lf {
715                    res = self.lex();
716                    continue;
717                }
718                // Collapse newlines into last newline token
719                if let Some(Ok((_, Token::Lf, _))) = self.buffer.front() {
720                    res = self.lex();
721                    continue;
722                }
723                break;
724            } else {
725                self.leading_lf = false;
726                break;
727            }
728        }
729        res
730    }
731}