litcheck_filecheck/parse/lexer/
mod.rs

1mod delimiter;
2mod error;
3mod patterns;
4mod token;
5
6use std::collections::VecDeque;
7
8use crate::{
9    ast::{self, CheckModifier},
10    common::*,
11    parse::ParserError,
12    pattern::search::{AhoCorasickSearcher, RegexSearcher},
13};
14
15pub use self::error::LexerError;
16pub use self::token::Token;
17
18/// The value produced by the [Lexer] when iterated
19pub type Lexed<'input> = Result<(usize, Token<'input>, usize), ParserError>;
20
21use self::delimiter::Delimiter;
22use self::patterns::{Check, Pattern};
23
24/// The lexer that is used to tokenize a check file
25pub struct Lexer<'input> {
26    input: Input<'input>,
27    patterns: Vec<Pattern>,
28    check_prefixes: Vec<Arc<str>>,
29    seen_prefixes: Vec<bool>,
30    regex: Regex,
31    searcher: RegexSearcher<'input>,
32    cache: regex_automata::meta::Cache,
33    captures: regex_automata::util::captures::Captures,
34    delimiter_patterns: aho_corasick::AhoCorasick,
35    delimiter_searcher: AhoCorasickSearcher<'input>,
36    /// When we have reached true Eof, this gets set to true, and the only token
37    /// produced after that point is Token::Eof, or None, depending on how you are
38    /// consuming the lexer
39    eof: bool,
40    /// When we first start parsing, this is true until we've observed a token
41    /// which is not Lf or Eof, so that we trim all leading empty lines which
42    /// will just be ignored anyway. This simplifies parsing by ensuring the
43    /// rules only need to account for actual content lines in the file.
44    leading_lf: bool,
45    /// The token buffer, used when tokenizing lines
46    buffer: VecDeque<Lexed<'input>>,
47}
48impl<'input> Lexer<'input> {
49    /// Produces an instance of the lexer with the lexical analysis to be performed on the `input`
50    /// string. Note that no lexical analysis occurs until the lexer has been iterated over.
51    pub fn new<S>(
52        source: &'input S,
53        check_prefixes: &[Arc<str>],
54        comment_prefixes: &[Arc<str>],
55    ) -> Self
56    where
57        S: SourceFile + ?Sized + 'input,
58    {
59        let buffer = source.source().as_bytes();
60        let input = Input::new(buffer, false);
61        let mut patterns = Pattern::generate_check_patterns(check_prefixes).collect::<Vec<_>>();
62        patterns.extend(Pattern::generate_comment_patterns(comment_prefixes));
63        let regex =
64            Regex::new_many(&patterns).expect("expected valid prefix searcher configuration");
65        let searcher = RegexSearcher::new(input.into());
66        let eof = input.is_empty();
67        let captures = regex.create_captures();
68        let cache = regex.create_cache();
69
70        let mut builder = aho_corasick::AhoCorasickBuilder::new();
71        builder
72            .match_kind(aho_corasick::MatchKind::LeftmostLongest)
73            .start_kind(aho_corasick::StartKind::Both)
74            .kind(Some(aho_corasick::AhoCorasickKind::DFA));
75        let delimiter_patterns = builder
76            .build(Delimiter::ALL)
77            .expect("expected delimiter searcher configuration");
78        let delimiter_searcher = AhoCorasickSearcher::new(input.into());
79
80        Lexer {
81            input,
82            patterns,
83            check_prefixes: check_prefixes.iter().cloned().collect(),
84            seen_prefixes: vec![false; check_prefixes.len()],
85            regex,
86            searcher,
87            cache,
88            captures,
89            delimiter_patterns,
90            delimiter_searcher,
91            eof,
92            leading_lf: true,
93            buffer: VecDeque::with_capacity(128),
94        }
95    }
96
97    pub fn unused_prefixes(&self) -> Vec<Arc<str>> {
98        self.check_prefixes
99            .iter()
100            .zip(self.seen_prefixes.iter().copied())
101            .filter_map(|(prefix, used)| if used { None } else { Some(prefix.clone()) })
102            .collect()
103    }
104
105    pub fn current_offset(&self) -> SourceSpan {
106        let at = self.input.start();
107        SourceSpan::from(at..at)
108    }
109
110    pub fn peek(&mut self) -> Option<&Token<'input>> {
111        loop {
112            if !self.buffer.is_empty() {
113                break self
114                    .buffer
115                    .front()
116                    .and_then(|lexed| lexed.as_ref().ok().map(|(_, t, _)| t));
117            } else if self.eof {
118                break None;
119            }
120
121            self.tokenize();
122        }
123    }
124
125    pub fn lex(&mut self) -> Option<Lexed<'input>> {
126        loop {
127            if !self.buffer.is_empty() {
128                break self.buffer.pop_front();
129            } else if self.eof {
130                break None;
131            }
132
133            self.tokenize();
134        }
135    }
136
137    fn tokenize(&mut self) {
138        match self.input.peek_byte() {
139            (_, b'\0') => {
140                self.eof = true;
141                return;
142            }
143            (offset, b'\n') => {
144                let next_offset = offset + 1;
145                self.input.set_start(next_offset);
146                self.buffer.push_back(Ok((offset, Token::Lf, next_offset)));
147            }
148            _ => (),
149        }
150
151        let bytes = self.input.buffer();
152        let start = self.input.start();
153        let eof = self.input.end();
154        let mut word_boundary = self.input.start();
155        while word_boundary < eof && bytes[word_boundary].is_ascii_whitespace() {
156            word_boundary += 1;
157        }
158        if start < word_boundary {
159            self.input.set_start(word_boundary);
160        }
161
162        // Force the searcher forward if necessary to ensure it is caught up to our view of the input
163        let start = self.input.start();
164        if self.searcher.input().start() < start {
165            self.searcher.set_last_match_end(start);
166        }
167
168        let search_result = self.searcher.advance(|input| {
169            self.regex
170                .search_captures_with(&mut self.cache, input, &mut self.captures);
171            Ok(self.captures.get_match())
172        });
173        if let Some(matched) = search_result {
174            let pid = matched.pattern();
175            let range = Range::from(matched.range());
176            let pattern = &self.patterns[pid.as_usize()];
177            let pattern_ty = pattern.ty;
178            if let Check::Comment = pattern_ty {
179                return self.tokenize_comment(range);
180            }
181
182            let prefix_span = self.captures.get_group_by_name("prefix").unwrap();
183            let prefix = self.input.as_str(prefix_span.start..prefix_span.end);
184            if let Some(index) = self
185                .check_prefixes
186                .iter()
187                .position(|pfx| pfx.as_ref() == prefix)
188            {
189                self.seen_prefixes[index] = true;
190            }
191            match pattern_ty {
192                Check::Count => {
193                    let valid = self.tokenize_check_count_prefix(range);
194                    if !valid {
195                        self.captures.set_pattern(None);
196                        return;
197                    }
198                }
199                ty => {
200                    self.buffer.push_back(Ok((
201                        range.start,
202                        Token::Check(ty.try_into().unwrap()),
203                        range.end,
204                    )));
205                }
206            }
207            let literal = self.tokenize_optional_modifiers();
208            self.buffer
209                .push_back(Ok((range.end - 1, Token::Colon, range.end)));
210            self.input.set_start(range.end);
211            self.tokenize_check_pattern(literal);
212            self.captures.set_pattern(None);
213        } else {
214            // There are no more check lines in the input
215            self.input.set_start(self.input.end());
216            self.eof = true;
217        }
218    }
219
220    fn tokenize_optional_modifiers(&mut self) -> bool {
221        if let Some(span) = self.captures.get_group_by_name("modifiers") {
222            if self.input.buffer()[span.start..].starts_with(b"LITERAL") {
223                self.buffer.push_back(Ok((
224                    span.start,
225                    Token::Modifier(CheckModifier::LITERAL),
226                    span.end,
227                )));
228                true
229            } else {
230                unreachable!("no other modifiers are recognized by the regex pattern")
231            }
232        } else {
233            false
234        }
235    }
236
237    fn tokenize_comment(&mut self, range: Range<usize>) {
238        // Find the next newline, and take all the input up to that point
239        let span = self.captures.get_group_by_name("comment").unwrap();
240        let comment = self.input.as_str(span.start..span.end);
241        let comment = comment.strip_prefix(' ').unwrap_or(comment);
242        self.buffer.push_back(Ok((
243            range.start,
244            Token::Comment(Cow::Borrowed(comment)),
245            span.end,
246        )));
247        self.input.set_start(span.end);
248        self.captures.set_pattern(None);
249    }
250
251    fn tokenize_check_count_prefix(&mut self, prefix_range: Range<usize>) -> bool {
252        let count_span = self.captures.get_group_by_name("count").unwrap();
253        let count = self.input.as_str(count_span.start..count_span.end);
254        match count.parse::<u8>() {
255            Ok(count) => {
256                self.buffer.push_back(Ok((
257                    prefix_range.start,
258                    Token::Check(ast::Check::Plain),
259                    count_span.end,
260                )));
261                self.buffer.push_back(Ok((
262                    prefix_range.start,
263                    Token::Modifier(CheckModifier::from_count(count)),
264                    count_span.end,
265                )));
266                true
267            }
268            Err(error) => {
269                let token = Token::Error(LexerError::BadCount {
270                    span: SourceSpan::from(count_span.start..count_span.end),
271                    error,
272                });
273                self.buffer
274                    .push_back(Ok((count_span.start, token, count_span.end)));
275                // Skip to end of line
276                let eol = self
277                    .input
278                    .next_newline_from(prefix_range.end)
279                    .unwrap_or_else(|| self.input.end());
280                self.input.set_start(eol);
281                false
282            }
283        }
284    }
285
286    /// We've succesfully parsed a CHECK directive.
287    ///
288    /// We're parsing the rest of the line as a CHECK pattern, so we need to look
289    /// for `[[` `]]` and `{{` `}}` markers indicating substitution/capture and regex
290    /// matches respectively.
291    fn tokenize_check_pattern(&mut self, literal: bool) {
292        let start = self.input.start();
293        let eol = self.input.next_newline().unwrap_or(self.input.end());
294
295        if literal {
296            let raw = self.input.as_str(start..eol);
297            if let Some(raw) = raw.strip_prefix(' ') {
298                self.buffer.push_back(Ok((start + 1, Token::Raw(raw), eol)));
299            } else {
300                self.buffer.push_back(Ok((start, Token::Raw(raw), eol)));
301            }
302            self.input.set_start(eol);
303            return;
304        }
305
306        let mut in_match: Option<Span<Delimiter>> = None;
307        let mut in_regex: Option<Span<Delimiter>> = None;
308
309        let mut last_delimiter_end = start;
310        self.delimiter_searcher.set_range(start..eol);
311        let mut is_first = true;
312        while let Some(matched) = self
313            .delimiter_searcher
314            .advance(|input| Ok(self.delimiter_patterns.find(input.clone())))
315        {
316            let pid = matched.pattern();
317            let delim_range = Range::from(matched.range());
318            match Delimiter::from_pid(pid.as_usize()) {
319                delim @ (Delimiter::MatchStart | Delimiter::NumericMatchStart)
320                    if in_match.is_none() && in_regex.is_none() =>
321                {
322                    in_match = Some(Span::new(delim_range, delim));
323                    if delim_range.start > last_delimiter_end {
324                        let raw = &self.input.buffer()[last_delimiter_end..delim_range.start];
325                        if !raw.iter().all(u8::is_ascii_whitespace) {
326                            let content = self.input.as_str(last_delimiter_end..delim_range.start);
327                            let content = if is_first {
328                                content.strip_prefix(' ').unwrap_or(content)
329                            } else {
330                                content
331                            };
332                            self.buffer.push_back(Ok((
333                                last_delimiter_end,
334                                Token::Raw(content),
335                                delim_range.start,
336                            )));
337                        }
338                    }
339                    if matches!(delim, Delimiter::NumericMatchStart) {
340                        self.buffer.push_back(Ok((
341                            delim_range.start,
342                            Token::MatchStart,
343                            delim_range.end - 1,
344                        )));
345                        self.buffer.push_back(Ok((
346                            delim_range.end - 1,
347                            Token::Hash,
348                            delim_range.end,
349                        )));
350                    } else {
351                        self.buffer.push_back(Ok((
352                            delim_range.start,
353                            Token::MatchStart,
354                            delim_range.end,
355                        )));
356                    }
357                    is_first = false;
358                }
359                Delimiter::RegexStart if in_match.is_none() && in_regex.is_none() => {
360                    in_regex = Some(Span::new(delim_range, Delimiter::RegexStart));
361                    if delim_range.start > last_delimiter_end {
362                        let raw = &self.input.buffer()[last_delimiter_end..delim_range.start];
363                        if !raw.iter().all(u8::is_ascii_whitespace) {
364                            let content = self.input.as_str(last_delimiter_end..delim_range.start);
365                            let content = if is_first {
366                                content.strip_prefix(' ').unwrap_or(content)
367                            } else {
368                                content
369                            };
370                            self.buffer.push_back(Ok((
371                                last_delimiter_end,
372                                Token::Raw(content),
373                                delim_range.start,
374                            )));
375                        }
376                    }
377                    self.buffer.push_back(Ok((
378                        delim_range.start,
379                        Token::RegexStart,
380                        delim_range.end,
381                    )));
382                    is_first = false;
383                }
384                Delimiter::MatchEnd if in_match.is_some() => {
385                    last_delimiter_end = delim_range.end;
386                    let match_start = in_match.take().unwrap();
387                    if matches!(match_start.into_inner(), Delimiter::NumericMatchStart) {
388                        self.tokenize_capture_or_match_numeric(Range::new(
389                            match_start.end(),
390                            delim_range.start,
391                        ));
392                    } else {
393                        self.tokenize_capture_or_match(Range::new(
394                            match_start.end(),
395                            delim_range.start,
396                        ));
397                    }
398                    self.buffer.push_back(Ok((
399                        delim_range.start,
400                        Token::MatchEnd,
401                        delim_range.end,
402                    )));
403                    self.input.set_start(delim_range.end);
404                    self.searcher.set_last_match_end(delim_range.end);
405                }
406                Delimiter::RegexEnd if in_regex.is_some() => {
407                    last_delimiter_end = delim_range.end;
408                    let regex_start = in_regex.take().unwrap();
409                    let pattern_start = regex_start.end();
410                    let raw = self.input.as_str(pattern_start..delim_range.start).trim();
411                    self.buffer
412                        .push_back(Ok((pattern_start, Token::Raw(raw), delim_range.start)));
413                    self.buffer.push_back(Ok((
414                        delim_range.start,
415                        Token::RegexEnd,
416                        delim_range.end,
417                    )));
418                    self.input.set_start(delim_range.end);
419                    self.searcher.set_last_match_end(delim_range.end);
420                }
421                delim @ (Delimiter::RegexEnd | Delimiter::MatchEnd)
422                    if in_match.is_none() && in_regex.is_none() =>
423                {
424                    self.buffer.push_back(Err(ParserError::UnrecognizedToken {
425                        span: delim_range.into(),
426                        token: AsRef::<str>::as_ref(&delim).to_string(),
427                        expected: vec![
428                            "literal".to_string(),
429                            Token::MatchStart.to_string(),
430                            Token::RegexStart.to_string(),
431                        ],
432                    }));
433                }
434                _ => continue,
435            }
436        }
437
438        // The line has been sliced up in delimited parts, but we have to handle any trailing content
439        if last_delimiter_end < eol && in_match.is_none() && in_regex.is_none() {
440            let line = self.input.as_str(last_delimiter_end..eol);
441            if !line.trim().is_empty() {
442                let line = if is_first {
443                    line.strip_prefix(' ').unwrap_or(line)
444                } else {
445                    line
446                };
447                self.buffer
448                    .push_back(Ok((last_delimiter_end, Token::Raw(line), eol)));
449            }
450            self.input.set_start(eol);
451            self.searcher.set_last_match_end(eol);
452            self.delimiter_searcher.set_last_match_end(eol);
453        }
454
455        // Handle unclosed delimiters
456        match (in_match, in_regex) {
457            (None, None) => (),
458            (Some(delim), _) => {
459                // Unclosed match start, i.e. [[ or [[#
460                self.buffer
461                    .push_back(Err(ParserError::UnclosedSubstitution {
462                        span: delim.span(),
463                    }));
464            }
465            (_, Some(delim)) => {
466                // Unclosed regex start, i.e. [[ or [[#
467                self.buffer
468                    .push_back(Err(ParserError::UnclosedRegex { span: delim.span() }));
469            }
470        }
471    }
472
473    fn tokenize_capture_or_match(&mut self, range: Range<usize>) {
474        let mut chars = self.input.as_str(range).chars().peekable();
475        let mut offset = range.start;
476        while let Some(c) = chars.next() {
477            let next_offset = offset + c.len_utf8();
478            match c {
479                c if c.is_ascii_alphabetic() || c == '_' => {
480                    let start = offset;
481                    let mut end = next_offset;
482
483                    while let Some(&c) = chars.peek() {
484                        match c {
485                            c if c.is_ascii_alphanumeric() => {
486                                end += c.len_utf8();
487                                chars.next();
488                            }
489                            '_' => {
490                                end += '_'.len_utf8();
491                                chars.next();
492                            }
493                            c if c.is_whitespace() || c == ':' => {
494                                break;
495                            }
496                            _ => {
497                                self.buffer.push_back(Ok((
498                                    start,
499                                    Token::Error(LexerError::InvalidIdentifier {
500                                        span: SourceSpan::from(start..(end + 1)),
501                                    }),
502                                    end + 1,
503                                )));
504                                self.buffer.push_back(Ok((
505                                    end + 1,
506                                    Token::Raw(self.input.as_str((end + 1)..range.end)),
507                                    range.end,
508                                )));
509                                return;
510                            }
511                        }
512                    }
513                    self.buffer.push_back(Ok((
514                        start,
515                        Token::from_keyword_or_ident(self.input.as_str(start..end)),
516                        end,
517                    )));
518                    offset = end;
519                    continue;
520                }
521                '@' => self.buffer.push_back(Ok((offset, Token::At, next_offset))),
522                '$' => self
523                    .buffer
524                    .push_back(Ok((offset, Token::Dollar, next_offset))),
525                ':' => {
526                    self.buffer
527                        .push_back(Ok((offset, Token::Colon, next_offset)));
528                    // The remainder of the match block is raw
529                    let raw = self.input.as_str(next_offset..range.end);
530                    self.buffer
531                        .push_back(Ok((offset + 1, Token::Raw(raw), range.end)));
532                    return;
533                }
534                c if c.is_whitespace() => (),
535                unexpected => {
536                    self.buffer.push_back(Ok((
537                        offset,
538                        Token::Error(LexerError::UnexpectedCharacter {
539                            span: SourceSpan::from(offset..next_offset),
540                            unexpected,
541                        }),
542                        next_offset,
543                    )));
544                    self.buffer.push_back(Ok((
545                        next_offset,
546                        Token::Raw(self.input.as_str(next_offset..range.end)),
547                        range.end,
548                    )));
549                    return;
550                }
551            }
552            offset = next_offset;
553        }
554    }
555
556    fn tokenize_capture_or_match_numeric(&mut self, range: Range<usize>) {
557        let mut chars = self.input.as_str(range).chars().peekable();
558        let mut offset = range.start;
559        let mut strip_whitespace = true;
560        while let Some(c) = chars.next() {
561            let mut next_offset = offset + c.len_utf8();
562            match c {
563                '#' => {
564                    strip_whitespace = false;
565                    self.buffer
566                        .push_back(Ok((offset, Token::Hash, next_offset)));
567                }
568                '%' => {
569                    strip_whitespace = false;
570                    self.buffer
571                        .push_back(Ok((offset, Token::Percent, next_offset)));
572                }
573                '.' => {
574                    strip_whitespace = false;
575                    self.buffer.push_back(Ok((offset, Token::Dot, next_offset)));
576                }
577                ',' => {
578                    self.buffer
579                        .push_back(Ok((offset, Token::Comma, next_offset)));
580                }
581                '+' => {
582                    strip_whitespace = true;
583                    self.buffer
584                        .push_back(Ok((offset, Token::Plus, next_offset)));
585                }
586                '-' => {
587                    strip_whitespace = true;
588                    self.buffer
589                        .push_back(Ok((offset, Token::Plus, next_offset)));
590                }
591                '@' => {
592                    strip_whitespace = false;
593                    self.buffer.push_back(Ok((offset, Token::At, next_offset)));
594                }
595                '$' => {
596                    strip_whitespace = false;
597                    self.buffer
598                        .push_back(Ok((offset, Token::Dollar, next_offset)));
599                }
600                '=' if matches!(chars.peek(), Some(&'=')) => {
601                    strip_whitespace = true;
602                    chars.next();
603                    next_offset += '='.len_utf8();
604                    self.buffer
605                        .push_back(Ok((offset, Token::Equals, next_offset)));
606                }
607                '(' => {
608                    strip_whitespace = true;
609                    self.buffer
610                        .push_back(Ok((offset, Token::LParen, next_offset)));
611                }
612                ')' => {
613                    strip_whitespace = true;
614                    self.buffer
615                        .push_back(Ok((offset, Token::RParen, next_offset)));
616                }
617                ':' => {
618                    strip_whitespace = true;
619                    self.buffer
620                        .push_back(Ok((offset, Token::Colon, next_offset)));
621                }
622                c if c.is_ascii_alphabetic() || c == '_' => {
623                    let mut end = next_offset;
624                    while let Some(&c) = chars.peek() {
625                        match c {
626                            c if c.is_ascii_alphanumeric() => {
627                                end += c.len_utf8();
628                                chars.next();
629                            }
630                            '_' => {
631                                end += c.len_utf8();
632                                chars.next();
633                            }
634                            _ => break,
635                        }
636                    }
637                    self.buffer.push_back(Ok((
638                        offset,
639                        Token::from_keyword_or_ident(self.input.as_str(offset..end)),
640                        end,
641                    )));
642                    strip_whitespace = true;
643                    offset = end;
644                    continue;
645                }
646                c if c.is_ascii_digit() => {
647                    let mut end = next_offset;
648                    while let Some(&c) = chars.peek() {
649                        match c {
650                            c if c.is_ascii_digit() => {
651                                end += 1;
652                                chars.next();
653                            }
654                            _ => break,
655                        }
656                    }
657                    match self.input.as_str(offset..end).parse::<i64>() {
658                        Ok(value) => {
659                            self.buffer.push_back(Ok((offset, Token::Num(value), end)));
660                        }
661                        Err(err) => {
662                            self.buffer.push_back(Ok((
663                                offset,
664                                Token::Error(LexerError::InvalidNumber {
665                                    span: SourceSpan::from(offset..end),
666                                    error: err,
667                                }),
668                                end,
669                            )));
670                        }
671                    }
672                    strip_whitespace = true;
673                    offset = end;
674                    continue;
675                }
676                c if c.is_ascii_whitespace() && strip_whitespace => (),
677                unexpected => {
678                    self.buffer.push_back(Ok((
679                        offset,
680                        Token::Error(LexerError::UnexpectedCharacter {
681                            span: SourceSpan::from(offset..next_offset),
682                            unexpected,
683                        }),
684                        next_offset,
685                    )));
686                    self.buffer.push_back(Ok((
687                        next_offset,
688                        Token::Raw(self.input.as_str(next_offset..range.end)),
689                        range.end,
690                    )));
691                    return;
692                }
693            }
694            offset = next_offset;
695        }
696    }
697}
698impl<'input> Iterator for Lexer<'input> {
699    type Item = Lexed<'input>;
700
701    #[track_caller]
702    fn next(&mut self) -> Option<Self::Item> {
703        let mut res = self.lex();
704        loop {
705            if let Some(Ok((_, Token::Lf, _))) = res.as_ref() {
706                // Drop leading newlines
707                if self.leading_lf {
708                    res = self.lex();
709                    continue;
710                }
711                // Collapse newlines into last newline token
712                if let Some(Ok((_, Token::Lf, _))) = self.buffer.front() {
713                    res = self.lex();
714                    continue;
715                }
716                break;
717            } else {
718                self.leading_lf = false;
719                break;
720            }
721        }
722        res
723    }
724}