Skip to main content

litcheck_filecheck/pattern/matcher/matchers/
regex_set.rs

1use regex_automata::{
2    Anchored, MatchKind, PatternID,
3    dfa::{self, Automaton, OverlappingState, StartKind, dense, onepass},
4    meta,
5    nfa::thompson,
6    util::{
7        captures::{Captures, GroupInfo},
8        syntax,
9    },
10};
11
12use crate::{
13    ast::{Capture, RegexPattern},
14    common::*,
15    pattern::{matcher::regex, search::Input as _},
16};
17
18#[derive(Default, Clone)]
19#[allow(clippy::large_enum_variant)]
20pub enum CapturingRegex {
21    #[default]
22    None,
23    Onepass {
24        re: onepass::DFA,
25        cache: onepass::Cache,
26    },
27    Default {
28        re: meta::Regex,
29        cache: meta::Cache,
30    },
31}
32
33struct Search<'input> {
34    crlf: bool,
35    forward: bool,
36    input: regex_automata::Input<'input>,
37    reverse_input: regex_automata::Input<'input>,
38    last_match_end: Option<usize>,
39    captures: Captures,
40    overlapping: OverlappingState,
41    overlapping_reverse: OverlappingState,
42}
43impl<'input> Search<'input> {
44    fn start(input: Input<'input>, captures: Captures) -> Self {
45        let crlf = input.is_crlf();
46        let input: regex_automata::Input<'input> = input.into();
47        let reverse_input = input.clone().earliest(false);
48        let overlapping = OverlappingState::start();
49        let overlapping_reverse = OverlappingState::start();
50        Self {
51            crlf,
52            forward: true,
53            input,
54            reverse_input,
55            last_match_end: None,
56            overlapping,
57            overlapping_reverse,
58            captures,
59        }
60    }
61}
62
63pub struct RegexSetSearcher<'a, 'input, A = dense::DFA<Vec<u32>>> {
64    search: Search<'input>,
65    /// The set of raw input patterns from which
66    /// this matcher was constructed
67    patterns: Vec<Span<Cow<'a, str>>>,
68    /// The compiled regex which will be used to search the input buffer
69    regex: dfa::regex::Regex<A>,
70    /// The regex used to obtain capturing groups, if there are any
71    capturing_regex: CapturingRegex,
72    /// Metadata about captures in the given patterns
73    ///
74    /// Each pattern gets its own vector of capture info, since
75    /// there is no requirement that all patterns have the same
76    /// number or type of captures
77    capture_types: Vec<Vec<Capture>>,
78}
79impl<'a, 'input> RegexSetSearcher<'a, 'input> {
80    pub fn new(
81        input: Input<'input>,
82        patterns: Vec<RegexPattern<'a>>,
83        config: &Config,
84    ) -> DiagResult<Self> {
85        let start_kind = if input.is_anchored() {
86            StartKind::Anchored
87        } else {
88            StartKind::Unanchored
89        };
90        Ok(
91            RegexSetMatcher::new_with_start_kind(start_kind, patterns, config)?
92                .into_searcher(input),
93        )
94    }
95}
96impl<'a, 'input, A: Automaton> RegexSetSearcher<'a, 'input, A> {
97    pub fn from_matcher(matcher: RegexSetMatcher<'a, A>, input: Input<'input>) -> Self {
98        let captures = matcher.captures;
99        let search = Search::start(input, captures);
100        Self {
101            search,
102            patterns: matcher.patterns,
103            regex: matcher.regex,
104            capturing_regex: matcher.capturing_regex,
105            capture_types: matcher.capture_types,
106        }
107    }
108}
109
110impl<'a, 'input, A: Automaton + Clone> RegexSetSearcher<'a, 'input, A> {
111    pub fn from_matcher_ref(matcher: &RegexSetMatcher<'a, A>, input: Input<'input>) -> Self {
112        let captures = matcher.captures.clone();
113        let search = Search::start(input, captures);
114        Self {
115            search,
116            patterns: matcher.patterns.clone(),
117            regex: matcher.regex.clone(),
118            capturing_regex: matcher.capturing_regex.clone(),
119            capture_types: matcher.capture_types.clone(),
120        }
121    }
122}
123impl<'a, 'input, A> fmt::Debug for RegexSetSearcher<'a, 'input, A> {
124    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
125        f.debug_struct("RegexSetSearcher")
126            .field("patterns", &self.patterns)
127            .field("capture_types", &self.search.captures)
128            .field("crlf", &self.search.crlf)
129            .field("forward", &self.search.forward)
130            .field("last_match_end", &self.search.last_match_end)
131            .finish()
132    }
133}
134impl<'a, 'input, A> Spanned for RegexSetSearcher<'a, 'input, A> {
135    fn span(&self) -> SourceSpan {
136        let start = self
137            .patterns
138            .iter()
139            .map(|p| p.span())
140            .min_by_key(|span| span.start())
141            .unwrap();
142        let end = self
143            .patterns
144            .iter()
145            .map(|p| p.span())
146            .max_by_key(|span| span.end())
147            .unwrap();
148        SourceSpan::from_range_unchecked(
149            start.source_id(),
150            start.start().to_usize()..end.end().to_usize(),
151        )
152    }
153}
154
155pub struct RegexSetMatcher<'a, A = dense::DFA<Vec<u32>>> {
156    /// The set of raw input patterns from which
157    /// this matcher was constructed
158    patterns: Vec<Span<Cow<'a, str>>>,
159    /// The compiled regex which will be used to search the input buffer
160    regex: dfa::regex::Regex<A>,
161    /// The regex used to obtain capturing groups, if there are any
162    capturing_regex: CapturingRegex,
163    /// Captures storage
164    captures: Captures,
165    /// Metadata about captures in the given patterns
166    ///
167    /// Each pattern gets its own vector of capture info, since
168    /// there is no requirement that all patterns have the same
169    /// number or type of captures
170    capture_types: Vec<Vec<Capture>>,
171}
172impl<'a> RegexSetMatcher<'a> {
173    pub fn new(patterns: Vec<RegexPattern<'a>>, config: &Config) -> DiagResult<Self> {
174        Self::new_with_start_kind(StartKind::Both, patterns, config)
175    }
176
177    pub fn new_with_start_kind(
178        start_kind: StartKind,
179        patterns: Vec<RegexPattern<'a>>,
180        config: &Config,
181    ) -> DiagResult<Self> {
182        let regex = dfa::regex::Regex::builder()
183            .dense(
184                dense::Config::new()
185                    .match_kind(MatchKind::All)
186                    .start_kind(start_kind)
187                    .starts_for_each_pattern(true),
188            )
189            .syntax(
190                syntax::Config::new()
191                    .multi_line(true)
192                    .case_insensitive(config.options.ignore_case),
193            )
194            .build_many(&patterns)
195            .map_err(|error| {
196                regex::build_error_to_diagnostic(error, patterns.len(), |id| patterns[id].span())
197            })?;
198
199        let has_captures = patterns.iter().any(|p| !p.captures.is_empty());
200        let (capturing_regex, captures) = if !has_captures {
201            (CapturingRegex::None, Captures::empty(GroupInfo::empty()))
202        } else {
203            onepass::DFA::builder()
204                .syntax(
205                    syntax::Config::new()
206                        .utf8(false)
207                        .multi_line(true)
208                        .case_insensitive(config.options.ignore_case),
209                )
210                .thompson(thompson::Config::new().utf8(false))
211                .configure(onepass::Config::new().starts_for_each_pattern(true))
212                .build_many(&patterns)
213                .map_or_else(
214                    |_| {
215                        let re = Regex::builder()
216                            .configure(Regex::config().match_kind(MatchKind::All))
217                            .syntax(
218                                syntax::Config::new()
219                                    .multi_line(true)
220                                    .case_insensitive(config.options.ignore_case),
221                            )
222                            .build_many(&patterns)
223                            .unwrap();
224                        let cache = re.create_cache();
225                        let captures = re.create_captures();
226                        (CapturingRegex::Default { re, cache }, captures)
227                    },
228                    |re| {
229                        let cache = re.create_cache();
230                        let captures = re.create_captures();
231                        (CapturingRegex::Onepass { re, cache }, captures)
232                    },
233                )
234        };
235
236        // Compute capture group information
237        let mut capture_types = vec![vec![]; patterns.len()];
238        let mut strings = Vec::with_capacity(patterns.len());
239        let groups = captures.group_info();
240        for (
241            i,
242            RegexPattern {
243                pattern,
244                captures: pattern_captures,
245            },
246        ) in patterns.into_iter().enumerate()
247        {
248            let span = pattern.span();
249            strings.push(pattern);
250            let pid = PatternID::new_unchecked(i);
251            let num_captures = groups.group_len(pid);
252            capture_types[i].resize(num_captures, Capture::Ignore(span));
253            for capture in pattern_captures.into_iter() {
254                if let Capture::Ignore(_) = capture {
255                    continue;
256                }
257                if let Some(group_name) = capture.group_name() {
258                    let group_id = groups
259                        .to_index(pid, group_name.as_str())
260                        .unwrap_or_else(|| {
261                            panic!("expected group for capture of '{group_name}' in pattern {i}")
262                        });
263                    capture_types[i][group_id] = capture;
264                } else {
265                    assert_eq!(
266                        &capture_types[i][0],
267                        &Capture::Ignore(span),
268                        "{capture:?} would overwrite a previous implicit capture group in pattern {i}"
269                    );
270                    capture_types[i][0] = capture;
271                }
272            }
273        }
274
275        Ok(Self {
276            patterns: strings,
277            regex,
278            capturing_regex,
279            captures,
280            capture_types,
281        })
282    }
283
284    pub fn patterns_len(&self) -> usize {
285        self.patterns.len()
286    }
287
288    pub fn first_pattern(&self) -> Span<usize> {
289        self.patterns
290            .iter()
291            .enumerate()
292            .map(|(i, p)| Span::new(p.span(), i))
293            .min_by_key(|span| span.span().start())
294            .unwrap()
295    }
296
297    pub fn first_pattern_span(&self) -> SourceSpan {
298        self.first_pattern().span()
299    }
300}
301impl<'a, A: Automaton + Clone> RegexSetMatcher<'a, A> {
302    pub fn search<'input>(&self, input: Input<'input>) -> RegexSetSearcher<'a, 'input, A> {
303        RegexSetSearcher::from_matcher_ref(self, input)
304    }
305}
306impl<'a, A: Automaton> RegexSetMatcher<'a, A> {
307    #[inline]
308    pub fn into_searcher<'input>(self, input: Input<'input>) -> RegexSetSearcher<'a, 'input, A> {
309        RegexSetSearcher::from_matcher(self, input)
310    }
311}
312impl<'a, A> fmt::Debug for RegexSetMatcher<'a, A> {
313    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
314        f.debug_struct("RegexSetMatcher")
315            .field("patterns", &self.patterns)
316            .field("captures", &self.captures)
317            .field("capture_types", &self.capture_types)
318            .finish()
319    }
320}
321impl<'a, A> Spanned for RegexSetMatcher<'a, A> {
322    fn span(&self) -> SourceSpan {
323        let start = self
324            .patterns
325            .iter()
326            .map(|p| p.span())
327            .min_by_key(|span| span.start())
328            .unwrap();
329        let end = self
330            .patterns
331            .iter()
332            .map(|p| p.span())
333            .max_by_key(|span| span.end())
334            .unwrap();
335        SourceSpan::from_range_unchecked(
336            start.source_id(),
337            start.start().to_usize()..end.end().to_usize(),
338        )
339    }
340}
341impl<'a, A: Automaton + Clone> MatcherMut for RegexSetMatcher<'a, A> {
342    fn try_match_mut<'input, 'context, C>(
343        &self,
344        input: Input<'input>,
345        context: &mut C,
346    ) -> DiagResult<MatchResult<'input>>
347    where
348        C: Context<'input, 'context> + ?Sized,
349    {
350        let mut searcher = self.search(input);
351        let matched = searcher.try_match_next(context)?;
352        matched.bind_captures_in(context);
353        Ok(matched)
354    }
355}
356impl<'a, 'input, A> PatternSearcher<'input> for RegexSetSearcher<'a, 'input, A>
357where
358    A: Automaton,
359{
360    type Input = regex_automata::Input<'input>;
361    type PatternID = PatternID;
362
363    fn input(&self) -> &Self::Input {
364        &self.search.input
365    }
366    fn last_match_end(&self) -> Option<usize> {
367        self.search.last_match_end
368    }
369    fn set_last_match_end(&mut self, end: usize) {
370        self.search.last_match_end = Some(end);
371        self.search.input.set_start(end);
372    }
373    fn patterns_len(&self) -> usize {
374        self.patterns.len()
375    }
376    fn pattern_span(&self, id: Self::PatternID) -> SourceSpan {
377        self.patterns[id.as_usize()].span()
378    }
379    fn try_match_next<'context, C>(&mut self, context: &mut C) -> DiagResult<MatchResult<'input>>
380    where
381        C: Context<'input, 'context> + ?Sized,
382    {
383        let (fwd_dfa, rev_dfa) = (self.regex.forward(), self.regex.reverse());
384        let matched;
385        let mut last_end = self
386            .search
387            .last_match_end
388            .unwrap_or(self.search.input.start());
389        loop {
390            if self.search.forward {
391                if let Some((pattern_id, end)) = {
392                    fwd_dfa
393                        .try_search_overlapping_fwd(
394                            &self.search.input,
395                            &mut self.search.overlapping,
396                        )
397                        .expect("match error");
398                    self.search
399                        .overlapping
400                        .get_match()
401                        .map(|hm| (hm.pattern(), hm.offset()))
402                } {
403                    last_end = end;
404                    self.search
405                        .reverse_input
406                        .set_anchored(Anchored::Pattern(pattern_id));
407                    self.search
408                        .reverse_input
409                        .set_range(self.search.input.start()..end);
410                    self.search.forward = false;
411                    self.search.overlapping_reverse = OverlappingState::start();
412                    continue;
413                } else {
414                    matched = None;
415                    break;
416                }
417            } else if let Some((pattern_id, start)) = {
418                rev_dfa
419                    .try_search_overlapping_rev(
420                        &self.search.reverse_input,
421                        &mut self.search.overlapping_reverse,
422                    )
423                    .expect("match error");
424                self.search
425                    .overlapping_reverse
426                    .get_match()
427                    .map(|hm| (hm.pattern(), hm.offset()))
428            } {
429                if start == last_end && !self.search.input.is_char_boundary(last_end) {
430                    continue;
431                }
432                self.search.last_match_end = Some(last_end);
433                matched = Some(regex_automata::Match::new(pattern_id, start..last_end));
434                break;
435            } else {
436                self.search.forward = true;
437            }
438        }
439
440        if let Some(matched) = matched {
441            self.search.captures.clear();
442            let pattern_id = matched.pattern();
443            match self.capturing_regex {
444                CapturingRegex::None => {
445                    let overall_span = SourceSpan::from_range_unchecked(
446                        self.search.input.source_id(),
447                        matched.range(),
448                    );
449                    let pattern_index = pattern_id.as_usize();
450                    let pattern_span = self.patterns[pattern_index].span();
451                    Ok(MatchResult::ok(MatchInfo {
452                        span: overall_span,
453                        pattern_span,
454                        pattern_id: pattern_index,
455                        captures: vec![],
456                    }))
457                }
458                CapturingRegex::Default {
459                    ref re,
460                    ref mut cache,
461                } => {
462                    let input = self
463                        .search
464                        .input
465                        .clone()
466                        .anchored(Anchored::Pattern(pattern_id))
467                        .range(matched.range());
468                    re.search_captures_with(cache, &input, &mut self.search.captures);
469                    if let Some(matched) = self.search.captures.get_match() {
470                        extract_captures_from_match(
471                            matched,
472                            &self.search,
473                            &self.patterns,
474                            &self.capture_types,
475                            context,
476                        )
477                    } else {
478                        let span = SourceSpan::from_range_unchecked(
479                            self.search.input.source_id(),
480                            matched.range(),
481                        );
482                        let pattern_span = self.patterns[pattern_id.as_usize()].span();
483                        let error = CheckFailedError::MatchError {
484                            span,
485                            input_file: context.input_file(),
486                            labels: vec![RelatedLabel::note(Label::at(pattern_span), context.source_file(pattern_span.source_id()).unwrap())],
487                            help: Some("meta regex searcher failed to match the input even though an initial DFA pass found a match".to_string()),
488                        };
489                        Err(Report::new(error))
490                    }
491                }
492                CapturingRegex::Onepass {
493                    ref re,
494                    ref mut cache,
495                } => {
496                    let input = self
497                        .search
498                        .input
499                        .clone()
500                        .anchored(Anchored::Pattern(pattern_id))
501                        .range(matched.range());
502                    re.captures(cache, input, &mut self.search.captures);
503                    if let Some(matched) = self.search.captures.get_match() {
504                        extract_captures_from_match(
505                            matched,
506                            &self.search,
507                            &self.patterns,
508                            &self.capture_types,
509                            context,
510                        )
511                    } else {
512                        let span = SourceSpan::from_range_unchecked(
513                            self.search.input.source_id(),
514                            matched.range(),
515                        );
516                        let pattern_span = self.patterns[pattern_id.as_usize()].span();
517                        let error = CheckFailedError::MatchError {
518                            span,
519                            input_file: context.input_file(),
520                            labels: vec![RelatedLabel::note(Label::at(pattern_span), context.source_file(pattern_span.source_id()).unwrap())],
521                            help: Some("onepass regex searcher failed to match the input even though an initial DFA pass found a match".to_string()),
522                        };
523                        Err(Report::new(error))
524                    }
525                }
526            }
527        } else {
528            Ok(MatchResult::failed(
529                CheckFailedError::MatchNoneButExpected {
530                    span: self.span(),
531                    match_file: context.source_file(self.span().source_id()).unwrap(),
532                    note: None,
533                },
534            ))
535        }
536    }
537}
538
539fn extract_captures_from_match<'a, 'input, 'context, C>(
540    matched: regex_automata::Match,
541    search: &Search<'_>,
542    patterns: &[Span<Cow<'a, str>>],
543    capture_types: &[Vec<Capture>],
544    context: &C,
545) -> DiagResult<MatchResult<'input>>
546where
547    C: Context<'input, 'context> + ?Sized,
548{
549    let pattern_id = matched.pattern();
550    let pattern_index = pattern_id.as_usize();
551    let pattern_span = patterns[pattern_index].span();
552    let overall_span = SourceSpan::from_range_unchecked(search.input.source_id(), matched.range());
553    let mut capture_infos = Vec::with_capacity(search.captures.group_len());
554    for (index, (maybe_capture_span, capture)) in search
555        .captures
556        .iter()
557        .zip(capture_types[pattern_id].iter().copied())
558        .enumerate()
559    {
560        if let Some(capture_span) = maybe_capture_span {
561            let input = context.search();
562            let captured = input.as_str(capture_span.range());
563            let capture_span =
564                SourceSpan::from_range_unchecked(search.input.source_id(), capture_span.range());
565            let result = regex::try_convert_capture_to_type(
566                pattern_id,
567                index,
568                pattern_span,
569                overall_span,
570                Span::new(capture_span, captured),
571                capture,
572                &search.captures,
573                context,
574            );
575            match result {
576                Ok(capture_info) => {
577                    capture_infos.push(capture_info);
578                }
579                Err(error) => {
580                    return Ok(MatchResult::failed(error));
581                }
582            }
583        }
584    }
585    Ok(MatchResult::ok(MatchInfo {
586        span: overall_span,
587        pattern_span,
588        pattern_id: pattern_index,
589        captures: capture_infos,
590    }))
591}