1use regex_automata::{
2 Anchored, MatchKind, PatternID,
3 dfa::{self, Automaton, OverlappingState, StartKind, dense, onepass},
4 meta,
5 nfa::thompson,
6 util::{
7 captures::{Captures, GroupInfo},
8 syntax,
9 },
10};
11
12use crate::{
13 ast::{Capture, RegexPattern},
14 common::*,
15 pattern::{matcher::regex, search::Input as _},
16};
17
18#[derive(Default, Clone)]
19#[allow(clippy::large_enum_variant)]
20pub enum CapturingRegex {
21 #[default]
22 None,
23 Onepass {
24 re: onepass::DFA,
25 cache: onepass::Cache,
26 },
27 Default {
28 re: meta::Regex,
29 cache: meta::Cache,
30 },
31}
32
33struct Search<'input> {
34 crlf: bool,
35 forward: bool,
36 input: regex_automata::Input<'input>,
37 reverse_input: regex_automata::Input<'input>,
38 last_match_end: Option<usize>,
39 captures: Captures,
40 overlapping: OverlappingState,
41 overlapping_reverse: OverlappingState,
42}
43impl<'input> Search<'input> {
44 fn start(input: Input<'input>, captures: Captures) -> Self {
45 let crlf = input.is_crlf();
46 let input: regex_automata::Input<'input> = input.into();
47 let reverse_input = input.clone().earliest(false);
48 let overlapping = OverlappingState::start();
49 let overlapping_reverse = OverlappingState::start();
50 Self {
51 crlf,
52 forward: true,
53 input,
54 reverse_input,
55 last_match_end: None,
56 overlapping,
57 overlapping_reverse,
58 captures,
59 }
60 }
61}
62
63pub struct RegexSetSearcher<'a, 'input, A = dense::DFA<Vec<u32>>> {
64 search: Search<'input>,
65 patterns: Vec<Span<Cow<'a, str>>>,
68 regex: dfa::regex::Regex<A>,
70 capturing_regex: CapturingRegex,
72 capture_types: Vec<Vec<Capture>>,
78}
79impl<'a, 'input> RegexSetSearcher<'a, 'input> {
80 pub fn new(
81 input: Input<'input>,
82 patterns: Vec<RegexPattern<'a>>,
83 config: &Config,
84 ) -> DiagResult<Self> {
85 let start_kind = if input.is_anchored() {
86 StartKind::Anchored
87 } else {
88 StartKind::Unanchored
89 };
90 Ok(
91 RegexSetMatcher::new_with_start_kind(start_kind, patterns, config)?
92 .into_searcher(input),
93 )
94 }
95}
96impl<'a, 'input, A: Automaton> RegexSetSearcher<'a, 'input, A> {
97 pub fn from_matcher(matcher: RegexSetMatcher<'a, A>, input: Input<'input>) -> Self {
98 let captures = matcher.captures;
99 let search = Search::start(input, captures);
100 Self {
101 search,
102 patterns: matcher.patterns,
103 regex: matcher.regex,
104 capturing_regex: matcher.capturing_regex,
105 capture_types: matcher.capture_types,
106 }
107 }
108}
109
110impl<'a, 'input, A: Automaton + Clone> RegexSetSearcher<'a, 'input, A> {
111 pub fn from_matcher_ref(matcher: &RegexSetMatcher<'a, A>, input: Input<'input>) -> Self {
112 let captures = matcher.captures.clone();
113 let search = Search::start(input, captures);
114 Self {
115 search,
116 patterns: matcher.patterns.clone(),
117 regex: matcher.regex.clone(),
118 capturing_regex: matcher.capturing_regex.clone(),
119 capture_types: matcher.capture_types.clone(),
120 }
121 }
122}
123impl<'a, 'input, A> fmt::Debug for RegexSetSearcher<'a, 'input, A> {
124 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
125 f.debug_struct("RegexSetSearcher")
126 .field("patterns", &self.patterns)
127 .field("capture_types", &self.search.captures)
128 .field("crlf", &self.search.crlf)
129 .field("forward", &self.search.forward)
130 .field("last_match_end", &self.search.last_match_end)
131 .finish()
132 }
133}
134impl<'a, 'input, A> Spanned for RegexSetSearcher<'a, 'input, A> {
135 fn span(&self) -> SourceSpan {
136 let start = self
137 .patterns
138 .iter()
139 .map(|p| p.span())
140 .min_by_key(|span| span.start())
141 .unwrap();
142 let end = self
143 .patterns
144 .iter()
145 .map(|p| p.span())
146 .max_by_key(|span| span.end())
147 .unwrap();
148 SourceSpan::from_range_unchecked(
149 start.source_id(),
150 start.start().to_usize()..end.end().to_usize(),
151 )
152 }
153}
154
155pub struct RegexSetMatcher<'a, A = dense::DFA<Vec<u32>>> {
156 patterns: Vec<Span<Cow<'a, str>>>,
159 regex: dfa::regex::Regex<A>,
161 capturing_regex: CapturingRegex,
163 captures: Captures,
165 capture_types: Vec<Vec<Capture>>,
171}
172impl<'a> RegexSetMatcher<'a> {
173 pub fn new(patterns: Vec<RegexPattern<'a>>, config: &Config) -> DiagResult<Self> {
174 Self::new_with_start_kind(StartKind::Both, patterns, config)
175 }
176
177 pub fn new_with_start_kind(
178 start_kind: StartKind,
179 patterns: Vec<RegexPattern<'a>>,
180 config: &Config,
181 ) -> DiagResult<Self> {
182 let regex = dfa::regex::Regex::builder()
183 .dense(
184 dense::Config::new()
185 .match_kind(MatchKind::All)
186 .start_kind(start_kind)
187 .starts_for_each_pattern(true),
188 )
189 .syntax(
190 syntax::Config::new()
191 .multi_line(true)
192 .case_insensitive(config.options.ignore_case),
193 )
194 .build_many(&patterns)
195 .map_err(|error| {
196 regex::build_error_to_diagnostic(error, patterns.len(), |id| patterns[id].span())
197 })?;
198
199 let has_captures = patterns.iter().any(|p| !p.captures.is_empty());
200 let (capturing_regex, captures) = if !has_captures {
201 (CapturingRegex::None, Captures::empty(GroupInfo::empty()))
202 } else {
203 onepass::DFA::builder()
204 .syntax(
205 syntax::Config::new()
206 .utf8(false)
207 .multi_line(true)
208 .case_insensitive(config.options.ignore_case),
209 )
210 .thompson(thompson::Config::new().utf8(false))
211 .configure(onepass::Config::new().starts_for_each_pattern(true))
212 .build_many(&patterns)
213 .map_or_else(
214 |_| {
215 let re = Regex::builder()
216 .configure(Regex::config().match_kind(MatchKind::All))
217 .syntax(
218 syntax::Config::new()
219 .multi_line(true)
220 .case_insensitive(config.options.ignore_case),
221 )
222 .build_many(&patterns)
223 .unwrap();
224 let cache = re.create_cache();
225 let captures = re.create_captures();
226 (CapturingRegex::Default { re, cache }, captures)
227 },
228 |re| {
229 let cache = re.create_cache();
230 let captures = re.create_captures();
231 (CapturingRegex::Onepass { re, cache }, captures)
232 },
233 )
234 };
235
236 let mut capture_types = vec![vec![]; patterns.len()];
238 let mut strings = Vec::with_capacity(patterns.len());
239 let groups = captures.group_info();
240 for (
241 i,
242 RegexPattern {
243 pattern,
244 captures: pattern_captures,
245 },
246 ) in patterns.into_iter().enumerate()
247 {
248 let span = pattern.span();
249 strings.push(pattern);
250 let pid = PatternID::new_unchecked(i);
251 let num_captures = groups.group_len(pid);
252 capture_types[i].resize(num_captures, Capture::Ignore(span));
253 for capture in pattern_captures.into_iter() {
254 if let Capture::Ignore(_) = capture {
255 continue;
256 }
257 if let Some(group_name) = capture.group_name() {
258 let group_id = groups
259 .to_index(pid, group_name.as_str())
260 .unwrap_or_else(|| {
261 panic!("expected group for capture of '{group_name}' in pattern {i}")
262 });
263 capture_types[i][group_id] = capture;
264 } else {
265 assert_eq!(
266 &capture_types[i][0],
267 &Capture::Ignore(span),
268 "{capture:?} would overwrite a previous implicit capture group in pattern {i}"
269 );
270 capture_types[i][0] = capture;
271 }
272 }
273 }
274
275 Ok(Self {
276 patterns: strings,
277 regex,
278 capturing_regex,
279 captures,
280 capture_types,
281 })
282 }
283
284 pub fn patterns_len(&self) -> usize {
285 self.patterns.len()
286 }
287
288 pub fn first_pattern(&self) -> Span<usize> {
289 self.patterns
290 .iter()
291 .enumerate()
292 .map(|(i, p)| Span::new(p.span(), i))
293 .min_by_key(|span| span.span().start())
294 .unwrap()
295 }
296
297 pub fn first_pattern_span(&self) -> SourceSpan {
298 self.first_pattern().span()
299 }
300}
301impl<'a, A: Automaton + Clone> RegexSetMatcher<'a, A> {
302 pub fn search<'input>(&self, input: Input<'input>) -> RegexSetSearcher<'a, 'input, A> {
303 RegexSetSearcher::from_matcher_ref(self, input)
304 }
305}
306impl<'a, A: Automaton> RegexSetMatcher<'a, A> {
307 #[inline]
308 pub fn into_searcher<'input>(self, input: Input<'input>) -> RegexSetSearcher<'a, 'input, A> {
309 RegexSetSearcher::from_matcher(self, input)
310 }
311}
312impl<'a, A> fmt::Debug for RegexSetMatcher<'a, A> {
313 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
314 f.debug_struct("RegexSetMatcher")
315 .field("patterns", &self.patterns)
316 .field("captures", &self.captures)
317 .field("capture_types", &self.capture_types)
318 .finish()
319 }
320}
321impl<'a, A> Spanned for RegexSetMatcher<'a, A> {
322 fn span(&self) -> SourceSpan {
323 let start = self
324 .patterns
325 .iter()
326 .map(|p| p.span())
327 .min_by_key(|span| span.start())
328 .unwrap();
329 let end = self
330 .patterns
331 .iter()
332 .map(|p| p.span())
333 .max_by_key(|span| span.end())
334 .unwrap();
335 SourceSpan::from_range_unchecked(
336 start.source_id(),
337 start.start().to_usize()..end.end().to_usize(),
338 )
339 }
340}
341impl<'a, A: Automaton + Clone> MatcherMut for RegexSetMatcher<'a, A> {
342 fn try_match_mut<'input, 'context, C>(
343 &self,
344 input: Input<'input>,
345 context: &mut C,
346 ) -> DiagResult<MatchResult<'input>>
347 where
348 C: Context<'input, 'context> + ?Sized,
349 {
350 let mut searcher = self.search(input);
351 let matched = searcher.try_match_next(context)?;
352 matched.bind_captures_in(context);
353 Ok(matched)
354 }
355}
356impl<'a, 'input, A> PatternSearcher<'input> for RegexSetSearcher<'a, 'input, A>
357where
358 A: Automaton,
359{
360 type Input = regex_automata::Input<'input>;
361 type PatternID = PatternID;
362
363 fn input(&self) -> &Self::Input {
364 &self.search.input
365 }
366 fn last_match_end(&self) -> Option<usize> {
367 self.search.last_match_end
368 }
369 fn set_last_match_end(&mut self, end: usize) {
370 self.search.last_match_end = Some(end);
371 self.search.input.set_start(end);
372 }
373 fn patterns_len(&self) -> usize {
374 self.patterns.len()
375 }
376 fn pattern_span(&self, id: Self::PatternID) -> SourceSpan {
377 self.patterns[id.as_usize()].span()
378 }
379 fn try_match_next<'context, C>(&mut self, context: &mut C) -> DiagResult<MatchResult<'input>>
380 where
381 C: Context<'input, 'context> + ?Sized,
382 {
383 let (fwd_dfa, rev_dfa) = (self.regex.forward(), self.regex.reverse());
384 let matched;
385 let mut last_end = self
386 .search
387 .last_match_end
388 .unwrap_or(self.search.input.start());
389 loop {
390 if self.search.forward {
391 if let Some((pattern_id, end)) = {
392 fwd_dfa
393 .try_search_overlapping_fwd(
394 &self.search.input,
395 &mut self.search.overlapping,
396 )
397 .expect("match error");
398 self.search
399 .overlapping
400 .get_match()
401 .map(|hm| (hm.pattern(), hm.offset()))
402 } {
403 last_end = end;
404 self.search
405 .reverse_input
406 .set_anchored(Anchored::Pattern(pattern_id));
407 self.search
408 .reverse_input
409 .set_range(self.search.input.start()..end);
410 self.search.forward = false;
411 self.search.overlapping_reverse = OverlappingState::start();
412 continue;
413 } else {
414 matched = None;
415 break;
416 }
417 } else if let Some((pattern_id, start)) = {
418 rev_dfa
419 .try_search_overlapping_rev(
420 &self.search.reverse_input,
421 &mut self.search.overlapping_reverse,
422 )
423 .expect("match error");
424 self.search
425 .overlapping_reverse
426 .get_match()
427 .map(|hm| (hm.pattern(), hm.offset()))
428 } {
429 if start == last_end && !self.search.input.is_char_boundary(last_end) {
430 continue;
431 }
432 self.search.last_match_end = Some(last_end);
433 matched = Some(regex_automata::Match::new(pattern_id, start..last_end));
434 break;
435 } else {
436 self.search.forward = true;
437 }
438 }
439
440 if let Some(matched) = matched {
441 self.search.captures.clear();
442 let pattern_id = matched.pattern();
443 match self.capturing_regex {
444 CapturingRegex::None => {
445 let overall_span = SourceSpan::from_range_unchecked(
446 self.search.input.source_id(),
447 matched.range(),
448 );
449 let pattern_index = pattern_id.as_usize();
450 let pattern_span = self.patterns[pattern_index].span();
451 Ok(MatchResult::ok(MatchInfo {
452 span: overall_span,
453 pattern_span,
454 pattern_id: pattern_index,
455 captures: vec![],
456 }))
457 }
458 CapturingRegex::Default {
459 ref re,
460 ref mut cache,
461 } => {
462 let input = self
463 .search
464 .input
465 .clone()
466 .anchored(Anchored::Pattern(pattern_id))
467 .range(matched.range());
468 re.search_captures_with(cache, &input, &mut self.search.captures);
469 if let Some(matched) = self.search.captures.get_match() {
470 extract_captures_from_match(
471 matched,
472 &self.search,
473 &self.patterns,
474 &self.capture_types,
475 context,
476 )
477 } else {
478 let span = SourceSpan::from_range_unchecked(
479 self.search.input.source_id(),
480 matched.range(),
481 );
482 let pattern_span = self.patterns[pattern_id.as_usize()].span();
483 let error = CheckFailedError::MatchError {
484 span,
485 input_file: context.input_file(),
486 labels: vec![RelatedLabel::note(Label::at(pattern_span), context.source_file(pattern_span.source_id()).unwrap())],
487 help: Some("meta regex searcher failed to match the input even though an initial DFA pass found a match".to_string()),
488 };
489 Err(Report::new(error))
490 }
491 }
492 CapturingRegex::Onepass {
493 ref re,
494 ref mut cache,
495 } => {
496 let input = self
497 .search
498 .input
499 .clone()
500 .anchored(Anchored::Pattern(pattern_id))
501 .range(matched.range());
502 re.captures(cache, input, &mut self.search.captures);
503 if let Some(matched) = self.search.captures.get_match() {
504 extract_captures_from_match(
505 matched,
506 &self.search,
507 &self.patterns,
508 &self.capture_types,
509 context,
510 )
511 } else {
512 let span = SourceSpan::from_range_unchecked(
513 self.search.input.source_id(),
514 matched.range(),
515 );
516 let pattern_span = self.patterns[pattern_id.as_usize()].span();
517 let error = CheckFailedError::MatchError {
518 span,
519 input_file: context.input_file(),
520 labels: vec![RelatedLabel::note(Label::at(pattern_span), context.source_file(pattern_span.source_id()).unwrap())],
521 help: Some("onepass regex searcher failed to match the input even though an initial DFA pass found a match".to_string()),
522 };
523 Err(Report::new(error))
524 }
525 }
526 }
527 } else {
528 Ok(MatchResult::failed(
529 CheckFailedError::MatchNoneButExpected {
530 span: self.span(),
531 match_file: context.source_file(self.span().source_id()).unwrap(),
532 note: None,
533 },
534 ))
535 }
536 }
537}
538
539fn extract_captures_from_match<'a, 'input, 'context, C>(
540 matched: regex_automata::Match,
541 search: &Search<'_>,
542 patterns: &[Span<Cow<'a, str>>],
543 capture_types: &[Vec<Capture>],
544 context: &C,
545) -> DiagResult<MatchResult<'input>>
546where
547 C: Context<'input, 'context> + ?Sized,
548{
549 let pattern_id = matched.pattern();
550 let pattern_index = pattern_id.as_usize();
551 let pattern_span = patterns[pattern_index].span();
552 let overall_span = SourceSpan::from_range_unchecked(search.input.source_id(), matched.range());
553 let mut capture_infos = Vec::with_capacity(search.captures.group_len());
554 for (index, (maybe_capture_span, capture)) in search
555 .captures
556 .iter()
557 .zip(capture_types[pattern_id].iter().copied())
558 .enumerate()
559 {
560 if let Some(capture_span) = maybe_capture_span {
561 let input = context.search();
562 let captured = input.as_str(capture_span.range());
563 let capture_span =
564 SourceSpan::from_range_unchecked(search.input.source_id(), capture_span.range());
565 let result = regex::try_convert_capture_to_type(
566 pattern_id,
567 index,
568 pattern_span,
569 overall_span,
570 Span::new(capture_span, captured),
571 capture,
572 &search.captures,
573 context,
574 );
575 match result {
576 Ok(capture_info) => {
577 capture_infos.push(capture_info);
578 }
579 Err(error) => {
580 return Ok(MatchResult::failed(error));
581 }
582 }
583 }
584 }
585 Ok(MatchResult::ok(MatchInfo {
586 span: overall_span,
587 pattern_span,
588 pattern_id: pattern_index,
589 captures: capture_infos,
590 }))
591}