1#![doc(hidden)]
2use alloc::{fmt, vec::Vec};
10use core::marker::PhantomData;
11
12use crate::ParseError;
13
14use regex_automata::hybrid::dfa::{Cache, DFA};
15use regex_automata::hybrid::{BuildError, LazyStateID};
16use regex_automata::nfa::thompson::Config as NfaConfig;
17use regex_automata::util::syntax::Config as SyntaxConfig;
18use regex_automata::{Anchored, Input, MatchKind};
19
20#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
21pub struct Token<'input>(pub usize, pub &'input str);
22impl fmt::Display for Token<'_> {
23 fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
24 fmt::Display::fmt(self.1, formatter)
25 }
26}
27
28pub struct MatcherBuilder {
29 dfa: DFA,
30 skip_vec: Vec<bool>,
31}
32
33impl MatcherBuilder {
34 #[allow(clippy::result_large_err)]
35 pub fn new<S>(exprs: impl IntoIterator<Item = (S, bool)>) -> Result<MatcherBuilder, BuildError>
36 where
37 S: AsRef<str>,
38 {
39 let exprs = exprs.into_iter();
40 let mut regex_vec = Vec::with_capacity(exprs.size_hint().0);
41 let mut skip_vec = Vec::with_capacity(exprs.size_hint().0);
42 for (regex, skip) in exprs {
43 regex_vec.push(regex);
44 skip_vec.push(skip);
45 }
46
47 let enable_unicode = cfg!(feature = "unicode");
48 let dfa = DFA::builder()
49 .configure(DFA::config().match_kind(MatchKind::All))
50 .syntax(
51 SyntaxConfig::new()
52 .unicode(enable_unicode)
53 .utf8(enable_unicode),
54 )
55 .thompson(NfaConfig::new().utf8(enable_unicode).shrink(true))
56 .build_many(®ex_vec)?;
57
58 Ok(MatcherBuilder { dfa, skip_vec })
59 }
60
61 pub fn matcher<'input, 'builder, E>(
62 &'builder self,
63 text: &'input str,
64 ) -> Matcher<'input, 'builder, E> {
65 let input = Input::new(text).anchored(Anchored::Yes);
66 let mut cache = self.dfa.create_cache();
67 let start = self.dfa.start_state_forward(&mut cache, &input).unwrap();
68 Matcher {
69 text,
70 consumed: 0,
71 cache,
72 start,
73 dfa: &self.dfa,
74 skip_vec: &self.skip_vec,
75 _marker: PhantomData,
76 }
77 }
78}
79
80pub struct Matcher<'input, 'builder, E> {
81 text: &'input str,
82 consumed: usize,
83 cache: Cache,
84 start: LazyStateID,
85 dfa: &'builder DFA,
86 skip_vec: &'builder [bool],
87 _marker: PhantomData<fn() -> E>,
88}
89
90impl<'input, E> Iterator for Matcher<'input, '_, E> {
91 type Item = Result<(usize, Token<'input>, usize), ParseError<usize, Token<'input>, E>>;
92
93 fn next(&mut self) -> Option<Self::Item> {
94 loop {
95 let text = self.text;
96 let start_offset = self.consumed;
97 if text.is_empty() {
98 self.consumed = start_offset;
99 return None;
100 }
101
102 let mut match_ = None;
103 'search: {
104 let mut state = self.start;
105 for (i, byte) in text.bytes().enumerate() {
106 state = self.dfa.next_state(&mut self.cache, state, byte).unwrap();
107 if state.is_match() {
108 match_ = Some((state, i));
109 } else if state.is_dead() {
110 break 'search;
111 }
112 }
113 state = self.dfa.next_eoi_state(&mut self.cache, state).unwrap();
114 if state.is_match() {
115 match_ = Some((state, text.len()));
116 }
117 }
118
119 let (match_state, longest_match) = match match_ {
120 Some(match_) => match_,
121 None => {
122 return Some(Err(ParseError::InvalidToken {
123 location: start_offset,
124 }))
125 }
126 };
127 let index = (0..self.dfa.match_len(&self.cache, match_state))
128 .map(|n| {
129 self.dfa
130 .match_pattern(&self.cache, match_state, n)
131 .as_usize()
132 })
133 .max()
134 .unwrap();
135
136 let result = &text[..longest_match];
137 let remaining = &text[longest_match..];
138 let end_offset = start_offset + longest_match;
139 self.text = remaining;
140 self.consumed = end_offset;
141
142 if self.skip_vec[index] {
143 if longest_match == 0 {
144 return Some(Err(ParseError::InvalidToken {
145 location: start_offset,
146 }));
147 }
148 continue;
149 }
150
151 return Some(Ok((start_offset, Token(index, result), end_offset)));
152 }
153 }
154}