Skip to main content

resharp/
lib.rs

1//! resharp - a regex engine with all boolean operations and lookarounds,
2//! powered by symbolic derivatives and lazy DFA construction.
3
4#![deny(missing_docs)]
5
6pub(crate) mod accel;
7pub(crate) mod engine;
8pub(crate) mod simd;
9
10#[doc(hidden)]
11pub use engine::calc_potential_start;
12#[doc(hidden)]
13pub use engine::calc_prefix_sets;
14#[doc(hidden)]
15pub use resharp_algebra::solver::TSetId;
16
17pub use resharp_algebra::nulls::Nullability;
18pub use resharp_algebra::NodeId;
19pub use resharp_algebra::RegexBuilder;
20
21use std::sync::Mutex;
22
23/// error from compiling or matching a regex.
24#[derive(Debug)]
25#[non_exhaustive]
26pub enum Error {
27    /// parse failure.
28    Parse(resharp_parser::ResharpError),
29    /// algebra error (unsupported pattern, anchor limit).
30    Algebra(resharp_algebra::AlgebraError),
31    /// DFA state cache exceeded `max_dfa_capacity`.
32    CapacityExceeded,
33}
34
35impl std::fmt::Display for Error {
36    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
37        match self {
38            Error::Parse(e) => write!(f, "parse error: {}", e),
39            Error::Algebra(e) => write!(f, "{}", e),
40            Error::CapacityExceeded => write!(f, "DFA state capacity exceeded"),
41        }
42    }
43}
44
45impl std::error::Error for Error {
46    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
47        match self {
48            Error::Parse(e) => Some(e),
49            Error::Algebra(e) => Some(e),
50            Error::CapacityExceeded => None,
51        }
52    }
53}
54
55impl From<resharp_parser::ResharpError> for Error {
56    fn from(e: resharp_parser::ResharpError) -> Self {
57        Error::Parse(e)
58    }
59}
60
61impl From<resharp_algebra::AlgebraError> for Error {
62    fn from(e: resharp_algebra::AlgebraError) -> Self {
63        Error::Algebra(e)
64    }
65}
66
67/// lazy DFA engine options.
68pub struct EngineOptions {
69    /// states to eagerly precompile (0 = fully lazy).
70    pub dfa_threshold: usize,
71    /// max cached DFA states; clamped to `u16::MAX`.
72    pub max_dfa_capacity: usize,
73    /// max lookahead context distance (default: 800).
74    pub lookahead_context_max: u32,
75}
76
77impl Default for EngineOptions {
78    fn default() -> Self {
79        Self {
80            dfa_threshold: 0,
81            max_dfa_capacity: u16::MAX as usize,
82            lookahead_context_max: 800,
83        }
84    }
85}
86
87/// byte-offset range `[start, end)`.
88#[derive(Debug, Clone, Copy, PartialEq, Eq)]
89#[repr(C)]
90pub struct Match {
91    /// inclusive start.
92    pub start: usize,
93    /// exclusive end.
94    pub end: usize,
95}
96
97struct RegexInner {
98    b: RegexBuilder,
99    fwd: engine::LazyDFA,
100    rev: engine::LazyDFA,
101    nulls_buf: Vec<usize>,
102}
103
104/// compiled regex backed by a lazy DFA.
105///
106/// uses a `Mutex` for mutable DFA state; clone for per-thread matching.
107pub struct Regex {
108    inner: Mutex<RegexInner>,
109    fwd_prefix: Option<accel::FwdPrefixSearch>,
110    fixed_length: Option<u32>,
111    #[allow(dead_code)]
112    max_length: Option<u32>,
113    empty_nullable: bool,
114}
115
116impl Regex {
117    /// compile with default options.
118    pub fn new(pattern: &str) -> Result<Regex, Error> {
119        Self::with_options(pattern, EngineOptions::default())
120    }
121
122    /// compile with custom options.
123    pub fn with_options(pattern: &str, opts: EngineOptions) -> Result<Regex, Error> {
124        let mut b = RegexBuilder::new();
125        b.lookahead_context_max = opts.lookahead_context_max;
126        let node = resharp_parser::parse_ast(&mut b, pattern)?;
127        Self::from_node(b, node, opts)
128    }
129
130    /// build from a pre-constructed AST node.
131    pub fn from_node(
132        mut b: RegexBuilder,
133        node: NodeId,
134        opts: EngineOptions,
135    ) -> Result<Regex, Error> {
136        let empty_nullable = b
137            .nullability_emptystring(node)
138            .has(Nullability::EMPTYSTRING);
139
140        let fwd_start = b.strip_lb(node)?;
141        let rev_start = b.reverse(node)?;
142        let ts_rev_start = b.mk_concat(NodeId::TS, rev_start);
143
144        let fixed_length = b.get_fixed_length(node);
145        let (min_len, max_len) = b.get_min_max_length(node);
146        let max_length = if max_len != u32::MAX {
147            Some(max_len)
148        } else {
149            None
150        };
151        let can_match_fwd = !b.is_infinite(node) && !b.contains_look(node);
152
153        let max_cap = opts.max_dfa_capacity.min(u16::MAX as usize);
154        let mut fwd = engine::LazyDFA::new(&mut b, fwd_start, max_cap)?;
155        let mut rev = engine::LazyDFA::new(&mut b, ts_rev_start, max_cap)?;
156
157        if opts.dfa_threshold > 0 {
158            fwd.precompile(&mut b, opts.dfa_threshold);
159            rev.precompile(&mut b, opts.dfa_threshold);
160        }
161
162        let fwd_prefix = if min_len > 0 && can_match_fwd {
163            engine::build_fwd_prefix(&mut b, node)?
164        } else {
165            None
166        };
167
168        rev.compute_skip(&mut b, rev_start)?;
169
170        Ok(Regex {
171            inner: Mutex::new(RegexInner {
172                b,
173                fwd,
174                rev,
175                nulls_buf: Vec::new(),
176            }),
177            fwd_prefix,
178            fixed_length,
179            max_length,
180            empty_nullable,
181        })
182    }
183
184    /// (fwd_states, rev_states) count.
185    pub fn dfa_stats(&self) -> (usize, usize) {
186        let inner = self.inner.lock().unwrap();
187        (inner.fwd.state_nodes.len(), inner.rev.state_nodes.len())
188    }
189
190    /// all non-overlapping matches, left-to-right.
191    pub fn find_all(&self, input: &[u8]) -> Result<Vec<Match>, Error> {
192        if input.is_empty() {
193            return if self.empty_nullable {
194                Ok(vec![Match { start: 0, end: 0 }])
195            } else {
196                Ok(vec![])
197            };
198        }
199        if self.fwd_prefix.is_some() {
200            return self.find_all_fwd_prefix(input);
201        }
202        self.find_all_dfa(input)
203    }
204
205    /// debug: dump rev DFA effects_id and effects.
206    pub fn effects_debug(&self) -> String {
207        let inner = self.inner.lock().unwrap();
208        let rev = &inner.rev;
209        let mut out = String::new();
210        for (i, &eid) in rev.effects_id.iter().enumerate() {
211            if eid != 0 {
212                let nulls: Vec<String> = rev.effects[eid as usize]
213                    .iter()
214                    .map(|n| format!("(mask={},rel={})", n.mask.0, n.rel))
215                    .collect();
216                out += &format!("  state[{}] eid={} nulls=[{}]\n", i, eid, nulls.join(", "));
217            }
218        }
219        out
220    }
221
222    /// debug: run only the reverse DFA, return null positions.
223    pub fn collect_rev_nulls_debug(&self, input: &[u8]) -> Vec<usize> {
224        let inner = &mut *self.inner.lock().unwrap();
225        inner.nulls_buf.clear();
226        inner
227            .rev
228            .collect_rev(&mut inner.b, input.len() - 1, input, &mut inner.nulls_buf)
229            .unwrap();
230        inner.nulls_buf.clone()
231    }
232
233    fn find_all_dfa(&self, input: &[u8]) -> Result<Vec<Match>, Error> {
234        let inner = &mut *self.inner.lock().unwrap();
235
236        inner.nulls_buf.clear();
237
238        inner
239            .rev
240            .collect_rev(&mut inner.b, input.len() - 1, input, &mut inner.nulls_buf)?;
241
242        let mut matches = Vec::new();
243        if let Some(fl) = self.fixed_length {
244            let fl = fl as usize;
245            let mut last_end = 0;
246            for &start in inner.nulls_buf.iter().rev() {
247                if start >= last_end && start + fl <= input.len() {
248                    matches.push(Match {
249                        start,
250                        end: start + fl,
251                    });
252                    last_end = start + fl;
253                }
254            }
255        } else {
256            inner
257                .fwd
258                .scan_fwd_all(&mut inner.b, &inner.nulls_buf, input, &mut matches)?;
259        }
260
261        if inner.rev.effects_id[inner.rev.initial as usize] != 0 {
262            matches.push(Match {
263                start: input.len(),
264                end: input.len(),
265            });
266        }
267
268        Ok(matches)
269    }
270
271    fn find_all_fwd_prefix(&self, input: &[u8]) -> Result<Vec<Match>, Error> {
272        let fwd_prefix = self.fwd_prefix.as_ref().unwrap();
273        let mut matches = Vec::new();
274        let mut search_start = 0;
275
276        if self.fixed_length.is_some() && fwd_prefix.find_all_literal(input, &mut matches) {
277            // done
278        } else if let Some(fl) = self.fixed_length {
279            while let Some(candidate) = fwd_prefix.find_fwd(input, search_start) {
280                let end = candidate + fl as usize;
281                if end <= input.len() {
282                    matches.push(Match {
283                        start: candidate,
284                        end,
285                    });
286                    search_start = end;
287                } else {
288                    break;
289                }
290            }
291        } else {
292            let inner = &mut *self.inner.lock().unwrap();
293            let prefix_len = fwd_prefix.len();
294            while let Some(candidate) = fwd_prefix.find_fwd(input, search_start) {
295                let state = inner
296                    .fwd
297                    .walk_input(&mut inner.b, candidate, prefix_len, input)?;
298                if state != 0 {
299                    let max_end = inner.fwd.scan_fwd_from(
300                        &mut inner.b,
301                        state,
302                        candidate + prefix_len,
303                        input,
304                    )?;
305                    if max_end != engine::NO_MATCH && max_end > candidate {
306                        matches.push(Match {
307                            start: candidate,
308                            end: max_end,
309                        });
310                        search_start = max_end;
311                        continue;
312                    }
313                }
314                search_start = candidate + 1;
315            }
316        }
317
318        Ok(matches)
319    }
320
321    /// longest match anchored at position 0, forward DFA only.
322    pub fn find_anchored(&self, input: &[u8]) -> Result<Option<Match>, Error> {
323        if input.is_empty() {
324            return if self.empty_nullable {
325                Ok(Some(Match { start: 0, end: 0 }))
326            } else {
327                Ok(None)
328            };
329        }
330        let inner = &mut *self.inner.lock().unwrap();
331        let max_end = inner.fwd.scan_fwd(&mut inner.b, 0, input)?;
332        if max_end != engine::NO_MATCH {
333            Ok(Some(Match {
334                start: 0,
335                end: max_end,
336            }))
337        } else {
338            Ok(None)
339        }
340    }
341
342    /// whether the pattern matches anywhere in the input.
343    pub fn is_match(&self, input: &[u8]) -> Result<bool, Error> {
344        if input.is_empty() {
345            return Ok(self.empty_nullable);
346        }
347        let inner = &mut *self.inner.lock().unwrap();
348        if inner.rev.effects_id[inner.rev.initial as usize] != 0 {
349            return Ok(true);
350        }
351        inner
352            .rev
353            .any_nullable_rev(&mut inner.b, input.len() - 1, input)
354    }
355}