regexr/
lib.rs

1//! regexr - A high-performance regex engine built from scratch
2//!
3//! This crate provides a regex engine with multiple execution backends:
4//! - PikeVM: Thread-based NFA simulation (supports backreferences, lookaround)
5//! - Shift-Or: Bit-parallel NFA for patterns with ≤64 states
6//! - Lazy DFA: On-demand determinization with caching
7//! - JIT: Native x86-64 code generation (optional, requires `jit` feature)
8//! - SIMD: AVX2-accelerated literal search (optional, requires `simd` feature)
9
10#![warn(missing_docs)]
11#![warn(rust_2018_idioms)]
12
13pub mod dfa;
14pub mod engine;
15pub mod error;
16pub mod hir;
17pub mod literal;
18pub mod nfa;
19pub mod parser;
20pub mod vm;
21
22#[cfg(feature = "jit")]
23pub mod jit;
24
25#[cfg(feature = "simd")]
26pub mod simd;
27
28pub use error::{Error, Result};
29
30use engine::CompiledRegex;
31use std::collections::HashMap;
32use std::sync::Arc;
33
34/// Configuration options for regex compilation.
35#[derive(Debug, Clone, Default)]
36pub struct RegexBuilder {
37    pattern: String,
38    /// Whether to enable JIT compilation.
39    jit: bool,
40    /// Whether to enable prefix optimization for large alternations.
41    /// This is critical for tokenizer-style patterns with many literal alternatives.
42    optimize_prefixes: bool,
43}
44
45impl RegexBuilder {
46    /// Creates a new RegexBuilder with the given pattern.
47    pub fn new(pattern: &str) -> Self {
48        Self {
49            pattern: pattern.to_string(),
50            jit: false,
51            optimize_prefixes: false,
52        }
53    }
54
55    /// Enables or disables JIT compilation.
56    ///
57    /// When enabled, the regex will be compiled to native machine code
58    /// for maximum performance. This is ideal for patterns that will be
59    /// matched many times (e.g., tokenization).
60    ///
61    /// JIT compilation has higher upfront cost but faster matching.
62    /// Only available on x86-64 with the `jit` feature enabled.
63    ///
64    /// # Example
65    ///
66    /// ```
67    /// use regexr::RegexBuilder;
68    ///
69    /// let re = RegexBuilder::new(r"\w+")
70    ///     .jit(true)
71    ///     .build()
72    ///     .unwrap();
73    /// assert!(re.is_match("hello"));
74    /// ```
75    pub fn jit(mut self, enabled: bool) -> Self {
76        self.jit = enabled;
77        self
78    }
79
80    /// Enables or disables prefix optimization for large alternations.
81    ///
82    /// When enabled, large alternations of literals (like `(token1|token2|...|tokenN)`)
83    /// will be optimized by merging common prefixes into a trie structure.
84    /// This reduces the number of active NFA threads from O(vocabulary_size) to O(token_length).
85    ///
86    /// This is critical for tokenizer-style patterns with many literal alternatives.
87    ///
88    /// # Example
89    ///
90    /// ```
91    /// use regexr::RegexBuilder;
92    ///
93    /// // Pattern with many tokens sharing common prefixes
94    /// let re = RegexBuilder::new(r"(the|that|them|they|this)")
95    ///     .optimize_prefixes(true)
96    ///     .build()
97    ///     .unwrap();
98    /// assert!(re.is_match("the"));
99    /// ```
100    pub fn optimize_prefixes(mut self, enabled: bool) -> Self {
101        self.optimize_prefixes = enabled;
102        self
103    }
104
105    /// Builds the regex with the configured options.
106    pub fn build(self) -> Result<Regex> {
107        let ast = parser::parse(&self.pattern)?;
108        let mut hir_result = hir::translate(&ast)?;
109
110        // Apply prefix optimization if enabled
111        if self.optimize_prefixes {
112            hir_result = hir::optimize_prefixes(hir_result);
113        }
114
115        let named_groups = Arc::new(hir_result.props.named_groups.clone());
116
117        let inner = if self.jit {
118            engine::compile_with_jit(&hir_result)?
119        } else {
120            // Use compile_from_hir for optimal engine selection (ShiftOr, LazyDfa, etc.)
121            engine::compile_from_hir(&hir_result)?
122        };
123
124        Ok(Regex {
125            inner,
126            pattern: self.pattern,
127            named_groups,
128        })
129    }
130}
131
132/// A compiled regular expression.
133#[derive(Debug)]
134pub struct Regex {
135    inner: CompiledRegex,
136    pattern: String,
137    /// Named capture groups: maps name to index.
138    named_groups: Arc<HashMap<String, u32>>,
139}
140
141impl Regex {
142    /// Compiles a regular expression pattern.
143    ///
144    /// # Errors
145    /// Returns an error if the pattern is invalid.
146    pub fn new(pattern: &str) -> Result<Regex> {
147        let ast = parser::parse(pattern)?;
148        let hir = hir::translate(&ast)?;
149        let named_groups = Arc::new(hir.props.named_groups.clone());
150        // Use HIR-based compilation to enable Shift-Or and prefilters
151        let inner = engine::compile_from_hir(&hir)?;
152
153        Ok(Regex {
154            inner,
155            pattern: pattern.to_string(),
156            named_groups,
157        })
158    }
159
160    /// Returns the names of all named capture groups.
161    pub fn capture_names(&self) -> impl Iterator<Item = &str> {
162        self.named_groups.keys().map(|s| s.as_str())
163    }
164
165    /// Returns the original pattern string.
166    pub fn as_str(&self) -> &str {
167        &self.pattern
168    }
169
170    /// Returns true if the regex matches anywhere in the text.
171    pub fn is_match(&self, text: &str) -> bool {
172        self.inner.is_match(text.as_bytes())
173    }
174
175    /// Returns the first match in the text.
176    pub fn find<'t>(&self, text: &'t str) -> Option<Match<'t>> {
177        self.inner
178            .find(text.as_bytes())
179            .map(|(start, end)| Match { text, start, end })
180    }
181
182    /// Returns an iterator over all non-overlapping matches.
183    pub fn find_iter<'a>(&'a self, text: &'a str) -> Matches<'a> {
184        Matches::new(self, text)
185    }
186
187    /// Returns the capture groups for the first match.
188    pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
189        self.inner.captures(text.as_bytes()).map(|slots| Captures {
190            text,
191            slots,
192            named_groups: Arc::clone(&self.named_groups),
193        })
194    }
195
196    /// Returns an iterator over all non-overlapping captures.
197    pub fn captures_iter<'r, 't>(&'r self, text: &'t str) -> CapturesIter<'r, 't> {
198        CapturesIter {
199            regex: self,
200            text,
201            last_end: 0,
202        }
203    }
204
205    /// Replaces the first match with the replacement string.
206    pub fn replace<'t>(&self, text: &'t str, rep: &str) -> std::borrow::Cow<'t, str> {
207        match self.find(text) {
208            None => std::borrow::Cow::Borrowed(text),
209            Some(m) => {
210                let mut result = String::with_capacity(text.len() + rep.len());
211                result.push_str(&text[..m.start()]);
212                result.push_str(rep);
213                result.push_str(&text[m.end()..]);
214                std::borrow::Cow::Owned(result)
215            }
216        }
217    }
218
219    /// Returns the name of the engine being used (for debugging).
220    pub fn engine_name(&self) -> &'static str {
221        self.inner.engine_name()
222    }
223
224    /// Replaces all matches with the replacement string.
225    pub fn replace_all<'t>(&self, text: &'t str, rep: &str) -> std::borrow::Cow<'t, str> {
226        let mut last_end = 0;
227        let mut result = String::new();
228        let mut had_match = false;
229
230        for m in self.find_iter(text) {
231            had_match = true;
232            result.push_str(&text[last_end..m.start()]);
233            result.push_str(rep);
234            last_end = m.end();
235        }
236
237        if !had_match {
238            std::borrow::Cow::Borrowed(text)
239        } else {
240            result.push_str(&text[last_end..]);
241            std::borrow::Cow::Owned(result)
242        }
243    }
244}
245
246/// A single match in the text.
247#[derive(Debug, Clone, Copy)]
248pub struct Match<'t> {
249    text: &'t str,
250    start: usize,
251    end: usize,
252}
253
254impl<'t> Match<'t> {
255    /// Returns the start byte offset of the match.
256    pub fn start(&self) -> usize {
257        self.start
258    }
259
260    /// Returns the end byte offset of the match.
261    pub fn end(&self) -> usize {
262        self.end
263    }
264
265    /// Returns the matched text.
266    pub fn as_str(&self) -> &'t str {
267        &self.text[self.start..self.end]
268    }
269
270    /// Returns the byte range of the match.
271    pub fn range(&self) -> std::ops::Range<usize> {
272        self.start..self.end
273    }
274
275    /// Returns the length of the match in bytes.
276    pub fn len(&self) -> usize {
277        self.end - self.start
278    }
279
280    /// Returns true if the match is empty.
281    pub fn is_empty(&self) -> bool {
282        self.start == self.end
283    }
284}
285
286/// An iterator over all non-overlapping matches.
287pub struct Matches<'a> {
288    inner: MatchesInner<'a>,
289    text: &'a str,
290}
291
292impl<'a> std::fmt::Debug for Matches<'a> {
293    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
294        f.debug_struct("Matches")
295            .field("text_len", &self.text.len())
296            .finish_non_exhaustive()
297    }
298}
299
300/// Internal iterator state - either uses TeddyFull fast path or generic find().
301enum MatchesInner<'a> {
302    /// Fast path: use TeddyFull prefilter iterator directly.
303    TeddyFull(literal::FullMatchIter<'a, 'a>),
304    /// Generic path: call find() repeatedly.
305    Generic { regex: &'a Regex, last_end: usize },
306}
307
308impl<'a> Matches<'a> {
309    /// Creates a new matches iterator.
310    fn new(regex: &'a Regex, text: &'a str) -> Self {
311        let inner = if regex.inner.is_full_match_prefilter() {
312            // Fast path: use Teddy iterator directly
313            MatchesInner::TeddyFull(regex.inner.find_full_matches(text.as_bytes()))
314        } else {
315            // Generic path
316            MatchesInner::Generic { regex, last_end: 0 }
317        };
318        Matches { inner, text }
319    }
320}
321
322impl<'a> Iterator for Matches<'a> {
323    type Item = Match<'a>;
324
325    fn next(&mut self) -> Option<Match<'a>> {
326        match &mut self.inner {
327            MatchesInner::TeddyFull(iter) => {
328                // Fast path: get match directly from Teddy iterator
329                iter.next().map(|(start, end)| Match {
330                    text: self.text,
331                    start,
332                    end,
333                })
334            }
335            MatchesInner::Generic { regex, last_end } => {
336                if *last_end > self.text.len() {
337                    return None;
338                }
339
340                let search_text = &self.text[*last_end..];
341                match regex.inner.find(search_text.as_bytes()) {
342                    None => None,
343                    Some((start, end)) => {
344                        let abs_start = *last_end + start;
345                        let abs_end = *last_end + end;
346
347                        // Advance past the match, but ensure progress on empty matches
348                        // For empty matches, advance to the next UTF-8 character boundary
349                        *last_end = if abs_start == abs_end {
350                            // Find the next char boundary after abs_end
351                            let remaining = &self.text[abs_end..];
352                            let next_char_len =
353                                remaining.chars().next().map(|c| c.len_utf8()).unwrap_or(1);
354                            abs_end + next_char_len
355                        } else {
356                            abs_end
357                        };
358
359                        Some(Match {
360                            text: self.text,
361                            start: abs_start,
362                            end: abs_end,
363                        })
364                    }
365                }
366            }
367        }
368    }
369}
370
371/// An iterator over all non-overlapping captures.
372#[derive(Debug)]
373pub struct CapturesIter<'r, 't> {
374    regex: &'r Regex,
375    text: &'t str,
376    last_end: usize,
377}
378
379impl<'r, 't> Iterator for CapturesIter<'r, 't> {
380    type Item = Captures<'t>;
381
382    fn next(&mut self) -> Option<Captures<'t>> {
383        if self.last_end > self.text.len() {
384            return None;
385        }
386
387        let search_text = &self.text[self.last_end..];
388        match self.regex.inner.captures(search_text.as_bytes()) {
389            None => None,
390            Some(slots) => {
391                // Get the full match bounds (slot 0)
392                let (start, end) = slots.first().and_then(|s| *s)?;
393                let offset = self.last_end;
394
395                // Advance past the match, but ensure progress on empty matches
396                // For empty matches, advance to the next UTF-8 character boundary
397                let abs_end = offset + end;
398                self.last_end = if start == end {
399                    let remaining = &self.text[abs_end..];
400                    let next_char_len = remaining.chars().next().map(|c| c.len_utf8()).unwrap_or(1);
401                    abs_end + next_char_len
402                } else {
403                    abs_end
404                };
405
406                // Adjust all slot positions to absolute positions
407                let adjusted_slots: Vec<_> = slots
408                    .into_iter()
409                    .map(|slot| slot.map(|(s, e)| (offset + s, offset + e)))
410                    .collect();
411
412                Some(Captures {
413                    text: self.text,
414                    slots: adjusted_slots,
415                    named_groups: Arc::clone(&self.regex.named_groups),
416                })
417            }
418        }
419    }
420}
421
422/// Captured groups from a regex match.
423#[derive(Debug, Clone)]
424pub struct Captures<'t> {
425    text: &'t str,
426    slots: Vec<Option<(usize, usize)>>,
427    named_groups: Arc<HashMap<String, u32>>,
428}
429
430impl<'t> Captures<'t> {
431    /// Returns the number of capture groups (including group 0 for the full match).
432    pub fn len(&self) -> usize {
433        self.slots.len()
434    }
435
436    /// Returns true if there are no captures.
437    pub fn is_empty(&self) -> bool {
438        self.slots.is_empty()
439    }
440
441    /// Returns the capture group at the given index.
442    pub fn get(&self, i: usize) -> Option<Match<'t>> {
443        self.slots.get(i).and_then(|slot| {
444            slot.map(|(start, end)| Match {
445                text: self.text,
446                start,
447                end,
448            })
449        })
450    }
451
452    /// Returns the capture group with the given name.
453    pub fn name(&self, name: &str) -> Option<Match<'t>> {
454        self.named_groups
455            .get(name)
456            .and_then(|&idx| self.get(idx as usize))
457    }
458}
459
460impl<'t> std::ops::Index<usize> for Captures<'t> {
461    type Output = str;
462
463    fn index(&self, i: usize) -> &str {
464        self.get(i)
465            .map(|m| m.as_str())
466            .unwrap_or_else(|| panic!("no capture group at index {}", i))
467    }
468}
469
470impl<'t> std::ops::Index<&str> for Captures<'t> {
471    type Output = str;
472
473    fn index(&self, name: &str) -> &str {
474        self.name(name)
475            .map(|m| m.as_str())
476            .unwrap_or_else(|| panic!("no capture group named '{}'", name))
477    }
478}