regexr 0.1.2

A high-performance regex engine built from scratch with JIT compilation and SIMD acceleration
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
//! Literal prefix/suffix extraction.
//!
//! Extracts literal prefixes and suffixes from HIR patterns for prefiltering.
//! Supports both single-literal and multi-literal extraction for Teddy.

use crate::hir::{Hir, HirExpr};

/// Extracted literals from a pattern.
#[derive(Debug, Clone, Default)]
pub struct Literals {
    /// Required prefix literals.
    /// For alternations like `hello|world`, contains `[b"hello", b"world"]`.
    /// For concatenations like `hello.*`, contains `[b"hello"]`.
    pub prefixes: Vec<Vec<u8>>,
    /// Required suffix literals.
    pub suffixes: Vec<Vec<u8>>,
    /// Whether the prefix set is complete (all match positions start with one of these).
    pub prefix_complete: bool,
    /// True if the pattern starts with a digit class (0-9).
    /// Used to create StartsWithDigit prefilter when no literal prefix exists.
    pub starts_with_digit: bool,
}

impl Literals {
    /// Returns true if there are no literals.
    pub fn is_empty(&self) -> bool {
        self.prefixes.is_empty() && self.suffixes.is_empty()
    }

    /// Returns the single prefix if there's exactly one.
    pub fn single_prefix(&self) -> Option<&[u8]> {
        if self.prefixes.len() == 1 {
            Some(&self.prefixes[0])
        } else {
            None
        }
    }

    /// Returns true if there are multiple prefixes (suitable for Teddy).
    pub fn has_multiple_prefixes(&self) -> bool {
        self.prefixes.len() > 1
    }

    /// Returns the number of prefix alternatives.
    pub fn prefix_count(&self) -> usize {
        self.prefixes.len()
    }
}

/// Extracts literals from an HIR.
pub fn extract_literals(hir: &Hir) -> Literals {
    let mut extractor = LiteralExtractor::new();
    let result = extractor.extract(&hir.expr);

    // Patterns with backreferences, lookarounds, or word boundaries cannot be
    // fully matched by literals alone - they require NFA verification.
    // Set prefix_complete = false to prevent TeddyFull from bypassing NFA.
    let prefix_complete = result.complete
        && !hir.props.has_backrefs
        && !hir.props.has_lookaround
        && !hir.props.has_word_boundary;

    // If no prefix literals found, check if pattern starts with digit class
    let starts_with_digit = result.prefixes.is_empty() && starts_with_digit_class(&hir.expr);

    Literals {
        prefixes: result.prefixes,
        suffixes: vec![],
        prefix_complete,
        starts_with_digit,
    }
}

/// Checks if an HIR expression starts with a pure digit character class.
/// Returns true only if the class exclusively matches digits (0-9), not if it
/// merely includes digits among other characters (like \w which includes letters).
fn starts_with_digit_class(expr: &HirExpr) -> bool {
    match expr {
        HirExpr::Class(class) => {
            // Only return true if ALL ranges are within the digit range 0-9.
            // This ensures we don't match \w which includes [A-Za-z0-9_].
            !class.ranges.is_empty()
                && class
                    .ranges
                    .iter()
                    .all(|(lo, hi)| *lo >= b'0' && *hi <= b'9')
        }
        HirExpr::Concat(exprs) => {
            // Skip anchors and find first non-anchor
            for e in exprs {
                if matches!(e, HirExpr::Anchor(_)) {
                    continue;
                }
                return starts_with_digit_class(e);
            }
            false
        }
        HirExpr::Repeat(rep) => {
            // Repeat of digits still starts with digit
            if rep.min > 0 {
                starts_with_digit_class(&rep.expr)
            } else {
                false
            }
        }
        HirExpr::Capture(cap) => starts_with_digit_class(&cap.expr),
        _ => false,
    }
}

/// Result of extracting prefixes from an expression.
#[derive(Debug, Clone, Default)]
struct ExtractionResult {
    /// The extracted prefixes.
    prefixes: Vec<Vec<u8>>,
    /// Whether the extraction is complete (all branches have literals).
    complete: bool,
    /// Whether the expression has a nullable suffix (e.g., ends with `?`, `*`).
    /// If true, we cannot safely extend with subsequent literals.
    has_nullable_suffix: bool,
}

struct LiteralExtractor {
    /// Maximum number of prefixes to extract (for Teddy limit).
    max_prefixes: usize,
    /// Maximum length of each prefix.
    max_prefix_len: usize,
}

impl LiteralExtractor {
    fn new() -> Self {
        Self {
            max_prefixes: 8,   // Teddy limit
            max_prefix_len: 8, // Teddy limit
        }
    }

    fn extract(&mut self, expr: &HirExpr) -> ExtractionResult {
        match expr {
            HirExpr::Literal(bytes) => {
                // Truncate to max length
                let truncated = bytes.len() > self.max_prefix_len;
                let prefix = if truncated {
                    bytes[..self.max_prefix_len].to_vec()
                } else {
                    bytes.clone()
                };
                ExtractionResult {
                    prefixes: vec![prefix],
                    // Only complete if we didn't truncate - truncated prefixes
                    // cannot provide full match bounds
                    complete: !truncated,
                    has_nullable_suffix: false,
                }
            }
            HirExpr::Concat(exprs) => {
                // Extract from the first non-anchor element, extend with subsequent literals.
                // Anchors (including word boundaries) are zero-width and should be skipped
                // during literal extraction. For example, `\bthe\b` should extract "the".
                if exprs.is_empty() {
                    return ExtractionResult::default();
                }

                // Skip leading anchors to find the first literal-producing expression
                let mut start_idx = 0;
                while start_idx < exprs.len() && matches!(&exprs[start_idx], HirExpr::Anchor(_)) {
                    start_idx += 1;
                }

                if start_idx >= exprs.len() {
                    // All anchors, no literals
                    return ExtractionResult::default();
                }

                let mut result = self.extract(&exprs[start_idx]);

                // Track whether we've seen all literals so far
                let mut all_literals_so_far = matches!(&exprs[start_idx], HirExpr::Literal(_));

                // Only extend prefixes with subsequent literals if there's no
                // nullable suffix. A nullable suffix means the prefix might not
                // consume all of what we extracted.
                if !result.has_nullable_suffix {
                    // Try to extend prefixes with subsequent literals
                    for expr in &exprs[start_idx + 1..] {
                        // Skip anchors (zero-width, don't affect literals)
                        if matches!(expr, HirExpr::Anchor(_)) {
                            continue;
                        }
                        if let HirExpr::Literal(bytes) = expr {
                            // Extend each prefix (up to max length)
                            for prefix in &mut result.prefixes {
                                let remaining = self.max_prefix_len.saturating_sub(prefix.len());
                                if remaining > 0 {
                                    let extend_len = bytes.len().min(remaining);
                                    prefix.extend_from_slice(&bytes[..extend_len]);
                                    // If we couldn't fit all bytes, mark incomplete
                                    if extend_len < bytes.len() {
                                        result.complete = false;
                                    }
                                } else {
                                    // No room to extend - subsequent literal was skipped
                                    result.complete = false;
                                }
                            }
                        } else {
                            // Stop extending if we hit a non-literal.
                            // The prefix is no longer complete since there's
                            // a non-literal suffix that must also match.
                            all_literals_so_far = false;
                            result.complete = false;
                            // Check if this expression has a nullable suffix
                            let sub = self.extract(expr);
                            if sub.has_nullable_suffix {
                                result.has_nullable_suffix = true;
                            }
                            break;
                        }
                    }
                } else {
                    // If first element has nullable suffix, check if there are
                    // subsequent elements - if so, prefix isn't complete
                    if start_idx + 1 < exprs.len() {
                        result.complete = false;
                    }
                }

                // Check if the concat ends with a nullable expression
                // Also, if the last element is not a literal or anchor, the prefix isn't complete
                // (Anchors are zero-width and don't affect completeness)
                if let Some(last) = exprs.last() {
                    // Skip trailing anchors to find the actual last element
                    let actual_last = exprs
                        .iter()
                        .rev()
                        .find(|e| !matches!(e, HirExpr::Anchor(_)))
                        .unwrap_or(last);

                    let last_result = self.extract(actual_last);
                    if last_result.has_nullable_suffix {
                        result.has_nullable_suffix = true;
                    }
                    // If the last expression is not a complete literal, mark as incomplete
                    if !last_result.complete || !matches!(actual_last, HirExpr::Literal(_)) {
                        // Only if we haven't already extended through all literals
                        if !all_literals_so_far {
                            result.complete = false;
                        }
                    }
                }

                result
            }
            HirExpr::Alt(exprs) => {
                // Collect prefixes from all branches
                let mut all_prefixes: Vec<Vec<u8>> = Vec::new();
                let mut all_complete = true;
                let mut any_nullable_suffix = false;

                for expr in exprs {
                    let sub_result = self.extract(expr);

                    if sub_result.prefixes.is_empty() {
                        // One branch has no prefix - can't use multi-prefix
                        // Try to find common prefix instead
                        return self.extract_common_prefix(exprs);
                    }

                    all_complete = all_complete && sub_result.complete;
                    any_nullable_suffix = any_nullable_suffix || sub_result.has_nullable_suffix;
                    all_prefixes.extend(sub_result.prefixes);

                    // Check if we've exceeded the limit
                    if all_prefixes.len() > self.max_prefixes {
                        // Too many prefixes - fall back to common prefix
                        return self.extract_common_prefix(exprs);
                    }
                }

                // Deduplicate prefixes
                all_prefixes.sort();
                all_prefixes.dedup();

                ExtractionResult {
                    prefixes: all_prefixes,
                    complete: all_complete,
                    has_nullable_suffix: any_nullable_suffix,
                }
            }
            HirExpr::Repeat(rep) => {
                if rep.min > 0 {
                    // Required repetition - extract inner prefix
                    // But the expression has a nullable suffix since repetition
                    // can match more or less than what's required
                    let mut result = self.extract(&rep.expr);
                    // Even with min > 0, repetition can match variable amounts,
                    // which means it has a "nullable suffix" in the sense that
                    // subsequent literals might not directly follow the required part.
                    // For example, a+b can match "ab" or "aab", so we shouldn't
                    // extend the prefix "a" with "b".
                    result.has_nullable_suffix = true;
                    result
                } else {
                    // Zero-or-more means no required prefix
                    ExtractionResult {
                        has_nullable_suffix: true,
                        ..Default::default()
                    }
                }
            }
            HirExpr::Capture(cap) => self.extract(&cap.expr),
            HirExpr::Class(_) => {
                // Can't extract literals from character classes
                ExtractionResult::default()
            }
            _ => ExtractionResult::default(),
        }
    }

    /// Extracts the common prefix from alternation branches.
    fn extract_common_prefix(&mut self, exprs: &[HirExpr]) -> ExtractionResult {
        let mut all_prefixes: Vec<Vec<u8>> = Vec::new();

        for expr in exprs {
            let sub_result = self.extract(expr);
            if sub_result.prefixes.is_empty() {
                return ExtractionResult::default();
            }
            all_prefixes.extend(sub_result.prefixes);
        }

        if let Some(common) = find_common_prefix(&all_prefixes) {
            if !common.is_empty() {
                return ExtractionResult {
                    prefixes: vec![common],
                    complete: false, // Common prefix isn't complete
                    has_nullable_suffix: false,
                };
            }
        }

        ExtractionResult::default()
    }
}

/// Finds the common prefix among a set of byte sequences.
fn find_common_prefix(seqs: &[Vec<u8>]) -> Option<Vec<u8>> {
    if seqs.is_empty() {
        return None;
    }

    let first = &seqs[0];
    let mut prefix_len = first.len();

    for seq in &seqs[1..] {
        let common_len = first
            .iter()
            .zip(seq.iter())
            .take_while(|(a, b)| a == b)
            .count();
        prefix_len = prefix_len.min(common_len);
    }

    if prefix_len == 0 {
        None
    } else {
        Some(first[..prefix_len].to_vec())
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::hir::translate;
    use crate::parser::parse;

    fn get_literals(pattern: &str) -> Literals {
        let ast = parse(pattern).unwrap();
        let hir = translate(&ast).unwrap();
        extract_literals(&hir)
    }

    #[test]
    fn test_simple_literal() {
        let lits = get_literals("hello");
        assert_eq!(lits.prefixes.len(), 1);
        assert_eq!(lits.prefixes[0], b"hello");
        assert!(lits.prefix_complete);
    }

    #[test]
    fn test_long_literal_truncated() {
        let lits = get_literals("helloworld123");
        assert_eq!(lits.prefixes.len(), 1);
        assert_eq!(lits.prefixes[0], b"hellowor"); // Truncated to 8 bytes
                                                   // Truncated literals cannot be "complete" - they're only prefixes
        assert!(!lits.prefix_complete);
    }

    #[test]
    fn test_no_prefix() {
        let lits = get_literals(".*hello");
        assert!(lits.prefixes.is_empty());
    }

    #[test]
    fn test_alternation_multi_prefix() {
        // Different prefixes - should extract both for Teddy
        let lits = get_literals("hello|world");
        assert_eq!(lits.prefixes.len(), 2);
        assert!(lits.prefixes.contains(&b"hello".to_vec()));
        assert!(lits.prefixes.contains(&b"world".to_vec()));
    }

    #[test]
    fn test_alternation_common_prefix() {
        // Same prefix - should extract common prefix
        let lits = get_literals("hello|help");
        // Both start with "hel", so we get both as separate prefixes
        assert_eq!(lits.prefixes.len(), 2);
        assert!(lits.prefixes.contains(&b"hello".to_vec()));
        assert!(lits.prefixes.contains(&b"help".to_vec()));
    }

    #[test]
    fn test_concat_extends_prefix() {
        let lits = get_literals("ab");
        assert_eq!(lits.prefixes.len(), 1);
        assert_eq!(lits.prefixes[0], b"ab");
    }

    #[test]
    fn test_class_no_prefix() {
        let lits = get_literals("[abc]hello");
        assert!(lits.prefixes.is_empty());
    }

    #[test]
    fn test_repeat_one_or_more() {
        let lits = get_literals("a+b");
        assert_eq!(lits.prefixes.len(), 1);
        // a+ means at least one 'a', but we cannot extend with 'b' because
        // the match could be "aab" or "aaab" - the prefix is just "a"
        assert_eq!(lits.prefixes[0], b"a");
    }

    #[test]
    fn test_repeat_zero_or_more_no_prefix() {
        let lits = get_literals("a*b");
        assert!(lits.prefixes.is_empty());
    }

    #[test]
    fn test_too_many_alternations() {
        // More than 8 alternations - falls back to common prefix (none in this case)
        let lits = get_literals("a|b|c|d|e|f|g|h|i|j");
        // Since there's no common prefix among a,b,c..., should be empty
        assert!(lits.prefixes.is_empty());
    }

    #[test]
    fn test_nested_alternation() {
        let lits = get_literals("(cat|dog)food");
        assert_eq!(lits.prefixes.len(), 2);
        assert!(lits.prefixes.contains(&b"catfood".to_vec()));
        assert!(lits.prefixes.contains(&b"dogfood".to_vec()));
    }

    #[test]
    fn test_literal_then_star() {
        // hello.*world should extract "hello" as prefix
        let lits = get_literals("hello.*world");
        assert_eq!(lits.prefixes.len(), 1);
        assert_eq!(lits.prefixes[0], b"hello");
    }

    #[test]
    fn test_literal_then_class() {
        // hello[0-9]+ should extract "hello" as prefix
        let lits = get_literals("hello[0-9]+");
        assert_eq!(lits.prefixes.len(), 1);
        assert_eq!(lits.prefixes[0], b"hello");
    }
}