regexr/literal/
extractor.rs

1//! Literal prefix/suffix extraction.
2//!
3//! Extracts literal prefixes and suffixes from HIR patterns for prefiltering.
4//! Supports both single-literal and multi-literal extraction for Teddy.
5
6use crate::hir::{Hir, HirExpr};
7
8/// Extracted literals from a pattern.
9#[derive(Debug, Clone, Default)]
10pub struct Literals {
11    /// Required prefix literals.
12    /// For alternations like `hello|world`, contains `[b"hello", b"world"]`.
13    /// For concatenations like `hello.*`, contains `[b"hello"]`.
14    pub prefixes: Vec<Vec<u8>>,
15    /// Required suffix literals.
16    pub suffixes: Vec<Vec<u8>>,
17    /// Whether the prefix set is complete (all match positions start with one of these).
18    pub prefix_complete: bool,
19    /// True if the pattern starts with a digit class (0-9).
20    /// Used to create StartsWithDigit prefilter when no literal prefix exists.
21    pub starts_with_digit: bool,
22}
23
24impl Literals {
25    /// Returns true if there are no literals.
26    pub fn is_empty(&self) -> bool {
27        self.prefixes.is_empty() && self.suffixes.is_empty()
28    }
29
30    /// Returns the single prefix if there's exactly one.
31    pub fn single_prefix(&self) -> Option<&[u8]> {
32        if self.prefixes.len() == 1 {
33            Some(&self.prefixes[0])
34        } else {
35            None
36        }
37    }
38
39    /// Returns true if there are multiple prefixes (suitable for Teddy).
40    pub fn has_multiple_prefixes(&self) -> bool {
41        self.prefixes.len() > 1
42    }
43
44    /// Returns the number of prefix alternatives.
45    pub fn prefix_count(&self) -> usize {
46        self.prefixes.len()
47    }
48}
49
50/// Extracts literals from an HIR.
51pub fn extract_literals(hir: &Hir) -> Literals {
52    let mut extractor = LiteralExtractor::new();
53    let result = extractor.extract(&hir.expr);
54
55    // Patterns with backreferences, lookarounds, or word boundaries cannot be
56    // fully matched by literals alone - they require NFA verification.
57    // Set prefix_complete = false to prevent TeddyFull from bypassing NFA.
58    let prefix_complete = result.complete
59        && !hir.props.has_backrefs
60        && !hir.props.has_lookaround
61        && !hir.props.has_word_boundary;
62
63    // If no prefix literals found, check if pattern starts with digit class
64    let starts_with_digit = result.prefixes.is_empty() && starts_with_digit_class(&hir.expr);
65
66    Literals {
67        prefixes: result.prefixes,
68        suffixes: vec![],
69        prefix_complete,
70        starts_with_digit,
71    }
72}
73
74/// Checks if an HIR expression starts with a pure digit character class.
75/// Returns true only if the class exclusively matches digits (0-9), not if it
76/// merely includes digits among other characters (like \w which includes letters).
77fn starts_with_digit_class(expr: &HirExpr) -> bool {
78    match expr {
79        HirExpr::Class(class) => {
80            // Only return true if ALL ranges are within the digit range 0-9.
81            // This ensures we don't match \w which includes [A-Za-z0-9_].
82            !class.ranges.is_empty()
83                && class
84                    .ranges
85                    .iter()
86                    .all(|(lo, hi)| *lo >= b'0' && *hi <= b'9')
87        }
88        HirExpr::Concat(exprs) => {
89            // Skip anchors and find first non-anchor
90            for e in exprs {
91                if matches!(e, HirExpr::Anchor(_)) {
92                    continue;
93                }
94                return starts_with_digit_class(e);
95            }
96            false
97        }
98        HirExpr::Repeat(rep) => {
99            // Repeat of digits still starts with digit
100            if rep.min > 0 {
101                starts_with_digit_class(&rep.expr)
102            } else {
103                false
104            }
105        }
106        HirExpr::Capture(cap) => starts_with_digit_class(&cap.expr),
107        _ => false,
108    }
109}
110
111/// Result of extracting prefixes from an expression.
112#[derive(Debug, Clone, Default)]
113struct ExtractionResult {
114    /// The extracted prefixes.
115    prefixes: Vec<Vec<u8>>,
116    /// Whether the extraction is complete (all branches have literals).
117    complete: bool,
118    /// Whether the expression has a nullable suffix (e.g., ends with `?`, `*`).
119    /// If true, we cannot safely extend with subsequent literals.
120    has_nullable_suffix: bool,
121}
122
123struct LiteralExtractor {
124    /// Maximum number of prefixes to extract (for Teddy limit).
125    max_prefixes: usize,
126    /// Maximum length of each prefix.
127    max_prefix_len: usize,
128}
129
130impl LiteralExtractor {
131    fn new() -> Self {
132        Self {
133            max_prefixes: 8,   // Teddy limit
134            max_prefix_len: 8, // Teddy limit
135        }
136    }
137
138    fn extract(&mut self, expr: &HirExpr) -> ExtractionResult {
139        match expr {
140            HirExpr::Literal(bytes) => {
141                // Truncate to max length
142                let truncated = bytes.len() > self.max_prefix_len;
143                let prefix = if truncated {
144                    bytes[..self.max_prefix_len].to_vec()
145                } else {
146                    bytes.clone()
147                };
148                ExtractionResult {
149                    prefixes: vec![prefix],
150                    // Only complete if we didn't truncate - truncated prefixes
151                    // cannot provide full match bounds
152                    complete: !truncated,
153                    has_nullable_suffix: false,
154                }
155            }
156            HirExpr::Concat(exprs) => {
157                // Extract from the first non-anchor element, extend with subsequent literals.
158                // Anchors (including word boundaries) are zero-width and should be skipped
159                // during literal extraction. For example, `\bthe\b` should extract "the".
160                if exprs.is_empty() {
161                    return ExtractionResult::default();
162                }
163
164                // Skip leading anchors to find the first literal-producing expression
165                let mut start_idx = 0;
166                while start_idx < exprs.len() && matches!(&exprs[start_idx], HirExpr::Anchor(_)) {
167                    start_idx += 1;
168                }
169
170                if start_idx >= exprs.len() {
171                    // All anchors, no literals
172                    return ExtractionResult::default();
173                }
174
175                let mut result = self.extract(&exprs[start_idx]);
176
177                // Track whether we've seen all literals so far
178                let mut all_literals_so_far = matches!(&exprs[start_idx], HirExpr::Literal(_));
179
180                // Only extend prefixes with subsequent literals if there's no
181                // nullable suffix. A nullable suffix means the prefix might not
182                // consume all of what we extracted.
183                if !result.has_nullable_suffix {
184                    // Try to extend prefixes with subsequent literals
185                    for expr in &exprs[start_idx + 1..] {
186                        // Skip anchors (zero-width, don't affect literals)
187                        if matches!(expr, HirExpr::Anchor(_)) {
188                            continue;
189                        }
190                        if let HirExpr::Literal(bytes) = expr {
191                            // Extend each prefix (up to max length)
192                            for prefix in &mut result.prefixes {
193                                let remaining = self.max_prefix_len.saturating_sub(prefix.len());
194                                if remaining > 0 {
195                                    let extend_len = bytes.len().min(remaining);
196                                    prefix.extend_from_slice(&bytes[..extend_len]);
197                                    // If we couldn't fit all bytes, mark incomplete
198                                    if extend_len < bytes.len() {
199                                        result.complete = false;
200                                    }
201                                } else {
202                                    // No room to extend - subsequent literal was skipped
203                                    result.complete = false;
204                                }
205                            }
206                        } else {
207                            // Stop extending if we hit a non-literal.
208                            // The prefix is no longer complete since there's
209                            // a non-literal suffix that must also match.
210                            all_literals_so_far = false;
211                            result.complete = false;
212                            // Check if this expression has a nullable suffix
213                            let sub = self.extract(expr);
214                            if sub.has_nullable_suffix {
215                                result.has_nullable_suffix = true;
216                            }
217                            break;
218                        }
219                    }
220                } else {
221                    // If first element has nullable suffix, check if there are
222                    // subsequent elements - if so, prefix isn't complete
223                    if start_idx + 1 < exprs.len() {
224                        result.complete = false;
225                    }
226                }
227
228                // Check if the concat ends with a nullable expression
229                // Also, if the last element is not a literal or anchor, the prefix isn't complete
230                // (Anchors are zero-width and don't affect completeness)
231                if let Some(last) = exprs.last() {
232                    // Skip trailing anchors to find the actual last element
233                    let actual_last = exprs
234                        .iter()
235                        .rev()
236                        .find(|e| !matches!(e, HirExpr::Anchor(_)))
237                        .unwrap_or(last);
238
239                    let last_result = self.extract(actual_last);
240                    if last_result.has_nullable_suffix {
241                        result.has_nullable_suffix = true;
242                    }
243                    // If the last expression is not a complete literal, mark as incomplete
244                    if !last_result.complete || !matches!(actual_last, HirExpr::Literal(_)) {
245                        // Only if we haven't already extended through all literals
246                        if !all_literals_so_far {
247                            result.complete = false;
248                        }
249                    }
250                }
251
252                result
253            }
254            HirExpr::Alt(exprs) => {
255                // Collect prefixes from all branches
256                let mut all_prefixes: Vec<Vec<u8>> = Vec::new();
257                let mut all_complete = true;
258                let mut any_nullable_suffix = false;
259
260                for expr in exprs {
261                    let sub_result = self.extract(expr);
262
263                    if sub_result.prefixes.is_empty() {
264                        // One branch has no prefix - can't use multi-prefix
265                        // Try to find common prefix instead
266                        return self.extract_common_prefix(exprs);
267                    }
268
269                    all_complete = all_complete && sub_result.complete;
270                    any_nullable_suffix = any_nullable_suffix || sub_result.has_nullable_suffix;
271                    all_prefixes.extend(sub_result.prefixes);
272
273                    // Check if we've exceeded the limit
274                    if all_prefixes.len() > self.max_prefixes {
275                        // Too many prefixes - fall back to common prefix
276                        return self.extract_common_prefix(exprs);
277                    }
278                }
279
280                // Deduplicate prefixes
281                all_prefixes.sort();
282                all_prefixes.dedup();
283
284                ExtractionResult {
285                    prefixes: all_prefixes,
286                    complete: all_complete,
287                    has_nullable_suffix: any_nullable_suffix,
288                }
289            }
290            HirExpr::Repeat(rep) => {
291                if rep.min > 0 {
292                    // Required repetition - extract inner prefix
293                    // But the expression has a nullable suffix since repetition
294                    // can match more or less than what's required
295                    let mut result = self.extract(&rep.expr);
296                    // Even with min > 0, repetition can match variable amounts,
297                    // which means it has a "nullable suffix" in the sense that
298                    // subsequent literals might not directly follow the required part.
299                    // For example, a+b can match "ab" or "aab", so we shouldn't
300                    // extend the prefix "a" with "b".
301                    result.has_nullable_suffix = true;
302                    result
303                } else {
304                    // Zero-or-more means no required prefix
305                    ExtractionResult {
306                        has_nullable_suffix: true,
307                        ..Default::default()
308                    }
309                }
310            }
311            HirExpr::Capture(cap) => self.extract(&cap.expr),
312            HirExpr::Class(_) => {
313                // Can't extract literals from character classes
314                ExtractionResult::default()
315            }
316            _ => ExtractionResult::default(),
317        }
318    }
319
320    /// Extracts the common prefix from alternation branches.
321    fn extract_common_prefix(&mut self, exprs: &[HirExpr]) -> ExtractionResult {
322        let mut all_prefixes: Vec<Vec<u8>> = Vec::new();
323
324        for expr in exprs {
325            let sub_result = self.extract(expr);
326            if sub_result.prefixes.is_empty() {
327                return ExtractionResult::default();
328            }
329            all_prefixes.extend(sub_result.prefixes);
330        }
331
332        if let Some(common) = find_common_prefix(&all_prefixes) {
333            if !common.is_empty() {
334                return ExtractionResult {
335                    prefixes: vec![common],
336                    complete: false, // Common prefix isn't complete
337                    has_nullable_suffix: false,
338                };
339            }
340        }
341
342        ExtractionResult::default()
343    }
344}
345
346/// Finds the common prefix among a set of byte sequences.
347fn find_common_prefix(seqs: &[Vec<u8>]) -> Option<Vec<u8>> {
348    if seqs.is_empty() {
349        return None;
350    }
351
352    let first = &seqs[0];
353    let mut prefix_len = first.len();
354
355    for seq in &seqs[1..] {
356        let common_len = first
357            .iter()
358            .zip(seq.iter())
359            .take_while(|(a, b)| a == b)
360            .count();
361        prefix_len = prefix_len.min(common_len);
362    }
363
364    if prefix_len == 0 {
365        None
366    } else {
367        Some(first[..prefix_len].to_vec())
368    }
369}
370
371#[cfg(test)]
372mod tests {
373    use super::*;
374    use crate::hir::translate;
375    use crate::parser::parse;
376
377    fn get_literals(pattern: &str) -> Literals {
378        let ast = parse(pattern).unwrap();
379        let hir = translate(&ast).unwrap();
380        extract_literals(&hir)
381    }
382
383    #[test]
384    fn test_simple_literal() {
385        let lits = get_literals("hello");
386        assert_eq!(lits.prefixes.len(), 1);
387        assert_eq!(lits.prefixes[0], b"hello");
388        assert!(lits.prefix_complete);
389    }
390
391    #[test]
392    fn test_long_literal_truncated() {
393        let lits = get_literals("helloworld123");
394        assert_eq!(lits.prefixes.len(), 1);
395        assert_eq!(lits.prefixes[0], b"hellowor"); // Truncated to 8 bytes
396                                                   // Truncated literals cannot be "complete" - they're only prefixes
397        assert!(!lits.prefix_complete);
398    }
399
400    #[test]
401    fn test_no_prefix() {
402        let lits = get_literals(".*hello");
403        assert!(lits.prefixes.is_empty());
404    }
405
406    #[test]
407    fn test_alternation_multi_prefix() {
408        // Different prefixes - should extract both for Teddy
409        let lits = get_literals("hello|world");
410        assert_eq!(lits.prefixes.len(), 2);
411        assert!(lits.prefixes.contains(&b"hello".to_vec()));
412        assert!(lits.prefixes.contains(&b"world".to_vec()));
413    }
414
415    #[test]
416    fn test_alternation_common_prefix() {
417        // Same prefix - should extract common prefix
418        let lits = get_literals("hello|help");
419        // Both start with "hel", so we get both as separate prefixes
420        assert_eq!(lits.prefixes.len(), 2);
421        assert!(lits.prefixes.contains(&b"hello".to_vec()));
422        assert!(lits.prefixes.contains(&b"help".to_vec()));
423    }
424
425    #[test]
426    fn test_concat_extends_prefix() {
427        let lits = get_literals("ab");
428        assert_eq!(lits.prefixes.len(), 1);
429        assert_eq!(lits.prefixes[0], b"ab");
430    }
431
432    #[test]
433    fn test_class_no_prefix() {
434        let lits = get_literals("[abc]hello");
435        assert!(lits.prefixes.is_empty());
436    }
437
438    #[test]
439    fn test_repeat_one_or_more() {
440        let lits = get_literals("a+b");
441        assert_eq!(lits.prefixes.len(), 1);
442        // a+ means at least one 'a', but we cannot extend with 'b' because
443        // the match could be "aab" or "aaab" - the prefix is just "a"
444        assert_eq!(lits.prefixes[0], b"a");
445    }
446
447    #[test]
448    fn test_repeat_zero_or_more_no_prefix() {
449        let lits = get_literals("a*b");
450        assert!(lits.prefixes.is_empty());
451    }
452
453    #[test]
454    fn test_too_many_alternations() {
455        // More than 8 alternations - falls back to common prefix (none in this case)
456        let lits = get_literals("a|b|c|d|e|f|g|h|i|j");
457        // Since there's no common prefix among a,b,c..., should be empty
458        assert!(lits.prefixes.is_empty());
459    }
460
461    #[test]
462    fn test_nested_alternation() {
463        let lits = get_literals("(cat|dog)food");
464        assert_eq!(lits.prefixes.len(), 2);
465        assert!(lits.prefixes.contains(&b"catfood".to_vec()));
466        assert!(lits.prefixes.contains(&b"dogfood".to_vec()));
467    }
468
469    #[test]
470    fn test_literal_then_star() {
471        // hello.*world should extract "hello" as prefix
472        let lits = get_literals("hello.*world");
473        assert_eq!(lits.prefixes.len(), 1);
474        assert_eq!(lits.prefixes[0], b"hello");
475    }
476
477    #[test]
478    fn test_literal_then_class() {
479        // hello[0-9]+ should extract "hello" as prefix
480        let lits = get_literals("hello[0-9]+");
481        assert_eq!(lits.prefixes.len(), 1);
482        assert_eq!(lits.prefixes[0], b"hello");
483    }
484}