regexr/literal/extractor.rs
1//! Literal prefix/suffix extraction.
2//!
3//! Extracts literal prefixes and suffixes from HIR patterns for prefiltering.
4//! Supports both single-literal and multi-literal extraction for Teddy.
5
6use crate::hir::{Hir, HirExpr};
7
8/// Extracted literals from a pattern.
9#[derive(Debug, Clone, Default)]
10pub struct Literals {
11 /// Required prefix literals.
12 /// For alternations like `hello|world`, contains `[b"hello", b"world"]`.
13 /// For concatenations like `hello.*`, contains `[b"hello"]`.
14 pub prefixes: Vec<Vec<u8>>,
15 /// Required suffix literals.
16 pub suffixes: Vec<Vec<u8>>,
17 /// Whether the prefix set is complete (all match positions start with one of these).
18 pub prefix_complete: bool,
19 /// True if the pattern starts with a digit class (0-9).
20 /// Used to create StartsWithDigit prefilter when no literal prefix exists.
21 pub starts_with_digit: bool,
22}
23
24impl Literals {
25 /// Returns true if there are no literals.
26 pub fn is_empty(&self) -> bool {
27 self.prefixes.is_empty() && self.suffixes.is_empty()
28 }
29
30 /// Returns the single prefix if there's exactly one.
31 pub fn single_prefix(&self) -> Option<&[u8]> {
32 if self.prefixes.len() == 1 {
33 Some(&self.prefixes[0])
34 } else {
35 None
36 }
37 }
38
39 /// Returns true if there are multiple prefixes (suitable for Teddy).
40 pub fn has_multiple_prefixes(&self) -> bool {
41 self.prefixes.len() > 1
42 }
43
44 /// Returns the number of prefix alternatives.
45 pub fn prefix_count(&self) -> usize {
46 self.prefixes.len()
47 }
48}
49
50/// Extracts literals from an HIR.
51pub fn extract_literals(hir: &Hir) -> Literals {
52 let mut extractor = LiteralExtractor::new();
53 let result = extractor.extract(&hir.expr);
54
55 // Patterns with backreferences, lookarounds, or word boundaries cannot be
56 // fully matched by literals alone - they require NFA verification.
57 // Set prefix_complete = false to prevent TeddyFull from bypassing NFA.
58 let prefix_complete = result.complete
59 && !hir.props.has_backrefs
60 && !hir.props.has_lookaround
61 && !hir.props.has_word_boundary;
62
63 // If no prefix literals found, check if pattern starts with digit class
64 let starts_with_digit = result.prefixes.is_empty() && starts_with_digit_class(&hir.expr);
65
66 Literals {
67 prefixes: result.prefixes,
68 suffixes: vec![],
69 prefix_complete,
70 starts_with_digit,
71 }
72}
73
74/// Checks if an HIR expression starts with a pure digit character class.
75/// Returns true only if the class exclusively matches digits (0-9), not if it
76/// merely includes digits among other characters (like \w which includes letters).
77fn starts_with_digit_class(expr: &HirExpr) -> bool {
78 match expr {
79 HirExpr::Class(class) => {
80 // Only return true if ALL ranges are within the digit range 0-9.
81 // This ensures we don't match \w which includes [A-Za-z0-9_].
82 !class.ranges.is_empty()
83 && class
84 .ranges
85 .iter()
86 .all(|(lo, hi)| *lo >= b'0' && *hi <= b'9')
87 }
88 HirExpr::Concat(exprs) => {
89 // Skip anchors and find first non-anchor
90 for e in exprs {
91 if matches!(e, HirExpr::Anchor(_)) {
92 continue;
93 }
94 return starts_with_digit_class(e);
95 }
96 false
97 }
98 HirExpr::Repeat(rep) => {
99 // Repeat of digits still starts with digit
100 if rep.min > 0 {
101 starts_with_digit_class(&rep.expr)
102 } else {
103 false
104 }
105 }
106 HirExpr::Capture(cap) => starts_with_digit_class(&cap.expr),
107 _ => false,
108 }
109}
110
111/// Result of extracting prefixes from an expression.
112#[derive(Debug, Clone, Default)]
113struct ExtractionResult {
114 /// The extracted prefixes.
115 prefixes: Vec<Vec<u8>>,
116 /// Whether the extraction is complete (all branches have literals).
117 complete: bool,
118 /// Whether the expression has a nullable suffix (e.g., ends with `?`, `*`).
119 /// If true, we cannot safely extend with subsequent literals.
120 has_nullable_suffix: bool,
121}
122
123struct LiteralExtractor {
124 /// Maximum number of prefixes to extract (for Teddy limit).
125 max_prefixes: usize,
126 /// Maximum length of each prefix.
127 max_prefix_len: usize,
128}
129
130impl LiteralExtractor {
131 fn new() -> Self {
132 Self {
133 max_prefixes: 8, // Teddy limit
134 max_prefix_len: 8, // Teddy limit
135 }
136 }
137
138 fn extract(&mut self, expr: &HirExpr) -> ExtractionResult {
139 match expr {
140 HirExpr::Literal(bytes) => {
141 // Truncate to max length
142 let truncated = bytes.len() > self.max_prefix_len;
143 let prefix = if truncated {
144 bytes[..self.max_prefix_len].to_vec()
145 } else {
146 bytes.clone()
147 };
148 ExtractionResult {
149 prefixes: vec![prefix],
150 // Only complete if we didn't truncate - truncated prefixes
151 // cannot provide full match bounds
152 complete: !truncated,
153 has_nullable_suffix: false,
154 }
155 }
156 HirExpr::Concat(exprs) => {
157 // Extract from the first non-anchor element, extend with subsequent literals.
158 // Anchors (including word boundaries) are zero-width and should be skipped
159 // during literal extraction. For example, `\bthe\b` should extract "the".
160 if exprs.is_empty() {
161 return ExtractionResult::default();
162 }
163
164 // Skip leading anchors to find the first literal-producing expression
165 let mut start_idx = 0;
166 while start_idx < exprs.len() && matches!(&exprs[start_idx], HirExpr::Anchor(_)) {
167 start_idx += 1;
168 }
169
170 if start_idx >= exprs.len() {
171 // All anchors, no literals
172 return ExtractionResult::default();
173 }
174
175 let mut result = self.extract(&exprs[start_idx]);
176
177 // Track whether we've seen all literals so far
178 let mut all_literals_so_far = matches!(&exprs[start_idx], HirExpr::Literal(_));
179
180 // Only extend prefixes with subsequent literals if there's no
181 // nullable suffix. A nullable suffix means the prefix might not
182 // consume all of what we extracted.
183 if !result.has_nullable_suffix {
184 // Try to extend prefixes with subsequent literals
185 for expr in &exprs[start_idx + 1..] {
186 // Skip anchors (zero-width, don't affect literals)
187 if matches!(expr, HirExpr::Anchor(_)) {
188 continue;
189 }
190 if let HirExpr::Literal(bytes) = expr {
191 // Extend each prefix (up to max length)
192 for prefix in &mut result.prefixes {
193 let remaining = self.max_prefix_len.saturating_sub(prefix.len());
194 if remaining > 0 {
195 let extend_len = bytes.len().min(remaining);
196 prefix.extend_from_slice(&bytes[..extend_len]);
197 // If we couldn't fit all bytes, mark incomplete
198 if extend_len < bytes.len() {
199 result.complete = false;
200 }
201 } else {
202 // No room to extend - subsequent literal was skipped
203 result.complete = false;
204 }
205 }
206 } else {
207 // Stop extending if we hit a non-literal.
208 // The prefix is no longer complete since there's
209 // a non-literal suffix that must also match.
210 all_literals_so_far = false;
211 result.complete = false;
212 // Check if this expression has a nullable suffix
213 let sub = self.extract(expr);
214 if sub.has_nullable_suffix {
215 result.has_nullable_suffix = true;
216 }
217 break;
218 }
219 }
220 } else {
221 // If first element has nullable suffix, check if there are
222 // subsequent elements - if so, prefix isn't complete
223 if start_idx + 1 < exprs.len() {
224 result.complete = false;
225 }
226 }
227
228 // Check if the concat ends with a nullable expression
229 // Also, if the last element is not a literal or anchor, the prefix isn't complete
230 // (Anchors are zero-width and don't affect completeness)
231 if let Some(last) = exprs.last() {
232 // Skip trailing anchors to find the actual last element
233 let actual_last = exprs
234 .iter()
235 .rev()
236 .find(|e| !matches!(e, HirExpr::Anchor(_)))
237 .unwrap_or(last);
238
239 let last_result = self.extract(actual_last);
240 if last_result.has_nullable_suffix {
241 result.has_nullable_suffix = true;
242 }
243 // If the last expression is not a complete literal, mark as incomplete
244 if !last_result.complete || !matches!(actual_last, HirExpr::Literal(_)) {
245 // Only if we haven't already extended through all literals
246 if !all_literals_so_far {
247 result.complete = false;
248 }
249 }
250 }
251
252 result
253 }
254 HirExpr::Alt(exprs) => {
255 // Collect prefixes from all branches
256 let mut all_prefixes: Vec<Vec<u8>> = Vec::new();
257 let mut all_complete = true;
258 let mut any_nullable_suffix = false;
259
260 for expr in exprs {
261 let sub_result = self.extract(expr);
262
263 if sub_result.prefixes.is_empty() {
264 // One branch has no prefix - can't use multi-prefix
265 // Try to find common prefix instead
266 return self.extract_common_prefix(exprs);
267 }
268
269 all_complete = all_complete && sub_result.complete;
270 any_nullable_suffix = any_nullable_suffix || sub_result.has_nullable_suffix;
271 all_prefixes.extend(sub_result.prefixes);
272
273 // Check if we've exceeded the limit
274 if all_prefixes.len() > self.max_prefixes {
275 // Too many prefixes - fall back to common prefix
276 return self.extract_common_prefix(exprs);
277 }
278 }
279
280 // Deduplicate prefixes
281 all_prefixes.sort();
282 all_prefixes.dedup();
283
284 ExtractionResult {
285 prefixes: all_prefixes,
286 complete: all_complete,
287 has_nullable_suffix: any_nullable_suffix,
288 }
289 }
290 HirExpr::Repeat(rep) => {
291 if rep.min > 0 {
292 // Required repetition - extract inner prefix
293 // But the expression has a nullable suffix since repetition
294 // can match more or less than what's required
295 let mut result = self.extract(&rep.expr);
296 // Even with min > 0, repetition can match variable amounts,
297 // which means it has a "nullable suffix" in the sense that
298 // subsequent literals might not directly follow the required part.
299 // For example, a+b can match "ab" or "aab", so we shouldn't
300 // extend the prefix "a" with "b".
301 result.has_nullable_suffix = true;
302 result
303 } else {
304 // Zero-or-more means no required prefix
305 ExtractionResult {
306 has_nullable_suffix: true,
307 ..Default::default()
308 }
309 }
310 }
311 HirExpr::Capture(cap) => self.extract(&cap.expr),
312 HirExpr::Class(_) => {
313 // Can't extract literals from character classes
314 ExtractionResult::default()
315 }
316 _ => ExtractionResult::default(),
317 }
318 }
319
320 /// Extracts the common prefix from alternation branches.
321 fn extract_common_prefix(&mut self, exprs: &[HirExpr]) -> ExtractionResult {
322 let mut all_prefixes: Vec<Vec<u8>> = Vec::new();
323
324 for expr in exprs {
325 let sub_result = self.extract(expr);
326 if sub_result.prefixes.is_empty() {
327 return ExtractionResult::default();
328 }
329 all_prefixes.extend(sub_result.prefixes);
330 }
331
332 if let Some(common) = find_common_prefix(&all_prefixes) {
333 if !common.is_empty() {
334 return ExtractionResult {
335 prefixes: vec![common],
336 complete: false, // Common prefix isn't complete
337 has_nullable_suffix: false,
338 };
339 }
340 }
341
342 ExtractionResult::default()
343 }
344}
345
346/// Finds the common prefix among a set of byte sequences.
347fn find_common_prefix(seqs: &[Vec<u8>]) -> Option<Vec<u8>> {
348 if seqs.is_empty() {
349 return None;
350 }
351
352 let first = &seqs[0];
353 let mut prefix_len = first.len();
354
355 for seq in &seqs[1..] {
356 let common_len = first
357 .iter()
358 .zip(seq.iter())
359 .take_while(|(a, b)| a == b)
360 .count();
361 prefix_len = prefix_len.min(common_len);
362 }
363
364 if prefix_len == 0 {
365 None
366 } else {
367 Some(first[..prefix_len].to_vec())
368 }
369}
370
371#[cfg(test)]
372mod tests {
373 use super::*;
374 use crate::hir::translate;
375 use crate::parser::parse;
376
377 fn get_literals(pattern: &str) -> Literals {
378 let ast = parse(pattern).unwrap();
379 let hir = translate(&ast).unwrap();
380 extract_literals(&hir)
381 }
382
383 #[test]
384 fn test_simple_literal() {
385 let lits = get_literals("hello");
386 assert_eq!(lits.prefixes.len(), 1);
387 assert_eq!(lits.prefixes[0], b"hello");
388 assert!(lits.prefix_complete);
389 }
390
391 #[test]
392 fn test_long_literal_truncated() {
393 let lits = get_literals("helloworld123");
394 assert_eq!(lits.prefixes.len(), 1);
395 assert_eq!(lits.prefixes[0], b"hellowor"); // Truncated to 8 bytes
396 // Truncated literals cannot be "complete" - they're only prefixes
397 assert!(!lits.prefix_complete);
398 }
399
400 #[test]
401 fn test_no_prefix() {
402 let lits = get_literals(".*hello");
403 assert!(lits.prefixes.is_empty());
404 }
405
406 #[test]
407 fn test_alternation_multi_prefix() {
408 // Different prefixes - should extract both for Teddy
409 let lits = get_literals("hello|world");
410 assert_eq!(lits.prefixes.len(), 2);
411 assert!(lits.prefixes.contains(&b"hello".to_vec()));
412 assert!(lits.prefixes.contains(&b"world".to_vec()));
413 }
414
415 #[test]
416 fn test_alternation_common_prefix() {
417 // Same prefix - should extract common prefix
418 let lits = get_literals("hello|help");
419 // Both start with "hel", so we get both as separate prefixes
420 assert_eq!(lits.prefixes.len(), 2);
421 assert!(lits.prefixes.contains(&b"hello".to_vec()));
422 assert!(lits.prefixes.contains(&b"help".to_vec()));
423 }
424
425 #[test]
426 fn test_concat_extends_prefix() {
427 let lits = get_literals("ab");
428 assert_eq!(lits.prefixes.len(), 1);
429 assert_eq!(lits.prefixes[0], b"ab");
430 }
431
432 #[test]
433 fn test_class_no_prefix() {
434 let lits = get_literals("[abc]hello");
435 assert!(lits.prefixes.is_empty());
436 }
437
438 #[test]
439 fn test_repeat_one_or_more() {
440 let lits = get_literals("a+b");
441 assert_eq!(lits.prefixes.len(), 1);
442 // a+ means at least one 'a', but we cannot extend with 'b' because
443 // the match could be "aab" or "aaab" - the prefix is just "a"
444 assert_eq!(lits.prefixes[0], b"a");
445 }
446
447 #[test]
448 fn test_repeat_zero_or_more_no_prefix() {
449 let lits = get_literals("a*b");
450 assert!(lits.prefixes.is_empty());
451 }
452
453 #[test]
454 fn test_too_many_alternations() {
455 // More than 8 alternations - falls back to common prefix (none in this case)
456 let lits = get_literals("a|b|c|d|e|f|g|h|i|j");
457 // Since there's no common prefix among a,b,c..., should be empty
458 assert!(lits.prefixes.is_empty());
459 }
460
461 #[test]
462 fn test_nested_alternation() {
463 let lits = get_literals("(cat|dog)food");
464 assert_eq!(lits.prefixes.len(), 2);
465 assert!(lits.prefixes.contains(&b"catfood".to_vec()));
466 assert!(lits.prefixes.contains(&b"dogfood".to_vec()));
467 }
468
469 #[test]
470 fn test_literal_then_star() {
471 // hello.*world should extract "hello" as prefix
472 let lits = get_literals("hello.*world");
473 assert_eq!(lits.prefixes.len(), 1);
474 assert_eq!(lits.prefixes[0], b"hello");
475 }
476
477 #[test]
478 fn test_literal_then_class() {
479 // hello[0-9]+ should extract "hello" as prefix
480 let lits = get_literals("hello[0-9]+");
481 assert_eq!(lits.prefixes.len(), 1);
482 assert_eq!(lits.prefixes[0], b"hello");
483 }
484}