1use alloc::boxed::Box;
24use alloc::string::String;
25use alloc::vec::Vec;
26use core::cmp::min;
27
28use bit_set::BitSet;
29
30use crate::alloc::string::ToString;
31use crate::parse::ExprTree;
32use crate::vm::CaptureGroupRange;
33use crate::{CompileError, Error, Expr, Result};
34
35#[cfg(not(feature = "std"))]
36use alloc::collections::BTreeMap as Map;
37#[cfg(feature = "std")]
38use std::collections::HashMap as Map;
39
40#[derive(Debug)]
41pub struct Info<'a> {
42 pub(crate) capture_groups: CaptureGroupRange,
43 pub(crate) min_size: usize,
44 pub(crate) const_size: bool,
45 pub(crate) min_pos_in_group: usize,
48 pub(crate) hard: bool,
49 pub(crate) expr: &'a Expr,
50 pub(crate) children: Vec<Info<'a>>,
51}
52
53impl<'a> Info<'a> {
54 pub(crate) fn start_group(&self) -> usize {
56 self.capture_groups.start()
57 }
58
59 pub(crate) fn end_group(&self) -> usize {
61 self.capture_groups.end()
62 }
63
64 pub(crate) fn is_literal(&self) -> bool {
65 match *self.expr {
66 Expr::Literal { casei, .. } => !casei,
67 Expr::Concat(_) => self.children.iter().all(|child| child.is_literal()),
68 _ => false,
69 }
70 }
71
72 pub(crate) fn push_literal(&self, buf: &mut String) {
73 match *self.expr {
74 Expr::Literal { ref val, .. } => buf.push_str(val),
76 Expr::Concat(_) => {
77 for child in &self.children {
78 child.push_literal(buf);
79 }
80 }
81 _ => panic!("push_literal called on non-literal"),
82 }
83 }
84}
85
86struct SizeInfo {
87 min_size: usize,
88 const_size: bool,
89}
90
91struct Analyzer<'a> {
92 backrefs: &'a BitSet,
93 group_ix: usize,
94 group_info: Map<usize, SizeInfo>,
97}
98
99impl<'a> Analyzer<'a> {
100 fn visit(&mut self, expr: &'a Expr, min_pos_in_group: usize) -> Result<Info<'a>> {
101 let start_group = self.group_ix;
102 let mut children = Vec::new();
103 let mut min_size = 0;
104 let mut const_size = false;
105 let mut hard = false;
106 match *expr {
107 Expr::Assertion(assertion) if assertion.is_hard() => {
108 const_size = true;
109 hard = true;
110 }
111 Expr::Empty | Expr::Assertion(_) => {
112 const_size = true;
113 }
114 Expr::Any { .. } => {
115 min_size = 1;
116 const_size = true;
117 }
118 Expr::Literal { ref val, casei } => {
119 min_size = 1;
121 const_size = literal_const_size(val, casei);
122 }
123 Expr::Concat(ref v) => {
124 const_size = true;
125 let mut pos_in_group = min_pos_in_group;
126 for child in v {
127 let child_info = self.visit(child, pos_in_group)?;
128 min_size += child_info.min_size;
129 const_size &= child_info.const_size;
130 hard |= child_info.hard;
131 pos_in_group += child_info.min_size;
132 children.push(child_info);
133 }
134 }
135 Expr::Alt(ref v) => {
136 let child_info = self.visit(&v[0], min_pos_in_group)?;
137 min_size = child_info.min_size;
138 const_size = child_info.const_size;
139 hard = child_info.hard;
140 children.push(child_info);
141 for child in &v[1..] {
142 let child_info = self.visit(child, min_pos_in_group)?;
143 const_size &= child_info.const_size && min_size == child_info.min_size;
144 min_size = min(min_size, child_info.min_size);
145 hard |= child_info.hard;
146 children.push(child_info);
147 }
148 }
149 Expr::Group(ref child) => {
150 let group = self.group_ix;
151 self.group_ix += 1;
152 let child_info = self.visit(child, 0)?;
153 min_size = child_info.min_size;
154 const_size = child_info.const_size;
155 self.group_info.insert(
157 group,
158 SizeInfo {
159 min_size,
160 const_size,
161 },
162 );
163 hard = child_info.hard | self.backrefs.contains(group);
167 children.push(child_info);
168 }
169 Expr::LookAround(ref child, _) => {
170 let child_info = self.visit(child, min_pos_in_group)?;
172 const_size = true;
174 hard = true;
175 children.push(child_info);
176 }
177 Expr::Repeat {
178 ref child, lo, hi, ..
179 } => {
180 let child_info = self.visit(child, min_pos_in_group)?;
181 min_size = child_info.min_size * lo;
182 const_size = child_info.const_size && lo == hi;
183 hard = child_info.hard;
184 children.push(child_info);
185 }
186 Expr::Delegate { size, .. } => {
187 min_size = size;
189 const_size = true;
190 }
191 Expr::Backref { group, .. } => {
192 if group == 0 {
193 return Err(Error::CompileError(Box::new(CompileError::InvalidBackref(
194 group,
195 ))));
196 }
197 if let Some(&SizeInfo {
199 min_size: group_min_size,
200 const_size: group_const_size,
201 }) = self.group_info.get(&group)
202 {
203 min_size = group_min_size;
204 const_size = group_const_size;
205 }
206 hard = true;
207 }
208 Expr::AtomicGroup(ref child) => {
209 let child_info = self.visit(child, min_pos_in_group)?;
210 min_size = child_info.min_size;
211 const_size = child_info.const_size;
212 hard = true; children.push(child_info);
214 }
215 Expr::KeepOut => {
216 hard = true;
217 const_size = true;
218 }
219 Expr::ContinueFromPreviousMatchEnd => {
220 hard = true;
221 const_size = true;
222 }
223 Expr::BackrefExistsCondition(_) => {
224 hard = true;
225 const_size = true;
226 }
227 Expr::Conditional {
228 ref condition,
229 ref true_branch,
230 ref false_branch,
231 } => {
232 hard = true;
233
234 let child_info_condition = self.visit(condition, min_pos_in_group)?;
235 let child_info_truth = self.visit(
236 true_branch,
237 min_pos_in_group + child_info_condition.min_size,
238 )?;
239 let child_info_false = self.visit(false_branch, min_pos_in_group)?;
240
241 min_size = child_info_condition.min_size
242 + min(child_info_truth.min_size, child_info_false.min_size);
243 const_size = child_info_condition.const_size
244 && child_info_truth.const_size
245 && child_info_false.const_size
246 && child_info_condition.min_size + child_info_truth.min_size == child_info_false.min_size;
248
249 children.push(child_info_condition);
250 children.push(child_info_truth);
251 children.push(child_info_false);
252 }
253 Expr::SubroutineCall(_) => {
254 return Err(Error::CompileError(Box::new(
255 CompileError::FeatureNotYetSupported("Subroutine Call".to_string()),
256 )));
257 }
258 Expr::UnresolvedNamedSubroutineCall { ref name, ix } => {
259 return Err(Error::CompileError(Box::new(
260 CompileError::SubroutineCallTargetNotFound(name.to_string(), ix),
261 )));
262 }
263 Expr::BackrefWithRelativeRecursionLevel { .. } => {
264 return Err(Error::CompileError(Box::new(
265 CompileError::FeatureNotYetSupported("Backref at recursion level".to_string()),
266 )));
267 }
268 };
269
270 Ok(Info {
271 expr,
272 children,
273 capture_groups: CaptureGroupRange(start_group, self.group_ix),
274 min_size,
275 const_size,
276 hard,
277 min_pos_in_group,
278 })
279 }
280}
281
282fn literal_const_size(_: &str, _: bool) -> bool {
283 true
287}
288
289pub fn analyze<'a>(tree: &'a ExprTree, explicit_capture_group_0: bool) -> Result<Info<'a>> {
291 let start_group = if explicit_capture_group_0 { 0 } else { 1 };
292 let mut analyzer = Analyzer {
293 backrefs: &tree.backrefs,
294 group_ix: start_group,
295 group_info: Map::new(),
296 };
297
298 let analyzed = analyzer.visit(&tree.expr, 0);
299 if analyzer.backrefs.contains(0) {
300 return Err(Error::CompileError(Box::new(CompileError::InvalidBackref(
301 0,
302 ))));
303 }
304 if let Some(highest_backref) = analyzer.backrefs.into_iter().last() {
305 if highest_backref > analyzer.group_ix - start_group
306 || highest_backref == analyzer.group_ix && start_group == 0
311 {
312 return Err(Error::CompileError(Box::new(CompileError::InvalidBackref(
313 highest_backref,
314 ))));
315 }
316 }
317 analyzed
318}
319
320pub fn can_compile_as_anchored(root_expr: &Expr) -> bool {
324 use crate::Assertion;
325
326 match root_expr {
327 Expr::Concat(children) => match children[0] {
328 Expr::Assertion(assertion) => assertion == Assertion::StartText,
329 _ => false,
330 },
331 Expr::Assertion(assertion) => *assertion == Assertion::StartText,
332 _ => false,
333 }
334}
335
336#[cfg(test)]
337mod tests {
338 use super::analyze;
339 use crate::{can_compile_as_anchored, CompileError, Error, Expr};
341
342 #[test]
357 fn invalid_backref_zero() {
358 let tree = Expr::parse_tree(r".\0").unwrap();
359 let result = analyze(&tree, false);
360 assert!(matches!(
361 result.err(),
362 Some(Error::CompileError(ref box_err)) if matches!(**box_err, CompileError::InvalidBackref(0))
363 ));
364
365 let result = analyze(&tree, true);
366 assert!(matches!(
367 result.err(),
368 Some(Error::CompileError(ref box_err)) if matches!(**box_err, CompileError::InvalidBackref(0))
369 ));
370
371 let tree = Expr::parse_tree(r"(.)\0").unwrap();
372 let result = analyze(&tree, false);
373 assert!(matches!(
374 result.err(),
375 Some(Error::CompileError(ref box_err)) if matches!(**box_err, CompileError::InvalidBackref(0))
376 ));
377
378 let result = analyze(&tree, true);
379 assert!(matches!(
380 result.err(),
381 Some(Error::CompileError(ref box_err)) if matches!(**box_err, CompileError::InvalidBackref(0))
382 ));
383
384 let tree = Expr::parse_tree(r"(.)\0\1").unwrap();
385 let result = analyze(&tree, false);
386 assert!(matches!(
387 result.err(),
388 Some(Error::CompileError(ref box_err)) if matches!(**box_err, CompileError::InvalidBackref(0))
389 ));
390 }
391
392 #[test]
393 fn invalid_backref_no_captures() {
394 let tree = Expr::parse_tree(r"aa\1").unwrap();
395 let result = analyze(&tree, false);
396 assert!(matches!(
397 result.err(),
398 Some(Error::CompileError(ref box_err)) if matches!(**box_err, CompileError::InvalidBackref(1))
399 ));
400
401 let tree = Expr::parse_tree(r"aaaa\2").unwrap();
402 let result = analyze(&tree, false);
403 assert!(matches!(
404 result.err(),
405 Some(Error::CompileError(ref box_err)) if matches!(**box_err, CompileError::InvalidBackref(2))
406 ));
407 }
408
409 #[test]
410 fn invalid_backref_with_captures() {
411 let tree = Expr::parse_tree(r"a(a)\2").unwrap();
412 let result = analyze(&tree, false);
413 assert!(matches!(
414 result.err(),
415 Some(Error::CompileError(ref box_err)) if matches!(**box_err, CompileError::InvalidBackref(2))
416 ));
417
418 let tree = Expr::parse_tree(r"a(a)\2\1").unwrap();
419 let result = analyze(&tree, false);
420 assert!(matches!(
421 result.err(),
422 Some(Error::CompileError(ref box_err)) if matches!(**box_err, CompileError::InvalidBackref(2))
423 ));
424 }
425
426 #[test]
427 fn invalid_backref_with_captures_explict_capture_group_zero() {
428 let tree = Expr::parse_tree(r"(a(b)\2)c").unwrap();
429 let result = analyze(&tree, true);
430 assert!(matches!(
431 result.err(),
432 Some(Error::CompileError(ref box_err)) if matches!(**box_err, CompileError::InvalidBackref(2))
433 ));
434
435 let tree = Expr::parse_tree(r"(a(b)\1\2)c").unwrap();
436 let result = analyze(&tree, true);
437 assert!(matches!(
438 result.err(),
439 Some(Error::CompileError(ref box_err)) if matches!(**box_err, CompileError::InvalidBackref(2))
440 ));
441
442 let tree = Expr::parse_tree(r"(a\1)b").unwrap();
443 let result = analyze(&tree, true);
444 assert!(matches!(
445 result.err(),
446 Some(Error::CompileError(ref box_err)) if matches!(**box_err, CompileError::InvalidBackref(1))
447 ));
448
449 let tree = Expr::parse_tree(r"(a(b))\2").unwrap();
450 let result = analyze(&tree, true);
451 assert!(matches!(
452 result.err(),
453 Some(Error::CompileError(ref box_err)) if matches!(**box_err, CompileError::InvalidBackref(2))
454 ));
455 }
456
457 #[test]
458 fn allow_analysis_of_self_backref() {
459 assert!(!analyze(&Expr::parse_tree(r"(.\1)").unwrap(), false).is_err());
461 assert!(!analyze(&Expr::parse_tree(r"((.\1))").unwrap(), true).is_err());
462 assert!(!analyze(&Expr::parse_tree(r"(([ab]+)\1b)").unwrap(), false).is_err());
463 assert!(!analyze(&Expr::parse_tree(r"(([ab]+?)(?(1)\1| )c)+").unwrap(), false).is_err());
465 }
466
467 #[test]
468 fn allow_backref_even_when_capture_group_occurs_after_backref() {
469 assert!(!analyze(&Expr::parse_tree(r"\1(.)").unwrap(), false).is_err());
470 assert!(!analyze(&Expr::parse_tree(r"(\1(.))").unwrap(), true).is_err());
471 }
472
473 #[test]
474 fn valid_backref_occurs_after_capture_group() {
475 assert!(!analyze(&Expr::parse_tree(r"(.)\1").unwrap(), false).is_err());
476 assert!(!analyze(&Expr::parse_tree(r"((.)\1)").unwrap(), true).is_err());
477
478 assert!(!analyze(&Expr::parse_tree(r"((.)\2\2)\1").unwrap(), false).is_err());
479 assert!(!analyze(&Expr::parse_tree(r"(.)\1(.)\2").unwrap(), false).is_err());
480 assert!(!analyze(&Expr::parse_tree(r"(.)foo(.)\2").unwrap(), false).is_err());
481 assert!(!analyze(&Expr::parse_tree(r"(.)(foo)(.)\3\2\1").unwrap(), false).is_err());
482 assert!(!analyze(&Expr::parse_tree(r"(.)(foo)(.)\3\1").unwrap(), false).is_err());
483 assert!(!analyze(&Expr::parse_tree(r"(.)(foo)(.)\2\1").unwrap(), false).is_err());
484 }
485
486 #[test]
487 fn feature_not_yet_supported() {
488 let tree = &Expr::parse_tree(r"(a)\g<1>").unwrap();
489 let result = analyze(tree, false);
490 assert!(result.is_err());
491 assert!(matches!(
492 result.err(),
493 Some(Error::CompileError(ref box_err)) if matches!(**box_err, CompileError::FeatureNotYetSupported(_))
494 ));
495
496 let tree = &Expr::parse_tree(r"(a)\k<1-0>").unwrap();
497 let result = analyze(tree, false);
498 assert!(result.is_err());
499 assert!(matches!(
500 result.err(),
501 Some(Error::CompileError(ref box_err)) if matches!(**box_err, CompileError::FeatureNotYetSupported(_))
502 ));
503 }
504
505 #[test]
506 fn subroutine_call_undefined() {
507 let tree = &Expr::parse_tree(r"\g<wrong_name>(?<different_name>a)").unwrap();
508 let result = analyze(tree, false);
509 assert!(result.is_err());
510 assert!(matches!(
511 result.err(),
512 Some(Error::CompileError(ref box_err)) if matches!(**box_err, CompileError::SubroutineCallTargetNotFound(_, _))
513 ));
514 }
515
516 #[test]
517 fn is_literal() {
518 let tree = Expr::parse_tree("abc").unwrap();
519 let info = analyze(&tree, false).unwrap();
520 assert_eq!(info.is_literal(), true);
521 }
522
523 #[test]
524 fn is_literal_with_repeat() {
525 let tree = Expr::parse_tree("abc*").unwrap();
526 let info = analyze(&tree, false).unwrap();
527 assert_eq!(info.is_literal(), false);
528 }
529
530 #[test]
531 fn anchored_for_starttext_assertions() {
532 let tree = Expr::parse_tree(r"^(\w+)\1").unwrap();
533 assert_eq!(can_compile_as_anchored(&tree.expr), true);
534
535 let tree = Expr::parse_tree(r"^").unwrap();
536 assert_eq!(can_compile_as_anchored(&tree.expr), true);
537 }
538
539 #[test]
540 fn backref_inherits_group_size_info() {
541 let tree = Expr::parse_tree(r"(abc)\1").unwrap();
543 let info = analyze(&tree, false).unwrap();
544 assert_eq!(info.min_size, 6);
546 assert!(info.const_size);
547
548 let tree = Expr::parse_tree(r"(a+)\1").unwrap();
550 let info = analyze(&tree, false).unwrap();
551 assert_eq!(info.min_size, 2);
554 assert!(!info.const_size);
555
556 let tree = Expr::parse_tree(r"(a?)\1").unwrap();
558 let info = analyze(&tree, false).unwrap();
559 assert_eq!(info.min_size, 0);
561 assert!(!info.const_size);
562 }
563
564 #[test]
565 fn backref_forward_reference() {
566 let tree = Expr::parse_tree(r"\1(abc)").unwrap();
569 let info = analyze(&tree, false).unwrap();
570 assert_eq!(info.min_size, 3);
572 assert!(!info.const_size);
574 }
575
576 #[test]
577 fn backref_in_lookbehind() {
578 assert!(!analyze(&Expr::parse_tree(r"(hello)(?<=\b\1)").unwrap(), false).is_err());
579 assert!(!analyze(&Expr::parse_tree(r"(..)(?<=\1\1)").unwrap(), false).is_err());
580 assert!(!analyze(&Expr::parse_tree(r"(abc)(?<=\1)def").unwrap(), false).is_err());
581 }
582
583 #[test]
584 fn not_anchored_for_startline_assertions() {
585 let tree = Expr::parse_tree(r"(?m)^(\w+)\1").unwrap();
586 assert_eq!(can_compile_as_anchored(&tree.expr), false);
587 }
588
589 #[test]
590 fn min_pos_in_group_calculated_correctly_with_no_groups() {
591 let tree = Expr::parse_tree(r"\G").unwrap();
592 let info = analyze(&tree, false).unwrap();
593 assert_eq!(info.min_size, 0);
594 assert_eq!(info.min_pos_in_group, 0);
595 assert!(info.const_size);
596
597 let tree = Expr::parse_tree(r"\G(?=abc)\w+").unwrap();
598 let info = analyze(&tree, false).unwrap();
599 assert_eq!(info.children[1].min_size, 0);
601 assert!(info.children[1].const_size);
602 assert_eq!(info.children[1].children[0].min_size, 3);
604 assert!(info.children[1].children[0].const_size);
605 assert_eq!(info.children[2].min_pos_in_group, 0);
607 assert_eq!(info.children[2].min_size, 1);
608 assert_eq!(info.min_pos_in_group, 0);
609 assert!(!info.const_size);
610
611 let tree = Expr::parse_tree(r"(?:ab*|cd){2}(?=bar)\w").unwrap();
612 let info = analyze(&tree, false).unwrap();
613 assert_eq!(info.min_size, 3);
615 assert_eq!(info.children[1].min_pos_in_group, 2);
617 assert_eq!(info.children[2].min_pos_in_group, 2);
619 assert_eq!(info.children[2].min_size, 1);
620 assert!(!info.const_size);
621 }
622
623 #[test]
624 fn min_pos_in_group_calculated_correctly_with_capture_groups() {
625 use matches::assert_matches;
626
627 let tree = Expr::parse_tree(r"a(bc)d(e(f)g)").unwrap();
628 let info = analyze(&tree, false).unwrap();
629 assert_eq!(info.min_pos_in_group, 0);
630 assert_eq!(info.children[1].min_pos_in_group, 1);
632 assert_matches!(info.children[1].children[0].expr, Expr::Concat(_));
634 assert_eq!(info.children[1].children[0].min_pos_in_group, 0);
635 assert!(info.children[1].children[0].const_size);
636 assert_matches!(info.children[1].children[0].children[1].expr, Expr::Literal { val, casei: false } if val == "c");
638 assert_eq!(info.children[1].children[0].children[1].min_pos_in_group, 1);
639
640 assert_matches!(info.children[2].expr, Expr::Literal { val, casei: false } if val == "d");
642 assert_eq!(info.children[2].min_pos_in_group, 3);
643 assert_eq!(info.children[2].start_group(), 2);
644 assert_eq!(info.children[2].min_size, 1);
645
646 assert_matches!(info.children[3].children[0].children[0].expr, Expr::Literal { val, casei: false } if val == "e");
648 assert_eq!(info.children[3].children[0].children[0].min_pos_in_group, 0);
649 }
650}