smt_str/re/
build.rs

1//! The builder for regular expressions.
2
3use smallvec::smallvec;
4
5use crate::SmtChar;
6
7use super::*;
8use std::collections::{BTreeSet, HashMap};
9
10/// A builder for constructing regular expressions in the SMT-LIB string theory.
11///
12/// This is the only way to create [`Regex`] instances.
13/// The `ReBuilder` ensures structural sharing by deduplicates identical sub-expressions and  assigns globally unique identifiers to each node.
14/// An instance of `ReBuilder` is obtained by calling [`ReBuilder::default()`] or [`ReBuilder::non_optimizing()`].
15///
16/// Internally, the builder maintains a registry of expressions:
17/// if the same regular expression is constructed multiple times, it will return
18/// a shared, reference-counted pointer to the same [`ReNode`] instance.
19/// The builder automatically performs garbage collection on unused expressions to keep memory usage low.
20///
21/// For each SMT-LIB regex operation (e.g., `re.*`, `re.union`, `re.++`, `re.comp`, etc.),
22/// the builder provides a corresponding method to create the appropriate [`Regex`] node.
23///
24/// # Optimization
25/// By default, the builder performs lightweight optimizations during construction.
26/// For instance, it may simplify expressions like `(re.++ ε r)` to just `r`.
27/// This behavior can be disabled by constructing a builder using [ReBuilder::non_optimizing()],
28/// which suppresses all such optimizations.
29///
30/// # Using multiple builders
31/// Mixing regular expressions from different builders results in logical errors.
32/// It is recommended to use a single builder instance to construct all regular expressions in a program.
33///
34/// # Example
35/// ```
36/// use smt_str::re::*;
37/// use smallvec::smallvec;
38///
39/// let mut builder1 = ReBuilder::default();
40/// let mut builder2 = ReBuilder::default();
41///
42/// // Construct a regex using the first builder
43/// let r1 = builder1.to_re("a".into());
44///
45/// // Construct a regex using the second builder
46/// let r2 = builder2.to_re("b".into());
47///
48/// // The regexes are structurally different but the following assertion will hold
49/// assert_eq!(r1, r2);
50///
51/// // Construct compound regexes using regexes from different builder will also result in unexpected behavior
52/// let r3 = builder1.union(smallvec![r1.clone(), r2.clone()]);
53///
54/// // The following assertion will hold, although we wanted r3 to accept "a" and "b"
55/// assert!(!r3.accepts(&"b".into()), "Expected {} to accept 'b'", r3);
56/// ```
57#[derive(Debug)]
58pub struct ReBuilder {
59    registry: Registry,
60    optimize: bool,
61
62    /* base expressions */
63    re_none: Regex,
64    re_all: Regex,
65    re_allchar: Regex,
66    re_epsilon: Regex,
67}
68
69impl Default for ReBuilder {
70    /// Creates a new `RegexBuilder` with an empty internal registry.
71    /// By default, on-the-fly optimization is enabled.
72    fn default() -> Self {
73        let mut registry = Registry::new();
74        let re_none = registry.intern(ReOp::None);
75        let re_all = registry.intern(ReOp::All);
76        let re_allchar = registry.intern(ReOp::Any);
77        let re_epsilon = registry.intern(ReOp::Literal(SmtString::empty()));
78
79        Self {
80            registry,
81            optimize: true,
82            re_none,
83            re_all,
84            re_allchar,
85            re_epsilon,
86        }
87    }
88}
89
90impl ReBuilder {
91    /// Creates a new `RegexBuilder` with on-the-fly optimization disabled.
92    pub fn non_optimizing() -> Self {
93        Self {
94            optimize: false,
95            ..Default::default()
96        }
97    }
98
99    fn intern(&mut self, regex: ReOp) -> Regex {
100        self.registry.intern(regex)
101    }
102
103    /// Checks if the builder manages the given regex.
104    /// Returns true if the regex was constructed by this builder.
105    /// Returns false otherwise.
106    ///
107    /// # Example
108    /// ```
109    /// use smt_str::re::*;
110    /// use smallvec::smallvec;
111    ///
112    /// let mut builder1 = ReBuilder::default();
113    /// let mut builder2 = ReBuilder::default();
114    ///
115    /// // Construct a regex using the first builder
116    /// let r1 = builder1.to_re("a".into());
117    ///
118    /// // Construct a regex using the second builder
119    /// let r2 = builder2.to_re("b".into());
120    ///
121    ///
122    /// assert!(builder1.manages(&r1));
123    /// assert!(!builder1.manages(&r2));
124    /// assert!(builder2.manages(&r2));
125    /// assert!(!builder2.manages(&r1));
126    /// ```
127    pub fn manages(&self, regex: &Regex) -> bool {
128        // We need to check if the registry has the regex stored with the same id.
129        // After that we need to check that that the stored regex is pointer equal to the given regex
130        // Only checking the id is not enough as the id is only unique to the builder instance
131        // We could also recurse the structure and check if all children are managed by this builder.
132        match self.registry.registry.get(regex.op()) {
133            Some(r) => Rc::ptr_eq(r, regex),
134            None => false,
135        }
136    }
137
138    /// Re-create a structurally identical regex as the given one using this builder.
139    ///
140    /// /// # Example
141    /// ```
142    /// use smt_str::re::*;
143    /// use smallvec::smallvec;
144    ///
145    /// let mut builder1 = ReBuilder::default();
146    /// let mut builder2 = ReBuilder::default();
147    ///
148    ///
149    /// // Construct a regex using the second builder
150    /// let r = builder2.to_re("b".into());
151    ///
152    /// assert!(!builder1.manages(&r));
153    ///
154    /// let rr = builder1.regex(&r);
155    /// assert!(builder1.manages(&rr));
156    /// ```
157    pub fn regex(&mut self, regex: &Regex) -> Regex {
158        match regex.op() {
159            ReOp::Literal(w) => self.to_re(w.clone()),
160            ReOp::None => self.none(),
161            ReOp::All => self.all(),
162            ReOp::Any => self.allchar(),
163            ReOp::Concat(rs) => {
164                let rs = rs.iter().map(|r| self.regex(r)).collect();
165                self.concat(rs)
166            }
167            ReOp::Union(rs) => {
168                let rs = rs.iter().map(|r| self.regex(r)).collect();
169                self.union(rs)
170            }
171            ReOp::Inter(rs) => {
172                let rs = rs.iter().map(|r| self.regex(r)).collect();
173                self.inter(rs)
174            }
175            ReOp::Star(r) => {
176                let r = self.regex(r);
177                self.star(r)
178            }
179            ReOp::Plus(r) => {
180                let r = self.regex(r);
181                self.plus(r)
182            }
183            ReOp::Opt(r) => {
184                let r = self.regex(r);
185                self.opt(r)
186            }
187            ReOp::Range(r) => self.range_from_to(r.start(), r.end()),
188            ReOp::Comp(r) => {
189                let r = self.regex(r);
190                self.comp(r)
191            }
192            ReOp::Diff(r1, r2) => {
193                let r1 = self.regex(r1);
194                let r2 = self.regex(r2);
195                self.diff(r1, r2)
196            }
197            ReOp::Pow(r, e) => {
198                let r = self.regex(r);
199                self.pow(r, *e)
200            }
201            ReOp::Loop(r, l, u) => {
202                let r = self.regex(r);
203                self.loop_(r, *l, *u)
204            }
205        }
206    }
207
208    /// Constructor for `str.to_re`.
209    /// Returns a regular expression of a constant word.
210    ///
211    /// # Example
212    /// ```
213    /// use smt_str::re::*;
214    /// use smt_str::SmtString;
215    ///
216    /// let mut builder = ReBuilder::default();
217    /// let string = SmtString::from("abc");
218    /// let r = builder.to_re(string.clone());
219    /// assert!(r.accepts(&string));
220    /// ```
221    ///
222    pub fn to_re(&mut self, w: SmtString) -> Regex {
223        self.intern(ReOp::Literal(w))
224    }
225
226    /// Constructs a regular expression denoting the empty word.
227    /// This is exactly the regular expression `(str.to_re "")`.
228    ///
229    /// # Example
230    /// ```
231    /// use smt_str::re::*;
232    ///
233    /// let mut builder = ReBuilder::default();
234    /// let r = builder.epsilon();
235    /// assert!(r.accepts(&"".into()));
236    /// ```
237    pub fn epsilon(&self) -> Regex {
238        self.re_epsilon.clone()
239    }
240
241    /// Constructor for `re.none`.
242    /// Returns a regular expression denoting the empty set.
243    ///
244    /// # Example
245    /// ```
246    /// use smt_str::re::*;
247    ///
248    ///
249    /// let mut builder = ReBuilder::default();
250    /// let r = builder.none();
251    /// assert!(!r.accepts(&"a".into()));
252    /// assert!(!r.accepts(&"".into()));
253    /// assert_eq!(r.none(), Some(true));
254    /// ```
255    pub fn none(&self) -> Regex {
256        self.re_none.clone()
257    }
258
259    /// Constructor for `re.all`.
260    /// Returns a regular expression denoting the set of all strings.
261    ///
262    /// # Example
263    /// ```
264    /// use smt_str::re::*;
265    ///
266    /// let mut builder = ReBuilder::default();
267    /// let r = builder.all();
268    /// assert!(r.accepts(&"a".into()));
269    /// assert!(r.accepts(&"".into()));
270    /// assert_eq!(r.universal(), Some(true));
271    /// ```
272    pub fn all(&self) -> Regex {
273        self.re_all.clone()
274    }
275
276    /// Constructor for `re.allchar`.
277    /// Returns a regular expression accepting any character in the SMT-LIB alphabet (0 - 0x2FFFF).
278    ///
279    /// # Example
280    /// ```
281    /// use smt_str::re::*;
282    ///
283    /// let mut builder = ReBuilder::default();
284    /// let r = builder.allchar();
285    /// assert!(r.accepts(&"a".into()));
286    /// assert!(r.accepts(&"🦀".into()));
287    /// assert!(!r.accepts(&"".into()));
288    /// ```
289    pub fn allchar(&self) -> Regex {
290        self.re_allchar.clone()
291    }
292
293    /// Constructor for `re.range`.
294    /// Returns a regular expression denoting the set of characters in the given range.
295    ///
296    /// # Example
297    /// ```
298    /// use smt_str::re::*;
299    /// use smt_str::alphabet::CharRange;
300    ///
301    /// let mut builder = ReBuilder::default();
302    /// let r = builder.range(CharRange::new('a', 'z'));
303    /// assert!(r.accepts(&"a".into()));
304    /// assert!(r.accepts(&"c".into()));
305    /// assert!(r.accepts(&"a".into()));
306    /// assert!(r.accepts(&"b".into()));
307    /// assert!(!r.accepts(&"A".into()));
308    /// ```
309    pub fn range(&mut self, r: CharRange) -> Regex {
310        if self.optimize {
311            self.range_opt(r)
312        } else {
313            self.intern(ReOp::Range(r))
314        }
315    }
316
317    /// Wrapper for [`ReBuilder::range`] that creates a range from the given characters.
318    pub fn range_from_to(&mut self, l: impl Into<SmtChar>, u: impl Into<SmtChar>) -> Regex {
319        self.range(CharRange::new(l.into(), u.into()))
320    }
321
322    fn range_opt(&mut self, r: CharRange) -> Regex {
323        if r.is_empty() {
324            self.none()
325        } else if r.is_full() {
326            self.allchar()
327        } else if let Some(c) = r.is_singleton() {
328            self.to_re(c.into())
329        } else {
330            self.intern(ReOp::Range(r))
331        }
332    }
333
334    /// Constructor for `re.++`.
335    /// Constructs a regular expression denoting the concatenation of the given regular expressions.
336    ///
337    /// # Example
338    /// ```
339    /// use smt_str::re::*;
340    /// use smallvec::smallvec;
341    ///
342    /// let mut builder = ReBuilder::default();
343    /// let r1 = builder.to_re("abc".into());
344    /// let r2 = builder.to_re("def".into());
345    /// let r = builder.concat(smallvec![r1, r2]);
346    /// assert!(r.accepts(&"abcdef".into()));
347    /// assert!(!r.accepts(&"abc".into()));
348    /// ```
349    pub fn concat(&mut self, rs: SmallVec<[Regex; 2]>) -> Regex {
350        if self.optimize {
351            self.concat_opt(rs)
352        } else {
353            self.intern(ReOp::Concat(rs))
354        }
355    }
356
357    fn concat_opt(&mut self, rs: SmallVec<[Regex; 2]>) -> Regex {
358        // Check if any of the children is the empty set, then the concatenation is the empty set
359        if rs.iter().any(|i| i.none() == Some(true)) {
360            return self.none();
361        }
362
363        // Filter out empty terms as they are the identity element of concatenation
364        // Flatten nested concatenations
365        let rs = rs
366            .into_iter()
367            .filter(|i| i.epsilon() != Some(true))
368            .flat_map(|i| {
369                if let ReOp::Concat(rs) = &i.op() {
370                    rs.clone()
371                } else {
372                    smallvec![i]
373                }
374            })
375            .collect_vec();
376
377        // Collapse adjacent words
378        // TODO: Do this in one iteration
379        let mut collapsed: SmallVec<[Regex; 2]> = SmallVec::with_capacity(rs.len());
380        let mut last_word: Option<SmtString> = None;
381        for r in rs.into_iter() {
382            if let ReOp::Literal(w) = &r.op() {
383                if let Some(last) = last_word {
384                    last_word = Some(last.concat(w));
385                } else {
386                    last_word = Some(w.clone());
387                }
388            } else {
389                if let Some(last) = last_word {
390                    collapsed.push(self.to_re(last));
391                    last_word = None;
392                }
393                collapsed.push(r);
394            }
395        }
396        if let Some(last) = last_word {
397            collapsed.push(self.to_re(last));
398        }
399
400        if collapsed.is_empty() {
401            self.epsilon()
402        } else if collapsed.len() == 1 {
403            collapsed[0].clone()
404        } else {
405            self.intern(ReOp::Concat(collapsed))
406        }
407    }
408
409    /// Constructor for `re.union`.
410    /// Returns a regular expression denoting the union of the given regular expressions.
411    ///
412    /// # Example
413    /// ```
414    /// use smt_str::re::*;
415    /// use smallvec::smallvec;
416    ///
417    /// let mut builder = ReBuilder::default();
418    /// let r1 = builder.to_re("ac".into());
419    /// let r2 = builder.to_re("ab".into());
420    /// let r = builder.union(smallvec![r1, r2]);
421    /// assert!(r.accepts(&"ac".into()));
422    /// assert!(r.accepts(&"ab".into()));
423    /// assert!(!r.accepts(&"cb".into()));
424    /// ```
425    pub fn union(&mut self, rs: SmallVec<[Regex; 2]>) -> Regex {
426        if self.optimize {
427            self.union_opt(rs)
428        } else {
429            self.intern(ReOp::Union(rs))
430        }
431    }
432
433    fn union_opt(&mut self, rs: SmallVec<[Regex; 2]>) -> Regex {
434        // Filter out empty terms as they are the identity element of union
435
436        // Filter out empty terms as they are the identity element of union
437        // Collect the remaining terms into a set to deduplicate and make the order deterministic
438        #[allow(clippy::mutable_key_type)]
439        let mut cleaned: BTreeSet<Regex> = BTreeSet::new();
440        let mut ranges: Option<Alphabet> = None;
441        for r in rs.into_iter().filter(|r| r.none() != Some(true)) {
442            if cleaned.contains(&self.comp(r.clone())) {
443                // if a union contains a regex and its complement, then it is the unviversal regex
444                return self.all();
445            }
446
447            match r.op() {
448                ReOp::Range(r) => match ranges {
449                    Some(ref mut ranges) => ranges.insert(*r),
450                    None => ranges = Some(Alphabet::from(*r)),
451                },
452                ReOp::None => (),
453                ReOp::Any => ranges = Some(Alphabet::full()),
454                ReOp::All => return self.all(),
455                ReOp::Union(rs) => {
456                    // Flatten
457                    cleaned.extend(BTreeSet::from_iter(rs.clone()));
458                }
459                _ => {
460                    cleaned.insert(r);
461                }
462            }
463        }
464        if let Some(a) = ranges {
465            let rs: SmallVec<[Regex; 2]> = a.iter_ranges().map(|r| self.range(r)).collect();
466            cleaned.insert(self.intern(ReOp::Union(rs)));
467        }
468        let rs: SmallVec<[Rc<ReNode>; 2]> = cleaned.into_iter().collect();
469        if rs.is_empty() {
470            self.none()
471        } else if rs.len() == 1 {
472            rs[0].clone()
473        } else {
474            self.intern(ReOp::Union(rs))
475        }
476    }
477
478    /// Constructor for `re.inter`.
479    /// Returns a regular expression denoting the intersection of the given regular expressions.
480    ///
481    /// # Example
482    /// ```
483    /// use smt_str::re::*;
484    /// use smallvec::smallvec;
485    ///
486    /// let mut builder = ReBuilder::default();
487    /// let r1 = builder.range_from_to('a', 'z');
488    /// let r2 = builder.range_from_to('o', 'x');
489    /// let r = builder.inter(smallvec![r1, r2]);
490    /// assert!(r.accepts(&"o".into()));
491    /// assert!(r.accepts(&"q".into()));
492    /// assert!(r.accepts(&"x".into()));
493    /// assert!(!r.accepts(&"z".into()));
494    /// assert!(!r.accepts(&"1".into()));
495    /// ```
496    pub fn inter(&mut self, rs: SmallVec<[Regex; 2]>) -> Regex {
497        if self.optimize {
498            self.inter_opt(rs)
499        } else {
500            self.intern(ReOp::Inter(rs))
501        }
502    }
503
504    fn inter_opt(&mut self, rs: SmallVec<[Regex; 2]>) -> Regex {
505        // Check if any of the children is the empty set, then the intersection is the empty set
506        if rs.iter().any(|i| i.none() == Some(true)) {
507            return self.none();
508        }
509
510        // Filter out re.all terms as they are the identity element of intersection
511        // As as side effect, this also flattens nested intersections, and sorts them to make the order deterministic
512        #[allow(clippy::mutable_key_type)]
513        let rs: BTreeSet<_> = rs
514            .into_iter()
515            .filter(|i| i.universal() != Some(true))
516            .flat_map(|i| {
517                if let ReOp::Inter(rs) = &i.op() {
518                    rs.clone()
519                } else {
520                    smallvec![i]
521                }
522            })
523            .collect();
524
525        let rs: SmallVec<[Rc<ReNode>; 2]> = rs.into_iter().collect();
526        if rs.is_empty() {
527            self.all()
528        } else if rs.len() == 1 {
529            rs[0].clone()
530        } else {
531            self.intern(ReOp::Inter(rs))
532        }
533    }
534
535    /// Constructor for `re.*`.
536    /// Returns a regular expression denoting the Kleene star of the given regular expression.
537    ///
538    /// # Example
539    /// ```
540    /// use smt_str::re::*;
541    ///
542    /// let mut builder = ReBuilder::default();
543    /// let r = builder.to_re("a".into());
544    /// let s = builder.star(r);
545    /// assert!(s.accepts(&"".into()));
546    /// assert!(s.accepts(&"a".into()));
547    /// assert!(s.accepts(&"aaaa".into()));
548    /// assert!(s.accepts(&"aaaaaaaa".into()));
549    /// assert!(!s.accepts(&"b".into()));
550    /// ```
551    pub fn star(&mut self, r: Regex) -> Regex {
552        if self.optimize {
553            self.star_opt(r)
554        } else {
555            self.intern(ReOp::Star(r))
556        }
557    }
558
559    fn star_opt(&mut self, r: Regex) -> Regex {
560        if r.none().unwrap_or(false) || r.epsilon().unwrap_or(false) {
561            return self.epsilon();
562        }
563
564        // If r is e+ or e* or e? then return e
565        // Otherwise, return r
566        fn stip_closures(r: Regex) -> Regex {
567            match r.op() {
568                ReOp::Star(inner) | ReOp::Plus(inner) | ReOp::Opt(inner) => inner.clone(),
569                _ => r,
570            }
571        }
572
573        match r.op() {
574            ReOp::Union(rs) => {
575                // Strip closures from each branch
576                let rs = rs.iter().map(|r| stip_closures(r.clone())).collect();
577                let u = self.union(rs);
578                self.intern(ReOp::Star(u))
579            }
580            ReOp::Star(_) => {
581                // Flatten nested stars
582                r.clone()
583            }
584            ReOp::Plus(rr) | ReOp::Opt(rr) => {
585                // Flatten (R+)* = R*
586                // Flatten (R?)* = R*
587                self.star(rr.clone())
588            }
589            _ => self.intern(ReOp::Star(r)),
590        }
591    }
592
593    /// Constructor for `re.+`.
594    /// Returns a regular expression denoting the positive closure of the given regular expression.
595    ///
596    /// # Example
597    /// ```
598    /// use smt_str::re::*;
599    ///
600    /// let mut builder = ReBuilder::default();
601    /// let r = builder.to_re("a".into());
602    /// let p = builder.plus(r);
603    /// assert!(!p.accepts(&"".into()));
604    /// assert!(p.accepts(&"a".into()));
605    /// assert!(p.accepts(&"aaaa".into()));
606    /// assert!(p.accepts(&"aaaaaaaa".into()));
607    /// assert!(!p.accepts(&"b".into()));
608    /// ```
609    pub fn plus(&mut self, r: Regex) -> Regex {
610        if self.optimize {
611            self.plus_opt(r)
612        } else {
613            self.intern(ReOp::Plus(r))
614        }
615    }
616
617    fn plus_opt(&mut self, r: Regex) -> Regex {
618        // (∅)+ = ∅
619        if r.none().unwrap_or(false) {
620            return self.none();
621        }
622
623        // If r is e+ return e
624        // Otherwise, return r
625        fn stip_plus(r: Regex) -> Regex {
626            match r.op() {
627                ReOp::Plus(inner) => inner.clone(),
628                _ => r,
629            }
630        }
631
632        // ε+ = ε
633        if r.epsilon().unwrap_or(false) {
634            return self.epsilon();
635        }
636
637        if r.nullable() {
638            // (R)+ = R* with R nullable
639            if let ReOp::Star(inner) = r.op() {
640                return self.star(inner.clone());
641            }
642        }
643
644        match r.op() {
645            // (R+)+ → R+
646            ReOp::Plus(_) => r.clone(),
647
648            // (R*)+ → R*
649            ReOp::Star(inner) => self.star(inner.clone()),
650
651            // (⋃ R_i+)+ → (⋃ R_i)+ → ⋃ R_i+
652            ReOp::Union(rs) => {
653                let rs = rs.iter().map(|r| stip_plus(r.clone())).collect();
654                let u = self.union(rs);
655                self.intern(ReOp::Plus(u))
656            }
657
658            _ => self.intern(ReOp::Plus(r)),
659        }
660    }
661
662    /// Constructor for `re.comp`.
663    /// Returns a regular expression denoting the complement of the given regular expression.
664    ///
665    /// # Example
666    /// ```
667    /// use smt_str::re::*;
668    ///
669    /// let mut builder = ReBuilder::default();
670    /// let r = builder.to_re("a".into());
671    /// let c = builder.comp(r);
672    /// assert!(!c.accepts(&"a".into()));
673    /// assert!(c.accepts(&"b".into()));
674    /// assert!(c.accepts(&"".into()));
675    /// assert!(c.accepts(&"aa".into()));
676    /// ```
677    pub fn comp(&mut self, r: Regex) -> Regex {
678        if self.optimize {
679            self.comp_opt(r)
680        } else {
681            self.intern(ReOp::Comp(r))
682        }
683    }
684
685    fn comp_opt(&mut self, r: Regex) -> Regex {
686        if r.none().unwrap_or(false) {
687            self.all()
688        } else if r.universal().unwrap_or(false) {
689            self.none()
690        } else if let ReOp::Comp(r) = r.op() {
691            r.clone()
692        } else {
693            self.intern(ReOp::Comp(r))
694        }
695    }
696
697    /// Constructor for `re.diff`.
698    /// Returns a regular expression denoting the difference of the given regular expressions.
699    ///
700    /// # Example
701    /// ```
702    /// use smt_str::re::*;
703    ///
704    /// let mut builder = ReBuilder::default();
705    /// let a = builder.to_re("a".into());
706    /// let s = builder.star(a.clone());
707    /// let eps = builder.epsilon();
708    /// let r = builder.diff(s, eps);
709    ///
710    /// assert!(r.accepts(&"a".into()));
711    /// assert!(r.accepts(&"aaaa".into()));
712    /// assert!(!r.accepts(&"".into()));
713    /// assert!(!r.accepts(&"b".into()));
714    /// ```
715    pub fn diff(&mut self, r1: Regex, r2: Regex) -> Regex {
716        if self.optimize {
717            self.diff_opt(r1, r2)
718        } else {
719            self.intern(ReOp::Diff(r1, r2))
720        }
721    }
722
723    fn diff_opt(&mut self, r1: Regex, r2: Regex) -> Regex {
724        if r1.none().unwrap_or(false) || r2.universal().unwrap_or(false) {
725            self.none()
726        } else if r2.none().unwrap_or(false) {
727            r1.clone()
728        } else {
729            match (r1.op(), r2.op()) {
730                (ReOp::Opt(r), _) if r2.epsilon().unwrap_or(false) => r.clone(),
731                (ReOp::Star(r), _) if r2.epsilon().unwrap_or(false) => self.plus(r.clone()),
732                _ => self.intern(ReOp::Diff(r1, r2)),
733            }
734        }
735    }
736
737    /// Constructor for `re.opt`.
738    /// Returns a regular expression denoting the optional version of the given regular expression.
739    ///
740    /// # Example
741    /// ```
742    /// use smt_str::re::*;
743    ///
744    /// let mut builder = ReBuilder::default();
745    /// let r = builder.to_re("a".into());
746    /// let o = builder.opt(r);
747    /// assert!(o.accepts(&"a".into()));
748    /// assert!(o.accepts(&"".into()));
749    /// assert!(!o.accepts(&"b".into()));
750    /// ```
751    pub fn opt(&mut self, r: Regex) -> Regex {
752        if self.optimize {
753            self.opt_opt(r)
754        } else {
755            self.intern(ReOp::Opt(r))
756        }
757    }
758
759    fn opt_opt(&mut self, r: Regex) -> Regex {
760        if r.none().unwrap_or(false) || r.epsilon().unwrap_or(false) {
761            self.epsilon()
762        } else {
763            self.intern(ReOp::Opt(r))
764        }
765    }
766
767    /// Constructor for `re.pow`.
768    /// Returns a regular expression denoting the `n`th power of the given regular expression.
769    ///
770    /// # Example
771    /// ```
772    /// use smt_str::re::*;
773    ///
774    /// let mut builder = ReBuilder::default();
775    /// let r = builder.to_re("a".into());
776    /// let p = builder.pow(r, 3);
777    /// assert!(p.accepts(&"aaa".into()));
778    /// assert!(!p.accepts(&"aaaa".into()));
779    /// assert!(!p.accepts(&"aa".into()));
780    /// ```
781    pub fn pow(&mut self, r: Regex, n: u32) -> Regex {
782        if self.optimize {
783            self.pow_opt(r, n)
784        } else {
785            self.intern(ReOp::Pow(r, n))
786        }
787    }
788
789    fn pow_opt(&mut self, r: Regex, n: u32) -> Regex {
790        if n == 0 {
791            self.epsilon()
792        } else if n == 1 {
793            r.clone()
794        } else {
795            self.intern(ReOp::Pow(r, n))
796        }
797    }
798
799    /// Constructor for `re.loop`.
800    /// Returns a regular expression denoting the loop of the given regular expression.
801    /// That is, the regular expression accepts any number of repetitions of the given regular expression between `l` and `u` times (inclusive).
802    ///
803    /// # Example
804    /// ```
805    /// use smt_str::re::*;
806    ///
807    /// let mut builder = ReBuilder::default();
808    /// let r = builder.to_re("a".into());
809    /// let l = 2;
810    /// let u = 4;
811    /// let looped = builder.loop_(r, l, u);
812    /// assert!(looped.accepts(&"aa".into()));
813    /// assert!(looped.accepts(&"aaa".into()));
814    /// assert!(looped.accepts(&"aaaa".into()));
815    /// assert!(!looped.accepts(&"aaaaa".into()));
816    /// assert!(!looped.accepts(&"a".into()));
817    /// ```
818    pub fn loop_(&mut self, r: Regex, l: u32, u: u32) -> Regex {
819        if self.optimize {
820            self.loop_opt(r, l, u)
821        } else {
822            self.intern(ReOp::Loop(r, l, u))
823        }
824    }
825
826    fn loop_opt(&mut self, r: Regex, l: u32, u: u32) -> Regex {
827        if l > u {
828            self.none()
829        } else if u == 0 {
830            self.epsilon()
831        } else if l == u {
832            self.pow(r, u)
833        } else {
834            self.intern(ReOp::Loop(r, l, u))
835        }
836    }
837
838    /// Unrolls a loop of the given regular expression.
839    /// The loop is unrolled as a concatenation of the given regular expression repeated `l` times followed by the optional regular expression repeated `u-l` times.
840    #[allow(dead_code)]
841    fn unroll_loop(&mut self, r: Regex, l: u32, u: u32) -> Regex {
842        let mut concats: SmallVec<[Rc<ReNode>; 2]> = SmallVec::with_capacity(u as usize);
843        let opt = self.opt(r.clone());
844        for i in 0..u {
845            if i < l {
846                concats.push(r.clone());
847            } else {
848                concats.push(opt.clone());
849            }
850        }
851        self.concat(concats)
852    }
853
854    /// Unrolls a power of the given regular expression.
855    /// The power is unrolled as a concatenation of the given regular expression repeated `n` times.
856    #[allow(dead_code)]
857    fn unroll_pow(&mut self, r: Regex, n: u32) -> Regex {
858        let mut concats = SmallVec::with_capacity(n as usize);
859        for _ in 0..n {
860            concats.push(r.clone());
861        }
862        self.concat(concats)
863    }
864
865    /* Aux methods */
866
867    /// Constructs a regular expression denoting the reverse of the given regular expression.
868    /// If a word w is accepted by the given regular expression, then the reverse of w is accepted by the returned regular expression.
869    pub fn reversed(&mut self, r: &Regex) -> Regex {
870        match r.op() {
871            ReOp::Literal(word) => self.to_re(word.reversed()),
872            ReOp::None | ReOp::All | ReOp::Any | ReOp::Range(_) => r.clone(),
873            ReOp::Concat(rs) => {
874                let rs = rs.iter().rev().map(|r| self.reversed(r)).collect();
875                self.concat(rs)
876            }
877            ReOp::Union(rs) => {
878                let rs = rs.iter().map(|r| self.reversed(r)).collect();
879                self.union(rs)
880            }
881            ReOp::Inter(rs) => {
882                let rs = rs.iter().map(|r| self.reversed(r)).collect();
883                self.inter(rs)
884            }
885            ReOp::Star(r) => {
886                let rev = self.reversed(r);
887                self.star(rev)
888            }
889            ReOp::Plus(r) => {
890                let rev = self.reversed(r);
891                self.plus(rev)
892            }
893            ReOp::Opt(r) => {
894                let rev = self.reversed(r);
895                self.opt(rev)
896            }
897            ReOp::Comp(r) => {
898                let rev = self.reversed(r);
899                self.comp(rev)
900            }
901            ReOp::Diff(r, r1) => {
902                let rev = self.reversed(r);
903                let rev1 = self.reversed(r1);
904                self.diff(rev, rev1)
905            }
906            ReOp::Pow(r, e) => {
907                let rev = self.reversed(r);
908                self.pow(rev, *e)
909            }
910            ReOp::Loop(r, l, u) => {
911                let rev = self.reversed(r);
912                self.loop_(rev, *l, *u)
913            }
914        }
915    }
916}
917
918/// Ensures that identical regex patterns are reused and provides construction methods for each regex op(), optimizing memory usage and construction efficiency.
919#[derive(Debug)]
920struct Registry {
921    /// Stores the unique instances of `Regex`.
922    /// The key is the regex pattern itself, and the value is a shared reference-counted handle to the pattern.
923    registry: HashMap<ReOp, Regex>,
924    /// The id to assign to the next regex.
925    next_id: usize,
926}
927
928impl Registry {
929    /// Creates a new `Interner` with an empty internal registry.
930    /// It starts without any regex patterns and will only populate its internal map when `intern` is called.
931    fn new() -> Self {
932        Registry {
933            registry: HashMap::new(),
934            next_id: 0,
935        }
936    }
937
938    /// Interns a regex pattern, ensuring each unique regex is stored and reused.
939    /// This method serves as the primary access point for adding new regex patterns to the builder.
940    ///
941    /// # Arguments
942    /// * `regex` - The `Regex` pattern to intern.
943    ///
944    /// # Returns
945    /// A [RegexRef] pointing to the stored or newly created regex instance.
946    fn intern(&mut self, op: ReOp) -> Regex {
947        if let Some(existing) = self.registry.get(&op) {
948            existing.clone()
949        } else {
950            let regex = ReNode::new(self.next_id, op.clone());
951            self.next_id += 1;
952            let re = Rc::new(regex.clone());
953            self.registry.insert(op, re.clone());
954            re
955        }
956    }
957}
958
959#[cfg(test)]
960mod test {
961    use test::build::ReBuilder;
962
963    use super::*;
964
965    #[test]
966    fn intern_word() {
967        let mut builder = ReBuilder::default();
968        let w1 = builder.to_re("abc".into());
969        let w2 = builder.to_re("abc".into());
970        assert!(Rc::ptr_eq(&w1, &w2));
971    }
972
973    #[test]
974    fn intern_epsilon() {
975        let builder = ReBuilder::default();
976        let e1 = builder.epsilon();
977        let e2 = builder.epsilon();
978        assert!(Rc::ptr_eq(&e1, &e2));
979    }
980
981    #[test]
982    fn intern_none() {
983        let builder = ReBuilder::default();
984        let n1 = builder.none();
985        let n2 = builder.none();
986        assert!(Rc::ptr_eq(&n1, &n2));
987    }
988
989    #[test]
990    fn intern_all() {
991        let builder = ReBuilder::default();
992        let a1 = builder.all();
993        let a2 = builder.all();
994        assert!(Rc::ptr_eq(&a1, &a2));
995    }
996
997    #[test]
998    fn intern_all_char() {
999        let builder = ReBuilder::default();
1000        let ac1 = builder.allchar();
1001        let ac2 = builder.allchar();
1002        assert!(Rc::ptr_eq(&ac1, &ac2));
1003    }
1004
1005    #[test]
1006    fn intern_range() {
1007        let mut builder = ReBuilder::default();
1008        let r1 = builder.range_from_to('a', 'z');
1009        let r2 = builder.range_from_to('a', 'z');
1010        assert!(Rc::ptr_eq(&r1, &r2));
1011    }
1012
1013    #[test]
1014    fn intern_range_full() {
1015        let mut builder = ReBuilder::default();
1016        let r1 = builder.range_from_to('\0', SmtChar::MAX);
1017        let r2 = builder.allchar();
1018        assert!(Rc::ptr_eq(&r1, &r2));
1019    }
1020
1021    #[test]
1022    fn intern_concat() {
1023        let mut builder = ReBuilder::default();
1024        let w1 = builder.to_re("abc".into());
1025        let w2 = builder.to_re("def".into());
1026        let c1 = builder.concat(smallvec![w1.clone(), w2.clone()]);
1027        let c2 = builder.concat(smallvec![w1.clone(), w2.clone()]);
1028        assert!(Rc::ptr_eq(&c1, &c2));
1029    }
1030
1031    #[test]
1032    fn intern_union() {
1033        let mut builder = ReBuilder::default();
1034        let w1 = builder.to_re("abc".into());
1035        let w2 = builder.to_re("def".into());
1036        let u1 = builder.union(smallvec![w1.clone(), w2.clone()]);
1037        let u2 = builder.union(smallvec![w1.clone(), w2.clone()]);
1038        assert!(Rc::ptr_eq(&u1, &u2));
1039    }
1040
1041    #[test]
1042    fn intern_inter() {
1043        let mut builder = ReBuilder::default();
1044        let w1 = builder.to_re("abc".into());
1045        let w2 = builder.to_re("def".into());
1046        let args = smallvec![w1.clone(), w2.clone()];
1047        let i1 = builder.inter(args.clone());
1048        let i2 = builder.inter(args.clone());
1049        assert!(Rc::ptr_eq(&i1, &i2));
1050    }
1051
1052    #[test]
1053    fn intern_star() {
1054        let mut builder = ReBuilder::default();
1055        let w1 = builder.to_re("abc".into());
1056        let s1 = builder.star(w1.clone());
1057        let s2 = builder.star(w1.clone());
1058        assert!(Rc::ptr_eq(&s1, &s2));
1059    }
1060
1061    #[test]
1062    fn intern_plus() {
1063        let mut builder = ReBuilder::default();
1064        let w1 = builder.to_re("abc".into());
1065        let p1 = builder.plus(w1.clone());
1066        let p2 = builder.plus(w1.clone());
1067        assert!(Rc::ptr_eq(&p1, &p2));
1068    }
1069
1070    #[test]
1071    fn intern_opt() {
1072        let mut builder = ReBuilder::default();
1073        let w1 = builder.to_re("abc".into());
1074        let o1 = builder.opt(w1.clone());
1075        let o2 = builder.opt(w1.clone());
1076        assert!(Rc::ptr_eq(&o1, &o2));
1077    }
1078
1079    #[test]
1080    fn intern_comp() {
1081        let mut builder = ReBuilder::default();
1082        let w1 = builder.to_re("abc".into());
1083        let c1 = builder.comp(w1.clone());
1084        let c2 = builder.comp(w1.clone());
1085        assert!(Rc::ptr_eq(&c1, &c2));
1086    }
1087
1088    #[test]
1089    fn builder_equal() {
1090        let mut builder = ReBuilder::default();
1091        let w1 = builder.to_re("abc".into());
1092        let w2 = builder.to_re("abc".into());
1093        assert_eq!(w1, w2);
1094        assert!(Rc::ptr_eq(&w1, &w2));
1095
1096        let mut builder2 = ReBuilder::default();
1097        let w3 = builder2.to_re("abc".into());
1098
1099        assert_eq!(w1, w3);
1100        assert!(!Rc::ptr_eq(&w1, &w3));
1101    }
1102
1103    #[test]
1104    fn universal_concat() {
1105        let mut builder = ReBuilder::default();
1106        let a = builder.all();
1107        let c = builder.concat(smallvec![a.clone(), a.clone()]);
1108        assert_eq!(c.universal(), Some(true));
1109    }
1110
1111    #[test]
1112    fn non_universal_concat() {
1113        let mut builder = ReBuilder::default();
1114        let a = builder.all();
1115        let b = builder.allchar();
1116        let c = builder.concat(smallvec![a.clone(), b.clone()]);
1117        assert_eq!(c.universal(), Some(false));
1118    }
1119
1120    #[test]
1121    fn universal_union() {
1122        let mut builder = ReBuilder::default();
1123        let all = builder.all();
1124        let none = builder.none();
1125        let c = builder.union(smallvec![all, none]);
1126        assert_eq!(c.universal(), Some(true));
1127    }
1128
1129    #[test]
1130    fn non_universal_union() {
1131        let mut builder = ReBuilder::default();
1132        let b = builder.allchar();
1133        let c = builder.concat(smallvec![b.clone(), b.clone()]);
1134        assert_eq!(c.universal(), Some(false));
1135    }
1136
1137    #[test]
1138    fn test_union_with_comp_literals() {
1139        let mut rb = ReBuilder::default();
1140
1141        let a = rb.to_re("a".into());
1142        let comp_a = rb.comp(a.clone());
1143        let got = rb.union(smallvec![comp_a, a]);
1144
1145        assert_eq!(
1146            got.op(),
1147            &ReOp::All,
1148            "Expected:\n{}\nGot\n{}",
1149            ReOp::All,
1150            got
1151        );
1152    }
1153
1154    #[test]
1155    fn test_reversed_constant() {
1156        let mut rb = ReBuilder::default();
1157        let regex = rb.to_re("abc".into());
1158        let reversed = rb.reversed(&regex);
1159        assert_eq!(reversed.to_string(), rb.to_re("cba".into()).to_string());
1160    }
1161
1162    #[test]
1163    fn test_reversed_concat() {
1164        let mut rb = ReBuilder::default();
1165        let regex1 = rb.to_re("abc".into());
1166        let regex2 = rb.to_re("def".into());
1167        let concat = rb.concat(smallvec![regex1.clone(), regex2.clone()]);
1168        let reversed = rb.reversed(&concat);
1169        let args = smallvec![rb.reversed(&regex2), rb.reversed(&regex1)];
1170        let expected = rb.concat(args);
1171        assert_eq!(reversed.to_string(), expected.to_string());
1172    }
1173
1174    #[test]
1175    fn test_reversed_concat_2() {
1176        let mut rb = ReBuilder::default();
1177        let a = rb.to_re("a".into());
1178        let astar = rb.star(a);
1179
1180        let b = rb.to_re("b".into());
1181        let bstar = rb.star(b);
1182
1183        // a*b*
1184        let concat = rb.concat(smallvec![astar.clone(), bstar.clone()]);
1185        // Should be b*a*
1186        let reversed = rb.reversed(&concat);
1187
1188        // b*a*
1189        let args = smallvec![bstar, astar];
1190        let expected = rb.concat(args);
1191        assert_eq!(
1192            reversed, expected,
1193            "Expected: {}, Got: {}",
1194            expected, reversed
1195        );
1196    }
1197
1198    #[test]
1199    fn test_reversed_union() {
1200        let mut rb = ReBuilder::default();
1201        let regex1 = rb.to_re("abc".into());
1202        let regex2 = rb.to_re("xyz".into());
1203        let union = rb.union(smallvec![regex1.clone(), regex2.clone()]);
1204        let reversed = rb.reversed(&union);
1205        let args = smallvec![rb.reversed(&regex1), rb.reversed(&regex2)];
1206        let expected = rb.union(args);
1207        assert_eq!(reversed.to_string(), expected.to_string());
1208    }
1209
1210    #[test]
1211    fn test_reversed_star() {
1212        let mut rb = ReBuilder::default();
1213        let regex = rb.to_re("abc".into());
1214        let star = rb.star(regex.clone());
1215        let reversed = rb.reversed(&star);
1216        let revd = rb.reversed(&regex);
1217        let expected = rb.star(revd);
1218        assert_eq!(reversed.to_string(), expected.to_string());
1219    }
1220
1221    #[test]
1222    fn test_reversed_plus() {
1223        let mut rb = ReBuilder::default();
1224        let regex = rb.to_re("abc".into());
1225        let plus = rb.plus(regex.clone());
1226        let reversed = rb.reversed(&plus);
1227
1228        let rev = rb.reversed(&regex);
1229        let expected = rb.plus(rev);
1230        assert_eq!(reversed.to_string(), expected.to_string());
1231    }
1232
1233    #[test]
1234    fn test_reversed_opt() {
1235        let mut rb = ReBuilder::default();
1236        let regex = rb.to_re("abc".into());
1237        let opt = rb.opt(regex.clone());
1238        let reversed = rb.reversed(&opt);
1239
1240        let rev = rb.reversed(&regex);
1241        let expected = rb.opt(rev);
1242        assert_eq!(reversed.to_string(), expected.to_string());
1243    }
1244
1245    #[test]
1246    fn test_reversed_inter() {
1247        let mut rb = ReBuilder::default();
1248        let regex1 = rb.to_re("abc".into());
1249        let regex2 = rb.to_re("xyz".into());
1250        let inter = rb.inter(smallvec![regex1.clone(), regex2.clone()]);
1251        let reversed = rb.reversed(&inter);
1252
1253        let args = smallvec![rb.reversed(&regex1), rb.reversed(&regex2)];
1254        let expected = rb.inter(args);
1255        assert_eq!(reversed.to_string(), expected.to_string());
1256    }
1257
1258    #[test]
1259    fn test_reversed_diff() {
1260        let mut rb = ReBuilder::default();
1261        let regex1 = rb.to_re("abc".into());
1262        let regex2 = rb.to_re("xyz".into());
1263        let diff = rb.diff(regex1.clone(), regex2.clone());
1264        let reversed = rb.reversed(&diff);
1265
1266        let l = rb.reversed(&regex1);
1267        let r = rb.reversed(&regex2);
1268        let expected = rb.diff(l, r);
1269        assert_eq!(reversed.to_string(), expected.to_string());
1270    }
1271
1272    #[test]
1273    fn test_reversed_comp() {
1274        let mut rb = ReBuilder::default();
1275        let regex = rb.to_re("abc".into());
1276        let comp = rb.comp(regex.clone());
1277        let reversed = rb.reversed(&comp);
1278
1279        let rev = rb.reversed(&regex);
1280        let expected = rb.comp(rev);
1281        assert_eq!(reversed.to_string(), expected.to_string());
1282    }
1283
1284    #[test]
1285    fn test_reversed_pow() {
1286        let mut rb = ReBuilder::default();
1287        let regex = rb.to_re("abc".into());
1288        let pow = rb.pow(regex.clone(), 3);
1289        let reversed = rb.reversed(&pow);
1290
1291        let rev = rb.reversed(&regex);
1292        let expected = rb.pow(rev, 3);
1293        assert_eq!(reversed.to_string(), expected.to_string());
1294    }
1295
1296    #[test]
1297    fn test_reversed_loop() {
1298        let mut rb = ReBuilder::default();
1299        let regex = rb.to_re("abc".into());
1300        let looped = rb.loop_(regex.clone(), 2, 5);
1301        let reversed = rb.reversed(&looped);
1302
1303        let rev = rb.reversed(&regex);
1304        let expected = rb.loop_(rev, 2, 5);
1305        assert_eq!(reversed.to_string(), expected.to_string());
1306    }
1307
1308    #[test]
1309    fn test_unroll_loop_zero() {
1310        let mut rb = ReBuilder::default();
1311        let regex = rb.to_re("abc".into());
1312        let unrolled = rb.unroll_loop(regex.clone(), 0, 0);
1313        assert_eq!(unrolled.to_string(), rb.epsilon().to_string());
1314    }
1315
1316    #[test]
1317    fn test_unroll_loop_single() {
1318        let mut rb = ReBuilder::default();
1319        let regex = rb.to_re("abc".into());
1320        let unrolled = rb.unroll_loop(regex.clone(), 1, 1);
1321        assert_eq!(unrolled.to_string(), regex.to_string());
1322    }
1323
1324    #[test]
1325    fn test_unroll_loop_multiple() {
1326        let mut rb = ReBuilder::default();
1327        let regex = rb.to_re("abc".into());
1328        let unrolled = rb.unroll_loop(regex.clone(), 2, 4);
1329        let opt = rb.opt(regex.clone());
1330
1331        let expected = rb.concat(smallvec![
1332            regex.clone(),
1333            regex.clone(),
1334            opt.clone(),
1335            opt.clone()
1336        ]);
1337        assert_eq!(unrolled.to_string(), expected.to_string());
1338    }
1339
1340    #[test]
1341    fn test_unroll_loop_full() {
1342        let mut rb = ReBuilder::default();
1343        let regex = rb.to_re("abc".into());
1344        let unrolled = rb.unroll_loop(regex.clone(), 3, 3);
1345        let expected = rb.concat(smallvec![regex.clone(), regex.clone(), regex.clone()]);
1346        assert_eq!(unrolled.to_string(), expected.to_string());
1347    }
1348
1349    #[test]
1350    fn test_unroll_loop_opt() {
1351        let mut rb = ReBuilder::default();
1352        let regex = rb.to_re("abc".into());
1353        let unrolled = rb.unroll_loop(regex.clone(), 0, 2);
1354        let opt = rb.opt(regex.clone());
1355        let expected = rb.concat(smallvec![opt.clone(), opt.clone()]);
1356        assert_eq!(unrolled.to_string(), expected.to_string());
1357    }
1358
1359    #[test]
1360    fn test_unroll_pow_zero() {
1361        let mut builder = ReBuilder::default();
1362        let r = builder.to_re("a".into());
1363        let result = builder.unroll_pow(r, 0);
1364        assert_eq!(builder.epsilon(), result);
1365    }
1366
1367    #[test]
1368    fn test_unroll_pow_one() {
1369        let mut builder = ReBuilder::default();
1370        let r = builder.to_re("a".into());
1371        let result = builder.unroll_pow(r.clone(), 1);
1372        assert_eq!(r, result);
1373    }
1374
1375    #[test]
1376    fn test_unroll_pow_multiple() {
1377        let mut builder = ReBuilder::default();
1378        let r = builder.to_re("a".into());
1379        let result = builder.unroll_pow(r.clone(), 3);
1380        let expected = builder.concat(smallvec![r.clone(), r.clone(), r.clone()]);
1381        assert_eq!(expected, result);
1382    }
1383}