aws_smt_strings/
regular_expressions.rs

1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0
3
4//!
5//! Regular expressions
6//!
7//! This module defines the abstract syntax of regular expressions [BaseRegLan]
8//! and the regular expression type [RE]. Regular expressions are built using
9//! an [ReManager], which provides hash consing.
10//!
11//! **Important**: Because of hash consing, regular expressions created with different
12//! [ReManager]s are not compatible. It's recommended to use only one [ReManager].
13//!
14//! Input to the manager's methods are static references to [RE] objects
15//! (see type [RegLan]). The manager also returns objects of type [RegLan]
16//! when producing regular expressions.
17//!
18//! [ReManager] also implements the *derivative* operation. The derivative of a regular
19//! expression R with respect to a character c is another regular expression S that
20//! defines all the strings that can follow c in the language of R. For example,
21//! the derivative of regular expression '(abc + cd)\*' with respect to 'a'  is
22//! 'bc(abc + cd)\*': all words of '(abc + cd)\*' that start with 'a' are of the
23//! form 'a.w' where 'w' is a word 'bc(abc + cd)\*'.
24//!
25//! For a regular expression R, we use a [CharPartition] that divides the alphabet into
26//! equivalent *derivative classes*. If two characters `c1` and `c2` are in the same
27//! derivative class, then the derivative of R with respect to `c1` and the derivative of R
28//! with respect to `c2` are equal. The [ReManager] implements derivative of R with respect
29//! to one of its derivative class. More generally, the derivative of R with respect to a
30//! character set C is well defined if C is included in a derivative class of R.
31//!
32//! Derivatives allows one to convert REs to deterministic automata and support
33//! other operations such as checking whether a string matches an RE.
34//!
35
36use std::collections::HashMap;
37use std::fmt::Display;
38use std::hash::Hash;
39use std::rc::Rc;
40
41use crate::automata::Automaton;
42use crate::automata::AutomatonBuilder;
43use crate::bfs_queues::*;
44use crate::character_sets::*;
45use crate::errors::*;
46use crate::labeled_queues::LabeledQueue;
47use crate::loop_ranges::*;
48use crate::matcher::SearchResult;
49use crate::smt_strings::SmtString;
50use crate::smt_strings::MAX_CHAR;
51use crate::store::*;
52
53#[derive(Debug, Clone, PartialEq, Eq, Hash)]
54///
55/// Abstract syntax for regular expressions
56///
57pub enum BaseRegLan {
58    /// Empty language
59    Empty,
60
61    /// The language that contains only the empty string
62    Epsilon,
63
64    /// Words of length one with characters is a range [a, b]
65    Range(CharSet),
66
67    /// Concatenation of two languages
68    Concat(RegLan, RegLan),
69
70    /// Generalized loop: see [loop_ranges][crate::loop_ranges]
71    Loop(RegLan, LoopRange),
72
73    /// Complement of a language
74    Complement(RegLan),
75
76    /// Union of two or more languages
77    Union(Box<[RegLan]>),
78
79    /// Intersection of two or more languages
80    Inter(Box<[RegLan]>),
81}
82
83/// Reference to a Regular Expression descriptor
84pub type RegLan = &'static RE;
85
86///
87/// Regular expression structure
88///
89/// A regular expression includes an expression of type [BaseRegLan],
90/// which is an abstract syntax tree.
91///
92/// In addition, each regular expression e has a
93/// unique integer id and three attributes:
94/// - e.nullable is true if the language of e contains the empty string
95/// - e.singleton is true if the language of e contains a single string
96/// - e.deriv_class is the list of derivative classes of e.
97///
98/// The derivative classes are disjoint interval characters that cover
99/// a subset of the alphabet, and a complementary class that covers the rest.
100/// See [CharPartition]. The `deriv_class` partition is
101/// constructed to ensure that all the characters in a class produce the same
102/// derivative of e: if c1 and c2 are in the same derivative class of e then
103/// deriv(e, c1) and deriv(e, c2) are equal.
104///
105/// Operations on regular expressions use hash-consing and are performed with
106/// an [ReManager].
107#[derive(Debug)]
108pub struct RE {
109    /// Abstract syntax tree
110    expr: BaseRegLan,
111    /// Unique id for this RE
112    id: usize,
113    /// Whether the language contains the empty string
114    pub nullable: bool,
115    /// Whether this RE has a single element
116    singleton: bool,
117    /// Whether this RE is a simple pattern, i.e.,
118    /// a concatenation of Loops and Ranges
119    simple_pattern: bool,
120    /// Partition of character sets relevant to this RE
121    deriv_class: Rc<CharPartition>,
122}
123
124/// Equality on RE is derived from the unique ids.
125///
126/// Two REs are equal iff they have the same id.
127impl PartialEq for RE {
128    fn eq(&self, other: &Self) -> bool {
129        self.id == other.id
130    }
131}
132
133impl Eq for RE {}
134
135/// Ordering on REs is based on unique ids.
136///
137/// We have re1 < re2 iff re1.id < re2.id
138impl PartialOrd for RE {
139    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
140        // self.id.partial_cmp(&other.id)
141        Some(self.cmp(other))
142    }
143}
144
145impl Ord for RE {
146    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
147        self.id.cmp(&other.id)
148    }
149}
150
151/// The hash code of an RE is just the hash code of its id.
152impl Hash for RE {
153    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
154        self.id.hash(state)
155    }
156}
157
158// utility: add n copies of x to vector v
159fn push_multiple<T: Copy>(v: &mut Vec<T>, n: u32, x: T) {
160    for _ in 0..n {
161        v.push(x);
162    }
163}
164
165impl BaseRegLan {
166    /// Check whether the empty word is in this language
167    fn is_nullable(&self) -> bool {
168        match self {
169            BaseRegLan::Empty => false,
170            BaseRegLan::Epsilon => true,
171            BaseRegLan::Range(_) => false,
172            BaseRegLan::Concat(e1, e2) => e1.nullable && e2.nullable,
173            BaseRegLan::Loop(e, range) => range.start() == 0 || e.nullable,
174            BaseRegLan::Complement(e) => !e.nullable,
175            BaseRegLan::Inter(args) => args.iter().all(|x| x.nullable),
176            BaseRegLan::Union(args) => args.iter().any(|x| x.nullable),
177        }
178    }
179
180    /// Check whether this RE is atomic (either Empty, Epsilon, or a character Range)
181    pub fn is_atomic(&self) -> bool {
182        matches!(
183            self,
184            BaseRegLan::Empty | BaseRegLan::Epsilon | BaseRegLan::Range(_)
185        )
186    }
187
188    /// Check whether this RE is Concat/Loop or Atomic
189    fn concat_or_atomic(&self) -> bool {
190        matches!(
191            self,
192            BaseRegLan::Empty
193                | BaseRegLan::Epsilon
194                | BaseRegLan::Range(..)
195                | BaseRegLan::Concat(..)
196                | BaseRegLan::Loop(..)
197        )
198    }
199
200    /// Check whether this RE is sigma (all chars)
201    fn is_all_chars(&self) -> bool {
202        if let BaseRegLan::Range(s) = self {
203            s.is_alphabet()
204        } else {
205            false
206        }
207    }
208
209    /// Check whether this RE is sigma^* (full language)
210    fn is_full(&self) -> bool {
211        if let BaseRegLan::Loop(r, range) = self {
212            range.is_all() && r.expr.is_all_chars()
213        } else {
214            false
215        }
216    }
217
218    /// Check whether this regular expression is of the form (str.to_re <some string>)
219    /// This holds if the RE is epsilon or if it's a concatenation of characters
220    fn is_singleton(&self) -> bool {
221        match self {
222            BaseRegLan::Epsilon => true,
223            BaseRegLan::Range(c) => c.is_singleton(),
224            BaseRegLan::Concat(e1, e2) => e1.singleton && e2.singleton,
225            BaseRegLan::Loop(e, range) => e.singleton && range.is_point(),
226            _ => false,
227        }
228    }
229
230    /// Check whether this regular expression is a concatenation of ranges
231    /// and loops over ranges.
232    fn is_simple_pattern(&self) -> bool {
233        match self {
234            BaseRegLan::Epsilon => true,
235            BaseRegLan::Range(..) => true,
236            BaseRegLan::Loop(r, ..) => matches!(r.expr, BaseRegLan::Range(..)),
237            BaseRegLan::Concat(e1, e2) => e1.simple_pattern && e2.simple_pattern,
238            _ => false,
239        }
240    }
241
242    /// Check whether this regular expression is a Range
243    /// (i.e., an interval of characters [c0, c1])
244    fn is_range(&self) -> bool {
245        matches!(self, BaseRegLan::Range(..))
246    }
247
248    /// Check whether all strings of `self` are one-character long and belong to s
249    fn match_char_set(&self, s: &CharSet) -> bool {
250        match self {
251            BaseRegLan::Range(x) => s.covers(x),
252            _ => false,
253        }
254    }
255
256    /// Compute the derivation classes for this regular expression
257    fn deriv_class(&self) -> Rc<CharPartition> {
258        fn rc(p: CharPartition) -> Rc<CharPartition> {
259            Rc::new(p)
260        }
261
262        fn merge_deriv_classes(a: &[RegLan]) -> Rc<CharPartition> {
263            let mut result = CharPartition::new();
264            for &re in a {
265                result = merge_partitions(&result, &re.deriv_class)
266            }
267            rc(result)
268        }
269
270        fn empty_partition() -> Rc<CharPartition> {
271            rc(CharPartition::new())
272        }
273
274        match self {
275            BaseRegLan::Empty => empty_partition(),
276            BaseRegLan::Epsilon => empty_partition(),
277            BaseRegLan::Range(c) => rc(CharPartition::from_set(c)),
278            BaseRegLan::Concat(e1, e2) => {
279                if e1.nullable {
280                    rc(merge_partitions(&e1.deriv_class, &e2.deriv_class))
281                } else {
282                    e1.deriv_class.clone()
283                }
284            }
285            BaseRegLan::Loop(e, _) => e.deriv_class.clone(),
286            BaseRegLan::Complement(e) => e.deriv_class.clone(),
287            BaseRegLan::Inter(args) => merge_deriv_classes(args.as_ref()),
288            BaseRegLan::Union(args) => merge_deriv_classes(args.as_ref()),
289        }
290    }
291
292    /// Collect all characters of this RE (if it's a singleton)
293    #[allow(dead_code)]
294    fn collect_chars(&self, v: &mut Vec<u32>) {
295        match self {
296            BaseRegLan::Range(c) => {
297                if c.is_singleton() {
298                    v.push(c.pick());
299                }
300            }
301            BaseRegLan::Loop(r, range) => {
302                if range.is_point() {
303                    if let BaseRegLan::Range(c) = r.expr {
304                        if c.is_singleton() {
305                            push_multiple(v, range.start(), c.pick());
306                        }
307                    }
308                }
309            }
310            BaseRegLan::Concat(e1, e2) => {
311                e1.expr.collect_chars(v);
312                e2.expr.collect_chars(v);
313            }
314            _ => (),
315        }
316    }
317}
318
319impl HashConsed for RE {
320    type Key = BaseRegLan;
321
322    fn make(index: usize, k: &Self::Key) -> Self {
323        RE {
324            expr: k.clone(),
325            id: index,
326            nullable: k.is_nullable(),
327            singleton: k.is_singleton(),
328            simple_pattern: k.is_simple_pattern(),
329            deriv_class: k.deriv_class(),
330        }
331    }
332}
333
334impl Display for BaseRegLan {
335    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
336        // write either e or '(e)' when e is a sub-expression
337        fn write_sub(f: &mut std::fmt::Formatter<'_>, e: RegLan) -> std::fmt::Result {
338            if e.singleton || e.expr.is_atomic() {
339                write!(f, "{}", e.expr)
340            } else {
341                write!(f, "({})", e.expr)
342            }
343        }
344
345        // write a list of operands separated by an symbol
346        fn write_list(
347            f: &mut std::fmt::Formatter<'_>,
348            l: &[RegLan],
349            symbol: char,
350        ) -> std::fmt::Result {
351            debug_assert!(!l.is_empty());
352            write_sub(f, l[0])?;
353            for e in &l[1..] {
354                write!(f, " {symbol} ")?;
355                write_sub(f, e)?;
356            }
357            Ok(())
358        }
359
360        match self {
361            BaseRegLan::Empty => write!(f, "\u{2205}"), // empty set
362            BaseRegLan::Epsilon => write!(f, "\u{03B5}"),
363            BaseRegLan::Range(r) => write!(f, "{r}"),
364            BaseRegLan::Concat(e1, e2) => {
365                let mut v = Vec::new();
366                flatten_concat(e1, &mut v);
367                flatten_concat(e2, &mut v);
368                for e in v {
369                    write_sub(f, e)?
370                }
371                Ok(())
372            }
373            BaseRegLan::Loop(e, range) => {
374                write_sub(f, e)?;
375                write!(f, "^{range}")
376            }
377            BaseRegLan::Complement(e) => {
378                write!(f, "\u{00AC}")?;
379                write_sub(f, e)
380            }
381            BaseRegLan::Inter(args) => write_list(f, args, '&'),
382            BaseRegLan::Union(args) => write_list(f, args, '+'),
383        }
384    }
385}
386
387impl Display for RE {
388    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
389        self.expr.fmt(f)
390    }
391}
392
393impl RE {
394    /// check whether the complementary class is empty
395    pub fn empty_complement(&self) -> bool {
396        self.deriv_class.empty_complement()
397    }
398
399    /// number of derivative classes (not including the complementary class)
400    pub fn num_deriv_classes(&self) -> usize {
401        self.deriv_class.len()
402    }
403
404    /// check whether cid is a valid class id
405    pub fn valid_class_id(&self, cid: ClassId) -> bool {
406        self.deriv_class.valid_class_id(cid)
407    }
408
409    /// Check whether this RE is equal to the empty RE
410    pub fn is_empty(&self) -> bool {
411        matches!(self.expr, BaseRegLan::Empty)
412    }
413
414    /// pick a representative element in a derivative class
415    ///
416    /// - if cid is Interval(i): pick in class i
417    /// - if cid is Complement: pick in the complementary class
418    ///
419    fn pick_class_rep(&self, cid: ClassId) -> u32 {
420        self.deriv_class.pick_in_class(cid)
421    }
422
423    /// class of point x
424    fn class_of_char(&self, x: u32) -> ClassId {
425        debug_assert!(x <= MAX_CHAR);
426        self.deriv_class.class_of_char(x)
427    }
428
429    /// class of set s
430    fn class_of_set(&self, s: &CharSet) -> Result<ClassId, Error> {
431        self.deriv_class.class_of_set(s)
432    }
433
434    /// iterator to go through valid classIds
435    pub fn class_ids(&self) -> ClassIdIterator<'_> {
436        self.deriv_class.class_ids()
437    }
438
439    /// iterator to go through the intervals in an RE derivative classes
440    pub fn char_ranges(&self) -> impl Iterator<Item = &CharSet> {
441        self.deriv_class.ranges()
442    }
443
444    /// incomplete check for inclusion
445    /// - if this returns true then self is included in other
446    /// - otherwise, we don't know
447    pub fn included_in(&self, other: &Self) -> bool {
448        sub_language(self, other)
449    }
450}
451
452/// Iterator to go through all sub-terms of a RegLan
453/// We can't implement this in RE because of lifetime issues
454#[derive(Debug)]
455struct ReIterator {
456    queue: BfsQueue<RegLan>,
457}
458
459impl Iterator for ReIterator {
460    type Item = RegLan;
461
462    /// List all sub-terms in breadth-first order, without duplicates
463    fn next(&mut self) -> Option<Self::Item> {
464        fn get_next(queue: &mut BfsQueue<RegLan>) -> Option<RegLan> {
465            queue.pop().inspect(|x| match x.expr {
466                BaseRegLan::Concat(left, right) => {
467                    queue.push(left);
468                    queue.push(right);
469                }
470                BaseRegLan::Loop(x, _) => {
471                    queue.push(x);
472                }
473                BaseRegLan::Complement(x) => {
474                    queue.push(x);
475                }
476                BaseRegLan::Inter(ref list) => {
477                    queue.push_all(list.iter().copied());
478                }
479                BaseRegLan::Union(ref list) => {
480                    queue.push_all(list.iter().copied());
481                }
482                _ => (),
483            })
484        }
485
486        get_next(&mut self.queue)
487    }
488}
489
490///
491/// Iterator for the sub-terms of r
492/// - This enumerates the sub-terms of r, without duplicates,
493///   in a breadth-first order. The term r is included.
494///   It comes first in the iteration.
495///
496pub fn sub_terms(r: RegLan) -> impl Iterator<Item = RegLan> {
497    let mut queue = BfsQueue::new();
498    queue.push(r);
499    ReIterator { queue }
500}
501
502///
503/// Iterator that enumerates the leaves of r
504/// - A leaf is an atomic sub-term of r (i.e., a term t such that f.expr is either
505///   [BaseRegLan::Empty], or [BaseRegLan::Epsilon] or [BaseRegLan::Range])
506/// - All leaves are listed once (no duplicates)
507/// - If r itself is atomic, the iterator just produces r and nothing else.
508///  
509pub fn leaves(r: RegLan) -> impl Iterator<Item = RegLan> {
510    sub_terms(r).filter(|&x| x.expr.is_atomic())
511}
512
513///
514/// Collect a list L = (R_1,...R_n) such that r = concat(R_1,...,R_n)
515/// and no R_i is itself of the form concat(...) or epsilon, then
516/// add the R_is to vector v.
517///
518fn flatten_concat<'a>(r: &'a RE, v: &mut Vec<&'a RE>) {
519    match &r.expr {
520        BaseRegLan::Epsilon => (), // skip epsilon
521        BaseRegLan::Concat(x, y) => {
522            flatten_concat(x, v);
523            flatten_concat(y, v)
524        }
525        _ => v.push(r),
526    }
527}
528
529///
530/// Same as flatten concat but return a vector of R_1, ..., R_n
531///
532fn decompose_concat(r: &RE) -> Vec<&RE> {
533    let mut result = Vec::new();
534    flatten_concat(r, &mut result);
535    result
536}
537
538///
539/// collect a list L= {R_1, ..., R_n} of languages such
540/// that R = inter(R_1, ..., R_n) and no R_i is itself of the form inter(...)
541/// add R_1 ... R_n to v
542///
543fn flatten_inter<'a>(r: &'a RE, v: &mut Vec<&'a RE>) {
544    match &r.expr {
545        BaseRegLan::Inter(x) => {
546            for &s in x.as_ref() {
547                flatten_inter(s, v)
548            }
549        }
550        _ => v.push(r),
551    }
552}
553
554///
555/// collect a list L= {R_1, ..., R_n} of languages such
556/// that R = union(R_1, ..., R_n) and no R_i is itself of the form union(...)
557/// add R_1 ... R_n to v
558///
559fn flatten_union<'a>(r: &'a RE, v: &mut Vec<&'a RE>) {
560    match &r.expr {
561        BaseRegLan::Union(x) => {
562            for &s in x.as_ref() {
563                flatten_union(s, v)
564            }
565        }
566        _ => v.push(r),
567    }
568}
569
570/// check whether a sorted slice v contains x
571/// this is used for x=empty or x=full or x=epsilon, which have small ids,
572/// so if x occurs, that will be at the beginning of v.
573fn contains<'a>(v: &[&'a RE], x: &'a RE) -> bool {
574    for &y in v {
575        if *y == *x {
576            return true;
577        }
578        if *y > *x {
579            return false;
580        }
581    }
582    false
583}
584
585/// reset a then store x as its unique element
586fn set_to_singleton<'a>(a: &mut Vec<&'a RE>, x: &'a RE) {
587    a.clear();
588    a.push(x);
589}
590
591///
592/// Subsumption/Language inclusion
593///
594/// A regular expression R subsumes another regular expression S is
595/// the language of S is included in the language of R. We try to
596/// detect subsumptions to simplify unions and intersections
597/// of regular expressions.
598///
599/// We do this when R is a simple pattern, that is, a concatenation
600/// of Range and Loop REs. In this case, we can write R as a concatenation
601/// of basic patterns. Each basic pattern is either a sequence of Range
602/// expressions or a sequence of loop expressions. We say that a sequence
603/// of Range expression is a rigid pattern (e.g., it can be something like a string
604/// 'abc'). A sequence of loop expression is a flexible pattern (e.g., something like
605/// Sigma^*).
606///
607/// To check whether R subsumes S:
608/// - construct the list of basic patterns of R
609/// - first pass: find matches in S for all the rigid patterns of R. Each match is a slice of S
610///   say S[i, j]
611/// - second pass: the parts of S that are not matched in the first pass must now match
612///   flexible patterns of R in sequence.
613///
614/// Data structure to represent a basic pattern of array R of RegLan
615/// - start and end are indices in R such that start < end
616/// - this means that R[start, end-1] is a base pattern
617/// - if we find a match for R in an array S, we set
618///   start_match and end_match to mean that S[start_match, end_match-1]
619///   matches the base_pattern. We must have start_match <= end_match.
620///
621#[derive(Debug)]
622struct BasePattern {
623    start: usize,
624    end: usize,
625    is_rigid: bool,
626    start_match: usize,
627    end_match: usize,
628}
629
630impl BasePattern {
631    fn len(&self) -> usize {
632        self.end - self.start
633    }
634
635    fn set_match(&mut self, start: usize, end: usize) {
636        self.start_match = start;
637        self.end_match = end;
638    }
639
640    fn make(start: usize, end: usize, is_rigid: bool) -> BasePattern {
641        debug_assert!(start < end);
642        BasePattern {
643            start,
644            end,
645            is_rigid,
646            start_match: 0,
647            end_match: 0,
648        }
649    }
650}
651
652impl Display for BasePattern {
653    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
654        let k = if self.is_rigid { "Rigid" } else { "Flex" };
655        write!(f, "{}({}, {})", k, self.start, self.end)
656    }
657}
658
659/// Construct a vector of base patterns from a slice of RegLan
660fn base_patterns(r: &[&RE]) -> Vec<BasePattern> {
661    let mut result = Vec::new();
662    if !r.is_empty() {
663        let mut j = 0;
664        let mut rigid_slice = r[0].expr.is_range();
665        for (i, re) in r.iter().enumerate().skip(1) {
666            let rigid_i = re.expr.is_range();
667            if rigid_slice != rigid_i {
668                result.push(BasePattern::make(j, i, rigid_slice));
669                rigid_slice = rigid_i;
670                j = i;
671            }
672        }
673        // last base_pattern
674        result.push(BasePattern::make(j, r.len(), rigid_slice))
675    }
676    result
677}
678
679/// Check whether s[i..pattern.len()-1] matches pattern
680/// Requires i + pattern.len() <= s.lend()
681fn rigid_match_at(pattern: &[&CharSet], s: &[&RE], i: usize) -> bool {
682    debug_assert!(i + pattern.len() <= s.len());
683    for j in 0..pattern.len() {
684        if !s[i + j].expr.match_char_set(pattern[j]) {
685            return false;
686        }
687    }
688    true
689}
690
691/// Search for a match of pattern in sequence s, starting at index i
692/// `pattern` is a rigid pattern represented as a sequence of CharSets
693fn next_rigid_match(pattern: &[&CharSet], s: &[&RE], i: usize) -> SearchResult {
694    let p_len = pattern.len();
695    let s_len = s.len();
696    if s_len >= p_len {
697        for j in i..=(s_len - p_len) {
698            if rigid_match_at(pattern, s, j) {
699                return SearchResult::Found(j, j + p_len);
700            }
701        }
702    }
703    SearchResult::NotFound
704}
705
706/// Search for a match of pattern in sequence s, starting from index i and going down
707/// `pattern` is a rigid pattern represented as a sequence of CharSets
708fn prev_rigid_match(pattern: &[&CharSet], s: &[&RE], i: usize) -> SearchResult {
709    let p_len = pattern.len();
710    for j in (p_len..=i).rev() {
711        if rigid_match_at(pattern, s, j - p_len) {
712            return SearchResult::Found(j - p_len, j);
713        }
714    }
715    SearchResult::NotFound
716}
717
718/// Collect the list of CharSets from a rigid pattern
719/// the pattern `p` is given as a list of regular expressions.
720/// each regular expression in `p` is assumed to be character Range.
721fn char_sets_of_pattern<'a>(p: &[&'a RE]) -> Vec<&'a CharSet> {
722    let mut result = Vec::new();
723    for r in p {
724        if let BaseRegLan::Range(s) = &r.expr {
725            result.push(s)
726        } else {
727            unreachable!()
728        }
729    }
730    result
731}
732
733/// Check whether a concatenation of REs starts with a rigid pattern
734/// - 'u' is a sequence of REs (concatenated)
735/// - 'v' and 'p' define the pattern to search for.
736///   `p` identifies a slice of 'v' (from p.start to p.end)
737///   this slice is a concatenation of Range expressions
738/// - return true if 'u' starts with a sequence of Range expressions
739///   of the same length as the pattern and that each range expression in `u` is
740///   included in the corresponding range expression in the pattern.
741fn rigid_prefix_match<'a>(u: &[&'a RE], v: &[&'a RE], p: &BasePattern) -> bool {
742    if u.len() >= p.len() {
743        let p = char_sets_of_pattern(&v[p.start..p.end]);
744        rigid_match_at(&p, u, 0)
745    } else {
746        false
747    }
748}
749
750/// Check whether a concatenation of REs end with a rigid pattern
751/// - 'u' is a sequence of REs
752/// - 'v' and 'p' define the pattern to search for
753/// - return true if 'u' ends with a sequence of REs that match this pattern
754fn rigid_suffix_match<'a>(u: &[&'a RE], v: &[&'a RE], p: &BasePattern) -> bool {
755    if u.len() >= p.len() {
756        let p = char_sets_of_pattern(&v[p.start..p.end]);
757        rigid_match_at(&p, u, u.len() - p.len())
758    } else {
759        false
760    }
761}
762
763/// Check whether all rigid patterns in a sequence are matched
764/// - `u` is a sequence/concatenation of REs
765/// - 'v' and 'patterns' define a sequence of rigid patterns
766/// - return true if all patterns defined by 'v' and 'patterns' are matched by
767///   successive, disjoint sub-sequences of `u`
768///
769/// This version does a left-to-right search in `u'
770#[allow(clippy::many_single_char_names)]
771fn find_rigid_matches<'a>(u: &[&'a RE], v: &[&'a RE], patterns: &mut [BasePattern]) -> bool {
772    let mut i = 0;
773    for p in patterns {
774        if p.is_rigid {
775            let pattern = char_sets_of_pattern(&v[p.start..p.end]);
776            match next_rigid_match(&pattern, u, i) {
777                SearchResult::NotFound => return false,
778                SearchResult::Found(j, k) => {
779                    p.set_match(j, k);
780                    i = k;
781                }
782            }
783        }
784    }
785    true
786}
787
788/// Same a [find_rigid_matches] but with a right-to-left search
789#[allow(clippy::many_single_char_names)]
790fn find_rigid_matches_rev<'a>(u: &[&'a RE], v: &[&'a RE], patterns: &mut [BasePattern]) -> bool {
791    let mut i = u.len();
792    for p in patterns.iter_mut().rev() {
793        if p.is_rigid {
794            let pattern = char_sets_of_pattern(&v[p.start..p.end]);
795            match prev_rigid_match(&pattern, u, i) {
796                SearchResult::NotFound => return false,
797                SearchResult::Found(j, k) => {
798                    p.set_match(j, k);
799                    i = j;
800                }
801            }
802        }
803    }
804    true
805}
806
807// set the matching regions of every flexible pattern of b after rigid patterns
808// have been matched. We use the fact that the predecessor and successor of a flexible
809// pattern p are both rigid.
810// So p.start_match = pred(p).end_match and p.end_match = suc(p).start_match.
811fn set_flexible_regions(p: &mut [BasePattern], string_len: usize) {
812    for i in 0..p.len() {
813        if !p[i].is_rigid {
814            let prev = if i == 0 { 0 } else { p[i - 1].end_match };
815            let next = if i == p.len() - 1 {
816                string_len
817            } else {
818                p[i + 1].start_match
819            };
820            p[i].set_match(prev, next);
821        }
822    }
823}
824
825// check whether u matches v
826// TBD: improve this
827fn flexible_match<'a>(_u: &[&'a RE], v: &[&'a RE]) -> bool {
828    v.len() == 1 && v[0].expr.is_full()
829}
830
831// try to complete a partial matching, where all rigid base_patterns are matched
832fn match_flexible_patterns<'a>(u: &[&'a RE], v: &[&'a RE], patterns: &mut [BasePattern]) -> bool {
833    if patterns.is_empty() {
834        // equivalent to matching with epsilon
835        u.is_empty()
836    } else {
837        set_flexible_regions(patterns, u.len());
838        for p in patterns {
839            if !p.is_rigid && !flexible_match(&u[p.start_match..p.end_match], &v[p.start..p.end]) {
840                return false;
841            }
842        }
843        true
844    }
845}
846
847// shift all base_patterns of p by delta (subtract delta)
848fn shift_pattern_start(patterns: &mut [BasePattern], delta: usize) {
849    for p in patterns {
850        debug_assert!(p.end > p.start && p.start >= delta);
851        p.start -= delta;
852        p.end -= delta;
853    }
854}
855
856// check whether (concat u) is a sub-language of (concat v)
857// - u and v are non-empty
858// - elements of u and v can be comp, inter, union, loop, range
859fn concat_inclusion<'a>(u: &[&'a RE], v: &[&'a RE]) -> bool {
860    let mut b = base_patterns(v);
861    let mut p = b.as_mut_slice();
862    let mut u = u;
863    let mut v = v;
864
865    // a rigid prefix must match
866    if let Some(pat) = p.first() {
867        if pat.is_rigid {
868            if rigid_prefix_match(u, v, pat) {
869                let len = pat.len();
870                p = &mut p[1..];
871                shift_pattern_start(p, len);
872                u = &u[len..];
873                v = &v[len..];
874            } else {
875                return false;
876            }
877        }
878    }
879
880    // a rigid suffix must match
881    if let Some(pat) = p.last() {
882        if pat.is_rigid {
883            if rigid_suffix_match(u, v, pat) {
884                let len = pat.len();
885                let p_len = p.len();
886                p = &mut p[..p_len - 1];
887                u = &u[..u.len() - len];
888                v = &v[..v.len() - len];
889            } else {
890                return false;
891            }
892        }
893    }
894
895    if find_rigid_matches(u, v, p) && match_flexible_patterns(u, v, p) {
896        return true;
897    }
898
899    if find_rigid_matches_rev(u, v, p) && match_flexible_patterns(u, v, p) {
900        return true;
901    }
902
903    false
904}
905
906///
907/// Check language inclusion
908///
909/// This relies on simple/incomplete checks
910/// - if the function returns true then `r` is a sub-language of `s`
911/// - otherwise, we don't know.
912///
913fn sub_language<'a>(r: &'a RE, s: &'a RE) -> bool {
914    use BaseRegLan::*;
915
916    if r == s {
917        true
918    } else {
919        match (&r.expr, &s.expr) {
920            (Empty, _) => true,
921            (_, Empty) => false,
922            (Epsilon, _) => s.nullable,
923            (_, Epsilon) => false,
924            (Complement(r1), Complement(s2)) => sub_language(s2, r1),
925
926            // for union and intersection, we iterate through the list only if the other side is simple
927            // (i.e., it's either an atom or a concatenation/loop)
928            (_, Union(list)) => {
929                r.expr.concat_or_atomic() && list.iter().any(|&x| sub_language(r, x))
930            }
931            (Inter(list), _) => {
932                s.expr.concat_or_atomic() && list.iter().any(|&x| sub_language(x, s))
933            }
934            (Union(list), _) => {
935                s.expr.concat_or_atomic() && list.iter().all(|&x| sub_language(x, s))
936            }
937            (_, Inter(list)) => {
938                r.expr.concat_or_atomic() && list.iter().all(|&x| sub_language(r, x))
939            }
940
941            // concatenation of loops and ranges
942            (_, _) => {
943                let u = decompose_concat(r);
944                let v = decompose_concat(s);
945                concat_inclusion(&u, &v)
946            }
947        }
948    }
949}
950
951///
952/// Simplify a vector for building unions or intersections of languages
953/// - v = a vector of languages
954/// - bottom = neutral element
955/// - top = absorbing element
956///
957/// The function updates v to remove neutral elements and complementary pairs.
958/// This implements the following simplification rules where op is either union
959/// or intersection:
960///  - op(X, bottom) = X
961///  - op(top, X) = top
962///  - op(X, complement(X)) = top
963///  - op(X, X) = X
964///
965/// For op=intersection, we must have bottom = full and top = empty
966///
967/// For op=union, we must have bottom = empty and top = full
968//
969// We use the property that X and complement(X) have successive ids
970// so after sorting v, X and complement(X) occur next to each other in v.
971//
972fn simplify_set_operation<'a>(v: &mut Vec<&'a RE>, bottom: &'a RE, top: &'a RE) {
973    if !v.is_empty() {
974        v.sort();
975        v.dedup();
976        if contains(v, top) {
977            set_to_singleton(v, top)
978        } else {
979            let mut j = 0;
980            let mut previous = v[0];
981            if previous != bottom {
982                v[j] = previous;
983                j += 1;
984            }
985            for i in 1..v.len() {
986                let current = v[i];
987                if current.id == previous.id + 1 && previous.id % 2 == 0 {
988                    // current is the complement of previous
989                    set_to_singleton(v, top);
990                    return;
991                }
992                if current != bottom {
993                    v[j] = current;
994                    previous = current;
995                    j += 1;
996                }
997            }
998            v.truncate(j)
999        }
1000    }
1001}
1002
1003///
1004/// Pairs RegLan, ClassId used as keys in hash maps.
1005///
1006/// - Some(i) means the i-th interval of an RE's deriv_class
1007/// - None means the RE's complementary class
1008///
1009#[derive(Debug, PartialEq, Eq, Hash)]
1010struct DerivKey(RegLan, ClassId);
1011
1012/// A store for constructing regular expressions using hash-consing.
1013///
1014/// The store ensures that each regular expression has a unique integer id.
1015///
1016/// For all regular expressions e1 and e2 constructed with the same manager,
1017/// we have e1.expr == e2.expr iff e1.id == e2.id
1018///
1019/// # Examples
1020///
1021/// This example shows how to create the regular expression `(ac + bc)*` and
1022/// compute its derivatives.
1023///
1024/// ```
1025/// use aws_smt_strings::regular_expressions::*;
1026///
1027/// let re = &mut ReManager::new();
1028/// let ac = re.str(&"ac".into());  // ac
1029/// let bc = re.str(&"bc".into());  // bc
1030/// let sum = re.union(ac, bc);     // ac + bc
1031/// let e = re.star(sum);           // (ac + bc)*
1032///
1033/// let d1 = re.char_derivative(e, 'a' as u32); // derivative of e w.r.t. 'a'
1034/// let d2 = re.char_derivative(e, 'b' as u32); // derivative of e w.r.t. 'b'
1035///
1036/// // by hash-consing: d1 and d2 are equal
1037/// assert_eq!(d1, d2);
1038/// assert!(std::ptr::eq(d1, d2));
1039/// ```
1040//
1041// We maintain the invariant that x and complement(x) have successive
1042// ids.
1043#[derive(Debug)]
1044pub struct ReManager {
1045    store: Store<RE>,
1046    id2re: Vec<RegLan>, // map id to RE
1047    sigma: RegLan,      // all one-letter strings
1048    empty: RegLan,
1049    sigma_star: RegLan, // complement of empty (all strings)
1050    epsilon: RegLan,
1051    sigma_plus: RegLan, // complement of epsilon (all strings of positive length)
1052    deriv_cache: HashMap<DerivKey, RegLan>, // cache of known derivatives
1053}
1054
1055impl ReManager {
1056    /// Create a new ReManager
1057    pub fn new() -> Self {
1058        let mut store = Store::new();
1059        let sigma = store.make(BaseRegLan::Range(CharSet::all_chars()));
1060        let not_sigma = store.make(BaseRegLan::Complement(sigma));
1061        let empty = store.make(BaseRegLan::Empty);
1062        let sigma_star = store.make(BaseRegLan::Loop(sigma, LoopRange::star()));
1063        let epsilon = store.make(BaseRegLan::Epsilon);
1064        let sigma_plus = store.make(BaseRegLan::Loop(sigma, LoopRange::plus()));
1065        debug_assert_eq!(sigma.id, 0);
1066        debug_assert_eq!(not_sigma.id, 1);
1067        debug_assert_eq!(empty.id, 2);
1068        debug_assert_eq!(sigma_star.id, 3);
1069        debug_assert_eq!(epsilon.id, 4);
1070        debug_assert_eq!(sigma_plus.id, 5);
1071        ReManager {
1072            store,
1073            id2re: vec![sigma, not_sigma, empty, sigma_star, epsilon, sigma_plus],
1074            sigma,
1075            empty,
1076            sigma_star,
1077            epsilon,
1078            sigma_plus,
1079            deriv_cache: HashMap::new(),
1080        }
1081    }
1082
1083    fn id_to_re(&self, id: usize) -> RegLan {
1084        self.id2re[id]
1085    }
1086
1087    /// Internal hash-consing constructor
1088    ///
1089    /// - When we create X, we also create complement(X) to make
1090    ///   sure X and complement(X) have consecutive ids.
1091    fn make(&mut self, ast: BaseRegLan) -> RegLan {
1092        match ast {
1093            BaseRegLan::Complement(x) => self.id_to_re(x.id + 1),
1094            _ => {
1095                let i = self.store.counter;
1096                debug_assert!(i == self.id2re.len());
1097                let x = self.store.make(ast);
1098                debug_assert!(x.id <= i);
1099                if x.id == i {
1100                    // new term
1101                    let y = self.store.make(BaseRegLan::Complement(x));
1102                    debug_assert!(y.id == i + 1);
1103                    self.id2re.push(x);
1104                    self.id2re.push(y);
1105                }
1106                x
1107            }
1108        }
1109    }
1110
1111    /// The empty language
1112    ///
1113    /// # Example
1114    ///
1115    /// ```
1116    /// use aws_smt_strings::regular_expressions::*;
1117    ///
1118    /// let mut re = ReManager::new(); // create a manager
1119    /// let e = re.empty();        // get the empty language
1120    ///
1121    /// // no string belongs to e.
1122    /// assert!(! re.str_in_re(&"0129".into(), e));
1123    /// ```
1124    pub fn empty(&self) -> RegLan {
1125        self.empty
1126    }
1127
1128    /// The full language
1129    ///
1130    /// This language contains every strings. It's the complement of [empty](Self::empty).
1131    ///
1132    /// # Example
1133    ///
1134    /// ```
1135    /// use aws_smt_strings::regular_expressions::*;
1136    ///
1137    /// let mut re = ReManager::new(); // create a manager
1138    /// let e = re.full();        // get the full language
1139    ///
1140    /// // every string belongs to e.
1141    /// assert!(re.str_in_re(&"0129".into(), e));
1142    /// ```
1143    pub fn full(&self) -> RegLan {
1144        self.sigma_star
1145    }
1146
1147    /// The RE that contains only the empty string
1148    ///
1149    /// # Example
1150    ///
1151    /// ```
1152    /// use aws_smt_strings::regular_expressions::*;
1153    ///
1154    /// let mut re = ReManager::new();
1155    /// let e = re.epsilon();
1156    ///
1157    /// // the empty string belongs to e
1158    /// assert!(re.str_in_re(&"".into(), e));
1159    /// // a non-empty string does not belong to e
1160    /// assert!(! re.str_in_re(&"a".into(), e));
1161    /// ```
1162    pub fn epsilon(&self) -> RegLan {
1163        self.epsilon
1164    }
1165
1166    /// The RE that contains all non-empty strings
1167    ///
1168    /// This is the complement of [epsilon](Self::epsilon).
1169    ///
1170    /// # Example
1171    ///
1172    /// ```
1173    /// use aws_smt_strings::regular_expressions::*;
1174    ///
1175    /// let mut re = ReManager::new();
1176    /// let e = re.sigma_plus();
1177    ///
1178    /// // the empty string does not belong to e
1179    /// assert!(! re.str_in_re(&"".into(), e));
1180    /// // any non-empty string belongs to e
1181    /// assert!(re.str_in_re(&"a".into(), e));
1182    /// ```
1183    pub fn sigma_plus(&self) -> RegLan {
1184        self.sigma_plus
1185    }
1186
1187    /// Regular expression defined by a character set
1188    ///
1189    /// Return the regular expression that contains all single-character
1190    /// strings with characters in the specified set. See also [range](Self::range).
1191    ///
1192    /// # Example
1193    ///
1194    /// Lower-case letters in ASCII.
1195    /// ```
1196    /// use aws_smt_strings::regular_expressions::*;
1197    /// use aws_smt_strings::character_sets::*;
1198    ///
1199    /// let mut re = ReManager::new();
1200    /// let set = CharSet::range('a' as u32, 'z' as u32);
1201    /// let e = re.char_set(set);
1202    ///
1203    /// // a single-character string that's in e
1204    /// assert!(re.str_in_re(&"c".into(), e));
1205    /// // a single-character string that's not in e
1206    /// assert!(!re.str_in_re(&"7".into(), e));
1207    /// // strings with more than one characters are not in e
1208    /// assert!(!re.str_in_re(&"abc".into(), e));
1209    /// ```
1210    pub fn char_set(&mut self, set: CharSet) -> RegLan {
1211        self.make(BaseRegLan::Range(set))
1212    }
1213
1214    ///
1215    /// Complement of a language
1216    ///
1217    /// Return the complement of RE `e`.
1218    ///
1219    /// # Example
1220    ///
1221    /// ```
1222    /// use aws_smt_strings::regular_expressions::*;
1223    ///
1224    /// let mut re = ReManager::new();
1225    /// let a_single_digit = re.range('0' as u32, '9' as u32);
1226    /// let not_a_digit = re.complement(a_single_digit);
1227    ///
1228    /// assert!(re.str_in_re(&"7".into(), a_single_digit));
1229    /// assert!(! re.str_in_re(&"7".into(), not_a_digit));
1230    ///
1231    /// // any string of more than 2 characters is not a single digit!
1232    /// assert!(re.str_in_re(&"94".into(), not_a_digit))
1233    /// ```
1234    pub fn complement(&mut self, e: RegLan) -> RegLan {
1235        self.id_to_re(e.id ^ 1)
1236    }
1237
1238    /// Concatenation of two languages
1239    ///
1240    /// Concatenate languages `e1` and `e2` in that order.
1241    ///
1242    /// # Example
1243    ///
1244    /// ```
1245    /// use aws_smt_strings::regular_expressions::*;
1246    ///
1247    /// let mut re = ReManager::new();
1248    /// let a_letter = re.range('a' as u32, 'z' as u32);
1249    /// let a_digit = re.range('0' as u32, '9' as u32);
1250    /// let e = re.concat(a_letter, a_digit);
1251    ///
1252    /// assert!(re.str_in_re(&"h4".into(), e));
1253    /// ```
1254    pub fn concat(&mut self, e1: RegLan, e2: RegLan) -> RegLan {
1255        match (&e1.expr, &e2.expr) {
1256            // empty . R --> empty
1257            (BaseRegLan::Empty, _) => self.empty,
1258            (_, BaseRegLan::Empty) => self.empty,
1259            // epsilon . R --> R
1260            (BaseRegLan::Epsilon, _) => e2,
1261            (_, BaseRegLan::Epsilon) => e1,
1262            // R . R^[i,j] --> R^[i+1, j+1]
1263            (_, BaseRegLan::Loop(y, rng)) if *e1 == **y => {
1264                self.make(BaseRegLan::Loop(e1, rng.add_point(1)))
1265            }
1266            (BaseRegLan::Loop(x, rng), _) if *e2 == **x => {
1267                self.make(BaseRegLan::Loop(e2, rng.add_point(1)))
1268            }
1269            // R^[a,b] . R^[b,c] -> R^[a+b, b+c]
1270            (BaseRegLan::Loop(x, x_rng), BaseRegLan::Loop(y, y_rng)) if *x == *y => {
1271                self.make(BaseRegLan::Loop(x, x_rng.add(y_rng)))
1272            }
1273            // R . R -> R^2
1274            _ if *e1 == *e2 => self.make(BaseRegLan::Loop(e1, LoopRange::point(2))),
1275            // (R . S) . T -> R . (S . T)
1276            (BaseRegLan::Concat(x, y), _) => {
1277                let right = self.concat(y, e2);
1278                self.concat(x, right)
1279            }
1280            //
1281            _ => {
1282                if e1.nullable && e2 == self.sigma_star {
1283                    // S . Sigma^* --> Sigma^* if S is nullable
1284                    e2
1285                } else {
1286                    self.make(BaseRegLan::Concat(e1, e2))
1287                }
1288            }
1289        }
1290    }
1291
1292    /// Concatenation of multiple languages
1293    ///
1294    /// Build the concatenation of `a[0]`, `a[1]`, ... in this order.
1295    /// - return [epsilon](Self::epsilon) is `a` is empty.
1296    /// - return `a[0]` is `a.len() == 1`
1297    ///
1298    /// See [concat](Self::concat)
1299    ///
1300    /// # Example
1301    ///
1302    /// ```
1303    /// use aws_smt_strings::regular_expressions::*;
1304    ///
1305    /// let mut re = ReManager::new();
1306    ///
1307    /// let a_letter = re.range('a' as u32, 'z' as u32);
1308    /// let a_digit = re.range('0' as u32, '9' as u32);
1309    /// let e = re.concat_list([a_letter, a_letter, a_digit]);
1310    ///
1311    /// assert!(re.str_in_re(&"ab3".into(), e));
1312    /// ```
1313    pub fn concat_list(&mut self, a: impl IntoIterator<Item = RegLan>) -> RegLan {
1314        let mut v = Vec::new();
1315        for x in a {
1316            flatten_concat(x, &mut v);
1317        }
1318        let mut result = self.epsilon;
1319        for &x in v.iter().rev() {
1320            result = self.concat(x, result);
1321        }
1322        result
1323    }
1324
1325    /// Generalized loop
1326    ///
1327    /// Parameter `range` defines an interval of natural numbers `[i, j]` where `j` may be infinity.
1328    /// This method returns the regular language equal to the union of `e`<sup>k</sup> for k in the interval `[i, j]`.
1329    /// See [loop_ranges](crate::loop_ranges).
1330    ///
1331    /// # Example
1332    ///
1333    /// Regular expression that recognizes sequences of 3 to 5 digits.
1334    ///
1335    /// ```
1336    /// use aws_smt_strings::{regular_expressions::*, loop_ranges::*};
1337    ///
1338    /// let mut re = ReManager::new();
1339    ///
1340    /// let digits = re.range('0' as u32, '9' as u32);
1341    /// let e = re.mk_loop(digits, LoopRange::finite(3, 5));
1342    ///
1343    /// assert!(re.str_in_re(&"12345".into(), e));
1344    /// assert!(! re.str_in_re(&"123456".into(), e));
1345    /// ```
1346    pub fn mk_loop(&mut self, e: RegLan, range: LoopRange) -> RegLan {
1347        if range.is_zero() {
1348            // R ^ 0 --> epsilon
1349            self.epsilon
1350        } else if range.is_one() {
1351            // R ^ 1 --> R
1352            e
1353        } else {
1354            match &e.expr {
1355                // empty ^ [i, j] --> empty
1356                BaseRegLan::Empty => self.empty,
1357                // epsilon ^ [i, j] --> epsilon
1358                BaseRegLan::Epsilon => self.epsilon,
1359                // (R ^[i,j]) ^ [k, l] --> R ^[i *k, j*l] if the product is exact
1360                BaseRegLan::Loop(x, x_rng) if x_rng.right_mul_is_exact(&range) => {
1361                    self.make(BaseRegLan::Loop(x, x_rng.mul(&range)))
1362                }
1363                _ => self.make(BaseRegLan::Loop(e, range)),
1364            }
1365        }
1366    }
1367
1368    // Intersection of REs in v
1369    fn make_inter(&mut self, mut v: Vec<RegLan>) -> RegLan {
1370        simplify_set_operation(&mut v, self.sigma_star, self.empty);
1371        if contains(&v, self.epsilon) {
1372            // v contains epsilon: the intersection is either epsilon or empty
1373            if v.iter().all(|&r| r.nullable) {
1374                self.epsilon
1375            } else {
1376                self.empty
1377            }
1378        } else {
1379            match v.len() {
1380                0 => self.sigma_star,
1381                1 => v[0],
1382                _ => self.make(BaseRegLan::Inter(v.into())),
1383            }
1384        }
1385    }
1386
1387    // Union of REs in v
1388    fn make_union(&mut self, mut v: Vec<RegLan>) -> RegLan {
1389        // verbose version of 'sub_language'
1390        #[allow(dead_code)]
1391        fn is_included(r: RegLan, s: RegLan) -> bool {
1392            let result = sub_language(r, s);
1393            if result {
1394                println!("---> subsumption: {r} subsumed by {s}");
1395            }
1396            result
1397        }
1398
1399        // we skip the subsumption check when x == r
1400        // this works since there are no duplicates in a
1401        fn is_subsumed(r: RegLan, a: &[RegLan]) -> bool {
1402            // a.iter().any(|&x| x != r && is_included(r, x))
1403            a.iter().any(|&x| x != r && sub_language(r, x))
1404        }
1405
1406        fn remove_subsumed(a: &mut Vec<RegLan>) {
1407            let mut i = 0;
1408            while i < a.len() {
1409                if is_subsumed(a[i], a) {
1410                    a.remove(i);
1411                } else {
1412                    i += 1;
1413                }
1414            }
1415        }
1416
1417        simplify_set_operation(&mut v, self.empty, self.sigma_star);
1418        if v.len() >= 2 {
1419            remove_subsumed(&mut v);
1420        }
1421        match v.len() {
1422            0 => self.empty,
1423            1 => v[0],
1424            _ => self.make(BaseRegLan::Union(v.into())),
1425        }
1426    }
1427
1428    /// Intersection of two languages
1429    ///
1430    /// Return the intersection of `e1` and `e2`
1431    ///
1432    /// # Example
1433    ///
1434    /// ```
1435    /// use aws_smt_strings::regular_expressions::*;
1436    ///
1437    /// let mut re = ReManager::new();
1438    ///
1439    /// let sigma = re.all_chars();
1440    /// let b = re.exp(sigma, 4);    // all strings of length 4
1441    ///
1442    /// let digits = re.range('0' as u32, '9' as u32);
1443    /// let c = re.star(digits);    // all sequences of digits
1444    ///
1445    /// let e = re.inter(b, c);     // all sequences of four digits
1446    ///
1447    /// assert!(re.str_in_re(&"0000".into(), e));
1448    /// ```
1449    pub fn inter(&mut self, e1: RegLan, e2: RegLan) -> RegLan {
1450        let mut v = Vec::new();
1451        flatten_inter(e1, &mut v);
1452        flatten_inter(e2, &mut v);
1453        self.make_inter(v)
1454    }
1455
1456    /// Intersection of multiple languages
1457    ///
1458    /// This returns the intersection of `a[0]`, `a[1]`, etc.
1459    /// - return the full language (see [full](Self::full)) if `a` is empty
1460    /// - return `a[0]` if `a.len() == 1`
1461    /// - otherwise construct the intersection.
1462    ///
1463    /// See [inter](Self::inter) for an example.
1464    pub fn inter_list(&mut self, a: impl IntoIterator<Item = RegLan>) -> RegLan {
1465        let mut v = Vec::new();
1466        for r in a {
1467            flatten_inter(r, &mut v);
1468        }
1469        self.make_inter(v)
1470    }
1471
1472    /// Union of two languages
1473    ///
1474    /// Return the union of `e1` and `e2`.
1475    ///
1476    /// # Example
1477    ///
1478    /// ```
1479    /// use aws_smt_strings::regular_expressions::*;
1480    ///
1481    /// let mut re = ReManager::new();
1482    ///
1483    /// let abc = re.str(&"abc".into());
1484    /// let de = re.str(&"de".into());
1485    /// let u = re.union(abc, de);
1486    ///
1487    /// // u contains two strings: "abc" and "de"
1488    /// assert!(re.str_in_re(&"de".into(), u));
1489    /// assert!(re.str_in_re(&"abc".into(), u));
1490    /// ```
1491    pub fn union(&mut self, e1: RegLan, e2: RegLan) -> RegLan {
1492        let mut v = Vec::new();
1493        flatten_union(e1, &mut v);
1494        flatten_union(e2, &mut v);
1495        self.make_union(v)
1496    }
1497
1498    /// Union of several languages
1499    ///
1500    /// Return the union of `a[0]`, `a[1]`, ...
1501    /// - if `a` is empty, return the empty language (see [empty](Self::empty))
1502    /// - if `a.len() == 1`, return `a[0]`
1503    /// - otherwise, build the union.
1504    ///
1505    /// See [union](Self::union).
1506    ///
1507    pub fn union_list(&mut self, a: impl IntoIterator<Item = RegLan>) -> RegLan {
1508        let mut v = Vec::new();
1509        for r in a {
1510            flatten_union(r, &mut v);
1511        }
1512        self.make_union(v)
1513    }
1514
1515    /// Difference of two languages
1516    ///
1517    /// Return the difference of `e1` and `e2`. This is the same as
1518    /// the intersection of `e1` and the complement of `e2`.
1519    ///
1520    /// # Example
1521    ///
1522    /// ```
1523    /// use aws_smt_strings::regular_expressions::*;
1524    ///
1525    /// let mut re = ReManager::new();
1526    ///
1527    /// let sigma = re.all_chars();
1528    /// let b = re.exp(sigma, 4);    // all strings of length 4
1529    ///
1530    /// let digits = re.range('0' as u32, '9' as u32);
1531    /// let c = re.star(digits);    // all sequences of digits
1532    ///
1533    /// let e = re.diff(c, b);  // sequences of digits of length other than four
1534    ///
1535    /// assert!(re.str_in_re(&"".into(), e)); // the empty sequence is included
1536    /// assert!(! re.str_in_re(&"0000".into(), e));
1537    /// assert!(re.str_in_re(&"123456".into(), e));
1538    /// ```
1539    pub fn diff(&mut self, e1: RegLan, e2: RegLan) -> RegLan {
1540        let comp_e2 = self.complement(e2);
1541        self.inter(e1, comp_e2)
1542    }
1543
1544    /// Difference of several languages
1545    ///
1546    /// Return the difference of `e1` and all regular expressions of `a`
1547    /// - return `e1` if `a` is empty.
1548    ///
1549    pub fn diff_list(&mut self, e1: RegLan, a: impl IntoIterator<Item = RegLan>) -> RegLan {
1550        let mut v = Vec::new();
1551        flatten_inter(e1, &mut v);
1552        for r in a {
1553            flatten_inter(self.complement(r), &mut v);
1554        }
1555        self.make_inter(v)
1556    }
1557
1558    /// All one-character strings
1559    ///
1560    /// This is the same as `range(CharSet::all_chars)`. See [range](Self::range) and [CharSet::all_chars].
1561    ///
1562    /// See [diff](Self::diff) for an example.
1563    pub fn all_chars(&mut self) -> RegLan {
1564        self.sigma
1565    }
1566
1567    /// A character as a regular expression
1568    ///
1569    /// Return the language that contains the one-character string `x` and nothing else.
1570    /// - this is the same as `char_set(CharSet::singleton(x))`. See [char_set](Self::char_set)
1571    ///   and [CharSet::singleton].
1572    ///
1573    /// # Panics
1574    ///
1575    /// If x is not a valid SMT character (i.e., x > MAX_CHAR).
1576    ///
1577    /// # Example
1578    ///
1579    /// ```
1580    /// use aws_smt_strings::regular_expressions::*;
1581    ///
1582    /// let mut re = ReManager::new();
1583    ///
1584    /// let e = re.char('Z' as u32);
1585    ///
1586    /// assert!(re.str_in_re(&"Z".into(), e));
1587    /// ```
1588    pub fn char(&mut self, x: u32) -> RegLan {
1589        assert!(x <= MAX_CHAR);
1590        self.char_set(CharSet::singleton(x))
1591    }
1592
1593    /// Range of characters
1594    ///
1595    /// Return the language that contains all one-character strings that contains character in range [start, end]
1596    /// - this is the same as `char_set(CharSet::range(start, end))`. See [char_set](Self::char_set)
1597    ///   and [CharSet::range].
1598    ///
1599    /// # Panics
1600    ///
1601    /// If the range is empty (i.e., start > end) or if end > [MAX_CHAR].
1602    ///
1603    /// # Example
1604    ///
1605    /// ```
1606    /// use aws_smt_strings::regular_expressions::*;
1607    ///
1608    /// let mut re = ReManager::new();
1609    ///
1610    /// let e = re.range('0' as u32, '9' as u32);
1611    ///
1612    /// assert!(re.str_in_re(&"4".into(), e));
1613    /// ```
1614    pub fn range(&mut self, start: u32, end: u32) -> RegLan {
1615        assert!(start <= end && end <= MAX_CHAR);
1616        self.char_set(CharSet::range(start, end))
1617    }
1618
1619    /// Kleene star closure
1620    ///
1621    /// Return the star closure of language `e` (i.e., the concatenations of an arbitrary number
1622    /// of strings of `e`).
1623    ///
1624    /// # Example
1625    ///
1626    /// ```
1627    /// use aws_smt_strings::regular_expressions::*;
1628    ///
1629    /// let mut re = ReManager::new();
1630    ///
1631    /// let letters = re.range('a' as u32, 'z' as u32);
1632    /// let letter_sequences = re.star(letters);
1633    ///
1634    /// assert!(re.str_in_re(&"abcd".into(), letter_sequences));
1635    /// assert!(re.str_in_re(&"".into(), letter_sequences));
1636    /// assert!(! re.str_in_re(&"abc-def".into(), letter_sequences));
1637    /// ```
1638    pub fn star(&mut self, e: RegLan) -> RegLan {
1639        self.mk_loop(e, LoopRange::star())
1640    }
1641
1642    /// Kleene plus
1643    ///
1644    /// Return the closure of `e` (i.e., the concatenation of one or more strings of `e`)
1645    ///
1646    /// # Example
1647    ///
1648    /// ```
1649    /// use aws_smt_strings::regular_expressions::*;
1650    ///
1651    /// let mut re = ReManager::new();
1652    ///
1653    /// let letters = re.range('a' as u32, 'z' as u32);
1654    /// let letter_sequences = re.plus(letters);
1655    ///
1656    /// assert!(re.str_in_re(&"abcd".into(), letter_sequences));
1657    /// assert!(! re.str_in_re(&"".into(), letter_sequences));
1658    /// assert!(! re.str_in_re(&"abc-def".into(), letter_sequences));
1659    /// ```
1660    pub fn plus(&mut self, e: RegLan) -> RegLan {
1661        self.mk_loop(e, LoopRange::plus())
1662    }
1663
1664    /// Option
1665    ///
1666    /// Returns the union of [epsilon](Self::epsilon) and `e`.
1667    ///
1668    /// # Example
1669    ///
1670    /// ```
1671    /// use aws_smt_strings::regular_expressions::*;
1672    ///
1673    /// let mut re = ReManager::new();
1674    ///
1675    /// let e = re.char('A' as u32);
1676    /// let opt_e = re.opt(e);
1677    ///
1678    /// // Both "A" and the empty strings are in `opt_e`
1679    /// assert!(re.str_in_re(&"A".into(), opt_e));
1680    /// assert!(re.str_in_re(&"".into(), opt_e));
1681    /// ```
1682    pub fn opt(&mut self, e: RegLan) -> RegLan {
1683        self.mk_loop(e, LoopRange::opt())
1684    }
1685
1686    /// Exponentiation
1687    ///
1688    /// Concatenates `e` with itself `k` times.
1689    /// - return [epsilon](Self::epsilon) if `k==0`.
1690    ///
1691    /// # Example
1692    ///
1693    /// All strings of length 5.
1694    ///
1695    /// ```
1696    /// use aws_smt_strings::regular_expressions::*;
1697    ///
1698    /// let mut re = ReManager::new();
1699    ///
1700    /// let a = re.all_chars();
1701    /// let b = re.exp(a, 5);
1702    ///
1703    /// assert!(re.str_in_re(&"ABCDE".into(), b));
1704    /// assert!(! re.str_in_re(&"ABCD".into(), b));
1705    /// ```
1706    pub fn exp(&mut self, e: RegLan, k: u32) -> RegLan {
1707        self.mk_loop(e, LoopRange::point(k))
1708    }
1709
1710    /// Finite loop as defined in SMT-LIB
1711    ///
1712    /// - if `i <= j`, return the regular expression `Loop(e, [i, j])`.
1713    ///   See [mk_loop](Self::mk_loop)
1714    /// - if `i > j`, return the empty language. See [empty](Self::empty).
1715    ///
1716    /// # Example
1717    ///
1718    /// ```
1719    /// use aws_smt_strings::regular_expressions::*;
1720    ///
1721    /// let mut re = ReManager::new();
1722    ///
1723    /// let a = re.all_chars();
1724    /// let b = re.smt_loop(a, 3, 7); // strings of length 3 to 7
1725    ///
1726    /// assert!(re.str_in_re(&"abcdef".into(), b));
1727    /// ```
1728    pub fn smt_loop(&mut self, e: RegLan, i: u32, j: u32) -> RegLan {
1729        if i <= j {
1730            self.mk_loop(e, LoopRange::finite(i, j))
1731        } else {
1732            self.empty
1733        }
1734    }
1735
1736    /// Character range as defined in SMT-LIB
1737    ///
1738    /// - if `s1` and `s2` are both singleton strings, and `s1 <= s2` in the
1739    ///   lexicographic ordering, return self.range(CharSet::range(c1, c2)) where c1 = unique
1740    ///   character of s1 and c2 = unique character of s2.
1741    /// - otherwise, return the empty language
1742    ///
1743    /// ```
1744    /// use aws_smt_strings::regular_expressions::*;
1745    ///
1746    /// let mut re = ReManager::new();
1747    ///
1748    /// let b = re.smt_range(&"a".into(), &"z".into());
1749    ///
1750    /// assert!(re.str_in_re(&"h".into(), b));
1751    /// ```
1752    pub fn smt_range(&mut self, s1: &SmtString, s2: &SmtString) -> RegLan {
1753        if s1.len() == 1 && s2.len() == 1 {
1754            let c1 = s1.char(0);
1755            let c2 = s2.char(0);
1756            if c1 <= c2 {
1757                return self.char_set(CharSet::range(c1, c2));
1758            }
1759        }
1760        self.empty
1761    }
1762
1763    /// Language that contains a single string
1764    ///
1765    /// - Return the language that contains string `s` and nothing else.
1766    /// - This is the same as the SMT-LIB 'str.to_re' function.
1767    ///
1768    /// # Example
1769    ///
1770    /// ```
1771    /// use aws_smt_strings::regular_expressions::*;
1772    ///
1773    /// let mut re = ReManager::new();
1774    ///
1775    /// let s = re.str(&"alpha".into());
1776    ///
1777    /// assert!(re.str_in_re(&"alpha".into(), s));
1778    /// assert!(! re.str_in_re(&"beta".into(), s));
1779    /// ```
1780    pub fn str(&mut self, s: &SmtString) -> RegLan {
1781        let mut re = self.epsilon();
1782        for c in s.iter().rev() {
1783            let c = self.char(*c);
1784            re = self.concat(c, re);
1785        }
1786        re
1787    }
1788
1789    //
1790    // DERIVATIVES
1791    //
1792
1793    /// Compute the derivative w.r.t. c of all elements of list
1794    fn deriv_list(&mut self, list: &[RegLan], c: u32) -> Vec<RegLan> {
1795        list.iter().map(|r| self.deriv(r, c)).collect()
1796    }
1797
1798    /// Derivative of e with respect to a character c
1799    fn compute_derivative(&mut self, e: RegLan, c: u32) -> RegLan {
1800        match e.expr {
1801            BaseRegLan::Empty => self.empty,
1802            BaseRegLan::Epsilon => self.empty,
1803            BaseRegLan::Range(r) => {
1804                if r.contains(c) {
1805                    self.epsilon
1806                } else {
1807                    self.empty
1808                }
1809            }
1810            BaseRegLan::Concat(e1, e2) => {
1811                let d1 = self.deriv(e1, c);
1812                let d1 = self.concat(d1, e2);
1813                if e1.nullable {
1814                    let d2 = self.deriv(e2, c);
1815                    self.union(d1, d2)
1816                } else {
1817                    d1
1818                }
1819            }
1820            BaseRegLan::Loop(e1, range) => {
1821                let d1 = self.deriv(e1, c);
1822                let e2 = self.mk_loop(e1, range.shift());
1823                self.concat(d1, e2)
1824            }
1825            BaseRegLan::Complement(e1) => {
1826                let d1 = self.deriv(e1, c);
1827                self.complement(d1)
1828            }
1829            BaseRegLan::Inter(ref v) => {
1830                let d = self.deriv_list(&v[..], c);
1831                self.inter_list(d)
1832            }
1833            BaseRegLan::Union(ref v) => {
1834                let d = self.deriv_list(&v[..], c);
1835                self.union_list(d)
1836            }
1837        }
1838    }
1839
1840    /// Derivative with respect to a character c using the cache
1841    fn deriv(&mut self, e: RegLan, c: u32) -> RegLan {
1842        let cid = e.class_of_char(c);
1843        self.cached_deriv(e, cid)
1844    }
1845
1846    ///
1847    /// Check whether character c can start expression e
1848    ///
1849    /// # Example
1850    ///
1851    /// ```
1852    /// use aws_smt_strings::regular_expressions::*;
1853    ///
1854    /// let mut re = ReManager::new();
1855    ///
1856    /// let digits = re.range('0' as u32, '9' as u32);
1857    /// let s = re.star(digits);
1858    ///
1859    /// assert!(re.start_char(s, '1' as u32));
1860    /// assert!(! re.start_char(s, 'a' as u32));
1861    /// ```
1862    pub fn start_char(&mut self, e: RegLan, c: u32) -> bool {
1863        match &e.expr {
1864            BaseRegLan::Empty => false,
1865            BaseRegLan::Epsilon => false,
1866            BaseRegLan::Range(set) => set.contains(c),
1867            BaseRegLan::Concat(e1, e2) => {
1868                self.start_char(e1, c) || e1.nullable && self.start_char(e2, c)
1869            }
1870            BaseRegLan::Loop(e, _) => self.start_char(e, c),
1871            BaseRegLan::Inter(args) => args.iter().all(|x| self.start_char(x, c)),
1872            BaseRegLan::Union(args) => args.iter().any(|x| self.start_char(x, c)),
1873            BaseRegLan::Complement(_) => {
1874                // expensive case
1875                let d = self.deriv(e, c);
1876                !self.is_empty_re(d)
1877            }
1878        }
1879    }
1880
1881    ///
1882    /// Check whether all characters in class cid can start e
1883    /// - return Error if cid is not a valid class for e
1884    ///
1885    pub fn start_class(&mut self, e: RegLan, cid: ClassId) -> Result<bool, Error> {
1886        if e.valid_class_id(cid) {
1887            let c = e.pick_class_rep(cid);
1888            Ok(self.start_char(e, c))
1889        } else {
1890            Err(Error::BadClassId)
1891        }
1892    }
1893
1894    ///
1895    /// Cached derivative
1896    ///
1897    /// - cid identifies a derivative class of e.
1898    /// - cid must be either Interval(i) where i is the id of a deriv class of e
1899    ///   or Complement, which denotes the complementary class of e.
1900    ///
1901    /// In either cases, all characters in the class are equivalent:
1902    /// - if c1 and c2 are in deriv class i then deriv(e, c1) = deriv(e, c2)
1903    /// - if c1 and c2 are outside all deriv class then deriv(e, c1) = deriv(e, c2)
1904    ///
1905    /// This method panics in the following cases:
1906    /// - cid is Interval(i) but i is not the index of a derivative class of e
1907    /// - cid is Complementary but the complementary derivative class of e is empty.
1908    ///
1909    fn cached_deriv(&mut self, e: RegLan, cid: ClassId) -> RegLan {
1910        let key = DerivKey(e, cid);
1911        match self.deriv_cache.get(&key) {
1912            Some(&r) => r,
1913            None => {
1914                let c = e.pick_class_rep(cid);
1915                let r = self.compute_derivative(e, c);
1916                self.deriv_cache.insert(key, r);
1917                r
1918            }
1919        }
1920    }
1921
1922    ///
1923    /// Derivative with respect to a class id
1924    ///
1925    /// Compute the derivative of e with respect to a class defined by `cid`
1926    /// - if `cid` is `Interval(i)`: class = i-th derivative class of `e`
1927    /// - if `cid` is `Complement`: class = complementary derivative class of `e`
1928    ///
1929    /// Derivative classes of `e` are indexed from 0 to `n`-1 where `n` is the
1930    /// number of classes.
1931    ///
1932    /// # Errors
1933    ///
1934    /// Return Err([Error::BadClassId]) if the class id is invalid.
1935    /// - Class id `Interval(i)` is invalid if i is larger than or equal to the number
1936    ///   of derivative classes of `e`.
1937    /// - Class if `Complement` is invalid if the complementary class of e is empty.
1938    ///
1939    /// # Example
1940    ///
1941    /// ```
1942    /// use aws_smt_strings::{regular_expressions::*, character_sets::*};
1943    /// # use std::error::Error;
1944    /// #
1945    /// # fn main() -> Result<(), Box<dyn Error>> {
1946    /// let mut re = ReManager::new();
1947    ///
1948    /// let abc = re.str(&"abc".into());
1949    /// let efg = re.str(&"efg".into());
1950    /// let r = re.union(abc, efg); // 'abc' + 'efg': two derivative classes
1951    ///
1952    /// let n = r.num_deriv_classes();
1953    /// assert_eq!(n, 2);
1954    ///
1955    /// let test1 = re.class_derivative(r, ClassId::Interval(0))?;
1956    /// let test2 = re.class_derivative(r, ClassId::Interval(1))?;
1957    /// let test3 = re.class_derivative(r, ClassId::Complement)?;
1958    ///
1959    /// assert_eq!(test1, re.str(&"bc".into()));
1960    /// assert_eq!(test2, re.str(&"fg".into()));
1961    /// assert_eq!(test3, re.empty());
1962    /// # Ok(())
1963    /// # }
1964    /// ```
1965    pub fn class_derivative(&mut self, e: RegLan, cid: ClassId) -> Result<RegLan, Error> {
1966        if e.valid_class_id(cid) {
1967            Ok(self.cached_deriv(e, cid))
1968        } else {
1969            Err(Error::BadClassId)
1970        }
1971    }
1972
1973    ///
1974    /// Unchecked derivative with respect to a class id
1975    ///
1976    /// Compute the derivative of e with respect to a class defined by cid.
1977    /// Does not check whether cid is a valid class id for e.
1978    /// See [class_derivative](Self::class_derivative)
1979    ///
1980    /// # Panics
1981    ///
1982    /// If cid is not a valid class id for e.
1983    ///
1984    pub fn class_derivative_unchecked(&mut self, e: RegLan, cid: ClassId) -> RegLan {
1985        self.cached_deriv(e, cid)
1986    }
1987
1988    ///
1989    /// Derivative with respect to a character set
1990    ///
1991    /// Return the derivative of e with respect to c provided this is well defined.
1992    ///
1993    /// # Errors
1994    ///
1995    /// The derivative with respect to c is well defined either if c is included in a
1996    /// derivative class of e or if c is included in the complementary class.
1997    /// If these conditions do not hold, the method return Err([Error::UndefinedDerivative]).
1998    /// See [Error].
1999    ///
2000    /// # Example
2001    /// ```
2002    /// use aws_smt_strings::{regular_expressions::*, character_sets::*};
2003    /// # use std::error::Error;
2004    /// #
2005    /// # fn main() -> Result<(), Box<dyn Error>> {
2006    /// let mut re = ReManager::new();
2007    ///
2008    /// let a_to_z = re.range('a' as u32, 'z' as u32);
2009    /// let e = re.plus(a_to_z); // non-empty sequences of lower-case ascii letters
2010    ///
2011    /// // the derivative of e w.r.t. ['c', 't'] is defined.
2012    /// let test = re.set_derivative(e, &CharSet::range('c' as u32, 't' as u32))?;
2013    ///
2014    /// assert_eq!(test, re.star(a_to_z));
2015    /// # Ok(())
2016    /// # }
2017    /// ```
2018    ///
2019    pub fn set_derivative(&mut self, e: RegLan, c: &CharSet) -> Result<RegLan, Error> {
2020        let cid = e.class_of_set(c)?;
2021        Ok(self.cached_deriv(e, cid))
2022    }
2023
2024    ///
2025    /// Unchecked derivative with respect to a character set.
2026    ///
2027    /// See [derivative](Self::set_derivative).
2028    ///
2029    /// # Panics
2030    ///
2031    /// If the derivative is not defined for this character set.
2032    ///
2033    pub fn set_derivative_unchecked(&mut self, e: RegLan, c: &CharSet) -> RegLan {
2034        let cid = e.class_of_set(c).unwrap();
2035        self.cached_deriv(e, cid)
2036    }
2037
2038    ///
2039    /// Derivative with respect to a character
2040    ///
2041    /// The derivative of e with respect to c is a regular expression e1 such
2042    /// every string of e that starts with c is formed by concatenating c and
2043    /// a string of e1. So the language of e1 is
2044    ///  L(e1) = { w | c.w is in L(e) }
2045    ///
2046    /// # Example
2047    ///
2048    /// ```
2049    /// use aws_smt_strings::regular_expressions::*;
2050    ///
2051    /// fn sum_of_str(re: &mut ReManager, s1: &str, s2: &str) -> RegLan {
2052    ///     let s1 = re.str(&s1.into());
2053    ///     let s2 = re.str(&s2.into());
2054    ///     re.union(s1, s2)
2055    /// }
2056    ///
2057    /// let re = &mut ReManager::new();
2058    /// let e = sum_of_str(re, "abc", "acc");
2059    ///
2060    /// // e is 'abc + acc'
2061    /// // the derivative of e w.r.t. 'a' is 'bc + cc'
2062    /// let d = re.char_derivative(e, 'a' as u32);
2063    /// assert_eq!(d, sum_of_str(re, "bc", "cc"));
2064    /// ```
2065    pub fn char_derivative(&mut self, e: RegLan, c: u32) -> RegLan {
2066        debug_assert!(c <= MAX_CHAR);
2067        self.deriv(e, c)
2068    }
2069
2070    ///
2071    /// Derivative with respect to a string
2072    ///
2073    /// The derivative with respect to s is defined by induction on the length of s:
2074    /// - if s is empty, deriv(e, s) = e
2075    /// - if s is of the form a.w, then deriv(e, s) = deriv(w, deriv(a, s))
2076    ///
2077    /// # Example
2078    ///
2079    /// ```
2080    /// use aws_smt_strings::regular_expressions::*;
2081    ///
2082    /// let re = &mut ReManager::new();
2083    /// let abc = re.str(&"abc".into());
2084    /// let acc = re.str(&"acc".into());
2085    /// let e = re.union(abc, acc);
2086    ///
2087    /// // e is 'abc + acc'
2088    /// // the derivative of e with respect to "ab" is 'c'
2089    /// let d1 = re.str_derivative(e, &"ab".into());
2090    /// assert_eq!(d1, re.char('c' as u32));
2091    ///
2092    /// // the derivative of e with respect to the empty string is e
2093    /// let d2 = re.str_derivative(e, &"".into());
2094    /// assert_eq!(d2, e);
2095    /// ```
2096    pub fn str_derivative(&mut self, e: RegLan, s: &SmtString) -> RegLan {
2097        s.iter().fold(e, |r, &c| self.char_derivative(r, c))
2098    }
2099
2100    /// Construct an iterator to list the derivatives of a regular expression
2101    ///
2102    /// The iterator produces `e`, then the derivatives of `e`, then the derivatives
2103    /// of these derivatives, and so forth. There are finitely many such derivatives.
2104    /// The iterator produces them without duplicates.
2105    pub fn iter_derivatives(&mut self, e: RegLan) -> DerivativeIterator<'_> {
2106        let mut queue = BfsQueue::new();
2107        queue.push(e);
2108        DerivativeIterator {
2109            manager: self,
2110            queue,
2111        }
2112    }
2113
2114    ///
2115    /// Check whether a string belongs to the language defined by a regular expression
2116    ///
2117    /// # Example
2118    ///
2119    /// ```
2120    /// use aws_smt_strings::regular_expressions::*;
2121    ///
2122    /// let re = &mut ReManager::new();
2123    ///
2124    /// // Build regular expression (ac + bc)*
2125    /// let ac = re.str(&"ac".into());
2126    /// let bc = re.str(&"bc".into());
2127    /// let sum = re.union(ac, bc);
2128    /// let e = re.star(sum);
2129    ///
2130    /// // Check that "acacbc" is in the language
2131    /// assert!(re.str_in_re(&"acacbc".into(), e))
2132    /// ```
2133    ///
2134    pub fn str_in_re(&mut self, s: &SmtString, e: RegLan) -> bool {
2135        self.str_derivative(e, s).nullable
2136    }
2137
2138    ///
2139    /// Check whether a regular expression is empty
2140    ///
2141    /// # Example
2142    /// ```
2143    /// use aws_smt_strings::regular_expressions::*;
2144    ///
2145    /// let re = &mut ReManager::new();
2146    ///
2147    /// let full = re.full();
2148    /// let abcd = re.str(&"abcd".into());
2149    /// let bc = re.str(&"bc".into());
2150    ///
2151    /// let a = re.concat(abcd, full); // strings that start with 'abcd'
2152    /// let b = re.concat_list([full, bc, full]); // strings that contain 'bc'
2153    ///
2154    /// let test = re.diff(a, b); // strings that start with 'abcd' but don't contain 'bc'
2155    /// assert!(re.is_empty_re(test));
2156    /// ```
2157    pub fn is_empty_re(&mut self, e: RegLan) -> bool {
2158        self.iter_derivatives(e).all(|x| !x.nullable)
2159    }
2160
2161    /// Check whether a regular expression is empty, with bounded derivative exploration
2162    ///
2163    /// This is a resource-bounded version of [`is_empty_re`](Self::is_empty_re) that limits the
2164    /// number of derivatives computed to avoid potentially expensive operations.
2165    ///
2166    /// # Returns
2167    /// - `None` if the derivative limit was reached before determining emptiness, otherwise
2168    /// - `Some(true)` if the language is empty
2169    /// - `Some(false)` if the language is non-empty
2170    ///
2171    /// # Example
2172    ///
2173    /// ```
2174    /// use aws_smt_strings::regular_expressions::*;
2175    /// let mut re = ReManager::new();
2176    /// let abc = re.str(&"abc".into());
2177    /// let def = re.str(&"def".into());
2178    /// let intersection = re.inter(abc, def);
2179    ///
2180    /// let small_bound_result = re.is_empty_re_bounded(intersection, 1);
2181    /// assert_eq!(small_bound_result, None);
2182    ///
2183    /// let result = re.is_empty_re_bounded(intersection, 3);
2184    /// assert_eq!(result, Some(true));
2185    /// ```
2186    pub fn is_empty_re_bounded(&mut self, e: RegLan, max_derivatives: usize) -> Option<bool> {
2187        for (count, d) in self.iter_derivatives(e).enumerate() {
2188            if count >= max_derivatives {
2189                return None;
2190            }
2191            if d.nullable {
2192                return Some(false);
2193            }
2194        }
2195        Some(true)
2196    }
2197
2198    ///
2199    /// Search for a symbolic string of e
2200    /// - the result is None, if e is an empty regular expression.
2201    /// - otherwise the result is Some(list or pairs (RegLan, ClassId) such that:
2202    ///   1) in each pair, the classId is valid for the RegLan
2203    ///   2) the RegLan in the first pair is e
2204    ///   3) in two successive pairs (r1, cid1) (r2, cid2),
2205    ///      we have r2 = class_derivative(r1, cid1)
2206    ///   4) for the last pair in the list (r, cid), the derivative
2207    ///      of r w.r.t. cid is nullable.
2208    /// - the list is empty if e itself is nullable (this represents the empty string)
2209    ///
2210    fn get_string_path(&mut self, e: RegLan) -> Option<Vec<(RegLan, ClassId)>> {
2211        let mut queue: LabeledQueue<RegLan, ClassId> = LabeledQueue::new(e);
2212        while let Some(r) = queue.pop() {
2213            if r.nullable {
2214                return queue.full_path(&r);
2215            } else {
2216                for cid in r.class_ids() {
2217                    let d = self.class_derivative_unchecked(r, cid);
2218                    queue.push(r, cid, d);
2219                }
2220            }
2221        }
2222        None
2223    }
2224
2225    ///
2226    /// Get a string that belongs to a regular expression
2227    ///
2228    /// Return None if the regular expression `e` is empty.
2229    ///
2230    /// # Example
2231    /// ```
2232    /// use aws_smt_strings::{regular_expressions::*, smt_strings::*};
2233    ///
2234    /// let re = &mut ReManager::new();
2235    ///
2236    /// let str1 = SmtString::from("abc");
2237    /// let str2 = SmtString::from("bcd");
2238    ///
2239    /// let abc = re.str(&str1);
2240    /// let bcd = re.str(&str2);
2241    /// let u = re.union(abc, bcd);
2242    ///
2243    /// let str = re.get_string(u);
2244    ///
2245    /// assert!(str == Some(str1) || str == Some(str2));
2246    /// ```
2247    pub fn get_string(&mut self, e: RegLan) -> Option<SmtString> {
2248        match self.get_string_path(e) {
2249            None => None,
2250            Some(path) => {
2251                let result: Vec<u32> = path
2252                    .iter()
2253                    .map(|(re, cid)| re.pick_class_rep(*cid))
2254                    .collect();
2255                Some(result.into())
2256            }
2257        }
2258    }
2259
2260    //
2261    // try to compile to a DFA of no more than max_states.
2262    // return None if that fails (i.e., if the automaton will have more than max_states)
2263    //
2264    fn compile_with_bound(&mut self, e: RegLan, max_states: usize) -> Option<Automaton> {
2265        if max_states == 0 {
2266            None
2267        } else {
2268            let mut builder = AutomatonBuilder::new(&e.expr);
2269            let mut queue = BfsQueue::new();
2270            let mut state_count = 0;
2271            queue.push(e);
2272            while let Some(e) = queue.pop() {
2273                debug_assert!(state_count <= max_states);
2274                if state_count == max_states {
2275                    return None;
2276                }
2277                state_count += 1;
2278                for set in e.char_ranges() {
2279                    let d = self.set_derivative_unchecked(e, set);
2280                    queue.push(d);
2281                    builder.add_transition(&e.expr, set, &d.expr);
2282                }
2283                if !e.empty_complement() {
2284                    let d = self.class_derivative_unchecked(e, ClassId::Complement);
2285                    queue.push(d);
2286                    builder.set_default_successor(&e.expr, &d.expr);
2287                }
2288                if e.nullable {
2289                    builder.mark_final(&e.expr);
2290                }
2291            }
2292            Some(builder.build_unchecked())
2293        }
2294    }
2295
2296    ///
2297    /// Compile a regular expression to a deterministic finite state automaton
2298    ///
2299    /// # Example
2300    ///
2301    /// ```
2302    /// use aws_smt_strings::{regular_expressions::*, automata::*};
2303    ///
2304    /// let re = &mut ReManager::new();
2305    ///
2306    /// // (ac + bc)*
2307    /// let ac = re.str(&"ac".into());
2308    /// let bc = re.str(&"bc".into());
2309    /// let sum = re.union(ac, bc);
2310    /// let e = re.star(sum);
2311    ///
2312    /// // convert e to an automaton
2313    /// let auto = re.compile(e);
2314    ///
2315    /// // string accepted by the automaton
2316    /// assert!(auto.accepts(&"acbcbc".into()))
2317    /// ```
2318    pub fn compile(&mut self, e: RegLan) -> Automaton {
2319        self.compile_with_bound(e, usize::MAX).unwrap()
2320    }
2321
2322    ///
2323    /// Compile a regular expression to a DFA of bounded size
2324    ///
2325    /// Try to compile a regular expression `e` to a deterministic finite-state automaton
2326    /// of size no more than `max_states`.
2327    /// - e: regular expression
2328    /// - max_states: bound
2329    ///
2330    /// Return None if the DFA has more than `max_states`
2331    /// Return `Some(a)` otherwise where `a` is the automaton
2332    ///
2333    /// # Example
2334    ///
2335    /// ```
2336    /// use aws_smt_strings::{regular_expressions::*, automata::*};
2337    ///
2338    /// let re = &mut ReManager::new();
2339    ///
2340    /// // (ac + bc)+
2341    /// let ac = re.str(&"ac".into());
2342    /// let bc = re.str(&"bc".into());
2343    /// let sum = re.union(ac, bc);
2344    /// let e = re.plus(sum);
2345    ///
2346    /// // the smallest DFA that recognizes e has four states
2347    /// let test1 = re.try_compile(e, 3);
2348    /// assert!(test1.is_none());
2349    ///
2350    /// let test2 = re.try_compile(e, 4);
2351    /// assert!(test2.is_some());
2352    /// ```
2353    pub fn try_compile(&mut self, e: RegLan, max_states: usize) -> Option<Automaton> {
2354        self.compile_with_bound(e, max_states)
2355    }
2356}
2357
2358impl Default for ReManager {
2359    fn default() -> Self {
2360        Self::new()
2361    }
2362}
2363
2364/// Iterator to enumerate all the derivatives of a regular expression
2365///
2366/// See [iter_derivatives](crate::regular_expressions::ReManager::iter_derivatives).
2367#[derive(Debug)]
2368pub struct DerivativeIterator<'a> {
2369    queue: BfsQueue<RegLan>,
2370    manager: &'a mut ReManager,
2371}
2372
2373impl<'a> Iterator for DerivativeIterator<'a> {
2374    type Item = &'a RE;
2375
2376    fn next(&mut self) -> Option<Self::Item> {
2377        if let Some(r) = self.queue.pop() {
2378            for cid in r.class_ids() {
2379                let d = self.manager.class_derivative_unchecked(r, cid);
2380                self.queue.push(d);
2381            }
2382            Some(r)
2383        } else {
2384            None
2385        }
2386    }
2387}
2388
2389#[allow(clippy::uninlined_format_args)]
2390#[cfg(test)]
2391mod tests {
2392    use crate::smt_strings::char_to_smt;
2393
2394    use super::*;
2395
2396    #[allow(clippy::uninlined_format_args)]
2397    fn print_term(name: &str, r: RegLan) {
2398        println!("term {} = {}", name, r);
2399        println!("   ptr:       {:p}", r);
2400        println!("   id:        {}", r.id);
2401        println!("   nullable:  {}", r.nullable);
2402        println!("   singleton: {}", r.singleton);
2403        println!("   pattern:   {}", r.simple_pattern);
2404        println!("   deriv:     {}", r.deriv_class);
2405        println!();
2406    }
2407
2408    fn build_atoms(re: &mut ReManager) -> Vec<RegLan> {
2409        vec![
2410            re.empty(),
2411            re.epsilon(),
2412            re.all_chars(),
2413            re.char('a' as u32),
2414            re.char('b' as u32),
2415            re.range('0' as u32, '9' as u32),
2416            re.range('A' as u32, 'Z' as u32),
2417        ]
2418    }
2419
2420    fn build_test_res(re: &mut ReManager) -> Vec<RegLan> {
2421        let mut v = build_atoms(re);
2422        let w = v.clone();
2423
2424        for &x in &w {
2425            v.push(re.complement(x));
2426            v.push(re.opt(x));
2427            v.push(re.star(x));
2428            v.push(re.plus(x));
2429            v.push(re.exp(x, 2));
2430        }
2431        for &x in &w {
2432            for &y in &w {
2433                v.push(re.concat(x, y));
2434                v.push(re.inter(x, y));
2435                v.push(re.union(x, y));
2436            }
2437        }
2438        v.sort();
2439        v.dedup();
2440        v
2441    }
2442
2443    fn check_equal(re1: RegLan, re2: RegLan) {
2444        assert_eq!(re1, re2);
2445        assert_eq!(re1.id, re2.id);
2446        assert!(std::ptr::eq(re1, re2));
2447    }
2448
2449    #[test]
2450    fn hash_atoms() {
2451        let re = &mut ReManager::new();
2452
2453        let v1 = build_atoms(re);
2454        let v2 = build_atoms(re);
2455
2456        for (i, &t) in v1.iter().enumerate() {
2457            let name = format!("t{i}");
2458            print_term(&name, t);
2459            check_equal(t, v2[i]);
2460        }
2461    }
2462
2463    #[test]
2464    fn test_loop() {
2465        let re = &mut ReManager::new();
2466
2467        let v = build_atoms(re);
2468
2469        for &t in &v {
2470            let x = re.star(t);
2471            print_term(&format!("star({t})"), x);
2472            check_equal(x, re.star(t));
2473        }
2474
2475        for &t in &v {
2476            let x = re.plus(t);
2477            print_term(&format!("plus({t})"), x);
2478            check_equal(x, re.plus(t));
2479        }
2480
2481        for &t in &v {
2482            let x = re.opt(t);
2483            print_term(&format!("opt({t})"), x);
2484            check_equal(x, re.opt(t));
2485        }
2486
2487        for &t in &v {
2488            for k in 0..3 {
2489                let x = re.exp(t, k);
2490                print_term(&format!("exp({t}, {k})"), x);
2491                check_equal(x, re.exp(t, k));
2492            }
2493        }
2494
2495        let a = re.all_chars();
2496        let a2 = re.exp(a, 2);
2497        let a2_star = re.star(a2);
2498        let a2_plus = re.plus(a2);
2499        let a_star = re.star(a);
2500        let a_plus = re.plus(a);
2501        let a_star2 = re.exp(a_star, 2);
2502        let a_star_star = re.star(a_star);
2503        let a_plus_star = re.star(a_plus);
2504        let a_star_plus = re.plus(a_star);
2505        print_term("(Sigma^2)^*", a2_star);
2506        print_term("(Sigma^2)^+", a2_plus);
2507        print_term("(Sigma^*)^2", a_star2);
2508        print_term("(Sigma^*)^*", a_star_star);
2509        print_term("(Sigma^*)^+)", a_star_plus);
2510        print_term("(Sigma^+)^*", a_plus_star);
2511
2512        assert_eq!(a_star2, a_star);
2513        assert_ne!(a2_star, a_star);
2514        assert_ne!(a2_star, a2_plus);
2515        assert_ne!(a2_plus, a_star);
2516        assert_eq!(a_plus_star, a_star);
2517        assert_eq!(a_star_plus, a_star);
2518        assert_eq!(a_star_star, a_star);
2519    }
2520
2521    #[test]
2522    fn test_concat() {
2523        let re = &mut ReManager::new();
2524        let v = build_atoms(re);
2525
2526        for &t in &v {
2527            for &u in &v {
2528                let x = re.concat(t, u);
2529                print_term(&format!("concat({t}, {u})"), x);
2530                check_equal(x, re.concat(t, u));
2531            }
2532        }
2533    }
2534
2535    #[test]
2536    fn test_inter() {
2537        let re = &mut ReManager::new();
2538        let v = build_atoms(re);
2539
2540        for &t in &v {
2541            for &u in &v {
2542                let x = re.inter(t, u);
2543                print_term(&format!("inter({t}, {u})"), x);
2544                check_equal(x, re.inter(t, u));
2545            }
2546        }
2547    }
2548
2549    #[test]
2550    fn test_union() {
2551        let re = &mut ReManager::new();
2552        let v = build_atoms(re);
2553
2554        for &t in &v {
2555            for &u in &v {
2556                let x = re.union(t, u);
2557                print_term(&format!("union({t}, {u})"), x);
2558                check_equal(x, re.union(t, u));
2559            }
2560        }
2561    }
2562
2563    #[test]
2564    fn test_complement() {
2565        let re = &mut ReManager::new();
2566        let v = build_atoms(re);
2567
2568        for &t in &v {
2569            let x = re.complement(t);
2570            print_term(&format!("complement({t})"), x);
2571            check_equal(x, re.complement(t));
2572
2573            let y = re.complement(x);
2574            print_term(&format!("complement({x})"), y);
2575            check_equal(y, t);
2576            check_equal(y, re.complement(x));
2577        }
2578    }
2579
2580    #[test]
2581    fn test_from_str() {
2582        let re = &mut ReManager::new();
2583
2584        let x = re.str(&SmtString::from("abcde"));
2585        print_term("(str.to_re \"abcde\")", x);
2586        check_equal(x, re.str(&SmtString::from("abcde")));
2587
2588        let v = vec![
2589            re.char('a' as u32),
2590            re.char('b' as u32),
2591            re.epsilon(),
2592            re.epsilon(),
2593            re.char('c' as u32),
2594            re.char('d' as u32),
2595            re.char('e' as u32),
2596        ];
2597
2598        let y = re.concat_list(v);
2599        check_equal(x, y);
2600    }
2601
2602    #[test]
2603    fn bigger_test() {
2604        let re = &mut ReManager::new();
2605        let v = build_test_res(re);
2606
2607        for &t in &v {
2608            for &u in &v {
2609                let x = re.concat(t, u);
2610                print_term(&format!("concat({t}, {u})"), x);
2611                check_equal(x, re.concat(t, u));
2612
2613                let x = re.inter(t, u);
2614                print_term(&format!("inter({t}, {u})"), x);
2615                check_equal(x, re.inter(t, u));
2616
2617                let x = re.union(t, u);
2618                print_term(&format!("union({t}, {u})"), x);
2619                check_equal(x, re.union(t, u));
2620            }
2621        }
2622    }
2623
2624    #[test]
2625    fn test_sub_terms() {
2626        fn print_sub_terms(t: RegLan) {
2627            println!("Base term: {t}");
2628            println!("Sub terms = [");
2629            for x in sub_terms(t) {
2630                println!("  {x}");
2631            }
2632            println!("]");
2633
2634            println!("Leaves = [");
2635            for leaf in leaves(t) {
2636                println!("  {leaf}");
2637            }
2638            println!("]\n");
2639        }
2640
2641        let re = &mut ReManager::new();
2642        let v = build_test_res(re);
2643        for &t in &v {
2644            print_sub_terms(t)
2645        }
2646
2647        let t = re.str(&"0987654321aabd".into());
2648        print_sub_terms(t)
2649    }
2650
2651    #[test]
2652    fn test_base_patterns() {
2653        let re = &mut ReManager::new();
2654
2655        fn show_patterns(r: RegLan) {
2656            let v = decompose_concat(r);
2657            let test = base_patterns(&v);
2658            println!("Expression: {r} ");
2659            println!("   vector:");
2660            for x in &v {
2661                println!("     {x}");
2662            }
2663            println!("   base patterns:");
2664            for x in &test {
2665                println!("     {x}");
2666            }
2667            println!()
2668        }
2669
2670        let test1 = re.all_chars();
2671        show_patterns(test1);
2672
2673        let test2 = re.epsilon();
2674        show_patterns(test2);
2675
2676        let test3 = re.full();
2677        show_patterns(test3);
2678
2679        let digits = re.range('0' as u32, '9' as u32);
2680        let d = re.star(digits);
2681
2682        let test4 = re.concat_list([test1, digits, test3, test3, d, test1, d, d]);
2683        show_patterns(test4);
2684    }
2685
2686    #[test]
2687    fn test_deriv() {
2688        let re = &mut ReManager::new();
2689        let v = build_test_res(re);
2690
2691        for &t in &v {
2692            for c in t.deriv_class.ranges() {
2693                let x = re.set_derivative(t, c);
2694                match x {
2695                    Ok(d) => println!("deriv {t} wrt {c} = {d}"),
2696                    Err(e) => panic!("deriv {} wrt {} failed with error {:?}", t, c, e),
2697                }
2698            }
2699            if !t.deriv_class.empty_complement() {
2700                let y = re.class_derivative(t, ClassId::Complement);
2701                match y {
2702                    Ok(d) => println!("deriv {t} wrt CompClass = {d}"),
2703                    Err(e) => panic!("deriv {} wrt CompClass failed with error {:?}", t, e),
2704                }
2705            }
2706        }
2707    }
2708
2709    // deriv e w.r.t. 'a', 'b', and 'c' and e's complementary class
2710    fn show_derivatives(re: &mut ReManager, e: RegLan) {
2711        println!("Expression: {e}");
2712        println!("  deriv classes: {}", e.deriv_class);
2713        for c in 'a'..='c' {
2714            println!("  deriv({e}, {c}) = {}", re.char_derivative(e, c as u32))
2715        }
2716        if !e.empty_complement() {
2717            println!(
2718                "  deriv({e}, CompClass) = {}",
2719                re.class_derivative(e, ClassId::Complement).unwrap()
2720            )
2721        }
2722        println!()
2723    }
2724
2725    #[test]
2726    fn test_deriv2() {
2727        let re = &mut ReManager::new();
2728        // a + ac + bc
2729        let a = re.str(&"a".into());
2730        let ac = re.str(&"ac".into());
2731        let bc = re.str(&"bc".into());
2732        let e = re.union_list([a, ac, bc]);
2733
2734        show_derivatives(re, e);
2735        let d1 = re.char_derivative(e, 'a' as u32);
2736        show_derivatives(re, d1);
2737        let d2 = re.char_derivative(e, 'b' as u32);
2738        show_derivatives(re, d2);
2739        let d3 = re.char_derivative(e, 'c' as u32);
2740        show_derivatives(re, d3);
2741
2742        assert!(re.str_in_re(&"a".into(), e));
2743        assert!(re.str_in_re(&"ac".into(), e));
2744        assert!(re.str_in_re(&"bc".into(), e));
2745        assert!(!re.str_in_re(&"b".into(), e));
2746        assert!(!re.str_in_re(&"c".into(), e));
2747    }
2748
2749    #[test]
2750    fn test_deriv3() {
2751        let re = &mut ReManager::new();
2752        // (ac + bc)*
2753        let ac = re.str(&"ac".into());
2754        let bc = re.str(&"bc".into());
2755        let sum = re.union(ac, bc);
2756        let e = re.star(sum);
2757
2758        show_derivatives(re, e);
2759        let d1 = re.char_derivative(e, 'a' as u32);
2760        show_derivatives(re, d1);
2761        let d2 = re.char_derivative(e, 'b' as u32);
2762        show_derivatives(re, d2);
2763        let d3 = re.char_derivative(e, 'c' as u32);
2764        show_derivatives(re, d3);
2765
2766        assert!(re.str_in_re(&"acacbc".into(), e));
2767        assert!(re.str_in_re(&"".into(), e));
2768    }
2769
2770    fn all_derivatives(re: &mut ReManager, e: RegLan) -> Vec<RegLan> {
2771        let mut queue = BfsQueue::new();
2772        let mut result = Vec::new();
2773
2774        queue.push(e);
2775        result.push(e);
2776
2777        while let Some(r) = queue.pop() {
2778            for cid in r.class_ids() {
2779                if let Ok(d) = re.class_derivative(r, cid) {
2780                    if queue.push(d) {
2781                        result.push(d)
2782                    }
2783                } else {
2784                    panic!("Unexpected failure: deriv {} w.r.t Class{}", r, cid)
2785                }
2786            }
2787        }
2788        result
2789    }
2790
2791    #[test]
2792    fn test_all_deriv() {
2793        fn show_derivs(e: RegLan, v: &[RegLan]) {
2794            println!("All derivatives of {e}");
2795            for &d in v {
2796                println!("   {d}")
2797            }
2798            if v.len() == 1 {
2799                println!("Total: 1 derivative")
2800            } else {
2801                println!("Total: {} derivatives", v.len())
2802            }
2803        }
2804
2805        let re = &mut ReManager::new();
2806
2807        // (ac + bc)*
2808        let ac = re.str(&"ac".into());
2809        let bc = re.str(&"bc".into());
2810        let sum = re.union(ac, bc);
2811        let e = re.star(sum);
2812
2813        let v = all_derivatives(re, e);
2814        show_derivs(e, &v);
2815
2816        let a = [
2817            re.sigma_plus(),
2818            re.str(&"something".into()),
2819            re.all_chars(),
2820            re.all_chars(),
2821            re.char(':' as u32),
2822            re.full(),
2823            re.char(':' as u32),
2824            re.full(),
2825            re.str(&".jpg".into()),
2826        ];
2827
2828        let e = re.concat_list(a);
2829        let v = all_derivatives(re, e);
2830        show_derivs(e, &v);
2831    }
2832
2833    #[test]
2834    fn test_derivative_iter() {
2835        let re = &mut ReManager::new();
2836
2837        // abc/(Σ^*)/ΣΣ-ΣΣ/def/(Σ^*)
2838        let a = [
2839            re.str(&"abc/".into()),
2840            re.full(),
2841            re.char('/' as u32),
2842            re.all_chars(),
2843            re.all_chars(),
2844            re.char('-' as u32),
2845            re.all_chars(),
2846            re.all_chars(),
2847            re.str(&"/def/".into()),
2848            re.full(),
2849        ];
2850
2851        let e = re.concat_list(a);
2852        println!("All derivatives of {e}");
2853        let mut count = 0;
2854        for r in re.iter_derivatives(e) {
2855            println!("  {r}");
2856            count += 1;
2857        }
2858        println!("Total: {count} derivatives");
2859    }
2860
2861    #[test]
2862    fn test_char_deriv() {
2863        fn sum_of_str(re: &mut ReManager, s1: &str, s2: &str) -> RegLan {
2864            let s1 = re.str(&s1.into());
2865            let s2 = re.str(&s2.into());
2866            re.union(s1, s2)
2867        }
2868
2869        let mut re = ReManager::new();
2870        let e = sum_of_str(&mut re, "abc", "acc");
2871
2872        // e is 'abc + acc'
2873        // the derivative of e w.r.t. 'a' is 'bc + cc'
2874        let d = re.char_derivative(e, 'a' as u32);
2875        assert_eq!(d, sum_of_str(&mut re, "bc", "cc"));
2876    }
2877
2878    #[test]
2879    fn test_empty_check() {
2880        let re = &mut ReManager::new();
2881
2882        let full = re.full();
2883        let abcd = re.str(&"abcd".into());
2884        let bc = re.str(&"bc".into());
2885
2886        let a = re.concat(abcd, full); // strings that start with 'abcd'
2887        let b = re.concat_list([full, bc, full]); // strings that contain 'bc'
2888
2889        let test = re.diff(a, b); // strings that start with 'abcd' but don't contain 'bc'
2890        assert!(re.is_empty_re(test));
2891    }
2892
2893    #[test]
2894    fn test_empty_check2() {
2895        let re = &mut ReManager::new();
2896
2897        let c1 = [
2898            re.all_chars(),
2899            re.all_chars(),
2900            re.all_chars(),
2901            re.all_chars(),
2902            re.full(),
2903            re.str(&"/abcd/".into()),
2904            re.all_chars(),
2905            re.str(&"/end".into()),
2906        ];
2907
2908        let c2 = [re.all_chars(), re.str(&"/zhfg".into()), re.full()];
2909
2910        let e1 = re.concat_list(c1);
2911        let e2 = re.concat_list(c2);
2912
2913        let test = re.inter(e1, e2);
2914        assert!(!re.is_empty_re(test));
2915
2916        let sample = re.get_string(test);
2917        assert!(sample.is_some());
2918        println!("Sample string in {}: {}", test, sample.unwrap());
2919    }
2920
2921    #[test]
2922    fn test_empty_check_bounded() {
2923        let re = &mut ReManager::new();
2924
2925        let full = re.full();
2926        let abcd = re.str(&"abcd".into());
2927        let bc = re.str(&"bc".into());
2928
2929        let a = re.concat(abcd, full); // strings that start with 'abcd'
2930        let b = re.concat_list([full, bc, full]); // strings that contain 'bc'
2931
2932        let test = re.diff(a, b); // strings that start with 'abcd' but don't contain 'bc'
2933        let derivatives_count = re.iter_derivatives(test).count();
2934
2935        assert_eq!(re.is_empty_re_bounded(test, derivatives_count - 1), None);
2936        assert_eq!(re.is_empty_re_bounded(test, derivatives_count), Some(true));
2937    }
2938
2939    #[test]
2940    fn test_empty_check_bounded_2() {
2941        let re = &mut ReManager::new();
2942
2943        let c1 = [
2944            re.all_chars(),
2945            re.all_chars(),
2946            re.full(),
2947            re.str(&"/abcd/".into()),
2948            re.all_chars(),
2949            re.str(&"/end".into()),
2950        ];
2951
2952        let e1 = re.concat_list(c1);
2953        let derivatives_count = re.iter_derivatives(e1).count();
2954
2955        assert_eq!(re.is_empty_re_bounded(e1, derivatives_count - 1), None);
2956        assert_eq!(re.is_empty_re_bounded(e1, derivatives_count), Some(false));
2957    }
2958
2959    #[test]
2960    fn test_compile() {
2961        let re = &mut ReManager::new();
2962
2963        // (ac + bc)*
2964        let ac = re.str(&"ac".into());
2965        let bc = re.str(&"bc".into());
2966        let sum = re.union(ac, bc);
2967        let e = re.star(sum);
2968
2969        // convert e to an automaton
2970        let auto = re.compile(e);
2971        println!("Automaton for (ac + bc)*");
2972        println!("{}", auto);
2973
2974        println!("Char partition: {}", auto.combined_char_partition());
2975
2976        let reps = auto.pick_alphabet();
2977        print!("Alphabet:");
2978        for x in reps {
2979            print!(" {}", char_to_smt(x));
2980        }
2981        println!();
2982
2983        let m = auto.compile_successors();
2984        println!("Compiled transition table: size = {}", m.size());
2985
2986        assert_eq!(auto.num_states(), 3);
2987        assert_eq!(auto.num_final_states(), 1);
2988    }
2989
2990    #[test]
2991    fn test_compile2() {
2992        let re = &mut ReManager::new();
2993
2994        let a = [
2995            re.sigma_plus(),
2996            re.str(&"something".into()),
2997            re.all_chars(),
2998            re.all_chars(),
2999            re.char(':' as u32),
3000            re.full(),
3001            re.char(':' as u32),
3002            re.full(),
3003            re.str(&".jpg".into()),
3004        ];
3005
3006        let e = re.concat_list(a);
3007
3008        let auto = re.compile(e);
3009        println!("Automaton for {}", e);
3010        println!("{}", auto);
3011
3012        println!("Char partition: {}", auto.combined_char_partition());
3013
3014        let a = auto.pick_alphabet();
3015        print!("Alphabet representatives:");
3016        for x in a {
3017            print!(" {}", char_to_smt(x));
3018        }
3019        println!();
3020
3021        let m = auto.compile_successors();
3022        println!("Compiled transition table: size = {}", m.size());
3023        println!("Transition table: {}", m);
3024
3025        assert!(auto.accepts(&"prefix_then_somethingAB:middle:mores_stuff.jpg".into()));
3026        assert!(!auto.accepts(&"prefix_then_something:middle:mores_stuff.jpg".into()));
3027
3028        auto.test_minimizer();
3029    }
3030
3031    #[test]
3032    fn test_compile3() {
3033        let re = &mut ReManager::new();
3034
3035        let c1 = [
3036            re.all_chars(),
3037            re.all_chars(),
3038            re.all_chars(),
3039            re.all_chars(),
3040            re.full(),
3041            re.str(&"/abcd/".into()),
3042            re.all_chars(),
3043            re.str(&"/end".into()),
3044        ];
3045
3046        let c2 = [re.all_chars(), re.str(&"/zhfg".into()), re.full()];
3047
3048        let e1 = re.concat_list(c1);
3049        let e2 = re.concat_list(c2);
3050
3051        let test = re.inter(e1, e2);
3052
3053        let auto = re.compile(test);
3054        println!("Automaton for {}", test);
3055        println!("{}", auto);
3056
3057        println!("Char partition: {}", auto.combined_char_partition());
3058        let a = auto.pick_alphabet();
3059        print!("Alphabet representatives:");
3060        for x in a {
3061            print!(" {}", char_to_smt(x));
3062        }
3063        println!();
3064
3065        let m = auto.compile_successors();
3066        println!("Compiled transition table: size = {}", m.size());
3067        println!("Table: {}", m);
3068
3069        let w = re.get_string(test).unwrap();
3070        assert!(auto.accepts(&w));
3071
3072        auto.test_minimizer();
3073    }
3074
3075    #[test]
3076    fn test_bounded_compile() {
3077        let re = &mut ReManager::new();
3078
3079        // abc/(Σ^*)/ΣΣ-ΣΣ/def/(Σ^*)
3080        let a = [
3081            re.str(&"abc/".into()),
3082            re.full(),
3083            re.char('/' as u32),
3084            re.all_chars(),
3085            re.all_chars(),
3086            re.char('-' as u32),
3087            re.all_chars(),
3088            re.all_chars(),
3089            re.str(&"/def/".into()),
3090            re.full(),
3091        ];
3092
3093        let e = re.concat_list(a);
3094
3095        let test0 = re.try_compile(e, 0);
3096        assert!(test0.is_none());
3097
3098        let test1 = re.try_compile(e, 46);
3099        assert!(test1.is_none());
3100
3101        let test2 = re.try_compile(e, 47);
3102        assert!(test2.is_some());
3103        assert_eq!(test2.unwrap().num_states(), 47);
3104    }
3105
3106    #[test]
3107    fn test_compile4() {
3108        let re = &mut ReManager::new();
3109
3110        // bb([0-9]+)
3111        let bb = re.str(&"bb".into());
3112        let digits = re.range('0' as u32, '9' as u32);
3113        let digits = re.plus(digits);
3114        let first = re.concat(bb, digits);
3115
3116        // \sigma^[0..4]
3117        let sigma = re.all_chars();
3118        let lop = re.smt_loop(sigma, 0, 4);
3119        let second = re.complement(lop);
3120
3121        let to_compile = re.inter(second, first);
3122
3123        let test1 = re.try_compile(to_compile, 10000);
3124        assert!(test1.is_some());
3125        assert!(test1.unwrap().accepts(&"bb01234".into()));
3126    }
3127
3128    #[test]
3129    fn test_compile5() {
3130        let re = &mut ReManager::new();
3131
3132        // (a^5)+
3133        let a = re.char('a' as u32);
3134        let a5 = re.smt_loop(a, 5, 5);
3135        let a5plus = re.plus(a5);
3136
3137        println!("testing {a5plus}");
3138        let test = re.try_compile(a5plus, 10000);
3139        assert!(test.is_some());
3140        let dfa = test.unwrap();
3141        println!("Resulting automaton: {dfa}");
3142        assert!(dfa.accepts(&"aaaaaaaaaa".into()));
3143        assert!(dfa.accepts(&"aaaaa".into()));
3144        assert!(!dfa.accepts(&"aaaa".into()));
3145    }
3146
3147    #[test]
3148    fn test_compile6() {
3149        let re = &mut ReManager::new();
3150
3151        // (a^5)+
3152        let a = re.char('a' as u32);
3153        let a5 = re.smt_loop(a, 5, 5);
3154        let a5plus = re.plus(a5);
3155
3156        // (a^2)+
3157        let a2 = re.smt_loop(a, 2, 2);
3158        let a2plus = re.plus(a2);
3159
3160        let inter = re.inter(a5plus, a2plus);
3161        println!("testing {inter}");
3162        let test = re.try_compile(inter, 10000);
3163        assert!(test.is_some());
3164        let dfa = test.unwrap();
3165        println!("Resulting automaton: {dfa}");
3166
3167        assert!(dfa.accepts(&"aaaaaaaaaa".into()));
3168        assert!(!dfa.accepts(&"aaaaa".into()));
3169        assert!(!dfa.accepts(&"aa".into()));
3170    }
3171}
aws_smt_strings/regular_expressions.rs

aws_smt_strings/
regular_expressions.rs