aws_smt_strings/regular_expressions.rs
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2// SPDX-License-Identifier: Apache-2.0
3
4//!
5//! Regular expressions
6//!
7//! This module defines the abstract syntax of regular expressions [BaseRegLan]
8//! and the regular expression type [RE]. Regular expressions are built using
9//! an [ReManager], which provides hash consing.
10//!
11//! **Important**: Because of hash consing, regular expressions created with different
12//! [ReManager]s are not compatible. It's recommended to use only one [ReManager].
13//!
14//! Input to the manager's methods are static references to [RE] objects
15//! (see type [RegLan]). The manager also returns objects of type [RegLan]
16//! when producing regular expressions.
17//!
18//! [ReManager] also implements the *derivative* operation. The derivative of a regular
19//! expression R with respect to a character c is another regular expression S that
20//! defines all the strings that can follow c in the language of R. For example,
21//! the derivative of regular expression '(abc + cd)\*' with respect to 'a' is
22//! 'bc(abc + cd)\*': all words of '(abc + cd)\*' that start with 'a' are of the
23//! form 'a.w' where 'w' is a word 'bc(abc + cd)\*'.
24//!
25//! For a regular expression R, we use a [CharPartition] that divides the alphabet into
26//! equivalent *derivative classes*. If two characters `c1` and `c2` are in the same
27//! derivative class, then the derivative of R with respect to `c1` and the derivative of R
28//! with respect to `c2` are equal. The [ReManager] implements derivative of R with respect
29//! to one of its derivative class. More generally, the derivative of R with respect to a
30//! character set C is well defined if C is included in a derivative class of R.
31//!
32//! Derivatives allows one to convert REs to deterministic automata and support
33//! other operations such as checking whether a string matches an RE.
34//!
35
36use std::collections::HashMap;
37use std::fmt::Display;
38use std::hash::Hash;
39use std::rc::Rc;
40
41use crate::automata::Automaton;
42use crate::automata::AutomatonBuilder;
43use crate::bfs_queues::*;
44use crate::character_sets::*;
45use crate::errors::*;
46use crate::labeled_queues::LabeledQueue;
47use crate::loop_ranges::*;
48use crate::matcher::SearchResult;
49use crate::smt_strings::SmtString;
50use crate::smt_strings::MAX_CHAR;
51use crate::store::*;
52
53#[derive(Debug, Clone, PartialEq, Eq, Hash)]
54///
55/// Abstract syntax for regular expressions
56///
57pub enum BaseRegLan {
58 /// Empty language
59 Empty,
60
61 /// The language that contains only the empty string
62 Epsilon,
63
64 /// Words of length one with characters is a range [a, b]
65 Range(CharSet),
66
67 /// Concatenation of two languages
68 Concat(RegLan, RegLan),
69
70 /// Generalized loop: see [loop_ranges][crate::loop_ranges]
71 Loop(RegLan, LoopRange),
72
73 /// Complement of a language
74 Complement(RegLan),
75
76 /// Union of two or more languages
77 Union(Box<[RegLan]>),
78
79 /// Intersection of two or more languages
80 Inter(Box<[RegLan]>),
81}
82
83/// Reference to a Regular Expression descriptor
84pub type RegLan = &'static RE;
85
86///
87/// Regular expression structure
88///
89/// A regular expression includes an expression of type [BaseRegLan],
90/// which is an abstract syntax tree.
91///
92/// In addition, each regular expression e has a
93/// unique integer id and three attributes:
94/// - e.nullable is true if the language of e contains the empty string
95/// - e.singleton is true if the language of e contains a single string
96/// - e.deriv_class is the list of derivative classes of e.
97///
98/// The derivative classes are disjoint interval characters that cover
99/// a subset of the alphabet, and a complementary class that covers the rest.
100/// See [CharPartition]. The `deriv_class` partition is
101/// constructed to ensure that all the characters in a class produce the same
102/// derivative of e: if c1 and c2 are in the same derivative class of e then
103/// deriv(e, c1) and deriv(e, c2) are equal.
104///
105/// Operations on regular expressions use hash-consing and are performed with
106/// an [ReManager].
107#[derive(Debug)]
108pub struct RE {
109 /// Abstract syntax tree
110 expr: BaseRegLan,
111 /// Unique id for this RE
112 id: usize,
113 /// Whether the language contains the empty string
114 pub nullable: bool,
115 /// Whether this RE has a single element
116 singleton: bool,
117 /// Whether this RE is a simple pattern, i.e.,
118 /// a concatenation of Loops and Ranges
119 simple_pattern: bool,
120 /// Partition of character sets relevant to this RE
121 deriv_class: Rc<CharPartition>,
122}
123
124/// Equality on RE is derived from the unique ids.
125///
126/// Two REs are equal iff they have the same id.
127impl PartialEq for RE {
128 fn eq(&self, other: &Self) -> bool {
129 self.id == other.id
130 }
131}
132
133impl Eq for RE {}
134
135/// Ordering on REs is based on unique ids.
136///
137/// We have re1 < re2 iff re1.id < re2.id
138impl PartialOrd for RE {
139 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
140 // self.id.partial_cmp(&other.id)
141 Some(self.cmp(other))
142 }
143}
144
145impl Ord for RE {
146 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
147 self.id.cmp(&other.id)
148 }
149}
150
151/// The hash code of an RE is just the hash code of its id.
152impl Hash for RE {
153 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
154 self.id.hash(state)
155 }
156}
157
158// utility: add n copies of x to vector v
159fn push_multiple<T: Copy>(v: &mut Vec<T>, n: u32, x: T) {
160 for _ in 0..n {
161 v.push(x);
162 }
163}
164
165impl BaseRegLan {
166 /// Check whether the empty word is in this language
167 fn is_nullable(&self) -> bool {
168 match self {
169 BaseRegLan::Empty => false,
170 BaseRegLan::Epsilon => true,
171 BaseRegLan::Range(_) => false,
172 BaseRegLan::Concat(e1, e2) => e1.nullable && e2.nullable,
173 BaseRegLan::Loop(e, range) => range.start() == 0 || e.nullable,
174 BaseRegLan::Complement(e) => !e.nullable,
175 BaseRegLan::Inter(args) => args.iter().all(|x| x.nullable),
176 BaseRegLan::Union(args) => args.iter().any(|x| x.nullable),
177 }
178 }
179
180 /// Check whether this RE is atomic (either Empty, Epsilon, or a character Range)
181 pub fn is_atomic(&self) -> bool {
182 matches!(
183 self,
184 BaseRegLan::Empty | BaseRegLan::Epsilon | BaseRegLan::Range(_)
185 )
186 }
187
188 /// Check whether this RE is Concat/Loop or Atomic
189 fn concat_or_atomic(&self) -> bool {
190 matches!(
191 self,
192 BaseRegLan::Empty
193 | BaseRegLan::Epsilon
194 | BaseRegLan::Range(..)
195 | BaseRegLan::Concat(..)
196 | BaseRegLan::Loop(..)
197 )
198 }
199
200 /// Check whether this RE is sigma (all chars)
201 fn is_all_chars(&self) -> bool {
202 if let BaseRegLan::Range(s) = self {
203 s.is_alphabet()
204 } else {
205 false
206 }
207 }
208
209 /// Check whether this RE is sigma^* (full language)
210 fn is_full(&self) -> bool {
211 if let BaseRegLan::Loop(r, range) = self {
212 range.is_all() && r.expr.is_all_chars()
213 } else {
214 false
215 }
216 }
217
218 /// Check whether this regular expression is of the form (str.to_re <some string>)
219 /// This holds if the RE is epsilon or if it's a concatenation of characters
220 fn is_singleton(&self) -> bool {
221 match self {
222 BaseRegLan::Epsilon => true,
223 BaseRegLan::Range(c) => c.is_singleton(),
224 BaseRegLan::Concat(e1, e2) => e1.singleton && e2.singleton,
225 BaseRegLan::Loop(e, range) => e.singleton && range.is_point(),
226 _ => false,
227 }
228 }
229
230 /// Check whether this regular expression is a concatenation of ranges
231 /// and loops over ranges.
232 fn is_simple_pattern(&self) -> bool {
233 match self {
234 BaseRegLan::Epsilon => true,
235 BaseRegLan::Range(..) => true,
236 BaseRegLan::Loop(r, ..) => matches!(r.expr, BaseRegLan::Range(..)),
237 BaseRegLan::Concat(e1, e2) => e1.simple_pattern && e2.simple_pattern,
238 _ => false,
239 }
240 }
241
242 /// Check whether this regular expression is a Range
243 /// (i.e., an interval of characters [c0, c1])
244 fn is_range(&self) -> bool {
245 matches!(self, BaseRegLan::Range(..))
246 }
247
248 /// Check whether all strings of `self` are one-character long and belong to s
249 fn match_char_set(&self, s: &CharSet) -> bool {
250 match self {
251 BaseRegLan::Range(x) => s.covers(x),
252 _ => false,
253 }
254 }
255
256 /// Compute the derivation classes for this regular expression
257 fn deriv_class(&self) -> Rc<CharPartition> {
258 fn rc(p: CharPartition) -> Rc<CharPartition> {
259 Rc::new(p)
260 }
261
262 fn merge_deriv_classes(a: &[RegLan]) -> Rc<CharPartition> {
263 let mut result = CharPartition::new();
264 for &re in a {
265 result = merge_partitions(&result, &re.deriv_class)
266 }
267 rc(result)
268 }
269
270 fn empty_partition() -> Rc<CharPartition> {
271 rc(CharPartition::new())
272 }
273
274 match self {
275 BaseRegLan::Empty => empty_partition(),
276 BaseRegLan::Epsilon => empty_partition(),
277 BaseRegLan::Range(c) => rc(CharPartition::from_set(c)),
278 BaseRegLan::Concat(e1, e2) => {
279 if e1.nullable {
280 rc(merge_partitions(&e1.deriv_class, &e2.deriv_class))
281 } else {
282 e1.deriv_class.clone()
283 }
284 }
285 BaseRegLan::Loop(e, _) => e.deriv_class.clone(),
286 BaseRegLan::Complement(e) => e.deriv_class.clone(),
287 BaseRegLan::Inter(args) => merge_deriv_classes(args.as_ref()),
288 BaseRegLan::Union(args) => merge_deriv_classes(args.as_ref()),
289 }
290 }
291
292 /// Collect all characters of this RE (if it's a singleton)
293 #[allow(dead_code)]
294 fn collect_chars(&self, v: &mut Vec<u32>) {
295 match self {
296 BaseRegLan::Range(c) => {
297 if c.is_singleton() {
298 v.push(c.pick());
299 }
300 }
301 BaseRegLan::Loop(r, range) => {
302 if range.is_point() {
303 if let BaseRegLan::Range(c) = r.expr {
304 if c.is_singleton() {
305 push_multiple(v, range.start(), c.pick());
306 }
307 }
308 }
309 }
310 BaseRegLan::Concat(e1, e2) => {
311 e1.expr.collect_chars(v);
312 e2.expr.collect_chars(v);
313 }
314 _ => (),
315 }
316 }
317}
318
319impl HashConsed for RE {
320 type Key = BaseRegLan;
321
322 fn make(index: usize, k: &Self::Key) -> Self {
323 RE {
324 expr: k.clone(),
325 id: index,
326 nullable: k.is_nullable(),
327 singleton: k.is_singleton(),
328 simple_pattern: k.is_simple_pattern(),
329 deriv_class: k.deriv_class(),
330 }
331 }
332}
333
334impl Display for BaseRegLan {
335 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
336 // write either e or '(e)' when e is a sub-expression
337 fn write_sub(f: &mut std::fmt::Formatter<'_>, e: RegLan) -> std::fmt::Result {
338 if e.singleton || e.expr.is_atomic() {
339 write!(f, "{}", e.expr)
340 } else {
341 write!(f, "({})", e.expr)
342 }
343 }
344
345 // write a list of operands separated by an symbol
346 fn write_list(
347 f: &mut std::fmt::Formatter<'_>,
348 l: &[RegLan],
349 symbol: char,
350 ) -> std::fmt::Result {
351 debug_assert!(!l.is_empty());
352 write_sub(f, l[0])?;
353 for e in &l[1..] {
354 write!(f, " {symbol} ")?;
355 write_sub(f, e)?;
356 }
357 Ok(())
358 }
359
360 match self {
361 BaseRegLan::Empty => write!(f, "\u{2205}"), // empty set
362 BaseRegLan::Epsilon => write!(f, "\u{03B5}"),
363 BaseRegLan::Range(r) => write!(f, "{r}"),
364 BaseRegLan::Concat(e1, e2) => {
365 let mut v = Vec::new();
366 flatten_concat(e1, &mut v);
367 flatten_concat(e2, &mut v);
368 for e in v {
369 write_sub(f, e)?
370 }
371 Ok(())
372 }
373 BaseRegLan::Loop(e, range) => {
374 write_sub(f, e)?;
375 write!(f, "^{range}")
376 }
377 BaseRegLan::Complement(e) => {
378 write!(f, "\u{00AC}")?;
379 write_sub(f, e)
380 }
381 BaseRegLan::Inter(args) => write_list(f, args, '&'),
382 BaseRegLan::Union(args) => write_list(f, args, '+'),
383 }
384 }
385}
386
387impl Display for RE {
388 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
389 self.expr.fmt(f)
390 }
391}
392
393impl RE {
394 /// check whether the complementary class is empty
395 pub fn empty_complement(&self) -> bool {
396 self.deriv_class.empty_complement()
397 }
398
399 /// number of derivative classes (not including the complementary class)
400 pub fn num_deriv_classes(&self) -> usize {
401 self.deriv_class.len()
402 }
403
404 /// check whether cid is a valid class id
405 pub fn valid_class_id(&self, cid: ClassId) -> bool {
406 self.deriv_class.valid_class_id(cid)
407 }
408
409 /// Check whether this RE is equal to the empty RE
410 pub fn is_empty(&self) -> bool {
411 matches!(self.expr, BaseRegLan::Empty)
412 }
413
414 /// pick a representative element in a derivative class
415 ///
416 /// - if cid is Interval(i): pick in class i
417 /// - if cid is Complement: pick in the complementary class
418 ///
419 fn pick_class_rep(&self, cid: ClassId) -> u32 {
420 self.deriv_class.pick_in_class(cid)
421 }
422
423 /// class of point x
424 fn class_of_char(&self, x: u32) -> ClassId {
425 debug_assert!(x <= MAX_CHAR);
426 self.deriv_class.class_of_char(x)
427 }
428
429 /// class of set s
430 fn class_of_set(&self, s: &CharSet) -> Result<ClassId, Error> {
431 self.deriv_class.class_of_set(s)
432 }
433
434 /// iterator to go through valid classIds
435 pub fn class_ids(&self) -> ClassIdIterator<'_> {
436 self.deriv_class.class_ids()
437 }
438
439 /// iterator to go through the intervals in an RE derivative classes
440 pub fn char_ranges(&self) -> impl Iterator<Item = &CharSet> {
441 self.deriv_class.ranges()
442 }
443
444 /// incomplete check for inclusion
445 /// - if this returns true then self is included in other
446 /// - otherwise, we don't know
447 pub fn included_in(&self, other: &Self) -> bool {
448 sub_language(self, other)
449 }
450}
451
452/// Iterator to go through all sub-terms of a RegLan
453/// We can't implement this in RE because of lifetime issues
454#[derive(Debug)]
455struct ReIterator {
456 queue: BfsQueue<RegLan>,
457}
458
459impl Iterator for ReIterator {
460 type Item = RegLan;
461
462 /// List all sub-terms in breadth-first order, without duplicates
463 fn next(&mut self) -> Option<Self::Item> {
464 fn get_next(queue: &mut BfsQueue<RegLan>) -> Option<RegLan> {
465 queue.pop().inspect(|x| match x.expr {
466 BaseRegLan::Concat(left, right) => {
467 queue.push(left);
468 queue.push(right);
469 }
470 BaseRegLan::Loop(x, _) => {
471 queue.push(x);
472 }
473 BaseRegLan::Complement(x) => {
474 queue.push(x);
475 }
476 BaseRegLan::Inter(ref list) => {
477 queue.push_all(list.iter().copied());
478 }
479 BaseRegLan::Union(ref list) => {
480 queue.push_all(list.iter().copied());
481 }
482 _ => (),
483 })
484 }
485
486 get_next(&mut self.queue)
487 }
488}
489
490///
491/// Iterator for the sub-terms of r
492/// - This enumerates the sub-terms of r, without duplicates,
493/// in a breadth-first order. The term r is included.
494/// It comes first in the iteration.
495///
496pub fn sub_terms(r: RegLan) -> impl Iterator<Item = RegLan> {
497 let mut queue = BfsQueue::new();
498 queue.push(r);
499 ReIterator { queue }
500}
501
502///
503/// Iterator that enumerates the leaves of r
504/// - A leaf is an atomic sub-term of r (i.e., a term t such that f.expr is either
505/// [BaseRegLan::Empty], or [BaseRegLan::Epsilon] or [BaseRegLan::Range])
506/// - All leaves are listed once (no duplicates)
507/// - If r itself is atomic, the iterator just produces r and nothing else.
508///
509pub fn leaves(r: RegLan) -> impl Iterator<Item = RegLan> {
510 sub_terms(r).filter(|&x| x.expr.is_atomic())
511}
512
513///
514/// Collect a list L = (R_1,...R_n) such that r = concat(R_1,...,R_n)
515/// and no R_i is itself of the form concat(...) or epsilon, then
516/// add the R_is to vector v.
517///
518fn flatten_concat<'a>(r: &'a RE, v: &mut Vec<&'a RE>) {
519 match &r.expr {
520 BaseRegLan::Epsilon => (), // skip epsilon
521 BaseRegLan::Concat(x, y) => {
522 flatten_concat(x, v);
523 flatten_concat(y, v)
524 }
525 _ => v.push(r),
526 }
527}
528
529///
530/// Same as flatten concat but return a vector of R_1, ..., R_n
531///
532fn decompose_concat(r: &RE) -> Vec<&RE> {
533 let mut result = Vec::new();
534 flatten_concat(r, &mut result);
535 result
536}
537
538///
539/// collect a list L= {R_1, ..., R_n} of languages such
540/// that R = inter(R_1, ..., R_n) and no R_i is itself of the form inter(...)
541/// add R_1 ... R_n to v
542///
543fn flatten_inter<'a>(r: &'a RE, v: &mut Vec<&'a RE>) {
544 match &r.expr {
545 BaseRegLan::Inter(x) => {
546 for &s in x.as_ref() {
547 flatten_inter(s, v)
548 }
549 }
550 _ => v.push(r),
551 }
552}
553
554///
555/// collect a list L= {R_1, ..., R_n} of languages such
556/// that R = union(R_1, ..., R_n) and no R_i is itself of the form union(...)
557/// add R_1 ... R_n to v
558///
559fn flatten_union<'a>(r: &'a RE, v: &mut Vec<&'a RE>) {
560 match &r.expr {
561 BaseRegLan::Union(x) => {
562 for &s in x.as_ref() {
563 flatten_union(s, v)
564 }
565 }
566 _ => v.push(r),
567 }
568}
569
570/// check whether a sorted slice v contains x
571/// this is used for x=empty or x=full or x=epsilon, which have small ids,
572/// so if x occurs, that will be at the beginning of v.
573fn contains<'a>(v: &[&'a RE], x: &'a RE) -> bool {
574 for &y in v {
575 if *y == *x {
576 return true;
577 }
578 if *y > *x {
579 return false;
580 }
581 }
582 false
583}
584
585/// reset a then store x as its unique element
586fn set_to_singleton<'a>(a: &mut Vec<&'a RE>, x: &'a RE) {
587 a.clear();
588 a.push(x);
589}
590
591///
592/// Subsumption/Language inclusion
593///
594/// A regular expression R subsumes another regular expression S is
595/// the language of S is included in the language of R. We try to
596/// detect subsumptions to simplify unions and intersections
597/// of regular expressions.
598///
599/// We do this when R is a simple pattern, that is, a concatenation
600/// of Range and Loop REs. In this case, we can write R as a concatenation
601/// of basic patterns. Each basic pattern is either a sequence of Range
602/// expressions or a sequence of loop expressions. We say that a sequence
603/// of Range expression is a rigid pattern (e.g., it can be something like a string
604/// 'abc'). A sequence of loop expression is a flexible pattern (e.g., something like
605/// Sigma^*).
606///
607/// To check whether R subsumes S:
608/// - construct the list of basic patterns of R
609/// - first pass: find matches in S for all the rigid patterns of R. Each match is a slice of S
610/// say S[i, j]
611/// - second pass: the parts of S that are not matched in the first pass must now match
612/// flexible patterns of R in sequence.
613///
614/// Data structure to represent a basic pattern of array R of RegLan
615/// - start and end are indices in R such that start < end
616/// - this means that R[start, end-1] is a base pattern
617/// - if we find a match for R in an array S, we set
618/// start_match and end_match to mean that S[start_match, end_match-1]
619/// matches the base_pattern. We must have start_match <= end_match.
620///
621#[derive(Debug)]
622struct BasePattern {
623 start: usize,
624 end: usize,
625 is_rigid: bool,
626 start_match: usize,
627 end_match: usize,
628}
629
630impl BasePattern {
631 fn len(&self) -> usize {
632 self.end - self.start
633 }
634
635 fn set_match(&mut self, start: usize, end: usize) {
636 self.start_match = start;
637 self.end_match = end;
638 }
639
640 fn make(start: usize, end: usize, is_rigid: bool) -> BasePattern {
641 debug_assert!(start < end);
642 BasePattern {
643 start,
644 end,
645 is_rigid,
646 start_match: 0,
647 end_match: 0,
648 }
649 }
650}
651
652impl Display for BasePattern {
653 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
654 let k = if self.is_rigid { "Rigid" } else { "Flex" };
655 write!(f, "{}({}, {})", k, self.start, self.end)
656 }
657}
658
659/// Construct a vector of base patterns from a slice of RegLan
660fn base_patterns(r: &[&RE]) -> Vec<BasePattern> {
661 let mut result = Vec::new();
662 if !r.is_empty() {
663 let mut j = 0;
664 let mut rigid_slice = r[0].expr.is_range();
665 for (i, re) in r.iter().enumerate().skip(1) {
666 let rigid_i = re.expr.is_range();
667 if rigid_slice != rigid_i {
668 result.push(BasePattern::make(j, i, rigid_slice));
669 rigid_slice = rigid_i;
670 j = i;
671 }
672 }
673 // last base_pattern
674 result.push(BasePattern::make(j, r.len(), rigid_slice))
675 }
676 result
677}
678
679/// Check whether s[i..pattern.len()-1] matches pattern
680/// Requires i + pattern.len() <= s.lend()
681fn rigid_match_at(pattern: &[&CharSet], s: &[&RE], i: usize) -> bool {
682 debug_assert!(i + pattern.len() <= s.len());
683 for j in 0..pattern.len() {
684 if !s[i + j].expr.match_char_set(pattern[j]) {
685 return false;
686 }
687 }
688 true
689}
690
691/// Search for a match of pattern in sequence s, starting at index i
692/// `pattern` is a rigid pattern represented as a sequence of CharSets
693fn next_rigid_match(pattern: &[&CharSet], s: &[&RE], i: usize) -> SearchResult {
694 let p_len = pattern.len();
695 let s_len = s.len();
696 if s_len >= p_len {
697 for j in i..=(s_len - p_len) {
698 if rigid_match_at(pattern, s, j) {
699 return SearchResult::Found(j, j + p_len);
700 }
701 }
702 }
703 SearchResult::NotFound
704}
705
706/// Search for a match of pattern in sequence s, starting from index i and going down
707/// `pattern` is a rigid pattern represented as a sequence of CharSets
708fn prev_rigid_match(pattern: &[&CharSet], s: &[&RE], i: usize) -> SearchResult {
709 let p_len = pattern.len();
710 for j in (p_len..=i).rev() {
711 if rigid_match_at(pattern, s, j - p_len) {
712 return SearchResult::Found(j - p_len, j);
713 }
714 }
715 SearchResult::NotFound
716}
717
718/// Collect the list of CharSets from a rigid pattern
719/// the pattern `p` is given as a list of regular expressions.
720/// each regular expression in `p` is assumed to be character Range.
721fn char_sets_of_pattern<'a>(p: &[&'a RE]) -> Vec<&'a CharSet> {
722 let mut result = Vec::new();
723 for r in p {
724 if let BaseRegLan::Range(s) = &r.expr {
725 result.push(s)
726 } else {
727 unreachable!()
728 }
729 }
730 result
731}
732
733/// Check whether a concatenation of REs starts with a rigid pattern
734/// - 'u' is a sequence of REs (concatenated)
735/// - 'v' and 'p' define the pattern to search for.
736/// `p` identifies a slice of 'v' (from p.start to p.end)
737/// this slice is a concatenation of Range expressions
738/// - return true if 'u' starts with a sequence of Range expressions
739/// of the same length as the pattern and that each range expression in `u` is
740/// included in the corresponding range expression in the pattern.
741fn rigid_prefix_match<'a>(u: &[&'a RE], v: &[&'a RE], p: &BasePattern) -> bool {
742 if u.len() >= p.len() {
743 let p = char_sets_of_pattern(&v[p.start..p.end]);
744 rigid_match_at(&p, u, 0)
745 } else {
746 false
747 }
748}
749
750/// Check whether a concatenation of REs end with a rigid pattern
751/// - 'u' is a sequence of REs
752/// - 'v' and 'p' define the pattern to search for
753/// - return true if 'u' ends with a sequence of REs that match this pattern
754fn rigid_suffix_match<'a>(u: &[&'a RE], v: &[&'a RE], p: &BasePattern) -> bool {
755 if u.len() >= p.len() {
756 let p = char_sets_of_pattern(&v[p.start..p.end]);
757 rigid_match_at(&p, u, u.len() - p.len())
758 } else {
759 false
760 }
761}
762
763/// Check whether all rigid patterns in a sequence are matched
764/// - `u` is a sequence/concatenation of REs
765/// - 'v' and 'patterns' define a sequence of rigid patterns
766/// - return true if all patterns defined by 'v' and 'patterns' are matched by
767/// successive, disjoint sub-sequences of `u`
768///
769/// This version does a left-to-right search in `u'
770#[allow(clippy::many_single_char_names)]
771fn find_rigid_matches<'a>(u: &[&'a RE], v: &[&'a RE], patterns: &mut [BasePattern]) -> bool {
772 let mut i = 0;
773 for p in patterns {
774 if p.is_rigid {
775 let pattern = char_sets_of_pattern(&v[p.start..p.end]);
776 match next_rigid_match(&pattern, u, i) {
777 SearchResult::NotFound => return false,
778 SearchResult::Found(j, k) => {
779 p.set_match(j, k);
780 i = k;
781 }
782 }
783 }
784 }
785 true
786}
787
788/// Same a [find_rigid_matches] but with a right-to-left search
789#[allow(clippy::many_single_char_names)]
790fn find_rigid_matches_rev<'a>(u: &[&'a RE], v: &[&'a RE], patterns: &mut [BasePattern]) -> bool {
791 let mut i = u.len();
792 for p in patterns.iter_mut().rev() {
793 if p.is_rigid {
794 let pattern = char_sets_of_pattern(&v[p.start..p.end]);
795 match prev_rigid_match(&pattern, u, i) {
796 SearchResult::NotFound => return false,
797 SearchResult::Found(j, k) => {
798 p.set_match(j, k);
799 i = j;
800 }
801 }
802 }
803 }
804 true
805}
806
807// set the matching regions of every flexible pattern of b after rigid patterns
808// have been matched. We use the fact that the predecessor and successor of a flexible
809// pattern p are both rigid.
810// So p.start_match = pred(p).end_match and p.end_match = suc(p).start_match.
811fn set_flexible_regions(p: &mut [BasePattern], string_len: usize) {
812 for i in 0..p.len() {
813 if !p[i].is_rigid {
814 let prev = if i == 0 { 0 } else { p[i - 1].end_match };
815 let next = if i == p.len() - 1 {
816 string_len
817 } else {
818 p[i + 1].start_match
819 };
820 p[i].set_match(prev, next);
821 }
822 }
823}
824
825// check whether u matches v
826// TBD: improve this
827fn flexible_match<'a>(_u: &[&'a RE], v: &[&'a RE]) -> bool {
828 v.len() == 1 && v[0].expr.is_full()
829}
830
831// try to complete a partial matching, where all rigid base_patterns are matched
832fn match_flexible_patterns<'a>(u: &[&'a RE], v: &[&'a RE], patterns: &mut [BasePattern]) -> bool {
833 if patterns.is_empty() {
834 // equivalent to matching with epsilon
835 u.is_empty()
836 } else {
837 set_flexible_regions(patterns, u.len());
838 for p in patterns {
839 if !p.is_rigid && !flexible_match(&u[p.start_match..p.end_match], &v[p.start..p.end]) {
840 return false;
841 }
842 }
843 true
844 }
845}
846
847// shift all base_patterns of p by delta (subtract delta)
848fn shift_pattern_start(patterns: &mut [BasePattern], delta: usize) {
849 for p in patterns {
850 debug_assert!(p.end > p.start && p.start >= delta);
851 p.start -= delta;
852 p.end -= delta;
853 }
854}
855
856// check whether (concat u) is a sub-language of (concat v)
857// - u and v are non-empty
858// - elements of u and v can be comp, inter, union, loop, range
859fn concat_inclusion<'a>(u: &[&'a RE], v: &[&'a RE]) -> bool {
860 let mut b = base_patterns(v);
861 let mut p = b.as_mut_slice();
862 let mut u = u;
863 let mut v = v;
864
865 // a rigid prefix must match
866 if let Some(pat) = p.first() {
867 if pat.is_rigid {
868 if rigid_prefix_match(u, v, pat) {
869 let len = pat.len();
870 p = &mut p[1..];
871 shift_pattern_start(p, len);
872 u = &u[len..];
873 v = &v[len..];
874 } else {
875 return false;
876 }
877 }
878 }
879
880 // a rigid suffix must match
881 if let Some(pat) = p.last() {
882 if pat.is_rigid {
883 if rigid_suffix_match(u, v, pat) {
884 let len = pat.len();
885 let p_len = p.len();
886 p = &mut p[..p_len - 1];
887 u = &u[..u.len() - len];
888 v = &v[..v.len() - len];
889 } else {
890 return false;
891 }
892 }
893 }
894
895 if find_rigid_matches(u, v, p) && match_flexible_patterns(u, v, p) {
896 return true;
897 }
898
899 if find_rigid_matches_rev(u, v, p) && match_flexible_patterns(u, v, p) {
900 return true;
901 }
902
903 false
904}
905
906///
907/// Check language inclusion
908///
909/// This relies on simple/incomplete checks
910/// - if the function returns true then `r` is a sub-language of `s`
911/// - otherwise, we don't know.
912///
913fn sub_language<'a>(r: &'a RE, s: &'a RE) -> bool {
914 use BaseRegLan::*;
915
916 if r == s {
917 true
918 } else {
919 match (&r.expr, &s.expr) {
920 (Empty, _) => true,
921 (_, Empty) => false,
922 (Epsilon, _) => s.nullable,
923 (_, Epsilon) => false,
924 (Complement(r1), Complement(s2)) => sub_language(s2, r1),
925
926 // for union and intersection, we iterate through the list only if the other side is simple
927 // (i.e., it's either an atom or a concatenation/loop)
928 (_, Union(list)) => {
929 r.expr.concat_or_atomic() && list.iter().any(|&x| sub_language(r, x))
930 }
931 (Inter(list), _) => {
932 s.expr.concat_or_atomic() && list.iter().any(|&x| sub_language(x, s))
933 }
934 (Union(list), _) => {
935 s.expr.concat_or_atomic() && list.iter().all(|&x| sub_language(x, s))
936 }
937 (_, Inter(list)) => {
938 r.expr.concat_or_atomic() && list.iter().all(|&x| sub_language(r, x))
939 }
940
941 // concatenation of loops and ranges
942 (_, _) => {
943 let u = decompose_concat(r);
944 let v = decompose_concat(s);
945 concat_inclusion(&u, &v)
946 }
947 }
948 }
949}
950
951///
952/// Simplify a vector for building unions or intersections of languages
953/// - v = a vector of languages
954/// - bottom = neutral element
955/// - top = absorbing element
956///
957/// The function updates v to remove neutral elements and complementary pairs.
958/// This implements the following simplification rules where op is either union
959/// or intersection:
960/// - op(X, bottom) = X
961/// - op(top, X) = top
962/// - op(X, complement(X)) = top
963/// - op(X, X) = X
964///
965/// For op=intersection, we must have bottom = full and top = empty
966///
967/// For op=union, we must have bottom = empty and top = full
968//
969// We use the property that X and complement(X) have successive ids
970// so after sorting v, X and complement(X) occur next to each other in v.
971//
972fn simplify_set_operation<'a>(v: &mut Vec<&'a RE>, bottom: &'a RE, top: &'a RE) {
973 if !v.is_empty() {
974 v.sort();
975 v.dedup();
976 if contains(v, top) {
977 set_to_singleton(v, top)
978 } else {
979 let mut j = 0;
980 let mut previous = v[0];
981 if previous != bottom {
982 v[j] = previous;
983 j += 1;
984 }
985 for i in 1..v.len() {
986 let current = v[i];
987 if current.id == previous.id + 1 && previous.id % 2 == 0 {
988 // current is the complement of previous
989 set_to_singleton(v, top);
990 return;
991 }
992 if current != bottom {
993 v[j] = current;
994 previous = current;
995 j += 1;
996 }
997 }
998 v.truncate(j)
999 }
1000 }
1001}
1002
1003///
1004/// Pairs RegLan, ClassId used as keys in hash maps.
1005///
1006/// - Some(i) means the i-th interval of an RE's deriv_class
1007/// - None means the RE's complementary class
1008///
1009#[derive(Debug, PartialEq, Eq, Hash)]
1010struct DerivKey(RegLan, ClassId);
1011
1012/// A store for constructing regular expressions using hash-consing.
1013///
1014/// The store ensures that each regular expression has a unique integer id.
1015///
1016/// For all regular expressions e1 and e2 constructed with the same manager,
1017/// we have e1.expr == e2.expr iff e1.id == e2.id
1018///
1019/// # Examples
1020///
1021/// This example shows how to create the regular expression `(ac + bc)*` and
1022/// compute its derivatives.
1023///
1024/// ```
1025/// use aws_smt_strings::regular_expressions::*;
1026///
1027/// let re = &mut ReManager::new();
1028/// let ac = re.str(&"ac".into()); // ac
1029/// let bc = re.str(&"bc".into()); // bc
1030/// let sum = re.union(ac, bc); // ac + bc
1031/// let e = re.star(sum); // (ac + bc)*
1032///
1033/// let d1 = re.char_derivative(e, 'a' as u32); // derivative of e w.r.t. 'a'
1034/// let d2 = re.char_derivative(e, 'b' as u32); // derivative of e w.r.t. 'b'
1035///
1036/// // by hash-consing: d1 and d2 are equal
1037/// assert_eq!(d1, d2);
1038/// assert!(std::ptr::eq(d1, d2));
1039/// ```
1040//
1041// We maintain the invariant that x and complement(x) have successive
1042// ids.
1043#[derive(Debug)]
1044pub struct ReManager {
1045 store: Store<RE>,
1046 id2re: Vec<RegLan>, // map id to RE
1047 sigma: RegLan, // all one-letter strings
1048 empty: RegLan,
1049 sigma_star: RegLan, // complement of empty (all strings)
1050 epsilon: RegLan,
1051 sigma_plus: RegLan, // complement of epsilon (all strings of positive length)
1052 deriv_cache: HashMap<DerivKey, RegLan>, // cache of known derivatives
1053}
1054
1055impl ReManager {
1056 /// Create a new ReManager
1057 pub fn new() -> Self {
1058 let mut store = Store::new();
1059 let sigma = store.make(BaseRegLan::Range(CharSet::all_chars()));
1060 let not_sigma = store.make(BaseRegLan::Complement(sigma));
1061 let empty = store.make(BaseRegLan::Empty);
1062 let sigma_star = store.make(BaseRegLan::Loop(sigma, LoopRange::star()));
1063 let epsilon = store.make(BaseRegLan::Epsilon);
1064 let sigma_plus = store.make(BaseRegLan::Loop(sigma, LoopRange::plus()));
1065 debug_assert_eq!(sigma.id, 0);
1066 debug_assert_eq!(not_sigma.id, 1);
1067 debug_assert_eq!(empty.id, 2);
1068 debug_assert_eq!(sigma_star.id, 3);
1069 debug_assert_eq!(epsilon.id, 4);
1070 debug_assert_eq!(sigma_plus.id, 5);
1071 ReManager {
1072 store,
1073 id2re: vec![sigma, not_sigma, empty, sigma_star, epsilon, sigma_plus],
1074 sigma,
1075 empty,
1076 sigma_star,
1077 epsilon,
1078 sigma_plus,
1079 deriv_cache: HashMap::new(),
1080 }
1081 }
1082
1083 fn id_to_re(&self, id: usize) -> RegLan {
1084 self.id2re[id]
1085 }
1086
1087 /// Internal hash-consing constructor
1088 ///
1089 /// - When we create X, we also create complement(X) to make
1090 /// sure X and complement(X) have consecutive ids.
1091 fn make(&mut self, ast: BaseRegLan) -> RegLan {
1092 match ast {
1093 BaseRegLan::Complement(x) => self.id_to_re(x.id + 1),
1094 _ => {
1095 let i = self.store.counter;
1096 debug_assert!(i == self.id2re.len());
1097 let x = self.store.make(ast);
1098 debug_assert!(x.id <= i);
1099 if x.id == i {
1100 // new term
1101 let y = self.store.make(BaseRegLan::Complement(x));
1102 debug_assert!(y.id == i + 1);
1103 self.id2re.push(x);
1104 self.id2re.push(y);
1105 }
1106 x
1107 }
1108 }
1109 }
1110
1111 /// The empty language
1112 ///
1113 /// # Example
1114 ///
1115 /// ```
1116 /// use aws_smt_strings::regular_expressions::*;
1117 ///
1118 /// let mut re = ReManager::new(); // create a manager
1119 /// let e = re.empty(); // get the empty language
1120 ///
1121 /// // no string belongs to e.
1122 /// assert!(! re.str_in_re(&"0129".into(), e));
1123 /// ```
1124 pub fn empty(&self) -> RegLan {
1125 self.empty
1126 }
1127
1128 /// The full language
1129 ///
1130 /// This language contains every strings. It's the complement of [empty](Self::empty).
1131 ///
1132 /// # Example
1133 ///
1134 /// ```
1135 /// use aws_smt_strings::regular_expressions::*;
1136 ///
1137 /// let mut re = ReManager::new(); // create a manager
1138 /// let e = re.full(); // get the full language
1139 ///
1140 /// // every string belongs to e.
1141 /// assert!(re.str_in_re(&"0129".into(), e));
1142 /// ```
1143 pub fn full(&self) -> RegLan {
1144 self.sigma_star
1145 }
1146
1147 /// The RE that contains only the empty string
1148 ///
1149 /// # Example
1150 ///
1151 /// ```
1152 /// use aws_smt_strings::regular_expressions::*;
1153 ///
1154 /// let mut re = ReManager::new();
1155 /// let e = re.epsilon();
1156 ///
1157 /// // the empty string belongs to e
1158 /// assert!(re.str_in_re(&"".into(), e));
1159 /// // a non-empty string does not belong to e
1160 /// assert!(! re.str_in_re(&"a".into(), e));
1161 /// ```
1162 pub fn epsilon(&self) -> RegLan {
1163 self.epsilon
1164 }
1165
1166 /// The RE that contains all non-empty strings
1167 ///
1168 /// This is the complement of [epsilon](Self::epsilon).
1169 ///
1170 /// # Example
1171 ///
1172 /// ```
1173 /// use aws_smt_strings::regular_expressions::*;
1174 ///
1175 /// let mut re = ReManager::new();
1176 /// let e = re.sigma_plus();
1177 ///
1178 /// // the empty string does not belong to e
1179 /// assert!(! re.str_in_re(&"".into(), e));
1180 /// // any non-empty string belongs to e
1181 /// assert!(re.str_in_re(&"a".into(), e));
1182 /// ```
1183 pub fn sigma_plus(&self) -> RegLan {
1184 self.sigma_plus
1185 }
1186
1187 /// Regular expression defined by a character set
1188 ///
1189 /// Return the regular expression that contains all single-character
1190 /// strings with characters in the specified set. See also [range](Self::range).
1191 ///
1192 /// # Example
1193 ///
1194 /// Lower-case letters in ASCII.
1195 /// ```
1196 /// use aws_smt_strings::regular_expressions::*;
1197 /// use aws_smt_strings::character_sets::*;
1198 ///
1199 /// let mut re = ReManager::new();
1200 /// let set = CharSet::range('a' as u32, 'z' as u32);
1201 /// let e = re.char_set(set);
1202 ///
1203 /// // a single-character string that's in e
1204 /// assert!(re.str_in_re(&"c".into(), e));
1205 /// // a single-character string that's not in e
1206 /// assert!(!re.str_in_re(&"7".into(), e));
1207 /// // strings with more than one characters are not in e
1208 /// assert!(!re.str_in_re(&"abc".into(), e));
1209 /// ```
1210 pub fn char_set(&mut self, set: CharSet) -> RegLan {
1211 self.make(BaseRegLan::Range(set))
1212 }
1213
1214 ///
1215 /// Complement of a language
1216 ///
1217 /// Return the complement of RE `e`.
1218 ///
1219 /// # Example
1220 ///
1221 /// ```
1222 /// use aws_smt_strings::regular_expressions::*;
1223 ///
1224 /// let mut re = ReManager::new();
1225 /// let a_single_digit = re.range('0' as u32, '9' as u32);
1226 /// let not_a_digit = re.complement(a_single_digit);
1227 ///
1228 /// assert!(re.str_in_re(&"7".into(), a_single_digit));
1229 /// assert!(! re.str_in_re(&"7".into(), not_a_digit));
1230 ///
1231 /// // any string of more than 2 characters is not a single digit!
1232 /// assert!(re.str_in_re(&"94".into(), not_a_digit))
1233 /// ```
1234 pub fn complement(&mut self, e: RegLan) -> RegLan {
1235 self.id_to_re(e.id ^ 1)
1236 }
1237
1238 /// Concatenation of two languages
1239 ///
1240 /// Concatenate languages `e1` and `e2` in that order.
1241 ///
1242 /// # Example
1243 ///
1244 /// ```
1245 /// use aws_smt_strings::regular_expressions::*;
1246 ///
1247 /// let mut re = ReManager::new();
1248 /// let a_letter = re.range('a' as u32, 'z' as u32);
1249 /// let a_digit = re.range('0' as u32, '9' as u32);
1250 /// let e = re.concat(a_letter, a_digit);
1251 ///
1252 /// assert!(re.str_in_re(&"h4".into(), e));
1253 /// ```
1254 pub fn concat(&mut self, e1: RegLan, e2: RegLan) -> RegLan {
1255 match (&e1.expr, &e2.expr) {
1256 // empty . R --> empty
1257 (BaseRegLan::Empty, _) => self.empty,
1258 (_, BaseRegLan::Empty) => self.empty,
1259 // epsilon . R --> R
1260 (BaseRegLan::Epsilon, _) => e2,
1261 (_, BaseRegLan::Epsilon) => e1,
1262 // R . R^[i,j] --> R^[i+1, j+1]
1263 (_, BaseRegLan::Loop(y, rng)) if *e1 == **y => {
1264 self.make(BaseRegLan::Loop(e1, rng.add_point(1)))
1265 }
1266 (BaseRegLan::Loop(x, rng), _) if *e2 == **x => {
1267 self.make(BaseRegLan::Loop(e2, rng.add_point(1)))
1268 }
1269 // R^[a,b] . R^[b,c] -> R^[a+b, b+c]
1270 (BaseRegLan::Loop(x, x_rng), BaseRegLan::Loop(y, y_rng)) if *x == *y => {
1271 self.make(BaseRegLan::Loop(x, x_rng.add(y_rng)))
1272 }
1273 // R . R -> R^2
1274 _ if *e1 == *e2 => self.make(BaseRegLan::Loop(e1, LoopRange::point(2))),
1275 // (R . S) . T -> R . (S . T)
1276 (BaseRegLan::Concat(x, y), _) => {
1277 let right = self.concat(y, e2);
1278 self.concat(x, right)
1279 }
1280 //
1281 _ => {
1282 if e1.nullable && e2 == self.sigma_star {
1283 // S . Sigma^* --> Sigma^* if S is nullable
1284 e2
1285 } else {
1286 self.make(BaseRegLan::Concat(e1, e2))
1287 }
1288 }
1289 }
1290 }
1291
1292 /// Concatenation of multiple languages
1293 ///
1294 /// Build the concatenation of `a[0]`, `a[1]`, ... in this order.
1295 /// - return [epsilon](Self::epsilon) is `a` is empty.
1296 /// - return `a[0]` is `a.len() == 1`
1297 ///
1298 /// See [concat](Self::concat)
1299 ///
1300 /// # Example
1301 ///
1302 /// ```
1303 /// use aws_smt_strings::regular_expressions::*;
1304 ///
1305 /// let mut re = ReManager::new();
1306 ///
1307 /// let a_letter = re.range('a' as u32, 'z' as u32);
1308 /// let a_digit = re.range('0' as u32, '9' as u32);
1309 /// let e = re.concat_list([a_letter, a_letter, a_digit]);
1310 ///
1311 /// assert!(re.str_in_re(&"ab3".into(), e));
1312 /// ```
1313 pub fn concat_list(&mut self, a: impl IntoIterator<Item = RegLan>) -> RegLan {
1314 let mut v = Vec::new();
1315 for x in a {
1316 flatten_concat(x, &mut v);
1317 }
1318 let mut result = self.epsilon;
1319 for &x in v.iter().rev() {
1320 result = self.concat(x, result);
1321 }
1322 result
1323 }
1324
1325 /// Generalized loop
1326 ///
1327 /// Parameter `range` defines an interval of natural numbers `[i, j]` where `j` may be infinity.
1328 /// This method returns the regular language equal to the union of `e`<sup>k</sup> for k in the interval `[i, j]`.
1329 /// See [loop_ranges](crate::loop_ranges).
1330 ///
1331 /// # Example
1332 ///
1333 /// Regular expression that recognizes sequences of 3 to 5 digits.
1334 ///
1335 /// ```
1336 /// use aws_smt_strings::{regular_expressions::*, loop_ranges::*};
1337 ///
1338 /// let mut re = ReManager::new();
1339 ///
1340 /// let digits = re.range('0' as u32, '9' as u32);
1341 /// let e = re.mk_loop(digits, LoopRange::finite(3, 5));
1342 ///
1343 /// assert!(re.str_in_re(&"12345".into(), e));
1344 /// assert!(! re.str_in_re(&"123456".into(), e));
1345 /// ```
1346 pub fn mk_loop(&mut self, e: RegLan, range: LoopRange) -> RegLan {
1347 if range.is_zero() {
1348 // R ^ 0 --> epsilon
1349 self.epsilon
1350 } else if range.is_one() {
1351 // R ^ 1 --> R
1352 e
1353 } else {
1354 match &e.expr {
1355 // empty ^ [i, j] --> empty
1356 BaseRegLan::Empty => self.empty,
1357 // epsilon ^ [i, j] --> epsilon
1358 BaseRegLan::Epsilon => self.epsilon,
1359 // (R ^[i,j]) ^ [k, l] --> R ^[i *k, j*l] if the product is exact
1360 BaseRegLan::Loop(x, x_rng) if x_rng.right_mul_is_exact(&range) => {
1361 self.make(BaseRegLan::Loop(x, x_rng.mul(&range)))
1362 }
1363 _ => self.make(BaseRegLan::Loop(e, range)),
1364 }
1365 }
1366 }
1367
1368 // Intersection of REs in v
1369 fn make_inter(&mut self, mut v: Vec<RegLan>) -> RegLan {
1370 simplify_set_operation(&mut v, self.sigma_star, self.empty);
1371 if contains(&v, self.epsilon) {
1372 // v contains epsilon: the intersection is either epsilon or empty
1373 if v.iter().all(|&r| r.nullable) {
1374 self.epsilon
1375 } else {
1376 self.empty
1377 }
1378 } else {
1379 match v.len() {
1380 0 => self.sigma_star,
1381 1 => v[0],
1382 _ => self.make(BaseRegLan::Inter(v.into())),
1383 }
1384 }
1385 }
1386
1387 // Union of REs in v
1388 fn make_union(&mut self, mut v: Vec<RegLan>) -> RegLan {
1389 // verbose version of 'sub_language'
1390 #[allow(dead_code)]
1391 fn is_included(r: RegLan, s: RegLan) -> bool {
1392 let result = sub_language(r, s);
1393 if result {
1394 println!("---> subsumption: {r} subsumed by {s}");
1395 }
1396 result
1397 }
1398
1399 // we skip the subsumption check when x == r
1400 // this works since there are no duplicates in a
1401 fn is_subsumed(r: RegLan, a: &[RegLan]) -> bool {
1402 // a.iter().any(|&x| x != r && is_included(r, x))
1403 a.iter().any(|&x| x != r && sub_language(r, x))
1404 }
1405
1406 fn remove_subsumed(a: &mut Vec<RegLan>) {
1407 let mut i = 0;
1408 while i < a.len() {
1409 if is_subsumed(a[i], a) {
1410 a.remove(i);
1411 } else {
1412 i += 1;
1413 }
1414 }
1415 }
1416
1417 simplify_set_operation(&mut v, self.empty, self.sigma_star);
1418 if v.len() >= 2 {
1419 remove_subsumed(&mut v);
1420 }
1421 match v.len() {
1422 0 => self.empty,
1423 1 => v[0],
1424 _ => self.make(BaseRegLan::Union(v.into())),
1425 }
1426 }
1427
1428 /// Intersection of two languages
1429 ///
1430 /// Return the intersection of `e1` and `e2`
1431 ///
1432 /// # Example
1433 ///
1434 /// ```
1435 /// use aws_smt_strings::regular_expressions::*;
1436 ///
1437 /// let mut re = ReManager::new();
1438 ///
1439 /// let sigma = re.all_chars();
1440 /// let b = re.exp(sigma, 4); // all strings of length 4
1441 ///
1442 /// let digits = re.range('0' as u32, '9' as u32);
1443 /// let c = re.star(digits); // all sequences of digits
1444 ///
1445 /// let e = re.inter(b, c); // all sequences of four digits
1446 ///
1447 /// assert!(re.str_in_re(&"0000".into(), e));
1448 /// ```
1449 pub fn inter(&mut self, e1: RegLan, e2: RegLan) -> RegLan {
1450 let mut v = Vec::new();
1451 flatten_inter(e1, &mut v);
1452 flatten_inter(e2, &mut v);
1453 self.make_inter(v)
1454 }
1455
1456 /// Intersection of multiple languages
1457 ///
1458 /// This returns the intersection of `a[0]`, `a[1]`, etc.
1459 /// - return the full language (see [full](Self::full)) if `a` is empty
1460 /// - return `a[0]` if `a.len() == 1`
1461 /// - otherwise construct the intersection.
1462 ///
1463 /// See [inter](Self::inter) for an example.
1464 pub fn inter_list(&mut self, a: impl IntoIterator<Item = RegLan>) -> RegLan {
1465 let mut v = Vec::new();
1466 for r in a {
1467 flatten_inter(r, &mut v);
1468 }
1469 self.make_inter(v)
1470 }
1471
1472 /// Union of two languages
1473 ///
1474 /// Return the union of `e1` and `e2`.
1475 ///
1476 /// # Example
1477 ///
1478 /// ```
1479 /// use aws_smt_strings::regular_expressions::*;
1480 ///
1481 /// let mut re = ReManager::new();
1482 ///
1483 /// let abc = re.str(&"abc".into());
1484 /// let de = re.str(&"de".into());
1485 /// let u = re.union(abc, de);
1486 ///
1487 /// // u contains two strings: "abc" and "de"
1488 /// assert!(re.str_in_re(&"de".into(), u));
1489 /// assert!(re.str_in_re(&"abc".into(), u));
1490 /// ```
1491 pub fn union(&mut self, e1: RegLan, e2: RegLan) -> RegLan {
1492 let mut v = Vec::new();
1493 flatten_union(e1, &mut v);
1494 flatten_union(e2, &mut v);
1495 self.make_union(v)
1496 }
1497
1498 /// Union of several languages
1499 ///
1500 /// Return the union of `a[0]`, `a[1]`, ...
1501 /// - if `a` is empty, return the empty language (see [empty](Self::empty))
1502 /// - if `a.len() == 1`, return `a[0]`
1503 /// - otherwise, build the union.
1504 ///
1505 /// See [union](Self::union).
1506 ///
1507 pub fn union_list(&mut self, a: impl IntoIterator<Item = RegLan>) -> RegLan {
1508 let mut v = Vec::new();
1509 for r in a {
1510 flatten_union(r, &mut v);
1511 }
1512 self.make_union(v)
1513 }
1514
1515 /// Difference of two languages
1516 ///
1517 /// Return the difference of `e1` and `e2`. This is the same as
1518 /// the intersection of `e1` and the complement of `e2`.
1519 ///
1520 /// # Example
1521 ///
1522 /// ```
1523 /// use aws_smt_strings::regular_expressions::*;
1524 ///
1525 /// let mut re = ReManager::new();
1526 ///
1527 /// let sigma = re.all_chars();
1528 /// let b = re.exp(sigma, 4); // all strings of length 4
1529 ///
1530 /// let digits = re.range('0' as u32, '9' as u32);
1531 /// let c = re.star(digits); // all sequences of digits
1532 ///
1533 /// let e = re.diff(c, b); // sequences of digits of length other than four
1534 ///
1535 /// assert!(re.str_in_re(&"".into(), e)); // the empty sequence is included
1536 /// assert!(! re.str_in_re(&"0000".into(), e));
1537 /// assert!(re.str_in_re(&"123456".into(), e));
1538 /// ```
1539 pub fn diff(&mut self, e1: RegLan, e2: RegLan) -> RegLan {
1540 let comp_e2 = self.complement(e2);
1541 self.inter(e1, comp_e2)
1542 }
1543
1544 /// Difference of several languages
1545 ///
1546 /// Return the difference of `e1` and all regular expressions of `a`
1547 /// - return `e1` if `a` is empty.
1548 ///
1549 pub fn diff_list(&mut self, e1: RegLan, a: impl IntoIterator<Item = RegLan>) -> RegLan {
1550 let mut v = Vec::new();
1551 flatten_inter(e1, &mut v);
1552 for r in a {
1553 flatten_inter(self.complement(r), &mut v);
1554 }
1555 self.make_inter(v)
1556 }
1557
1558 /// All one-character strings
1559 ///
1560 /// This is the same as `range(CharSet::all_chars)`. See [range](Self::range) and [CharSet::all_chars].
1561 ///
1562 /// See [diff](Self::diff) for an example.
1563 pub fn all_chars(&mut self) -> RegLan {
1564 self.sigma
1565 }
1566
1567 /// A character as a regular expression
1568 ///
1569 /// Return the language that contains the one-character string `x` and nothing else.
1570 /// - this is the same as `char_set(CharSet::singleton(x))`. See [char_set](Self::char_set)
1571 /// and [CharSet::singleton].
1572 ///
1573 /// # Panics
1574 ///
1575 /// If x is not a valid SMT character (i.e., x > MAX_CHAR).
1576 ///
1577 /// # Example
1578 ///
1579 /// ```
1580 /// use aws_smt_strings::regular_expressions::*;
1581 ///
1582 /// let mut re = ReManager::new();
1583 ///
1584 /// let e = re.char('Z' as u32);
1585 ///
1586 /// assert!(re.str_in_re(&"Z".into(), e));
1587 /// ```
1588 pub fn char(&mut self, x: u32) -> RegLan {
1589 assert!(x <= MAX_CHAR);
1590 self.char_set(CharSet::singleton(x))
1591 }
1592
1593 /// Range of characters
1594 ///
1595 /// Return the language that contains all one-character strings that contains character in range [start, end]
1596 /// - this is the same as `char_set(CharSet::range(start, end))`. See [char_set](Self::char_set)
1597 /// and [CharSet::range].
1598 ///
1599 /// # Panics
1600 ///
1601 /// If the range is empty (i.e., start > end) or if end > [MAX_CHAR].
1602 ///
1603 /// # Example
1604 ///
1605 /// ```
1606 /// use aws_smt_strings::regular_expressions::*;
1607 ///
1608 /// let mut re = ReManager::new();
1609 ///
1610 /// let e = re.range('0' as u32, '9' as u32);
1611 ///
1612 /// assert!(re.str_in_re(&"4".into(), e));
1613 /// ```
1614 pub fn range(&mut self, start: u32, end: u32) -> RegLan {
1615 assert!(start <= end && end <= MAX_CHAR);
1616 self.char_set(CharSet::range(start, end))
1617 }
1618
1619 /// Kleene star closure
1620 ///
1621 /// Return the star closure of language `e` (i.e., the concatenations of an arbitrary number
1622 /// of strings of `e`).
1623 ///
1624 /// # Example
1625 ///
1626 /// ```
1627 /// use aws_smt_strings::regular_expressions::*;
1628 ///
1629 /// let mut re = ReManager::new();
1630 ///
1631 /// let letters = re.range('a' as u32, 'z' as u32);
1632 /// let letter_sequences = re.star(letters);
1633 ///
1634 /// assert!(re.str_in_re(&"abcd".into(), letter_sequences));
1635 /// assert!(re.str_in_re(&"".into(), letter_sequences));
1636 /// assert!(! re.str_in_re(&"abc-def".into(), letter_sequences));
1637 /// ```
1638 pub fn star(&mut self, e: RegLan) -> RegLan {
1639 self.mk_loop(e, LoopRange::star())
1640 }
1641
1642 /// Kleene plus
1643 ///
1644 /// Return the closure of `e` (i.e., the concatenation of one or more strings of `e`)
1645 ///
1646 /// # Example
1647 ///
1648 /// ```
1649 /// use aws_smt_strings::regular_expressions::*;
1650 ///
1651 /// let mut re = ReManager::new();
1652 ///
1653 /// let letters = re.range('a' as u32, 'z' as u32);
1654 /// let letter_sequences = re.plus(letters);
1655 ///
1656 /// assert!(re.str_in_re(&"abcd".into(), letter_sequences));
1657 /// assert!(! re.str_in_re(&"".into(), letter_sequences));
1658 /// assert!(! re.str_in_re(&"abc-def".into(), letter_sequences));
1659 /// ```
1660 pub fn plus(&mut self, e: RegLan) -> RegLan {
1661 self.mk_loop(e, LoopRange::plus())
1662 }
1663
1664 /// Option
1665 ///
1666 /// Returns the union of [epsilon](Self::epsilon) and `e`.
1667 ///
1668 /// # Example
1669 ///
1670 /// ```
1671 /// use aws_smt_strings::regular_expressions::*;
1672 ///
1673 /// let mut re = ReManager::new();
1674 ///
1675 /// let e = re.char('A' as u32);
1676 /// let opt_e = re.opt(e);
1677 ///
1678 /// // Both "A" and the empty strings are in `opt_e`
1679 /// assert!(re.str_in_re(&"A".into(), opt_e));
1680 /// assert!(re.str_in_re(&"".into(), opt_e));
1681 /// ```
1682 pub fn opt(&mut self, e: RegLan) -> RegLan {
1683 self.mk_loop(e, LoopRange::opt())
1684 }
1685
1686 /// Exponentiation
1687 ///
1688 /// Concatenates `e` with itself `k` times.
1689 /// - return [epsilon](Self::epsilon) if `k==0`.
1690 ///
1691 /// # Example
1692 ///
1693 /// All strings of length 5.
1694 ///
1695 /// ```
1696 /// use aws_smt_strings::regular_expressions::*;
1697 ///
1698 /// let mut re = ReManager::new();
1699 ///
1700 /// let a = re.all_chars();
1701 /// let b = re.exp(a, 5);
1702 ///
1703 /// assert!(re.str_in_re(&"ABCDE".into(), b));
1704 /// assert!(! re.str_in_re(&"ABCD".into(), b));
1705 /// ```
1706 pub fn exp(&mut self, e: RegLan, k: u32) -> RegLan {
1707 self.mk_loop(e, LoopRange::point(k))
1708 }
1709
1710 /// Finite loop as defined in SMT-LIB
1711 ///
1712 /// - if `i <= j`, return the regular expression `Loop(e, [i, j])`.
1713 /// See [mk_loop](Self::mk_loop)
1714 /// - if `i > j`, return the empty language. See [empty](Self::empty).
1715 ///
1716 /// # Example
1717 ///
1718 /// ```
1719 /// use aws_smt_strings::regular_expressions::*;
1720 ///
1721 /// let mut re = ReManager::new();
1722 ///
1723 /// let a = re.all_chars();
1724 /// let b = re.smt_loop(a, 3, 7); // strings of length 3 to 7
1725 ///
1726 /// assert!(re.str_in_re(&"abcdef".into(), b));
1727 /// ```
1728 pub fn smt_loop(&mut self, e: RegLan, i: u32, j: u32) -> RegLan {
1729 if i <= j {
1730 self.mk_loop(e, LoopRange::finite(i, j))
1731 } else {
1732 self.empty
1733 }
1734 }
1735
1736 /// Character range as defined in SMT-LIB
1737 ///
1738 /// - if `s1` and `s2` are both singleton strings, and `s1 <= s2` in the
1739 /// lexicographic ordering, return self.range(CharSet::range(c1, c2)) where c1 = unique
1740 /// character of s1 and c2 = unique character of s2.
1741 /// - otherwise, return the empty language
1742 ///
1743 /// ```
1744 /// use aws_smt_strings::regular_expressions::*;
1745 ///
1746 /// let mut re = ReManager::new();
1747 ///
1748 /// let b = re.smt_range(&"a".into(), &"z".into());
1749 ///
1750 /// assert!(re.str_in_re(&"h".into(), b));
1751 /// ```
1752 pub fn smt_range(&mut self, s1: &SmtString, s2: &SmtString) -> RegLan {
1753 if s1.len() == 1 && s2.len() == 1 {
1754 let c1 = s1.char(0);
1755 let c2 = s2.char(0);
1756 if c1 <= c2 {
1757 return self.char_set(CharSet::range(c1, c2));
1758 }
1759 }
1760 self.empty
1761 }
1762
1763 /// Language that contains a single string
1764 ///
1765 /// - Return the language that contains string `s` and nothing else.
1766 /// - This is the same as the SMT-LIB 'str.to_re' function.
1767 ///
1768 /// # Example
1769 ///
1770 /// ```
1771 /// use aws_smt_strings::regular_expressions::*;
1772 ///
1773 /// let mut re = ReManager::new();
1774 ///
1775 /// let s = re.str(&"alpha".into());
1776 ///
1777 /// assert!(re.str_in_re(&"alpha".into(), s));
1778 /// assert!(! re.str_in_re(&"beta".into(), s));
1779 /// ```
1780 pub fn str(&mut self, s: &SmtString) -> RegLan {
1781 let mut re = self.epsilon();
1782 for c in s.iter().rev() {
1783 let c = self.char(*c);
1784 re = self.concat(c, re);
1785 }
1786 re
1787 }
1788
1789 //
1790 // DERIVATIVES
1791 //
1792
1793 /// Compute the derivative w.r.t. c of all elements of list
1794 fn deriv_list(&mut self, list: &[RegLan], c: u32) -> Vec<RegLan> {
1795 list.iter().map(|r| self.deriv(r, c)).collect()
1796 }
1797
1798 /// Derivative of e with respect to a character c
1799 fn compute_derivative(&mut self, e: RegLan, c: u32) -> RegLan {
1800 match e.expr {
1801 BaseRegLan::Empty => self.empty,
1802 BaseRegLan::Epsilon => self.empty,
1803 BaseRegLan::Range(r) => {
1804 if r.contains(c) {
1805 self.epsilon
1806 } else {
1807 self.empty
1808 }
1809 }
1810 BaseRegLan::Concat(e1, e2) => {
1811 let d1 = self.deriv(e1, c);
1812 let d1 = self.concat(d1, e2);
1813 if e1.nullable {
1814 let d2 = self.deriv(e2, c);
1815 self.union(d1, d2)
1816 } else {
1817 d1
1818 }
1819 }
1820 BaseRegLan::Loop(e1, range) => {
1821 let d1 = self.deriv(e1, c);
1822 let e2 = self.mk_loop(e1, range.shift());
1823 self.concat(d1, e2)
1824 }
1825 BaseRegLan::Complement(e1) => {
1826 let d1 = self.deriv(e1, c);
1827 self.complement(d1)
1828 }
1829 BaseRegLan::Inter(ref v) => {
1830 let d = self.deriv_list(&v[..], c);
1831 self.inter_list(d)
1832 }
1833 BaseRegLan::Union(ref v) => {
1834 let d = self.deriv_list(&v[..], c);
1835 self.union_list(d)
1836 }
1837 }
1838 }
1839
1840 /// Derivative with respect to a character c using the cache
1841 fn deriv(&mut self, e: RegLan, c: u32) -> RegLan {
1842 let cid = e.class_of_char(c);
1843 self.cached_deriv(e, cid)
1844 }
1845
1846 ///
1847 /// Check whether character c can start expression e
1848 ///
1849 /// # Example
1850 ///
1851 /// ```
1852 /// use aws_smt_strings::regular_expressions::*;
1853 ///
1854 /// let mut re = ReManager::new();
1855 ///
1856 /// let digits = re.range('0' as u32, '9' as u32);
1857 /// let s = re.star(digits);
1858 ///
1859 /// assert!(re.start_char(s, '1' as u32));
1860 /// assert!(! re.start_char(s, 'a' as u32));
1861 /// ```
1862 pub fn start_char(&mut self, e: RegLan, c: u32) -> bool {
1863 match &e.expr {
1864 BaseRegLan::Empty => false,
1865 BaseRegLan::Epsilon => false,
1866 BaseRegLan::Range(set) => set.contains(c),
1867 BaseRegLan::Concat(e1, e2) => {
1868 self.start_char(e1, c) || e1.nullable && self.start_char(e2, c)
1869 }
1870 BaseRegLan::Loop(e, _) => self.start_char(e, c),
1871 BaseRegLan::Inter(args) => args.iter().all(|x| self.start_char(x, c)),
1872 BaseRegLan::Union(args) => args.iter().any(|x| self.start_char(x, c)),
1873 BaseRegLan::Complement(_) => {
1874 // expensive case
1875 let d = self.deriv(e, c);
1876 !self.is_empty_re(d)
1877 }
1878 }
1879 }
1880
1881 ///
1882 /// Check whether all characters in class cid can start e
1883 /// - return Error if cid is not a valid class for e
1884 ///
1885 pub fn start_class(&mut self, e: RegLan, cid: ClassId) -> Result<bool, Error> {
1886 if e.valid_class_id(cid) {
1887 let c = e.pick_class_rep(cid);
1888 Ok(self.start_char(e, c))
1889 } else {
1890 Err(Error::BadClassId)
1891 }
1892 }
1893
1894 ///
1895 /// Cached derivative
1896 ///
1897 /// - cid identifies a derivative class of e.
1898 /// - cid must be either Interval(i) where i is the id of a deriv class of e
1899 /// or Complement, which denotes the complementary class of e.
1900 ///
1901 /// In either cases, all characters in the class are equivalent:
1902 /// - if c1 and c2 are in deriv class i then deriv(e, c1) = deriv(e, c2)
1903 /// - if c1 and c2 are outside all deriv class then deriv(e, c1) = deriv(e, c2)
1904 ///
1905 /// This method panics in the following cases:
1906 /// - cid is Interval(i) but i is not the index of a derivative class of e
1907 /// - cid is Complementary but the complementary derivative class of e is empty.
1908 ///
1909 fn cached_deriv(&mut self, e: RegLan, cid: ClassId) -> RegLan {
1910 let key = DerivKey(e, cid);
1911 match self.deriv_cache.get(&key) {
1912 Some(&r) => r,
1913 None => {
1914 let c = e.pick_class_rep(cid);
1915 let r = self.compute_derivative(e, c);
1916 self.deriv_cache.insert(key, r);
1917 r
1918 }
1919 }
1920 }
1921
1922 ///
1923 /// Derivative with respect to a class id
1924 ///
1925 /// Compute the derivative of e with respect to a class defined by `cid`
1926 /// - if `cid` is `Interval(i)`: class = i-th derivative class of `e`
1927 /// - if `cid` is `Complement`: class = complementary derivative class of `e`
1928 ///
1929 /// Derivative classes of `e` are indexed from 0 to `n`-1 where `n` is the
1930 /// number of classes.
1931 ///
1932 /// # Errors
1933 ///
1934 /// Return Err([Error::BadClassId]) if the class id is invalid.
1935 /// - Class id `Interval(i)` is invalid if i is larger than or equal to the number
1936 /// of derivative classes of `e`.
1937 /// - Class if `Complement` is invalid if the complementary class of e is empty.
1938 ///
1939 /// # Example
1940 ///
1941 /// ```
1942 /// use aws_smt_strings::{regular_expressions::*, character_sets::*};
1943 /// # use std::error::Error;
1944 /// #
1945 /// # fn main() -> Result<(), Box<dyn Error>> {
1946 /// let mut re = ReManager::new();
1947 ///
1948 /// let abc = re.str(&"abc".into());
1949 /// let efg = re.str(&"efg".into());
1950 /// let r = re.union(abc, efg); // 'abc' + 'efg': two derivative classes
1951 ///
1952 /// let n = r.num_deriv_classes();
1953 /// assert_eq!(n, 2);
1954 ///
1955 /// let test1 = re.class_derivative(r, ClassId::Interval(0))?;
1956 /// let test2 = re.class_derivative(r, ClassId::Interval(1))?;
1957 /// let test3 = re.class_derivative(r, ClassId::Complement)?;
1958 ///
1959 /// assert_eq!(test1, re.str(&"bc".into()));
1960 /// assert_eq!(test2, re.str(&"fg".into()));
1961 /// assert_eq!(test3, re.empty());
1962 /// # Ok(())
1963 /// # }
1964 /// ```
1965 pub fn class_derivative(&mut self, e: RegLan, cid: ClassId) -> Result<RegLan, Error> {
1966 if e.valid_class_id(cid) {
1967 Ok(self.cached_deriv(e, cid))
1968 } else {
1969 Err(Error::BadClassId)
1970 }
1971 }
1972
1973 ///
1974 /// Unchecked derivative with respect to a class id
1975 ///
1976 /// Compute the derivative of e with respect to a class defined by cid.
1977 /// Does not check whether cid is a valid class id for e.
1978 /// See [class_derivative](Self::class_derivative)
1979 ///
1980 /// # Panics
1981 ///
1982 /// If cid is not a valid class id for e.
1983 ///
1984 pub fn class_derivative_unchecked(&mut self, e: RegLan, cid: ClassId) -> RegLan {
1985 self.cached_deriv(e, cid)
1986 }
1987
1988 ///
1989 /// Derivative with respect to a character set
1990 ///
1991 /// Return the derivative of e with respect to c provided this is well defined.
1992 ///
1993 /// # Errors
1994 ///
1995 /// The derivative with respect to c is well defined either if c is included in a
1996 /// derivative class of e or if c is included in the complementary class.
1997 /// If these conditions do not hold, the method return Err([Error::UndefinedDerivative]).
1998 /// See [Error].
1999 ///
2000 /// # Example
2001 /// ```
2002 /// use aws_smt_strings::{regular_expressions::*, character_sets::*};
2003 /// # use std::error::Error;
2004 /// #
2005 /// # fn main() -> Result<(), Box<dyn Error>> {
2006 /// let mut re = ReManager::new();
2007 ///
2008 /// let a_to_z = re.range('a' as u32, 'z' as u32);
2009 /// let e = re.plus(a_to_z); // non-empty sequences of lower-case ascii letters
2010 ///
2011 /// // the derivative of e w.r.t. ['c', 't'] is defined.
2012 /// let test = re.set_derivative(e, &CharSet::range('c' as u32, 't' as u32))?;
2013 ///
2014 /// assert_eq!(test, re.star(a_to_z));
2015 /// # Ok(())
2016 /// # }
2017 /// ```
2018 ///
2019 pub fn set_derivative(&mut self, e: RegLan, c: &CharSet) -> Result<RegLan, Error> {
2020 let cid = e.class_of_set(c)?;
2021 Ok(self.cached_deriv(e, cid))
2022 }
2023
2024 ///
2025 /// Unchecked derivative with respect to a character set.
2026 ///
2027 /// See [derivative](Self::set_derivative).
2028 ///
2029 /// # Panics
2030 ///
2031 /// If the derivative is not defined for this character set.
2032 ///
2033 pub fn set_derivative_unchecked(&mut self, e: RegLan, c: &CharSet) -> RegLan {
2034 let cid = e.class_of_set(c).unwrap();
2035 self.cached_deriv(e, cid)
2036 }
2037
2038 ///
2039 /// Derivative with respect to a character
2040 ///
2041 /// The derivative of e with respect to c is a regular expression e1 such
2042 /// every string of e that starts with c is formed by concatenating c and
2043 /// a string of e1. So the language of e1 is
2044 /// L(e1) = { w | c.w is in L(e) }
2045 ///
2046 /// # Example
2047 ///
2048 /// ```
2049 /// use aws_smt_strings::regular_expressions::*;
2050 ///
2051 /// fn sum_of_str(re: &mut ReManager, s1: &str, s2: &str) -> RegLan {
2052 /// let s1 = re.str(&s1.into());
2053 /// let s2 = re.str(&s2.into());
2054 /// re.union(s1, s2)
2055 /// }
2056 ///
2057 /// let re = &mut ReManager::new();
2058 /// let e = sum_of_str(re, "abc", "acc");
2059 ///
2060 /// // e is 'abc + acc'
2061 /// // the derivative of e w.r.t. 'a' is 'bc + cc'
2062 /// let d = re.char_derivative(e, 'a' as u32);
2063 /// assert_eq!(d, sum_of_str(re, "bc", "cc"));
2064 /// ```
2065 pub fn char_derivative(&mut self, e: RegLan, c: u32) -> RegLan {
2066 debug_assert!(c <= MAX_CHAR);
2067 self.deriv(e, c)
2068 }
2069
2070 ///
2071 /// Derivative with respect to a string
2072 ///
2073 /// The derivative with respect to s is defined by induction on the length of s:
2074 /// - if s is empty, deriv(e, s) = e
2075 /// - if s is of the form a.w, then deriv(e, s) = deriv(w, deriv(a, s))
2076 ///
2077 /// # Example
2078 ///
2079 /// ```
2080 /// use aws_smt_strings::regular_expressions::*;
2081 ///
2082 /// let re = &mut ReManager::new();
2083 /// let abc = re.str(&"abc".into());
2084 /// let acc = re.str(&"acc".into());
2085 /// let e = re.union(abc, acc);
2086 ///
2087 /// // e is 'abc + acc'
2088 /// // the derivative of e with respect to "ab" is 'c'
2089 /// let d1 = re.str_derivative(e, &"ab".into());
2090 /// assert_eq!(d1, re.char('c' as u32));
2091 ///
2092 /// // the derivative of e with respect to the empty string is e
2093 /// let d2 = re.str_derivative(e, &"".into());
2094 /// assert_eq!(d2, e);
2095 /// ```
2096 pub fn str_derivative(&mut self, e: RegLan, s: &SmtString) -> RegLan {
2097 s.iter().fold(e, |r, &c| self.char_derivative(r, c))
2098 }
2099
2100 /// Construct an iterator to list the derivatives of a regular expression
2101 ///
2102 /// The iterator produces `e`, then the derivatives of `e`, then the derivatives
2103 /// of these derivatives, and so forth. There are finitely many such derivatives.
2104 /// The iterator produces them without duplicates.
2105 pub fn iter_derivatives(&mut self, e: RegLan) -> DerivativeIterator<'_> {
2106 let mut queue = BfsQueue::new();
2107 queue.push(e);
2108 DerivativeIterator {
2109 manager: self,
2110 queue,
2111 }
2112 }
2113
2114 ///
2115 /// Check whether a string belongs to the language defined by a regular expression
2116 ///
2117 /// # Example
2118 ///
2119 /// ```
2120 /// use aws_smt_strings::regular_expressions::*;
2121 ///
2122 /// let re = &mut ReManager::new();
2123 ///
2124 /// // Build regular expression (ac + bc)*
2125 /// let ac = re.str(&"ac".into());
2126 /// let bc = re.str(&"bc".into());
2127 /// let sum = re.union(ac, bc);
2128 /// let e = re.star(sum);
2129 ///
2130 /// // Check that "acacbc" is in the language
2131 /// assert!(re.str_in_re(&"acacbc".into(), e))
2132 /// ```
2133 ///
2134 pub fn str_in_re(&mut self, s: &SmtString, e: RegLan) -> bool {
2135 self.str_derivative(e, s).nullable
2136 }
2137
2138 ///
2139 /// Check whether a regular expression is empty
2140 ///
2141 /// # Example
2142 /// ```
2143 /// use aws_smt_strings::regular_expressions::*;
2144 ///
2145 /// let re = &mut ReManager::new();
2146 ///
2147 /// let full = re.full();
2148 /// let abcd = re.str(&"abcd".into());
2149 /// let bc = re.str(&"bc".into());
2150 ///
2151 /// let a = re.concat(abcd, full); // strings that start with 'abcd'
2152 /// let b = re.concat_list([full, bc, full]); // strings that contain 'bc'
2153 ///
2154 /// let test = re.diff(a, b); // strings that start with 'abcd' but don't contain 'bc'
2155 /// assert!(re.is_empty_re(test));
2156 /// ```
2157 pub fn is_empty_re(&mut self, e: RegLan) -> bool {
2158 self.iter_derivatives(e).all(|x| !x.nullable)
2159 }
2160
2161 /// Check whether a regular expression is empty, with bounded derivative exploration
2162 ///
2163 /// This is a resource-bounded version of [`is_empty_re`](Self::is_empty_re) that limits the
2164 /// number of derivatives computed to avoid potentially expensive operations.
2165 ///
2166 /// # Returns
2167 /// - `None` if the derivative limit was reached before determining emptiness, otherwise
2168 /// - `Some(true)` if the language is empty
2169 /// - `Some(false)` if the language is non-empty
2170 ///
2171 /// # Example
2172 ///
2173 /// ```
2174 /// use aws_smt_strings::regular_expressions::*;
2175 /// let mut re = ReManager::new();
2176 /// let abc = re.str(&"abc".into());
2177 /// let def = re.str(&"def".into());
2178 /// let intersection = re.inter(abc, def);
2179 ///
2180 /// let small_bound_result = re.is_empty_re_bounded(intersection, 1);
2181 /// assert_eq!(small_bound_result, None);
2182 ///
2183 /// let result = re.is_empty_re_bounded(intersection, 3);
2184 /// assert_eq!(result, Some(true));
2185 /// ```
2186 pub fn is_empty_re_bounded(&mut self, e: RegLan, max_derivatives: usize) -> Option<bool> {
2187 for (count, d) in self.iter_derivatives(e).enumerate() {
2188 if count >= max_derivatives {
2189 return None;
2190 }
2191 if d.nullable {
2192 return Some(false);
2193 }
2194 }
2195 Some(true)
2196 }
2197
2198 ///
2199 /// Search for a symbolic string of e
2200 /// - the result is None, if e is an empty regular expression.
2201 /// - otherwise the result is Some(list or pairs (RegLan, ClassId) such that:
2202 /// 1) in each pair, the classId is valid for the RegLan
2203 /// 2) the RegLan in the first pair is e
2204 /// 3) in two successive pairs (r1, cid1) (r2, cid2),
2205 /// we have r2 = class_derivative(r1, cid1)
2206 /// 4) for the last pair in the list (r, cid), the derivative
2207 /// of r w.r.t. cid is nullable.
2208 /// - the list is empty if e itself is nullable (this represents the empty string)
2209 ///
2210 fn get_string_path(&mut self, e: RegLan) -> Option<Vec<(RegLan, ClassId)>> {
2211 let mut queue: LabeledQueue<RegLan, ClassId> = LabeledQueue::new(e);
2212 while let Some(r) = queue.pop() {
2213 if r.nullable {
2214 return queue.full_path(&r);
2215 } else {
2216 for cid in r.class_ids() {
2217 let d = self.class_derivative_unchecked(r, cid);
2218 queue.push(r, cid, d);
2219 }
2220 }
2221 }
2222 None
2223 }
2224
2225 ///
2226 /// Get a string that belongs to a regular expression
2227 ///
2228 /// Return None if the regular expression `e` is empty.
2229 ///
2230 /// # Example
2231 /// ```
2232 /// use aws_smt_strings::{regular_expressions::*, smt_strings::*};
2233 ///
2234 /// let re = &mut ReManager::new();
2235 ///
2236 /// let str1 = SmtString::from("abc");
2237 /// let str2 = SmtString::from("bcd");
2238 ///
2239 /// let abc = re.str(&str1);
2240 /// let bcd = re.str(&str2);
2241 /// let u = re.union(abc, bcd);
2242 ///
2243 /// let str = re.get_string(u);
2244 ///
2245 /// assert!(str == Some(str1) || str == Some(str2));
2246 /// ```
2247 pub fn get_string(&mut self, e: RegLan) -> Option<SmtString> {
2248 match self.get_string_path(e) {
2249 None => None,
2250 Some(path) => {
2251 let result: Vec<u32> = path
2252 .iter()
2253 .map(|(re, cid)| re.pick_class_rep(*cid))
2254 .collect();
2255 Some(result.into())
2256 }
2257 }
2258 }
2259
2260 //
2261 // try to compile to a DFA of no more than max_states.
2262 // return None if that fails (i.e., if the automaton will have more than max_states)
2263 //
2264 fn compile_with_bound(&mut self, e: RegLan, max_states: usize) -> Option<Automaton> {
2265 if max_states == 0 {
2266 None
2267 } else {
2268 let mut builder = AutomatonBuilder::new(&e.expr);
2269 let mut queue = BfsQueue::new();
2270 let mut state_count = 0;
2271 queue.push(e);
2272 while let Some(e) = queue.pop() {
2273 debug_assert!(state_count <= max_states);
2274 if state_count == max_states {
2275 return None;
2276 }
2277 state_count += 1;
2278 for set in e.char_ranges() {
2279 let d = self.set_derivative_unchecked(e, set);
2280 queue.push(d);
2281 builder.add_transition(&e.expr, set, &d.expr);
2282 }
2283 if !e.empty_complement() {
2284 let d = self.class_derivative_unchecked(e, ClassId::Complement);
2285 queue.push(d);
2286 builder.set_default_successor(&e.expr, &d.expr);
2287 }
2288 if e.nullable {
2289 builder.mark_final(&e.expr);
2290 }
2291 }
2292 Some(builder.build_unchecked())
2293 }
2294 }
2295
2296 ///
2297 /// Compile a regular expression to a deterministic finite state automaton
2298 ///
2299 /// # Example
2300 ///
2301 /// ```
2302 /// use aws_smt_strings::{regular_expressions::*, automata::*};
2303 ///
2304 /// let re = &mut ReManager::new();
2305 ///
2306 /// // (ac + bc)*
2307 /// let ac = re.str(&"ac".into());
2308 /// let bc = re.str(&"bc".into());
2309 /// let sum = re.union(ac, bc);
2310 /// let e = re.star(sum);
2311 ///
2312 /// // convert e to an automaton
2313 /// let auto = re.compile(e);
2314 ///
2315 /// // string accepted by the automaton
2316 /// assert!(auto.accepts(&"acbcbc".into()))
2317 /// ```
2318 pub fn compile(&mut self, e: RegLan) -> Automaton {
2319 self.compile_with_bound(e, usize::MAX).unwrap()
2320 }
2321
2322 ///
2323 /// Compile a regular expression to a DFA of bounded size
2324 ///
2325 /// Try to compile a regular expression `e` to a deterministic finite-state automaton
2326 /// of size no more than `max_states`.
2327 /// - e: regular expression
2328 /// - max_states: bound
2329 ///
2330 /// Return None if the DFA has more than `max_states`
2331 /// Return `Some(a)` otherwise where `a` is the automaton
2332 ///
2333 /// # Example
2334 ///
2335 /// ```
2336 /// use aws_smt_strings::{regular_expressions::*, automata::*};
2337 ///
2338 /// let re = &mut ReManager::new();
2339 ///
2340 /// // (ac + bc)+
2341 /// let ac = re.str(&"ac".into());
2342 /// let bc = re.str(&"bc".into());
2343 /// let sum = re.union(ac, bc);
2344 /// let e = re.plus(sum);
2345 ///
2346 /// // the smallest DFA that recognizes e has four states
2347 /// let test1 = re.try_compile(e, 3);
2348 /// assert!(test1.is_none());
2349 ///
2350 /// let test2 = re.try_compile(e, 4);
2351 /// assert!(test2.is_some());
2352 /// ```
2353 pub fn try_compile(&mut self, e: RegLan, max_states: usize) -> Option<Automaton> {
2354 self.compile_with_bound(e, max_states)
2355 }
2356}
2357
2358impl Default for ReManager {
2359 fn default() -> Self {
2360 Self::new()
2361 }
2362}
2363
2364/// Iterator to enumerate all the derivatives of a regular expression
2365///
2366/// See [iter_derivatives](crate::regular_expressions::ReManager::iter_derivatives).
2367#[derive(Debug)]
2368pub struct DerivativeIterator<'a> {
2369 queue: BfsQueue<RegLan>,
2370 manager: &'a mut ReManager,
2371}
2372
2373impl<'a> Iterator for DerivativeIterator<'a> {
2374 type Item = &'a RE;
2375
2376 fn next(&mut self) -> Option<Self::Item> {
2377 if let Some(r) = self.queue.pop() {
2378 for cid in r.class_ids() {
2379 let d = self.manager.class_derivative_unchecked(r, cid);
2380 self.queue.push(d);
2381 }
2382 Some(r)
2383 } else {
2384 None
2385 }
2386 }
2387}
2388
2389#[allow(clippy::uninlined_format_args)]
2390#[cfg(test)]
2391mod tests {
2392 use crate::smt_strings::char_to_smt;
2393
2394 use super::*;
2395
2396 #[allow(clippy::uninlined_format_args)]
2397 fn print_term(name: &str, r: RegLan) {
2398 println!("term {} = {}", name, r);
2399 println!(" ptr: {:p}", r);
2400 println!(" id: {}", r.id);
2401 println!(" nullable: {}", r.nullable);
2402 println!(" singleton: {}", r.singleton);
2403 println!(" pattern: {}", r.simple_pattern);
2404 println!(" deriv: {}", r.deriv_class);
2405 println!();
2406 }
2407
2408 fn build_atoms(re: &mut ReManager) -> Vec<RegLan> {
2409 vec![
2410 re.empty(),
2411 re.epsilon(),
2412 re.all_chars(),
2413 re.char('a' as u32),
2414 re.char('b' as u32),
2415 re.range('0' as u32, '9' as u32),
2416 re.range('A' as u32, 'Z' as u32),
2417 ]
2418 }
2419
2420 fn build_test_res(re: &mut ReManager) -> Vec<RegLan> {
2421 let mut v = build_atoms(re);
2422 let w = v.clone();
2423
2424 for &x in &w {
2425 v.push(re.complement(x));
2426 v.push(re.opt(x));
2427 v.push(re.star(x));
2428 v.push(re.plus(x));
2429 v.push(re.exp(x, 2));
2430 }
2431 for &x in &w {
2432 for &y in &w {
2433 v.push(re.concat(x, y));
2434 v.push(re.inter(x, y));
2435 v.push(re.union(x, y));
2436 }
2437 }
2438 v.sort();
2439 v.dedup();
2440 v
2441 }
2442
2443 fn check_equal(re1: RegLan, re2: RegLan) {
2444 assert_eq!(re1, re2);
2445 assert_eq!(re1.id, re2.id);
2446 assert!(std::ptr::eq(re1, re2));
2447 }
2448
2449 #[test]
2450 fn hash_atoms() {
2451 let re = &mut ReManager::new();
2452
2453 let v1 = build_atoms(re);
2454 let v2 = build_atoms(re);
2455
2456 for (i, &t) in v1.iter().enumerate() {
2457 let name = format!("t{i}");
2458 print_term(&name, t);
2459 check_equal(t, v2[i]);
2460 }
2461 }
2462
2463 #[test]
2464 fn test_loop() {
2465 let re = &mut ReManager::new();
2466
2467 let v = build_atoms(re);
2468
2469 for &t in &v {
2470 let x = re.star(t);
2471 print_term(&format!("star({t})"), x);
2472 check_equal(x, re.star(t));
2473 }
2474
2475 for &t in &v {
2476 let x = re.plus(t);
2477 print_term(&format!("plus({t})"), x);
2478 check_equal(x, re.plus(t));
2479 }
2480
2481 for &t in &v {
2482 let x = re.opt(t);
2483 print_term(&format!("opt({t})"), x);
2484 check_equal(x, re.opt(t));
2485 }
2486
2487 for &t in &v {
2488 for k in 0..3 {
2489 let x = re.exp(t, k);
2490 print_term(&format!("exp({t}, {k})"), x);
2491 check_equal(x, re.exp(t, k));
2492 }
2493 }
2494
2495 let a = re.all_chars();
2496 let a2 = re.exp(a, 2);
2497 let a2_star = re.star(a2);
2498 let a2_plus = re.plus(a2);
2499 let a_star = re.star(a);
2500 let a_plus = re.plus(a);
2501 let a_star2 = re.exp(a_star, 2);
2502 let a_star_star = re.star(a_star);
2503 let a_plus_star = re.star(a_plus);
2504 let a_star_plus = re.plus(a_star);
2505 print_term("(Sigma^2)^*", a2_star);
2506 print_term("(Sigma^2)^+", a2_plus);
2507 print_term("(Sigma^*)^2", a_star2);
2508 print_term("(Sigma^*)^*", a_star_star);
2509 print_term("(Sigma^*)^+)", a_star_plus);
2510 print_term("(Sigma^+)^*", a_plus_star);
2511
2512 assert_eq!(a_star2, a_star);
2513 assert_ne!(a2_star, a_star);
2514 assert_ne!(a2_star, a2_plus);
2515 assert_ne!(a2_plus, a_star);
2516 assert_eq!(a_plus_star, a_star);
2517 assert_eq!(a_star_plus, a_star);
2518 assert_eq!(a_star_star, a_star);
2519 }
2520
2521 #[test]
2522 fn test_concat() {
2523 let re = &mut ReManager::new();
2524 let v = build_atoms(re);
2525
2526 for &t in &v {
2527 for &u in &v {
2528 let x = re.concat(t, u);
2529 print_term(&format!("concat({t}, {u})"), x);
2530 check_equal(x, re.concat(t, u));
2531 }
2532 }
2533 }
2534
2535 #[test]
2536 fn test_inter() {
2537 let re = &mut ReManager::new();
2538 let v = build_atoms(re);
2539
2540 for &t in &v {
2541 for &u in &v {
2542 let x = re.inter(t, u);
2543 print_term(&format!("inter({t}, {u})"), x);
2544 check_equal(x, re.inter(t, u));
2545 }
2546 }
2547 }
2548
2549 #[test]
2550 fn test_union() {
2551 let re = &mut ReManager::new();
2552 let v = build_atoms(re);
2553
2554 for &t in &v {
2555 for &u in &v {
2556 let x = re.union(t, u);
2557 print_term(&format!("union({t}, {u})"), x);
2558 check_equal(x, re.union(t, u));
2559 }
2560 }
2561 }
2562
2563 #[test]
2564 fn test_complement() {
2565 let re = &mut ReManager::new();
2566 let v = build_atoms(re);
2567
2568 for &t in &v {
2569 let x = re.complement(t);
2570 print_term(&format!("complement({t})"), x);
2571 check_equal(x, re.complement(t));
2572
2573 let y = re.complement(x);
2574 print_term(&format!("complement({x})"), y);
2575 check_equal(y, t);
2576 check_equal(y, re.complement(x));
2577 }
2578 }
2579
2580 #[test]
2581 fn test_from_str() {
2582 let re = &mut ReManager::new();
2583
2584 let x = re.str(&SmtString::from("abcde"));
2585 print_term("(str.to_re \"abcde\")", x);
2586 check_equal(x, re.str(&SmtString::from("abcde")));
2587
2588 let v = vec![
2589 re.char('a' as u32),
2590 re.char('b' as u32),
2591 re.epsilon(),
2592 re.epsilon(),
2593 re.char('c' as u32),
2594 re.char('d' as u32),
2595 re.char('e' as u32),
2596 ];
2597
2598 let y = re.concat_list(v);
2599 check_equal(x, y);
2600 }
2601
2602 #[test]
2603 fn bigger_test() {
2604 let re = &mut ReManager::new();
2605 let v = build_test_res(re);
2606
2607 for &t in &v {
2608 for &u in &v {
2609 let x = re.concat(t, u);
2610 print_term(&format!("concat({t}, {u})"), x);
2611 check_equal(x, re.concat(t, u));
2612
2613 let x = re.inter(t, u);
2614 print_term(&format!("inter({t}, {u})"), x);
2615 check_equal(x, re.inter(t, u));
2616
2617 let x = re.union(t, u);
2618 print_term(&format!("union({t}, {u})"), x);
2619 check_equal(x, re.union(t, u));
2620 }
2621 }
2622 }
2623
2624 #[test]
2625 fn test_sub_terms() {
2626 fn print_sub_terms(t: RegLan) {
2627 println!("Base term: {t}");
2628 println!("Sub terms = [");
2629 for x in sub_terms(t) {
2630 println!(" {x}");
2631 }
2632 println!("]");
2633
2634 println!("Leaves = [");
2635 for leaf in leaves(t) {
2636 println!(" {leaf}");
2637 }
2638 println!("]\n");
2639 }
2640
2641 let re = &mut ReManager::new();
2642 let v = build_test_res(re);
2643 for &t in &v {
2644 print_sub_terms(t)
2645 }
2646
2647 let t = re.str(&"0987654321aabd".into());
2648 print_sub_terms(t)
2649 }
2650
2651 #[test]
2652 fn test_base_patterns() {
2653 let re = &mut ReManager::new();
2654
2655 fn show_patterns(r: RegLan) {
2656 let v = decompose_concat(r);
2657 let test = base_patterns(&v);
2658 println!("Expression: {r} ");
2659 println!(" vector:");
2660 for x in &v {
2661 println!(" {x}");
2662 }
2663 println!(" base patterns:");
2664 for x in &test {
2665 println!(" {x}");
2666 }
2667 println!()
2668 }
2669
2670 let test1 = re.all_chars();
2671 show_patterns(test1);
2672
2673 let test2 = re.epsilon();
2674 show_patterns(test2);
2675
2676 let test3 = re.full();
2677 show_patterns(test3);
2678
2679 let digits = re.range('0' as u32, '9' as u32);
2680 let d = re.star(digits);
2681
2682 let test4 = re.concat_list([test1, digits, test3, test3, d, test1, d, d]);
2683 show_patterns(test4);
2684 }
2685
2686 #[test]
2687 fn test_deriv() {
2688 let re = &mut ReManager::new();
2689 let v = build_test_res(re);
2690
2691 for &t in &v {
2692 for c in t.deriv_class.ranges() {
2693 let x = re.set_derivative(t, c);
2694 match x {
2695 Ok(d) => println!("deriv {t} wrt {c} = {d}"),
2696 Err(e) => panic!("deriv {} wrt {} failed with error {:?}", t, c, e),
2697 }
2698 }
2699 if !t.deriv_class.empty_complement() {
2700 let y = re.class_derivative(t, ClassId::Complement);
2701 match y {
2702 Ok(d) => println!("deriv {t} wrt CompClass = {d}"),
2703 Err(e) => panic!("deriv {} wrt CompClass failed with error {:?}", t, e),
2704 }
2705 }
2706 }
2707 }
2708
2709 // deriv e w.r.t. 'a', 'b', and 'c' and e's complementary class
2710 fn show_derivatives(re: &mut ReManager, e: RegLan) {
2711 println!("Expression: {e}");
2712 println!(" deriv classes: {}", e.deriv_class);
2713 for c in 'a'..='c' {
2714 println!(" deriv({e}, {c}) = {}", re.char_derivative(e, c as u32))
2715 }
2716 if !e.empty_complement() {
2717 println!(
2718 " deriv({e}, CompClass) = {}",
2719 re.class_derivative(e, ClassId::Complement).unwrap()
2720 )
2721 }
2722 println!()
2723 }
2724
2725 #[test]
2726 fn test_deriv2() {
2727 let re = &mut ReManager::new();
2728 // a + ac + bc
2729 let a = re.str(&"a".into());
2730 let ac = re.str(&"ac".into());
2731 let bc = re.str(&"bc".into());
2732 let e = re.union_list([a, ac, bc]);
2733
2734 show_derivatives(re, e);
2735 let d1 = re.char_derivative(e, 'a' as u32);
2736 show_derivatives(re, d1);
2737 let d2 = re.char_derivative(e, 'b' as u32);
2738 show_derivatives(re, d2);
2739 let d3 = re.char_derivative(e, 'c' as u32);
2740 show_derivatives(re, d3);
2741
2742 assert!(re.str_in_re(&"a".into(), e));
2743 assert!(re.str_in_re(&"ac".into(), e));
2744 assert!(re.str_in_re(&"bc".into(), e));
2745 assert!(!re.str_in_re(&"b".into(), e));
2746 assert!(!re.str_in_re(&"c".into(), e));
2747 }
2748
2749 #[test]
2750 fn test_deriv3() {
2751 let re = &mut ReManager::new();
2752 // (ac + bc)*
2753 let ac = re.str(&"ac".into());
2754 let bc = re.str(&"bc".into());
2755 let sum = re.union(ac, bc);
2756 let e = re.star(sum);
2757
2758 show_derivatives(re, e);
2759 let d1 = re.char_derivative(e, 'a' as u32);
2760 show_derivatives(re, d1);
2761 let d2 = re.char_derivative(e, 'b' as u32);
2762 show_derivatives(re, d2);
2763 let d3 = re.char_derivative(e, 'c' as u32);
2764 show_derivatives(re, d3);
2765
2766 assert!(re.str_in_re(&"acacbc".into(), e));
2767 assert!(re.str_in_re(&"".into(), e));
2768 }
2769
2770 fn all_derivatives(re: &mut ReManager, e: RegLan) -> Vec<RegLan> {
2771 let mut queue = BfsQueue::new();
2772 let mut result = Vec::new();
2773
2774 queue.push(e);
2775 result.push(e);
2776
2777 while let Some(r) = queue.pop() {
2778 for cid in r.class_ids() {
2779 if let Ok(d) = re.class_derivative(r, cid) {
2780 if queue.push(d) {
2781 result.push(d)
2782 }
2783 } else {
2784 panic!("Unexpected failure: deriv {} w.r.t Class{}", r, cid)
2785 }
2786 }
2787 }
2788 result
2789 }
2790
2791 #[test]
2792 fn test_all_deriv() {
2793 fn show_derivs(e: RegLan, v: &[RegLan]) {
2794 println!("All derivatives of {e}");
2795 for &d in v {
2796 println!(" {d}")
2797 }
2798 if v.len() == 1 {
2799 println!("Total: 1 derivative")
2800 } else {
2801 println!("Total: {} derivatives", v.len())
2802 }
2803 }
2804
2805 let re = &mut ReManager::new();
2806
2807 // (ac + bc)*
2808 let ac = re.str(&"ac".into());
2809 let bc = re.str(&"bc".into());
2810 let sum = re.union(ac, bc);
2811 let e = re.star(sum);
2812
2813 let v = all_derivatives(re, e);
2814 show_derivs(e, &v);
2815
2816 let a = [
2817 re.sigma_plus(),
2818 re.str(&"something".into()),
2819 re.all_chars(),
2820 re.all_chars(),
2821 re.char(':' as u32),
2822 re.full(),
2823 re.char(':' as u32),
2824 re.full(),
2825 re.str(&".jpg".into()),
2826 ];
2827
2828 let e = re.concat_list(a);
2829 let v = all_derivatives(re, e);
2830 show_derivs(e, &v);
2831 }
2832
2833 #[test]
2834 fn test_derivative_iter() {
2835 let re = &mut ReManager::new();
2836
2837 // abc/(Σ^*)/ΣΣ-ΣΣ/def/(Σ^*)
2838 let a = [
2839 re.str(&"abc/".into()),
2840 re.full(),
2841 re.char('/' as u32),
2842 re.all_chars(),
2843 re.all_chars(),
2844 re.char('-' as u32),
2845 re.all_chars(),
2846 re.all_chars(),
2847 re.str(&"/def/".into()),
2848 re.full(),
2849 ];
2850
2851 let e = re.concat_list(a);
2852 println!("All derivatives of {e}");
2853 let mut count = 0;
2854 for r in re.iter_derivatives(e) {
2855 println!(" {r}");
2856 count += 1;
2857 }
2858 println!("Total: {count} derivatives");
2859 }
2860
2861 #[test]
2862 fn test_char_deriv() {
2863 fn sum_of_str(re: &mut ReManager, s1: &str, s2: &str) -> RegLan {
2864 let s1 = re.str(&s1.into());
2865 let s2 = re.str(&s2.into());
2866 re.union(s1, s2)
2867 }
2868
2869 let mut re = ReManager::new();
2870 let e = sum_of_str(&mut re, "abc", "acc");
2871
2872 // e is 'abc + acc'
2873 // the derivative of e w.r.t. 'a' is 'bc + cc'
2874 let d = re.char_derivative(e, 'a' as u32);
2875 assert_eq!(d, sum_of_str(&mut re, "bc", "cc"));
2876 }
2877
2878 #[test]
2879 fn test_empty_check() {
2880 let re = &mut ReManager::new();
2881
2882 let full = re.full();
2883 let abcd = re.str(&"abcd".into());
2884 let bc = re.str(&"bc".into());
2885
2886 let a = re.concat(abcd, full); // strings that start with 'abcd'
2887 let b = re.concat_list([full, bc, full]); // strings that contain 'bc'
2888
2889 let test = re.diff(a, b); // strings that start with 'abcd' but don't contain 'bc'
2890 assert!(re.is_empty_re(test));
2891 }
2892
2893 #[test]
2894 fn test_empty_check2() {
2895 let re = &mut ReManager::new();
2896
2897 let c1 = [
2898 re.all_chars(),
2899 re.all_chars(),
2900 re.all_chars(),
2901 re.all_chars(),
2902 re.full(),
2903 re.str(&"/abcd/".into()),
2904 re.all_chars(),
2905 re.str(&"/end".into()),
2906 ];
2907
2908 let c2 = [re.all_chars(), re.str(&"/zhfg".into()), re.full()];
2909
2910 let e1 = re.concat_list(c1);
2911 let e2 = re.concat_list(c2);
2912
2913 let test = re.inter(e1, e2);
2914 assert!(!re.is_empty_re(test));
2915
2916 let sample = re.get_string(test);
2917 assert!(sample.is_some());
2918 println!("Sample string in {}: {}", test, sample.unwrap());
2919 }
2920
2921 #[test]
2922 fn test_empty_check_bounded() {
2923 let re = &mut ReManager::new();
2924
2925 let full = re.full();
2926 let abcd = re.str(&"abcd".into());
2927 let bc = re.str(&"bc".into());
2928
2929 let a = re.concat(abcd, full); // strings that start with 'abcd'
2930 let b = re.concat_list([full, bc, full]); // strings that contain 'bc'
2931
2932 let test = re.diff(a, b); // strings that start with 'abcd' but don't contain 'bc'
2933 let derivatives_count = re.iter_derivatives(test).count();
2934
2935 assert_eq!(re.is_empty_re_bounded(test, derivatives_count - 1), None);
2936 assert_eq!(re.is_empty_re_bounded(test, derivatives_count), Some(true));
2937 }
2938
2939 #[test]
2940 fn test_empty_check_bounded_2() {
2941 let re = &mut ReManager::new();
2942
2943 let c1 = [
2944 re.all_chars(),
2945 re.all_chars(),
2946 re.full(),
2947 re.str(&"/abcd/".into()),
2948 re.all_chars(),
2949 re.str(&"/end".into()),
2950 ];
2951
2952 let e1 = re.concat_list(c1);
2953 let derivatives_count = re.iter_derivatives(e1).count();
2954
2955 assert_eq!(re.is_empty_re_bounded(e1, derivatives_count - 1), None);
2956 assert_eq!(re.is_empty_re_bounded(e1, derivatives_count), Some(false));
2957 }
2958
2959 #[test]
2960 fn test_compile() {
2961 let re = &mut ReManager::new();
2962
2963 // (ac + bc)*
2964 let ac = re.str(&"ac".into());
2965 let bc = re.str(&"bc".into());
2966 let sum = re.union(ac, bc);
2967 let e = re.star(sum);
2968
2969 // convert e to an automaton
2970 let auto = re.compile(e);
2971 println!("Automaton for (ac + bc)*");
2972 println!("{}", auto);
2973
2974 println!("Char partition: {}", auto.combined_char_partition());
2975
2976 let reps = auto.pick_alphabet();
2977 print!("Alphabet:");
2978 for x in reps {
2979 print!(" {}", char_to_smt(x));
2980 }
2981 println!();
2982
2983 let m = auto.compile_successors();
2984 println!("Compiled transition table: size = {}", m.size());
2985
2986 assert_eq!(auto.num_states(), 3);
2987 assert_eq!(auto.num_final_states(), 1);
2988 }
2989
2990 #[test]
2991 fn test_compile2() {
2992 let re = &mut ReManager::new();
2993
2994 let a = [
2995 re.sigma_plus(),
2996 re.str(&"something".into()),
2997 re.all_chars(),
2998 re.all_chars(),
2999 re.char(':' as u32),
3000 re.full(),
3001 re.char(':' as u32),
3002 re.full(),
3003 re.str(&".jpg".into()),
3004 ];
3005
3006 let e = re.concat_list(a);
3007
3008 let auto = re.compile(e);
3009 println!("Automaton for {}", e);
3010 println!("{}", auto);
3011
3012 println!("Char partition: {}", auto.combined_char_partition());
3013
3014 let a = auto.pick_alphabet();
3015 print!("Alphabet representatives:");
3016 for x in a {
3017 print!(" {}", char_to_smt(x));
3018 }
3019 println!();
3020
3021 let m = auto.compile_successors();
3022 println!("Compiled transition table: size = {}", m.size());
3023 println!("Transition table: {}", m);
3024
3025 assert!(auto.accepts(&"prefix_then_somethingAB:middle:mores_stuff.jpg".into()));
3026 assert!(!auto.accepts(&"prefix_then_something:middle:mores_stuff.jpg".into()));
3027
3028 auto.test_minimizer();
3029 }
3030
3031 #[test]
3032 fn test_compile3() {
3033 let re = &mut ReManager::new();
3034
3035 let c1 = [
3036 re.all_chars(),
3037 re.all_chars(),
3038 re.all_chars(),
3039 re.all_chars(),
3040 re.full(),
3041 re.str(&"/abcd/".into()),
3042 re.all_chars(),
3043 re.str(&"/end".into()),
3044 ];
3045
3046 let c2 = [re.all_chars(), re.str(&"/zhfg".into()), re.full()];
3047
3048 let e1 = re.concat_list(c1);
3049 let e2 = re.concat_list(c2);
3050
3051 let test = re.inter(e1, e2);
3052
3053 let auto = re.compile(test);
3054 println!("Automaton for {}", test);
3055 println!("{}", auto);
3056
3057 println!("Char partition: {}", auto.combined_char_partition());
3058 let a = auto.pick_alphabet();
3059 print!("Alphabet representatives:");
3060 for x in a {
3061 print!(" {}", char_to_smt(x));
3062 }
3063 println!();
3064
3065 let m = auto.compile_successors();
3066 println!("Compiled transition table: size = {}", m.size());
3067 println!("Table: {}", m);
3068
3069 let w = re.get_string(test).unwrap();
3070 assert!(auto.accepts(&w));
3071
3072 auto.test_minimizer();
3073 }
3074
3075 #[test]
3076 fn test_bounded_compile() {
3077 let re = &mut ReManager::new();
3078
3079 // abc/(Σ^*)/ΣΣ-ΣΣ/def/(Σ^*)
3080 let a = [
3081 re.str(&"abc/".into()),
3082 re.full(),
3083 re.char('/' as u32),
3084 re.all_chars(),
3085 re.all_chars(),
3086 re.char('-' as u32),
3087 re.all_chars(),
3088 re.all_chars(),
3089 re.str(&"/def/".into()),
3090 re.full(),
3091 ];
3092
3093 let e = re.concat_list(a);
3094
3095 let test0 = re.try_compile(e, 0);
3096 assert!(test0.is_none());
3097
3098 let test1 = re.try_compile(e, 46);
3099 assert!(test1.is_none());
3100
3101 let test2 = re.try_compile(e, 47);
3102 assert!(test2.is_some());
3103 assert_eq!(test2.unwrap().num_states(), 47);
3104 }
3105
3106 #[test]
3107 fn test_compile4() {
3108 let re = &mut ReManager::new();
3109
3110 // bb([0-9]+)
3111 let bb = re.str(&"bb".into());
3112 let digits = re.range('0' as u32, '9' as u32);
3113 let digits = re.plus(digits);
3114 let first = re.concat(bb, digits);
3115
3116 // \sigma^[0..4]
3117 let sigma = re.all_chars();
3118 let lop = re.smt_loop(sigma, 0, 4);
3119 let second = re.complement(lop);
3120
3121 let to_compile = re.inter(second, first);
3122
3123 let test1 = re.try_compile(to_compile, 10000);
3124 assert!(test1.is_some());
3125 assert!(test1.unwrap().accepts(&"bb01234".into()));
3126 }
3127
3128 #[test]
3129 fn test_compile5() {
3130 let re = &mut ReManager::new();
3131
3132 // (a^5)+
3133 let a = re.char('a' as u32);
3134 let a5 = re.smt_loop(a, 5, 5);
3135 let a5plus = re.plus(a5);
3136
3137 println!("testing {a5plus}");
3138 let test = re.try_compile(a5plus, 10000);
3139 assert!(test.is_some());
3140 let dfa = test.unwrap();
3141 println!("Resulting automaton: {dfa}");
3142 assert!(dfa.accepts(&"aaaaaaaaaa".into()));
3143 assert!(dfa.accepts(&"aaaaa".into()));
3144 assert!(!dfa.accepts(&"aaaa".into()));
3145 }
3146
3147 #[test]
3148 fn test_compile6() {
3149 let re = &mut ReManager::new();
3150
3151 // (a^5)+
3152 let a = re.char('a' as u32);
3153 let a5 = re.smt_loop(a, 5, 5);
3154 let a5plus = re.plus(a5);
3155
3156 // (a^2)+
3157 let a2 = re.smt_loop(a, 2, 2);
3158 let a2plus = re.plus(a2);
3159
3160 let inter = re.inter(a5plus, a2plus);
3161 println!("testing {inter}");
3162 let test = re.try_compile(inter, 10000);
3163 assert!(test.is_some());
3164 let dfa = test.unwrap();
3165 println!("Resulting automaton: {dfa}");
3166
3167 assert!(dfa.accepts(&"aaaaaaaaaa".into()));
3168 assert!(!dfa.accepts(&"aaaaa".into()));
3169 assert!(!dfa.accepts(&"aa".into()));
3170 }
3171}